HTMLparser

Name

HTMLparser -- 

Synopsis



typedef     htmlParserCtxt;
typedef     htmlParserCtxtPtr;
typedef     htmlParserNodeInfo;
typedef     htmlSAXHandler;
typedef     htmlSAXHandlerPtr;
typedef     htmlParserInput;
typedef     htmlParserInputPtr;
typedef     htmlDocPtr;
typedef     htmlNodePtr;
struct      htmlElemDesc;
typedef     htmlElemDescPtr;
struct      htmlEntityDesc;
typedef     htmlEntityDescPtr;
const htmlElemDesc* htmlTagLookup           (const xmlChar *tag);
const htmlEntityDesc* htmlEntityLookup      (const xmlChar *name);
const htmlEntityDesc* htmlEntityValueLookup (unsigned int value);
int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);
int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);
const htmlEntityDesc* htmlParseEntityRef    (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);
int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);
void        htmlParseElement                (htmlParserCtxtPtr ctxt);
htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);
htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);
int         UTF8ToHtml                      (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);
int         htmlEncodeEntities              (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen,
                                             int quoteChar);
int         htmlIsScriptAttribute           (const xmlChar *name);
int         htmlHandleOmittedElem           (int val);
void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);
htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);
int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Description

Details

>htmlParserCtxt

typedef xmlParserCtxt htmlParserCtxt;


>htmlParserCtxtPtr

typedef xmlParserCtxtPtr htmlParserCtxtPtr;


>htmlParserNodeInfo

typedef xmlParserNodeInfo htmlParserNodeInfo;


>htmlSAXHandler

typedef xmlSAXHandler htmlSAXHandler;


>htmlSAXHandlerPtr

typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;


>htmlParserInput

typedef xmlParserInput htmlParserInput;


>htmlParserInputPtr

typedef xmlParserInputPtr htmlParserInputPtr;


>htmlDocPtr

typedef xmlDocPtr htmlDocPtr;


>htmlNodePtr

typedef xmlNodePtr htmlNodePtr;


>struct htmlElemDesc

struct htmlElemDesc {
    const char *name;	/* The tag name */
    char startTag;      /* Whether the start tag can be implied */
    char endTag;        /* Whether the end tag can be implied */
    char saveEndTag;    /* Whether the end tag should be saved */
    char empty;         /* Is this an empty element ? */
    char depr;          /* Is this a deprecated element ? */
    char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
    char isinline;      /* is this a block 0 or inline 1 element */
    const char *desc;   /* the description */
};


>htmlElemDescPtr

typedef htmlElemDesc *htmlElemDescPtr;


>struct htmlEntityDesc

struct htmlEntityDesc {
    unsigned int value;	/* the UNICODE value for the character */
    const char *name;	/* The entity name */
    const char *desc;   /* the description */
};


>htmlEntityDescPtr

typedef htmlEntityDesc *htmlEntityDescPtr;


>htmlTagLookup ()

const htmlElemDesc* htmlTagLookup           (const xmlChar *tag);

Lookup the HTML tag in the ElementTable

tag : The tag name in lowercase
Returns :the related htmlElemDescPtr or NULL if not found.


>htmlEntityLookup ()

const htmlEntityDesc* htmlEntityLookup      (const xmlChar *name);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.

name : the entity name
Returns :the associated htmlEntityDescPtr if found, NULL otherwise.


>htmlEntityValueLookup ()

const htmlEntityDesc* htmlEntityValueLookup (unsigned int value);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.

value : the entity's unicode value
Returns :the associated htmlEntityDescPtr if found, NULL otherwise.


>htmlIsAutoClosed ()

int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child

doc : the HTML document
elem : the HTML element
Returns :1 if autoclosed, 0 otherwise


>htmlAutoCloseTag ()

int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.

doc : the HTML document
name : The tag name
elem : the HTML element
Returns :1 if autoclose, 0 otherwise


>htmlParseEntityRef ()

const htmlEntityDesc* htmlParseEntityRef    (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);

parse an HTML ENTITY references

[68] EntityRef ::= '&' Name ';'

ctxt : an HTML parser context
str : location to store the entity name
Returns :the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller.


>htmlParseCharRef ()

int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);

parse Reference declarations

[66] CharRef ::= '&#' [0-9]+ ';' | '&x' [0-9a-fA-F]+ ';'

ctxt : an HTML parser context
Returns :the value parsed (as an int)


>htmlParseElement ()

void        htmlParseElement                (htmlParserCtxtPtr ctxt);

parse an HTML element, this is highly recursive

[39] element ::= EmptyElemTag | STag content ETag

[41] Attribute ::= Name Eq AttValue

ctxt : an HTML parser context


>htmlSAXParseDoc ()

htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.

cur : a pointer to an array of xmlChar
encoding : a free form C string describing the HTML document encoding, or NULL
sax : the SAX handler block
userData : if using SAX, this pointer will be provided on callbacks.
Returns :the resulting document tree unless SAX is NULL or the document is not well formed.


>htmlParseDoc ()

htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);

parse an HTML in-memory document and build a tree.

cur : a pointer to an array of xmlChar
encoding : a free form C string describing the HTML document encoding, or NULL
Returns :the resulting document tree


>htmlSAXParseFile ()

htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

filename : the filename
encoding : a free form C string describing the HTML document encoding, or NULL
sax : the SAX handler block
userData : if using SAX, this pointer will be provided on callbacks.
Returns :the resulting document tree unless SAX is NULL or the document is not well formed.


>htmlParseFile ()

htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.

filename : the filename
encoding : a free form C string describing the HTML document encoding, or NULL
Returns :the resulting document tree


>UTF8ToHtml ()

int         UTF8ToHtml                      (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out : a pointer to an array of bytes to store the result
outlen : the length of out
in : a pointer to an array of UTF-8 chars
inlen : the length of in
Returns :0 if success, -2 if the transcoding fails, or -1 otherwise The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of outlen after return is the number of octets consumed.


>htmlEncodeEntities ()

int         htmlEncodeEntities              (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen,
                                             int quoteChar);

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out : a pointer to an array of bytes to store the result
outlen : the length of out
in : a pointer to an array of UTF-8 chars
inlen : the length of in
quoteChar : the quote character to escape (' or ") or zero.
Returns :0 if success, -2 if the transcoding fails, or -1 otherwise The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of outlen after return is the number of octets consumed.


>htmlIsScriptAttribute ()

int         htmlIsScriptAttribute           (const xmlChar *name);

Check if an attribute is of content type Script

name : an attribute name
Returns :1 is the attribute is a script 0 otherwise


>htmlHandleOmittedElem ()

int         htmlHandleOmittedElem           (int val);

Set and return the previous value for handling HTML omitted tags.

val : int 0 or 1
Returns :the last value for 0 for no handling, 1 for auto insertion.


>htmlFreeParserCtxt ()

void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.

ctxt : an HTML parser context


>htmlCreatePushParserCtxt ()

htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);

Create a parser context for using the HTML parser in push mode To allow content encoding detection, size should be >= 4 The value of filename is used for fetching external entities and error/warning reports.

sax : a SAX handler
user_data : The user data returned on SAX callbacks
chunk : a pointer to an array of chars
size : number of chars in the array
filename : an optional file name or URI
enc : an optional encoding
Returns :the new parser context or NULL


>htmlParseChunk ()

int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Parse a Chunk of memory

ctxt : an XML parser context
chunk : an char array
size : the size in byte of the chunk
terminate : last chunk indicator
Returns :zero if no error, the xmlParserErrors otherwise.