libxml2 Reference Manual |
---|
HTMLparser - interface for an HTML 4.0 non-verifying parser
this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.
Author(s): Daniel Veillard
#define htmlDefaultSubelement(elt); #define htmlElementAllowedHereDesc(parent, elt); #define htmlRequiredAttrs(elt); typedef xmlDocPtr htmlDocPtr; typedef struct _htmlElemDesc htmlElemDesc; typedef htmlElemDesc * htmlElemDescPtr; typedef struct _htmlEntityDesc htmlEntityDesc; typedef htmlEntityDesc * htmlEntityDescPtr; typedef xmlNodePtr htmlNodePtr; typedef xmlParserCtxt htmlParserCtxt; typedef xmlParserCtxtPtr htmlParserCtxtPtr; typedef xmlParserInput htmlParserInput; typedef xmlParserInputPtr htmlParserInputPtr; typedef xmlParserNodeInfo htmlParserNodeInfo; typedef enum htmlParserOption; typedef xmlSAXHandler htmlSAXHandler; typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; typedef enum htmlStatus; int UTF8ToHtml (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy); int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem); htmlParserCtxtPtr htmlCreateFileParserCtxt (const char * filename,
const char * encoding); htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size); htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc); htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * str,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); void htmlCtxtReset (htmlParserCtxtPtr ctxt); int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options); int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt); htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt); int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar); const htmlEntityDesc * htmlEntityLookup (const xmlChar * name); const htmlEntityDesc * htmlEntityValueLookup (unsigned int value); void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); int htmlHandleOmittedElem (int val); void htmlInitAutoClose (void); int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem); int htmlIsScriptAttribute (const xmlChar * name); htmlParserCtxtPtr htmlNewParserCtxt (void); htmlParserCtxtPtr htmlNewSAXParserCtxt (const htmlSAXHandler * sax,
void * userData); htmlStatus htmlNodeStatus (const htmlNodePtr node,
int legacy); int htmlParseCharRef (htmlParserCtxtPtr ctxt); int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate); htmlDocPtr htmlParseDoc (const xmlChar * cur,
const char * encoding); int htmlParseDocument (htmlParserCtxtPtr ctxt); void htmlParseElement (htmlParserCtxtPtr ctxt); const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str); htmlDocPtr htmlParseFile (const char * filename,
const char * encoding); htmlDocPtr htmlReadDoc (const xmlChar * cur,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlReadFd (int fd,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options); htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlSAXParseDoc (const xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); const htmlElemDesc * htmlTagLookup (const xmlChar * tag);
#define htmlDefaultSubelement(elt);
Returns the default subelement for this element
elt: | HTML element |
#define htmlElementAllowedHereDesc(parent, elt);
Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.
parent: | HTML parent element |
elt: | HTML element |
#define htmlRequiredAttrs(elt);
Returns the attributes required for the specified element.
elt: | HTML element |
struct _htmlElemDesc { const char * name : The tag name char startTag : Whether the start tag can be implied char endTag : Whether the end tag can be implied char saveEndTag : Whether the end tag should be saved char empty : Is this an empty element ? char depr : Is this a deprecated element ? char dtd : 1: only in Loose DTD, 2: only Frameset one char isinline : is this a block 0 or inline 1 element const char * desc : the description NRK Jan.2003 * New fields encapsulating HTML structur const char ** subelts : allowed sub-elements of this element const char * defaultsubelt : subelement for suggested auto-repair if necessary or NULL const char ** attrs_opt : Optional Attributes const char ** attrs_depr : Additional deprecated attributes const char ** attrs_req : Required attributes } htmlElemDesc;
htmlElemDesc * htmlElemDescPtr;
struct _htmlEntityDesc { unsigned int value : the UNICODE value for the character const char * name : The entity name const char * desc : the description } htmlEntityDesc;
htmlEntityDesc * htmlEntityDescPtr;
xmlNodePtr htmlNodePtr;
xmlParserCtxt htmlParserCtxt;
xmlParserCtxtPtr htmlParserCtxtPtr;
xmlParserInput htmlParserInput;
xmlParserInputPtr htmlParserInputPtr;
xmlParserNodeInfo htmlParserNodeInfo;
enum htmlParserOption { HTML_PARSE_RECOVER = 1 /* Relaxed parsing */ HTML_PARSE_NODEFDTD = 4 /* do not default a doctype if not found */ HTML_PARSE_NOERROR = 32 /* suppress error reports */ HTML_PARSE_NOWARNING = 64 /* suppress warning reports */ HTML_PARSE_PEDANTIC = 128 /* pedantic error reporting */ HTML_PARSE_NOBLANKS = 256 /* remove blank nodes */ HTML_PARSE_NONET = 2048 /* Forbid network access */ HTML_PARSE_NOIMPLIED = 8192 /* Do not add implied html/body... elements */ HTML_PARSE_COMPACT = 65536 /* compact small text nodes */ HTML_PARSE_IGNORE_ENC = 2097152 /* ignore internal document encoding hint */ };
xmlSAXHandler htmlSAXHandler;
xmlSAXHandlerPtr htmlSAXHandlerPtr;
enum htmlStatus { HTML_NA = 0 /* something we don't check at all */ HTML_INVALID = 1 HTML_DEPRECATED = 2 HTML_VALID = 4 HTML_REQUIRED = 12 /* VALID bit set so ( & HTML_VALID ) is TRUE */ };
int UTF8ToHtml (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out: | a pointer to an array of bytes to store the result |
outlen: | the length of @out |
in: | a pointer to an array of UTF-8 chars |
inlen: | the length of @in |
Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy)
Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes
elt: | HTML element |
attr: | HTML attribute |
legacy: | whether to allow deprecated attributes |
Returns: | one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
doc: | the HTML document |
name: | The tag name |
elem: | the HTML element |
Returns: | 1 if autoclose, 0 otherwise |
htmlParserCtxtPtr htmlCreateFileParserCtxt (const char * filename,
const char * encoding)
Create a parser context for a file content. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
filename: | the filename |
encoding: | a free form C string describing the HTML document encoding, or NULL |
Returns: | the new parser context or NULL |
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size)
Create a parser context for an HTML in-memory document.
buffer: | a pointer to a char array |
size: | the size of the array |
Returns: | the new parser context or NULL |
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)
Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.
sax: | a SAX handler |
user_data: | The user data returned on SAX callbacks |
chunk: | a pointer to an array of chars |
size: | number of chars in the array |
filename: | an optional file name or URI |
enc: | an optional encoding |
Returns: | the new parser context or NULL |
htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * str,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context |
str: | a pointer to a zero terminated string |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options)
parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context |
fd: | an open file descriptor |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options)
parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context |
filename: | a file or URL |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context |
ioread: | an I/O read function |
ioclose: | an I/O close function |
ioctx: | an I/O handler |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context |
buffer: | a pointer to a char array |
size: | the size of the array |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
void htmlCtxtReset (htmlParserCtxtPtr ctxt)
Reset a parser context
ctxt: | an HTML parser context |
int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options)
Applies the options to the parser context
ctxt: | an HTML parser context |
options: | a combination of htmlParserOption(s) |
Returns: | 0 in case of success, the set of unknown or unimplemented options in case of error. |
int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt)
Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements
parent: | HTML parent element |
elt: | HTML element |
Returns: | 1 if allowed; 0 otherwise. |
htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt)
Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.
parent: | HTML parent element |
elt: | HTML element |
Returns: | one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out: | a pointer to an array of bytes to store the result |
outlen: | the length of @out |
in: | a pointer to an array of UTF-8 chars |
inlen: | the length of @in |
quoteChar: | the quote character to escape (' or ") or zero. |
Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
const htmlEntityDesc * htmlEntityLookup (const xmlChar * name)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
name: | the entity name |
Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. |
const htmlEntityDesc * htmlEntityValueLookup (unsigned int value)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
value: | the entity's unicode value |
Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. |
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt)
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
ctxt: | an HTML parser context |
int htmlHandleOmittedElem (int val)
Set and return the previous value for handling HTML omitted tags.
val: | int 0 or 1 |
Returns: | the last value for 0 for no handling, 1 for auto insertion. |
int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
doc: | the HTML document |
elem: | the HTML element |
Returns: | 1 if autoclosed, 0 otherwise |
int htmlIsScriptAttribute (const xmlChar * name)
Check if an attribute is of content type Script
htmlParserCtxtPtr htmlNewParserCtxt (void)
Allocate and initialize a new parser context.
Returns: | the htmlParserCtxtPtr or NULL in case of allocation error |
htmlParserCtxtPtr htmlNewSAXParserCtxt (const htmlSAXHandler * sax,
void * userData)
Allocate and initialize a new SAX parser context. If userData is NULL, the parser context will be passed as user data.
sax: | SAX handler |
userData: | user data |
Returns: | the htmlParserCtxtPtr or NULL in case of allocation error |
htmlStatus htmlNodeStatus (const htmlNodePtr node,
int legacy)
Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)
node: | an htmlNodePtr in a tree |
legacy: | whether to allow deprecated elements (YES is faster here for Element nodes) |
Returns: | for Element nodes, a return from htmlElementAllowedHere (if legacy allowed) or htmlElementStatusHere (otherwise). for Attribute nodes, a return from htmlAttrAllowed for other nodes, HTML_NA (no checks performed) |
int htmlParseCharRef (htmlParserCtxtPtr ctxt)
DEPRECATED: Internal function, don't use. parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
ctxt: | an HTML parser context |
Returns: | the value parsed (as an int) |
int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate)
Parse a Chunk of memory
ctxt: | an HTML parser context |
chunk: | an char array |
size: | the size in byte of the chunk |
terminate: | last chunk indicator |
Returns: | zero if no error, the xmlParserErrors otherwise. |
htmlDocPtr htmlParseDoc (const xmlChar * cur,
const char * encoding)
parse an HTML in-memory document and build a tree.
cur: | a pointer to an array of xmlChar |
encoding: | a free form C string describing the HTML document encoding, or NULL |
Returns: | the resulting document tree |
int htmlParseDocument (htmlParserCtxtPtr ctxt)
parse an HTML document (and build a tree if using the standard SAX interface).
ctxt: | an HTML parser context |
Returns: | 0, -1 in case of error. the parser context is augmented as a result of the parsing. |
void htmlParseElement (htmlParserCtxtPtr ctxt)
DEPRECATED: Internal function, don't use. parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue
ctxt: | an HTML parser context |
const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str)
DEPRECATED: Internal function, don't use. parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'
ctxt: | an HTML parser context |
str: | location to store the entity name |
Returns: | the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller. |
htmlDocPtr htmlParseFile (const char * filename,
const char * encoding)
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
filename: | the filename |
encoding: | a free form C string describing the HTML document encoding, or NULL |
Returns: | the resulting document tree |
htmlDocPtr htmlReadDoc (const xmlChar * cur,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree.
cur: | a pointer to a zero terminated string |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlReadFd (int fd,
const char * URL,
const char * encoding,
int options)
parse an HTML from a file descriptor and build a tree. NOTE that the file descriptor will not be closed when the reader is closed or reset.
fd: | an open file descriptor |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options)
parse an XML file from the filesystem or the network.
filename: | a file or URL |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
parse an HTML document from I/O functions and source and build a tree.
ioread: | an I/O read function |
ioclose: | an I/O close function |
ioctx: | an I/O handler |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree.
buffer: | a pointer to a char array |
size: | the size of the array |
URL: | the base URL to use for the document |
encoding: | the document encoding, or NULL |
options: | a combination of htmlParserOption(s) |
Returns: | the resulting document tree |
htmlDocPtr htmlSAXParseDoc (const xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc. Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
cur: | a pointer to an array of xmlChar |
encoding: | a free form C string describing the HTML document encoding, or NULL |
sax: | the SAX handler block |
userData: | if using SAX, this pointer will be provided on callbacks. |
Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. |
htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile. parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
filename: | the filename |
encoding: | a free form C string describing the HTML document encoding, or NULL |
sax: | the SAX handler block |
userData: | if using SAX, this pointer will be provided on callbacks. |
Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. |
const htmlElemDesc * htmlTagLookup (const xmlChar * tag)
Lookup the HTML tag in the ElementTable
tag: | The tag name in lowercase |
Returns: | the related htmlElemDescPtr or NULL if not found. |