1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

html: Rework options

Introduce htmlCtxtSetOptions, see similar changes made to XML parser.

Add HTML_PARSE_HUGE alias. Support HTML_PARSE_BIG_LINES.
This commit is contained in:
Nick Wellnhofer 2024-09-11 19:06:06 +02:00
parent 16de1346eb
commit 462bf0b7a5
2 changed files with 195 additions and 70 deletions

View File

@ -2623,7 +2623,7 @@ static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
int len = 0, l;
int c;
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_TEXT_LENGTH :
XML_MAX_NAME_LENGTH;
const xmlChar *base = ctxt->input->base;
@ -2796,7 +2796,7 @@ static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, int stop) {
xmlChar *buffer = NULL;
int buffer_size = 0;
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_HUGE_LENGTH :
XML_MAX_TEXT_LENGTH;
xmlChar *out = NULL;
@ -3162,7 +3162,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
int len;
int size = HTML_PARSER_BUFFER_SIZE;
int cur, l;
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_HUGE_LENGTH :
XML_MAX_TEXT_LENGTH;
xmlParserInputState state;
@ -3356,7 +3356,7 @@ htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
int len;
int size = HTML_PARSER_BUFFER_SIZE;
int quote, cur, l;
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_TEXT_LENGTH :
XML_MAX_NAME_LENGTH;
@ -3424,7 +3424,7 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
xmlChar *publicId = NULL;
xmlChar *URI = NULL;
int nameCap, nameSize;
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
XML_MAX_TEXT_LENGTH :
XML_MAX_NAME_LENGTH;
@ -5714,12 +5714,178 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
xmlResetError(&ctxt->lastError);
}
static int
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
{
int allMask;
if (ctxt == NULL)
return(-1);
allMask = HTML_PARSE_RECOVER |
HTML_PARSE_HTML5 |
HTML_PARSE_NODEFDTD |
HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING |
HTML_PARSE_PEDANTIC |
HTML_PARSE_NOBLANKS |
HTML_PARSE_NONET |
HTML_PARSE_NOIMPLIED |
HTML_PARSE_COMPACT |
HTML_PARSE_HUGE |
HTML_PARSE_IGNORE_ENC |
HTML_PARSE_BIG_LINES;
ctxt->options = (ctxt->options & keepMask) | (options & allMask);
/*
* For some options, struct members are historically the source
* of truth. See xmlCtxtSetOptionsInternal.
*/
ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
/*
* Changing SAX callbacks is a bad idea. This should be fixed.
*/
if (options & HTML_PARSE_NOBLANKS) {
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
}
if (options & HTML_PARSE_HUGE) {
if (ctxt->dict != NULL)
xmlDictSetLimit(ctxt->dict, 0);
}
/*
* It would be useful to allow this feature.
*/
ctxt->dictNames = 0;
ctxt->linenumbers = 1;
return(options & ~allMask);
}
/**
* htmlCtxtSetOptions:
* @ctxt: an HTML parser context
* @options: a bitmask of xmlParserOption values
*
* Applies the options to the parser context. Unset options are
* cleared.
*
* Available since 2.14.0. With older versions, you can use
* htmlCtxtUseOptions.
*
* HTML_PARSE_RECOVER
*
* No effect as of 2.14.0.
*
* HTML_PARSE_HTML5
*
* Make the tokenizer emit a SAX callback for each token. This results
* in unbalanced invocations of startElement and endElement.
*
* For now, this is only usable with custom SAX callbacks.
*
* HTML_PARSE_NODEFDTD
*
* Do not default to a doctype if none was found.
*
* HTML_PARSE_NOERROR
*
* Disable error and warning reports to the error handlers.
* Errors are still accessible with xmlCtxtGetLastError.
*
* HTML_PARSE_NOWARNING
*
* Disable warning reports.
*
* HTML_PARSE_PEDANTIC
*
* No effect.
*
* HTML_PARSE_NOBLANKS
*
* Remove some text nodes containing only whitespace from the
* result document. Which nodes are removed depends on a conservative
* heuristic. The reindenting feature of the serialization code relies
* on this option to be set when parsing. Use of this option is
* DISCOURAGED.
*
* HTML_PARSE_NONET
*
* No effect.
*
* HTML_PARSE_NOIMPLIED
*
* Do not add implied html, head or body elements.
*
* HTML_PARSE_COMPACT
*
* Store small strings directly in the node struct to save
* memory.
*
* HTML_PARSE_HUGE
*
* Relax some internal limits.
*
* Available since 2.14.0. Use XML_PARSE_HUGE works with older
* versions.
*
* Maximum size of text nodes, tags, comments, CDATA sections
*
* normal: 10M
* huge: 1B
*
* Maximum size of names, system literals, pubid literals
*
* normal: 50K
* huge: 10M
*
* Maximum nesting depth of elements
*
* normal: 256
* huge: 2048
*
* HTML_PARSE_IGNORE_ENC
*
* Ignore the encoding in the HTML declaration. This option is
* mostly unneeded these days. The only effect is to enforce
* UTF-8 decoding of ASCII-like data.
*
* HTML_PARSE_BIG_LINES
*
* Enable reporting of line numbers larger than 65535.
*
* Available since 2.14.0.
*
* Returns 0 in case of success, the set of unknown or unimplemented options
* in case of error.
*/
int
htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options)
{
return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
}
/**
* htmlCtxtUseOptions:
* @ctxt: an HTML parser context
* @options: a combination of htmlParserOption(s)
*
* Applies the options to the parser context
* DEPRECATED: Use htmlCtxtSetOptions.
*
* Applies the options to the parser context. The following options
* are never cleared and can only be enabled:
*
* HTML_PARSE_NODEFDTD
* HTML_PARSE_NOERROR
* HTML_PARSE_NOWARNING
* HTML_PARSE_NOIMPLIED
* HTML_PARSE_COMPACT
* HTML_PARSE_HUGE
* HTML_PARSE_IGNORE_ENC
* HTML_PARSE_BIG_LINES
*
* Returns 0 in case of success, the set of unknown or unimplemented options
* in case of error.
@ -5727,67 +5893,21 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
{
if (ctxt == NULL)
return(-1);
int keepMask;
if (options & HTML_PARSE_NOWARNING) {
ctxt->sax->warning = NULL;
ctxt->vctxt.warning = NULL;
options -= XML_PARSE_NOWARNING;
ctxt->options |= XML_PARSE_NOWARNING;
}
if (options & HTML_PARSE_NOERROR) {
ctxt->sax->error = NULL;
ctxt->vctxt.error = NULL;
ctxt->sax->fatalError = NULL;
options -= XML_PARSE_NOERROR;
ctxt->options |= XML_PARSE_NOERROR;
}
if (options & HTML_PARSE_PEDANTIC) {
ctxt->pedantic = 1;
options -= XML_PARSE_PEDANTIC;
ctxt->options |= XML_PARSE_PEDANTIC;
} else
ctxt->pedantic = 0;
if (options & XML_PARSE_NOBLANKS) {
ctxt->keepBlanks = 0;
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
options -= XML_PARSE_NOBLANKS;
ctxt->options |= XML_PARSE_NOBLANKS;
} else
ctxt->keepBlanks = 1;
if (options & HTML_PARSE_RECOVER) {
ctxt->recovery = 1;
options -= HTML_PARSE_RECOVER;
} else
ctxt->recovery = 0;
if (options & HTML_PARSE_COMPACT) {
ctxt->options |= HTML_PARSE_COMPACT;
options -= HTML_PARSE_COMPACT;
}
if (options & XML_PARSE_HUGE) {
ctxt->options |= XML_PARSE_HUGE;
options -= XML_PARSE_HUGE;
}
if (options & HTML_PARSE_NODEFDTD) {
ctxt->options |= HTML_PARSE_NODEFDTD;
options -= HTML_PARSE_NODEFDTD;
}
if (options & HTML_PARSE_IGNORE_ENC) {
ctxt->options |= HTML_PARSE_IGNORE_ENC;
options -= HTML_PARSE_IGNORE_ENC;
}
if (options & HTML_PARSE_NOIMPLIED) {
ctxt->options |= HTML_PARSE_NOIMPLIED;
options -= HTML_PARSE_NOIMPLIED;
}
if (options & HTML_PARSE_HTML5) {
ctxt->options |= HTML_PARSE_HTML5;
options -= HTML_PARSE_HTML5;
}
ctxt->dictNames = 0;
ctxt->linenumbers = 1;
return (options);
/*
* For historic reasons, some options can only be enabled.
*/
keepMask = HTML_PARSE_NODEFDTD |
HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING |
HTML_PARSE_NOIMPLIED |
HTML_PARSE_COMPACT |
HTML_PARSE_HUGE |
HTML_PARSE_IGNORE_ENC |
HTML_PARSE_BIG_LINES;
return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
}
/**

View File

@ -205,21 +205,26 @@ XMLPUBFUN void
* to the xmlReadDoc() and similar calls.
*/
typedef enum {
HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
HTML_PARSE_RECOVER = 1<<0, /* No effect */
HTML_PARSE_HTML5 = 1<<1, /* HTML5 support */
HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
HTML_PARSE_PEDANTIC = 1<<7, /* No effect */
HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
HTML_PARSE_NONET = 1<<11,/* Forbid network access */
HTML_PARSE_NONET = 1<<11,/* No effect */
HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
HTML_PARSE_HUGE = 1<<19,/* relax any hardcoded limit from the parser */
HTML_PARSE_IGNORE_ENC=1<<21,/* ignore internal document encoding hint */
HTML_PARSE_BIG_LINES= 1<<22 /* Store big lines numbers in text PSVI field */
} htmlParserOption;
XMLPUBFUN void
htmlCtxtReset (htmlParserCtxtPtr ctxt);
XMLPUBFUN int
htmlCtxtSetOptions (htmlParserCtxtPtr ctxt,
int options);
XMLPUBFUN int
htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options);