1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-04-24 18:50:07 +03:00

html: Stop reporting syntax errors

It doesn't make much sense to keep the old syntax error handling which
doesn't conform to HTML5.

Handling HTML5 parser errors is rather involved and not essential for
parsers.
This commit is contained in:
Nick Wellnhofer 2024-09-11 17:29:59 +02:00
parent c6af101728
commit e179f3ec0e
8 changed files with 53 additions and 129 deletions

View File

@ -2958,28 +2958,18 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
if (CUR == '"') { if (CUR == '"') {
SKIP(1); SKIP(1);
ret = htmlParseHTMLAttribute(ctxt, '"'); ret = htmlParseHTMLAttribute(ctxt, '"');
if (CUR != '"') { if (CUR == '"')
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
"AttValue: \" expected\n", NULL, NULL);
} else
SKIP(1); SKIP(1);
} else if (CUR == '\'') { } else if (CUR == '\'') {
SKIP(1); SKIP(1);
ret = htmlParseHTMLAttribute(ctxt, '\''); ret = htmlParseHTMLAttribute(ctxt, '\'');
if (CUR != '\'') { if (CUR == '\'')
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
"AttValue: ' expected\n", NULL, NULL);
} else
SKIP(1); SKIP(1);
} else { } else {
/* /*
* That's an HTMLism, the attribute value may not be quoted * That's an HTMLism, the attribute value may not be quoted
*/ */
ret = htmlParseHTMLAttribute(ctxt, 0); ret = htmlParseHTMLAttribute(ctxt, 0);
if (ret == NULL) {
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
"AttValue: no value found\n", NULL, NULL);
}
} }
return(ret); return(ret);
} }
@ -3561,11 +3551,8 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
*value = NULL; *value = NULL;
name = htmlParseHTMLName(ctxt, 1); name = htmlParseHTMLName(ctxt, 1);
if (name == NULL) { if (name == NULL)
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"error parsing attribute name\n", NULL, NULL);
return(NULL); return(NULL);
}
/* /*
* read the value * read the value
@ -3702,55 +3689,53 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
GROW; GROW;
name = htmlParseHTMLName(ctxt, 0); name = htmlParseHTMLName(ctxt, 0);
if (name == NULL) { if (name == NULL)
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
return -1; return -1;
}
if (xmlStrEqual(name, BAD_CAST"meta")) if (xmlStrEqual(name, BAD_CAST"meta"))
meta = 1; meta = 1;
/* if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
* Check for auto-closure of HTML elements. /*
*/ * Check for auto-closure of HTML elements.
htmlAutoClose(ctxt, name); */
htmlAutoClose(ctxt, name);
/* /*
* Check for implied HTML elements. * Check for implied HTML elements.
*/ */
htmlCheckImplied(ctxt, name); htmlCheckImplied(ctxt, name);
/* /*
* Avoid html at any level > 0, head at any level != 1 * Avoid html at any level > 0, head at any level != 1
* or any attempt to recurse body * or any attempt to recurse body
*/ */
if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"htmlParseStartTag: misplaced <html> tag\n", "htmlParseStartTag: misplaced <html> tag\n",
name, NULL); name, NULL);
discardtag = 1; discardtag = 1;
ctxt->depth++; ctxt->depth++;
} }
if ((ctxt->nameNr != 1) && if ((ctxt->nameNr != 1) &&
(xmlStrEqual(name, BAD_CAST"head"))) { (xmlStrEqual(name, BAD_CAST"head"))) {
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"htmlParseStartTag: misplaced <head> tag\n", "htmlParseStartTag: misplaced <head> tag\n",
name, NULL); name, NULL);
discardtag = 1; discardtag = 1;
ctxt->depth++; ctxt->depth++;
} }
if (xmlStrEqual(name, BAD_CAST"body")) { if (xmlStrEqual(name, BAD_CAST"body")) {
int indx; int indx;
for (indx = 0;indx < ctxt->nameNr;indx++) { for (indx = 0;indx < ctxt->nameNr;indx++) {
if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"htmlParseStartTag: misplaced <body> tag\n", "htmlParseStartTag: misplaced <body> tag\n",
name, NULL); name, NULL);
discardtag = 1; discardtag = 1;
ctxt->depth++; ctxt->depth++;
} }
} }
}
} }
/* /*
@ -3778,8 +3763,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
*/ */
for (i = 0; i < nbatts;i += 2) { for (i = 0; i < nbatts;i += 2) {
if (xmlStrEqual(atts[i], attname)) { if (xmlStrEqual(atts[i], attname)) {
htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
"Attribute %s redefined\n", attname, NULL);
if (attvalue != NULL) if (attvalue != NULL)
xmlFree(attvalue); xmlFree(attvalue);
goto failed; goto failed;
@ -3894,8 +3877,6 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
int i, ret; int i, ret;
if ((CUR != '<') || (NXT(1) != '/')) { if ((CUR != '<') || (NXT(1) != '/')) {
htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
"htmlParseEndTag: '</' not found\n", NULL, NULL);
return (0); return (0);
} }
SKIP(2); SKIP(2);
@ -4177,12 +4158,8 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
* Lookup the info for that element. * Lookup the info for that element.
*/ */
info = htmlTagLookup(name); info = htmlTagLookup(name);
if (info == NULL) { if (info != NULL)
htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
"Tag %s invalid\n", name, NULL);
} else {
ctxt->endCheckState = info->dataMode; ctxt->endCheckState = info->dataMode;
}
if (ctxt->record_info) if (ctxt->record_info)
htmlNodeInfoPush(ctxt, &node_info); htmlNodeInfoPush(ctxt, &node_info);
@ -4201,22 +4178,9 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
return(0); return(0);
} }
if (CUR == '>') { if (CUR != '>')
SKIP(1); return(0);
} else { SKIP(1);
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"Couldn't find end of Start Tag %s\n", name, NULL);
/*
* end of parsing of this node.
*/
if (xmlStrEqual(name, ctxt->name)) {
htmlParserFinishElementParsing(ctxt);
nodePop(ctxt);
htmlnamePop(ctxt);
}
return(0);
}
/* /*
* Check for an Empty Element from DTD definition * Check for an Empty Element from DTD definition
@ -4358,10 +4322,6 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
* Wipe out everything which is before the first '<' * Wipe out everything which is before the first '<'
*/ */
SKIP_BLANKS; SKIP_BLANKS;
if (ctxt->input->cur >= ctxt->input->end) {
htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
"Document is empty\n", NULL, NULL);
}
if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
ctxt->sax->startDocument(ctxt->userData); ctxt->sax->startDocument(ctxt->userData);
@ -5018,12 +4978,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
* Lookup the info for that element. * Lookup the info for that element.
*/ */
info = htmlTagLookup(name); info = htmlTagLookup(name);
if (info == NULL) { if (info != NULL)
htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
"Tag %s invalid\n", name, NULL);
} else {
ctxt->endCheckState = info->dataMode; ctxt->endCheckState = info->dataMode;
}
/* /*
* Check for an Empty Element labeled the XML/SGML way * Check for an Empty Element labeled the XML/SGML way
@ -5041,28 +4997,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
break; break;
} }
if (CUR == '>') { if (CUR != '>')
SKIP(1); break;
} else { SKIP(1);
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"Couldn't find end of Start Tag %s\n",
name, NULL);
/*
* end of parsing of this node.
*/
if (xmlStrEqual(name, ctxt->name)) {
htmlParserFinishElementParsing(ctxt);
nodePop(ctxt);
htmlnamePop(ctxt);
}
if (ctxt->record_info)
htmlNodeInfoPush(ctxt, &node_info);
ctxt->instate = XML_PARSER_CONTENT;
break;
}
/* /*
* Check for an Empty Element from DTD definition * Check for an Empty Element from DTD definition

View File

@ -50,8 +50,7 @@ chunk = "ar</foo>"
ctxt.htmlParseChunk(chunk, len(chunk), 1) ctxt.htmlParseChunk(chunk, len(chunk), 1)
ctxt=None ctxt=None
reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:error: Tag foo invalid reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:characters: bar:endElement foo:endElement body:endElement html:endDocument:"""
:characters: bar:endElement foo:endElement body:endElement html:endDocument:"""
if log != reference: if log != reference:
print("Error got: %s" % log) print("Error got: %s" % log)
print("Exprected: %s" % reference) print("Exprected: %s" % reference)

View File

@ -1,3 +0,0 @@
./test/HTML/names.html:3: HTML parser error : Tag o:p invalid
<o:p></o:p>
^

View File

@ -7,7 +7,6 @@ SAX.startElement(body)
SAX.characters( SAX.characters(
, 3) , 3)
SAX.startElement(o:p) SAX.startElement(o:p)
SAX.error: Tag o:p invalid
SAX.endElement(o:p) SAX.endElement(o:p)
SAX.characters( SAX.characters(
, 1) , 1)

View File

@ -1,3 +0,0 @@
./test/HTML/utf8bug.html:121: HTML parser error : Tag s1 invalid
ز همکاران است. روی آن کلیک کند.</FONT></FONT></STRONG><S1
^

View File

@ -422,7 +422,6 @@ SAX.endElement(font)
SAX.endElement(font) SAX.endElement(font)
SAX.endElement(strong) SAX.endElement(strong)
SAX.startElement(s1) SAX.startElement(s1)
SAX.error: Tag s1 invalid
SAX.characters( , 2) SAX.characters( , 2)
SAX.endElement(s1) SAX.endElement(s1)
SAX.endElement(div) SAX.endElement(div)

View File

@ -1,6 +1,3 @@
./test/HTML/wired.html:25: HTML parser error : Tag nobr invalid
<td bgcolor="#FF0000" align="left" valign="center"><nobr><img src="http://static
^
./test/HTML/wired.html:125: HTML parser error : Unexpected end tag : form ./test/HTML/wired.html:125: HTML parser error : Unexpected end tag : form
</tr> </form> </tr> </form>
^ ^

View File

@ -357,7 +357,6 @@ SAX.characters(
, 3) , 3)
SAX.startElement(td, bgcolor='#FF0000', align='left', valign='center') SAX.startElement(td, bgcolor='#FF0000', align='left', valign='center')
SAX.startElement(nobr) SAX.startElement(nobr)
SAX.error: Tag nobr invalid
SAX.startElement(img, src='http://static.wired.com/news/images/spacer.gif', width='344', height='1') SAX.startElement(img, src='http://static.wired.com/news/images/spacer.gif', width='344', height='1')
SAX.endElement(img) SAX.endElement(img)
SAX.startElement(br) SAX.startElement(br)