html: Stop reporting syntax errors

It doesn't make much sense to keep the old syntax error handling which doesn't conform to HTML5. Handling HTML5 parser errors is rather involved and not essential for parsers.
2025-04-24 18:50:07 +03:00 · 2024-09-11 17:29:59 +02:00 · 2024-09-11 17:29:59 +02:00 · e179f3ec0e
commit e179f3ec0e
parent c6af101728
8 changed files with 53 additions and 129 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@ -2958,28 +2958,18 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
    if (CUR == '"') {
        SKIP(1);
 	ret = htmlParseHTMLAttribute(ctxt, '"');
-        if (CUR != '"') {
-	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
-	                 "AttValue: \" expected\n", NULL, NULL);
-	} else
+        if (CUR == '"')
 	    SKIP(1);
    } else if (CUR == '\'') {
        SKIP(1);
 	ret = htmlParseHTMLAttribute(ctxt, '\'');
-        if (CUR != '\'') {
-	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
-	                 "AttValue: ' expected\n", NULL, NULL);
-	} else
+        if (CUR == '\'')
 	    SKIP(1);
    } else {
        /*
 	 * That's an HTMLism, the attribute value may not be quoted
 	 */
 	ret = htmlParseHTMLAttribute(ctxt, 0);
-	if (ret == NULL) {
-	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
-	                 "AttValue: no value found\n", NULL, NULL);
-	}
    }
    return(ret);
 }
@ -3561,11 +3551,8 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {

    *value = NULL;
    name = htmlParseHTMLName(ctxt, 1);
-    if (name == NULL) {
-	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
-	             "error parsing attribute name\n", NULL, NULL);
+    if (name == NULL)
        return(NULL);
-    }

    /*
     * read the value
@ -3702,15 +3689,12 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {

    GROW;
    name = htmlParseHTMLName(ctxt, 0);
-    if (name == NULL) {
-	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
-	             "htmlParseStartTag: invalid element name\n",
-		     NULL, NULL);
+    if (name == NULL)
        return -1;
-    }
    if (xmlStrEqual(name, BAD_CAST"meta"))
 	meta = 1;

+    if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
        /*
         * Check for auto-closure of HTML elements.
         */
@ -3752,6 +3736,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
                }
            }
        }
+    }

    /*
     * Now parse the attributes, it ends up with the ending
@ -3778,8 +3763,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	     */
 	    for (i = 0; i < nbatts;i += 2) {
 	        if (xmlStrEqual(atts[i], attname)) {
-		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
-		                 "Attribute %s redefined\n", attname, NULL);
 		    if (attvalue != NULL)
 			xmlFree(attvalue);
 		    goto failed;
@ -3894,8 +3877,6 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
    int i, ret;

    if ((CUR != '<') || (NXT(1) != '/')) {
-        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
-	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
        return (0);
    }
    SKIP(2);
@ -4177,12 +4158,8 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
     * Lookup the info for that element.
     */
    info = htmlTagLookup(name);
-    if (info == NULL) {
-	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
-	             "Tag %s invalid\n", name, NULL);
-    } else {
+    if (info != NULL)
        ctxt->endCheckState = info->dataMode;
-    }

    if (ctxt->record_info)
        htmlNodeInfoPush(ctxt, &node_info);
@ -4201,22 +4178,9 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
 	return(0);
    }

-    if (CUR == '>') {
-        SKIP(1);
-    } else {
-	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
-	             "Couldn't find end of Start Tag %s\n", name, NULL);
-
-	/*
-	 * end of parsing of this node.
-	 */
-	if (xmlStrEqual(name, ctxt->name)) {
-            htmlParserFinishElementParsing(ctxt);
-	    nodePop(ctxt);
-	    htmlnamePop(ctxt);
-	}
+    if (CUR != '>')
        return(0);
-    }
+    SKIP(1);

    /*
     * Check for an Empty Element from DTD definition
@ -4358,10 +4322,6 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
     * Wipe out everything which is before the first '<'
     */
    SKIP_BLANKS;
-    if (ctxt->input->cur >= ctxt->input->end) {
-	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
-	             "Document is empty\n", NULL, NULL);
-    }

    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
 	ctxt->sax->startDocument(ctxt->userData);
@ -5018,12 +4978,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		 * Lookup the info for that element.
 		 */
 		info = htmlTagLookup(name);
-		if (info == NULL) {
-		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
-		                 "Tag %s invalid\n", name, NULL);
-                } else {
+		if (info != NULL)
                    ctxt->endCheckState = info->dataMode;
-		}

 		/*
 		 * Check for an Empty Element labeled the XML/SGML way
@ -5041,28 +4997,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    break;
 		}

-		if (CUR == '>') {
-		    SKIP(1);
-		} else {
-		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
-		                 "Couldn't find end of Start Tag %s\n",
-				 name, NULL);
-
-		    /*
-		     * end of parsing of this node.
-		     */
-		    if (xmlStrEqual(name, ctxt->name)) {
-                        htmlParserFinishElementParsing(ctxt);
-			nodePop(ctxt);
-			htmlnamePop(ctxt);
-		    }
-
-		    if (ctxt->record_info)
-		        htmlNodeInfoPush(ctxt, &node_info);
-
-		    ctxt->instate = XML_PARSER_CONTENT;
+		if (CUR != '>')
                    break;
-		}
+		SKIP(1);

 		/*
 		 * Check for an Empty Element from DTD definition
--- a/python/tests/pushSAXhtml.py
+++ b/python/tests/pushSAXhtml.py
@ -50,8 +50,7 @@ chunk = "ar</foo>"
 ctxt.htmlParseChunk(chunk, len(chunk), 1)
 ctxt=None

-reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:error: Tag foo invalid
-:characters: bar:endElement foo:endElement body:endElement html:endDocument:"""
+reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:characters: bar:endElement foo:endElement body:endElement html:endDocument:"""
 if log != reference:
    print("Error got: %s" % log)
    print("Exprected: %s" % reference)
--- a/result/HTML/names.html.err
+++ b/result/HTML/names.html.err
@ -1,3 +0,0 @@
-./test/HTML/names.html:3: HTML parser error : Tag o:p invalid
-  <o:p></o:p>
-      ^
--- a/result/HTML/names.html.sax
+++ b/result/HTML/names.html.sax
@ -7,7 +7,6 @@ SAX.startElement(body)
 SAX.characters(
  , 3)
 SAX.startElement(o:p)
-SAX.error: Tag o:p invalid
 SAX.endElement(o:p)
 SAX.characters(
 , 1)
--- a/result/HTML/utf8bug.html.err
+++ b/result/HTML/utf8bug.html.err
@ -1,3 +0,0 @@
-./test/HTML/utf8bug.html:121: HTML parser error : Tag s1 invalid
-ز همکاران است. روی آن کلیک کند.</FONT></FONT></STRONG><S1
-                                                                               ^
--- a/result/HTML/utf8bug.html.sax
+++ b/result/HTML/utf8bug.html.sax
@ -422,7 +422,6 @@ SAX.endElement(font)
 SAX.endElement(font)
 SAX.endElement(strong)
 SAX.startElement(s1)
-SAX.error: Tag s1 invalid
 SAX.characters(  , 2)
 SAX.endElement(s1)
 SAX.endElement(div)
--- a/result/HTML/wired.html.err
+++ b/result/HTML/wired.html.err
@ -1,6 +1,3 @@
-./test/HTML/wired.html:25: HTML parser error : Tag nobr invalid
-<td bgcolor="#FF0000" align="left" valign="center"><nobr><img src="http://static
-                                                        ^
 ./test/HTML/wired.html:125: HTML parser error : Unexpected end tag : form
 	</tr>    </form>
 	                ^
--- a/result/HTML/wired.html.sax
+++ b/result/HTML/wired.html.sax
@ -357,7 +357,6 @@ SAX.characters(
 , 3)
 SAX.startElement(td, bgcolor='#FF0000', align='left', valign='center')
 SAX.startElement(nobr)
-SAX.error: Tag nobr invalid
 SAX.startElement(img, src='http://static.wired.com/news/images/spacer.gif', width='344', height='1')
 SAX.endElement(img)
 SAX.startElement(br)