diff --git a/HTMLparser.c b/HTMLparser.c index d5c8e0e2..a48b2318 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -40,8 +40,6 @@ static int htmlOmittedDefaultValue = 1; -static void htmlParseComment(htmlParserCtxtPtr ctxt); - static int htmlParseElementInternal(htmlParserCtxtPtr ctxt); @@ -2545,23 +2543,6 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); -static void -htmlSkipBogusComment(htmlParserCtxtPtr ctxt) { - int c; - - htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT, - "Incorrectly opened comment\n", NULL, NULL); - - while (PARSER_STOPPED(ctxt) == 0) { - c = CUR; - if (c == 0) - break; - NEXT; - if (c == '>') - break; - } -} - /** * htmlParseHTMLName: * @ctxt: an HTML parser context @@ -3368,147 +3349,27 @@ htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { return(URI); } -/** - * htmlParsePI: - * @ctxt: an HTML parser context - * - * Parse an XML Processing Instruction. HTML5 doesn't allow processing - * instructions, so this will be removed at some point. - */ -static void -htmlParsePI(htmlParserCtxtPtr ctxt) { - xmlChar *buf = NULL; - int len = 0; - int size = HTML_PARSER_BUFFER_SIZE; - int cur, l; - int maxLength = (ctxt->options & XML_PARSE_HUGE) ? - XML_MAX_HUGE_LENGTH : - XML_MAX_TEXT_LENGTH; - const xmlChar *target; - xmlParserInputState state; - - if ((RAW == '<') && (NXT(1) == '?')) { - state = ctxt->instate; - ctxt->instate = XML_PARSER_PI; - /* - * this is a Processing Instruction. - */ - SKIP(2); - - /* - * Parse the target name and check for special support like - * namespace. - */ - target = htmlParseName(ctxt); - if (target != NULL) { - if (RAW == '>') { - SKIP(1); - - /* - * SAX: PI detected. - */ - if ((ctxt->sax) && (!ctxt->disableSAX) && - (ctxt->sax->processingInstruction != NULL)) - ctxt->sax->processingInstruction(ctxt->userData, - target, NULL); - goto done; - } - buf = xmlMalloc(size); - if (buf == NULL) { - htmlErrMemory(ctxt); - return; - } - cur = CUR; - if (!IS_BLANK(cur)) { - htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, - "ParsePI: PI %s space expected\n", target, NULL); - } - SKIP_BLANKS; - cur = CUR_CHAR(l); - while ((cur != 0) && (cur != '>')) { - if (len + 5 >= size) { - xmlChar *tmp; - - size *= 2; - tmp = (xmlChar *) xmlRealloc(buf, size); - if (tmp == NULL) { - htmlErrMemory(ctxt); - xmlFree(buf); - return; - } - buf = tmp; - } - if (IS_CHAR(cur)) { - COPY_BUF(buf,len,cur); - } else { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in processing instruction " - "0x%X\n", cur); - } - if (len > maxLength) { - htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, - "PI %s too long", target, NULL); - xmlFree(buf); - goto done; - } - NEXTL(l); - cur = CUR_CHAR(l); - } - buf[len] = 0; - if (cur != '>') { - htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, - "ParsePI: PI %s never end ...\n", target, NULL); - } else { - SKIP(1); - - /* - * SAX: PI detected. - */ - if ((ctxt->sax) && (!ctxt->disableSAX) && - (ctxt->sax->processingInstruction != NULL)) - ctxt->sax->processingInstruction(ctxt->userData, - target, buf); - } - xmlFree(buf); - } else { - htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, - "PI is not started correctly", NULL, NULL); - } - -done: - ctxt->instate = state; - } -} - /** * htmlParseComment: * @ctxt: an HTML parser context + * @bogus: true if this is a bogus comment * * Parse an HTML comment */ static void -htmlParseComment(htmlParserCtxtPtr ctxt) { +htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) { xmlChar *buf = NULL; int len; int size = HTML_PARSER_BUFFER_SIZE; - int q, ql; - int r, rl; int cur, l; - int next, nl; int maxLength = (ctxt->options & XML_PARSE_HUGE) ? XML_MAX_HUGE_LENGTH : XML_MAX_TEXT_LENGTH; xmlParserInputState state; - /* - * Check that there is a comment right here. - */ - if ((RAW != '<') || (NXT(1) != '!') || - (NXT(2) != '-') || (NXT(3) != '-')) return; - state = ctxt->instate; ctxt->instate = XML_PARSER_COMMENT; - SKIP(4); + buf = xmlMalloc(size); if (buf == NULL) { htmlErrMemory(ctxt); @@ -3516,36 +3377,34 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { } len = 0; buf[len] = 0; - q = CUR_CHAR(ql); - if (q == 0) - goto unfinished; - if (q == '>') { - htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL); - cur = '>'; - goto finished; - } - NEXTL(ql); - r = CUR_CHAR(rl); - if (r == 0) - goto unfinished; - if (q == '-' && r == '>') { - htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL); - cur = '>'; - goto finished; - } - NEXTL(rl); - cur = CUR_CHAR(l); - while ((cur != 0) && - ((cur != '>') || - (r != '-') || (q != '-'))) { - NEXTL(l); - next = CUR_CHAR(nl); - if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) { - htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, - "Comment incorrectly closed by '--!>'", NULL, NULL); - cur = '>'; - break; + cur = CUR_CHAR(l); + if (!bogus) { + if (cur == '>') { + SKIP(1); + goto done; + } else if ((cur == '-') && (NXT(1) == '>')) { + SKIP(2); + goto done; + } + } + + while (cur != 0) { + if (bogus) { + if (cur == '>') { + SKIP(1); + break; + } + } else { + if ((cur == '-') && (NXT(1) == '-')) { + if (NXT(2) == '>') { + SKIP(3); + break; + } else if ((NXT(2) == '!') && (NXT(3) == '>')) { + SKIP(4); + break; + } + } } if (len + 5 >= size) { @@ -3556,15 +3415,16 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { if (tmp == NULL) { xmlFree(buf); htmlErrMemory(ctxt); + ctxt->instate = state; return; } buf = tmp; } - if (IS_CHAR(q)) { - COPY_BUF(buf,len,q); + if (IS_CHAR(cur)) { + COPY_BUF(buf,len,cur); } else { htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in comment 0x%X\n", q); + "Invalid char in comment 0x%X\n", cur); } if (len > maxLength) { htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, @@ -3574,29 +3434,19 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { return; } - q = r; - ql = rl; - r = cur; - rl = l; - cur = next; - l = nl; - } -finished: - buf[len] = 0; - if (cur == '>') { - SKIP(1); - if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && - (!ctxt->disableSAX)) - ctxt->sax->comment(ctxt->userData, buf); - xmlFree(buf); - ctxt->instate = state; - return; + NEXTL(l); + cur = CUR_CHAR(l); } -unfinished: - htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, - "Comment not terminated \n diff --git a/result/HTML/758518-tag.html.err b/result/HTML/758518-tag.html.err deleted file mode 100644 index c912c91f..00000000 --- a/result/HTML/758518-tag.html.err +++ /dev/null @@ -1,3 +0,0 @@ -./test/HTML/758518-tag.html:1: HTML parser error : PI is not started correctly -“ -^ diff --git a/result/HTML/758518-tag.html.sax b/result/HTML/758518-tag.html.sax index fd4aa949..d94eb193 100644 --- a/result/HTML/758518-tag.html.sax +++ b/result/HTML/758518-tag.html.sax @@ -1,10 +1,4 @@ SAX.setDocumentLocator() SAX.startDocument() -SAX.error: PI is not started correctlySAX.startElement(html) -SAX.startElement(body) -SAX.startElement(p) -SAX.characters(“, 2) -SAX.endElement(p) -SAX.endElement(body) -SAX.endElement(html) +SAX.comment(?a“) SAX.endDocument() diff --git a/result/HTML/758606.html b/result/HTML/758606.html index 3974ca90..ee62ed7b 100644 --- a/result/HTML/758606.html +++ b/result/HTML/758606.html @@ -1,2 +1,3 @@ - + diff --git a/result/HTML/758606.html.err b/result/HTML/758606.html.err index bcb253eb..523b8b8b 100644 --- a/result/HTML/758606.html.err +++ b/result/HTML/758606.html.err @@ -1,7 +1,3 @@ ./test/HTML/758606.html:1: HTML parser error : Invalid char in comment 0xC diff --git a/result/HTML/758606_2.html.err b/result/HTML/758606_2.html.err index 88bcde6b..104a5e43 100644 --- a/result/HTML/758606_2.html.err +++ b/result/HTML/758606_2.html.err @@ -1,7 +1,3 @@ ./test/HTML/758606_2.html:1: HTML parser error : Invalid char in comment 0xC -‘' - whatwg guidance is - ^ diff --git a/result/HTML/comments.html.sax b/result/HTML/comments.html.sax index ee8fcd7b..011c6dc0 100644 --- a/result/HTML/comments.html.sax +++ b/result/HTML/comments.html.sax @@ -24,7 +24,7 @@ SAX.characters( SAX.startElement(div) SAX.characters( , 9) -SAX.error: Comment incorrectly closed by '--!>'SAX.comment(incorrectly closed comment) +SAX.comment(incorrectly closed comment) SAX.startElement(span, id='under-test') SAX.characters(whatwg guidance is that this s, 49) SAX.endElement(span) diff --git a/result/HTML/comments2.html.err b/result/HTML/comments2.html.err deleted file mode 100644 index 8d1f5926..00000000 --- a/result/HTML/comments2.html.err +++ /dev/null @@ -1,3 +0,0 @@ -./test/HTML/comments2.html:10: HTML parser error : Comment incorrectly closed by '--!>' - whatwg guidance is - ^ diff --git a/result/HTML/comments2.html.sax b/result/HTML/comments2.html.sax index d694f04f..4958a4f1 100644 --- a/result/HTML/comments2.html.sax +++ b/result/HTML/comments2.html.sax @@ -24,7 +24,7 @@ SAX.characters( SAX.startElement(div) SAX.characters( , 9) -SAX.error: Comment incorrectly closed by '--!>'SAX.comment(incorrectly closed comment) +SAX.comment(incorrectly closed comment) SAX.startElement(span, id='under-test') SAX.characters(whatwg guidance is that this s, 49) SAX.endElement(span) diff --git a/result/HTML/comments3.html.err b/result/HTML/comments3.html.err deleted file mode 100644 index 60e927ba..00000000 --- a/result/HTML/comments3.html.err +++ /dev/null @@ -1,6 +0,0 @@ -./test/HTML/comments3.html:10: HTML parser error : Comment abruptly ended - the previous node should be an empty comment, and this should be a - ^ -./test/HTML/comments3.html:13: HTML parser error : Comment abruptly ended - the previous node should be an empty comment, and this should be a - ^ diff --git a/result/HTML/comments3.html.sax b/result/HTML/comments3.html.sax index ab783f94..9f40a450 100644 --- a/result/HTML/comments3.html.sax +++ b/result/HTML/comments3.html.sax @@ -24,7 +24,7 @@ SAX.characters( SAX.startElement(div) SAX.characters( , 9) -SAX.error: Comment abruptly endedSAX.comment() +SAX.comment() SAX.characters(the previous node should be an, 86) SAX.endElement(div) SAX.characters( @@ -32,7 +32,7 @@ SAX.characters( SAX.startElement(div) SAX.characters( , 9) -SAX.error: Comment abruptly endedSAX.comment() +SAX.comment() SAX.characters(the previous node should be an, 86) SAX.endElement(div) SAX.characters( diff --git a/result/HTML/issue380.html b/result/HTML/issue380.html index 1fcf4965..13b7db1f 100644 --- a/result/HTML/issue380.html +++ b/result/HTML/issue380.html @@ -1,6 +1,6 @@ - ... + ... diff --git a/result/HTML/issue380.html.err b/result/HTML/issue380.html.err deleted file mode 100644 index efbb8bdf..00000000 --- a/result/HTML/issue380.html.err +++ /dev/null @@ -1,6 +0,0 @@ -./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment - ... - ^ -./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment - ... - ^ diff --git a/result/HTML/issue380.html.sax b/result/HTML/issue380.html.sax index 5df2b506..b53845cf 100644 --- a/result/HTML/issue380.html.sax +++ b/result/HTML/issue380.html.sax @@ -6,9 +6,9 @@ SAX.characters( SAX.startElement(body) SAX.characters( , 5) -SAX.error: Incorrectly opened comment +SAX.comment([if !supportLists]) SAX.characters(..., 3) -SAX.error: Incorrectly opened comment +SAX.comment([endif]) SAX.characters( , 3) SAX.endElement(body) diff --git a/result/HTML/python.html b/result/HTML/python.html index ea0be18e..befe71e0 100644 --- a/result/HTML/python.html +++ b/result/HTML/python.html @@ -1,5 +1,5 @@ - + Python Programming Language diff --git a/result/HTML/python.html.sax b/result/HTML/python.html.sax index ec300d35..f825954f 100644 --- a/result/HTML/python.html.sax +++ b/result/HTML/python.html.sax @@ -1,7 +1,7 @@ SAX.setDocumentLocator() SAX.startDocument() SAX.internalSubset(html, -//W3C//DTD HTML 4.01 Transitional//EN, http://www.w3.org/TR/html4/loose.dtd) -SAX.processingInstruction(xml-stylesheet, href="./css/ht2html.css" type="text/css"?) +SAX.comment(?xml-stylesheet href="./css/ht2html.css" type="text/css"?) SAX.startElement(html) SAX.characters( , 1) diff --git a/result/HTML/xml-declaration-1.html b/result/HTML/xml-declaration-1.html index 8c9ebe39..1d0ca6c0 100644 --- a/result/HTML/xml-declaration-1.html +++ b/result/HTML/xml-declaration-1.html @@ -1,4 +1,4 @@ - +

öäüß

diff --git a/result/HTML/xml-declaration-1.html.sax b/result/HTML/xml-declaration-1.html.sax index 83fe8eb6..c1ce23ee 100644 --- a/result/HTML/xml-declaration-1.html.sax +++ b/result/HTML/xml-declaration-1.html.sax @@ -1,6 +1,6 @@ SAX.setDocumentLocator() SAX.startDocument() -SAX.processingInstruction(xml, encoding="UTF-8") +SAX.comment(?xml encoding="UTF-8") SAX.startElement(html) SAX.startElement(body) SAX.startElement(p)