1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

html: Parse bogus comments instead of ignoring them

Also treat XML processing instructions as bogus comments.
This commit is contained in:
Nick Wellnhofer 2024-09-07 15:18:13 +02:00
parent 8444017578
commit 3adb396d87
23 changed files with 113 additions and 275 deletions

View File

@ -40,8 +40,6 @@
static int htmlOmittedDefaultValue = 1;
static void htmlParseComment(htmlParserCtxtPtr ctxt);
static int
htmlParseElementInternal(htmlParserCtxtPtr ctxt);
@ -2545,23 +2543,6 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
int c;
htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
"Incorrectly opened comment\n", NULL, NULL);
while (PARSER_STOPPED(ctxt) == 0) {
c = CUR;
if (c == 0)
break;
NEXT;
if (c == '>')
break;
}
}
/**
* htmlParseHTMLName:
* @ctxt: an HTML parser context
@ -3368,147 +3349,27 @@ htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
return(URI);
}
/**
* htmlParsePI:
* @ctxt: an HTML parser context
*
* Parse an XML Processing Instruction. HTML5 doesn't allow processing
* instructions, so this will be removed at some point.
*/
static void
htmlParsePI(htmlParserCtxtPtr ctxt) {
xmlChar *buf = NULL;
int len = 0;
int size = HTML_PARSER_BUFFER_SIZE;
int cur, l;
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
XML_MAX_HUGE_LENGTH :
XML_MAX_TEXT_LENGTH;
const xmlChar *target;
xmlParserInputState state;
if ((RAW == '<') && (NXT(1) == '?')) {
state = ctxt->instate;
ctxt->instate = XML_PARSER_PI;
/*
* this is a Processing Instruction.
*/
SKIP(2);
/*
* Parse the target name and check for special support like
* namespace.
*/
target = htmlParseName(ctxt);
if (target != NULL) {
if (RAW == '>') {
SKIP(1);
/*
* SAX: PI detected.
*/
if ((ctxt->sax) && (!ctxt->disableSAX) &&
(ctxt->sax->processingInstruction != NULL))
ctxt->sax->processingInstruction(ctxt->userData,
target, NULL);
goto done;
}
buf = xmlMalloc(size);
if (buf == NULL) {
htmlErrMemory(ctxt);
return;
}
cur = CUR;
if (!IS_BLANK(cur)) {
htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
"ParsePI: PI %s space expected\n", target, NULL);
}
SKIP_BLANKS;
cur = CUR_CHAR(l);
while ((cur != 0) && (cur != '>')) {
if (len + 5 >= size) {
xmlChar *tmp;
size *= 2;
tmp = (xmlChar *) xmlRealloc(buf, size);
if (tmp == NULL) {
htmlErrMemory(ctxt);
xmlFree(buf);
return;
}
buf = tmp;
}
if (IS_CHAR(cur)) {
COPY_BUF(buf,len,cur);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in processing instruction "
"0x%X\n", cur);
}
if (len > maxLength) {
htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
"PI %s too long", target, NULL);
xmlFree(buf);
goto done;
}
NEXTL(l);
cur = CUR_CHAR(l);
}
buf[len] = 0;
if (cur != '>') {
htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
"ParsePI: PI %s never end ...\n", target, NULL);
} else {
SKIP(1);
/*
* SAX: PI detected.
*/
if ((ctxt->sax) && (!ctxt->disableSAX) &&
(ctxt->sax->processingInstruction != NULL))
ctxt->sax->processingInstruction(ctxt->userData,
target, buf);
}
xmlFree(buf);
} else {
htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
"PI is not started correctly", NULL, NULL);
}
done:
ctxt->instate = state;
}
}
/**
* htmlParseComment:
* @ctxt: an HTML parser context
* @bogus: true if this is a bogus comment
*
* Parse an HTML comment
*/
static void
htmlParseComment(htmlParserCtxtPtr ctxt) {
htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
xmlChar *buf = NULL;
int len;
int size = HTML_PARSER_BUFFER_SIZE;
int q, ql;
int r, rl;
int cur, l;
int next, nl;
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
XML_MAX_HUGE_LENGTH :
XML_MAX_TEXT_LENGTH;
xmlParserInputState state;
/*
* Check that there is a comment right here.
*/
if ((RAW != '<') || (NXT(1) != '!') ||
(NXT(2) != '-') || (NXT(3) != '-')) return;
state = ctxt->instate;
ctxt->instate = XML_PARSER_COMMENT;
SKIP(4);
buf = xmlMalloc(size);
if (buf == NULL) {
htmlErrMemory(ctxt);
@ -3516,36 +3377,34 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
}
len = 0;
buf[len] = 0;
q = CUR_CHAR(ql);
if (q == 0)
goto unfinished;
if (q == '>') {
htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
cur = '>';
goto finished;
}
NEXTL(ql);
r = CUR_CHAR(rl);
if (r == 0)
goto unfinished;
if (q == '-' && r == '>') {
htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
cur = '>';
goto finished;
}
NEXTL(rl);
cur = CUR_CHAR(l);
while ((cur != 0) &&
((cur != '>') ||
(r != '-') || (q != '-'))) {
NEXTL(l);
next = CUR_CHAR(nl);
if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
"Comment incorrectly closed by '--!>'", NULL, NULL);
cur = '>';
break;
cur = CUR_CHAR(l);
if (!bogus) {
if (cur == '>') {
SKIP(1);
goto done;
} else if ((cur == '-') && (NXT(1) == '>')) {
SKIP(2);
goto done;
}
}
while (cur != 0) {
if (bogus) {
if (cur == '>') {
SKIP(1);
break;
}
} else {
if ((cur == '-') && (NXT(1) == '-')) {
if (NXT(2) == '>') {
SKIP(3);
break;
} else if ((NXT(2) == '!') && (NXT(3) == '>')) {
SKIP(4);
break;
}
}
}
if (len + 5 >= size) {
@ -3556,15 +3415,16 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
if (tmp == NULL) {
xmlFree(buf);
htmlErrMemory(ctxt);
ctxt->instate = state;
return;
}
buf = tmp;
}
if (IS_CHAR(q)) {
COPY_BUF(buf,len,q);
if (IS_CHAR(cur)) {
COPY_BUF(buf,len,cur);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in comment 0x%X\n", q);
"Invalid char in comment 0x%X\n", cur);
}
if (len > maxLength) {
htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
@ -3574,29 +3434,19 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
return;
}
q = r;
ql = rl;
r = cur;
rl = l;
cur = next;
l = nl;
}
finished:
buf[len] = 0;
if (cur == '>') {
SKIP(1);
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
(!ctxt->disableSAX))
ctxt->sax->comment(ctxt->userData, buf);
xmlFree(buf);
ctxt->instate = state;
return;
NEXTL(l);
cur = CUR_CHAR(l);
}
unfinished:
htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
"Comment not terminated \n<!--%.50s\n", buf, NULL);
done:
buf[len] = 0;
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
(!ctxt->disableSAX))
ctxt->sax->comment(ctxt->userData, buf);
xmlFree(buf);
ctxt->instate = state;
return;
}
/**
@ -4294,12 +4144,15 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
BAD_CAST "DOCTYPE" , NULL);
htmlParseDocTypeDecl(ctxt);
} else if ((NXT(2) == '-') && (NXT(3) == '-')) {
htmlParseComment(ctxt);
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
} else {
htmlSkipBogusComment(ctxt);
SKIP(2);
htmlParseComment(ctxt, /* bogus */ 1);
}
} else if (NXT(1) == '?') {
htmlParsePI(ctxt);
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
} else if (IS_ASCII_LETTER(NXT(1))) {
htmlParseElementInternal(ctxt);
} else {
@ -4551,15 +4404,19 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
/*
* Parse possible comments and PIs before any content
*/
while (((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) ||
((CUR == '<') && (NXT(1) == '?'))) {
htmlParseComment(ctxt);
htmlParsePI(ctxt);
while (CUR == '<') {
if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
} else if (NXT(1) == '?') {
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
} else {
break;
}
SKIP_BLANKS;
}
/*
* Then possibly doc type declaration(s) and more Misc
* (doctypedecl Misc*)?
@ -4576,12 +4433,16 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
/*
* Parse possible comments and PIs before any content
*/
while ((PARSER_STOPPED(ctxt) == 0) &&
(((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) ||
((CUR == '<') && (NXT(1) == '?')))) {
htmlParseComment(ctxt);
htmlParsePI(ctxt);
while (CUR == '<') {
if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
} else if (NXT(1) == '?') {
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
} else {
break;
}
SKIP_BLANKS;
}
@ -5200,13 +5061,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
htmlParseComment(ctxt);
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
htmlParsePI(ctxt);
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '!') &&
(UPP(2) == 'D') && (UPP(3) == 'O') &&
@ -5236,13 +5099,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
htmlParseComment(ctxt);
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
htmlParsePI(ctxt);
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
@ -5267,13 +5132,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
htmlParseComment(ctxt);
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
htmlParsePI(ctxt);
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
@ -5489,19 +5356,23 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if ((!terminate) &&
(htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
htmlParseComment(ctxt);
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_CONTENT;
} else {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
htmlSkipBogusComment(ctxt);
SKIP(2);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_CONTENT;
}
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
htmlParsePI(ctxt);
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '/')) {
ctxt->instate = XML_PARSER_END_TAG;

View File

@ -1,2 +1,2 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><p>&#147;</p></body></html>
<!--?a&#147;-->

View File

@ -1,3 +0,0 @@
./test/HTML/758518-tag.html:1: HTML parser error : PI is not started correctly
“
^

View File

@ -1,10 +1,4 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.error: PI is not started correctlySAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&#147;, 2)
SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)
SAX.comment(?a“)
SAX.endDocument()

View File

@ -1,2 +1,3 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<!--<!doctype
-->

View File

@ -1,7 +1,3 @@
./test/HTML/758606.html:1: HTML parser error : Invalid char in comment 0xC
<!-- <!doctype
^
./test/HTML/758606.html:2: HTML parser error : Comment not terminated
<!--<!doctyp
^
^

View File

@ -1,6 +1,6 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.error: Invalid char in comment 0xC
SAX.error: Comment not terminated
<!--<!doctyp
SAX.comment(<!doctype
)
SAX.endDocument()

View File

@ -1,2 +1,3 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<!--&#145;<!dOctYPE
-->

View File

@ -1,7 +1,3 @@
./test/HTML/758606_2.html:1: HTML parser error : Invalid char in comment 0xC
‘<!dOctYPE
^
./test/HTML/758606_2.html:2: HTML parser error : Comment not terminated
<!--‘<!dOctYP
^
<!--
^

View File

@ -1,6 +1,6 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.error: Invalid char in comment 0xC
SAX.error: Comment not terminated
<!--‘<!dOctYP
SAX.comment(‘<!dOctYPE
)
SAX.endDocument()

View File

@ -1,3 +0,0 @@
./test/HTML/comments.html:10: HTML parser error : Comment incorrectly closed by '--!>'
<!--incorrectly closed comment--!><span id=under-test>whatwg guidance is
^

View File

@ -24,7 +24,7 @@ SAX.characters(
SAX.startElement(div)
SAX.characters(
, 9)
SAX.error: Comment incorrectly closed by '--!>'SAX.comment(incorrectly closed comment)
SAX.comment(incorrectly closed comment)
SAX.startElement(span, id='under-test')
SAX.characters(whatwg guidance is that this s, 49)
SAX.endElement(span)

View File

@ -1,3 +0,0 @@
./test/HTML/comments2.html:10: HTML parser error : Comment incorrectly closed by '--!>'
<!--incorrectly closed comment--!><span id=under-test>whatwg guidance is
^

View File

@ -24,7 +24,7 @@ SAX.characters(
SAX.startElement(div)
SAX.characters(
, 9)
SAX.error: Comment incorrectly closed by '--!>'SAX.comment(incorrectly closed comment)
SAX.comment(incorrectly closed comment)
SAX.startElement(span, id='under-test')
SAX.characters(whatwg guidance is that this s, 49)
SAX.endElement(span)

View File

@ -1,6 +0,0 @@
./test/HTML/comments3.html:10: HTML parser error : Comment abruptly ended
<!-->the previous node should be an empty comment, and this should be a
^
./test/HTML/comments3.html:13: HTML parser error : Comment abruptly ended
<!--->the previous node should be an empty comment, and this should be a
^

View File

@ -24,7 +24,7 @@ SAX.characters(
SAX.startElement(div)
SAX.characters(
, 9)
SAX.error: Comment abruptly endedSAX.comment()
SAX.comment()
SAX.characters(the previous node should be an, 86)
SAX.endElement(div)
SAX.characters(
@ -32,7 +32,7 @@ SAX.characters(
SAX.startElement(div)
SAX.characters(
, 9)
SAX.error: Comment abruptly endedSAX.comment()
SAX.comment()
SAX.characters(the previous node should be an, 86)
SAX.endElement(div)
SAX.characters(

View File

@ -1,6 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<body>
...
<!--[if !supportLists]-->...<!--[endif]-->
</body>
</html>

View File

@ -1,6 +0,0 @@
./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
<![if !supportLists]>...<![endif]>
^
./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
<![if !supportLists]>...<![endif]>
^

View File

@ -6,9 +6,9 @@ SAX.characters(
SAX.startElement(body)
SAX.characters(
, 5)
SAX.error: Incorrectly opened comment
SAX.comment([if !supportLists])
SAX.characters(..., 3)
SAX.error: Incorrectly opened comment
SAX.comment([endif])
SAX.characters(
, 3)
SAX.endElement(body)

View File

@ -1,5 +1,5 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<?xml-stylesheet href="./css/ht2html.css" type="text/css"?><html>
<!--?xml-stylesheet href="./css/ht2html.css" type="text/css"?--><html>
<!-- THIS PAGE IS AUTOMATICALLY GENERATED. DO NOT EDIT. -->
<head>
<title>Python Programming Language</title>

View File

@ -1,7 +1,7 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.internalSubset(html, -//W3C//DTD HTML 4.01 Transitional//EN, http://www.w3.org/TR/html4/loose.dtd)
SAX.processingInstruction(xml-stylesheet, href="./css/ht2html.css" type="text/css"?)
SAX.comment(?xml-stylesheet href="./css/ht2html.css" type="text/css"?)
SAX.startElement(html)
SAX.characters(
, 1)

View File

@ -1,4 +1,4 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<?xml encoding="UTF-8"><html><body>
<!--?xml encoding="UTF-8"--><html><body>
<p>&ouml;&auml;&uuml;&szlig;</p>
</body></html>

View File

@ -1,6 +1,6 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.processingInstruction(xml, encoding="UTF-8")
SAX.comment(?xml encoding="UTF-8")
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)