1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-13 20:58:16 +03:00

Fix several quadratic runtime issues in HTML push parser

Fix a few remaining cases where the HTML push parser would scan more
content during lookahead than being parsed later.

Make sure that htmlParseDocTypeDecl consumes all content up to the
final '>' in case of errors. The old comment said "We shouldn't try to
resynchronize", but ignoring invalid content is also what the HTML5
spec mandates.

Likewise, make htmlParseEndTag skip to the final '>' in invalid end
tags even if not in recovery mode. This is probably the most visible
change in practice and leads to different output for some tests but is
also more in line with HTML5.

Make sure that htmlParsePI and htmlParseComment don't abort if invalid
characters are encountered but log an error and ignore the character.

Change some other end-of-buffer checks to test for a zero byte instead
of relying on IS_CHAR.

Fix usage of IS_CHAR macro in htmlParseScript.
This commit is contained in:
Nick Wellnhofer 2020-07-23 17:34:08 +02:00
parent 10d0947249
commit 93ce33c2b8
14 changed files with 122 additions and 155 deletions

View File

@ -2802,47 +2802,39 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
size_t len = 0, startPosition = 0;
int err = 0;
int quote;
xmlChar *ret = NULL;
if (CUR == '"') {
NEXT;
if (CUR_PTR < BASE_PTR)
return(ret);
startPosition = CUR_PTR - BASE_PTR;
while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
NEXT;
len++;
}
if (!IS_CHAR_CH(CUR)) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished SystemLiteral\n", NULL, NULL);
} else {
ret = xmlStrndup((BASE_PTR+startPosition), len);
NEXT;
}
} else if (CUR == '\'') {
NEXT;
if (CUR_PTR < BASE_PTR)
return(ret);
startPosition = CUR_PTR - BASE_PTR;
while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
NEXT;
len++;
}
if (!IS_CHAR_CH(CUR)) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished SystemLiteral\n", NULL, NULL);
} else {
ret = xmlStrndup((BASE_PTR+startPosition), len);
NEXT;
}
} else {
if ((CUR != '"') && (CUR != '\'')) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
" or ' expected\n", NULL, NULL);
"SystemLiteral \" or ' expected\n", NULL, NULL);
return(NULL);
}
quote = CUR;
NEXT;
if (CUR_PTR < BASE_PTR)
return(ret);
startPosition = CUR_PTR - BASE_PTR;
while ((CUR != 0) && (CUR != quote)) {
/* TODO: Handle UTF-8 */
if (!IS_CHAR_CH(CUR)) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in SystemLiteral 0x%X\n", CUR);
err = 1;
}
NEXT;
len++;
}
if (CUR != quote) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished SystemLiteral\n", NULL, NULL);
} else {
NEXT;
if (err == 0)
ret = xmlStrndup((BASE_PTR+startPosition), len);
}
return(ret);
@ -2862,51 +2854,42 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
size_t len = 0, startPosition = 0;
int err = 0;
int quote;
xmlChar *ret = NULL;
if ((CUR != '"') && (CUR != '\'')) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
"PubidLiteral \" or ' expected\n", NULL, NULL);
return(NULL);
}
quote = CUR;
NEXT;
/*
* Name ::= (Letter | '_') (NameChar)*
*/
if (CUR == '"') {
NEXT;
if (CUR_PTR < BASE_PTR)
return(ret);
startPosition = CUR_PTR - BASE_PTR;
if (CUR_PTR < BASE_PTR)
return(ret);
startPosition = CUR_PTR - BASE_PTR;
while (IS_PUBIDCHAR_CH(CUR)) {
len++;
NEXT;
while ((CUR != 0) && (CUR != quote)) {
if (!IS_PUBIDCHAR_CH(CUR)) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in PubidLiteral 0x%X\n", CUR);
err = 1;
}
if (CUR != '"') {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished PubidLiteral\n", NULL, NULL);
} else {
ret = xmlStrndup((BASE_PTR + startPosition), len);
NEXT;
}
} else if (CUR == '\'') {
len++;
NEXT;
}
if (CUR_PTR < BASE_PTR)
return(ret);
startPosition = CUR_PTR - BASE_PTR;
while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
len++;
NEXT;
}
if (CUR != '\'') {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished PubidLiteral\n", NULL, NULL);
} else {
ret = xmlStrndup((BASE_PTR + startPosition), len);
NEXT;
}
if (CUR != '"') {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
"Unfinished PubidLiteral\n", NULL, NULL);
} else {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
"PubidLiteral \" or ' expected\n", NULL, NULL);
NEXT;
if (err == 0)
ret = xmlStrndup((BASE_PTR + startPosition), len);
}
return(ret);
@ -2972,7 +2955,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
}
}
}
if (IS_CHAR_CH(cur)) {
if (IS_CHAR(cur)) {
COPY_BUF(l,buf,nbchar,cur);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
@ -3242,7 +3225,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {
}
SKIP_BLANKS;
cur = CUR_CHAR(l);
while (IS_CHAR(cur) && (cur != '>')) {
while ((cur != 0) && (cur != '>')) {
if (len + 5 >= size) {
xmlChar *tmp;
@ -3261,7 +3244,13 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {
GROW;
count = 0;
}
COPY_BUF(l,buf,len,cur);
if (IS_CHAR(cur)) {
COPY_BUF(l,buf,len,cur);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in processing instruction "
"0x%X\n", cur);
}
NEXTL(l);
cur = CUR_CHAR(l);
if (cur == 0) {
@ -3331,15 +3320,15 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
len = 0;
buf[len] = 0;
q = CUR_CHAR(ql);
if (!IS_CHAR(q))
if (q == 0)
goto unfinished;
NEXTL(ql);
r = CUR_CHAR(rl);
if (!IS_CHAR(r))
if (r == 0)
goto unfinished;
NEXTL(rl);
cur = CUR_CHAR(l);
while (IS_CHAR(cur) &&
while ((cur != 0) &&
((cur != '>') ||
(r != '-') || (q != '-'))) {
if (len + 5 >= size) {
@ -3355,7 +3344,12 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
}
buf = tmp;
}
COPY_BUF(ql,buf,len,q);
if (IS_CHAR(q)) {
COPY_BUF(ql,buf,len,q);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in comment 0x%X\n", q);
}
q = r;
ql = rl;
r = cur;
@ -3369,7 +3363,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) {
}
}
buf[len] = 0;
if (IS_CHAR(cur)) {
if (cur == '>') {
NEXT;
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
(!ctxt->disableSAX))
@ -3516,9 +3510,12 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
if (CUR != '>') {
htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
"DOCTYPE improperly terminated\n", NULL, NULL);
/* We shouldn't try to resynchronize ... */
/* Ignore bogus content */
while ((CUR != 0) && (CUR != '>'))
NEXT;
}
NEXT;
if (CUR == '>')
NEXT;
/*
* Create or update the document accordingly to the DOCTYPE
@ -3996,19 +3993,14 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
* We should definitely be at the ending "S? '>'" part
*/
SKIP_BLANKS;
if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
if (CUR != '>') {
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"End tag : expected '>'\n", NULL, NULL);
if (ctxt->recovery) {
/*
* We're not at the ending > !!
* Error, unless in recover mode where we search forwards
* until we find a >
*/
while (CUR != '\0' && CUR != '>') NEXT;
NEXT;
}
} else
/* Skip to next '>' */
while ((CUR != 0) && (CUR != '>'))
NEXT;
}
if (CUR == '>')
NEXT;
/*
@ -4198,7 +4190,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
/* Dump the bogus tag like browsers do */
while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
while ((CUR != 0) && (CUR != '>'))
NEXT;
if (currentNode != NULL)
@ -4413,7 +4405,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
*/
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
while (IS_CHAR_CH(CUR)) {
while (CUR != 0) {
oldptr = ctxt->input->cur;
htmlParseContent(ctxt);
if (oldptr==ctxt->input->cur) break;
@ -4430,7 +4422,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info);
}
if (!IS_CHAR_CH(CUR)) {
if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
}
@ -4451,7 +4443,7 @@ htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
htmlNodeInfoPop(ctxt);
}
if (!IS_CHAR_CH(CUR)) {
if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
}
}
@ -4600,7 +4592,7 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
/* Dump the bogus tag like browsers do */
while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
while ((CUR == 0) && (CUR != '>'))
NEXT;
htmlParserFinishElementParsing(ctxt);

View File

@ -96,6 +96,10 @@ attr_style=" style=\"\""
comment="<!-- -->"
doctype="<!DOCTYPE d>"
doctype_system="<!DOCTYPE s SYSTEM \"u\">"
doctype_public="<!DOCTYPE p PUBLIC \"i\" \"u\">"
pi="<?a?>"
ref_lt="&lt;"

View File

@ -1,2 +1,2 @@
<!DOCTYPE >
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">

View File

@ -1,16 +1,7 @@
./test/HTML/758606.html:1: HTML parser error : Comment not terminated
<!--
./test/HTML/758606.html:1: HTML parser error : Invalid char in comment 0xC
<!-- <!doctype
^
./test/HTML/758606.html:1: HTML parser error : Invalid char in CDATA 0xC
<!-- <!doctype
^
./test/HTML/758606.html:1: HTML parser error : Misplaced DOCTYPE declaration
<!-- <!doctype
^
./test/HTML/758606.html:2: HTML parser error : htmlParseDocTypeDecl : no DOCTYPE name !
^
./test/HTML/758606.html:2: HTML parser error : DOCTYPE improperly terminated
^
./test/HTML/758606.html:2: HTML parser error : Comment not terminated
<!--<!doctyp
^

View File

@ -1,10 +1,6 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.error: Invalid char in comment 0xC
SAX.error: Comment not terminated
<!--
SAX.error: Invalid char in CDATA 0xC
SAX.error: Misplaced DOCTYPE declaration
SAX.error: htmlParseDocTypeDecl : no DOCTYPE name !
SAX.error: DOCTYPE improperly terminated
SAX.internalSubset((null), , )
<!--<!doctyp
SAX.endDocument()

View File

@ -1,2 +1,2 @@
<!DOCTYPE >
<html><body><p>&#145;</p></body></html>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">

View File

@ -1,16 +1,7 @@
./test/HTML/758606_2.html:1: HTML parser error : Comment not terminated
<!--
<!-- <!dOctYPE
^
./test/HTML/758606_2.html:1: HTML parser error : Invalid char in CDATA 0xC
<!-- <!dOctYPE
^
./test/HTML/758606_2.html:1: HTML parser error : Misplaced DOCTYPE declaration
./test/HTML/758606_2.html:1: HTML parser error : Invalid char in comment 0xC
‘<!dOctYPE
^
./test/HTML/758606_2.html:2: HTML parser error : htmlParseDocTypeDecl : no DOCTYPE name !
^
./test/HTML/758606_2.html:2: HTML parser error : DOCTYPE improperly terminated
./test/HTML/758606_2.html:2: HTML parser error : Comment not terminated
<!--‘<!dOctYP
^

View File

@ -1,17 +1,6 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.error: Invalid char in comment 0xC
SAX.error: Comment not terminated
<!--
SAX.error: Invalid char in CDATA 0xC
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&#145;, 2)
SAX.error: Misplaced DOCTYPE declaration
SAX.error: htmlParseDocTypeDecl : no DOCTYPE name !
SAX.error: DOCTYPE improperly terminated
SAX.internalSubset((null), , )
SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)
<!--‘<!dOctYP
SAX.endDocument()

View File

@ -521,7 +521,6 @@ eval("page" + id + " = window.open(URL, '" + id + "', 'toolbars=0, scrollbars=0,
document.write("ype=js&size=100x90&url=http://www.goto.com/");
document.write("d/search/ssn/&target=_blank&Partner=SSN8042");
document.write("DF8478957377>");
document.write("RIPT>");
} else {
document.write("<A TARGET=_blank ");
document.write("HREF=http://www.goto.com/d/search/ssn/?from");

View File

@ -46,9 +46,9 @@ om/ad_static.asp?pid=2097&sid=1881&asid=7708"></a></IFRAME></CENTER></LI></FONT>
./test/HTML/doc3.htm:803: HTML parser error : End tag : expected '>'
document.write("DF8478957377></SC");
^
./test/HTML/doc3.htm:803: HTML parser error : Unexpected end tag : sc
document.write("DF8478957377></SC");
^
./test/HTML/doc3.htm:804: HTML parser error : Unexpected end tag : sc
document.write("RIPT>");
^
./test/HTML/doc3.htm:811: HTML parser error : Unexpected end tag : a
document.write("ype=gif&size=100x90></A>");
^

View File

@ -2700,7 +2700,8 @@ SAX.cdata(");
SAX.error: End tag : expected '>'
SAX.error: Unexpected end tag : sc
SAX.cdata(");
document.write("RI, 361)
} else {
d, 328)
SAX.error: Unexpected end tag : a
SAX.cdata(");
}

View File

@ -462,13 +462,13 @@ or <a href="/news/pointcast/0,1366,,00.html">PointCast</a></font><br>
<font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/school/0,1383,,00.html">Making the Grade</a></b></font><br><font size="2" face="Arial, Helvetica, sans-serif"><font size="1" face="Arial, Geneva, sans-serif" color="#000000">Reading, writing, and ROM. <br><i>Sponsored by <a href="http://r.hotwired.com/r/wn_sch_r_nav_uop/http://ads25.focalink.com/SmartBanner/page?12630.53" style="text-decoration:none"><font color="#000000">U of Phoenix</font></a></i></font><br><br>
<font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/infostructure/0,1377,,00.html">Infostructure</a></b></font><br><font size="1" face="Arial, Helvetica, sans-serif" color="#000000">An IS/IT resource <br><i>Sponsored by <a href="http://r.wired.com/r/wn_is_r_ssec/http://ad.doubleclick.net/clk;653163;3599571;s?http://www.sprintbiz.com/s%0Aervlet/appservlet?from=/wired/sprint/&amp;template=/security/security.html&amp;SITE=%0Awired.com&amp;BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</font></a></i></font></font><br><br>
<font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/infostructure/0,1377,,00.html">Infostructure</a></b></font><br><font size="1" face="Arial, Helvetica, sans-serif" color="#000000">An IS/IT resource <br><i>Sponsored by <a href="http://r.wired.com/r/wn_is_r_ssec/http://ad.doubleclick.net/clk;653163;3599571;s?http://www.sprintbiz.com/s%0Aervlet/appservlet?from=/wired/sprint/&amp;template=/security/security.html&amp;SITE=%0Awired.com&amp;BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</font></a></i></font><br><br>
<font size="2" face="Arial,Helvetica, sans-serif"><b><a href="/news/y2k/0,1360,,00.html">Y2K Watch</a></b></font><br><font size="2" face="Arial, Helvetica, sans-serif"><font size="1" face="Arial, Geneva, sans-serif" color="#000000">Tick... Tick... Tick...</font><br><br>
<font face="Arial, Helvetica, sans-serif" size="2"><b><i><a href="/news/special_reports/1,1293,,00.html">More Hoo-Ha</a></i></b></font><br>&nbsp;<br>
</font></font></font></font></font></font></font></font>
</font></font></font></font></font></font></font></font></font>
</td>
</tr>
<!-- start of Gen News -->

View File

@ -242,6 +242,9 @@ com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</a>
</td>
^
./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font
</td>
^
./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font
</td>
^
./test/HTML/wired.html:432: HTML parser error : htmlParseEntityRef: expecting ';'

View File

@ -1962,7 +1962,6 @@ SAX.endElement(a)
SAX.endElement(i)
SAX.error: End tag : expected '>'
SAX.endElement(font)
SAX.endElement(font)
SAX.startElement(br)
SAX.endElement(br)
SAX.startElement(br)
@ -2023,6 +2022,8 @@ SAX.error: Opening and ending tag mismatch: td and font
SAX.endElement(font)
SAX.error: Opening and ending tag mismatch: td and font
SAX.endElement(font)
SAX.error: Opening and ending tag mismatch: td and font
SAX.endElement(font)
SAX.endElement(td)
SAX.characters(
, 1)