1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-28 22:50:07 +03:00

html: Make implied <p> tags more deterministic

libxml2's HTML parser adds <p> start tags in some situations. This
behavior, which doesn't follow any standard, was added in 2000, see
here: http://veillard.com/XML/messages/0655.html

Text nodes that only contain whitespace don't imply a <p> tag, but the
whitespace check cannot work reliably if we're parsing partial text data
which can happen with both pull and push parser.

The logic in `areBlanks` is hard to follow. The checks involving `CUR`
depend on the position of the input pointer and seem dubious. It's also
possible that the behavior changed inadvertently with a later commit.
As a result, it's hard to come up with good test cases.

We now process leading whitespace before creating implied tags. This is
more in line with HTML5 and should avoid at least some issues with
partial text data.

For example, parsing the string "<head>   x" used to result in:

<html>
<head></head>
<body><p>   x</p></body>
</html>

And now results in:

<html>
<head>   </head>
<body><p>x</p></body>
</html>

Except for the implied <p> tag, this matches HTML5.
This commit is contained in:
Nick Wellnhofer 2025-02-13 14:04:10 +01:00
parent ebbc31cc6b
commit 71122421a1
4 changed files with 56 additions and 7 deletions

View File

@ -2965,16 +2965,44 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
if ((mode == 0) || (mode == DATA_RCDATA) ||
(ctxt->sax->cdataBlock == NULL)) {
int blank = areBlanks(ctxt, buf, size);
if ((ctxt->name == NULL) ||
(xmlStrEqual(ctxt->name, BAD_CAST "html")) ||
(xmlStrEqual(ctxt->name, BAD_CAST "head"))) {
int i;
if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) {
/*
* Add leading whitespace to html or head elements before
* calling htmlCheckParagraph.
*/
for (i = 0; i < size; i++)
if (!IS_WS_HTML(buf[i]))
break;
if (i > 0) {
if (!ctxt->keepBlanks) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, i);
}
buf += i;
size -= i;
}
if (size <= 0)
return;
htmlCheckParagraph(ctxt);
}
if ((mode == 0) &&
(!ctxt->keepBlanks) &&
(areBlanks(ctxt, buf, size))) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, size);
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size);
} else {
if ((mode == 0) && (blank < 0))
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, size);
}

View File

@ -0,0 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head> </head>
<body><p>x
</p></body>
</html>

View File

@ -0,0 +1,14 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
SAX.startElement(head)
SAX.characters( , 3)
SAX.endElement(head)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(x
, 2)
SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)
SAX.endDocument()

1
test/HTML/implied1.html Normal file
View File

@ -0,0 +1 @@
<head> x