mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-03-28 22:50:07 +03:00
html: Make implied <p> tags more deterministic
libxml2's HTML parser adds <p> start tags in some situations. This behavior, which doesn't follow any standard, was added in 2000, see here: http://veillard.com/XML/messages/0655.html Text nodes that only contain whitespace don't imply a <p> tag, but the whitespace check cannot work reliably if we're parsing partial text data which can happen with both pull and push parser. The logic in `areBlanks` is hard to follow. The checks involving `CUR` depend on the position of the input pointer and seem dubious. It's also possible that the behavior changed inadvertently with a later commit. As a result, it's hard to come up with good test cases. We now process leading whitespace before creating implied tags. This is more in line with HTML5 and should avoid at least some issues with partial text data. For example, parsing the string "<head> x" used to result in: <html> <head></head> <body><p> x</p></body> </html> And now results in: <html> <head> </head> <body><p>x</p></body> </html> Except for the implied <p> tag, this matches HTML5.
This commit is contained in:
parent
ebbc31cc6b
commit
71122421a1
42
HTMLparser.c
42
HTMLparser.c
@ -2965,16 +2965,44 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
|
||||
|
||||
if ((mode == 0) || (mode == DATA_RCDATA) ||
|
||||
(ctxt->sax->cdataBlock == NULL)) {
|
||||
int blank = areBlanks(ctxt, buf, size);
|
||||
if ((ctxt->name == NULL) ||
|
||||
(xmlStrEqual(ctxt->name, BAD_CAST "html")) ||
|
||||
(xmlStrEqual(ctxt->name, BAD_CAST "head"))) {
|
||||
int i;
|
||||
|
||||
if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) {
|
||||
/*
|
||||
* Add leading whitespace to html or head elements before
|
||||
* calling htmlCheckParagraph.
|
||||
*/
|
||||
for (i = 0; i < size; i++)
|
||||
if (!IS_WS_HTML(buf[i]))
|
||||
break;
|
||||
|
||||
if (i > 0) {
|
||||
if (!ctxt->keepBlanks) {
|
||||
if (ctxt->sax->ignorableWhitespace != NULL)
|
||||
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i);
|
||||
} else {
|
||||
if (ctxt->sax->characters != NULL)
|
||||
ctxt->sax->characters(ctxt->userData, buf, i);
|
||||
}
|
||||
|
||||
buf += i;
|
||||
size -= i;
|
||||
}
|
||||
|
||||
if (size <= 0)
|
||||
return;
|
||||
|
||||
htmlCheckParagraph(ctxt);
|
||||
}
|
||||
|
||||
if ((mode == 0) &&
|
||||
(!ctxt->keepBlanks) &&
|
||||
(areBlanks(ctxt, buf, size))) {
|
||||
if (ctxt->sax->ignorableWhitespace != NULL)
|
||||
ctxt->sax->ignorableWhitespace(ctxt->userData,
|
||||
buf, size);
|
||||
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size);
|
||||
} else {
|
||||
if ((mode == 0) && (blank < 0))
|
||||
htmlCheckParagraph(ctxt);
|
||||
|
||||
if (ctxt->sax->characters != NULL)
|
||||
ctxt->sax->characters(ctxt->userData, buf, size);
|
||||
}
|
||||
|
6
result/HTML/implied1.html
Normal file
6
result/HTML/implied1.html
Normal file
@ -0,0 +1,6 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head> </head>
|
||||
<body><p>x
|
||||
</p></body>
|
||||
</html>
|
14
result/HTML/implied1.html.sax
Normal file
14
result/HTML/implied1.html.sax
Normal file
@ -0,0 +1,14 @@
|
||||
SAX.setDocumentLocator()
|
||||
SAX.startDocument()
|
||||
SAX.startElement(html)
|
||||
SAX.startElement(head)
|
||||
SAX.characters( , 3)
|
||||
SAX.endElement(head)
|
||||
SAX.startElement(body)
|
||||
SAX.startElement(p)
|
||||
SAX.characters(x
|
||||
, 2)
|
||||
SAX.endElement(p)
|
||||
SAX.endElement(body)
|
||||
SAX.endElement(html)
|
||||
SAX.endDocument()
|
1
test/HTML/implied1.html
Normal file
1
test/HTML/implied1.html
Normal file
@ -0,0 +1 @@
|
||||
<head> x
|
Loading…
x
Reference in New Issue
Block a user