1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-13 12:59:29 +03:00

Work around lxml API abuse

Make xmlNodeDumpOutput and htmlNodeDumpFormatOutput work with corrupted
parent pointers. This used to work with the old recursive code but the
non-recursive rewrite required parent pointers to be set correctly.

Unfortunately, lxml relies on the old behavior and passes subtrees with
a corrupted structure. Fall back to a recursive function call if an
invalid parent pointer is detected.

Fixes #255.
This commit is contained in:
Nick Wellnhofer
2021-05-18 20:08:28 +02:00
parent a7b9f3ebdf
commit 85b1792e37
2 changed files with 49 additions and 28 deletions

View File

@ -744,7 +744,7 @@ void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
int format) { int format) {
xmlNodePtr root; xmlNodePtr root, parent;
xmlAttrPtr attr; xmlAttrPtr attr;
const htmlElemDesc * info; const htmlElemDesc * info;
@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
} }
root = cur; root = cur;
parent = cur->parent;
while (1) { while (1) {
switch (cur->type) { switch (cur->type) {
case XML_HTML_DOCUMENT_NODE: case XML_HTML_DOCUMENT_NODE:
@ -762,13 +763,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
if (((xmlDocPtr) cur)->intSubset != NULL) { if (((xmlDocPtr) cur)->intSubset != NULL) {
htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
} }
if (cur->children != NULL) { /* Always validate cur->parent when descending. */
if ((cur->parent == parent) && (cur->children != NULL)) {
parent = cur;
cur = cur->children; cur = cur->children;
continue; continue;
} }
break; break;
case XML_ELEMENT_NODE: case XML_ELEMENT_NODE:
/*
* Some users like lxml are known to pass nodes with a corrupted
* tree structure. Fall back to a recursive call to handle this
* case.
*/
if ((cur->parent != parent) && (cur->children != NULL)) {
htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
break;
}
/* /*
* Get specific HTML info for that node. * Get specific HTML info for that node.
*/ */
@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
(cur->name != NULL) && (cur->name != NULL) &&
(cur->name[0] != 'p')) /* p, pre, param */ (cur->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWriteString(buf, "\n"); xmlOutputBufferWriteString(buf, "\n");
parent = cur;
cur = cur->children; cur = cur->children;
continue; continue;
} }
@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
(info != NULL) && (!info->isinline)) { (info != NULL) && (!info->isinline)) {
if ((cur->next->type != HTML_TEXT_NODE) && if ((cur->next->type != HTML_TEXT_NODE) &&
(cur->next->type != HTML_ENTITY_REF_NODE) && (cur->next->type != HTML_ENTITY_REF_NODE) &&
(cur->parent != NULL) && (parent != NULL) &&
(cur->parent->name != NULL) && (parent->name != NULL) &&
(cur->parent->name[0] != 'p')) /* p, pre, param */ (parent->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWriteString(buf, "\n"); xmlOutputBufferWriteString(buf, "\n");
} }
@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
break; break;
if (((cur->name == (const xmlChar *)xmlStringText) || if (((cur->name == (const xmlChar *)xmlStringText) ||
(cur->name != (const xmlChar *)xmlStringTextNoenc)) && (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
((cur->parent == NULL) || ((parent == NULL) ||
((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
(xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
xmlChar *buffer; xmlChar *buffer;
buffer = xmlEncodeEntitiesReentrant(doc, cur->content); buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
break; break;
} }
/* cur = parent;
* The parent should never be NULL here but we want to handle /* cur->parent was validated when descending. */
* corrupted documents gracefully. parent = cur->parent;
*/
if (cur->parent == NULL)
return;
cur = cur->parent;
if ((cur->type == XML_HTML_DOCUMENT_NODE) || if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
(cur->type == XML_DOCUMENT_NODE)) { (cur->type == XML_DOCUMENT_NODE)) {
@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
(cur->next != NULL)) { (cur->next != NULL)) {
if ((cur->next->type != HTML_TEXT_NODE) && if ((cur->next->type != HTML_TEXT_NODE) &&
(cur->next->type != HTML_ENTITY_REF_NODE) && (cur->next->type != HTML_ENTITY_REF_NODE) &&
(cur->parent != NULL) && (parent != NULL) &&
(cur->parent->name != NULL) && (parent->name != NULL) &&
(cur->parent->name[0] != 'p')) /* p, pre, param */ (parent->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWriteString(buf, "\n"); xmlOutputBufferWriteString(buf, "\n");
} }
} }

View File

@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
static void static void
xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
int format = ctxt->format; int format = ctxt->format;
xmlNodePtr tmp, root, unformattedNode = NULL; xmlNodePtr tmp, root, unformattedNode = NULL, parent;
xmlAttrPtr attr; xmlAttrPtr attr;
xmlChar *start, *end; xmlChar *start, *end;
xmlOutputBufferPtr buf; xmlOutputBufferPtr buf;
@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
buf = ctxt->buf; buf = ctxt->buf;
root = cur; root = cur;
parent = cur->parent;
while (1) { while (1) {
switch (cur->type) { switch (cur->type) {
case XML_DOCUMENT_NODE: case XML_DOCUMENT_NODE:
@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
break; break;
case XML_DOCUMENT_FRAG_NODE: case XML_DOCUMENT_FRAG_NODE:
if (cur->children != NULL) { /* Always validate cur->parent when descending. */
if ((cur->parent == parent) && (cur->children != NULL)) {
parent = cur;
cur = cur->children; cur = cur->children;
continue; continue;
} }
@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
break; break;
case XML_ELEMENT_NODE: case XML_ELEMENT_NODE:
if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput)) /*
* Some users like lxml are known to pass nodes with a corrupted
* tree structure. Fall back to a recursive call to handle this
* case.
*/
if ((cur->parent != parent) && (cur->children != NULL)) {
xmlNodeDumpOutputInternal(ctxt, cur);
break;
}
if ((ctxt->level > 0) && (ctxt->format == 1) &&
(xmlIndentTreeOutput))
xmlOutputBufferWrite(buf, ctxt->indent_size * xmlOutputBufferWrite(buf, ctxt->indent_size *
(ctxt->level > ctxt->indent_nr ? (ctxt->level > ctxt->indent_nr ?
ctxt->indent_nr : ctxt->level), ctxt->indent_nr : ctxt->level),
@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
xmlOutputBufferWrite(buf, 1, ">"); xmlOutputBufferWrite(buf, 1, ">");
if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n"); if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n");
if (ctxt->level >= 0) ctxt->level++; if (ctxt->level >= 0) ctxt->level++;
parent = cur;
cur = cur->children; cur = cur->children;
continue; continue;
} }
@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
break; break;
} }
/* cur = parent;
* The parent should never be NULL here but we want to handle /* cur->parent was validated when descending. */
* corrupted documents gracefully. parent = cur->parent;
*/
if (cur->parent == NULL)
return;
cur = cur->parent;
if (cur->type == XML_ELEMENT_NODE) { if (cur->type == XML_ELEMENT_NODE) {
if (ctxt->level > 0) ctxt->level--; if (ctxt->level > 0) ctxt->level--;