1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-31 06:50:06 +03:00

parser: Improve handling of NOBLANKS option

Don't change the SAX handler.

Use a helper function to invoke "characters" SAX callback.

The old code didn't advance the input pointer consistently before
invoking the callback. There was also some inconsistency wrt to
ctxt->space handling. I don't understand the ctxt->space thing, but
now we always behave like the non-complex case before.
This commit is contained in:
Nick Wellnhofer 2025-01-31 16:21:20 +01:00
parent 7a8722f557
commit 1f5b5371cf
2 changed files with 39 additions and 97 deletions

2
SAX2.c
View File

@ -2733,7 +2733,7 @@ xmlSAXVersion(xmlSAXHandler *hdlr, int version)
hdlr->reference = xmlSAX2Reference;
hdlr->characters = xmlSAX2Characters;
hdlr->cdataBlock = xmlSAX2CDataBlock;
hdlr->ignorableWhitespace = xmlSAX2Characters;
hdlr->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
hdlr->processingInstruction = xmlSAX2ProcessingInstruction;
hdlr->comment = xmlSAX2Comment;
hdlr->warning = xmlParserWarning;

134
parser.c
View File

@ -2978,13 +2978,6 @@ static int areBlanks(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
int i;
xmlNodePtr lastChild;
/*
* Don't spend time trying to differentiate them, the same callback is
* used !
*/
if (ctxt->sax->ignorableWhitespace == ctxt->sax->characters)
return(0);
/*
* Check for xml:space value.
*/
@ -4865,6 +4858,34 @@ static const unsigned char test_char_data[256] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
static void
xmlCharacters(xmlParserCtxtPtr ctxt, const xmlChar *buf, int size) {
if ((ctxt->sax == NULL) || (ctxt->disableSAX))
return;
/*
* Calling areBlanks with only parts of a text node
* is fundamentally broken, making the NOBLANKS option
* essentially unusable.
*/
if ((!ctxt->keepBlanks) &&
(ctxt->sax->ignorableWhitespace != ctxt->sax->characters) &&
(areBlanks(ctxt, buf, size, 1))) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, size);
/*
* The old code used to update this value for "complex" data
* even if keepBlanks was true. This was probably a bug.
*/
if ((!ctxt->keepBlanks) && (*ctxt->space == -1))
*ctxt->space = -2;
}
}
/**
* xmlParseCharDataInternal:
* @ctxt: an XML parser context
@ -4910,32 +4931,7 @@ get_more_space:
const xmlChar *tmp = ctxt->input->cur;
ctxt->input->cur = in;
if ((ctxt->sax != NULL) &&
(ctxt->disableSAX == 0) &&
(ctxt->sax->ignorableWhitespace !=
ctxt->sax->characters)) {
/*
* Calling areBlanks with only parts of a text node
* is fundamentally broken, making the NOBLANKS option
* essentially unusable.
*/
if (areBlanks(ctxt, tmp, nbchar, 1)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
tmp, nbchar);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
tmp, nbchar);
if (*ctxt->space == -1)
*ctxt->space = -2;
}
} else if ((ctxt->sax != NULL) &&
(ctxt->disableSAX == 0) &&
(ctxt->sax->characters != NULL)) {
ctxt->sax->characters(ctxt->userData,
tmp, nbchar);
}
xmlCharacters(ctxt, tmp, nbchar);
}
return;
}
@ -4968,35 +4964,13 @@ get_more:
}
nbchar = in - ctxt->input->cur;
if (nbchar > 0) {
if ((ctxt->sax != NULL) &&
(ctxt->disableSAX == 0) &&
(ctxt->sax->ignorableWhitespace !=
ctxt->sax->characters) &&
(IS_BLANK_CH(*ctxt->input->cur))) {
const xmlChar *tmp = ctxt->input->cur;
ctxt->input->cur = in;
const xmlChar *tmp = ctxt->input->cur;
ctxt->input->cur = in;
if (areBlanks(ctxt, tmp, nbchar, 0)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
tmp, nbchar);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
tmp, nbchar);
if (*ctxt->space == -1)
*ctxt->space = -2;
}
line = ctxt->input->line;
col = ctxt->input->col;
} else if ((ctxt->sax != NULL) &&
(ctxt->disableSAX == 0)) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
ctxt->input->cur, nbchar);
line = ctxt->input->line;
col = ctxt->input->col;
}
xmlCharacters(ctxt, tmp, nbchar);
line = ctxt->input->line;
col = ctxt->input->col;
}
ctxt->input->cur = in;
if (*in == 0xD) {
@ -5060,23 +5034,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
if (nbchar >= XML_PARSER_BIG_BUFFER_SIZE) {
buf[nbchar] = 0;
/*
* OK the segment is to be consumed as chars.
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar, 0)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, nbchar);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
if ((ctxt->sax->characters !=
ctxt->sax->ignorableWhitespace) &&
(*ctxt->space == -1))
*ctxt->space = -2;
}
}
xmlCharacters(ctxt, buf, nbchar);
nbchar = 0;
SHRINK;
}
@ -5084,21 +5042,8 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
}
if (nbchar != 0) {
buf[nbchar] = 0;
/*
* OK the segment is to be consumed as chars.
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar, 0)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
if ((ctxt->sax->characters != ctxt->sax->ignorableWhitespace) &&
(*ctxt->space == -1))
*ctxt->space = -2;
}
}
xmlCharacters(ctxt, buf, nbchar);
}
/*
* cur == 0 can mean
@ -13633,9 +13578,6 @@ xmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
/*
* Changing SAX callbacks is a bad idea. This should be fixed.
*/
if (options & XML_PARSE_NOBLANKS) {
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
}
if (options & XML_PARSE_NOCDATA) {
ctxt->sax->cdataBlock = NULL;
}