1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

parser: Align push parsing of CDATA sections with pull parser

Remove special handling of CDATA sections in push parser. This makes
sure that only a single callback is generated for large sections.

Fixes #22 and needed for #412.
This commit is contained in:
Nick Wellnhofer 2024-08-28 23:58:20 +02:00
parent 4d10e53af1
commit dde62ae5d5

164
parser.c
View File

@ -11013,74 +11013,6 @@ xmlParseLookupInternalSubset(xmlParserCtxtPtr ctxt) {
return(0);
}
/**
* xmlCheckCdataPush:
* @cur: pointer to the block of characters
* @len: length of the block in bytes
* @complete: 1 if complete CDATA block is passed in, 0 if partial block
*
* Check that the block of characters is okay as SCdata content [20]
*
* Returns the number of bytes to pass if okay, a negative index where an
* UTF-8 error occurred otherwise
*/
static int
xmlCheckCdataPush(const xmlChar *utf, int len, int complete) {
int ix;
unsigned char c;
int codepoint;
if ((utf == NULL) || (len <= 0))
return(0);
for (ix = 0; ix < len;) { /* string is 0-terminated */
c = utf[ix];
if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
if (c >= 0x20)
ix++;
else if ((c == 0xA) || (c == 0xD) || (c == 0x9))
ix++;
else
return(-ix);
} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
if (ix + 2 > len) return(complete ? -ix : ix);
if ((utf[ix+1] & 0xc0 ) != 0x80)
return(-ix);
codepoint = (utf[ix] & 0x1f) << 6;
codepoint |= utf[ix+1] & 0x3f;
if (!xmlIsCharQ(codepoint))
return(-ix);
ix += 2;
} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
if (ix + 3 > len) return(complete ? -ix : ix);
if (((utf[ix+1] & 0xc0) != 0x80) ||
((utf[ix+2] & 0xc0) != 0x80))
return(-ix);
codepoint = (utf[ix] & 0xf) << 12;
codepoint |= (utf[ix+1] & 0x3f) << 6;
codepoint |= utf[ix+2] & 0x3f;
if (!xmlIsCharQ(codepoint))
return(-ix);
ix += 3;
} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
if (ix + 4 > len) return(complete ? -ix : ix);
if (((utf[ix+1] & 0xc0) != 0x80) ||
((utf[ix+2] & 0xc0) != 0x80) ||
((utf[ix+3] & 0xc0) != 0x80))
return(-ix);
codepoint = (utf[ix] & 0x7) << 18;
codepoint |= (utf[ix+1] & 0x3f) << 12;
codepoint |= (utf[ix+2] & 0x3f) << 6;
codepoint |= utf[ix+3] & 0x3f;
if (!xmlIsCharQ(codepoint))
return(-ix);
ix += 4;
} else /* unknown encoding */
return(-ix);
}
return(ix);
}
/**
* xmlParseTryOrFinish:
* @ctxt: an XML parser context
@ -11308,8 +11240,12 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
(ctxt->input->cur[6] == 'T') &&
(ctxt->input->cur[7] == 'A') &&
(ctxt->input->cur[8] == '[')) {
SKIP(9);
if ((!terminate) &&
(!xmlParseLookupString(ctxt, 9, "]]>", 3)))
goto done;
ctxt->instate = XML_PARSER_CDATA_SECTION;
xmlParseCDSect(ctxt);
ctxt->instate = XML_PARSER_CONTENT;
break;
}
}
@ -11361,89 +11297,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_CONTENT;
}
break;
case XML_PARSER_CDATA_SECTION: {
/*
* The Push mode need to have the SAX callback for
* cdataBlock merge back contiguous callbacks.
*/
const xmlChar *term;
if (terminate) {
/*
* Don't call xmlParseLookupString. If 'terminate'
* is set, checkIndex is invalid.
*/
term = BAD_CAST strstr((const char *) ctxt->input->cur,
"]]>");
} else {
term = xmlParseLookupString(ctxt, 0, "]]>", 3);
}
if (term == NULL) {
int tmp, size;
if (terminate) {
/* Unfinished CDATA section */
size = ctxt->input->end - ctxt->input->cur;
} else {
if (avail < XML_PARSER_BIG_BUFFER_SIZE + 2)
goto done;
ctxt->checkIndex = 0;
/* XXX: Why don't we pass the full buffer? */
size = XML_PARSER_BIG_BUFFER_SIZE;
}
tmp = xmlCheckCdataPush(ctxt->input->cur, size, 0);
if (tmp <= 0) {
tmp = -tmp;
ctxt->input->cur += tmp;
goto encoding_error;
}
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (ctxt->sax->cdataBlock != NULL)
ctxt->sax->cdataBlock(ctxt->userData,
ctxt->input->cur, tmp);
else if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
ctxt->input->cur, tmp);
}
SKIPL(tmp);
} else {
int base = term - CUR_PTR;
int tmp;
tmp = xmlCheckCdataPush(ctxt->input->cur, base, 1);
if ((tmp < 0) || (tmp != base)) {
tmp = -tmp;
ctxt->input->cur += tmp;
goto encoding_error;
}
if ((ctxt->sax != NULL) && (base == 0) &&
(ctxt->sax->cdataBlock != NULL) &&
(!ctxt->disableSAX)) {
/*
* Special case to provide identical behaviour
* between pull and push parsers on enpty CDATA
* sections
*/
if ((ctxt->input->cur - ctxt->input->base >= 9) &&
(!strncmp((const char *)&ctxt->input->cur[-9],
"<![CDATA[", 9)))
ctxt->sax->cdataBlock(ctxt->userData,
BAD_CAST "", 0);
} else if ((ctxt->sax != NULL) && (base > 0) &&
(!ctxt->disableSAX)) {
if (ctxt->sax->cdataBlock != NULL)
ctxt->sax->cdataBlock(ctxt->userData,
ctxt->input->cur, base);
else if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
ctxt->input->cur, base);
}
SKIPL(base + 3);
ctxt->instate = XML_PARSER_CONTENT;
}
break;
}
case XML_PARSER_MISC:
case XML_PARSER_PROLOG:
case XML_PARSER_EPILOG:
@ -11546,13 +11399,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
}
done:
return(ret);
encoding_error:
/* Only report the first error */
if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
}
return(0);
}
/**