1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-25 10:50:08 +03:00

parser: Decode all data in xmlCharEncInput

Even with flush set to true, xmlCharEncInput didn't guarantee to decode
all data. This complicated the push parser.

Remove the flush flag and always decode all available data.

Also fix ICU code where the flush flag has a different meaning. Always
set flush to false and retry even with empty input buffers.
This commit is contained in:
Nick Wellnhofer 2023-08-08 15:21:31 +02:00
parent 834b8123ef
commit 95e81a360c
6 changed files with 66 additions and 123 deletions

View File

@ -6110,29 +6110,6 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
#if 0
if ((terminate) || (ctxt->input->buf->buffer->use > 80))
htmlParseTryOrFinish(ctxt, terminate);
#endif
} else if (ctxt->instate != XML_PARSER_EOF) {
if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
xmlParserInputBufferPtr in = ctxt->input->buf;
if ((in->encoder != NULL) && (in->buffer != NULL) &&
(in->raw != NULL)) {
int nbchars;
size_t pos = ctxt->input->cur - ctxt->input->base;
nbchars = xmlCharEncInput(in, terminate);
xmlBufUpdateInput(in->buffer, ctxt->input, pos);
if (nbchars < 0) {
htmlParseErr(ctxt, in->error,
"encoder error\n", NULL, NULL);
xmlHaltParser(ctxt);
return(XML_ERR_INVALID_ENCODING);
}
}
}
}
htmlParseTryOrFinish(ctxt, terminate);
if (terminate) {

View File

@ -1915,7 +1915,6 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
* @flush: if true, indicates end of input
*
* Returns an XML_ENC_ERR code.
*
@ -1925,7 +1924,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
*/
static int
xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
const unsigned char *in, int *inlen, int flush) {
const unsigned char *in, int *inlen) {
const char *ucv_in = (const char *) in;
char *ucv_out = (char *) out;
UErrorCode err = U_ZERO_ERROR;
@ -1935,25 +1934,36 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
return(XML_ENC_ERR_INTERNAL);
}
/*
* Note that the ICU API is stateful. It can always consume a certain
* amount of input even if the output buffer would overflow. The
* remaining input must be processed by calling ucnv_convertEx with a
* possibly empty input buffer.
*
* ucnv_convertEx is always called with reset and flush set to 0,
* so we don't mess up the state. This should never generate
* U_TRUNCATED_CHAR_FOUND errors.
*
* This also means that ICU xmlCharEncodingHandlers should never be
* reused. It would be a lot nicer if there was a way to emulate the
* stateless iconv API.
*/
if (toUnicode) {
/* encoding => UTF-16 => UTF-8 */
ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
&cd->pivot_source, &cd->pivot_target,
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
} else {
/* UTF-8 => UTF-16 => encoding */
ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
&cd->pivot_source, &cd->pivot_target,
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
}
*inlen = ucv_in - (const char*) in;
*outlen = ucv_out - (char *) out;
if (U_SUCCESS(err)) {
/* reset pivot buf if this is the last call for input (flush==TRUE) */
if (flush)
cd->pivot_source = cd->pivot_target = cd->pivot_buf;
return(XML_ENC_ERR_SUCCESS);
}
if (err == U_BUFFER_OVERFLOW_ERROR)
@ -2005,7 +2015,6 @@ xmlEncConvertError(int code) {
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
* @flush: flush (ICU-related)
*
* Returns an XML_ENC_ERR code.
*
@ -2015,9 +2024,8 @@ xmlEncConvertError(int code) {
*/
int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
int *outlen, const unsigned char *in, int *inlen, int flush) {
int *outlen, const unsigned char *in, int *inlen) {
int ret;
(void)flush;
if (handler->input != NULL) {
ret = handler->input(out, outlen, in, inlen);
@ -2031,8 +2039,7 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
#endif /* LIBXML_ICONV_ENABLED */
#ifdef LIBXML_ICU_ENABLED
else if (handler->uconv_in != NULL) {
ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
flush);
ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen);
}
#endif /* LIBXML_ICU_ENABLED */
else {
@ -2041,8 +2048,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
ret = XML_ENC_ERR_INTERNAL;
}
/* Ignore space and partial errors when reading. */
if ((ret == XML_ENC_ERR_SPACE) || (ret == XML_ENC_ERR_PARTIAL))
/* Ignore partial errors when reading. */
if (ret == XML_ENC_ERR_PARTIAL)
ret = XML_ENC_ERR_SUCCESS;
return(ret);
@ -2079,8 +2086,7 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
#endif /* LIBXML_ICONV_ENABLED */
#ifdef LIBXML_ICU_ENABLED
else if (handler->uconv_out != NULL) {
ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
1);
ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen);
}
#endif /* LIBXML_ICU_ENABLED */
else {
@ -2113,22 +2119,23 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
/**
* xmlCharEncInput:
* @input: a parser input buffer
* @flush: try to flush all the raw buffer
*
* Generic front-end for the encoding handler on parser input
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*/
int
xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
xmlCharEncInput(xmlParserInputBufferPtr input)
{
int ret;
size_t written;
size_t avail;
size_t toconv;
int c_in;
int c_out;
xmlBufPtr in;
xmlBufPtr out;
const xmlChar *inData;
size_t inTotal = 0;
if ((input == NULL) || (input->encoder == NULL) ||
(input->buffer == NULL) || (input->raw == NULL))
@ -2139,25 +2146,34 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
toconv = xmlBufUse(in);
if (toconv == 0)
return (0);
if ((toconv > 64 * 1024) && (flush == 0))
toconv = 64 * 1024;
written = xmlBufAvail(out);
if (toconv * 2 >= written) {
if (xmlBufGrow(out, toconv * 2) < 0) {
input->error = XML_ERR_NO_MEMORY;
return(XML_ENC_ERR_MEMORY);
}
written = xmlBufAvail(out);
}
if ((written > 128 * 1024) && (flush == 0))
written = 128 * 1024;
inData = xmlBufContent(in);
inTotal = 0;
c_in = toconv;
c_out = written;
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
xmlBufContent(in), &c_in, flush);
xmlBufShrink(in, c_in);
xmlBufAddLen(out, c_out);
do {
c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv;
avail = xmlBufAvail(out);
if (avail > INT_MAX)
avail = INT_MAX;
if (avail < toconv * 2) {
if (xmlBufGrow(out, toconv * 2) < 0) {
input->error = XML_ERR_NO_MEMORY;
return(XML_ENC_ERR_MEMORY);
}
avail = xmlBufAvail(out);
}
c_in = toconv;
c_out = avail;
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
inData, &c_in);
inTotal += c_in;
inData += c_in;
toconv -= c_in;
xmlBufAddLen(out, c_out);
} while (ret == XML_ENC_ERR_SPACE);
xmlBufShrink(in, inTotal);
if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in)
input->rawconsumed = ULONG_MAX;
@ -2207,7 +2223,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
written = out->size - out->use - 1;
}
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
in->content, &toconv, 1);
in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
@ -2273,8 +2289,6 @@ retry:
* Conversion itself.
*/
toconv = xmlBufUse(in);
if (toconv == 0)
return (writtentot);
if (toconv > 64 * 1024)
toconv = 64 * 1024;
if (toconv * 4 >= written) {
@ -2404,8 +2418,6 @@ retry:
* Conversion itself.
*/
toconv = in->use;
if (toconv == 0)
return(0);
if (toconv * 4 >= written) {
xmlBufferGrow(out, toconv * 4);
written = out->size - out->use - 1;

View File

@ -9,9 +9,9 @@ xmlInitEncodingInternal(void);
XML_HIDDEN int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
int *outlen, const unsigned char *in, int *inlen, int flush);
int *outlen, const unsigned char *in, int *inlen);
XML_HIDDEN int
xmlCharEncInput(xmlParserInputBufferPtr input, int flush);
xmlCharEncInput(xmlParserInputBufferPtr input);
XML_HIDDEN int
xmlCharEncOutput(xmlOutputBufferPtr output, int init);

View File

@ -10993,27 +10993,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1))
return(0);
if (ctxt->input == NULL) break;
if (ctxt->input->buf != NULL) {
/*
* If we are operating on converted input, try to flush
* remaining chars to avoid them stalling in the non-converted
* buffer.
*/
if ((ctxt->input->buf->raw != NULL) &&
(xmlBufIsEmpty(ctxt->input->buf->raw) == 0)) {
size_t pos = ctxt->input->cur - ctxt->input->base;
int res;
res = xmlParserInputBufferPush(ctxt->input->buf, 0, "");
xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
if (res < 0) {
xmlFatalErr(ctxt, ctxt->input->buf->error, NULL);
xmlHaltParser(ctxt);
return(0);
}
}
}
avail = ctxt->input->end - ctxt->input->cur;
if (avail < 1)
goto done;
@ -11667,24 +11646,6 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "PP: pushed %d\n", size);
#endif
} else if (ctxt->instate != XML_PARSER_EOF) {
if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
xmlParserInputBufferPtr in = ctxt->input->buf;
if ((in->encoder != NULL) && (in->buffer != NULL) &&
(in->raw != NULL)) {
int nbchars;
size_t pos = ctxt->input->cur - ctxt->input->base;
nbchars = xmlCharEncInput(in, terminate);
xmlBufUpdateInput(in->buffer, ctxt->input, pos);
if (nbchars < 0) {
xmlFatalErr(ctxt, in->error, NULL);
xmlHaltParser(ctxt);
return(ctxt->errNo);
}
}
}
}
xmlParseTryOrFinish(ctxt, terminate);

View File

@ -1253,7 +1253,7 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
return(NULL);
outlen = sizeof(out) - 1;
inlen = input->end - input->cur;
res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, 0);
res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen);
if (res < 0)
return(handler);
out[outlen] = 0;
@ -1287,12 +1287,15 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
break;
out[i] = 0;
xmlCharEncCloseFunc(handler);
handler = xmlFindCharEncodingHandler((char *) out + start);
break;
return(xmlFindCharEncodingHandler((char *) out + start));
}
}
return(handler);
/*
* ICU handlers are stateful, so we have to recreate them.
*/
xmlCharEncCloseFunc(handler);
return(xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC));
}
/**
@ -1420,17 +1423,7 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
in->buffer = xmlBufCreate();
in->rawconsumed = processed;
/*
* TODO: We must flush and decode the whole buffer to make functions
* like xmlReadMemory work with a user-provided encoding. If the
* encoding is specified directly, we should probably set
* XML_PARSE_IGNORE_ENC in xmlDoRead to avoid switching encodings
* twice. Then we could set "flush" to false which should save
* a considerable amount of memory when parsing from memory.
* It's probably even possible to remove this whole if-block
* completely.
*/
nbchars = xmlCharEncInput(in, 1);
nbchars = xmlCharEncInput(in);
xmlBufResetInput(in->buffer, input);
if (nbchars < 0) {
/* TODO: This could be an out of memory or an encoding error. */

View File

@ -3218,7 +3218,7 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in,
/*
* convert as much as possible to the parser reading buffer.
*/
nbchars = xmlCharEncInput(in, 1);
nbchars = xmlCharEncInput(in);
if (nbchars < 0)
return(-1);
} else {
@ -3319,7 +3319,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
}
if (in->encoder != NULL) {
res = xmlCharEncInput(in, 1);
res = xmlCharEncInput(in);
if (res < 0)
return(-1);
}