mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-03-25 10:50:08 +03:00
parser: Decode all data in xmlCharEncInput
Even with flush set to true, xmlCharEncInput didn't guarantee to decode all data. This complicated the push parser. Remove the flush flag and always decode all available data. Also fix ICU code where the flush flag has a different meaning. Always set flush to false and retry even with empty input buffers.
This commit is contained in:
parent
834b8123ef
commit
95e81a360c
23
HTMLparser.c
23
HTMLparser.c
@ -6110,29 +6110,6 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
|
||||
#ifdef DEBUG_PUSH
|
||||
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
if ((terminate) || (ctxt->input->buf->buffer->use > 80))
|
||||
htmlParseTryOrFinish(ctxt, terminate);
|
||||
#endif
|
||||
} else if (ctxt->instate != XML_PARSER_EOF) {
|
||||
if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
|
||||
xmlParserInputBufferPtr in = ctxt->input->buf;
|
||||
if ((in->encoder != NULL) && (in->buffer != NULL) &&
|
||||
(in->raw != NULL)) {
|
||||
int nbchars;
|
||||
size_t pos = ctxt->input->cur - ctxt->input->base;
|
||||
|
||||
nbchars = xmlCharEncInput(in, terminate);
|
||||
xmlBufUpdateInput(in->buffer, ctxt->input, pos);
|
||||
if (nbchars < 0) {
|
||||
htmlParseErr(ctxt, in->error,
|
||||
"encoder error\n", NULL, NULL);
|
||||
xmlHaltParser(ctxt);
|
||||
return(XML_ERR_INVALID_ENCODING);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
htmlParseTryOrFinish(ctxt, terminate);
|
||||
if (terminate) {
|
||||
|
96
encoding.c
96
encoding.c
@ -1915,7 +1915,6 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
|
||||
* @outlen: the length of @out
|
||||
* @in: a pointer to an array of input bytes
|
||||
* @inlen: the length of @in
|
||||
* @flush: if true, indicates end of input
|
||||
*
|
||||
* Returns an XML_ENC_ERR code.
|
||||
*
|
||||
@ -1925,7 +1924,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
|
||||
*/
|
||||
static int
|
||||
xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
||||
const unsigned char *in, int *inlen, int flush) {
|
||||
const unsigned char *in, int *inlen) {
|
||||
const char *ucv_in = (const char *) in;
|
||||
char *ucv_out = (char *) out;
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
@ -1935,25 +1934,36 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
||||
return(XML_ENC_ERR_INTERNAL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that the ICU API is stateful. It can always consume a certain
|
||||
* amount of input even if the output buffer would overflow. The
|
||||
* remaining input must be processed by calling ucnv_convertEx with a
|
||||
* possibly empty input buffer.
|
||||
*
|
||||
* ucnv_convertEx is always called with reset and flush set to 0,
|
||||
* so we don't mess up the state. This should never generate
|
||||
* U_TRUNCATED_CHAR_FOUND errors.
|
||||
*
|
||||
* This also means that ICU xmlCharEncodingHandlers should never be
|
||||
* reused. It would be a lot nicer if there was a way to emulate the
|
||||
* stateless iconv API.
|
||||
*/
|
||||
if (toUnicode) {
|
||||
/* encoding => UTF-16 => UTF-8 */
|
||||
ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
|
||||
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
|
||||
&cd->pivot_source, &cd->pivot_target,
|
||||
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
|
||||
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
|
||||
} else {
|
||||
/* UTF-8 => UTF-16 => encoding */
|
||||
ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
|
||||
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
|
||||
&cd->pivot_source, &cd->pivot_target,
|
||||
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
|
||||
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
|
||||
}
|
||||
*inlen = ucv_in - (const char*) in;
|
||||
*outlen = ucv_out - (char *) out;
|
||||
if (U_SUCCESS(err)) {
|
||||
/* reset pivot buf if this is the last call for input (flush==TRUE) */
|
||||
if (flush)
|
||||
cd->pivot_source = cd->pivot_target = cd->pivot_buf;
|
||||
return(XML_ENC_ERR_SUCCESS);
|
||||
}
|
||||
if (err == U_BUFFER_OVERFLOW_ERROR)
|
||||
@ -2005,7 +2015,6 @@ xmlEncConvertError(int code) {
|
||||
* @outlen: the length of @out
|
||||
* @in: a pointer to an array of input bytes
|
||||
* @inlen: the length of @in
|
||||
* @flush: flush (ICU-related)
|
||||
*
|
||||
* Returns an XML_ENC_ERR code.
|
||||
*
|
||||
@ -2015,9 +2024,8 @@ xmlEncConvertError(int code) {
|
||||
*/
|
||||
int
|
||||
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
||||
int *outlen, const unsigned char *in, int *inlen, int flush) {
|
||||
int *outlen, const unsigned char *in, int *inlen) {
|
||||
int ret;
|
||||
(void)flush;
|
||||
|
||||
if (handler->input != NULL) {
|
||||
ret = handler->input(out, outlen, in, inlen);
|
||||
@ -2031,8 +2039,7 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef LIBXML_ICU_ENABLED
|
||||
else if (handler->uconv_in != NULL) {
|
||||
ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
|
||||
flush);
|
||||
ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen);
|
||||
}
|
||||
#endif /* LIBXML_ICU_ENABLED */
|
||||
else {
|
||||
@ -2041,8 +2048,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
||||
ret = XML_ENC_ERR_INTERNAL;
|
||||
}
|
||||
|
||||
/* Ignore space and partial errors when reading. */
|
||||
if ((ret == XML_ENC_ERR_SPACE) || (ret == XML_ENC_ERR_PARTIAL))
|
||||
/* Ignore partial errors when reading. */
|
||||
if (ret == XML_ENC_ERR_PARTIAL)
|
||||
ret = XML_ENC_ERR_SUCCESS;
|
||||
|
||||
return(ret);
|
||||
@ -2079,8 +2086,7 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef LIBXML_ICU_ENABLED
|
||||
else if (handler->uconv_out != NULL) {
|
||||
ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
|
||||
1);
|
||||
ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen);
|
||||
}
|
||||
#endif /* LIBXML_ICU_ENABLED */
|
||||
else {
|
||||
@ -2113,22 +2119,23 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||||
/**
|
||||
* xmlCharEncInput:
|
||||
* @input: a parser input buffer
|
||||
* @flush: try to flush all the raw buffer
|
||||
*
|
||||
* Generic front-end for the encoding handler on parser input
|
||||
*
|
||||
* Returns the number of bytes written or an XML_ENC_ERR code.
|
||||
*/
|
||||
int
|
||||
xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
|
||||
xmlCharEncInput(xmlParserInputBufferPtr input)
|
||||
{
|
||||
int ret;
|
||||
size_t written;
|
||||
size_t avail;
|
||||
size_t toconv;
|
||||
int c_in;
|
||||
int c_out;
|
||||
xmlBufPtr in;
|
||||
xmlBufPtr out;
|
||||
const xmlChar *inData;
|
||||
size_t inTotal = 0;
|
||||
|
||||
if ((input == NULL) || (input->encoder == NULL) ||
|
||||
(input->buffer == NULL) || (input->raw == NULL))
|
||||
@ -2139,25 +2146,34 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
|
||||
toconv = xmlBufUse(in);
|
||||
if (toconv == 0)
|
||||
return (0);
|
||||
if ((toconv > 64 * 1024) && (flush == 0))
|
||||
toconv = 64 * 1024;
|
||||
written = xmlBufAvail(out);
|
||||
if (toconv * 2 >= written) {
|
||||
if (xmlBufGrow(out, toconv * 2) < 0) {
|
||||
input->error = XML_ERR_NO_MEMORY;
|
||||
return(XML_ENC_ERR_MEMORY);
|
||||
}
|
||||
written = xmlBufAvail(out);
|
||||
}
|
||||
if ((written > 128 * 1024) && (flush == 0))
|
||||
written = 128 * 1024;
|
||||
inData = xmlBufContent(in);
|
||||
inTotal = 0;
|
||||
|
||||
c_in = toconv;
|
||||
c_out = written;
|
||||
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
|
||||
xmlBufContent(in), &c_in, flush);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
do {
|
||||
c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv;
|
||||
|
||||
avail = xmlBufAvail(out);
|
||||
if (avail > INT_MAX)
|
||||
avail = INT_MAX;
|
||||
if (avail < toconv * 2) {
|
||||
if (xmlBufGrow(out, toconv * 2) < 0) {
|
||||
input->error = XML_ERR_NO_MEMORY;
|
||||
return(XML_ENC_ERR_MEMORY);
|
||||
}
|
||||
avail = xmlBufAvail(out);
|
||||
}
|
||||
|
||||
c_in = toconv;
|
||||
c_out = avail;
|
||||
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
|
||||
inData, &c_in);
|
||||
inTotal += c_in;
|
||||
inData += c_in;
|
||||
toconv -= c_in;
|
||||
xmlBufAddLen(out, c_out);
|
||||
} while (ret == XML_ENC_ERR_SPACE);
|
||||
|
||||
xmlBufShrink(in, inTotal);
|
||||
|
||||
if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in)
|
||||
input->rawconsumed = ULONG_MAX;
|
||||
@ -2207,7 +2223,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
|
||||
written = out->size - out->use - 1;
|
||||
}
|
||||
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
|
||||
in->content, &toconv, 1);
|
||||
in->content, &toconv);
|
||||
xmlBufferShrink(in, toconv);
|
||||
out->use += written;
|
||||
out->content[out->use] = 0;
|
||||
@ -2273,8 +2289,6 @@ retry:
|
||||
* Conversion itself.
|
||||
*/
|
||||
toconv = xmlBufUse(in);
|
||||
if (toconv == 0)
|
||||
return (writtentot);
|
||||
if (toconv > 64 * 1024)
|
||||
toconv = 64 * 1024;
|
||||
if (toconv * 4 >= written) {
|
||||
@ -2404,8 +2418,6 @@ retry:
|
||||
* Conversion itself.
|
||||
*/
|
||||
toconv = in->use;
|
||||
if (toconv == 0)
|
||||
return(0);
|
||||
if (toconv * 4 >= written) {
|
||||
xmlBufferGrow(out, toconv * 4);
|
||||
written = out->size - out->use - 1;
|
||||
|
@ -9,9 +9,9 @@ xmlInitEncodingInternal(void);
|
||||
|
||||
XML_HIDDEN int
|
||||
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
||||
int *outlen, const unsigned char *in, int *inlen, int flush);
|
||||
int *outlen, const unsigned char *in, int *inlen);
|
||||
XML_HIDDEN int
|
||||
xmlCharEncInput(xmlParserInputBufferPtr input, int flush);
|
||||
xmlCharEncInput(xmlParserInputBufferPtr input);
|
||||
XML_HIDDEN int
|
||||
xmlCharEncOutput(xmlOutputBufferPtr output, int init);
|
||||
|
||||
|
39
parser.c
39
parser.c
@ -10993,27 +10993,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
|
||||
if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1))
|
||||
return(0);
|
||||
|
||||
if (ctxt->input == NULL) break;
|
||||
if (ctxt->input->buf != NULL) {
|
||||
/*
|
||||
* If we are operating on converted input, try to flush
|
||||
* remaining chars to avoid them stalling in the non-converted
|
||||
* buffer.
|
||||
*/
|
||||
if ((ctxt->input->buf->raw != NULL) &&
|
||||
(xmlBufIsEmpty(ctxt->input->buf->raw) == 0)) {
|
||||
size_t pos = ctxt->input->cur - ctxt->input->base;
|
||||
int res;
|
||||
|
||||
res = xmlParserInputBufferPush(ctxt->input->buf, 0, "");
|
||||
xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
|
||||
if (res < 0) {
|
||||
xmlFatalErr(ctxt, ctxt->input->buf->error, NULL);
|
||||
xmlHaltParser(ctxt);
|
||||
return(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
if (avail < 1)
|
||||
goto done;
|
||||
@ -11667,24 +11646,6 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
|
||||
#ifdef DEBUG_PUSH
|
||||
xmlGenericError(xmlGenericErrorContext, "PP: pushed %d\n", size);
|
||||
#endif
|
||||
|
||||
} else if (ctxt->instate != XML_PARSER_EOF) {
|
||||
if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
|
||||
xmlParserInputBufferPtr in = ctxt->input->buf;
|
||||
if ((in->encoder != NULL) && (in->buffer != NULL) &&
|
||||
(in->raw != NULL)) {
|
||||
int nbchars;
|
||||
size_t pos = ctxt->input->cur - ctxt->input->base;
|
||||
|
||||
nbchars = xmlCharEncInput(in, terminate);
|
||||
xmlBufUpdateInput(in->buffer, ctxt->input, pos);
|
||||
if (nbchars < 0) {
|
||||
xmlFatalErr(ctxt, in->error, NULL);
|
||||
xmlHaltParser(ctxt);
|
||||
return(ctxt->errNo);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
xmlParseTryOrFinish(ctxt, terminate);
|
||||
|
@ -1253,7 +1253,7 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
|
||||
return(NULL);
|
||||
outlen = sizeof(out) - 1;
|
||||
inlen = input->end - input->cur;
|
||||
res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, 0);
|
||||
res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen);
|
||||
if (res < 0)
|
||||
return(handler);
|
||||
out[outlen] = 0;
|
||||
@ -1287,12 +1287,15 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
|
||||
break;
|
||||
out[i] = 0;
|
||||
xmlCharEncCloseFunc(handler);
|
||||
handler = xmlFindCharEncodingHandler((char *) out + start);
|
||||
break;
|
||||
return(xmlFindCharEncodingHandler((char *) out + start));
|
||||
}
|
||||
}
|
||||
|
||||
return(handler);
|
||||
/*
|
||||
* ICU handlers are stateful, so we have to recreate them.
|
||||
*/
|
||||
xmlCharEncCloseFunc(handler);
|
||||
return(xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1420,17 +1423,7 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
|
||||
in->buffer = xmlBufCreate();
|
||||
in->rawconsumed = processed;
|
||||
|
||||
/*
|
||||
* TODO: We must flush and decode the whole buffer to make functions
|
||||
* like xmlReadMemory work with a user-provided encoding. If the
|
||||
* encoding is specified directly, we should probably set
|
||||
* XML_PARSE_IGNORE_ENC in xmlDoRead to avoid switching encodings
|
||||
* twice. Then we could set "flush" to false which should save
|
||||
* a considerable amount of memory when parsing from memory.
|
||||
* It's probably even possible to remove this whole if-block
|
||||
* completely.
|
||||
*/
|
||||
nbchars = xmlCharEncInput(in, 1);
|
||||
nbchars = xmlCharEncInput(in);
|
||||
xmlBufResetInput(in->buffer, input);
|
||||
if (nbchars < 0) {
|
||||
/* TODO: This could be an out of memory or an encoding error. */
|
||||
|
4
xmlIO.c
4
xmlIO.c
@ -3218,7 +3218,7 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in,
|
||||
/*
|
||||
* convert as much as possible to the parser reading buffer.
|
||||
*/
|
||||
nbchars = xmlCharEncInput(in, 1);
|
||||
nbchars = xmlCharEncInput(in);
|
||||
if (nbchars < 0)
|
||||
return(-1);
|
||||
} else {
|
||||
@ -3319,7 +3319,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
|
||||
}
|
||||
|
||||
if (in->encoder != NULL) {
|
||||
res = xmlCharEncInput(in, 1);
|
||||
res = xmlCharEncInput(in);
|
||||
if (res < 0)
|
||||
return(-1);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user