diff --git a/HTMLparser.c b/HTMLparser.c
index e7ac8d41..ba3eb16d 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -6110,29 +6110,6 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
-
-#if 0
- if ((terminate) || (ctxt->input->buf->buffer->use > 80))
- htmlParseTryOrFinish(ctxt, terminate);
-#endif
- } else if (ctxt->instate != XML_PARSER_EOF) {
- if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
- xmlParserInputBufferPtr in = ctxt->input->buf;
- if ((in->encoder != NULL) && (in->buffer != NULL) &&
- (in->raw != NULL)) {
- int nbchars;
- size_t pos = ctxt->input->cur - ctxt->input->base;
-
- nbchars = xmlCharEncInput(in, terminate);
- xmlBufUpdateInput(in->buffer, ctxt->input, pos);
- if (nbchars < 0) {
- htmlParseErr(ctxt, in->error,
- "encoder error\n", NULL, NULL);
- xmlHaltParser(ctxt);
- return(XML_ERR_INVALID_ENCODING);
- }
- }
- }
}
htmlParseTryOrFinish(ctxt, terminate);
if (terminate) {
diff --git a/encoding.c b/encoding.c
index b59df26e..7a19c2be 100644
--- a/encoding.c
+++ b/encoding.c
@@ -1915,7 +1915,6 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
- * @flush: if true, indicates end of input
*
* Returns an XML_ENC_ERR code.
*
@@ -1925,7 +1924,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
*/
static int
xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
- const unsigned char *in, int *inlen, int flush) {
+ const unsigned char *in, int *inlen) {
const char *ucv_in = (const char *) in;
char *ucv_out = (char *) out;
UErrorCode err = U_ZERO_ERROR;
@@ -1935,25 +1934,36 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
return(XML_ENC_ERR_INTERNAL);
}
+ /*
+ * Note that the ICU API is stateful. It can always consume a certain
+ * amount of input even if the output buffer would overflow. The
+ * remaining input must be processed by calling ucnv_convertEx with a
+ * possibly empty input buffer.
+ *
+ * ucnv_convertEx is always called with reset and flush set to 0,
+ * so we don't mess up the state. This should never generate
+ * U_TRUNCATED_CHAR_FOUND errors.
+ *
+ * This also means that ICU xmlCharEncodingHandlers should never be
+ * reused. It would be a lot nicer if there was a way to emulate the
+ * stateless iconv API.
+ */
if (toUnicode) {
/* encoding => UTF-16 => UTF-8 */
ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
&cd->pivot_source, &cd->pivot_target,
- cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
+ cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
} else {
/* UTF-8 => UTF-16 => encoding */
ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
&cd->pivot_source, &cd->pivot_target,
- cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
+ cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
}
*inlen = ucv_in - (const char*) in;
*outlen = ucv_out - (char *) out;
if (U_SUCCESS(err)) {
- /* reset pivot buf if this is the last call for input (flush==TRUE) */
- if (flush)
- cd->pivot_source = cd->pivot_target = cd->pivot_buf;
return(XML_ENC_ERR_SUCCESS);
}
if (err == U_BUFFER_OVERFLOW_ERROR)
@@ -2005,7 +2015,6 @@ xmlEncConvertError(int code) {
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
- * @flush: flush (ICU-related)
*
* Returns an XML_ENC_ERR code.
*
@@ -2015,9 +2024,8 @@ xmlEncConvertError(int code) {
*/
int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
- int *outlen, const unsigned char *in, int *inlen, int flush) {
+ int *outlen, const unsigned char *in, int *inlen) {
int ret;
- (void)flush;
if (handler->input != NULL) {
ret = handler->input(out, outlen, in, inlen);
@@ -2031,8 +2039,7 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
#endif /* LIBXML_ICONV_ENABLED */
#ifdef LIBXML_ICU_ENABLED
else if (handler->uconv_in != NULL) {
- ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
- flush);
+ ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen);
}
#endif /* LIBXML_ICU_ENABLED */
else {
@@ -2041,8 +2048,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
ret = XML_ENC_ERR_INTERNAL;
}
- /* Ignore space and partial errors when reading. */
- if ((ret == XML_ENC_ERR_SPACE) || (ret == XML_ENC_ERR_PARTIAL))
+ /* Ignore partial errors when reading. */
+ if (ret == XML_ENC_ERR_PARTIAL)
ret = XML_ENC_ERR_SUCCESS;
return(ret);
@@ -2079,8 +2086,7 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
#endif /* LIBXML_ICONV_ENABLED */
#ifdef LIBXML_ICU_ENABLED
else if (handler->uconv_out != NULL) {
- ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
- 1);
+ ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen);
}
#endif /* LIBXML_ICU_ENABLED */
else {
@@ -2113,22 +2119,23 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
/**
* xmlCharEncInput:
* @input: a parser input buffer
- * @flush: try to flush all the raw buffer
*
* Generic front-end for the encoding handler on parser input
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*/
int
-xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
+xmlCharEncInput(xmlParserInputBufferPtr input)
{
int ret;
- size_t written;
+ size_t avail;
size_t toconv;
int c_in;
int c_out;
xmlBufPtr in;
xmlBufPtr out;
+ const xmlChar *inData;
+ size_t inTotal = 0;
if ((input == NULL) || (input->encoder == NULL) ||
(input->buffer == NULL) || (input->raw == NULL))
@@ -2139,25 +2146,34 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
toconv = xmlBufUse(in);
if (toconv == 0)
return (0);
- if ((toconv > 64 * 1024) && (flush == 0))
- toconv = 64 * 1024;
- written = xmlBufAvail(out);
- if (toconv * 2 >= written) {
- if (xmlBufGrow(out, toconv * 2) < 0) {
- input->error = XML_ERR_NO_MEMORY;
- return(XML_ENC_ERR_MEMORY);
- }
- written = xmlBufAvail(out);
- }
- if ((written > 128 * 1024) && (flush == 0))
- written = 128 * 1024;
+ inData = xmlBufContent(in);
+ inTotal = 0;
- c_in = toconv;
- c_out = written;
- ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
- xmlBufContent(in), &c_in, flush);
- xmlBufShrink(in, c_in);
- xmlBufAddLen(out, c_out);
+ do {
+ c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv;
+
+ avail = xmlBufAvail(out);
+ if (avail > INT_MAX)
+ avail = INT_MAX;
+ if (avail < toconv * 2) {
+ if (xmlBufGrow(out, toconv * 2) < 0) {
+ input->error = XML_ERR_NO_MEMORY;
+ return(XML_ENC_ERR_MEMORY);
+ }
+ avail = xmlBufAvail(out);
+ }
+
+ c_in = toconv;
+ c_out = avail;
+ ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
+ inData, &c_in);
+ inTotal += c_in;
+ inData += c_in;
+ toconv -= c_in;
+ xmlBufAddLen(out, c_out);
+ } while (ret == XML_ENC_ERR_SPACE);
+
+ xmlBufShrink(in, inTotal);
if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in)
input->rawconsumed = ULONG_MAX;
@@ -2207,7 +2223,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
written = out->size - out->use - 1;
}
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
- in->content, &toconv, 1);
+ in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
@@ -2273,8 +2289,6 @@ retry:
* Conversion itself.
*/
toconv = xmlBufUse(in);
- if (toconv == 0)
- return (writtentot);
if (toconv > 64 * 1024)
toconv = 64 * 1024;
if (toconv * 4 >= written) {
@@ -2404,8 +2418,6 @@ retry:
* Conversion itself.
*/
toconv = in->use;
- if (toconv == 0)
- return(0);
if (toconv * 4 >= written) {
xmlBufferGrow(out, toconv * 4);
written = out->size - out->use - 1;
diff --git a/include/private/enc.h b/include/private/enc.h
index cbdc2b33..cd549145 100644
--- a/include/private/enc.h
+++ b/include/private/enc.h
@@ -9,9 +9,9 @@ xmlInitEncodingInternal(void);
XML_HIDDEN int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
- int *outlen, const unsigned char *in, int *inlen, int flush);
+ int *outlen, const unsigned char *in, int *inlen);
XML_HIDDEN int
-xmlCharEncInput(xmlParserInputBufferPtr input, int flush);
+xmlCharEncInput(xmlParserInputBufferPtr input);
XML_HIDDEN int
xmlCharEncOutput(xmlOutputBufferPtr output, int init);
diff --git a/parser.c b/parser.c
index 97c087e1..2ebc3799 100644
--- a/parser.c
+++ b/parser.c
@@ -10993,27 +10993,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1))
return(0);
- if (ctxt->input == NULL) break;
- if (ctxt->input->buf != NULL) {
- /*
- * If we are operating on converted input, try to flush
- * remaining chars to avoid them stalling in the non-converted
- * buffer.
- */
- if ((ctxt->input->buf->raw != NULL) &&
- (xmlBufIsEmpty(ctxt->input->buf->raw) == 0)) {
- size_t pos = ctxt->input->cur - ctxt->input->base;
- int res;
-
- res = xmlParserInputBufferPush(ctxt->input->buf, 0, "");
- xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
- if (res < 0) {
- xmlFatalErr(ctxt, ctxt->input->buf->error, NULL);
- xmlHaltParser(ctxt);
- return(0);
- }
- }
- }
avail = ctxt->input->end - ctxt->input->cur;
if (avail < 1)
goto done;
@@ -11667,24 +11646,6 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "PP: pushed %d\n", size);
#endif
-
- } else if (ctxt->instate != XML_PARSER_EOF) {
- if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
- xmlParserInputBufferPtr in = ctxt->input->buf;
- if ((in->encoder != NULL) && (in->buffer != NULL) &&
- (in->raw != NULL)) {
- int nbchars;
- size_t pos = ctxt->input->cur - ctxt->input->base;
-
- nbchars = xmlCharEncInput(in, terminate);
- xmlBufUpdateInput(in->buffer, ctxt->input, pos);
- if (nbchars < 0) {
- xmlFatalErr(ctxt, in->error, NULL);
- xmlHaltParser(ctxt);
- return(ctxt->errNo);
- }
- }
- }
}
xmlParseTryOrFinish(ctxt, terminate);
diff --git a/parserInternals.c b/parserInternals.c
index 706515ce..eccdb32e 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1253,7 +1253,7 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
return(NULL);
outlen = sizeof(out) - 1;
inlen = input->end - input->cur;
- res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, 0);
+ res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen);
if (res < 0)
return(handler);
out[outlen] = 0;
@@ -1287,12 +1287,15 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
break;
out[i] = 0;
xmlCharEncCloseFunc(handler);
- handler = xmlFindCharEncodingHandler((char *) out + start);
- break;
+ return(xmlFindCharEncodingHandler((char *) out + start));
}
}
- return(handler);
+ /*
+ * ICU handlers are stateful, so we have to recreate them.
+ */
+ xmlCharEncCloseFunc(handler);
+ return(xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC));
}
/**
@@ -1420,17 +1423,7 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
in->buffer = xmlBufCreate();
in->rawconsumed = processed;
- /*
- * TODO: We must flush and decode the whole buffer to make functions
- * like xmlReadMemory work with a user-provided encoding. If the
- * encoding is specified directly, we should probably set
- * XML_PARSE_IGNORE_ENC in xmlDoRead to avoid switching encodings
- * twice. Then we could set "flush" to false which should save
- * a considerable amount of memory when parsing from memory.
- * It's probably even possible to remove this whole if-block
- * completely.
- */
- nbchars = xmlCharEncInput(in, 1);
+ nbchars = xmlCharEncInput(in);
xmlBufResetInput(in->buffer, input);
if (nbchars < 0) {
/* TODO: This could be an out of memory or an encoding error. */
diff --git a/xmlIO.c b/xmlIO.c
index 6d94d5ee..9c263814 100644
--- a/xmlIO.c
+++ b/xmlIO.c
@@ -3218,7 +3218,7 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in,
/*
* convert as much as possible to the parser reading buffer.
*/
- nbchars = xmlCharEncInput(in, 1);
+ nbchars = xmlCharEncInput(in);
if (nbchars < 0)
return(-1);
} else {
@@ -3319,7 +3319,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
}
if (in->encoder != NULL) {
- res = xmlCharEncInput(in, 1);
+ res = xmlCharEncInput(in);
if (res < 0)
return(-1);
}