encoding: Detect truncated multi-byte sequences with ICU

Unlike iconv or the internal converters, ICU consumes truncated multi- byte sequences at the end of an input buffer. We currently check for a non-empty raw input buffer to detect truncated sequences, so this fails with ICU. It might be possible to inspect the pivot buffer pointers, but it seems cleaner to implement a `flush` flag for some encoding and I/O functions. After flushing, we can check for U_TRUNCATED_CHAR_FOUND with ICU, or detect remaining input with other converters. Also fix detection of truncated sequences for HTML, XML content and DTDs with iconv.
2025-03-27 18:50:07 +03:00 · 2025-03-10 02:18:51 +01:00 · 2025-03-10 02:18:51 +01:00 · 69b83bb68e
commit 69b83bb68e
parent 76c6ddfef9
14 changed files with 287 additions and 133 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@ -4385,6 +4385,11 @@ htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) {

    htmlParseContent(ctxt);

+    /*
+     * Only check for truncated multi-byte sequences
+     */
+    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
+
    /* TODO: Use xmlCtxtIsCatastrophicError */
    if (ctxt->errNo != XML_ERR_NO_MEMORY) {
        xmlNodePtr cur;
@ -4509,11 +4514,9 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
    htmlParseContent(ctxt);

    /*
-     * autoclose
+     * Only check for truncated multi-byte sequences
     */
-    if (CUR == 0)
-	htmlAutoCloseOnEnd(ctxt);
-
+    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);

    /*
     * SAX: end of the document processing.
@ -5237,12 +5240,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 int
 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
              int terminate) {
-    if ((ctxt == NULL) || (ctxt->input == NULL))
+    if ((ctxt == NULL) ||
+        (ctxt->input == NULL) || (ctxt->input->buf == NULL) ||
+        (size < 0) ||
+        ((size > 0) && (chunk == NULL)))
 	return(XML_ERR_ARGUMENT);
    if (PARSER_STOPPED(ctxt) != 0)
        return(ctxt->errNo);
-    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
-        (ctxt->input->buf != NULL))  {
+
+    if (size > 0)  {
 	size_t pos = ctxt->input->cur - ctxt->input->base;
 	int res;

@ -5261,6 +5267,11 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
    if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
        htmlAutoCloseOnEnd(ctxt);

+        /*
+         * Only check for truncated multi-byte sequences
+         */
+        xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
+
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
            ctxt->sax->endDocument(ctxt->userData);

--- a/doc/libxml2-api.xml
+++ b/doc/libxml2-api.xml
@ -8107,13 +8107,14 @@ crash if you try to modify the tree)'/>
      <arg name='vctxt' type='void *' info='conversion context'/>
    </functype>
    <functype name='xmlCharEncConvFunc' file='encoding' module='encoding'>
-      <info>Convert between character encodings.  On success, the value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced.</info>
-      <return type='int' info='the number of bytes written or an XML_ENC_ERR code.'/>
+      <info>Convert between character encodings.  The value of @inlen after return is the number of bytes consumed and @outlen is the number of bytes produced.  If the converter can consume partial multi-byte sequences, the @flush flag can be used to detect truncated sequences at EOF. Otherwise, the flag can be ignored.</info>
+      <return type='int' info='a non-negative number on success or an XML_ENC_ERR code.'/>
+      <arg name='vctxt' type='void *' info='conversion context'/>
      <arg name='out' type='unsigned char *' info='a pointer to an array of bytes to store the result'/>
      <arg name='outlen' type='int *' info='the length of @out'/>
      <arg name='in' type='const unsigned char *' info='a pointer to an array of input bytes'/>
      <arg name='inlen' type='int *' info='the length of @in'/>
-      <arg name='vctxt' type='void *' info='conversion context'/>
+      <arg name='flush' type='int' info='end of input'/>
    </functype>
    <functype name='xmlCharEncConvImpl' file='encoding' module='encoding'>
      <info>If this function returns XML_ERR_OK, it must fill the @out pointer with an encoding handler. The handler can be obtained from xmlCharEncNewCustomHandler.</info>
--- a/encoding.c
+++ b/encoding.c
@ -113,35 +113,35 @@ static const xmlEncTableEntry xmlEncTable[] = {
 };

 static int
-asciiToAscii(unsigned char* out, int *outlen,
-             const unsigned char* in, int *inlen, void *vctxt);
+asciiToAscii(void *vctxt, unsigned char* out, int *outlen,
+             const unsigned char* in, int *inlen, int flush);
 static int
-UTF8ToUTF8(unsigned char* out, int *outlen,
-           const unsigned char* inb, int *inlenb, void *vctxt);
+UTF8ToUTF8(void *vctxt, unsigned char* out, int *outlen,
+           const unsigned char* inb, int *inlenb, int flush);
 static int
-latin1ToUTF8(unsigned char* out, int *outlen,
-             const unsigned char* in, int *inlen, void *vctxt);
+latin1ToUTF8(void *vctxt, unsigned char* out, int *outlen,
+             const unsigned char* in, int *inlen, int flush);
 static int
-UTF16LEToUTF8(unsigned char* out, int *outlen,
-              const unsigned char* inb, int *inlenb, void *vctxt);
+UTF16LEToUTF8(void *vctxt, unsigned char* out, int *outlen,
+              const unsigned char* inb, int *inlenb, int flush);
 static int
-UTF16BEToUTF8(unsigned char* out, int *outlen,
-              const unsigned char* inb, int *inlenb, void *vctxt);
+UTF16BEToUTF8(void *vctxt, unsigned char* out, int *outlen,
+              const unsigned char* inb, int *inlenb, int flush);

 #ifdef LIBXML_OUTPUT_ENABLED

 static int
-UTF8ToLatin1(unsigned char* outb, int *outlen,
-             const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToLatin1(void *vctxt, unsigned char* outb, int *outlen,
+             const unsigned char* in, int *inlen, int flush);
 static int
-UTF8ToUTF16(unsigned char* outb, int *outlen,
-            const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToUTF16(void *vctxt, unsigned char* outb, int *outlen,
+            const unsigned char* in, int *inlen, int flush);
 static int
-UTF8ToUTF16LE(unsigned char* outb, int *outlen,
-              const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToUTF16LE(void *vctxt, unsigned char* outb, int *outlen,
+              const unsigned char* in, int *inlen, int flush);
 static int
-UTF8ToUTF16BE(unsigned char* outb, int *outlen,
-              const unsigned char* in, int *inlen, void *vctxt);
+UTF8ToUTF16BE(void *vctxt, unsigned char* outb, int *outlen,
+              const unsigned char* in, int *inlen, int flush);

 #else /* LIBXML_OUTPUT_ENABLED */

@ -154,8 +154,8 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,

 #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
 static int
-UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
-                  const unsigned char *in, int *inlen, void *vctxt);
+UTF8ToHtmlWrapper(void *vctxt, unsigned char *out, int *outlen,
+                  const unsigned char *in, int *inlen, int flush);
 #else
 #define UTF8ToHtmlWrapper NULL
 #endif
@ -166,11 +166,11 @@ UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
 #include "iso8859x.inc"

 static int
-ISO8859xToUTF8(unsigned char* out, int *outlen,
-               const unsigned char* in, int *inlen, void *vctxt);
+ISO8859xToUTF8(void *vctxt, unsigned char* out, int *outlen,
+               const unsigned char* in, int *inlen, int flush);
 static int
-UTF8ToISO8859x(unsigned char *out, int *outlen,
-               const unsigned char *in, int *inlen, void *vctxt);
+UTF8ToISO8859x(void *vctxt, unsigned char *out, int *outlen,
+               const unsigned char *in, int *inlen, int flush);

 #define MAKE_ISO_HANDLER(name, n) \
    { (char *) name, { ISO8859xToUTF8 }, { UTF8ToISO8859x }, \
@ -1073,6 +1073,7 @@ typedef struct {
 * @outlen:  the length of @out
 * @in:  a pointer to an array of input bytes
 * @inlen:  the length of @in
+ * @flush:  end of input
 *
 * Returns an XML_ENC_ERR code.
 *
@ -1081,8 +1082,9 @@ typedef struct {
 * The value of @outlen after return is the number of octets produced.
 */
 static int
-xmlIconvConvert(unsigned char *out, int *outlen,
-                const unsigned char *in, int *inlen, void *vctxt) {
+xmlIconvConvert(void *vctxt, unsigned char *out, int *outlen,
+                const unsigned char *in, int *inlen,
+                int flush ATTRIBUTE_UNUSED) {
    xmlIconvCtxt *ctxt = vctxt;
    size_t icv_inlen, icv_outlen;
    const char *icv_in = (const char *) in;
@ -1293,6 +1295,7 @@ struct _uconv_t {
 * @outlen:  the length of @out
 * @in:  a pointer to an array of input bytes
 * @inlen:  the length of @in
+ * @flush:  end of input
 *
 * Returns an XML_ENC_ERR code.
 *
@ -1301,8 +1304,8 @@ struct _uconv_t {
 * The value of @outlen after return is the number of octets produced.
 */
 static int
-xmlUconvConvert(unsigned char *out, int *outlen,
-                const unsigned char *in, int *inlen, void *vctxt) {
+xmlUconvConvert(void *vctxt, unsigned char *out, int *outlen,
+                const unsigned char *in, int *inlen, int flush) {
    xmlUconvCtxt *cd = vctxt;
    const char *ucv_in = (const char *) in;
    char *ucv_out = (char *) out;
@ -1317,14 +1320,10 @@ xmlUconvConvert(unsigned char *out, int *outlen,
    }

    /*
-     * Note that the ICU API is stateful. It can always consume a certain
-     * amount of input even if the output buffer would overflow. The
-     * remaining input must be processed by calling ucnv_convertEx with a
-     * possibly empty input buffer.
-     *
-     * ucnv_convertEx is always called with reset and flush set to 0,
-     * so we don't mess up the state. This should never generate
-     * U_TRUNCATED_CHAR_FOUND errors.
+     * The ICU API can consume input, including partial sequences,
+     * even if the output buffer would overflow. The remaining input
+     * must be processed by calling ucnv_convertEx with a possibly
+     * empty input buffer.
     */
    if (cd->isInput) {
        source = cd->uconv;
@ -1337,7 +1336,8 @@ xmlUconvConvert(unsigned char *out, int *outlen,
    ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
                   &ucv_in, ucv_in + *inlen, cd->pivot_buf,
                   &cd->pivot_source, &cd->pivot_target,
-                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
+                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
+                   /* reset */ 0, flush, &err);

    *inlen = ucv_in - (const char*) in;
    *outlen = ucv_out - (char *) out;
@ -1347,8 +1347,8 @@ xmlUconvConvert(unsigned char *out, int *outlen,
    } else {
        switch (err) {
            case U_TRUNCATED_CHAR_FOUND:
-                /* Shouldn't happen without flush */
-                ret = XML_ENC_ERR_SUCCESS;
+                /* Should only happen with flush */
+                ret = XML_ENC_ERR_INPUT;
                break;

            case U_BUFFER_OVERFLOW_ERROR:
@ -1510,6 +1510,7 @@ xmlEncConvertError(int code) {
 * @outlen:  the length of @out
 * @in:  a pointer to an array of input bytes
 * @inlen:  the length of @in
+ * @flush:  end of input
 *
 * The value of @inlen after return is the number of octets consumed
 *     as the return value is 0, else unpredictable.
@ -1519,7 +1520,8 @@ xmlEncConvertError(int code) {
 */
 int
 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
-                 int *outlen, const unsigned char *in, int *inlen) {
+                 int *outlen, const unsigned char *in, int *inlen,
+                 int flush) {
    int ret;

    if (handler->flags & XML_HANDLER_LEGACY) {
@ -1534,6 +1536,7 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
        ret = func(out, outlen, in, inlen);
    } else {
        xmlCharEncConvFunc func = handler->input.func;
+        int oldInlen;

        if (func == NULL) {
            *outlen = 0;
@ -1541,7 +1544,14 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
            return(XML_ENC_ERR_INTERNAL);
        }

-        ret = func(out, outlen, in, inlen, handler->inputCtxt);
+        oldInlen = *inlen;
+        ret = func(handler->inputCtxt, out, outlen, in, inlen, flush);
+
+        /*
+         * Check for truncated multi-byte sequence.
+         */
+        if ((flush) && (ret == XML_ENC_ERR_SUCCESS) && (*inlen != oldInlen))
+            ret = XML_ENC_ERR_INPUT;
    }

    if (ret > 0)
@ -1588,7 +1598,7 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
            return(XML_ENC_ERR_INTERNAL);
        }

-        ret = func(out, outlen, in, inlen, handler->outputCtxt);
+        ret = func(handler->outputCtxt, out, outlen, in, inlen, /* flush */ 0);
    }

    if (ret > 0)
@ -1617,6 +1627,7 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 * xmlCharEncInput:
 * @input: a parser input buffer
 * @sizeOut:  pointer to output size
+ * @flush:  end of input
 *
 * @sizeOut should be set to the maximum output size (or SIZE_MAX).
 * After return, it is set to the number of bytes written.
@ -1626,7 +1637,7 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 * Returns an XML_ENC_ERR code.
 */
 int
-xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
+xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut, int flush)
 {
    xmlBufPtr out, in;
    const xmlChar *dataIn;
@ -1644,7 +1655,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
    *sizeOut = 0;

    availIn = xmlBufUse(in);
-    if (availIn == 0)
+    if ((availIn == 0) && (!flush))
        return(0);
    dataIn = xmlBufContent(in);
    totalIn = 0;
@ -1675,7 +1686,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
        }

        ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
-                               dataIn, &c_in);
+                               dataIn, &c_in, flush && completeIn);

        totalIn += c_in;
        dataIn += c_in;
@ -1750,7 +1761,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
        written = out->size - out->use - 1;
    }
    ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
-                           in->content, &toconv);
+                           in->content, &toconv, /* flush */ 0);
    xmlBufferShrink(in, toconv);
    out->use += written;
    out->content[out->use] = 0;
@ -2077,9 +2088,10 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
 ************************************************************************/

 static int
-asciiToAscii(unsigned char* out, int *poutlen,
+asciiToAscii(void *vctxt ATTRIBUTE_UNUSED,
+             unsigned char* out, int *poutlen,
             const unsigned char* in, int *pinlen,
-             void *vctxt ATTRIBUTE_UNUSED) {
+             int flush ATTRIBUTE_UNUSED) {
    const unsigned char *inend;
    const unsigned char *instart = in;
    int inlen, outlen, ret;
@ -2121,9 +2133,10 @@ asciiToAscii(unsigned char* out, int *poutlen,
 }

 static int
-latin1ToUTF8(unsigned char* out, int *outlen,
+latin1ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+             unsigned char* out, int *outlen,
             const unsigned char* in, int *inlen,
-             void *vctxt ATTRIBUTE_UNUSED) {
+             int flush ATTRIBUTE_UNUSED) {
    unsigned char* outstart = out;
    const unsigned char* instart = in;
    unsigned char* outend;
@ -2180,13 +2193,15 @@ done:
 int
 xmlIsolat1ToUTF8(unsigned char* out, int *outlen,
                 const unsigned char* in, int *inlen) {
-    return(latin1ToUTF8(out, outlen, in, inlen, NULL));
+    return(latin1ToUTF8(/* ctxt */ NULL, out, outlen, in, inlen,
+                        /* flush */ 0));
 }

 static int
-UTF8ToUTF8(unsigned char* out, int *outlen,
+UTF8ToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+           unsigned char* out, int *outlen,
           const unsigned char* in, int *inlen,
-           void *vctxt ATTRIBUTE_UNUSED) {
+           int flush ATTRIBUTE_UNUSED) {
    int len;
    int ret;

@ -2214,9 +2229,10 @@ UTF8ToUTF8(unsigned char* out, int *outlen,

 #ifdef LIBXML_OUTPUT_ENABLED
 static int
-UTF8ToLatin1(unsigned char* out, int *outlen,
+UTF8ToLatin1(void *vctxt ATTRIBUTE_UNUSED,
+             unsigned char* out, int *outlen,
             const unsigned char* in, int *inlen,
-             void *vctxt ATTRIBUTE_UNUSED) {
+             int flush ATTRIBUTE_UNUSED) {
    const unsigned char* outend;
    const unsigned char* outstart = out;
    const unsigned char* instart = in;
@ -2286,14 +2302,16 @@ xmlUTF8ToIsolat1(unsigned char* out, int *outlen,
    if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
        return(XML_ENC_ERR_INTERNAL);

-    return(UTF8ToLatin1(out, outlen, in, inlen, NULL));
+    return(UTF8ToLatin1(/* ctxt */ NULL, out, outlen, in, inlen,
+                        /* flush */ 0));
 }
 #endif /* LIBXML_OUTPUT_ENABLED */

 static int
-UTF16LEToUTF8(unsigned char *out, int *outlen,
+UTF16LEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+              unsigned char *out, int *outlen,
              const unsigned char *in, int *inlen,
-              void *vctxt ATTRIBUTE_UNUSED) {
+              int flush ATTRIBUTE_UNUSED) {
    const unsigned char *instart = in;
    const unsigned char *inend = in + (*inlen & ~1);
    unsigned char *outstart = out;
@ -2360,9 +2378,10 @@ done:

 #ifdef LIBXML_OUTPUT_ENABLED
 static int
-UTF8ToUTF16LE(unsigned char *out, int *outlen,
+UTF8ToUTF16LE(void *vctxt ATTRIBUTE_UNUSED,
+              unsigned char *out, int *outlen,
              const unsigned char *in, int *inlen,
-              void *vctxt ATTRIBUTE_UNUSED) {
+              int flush ATTRIBUTE_UNUSED) {
    const unsigned char *instart = in;
    const unsigned char *inend;
    unsigned char *outstart = out;
@ -2462,9 +2481,10 @@ done:
 }

 static int
-UTF8ToUTF16(unsigned char* outb, int *outlen,
+UTF8ToUTF16(void *vctxt,
+            unsigned char* outb, int *outlen,
            const unsigned char* in, int *inlen,
-            void *vctxt ATTRIBUTE_UNUSED) {
+            int flush) {
    if (in == NULL) {
 	/*
 	 * initialization, add the Byte Order Mark for UTF-16LE
@ -2480,14 +2500,15 @@ UTF8ToUTF16(unsigned char* outb, int *outlen,
 	*inlen = 0;
 	return(0);
    }
-    return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL));
+    return (UTF8ToUTF16LE(vctxt, outb, outlen, in, inlen, flush));
 }
 #endif /* LIBXML_OUTPUT_ENABLED */

 static int
-UTF16BEToUTF8(unsigned char *out, int *outlen,
+UTF16BEToUTF8(void *vctxt ATTRIBUTE_UNUSED,
+              unsigned char *out, int *outlen,
              const unsigned char *in, int *inlen,
-              void *vctxt ATTRIBUTE_UNUSED) {
+              int flush ATTRIBUTE_UNUSED) {
    const unsigned char *instart = in;
    const unsigned char *inend = in + (*inlen & ~1);
    unsigned char *outstart = out;
@ -2554,9 +2575,10 @@ done:

 #ifdef LIBXML_OUTPUT_ENABLED
 static int
-UTF8ToUTF16BE(unsigned char *out, int *outlen,
+UTF8ToUTF16BE(void *vctxt ATTRIBUTE_UNUSED,
+              unsigned char *out, int *outlen,
              const unsigned char *in, int *inlen,
-              void *vctxt ATTRIBUTE_UNUSED) {
+              int flush ATTRIBUTE_UNUSED) {
    const unsigned char *instart = in;
    const unsigned char *inend;
    unsigned char *outstart = out;
@ -2657,10 +2679,11 @@ done:

 #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
 static int
-UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
+UTF8ToHtmlWrapper(void *vctxt ATTRIBUTE_UNUSED,
+                  unsigned char *out, int *outlen,
                  const unsigned char *in, int *inlen,
-                  void *vctxt ATTRIBUTE_UNUSED) {
-    return(UTF8ToHtml(out, outlen, in, inlen));
+                  int flush ATTRIBUTE_UNUSED) {
+    return(htmlUTF8ToHtml(out, outlen, in, inlen));
 }
 #endif

@ -2668,8 +2691,10 @@ UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
    defined(LIBXML_ISO8859X_ENABLED)

 static int
-UTF8ToISO8859x(unsigned char *out, int *outlen,
-               const unsigned char *in, int *inlen, void *vctxt) {
+UTF8ToISO8859x(void *vctxt,
+               unsigned char *out, int *outlen,
+               const unsigned char *in, int *inlen,
+               int flush ATTRIBUTE_UNUSED) {
    const unsigned char *xlattable = vctxt;
    const unsigned char *instart = in;
    const unsigned char *inend;
@ -2748,8 +2773,10 @@ done:
 }

 static int
-ISO8859xToUTF8(unsigned char* out, int *outlen,
-               const unsigned char* in, int *inlen, void *vctxt) {
+ISO8859xToUTF8(void *vctxt,
+               unsigned char* out, int *outlen,
+               const unsigned char* in, int *inlen,
+               int flush ATTRIBUTE_UNUSED) {
    unsigned short const *unicodetable = vctxt;
    const unsigned char* instart = in;
    const unsigned char* inend;
--- a/example/icu.c
+++ b/example/icu.c
@ -31,8 +31,8 @@ typedef struct {
 } myConvCtxt;

 static int
-icuConvert(unsigned char *out, int *outlen,
-           const unsigned char *in, int *inlen, void *vctxt) {
+icuConvert(void *vctxt, unsigned char *out, int *outlen,
+           const unsigned char *in, int *inlen, int flush) {
    myConvCtxt *cd = vctxt;
    const char *ucv_in = (const char *) in;
    char *ucv_out = (char *) out;
@ -47,14 +47,10 @@ icuConvert(unsigned char *out, int *outlen,
    }

    /*
-     * Note that the ICU API is stateful. It can always consume a certain
-     * amount of input even if the output buffer would overflow. The
-     * remaining input must be processed by calling ucnv_convertEx with a
-     * possibly empty input buffer.
-     *
-     * ucnv_convertEx is always called with reset and flush set to 0,
-     * so we don't mess up the state. This should never generate
-     * U_TRUNCATED_CHAR_FOUND errors.
+     * The ICU API can consume input, including partial sequences,
+     * even if the output buffer would overflow. The remaining input
+     * must be processed by calling ucnv_convertEx with a possibly
+     * empty input buffer.
     */
    if (cd->isInput) {
        source = cd->uconv;
@ -67,7 +63,8 @@ icuConvert(unsigned char *out, int *outlen,
    ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
                   &ucv_in, ucv_in + *inlen, cd->pivot_buf,
                   &cd->pivot_source, &cd->pivot_target,
-                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
+                   cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
+                   /* reset */ 0, flush, &err);

    *inlen = ucv_in - (const char*) in;
    *outlen = ucv_out - (char *) out;
@ -77,8 +74,8 @@ icuConvert(unsigned char *out, int *outlen,
    } else {
        switch (err) {
            case U_TRUNCATED_CHAR_FOUND:
-                /* Shouldn't happen without flush */
-                ret = XML_ENC_ERR_SUCCESS;
+                /* Should only happen with flush */
+                ret = XML_ENC_ERR_INPUT;
                break;

            case U_BUFFER_OVERFLOW_ERROR:
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@ -126,17 +126,22 @@ typedef int (*xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
 * @outlen:  the length of @out
 * @in:  a pointer to an array of input bytes
 * @inlen:  the length of @in
+ * @flush:  end of input
 *
 * Convert between character encodings.
 *
- * On success, the value of @inlen after return is the number of
- * bytes consumed and @outlen is the number of bytes produced.
+ * The value of @inlen after return is the number of bytes consumed
+ * and @outlen is the number of bytes produced.
 *
- * Returns the number of bytes written or an XML_ENC_ERR code.
+ * If the converter can consume partial multi-byte sequences, the
+ * @flush flag can be used to detect truncated sequences at EOF.
+ * Otherwise, the flag can be ignored.
+ *
+ * Returns a non-negative number on success or an XML_ENC_ERR code.
 */
 typedef int
-(*xmlCharEncConvFunc)(unsigned char *out, int *outlen,
-                      const unsigned char *in, int *inlen, void *vctxt);
+(*xmlCharEncConvFunc)(void *vctxt, unsigned char *out, int *outlen,
+                      const unsigned char *in, int *inlen, int flush);

 /**
 * xmlCharEncConvCtxtDtor:
--- a/include/private/enc.h
+++ b/include/private/enc.h
@ -9,9 +9,10 @@ xmlInitEncodingInternal(void);

 XML_HIDDEN int
 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
-                 int *outlen, const unsigned char *in, int *inlen);
+                 int *outlen, const unsigned char *in, int *inlen,
+                 int flush);
 XML_HIDDEN int
-xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut);
+xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut, int flush);
 XML_HIDDEN int
 xmlCharEncOutput(xmlOutputBufferPtr output, int init);

--- a/include/private/parser.h
+++ b/include/private/parser.h
@ -140,4 +140,7 @@ XML_HIDDEN xmlChar *
 xmlExpandEntitiesInAttValue(xmlParserCtxtPtr ctxt, const xmlChar *str,
                            int normalize);

+XML_HIDDEN void
+xmlParserCheckEOF(xmlParserCtxtPtr ctxt, xmlParserErrors code);
+
 #endif /* XML_PARSER_H_PRIVATE__ */
--- a/parser.c
+++ b/parser.c
@ -7300,9 +7300,7 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
    while (ctxt->inputNr > oldInputNr)
        xmlPopPE(ctxt);

-    if (RAW != 0) {
-	xmlFatalErr(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED, NULL);
-    }
+    xmlParserCheckEOF(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED);
 }

 /**
@ -9875,8 +9873,7 @@ xmlParseContent(xmlParserCtxtPtr ctxt) {

    xmlParseContentInternal(ctxt);

-    if (ctxt->input->cur < ctxt->input->end)
-	xmlFatalErr(ctxt, XML_ERR_NOT_WELL_BALANCED, NULL);
+    xmlParserCheckEOF(ctxt, XML_ERR_NOT_WELL_BALANCED);
 }

 /**
@ -10737,16 +10734,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
 	 */
 	xmlParseMisc(ctxt);

-        if (ctxt->input->cur < ctxt->input->end) {
-            if (ctxt->wellFormed)
-	        xmlFatalErr(ctxt, XML_ERR_DOCUMENT_END, NULL);
-        } else if ((ctxt->input->buf != NULL) &&
-                   (ctxt->input->buf->encoder != NULL) &&
-                   (ctxt->input->buf->error == 0) &&
-                   (!xmlBufIsEmpty(ctxt->input->buf->raw))) {
-            xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR,
-                           "Truncated multi-byte sequence at EOF\n");
-        }
+        xmlParserCheckEOF(ctxt, XML_ERR_DOCUMENT_END);
    }

    ctxt->instate = XML_PARSER_EOF;
@ -11596,11 +11584,8 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
                xmlFatalErrMsg(ctxt, XML_ERR_DOCUMENT_EMPTY,
                               "Start tag expected, '<' not found\n");
            }
-        } else if ((ctxt->input->buf->encoder != NULL) &&
-                   (ctxt->input->buf->error == 0) &&
-                   (!xmlBufIsEmpty(ctxt->input->buf->raw))) {
-            xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR,
-                           "Truncated multi-byte sequence at EOF\n");
+        } else {
+            xmlParserCheckEOF(ctxt, XML_ERR_DOCUMENT_END);
        }
 	if (ctxt->instate != XML_PARSER_EOF) {
            ctxt->instate = XML_PARSER_EOF;
--- a/parserInternals.c
+++ b/parserInternals.c
@ -596,6 +596,49 @@ xmlParserGrow(xmlParserCtxtPtr ctxt) {
    return(ret);
 }

+/**
+ * xmlParserCheckEOF:
+ * @ctxt:  parser ctxt
+ * @code:  error code
+ *
+ * Raises an error with @code if the input wasn't consumed
+ * completely.
+ */
+void
+xmlParserCheckEOF(xmlParserCtxtPtr ctxt, xmlParserErrors code) {
+    xmlParserInputPtr in = ctxt->input;
+    xmlParserInputBufferPtr buf;
+
+    if (ctxt->errNo != XML_ERR_OK)
+        return;
+
+    if (in->cur < in->end) {
+        xmlFatalErr(ctxt, code, NULL);
+        return;
+    }
+
+    buf = in->buf;
+    if ((buf != NULL) && (buf->encoder != NULL)) {
+        size_t curBase = in->cur - in->base;
+        size_t sizeOut = 64;
+        int ret;
+
+        /*
+         * Check for truncated multi-byte sequence
+         */
+        ret = xmlCharEncInput(buf, &sizeOut, /* flush */ 1);
+        xmlBufUpdateInput(buf->buffer, in, curBase);
+        if (ret < 0) {
+            xmlCtxtErrIO(ctxt, buf->error, NULL);
+            return;
+        }
+
+        /* Shouldn't happen */
+        if (in->cur < in->end)
+            xmlFatalErr(ctxt, XML_ERR_INTERNAL_ERROR, "expected EOF");
+    }
+}
+
 /**
 * xmlParserInputGrow:
 * @in:  an XML parser input
@ -1105,7 +1148,8 @@ xmlDetectEBCDIC(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr *hout) {
        return(res);
    outlen = sizeof(out) - 1;
    inlen = input->end - input->cur;
-    res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen);
+    res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen,
+                           /* flush */ 0);
    /*
     * Return the EBCDIC handler if decoding failed. The error will
     * be reported later.
@ -1354,7 +1398,7 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input,
            nbchars = SIZE_MAX;
        else
            nbchars = 4000 /* MINLEN */;
-        res = xmlCharEncInput(in, &nbchars);
+        res = xmlCharEncInput(in, &nbchars, /* flush */ 0);
        if (res < 0)
            code = in->error;
    }
--- a/result/errors/truncated-utf16.xml.ent
+++ b/result/errors/truncated-utf16.xml.ent
@ -1,3 +1,3 @@
-./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF
+./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding
 <d/>
    ^
--- a/result/errors/truncated-utf16.xml.err
+++ b/result/errors/truncated-utf16.xml.err
@ -1,3 +1,3 @@
-./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF
+./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding
 <d/>
    ^
--- a/result/errors/truncated-utf16.xml.str
+++ b/result/errors/truncated-utf16.xml.str
@ -1,4 +1,4 @@
-./test/errors/truncated-utf16.xml:1: parser error : Truncated multi-byte sequence at EOF
+./test/errors/truncated-utf16.xml:1: I/O error : Invalid bytes in character encoding
 <d/>
    ^
 ./test/errors/truncated-utf16.xml : failed to parse
--- a/testparser.c
+++ b/testparser.c
@ -952,11 +952,88 @@ testWindowsUri(void) {
 }
 #endif /* WIN32 */

+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
+static int
+testTruncatedMultiByte(void) {
+    const char xml[] =
+        "<?xml version=\"1.0\" encoding=\"EUC-JP\"?>\n"
+        "<doc/>\xC3";
+#ifdef LIBXML_HTML_ENABLED
+    const char html[] =
+        "<meta charset=\"EUC-JP\">\n"
+        "<div/>\xC3";
+#endif
+    xmlDocPtr doc;
+    const xmlError *error;
+    int err = 0;
+
+    xmlResetLastError();
+    doc = xmlReadDoc(BAD_CAST xml, NULL, NULL, XML_PARSE_NOERROR);
+    error = xmlGetLastError();
+    if (error == NULL || error->code != XML_ERR_INVALID_ENCODING) {
+        fprintf(stderr, "xml, pull: expected XML_ERR_INVALID_ENCODING\n");
+        err = 1;
+    }
+    xmlFreeDoc(doc);
+
+#ifdef LIBXML_HTML_ENABLED
+    xmlResetLastError();
+    doc = htmlReadDoc(BAD_CAST html, NULL, NULL, XML_PARSE_NOERROR);
+    error = xmlGetLastError();
+    if (error == NULL || error->code != XML_ERR_INVALID_ENCODING) {
+        fprintf(stderr, "html, pull: expected XML_ERR_INVALID_ENCODING\n");
+        err = 1;
+    }
+    xmlFreeDoc(doc);
+#endif /* LIBXML_HTML_ENABLED */
+
+#ifdef LIBXML_PUSH_ENABLED
+    {
+        xmlParserCtxtPtr ctxt;
+
+        ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL);
+        xmlCtxtSetOptions(ctxt, XML_PARSE_NOERROR);
+
+        xmlParseChunk(ctxt, xml, sizeof(xml) - 1, 0);
+        xmlParseChunk(ctxt, "", 0, 1);
+
+        if (ctxt->errNo != XML_ERR_INVALID_ENCODING) {
+            fprintf(stderr, "xml, push: expected XML_ERR_INVALID_ENCODING\n");
+            err = 1;
+        }
+
+        xmlFreeDoc(ctxt->myDoc);
+        xmlFreeParserCtxt(ctxt);
+
+#ifdef LIBXML_HTML_ENABLED
+        ctxt = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL,
+                                        XML_CHAR_ENCODING_NONE);
+        xmlCtxtSetOptions(ctxt, XML_PARSE_NOERROR);
+
+        htmlParseChunk(ctxt, html, sizeof(html) - 1, 0);
+        htmlParseChunk(ctxt, "", 0, 1);
+
+        if (ctxt->errNo != XML_ERR_INVALID_ENCODING) {
+            fprintf(stderr, "html, push: expected XML_ERR_INVALID_ENCODING\n");
+            err = 1;
+        }
+
+        xmlFreeDoc(ctxt->myDoc);
+        htmlFreeParserCtxt(ctxt);
+#endif /* LIBXML_HTML_ENABLED */
+    }
+#endif /* LIBXML_PUSH_ENABLED */
+
+    return err;
+}
+#endif /* iconv || icu */
+
 static int charEncConvImplError;

 static int
-rot13Convert(unsigned char *out, int *outlen,
-             const unsigned char *in, int *inlen, void *vctxt) {
+rot13Convert(void *vctxt, unsigned char *out, int *outlen,
+             const unsigned char *in, int *inlen,
+             int flush ATTRIBUTE_UNUSED) {
    int *ctxt = vctxt;
    int inSize = *inlen;
    int outSize = *outlen;
@ -1075,6 +1152,9 @@ main(void) {
    err |= testBuildRelativeUri();
 #if defined(_WIN32) || defined(__CYGWIN__)
    err |= testWindowsUri();
+#endif
+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
+    err |= testTruncatedMultiByte();
 #endif
    err |= testCharEncConvImpl();

--- a/xmlIO.c
+++ b/xmlIO.c
@ -2201,7 +2201,7 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in,
 	 * convert as much as possible to the parser reading buffer.
 	 */
        nbchars = SIZE_MAX;
-	if (xmlCharEncInput(in, &nbchars) < 0)
+	if (xmlCharEncInput(in, &nbchars, /* flush */ 0) < 0)
            return(-1);
        if (nbchars > INT_MAX)
            nbchars = INT_MAX;
@ -2312,7 +2312,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
        else
            sizeOut = SIZE_MAX;

-	if (xmlCharEncInput(in, &sizeOut) < 0)
+	if (xmlCharEncInput(in, &sizeOut, /* flush */ 0) < 0)
 	    return(-1);
        res = sizeOut;
    }