diff --git a/HTMLparser.c b/HTMLparser.c index e7ac8d41..ba3eb16d 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -6110,29 +6110,6 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif - -#if 0 - if ((terminate) || (ctxt->input->buf->buffer->use > 80)) - htmlParseTryOrFinish(ctxt, terminate); -#endif - } else if (ctxt->instate != XML_PARSER_EOF) { - if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { - xmlParserInputBufferPtr in = ctxt->input->buf; - if ((in->encoder != NULL) && (in->buffer != NULL) && - (in->raw != NULL)) { - int nbchars; - size_t pos = ctxt->input->cur - ctxt->input->base; - - nbchars = xmlCharEncInput(in, terminate); - xmlBufUpdateInput(in->buffer, ctxt->input, pos); - if (nbchars < 0) { - htmlParseErr(ctxt, in->error, - "encoder error\n", NULL, NULL); - xmlHaltParser(ctxt); - return(XML_ERR_INVALID_ENCODING); - } - } - } } htmlParseTryOrFinish(ctxt, terminate); if (terminate) { diff --git a/encoding.c b/encoding.c index b59df26e..7a19c2be 100644 --- a/encoding.c +++ b/encoding.c @@ -1915,7 +1915,6 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in - * @flush: if true, indicates end of input * * Returns an XML_ENC_ERR code. * @@ -1925,7 +1924,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, */ static int xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, - const unsigned char *in, int *inlen, int flush) { + const unsigned char *in, int *inlen) { const char *ucv_in = (const char *) in; char *ucv_out = (char *) out; UErrorCode err = U_ZERO_ERROR; @@ -1935,25 +1934,36 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, return(XML_ENC_ERR_INTERNAL); } + /* + * Note that the ICU API is stateful. It can always consume a certain + * amount of input even if the output buffer would overflow. The + * remaining input must be processed by calling ucnv_convertEx with a + * possibly empty input buffer. + * + * ucnv_convertEx is always called with reset and flush set to 0, + * so we don't mess up the state. This should never generate + * U_TRUNCATED_CHAR_FOUND errors. + * + * This also means that ICU xmlCharEncodingHandlers should never be + * reused. It would be a lot nicer if there was a way to emulate the + * stateless iconv API. + */ if (toUnicode) { /* encoding => UTF-16 => UTF-8 */ ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, &ucv_in, ucv_in + *inlen, cd->pivot_buf, &cd->pivot_source, &cd->pivot_target, - cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err); + cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err); } else { /* UTF-8 => UTF-16 => encoding */ ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, &ucv_in, ucv_in + *inlen, cd->pivot_buf, &cd->pivot_source, &cd->pivot_target, - cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err); + cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err); } *inlen = ucv_in - (const char*) in; *outlen = ucv_out - (char *) out; if (U_SUCCESS(err)) { - /* reset pivot buf if this is the last call for input (flush==TRUE) */ - if (flush) - cd->pivot_source = cd->pivot_target = cd->pivot_buf; return(XML_ENC_ERR_SUCCESS); } if (err == U_BUFFER_OVERFLOW_ERROR) @@ -2005,7 +2015,6 @@ xmlEncConvertError(int code) { * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in - * @flush: flush (ICU-related) * * Returns an XML_ENC_ERR code. * @@ -2015,9 +2024,8 @@ xmlEncConvertError(int code) { */ int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, - int *outlen, const unsigned char *in, int *inlen, int flush) { + int *outlen, const unsigned char *in, int *inlen) { int ret; - (void)flush; if (handler->input != NULL) { ret = handler->input(out, outlen, in, inlen); @@ -2031,8 +2039,7 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, #endif /* LIBXML_ICONV_ENABLED */ #ifdef LIBXML_ICU_ENABLED else if (handler->uconv_in != NULL) { - ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen, - flush); + ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen); } #endif /* LIBXML_ICU_ENABLED */ else { @@ -2041,8 +2048,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, ret = XML_ENC_ERR_INTERNAL; } - /* Ignore space and partial errors when reading. */ - if ((ret == XML_ENC_ERR_SPACE) || (ret == XML_ENC_ERR_PARTIAL)) + /* Ignore partial errors when reading. */ + if (ret == XML_ENC_ERR_PARTIAL) ret = XML_ENC_ERR_SUCCESS; return(ret); @@ -2079,8 +2086,7 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out, #endif /* LIBXML_ICONV_ENABLED */ #ifdef LIBXML_ICU_ENABLED else if (handler->uconv_out != NULL) { - ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen, - 1); + ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen); } #endif /* LIBXML_ICU_ENABLED */ else { @@ -2113,22 +2119,23 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, /** * xmlCharEncInput: * @input: a parser input buffer - * @flush: try to flush all the raw buffer * * Generic front-end for the encoding handler on parser input * * Returns the number of bytes written or an XML_ENC_ERR code. */ int -xmlCharEncInput(xmlParserInputBufferPtr input, int flush) +xmlCharEncInput(xmlParserInputBufferPtr input) { int ret; - size_t written; + size_t avail; size_t toconv; int c_in; int c_out; xmlBufPtr in; xmlBufPtr out; + const xmlChar *inData; + size_t inTotal = 0; if ((input == NULL) || (input->encoder == NULL) || (input->buffer == NULL) || (input->raw == NULL)) @@ -2139,25 +2146,34 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush) toconv = xmlBufUse(in); if (toconv == 0) return (0); - if ((toconv > 64 * 1024) && (flush == 0)) - toconv = 64 * 1024; - written = xmlBufAvail(out); - if (toconv * 2 >= written) { - if (xmlBufGrow(out, toconv * 2) < 0) { - input->error = XML_ERR_NO_MEMORY; - return(XML_ENC_ERR_MEMORY); - } - written = xmlBufAvail(out); - } - if ((written > 128 * 1024) && (flush == 0)) - written = 128 * 1024; + inData = xmlBufContent(in); + inTotal = 0; - c_in = toconv; - c_out = written; - ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in, flush); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); + do { + c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv; + + avail = xmlBufAvail(out); + if (avail > INT_MAX) + avail = INT_MAX; + if (avail < toconv * 2) { + if (xmlBufGrow(out, toconv * 2) < 0) { + input->error = XML_ERR_NO_MEMORY; + return(XML_ENC_ERR_MEMORY); + } + avail = xmlBufAvail(out); + } + + c_in = toconv; + c_out = avail; + ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, + inData, &c_in); + inTotal += c_in; + inData += c_in; + toconv -= c_in; + xmlBufAddLen(out, c_out); + } while (ret == XML_ENC_ERR_SPACE); + + xmlBufShrink(in, inTotal); if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in) input->rawconsumed = ULONG_MAX; @@ -2207,7 +2223,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, written = out->size - out->use - 1; } ret = xmlEncInputChunk(handler, &out->content[out->use], &written, - in->content, &toconv, 1); + in->content, &toconv); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; @@ -2273,8 +2289,6 @@ retry: * Conversion itself. */ toconv = xmlBufUse(in); - if (toconv == 0) - return (writtentot); if (toconv > 64 * 1024) toconv = 64 * 1024; if (toconv * 4 >= written) { @@ -2404,8 +2418,6 @@ retry: * Conversion itself. */ toconv = in->use; - if (toconv == 0) - return(0); if (toconv * 4 >= written) { xmlBufferGrow(out, toconv * 4); written = out->size - out->use - 1; diff --git a/include/private/enc.h b/include/private/enc.h index cbdc2b33..cd549145 100644 --- a/include/private/enc.h +++ b/include/private/enc.h @@ -9,9 +9,9 @@ xmlInitEncodingInternal(void); XML_HIDDEN int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, - int *outlen, const unsigned char *in, int *inlen, int flush); + int *outlen, const unsigned char *in, int *inlen); XML_HIDDEN int -xmlCharEncInput(xmlParserInputBufferPtr input, int flush); +xmlCharEncInput(xmlParserInputBufferPtr input); XML_HIDDEN int xmlCharEncOutput(xmlOutputBufferPtr output, int init); diff --git a/parser.c b/parser.c index 97c087e1..2ebc3799 100644 --- a/parser.c +++ b/parser.c @@ -10993,27 +10993,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1)) return(0); - if (ctxt->input == NULL) break; - if (ctxt->input->buf != NULL) { - /* - * If we are operating on converted input, try to flush - * remaining chars to avoid them stalling in the non-converted - * buffer. - */ - if ((ctxt->input->buf->raw != NULL) && - (xmlBufIsEmpty(ctxt->input->buf->raw) == 0)) { - size_t pos = ctxt->input->cur - ctxt->input->base; - int res; - - res = xmlParserInputBufferPush(ctxt->input->buf, 0, ""); - xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos); - if (res < 0) { - xmlFatalErr(ctxt, ctxt->input->buf->error, NULL); - xmlHaltParser(ctxt); - return(0); - } - } - } avail = ctxt->input->end - ctxt->input->cur; if (avail < 1) goto done; @@ -11667,24 +11646,6 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: pushed %d\n", size); #endif - - } else if (ctxt->instate != XML_PARSER_EOF) { - if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { - xmlParserInputBufferPtr in = ctxt->input->buf; - if ((in->encoder != NULL) && (in->buffer != NULL) && - (in->raw != NULL)) { - int nbchars; - size_t pos = ctxt->input->cur - ctxt->input->base; - - nbchars = xmlCharEncInput(in, terminate); - xmlBufUpdateInput(in->buffer, ctxt->input, pos); - if (nbchars < 0) { - xmlFatalErr(ctxt, in->error, NULL); - xmlHaltParser(ctxt); - return(ctxt->errNo); - } - } - } } xmlParseTryOrFinish(ctxt, terminate); diff --git a/parserInternals.c b/parserInternals.c index 706515ce..eccdb32e 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1253,7 +1253,7 @@ xmlDetectEBCDIC(xmlParserInputPtr input) { return(NULL); outlen = sizeof(out) - 1; inlen = input->end - input->cur; - res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, 0); + res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen); if (res < 0) return(handler); out[outlen] = 0; @@ -1287,12 +1287,15 @@ xmlDetectEBCDIC(xmlParserInputPtr input) { break; out[i] = 0; xmlCharEncCloseFunc(handler); - handler = xmlFindCharEncodingHandler((char *) out + start); - break; + return(xmlFindCharEncodingHandler((char *) out + start)); } } - return(handler); + /* + * ICU handlers are stateful, so we have to recreate them. + */ + xmlCharEncCloseFunc(handler); + return(xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC)); } /** @@ -1420,17 +1423,7 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, in->buffer = xmlBufCreate(); in->rawconsumed = processed; - /* - * TODO: We must flush and decode the whole buffer to make functions - * like xmlReadMemory work with a user-provided encoding. If the - * encoding is specified directly, we should probably set - * XML_PARSE_IGNORE_ENC in xmlDoRead to avoid switching encodings - * twice. Then we could set "flush" to false which should save - * a considerable amount of memory when parsing from memory. - * It's probably even possible to remove this whole if-block - * completely. - */ - nbchars = xmlCharEncInput(in, 1); + nbchars = xmlCharEncInput(in); xmlBufResetInput(in->buffer, input); if (nbchars < 0) { /* TODO: This could be an out of memory or an encoding error. */ diff --git a/xmlIO.c b/xmlIO.c index 6d94d5ee..9c263814 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -3218,7 +3218,7 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, /* * convert as much as possible to the parser reading buffer. */ - nbchars = xmlCharEncInput(in, 1); + nbchars = xmlCharEncInput(in); if (nbchars < 0) return(-1); } else { @@ -3319,7 +3319,7 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) { } if (in->encoder != NULL) { - res = xmlCharEncInput(in, 1); + res = xmlCharEncInput(in); if (res < 0) return(-1); }