1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-23 02:50:08 +03:00

encoding: Add sizeOut argument to xmlCharEncInput

When push parsing, we want to convert as much of the input as possible.
When pull parsing memory buffers, we want to convert data chunk by chunk
to save memory.
This commit is contained in:
Nick Wellnhofer 2024-07-07 18:38:31 +02:00
parent 8e871a31f8
commit 34c9108f15
5 changed files with 126 additions and 80 deletions

View File

@ -1537,75 +1537,104 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
/**
* xmlCharEncInput:
* @input: a parser input buffer
* @sizeOut: pointer to output size
*
* @sizeOut should be set to the maximum output size (or SIZE_MAX).
* After return, it is set to the number of bytes written.
*
* Generic front-end for the encoding handler on parser input
*
* Returns the number of bytes written or an XML_ENC_ERR code.
* Returns an XML_ENC_ERR code.
*/
int
xmlCharEncInput(xmlParserInputBufferPtr input)
xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
{
xmlBufPtr out, in;
const xmlChar *dataIn;
size_t availIn;
size_t maxOut;
size_t totalIn, totalOut;
int ret;
size_t avail;
size_t toconv;
int c_in;
int c_out;
xmlBufPtr in;
xmlBufPtr out;
const xmlChar *inData;
size_t inTotal = 0;
if ((input == NULL) || (input->encoder == NULL) ||
(input->buffer == NULL) || (input->raw == NULL))
return(XML_ENC_ERR_INTERNAL);
out = input->buffer;
in = input->raw;
toconv = xmlBufUse(in);
if (toconv == 0)
return (0);
inData = xmlBufContent(in);
inTotal = 0;
maxOut = *sizeOut;
totalOut = 0;
do {
c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv;
*sizeOut = 0;
avail = xmlBufAvail(out);
if (avail > INT_MAX)
avail = INT_MAX;
if (avail < 4096) {
availIn = xmlBufUse(in);
if (availIn == 0)
return(0);
dataIn = xmlBufContent(in);
totalIn = 0;
while (1) {
size_t availOut;
int completeOut, completeIn;
int c_out, c_in;
availOut = xmlBufAvail(out);
if (availOut > INT_MAX / 2)
availOut = INT_MAX / 2;
if (availOut < maxOut) {
c_out = availOut;
completeOut = 0;
} else {
c_out = maxOut;
completeOut = 1;
}
if (availIn > INT_MAX / 2) {
c_in = INT_MAX / 2;
completeIn = 0;
} else {
c_in = availIn;
completeIn = 1;
}
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
dataIn, &c_in);
totalIn += c_in;
dataIn += c_in;
availIn -= c_in;
totalOut += c_out;
maxOut -= c_out;
xmlBufAddLen(out, c_out);
if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) {
input->error = xmlEncConvertError(ret);
return(ret);
}
if ((completeOut) && (completeIn))
break;
if ((completeOut) && (ret == XML_ENC_ERR_SPACE))
break;
if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS))
break;
if (ret == XML_ENC_ERR_SPACE) {
if (xmlBufGrow(out, 4096) < 0) {
input->error = XML_ERR_NO_MEMORY;
return(XML_ENC_ERR_MEMORY);
}
avail = xmlBufAvail(out);
}
c_in = toconv;
c_out = avail;
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
inData, &c_in);
inTotal += c_in;
inData += c_in;
toconv -= c_in;
xmlBufAddLen(out, c_out);
} while (ret == XML_ENC_ERR_SPACE);
xmlBufShrink(in, inTotal);
if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in)
input->rawconsumed = ULONG_MAX;
else
input->rawconsumed += c_in;
if (((ret != 0) && (c_out == 0)) ||
(ret == XML_ENC_ERR_MEMORY)) {
if (input->error == 0)
input->error = xmlEncConvertError(ret);
return(ret);
}
return (c_out);
xmlBufShrink(in, totalIn);
if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn)
input->rawconsumed = ULONG_MAX;
else
input->rawconsumed += totalIn;
*sizeOut = totalOut;
return(XML_ERR_OK);
}
/**

View File

@ -11,7 +11,7 @@ XML_HIDDEN int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
int *outlen, const unsigned char *in, int *inlen);
XML_HIDDEN int
xmlCharEncInput(xmlParserInputBufferPtr input);
xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut);
XML_HIDDEN int
xmlCharEncOutput(xmlOutputBufferPtr output, int init);

View File

@ -11561,14 +11561,18 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
int terminate) {
size_t curBase;
size_t maxLength;
size_t pos;
int end_in_lf = 0;
int res;
if ((ctxt == NULL) || (size < 0))
return(XML_ERR_ARGUMENT);
if ((chunk == NULL) && (size > 0))
return(XML_ERR_ARGUMENT);
if ((ctxt->input == NULL) || (ctxt->input->buf == NULL))
return(XML_ERR_ARGUMENT);
if (ctxt->disableSAX != 0)
return(ctxt->errNo);
if (ctxt->input == NULL)
return(XML_ERR_INTERNAL_ERROR);
ctxt->input->flags |= XML_INPUT_PROGRESSIVE;
if (ctxt->instate == XML_PARSER_START)
@ -11579,18 +11583,17 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
size--;
}
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL)) {
size_t pos = ctxt->input->cur - ctxt->input->base;
int res;
res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
if (res < 0) {
xmlCtxtErrIO(ctxt, ctxt->input->buf->error, NULL);
xmlHaltParser(ctxt);
return(ctxt->errNo);
}
/*
* Also push an empty chunk to make sure that the raw buffer
* will be flushed if there is an encoder.
*/
pos = ctxt->input->cur - ctxt->input->base;
res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
if (res < 0) {
xmlCtxtErrIO(ctxt, ctxt->input->buf->error, NULL);
xmlHaltParser(ctxt);
return(ctxt->errNo);
}
xmlParseTryOrFinish(ctxt, terminate);
@ -11608,11 +11611,8 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1))
return(ctxt->errNo);
if ((end_in_lf == 1) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL)) {
size_t pos = ctxt->input->cur - ctxt->input->base;
int res;
if (end_in_lf == 1) {
pos = ctxt->input->cur - ctxt->input->base;
res = xmlParserInputBufferPush(ctxt->input->buf, 1, "\r");
xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
if (res < 0) {
@ -11639,8 +11639,7 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
xmlFatalErrMsg(ctxt, XML_ERR_DOCUMENT_EMPTY,
"Start tag expected, '<' not found\n");
}
} else if ((ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder != NULL) &&
} else if ((ctxt->input->buf->encoder != NULL) &&
(ctxt->input->buf->error == 0) &&
(!xmlBufIsEmpty(ctxt->input->buf->raw))) {
xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR,

View File

@ -1273,7 +1273,6 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler) {
xmlParserInputBufferPtr in;
xmlBufPtr buf;
int nbchars;
int code = XML_ERR_OK;
if ((input == NULL) || (input->buf == NULL)) {
@ -1326,6 +1325,8 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input,
*/
if (input->end > input->base) {
size_t processed;
size_t nbchars;
int res;
/*
* Shrink the current input buffer.
@ -1336,8 +1337,9 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input,
input->consumed += processed;
in->rawconsumed = processed;
nbchars = xmlCharEncInput(in);
if (nbchars < 0)
nbchars = 4000 /* MINLEN */;
res = xmlCharEncInput(in, &nbchars);
if (res < 0)
code = in->error;
}

28
xmlIO.c
View File

@ -48,6 +48,10 @@
#include "private/error.h"
#include "private/io.h"
#ifndef SIZE_MAX
#define SIZE_MAX ((size_t) -1)
#endif
/* #define VERBOSE_FAILURE */
#define MINLEN 4000
@ -2105,7 +2109,7 @@ xmlOutputBufferCreateFilenameDefault(xmlOutputBufferCreateFilenameFunc func)
int
xmlParserInputBufferPush(xmlParserInputBufferPtr in,
int len, const char *buf) {
int nbchars = 0;
size_t nbchars = 0;
int ret;
if (len < 0) return(0);
@ -2130,9 +2134,11 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in,
/*
* convert as much as possible to the parser reading buffer.
*/
nbchars = xmlCharEncInput(in);
if (nbchars < 0)
return(-1);
nbchars = SIZE_MAX;
if (xmlCharEncInput(in, &nbchars) < 0)
return(-1);
if (nbchars > INT_MAX)
nbchars = INT_MAX;
} else {
nbchars = len;
ret = xmlBufAdd(in->buffer, (xmlChar *) buf, nbchars);
@ -2229,9 +2235,19 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
}
if (in->encoder != NULL) {
res = xmlCharEncInput(in);
if (res < 0)
size_t sizeOut;
/*
* Don't convert whole buffer when reading from memory.
*/
if (in->readcallback == NULL)
sizeOut = len;
else
sizeOut = SIZE_MAX;
if (xmlCharEncInput(in, &sizeOut) < 0)
return(-1);
res = sizeOut;
}
return(res);
}