mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-03-25 10:50:08 +03:00
parser: Stream data when reading from memory
Don't create a copy of the whole input buffer. Read the data chunk by chunk to save memory. Historically, it was probably envisioned to read data from memory without additional copying. This doesn't work reliably with the current design of the XML parser which requires a terminating null byte at the end of input buffers. This lead to xmlReadMemory interfaces, which expect pointer and size arguments, being changed to make a zero-terminated copy of the input buffer. Interfaces based on xmlReadDoc, which actually expect a zero-terminated string and would make zero-copy operation work, were then simplified to rely on xmlReadMemoryi, resulting in an unnecessary copy. To avoid copying (possibly gigabytes) of memory temporarily, we now stream in-memory input just like content read from files in a chunk-by-chunk fashion (using a somewhat outdated INPUT_CHUNK size of 250 bytes). As a side effect, we also avoid another copy of the whole input when handling non-UTF-8 data which was made possible by some earlier commits. Interfaces expecting zero-terminated strings now make use of strnlen which unfortunately isn't part of the standard C library and only mandated since POSIX 2008.
This commit is contained in:
parent
5aff27ae78
commit
834b8123ef
64
HTMLparser.c
64
HTMLparser.c
@ -32,6 +32,7 @@
|
||||
#include "private/enc.h"
|
||||
#include "private/error.h"
|
||||
#include "private/html.h"
|
||||
#include "private/io.h"
|
||||
#include "private/parser.h"
|
||||
#include "private/tree.h"
|
||||
|
||||
@ -5169,7 +5170,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) {
|
||||
|
||||
/**
|
||||
* htmlCreateDocParserCtxt:
|
||||
* @cur: a pointer to an array of xmlChar
|
||||
* @str: a pointer to an array of xmlChar
|
||||
* @encoding: a free form C string describing the HTML document encoding, or NULL
|
||||
*
|
||||
* Create a parser context for an HTML document.
|
||||
@ -5179,17 +5180,37 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) {
|
||||
* Returns the new parser context or NULL
|
||||
*/
|
||||
static htmlParserCtxtPtr
|
||||
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
|
||||
int len;
|
||||
htmlParserCtxtPtr ctxt;
|
||||
htmlCreateDocParserCtxt(const xmlChar *str, const char *encoding) {
|
||||
xmlParserCtxtPtr ctxt;
|
||||
xmlParserInputPtr input;
|
||||
xmlParserInputBufferPtr buf;
|
||||
|
||||
if (cur == NULL)
|
||||
if (str == NULL)
|
||||
return(NULL);
|
||||
len = xmlStrlen(cur);
|
||||
ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
|
||||
|
||||
ctxt = htmlNewParserCtxt();
|
||||
if (ctxt == NULL)
|
||||
return(NULL);
|
||||
|
||||
buf = xmlParserInputBufferCreateString(str);
|
||||
if (buf == NULL) {
|
||||
xmlFreeParserCtxt(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
input = xmlNewInputStream(ctxt);
|
||||
if (input == NULL) {
|
||||
xmlFreeParserInputBuffer(buf);
|
||||
xmlFreeParserCtxt(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
input->filename = NULL;
|
||||
input->buf = buf;
|
||||
xmlBufResetInput(buf->buffer, input);
|
||||
|
||||
inputPush(ctxt, input);
|
||||
|
||||
if (encoding != NULL) {
|
||||
xmlCharEncoding enc;
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
@ -5219,6 +5240,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return(ctxt);
|
||||
}
|
||||
|
||||
@ -6932,13 +6954,33 @@ htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
|
||||
* Returns the resulting document tree
|
||||
*/
|
||||
htmlDocPtr
|
||||
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
|
||||
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
|
||||
const char *URL, const char *encoding, int options)
|
||||
{
|
||||
if (cur == NULL)
|
||||
xmlParserInputBufferPtr input;
|
||||
xmlParserInputPtr stream;
|
||||
|
||||
if (ctxt == NULL)
|
||||
return (NULL);
|
||||
return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
|
||||
encoding, options));
|
||||
if (str == NULL)
|
||||
return (NULL);
|
||||
xmlInitParser();
|
||||
|
||||
htmlCtxtReset(ctxt);
|
||||
|
||||
input = xmlParserInputBufferCreateString(str);
|
||||
if (input == NULL) {
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
|
||||
if (stream == NULL) {
|
||||
xmlFreeParserInputBuffer(input);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
inputPush(ctxt, stream);
|
||||
return (htmlDoRead(ctxt, URL, encoding, options, 1));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -364,7 +364,7 @@ xmlFuzzEntityLoader(const char *URL, const char *ID ATTRIBUTE_UNUSED,
|
||||
return(NULL);
|
||||
}
|
||||
input->base = input->cur = xmlBufContent(input->buf->buffer);
|
||||
input->end = input->base + entity->size;
|
||||
input->end = input->base + xmlBufUse(input->buf->buffer);
|
||||
|
||||
return input;
|
||||
}
|
||||
|
@ -11,6 +11,9 @@ XML_HIDDEN void
|
||||
__xmlLoaderErr(void *ctx, const char *msg,
|
||||
const char *filename) LIBXML_ATTR_FORMAT(2,0);
|
||||
|
||||
xmlParserInputBufferPtr
|
||||
xmlParserInputBufferCreateString(const xmlChar *str);
|
||||
|
||||
#ifdef LIBXML_OUTPUT_ENABLED
|
||||
XML_HIDDEN xmlOutputBufferPtr
|
||||
xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
|
||||
|
72
parser.c
72
parser.c
@ -12468,7 +12468,6 @@ xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt,
|
||||
xmlSAXHandlerPtr oldsax = NULL;
|
||||
xmlNodePtr content = NULL;
|
||||
xmlNodePtr last = NULL;
|
||||
int size;
|
||||
xmlParserErrors ret = XML_ERR_OK;
|
||||
#ifdef SAX2
|
||||
int i;
|
||||
@ -12487,9 +12486,7 @@ xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt,
|
||||
if (string == NULL)
|
||||
return(XML_ERR_INTERNAL_ERROR);
|
||||
|
||||
size = xmlStrlen(string);
|
||||
|
||||
ctxt = xmlCreateMemoryParserCtxt((char *) string, size);
|
||||
ctxt = xmlCreateDocParserCtxt(string);
|
||||
if (ctxt == NULL) return(XML_WAR_UNDECLARED_ENTITY);
|
||||
ctxt->nbErrors = oldctxt->nbErrors;
|
||||
ctxt->nbWarnings = oldctxt->nbWarnings;
|
||||
@ -12896,7 +12893,6 @@ xmlParseBalancedChunkMemoryRecover(xmlDocPtr doc, xmlSAXHandlerPtr sax,
|
||||
xmlDocPtr newDoc;
|
||||
xmlSAXHandlerPtr oldsax = NULL;
|
||||
xmlNodePtr content, newRoot;
|
||||
int size;
|
||||
int ret = 0;
|
||||
|
||||
if (depth > 40) {
|
||||
@ -12909,9 +12905,7 @@ xmlParseBalancedChunkMemoryRecover(xmlDocPtr doc, xmlSAXHandlerPtr sax,
|
||||
if (string == NULL)
|
||||
return(-1);
|
||||
|
||||
size = xmlStrlen(string);
|
||||
|
||||
ctxt = xmlCreateMemoryParserCtxt((char *) string, size);
|
||||
ctxt = xmlCreateDocParserCtxt(string);
|
||||
if (ctxt == NULL) return(-1);
|
||||
ctxt->userData = ctxt;
|
||||
if (sax != NULL) {
|
||||
@ -13721,13 +13715,37 @@ int xmlSAXUserParseMemory(xmlSAXHandlerPtr sax, void *user_data,
|
||||
* Returns the new parser context or NULL
|
||||
*/
|
||||
xmlParserCtxtPtr
|
||||
xmlCreateDocParserCtxt(const xmlChar *cur) {
|
||||
int len;
|
||||
xmlCreateDocParserCtxt(const xmlChar *str) {
|
||||
xmlParserCtxtPtr ctxt;
|
||||
xmlParserInputPtr input;
|
||||
xmlParserInputBufferPtr buf;
|
||||
|
||||
if (cur == NULL)
|
||||
if (str == NULL)
|
||||
return(NULL);
|
||||
len = xmlStrlen(cur);
|
||||
return(xmlCreateMemoryParserCtxt((const char *)cur, len));
|
||||
|
||||
ctxt = xmlNewParserCtxt();
|
||||
if (ctxt == NULL)
|
||||
return(NULL);
|
||||
|
||||
buf = xmlParserInputBufferCreateString(str);
|
||||
if (buf == NULL) {
|
||||
xmlFreeParserCtxt(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
input = xmlNewInputStream(ctxt);
|
||||
if (input == NULL) {
|
||||
xmlFreeParserInputBuffer(buf);
|
||||
xmlFreeParserCtxt(ctxt);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
input->filename = NULL;
|
||||
input->buf = buf;
|
||||
xmlBufResetInput(input->buf->buffer, input);
|
||||
|
||||
inputPush(ctxt, input);
|
||||
return(ctxt);
|
||||
}
|
||||
|
||||
#ifdef LIBXML_SAX1_ENABLED
|
||||
@ -14540,13 +14558,33 @@ xmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
|
||||
* Returns the resulting document tree
|
||||
*/
|
||||
xmlDocPtr
|
||||
xmlCtxtReadDoc(xmlParserCtxtPtr ctxt, const xmlChar * cur,
|
||||
xmlCtxtReadDoc(xmlParserCtxtPtr ctxt, const xmlChar *str,
|
||||
const char *URL, const char *encoding, int options)
|
||||
{
|
||||
if (cur == NULL)
|
||||
xmlParserInputBufferPtr input;
|
||||
xmlParserInputPtr stream;
|
||||
|
||||
if (ctxt == NULL)
|
||||
return (NULL);
|
||||
return (xmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
|
||||
encoding, options));
|
||||
if (str == NULL)
|
||||
return (NULL);
|
||||
xmlInitParser();
|
||||
|
||||
xmlCtxtReset(ctxt);
|
||||
|
||||
input = xmlParserInputBufferCreateString(str);
|
||||
if (input == NULL) {
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
|
||||
if (stream == NULL) {
|
||||
xmlFreeParserInputBuffer(input);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
inputPush(ctxt, stream);
|
||||
return (xmlDoRead(ctxt, URL, encoding, options, 1));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -546,9 +546,6 @@ xmlParserGrow(xmlParserCtxtPtr ctxt) {
|
||||
/* Don't grow push parser buffer. */
|
||||
if (ctxt->progressive)
|
||||
return(0);
|
||||
/* Don't grow memory buffers. */
|
||||
if ((buf->encoder == NULL) && (buf->readcallback == NULL))
|
||||
return(0);
|
||||
if (buf->error != 0)
|
||||
return(-1);
|
||||
|
||||
@ -603,10 +600,6 @@ xmlParserInputGrow(xmlParserInputPtr in, int len) {
|
||||
if (in->cur == NULL) return(-1);
|
||||
if (in->buf->buffer == NULL) return(-1);
|
||||
|
||||
/* Don't grow memory buffers. */
|
||||
if ((in->buf->encoder == NULL) && (in->buf->readcallback == NULL))
|
||||
return(0);
|
||||
|
||||
CHECK_BUFFER(in);
|
||||
|
||||
indx = in->cur - in->base;
|
||||
@ -1838,9 +1831,7 @@ xmlNewStringInputStream(xmlParserCtxtPtr ctxt, const xmlChar *buffer) {
|
||||
if (xmlParserDebugEntities)
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"new fixed input: %.30s\n", buffer);
|
||||
buf = xmlParserInputBufferCreateMem((const char *) buffer,
|
||||
xmlStrlen(buffer),
|
||||
XML_CHAR_ENCODING_NONE);
|
||||
buf = xmlParserInputBufferCreateString(buffer);
|
||||
if (buf == NULL) {
|
||||
xmlErrMemory(ctxt, NULL);
|
||||
return(NULL);
|
||||
|
108
xmlIO.c
108
xmlIO.c
@ -2909,6 +2909,31 @@ xmlParserInputBufferCreateFd(int fd, xmlCharEncoding enc) {
|
||||
return(ret);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
const char *mem;
|
||||
size_t size;
|
||||
} xmlMemIOCtxt;
|
||||
|
||||
static int
|
||||
xmlMemRead(void *vctxt, char *buf, int size) {
|
||||
xmlMemIOCtxt *ctxt = vctxt;
|
||||
|
||||
if ((size_t) size > ctxt->size)
|
||||
size = ctxt->size;
|
||||
|
||||
memcpy(buf, ctxt->mem, size);
|
||||
ctxt->mem += size;
|
||||
ctxt->size -= size;
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static int
|
||||
xmlMemClose(void *vctxt) {
|
||||
xmlFree(vctxt);
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlParserInputBufferCreateMem:
|
||||
* @mem: the memory input
|
||||
@ -2923,22 +2948,26 @@ xmlParserInputBufferCreateFd(int fd, xmlCharEncoding enc) {
|
||||
xmlParserInputBufferPtr
|
||||
xmlParserInputBufferCreateMem(const char *mem, int size, xmlCharEncoding enc) {
|
||||
xmlParserInputBufferPtr ret;
|
||||
int errcode;
|
||||
xmlMemIOCtxt *ctxt;
|
||||
|
||||
if (size < 0) return(NULL);
|
||||
if (mem == NULL) return(NULL);
|
||||
|
||||
ret = xmlAllocParserInputBuffer(enc);
|
||||
if (ret != NULL) {
|
||||
ret->context = (void *) mem;
|
||||
ret->readcallback = NULL;
|
||||
ret->closecallback = NULL;
|
||||
errcode = xmlBufAdd(ret->buffer, (const xmlChar *) mem, size);
|
||||
if (errcode != 0) {
|
||||
xmlFreeParserInputBuffer(ret);
|
||||
return(NULL);
|
||||
}
|
||||
if (ret == NULL)
|
||||
return(NULL);
|
||||
|
||||
ctxt = xmlMalloc(sizeof(*ctxt));
|
||||
if (ctxt == NULL) {
|
||||
xmlFreeParserInputBuffer(ret);
|
||||
return(NULL);
|
||||
}
|
||||
ctxt->mem = mem;
|
||||
ctxt->size = size;
|
||||
|
||||
ret->context = ctxt;
|
||||
ret->readcallback = xmlMemRead;
|
||||
ret->closecallback = xmlMemClose;
|
||||
|
||||
return(ret);
|
||||
}
|
||||
@ -2959,6 +2988,65 @@ xmlParserInputBufferCreateStatic(const char *mem, int size,
|
||||
return(xmlParserInputBufferCreateMem(mem, size, enc));
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
const xmlChar *str;
|
||||
} xmlStringIOCtxt;
|
||||
|
||||
static int
|
||||
xmlStringRead(void *vctxt, char *buf, int size) {
|
||||
xmlStringIOCtxt *ctxt = vctxt;
|
||||
const xmlChar *zero;
|
||||
size_t len;
|
||||
|
||||
zero = memchr(ctxt->str, 0, size);
|
||||
len = zero ? zero - ctxt->str : size;
|
||||
|
||||
memcpy(buf, ctxt->str, len);
|
||||
ctxt->str += len;
|
||||
|
||||
return(len);
|
||||
}
|
||||
|
||||
static int
|
||||
xmlStringClose(void *vctxt) {
|
||||
xmlFree(vctxt);
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlParserInputBufferCreateString:
|
||||
* @str: a null-terminated string
|
||||
*
|
||||
* Create a buffered parser input for the progressive parsing for the input
|
||||
* from a null-terminated C string.
|
||||
*
|
||||
* Returns the new parser input or NULL
|
||||
*/
|
||||
xmlParserInputBufferPtr
|
||||
xmlParserInputBufferCreateString(const xmlChar *str) {
|
||||
xmlParserInputBufferPtr ret;
|
||||
xmlStringIOCtxt *ctxt;
|
||||
|
||||
if (str == NULL) return(NULL);
|
||||
|
||||
ret = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE);
|
||||
if (ret == NULL)
|
||||
return(NULL);
|
||||
|
||||
ctxt = xmlMalloc(sizeof(*ctxt));
|
||||
if (ctxt == NULL) {
|
||||
xmlFreeParserInputBuffer(ret);
|
||||
return(NULL);
|
||||
}
|
||||
ctxt->str = str;
|
||||
|
||||
ret->context = ctxt;
|
||||
ret->readcallback = xmlStringRead;
|
||||
ret->closecallback = xmlStringClose;
|
||||
|
||||
return(ret);
|
||||
}
|
||||
|
||||
#ifdef LIBXML_OUTPUT_ENABLED
|
||||
/**
|
||||
* xmlOutputBufferCreateFd:
|
||||
|
Loading…
x
Reference in New Issue
Block a user