mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2024-10-26 12:25:09 +03:00
Adding new encoding function to deal with the new structures
* encoding.c: adds xmlCharEncFirstLineInput, xmlCharEncInput and xmlCharEncOutput * enc.h: the functions are not made public but added to this new header
This commit is contained in:
parent
ade10f2c57
commit
18d0db2503
29
enc.h
Normal file
29
enc.h
Normal file
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* enc.h: Internal Interfaces for encoding in libxml2
|
||||
*
|
||||
* See Copyright for the status of this software.
|
||||
*
|
||||
* daniel@veillard.com
|
||||
*/
|
||||
|
||||
#ifndef __XML_ENC_H__
|
||||
#define __XML_ENC_H__
|
||||
|
||||
#include <libxml/tree.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||||
xmlBufferPtr in, int len);
|
||||
int xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len);
|
||||
int xmlCharEncInput(xmlParserInputBufferPtr input);
|
||||
int xmlCharEncOutput(xmlOutputBufferPtr output, int init);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* __XML_ENC_H__ */
|
||||
|
||||
|
483
encoding.c
483
encoding.c
@ -24,6 +24,7 @@
|
||||
#include "libxml.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef HAVE_CTYPE_H
|
||||
#include <ctype.h>
|
||||
@ -44,6 +45,9 @@
|
||||
#include <libxml/globals.h>
|
||||
#include <libxml/xmlerror.h>
|
||||
|
||||
#include "buf.h"
|
||||
#include "enc.h"
|
||||
|
||||
static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
|
||||
static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
|
||||
|
||||
@ -1897,9 +1901,6 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
||||
* The real API used by libxml for on-the-fly conversion *
|
||||
* *
|
||||
************************************************************************/
|
||||
int
|
||||
xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||||
xmlBufferPtr in, int len);
|
||||
|
||||
/**
|
||||
* xmlCharEncFirstLineInt:
|
||||
@ -1946,7 +1947,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||||
toconv = 180;
|
||||
}
|
||||
if (toconv * 2 >= written) {
|
||||
xmlBufferGrow(out, toconv);
|
||||
xmlBufferGrow(out, toconv * 2);
|
||||
written = out->size - out->use - 1;
|
||||
}
|
||||
|
||||
@ -2028,6 +2029,251 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||||
return(xmlCharEncFirstLineInt(handler, out, in, -1));
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlCharEncInput:
|
||||
* @input: a parser input buffer
|
||||
* @len: number of bytes to convert for the first line, or -1
|
||||
*
|
||||
* Front-end for the encoding handler input function, but handle only
|
||||
* the very first line. Point is that this is based on autodetection
|
||||
* of the encoding and once that first line is converted we may find
|
||||
* out that a different decoder is needed to process the input.
|
||||
*
|
||||
* Returns the number of byte written if success, or
|
||||
* -1 general error
|
||||
* -2 if the transcoding fails (for *in is not valid utf8 string or
|
||||
* the result of transformation can't fit into the encoding we want), or
|
||||
*/
|
||||
int
|
||||
xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
|
||||
{
|
||||
int ret = -2;
|
||||
size_t written;
|
||||
size_t toconv;
|
||||
int c_in;
|
||||
int c_out;
|
||||
xmlBufPtr in;
|
||||
xmlBufPtr out;
|
||||
|
||||
if ((input == NULL) || (input->encoder == NULL) ||
|
||||
(input->buffer == NULL) || (input->raw == NULL))
|
||||
return (-1);
|
||||
out = input->buffer;
|
||||
in = input->raw;
|
||||
|
||||
toconv = xmlBufUse(in);
|
||||
if (toconv == 0)
|
||||
return (0);
|
||||
written = xmlBufAvail(out) - 1; /* count '\0' */
|
||||
/*
|
||||
* echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
|
||||
* 45 chars should be sufficient to reach the end of the encoding
|
||||
* declaration without going too far inside the document content.
|
||||
* on UTF-16 this means 90bytes, on UCS4 this means 180
|
||||
* The actual value depending on guessed encoding is passed as @len
|
||||
* if provided
|
||||
*/
|
||||
if (len >= 0) {
|
||||
if (toconv > (unsigned int) len)
|
||||
toconv = len;
|
||||
} else {
|
||||
if (toconv > 180)
|
||||
toconv = 180;
|
||||
}
|
||||
if (toconv * 2 >= written) {
|
||||
xmlBufGrow(out, toconv * 2);
|
||||
written = xmlBufAvail(out) - 1;
|
||||
}
|
||||
if (written > 360)
|
||||
written = 360;
|
||||
|
||||
c_in = toconv;
|
||||
c_out = written;
|
||||
if (input->encoder->input != NULL) {
|
||||
ret = input->encoder->input(xmlBufEnd(out), &c_out,
|
||||
xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
}
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
else if (input->encoder->iconv_in != NULL) {
|
||||
ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out),
|
||||
&c_out, xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
if (ret == -1)
|
||||
ret = -3;
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef LIBXML_ICU_ENABLED
|
||||
else if (input->encoder->uconv_in != NULL) {
|
||||
ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out),
|
||||
&c_out, xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
if (ret == -1)
|
||||
ret = -3;
|
||||
}
|
||||
#endif /* LIBXML_ICU_ENABLED */
|
||||
switch (ret) {
|
||||
case 0:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"converted %d bytes to %d bytes of input\n",
|
||||
c_in, c_out);
|
||||
#endif
|
||||
break;
|
||||
case -1:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"converted %d bytes to %d bytes of input, %d left\n",
|
||||
c_in, c_out, (int)xmlBufUse(in));
|
||||
#endif
|
||||
break;
|
||||
case -3:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"converted %d bytes to %d bytes of input, %d left\n",
|
||||
c_in, c_out, (int)xmlBufUse(in));
|
||||
#endif
|
||||
break;
|
||||
case -2: {
|
||||
char buf[50];
|
||||
const xmlChar *content = xmlBufContent(in);
|
||||
|
||||
snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
|
||||
content[0], content[1],
|
||||
content[2], content[3]);
|
||||
buf[49] = 0;
|
||||
xmlEncodingErr(XML_I18N_CONV_FAILED,
|
||||
"input conversion failed due to input error, bytes %s\n",
|
||||
buf);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Ignore when input buffer is not on a boundary
|
||||
*/
|
||||
if (ret == -3) ret = 0;
|
||||
if (ret == -1) ret = 0;
|
||||
return(ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlCharEncInput:
|
||||
* @input: a parser input buffer
|
||||
*
|
||||
* Generic front-end for the encoding handler on parser input
|
||||
*
|
||||
* Returns the number of byte written if success, or
|
||||
* -1 general error
|
||||
* -2 if the transcoding fails (for *in is not valid utf8 string or
|
||||
* the result of transformation can't fit into the encoding we want), or
|
||||
*/
|
||||
int
|
||||
xmlCharEncInput(xmlParserInputBufferPtr input)
|
||||
{
|
||||
int ret = -2;
|
||||
size_t written;
|
||||
size_t toconv;
|
||||
int c_in;
|
||||
int c_out;
|
||||
xmlBufPtr in;
|
||||
xmlBufPtr out;
|
||||
|
||||
if ((input == NULL) || (input->encoder == NULL) ||
|
||||
(input->buffer == NULL) || (input->raw == NULL))
|
||||
return (-1);
|
||||
out = input->buffer;
|
||||
in = input->raw;
|
||||
|
||||
toconv = xmlBufUse(in);
|
||||
if (toconv == 0)
|
||||
return (0);
|
||||
if (toconv > 64 * 1024)
|
||||
toconv = 64 * 1024;
|
||||
written = xmlBufAvail(out);
|
||||
if (written > 0)
|
||||
written--; /* count '\0' */
|
||||
if (toconv * 2 >= written) {
|
||||
xmlBufGrow(out, toconv * 2);
|
||||
written = xmlBufAvail(out);
|
||||
if (written > 0)
|
||||
written--; /* count '\0' */
|
||||
}
|
||||
if (written > 128 * 1024)
|
||||
written = 128 * 1024;
|
||||
|
||||
c_in = toconv;
|
||||
c_out = written;
|
||||
if (input->encoder->input != NULL) {
|
||||
ret = input->encoder->input(xmlBufEnd(out), &c_out,
|
||||
xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
}
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
else if (input->encoder->iconv_in != NULL) {
|
||||
ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out),
|
||||
&c_out, xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
if (ret == -1)
|
||||
ret = -3;
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef LIBXML_ICU_ENABLED
|
||||
else if (input->encoder->uconv_in != NULL) {
|
||||
ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out),
|
||||
&c_out, xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
if (ret == -1)
|
||||
ret = -3;
|
||||
}
|
||||
#endif /* LIBXML_ICU_ENABLED */
|
||||
switch (ret) {
|
||||
case 0:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"converted %d bytes to %d bytes of input\n",
|
||||
c_in, c_out);
|
||||
#endif
|
||||
break;
|
||||
case -1:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"converted %d bytes to %d bytes of input, %d left\n",
|
||||
c_in, c_out, (int)xmlBufUse(in));
|
||||
#endif
|
||||
break;
|
||||
case -3:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"converted %d bytes to %d bytes of input, %d left\n",
|
||||
c_in, c_out, (int)xmlBufUse(in));
|
||||
#endif
|
||||
break;
|
||||
case -2: {
|
||||
char buf[50];
|
||||
const xmlChar *content = xmlBufContent(in);
|
||||
|
||||
snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
|
||||
content[0], content[1],
|
||||
content[2], content[3]);
|
||||
buf[49] = 0;
|
||||
xmlEncodingErr(XML_I18N_CONV_FAILED,
|
||||
"input conversion failed due to input error, bytes %s\n",
|
||||
buf);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Ignore when input buffer is not on a boundary
|
||||
*/
|
||||
if (ret == -3)
|
||||
ret = 0;
|
||||
return (c_out? c_out : ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlCharEncInFunc:
|
||||
* @handler: char encoding transformation data structure
|
||||
@ -2135,6 +2381,235 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
|
||||
return (written? written : ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlCharEncOutput:
|
||||
* @input: a parser input buffer
|
||||
* @init: is this an initialization call without data
|
||||
*
|
||||
* Generic front-end for the encoding handler on parser output
|
||||
* a first call with @init == 1 has to be made first to initiate the
|
||||
* output in case of non-stateless encoding needing to initiate their
|
||||
* state or the output (like the BOM in UTF16).
|
||||
* In case of UTF8 sequence conversion errors for the given encoder,
|
||||
* the content will be automatically remapped to a CharRef sequence.
|
||||
*
|
||||
* Returns the number of byte written if success, or
|
||||
* -1 general error
|
||||
* -2 if the transcoding fails (for *in is not valid utf8 string or
|
||||
* the result of transformation can't fit into the encoding we want), or
|
||||
*/
|
||||
int
|
||||
xmlCharEncOutput(xmlOutputBufferPtr output, int init)
|
||||
{
|
||||
int ret = -2;
|
||||
size_t written;
|
||||
size_t writtentot = 0;
|
||||
size_t toconv;
|
||||
int c_in;
|
||||
int c_out;
|
||||
xmlBufPtr in;
|
||||
xmlBufPtr out;
|
||||
int charref_len = 0;
|
||||
|
||||
if ((output == NULL) || (output->encoder == NULL) ||
|
||||
(output->buffer == NULL) || (output->conv == NULL))
|
||||
return (-1);
|
||||
out = output->conv;
|
||||
in = output->buffer;
|
||||
|
||||
retry:
|
||||
|
||||
written = xmlBufAvail(out);
|
||||
if (written > 0)
|
||||
written--; /* count '\0' */
|
||||
|
||||
/*
|
||||
* First specific handling of the initialization call
|
||||
*/
|
||||
if (init) {
|
||||
c_in = 0;
|
||||
c_out = written;
|
||||
if (output->encoder->output != NULL) {
|
||||
ret = output->encoder->output(xmlBufEnd(out), &c_out,
|
||||
NULL, &c_in);
|
||||
if (ret > 0) /* Gennady: check return value */
|
||||
xmlBufAddLen(out, c_out);
|
||||
}
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
else if (output->encoder->iconv_out != NULL) {
|
||||
ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out),
|
||||
&c_out, NULL, &c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef LIBXML_ICU_ENABLED
|
||||
else if (output->encoder->uconv_out != NULL) {
|
||||
ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out),
|
||||
&c_out, NULL, &c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
}
|
||||
#endif /* LIBXML_ICU_ENABLED */
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"initialized encoder\n");
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Conversion itself.
|
||||
*/
|
||||
toconv = xmlBufUse(in);
|
||||
if (toconv == 0)
|
||||
return (0);
|
||||
if (toconv > 64 * 1024)
|
||||
toconv = 64 * 1024;
|
||||
if (toconv * 4 >= written) {
|
||||
xmlBufGrow(out, toconv * 4);
|
||||
written = xmlBufAvail(out) - 1;
|
||||
}
|
||||
if (written > 256 * 1024)
|
||||
written = 256 * 1024;
|
||||
|
||||
c_in = toconv;
|
||||
c_out = written;
|
||||
if (output->encoder->output != NULL) {
|
||||
ret = output->encoder->output(xmlBufEnd(out), &c_out,
|
||||
xmlBufContent(in), &c_in);
|
||||
if (c_out > 0) {
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
writtentot += c_out;
|
||||
}
|
||||
}
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
else if (output->encoder->iconv_out != NULL) {
|
||||
ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out),
|
||||
&c_out, xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
writtentot += c_out;
|
||||
if (ret == -1) {
|
||||
if (c_out > 0) {
|
||||
/*
|
||||
* Can be a limitation of iconv
|
||||
*/
|
||||
charref_len = 0;
|
||||
goto retry;
|
||||
}
|
||||
ret = -3;
|
||||
}
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef LIBXML_ICU_ENABLED
|
||||
else if (output->encoder->uconv_out != NULL) {
|
||||
ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out),
|
||||
&c_out, xmlBufContent(in), &c_in);
|
||||
xmlBufShrink(in, c_in);
|
||||
xmlBufAddLen(out, c_out);
|
||||
writtentot += c_out;
|
||||
if (ret == -1) {
|
||||
if (c_out > 0) {
|
||||
/*
|
||||
* Can be a limitation of uconv
|
||||
*/
|
||||
charref_len = 0;
|
||||
goto retry;
|
||||
}
|
||||
ret = -3;
|
||||
}
|
||||
}
|
||||
#endif /* LIBXML_ICU_ENABLED */
|
||||
else {
|
||||
xmlEncodingErr(XML_I18N_NO_OUTPUT,
|
||||
"xmlCharEncOutFunc: no output function !\n", NULL);
|
||||
return(-1);
|
||||
}
|
||||
|
||||
if (ret >= 0) output += ret;
|
||||
|
||||
/*
|
||||
* Attempt to handle error cases
|
||||
*/
|
||||
switch (ret) {
|
||||
case 0:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"converted %d bytes to %d bytes of output\n",
|
||||
c_in, c_out);
|
||||
#endif
|
||||
break;
|
||||
case -1:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"output conversion failed by lack of space\n");
|
||||
#endif
|
||||
break;
|
||||
case -3:
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
|
||||
c_in, c_out, (int) xmlBufUse(in));
|
||||
#endif
|
||||
break;
|
||||
case -2: {
|
||||
int len = (int) xmlBufUse(in);
|
||||
xmlChar *content = xmlBufContent(in);
|
||||
int cur;
|
||||
|
||||
cur = xmlGetUTF8Char(content, &len);
|
||||
if ((charref_len != 0) && (c_out < charref_len)) {
|
||||
/*
|
||||
* We attempted to insert a character reference and failed.
|
||||
* Undo what was written and skip the remaining charref.
|
||||
*/
|
||||
xmlBufErase(out, c_out);
|
||||
writtentot -= c_out;
|
||||
xmlBufShrink(in, charref_len - c_out);
|
||||
charref_len = 0;
|
||||
|
||||
ret = -1;
|
||||
break;
|
||||
} else if (cur > 0) {
|
||||
xmlChar charref[20];
|
||||
|
||||
#ifdef DEBUG_ENCODING
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"handling output conversion error\n");
|
||||
xmlGenericError(xmlGenericErrorContext,
|
||||
"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
content[0], content[1],
|
||||
content[2], content[3]);
|
||||
#endif
|
||||
/*
|
||||
* Removes the UTF8 sequence, and replace it by a charref
|
||||
* and continue the transcoding phase, hoping the error
|
||||
* did not mangle the encoder state.
|
||||
*/
|
||||
charref_len = snprintf((char *) &charref[0], sizeof(charref),
|
||||
"&#%d;", cur);
|
||||
xmlBufShrink(in, len);
|
||||
xmlBufAddHead(in, charref, -1);
|
||||
|
||||
goto retry;
|
||||
} else {
|
||||
char buf[50];
|
||||
|
||||
snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
|
||||
content[0], content[1],
|
||||
content[2], content[3]);
|
||||
buf[49] = 0;
|
||||
xmlEncodingErr(XML_I18N_CONV_FAILED,
|
||||
"output conversion failed due to conv error, bytes %s\n",
|
||||
buf);
|
||||
if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE)
|
||||
content[0] = ' ';
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return(ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlCharEncOutFunc:
|
||||
* @handler: char enconding transformation data structure
|
||||
|
Loading…
Reference in New Issue
Block a user