1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-01-27 14:03:36 +03:00

encoding: Rework simple conversions function

Use a single function for ASCII conversion. Optimize code. Check for
XML_ENC_ERR_SPACE.
This commit is contained in:
Nick Wellnhofer 2024-06-28 04:36:14 +02:00
parent 501e5d195d
commit bb8e81c788

View File

@ -107,8 +107,8 @@ static const xmlEncTableEntry xmlEncTable[] = {
};
static int
asciiToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen);
asciiToAscii(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen);
static int
UTF8ToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb);
@ -121,9 +121,6 @@ UTF16BEToUTF8(unsigned char* out, int *outlen,
#ifdef LIBXML_OUTPUT_ENABLED
static int
UTF8Toascii(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen);
static int
UTF8ToUTF16(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen);
@ -136,7 +133,6 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
#else /* LIBXML_OUTPUT_ENABLED */
#define UTF8Toascii NULL
#define UTF8Toisolat1 NULL
#define UTF8ToUTF16 NULL
#define UTF8ToUTF16LE NULL
@ -252,7 +248,7 @@ static const xmlCharEncodingHandler defaultHandlers[31] = {
MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
MAKE_HANDLER("Shift_JIS", NULL, NULL),
MAKE_HANDLER("EUC-JP", NULL, NULL),
MAKE_HANDLER("US-ASCII", asciiToUTF8, UTF8Toascii),
MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
MAKE_HANDLER("HTML", NULL, UTF8ToHtml),
MAKE_HANDLER("ISO-8859-10", ISO8859_10ToUTF8, UTF8ToISO8859_10),
@ -287,336 +283,7 @@ xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
* *
************************************************************************/
/**
* asciiToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ASCII chars
* @inlen: the length of @in
*
* Take a block of ASCII chars in and try to convert it to an UTF-8
* block of chars out.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
static int
asciiToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend;
unsigned int c;
inend = in + (*inlen);
while ((in < inend) && (out - outstart + 5 < *outlen)) {
c= *in++;
if (out >= outend)
break;
if (c < 0x80) {
*out++ = c;
} else {
*outlen = out - outstart;
*inlen = processed - base;
return(XML_ENC_ERR_INPUT);
}
processed = (const unsigned char*) in;
}
*outlen = out - outstart;
*inlen = processed - base;
return(*outlen);
}
#ifdef LIBXML_OUTPUT_ENABLED
/**
* UTF8Toascii:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an ASCII
* block of chars out.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
static int
UTF8Toascii(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
const unsigned char* processed = in;
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned int c, d;
int trailing;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
if (in == NULL) {
/*
* initialization nothing to do
*/
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + (*inlen);
outend = out + (*outlen);
while (in < inend) {
d = *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - instart;
return(XML_ENC_ERR_INPUT);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in Ascii */
*outlen = out - outstart;
*inlen = processed - instart;
return(XML_ENC_ERR_INPUT);
}
if (inend - in < trailing) {
break;
}
for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
break;
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c < 0x80) {
if (out >= outend)
break;
*out++ = c;
} else {
/* no chance for this in Ascii */
*outlen = out - outstart;
*inlen = processed - instart;
return(XML_ENC_ERR_INPUT);
}
processed = in;
}
*outlen = out - outstart;
*inlen = processed - instart;
return(*outlen);
}
#endif /* LIBXML_OUTPUT_ENABLED */
/**
* isolat1ToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ISO Latin 1 chars
* @inlen: the length of @in
*
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
* block of chars out.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
int
isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
unsigned char* outend;
const unsigned char* inend;
const unsigned char* instop;
if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
outend = out + *outlen;
inend = in + (*inlen);
instop = inend;
while ((in < inend) && (out < outend - 1)) {
if (*in >= 0x80) {
*out++ = (((*in) >> 6) & 0x1F) | 0xC0;
*out++ = ((*in) & 0x3F) | 0x80;
++in;
}
if ((instop - in) > (outend - out)) instop = in + (outend - out);
while ((in < instop) && (*in < 0x80)) {
*out++ = *in++;
}
}
if ((in < inend) && (out < outend) && (*in < 0x80)) {
*out++ = *in++;
}
*outlen = out - outstart;
*inlen = in - base;
return(*outlen);
}
/**
* UTF8ToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @inb: a pointer to an array of UTF-8 chars
* @inlenb: the length of @in in UTF-8 chars
*
* No op copy operation for UTF8 handling.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*
* The value of *inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
*/
static int
UTF8ToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb)
{
int len;
if ((out == NULL) || (outlen == NULL) || (inlenb == NULL))
return(XML_ENC_ERR_INTERNAL);
if (inb == NULL) {
/* inb == NULL means output is initialized. */
*outlen = 0;
*inlenb = 0;
return(0);
}
if (*outlen > *inlenb) {
len = *inlenb;
} else {
len = *outlen;
}
if (len < 0)
return(XML_ENC_ERR_INTERNAL);
/*
* FIXME: Conversion functions must assure valid UTF-8, so we have
* to check for UTF-8 validity. Preferably, this converter shouldn't
* be used at all.
*/
memcpy(out, inb, len);
*outlen = len;
*inlenb = len;
return(*outlen);
}
#ifdef LIBXML_OUTPUT_ENABLED
/**
* UTF8Toisolat1:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
* block of chars out.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
int
UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
const unsigned char* processed = in;
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned int c, d;
int trailing;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
if (in == NULL) {
/*
* initialization nothing to do
*/
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + (*inlen);
outend = out + (*outlen);
while (in < inend) {
d = *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - instart;
return(XML_ENC_ERR_INPUT);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in IsoLat1 */
*outlen = out - outstart;
*inlen = processed - instart;
return(XML_ENC_ERR_INPUT);
}
if (inend - in < trailing) {
break;
}
for ( ; trailing; trailing--) {
if (in >= inend)
break;
d = *in++;
if ((d & 0xC0) != 0x80) {
*outlen = out - outstart;
*inlen = processed - instart;
return(XML_ENC_ERR_INPUT);
}
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c <= 0xFF) {
if (out >= outend)
break;
*out++ = c;
} else {
/* no chance for this in IsoLat1 */
*outlen = out - outstart;
*inlen = processed - instart;
return(XML_ENC_ERR_INPUT);
}
processed = in;
}
*outlen = out - outstart;
*inlen = processed - instart;
return(*outlen);
}
#endif /* LIBXML_OUTPUT_ENABLED */
/**
/*
* UTF16LEToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
@ -2719,6 +2386,176 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
return(in->consumed + (in->cur - in->base));
}
/************************************************************************
* *
* Conversions To/From UTF8 encoding *
* *
************************************************************************/
static int
asciiToAscii(unsigned char* out, int *poutlen,
const unsigned char* in, int *pinlen) {
const unsigned char *inend;
const unsigned char *instart = in;
int inlen, outlen, ret;
if (in == NULL) {
*pinlen = 0;
*poutlen = 0;
return(XML_ENC_ERR_SUCCESS);
}
inlen = *pinlen;
outlen = *poutlen;
if (outlen < inlen) {
inlen = outlen;
ret = XML_ENC_ERR_SPACE;
} else {
ret = inlen;
}
inend = in + inlen;
*poutlen = inlen;
*pinlen = inlen;
while (in < inend) {
unsigned c = *in;
if (c >= 0x80) {
*poutlen = in - instart;
*pinlen = in - instart;
return(XML_ENC_ERR_INPUT);
}
in++;
*out++ = c;
}
return(ret);
}
int
isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* instart = in;
unsigned char* outend;
const unsigned char* inend;
int ret = XML_ENC_ERR_SPACE;
if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
outend = out + *outlen;
inend = in + *inlen;
while (in < inend) {
unsigned c = *in;
if (c < 0x80) {
if (out >= outend)
goto done;
*out++ = c;
} else {
if (outend - out < 2)
goto done;
*out++ = (c >> 6) | 0xC0;
*out++ = (c & 0x3F) | 0x80;
}
in++;
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
static int
UTF8ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen)
{
int len;
int ret;
if (in == NULL) {
*inlen = 0;
*outlen = 0;
return(XML_ENC_ERR_SUCCESS);
}
if (*outlen < *inlen) {
len = *outlen;
ret = XML_ENC_ERR_SPACE;
} else {
len = *inlen;
ret = len;
}
memcpy(out, in, len);
*outlen = len;
*inlen = len;
return(ret);
}
#ifdef LIBXML_OUTPUT_ENABLED
int
UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned c;
int ret = XML_ENC_ERR_SPACE;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
if (in == NULL) {
*inlen = 0;
*outlen = 0;
return(XML_ENC_ERR_SUCCESS);
}
inend = in + *inlen;
outend = out + *outlen;
while (in < inend) {
if (out >= outend)
goto done;
c = *in;
if (c < 0x80) {
*out++ = c;
} else if (c < 0xC4) {
if (inend - in < 2)
break;
in++;
*out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
} else {
ret = XML_ENC_ERR_INPUT;
goto done;
}
in++;
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
#endif /* LIBXML_OUTPUT_ENABLED */
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
#ifdef LIBXML_ISO8859X_ENABLED