mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-02-04 01:47:02 +03:00
69f12d6d47
This was only used by Chromium/WebKit to detect whether xmlParseContent really succeeded. It's a horrible, overcomplicated hack. See 8c5848bd and #767.
2697 lines
72 KiB
C
2697 lines
72 KiB
C
/*
|
|
* encoding.c : implements the encoding conversion functions needed for XML
|
|
*
|
|
* Related specs:
|
|
* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
|
|
* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
|
|
* [ISO-10646] UTF-8 and UTF-16 in Annexes
|
|
* [ISO-8859-1] ISO Latin-1 characters codes.
|
|
* [UNICODE] The Unicode Consortium, "The Unicode Standard --
|
|
* Worldwide Character Encoding -- Version 1.0", Addison-
|
|
* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
|
|
* described in Unicode Technical Report #4.
|
|
* [US-ASCII] Coded Character Set--7-bit American Standard Code for
|
|
* Information Interchange, ANSI X3.4-1986.
|
|
*
|
|
* See Copyright for the status of this software.
|
|
*
|
|
* daniel@veillard.com
|
|
*
|
|
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
|
|
*/
|
|
|
|
#define IN_LIBXML
|
|
#include "libxml.h"
|
|
|
|
#include <string.h>
|
|
#include <limits.h>
|
|
#include <ctype.h>
|
|
#include <stdlib.h>
|
|
|
|
#ifdef LIBXML_ICONV_ENABLED
|
|
#include <iconv.h>
|
|
#include <errno.h>
|
|
#endif
|
|
|
|
#include <libxml/encoding.h>
|
|
#include <libxml/xmlmemory.h>
|
|
#include <libxml/parser.h>
|
|
#ifdef LIBXML_HTML_ENABLED
|
|
#include <libxml/HTMLparser.h>
|
|
#endif
|
|
#include <libxml/xmlerror.h>
|
|
|
|
#include "private/buf.h"
|
|
#include "private/enc.h"
|
|
#include "private/error.h"
|
|
|
|
#ifdef LIBXML_ICU_ENABLED
|
|
#include <unicode/ucnv.h>
|
|
#endif
|
|
|
|
#define XML_HANDLER_STATIC 1
|
|
|
|
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
|
|
typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
|
|
struct _xmlCharEncodingAlias {
|
|
const char *name;
|
|
const char *alias;
|
|
};
|
|
|
|
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
|
|
static int xmlCharEncodingAliasesNb = 0;
|
|
static int xmlCharEncodingAliasesMax = 0;
|
|
|
|
static int xmlLittleEndian = 1;
|
|
|
|
typedef struct {
|
|
const char *name;
|
|
xmlCharEncoding enc;
|
|
} xmlEncTableEntry;
|
|
|
|
static const xmlEncTableEntry xmlEncTable[] = {
|
|
{ "ASCII", XML_CHAR_ENCODING_ASCII },
|
|
{ "EUC-JP", XML_CHAR_ENCODING_EUC_JP },
|
|
{ "HTML", XML_CHAR_ENCODING_HTML },
|
|
{ "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 },
|
|
{ "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 },
|
|
{ "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 },
|
|
{ "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE },
|
|
{ "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP },
|
|
{ "ISO-8859-1", XML_CHAR_ENCODING_8859_1 },
|
|
{ "ISO-8859-10", XML_CHAR_ENCODING_8859_10 },
|
|
{ "ISO-8859-11", XML_CHAR_ENCODING_8859_11 },
|
|
{ "ISO-8859-13", XML_CHAR_ENCODING_8859_13 },
|
|
{ "ISO-8859-14", XML_CHAR_ENCODING_8859_14 },
|
|
{ "ISO-8859-15", XML_CHAR_ENCODING_8859_15 },
|
|
{ "ISO-8859-16", XML_CHAR_ENCODING_8859_16 },
|
|
{ "ISO-8859-2", XML_CHAR_ENCODING_8859_2 },
|
|
{ "ISO-8859-3", XML_CHAR_ENCODING_8859_3 },
|
|
{ "ISO-8859-4", XML_CHAR_ENCODING_8859_4 },
|
|
{ "ISO-8859-5", XML_CHAR_ENCODING_8859_5 },
|
|
{ "ISO-8859-6", XML_CHAR_ENCODING_8859_6 },
|
|
{ "ISO-8859-7", XML_CHAR_ENCODING_8859_7 },
|
|
{ "ISO-8859-8", XML_CHAR_ENCODING_8859_8 },
|
|
{ "ISO-8859-9", XML_CHAR_ENCODING_8859_9 },
|
|
{ "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 },
|
|
{ "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 },
|
|
{ "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS },
|
|
{ "UCS-2", XML_CHAR_ENCODING_UCS2 },
|
|
{ "UCS-4", XML_CHAR_ENCODING_UCS4LE },
|
|
{ "UCS2", XML_CHAR_ENCODING_UCS2 },
|
|
{ "UCS4", XML_CHAR_ENCODING_UCS4LE },
|
|
{ "US-ASCII", XML_CHAR_ENCODING_ASCII },
|
|
{ "UTF-16", XML_CHAR_ENCODING_UTF16 },
|
|
{ "UTF-16BE", XML_CHAR_ENCODING_UTF16BE },
|
|
{ "UTF-16LE", XML_CHAR_ENCODING_UTF16LE },
|
|
{ "UTF-8", XML_CHAR_ENCODING_UTF8 },
|
|
{ "UTF16", XML_CHAR_ENCODING_UTF16LE },
|
|
{ "UTF8", XML_CHAR_ENCODING_UTF8 }
|
|
};
|
|
|
|
static int
|
|
asciiToAscii(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt);
|
|
static int
|
|
UTF8ToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* inb, int *inlenb, void *vctxt);
|
|
static int
|
|
latin1ToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt);
|
|
static int
|
|
UTF16LEToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* inb, int *inlenb, void *vctxt);
|
|
static int
|
|
UTF16BEToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* inb, int *inlenb, void *vctxt);
|
|
|
|
#ifdef LIBXML_OUTPUT_ENABLED
|
|
|
|
static int
|
|
UTF8ToLatin1(unsigned char* outb, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt);
|
|
static int
|
|
UTF8ToUTF16(unsigned char* outb, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt);
|
|
static int
|
|
UTF8ToUTF16LE(unsigned char* outb, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt);
|
|
static int
|
|
UTF8ToUTF16BE(unsigned char* outb, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt);
|
|
|
|
#else /* LIBXML_OUTPUT_ENABLED */
|
|
|
|
#define UTF8ToLatin1 NULL
|
|
#define UTF8ToUTF16 NULL
|
|
#define UTF8ToUTF16LE NULL
|
|
#define UTF8ToUTF16BE NULL
|
|
|
|
#endif /* LIBXML_OUTPUT_ENABLED */
|
|
|
|
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
|
|
static int
|
|
UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen, void *vctxt);
|
|
#else
|
|
#define UTF8ToHtmlWrapper NULL
|
|
#endif
|
|
|
|
#ifdef LIBXML_ICONV_ENABLED
|
|
#define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0
|
|
#else
|
|
#define EMPTY_ICONV
|
|
#endif
|
|
|
|
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
|
|
defined(LIBXML_ISO8859X_ENABLED)
|
|
|
|
#include "iso8859x.inc"
|
|
|
|
static int
|
|
ISO8859xToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt);
|
|
static int
|
|
UTF8ToISO8859x(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen, void *vctxt);
|
|
|
|
#define MAKE_ISO_HANDLER(name, n) \
|
|
{ (char *) name, \
|
|
(xmlCharEncodingInputFunc) (void (*)(void)) ISO8859xToUTF8, \
|
|
(xmlCharEncodingInputFunc) (void (*)(void)) UTF8ToISO8859x \
|
|
EMPTY_ICONV, \
|
|
(void *) xmlunicodetable_ISO8859_##n, \
|
|
(void *) xmltranscodetable_ISO8859_##n, \
|
|
NULL, XML_HANDLER_STATIC }
|
|
|
|
#else /* LIBXML_ISO8859X_ENABLED */
|
|
|
|
#define MAKE_ISO_HANDLER(name, n) \
|
|
{ (char *) name, NULL, NULL EMPTY_ICONV, NULL, NULL, NULL, \
|
|
XML_HANDLER_STATIC }
|
|
|
|
#endif /* LIBXML_ISO8859X_ENABLED */
|
|
|
|
#define MAKE_HANDLER(name, in, out) \
|
|
{ (char *) name, \
|
|
(xmlCharEncodingInputFunc) (void (*)(void)) in, \
|
|
(xmlCharEncodingOutputFunc) (void (*)(void)) out \
|
|
EMPTY_ICONV, NULL, NULL, NULL, XML_HANDLER_STATIC }
|
|
|
|
/*
|
|
* The layout must match enum xmlCharEncoding.
|
|
*
|
|
* Names should match the IANA registry if possible:
|
|
* https://www.iana.org/assignments/character-sets/character-sets.xhtml
|
|
*/
|
|
static const xmlCharEncodingHandler defaultHandlers[31] = {
|
|
MAKE_HANDLER(NULL, NULL, NULL), /* NONE */
|
|
MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8),
|
|
MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE),
|
|
MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE),
|
|
MAKE_HANDLER("UCS-4LE", NULL, NULL),
|
|
MAKE_HANDLER("UCS-4BE", NULL, NULL),
|
|
MAKE_HANDLER("IBM037", NULL, NULL),
|
|
MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
|
|
MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
|
|
MAKE_HANDLER("ISO-10646-UCS-2", NULL, NULL),
|
|
MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1),
|
|
MAKE_ISO_HANDLER("ISO-8859-2", 2),
|
|
MAKE_ISO_HANDLER("ISO-8859-3", 3),
|
|
MAKE_ISO_HANDLER("ISO-8859-4", 4),
|
|
MAKE_ISO_HANDLER("ISO-8859-5", 5),
|
|
MAKE_ISO_HANDLER("ISO-8859-6", 6),
|
|
MAKE_ISO_HANDLER("ISO-8859-7", 7),
|
|
MAKE_ISO_HANDLER("ISO-8859-8", 8),
|
|
MAKE_ISO_HANDLER("ISO-8859-9", 9),
|
|
MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
|
|
MAKE_HANDLER("Shift_JIS", NULL, NULL),
|
|
MAKE_HANDLER("EUC-JP", NULL, NULL),
|
|
MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
|
|
MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
|
|
MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper),
|
|
MAKE_ISO_HANDLER("ISO-8859-10", 10),
|
|
MAKE_ISO_HANDLER("ISO-8859-11", 11),
|
|
MAKE_ISO_HANDLER("ISO-8859-13", 13),
|
|
MAKE_ISO_HANDLER("ISO-8859-14", 14),
|
|
MAKE_ISO_HANDLER("ISO-8859-15", 15),
|
|
MAKE_ISO_HANDLER("ISO-8859-16", 16),
|
|
};
|
|
|
|
#define NUM_DEFAULT_HANDLERS \
|
|
(sizeof(defaultHandlers) / sizeof(defaultHandlers[0]))
|
|
|
|
/* the size should be growable, but it's not a big deal ... */
|
|
#define MAX_ENCODING_HANDLERS 50
|
|
static xmlCharEncodingHandlerPtr *globalHandlers = NULL;
|
|
static int nbCharEncodingHandler = 0;
|
|
|
|
#ifdef LIBXML_ICONV_ENABLED
|
|
static int
|
|
xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
|
|
#endif
|
|
|
|
#ifdef LIBXML_ICU_ENABLED
|
|
static int
|
|
xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
|
|
#endif
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Generic encoding handling routines *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* xmlDetectCharEncoding:
|
|
* @in: a pointer to the first bytes of the XML entity, must be at least
|
|
* 2 bytes long (at least 4 if encoding is UTF4 variant).
|
|
* @len: pointer to the length of the buffer
|
|
*
|
|
* Guess the encoding of the entity using the first bytes of the entity content
|
|
* according to the non-normative appendix F of the XML-1.0 recommendation.
|
|
*
|
|
* Returns one of the XML_CHAR_ENCODING_... values.
|
|
*/
|
|
xmlCharEncoding
|
|
xmlDetectCharEncoding(const unsigned char* in, int len)
|
|
{
|
|
if (in == NULL)
|
|
return(XML_CHAR_ENCODING_NONE);
|
|
if (len >= 4) {
|
|
if ((in[0] == 0x00) && (in[1] == 0x00) &&
|
|
(in[2] == 0x00) && (in[3] == 0x3C))
|
|
return(XML_CHAR_ENCODING_UCS4BE);
|
|
if ((in[0] == 0x3C) && (in[1] == 0x00) &&
|
|
(in[2] == 0x00) && (in[3] == 0x00))
|
|
return(XML_CHAR_ENCODING_UCS4LE);
|
|
if ((in[0] == 0x00) && (in[1] == 0x00) &&
|
|
(in[2] == 0x3C) && (in[3] == 0x00))
|
|
return(XML_CHAR_ENCODING_UCS4_2143);
|
|
if ((in[0] == 0x00) && (in[1] == 0x3C) &&
|
|
(in[2] == 0x00) && (in[3] == 0x00))
|
|
return(XML_CHAR_ENCODING_UCS4_3412);
|
|
if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
|
|
(in[2] == 0xA7) && (in[3] == 0x94))
|
|
return(XML_CHAR_ENCODING_EBCDIC);
|
|
if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
|
|
(in[2] == 0x78) && (in[3] == 0x6D))
|
|
return(XML_CHAR_ENCODING_UTF8);
|
|
/*
|
|
* Although not part of the recommendation, we also
|
|
* attempt an "auto-recognition" of UTF-16LE and
|
|
* UTF-16BE encodings.
|
|
*/
|
|
if ((in[0] == 0x3C) && (in[1] == 0x00) &&
|
|
(in[2] == 0x3F) && (in[3] == 0x00))
|
|
return(XML_CHAR_ENCODING_UTF16LE);
|
|
if ((in[0] == 0x00) && (in[1] == 0x3C) &&
|
|
(in[2] == 0x00) && (in[3] == 0x3F))
|
|
return(XML_CHAR_ENCODING_UTF16BE);
|
|
}
|
|
if (len >= 3) {
|
|
/*
|
|
* Errata on XML-1.0 June 20 2001
|
|
* We now allow an UTF8 encoded BOM
|
|
*/
|
|
if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
|
|
(in[2] == 0xBF))
|
|
return(XML_CHAR_ENCODING_UTF8);
|
|
}
|
|
/* For UTF-16 we can recognize by the BOM */
|
|
if (len >= 2) {
|
|
if ((in[0] == 0xFE) && (in[1] == 0xFF))
|
|
return(XML_CHAR_ENCODING_UTF16BE);
|
|
if ((in[0] == 0xFF) && (in[1] == 0xFE))
|
|
return(XML_CHAR_ENCODING_UTF16LE);
|
|
}
|
|
return(XML_CHAR_ENCODING_NONE);
|
|
}
|
|
|
|
/**
|
|
* xmlCleanupEncodingAliases:
|
|
*
|
|
* DEPRECATED: This function modifies global state and is not
|
|
* thread-safe.
|
|
*
|
|
* Unregisters all aliases
|
|
*/
|
|
void
|
|
xmlCleanupEncodingAliases(void) {
|
|
int i;
|
|
|
|
if (xmlCharEncodingAliases == NULL)
|
|
return;
|
|
|
|
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
|
|
if (xmlCharEncodingAliases[i].name != NULL)
|
|
xmlFree((char *) xmlCharEncodingAliases[i].name);
|
|
if (xmlCharEncodingAliases[i].alias != NULL)
|
|
xmlFree((char *) xmlCharEncodingAliases[i].alias);
|
|
}
|
|
xmlCharEncodingAliasesNb = 0;
|
|
xmlCharEncodingAliasesMax = 0;
|
|
xmlFree(xmlCharEncodingAliases);
|
|
xmlCharEncodingAliases = NULL;
|
|
}
|
|
|
|
/**
|
|
* xmlGetEncodingAlias:
|
|
* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
|
|
*
|
|
* DEPRECATED: This function is not thread-safe.
|
|
*
|
|
* Lookup an encoding name for the given alias.
|
|
*
|
|
* Returns NULL if not found, otherwise the original name
|
|
*/
|
|
const char *
|
|
xmlGetEncodingAlias(const char *alias) {
|
|
int i;
|
|
char upper[100];
|
|
|
|
if (alias == NULL)
|
|
return(NULL);
|
|
|
|
if (xmlCharEncodingAliases == NULL)
|
|
return(NULL);
|
|
|
|
for (i = 0;i < 99;i++) {
|
|
upper[i] = (char) toupper((unsigned char) alias[i]);
|
|
if (upper[i] == 0) break;
|
|
}
|
|
upper[i] = 0;
|
|
|
|
/*
|
|
* Walk down the list looking for a definition of the alias
|
|
*/
|
|
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
|
|
if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
|
|
return(xmlCharEncodingAliases[i].name);
|
|
}
|
|
}
|
|
return(NULL);
|
|
}
|
|
|
|
/**
|
|
* xmlAddEncodingAlias:
|
|
* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
|
|
* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
|
|
*
|
|
* DEPRECATED: This function modifies global state and is not
|
|
* thread-safe.
|
|
*
|
|
* Registers an alias @alias for an encoding named @name. Existing alias
|
|
* will be overwritten.
|
|
*
|
|
* Returns 0 in case of success, -1 in case of error
|
|
*/
|
|
int
|
|
xmlAddEncodingAlias(const char *name, const char *alias) {
|
|
int i;
|
|
char upper[100];
|
|
char *nameCopy, *aliasCopy;
|
|
|
|
if ((name == NULL) || (alias == NULL))
|
|
return(-1);
|
|
|
|
for (i = 0;i < 99;i++) {
|
|
upper[i] = (char) toupper((unsigned char) alias[i]);
|
|
if (upper[i] == 0) break;
|
|
}
|
|
upper[i] = 0;
|
|
|
|
if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
|
|
xmlCharEncodingAliasPtr tmp;
|
|
size_t newSize = xmlCharEncodingAliasesMax ?
|
|
xmlCharEncodingAliasesMax * 2 :
|
|
20;
|
|
|
|
tmp = (xmlCharEncodingAliasPtr)
|
|
xmlRealloc(xmlCharEncodingAliases,
|
|
newSize * sizeof(xmlCharEncodingAlias));
|
|
if (tmp == NULL)
|
|
return(-1);
|
|
xmlCharEncodingAliases = tmp;
|
|
xmlCharEncodingAliasesMax = newSize;
|
|
}
|
|
|
|
/*
|
|
* Walk down the list looking for a definition of the alias
|
|
*/
|
|
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
|
|
if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
|
|
/*
|
|
* Replace the definition.
|
|
*/
|
|
nameCopy = xmlMemStrdup(name);
|
|
if (nameCopy == NULL)
|
|
return(-1);
|
|
xmlFree((char *) xmlCharEncodingAliases[i].name);
|
|
xmlCharEncodingAliases[i].name = nameCopy;
|
|
return(0);
|
|
}
|
|
}
|
|
/*
|
|
* Add the definition
|
|
*/
|
|
nameCopy = xmlMemStrdup(name);
|
|
if (nameCopy == NULL)
|
|
return(-1);
|
|
aliasCopy = xmlMemStrdup(upper);
|
|
if (aliasCopy == NULL) {
|
|
xmlFree(nameCopy);
|
|
return(-1);
|
|
}
|
|
xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy;
|
|
xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy;
|
|
xmlCharEncodingAliasesNb++;
|
|
return(0);
|
|
}
|
|
|
|
/**
|
|
* xmlDelEncodingAlias:
|
|
* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
|
|
*
|
|
* DEPRECATED: This function modifies global state and is not
|
|
* thread-safe.
|
|
*
|
|
* Unregisters an encoding alias @alias
|
|
*
|
|
* Returns 0 in case of success, -1 in case of error
|
|
*/
|
|
int
|
|
xmlDelEncodingAlias(const char *alias) {
|
|
int i;
|
|
|
|
if (alias == NULL)
|
|
return(-1);
|
|
|
|
if (xmlCharEncodingAliases == NULL)
|
|
return(-1);
|
|
/*
|
|
* Walk down the list looking for a definition of the alias
|
|
*/
|
|
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
|
|
if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
|
|
xmlFree((char *) xmlCharEncodingAliases[i].name);
|
|
xmlFree((char *) xmlCharEncodingAliases[i].alias);
|
|
xmlCharEncodingAliasesNb--;
|
|
memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
|
|
sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
|
|
return(0);
|
|
}
|
|
}
|
|
return(-1);
|
|
}
|
|
|
|
static int
|
|
xmlCompareEncTableEntries(const void *vkey, const void *ventry) {
|
|
const char *key = vkey;
|
|
const xmlEncTableEntry *entry = ventry;
|
|
|
|
return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name));
|
|
}
|
|
|
|
static xmlCharEncoding
|
|
xmlParseCharEncodingInternal(const char *name)
|
|
{
|
|
const xmlEncTableEntry *entry;
|
|
|
|
if (name == NULL)
|
|
return(XML_CHAR_ENCODING_NONE);
|
|
|
|
entry = bsearch(name, xmlEncTable,
|
|
sizeof(xmlEncTable) / sizeof(xmlEncTable[0]),
|
|
sizeof(xmlEncTable[0]), xmlCompareEncTableEntries);
|
|
if (entry != NULL)
|
|
return(entry->enc);
|
|
|
|
return(XML_CHAR_ENCODING_ERROR);
|
|
}
|
|
|
|
/**
|
|
* xmlParseCharEncoding:
|
|
* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
|
|
*
|
|
* Compare the string to the encoding schemes already known. Note
|
|
* that the comparison is case insensitive accordingly to the section
|
|
* [XML] 4.3.3 Character Encoding in Entities.
|
|
*
|
|
* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
|
|
* if not recognized.
|
|
*/
|
|
xmlCharEncoding
|
|
xmlParseCharEncoding(const char *name)
|
|
{
|
|
xmlCharEncoding enc = xmlParseCharEncodingInternal(name);
|
|
|
|
/* Backward compatibility */
|
|
if (enc == XML_CHAR_ENCODING_UTF16)
|
|
enc = XML_CHAR_ENCODING_UTF16LE;
|
|
|
|
return(enc);
|
|
}
|
|
|
|
/**
|
|
* xmlGetCharEncodingName:
|
|
* @enc: the encoding
|
|
*
|
|
* The "canonical" name for XML encoding.
|
|
* C.f. http://www.w3.org/TR/REC-xml#charencoding
|
|
* Section 4.3.3 Character Encoding in Entities
|
|
*
|
|
* Returns the canonical name for the given encoding
|
|
*/
|
|
const char*
|
|
xmlGetCharEncodingName(xmlCharEncoding enc) {
|
|
switch (enc) {
|
|
case XML_CHAR_ENCODING_UTF16LE:
|
|
return("UTF-16");
|
|
case XML_CHAR_ENCODING_UTF16BE:
|
|
return("UTF-16");
|
|
case XML_CHAR_ENCODING_UCS4LE:
|
|
return("ISO-10646-UCS-4");
|
|
case XML_CHAR_ENCODING_UCS4BE:
|
|
return("ISO-10646-UCS-4");
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
|
|
return(NULL);
|
|
|
|
return(defaultHandlers[enc].name);
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Char encoding handlers *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* xmlNewCharEncodingHandler:
|
|
* @name: the encoding name, in UTF-8 format (ASCII actually)
|
|
* @input: the xmlCharEncodingInputFunc to read that encoding
|
|
* @output: the xmlCharEncodingOutputFunc to write that encoding
|
|
*
|
|
* DEPRECATED: This function modifies global state and is not
|
|
* thread-safe.
|
|
*
|
|
* Create and registers an xmlCharEncodingHandler.
|
|
*
|
|
* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
|
|
*/
|
|
xmlCharEncodingHandlerPtr
|
|
xmlNewCharEncodingHandler(const char *name,
|
|
xmlCharEncodingInputFunc input,
|
|
xmlCharEncodingOutputFunc output) {
|
|
xmlCharEncodingHandlerPtr handler;
|
|
const char *alias;
|
|
char upper[500];
|
|
int i;
|
|
char *up = NULL;
|
|
|
|
/*
|
|
* Do the alias resolution
|
|
*/
|
|
alias = xmlGetEncodingAlias(name);
|
|
if (alias != NULL)
|
|
name = alias;
|
|
|
|
/*
|
|
* Keep only the uppercase version of the encoding.
|
|
*/
|
|
if (name == NULL)
|
|
return(NULL);
|
|
for (i = 0;i < 499;i++) {
|
|
upper[i] = (char) toupper((unsigned char) name[i]);
|
|
if (upper[i] == 0) break;
|
|
}
|
|
upper[i] = 0;
|
|
up = xmlMemStrdup(upper);
|
|
if (up == NULL)
|
|
return(NULL);
|
|
|
|
/*
|
|
* allocate and fill-up an handler block.
|
|
*/
|
|
handler = (xmlCharEncodingHandlerPtr)
|
|
xmlMalloc(sizeof(xmlCharEncodingHandler));
|
|
if (handler == NULL) {
|
|
xmlFree(up);
|
|
return(NULL);
|
|
}
|
|
memset(handler, 0, sizeof(xmlCharEncodingHandler));
|
|
handler->input = input;
|
|
handler->output = output;
|
|
handler->name = up;
|
|
handler->flags = XML_HANDLER_STATIC;
|
|
|
|
#ifdef LIBXML_ICONV_ENABLED
|
|
handler->iconv_in = NULL;
|
|
handler->iconv_out = NULL;
|
|
#endif
|
|
|
|
/*
|
|
* registers and returns the handler.
|
|
*/
|
|
xmlRegisterCharEncodingHandler(handler);
|
|
return(handler);
|
|
}
|
|
|
|
/**
|
|
* xmlInitCharEncodingHandlers:
|
|
*
|
|
* DEPRECATED: Alias for xmlInitParser.
|
|
*/
|
|
void
|
|
xmlInitCharEncodingHandlers(void) {
|
|
xmlInitParser();
|
|
}
|
|
|
|
/**
|
|
* xmlInitEncodingInternal:
|
|
*
|
|
* Initialize the char encoding support.
|
|
*/
|
|
void
|
|
xmlInitEncodingInternal(void) {
|
|
unsigned short int tst = 0x1234;
|
|
unsigned char *ptr = (unsigned char *) &tst;
|
|
|
|
if (*ptr == 0x12) xmlLittleEndian = 0;
|
|
else xmlLittleEndian = 1;
|
|
}
|
|
|
|
/**
|
|
* xmlCleanupCharEncodingHandlers:
|
|
*
|
|
* DEPRECATED: This function will be made private. Call xmlCleanupParser
|
|
* to free global state but see the warnings there. xmlCleanupParser
|
|
* should be only called once at program exit. In most cases, you don't
|
|
* have call cleanup functions at all.
|
|
*
|
|
* Cleanup the memory allocated for the char encoding support, it
|
|
* unregisters all the encoding handlers and the aliases.
|
|
*/
|
|
void
|
|
xmlCleanupCharEncodingHandlers(void) {
|
|
xmlCleanupEncodingAliases();
|
|
|
|
if (globalHandlers == NULL) return;
|
|
|
|
for (;nbCharEncodingHandler > 0;) {
|
|
xmlCharEncodingHandler *handler;
|
|
|
|
nbCharEncodingHandler--;
|
|
handler = globalHandlers[nbCharEncodingHandler];
|
|
if (handler != NULL) {
|
|
if (handler->name != NULL)
|
|
xmlFree(handler->name);
|
|
xmlFree(handler);
|
|
}
|
|
}
|
|
xmlFree(globalHandlers);
|
|
globalHandlers = NULL;
|
|
nbCharEncodingHandler = 0;
|
|
}
|
|
|
|
/**
|
|
* xmlRegisterCharEncodingHandler:
|
|
* @handler: the xmlCharEncodingHandlerPtr handler block
|
|
*
|
|
* DEPRECATED: This function modifies global state and is not
|
|
* thread-safe.
|
|
*
|
|
* Register the char encoding handler.
|
|
*/
|
|
void
|
|
xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
|
|
if (handler == NULL)
|
|
return;
|
|
if (globalHandlers == NULL) {
|
|
globalHandlers = xmlMalloc(
|
|
MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0]));
|
|
if (globalHandlers == NULL)
|
|
goto free_handler;
|
|
}
|
|
|
|
if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS)
|
|
goto free_handler;
|
|
globalHandlers[nbCharEncodingHandler++] = handler;
|
|
return;
|
|
|
|
free_handler:
|
|
if (handler != NULL) {
|
|
if (handler->name != NULL) {
|
|
xmlFree(handler->name);
|
|
}
|
|
xmlFree(handler);
|
|
}
|
|
}
|
|
|
|
static int
|
|
xmlInvokeConvImpl(xmlCharEncConvImpl impl, void *implCtxt,
|
|
const char *name, xmlCharEncodingHandler *handler) {
|
|
xmlCharEncConverter conv = { NULL, NULL, NULL, NULL, NULL };
|
|
int ret;
|
|
|
|
ret = impl(implCtxt, name, &conv);
|
|
|
|
if (ret == XML_ERR_OK) {
|
|
handler->input =
|
|
(xmlCharEncodingInputFunc) (void (*)(void)) conv.input;
|
|
handler->output =
|
|
(xmlCharEncodingOutputFunc) (void (*)(void)) conv.output;
|
|
handler->ctxtDtor = conv.ctxtDtor;
|
|
handler->inputCtxt = conv.inputCtxt;
|
|
handler->outputCtxt = conv.outputCtxt;
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlFindExtraHandler:
|
|
* @norig: name of the char encoding
|
|
* @name: potentially aliased name of the encoding
|
|
* @output: boolean, use handler for output
|
|
* @impl: a conversion implementation (optional)
|
|
* @implCtxt: user data for conversion implementation (optional)
|
|
* @out: pointer to resulting handler
|
|
*
|
|
* Search the non-default handlers for an exact match.
|
|
*
|
|
* Returns an xmlParserErrors error code.
|
|
*/
|
|
static int
|
|
xmlFindExtraHandler(const char *norig, const char *name, int output,
|
|
xmlCharEncConvImpl impl, void *implCtxt,
|
|
xmlCharEncodingHandler **out) {
|
|
xmlCharEncodingHandler *handler;
|
|
int ret;
|
|
int i;
|
|
|
|
handler = xmlMalloc(sizeof(*handler));
|
|
if (handler == NULL)
|
|
return(XML_ERR_NO_MEMORY);
|
|
memset(handler, 0, sizeof(*handler));
|
|
|
|
handler->name = xmlMemStrdup(name);
|
|
if (handler->name == NULL) {
|
|
ret = XML_ERR_NO_MEMORY;
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* Try custom implementation before deprecated global handlers.
|
|
*
|
|
* Note that we pass the original name without deprecated
|
|
* alias resolution.
|
|
*/
|
|
if (impl != NULL) {
|
|
ret = xmlInvokeConvImpl(impl, implCtxt, norig, handler);
|
|
if (ret != XML_ERR_OK)
|
|
goto done;
|
|
|
|
*out = handler;
|
|
return(XML_ERR_OK);
|
|
}
|
|
|
|
/*
|
|
* Deprecated
|
|
*/
|
|
if (globalHandlers != NULL) {
|
|
for (i = 0; i < nbCharEncodingHandler; i++) {
|
|
xmlCharEncodingHandler *h = globalHandlers[i];
|
|
|
|
if (!xmlStrcasecmp((const xmlChar *) name,
|
|
(const xmlChar *) h->name)) {
|
|
if ((output ? h->output : h->input) != NULL) {
|
|
*out = h;
|
|
ret = XML_ERR_OK;
|
|
goto done;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef LIBXML_ICONV_ENABLED
|
|
ret = xmlInvokeConvImpl(xmlCharEncIconv, handler, name, handler);
|
|
if (ret == XML_ERR_OK) {
|
|
*out = handler;
|
|
return(XML_ERR_OK);
|
|
}
|
|
if (ret != XML_ERR_UNSUPPORTED_ENCODING)
|
|
goto done;
|
|
#endif /* LIBXML_ICONV_ENABLED */
|
|
|
|
#ifdef LIBXML_ICU_ENABLED
|
|
ret = xmlInvokeConvImpl(xmlCharEncUconv, handler, name, handler);
|
|
if (ret == XML_ERR_OK) {
|
|
*out = handler;
|
|
return(XML_ERR_OK);
|
|
}
|
|
if (ret != XML_ERR_UNSUPPORTED_ENCODING)
|
|
goto done;
|
|
#endif /* LIBXML_ICU_ENABLED */
|
|
|
|
ret = XML_ERR_UNSUPPORTED_ENCODING;
|
|
|
|
done:
|
|
if (handler != NULL) {
|
|
xmlFree(handler->name);
|
|
xmlFree(handler);
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlLookupCharEncodingHandler:
|
|
* @enc: an xmlCharEncoding value.
|
|
* @out: pointer to result
|
|
*
|
|
* Find or create a handler matching the encoding. The following
|
|
* converters are looked up in order:
|
|
*
|
|
* - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
|
|
* - User-registered global handler (deprecated)
|
|
* - iconv if enabled
|
|
* - ICU if enabled
|
|
*
|
|
* The handler must be closed with xmlCharEncCloseFunc.
|
|
*
|
|
* If the encoding is UTF-8, a NULL handler and no error code will
|
|
* be returned.
|
|
*
|
|
* Available since 2.13.0.
|
|
*
|
|
* Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
|
|
* xmlParserErrors error code.
|
|
*/
|
|
int
|
|
xmlLookupCharEncodingHandler(xmlCharEncoding enc,
|
|
xmlCharEncodingHandler **out) {
|
|
const xmlCharEncodingHandler *handler;
|
|
|
|
if (out == NULL)
|
|
return(XML_ERR_ARGUMENT);
|
|
*out = NULL;
|
|
|
|
if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
|
|
return(XML_ERR_UNSUPPORTED_ENCODING);
|
|
|
|
/* Return NULL handler for UTF-8 */
|
|
if ((enc == XML_CHAR_ENCODING_UTF8) ||
|
|
(enc == XML_CHAR_ENCODING_NONE))
|
|
return(XML_ERR_OK);
|
|
|
|
handler = &defaultHandlers[enc];
|
|
if ((handler->input != NULL) || (handler->output != NULL)) {
|
|
*out = (xmlCharEncodingHandler *) handler;
|
|
return(XML_ERR_OK);
|
|
}
|
|
|
|
if (handler->name != NULL)
|
|
return(xmlFindExtraHandler(handler->name, handler->name, 0,
|
|
NULL, NULL, out));
|
|
|
|
return(XML_ERR_UNSUPPORTED_ENCODING);
|
|
}
|
|
|
|
/**
|
|
* xmlGetCharEncodingHandler:
|
|
* @enc: an xmlCharEncoding value.
|
|
*
|
|
* DEPRECATED: Use xmlLookupCharEncodingHandler which has better error
|
|
* reporting.
|
|
*
|
|
* Returns the handler or NULL if no handler was found or an error
|
|
* occurred.
|
|
*/
|
|
xmlCharEncodingHandlerPtr
|
|
xmlGetCharEncodingHandler(xmlCharEncoding enc) {
|
|
xmlCharEncodingHandler *ret;
|
|
|
|
xmlLookupCharEncodingHandler(enc, &ret);
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlCreateCharEncodingHandler:
|
|
* @name: a string describing the char encoding.
|
|
* @output: boolean, use handler for output
|
|
* @impl: a conversion implementation (optional)
|
|
* @implCtxt: user data for conversion implementation (optional)
|
|
* @out: pointer to result
|
|
*
|
|
* Find or create a handler matching the encoding. The following
|
|
* converters are looked up in order:
|
|
*
|
|
* - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
|
|
* - Custom implementation if provided
|
|
* - User-registered global handler (deprecated)
|
|
* - iconv if enabled
|
|
* - ICU if enabled
|
|
*
|
|
* The handler must be closed with xmlCharEncCloseFunc.
|
|
*
|
|
* If the encoding is UTF-8, a NULL handler and no error code will
|
|
* be returned.
|
|
*
|
|
* Available since 2.14.0.
|
|
*
|
|
* Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
|
|
* xmlParserErrors error code.
|
|
*/
|
|
int
|
|
xmlCreateCharEncodingHandler(const char *name, int output,
|
|
xmlCharEncConvImpl impl, void *implCtxt,
|
|
xmlCharEncodingHandler **out) {
|
|
const xmlCharEncodingHandler *handler;
|
|
const char *norig, *nalias;
|
|
xmlCharEncoding enc;
|
|
|
|
if (out == NULL)
|
|
return(XML_ERR_ARGUMENT);
|
|
*out = NULL;
|
|
|
|
if (name == NULL)
|
|
return(XML_ERR_ARGUMENT);
|
|
|
|
norig = name;
|
|
nalias = xmlGetEncodingAlias(name);
|
|
if (nalias != NULL)
|
|
name = nalias;
|
|
|
|
enc = xmlParseCharEncodingInternal(name);
|
|
|
|
/* Return NULL handler for UTF-8 */
|
|
if (enc == XML_CHAR_ENCODING_UTF8)
|
|
return(XML_ERR_OK);
|
|
|
|
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
|
|
handler = &defaultHandlers[enc];
|
|
if ((output ? handler->output : handler->input) != NULL) {
|
|
*out = (xmlCharEncodingHandler *) handler;
|
|
return(XML_ERR_OK);
|
|
}
|
|
}
|
|
|
|
return(xmlFindExtraHandler(norig, name, output, impl, implCtxt, out));
|
|
}
|
|
|
|
/**
|
|
* xmlOpenCharEncodingHandler:
|
|
* @name: a string describing the char encoding.
|
|
* @output: boolean, use handler for output
|
|
* @out: pointer to result
|
|
*
|
|
* Find or create a handler matching the encoding. The following
|
|
* converters are looked up in order:
|
|
*
|
|
* - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
|
|
* - User-registered global handler (deprecated)
|
|
* - iconv if enabled
|
|
* - ICU if enabled
|
|
*
|
|
* The handler must be closed with xmlCharEncCloseFunc.
|
|
*
|
|
* If the encoding is UTF-8, a NULL handler and no error code will
|
|
* be returned.
|
|
*
|
|
* Available since 2.13.0.
|
|
*
|
|
* Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
|
|
* xmlParserErrors error code.
|
|
*/
|
|
int
|
|
xmlOpenCharEncodingHandler(const char *name, int output,
|
|
xmlCharEncodingHandler **out) {
|
|
return(xmlCreateCharEncodingHandler(name, output, NULL, NULL, out));
|
|
}
|
|
|
|
/**
|
|
* xmlFindCharEncodingHandler:
|
|
* @name: a string describing the char encoding.
|
|
*
|
|
* DEPRECATED: Use xmlOpenCharEncodingHandler which has better error
|
|
* reporting.
|
|
*
|
|
* If the encoding is UTF-8, this will return a no-op handler that
|
|
* shouldn't be used.
|
|
*
|
|
* Returns the handler or NULL if no handler was found or an error
|
|
* occurred.
|
|
*/
|
|
xmlCharEncodingHandlerPtr
|
|
xmlFindCharEncodingHandler(const char *name) {
|
|
xmlCharEncodingHandler *ret;
|
|
|
|
/*
|
|
* This handler shouldn't be used, but we must return a non-NULL
|
|
* handler.
|
|
*/
|
|
if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) ||
|
|
(xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0))
|
|
return((xmlCharEncodingHandlerPtr)
|
|
&defaultHandlers[XML_CHAR_ENCODING_UTF8]);
|
|
|
|
xmlOpenCharEncodingHandler(name, 0, &ret);
|
|
return(ret);
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* ICONV based generic conversion functions *
|
|
* *
|
|
************************************************************************/
|
|
|
|
#ifdef LIBXML_ICONV_ENABLED
|
|
typedef struct {
|
|
iconv_t cd;
|
|
} xmlIconvCtxt;
|
|
|
|
/**
|
|
* xmlIconvConvert:
|
|
* @vctxt: conversion context
|
|
* @out: a pointer to an array of bytes to store the result
|
|
* @outlen: the length of @out
|
|
* @in: a pointer to an array of input bytes
|
|
* @inlen: the length of @in
|
|
*
|
|
* Returns an XML_ENC_ERR code.
|
|
*
|
|
* The value of @inlen after return is the number of octets consumed
|
|
* as the return value is positive, else unpredictable.
|
|
* The value of @outlen after return is the number of octets produced.
|
|
*/
|
|
static int
|
|
xmlIconvConvert(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen, void *vctxt) {
|
|
xmlIconvCtxt *ctxt = vctxt;
|
|
size_t icv_inlen, icv_outlen;
|
|
const char *icv_in = (const char *) in;
|
|
char *icv_out = (char *) out;
|
|
size_t ret;
|
|
|
|
if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
|
|
if (outlen != NULL) *outlen = 0;
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
}
|
|
icv_inlen = *inlen;
|
|
icv_outlen = *outlen;
|
|
/*
|
|
* Some versions take const, other versions take non-const input.
|
|
*/
|
|
ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
|
|
*inlen -= icv_inlen;
|
|
*outlen -= icv_outlen;
|
|
if (ret == (size_t) -1) {
|
|
if (errno == EILSEQ)
|
|
return(XML_ENC_ERR_INPUT);
|
|
if (errno == E2BIG)
|
|
return(XML_ENC_ERR_SPACE);
|
|
/*
|
|
* EINVAL means a truncated multi-byte sequence at the end
|
|
* of the input buffer. We treat this as success.
|
|
*/
|
|
if (errno == EINVAL)
|
|
return(XML_ENC_ERR_SUCCESS);
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
}
|
|
return(XML_ENC_ERR_SUCCESS);
|
|
}
|
|
|
|
static void
|
|
xmlIconvFree(void *vctxt) {
|
|
xmlIconvCtxt *ctxt = vctxt;
|
|
|
|
if (ctxt->cd != (iconv_t) -1)
|
|
iconv_close(ctxt->cd);
|
|
|
|
xmlFree(ctxt);
|
|
}
|
|
|
|
static int
|
|
xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv) {
|
|
xmlCharEncodingHandler *handler = vctxt;
|
|
xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL;
|
|
iconv_t icv_in;
|
|
iconv_t icv_out;
|
|
int ret;
|
|
|
|
inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
|
|
if (inputCtxt == NULL) {
|
|
ret = XML_ERR_NO_MEMORY;
|
|
goto error;
|
|
}
|
|
inputCtxt->cd = (iconv_t) -1;
|
|
|
|
icv_in = iconv_open("UTF-8", name);
|
|
if (icv_in == (iconv_t) -1) {
|
|
if (errno == EINVAL)
|
|
ret = XML_ERR_UNSUPPORTED_ENCODING;
|
|
else if (errno == ENOMEM)
|
|
ret = XML_ERR_NO_MEMORY;
|
|
else
|
|
ret = XML_ERR_SYSTEM;
|
|
goto error;
|
|
}
|
|
inputCtxt->cd = icv_in;
|
|
|
|
outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
|
|
if (outputCtxt == NULL) {
|
|
ret = XML_ERR_NO_MEMORY;
|
|
goto error;
|
|
}
|
|
outputCtxt->cd = (iconv_t) -1;
|
|
|
|
icv_out = iconv_open(name, "UTF-8");
|
|
if (icv_out == (iconv_t) -1) {
|
|
if (errno == EINVAL)
|
|
ret = XML_ERR_UNSUPPORTED_ENCODING;
|
|
else if (errno == ENOMEM)
|
|
ret = XML_ERR_NO_MEMORY;
|
|
else
|
|
ret = XML_ERR_SYSTEM;
|
|
goto error;
|
|
}
|
|
outputCtxt->cd = icv_out;
|
|
|
|
conv->input = xmlIconvConvert;
|
|
conv->output = xmlIconvConvert;
|
|
conv->ctxtDtor = xmlIconvFree;
|
|
conv->inputCtxt = inputCtxt;
|
|
conv->outputCtxt = outputCtxt;
|
|
|
|
/* Backward compatibility */
|
|
if (handler != NULL) {
|
|
handler->iconv_in = icv_in;
|
|
handler->iconv_out = icv_out;
|
|
}
|
|
|
|
return(XML_ERR_OK);
|
|
|
|
error:
|
|
if (inputCtxt != NULL)
|
|
xmlIconvFree(inputCtxt);
|
|
if (outputCtxt != NULL)
|
|
xmlIconvFree(outputCtxt);
|
|
return(ret);
|
|
}
|
|
#endif /* LIBXML_ICONV_ENABLED */
|
|
|
|
/************************************************************************
|
|
* *
|
|
* ICU based generic conversion functions *
|
|
* *
|
|
************************************************************************/
|
|
|
|
#ifdef LIBXML_ICU_ENABLED
|
|
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
|
|
#define ICU_PIVOT_BUF_SIZE 1024
|
|
|
|
typedef struct _uconv_t xmlUconvCtxt;
|
|
struct _uconv_t {
|
|
UConverter *uconv; /* for conversion between an encoding and UTF-16 */
|
|
UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
|
|
UChar *pivot_source;
|
|
UChar *pivot_target;
|
|
int isInput;
|
|
UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
|
|
};
|
|
|
|
/**
|
|
* xmlUconvConvert:
|
|
* @vctxt: converison context
|
|
* @out: a pointer to an array of bytes to store the result
|
|
* @outlen: the length of @out
|
|
* @in: a pointer to an array of input bytes
|
|
* @inlen: the length of @in
|
|
*
|
|
* Returns an XML_ENC_ERR code.
|
|
*
|
|
* The value of @inlen after return is the number of octets consumed
|
|
* as the return value is positive, else unpredictable.
|
|
* The value of @outlen after return is the number of octets produced.
|
|
*/
|
|
static int
|
|
xmlUconvConvert(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen, void *vctxt) {
|
|
xmlUconvCtxt *cd = vctxt;
|
|
const char *ucv_in = (const char *) in;
|
|
char *ucv_out = (char *) out;
|
|
UConverter *target, *source;
|
|
UErrorCode err = U_ZERO_ERROR;
|
|
int ret;
|
|
|
|
if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
|
|
if (outlen != NULL)
|
|
*outlen = 0;
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
}
|
|
|
|
/*
|
|
* Note that the ICU API is stateful. It can always consume a certain
|
|
* amount of input even if the output buffer would overflow. The
|
|
* remaining input must be processed by calling ucnv_convertEx with a
|
|
* possibly empty input buffer.
|
|
*
|
|
* ucnv_convertEx is always called with reset and flush set to 0,
|
|
* so we don't mess up the state. This should never generate
|
|
* U_TRUNCATED_CHAR_FOUND errors.
|
|
*/
|
|
if (cd->isInput) {
|
|
source = cd->uconv;
|
|
target = cd->utf8;
|
|
} else {
|
|
source = cd->utf8;
|
|
target = cd->uconv;
|
|
}
|
|
|
|
ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
|
|
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
|
|
&cd->pivot_source, &cd->pivot_target,
|
|
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
|
|
|
|
*inlen = ucv_in - (const char*) in;
|
|
*outlen = ucv_out - (char *) out;
|
|
|
|
if (U_SUCCESS(err)) {
|
|
ret = XML_ENC_ERR_SUCCESS;
|
|
} else {
|
|
switch (err) {
|
|
case U_TRUNCATED_CHAR_FOUND:
|
|
/* Shouldn't happen without flush */
|
|
ret = XML_ENC_ERR_SUCCESS;
|
|
break;
|
|
|
|
case U_BUFFER_OVERFLOW_ERROR:
|
|
ret = XML_ENC_ERR_SPACE;
|
|
break;
|
|
|
|
case U_INVALID_CHAR_FOUND:
|
|
case U_ILLEGAL_CHAR_FOUND:
|
|
ret = XML_ENC_ERR_INPUT;
|
|
break;
|
|
|
|
case U_MEMORY_ALLOCATION_ERROR:
|
|
ret = XML_ERR_NO_MEMORY;
|
|
break;
|
|
|
|
default:
|
|
ret = XML_ENC_ERR_INTERNAL;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
static int
|
|
openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out)
|
|
{
|
|
UErrorCode status;
|
|
xmlUconvCtxt *conv;
|
|
|
|
*out = NULL;
|
|
|
|
conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt));
|
|
if (conv == NULL)
|
|
return(XML_ERR_NO_MEMORY);
|
|
|
|
conv->isInput = isInput;
|
|
conv->pivot_source = conv->pivot_buf;
|
|
conv->pivot_target = conv->pivot_buf;
|
|
|
|
status = U_ZERO_ERROR;
|
|
conv->uconv = ucnv_open(name, &status);
|
|
if (U_FAILURE(status))
|
|
goto error;
|
|
|
|
status = U_ZERO_ERROR;
|
|
if (isInput) {
|
|
ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
|
|
NULL, NULL, NULL, &status);
|
|
}
|
|
else {
|
|
ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
|
|
NULL, NULL, NULL, &status);
|
|
}
|
|
if (U_FAILURE(status))
|
|
goto error;
|
|
|
|
status = U_ZERO_ERROR;
|
|
conv->utf8 = ucnv_open("UTF-8", &status);
|
|
if (U_FAILURE(status))
|
|
goto error;
|
|
|
|
*out = conv;
|
|
return(0);
|
|
|
|
error:
|
|
if (conv->uconv)
|
|
ucnv_close(conv->uconv);
|
|
xmlFree(conv);
|
|
|
|
if (status == U_FILE_ACCESS_ERROR)
|
|
return(XML_ERR_UNSUPPORTED_ENCODING);
|
|
if (status == U_MEMORY_ALLOCATION_ERROR)
|
|
return(XML_ERR_NO_MEMORY);
|
|
return(XML_ERR_SYSTEM);
|
|
}
|
|
|
|
static void
|
|
closeIcuConverter(xmlUconvCtxt *conv)
|
|
{
|
|
if (conv == NULL)
|
|
return;
|
|
ucnv_close(conv->uconv);
|
|
ucnv_close(conv->utf8);
|
|
xmlFree(conv);
|
|
}
|
|
|
|
static void
|
|
xmlUconvFree(void *vctxt) {
|
|
closeIcuConverter(vctxt);
|
|
}
|
|
|
|
static int
|
|
xmlCharEncUconv(void *vctxt ATTRIBUTE_UNUSED, const char *name,
|
|
xmlCharEncConverter *conv) {
|
|
xmlUconvCtxt *ucv_in = NULL;
|
|
xmlUconvCtxt *ucv_out = NULL;
|
|
int ret;
|
|
|
|
ret = openIcuConverter(name, 1, &ucv_in);
|
|
if (ret != 0)
|
|
goto error;
|
|
ret = openIcuConverter(name, 0, &ucv_out);
|
|
if (ret != 0)
|
|
goto error;
|
|
|
|
conv->input = xmlUconvConvert;
|
|
conv->output = xmlUconvConvert;
|
|
conv->ctxtDtor = xmlUconvFree;
|
|
conv->inputCtxt = ucv_in;
|
|
conv->outputCtxt = ucv_out;
|
|
|
|
return(XML_ERR_OK);
|
|
|
|
error:
|
|
if (ucv_in != NULL)
|
|
closeIcuConverter(ucv_in);
|
|
if (ucv_out != NULL)
|
|
closeIcuConverter(ucv_out);
|
|
return(ret);
|
|
}
|
|
#endif /* LIBXML_ICU_ENABLED */
|
|
|
|
/************************************************************************
|
|
* *
|
|
* The real API used by libxml for on-the-fly conversion *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* xmlEncConvertError:
|
|
* @code: XML_ENC_ERR code
|
|
*
|
|
* Convert XML_ENC_ERR to libxml2 error codes.
|
|
*/
|
|
static int
|
|
xmlEncConvertError(int code) {
|
|
int ret;
|
|
|
|
switch (code) {
|
|
case XML_ENC_ERR_SUCCESS:
|
|
ret = XML_ERR_OK;
|
|
break;
|
|
case XML_ENC_ERR_INPUT:
|
|
ret = XML_ERR_INVALID_ENCODING;
|
|
break;
|
|
case XML_ENC_ERR_MEMORY:
|
|
ret = XML_ERR_NO_MEMORY;
|
|
break;
|
|
default:
|
|
ret = XML_ERR_INTERNAL_ERROR;
|
|
break;
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlEncInputChunk:
|
|
* @handler: encoding handler
|
|
* @out: a pointer to an array of bytes to store the result
|
|
* @outlen: the length of @out
|
|
* @in: a pointer to an array of input bytes
|
|
* @inlen: the length of @in
|
|
*
|
|
* The value of @inlen after return is the number of octets consumed
|
|
* as the return value is 0, else unpredictable.
|
|
* The value of @outlen after return is the number of octets produced.
|
|
*
|
|
* Returns an XML_ENC_ERR code.
|
|
*/
|
|
int
|
|
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
|
int *outlen, const unsigned char *in, int *inlen) {
|
|
int ret;
|
|
|
|
if (handler->input != NULL) {
|
|
xmlCharEncConvFunc conv =
|
|
(xmlCharEncConvFunc) (void (*)(void)) handler->input;
|
|
|
|
ret = conv(out, outlen, in, inlen, handler->inputCtxt);
|
|
if (ret > 0)
|
|
ret = XML_ENC_ERR_SUCCESS;
|
|
}
|
|
else {
|
|
*outlen = 0;
|
|
*inlen = 0;
|
|
ret = XML_ENC_ERR_INTERNAL;
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlEncOutputChunk:
|
|
* @handler: encoding handler
|
|
* @out: a pointer to an array of bytes to store the result
|
|
* @outlen: the length of @out
|
|
* @in: a pointer to an array of input bytes
|
|
* @inlen: the length of @in
|
|
*
|
|
* Returns an XML_ENC_ERR code.
|
|
*
|
|
* The value of @inlen after return is the number of octets consumed
|
|
* as the return value is 0, else unpredictable.
|
|
* The value of @outlen after return is the number of octets produced.
|
|
*/
|
|
static int
|
|
xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
|
int *outlen, const unsigned char *in, int *inlen) {
|
|
int ret;
|
|
|
|
if (handler->output != NULL) {
|
|
xmlCharEncConvFunc conv =
|
|
(xmlCharEncConvFunc) (void (*)(void)) handler->output;
|
|
|
|
ret = conv(out, outlen, in, inlen, handler->outputCtxt);
|
|
if (ret > 0)
|
|
ret = XML_ENC_ERR_SUCCESS;
|
|
}
|
|
else {
|
|
*outlen = 0;
|
|
*inlen = 0;
|
|
ret = XML_ENC_ERR_INTERNAL;
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlCharEncFirstLine:
|
|
* @handler: char encoding transformation data structure
|
|
* @out: an xmlBuffer for the output.
|
|
* @in: an xmlBuffer for the input
|
|
*
|
|
* DEPERECATED: Don't use.
|
|
*
|
|
* Returns the number of bytes written or an XML_ENC_ERR code.
|
|
*/
|
|
int
|
|
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
|
xmlBufferPtr in) {
|
|
return(xmlCharEncInFunc(handler, out, in));
|
|
}
|
|
|
|
/**
|
|
* xmlCharEncInput:
|
|
* @input: a parser input buffer
|
|
*
|
|
* Generic front-end for the encoding handler on parser input
|
|
*
|
|
* Returns the number of bytes written or an XML_ENC_ERR code.
|
|
*/
|
|
int
|
|
xmlCharEncInput(xmlParserInputBufferPtr input)
|
|
{
|
|
int ret;
|
|
size_t avail;
|
|
size_t toconv;
|
|
int c_in;
|
|
int c_out;
|
|
xmlBufPtr in;
|
|
xmlBufPtr out;
|
|
const xmlChar *inData;
|
|
size_t inTotal = 0;
|
|
|
|
if ((input == NULL) || (input->encoder == NULL) ||
|
|
(input->buffer == NULL) || (input->raw == NULL))
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
out = input->buffer;
|
|
in = input->raw;
|
|
|
|
toconv = xmlBufUse(in);
|
|
if (toconv == 0)
|
|
return (0);
|
|
inData = xmlBufContent(in);
|
|
inTotal = 0;
|
|
|
|
do {
|
|
c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv;
|
|
|
|
avail = xmlBufAvail(out);
|
|
if (avail > INT_MAX)
|
|
avail = INT_MAX;
|
|
if (avail < 4096) {
|
|
if (xmlBufGrow(out, 4096) < 0) {
|
|
input->error = XML_ERR_NO_MEMORY;
|
|
return(XML_ENC_ERR_MEMORY);
|
|
}
|
|
avail = xmlBufAvail(out);
|
|
}
|
|
|
|
c_in = toconv;
|
|
c_out = avail;
|
|
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
|
|
inData, &c_in);
|
|
inTotal += c_in;
|
|
inData += c_in;
|
|
toconv -= c_in;
|
|
xmlBufAddLen(out, c_out);
|
|
} while (ret == XML_ENC_ERR_SPACE);
|
|
|
|
xmlBufShrink(in, inTotal);
|
|
|
|
if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in)
|
|
input->rawconsumed = ULONG_MAX;
|
|
else
|
|
input->rawconsumed += c_in;
|
|
|
|
if (((ret != 0) && (c_out == 0)) ||
|
|
(ret == XML_ENC_ERR_MEMORY)) {
|
|
if (input->error == 0)
|
|
input->error = xmlEncConvertError(ret);
|
|
return(ret);
|
|
}
|
|
|
|
return (c_out);
|
|
}
|
|
|
|
/**
|
|
* xmlCharEncInFunc:
|
|
* @handler: char encoding transformation data structure
|
|
* @out: an xmlBuffer for the output.
|
|
* @in: an xmlBuffer for the input
|
|
*
|
|
* Generic front-end for the encoding handler input function
|
|
*
|
|
* Returns the number of bytes written or an XML_ENC_ERR code.
|
|
*/
|
|
int
|
|
xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
|
|
xmlBufferPtr in)
|
|
{
|
|
int ret;
|
|
int written;
|
|
int toconv;
|
|
|
|
if (handler == NULL)
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
if (out == NULL)
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
if (in == NULL)
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
|
|
toconv = in->use;
|
|
if (toconv == 0)
|
|
return (0);
|
|
written = out->size - out->use -1; /* count '\0' */
|
|
if (toconv * 2 >= written) {
|
|
xmlBufferGrow(out, out->size + toconv * 2);
|
|
written = out->size - out->use - 1;
|
|
}
|
|
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
|
|
in->content, &toconv);
|
|
xmlBufferShrink(in, toconv);
|
|
out->use += written;
|
|
out->content[out->use] = 0;
|
|
|
|
return (written? written : ret);
|
|
}
|
|
|
|
#ifdef LIBXML_OUTPUT_ENABLED
|
|
/**
|
|
* xmlCharEncOutput:
|
|
* @output: a parser output buffer
|
|
* @init: is this an initialization call without data
|
|
*
|
|
* Generic front-end for the encoding handler on parser output
|
|
* a first call with @init == 1 has to be made first to initiate the
|
|
* output in case of non-stateless encoding needing to initiate their
|
|
* state or the output (like the BOM in UTF16).
|
|
* In case of UTF8 sequence conversion errors for the given encoder,
|
|
* the content will be automatically remapped to a CharRef sequence.
|
|
*
|
|
* Returns the number of bytes written or an XML_ENC_ERR code.
|
|
*/
|
|
int
|
|
xmlCharEncOutput(xmlOutputBufferPtr output, int init)
|
|
{
|
|
int ret;
|
|
size_t written;
|
|
int writtentot = 0;
|
|
size_t toconv;
|
|
int c_in;
|
|
int c_out;
|
|
xmlBufPtr in;
|
|
xmlBufPtr out;
|
|
|
|
if ((output == NULL) || (output->encoder == NULL) ||
|
|
(output->buffer == NULL) || (output->conv == NULL))
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
out = output->conv;
|
|
in = output->buffer;
|
|
|
|
retry:
|
|
|
|
written = xmlBufAvail(out);
|
|
|
|
/*
|
|
* First specific handling of the initialization call
|
|
*/
|
|
if (init) {
|
|
c_in = 0;
|
|
c_out = written;
|
|
/* TODO: Check return value. */
|
|
xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
|
|
NULL, &c_in);
|
|
xmlBufAddLen(out, c_out);
|
|
return(c_out);
|
|
}
|
|
|
|
/*
|
|
* Conversion itself.
|
|
*/
|
|
toconv = xmlBufUse(in);
|
|
if (toconv > 64 * 1024)
|
|
toconv = 64 * 1024;
|
|
if (toconv * 4 >= written) {
|
|
if (xmlBufGrow(out, toconv * 4) < 0) {
|
|
ret = XML_ENC_ERR_MEMORY;
|
|
goto error;
|
|
}
|
|
written = xmlBufAvail(out);
|
|
}
|
|
if (written > 256 * 1024)
|
|
written = 256 * 1024;
|
|
|
|
c_in = toconv;
|
|
c_out = written;
|
|
ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
|
|
xmlBufContent(in), &c_in);
|
|
xmlBufShrink(in, c_in);
|
|
xmlBufAddLen(out, c_out);
|
|
writtentot += c_out;
|
|
|
|
if (ret == XML_ENC_ERR_SPACE)
|
|
goto retry;
|
|
|
|
/*
|
|
* Attempt to handle error cases
|
|
*/
|
|
if (ret == XML_ENC_ERR_INPUT) {
|
|
xmlChar charref[20];
|
|
int len = xmlBufUse(in);
|
|
xmlChar *content = xmlBufContent(in);
|
|
int cur, charrefLen;
|
|
|
|
cur = xmlGetUTF8Char(content, &len);
|
|
if (cur <= 0)
|
|
goto error;
|
|
|
|
/*
|
|
* Removes the UTF8 sequence, and replace it by a charref
|
|
* and continue the transcoding phase, hoping the error
|
|
* did not mangle the encoder state.
|
|
*/
|
|
charrefLen = snprintf((char *) &charref[0], sizeof(charref),
|
|
"&#%d;", cur);
|
|
xmlBufGrow(out, charrefLen * 4);
|
|
c_out = xmlBufAvail(out);
|
|
c_in = charrefLen;
|
|
ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
|
|
charref, &c_in);
|
|
if ((ret < 0) || (c_in != charrefLen)) {
|
|
ret = XML_ENC_ERR_INTERNAL;
|
|
goto error;
|
|
}
|
|
|
|
xmlBufShrink(in, len);
|
|
xmlBufAddLen(out, c_out);
|
|
writtentot += c_out;
|
|
goto retry;
|
|
}
|
|
|
|
error:
|
|
if (((writtentot <= 0) && (ret != 0)) ||
|
|
(ret == XML_ENC_ERR_MEMORY)) {
|
|
if (output->error == 0)
|
|
output->error = xmlEncConvertError(ret);
|
|
return(ret);
|
|
}
|
|
|
|
return(writtentot);
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* xmlCharEncOutFunc:
|
|
* @handler: char encoding transformation data structure
|
|
* @out: an xmlBuffer for the output.
|
|
* @in: an xmlBuffer for the input
|
|
*
|
|
* Generic front-end for the encoding handler output function
|
|
* a first call with @in == NULL has to be made firs to initiate the
|
|
* output in case of non-stateless encoding needing to initiate their
|
|
* state or the output (like the BOM in UTF16).
|
|
* In case of UTF8 sequence conversion errors for the given encoder,
|
|
* the content will be automatically remapped to a CharRef sequence.
|
|
*
|
|
* Returns the number of bytes written or an XML_ENC_ERR code.
|
|
*/
|
|
int
|
|
xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
|
xmlBufferPtr in) {
|
|
int ret;
|
|
int written;
|
|
int writtentot = 0;
|
|
int toconv;
|
|
|
|
if (handler == NULL) return(XML_ENC_ERR_INTERNAL);
|
|
if (out == NULL) return(XML_ENC_ERR_INTERNAL);
|
|
|
|
retry:
|
|
|
|
written = out->size - out->use;
|
|
|
|
if (written > 0)
|
|
written--; /* Gennady: count '/0' */
|
|
|
|
/*
|
|
* First specific handling of in = NULL, i.e. the initialization call
|
|
*/
|
|
if (in == NULL) {
|
|
toconv = 0;
|
|
/* TODO: Check return value. */
|
|
xmlEncOutputChunk(handler, &out->content[out->use], &written,
|
|
NULL, &toconv);
|
|
out->use += written;
|
|
out->content[out->use] = 0;
|
|
return(0);
|
|
}
|
|
|
|
/*
|
|
* Conversion itself.
|
|
*/
|
|
toconv = in->use;
|
|
if (toconv * 4 >= written) {
|
|
xmlBufferGrow(out, toconv * 4);
|
|
written = out->size - out->use - 1;
|
|
}
|
|
ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
|
|
in->content, &toconv);
|
|
xmlBufferShrink(in, toconv);
|
|
out->use += written;
|
|
writtentot += written;
|
|
out->content[out->use] = 0;
|
|
|
|
if (ret == XML_ENC_ERR_SPACE)
|
|
goto retry;
|
|
|
|
/*
|
|
* Attempt to handle error cases
|
|
*/
|
|
if (ret == XML_ENC_ERR_INPUT) {
|
|
xmlChar charref[20];
|
|
int len = in->use;
|
|
const xmlChar *utf = (const xmlChar *) in->content;
|
|
int cur, charrefLen;
|
|
|
|
cur = xmlGetUTF8Char(utf, &len);
|
|
if (cur <= 0)
|
|
return(ret);
|
|
|
|
/*
|
|
* Removes the UTF8 sequence, and replace it by a charref
|
|
* and continue the transcoding phase, hoping the error
|
|
* did not mangle the encoder state.
|
|
*/
|
|
charrefLen = snprintf((char *) &charref[0], sizeof(charref),
|
|
"&#%d;", cur);
|
|
xmlBufferShrink(in, len);
|
|
xmlBufferGrow(out, charrefLen * 4);
|
|
written = out->size - out->use - 1;
|
|
toconv = charrefLen;
|
|
ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
|
|
charref, &toconv);
|
|
if ((ret < 0) || (toconv != charrefLen))
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
|
|
out->use += written;
|
|
writtentot += written;
|
|
out->content[out->use] = 0;
|
|
goto retry;
|
|
}
|
|
return(writtentot ? writtentot : ret);
|
|
}
|
|
|
|
/**
|
|
* xmlCharEncCloseFunc:
|
|
* @handler: char encoding transformation data structure
|
|
*
|
|
* Releases an xmlCharEncodingHandler. Must be called after
|
|
* a handler is no longer in use.
|
|
*
|
|
* Returns 0.
|
|
*/
|
|
int
|
|
xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
|
|
if (handler == NULL)
|
|
return(0);
|
|
|
|
if (handler->flags & XML_HANDLER_STATIC)
|
|
return(0);
|
|
|
|
xmlFree(handler->name);
|
|
if (handler->ctxtDtor != NULL) {
|
|
handler->ctxtDtor(handler->inputCtxt);
|
|
handler->ctxtDtor(handler->outputCtxt);
|
|
}
|
|
xmlFree(handler);
|
|
return(0);
|
|
}
|
|
|
|
/**
|
|
* xmlByteConsumed:
|
|
* @ctxt: an XML parser context
|
|
*
|
|
* DEPRECATED: Don't use.
|
|
*
|
|
* This function provides the current index of the parser relative
|
|
* to the start of the current entity. This function is computed in
|
|
* bytes from the beginning starting at zero and finishing at the
|
|
* size in byte of the file if parsing a file. The function is
|
|
* of constant cost if the input is UTF-8 but can be costly if run
|
|
* on non-UTF-8 input.
|
|
*
|
|
* Returns the index in bytes from the beginning of the entity or -1
|
|
* in case the index could not be computed.
|
|
*/
|
|
long
|
|
xmlByteConsumed(xmlParserCtxtPtr ctxt) {
|
|
xmlParserInputPtr in;
|
|
|
|
if (ctxt == NULL)
|
|
return(-1);
|
|
in = ctxt->input;
|
|
if (in == NULL)
|
|
return(-1);
|
|
|
|
if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
|
|
int unused = 0;
|
|
xmlCharEncodingHandler * handler = in->buf->encoder;
|
|
|
|
/*
|
|
* Encoding conversion, compute the number of unused original
|
|
* bytes from the input not consumed and subtract that from
|
|
* the raw consumed value, this is not a cheap operation
|
|
*/
|
|
if (in->end - in->cur > 0) {
|
|
unsigned char *convbuf;
|
|
const unsigned char *cur = (const unsigned char *)in->cur;
|
|
int toconv, ret;
|
|
|
|
convbuf = xmlMalloc(32000);
|
|
if (convbuf == NULL)
|
|
return(-1);
|
|
|
|
toconv = in->end - cur;
|
|
unused = 32000;
|
|
ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv);
|
|
|
|
xmlFree(convbuf);
|
|
|
|
if (ret != XML_ENC_ERR_SUCCESS)
|
|
return(-1);
|
|
}
|
|
|
|
if (in->buf->rawconsumed < (unsigned long) unused)
|
|
return(-1);
|
|
return(in->buf->rawconsumed - unused);
|
|
}
|
|
|
|
return(in->consumed + (in->cur - in->base));
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Conversions To/From UTF8 encoding *
|
|
* *
|
|
************************************************************************/
|
|
|
|
static int
|
|
asciiToAscii(unsigned char* out, int *poutlen,
|
|
const unsigned char* in, int *pinlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
const unsigned char *inend;
|
|
const unsigned char *instart = in;
|
|
int inlen, outlen, ret;
|
|
|
|
if (in == NULL) {
|
|
*pinlen = 0;
|
|
*poutlen = 0;
|
|
return(XML_ENC_ERR_SUCCESS);
|
|
}
|
|
|
|
inlen = *pinlen;
|
|
outlen = *poutlen;
|
|
|
|
if (outlen < inlen) {
|
|
inlen = outlen;
|
|
ret = XML_ENC_ERR_SPACE;
|
|
} else {
|
|
ret = inlen;
|
|
}
|
|
|
|
inend = in + inlen;
|
|
*poutlen = inlen;
|
|
*pinlen = inlen;
|
|
|
|
while (in < inend) {
|
|
unsigned c = *in;
|
|
|
|
if (c >= 0x80) {
|
|
*poutlen = in - instart;
|
|
*pinlen = in - instart;
|
|
return(XML_ENC_ERR_INPUT);
|
|
}
|
|
|
|
in++;
|
|
*out++ = c;
|
|
}
|
|
|
|
return(ret);
|
|
}
|
|
|
|
static int
|
|
latin1ToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
unsigned char* outstart = out;
|
|
const unsigned char* instart = in;
|
|
unsigned char* outend;
|
|
const unsigned char* inend;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
|
|
outend = out + *outlen;
|
|
inend = in + *inlen;
|
|
|
|
while (in < inend) {
|
|
unsigned c = *in;
|
|
|
|
if (c < 0x80) {
|
|
if (out >= outend)
|
|
goto done;
|
|
*out++ = c;
|
|
} else {
|
|
if (outend - out < 2)
|
|
goto done;
|
|
*out++ = (c >> 6) | 0xC0;
|
|
*out++ = (c & 0x3F) | 0x80;
|
|
}
|
|
|
|
in++;
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* isolat1ToUTF8:
|
|
* @out: a pointer to an array of bytes to store the result
|
|
* @outlen: the length of @out
|
|
* @in: a pointer to an array of ISO Latin 1 chars
|
|
* @inlen: the length of @in
|
|
*
|
|
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
|
|
* block of chars out.
|
|
*
|
|
* Returns the number of bytes written or an XML_ENC_ERR code.
|
|
*
|
|
* The value of @inlen after return is the number of octets consumed
|
|
* if the return value is positive, else unpredictable.
|
|
* The value of @outlen after return is the number of octets produced.
|
|
*/
|
|
int
|
|
isolat1ToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen) {
|
|
return(latin1ToUTF8(out, outlen, in, inlen, NULL));
|
|
}
|
|
|
|
static int
|
|
UTF8ToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
int len;
|
|
int ret;
|
|
|
|
if (in == NULL) {
|
|
*inlen = 0;
|
|
*outlen = 0;
|
|
return(XML_ENC_ERR_SUCCESS);
|
|
}
|
|
|
|
if (*outlen < *inlen) {
|
|
len = *outlen;
|
|
ret = XML_ENC_ERR_SPACE;
|
|
} else {
|
|
len = *inlen;
|
|
ret = len;
|
|
}
|
|
|
|
memcpy(out, in, len);
|
|
|
|
*outlen = len;
|
|
*inlen = len;
|
|
return(ret);
|
|
}
|
|
|
|
|
|
#ifdef LIBXML_OUTPUT_ENABLED
|
|
static int
|
|
UTF8ToLatin1(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
const unsigned char* outend;
|
|
const unsigned char* outstart = out;
|
|
const unsigned char* instart = in;
|
|
const unsigned char* inend;
|
|
unsigned c;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
|
|
if (in == NULL) {
|
|
*inlen = 0;
|
|
*outlen = 0;
|
|
return(XML_ENC_ERR_SUCCESS);
|
|
}
|
|
|
|
inend = in + *inlen;
|
|
outend = out + *outlen;
|
|
while (in < inend) {
|
|
if (out >= outend)
|
|
goto done;
|
|
|
|
c = *in;
|
|
|
|
if (c < 0x80) {
|
|
*out++ = c;
|
|
} else if ((c >= 0xC2) && (c <= 0xC3)) {
|
|
if (inend - in < 2)
|
|
break;
|
|
in++;
|
|
*out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
|
|
} else {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
|
|
in++;
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* UTF8Toisolat1:
|
|
* @out: a pointer to an array of bytes to store the result
|
|
* @outlen: the length of @out
|
|
* @in: a pointer to an array of UTF-8 chars
|
|
* @inlen: the length of @in
|
|
*
|
|
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
|
|
* block of chars out.
|
|
*
|
|
* Returns the number of bytes written or an XML_ENC_ERR code.
|
|
*
|
|
* The value of @inlen after return is the number of octets consumed
|
|
* if the return value is positive, else unpredictable.
|
|
* The value of @outlen after return is the number of octets produced.
|
|
*/
|
|
int
|
|
UTF8Toisolat1(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen) {
|
|
if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
|
|
return(UTF8ToLatin1(out, outlen, in, inlen, NULL));
|
|
}
|
|
#endif /* LIBXML_OUTPUT_ENABLED */
|
|
|
|
static int
|
|
UTF16LEToUTF8(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
const unsigned char *instart = in;
|
|
const unsigned char *inend = in + (*inlen & ~1);
|
|
unsigned char *outstart = out;
|
|
unsigned char *outend = out + *outlen;
|
|
unsigned c, d;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
while (in < inend) {
|
|
c = in[0] | (in[1] << 8);
|
|
|
|
if (c < 0x80) {
|
|
if (out >= outend)
|
|
goto done;
|
|
out[0] = c;
|
|
in += 2;
|
|
out += 1;
|
|
} else if (c < 0x800) {
|
|
if (outend - out < 2)
|
|
goto done;
|
|
out[0] = (c >> 6) | 0xC0;
|
|
out[1] = (c & 0x3F) | 0x80;
|
|
in += 2;
|
|
out += 2;
|
|
} else if ((c & 0xF800) != 0xD800) {
|
|
if (outend - out < 3)
|
|
goto done;
|
|
out[0] = (c >> 12) | 0xE0;
|
|
out[1] = ((c >> 6) & 0x3F) | 0x80;
|
|
out[2] = (c & 0x3F) | 0x80;
|
|
in += 2;
|
|
out += 3;
|
|
} else {
|
|
/* Surrogate pair */
|
|
if ((c & 0xFC00) != 0xD800) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
if (inend - in < 4)
|
|
break;
|
|
d = in[2] | (in[3] << 8);
|
|
if ((d & 0xFC00) != 0xDC00) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
if (outend - out < 4)
|
|
goto done;
|
|
c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
|
|
out[0] = (c >> 18) | 0xF0;
|
|
out[1] = ((c >> 12) & 0x3F) | 0x80;
|
|
out[2] = ((c >> 6) & 0x3F) | 0x80;
|
|
out[3] = (c & 0x3F) | 0x80;
|
|
in += 4;
|
|
out += 4;
|
|
}
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
|
|
#ifdef LIBXML_OUTPUT_ENABLED
|
|
static int
|
|
UTF8ToUTF16LE(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
const unsigned char *instart = in;
|
|
const unsigned char *inend;
|
|
unsigned char *outstart = out;
|
|
unsigned char *outend;
|
|
unsigned c, d;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
/* UTF16LE encoding has no BOM */
|
|
if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
|
|
return(XML_ENC_ERR_INTERNAL);
|
|
if (in == NULL) {
|
|
*outlen = 0;
|
|
*inlen = 0;
|
|
return(0);
|
|
}
|
|
inend = in + *inlen;
|
|
outend = out + (*outlen & ~1);
|
|
while (in < inend) {
|
|
c = in[0];
|
|
|
|
if (c < 0x80) {
|
|
if (out >= outend)
|
|
goto done;
|
|
out[0] = c;
|
|
out[1] = 0;
|
|
in += 1;
|
|
out += 2;
|
|
} else {
|
|
int i, len;
|
|
unsigned min;
|
|
|
|
if (c < 0xE0) {
|
|
if (c < 0xC2) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
c &= 0x1F;
|
|
len = 2;
|
|
min = 0x80;
|
|
} else if (c < 0xF0) {
|
|
c &= 0x0F;
|
|
len = 3;
|
|
min = 0x800;
|
|
} else {
|
|
c &= 0x0F;
|
|
len = 4;
|
|
min = 0x10000;
|
|
}
|
|
|
|
if (inend - in < len)
|
|
break;
|
|
|
|
for (i = 1; i < len; i++) {
|
|
if ((in[i] & 0xC0) != 0x80) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
c = (c << 6) | (in[i] & 0x3F);
|
|
}
|
|
|
|
if ((c < min) ||
|
|
((c >= 0xD800) && (c <= 0xDFFF)) ||
|
|
(c > 0x10FFFF)) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
|
|
if (c < 0x10000) {
|
|
if (out >= outend)
|
|
goto done;
|
|
out[0] = c & 0xFF;
|
|
out[1] = c >> 8;
|
|
out += 2;
|
|
} else {
|
|
if (outend - out < 4)
|
|
goto done;
|
|
c -= 0x10000;
|
|
d = (c & 0x03FF) | 0xDC00;
|
|
c = (c >> 10) | 0xD800;
|
|
out[0] = c & 0xFF;
|
|
out[1] = c >> 8;
|
|
out[2] = d & 0xFF;
|
|
out[3] = d >> 8;
|
|
out += 4;
|
|
}
|
|
|
|
in += len;
|
|
}
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
|
|
static int
|
|
UTF8ToUTF16(unsigned char* outb, int *outlen,
|
|
const unsigned char* in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
if (in == NULL) {
|
|
/*
|
|
* initialization, add the Byte Order Mark for UTF-16LE
|
|
*/
|
|
if (*outlen >= 2) {
|
|
outb[0] = 0xFF;
|
|
outb[1] = 0xFE;
|
|
*outlen = 2;
|
|
*inlen = 0;
|
|
return(2);
|
|
}
|
|
*outlen = 0;
|
|
*inlen = 0;
|
|
return(0);
|
|
}
|
|
return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL));
|
|
}
|
|
#endif /* LIBXML_OUTPUT_ENABLED */
|
|
|
|
static int
|
|
UTF16BEToUTF8(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
const unsigned char *instart = in;
|
|
const unsigned char *inend = in + (*inlen & ~1);
|
|
unsigned char *outstart = out;
|
|
unsigned char *outend = out + *outlen;
|
|
unsigned c, d;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
while (in < inend) {
|
|
c = (in[0] << 8) | in[1];
|
|
|
|
if (c < 0x80) {
|
|
if (out >= outend)
|
|
goto done;
|
|
out[0] = c;
|
|
in += 2;
|
|
out += 1;
|
|
} else if (c < 0x800) {
|
|
if (outend - out < 2)
|
|
goto done;
|
|
out[0] = (c >> 6) | 0xC0;
|
|
out[1] = (c & 0x3F) | 0x80;
|
|
in += 2;
|
|
out += 2;
|
|
} else if ((c & 0xF800) != 0xD800) {
|
|
if (outend - out < 3)
|
|
goto done;
|
|
out[0] = (c >> 12) | 0xE0;
|
|
out[1] = ((c >> 6) & 0x3F) | 0x80;
|
|
out[2] = (c & 0x3F) | 0x80;
|
|
in += 2;
|
|
out += 3;
|
|
} else {
|
|
/* Surrogate pair */
|
|
if ((c & 0xFC00) != 0xD800) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
if (inend - in < 4)
|
|
break;
|
|
d = (in[2] << 8) | in[3];
|
|
if ((d & 0xFC00) != 0xDC00) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
if (outend - out < 4)
|
|
goto done;
|
|
c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
|
|
out[0] = (c >> 18) | 0xF0;
|
|
out[1] = ((c >> 12) & 0x3F) | 0x80;
|
|
out[2] = ((c >> 6) & 0x3F) | 0x80;
|
|
out[3] = (c & 0x3F) | 0x80;
|
|
in += 4;
|
|
out += 4;
|
|
}
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
|
|
#ifdef LIBXML_OUTPUT_ENABLED
|
|
static int
|
|
UTF8ToUTF16BE(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
const unsigned char *instart = in;
|
|
const unsigned char *inend;
|
|
unsigned char *outstart = out;
|
|
unsigned char *outend;
|
|
unsigned c, d;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
/* UTF-16BE has no BOM */
|
|
if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
|
|
if (in == NULL) {
|
|
*outlen = 0;
|
|
*inlen = 0;
|
|
return(0);
|
|
}
|
|
inend = in + *inlen;
|
|
outend = out + (*outlen & ~1);
|
|
while (in < inend) {
|
|
c = in[0];
|
|
|
|
if (c < 0x80) {
|
|
if (out >= outend)
|
|
goto done;
|
|
out[0] = 0;
|
|
out[1] = c;
|
|
in += 1;
|
|
out += 2;
|
|
} else {
|
|
int i, len;
|
|
unsigned min;
|
|
|
|
if (c < 0xE0) {
|
|
if (c < 0xC2) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
c &= 0x1F;
|
|
len = 2;
|
|
min = 0x80;
|
|
} else if (c < 0xF0) {
|
|
c &= 0x0F;
|
|
len = 3;
|
|
min = 0x800;
|
|
} else {
|
|
c &= 0x0F;
|
|
len = 4;
|
|
min = 0x10000;
|
|
}
|
|
|
|
if (inend - in < len)
|
|
break;
|
|
|
|
for (i = 1; i < len; i++) {
|
|
if ((in[i] & 0xC0) != 0x80) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
c = (c << 6) | (in[i] & 0x3F);
|
|
}
|
|
|
|
if ((c < min) ||
|
|
((c >= 0xD800) && (c <= 0xDFFF)) ||
|
|
(c > 0x10FFFF)) {
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
|
|
if (c < 0x10000) {
|
|
if (out >= outend)
|
|
goto done;
|
|
out[0] = c >> 8;
|
|
out[1] = c & 0xFF;
|
|
out += 2;
|
|
} else {
|
|
if (outend - out < 4)
|
|
goto done;
|
|
c -= 0x10000;
|
|
d = (c & 0x03FF) | 0xDC00;
|
|
c = (c >> 10) | 0xD800;
|
|
out[0] = c >> 8;
|
|
out[1] = c & 0xFF;
|
|
out[2] = d >> 8;
|
|
out[3] = d & 0xFF;
|
|
out += 4;
|
|
}
|
|
|
|
in += len;
|
|
}
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
#endif /* LIBXML_OUTPUT_ENABLED */
|
|
|
|
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
|
|
static int
|
|
UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen,
|
|
void *vctxt ATTRIBUTE_UNUSED) {
|
|
return(UTF8ToHtml(out, outlen, in, inlen));
|
|
}
|
|
#endif
|
|
|
|
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
|
|
defined(LIBXML_ISO8859X_ENABLED)
|
|
|
|
static int
|
|
UTF8ToISO8859x(unsigned char *out, int *outlen,
|
|
const unsigned char *in, int *inlen, void *vctxt) {
|
|
const unsigned char *xlattable = vctxt;
|
|
const unsigned char *instart = in;
|
|
const unsigned char *inend;
|
|
unsigned char *outstart = out;
|
|
unsigned char *outend;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
if (in == NULL) {
|
|
/*
|
|
* initialization nothing to do
|
|
*/
|
|
*outlen = 0;
|
|
*inlen = 0;
|
|
return(XML_ENC_ERR_SUCCESS);
|
|
}
|
|
|
|
inend = in + *inlen;
|
|
outend = out + *outlen;
|
|
while (in < inend) {
|
|
unsigned d = *in;
|
|
|
|
if (d < 0x80) {
|
|
if (out >= outend)
|
|
goto done;
|
|
in += 1;
|
|
} else if (d < 0xE0) {
|
|
unsigned c;
|
|
|
|
if (inend - in < 2)
|
|
break;
|
|
c = in[1] & 0x3F;
|
|
d = d & 0x1F;
|
|
d = xlattable [48 + c + xlattable [d] * 64];
|
|
if (d == 0) {
|
|
/* not in character set */
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
if (out >= outend)
|
|
goto done;
|
|
in += 2;
|
|
} else if (d < 0xF0) {
|
|
unsigned c1;
|
|
unsigned c2;
|
|
|
|
if (inend - in < 3)
|
|
break;
|
|
c1 = in[1] & 0x3F;
|
|
c2 = in[2] & 0x3F;
|
|
d = d & 0x0F;
|
|
d = xlattable [48 + c2 + xlattable [48 + c1 +
|
|
xlattable [32 + d] * 64] * 64];
|
|
if (d == 0) {
|
|
/* not in character set */
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
if (out >= outend)
|
|
goto done;
|
|
in += 3;
|
|
} else {
|
|
/* cannot transcode >= U+010000 */
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
|
|
*out++ = d;
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
|
|
static int
|
|
ISO8859xToUTF8(unsigned char* out, int *outlen,
|
|
const unsigned char* in, int *inlen, void *vctxt) {
|
|
unsigned short const *unicodetable = vctxt;
|
|
const unsigned char* instart = in;
|
|
const unsigned char* inend;
|
|
unsigned char* outstart = out;
|
|
unsigned char* outend;
|
|
int ret = XML_ENC_ERR_SPACE;
|
|
|
|
outend = out + *outlen;
|
|
inend = in + *inlen;
|
|
|
|
while (in < inend) {
|
|
unsigned c = *in;
|
|
|
|
if (c < 0x80) {
|
|
if (out >= outend)
|
|
goto done;
|
|
*out++ = c;
|
|
} else {
|
|
c = unicodetable[c - 0x80];
|
|
if (c == 0) {
|
|
/* undefined code point */
|
|
ret = XML_ENC_ERR_INPUT;
|
|
goto done;
|
|
}
|
|
if (c < 0x800) {
|
|
if (outend - out < 2)
|
|
goto done;
|
|
*out++ = ((c >> 6) & 0x1F) | 0xC0;
|
|
*out++ = (c & 0x3F) | 0x80;
|
|
} else {
|
|
if (outend - out < 3)
|
|
goto done;
|
|
*out++ = ((c >> 12) & 0x0F) | 0xE0;
|
|
*out++ = ((c >> 6) & 0x3F) | 0x80;
|
|
*out++ = (c & 0x3F) | 0x80;
|
|
}
|
|
}
|
|
|
|
in += 1;
|
|
}
|
|
|
|
ret = out - outstart;
|
|
|
|
done:
|
|
*outlen = out - outstart;
|
|
*inlen = in - instart;
|
|
return(ret);
|
|
}
|
|
|
|
#endif
|
|
|