1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00
libxml2/encoding.c
Nick Wellnhofer 34c9108f15 encoding: Add sizeOut argument to xmlCharEncInput
When push parsing, we want to convert as much of the input as possible.
When pull parsing memory buffers, we want to convert data chunk by chunk
to save memory.
2024-07-16 17:42:10 +02:00

2725 lines
73 KiB
C

/*
* encoding.c : implements the encoding conversion functions needed for XML
*
* Related specs:
* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
* [ISO-10646] UTF-8 and UTF-16 in Annexes
* [ISO-8859-1] ISO Latin-1 characters codes.
* [UNICODE] The Unicode Consortium, "The Unicode Standard --
* Worldwide Character Encoding -- Version 1.0", Addison-
* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
* described in Unicode Technical Report #4.
* [US-ASCII] Coded Character Set--7-bit American Standard Code for
* Information Interchange, ANSI X3.4-1986.
*
* See Copyright for the status of this software.
*
* daniel@veillard.com
*
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
*/
#define IN_LIBXML
#include "libxml.h"
#include <string.h>
#include <limits.h>
#include <ctype.h>
#include <stdlib.h>
#ifdef LIBXML_ICONV_ENABLED
#include <iconv.h>
#include <errno.h>
#endif
#include <libxml/encoding.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#ifdef LIBXML_HTML_ENABLED
#include <libxml/HTMLparser.h>
#endif
#include <libxml/xmlerror.h>
#include "private/buf.h"
#include "private/enc.h"
#include "private/entities.h"
#include "private/error.h"
#ifdef LIBXML_ICU_ENABLED
#include <unicode/ucnv.h>
#endif
#define XML_HANDLER_STATIC 1
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
struct _xmlCharEncodingAlias {
const char *name;
const char *alias;
};
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
static int xmlCharEncodingAliasesNb = 0;
static int xmlCharEncodingAliasesMax = 0;
static int xmlLittleEndian = 1;
typedef struct {
const char *name;
xmlCharEncoding enc;
} xmlEncTableEntry;
static const xmlEncTableEntry xmlEncTable[] = {
{ "ASCII", XML_CHAR_ENCODING_ASCII },
{ "EUC-JP", XML_CHAR_ENCODING_EUC_JP },
{ "HTML", XML_CHAR_ENCODING_HTML },
{ "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 },
{ "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 },
{ "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 },
{ "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE },
{ "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP },
{ "ISO-8859-1", XML_CHAR_ENCODING_8859_1 },
{ "ISO-8859-10", XML_CHAR_ENCODING_8859_10 },
{ "ISO-8859-11", XML_CHAR_ENCODING_8859_11 },
{ "ISO-8859-13", XML_CHAR_ENCODING_8859_13 },
{ "ISO-8859-14", XML_CHAR_ENCODING_8859_14 },
{ "ISO-8859-15", XML_CHAR_ENCODING_8859_15 },
{ "ISO-8859-16", XML_CHAR_ENCODING_8859_16 },
{ "ISO-8859-2", XML_CHAR_ENCODING_8859_2 },
{ "ISO-8859-3", XML_CHAR_ENCODING_8859_3 },
{ "ISO-8859-4", XML_CHAR_ENCODING_8859_4 },
{ "ISO-8859-5", XML_CHAR_ENCODING_8859_5 },
{ "ISO-8859-6", XML_CHAR_ENCODING_8859_6 },
{ "ISO-8859-7", XML_CHAR_ENCODING_8859_7 },
{ "ISO-8859-8", XML_CHAR_ENCODING_8859_8 },
{ "ISO-8859-9", XML_CHAR_ENCODING_8859_9 },
{ "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 },
{ "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 },
{ "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS },
{ "UCS-2", XML_CHAR_ENCODING_UCS2 },
{ "UCS-4", XML_CHAR_ENCODING_UCS4LE },
{ "UCS2", XML_CHAR_ENCODING_UCS2 },
{ "UCS4", XML_CHAR_ENCODING_UCS4LE },
{ "US-ASCII", XML_CHAR_ENCODING_ASCII },
{ "UTF-16", XML_CHAR_ENCODING_UTF16 },
{ "UTF-16BE", XML_CHAR_ENCODING_UTF16BE },
{ "UTF-16LE", XML_CHAR_ENCODING_UTF16LE },
{ "UTF-8", XML_CHAR_ENCODING_UTF8 },
{ "UTF16", XML_CHAR_ENCODING_UTF16LE },
{ "UTF8", XML_CHAR_ENCODING_UTF8 }
};
static int
asciiToAscii(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen, void *vctxt);
static int
UTF8ToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb, void *vctxt);
static int
latin1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen, void *vctxt);
static int
UTF16LEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb, void *vctxt);
static int
UTF16BEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb, void *vctxt);
#ifdef LIBXML_OUTPUT_ENABLED
static int
UTF8ToLatin1(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen, void *vctxt);
static int
UTF8ToUTF16(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen, void *vctxt);
static int
UTF8ToUTF16LE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen, void *vctxt);
static int
UTF8ToUTF16BE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen, void *vctxt);
#else /* LIBXML_OUTPUT_ENABLED */
#define UTF8ToLatin1 NULL
#define UTF8ToUTF16 NULL
#define UTF8ToUTF16LE NULL
#define UTF8ToUTF16BE NULL
#endif /* LIBXML_OUTPUT_ENABLED */
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
static int
UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen, void *vctxt);
#else
#define UTF8ToHtmlWrapper NULL
#endif
#ifdef LIBXML_ICONV_ENABLED
#define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0
#else
#define EMPTY_ICONV
#endif
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
defined(LIBXML_ISO8859X_ENABLED)
#include "iso8859x.inc"
static int
ISO8859xToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen, void *vctxt);
static int
UTF8ToISO8859x(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen, void *vctxt);
#define MAKE_ISO_HANDLER(name, n) \
{ (char *) name, \
(xmlCharEncodingInputFunc) (void (*)(void)) ISO8859xToUTF8, \
(xmlCharEncodingInputFunc) (void (*)(void)) UTF8ToISO8859x \
EMPTY_ICONV, \
(void *) xmlunicodetable_ISO8859_##n, \
(void *) xmltranscodetable_ISO8859_##n, \
NULL, XML_HANDLER_STATIC }
#else /* LIBXML_ISO8859X_ENABLED */
#define MAKE_ISO_HANDLER(name, n) \
{ (char *) name, NULL, NULL EMPTY_ICONV, NULL, NULL, NULL, \
XML_HANDLER_STATIC }
#endif /* LIBXML_ISO8859X_ENABLED */
#define MAKE_HANDLER(name, in, out) \
{ (char *) name, \
(xmlCharEncodingInputFunc) (void (*)(void)) in, \
(xmlCharEncodingOutputFunc) (void (*)(void)) out \
EMPTY_ICONV, NULL, NULL, NULL, XML_HANDLER_STATIC }
/*
* The layout must match enum xmlCharEncoding.
*
* Names should match the IANA registry if possible:
* https://www.iana.org/assignments/character-sets/character-sets.xhtml
*/
static const xmlCharEncodingHandler defaultHandlers[31] = {
MAKE_HANDLER(NULL, NULL, NULL), /* NONE */
MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8),
MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE),
MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE),
MAKE_HANDLER("UCS-4LE", NULL, NULL),
MAKE_HANDLER("UCS-4BE", NULL, NULL),
MAKE_HANDLER("IBM037", NULL, NULL),
MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */
MAKE_HANDLER("ISO-10646-UCS-2", NULL, NULL),
MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1),
MAKE_ISO_HANDLER("ISO-8859-2", 2),
MAKE_ISO_HANDLER("ISO-8859-3", 3),
MAKE_ISO_HANDLER("ISO-8859-4", 4),
MAKE_ISO_HANDLER("ISO-8859-5", 5),
MAKE_ISO_HANDLER("ISO-8859-6", 6),
MAKE_ISO_HANDLER("ISO-8859-7", 7),
MAKE_ISO_HANDLER("ISO-8859-8", 8),
MAKE_ISO_HANDLER("ISO-8859-9", 9),
MAKE_HANDLER("ISO-2022-JP", NULL, NULL),
MAKE_HANDLER("Shift_JIS", NULL, NULL),
MAKE_HANDLER("EUC-JP", NULL, NULL),
MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii),
MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16),
MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper),
MAKE_ISO_HANDLER("ISO-8859-10", 10),
MAKE_ISO_HANDLER("ISO-8859-11", 11),
MAKE_ISO_HANDLER("ISO-8859-13", 13),
MAKE_ISO_HANDLER("ISO-8859-14", 14),
MAKE_ISO_HANDLER("ISO-8859-15", 15),
MAKE_ISO_HANDLER("ISO-8859-16", 16),
};
#define NUM_DEFAULT_HANDLERS \
(sizeof(defaultHandlers) / sizeof(defaultHandlers[0]))
/* the size should be growable, but it's not a big deal ... */
#define MAX_ENCODING_HANDLERS 50
static xmlCharEncodingHandlerPtr *globalHandlers = NULL;
static int nbCharEncodingHandler = 0;
#ifdef LIBXML_ICONV_ENABLED
static int
xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
#endif
#ifdef LIBXML_ICU_ENABLED
static int
xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv);
#endif
/************************************************************************
* *
* Generic encoding handling routines *
* *
************************************************************************/
/**
* xmlDetectCharEncoding:
* @in: a pointer to the first bytes of the XML entity, must be at least
* 2 bytes long (at least 4 if encoding is UTF4 variant).
* @len: pointer to the length of the buffer
*
* Guess the encoding of the entity using the first bytes of the entity content
* according to the non-normative appendix F of the XML-1.0 recommendation.
*
* Returns one of the XML_CHAR_ENCODING_... values.
*/
xmlCharEncoding
xmlDetectCharEncoding(const unsigned char* in, int len)
{
if (in == NULL)
return(XML_CHAR_ENCODING_NONE);
if (len >= 4) {
if ((in[0] == 0x00) && (in[1] == 0x00) &&
(in[2] == 0x00) && (in[3] == 0x3C))
return(XML_CHAR_ENCODING_UCS4BE);
if ((in[0] == 0x3C) && (in[1] == 0x00) &&
(in[2] == 0x00) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4LE);
if ((in[0] == 0x00) && (in[1] == 0x00) &&
(in[2] == 0x3C) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4_2143);
if ((in[0] == 0x00) && (in[1] == 0x3C) &&
(in[2] == 0x00) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4_3412);
if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
(in[2] == 0xA7) && (in[3] == 0x94))
return(XML_CHAR_ENCODING_EBCDIC);
if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
(in[2] == 0x78) && (in[3] == 0x6D))
return(XML_CHAR_ENCODING_UTF8);
/*
* Although not part of the recommendation, we also
* attempt an "auto-recognition" of UTF-16LE and
* UTF-16BE encodings.
*/
if ((in[0] == 0x3C) && (in[1] == 0x00) &&
(in[2] == 0x3F) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UTF16LE);
if ((in[0] == 0x00) && (in[1] == 0x3C) &&
(in[2] == 0x00) && (in[3] == 0x3F))
return(XML_CHAR_ENCODING_UTF16BE);
}
if (len >= 3) {
/*
* Errata on XML-1.0 June 20 2001
* We now allow an UTF8 encoded BOM
*/
if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
(in[2] == 0xBF))
return(XML_CHAR_ENCODING_UTF8);
}
/* For UTF-16 we can recognize by the BOM */
if (len >= 2) {
if ((in[0] == 0xFE) && (in[1] == 0xFF))
return(XML_CHAR_ENCODING_UTF16BE);
if ((in[0] == 0xFF) && (in[1] == 0xFE))
return(XML_CHAR_ENCODING_UTF16LE);
}
return(XML_CHAR_ENCODING_NONE);
}
/**
* xmlCleanupEncodingAliases:
*
* DEPRECATED: This function modifies global state and is not
* thread-safe.
*
* Unregisters all aliases
*/
void
xmlCleanupEncodingAliases(void) {
int i;
if (xmlCharEncodingAliases == NULL)
return;
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
if (xmlCharEncodingAliases[i].name != NULL)
xmlFree((char *) xmlCharEncodingAliases[i].name);
if (xmlCharEncodingAliases[i].alias != NULL)
xmlFree((char *) xmlCharEncodingAliases[i].alias);
}
xmlCharEncodingAliasesNb = 0;
xmlCharEncodingAliasesMax = 0;
xmlFree(xmlCharEncodingAliases);
xmlCharEncodingAliases = NULL;
}
/**
* xmlGetEncodingAlias:
* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
*
* DEPRECATED: This function is not thread-safe.
*
* Lookup an encoding name for the given alias.
*
* Returns NULL if not found, otherwise the original name
*/
const char *
xmlGetEncodingAlias(const char *alias) {
int i;
char upper[100];
if (alias == NULL)
return(NULL);
if (xmlCharEncodingAliases == NULL)
return(NULL);
for (i = 0;i < 99;i++) {
upper[i] = (char) toupper((unsigned char) alias[i]);
if (upper[i] == 0) break;
}
upper[i] = 0;
/*
* Walk down the list looking for a definition of the alias
*/
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
return(xmlCharEncodingAliases[i].name);
}
}
return(NULL);
}
/**
* xmlAddEncodingAlias:
* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
*
* DEPRECATED: This function modifies global state and is not
* thread-safe.
*
* Registers an alias @alias for an encoding named @name. Existing alias
* will be overwritten.
*
* Returns 0 in case of success, -1 in case of error
*/
int
xmlAddEncodingAlias(const char *name, const char *alias) {
int i;
char upper[100];
char *nameCopy, *aliasCopy;
if ((name == NULL) || (alias == NULL))
return(-1);
for (i = 0;i < 99;i++) {
upper[i] = (char) toupper((unsigned char) alias[i]);
if (upper[i] == 0) break;
}
upper[i] = 0;
if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
xmlCharEncodingAliasPtr tmp;
size_t newSize = xmlCharEncodingAliasesMax ?
xmlCharEncodingAliasesMax * 2 :
20;
tmp = (xmlCharEncodingAliasPtr)
xmlRealloc(xmlCharEncodingAliases,
newSize * sizeof(xmlCharEncodingAlias));
if (tmp == NULL)
return(-1);
xmlCharEncodingAliases = tmp;
xmlCharEncodingAliasesMax = newSize;
}
/*
* Walk down the list looking for a definition of the alias
*/
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
/*
* Replace the definition.
*/
nameCopy = xmlMemStrdup(name);
if (nameCopy == NULL)
return(-1);
xmlFree((char *) xmlCharEncodingAliases[i].name);
xmlCharEncodingAliases[i].name = nameCopy;
return(0);
}
}
/*
* Add the definition
*/
nameCopy = xmlMemStrdup(name);
if (nameCopy == NULL)
return(-1);
aliasCopy = xmlMemStrdup(upper);
if (aliasCopy == NULL) {
xmlFree(nameCopy);
return(-1);
}
xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy;
xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy;
xmlCharEncodingAliasesNb++;
return(0);
}
/**
* xmlDelEncodingAlias:
* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
*
* DEPRECATED: This function modifies global state and is not
* thread-safe.
*
* Unregisters an encoding alias @alias
*
* Returns 0 in case of success, -1 in case of error
*/
int
xmlDelEncodingAlias(const char *alias) {
int i;
if (alias == NULL)
return(-1);
if (xmlCharEncodingAliases == NULL)
return(-1);
/*
* Walk down the list looking for a definition of the alias
*/
for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
xmlFree((char *) xmlCharEncodingAliases[i].name);
xmlFree((char *) xmlCharEncodingAliases[i].alias);
xmlCharEncodingAliasesNb--;
memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
return(0);
}
}
return(-1);
}
static int
xmlCompareEncTableEntries(const void *vkey, const void *ventry) {
const char *key = vkey;
const xmlEncTableEntry *entry = ventry;
return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name));
}
static xmlCharEncoding
xmlParseCharEncodingInternal(const char *name)
{
const xmlEncTableEntry *entry;
if (name == NULL)
return(XML_CHAR_ENCODING_NONE);
entry = bsearch(name, xmlEncTable,
sizeof(xmlEncTable) / sizeof(xmlEncTable[0]),
sizeof(xmlEncTable[0]), xmlCompareEncTableEntries);
if (entry != NULL)
return(entry->enc);
return(XML_CHAR_ENCODING_ERROR);
}
/**
* xmlParseCharEncoding:
* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
*
* Compare the string to the encoding schemes already known. Note
* that the comparison is case insensitive accordingly to the section
* [XML] 4.3.3 Character Encoding in Entities.
*
* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
* if not recognized.
*/
xmlCharEncoding
xmlParseCharEncoding(const char *name)
{
xmlCharEncoding enc = xmlParseCharEncodingInternal(name);
/* Backward compatibility */
if (enc == XML_CHAR_ENCODING_UTF16)
enc = XML_CHAR_ENCODING_UTF16LE;
return(enc);
}
/**
* xmlGetCharEncodingName:
* @enc: the encoding
*
* The "canonical" name for XML encoding.
* C.f. http://www.w3.org/TR/REC-xml#charencoding
* Section 4.3.3 Character Encoding in Entities
*
* Returns the canonical name for the given encoding
*/
const char*
xmlGetCharEncodingName(xmlCharEncoding enc) {
switch (enc) {
case XML_CHAR_ENCODING_UTF16LE:
return("UTF-16");
case XML_CHAR_ENCODING_UTF16BE:
return("UTF-16");
case XML_CHAR_ENCODING_UCS4LE:
return("ISO-10646-UCS-4");
case XML_CHAR_ENCODING_UCS4BE:
return("ISO-10646-UCS-4");
default:
break;
}
if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
return(NULL);
return(defaultHandlers[enc].name);
}
/************************************************************************
* *
* Char encoding handlers *
* *
************************************************************************/
/**
* xmlNewCharEncodingHandler:
* @name: the encoding name, in UTF-8 format (ASCII actually)
* @input: the xmlCharEncodingInputFunc to read that encoding
* @output: the xmlCharEncodingOutputFunc to write that encoding
*
* DEPRECATED: This function modifies global state and is not
* thread-safe.
*
* Create and registers an xmlCharEncodingHandler.
*
* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
*/
xmlCharEncodingHandlerPtr
xmlNewCharEncodingHandler(const char *name,
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output) {
xmlCharEncodingHandlerPtr handler;
const char *alias;
char upper[500];
int i;
char *up = NULL;
/*
* Do the alias resolution
*/
alias = xmlGetEncodingAlias(name);
if (alias != NULL)
name = alias;
/*
* Keep only the uppercase version of the encoding.
*/
if (name == NULL)
return(NULL);
for (i = 0;i < 499;i++) {
upper[i] = (char) toupper((unsigned char) name[i]);
if (upper[i] == 0) break;
}
upper[i] = 0;
up = xmlMemStrdup(upper);
if (up == NULL)
return(NULL);
/*
* allocate and fill-up an handler block.
*/
handler = (xmlCharEncodingHandlerPtr)
xmlMalloc(sizeof(xmlCharEncodingHandler));
if (handler == NULL) {
xmlFree(up);
return(NULL);
}
memset(handler, 0, sizeof(xmlCharEncodingHandler));
handler->input = input;
handler->output = output;
handler->name = up;
handler->flags = XML_HANDLER_STATIC;
#ifdef LIBXML_ICONV_ENABLED
handler->iconv_in = NULL;
handler->iconv_out = NULL;
#endif
/*
* registers and returns the handler.
*/
xmlRegisterCharEncodingHandler(handler);
return(handler);
}
/**
* xmlInitCharEncodingHandlers:
*
* DEPRECATED: Alias for xmlInitParser.
*/
void
xmlInitCharEncodingHandlers(void) {
xmlInitParser();
}
/**
* xmlInitEncodingInternal:
*
* Initialize the char encoding support.
*/
void
xmlInitEncodingInternal(void) {
unsigned short int tst = 0x1234;
unsigned char *ptr = (unsigned char *) &tst;
if (*ptr == 0x12) xmlLittleEndian = 0;
else xmlLittleEndian = 1;
}
/**
* xmlCleanupCharEncodingHandlers:
*
* DEPRECATED: This function will be made private. Call xmlCleanupParser
* to free global state but see the warnings there. xmlCleanupParser
* should be only called once at program exit. In most cases, you don't
* have call cleanup functions at all.
*
* Cleanup the memory allocated for the char encoding support, it
* unregisters all the encoding handlers and the aliases.
*/
void
xmlCleanupCharEncodingHandlers(void) {
xmlCleanupEncodingAliases();
if (globalHandlers == NULL) return;
for (;nbCharEncodingHandler > 0;) {
xmlCharEncodingHandler *handler;
nbCharEncodingHandler--;
handler = globalHandlers[nbCharEncodingHandler];
if (handler != NULL) {
if (handler->name != NULL)
xmlFree(handler->name);
xmlFree(handler);
}
}
xmlFree(globalHandlers);
globalHandlers = NULL;
nbCharEncodingHandler = 0;
}
/**
* xmlRegisterCharEncodingHandler:
* @handler: the xmlCharEncodingHandlerPtr handler block
*
* DEPRECATED: This function modifies global state and is not
* thread-safe.
*
* Register the char encoding handler.
*/
void
xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
if (handler == NULL)
return;
if (globalHandlers == NULL) {
globalHandlers = xmlMalloc(
MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0]));
if (globalHandlers == NULL)
goto free_handler;
}
if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS)
goto free_handler;
globalHandlers[nbCharEncodingHandler++] = handler;
return;
free_handler:
if (handler != NULL) {
if (handler->name != NULL) {
xmlFree(handler->name);
}
xmlFree(handler);
}
}
static int
xmlInvokeConvImpl(xmlCharEncConvImpl impl, void *implCtxt,
const char *name, xmlCharEncodingHandler *handler) {
xmlCharEncConverter conv = { NULL, NULL, NULL, NULL, NULL };
int ret;
ret = impl(implCtxt, name, &conv);
if (ret == XML_ERR_OK) {
handler->input =
(xmlCharEncodingInputFunc) (void (*)(void)) conv.input;
handler->output =
(xmlCharEncodingOutputFunc) (void (*)(void)) conv.output;
handler->ctxtDtor = conv.ctxtDtor;
handler->inputCtxt = conv.inputCtxt;
handler->outputCtxt = conv.outputCtxt;
}
return(ret);
}
/**
* xmlFindExtraHandler:
* @norig: name of the char encoding
* @name: potentially aliased name of the encoding
* @output: boolean, use handler for output
* @impl: a conversion implementation (optional)
* @implCtxt: user data for conversion implementation (optional)
* @out: pointer to resulting handler
*
* Search the non-default handlers for an exact match.
*
* Returns an xmlParserErrors error code.
*/
static int
xmlFindExtraHandler(const char *norig, const char *name, int output,
xmlCharEncConvImpl impl, void *implCtxt,
xmlCharEncodingHandler **out) {
xmlCharEncodingHandler *handler;
int ret;
int i;
handler = xmlMalloc(sizeof(*handler));
if (handler == NULL)
return(XML_ERR_NO_MEMORY);
memset(handler, 0, sizeof(*handler));
handler->name = xmlMemStrdup(name);
if (handler->name == NULL) {
ret = XML_ERR_NO_MEMORY;
goto done;
}
/*
* Try custom implementation before deprecated global handlers.
*
* Note that we pass the original name without deprecated
* alias resolution.
*/
if (impl != NULL) {
ret = xmlInvokeConvImpl(impl, implCtxt, norig, handler);
if (ret != XML_ERR_OK)
goto done;
*out = handler;
return(XML_ERR_OK);
}
/*
* Deprecated
*/
if (globalHandlers != NULL) {
for (i = 0; i < nbCharEncodingHandler; i++) {
xmlCharEncodingHandler *h = globalHandlers[i];
if (!xmlStrcasecmp((const xmlChar *) name,
(const xmlChar *) h->name)) {
if ((output ? h->output : h->input) != NULL) {
*out = h;
ret = XML_ERR_OK;
goto done;
}
}
}
}
#ifdef LIBXML_ICONV_ENABLED
ret = xmlInvokeConvImpl(xmlCharEncIconv, handler, name, handler);
if (ret == XML_ERR_OK) {
*out = handler;
return(XML_ERR_OK);
}
if (ret != XML_ERR_UNSUPPORTED_ENCODING)
goto done;
#endif /* LIBXML_ICONV_ENABLED */
#ifdef LIBXML_ICU_ENABLED
ret = xmlInvokeConvImpl(xmlCharEncUconv, handler, name, handler);
if (ret == XML_ERR_OK) {
*out = handler;
return(XML_ERR_OK);
}
if (ret != XML_ERR_UNSUPPORTED_ENCODING)
goto done;
#endif /* LIBXML_ICU_ENABLED */
ret = XML_ERR_UNSUPPORTED_ENCODING;
done:
if (handler != NULL) {
xmlFree(handler->name);
xmlFree(handler);
}
return(ret);
}
/**
* xmlLookupCharEncodingHandler:
* @enc: an xmlCharEncoding value.
* @out: pointer to result
*
* Find or create a handler matching the encoding. The following
* converters are looked up in order:
*
* - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
* - User-registered global handler (deprecated)
* - iconv if enabled
* - ICU if enabled
*
* The handler must be closed with xmlCharEncCloseFunc.
*
* If the encoding is UTF-8, a NULL handler and no error code will
* be returned.
*
* Available since 2.13.0.
*
* Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
* xmlParserErrors error code.
*/
int
xmlLookupCharEncodingHandler(xmlCharEncoding enc,
xmlCharEncodingHandler **out) {
const xmlCharEncodingHandler *handler;
if (out == NULL)
return(XML_ERR_ARGUMENT);
*out = NULL;
if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS))
return(XML_ERR_UNSUPPORTED_ENCODING);
/* Return NULL handler for UTF-8 */
if ((enc == XML_CHAR_ENCODING_UTF8) ||
(enc == XML_CHAR_ENCODING_NONE))
return(XML_ERR_OK);
handler = &defaultHandlers[enc];
if ((handler->input != NULL) || (handler->output != NULL)) {
*out = (xmlCharEncodingHandler *) handler;
return(XML_ERR_OK);
}
if (handler->name != NULL)
return(xmlFindExtraHandler(handler->name, handler->name, 0,
NULL, NULL, out));
return(XML_ERR_UNSUPPORTED_ENCODING);
}
/**
* xmlGetCharEncodingHandler:
* @enc: an xmlCharEncoding value.
*
* DEPRECATED: Use xmlLookupCharEncodingHandler which has better error
* reporting.
*
* Returns the handler or NULL if no handler was found or an error
* occurred.
*/
xmlCharEncodingHandlerPtr
xmlGetCharEncodingHandler(xmlCharEncoding enc) {
xmlCharEncodingHandler *ret;
xmlLookupCharEncodingHandler(enc, &ret);
return(ret);
}
/**
* xmlCreateCharEncodingHandler:
* @name: a string describing the char encoding.
* @output: boolean, use handler for output
* @impl: a conversion implementation (optional)
* @implCtxt: user data for conversion implementation (optional)
* @out: pointer to result
*
* Find or create a handler matching the encoding. The following
* converters are looked up in order:
*
* - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
* - Custom implementation if provided
* - User-registered global handler (deprecated)
* - iconv if enabled
* - ICU if enabled
*
* The handler must be closed with xmlCharEncCloseFunc.
*
* If the encoding is UTF-8, a NULL handler and no error code will
* be returned.
*
* Available since 2.14.0.
*
* Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
* xmlParserErrors error code.
*/
int
xmlCreateCharEncodingHandler(const char *name, int output,
xmlCharEncConvImpl impl, void *implCtxt,
xmlCharEncodingHandler **out) {
const xmlCharEncodingHandler *handler;
const char *norig, *nalias;
xmlCharEncoding enc;
if (out == NULL)
return(XML_ERR_ARGUMENT);
*out = NULL;
if (name == NULL)
return(XML_ERR_ARGUMENT);
norig = name;
nalias = xmlGetEncodingAlias(name);
if (nalias != NULL)
name = nalias;
enc = xmlParseCharEncodingInternal(name);
/* Return NULL handler for UTF-8 */
if (enc == XML_CHAR_ENCODING_UTF8)
return(XML_ERR_OK);
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
handler = &defaultHandlers[enc];
if ((output ? handler->output : handler->input) != NULL) {
*out = (xmlCharEncodingHandler *) handler;
return(XML_ERR_OK);
}
}
return(xmlFindExtraHandler(norig, name, output, impl, implCtxt, out));
}
/**
* xmlOpenCharEncodingHandler:
* @name: a string describing the char encoding.
* @output: boolean, use handler for output
* @out: pointer to result
*
* Find or create a handler matching the encoding. The following
* converters are looked up in order:
*
* - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII)
* - User-registered global handler (deprecated)
* - iconv if enabled
* - ICU if enabled
*
* The handler must be closed with xmlCharEncCloseFunc.
*
* If the encoding is UTF-8, a NULL handler and no error code will
* be returned.
*
* Available since 2.13.0.
*
* Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another
* xmlParserErrors error code.
*/
int
xmlOpenCharEncodingHandler(const char *name, int output,
xmlCharEncodingHandler **out) {
return(xmlCreateCharEncodingHandler(name, output, NULL, NULL, out));
}
/**
* xmlFindCharEncodingHandler:
* @name: a string describing the char encoding.
*
* DEPRECATED: Use xmlOpenCharEncodingHandler which has better error
* reporting.
*
* If the encoding is UTF-8, this will return a no-op handler that
* shouldn't be used.
*
* Returns the handler or NULL if no handler was found or an error
* occurred.
*/
xmlCharEncodingHandlerPtr
xmlFindCharEncodingHandler(const char *name) {
xmlCharEncodingHandler *ret;
/*
* This handler shouldn't be used, but we must return a non-NULL
* handler.
*/
if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) ||
(xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0))
return((xmlCharEncodingHandlerPtr)
&defaultHandlers[XML_CHAR_ENCODING_UTF8]);
xmlOpenCharEncodingHandler(name, 0, &ret);
return(ret);
}
/************************************************************************
* *
* ICONV based generic conversion functions *
* *
************************************************************************/
#ifdef LIBXML_ICONV_ENABLED
typedef struct {
iconv_t cd;
} xmlIconvCtxt;
/**
* xmlIconvConvert:
* @vctxt: conversion context
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
*
* Returns an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
static int
xmlIconvConvert(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen, void *vctxt) {
xmlIconvCtxt *ctxt = vctxt;
size_t icv_inlen, icv_outlen;
const char *icv_in = (const char *) in;
char *icv_out = (char *) out;
size_t ret;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
if (outlen != NULL) *outlen = 0;
return(XML_ENC_ERR_INTERNAL);
}
icv_inlen = *inlen;
icv_outlen = *outlen;
/*
* Some versions take const, other versions take non-const input.
*/
ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
*inlen -= icv_inlen;
*outlen -= icv_outlen;
if (ret == (size_t) -1) {
if (errno == EILSEQ)
return(XML_ENC_ERR_INPUT);
if (errno == E2BIG)
return(XML_ENC_ERR_SPACE);
/*
* EINVAL means a truncated multi-byte sequence at the end
* of the input buffer. We treat this as success.
*/
if (errno == EINVAL)
return(XML_ENC_ERR_SUCCESS);
return(XML_ENC_ERR_INTERNAL);
}
return(XML_ENC_ERR_SUCCESS);
}
static void
xmlIconvFree(void *vctxt) {
xmlIconvCtxt *ctxt = vctxt;
if (ctxt->cd != (iconv_t) -1)
iconv_close(ctxt->cd);
xmlFree(ctxt);
}
static int
xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv) {
xmlCharEncodingHandler *handler = vctxt;
xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL;
iconv_t icv_in;
iconv_t icv_out;
int ret;
inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
if (inputCtxt == NULL) {
ret = XML_ERR_NO_MEMORY;
goto error;
}
inputCtxt->cd = (iconv_t) -1;
icv_in = iconv_open("UTF-8", name);
if (icv_in == (iconv_t) -1) {
if (errno == EINVAL)
ret = XML_ERR_UNSUPPORTED_ENCODING;
else if (errno == ENOMEM)
ret = XML_ERR_NO_MEMORY;
else
ret = XML_ERR_SYSTEM;
goto error;
}
inputCtxt->cd = icv_in;
outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt));
if (outputCtxt == NULL) {
ret = XML_ERR_NO_MEMORY;
goto error;
}
outputCtxt->cd = (iconv_t) -1;
icv_out = iconv_open(name, "UTF-8");
if (icv_out == (iconv_t) -1) {
if (errno == EINVAL)
ret = XML_ERR_UNSUPPORTED_ENCODING;
else if (errno == ENOMEM)
ret = XML_ERR_NO_MEMORY;
else
ret = XML_ERR_SYSTEM;
goto error;
}
outputCtxt->cd = icv_out;
conv->input = xmlIconvConvert;
conv->output = xmlIconvConvert;
conv->ctxtDtor = xmlIconvFree;
conv->inputCtxt = inputCtxt;
conv->outputCtxt = outputCtxt;
/* Backward compatibility */
if (handler != NULL) {
handler->iconv_in = icv_in;
handler->iconv_out = icv_out;
}
return(XML_ERR_OK);
error:
if (inputCtxt != NULL)
xmlIconvFree(inputCtxt);
if (outputCtxt != NULL)
xmlIconvFree(outputCtxt);
return(ret);
}
#endif /* LIBXML_ICONV_ENABLED */
/************************************************************************
* *
* ICU based generic conversion functions *
* *
************************************************************************/
#ifdef LIBXML_ICU_ENABLED
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
#define ICU_PIVOT_BUF_SIZE 1024
typedef struct _uconv_t xmlUconvCtxt;
struct _uconv_t {
UConverter *uconv; /* for conversion between an encoding and UTF-16 */
UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
UChar *pivot_source;
UChar *pivot_target;
int isInput;
UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
};
/**
* xmlUconvConvert:
* @vctxt: converison context
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
*
* Returns an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
static int
xmlUconvConvert(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen, void *vctxt) {
xmlUconvCtxt *cd = vctxt;
const char *ucv_in = (const char *) in;
char *ucv_out = (char *) out;
UConverter *target, *source;
UErrorCode err = U_ZERO_ERROR;
int ret;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
if (outlen != NULL)
*outlen = 0;
return(XML_ENC_ERR_INTERNAL);
}
/*
* Note that the ICU API is stateful. It can always consume a certain
* amount of input even if the output buffer would overflow. The
* remaining input must be processed by calling ucnv_convertEx with a
* possibly empty input buffer.
*
* ucnv_convertEx is always called with reset and flush set to 0,
* so we don't mess up the state. This should never generate
* U_TRUNCATED_CHAR_FOUND errors.
*/
if (cd->isInput) {
source = cd->uconv;
target = cd->utf8;
} else {
source = cd->utf8;
target = cd->uconv;
}
ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
&cd->pivot_source, &cd->pivot_target,
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
*inlen = ucv_in - (const char*) in;
*outlen = ucv_out - (char *) out;
if (U_SUCCESS(err)) {
ret = XML_ENC_ERR_SUCCESS;
} else {
switch (err) {
case U_TRUNCATED_CHAR_FOUND:
/* Shouldn't happen without flush */
ret = XML_ENC_ERR_SUCCESS;
break;
case U_BUFFER_OVERFLOW_ERROR:
ret = XML_ENC_ERR_SPACE;
break;
case U_INVALID_CHAR_FOUND:
case U_ILLEGAL_CHAR_FOUND:
ret = XML_ENC_ERR_INPUT;
break;
case U_MEMORY_ALLOCATION_ERROR:
ret = XML_ERR_NO_MEMORY;
break;
default:
ret = XML_ENC_ERR_INTERNAL;
break;
}
}
return(ret);
}
static int
openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out)
{
UErrorCode status;
xmlUconvCtxt *conv;
*out = NULL;
conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt));
if (conv == NULL)
return(XML_ERR_NO_MEMORY);
conv->isInput = isInput;
conv->pivot_source = conv->pivot_buf;
conv->pivot_target = conv->pivot_buf;
status = U_ZERO_ERROR;
conv->uconv = ucnv_open(name, &status);
if (U_FAILURE(status))
goto error;
status = U_ZERO_ERROR;
if (isInput) {
ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
NULL, NULL, NULL, &status);
}
else {
ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
NULL, NULL, NULL, &status);
}
if (U_FAILURE(status))
goto error;
status = U_ZERO_ERROR;
conv->utf8 = ucnv_open("UTF-8", &status);
if (U_FAILURE(status))
goto error;
*out = conv;
return(0);
error:
if (conv->uconv)
ucnv_close(conv->uconv);
xmlFree(conv);
if (status == U_FILE_ACCESS_ERROR)
return(XML_ERR_UNSUPPORTED_ENCODING);
if (status == U_MEMORY_ALLOCATION_ERROR)
return(XML_ERR_NO_MEMORY);
return(XML_ERR_SYSTEM);
}
static void
closeIcuConverter(xmlUconvCtxt *conv)
{
if (conv == NULL)
return;
ucnv_close(conv->uconv);
ucnv_close(conv->utf8);
xmlFree(conv);
}
static void
xmlUconvFree(void *vctxt) {
closeIcuConverter(vctxt);
}
static int
xmlCharEncUconv(void *vctxt ATTRIBUTE_UNUSED, const char *name,
xmlCharEncConverter *conv) {
xmlUconvCtxt *ucv_in = NULL;
xmlUconvCtxt *ucv_out = NULL;
int ret;
ret = openIcuConverter(name, 1, &ucv_in);
if (ret != 0)
goto error;
ret = openIcuConverter(name, 0, &ucv_out);
if (ret != 0)
goto error;
conv->input = xmlUconvConvert;
conv->output = xmlUconvConvert;
conv->ctxtDtor = xmlUconvFree;
conv->inputCtxt = ucv_in;
conv->outputCtxt = ucv_out;
return(XML_ERR_OK);
error:
if (ucv_in != NULL)
closeIcuConverter(ucv_in);
if (ucv_out != NULL)
closeIcuConverter(ucv_out);
return(ret);
}
#endif /* LIBXML_ICU_ENABLED */
/************************************************************************
* *
* The real API used by libxml for on-the-fly conversion *
* *
************************************************************************/
/**
* xmlEncConvertError:
* @code: XML_ENC_ERR code
*
* Convert XML_ENC_ERR to libxml2 error codes.
*/
static int
xmlEncConvertError(int code) {
int ret;
switch (code) {
case XML_ENC_ERR_SUCCESS:
ret = XML_ERR_OK;
break;
case XML_ENC_ERR_INPUT:
ret = XML_ERR_INVALID_ENCODING;
break;
case XML_ENC_ERR_MEMORY:
ret = XML_ERR_NO_MEMORY;
break;
default:
ret = XML_ERR_INTERNAL_ERROR;
break;
}
return(ret);
}
/**
* xmlEncInputChunk:
* @handler: encoding handler
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
*
* The value of @inlen after return is the number of octets consumed
* as the return value is 0, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*
* Returns an XML_ENC_ERR code.
*/
int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
int *outlen, const unsigned char *in, int *inlen) {
int ret;
if (handler->input != NULL) {
xmlCharEncConvFunc conv =
(xmlCharEncConvFunc) (void (*)(void)) handler->input;
ret = conv(out, outlen, in, inlen, handler->inputCtxt);
if (ret > 0)
ret = XML_ENC_ERR_SUCCESS;
}
else {
*outlen = 0;
*inlen = 0;
ret = XML_ENC_ERR_INTERNAL;
}
return(ret);
}
/**
* xmlEncOutputChunk:
* @handler: encoding handler
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of input bytes
* @inlen: the length of @in
*
* Returns an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* as the return value is 0, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
static int
xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
int *outlen, const unsigned char *in, int *inlen) {
int ret;
if (handler->output != NULL) {
xmlCharEncConvFunc conv =
(xmlCharEncConvFunc) (void (*)(void)) handler->output;
ret = conv(out, outlen, in, inlen, handler->outputCtxt);
if (ret > 0)
ret = XML_ENC_ERR_SUCCESS;
}
else {
*outlen = 0;
*inlen = 0;
ret = XML_ENC_ERR_INTERNAL;
}
return(ret);
}
/**
* xmlCharEncFirstLine:
* @handler: char encoding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* DEPERECATED: Don't use.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*/
int
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in) {
return(xmlCharEncInFunc(handler, out, in));
}
/**
* xmlCharEncInput:
* @input: a parser input buffer
* @sizeOut: pointer to output size
*
* @sizeOut should be set to the maximum output size (or SIZE_MAX).
* After return, it is set to the number of bytes written.
*
* Generic front-end for the encoding handler on parser input
*
* Returns an XML_ENC_ERR code.
*/
int
xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut)
{
xmlBufPtr out, in;
const xmlChar *dataIn;
size_t availIn;
size_t maxOut;
size_t totalIn, totalOut;
int ret;
out = input->buffer;
in = input->raw;
maxOut = *sizeOut;
totalOut = 0;
*sizeOut = 0;
availIn = xmlBufUse(in);
if (availIn == 0)
return(0);
dataIn = xmlBufContent(in);
totalIn = 0;
while (1) {
size_t availOut;
int completeOut, completeIn;
int c_out, c_in;
availOut = xmlBufAvail(out);
if (availOut > INT_MAX / 2)
availOut = INT_MAX / 2;
if (availOut < maxOut) {
c_out = availOut;
completeOut = 0;
} else {
c_out = maxOut;
completeOut = 1;
}
if (availIn > INT_MAX / 2) {
c_in = INT_MAX / 2;
completeIn = 0;
} else {
c_in = availIn;
completeIn = 1;
}
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
dataIn, &c_in);
totalIn += c_in;
dataIn += c_in;
availIn -= c_in;
totalOut += c_out;
maxOut -= c_out;
xmlBufAddLen(out, c_out);
if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) {
input->error = xmlEncConvertError(ret);
return(ret);
}
if ((completeOut) && (completeIn))
break;
if ((completeOut) && (ret == XML_ENC_ERR_SPACE))
break;
if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS))
break;
if (ret == XML_ENC_ERR_SPACE) {
if (xmlBufGrow(out, 4096) < 0) {
input->error = XML_ERR_NO_MEMORY;
return(XML_ENC_ERR_MEMORY);
}
}
}
xmlBufShrink(in, totalIn);
if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn)
input->rawconsumed = ULONG_MAX;
else
input->rawconsumed += totalIn;
*sizeOut = totalOut;
return(XML_ERR_OK);
}
/**
* xmlCharEncInFunc:
* @handler: char encoding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* Generic front-end for the encoding handler input function
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*/
int
xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
xmlBufferPtr in)
{
int ret;
int written;
int toconv;
if (handler == NULL)
return(XML_ENC_ERR_INTERNAL);
if (out == NULL)
return(XML_ENC_ERR_INTERNAL);
if (in == NULL)
return(XML_ENC_ERR_INTERNAL);
toconv = in->use;
if (toconv == 0)
return (0);
written = out->size - out->use -1; /* count '\0' */
if (toconv * 2 >= written) {
xmlBufferGrow(out, out->size + toconv * 2);
written = out->size - out->use - 1;
}
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
return (written? written : ret);
}
#ifdef LIBXML_OUTPUT_ENABLED
/**
* xmlCharEncOutput:
* @output: a parser output buffer
* @init: is this an initialization call without data
*
* Generic front-end for the encoding handler on parser output
* a first call with @init == 1 has to be made first to initiate the
* output in case of non-stateless encoding needing to initiate their
* state or the output (like the BOM in UTF16).
* In case of UTF8 sequence conversion errors for the given encoder,
* the content will be automatically remapped to a CharRef sequence.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*/
int
xmlCharEncOutput(xmlOutputBufferPtr output, int init)
{
int ret;
size_t written;
int writtentot = 0;
size_t toconv;
int c_in;
int c_out;
xmlBufPtr in;
xmlBufPtr out;
if ((output == NULL) || (output->encoder == NULL) ||
(output->buffer == NULL) || (output->conv == NULL))
return(XML_ENC_ERR_INTERNAL);
out = output->conv;
in = output->buffer;
retry:
written = xmlBufAvail(out);
/*
* First specific handling of the initialization call
*/
if (init) {
c_in = 0;
c_out = written;
/* TODO: Check return value. */
xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
NULL, &c_in);
xmlBufAddLen(out, c_out);
return(c_out);
}
/*
* Conversion itself.
*/
toconv = xmlBufUse(in);
if (toconv > 64 * 1024)
toconv = 64 * 1024;
if (toconv * 4 >= written) {
if (xmlBufGrow(out, toconv * 4) < 0) {
ret = XML_ENC_ERR_MEMORY;
goto error;
}
written = xmlBufAvail(out);
}
if (written > 256 * 1024)
written = 256 * 1024;
c_in = toconv;
c_out = written;
ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
xmlBufContent(in), &c_in);
xmlBufShrink(in, c_in);
xmlBufAddLen(out, c_out);
writtentot += c_out;
if (ret == XML_ENC_ERR_SPACE)
goto retry;
/*
* Attempt to handle error cases
*/
if (ret == XML_ENC_ERR_INPUT) {
xmlChar charref[20];
int len = xmlBufUse(in);
xmlChar *content = xmlBufContent(in);
int cur, charrefLen;
cur = xmlGetUTF8Char(content, &len);
if (cur <= 0)
goto error;
/*
* Removes the UTF8 sequence, and replace it by a charref
* and continue the transcoding phase, hoping the error
* did not mangle the encoder state.
*/
charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
xmlBufGrow(out, charrefLen * 4);
c_out = xmlBufAvail(out);
c_in = charrefLen;
ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out,
charref, &c_in);
if ((ret < 0) || (c_in != charrefLen)) {
ret = XML_ENC_ERR_INTERNAL;
goto error;
}
xmlBufShrink(in, len);
xmlBufAddLen(out, c_out);
writtentot += c_out;
goto retry;
}
error:
if (((writtentot <= 0) && (ret != 0)) ||
(ret == XML_ENC_ERR_MEMORY)) {
if (output->error == 0)
output->error = xmlEncConvertError(ret);
return(ret);
}
return(writtentot);
}
#endif
/**
* xmlCharEncOutFunc:
* @handler: char encoding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* Generic front-end for the encoding handler output function
* a first call with @in == NULL has to be made firs to initiate the
* output in case of non-stateless encoding needing to initiate their
* state or the output (like the BOM in UTF16).
* In case of UTF8 sequence conversion errors for the given encoder,
* the content will be automatically remapped to a CharRef sequence.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*/
int
xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in) {
int ret;
int written;
int writtentot = 0;
int toconv;
if (handler == NULL) return(XML_ENC_ERR_INTERNAL);
if (out == NULL) return(XML_ENC_ERR_INTERNAL);
retry:
written = out->size - out->use;
if (written > 0)
written--; /* Gennady: count '/0' */
/*
* First specific handling of in = NULL, i.e. the initialization call
*/
if (in == NULL) {
toconv = 0;
/* TODO: Check return value. */
xmlEncOutputChunk(handler, &out->content[out->use], &written,
NULL, &toconv);
out->use += written;
out->content[out->use] = 0;
return(0);
}
/*
* Conversion itself.
*/
toconv = in->use;
if (toconv * 4 >= written) {
xmlBufferGrow(out, toconv * 4);
written = out->size - out->use - 1;
}
ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
writtentot += written;
out->content[out->use] = 0;
if (ret == XML_ENC_ERR_SPACE)
goto retry;
/*
* Attempt to handle error cases
*/
if (ret == XML_ENC_ERR_INPUT) {
xmlChar charref[20];
int len = in->use;
const xmlChar *utf = (const xmlChar *) in->content;
int cur, charrefLen;
cur = xmlGetUTF8Char(utf, &len);
if (cur <= 0)
return(ret);
/*
* Removes the UTF8 sequence, and replace it by a charref
* and continue the transcoding phase, hoping the error
* did not mangle the encoder state.
*/
charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
xmlBufferShrink(in, len);
xmlBufferGrow(out, charrefLen * 4);
written = out->size - out->use - 1;
toconv = charrefLen;
ret = xmlEncOutputChunk(handler, &out->content[out->use], &written,
charref, &toconv);
if ((ret < 0) || (toconv != charrefLen))
return(XML_ENC_ERR_INTERNAL);
out->use += written;
writtentot += written;
out->content[out->use] = 0;
goto retry;
}
return(writtentot ? writtentot : ret);
}
/**
* xmlCharEncCloseFunc:
* @handler: char encoding transformation data structure
*
* Releases an xmlCharEncodingHandler. Must be called after
* a handler is no longer in use.
*
* Returns 0.
*/
int
xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
if (handler == NULL)
return(0);
if (handler->flags & XML_HANDLER_STATIC)
return(0);
xmlFree(handler->name);
if (handler->ctxtDtor != NULL) {
handler->ctxtDtor(handler->inputCtxt);
handler->ctxtDtor(handler->outputCtxt);
}
xmlFree(handler);
return(0);
}
/**
* xmlByteConsumed:
* @ctxt: an XML parser context
*
* DEPRECATED: Don't use.
*
* This function provides the current index of the parser relative
* to the start of the current entity. This function is computed in
* bytes from the beginning starting at zero and finishing at the
* size in byte of the file if parsing a file. The function is
* of constant cost if the input is UTF-8 but can be costly if run
* on non-UTF-8 input.
*
* Returns the index in bytes from the beginning of the entity or -1
* in case the index could not be computed.
*/
long
xmlByteConsumed(xmlParserCtxtPtr ctxt) {
xmlParserInputPtr in;
if (ctxt == NULL)
return(-1);
in = ctxt->input;
if (in == NULL)
return(-1);
if ((in->buf != NULL) && (in->buf->encoder != NULL)) {
int unused = 0;
xmlCharEncodingHandler * handler = in->buf->encoder;
/*
* Encoding conversion, compute the number of unused original
* bytes from the input not consumed and subtract that from
* the raw consumed value, this is not a cheap operation
*/
if (in->end - in->cur > 0) {
unsigned char *convbuf;
const unsigned char *cur = (const unsigned char *)in->cur;
int toconv, ret;
convbuf = xmlMalloc(32000);
if (convbuf == NULL)
return(-1);
toconv = in->end - cur;
unused = 32000;
ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv);
xmlFree(convbuf);
if (ret != XML_ENC_ERR_SUCCESS)
return(-1);
}
if (in->buf->rawconsumed < (unsigned long) unused)
return(-1);
return(in->buf->rawconsumed - unused);
}
return(in->consumed + (in->cur - in->base));
}
/************************************************************************
* *
* Conversions To/From UTF8 encoding *
* *
************************************************************************/
static int
asciiToAscii(unsigned char* out, int *poutlen,
const unsigned char* in, int *pinlen,
void *vctxt ATTRIBUTE_UNUSED) {
const unsigned char *inend;
const unsigned char *instart = in;
int inlen, outlen, ret;
if (in == NULL) {
*pinlen = 0;
*poutlen = 0;
return(XML_ENC_ERR_SUCCESS);
}
inlen = *pinlen;
outlen = *poutlen;
if (outlen < inlen) {
inlen = outlen;
ret = XML_ENC_ERR_SPACE;
} else {
ret = inlen;
}
inend = in + inlen;
*poutlen = inlen;
*pinlen = inlen;
while (in < inend) {
unsigned c = *in;
if (c >= 0x80) {
*poutlen = in - instart;
*pinlen = in - instart;
return(XML_ENC_ERR_INPUT);
}
in++;
*out++ = c;
}
return(ret);
}
static int
latin1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
unsigned char* outstart = out;
const unsigned char* instart = in;
unsigned char* outend;
const unsigned char* inend;
int ret = XML_ENC_ERR_SPACE;
if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
outend = out + *outlen;
inend = in + *inlen;
while (in < inend) {
unsigned c = *in;
if (c < 0x80) {
if (out >= outend)
goto done;
*out++ = c;
} else {
if (outend - out < 2)
goto done;
*out++ = (c >> 6) | 0xC0;
*out++ = (c & 0x3F) | 0x80;
}
in++;
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
/**
* isolat1ToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ISO Latin 1 chars
* @inlen: the length of @in
*
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
* block of chars out.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
int
isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
return(latin1ToUTF8(out, outlen, in, inlen, NULL));
}
static int
UTF8ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
int len;
int ret;
if (in == NULL) {
*inlen = 0;
*outlen = 0;
return(XML_ENC_ERR_SUCCESS);
}
if (*outlen < *inlen) {
len = *outlen;
ret = XML_ENC_ERR_SPACE;
} else {
len = *inlen;
ret = len;
}
memcpy(out, in, len);
*outlen = len;
*inlen = len;
return(ret);
}
#ifdef LIBXML_OUTPUT_ENABLED
static int
UTF8ToLatin1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned c;
int ret = XML_ENC_ERR_SPACE;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
if (in == NULL) {
*inlen = 0;
*outlen = 0;
return(XML_ENC_ERR_SUCCESS);
}
inend = in + *inlen;
outend = out + *outlen;
while (in < inend) {
if (out >= outend)
goto done;
c = *in;
if (c < 0x80) {
*out++ = c;
} else if ((c >= 0xC2) && (c <= 0xC3)) {
if (inend - in < 2)
break;
in++;
*out++ = (unsigned char) ((c << 6) | (*in & 0x3F));
} else {
ret = XML_ENC_ERR_INPUT;
goto done;
}
in++;
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
/**
* UTF8Toisolat1:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
* block of chars out.
*
* Returns the number of bytes written or an XML_ENC_ERR code.
*
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
int
UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
return(UTF8ToLatin1(out, outlen, in, inlen, NULL));
}
#endif /* LIBXML_OUTPUT_ENABLED */
static int
UTF16LEToUTF8(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend = in + (*inlen & ~1);
unsigned char *outstart = out;
unsigned char *outend = out + *outlen;
unsigned c, d;
int ret = XML_ENC_ERR_SPACE;
while (in < inend) {
c = in[0] | (in[1] << 8);
if (c < 0x80) {
if (out >= outend)
goto done;
out[0] = c;
in += 2;
out += 1;
} else if (c < 0x800) {
if (outend - out < 2)
goto done;
out[0] = (c >> 6) | 0xC0;
out[1] = (c & 0x3F) | 0x80;
in += 2;
out += 2;
} else if ((c & 0xF800) != 0xD800) {
if (outend - out < 3)
goto done;
out[0] = (c >> 12) | 0xE0;
out[1] = ((c >> 6) & 0x3F) | 0x80;
out[2] = (c & 0x3F) | 0x80;
in += 2;
out += 3;
} else {
/* Surrogate pair */
if ((c & 0xFC00) != 0xD800) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (inend - in < 4)
break;
d = in[2] | (in[3] << 8);
if ((d & 0xFC00) != 0xDC00) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (outend - out < 4)
goto done;
c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
out[0] = (c >> 18) | 0xF0;
out[1] = ((c >> 12) & 0x3F) | 0x80;
out[2] = ((c >> 6) & 0x3F) | 0x80;
out[3] = (c & 0x3F) | 0x80;
in += 4;
out += 4;
}
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
#ifdef LIBXML_OUTPUT_ENABLED
static int
UTF8ToUTF16LE(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend;
unsigned char *outstart = out;
unsigned char *outend;
unsigned c, d;
int ret = XML_ENC_ERR_SPACE;
/* UTF16LE encoding has no BOM */
if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
return(XML_ENC_ERR_INTERNAL);
if (in == NULL) {
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + *inlen;
outend = out + (*outlen & ~1);
while (in < inend) {
c = in[0];
if (c < 0x80) {
if (out >= outend)
goto done;
out[0] = c;
out[1] = 0;
in += 1;
out += 2;
} else {
int i, len;
unsigned min;
if (c < 0xE0) {
if (c < 0xC2) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
c &= 0x1F;
len = 2;
min = 0x80;
} else if (c < 0xF0) {
c &= 0x0F;
len = 3;
min = 0x800;
} else {
c &= 0x0F;
len = 4;
min = 0x10000;
}
if (inend - in < len)
break;
for (i = 1; i < len; i++) {
if ((in[i] & 0xC0) != 0x80) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
c = (c << 6) | (in[i] & 0x3F);
}
if ((c < min) ||
((c >= 0xD800) && (c <= 0xDFFF)) ||
(c > 0x10FFFF)) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (c < 0x10000) {
if (out >= outend)
goto done;
out[0] = c & 0xFF;
out[1] = c >> 8;
out += 2;
} else {
if (outend - out < 4)
goto done;
c -= 0x10000;
d = (c & 0x03FF) | 0xDC00;
c = (c >> 10) | 0xD800;
out[0] = c & 0xFF;
out[1] = c >> 8;
out[2] = d & 0xFF;
out[3] = d >> 8;
out += 4;
}
in += len;
}
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
static int
UTF8ToUTF16(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
if (in == NULL) {
/*
* initialization, add the Byte Order Mark for UTF-16LE
*/
if (*outlen >= 2) {
outb[0] = 0xFF;
outb[1] = 0xFE;
*outlen = 2;
*inlen = 0;
return(2);
}
*outlen = 0;
*inlen = 0;
return(0);
}
return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL));
}
#endif /* LIBXML_OUTPUT_ENABLED */
static int
UTF16BEToUTF8(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend = in + (*inlen & ~1);
unsigned char *outstart = out;
unsigned char *outend = out + *outlen;
unsigned c, d;
int ret = XML_ENC_ERR_SPACE;
while (in < inend) {
c = (in[0] << 8) | in[1];
if (c < 0x80) {
if (out >= outend)
goto done;
out[0] = c;
in += 2;
out += 1;
} else if (c < 0x800) {
if (outend - out < 2)
goto done;
out[0] = (c >> 6) | 0xC0;
out[1] = (c & 0x3F) | 0x80;
in += 2;
out += 2;
} else if ((c & 0xF800) != 0xD800) {
if (outend - out < 3)
goto done;
out[0] = (c >> 12) | 0xE0;
out[1] = ((c >> 6) & 0x3F) | 0x80;
out[2] = (c & 0x3F) | 0x80;
in += 2;
out += 3;
} else {
/* Surrogate pair */
if ((c & 0xFC00) != 0xD800) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (inend - in < 4)
break;
d = (in[2] << 8) | in[3];
if ((d & 0xFC00) != 0xDC00) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (outend - out < 4)
goto done;
c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000);
out[0] = (c >> 18) | 0xF0;
out[1] = ((c >> 12) & 0x3F) | 0x80;
out[2] = ((c >> 6) & 0x3F) | 0x80;
out[3] = (c & 0x3F) | 0x80;
in += 4;
out += 4;
}
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
#ifdef LIBXML_OUTPUT_ENABLED
static int
UTF8ToUTF16BE(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
const unsigned char *instart = in;
const unsigned char *inend;
unsigned char *outstart = out;
unsigned char *outend;
unsigned c, d;
int ret = XML_ENC_ERR_SPACE;
/* UTF-16BE has no BOM */
if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
if (in == NULL) {
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + *inlen;
outend = out + (*outlen & ~1);
while (in < inend) {
c = in[0];
if (c < 0x80) {
if (out >= outend)
goto done;
out[0] = 0;
out[1] = c;
in += 1;
out += 2;
} else {
int i, len;
unsigned min;
if (c < 0xE0) {
if (c < 0xC2) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
c &= 0x1F;
len = 2;
min = 0x80;
} else if (c < 0xF0) {
c &= 0x0F;
len = 3;
min = 0x800;
} else {
c &= 0x0F;
len = 4;
min = 0x10000;
}
if (inend - in < len)
break;
for (i = 1; i < len; i++) {
if ((in[i] & 0xC0) != 0x80) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
c = (c << 6) | (in[i] & 0x3F);
}
if ((c < min) ||
((c >= 0xD800) && (c <= 0xDFFF)) ||
(c > 0x10FFFF)) {
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (c < 0x10000) {
if (out >= outend)
goto done;
out[0] = c >> 8;
out[1] = c & 0xFF;
out += 2;
} else {
if (outend - out < 4)
goto done;
c -= 0x10000;
d = (c & 0x03FF) | 0xDC00;
c = (c >> 10) | 0xD800;
out[0] = c >> 8;
out[1] = c & 0xFF;
out[2] = d >> 8;
out[3] = d & 0xFF;
out += 4;
}
in += len;
}
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
#endif /* LIBXML_OUTPUT_ENABLED */
#if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
static int
UTF8ToHtmlWrapper(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen,
void *vctxt ATTRIBUTE_UNUSED) {
return(UTF8ToHtml(out, outlen, in, inlen));
}
#endif
#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
defined(LIBXML_ISO8859X_ENABLED)
static int
UTF8ToISO8859x(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen, void *vctxt) {
const unsigned char *xlattable = vctxt;
const unsigned char *instart = in;
const unsigned char *inend;
unsigned char *outstart = out;
unsigned char *outend;
int ret = XML_ENC_ERR_SPACE;
if (in == NULL) {
/*
* initialization nothing to do
*/
*outlen = 0;
*inlen = 0;
return(XML_ENC_ERR_SUCCESS);
}
inend = in + *inlen;
outend = out + *outlen;
while (in < inend) {
unsigned d = *in;
if (d < 0x80) {
if (out >= outend)
goto done;
in += 1;
} else if (d < 0xE0) {
unsigned c;
if (inend - in < 2)
break;
c = in[1] & 0x3F;
d = d & 0x1F;
d = xlattable [48 + c + xlattable [d] * 64];
if (d == 0) {
/* not in character set */
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (out >= outend)
goto done;
in += 2;
} else if (d < 0xF0) {
unsigned c1;
unsigned c2;
if (inend - in < 3)
break;
c1 = in[1] & 0x3F;
c2 = in[2] & 0x3F;
d = d & 0x0F;
d = xlattable [48 + c2 + xlattable [48 + c1 +
xlattable [32 + d] * 64] * 64];
if (d == 0) {
/* not in character set */
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (out >= outend)
goto done;
in += 3;
} else {
/* cannot transcode >= U+010000 */
ret = XML_ENC_ERR_INPUT;
goto done;
}
*out++ = d;
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
static int
ISO8859xToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen, void *vctxt) {
unsigned short const *unicodetable = vctxt;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned char* outstart = out;
unsigned char* outend;
int ret = XML_ENC_ERR_SPACE;
outend = out + *outlen;
inend = in + *inlen;
while (in < inend) {
unsigned c = *in;
if (c < 0x80) {
if (out >= outend)
goto done;
*out++ = c;
} else {
c = unicodetable[c - 0x80];
if (c == 0) {
/* undefined code point */
ret = XML_ENC_ERR_INPUT;
goto done;
}
if (c < 0x800) {
if (outend - out < 2)
goto done;
*out++ = ((c >> 6) & 0x1F) | 0xC0;
*out++ = (c & 0x3F) | 0x80;
} else {
if (outend - out < 3)
goto done;
*out++ = ((c >> 12) & 0x0F) | 0xE0;
*out++ = ((c >> 6) & 0x3F) | 0x80;
*out++ = (c & 0x3F) | 0x80;
}
}
in += 1;
}
ret = out - outstart;
done:
*outlen = out - outstart;
*inlen = in - instart;
return(ret);
}
#endif