2001-02-23 20:55:21 +03:00
/*
* encoding . c : implements the encoding conversion functions needed for XML
*
* Related specs :
* rfc2044 ( UTF - 8 and UTF - 16 ) F . Yergeau Alis Technologies
* rfc2781 UTF - 16 , an encoding of ISO 10646 , P . Hoffman , F . Yergeau
* [ ISO - 10646 ] UTF - 8 and UTF - 16 in Annexes
* [ ISO - 8859 - 1 ] ISO Latin - 1 characters codes .
* [ UNICODE ] The Unicode Consortium , " The Unicode Standard --
* Worldwide Character Encoding - - Version 1.0 " , Addison-
* Wesley , Volume 1 , 1991 , Volume 2 , 1992. UTF - 8 is
* described in Unicode Technical Report # 4.
* [ US - ASCII ] Coded Character Set - - 7 - bit American Standard Code for
* Information Interchange , ANSI X3 .4 - 1986.
*
* See Copyright for the status of this software .
*
2001-06-24 16:13:24 +04:00
* daniel @ veillard . com
2001-05-30 23:14:17 +04:00
*
* UTF8 string routines from :
* " William M. Brack " < wbrack @ mmm . com . hk >
*
* Original code for IsoLatin1 and UTF - 16 by " Martin J. Duerst " < duerst @ w3 . org >
2001-02-23 20:55:21 +03:00
*/
2002-03-18 22:37:11 +03:00
# define IN_LIBXML
2001-04-21 20:57:29 +04:00
# include "libxml.h"
2001-02-23 20:55:21 +03:00
# include <string.h>
# ifdef HAVE_CTYPE_H
# include <ctype.h>
# endif
# ifdef HAVE_STDLIB_H
# include <stdlib.h>
# endif
# ifdef LIBXML_ICONV_ENABLED
# ifdef HAVE_ERRNO_H
# include <errno.h>
# endif
# endif
# include <libxml/encoding.h>
# include <libxml/xmlmemory.h>
# ifdef LIBXML_HTML_ENABLED
# include <libxml/HTMLparser.h>
# endif
2001-10-15 16:32:07 +04:00
# include <libxml/globals.h>
2001-11-04 23:19:12 +03:00
# include <libxml/xmlerror.h>
2001-02-23 20:55:21 +03:00
2001-07-16 04:06:07 +04:00
static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL ;
static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL ;
2001-02-23 20:55:21 +03:00
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias ;
typedef xmlCharEncodingAlias * xmlCharEncodingAliasPtr ;
struct _xmlCharEncodingAlias {
const char * name ;
const char * alias ;
} ;
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL ;
static int xmlCharEncodingAliasesNb = 0 ;
static int xmlCharEncodingAliasesMax = 0 ;
# ifdef LIBXML_ICONV_ENABLED
#if 0
# define DEBUG_ENCODING /* Define this to get encoding traces */
# endif
# endif
static int xmlLittleEndian = 1 ;
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* Generic UTF8 handling routines *
* *
* From rfc2044 : encoding of the Unicode values on UTF - 8 : *
* *
* UCS - 4 range ( hex . ) UTF - 8 octet sequence ( binary ) *
* 0000 0000 - 0000 007F 0 xxxxxxx *
* 0000 00 80 - 0000 07FF 110 xxxxx 10 xxxxxx *
* 0000 0800 - 0000 FFFF 1110 xxxx 10 xxxxxx 10 xxxxxx *
* *
* I hope we won ' t use values > 0xFFFF anytime soon ! *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
2001-04-16 18:08:07 +04:00
/**
* xmlUTF8Strlen :
* @ utf : a sequence of UTF - 8 encoded bytes
*
2001-10-10 13:45:09 +04:00
* compute the length of an UTF8 string , it doesn ' t do a full UTF8
2001-04-16 18:08:07 +04:00
* checking of the content of the string .
*
* Returns the number of characters in the string or - 1 in case of error
*/
int
2001-05-30 23:14:17 +04:00
xmlUTF8Strlen ( const xmlChar * utf ) {
2001-04-16 18:08:07 +04:00
int ret = 0 ;
if ( utf = = NULL )
return ( - 1 ) ;
while ( * utf ! = 0 ) {
if ( utf [ 0 ] & 0x80 ) {
if ( ( utf [ 1 ] & 0xc0 ) ! = 0x80 )
return ( - 1 ) ;
if ( ( utf [ 0 ] & 0xe0 ) = = 0xe0 ) {
if ( ( utf [ 2 ] & 0xc0 ) ! = 0x80 )
return ( - 1 ) ;
if ( ( utf [ 0 ] & 0xf0 ) = = 0xf0 ) {
if ( ( utf [ 0 ] & 0xf8 ) ! = 0xf0 | | ( utf [ 3 ] & 0xc0 ) ! = 0x80 )
return ( - 1 ) ;
utf + = 4 ;
} else {
utf + = 3 ;
}
} else {
utf + = 2 ;
}
} else {
utf + + ;
}
ret + + ;
}
return ( ret ) ;
}
2001-02-23 20:55:21 +03:00
/**
* xmlGetUTF8Char :
* @ utf : a sequence of UTF - 8 encoded bytes
* @ len : a pointer to @ bytes len
*
* Read one UTF8 Char from @ utf
*
* Returns the char value or - 1 in case of error and update @ len with the
* number of bytes used
*/
2001-03-24 20:00:36 +03:00
static int
2001-02-23 20:55:21 +03:00
xmlGetUTF8Char ( const unsigned char * utf , int * len ) {
unsigned int c ;
if ( utf = = NULL )
goto error ;
if ( len = = NULL )
goto error ;
if ( * len < 1 )
goto error ;
c = utf [ 0 ] ;
if ( c & 0x80 ) {
if ( * len < 2 )
goto error ;
if ( ( utf [ 1 ] & 0xc0 ) ! = 0x80 )
goto error ;
if ( ( c & 0xe0 ) = = 0xe0 ) {
if ( * len < 3 )
goto error ;
if ( ( utf [ 2 ] & 0xc0 ) ! = 0x80 )
goto error ;
if ( ( c & 0xf0 ) = = 0xf0 ) {
if ( * len < 4 )
goto error ;
if ( ( c & 0xf8 ) ! = 0xf0 | | ( utf [ 3 ] & 0xc0 ) ! = 0x80 )
goto error ;
* len = 4 ;
/* 4-byte code */
c = ( utf [ 0 ] & 0x7 ) < < 18 ;
c | = ( utf [ 1 ] & 0x3f ) < < 12 ;
c | = ( utf [ 2 ] & 0x3f ) < < 6 ;
c | = utf [ 3 ] & 0x3f ;
} else {
/* 3-byte code */
* len = 3 ;
c = ( utf [ 0 ] & 0xf ) < < 12 ;
c | = ( utf [ 1 ] & 0x3f ) < < 6 ;
c | = utf [ 2 ] & 0x3f ;
}
} else {
/* 2-byte code */
* len = 2 ;
c = ( utf [ 0 ] & 0x1f ) < < 6 ;
c | = utf [ 1 ] & 0x3f ;
}
} else {
/* 1-byte code */
* len = 1 ;
}
return ( c ) ;
error :
* len = 0 ;
return ( - 1 ) ;
}
/**
* xmlCheckUTF8 : Check utf - 8 string for legality .
* @ utf : Pointer to putative utf - 8 encoded string .
*
* Checks @ utf for being valid utf - 8. @ utf is assumed to be
* null - terminated . This function is not super - strict , as it will
* allow longer utf - 8 sequences than necessary . Note that Java is
* capable of producing these sequences if provoked . Also note , this
2001-12-31 19:16:02 +03:00
* routine checks for the 4 - byte maximum size , but does not check for
2001-02-23 20:55:21 +03:00
* 0x10ffff maximum value .
*
* Return value : true if @ utf is valid .
* */
int
xmlCheckUTF8 ( const unsigned char * utf )
{
int ix ;
unsigned char c ;
for ( ix = 0 ; ( c = utf [ ix ] ) ; ) {
if ( c & 0x80 ) {
if ( ( utf [ ix + 1 ] & 0xc0 ) ! = 0x80 )
return ( 0 ) ;
if ( ( c & 0xe0 ) = = 0xe0 ) {
if ( ( utf [ ix + 2 ] & 0xc0 ) ! = 0x80 )
return ( 0 ) ;
if ( ( c & 0xf0 ) = = 0xf0 ) {
if ( ( c & 0xf8 ) ! = 0xf0 | | ( utf [ ix + 3 ] & 0xc0 ) ! = 0x80 )
return ( 0 ) ;
ix + = 4 ;
/* 4-byte code */
} else
/* 3-byte code */
ix + = 3 ;
} else
/* 2-byte code */
ix + = 2 ;
} else
/* 1-byte code */
ix + + ;
}
return ( 1 ) ;
}
2001-05-30 23:14:17 +04:00
/**
* xmlUTF8Strsize :
* @ utf : a sequence of UTF - 8 encoded bytes
* @ len : the number of characters in the array
*
* storage size of an UTF8 string
*
* Returns the storage size of
* the first ' len ' characters of ARRAY
*
*/
int
xmlUTF8Strsize ( const xmlChar * utf , int len ) {
const xmlChar * ptr = utf ;
xmlChar ch ;
if ( len < = 0 )
return ( 0 ) ;
while ( len - - > 0 ) {
if ( ! * ptr )
break ;
if ( ( ch = * ptr + + ) & 0x80 )
while ( ( ch < < = 1 ) & 0x80 )
ptr + + ;
}
return ( ptr - utf ) ;
}
/**
* xmlUTF8Strndup :
* @ utf : the input UTF8 *
* @ len : the len of @ utf ( in chars )
*
* a strndup for array of UTF8 ' s
*
* Returns a new UTF8 * or NULL
*/
xmlChar *
xmlUTF8Strndup ( const xmlChar * utf , int len ) {
xmlChar * ret ;
int i ;
if ( ( utf = = NULL ) | | ( len < 0 ) ) return ( NULL ) ;
i = xmlUTF8Strsize ( utf , len ) ;
ret = ( xmlChar * ) xmlMalloc ( ( i + 1 ) * sizeof ( xmlChar ) ) ;
if ( ret = = NULL ) {
xmlGenericError ( xmlGenericErrorContext ,
" malloc of %ld byte failed \n " ,
( len + 1 ) * ( long ) sizeof ( xmlChar ) ) ;
return ( NULL ) ;
}
memcpy ( ret , utf , i * sizeof ( xmlChar ) ) ;
ret [ i ] = 0 ;
return ( ret ) ;
}
/**
* xmlUTF8Strpos :
* @ utf : the input UTF8 *
* @ pos : the position of the desired UTF8 char ( in chars )
*
* a function to provide the equivalent of fetching a
* character from a string array
*
* Returns a pointer to the UTF8 character or NULL
*/
xmlChar *
xmlUTF8Strpos ( const xmlChar * utf , int pos ) {
xmlChar ch ;
if ( utf = = NULL ) return ( NULL ) ;
if ( ( pos < 0 ) | | ( pos > = xmlUTF8Strlen ( utf ) ) )
return ( NULL ) ;
while ( pos - - ) {
if ( ( ch = * utf + + ) = = 0 ) return ( NULL ) ;
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( ( ch & 0xc0 ) ! = 0xc0 )
return ( NULL ) ;
/* then skip over remaining bytes for this char */
while ( ( ch < < = 1 ) & 0x80 )
if ( ( * utf + + & 0xc0 ) ! = 0x80 )
return ( NULL ) ;
}
}
return ( ( xmlChar * ) utf ) ;
}
/**
* xmlUTF8Strloc :
* @ utf : the input UTF8 *
* @ utfchar : the UTF8 character to be found
*
* a function to provide relative location of a UTF8 char
*
* Returns the relative character position of the desired char
* or - 1 if not found
*/
int
xmlUTF8Strloc ( const xmlChar * utf , const xmlChar * utfchar ) {
int i , size ;
xmlChar ch ;
if ( utf = = NULL | | utfchar = = NULL ) return - 1 ;
size = xmlUTF8Strsize ( utfchar , 1 ) ;
for ( i = 0 ; ( ch = * utf ) ! = 0 ; i + + ) {
if ( xmlStrncmp ( utf , utfchar , size ) = = 0 )
return ( i ) ;
utf + + ;
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( ( ch & 0xc0 ) ! = 0xc0 )
return ( - 1 ) ;
/* then skip over remaining bytes for this char */
while ( ( ch < < = 1 ) & 0x80 )
if ( ( * utf + + & 0xc0 ) ! = 0x80 )
return ( - 1 ) ;
}
}
return ( - 1 ) ;
}
/**
* xmlUTF8Strsub :
* @ utf : a sequence of UTF - 8 encoded bytes
* @ start : relative pos of first char
* @ len : total number to copy
*
* Note : positions are given in units of UTF - 8 chars
*
* Returns a pointer to a newly created string
* or NULL if any problem
*/
xmlChar *
xmlUTF8Strsub ( const xmlChar * utf , int start , int len ) {
int i ;
xmlChar ch ;
if ( utf = = NULL ) return ( NULL ) ;
if ( start < 0 ) return ( NULL ) ;
if ( len < 0 ) return ( NULL ) ;
/*
* Skip over any leading chars
*/
for ( i = 0 ; i < start ; i + + ) {
if ( ( ch = * utf + + ) = = 0 ) return ( NULL ) ;
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( ( ch & 0xc0 ) ! = 0xc0 )
return ( NULL ) ;
/* then skip over remaining bytes for this char */
while ( ( ch < < = 1 ) & 0x80 )
if ( ( * utf + + & 0xc0 ) ! = 0x80 )
return ( NULL ) ;
}
}
return ( xmlUTF8Strndup ( utf , len ) ) ;
}
/************************************************************************
* *
* Conversions To / From UTF8 encoding *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
/**
* asciiToUTF8 :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of ASCII chars
* @ inlen : the length of @ in
*
* Take a block of ASCII chars in and try to convert it to an UTF - 8
* block of chars out .
* Returns 0 if success , or - 1 otherwise
* The value of @ inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2001-02-23 20:55:21 +03:00
* The value of @ outlen after return is the number of ocetes consumed .
*/
2001-03-24 20:00:36 +03:00
static int
2001-02-23 20:55:21 +03:00
asciiToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ) {
unsigned char * outstart = out ;
const unsigned char * base = in ;
const unsigned char * processed = in ;
unsigned char * outend = out + * outlen ;
const unsigned char * inend ;
unsigned int c ;
int bits ;
inend = in + ( * inlen ) ;
while ( ( in < inend ) & & ( out - outstart + 5 < * outlen ) ) {
c = * in + + ;
/* assertion: c is a single UTF-4 value */
if ( out > = outend )
break ;
if ( c < 0x80 ) { * out + + = c ; bits = - 6 ; }
else {
* outlen = out - outstart ;
* inlen = processed - base ;
return ( - 1 ) ;
}
for ( ; bits > = 0 ; bits - = 6 ) {
if ( out > = outend )
break ;
* out + + = ( ( c > > bits ) & 0x3F ) | 0x80 ;
}
processed = ( const unsigned char * ) in ;
}
* outlen = out - outstart ;
* inlen = processed - base ;
return ( 0 ) ;
}
/**
* UTF8Toascii :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of UTF - 8 chars
* @ inlen : the length of @ in
*
* Take a block of UTF - 8 chars in and try to convert it to an ASCII
* block of chars out .
*
* Returns 0 if success , - 2 if the transcoding fails , or - 1 otherwise
* The value of @ inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2001-02-23 20:55:21 +03:00
* The value of @ outlen after return is the number of ocetes consumed .
*/
2001-03-24 20:00:36 +03:00
static int
2001-02-23 20:55:21 +03:00
UTF8Toascii ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ) {
const unsigned char * processed = in ;
const unsigned char * outend ;
const unsigned char * outstart = out ;
const unsigned char * instart = in ;
const unsigned char * inend ;
unsigned int c , d ;
int trailing ;
if ( in = = NULL ) {
/*
* initialization nothing to do
*/
* outlen = 0 ;
* inlen = 0 ;
return ( 0 ) ;
}
inend = in + ( * inlen ) ;
outend = out + ( * outlen ) ;
while ( in < inend ) {
d = * in + + ;
if ( d < 0x80 ) { c = d ; trailing = 0 ; }
else if ( d < 0xC0 ) {
/* trailing byte in leading position */
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( - 2 ) ;
} else if ( d < 0xE0 ) { c = d & 0x1F ; trailing = 1 ; }
else if ( d < 0xF0 ) { c = d & 0x0F ; trailing = 2 ; }
else if ( d < 0xF8 ) { c = d & 0x07 ; trailing = 3 ; }
else {
/* no chance for this in Ascii */
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( - 2 ) ;
}
if ( inend - in < trailing ) {
break ;
}
for ( ; trailing ; trailing - - ) {
if ( ( in > = inend ) | | ( ( ( d = * in + + ) & 0xC0 ) ! = 0x80 ) )
break ;
c < < = 6 ;
c | = d & 0x3F ;
}
/* assertion: c is a single UTF-4 value */
if ( c < 0x80 ) {
if ( out > = outend )
break ;
* out + + = c ;
} else {
/* no chance for this in Ascii */
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( - 2 ) ;
}
processed = in ;
}
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( 0 ) ;
}
/**
* isolat1ToUTF8 :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of ISO Latin 1 chars
* @ inlen : the length of @ in
*
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF - 8
* block of chars out .
* Returns 0 if success , or - 1 otherwise
* The value of @ inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2001-02-23 20:55:21 +03:00
* The value of @ outlen after return is the number of ocetes consumed .
*/
int
isolat1ToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ) {
unsigned char * outstart = out ;
const unsigned char * base = in ;
unsigned char * outend = out + * outlen ;
const unsigned char * inend ;
2002-05-31 13:47:30 +04:00
const unsigned char * instop ;
xmlChar c = * in ;
2001-02-23 20:55:21 +03:00
inend = in + ( * inlen ) ;
2002-05-31 13:47:30 +04:00
instop = inend ;
while ( in < inend & & out < outend - 1 ) {
if ( c > = 0x80 ) {
2002-03-21 16:27:59 +03:00
* out + + = ( ( c > > 6 ) & 0x1F ) | 0xC0 ;
2001-04-30 15:46:40 +04:00
* out + + = ( c & 0x3F ) | 0x80 ;
2002-05-31 13:47:30 +04:00
+ + in ;
c = * in ;
}
if ( instop - in > outend - out ) instop = in + ( outend - out ) ;
while ( c < 0x80 & & in < instop ) {
* out + + = c ;
+ + in ;
c = * in ;
}
}
if ( in < inend & & out < outend & & c < 0x80 ) {
* out + + = c ;
+ + in ;
2001-02-23 20:55:21 +03:00
}
* outlen = out - outstart ;
2002-05-31 13:47:30 +04:00
* inlen = in - base ;
2001-02-23 20:55:21 +03:00
return ( 0 ) ;
}
2002-05-31 13:47:30 +04:00
2001-02-23 20:55:21 +03:00
/**
* UTF8Toisolat1 :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of UTF - 8 chars
* @ inlen : the length of @ in
*
* Take a block of UTF - 8 chars in and try to convert it to an ISO Latin 1
* block of chars out .
*
* Returns 0 if success , - 2 if the transcoding fails , or - 1 otherwise
* The value of @ inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2001-02-23 20:55:21 +03:00
* The value of @ outlen after return is the number of ocetes consumed .
*/
int
UTF8Toisolat1 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ) {
const unsigned char * processed = in ;
const unsigned char * outend ;
const unsigned char * outstart = out ;
const unsigned char * instart = in ;
const unsigned char * inend ;
unsigned int c , d ;
int trailing ;
if ( in = = NULL ) {
/*
* initialization nothing to do
*/
* outlen = 0 ;
* inlen = 0 ;
return ( 0 ) ;
}
inend = in + ( * inlen ) ;
outend = out + ( * outlen ) ;
while ( in < inend ) {
d = * in + + ;
if ( d < 0x80 ) { c = d ; trailing = 0 ; }
else if ( d < 0xC0 ) {
/* trailing byte in leading position */
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( - 2 ) ;
} else if ( d < 0xE0 ) { c = d & 0x1F ; trailing = 1 ; }
else if ( d < 0xF0 ) { c = d & 0x0F ; trailing = 2 ; }
else if ( d < 0xF8 ) { c = d & 0x07 ; trailing = 3 ; }
else {
/* no chance for this in IsoLat1 */
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( - 2 ) ;
}
if ( inend - in < trailing ) {
break ;
}
for ( ; trailing ; trailing - - ) {
if ( in > = inend )
break ;
if ( ( ( d = * in + + ) & 0xC0 ) ! = 0x80 ) {
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( - 2 ) ;
}
c < < = 6 ;
c | = d & 0x3F ;
}
/* assertion: c is a single UTF-4 value */
if ( c < = 0xFF ) {
if ( out > = outend )
break ;
* out + + = c ;
} else {
/* no chance for this in IsoLat1 */
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( - 2 ) ;
}
processed = in ;
}
* outlen = out - outstart ;
* inlen = processed - instart ;
return ( 0 ) ;
}
/**
* UTF16LEToUTF8 :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ inb : a pointer to an array of UTF - 16L E passwd as a byte array
* @ inlenb : the length of @ in in UTF - 16L E chars
*
* Take a block of UTF - 16L E ushorts in and try to convert it to an UTF - 8
2001-12-31 19:16:02 +03:00
* block of chars out . This function assume the endian property
2001-02-23 20:55:21 +03:00
* is the same between the native type of this machine and the
* inputed one .
*
* Returns the number of byte written , or - 1 by lack of space , or - 2
* if the transcoding fails ( for * in is not valid utf16 string )
* The value of * inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2001-02-23 20:55:21 +03:00
*/
2001-03-24 20:00:36 +03:00
static int
2001-02-23 20:55:21 +03:00
UTF16LEToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * inb , int * inlenb )
{
unsigned char * outstart = out ;
const unsigned char * processed = inb ;
unsigned char * outend = out + * outlen ;
unsigned short * in = ( unsigned short * ) inb ;
unsigned short * inend ;
unsigned int c , d , inlen ;
unsigned char * tmp ;
int bits ;
if ( ( * inlenb % 2 ) = = 1 )
( * inlenb ) - - ;
inlen = * inlenb / 2 ;
inend = in + inlen ;
while ( ( in < inend ) & & ( out - outstart + 5 < * outlen ) ) {
if ( xmlLittleEndian ) {
c = * in + + ;
} else {
tmp = ( unsigned char * ) in ;
c = * tmp + + ;
c = c | ( ( ( unsigned int ) * tmp ) < < 8 ) ;
in + + ;
}
if ( ( c & 0xFC00 ) = = 0xD800 ) { /* surrogates */
if ( in > = inend ) { /* (in > inend) shouldn't happens */
break ;
}
if ( xmlLittleEndian ) {
d = * in + + ;
} else {
tmp = ( unsigned char * ) in ;
d = * tmp + + ;
d = d | ( ( ( unsigned int ) * tmp ) < < 8 ) ;
in + + ;
}
if ( ( d & 0xFC00 ) = = 0xDC00 ) {
c & = 0x03FF ;
c < < = 10 ;
c | = d & 0x03FF ;
c + = 0x10000 ;
}
else {
* outlen = out - outstart ;
* inlenb = processed - inb ;
return ( - 2 ) ;
}
}
/* assertion: c is a single UTF-4 value */
if ( out > = outend )
break ;
if ( c < 0x80 ) { * out + + = c ; bits = - 6 ; }
else if ( c < 0x800 ) { * out + + = ( ( c > > 6 ) & 0x1F ) | 0xC0 ; bits = 0 ; }
else if ( c < 0x10000 ) { * out + + = ( ( c > > 12 ) & 0x0F ) | 0xE0 ; bits = 6 ; }
else { * out + + = ( ( c > > 18 ) & 0x07 ) | 0xF0 ; bits = 12 ; }
for ( ; bits > = 0 ; bits - = 6 ) {
if ( out > = outend )
break ;
* out + + = ( ( c > > bits ) & 0x3F ) | 0x80 ;
}
processed = ( const unsigned char * ) in ;
}
* outlen = out - outstart ;
* inlenb = processed - inb ;
return ( 0 ) ;
}
/**
* UTF8ToUTF16LE :
* @ outb : a pointer to an array of bytes to store the result
* @ outlen : the length of @ outb
* @ in : a pointer to an array of UTF - 8 chars
* @ inlen : the length of @ in
*
* Take a block of UTF - 8 chars in and try to convert it to an UTF - 16L E
* block of chars out .
*
* Returns the number of byte written , or - 1 by lack of space , or - 2
* if the transcoding failed .
*/
2001-03-24 20:00:36 +03:00
static int
2001-02-23 20:55:21 +03:00
UTF8ToUTF16LE ( unsigned char * outb , int * outlen ,
const unsigned char * in , int * inlen )
{
unsigned short * out = ( unsigned short * ) outb ;
const unsigned char * processed = in ;
unsigned short * outstart = out ;
unsigned short * outend ;
const unsigned char * inend = in + * inlen ;
unsigned int c , d ;
int trailing ;
unsigned char * tmp ;
unsigned short tmp1 , tmp2 ;
if ( in = = NULL ) {
/*
* initialization , add the Byte Order Mark
*/
if ( * outlen > = 2 ) {
outb [ 0 ] = 0xFF ;
outb [ 1 ] = 0xFE ;
* outlen = 2 ;
* inlen = 0 ;
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" Added FFFE Byte Order Mark \n " ) ;
# endif
return ( 2 ) ;
}
* outlen = 0 ;
* inlen = 0 ;
return ( 0 ) ;
}
outend = out + ( * outlen / 2 ) ;
while ( in < inend ) {
d = * in + + ;
if ( d < 0x80 ) { c = d ; trailing = 0 ; }
else if ( d < 0xC0 ) {
/* trailing byte in leading position */
* outlen = ( out - outstart ) * 2 ;
* inlen = processed - in ;
return ( - 2 ) ;
} else if ( d < 0xE0 ) { c = d & 0x1F ; trailing = 1 ; }
else if ( d < 0xF0 ) { c = d & 0x0F ; trailing = 2 ; }
else if ( d < 0xF8 ) { c = d & 0x07 ; trailing = 3 ; }
else {
/* no chance for this in UTF-16 */
* outlen = ( out - outstart ) * 2 ;
* inlen = processed - in ;
return ( - 2 ) ;
}
if ( inend - in < trailing ) {
break ;
}
for ( ; trailing ; trailing - - ) {
if ( ( in > = inend ) | | ( ( ( d = * in + + ) & 0xC0 ) ! = 0x80 ) )
break ;
c < < = 6 ;
c | = d & 0x3F ;
}
/* assertion: c is a single UTF-4 value */
if ( c < 0x10000 ) {
if ( out > = outend )
break ;
if ( xmlLittleEndian ) {
* out + + = c ;
} else {
tmp = ( unsigned char * ) out ;
* tmp = c ;
* ( tmp + 1 ) = c > > 8 ;
out + + ;
}
}
else if ( c < 0x110000 ) {
if ( out + 1 > = outend )
break ;
c - = 0x10000 ;
if ( xmlLittleEndian ) {
* out + + = 0xD800 | ( c > > 10 ) ;
* out + + = 0xDC00 | ( c & 0x03FF ) ;
} else {
tmp1 = 0xD800 | ( c > > 10 ) ;
tmp = ( unsigned char * ) out ;
* tmp = ( unsigned char ) tmp1 ;
* ( tmp + 1 ) = tmp1 > > 8 ;
out + + ;
tmp2 = 0xDC00 | ( c & 0x03FF ) ;
tmp = ( unsigned char * ) out ;
* tmp = ( unsigned char ) tmp2 ;
* ( tmp + 1 ) = tmp2 > > 8 ;
out + + ;
}
}
else
break ;
processed = in ;
}
* outlen = ( out - outstart ) * 2 ;
* inlen = processed - in ;
return ( 0 ) ;
}
/**
* UTF16BEToUTF8 :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ inb : a pointer to an array of UTF - 16 passwd as a byte array
* @ inlenb : the length of @ in in UTF - 16 chars
*
* Take a block of UTF - 16 ushorts in and try to convert it to an UTF - 8
2001-12-31 19:16:02 +03:00
* block of chars out . This function assume the endian property
2001-02-23 20:55:21 +03:00
* is the same between the native type of this machine and the
* inputed one .
*
* Returns the number of byte written , or - 1 by lack of space , or - 2
* if the transcoding fails ( for * in is not valid utf16 string )
* The value of * inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2001-02-23 20:55:21 +03:00
*/
2001-03-24 20:00:36 +03:00
static int
2001-02-23 20:55:21 +03:00
UTF16BEToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * inb , int * inlenb )
{
unsigned char * outstart = out ;
const unsigned char * processed = inb ;
unsigned char * outend = out + * outlen ;
unsigned short * in = ( unsigned short * ) inb ;
unsigned short * inend ;
unsigned int c , d , inlen ;
unsigned char * tmp ;
int bits ;
if ( ( * inlenb % 2 ) = = 1 )
( * inlenb ) - - ;
inlen = * inlenb / 2 ;
inend = in + inlen ;
while ( in < inend ) {
if ( xmlLittleEndian ) {
tmp = ( unsigned char * ) in ;
c = * tmp + + ;
c = c < < 8 ;
c = c | ( unsigned int ) * tmp ;
in + + ;
} else {
c = * in + + ;
}
if ( ( c & 0xFC00 ) = = 0xD800 ) { /* surrogates */
if ( in > = inend ) { /* (in > inend) shouldn't happens */
* outlen = out - outstart ;
* inlenb = processed - inb ;
return ( - 2 ) ;
}
if ( xmlLittleEndian ) {
tmp = ( unsigned char * ) in ;
d = * tmp + + ;
d = d < < 8 ;
d = d | ( unsigned int ) * tmp ;
in + + ;
} else {
d = * in + + ;
}
if ( ( d & 0xFC00 ) = = 0xDC00 ) {
c & = 0x03FF ;
c < < = 10 ;
c | = d & 0x03FF ;
c + = 0x10000 ;
}
else {
* outlen = out - outstart ;
* inlenb = processed - inb ;
return ( - 2 ) ;
}
}
/* assertion: c is a single UTF-4 value */
if ( out > = outend )
break ;
if ( c < 0x80 ) { * out + + = c ; bits = - 6 ; }
else if ( c < 0x800 ) { * out + + = ( ( c > > 6 ) & 0x1F ) | 0xC0 ; bits = 0 ; }
else if ( c < 0x10000 ) { * out + + = ( ( c > > 12 ) & 0x0F ) | 0xE0 ; bits = 6 ; }
else { * out + + = ( ( c > > 18 ) & 0x07 ) | 0xF0 ; bits = 12 ; }
for ( ; bits > = 0 ; bits - = 6 ) {
if ( out > = outend )
break ;
* out + + = ( ( c > > bits ) & 0x3F ) | 0x80 ;
}
processed = ( const unsigned char * ) in ;
}
* outlen = out - outstart ;
* inlenb = processed - inb ;
return ( 0 ) ;
}
/**
* UTF8ToUTF16BE :
* @ outb : a pointer to an array of bytes to store the result
* @ outlen : the length of @ outb
* @ in : a pointer to an array of UTF - 8 chars
* @ inlen : the length of @ in
*
* Take a block of UTF - 8 chars in and try to convert it to an UTF - 16 BE
* block of chars out .
*
* Returns the number of byte written , or - 1 by lack of space , or - 2
* if the transcoding failed .
*/
2001-03-24 20:00:36 +03:00
static int
2001-02-23 20:55:21 +03:00
UTF8ToUTF16BE ( unsigned char * outb , int * outlen ,
const unsigned char * in , int * inlen )
{
unsigned short * out = ( unsigned short * ) outb ;
const unsigned char * processed = in ;
unsigned short * outstart = out ;
unsigned short * outend ;
const unsigned char * inend = in + * inlen ;
unsigned int c , d ;
int trailing ;
unsigned char * tmp ;
unsigned short tmp1 , tmp2 ;
if ( in = = NULL ) {
/*
* initialization , add the Byte Order Mark
*/
if ( * outlen > = 2 ) {
outb [ 0 ] = 0xFE ;
outb [ 1 ] = 0xFF ;
* outlen = 2 ;
* inlen = 0 ;
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" Added FEFF Byte Order Mark \n " ) ;
# endif
return ( 2 ) ;
}
* outlen = 0 ;
* inlen = 0 ;
return ( 0 ) ;
}
outend = out + ( * outlen / 2 ) ;
while ( in < inend ) {
d = * in + + ;
if ( d < 0x80 ) { c = d ; trailing = 0 ; }
else if ( d < 0xC0 ) {
/* trailing byte in leading position */
* outlen = out - outstart ;
* inlen = processed - in ;
return ( - 2 ) ;
} else if ( d < 0xE0 ) { c = d & 0x1F ; trailing = 1 ; }
else if ( d < 0xF0 ) { c = d & 0x0F ; trailing = 2 ; }
else if ( d < 0xF8 ) { c = d & 0x07 ; trailing = 3 ; }
else {
/* no chance for this in UTF-16 */
* outlen = out - outstart ;
* inlen = processed - in ;
return ( - 2 ) ;
}
if ( inend - in < trailing ) {
break ;
}
for ( ; trailing ; trailing - - ) {
if ( ( in > = inend ) | | ( ( ( d = * in + + ) & 0xC0 ) ! = 0x80 ) ) break ;
c < < = 6 ;
c | = d & 0x3F ;
}
/* assertion: c is a single UTF-4 value */
if ( c < 0x10000 ) {
if ( out > = outend ) break ;
if ( xmlLittleEndian ) {
tmp = ( unsigned char * ) out ;
* tmp = c > > 8 ;
* ( tmp + 1 ) = c ;
out + + ;
} else {
* out + + = c ;
}
}
else if ( c < 0x110000 ) {
if ( out + 1 > = outend ) break ;
c - = 0x10000 ;
if ( xmlLittleEndian ) {
tmp1 = 0xD800 | ( c > > 10 ) ;
tmp = ( unsigned char * ) out ;
* tmp = tmp1 > > 8 ;
* ( tmp + 1 ) = ( unsigned char ) tmp1 ;
out + + ;
tmp2 = 0xDC00 | ( c & 0x03FF ) ;
tmp = ( unsigned char * ) out ;
* tmp = tmp2 > > 8 ;
* ( tmp + 1 ) = ( unsigned char ) tmp2 ;
out + + ;
} else {
* out + + = 0xD800 | ( c > > 10 ) ;
* out + + = 0xDC00 | ( c & 0x03FF ) ;
}
}
else
break ;
processed = in ;
}
* outlen = ( out - outstart ) * 2 ;
* inlen = processed - in ;
return ( 0 ) ;
}
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* Generic encoding handling routines *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
/**
* xmlDetectCharEncoding :
* @ in : a pointer to the first bytes of the XML entity , must be at least
* 4 bytes long .
* @ len : pointer to the length of the buffer
*
* Guess the encoding of the entity using the first bytes of the entity content
* accordingly of the non - normative appendix F of the XML - 1.0 recommendation .
*
* Returns one of the XML_CHAR_ENCODING_ . . . values .
*/
xmlCharEncoding
xmlDetectCharEncoding ( const unsigned char * in , int len )
{
if ( len > = 4 ) {
if ( ( in [ 0 ] = = 0x00 ) & & ( in [ 1 ] = = 0x00 ) & &
( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x3C ) )
return ( XML_CHAR_ENCODING_UCS4BE ) ;
if ( ( in [ 0 ] = = 0x3C ) & & ( in [ 1 ] = = 0x00 ) & &
( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x00 ) )
return ( XML_CHAR_ENCODING_UCS4LE ) ;
if ( ( in [ 0 ] = = 0x00 ) & & ( in [ 1 ] = = 0x00 ) & &
( in [ 2 ] = = 0x3C ) & & ( in [ 3 ] = = 0x00 ) )
return ( XML_CHAR_ENCODING_UCS4_2143 ) ;
if ( ( in [ 0 ] = = 0x00 ) & & ( in [ 1 ] = = 0x3C ) & &
( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x00 ) )
return ( XML_CHAR_ENCODING_UCS4_3412 ) ;
if ( ( in [ 0 ] = = 0x4C ) & & ( in [ 1 ] = = 0x6F ) & &
( in [ 2 ] = = 0xA7 ) & & ( in [ 3 ] = = 0x94 ) )
return ( XML_CHAR_ENCODING_EBCDIC ) ;
if ( ( in [ 0 ] = = 0x3C ) & & ( in [ 1 ] = = 0x3F ) & &
( in [ 2 ] = = 0x78 ) & & ( in [ 3 ] = = 0x6D ) )
return ( XML_CHAR_ENCODING_UTF8 ) ;
}
2001-06-20 21:41:10 +04:00
if ( len > = 3 ) {
/*
* Errata on XML - 1.0 June 20 2001
* We now allow an UTF8 encoded BOM
*/
if ( ( in [ 0 ] = = 0xEF ) & & ( in [ 1 ] = = 0xBB ) & &
( in [ 2 ] = = 0xBF ) )
return ( XML_CHAR_ENCODING_UTF8 ) ;
}
2001-02-23 20:55:21 +03:00
if ( len > = 2 ) {
if ( ( in [ 0 ] = = 0xFE ) & & ( in [ 1 ] = = 0xFF ) )
return ( XML_CHAR_ENCODING_UTF16BE ) ;
if ( ( in [ 0 ] = = 0xFF ) & & ( in [ 1 ] = = 0xFE ) )
return ( XML_CHAR_ENCODING_UTF16LE ) ;
}
return ( XML_CHAR_ENCODING_NONE ) ;
}
/**
* xmlCleanupEncodingAliases :
*
* Unregisters all aliases
*/
void
xmlCleanupEncodingAliases ( void ) {
int i ;
if ( xmlCharEncodingAliases = = NULL )
return ;
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( xmlCharEncodingAliases [ i ] . name ! = NULL )
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . name ) ;
if ( xmlCharEncodingAliases [ i ] . alias ! = NULL )
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . alias ) ;
}
xmlCharEncodingAliasesNb = 0 ;
xmlCharEncodingAliasesMax = 0 ;
xmlFree ( xmlCharEncodingAliases ) ;
2002-01-08 16:15:33 +03:00
xmlCharEncodingAliases = NULL ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlGetEncodingAlias :
* @ alias : the alias name as parsed , in UTF - 8 format ( ASCII actually )
*
* Lookup an encoding name for the given alias .
*
* Returns NULL if not found the original name otherwise
*/
const char *
xmlGetEncodingAlias ( const char * alias ) {
int i ;
char upper [ 100 ] ;
if ( alias = = NULL )
return ( NULL ) ;
if ( xmlCharEncodingAliases = = NULL )
return ( NULL ) ;
for ( i = 0 ; i < 99 ; i + + ) {
upper [ i ] = toupper ( alias [ i ] ) ;
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
/*
* Walk down the list looking for a definition of the alias
*/
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( ! strcmp ( xmlCharEncodingAliases [ i ] . alias , upper ) ) {
return ( xmlCharEncodingAliases [ i ] . name ) ;
}
}
return ( NULL ) ;
}
/**
* xmlAddEncodingAlias :
* @ name : the encoding name as parsed , in UTF - 8 format ( ASCII actually )
* @ alias : the alias name as parsed , in UTF - 8 format ( ASCII actually )
*
* Registers and alias @ alias for an encoding named @ name . Existing alias
* will be overwritten .
*
* Returns 0 in case of success , - 1 in case of error
*/
int
xmlAddEncodingAlias ( const char * name , const char * alias ) {
int i ;
char upper [ 100 ] ;
if ( ( name = = NULL ) | | ( alias = = NULL ) )
return ( - 1 ) ;
for ( i = 0 ; i < 99 ; i + + ) {
upper [ i ] = toupper ( alias [ i ] ) ;
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
if ( xmlCharEncodingAliases = = NULL ) {
xmlCharEncodingAliasesNb = 0 ;
xmlCharEncodingAliasesMax = 20 ;
xmlCharEncodingAliases = ( xmlCharEncodingAliasPtr )
xmlMalloc ( xmlCharEncodingAliasesMax * sizeof ( xmlCharEncodingAlias ) ) ;
if ( xmlCharEncodingAliases = = NULL )
return ( - 1 ) ;
} else if ( xmlCharEncodingAliasesNb > = xmlCharEncodingAliasesMax ) {
xmlCharEncodingAliasesMax * = 2 ;
xmlCharEncodingAliases = ( xmlCharEncodingAliasPtr )
xmlRealloc ( xmlCharEncodingAliases ,
xmlCharEncodingAliasesMax * sizeof ( xmlCharEncodingAlias ) ) ;
}
/*
* Walk down the list looking for a definition of the alias
*/
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( ! strcmp ( xmlCharEncodingAliases [ i ] . alias , upper ) ) {
/*
* Replace the definition .
*/
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . name ) ;
xmlCharEncodingAliases [ i ] . name = xmlMemStrdup ( name ) ;
return ( 0 ) ;
}
}
/*
* Add the definition
*/
xmlCharEncodingAliases [ xmlCharEncodingAliasesNb ] . name = xmlMemStrdup ( name ) ;
xmlCharEncodingAliases [ xmlCharEncodingAliasesNb ] . alias = xmlMemStrdup ( upper ) ;
xmlCharEncodingAliasesNb + + ;
return ( 0 ) ;
}
/**
* xmlDelEncodingAlias :
* @ alias : the alias name as parsed , in UTF - 8 format ( ASCII actually )
*
* Unregisters an encoding alias @ alias
*
* Returns 0 in case of success , - 1 in case of error
*/
int
xmlDelEncodingAlias ( const char * alias ) {
int i ;
if ( alias = = NULL )
return ( - 1 ) ;
if ( xmlCharEncodingAliases = = NULL )
return ( - 1 ) ;
/*
* Walk down the list looking for a definition of the alias
*/
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( ! strcmp ( xmlCharEncodingAliases [ i ] . alias , alias ) ) {
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . name ) ;
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . alias ) ;
xmlCharEncodingAliasesNb - - ;
memmove ( & xmlCharEncodingAliases [ i ] , & xmlCharEncodingAliases [ i + 1 ] ,
sizeof ( xmlCharEncodingAlias ) * ( xmlCharEncodingAliasesNb - i ) ) ;
return ( 0 ) ;
}
}
return ( - 1 ) ;
}
/**
* xmlParseCharEncoding :
* @ name : the encoding name as parsed , in UTF - 8 format ( ASCII actually )
*
2001-12-31 19:16:02 +03:00
* Compare the string to the known encoding schemes already known . Note
2001-02-23 20:55:21 +03:00
* that the comparison is case insensitive accordingly to the section
* [ XML ] 4.3 .3 Character Encoding in Entities .
*
* Returns one of the XML_CHAR_ENCODING_ . . . values or XML_CHAR_ENCODING_NONE
* if not recognized .
*/
xmlCharEncoding
xmlParseCharEncoding ( const char * name )
{
const char * alias ;
char upper [ 500 ] ;
int i ;
if ( name = = NULL )
return ( XML_CHAR_ENCODING_NONE ) ;
/*
* Do the alias resolution
*/
alias = xmlGetEncodingAlias ( name ) ;
if ( alias ! = NULL )
name = alias ;
for ( i = 0 ; i < 499 ; i + + ) {
upper [ i ] = toupper ( name [ i ] ) ;
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
if ( ! strcmp ( upper , " " ) ) return ( XML_CHAR_ENCODING_NONE ) ;
if ( ! strcmp ( upper , " UTF-8 " ) ) return ( XML_CHAR_ENCODING_UTF8 ) ;
if ( ! strcmp ( upper , " UTF8 " ) ) return ( XML_CHAR_ENCODING_UTF8 ) ;
/*
* NOTE : if we were able to parse this , the endianness of UTF16 is
* already found and in use
*/
if ( ! strcmp ( upper , " UTF-16 " ) ) return ( XML_CHAR_ENCODING_UTF16LE ) ;
if ( ! strcmp ( upper , " UTF16 " ) ) return ( XML_CHAR_ENCODING_UTF16LE ) ;
if ( ! strcmp ( upper , " ISO-10646-UCS-2 " ) ) return ( XML_CHAR_ENCODING_UCS2 ) ;
if ( ! strcmp ( upper , " UCS-2 " ) ) return ( XML_CHAR_ENCODING_UCS2 ) ;
if ( ! strcmp ( upper , " UCS2 " ) ) return ( XML_CHAR_ENCODING_UCS2 ) ;
/*
* NOTE : if we were able to parse this , the endianness of UCS4 is
* already found and in use
*/
if ( ! strcmp ( upper , " ISO-10646-UCS-4 " ) ) return ( XML_CHAR_ENCODING_UCS4LE ) ;
if ( ! strcmp ( upper , " UCS-4 " ) ) return ( XML_CHAR_ENCODING_UCS4LE ) ;
if ( ! strcmp ( upper , " UCS4 " ) ) return ( XML_CHAR_ENCODING_UCS4LE ) ;
if ( ! strcmp ( upper , " ISO-8859-1 " ) ) return ( XML_CHAR_ENCODING_8859_1 ) ;
if ( ! strcmp ( upper , " ISO-LATIN-1 " ) ) return ( XML_CHAR_ENCODING_8859_1 ) ;
if ( ! strcmp ( upper , " ISO LATIN 1 " ) ) return ( XML_CHAR_ENCODING_8859_1 ) ;
if ( ! strcmp ( upper , " ISO-8859-2 " ) ) return ( XML_CHAR_ENCODING_8859_2 ) ;
if ( ! strcmp ( upper , " ISO-LATIN-2 " ) ) return ( XML_CHAR_ENCODING_8859_2 ) ;
if ( ! strcmp ( upper , " ISO LATIN 2 " ) ) return ( XML_CHAR_ENCODING_8859_2 ) ;
if ( ! strcmp ( upper , " ISO-8859-3 " ) ) return ( XML_CHAR_ENCODING_8859_3 ) ;
if ( ! strcmp ( upper , " ISO-8859-4 " ) ) return ( XML_CHAR_ENCODING_8859_4 ) ;
if ( ! strcmp ( upper , " ISO-8859-5 " ) ) return ( XML_CHAR_ENCODING_8859_5 ) ;
if ( ! strcmp ( upper , " ISO-8859-6 " ) ) return ( XML_CHAR_ENCODING_8859_6 ) ;
if ( ! strcmp ( upper , " ISO-8859-7 " ) ) return ( XML_CHAR_ENCODING_8859_7 ) ;
if ( ! strcmp ( upper , " ISO-8859-8 " ) ) return ( XML_CHAR_ENCODING_8859_8 ) ;
if ( ! strcmp ( upper , " ISO-8859-9 " ) ) return ( XML_CHAR_ENCODING_8859_9 ) ;
if ( ! strcmp ( upper , " ISO-2022-JP " ) ) return ( XML_CHAR_ENCODING_2022_JP ) ;
if ( ! strcmp ( upper , " SHIFT_JIS " ) ) return ( XML_CHAR_ENCODING_SHIFT_JIS ) ;
if ( ! strcmp ( upper , " EUC-JP " ) ) return ( XML_CHAR_ENCODING_EUC_JP ) ;
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext , " Unknown encoding %s \n " , name ) ;
# endif
return ( XML_CHAR_ENCODING_ERROR ) ;
}
/**
* xmlGetCharEncodingName :
* @ enc : the encoding
*
* The " canonical " name for XML encoding .
* C . f . http : //www.w3.org/TR/REC-xml#charencoding
* Section 4.3 .3 Character Encoding in Entities
*
* Returns the canonical name for the given encoding
*/
const char *
xmlGetCharEncodingName ( xmlCharEncoding enc ) {
switch ( enc ) {
case XML_CHAR_ENCODING_ERROR :
return ( NULL ) ;
case XML_CHAR_ENCODING_NONE :
return ( NULL ) ;
case XML_CHAR_ENCODING_UTF8 :
return ( " UTF-8 " ) ;
case XML_CHAR_ENCODING_UTF16LE :
return ( " UTF-16 " ) ;
case XML_CHAR_ENCODING_UTF16BE :
return ( " UTF-16 " ) ;
case XML_CHAR_ENCODING_EBCDIC :
return ( " EBCDIC " ) ;
case XML_CHAR_ENCODING_UCS4LE :
return ( " ISO-10646-UCS-4 " ) ;
case XML_CHAR_ENCODING_UCS4BE :
return ( " ISO-10646-UCS-4 " ) ;
case XML_CHAR_ENCODING_UCS4_2143 :
return ( " ISO-10646-UCS-4 " ) ;
case XML_CHAR_ENCODING_UCS4_3412 :
return ( " ISO-10646-UCS-4 " ) ;
case XML_CHAR_ENCODING_UCS2 :
return ( " ISO-10646-UCS-2 " ) ;
case XML_CHAR_ENCODING_8859_1 :
return ( " ISO-8859-1 " ) ;
case XML_CHAR_ENCODING_8859_2 :
return ( " ISO-8859-2 " ) ;
case XML_CHAR_ENCODING_8859_3 :
return ( " ISO-8859-3 " ) ;
case XML_CHAR_ENCODING_8859_4 :
return ( " ISO-8859-4 " ) ;
case XML_CHAR_ENCODING_8859_5 :
return ( " ISO-8859-5 " ) ;
case XML_CHAR_ENCODING_8859_6 :
return ( " ISO-8859-6 " ) ;
case XML_CHAR_ENCODING_8859_7 :
return ( " ISO-8859-7 " ) ;
case XML_CHAR_ENCODING_8859_8 :
return ( " ISO-8859-8 " ) ;
case XML_CHAR_ENCODING_8859_9 :
return ( " ISO-8859-9 " ) ;
case XML_CHAR_ENCODING_2022_JP :
return ( " ISO-2022-JP " ) ;
case XML_CHAR_ENCODING_SHIFT_JIS :
return ( " Shift-JIS " ) ;
case XML_CHAR_ENCODING_EUC_JP :
return ( " EUC-JP " ) ;
case XML_CHAR_ENCODING_ASCII :
return ( NULL ) ;
}
return ( NULL ) ;
}
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* Char encoding handlers *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
/* the size should be growable, but it's not a big deal ... */
# define MAX_ENCODING_HANDLERS 50
static xmlCharEncodingHandlerPtr * handlers = NULL ;
static int nbCharEncodingHandler = 0 ;
/*
* The default is UTF - 8 for XML , that ' s also the default used for the
* parser internals , so the default encoding handler is NULL
*/
static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL ;
/**
* xmlNewCharEncodingHandler :
* @ name : the encoding name , in UTF - 8 format ( ASCII actually )
* @ input : the xmlCharEncodingInputFunc to read that encoding
* @ output : the xmlCharEncodingOutputFunc to write that encoding
*
* Create and registers an xmlCharEncodingHandler .
2002-08-01 16:22:24 +04:00
*
2001-02-23 20:55:21 +03:00
* Returns the xmlCharEncodingHandlerPtr created ( or NULL in case of error ) .
*/
2002-08-01 16:22:24 +04:00
xmlCharEncodingHandlerPtr
2001-02-23 20:55:21 +03:00
xmlNewCharEncodingHandler ( const char * name ,
xmlCharEncodingInputFunc input ,
xmlCharEncodingOutputFunc output ) {
xmlCharEncodingHandlerPtr handler ;
const char * alias ;
char upper [ 500 ] ;
int i ;
char * up = 0 ;
/*
* Do the alias resolution
*/
alias = xmlGetEncodingAlias ( name ) ;
if ( alias ! = NULL )
name = alias ;
/*
* Keep only the uppercase version of the encoding .
*/
if ( name = = NULL ) {
xmlGenericError ( xmlGenericErrorContext ,
" xmlNewCharEncodingHandler : no name ! \n " ) ;
return ( NULL ) ;
}
for ( i = 0 ; i < 499 ; i + + ) {
upper [ i ] = toupper ( name [ i ] ) ;
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
up = xmlMemStrdup ( upper ) ;
if ( up = = NULL ) {
xmlGenericError ( xmlGenericErrorContext ,
" xmlNewCharEncodingHandler : out of memory ! \n " ) ;
return ( NULL ) ;
}
/*
* allocate and fill - up an handler block .
*/
handler = ( xmlCharEncodingHandlerPtr )
xmlMalloc ( sizeof ( xmlCharEncodingHandler ) ) ;
if ( handler = = NULL ) {
xmlGenericError ( xmlGenericErrorContext ,
" xmlNewCharEncodingHandler : out of memory ! \n " ) ;
return ( NULL ) ;
}
handler - > input = input ;
handler - > output = output ;
handler - > name = up ;
# ifdef LIBXML_ICONV_ENABLED
handler - > iconv_in = NULL ;
handler - > iconv_out = NULL ;
# endif /* LIBXML_ICONV_ENABLED */
/*
* registers and returns the handler .
*/
xmlRegisterCharEncodingHandler ( handler ) ;
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" Registered encoding handler for %s \n " , name ) ;
# endif
return ( handler ) ;
}
/**
* xmlInitCharEncodingHandlers :
*
* Initialize the char encoding support , it registers the default
* encoding supported .
* NOTE : while public , this function usually doesn ' t need to be called
* in normal processing .
*/
void
xmlInitCharEncodingHandlers ( void ) {
unsigned short int tst = 0x1234 ;
unsigned char * ptr = ( unsigned char * ) & tst ;
if ( handlers ! = NULL ) return ;
handlers = ( xmlCharEncodingHandlerPtr * )
xmlMalloc ( MAX_ENCODING_HANDLERS * sizeof ( xmlCharEncodingHandlerPtr ) ) ;
if ( * ptr = = 0x12 ) xmlLittleEndian = 0 ;
else if ( * ptr = = 0x34 ) xmlLittleEndian = 1 ;
else xmlGenericError ( xmlGenericErrorContext ,
" Odd problem at endianness detection \n " ) ;
if ( handlers = = NULL ) {
xmlGenericError ( xmlGenericErrorContext ,
" xmlInitCharEncodingHandlers : out of memory ! \n " ) ;
return ;
}
xmlNewCharEncodingHandler ( " UTF-8 " , NULL , NULL ) ;
xmlUTF16LEHandler =
xmlNewCharEncodingHandler ( " UTF-16LE " , UTF16LEToUTF8 , UTF8ToUTF16LE ) ;
xmlUTF16BEHandler =
xmlNewCharEncodingHandler ( " UTF-16BE " , UTF16BEToUTF8 , UTF8ToUTF16BE ) ;
xmlNewCharEncodingHandler ( " ISO-8859-1 " , isolat1ToUTF8 , UTF8Toisolat1 ) ;
xmlNewCharEncodingHandler ( " ASCII " , asciiToUTF8 , UTF8Toascii ) ;
2001-05-31 22:22:04 +04:00
xmlNewCharEncodingHandler ( " US-ASCII " , asciiToUTF8 , UTF8Toascii ) ;
2001-02-23 20:55:21 +03:00
# ifdef LIBXML_HTML_ENABLED
xmlNewCharEncodingHandler ( " HTML " , NULL , UTF8ToHtml ) ;
# endif
}
/**
* xmlCleanupCharEncodingHandlers :
*
* Cleanup the memory allocated for the char encoding support , it
* unregisters all the encoding handlers and the aliases .
*/
void
xmlCleanupCharEncodingHandlers ( void ) {
xmlCleanupEncodingAliases ( ) ;
if ( handlers = = NULL ) return ;
for ( ; nbCharEncodingHandler > 0 ; ) {
nbCharEncodingHandler - - ;
if ( handlers [ nbCharEncodingHandler ] ! = NULL ) {
if ( handlers [ nbCharEncodingHandler ] - > name ! = NULL )
xmlFree ( handlers [ nbCharEncodingHandler ] - > name ) ;
xmlFree ( handlers [ nbCharEncodingHandler ] ) ;
}
}
xmlFree ( handlers ) ;
handlers = NULL ;
nbCharEncodingHandler = 0 ;
xmlDefaultCharEncodingHandler = NULL ;
}
/**
* xmlRegisterCharEncodingHandler :
* @ handler : the xmlCharEncodingHandlerPtr handler block
*
2001-12-31 19:16:02 +03:00
* Register the char encoding handler , surprising , isn ' t it ?
2001-02-23 20:55:21 +03:00
*/
void
xmlRegisterCharEncodingHandler ( xmlCharEncodingHandlerPtr handler ) {
if ( handlers = = NULL ) xmlInitCharEncodingHandlers ( ) ;
if ( handler = = NULL ) {
xmlGenericError ( xmlGenericErrorContext ,
" xmlRegisterCharEncodingHandler: NULL handler ! \n " ) ;
return ;
}
if ( nbCharEncodingHandler > = MAX_ENCODING_HANDLERS ) {
xmlGenericError ( xmlGenericErrorContext ,
" xmlRegisterCharEncodingHandler: Too many handler registered \n " ) ;
xmlGenericError ( xmlGenericErrorContext ,
" \t increase MAX_ENCODING_HANDLERS : %s \n " , __FILE__ ) ;
return ;
}
handlers [ nbCharEncodingHandler + + ] = handler ;
}
/**
* xmlGetCharEncodingHandler :
* @ enc : an xmlCharEncoding value .
*
2001-12-31 19:16:02 +03:00
* Search in the registered set the handler able to read / write that encoding .
2001-02-23 20:55:21 +03:00
*
* Returns the handler or NULL if not found
*/
xmlCharEncodingHandlerPtr
xmlGetCharEncodingHandler ( xmlCharEncoding enc ) {
xmlCharEncodingHandlerPtr handler ;
if ( handlers = = NULL ) xmlInitCharEncodingHandlers ( ) ;
switch ( enc ) {
case XML_CHAR_ENCODING_ERROR :
return ( NULL ) ;
case XML_CHAR_ENCODING_NONE :
return ( NULL ) ;
case XML_CHAR_ENCODING_UTF8 :
return ( NULL ) ;
case XML_CHAR_ENCODING_UTF16LE :
return ( xmlUTF16LEHandler ) ;
case XML_CHAR_ENCODING_UTF16BE :
return ( xmlUTF16BEHandler ) ;
case XML_CHAR_ENCODING_EBCDIC :
handler = xmlFindCharEncodingHandler ( " EBCDIC " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " ebcdic " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_UCS4BE :
handler = xmlFindCharEncodingHandler ( " ISO-10646-UCS-4 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " UCS-4 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " UCS4 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_UCS4LE :
handler = xmlFindCharEncodingHandler ( " ISO-10646-UCS-4 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " UCS-4 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " UCS4 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_UCS4_2143 :
break ;
case XML_CHAR_ENCODING_UCS4_3412 :
break ;
case XML_CHAR_ENCODING_UCS2 :
handler = xmlFindCharEncodingHandler ( " ISO-10646-UCS-2 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " UCS-2 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " UCS2 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
/*
* We used to keep ISO Latin encodings native in the
* generated data . This led to so many problems that
* this has been removed . One can still change this
* back by registering no - ops encoders for those
*/
case XML_CHAR_ENCODING_8859_1 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-1 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_2 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-2 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_3 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-3 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_4 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-4 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_5 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-5 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_6 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-6 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_7 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-7 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_8 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-8 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_8859_9 :
handler = xmlFindCharEncodingHandler ( " ISO-8859-9 " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_2022_JP :
handler = xmlFindCharEncodingHandler ( " ISO-2022-JP " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_SHIFT_JIS :
handler = xmlFindCharEncodingHandler ( " SHIFT-JIS " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " SHIFT_JIS " ) ;
if ( handler ! = NULL ) return ( handler ) ;
handler = xmlFindCharEncodingHandler ( " Shift_JIS " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
case XML_CHAR_ENCODING_EUC_JP :
handler = xmlFindCharEncodingHandler ( " EUC-JP " ) ;
if ( handler ! = NULL ) return ( handler ) ;
break ;
default :
break ;
}
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" No handler found for encoding %d \n " , enc ) ;
# endif
return ( NULL ) ;
}
/**
2001-07-18 23:30:27 +04:00
* xmlFindCharEncodingHandler :
* @ name : a string describing the char encoding .
2001-02-23 20:55:21 +03:00
*
2001-12-31 19:16:02 +03:00
* Search in the registered set the handler able to read / write that encoding .
2001-02-23 20:55:21 +03:00
*
* Returns the handler or NULL if not found
*/
xmlCharEncodingHandlerPtr
xmlFindCharEncodingHandler ( const char * name ) {
const char * nalias ;
const char * norig ;
xmlCharEncoding alias ;
# ifdef LIBXML_ICONV_ENABLED
xmlCharEncodingHandlerPtr enc ;
iconv_t icv_in , icv_out ;
# endif /* LIBXML_ICONV_ENABLED */
char upper [ 100 ] ;
int i ;
if ( handlers = = NULL ) xmlInitCharEncodingHandlers ( ) ;
if ( name = = NULL ) return ( xmlDefaultCharEncodingHandler ) ;
if ( name [ 0 ] = = 0 ) return ( xmlDefaultCharEncodingHandler ) ;
/*
* Do the alias resolution
*/
norig = name ;
nalias = xmlGetEncodingAlias ( name ) ;
if ( nalias ! = NULL )
name = nalias ;
/*
* Check first for directly registered encoding names
*/
for ( i = 0 ; i < 99 ; i + + ) {
upper [ i ] = toupper ( name [ i ] ) ;
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
for ( i = 0 ; i < nbCharEncodingHandler ; i + + )
if ( ! strcmp ( upper , handlers [ i ] - > name ) ) {
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" Found registered handler for encoding %s \n " , name ) ;
# endif
return ( handlers [ i ] ) ;
}
# ifdef LIBXML_ICONV_ENABLED
/* check whether iconv can handle this */
icv_in = iconv_open ( " UTF-8 " , name ) ;
icv_out = iconv_open ( name , " UTF-8 " ) ;
if ( ( icv_in ! = ( iconv_t ) - 1 ) & & ( icv_out ! = ( iconv_t ) - 1 ) ) {
enc = ( xmlCharEncodingHandlerPtr )
xmlMalloc ( sizeof ( xmlCharEncodingHandler ) ) ;
if ( enc = = NULL ) {
iconv_close ( icv_in ) ;
iconv_close ( icv_out ) ;
return ( NULL ) ;
}
enc - > name = xmlMemStrdup ( name ) ;
enc - > input = NULL ;
enc - > output = NULL ;
enc - > iconv_in = icv_in ;
enc - > iconv_out = icv_out ;
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" Found iconv handler for encoding %s \n " , name ) ;
# endif
return enc ;
} else if ( ( icv_in ! = ( iconv_t ) - 1 ) | | icv_out ! = ( iconv_t ) - 1 ) {
xmlGenericError ( xmlGenericErrorContext ,
" iconv : problems with filters for '%s' \n " , name ) ;
}
# endif /* LIBXML_ICONV_ENABLED */
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" No handler found for encoding %s \n " , name ) ;
# endif
/*
* Fallback using the canonical names
*/
alias = xmlParseCharEncoding ( norig ) ;
if ( alias ! = XML_CHAR_ENCODING_ERROR ) {
const char * canon ;
canon = xmlGetCharEncodingName ( alias ) ;
if ( ( canon ! = NULL ) & & ( strcmp ( name , canon ) ) ) {
return ( xmlFindCharEncodingHandler ( canon ) ) ;
}
}
return ( NULL ) ;
}
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* ICONV based generic conversion functions *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
# ifdef LIBXML_ICONV_ENABLED
/**
* xmlIconvWrapper :
* @ cd : iconv converter data structure
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of ISO Latin 1 chars
* @ inlen : the length of @ in
*
* Returns 0 if success , or
* - 1 by lack of space , or
* - 2 if the transcoding fails ( for * in is not valid utf8 string or
* the result of transformation can ' t fit into the encoding we want ) , or
* - 3 if there the last byte can ' t form a single output char .
*
* The value of @ inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2001-02-23 20:55:21 +03:00
* The value of @ outlen after return is the number of ocetes consumed .
*/
static int
xmlIconvWrapper ( iconv_t cd ,
2001-05-28 15:00:53 +04:00
unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ) {
size_t icv_inlen = * inlen , icv_outlen = * outlen ;
const char * icv_in = ( const char * ) in ;
char * icv_out = ( char * ) out ;
int ret ;
2001-07-28 02:47:14 +04:00
ret = iconv ( cd , ( char * * ) & icv_in , & icv_inlen , & icv_out , & icv_outlen ) ;
2001-05-28 15:00:53 +04:00
if ( in ! = NULL ) {
* inlen - = icv_inlen ;
* outlen - = icv_outlen ;
} else {
* inlen = 0 ;
* outlen = 0 ;
}
if ( ( icv_inlen ! = 0 ) | | ( ret = = - 1 ) ) {
2001-02-23 20:55:21 +03:00
# ifdef EILSEQ
2001-05-28 15:00:53 +04:00
if ( errno = = EILSEQ ) {
return - 2 ;
} else
2001-02-23 20:55:21 +03:00
# endif
# ifdef E2BIG
2001-05-28 15:00:53 +04:00
if ( errno = = E2BIG ) {
return - 1 ;
} else
2001-02-23 20:55:21 +03:00
# endif
# ifdef EINVAL
2001-05-28 15:00:53 +04:00
if ( errno = = EINVAL ) {
return - 3 ;
} else
2001-02-23 20:55:21 +03:00
# endif
2001-05-28 15:00:53 +04:00
{
return - 3 ;
}
}
return 0 ;
2001-02-23 20:55:21 +03:00
}
# endif /* LIBXML_ICONV_ENABLED */
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* The real API used by libxml for on - the - fly conversion *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
/**
* xmlCharEncFirstLine :
* @ handler : char enconding transformation data structure
* @ out : an xmlBuffer for the output .
* @ in : an xmlBuffer for the input
*
* Front - end for the encoding handler input function , but handle only
* the very first line , i . e . limit itself to 45 chars .
*
* Returns the number of byte written if success , or
* - 1 general error
* - 2 if the transcoding fails ( for * in is not valid utf8 string or
* the result of transformation can ' t fit into the encoding we want ) , or
*/
int
xmlCharEncFirstLine ( xmlCharEncodingHandler * handler , xmlBufferPtr out ,
xmlBufferPtr in ) {
int ret = - 2 ;
int written ;
int toconv ;
if ( handler = = NULL ) return ( - 1 ) ;
if ( out = = NULL ) return ( - 1 ) ;
if ( in = = NULL ) return ( - 1 ) ;
written = out - > size - out - > use ;
toconv = in - > use ;
if ( toconv * 2 > = written ) {
xmlBufferGrow ( out , toconv ) ;
written = out - > size - out - > use - 1 ;
}
/*
* echo ' < ? xml version = " 1.0 " encoding = " UCS4 " ? > ' | wc - c = > 38
* 45 chars should be sufficient to reach the end of the encoding
2001-12-31 19:16:02 +03:00
* declaration without going too far inside the document content .
2001-02-23 20:55:21 +03:00
*/
written = 45 ;
if ( handler - > input ! = NULL ) {
ret = handler - > input ( & out - > content [ out - > use ] , & written ,
in - > content , & toconv ) ;
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
}
# ifdef LIBXML_ICONV_ENABLED
else if ( handler - > iconv_in ! = NULL ) {
ret = xmlIconvWrapper ( handler - > iconv_in , & out - > content [ out - > use ] ,
& written , in - > content , & toconv ) ;
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
if ( ret = = - 1 ) ret = - 3 ;
}
# endif /* LIBXML_ICONV_ENABLED */
# ifdef DEBUG_ENCODING
switch ( ret ) {
case 0 :
xmlGenericError ( xmlGenericErrorContext ,
" converted %d bytes to %d bytes of input \n " ,
toconv , written ) ;
break ;
case - 1 :
xmlGenericError ( xmlGenericErrorContext , " converted %d bytes to %d bytes of input, %d left \n " ,
toconv , written , in - > use ) ;
break ;
case - 2 :
xmlGenericError ( xmlGenericErrorContext ,
" input conversion failed due to input error \n " ) ;
break ;
case - 3 :
xmlGenericError ( xmlGenericErrorContext , " converted %d bytes to %d bytes of input, %d left \n " ,
toconv , written , in - > use ) ;
break ;
default :
xmlGenericError ( xmlGenericErrorContext , " Unknown input conversion failed %d \n " , ret ) ;
}
2001-06-22 02:07:42 +04:00
# endif /* DEBUG_ENCODING */
2001-02-23 20:55:21 +03:00
/*
* Ignore when input buffer is not on a boundary
*/
if ( ret = = - 3 ) ret = 0 ;
if ( ret = = - 1 ) ret = 0 ;
return ( ret ) ;
}
/**
* xmlCharEncInFunc :
2001-12-31 19:16:02 +03:00
* @ handler : char encoding transformation data structure
2001-02-23 20:55:21 +03:00
* @ out : an xmlBuffer for the output .
* @ in : an xmlBuffer for the input
*
* Generic front - end for the encoding handler input function
*
* Returns the number of byte written if success , or
* - 1 general error
* - 2 if the transcoding fails ( for * in is not valid utf8 string or
* the result of transformation can ' t fit into the encoding we want ) , or
*/
int
2001-06-22 02:07:42 +04:00
xmlCharEncInFunc ( xmlCharEncodingHandler * handler , xmlBufferPtr out ,
xmlBufferPtr in )
{
2001-02-23 20:55:21 +03:00
int ret = - 2 ;
int written ;
int toconv ;
2001-06-22 02:07:42 +04:00
if ( handler = = NULL )
return ( - 1 ) ;
if ( out = = NULL )
return ( - 1 ) ;
if ( in = = NULL )
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
toconv = in - > use ;
if ( toconv = = 0 )
2001-06-22 02:07:42 +04:00
return ( 0 ) ;
2001-02-23 20:55:21 +03:00
written = out - > size - out - > use ;
if ( toconv * 2 > = written ) {
xmlBufferGrow ( out , out - > size + toconv * 2 ) ;
2001-06-22 02:07:42 +04:00
written = out - > size - out - > use - 1 ;
2001-02-23 20:55:21 +03:00
}
if ( handler - > input ! = NULL ) {
2001-06-22 02:07:42 +04:00
ret = handler - > input ( & out - > content [ out - > use ] , & written ,
in - > content , & toconv ) ;
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
2001-02-23 20:55:21 +03:00
}
# ifdef LIBXML_ICONV_ENABLED
else if ( handler - > iconv_in ! = NULL ) {
2001-06-22 02:07:42 +04:00
ret = xmlIconvWrapper ( handler - > iconv_in , & out - > content [ out - > use ] ,
& written , in - > content , & toconv ) ;
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
if ( ret = = - 1 )
ret = - 3 ;
2001-02-23 20:55:21 +03:00
}
# endif /* LIBXML_ICONV_ENABLED */
switch ( ret ) {
case 0 :
2001-06-22 02:07:42 +04:00
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" converted %d bytes to %d bytes of input \n " ,
toconv , written ) ;
# endif
break ;
2001-02-23 20:55:21 +03:00
case - 1 :
2001-06-22 02:07:42 +04:00
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" converted %d bytes to %d bytes of input, %d left \n " ,
toconv , written , in - > use ) ;
# endif
break ;
2001-02-23 20:55:21 +03:00
case - 3 :
2001-06-22 02:07:42 +04:00
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" converted %d bytes to %d bytes of input, %d left \n " ,
toconv , written , in - > use ) ;
2001-02-23 20:55:21 +03:00
# endif
2001-06-22 02:07:42 +04:00
break ;
2001-02-23 20:55:21 +03:00
case - 2 :
2001-06-22 02:07:42 +04:00
xmlGenericError ( xmlGenericErrorContext ,
" input conversion failed due to input error \n " ) ;
xmlGenericError ( xmlGenericErrorContext ,
" Bytes: 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
in - > content [ 0 ] , in - > content [ 1 ] ,
in - > content [ 2 ] , in - > content [ 3 ] ) ;
2001-02-23 20:55:21 +03:00
}
/*
* Ignore when input buffer is not on a boundary
*/
2001-06-22 02:07:42 +04:00
if ( ret = = - 3 )
ret = 0 ;
return ( ret ) ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlCharEncOutFunc :
* @ handler : char enconding transformation data structure
* @ out : an xmlBuffer for the output .
* @ in : an xmlBuffer for the input
*
* Generic front - end for the encoding handler output function
* a first call with @ in = = NULL has to be made firs to initiate the
* output in case of non - stateless encoding needing to initiate their
* state or the output ( like the BOM in UTF16 ) .
* In case of UTF8 sequence conversion errors for the given encoder ,
* the content will be automatically remapped to a CharRef sequence .
*
* Returns the number of byte written if success , or
* - 1 general error
* - 2 if the transcoding fails ( for * in is not valid utf8 string or
* the result of transformation can ' t fit into the encoding we want ) , or
*/
int
xmlCharEncOutFunc ( xmlCharEncodingHandler * handler , xmlBufferPtr out ,
xmlBufferPtr in ) {
int ret = - 2 ;
int written ;
int writtentot = 0 ;
int toconv ;
int output = 0 ;
if ( handler = = NULL ) return ( - 1 ) ;
if ( out = = NULL ) return ( - 1 ) ;
retry :
written = out - > size - out - > use ;
/*
* First specific handling of in = NULL , i . e . the initialization call
*/
if ( in = = NULL ) {
toconv = 0 ;
if ( handler - > output ! = NULL ) {
ret = handler - > output ( & out - > content [ out - > use ] , & written ,
NULL , & toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
}
# ifdef LIBXML_ICONV_ENABLED
else if ( handler - > iconv_out ! = NULL ) {
ret = xmlIconvWrapper ( handler - > iconv_out , & out - > content [ out - > use ] ,
& written , NULL , & toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
}
# endif /* LIBXML_ICONV_ENABLED */
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" initialized encoder \n " ) ;
# endif
return ( 0 ) ;
}
/*
2001-12-31 19:16:02 +03:00
* Conversion itself .
2001-02-23 20:55:21 +03:00
*/
toconv = in - > use ;
if ( toconv = = 0 )
return ( 0 ) ;
if ( toconv * 2 > = written ) {
xmlBufferGrow ( out , toconv * 2 ) ;
written = out - > size - out - > use - 1 ;
}
if ( handler - > output ! = NULL ) {
ret = handler - > output ( & out - > content [ out - > use ] , & written ,
in - > content , & toconv ) ;
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
writtentot + = written ;
out - > content [ out - > use ] = 0 ;
}
# ifdef LIBXML_ICONV_ENABLED
else if ( handler - > iconv_out ! = NULL ) {
ret = xmlIconvWrapper ( handler - > iconv_out , & out - > content [ out - > use ] ,
& written , in - > content , & toconv ) ;
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
writtentot + = written ;
out - > content [ out - > use ] = 0 ;
if ( ret = = - 1 ) {
if ( written > 0 ) {
/*
* Can be a limitation of iconv
*/
goto retry ;
}
ret = - 3 ;
}
}
# endif /* LIBXML_ICONV_ENABLED */
else {
xmlGenericError ( xmlGenericErrorContext ,
" xmlCharEncOutFunc: no output function ! \n " ) ;
return ( - 1 ) ;
}
if ( ret > = 0 ) output + = ret ;
/*
* Attempt to handle error cases
*/
switch ( ret ) {
case 0 :
2001-06-22 02:07:42 +04:00
# ifdef DEBUG_ENCODING
2001-02-23 20:55:21 +03:00
xmlGenericError ( xmlGenericErrorContext ,
" converted %d bytes to %d bytes of output \n " ,
toconv , written ) ;
2001-06-22 02:07:42 +04:00
# endif
2001-02-23 20:55:21 +03:00
break ;
case - 1 :
2001-06-22 02:07:42 +04:00
# ifdef DEBUG_ENCODING
2001-02-23 20:55:21 +03:00
xmlGenericError ( xmlGenericErrorContext ,
" output conversion failed by lack of space \n " ) ;
# endif
2001-06-22 02:07:42 +04:00
break ;
2001-02-23 20:55:21 +03:00
case - 3 :
xmlGenericError ( xmlGenericErrorContext , " converted %d bytes to %d bytes of output %d left \n " ,
toconv , written , in - > use ) ;
break ;
case - 2 : {
int len = in - > use ;
const xmlChar * utf = ( const xmlChar * ) in - > content ;
int cur ;
cur = xmlGetUTF8Char ( utf , & len ) ;
if ( cur > 0 ) {
xmlChar charref [ 20 ] ;
# ifdef DEBUG_ENCODING
xmlGenericError ( xmlGenericErrorContext ,
" handling output conversion error \n " ) ;
xmlGenericError ( xmlGenericErrorContext ,
" Bytes: 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
in - > content [ 0 ] , in - > content [ 1 ] ,
in - > content [ 2 ] , in - > content [ 3 ] ) ;
# endif
/*
* Removes the UTF8 sequence , and replace it by a charref
* and continue the transcoding phase , hoping the error
* did not mangle the encoder state .
*/
2002-06-14 21:07:10 +04:00
snprintf ( ( char * ) charref , sizeof ( charref ) , " &#%d; " , cur ) ;
2001-02-23 20:55:21 +03:00
xmlBufferShrink ( in , len ) ;
xmlBufferAddHead ( in , charref , - 1 ) ;
goto retry ;
} else {
xmlGenericError ( xmlGenericErrorContext ,
" output conversion failed due to conv error \n " ) ;
xmlGenericError ( xmlGenericErrorContext ,
" Bytes: 0x%02X 0x%02X 0x%02X 0x%02X \n " ,
in - > content [ 0 ] , in - > content [ 1 ] ,
in - > content [ 2 ] , in - > content [ 3 ] ) ;
in - > content [ 0 ] = ' ' ;
}
break ;
}
}
return ( ret ) ;
}
/**
* xmlCharEncCloseFunc :
* @ handler : char enconding transformation data structure
*
2001-12-31 19:16:02 +03:00
* Generic front - end for encoding handler close function
2001-02-23 20:55:21 +03:00
*
* Returns 0 if success , or - 1 in case of error
*/
int
xmlCharEncCloseFunc ( xmlCharEncodingHandler * handler ) {
int ret = 0 ;
if ( handler = = NULL ) return ( - 1 ) ;
if ( handler - > name = = NULL ) return ( - 1 ) ;
# ifdef LIBXML_ICONV_ENABLED
/*
2001-12-31 19:16:02 +03:00
* Iconv handlers can be used only once , free the whole block .
2001-02-23 20:55:21 +03:00
* and the associated icon resources .
*/
if ( ( handler - > iconv_out ! = NULL ) | | ( handler - > iconv_in ! = NULL ) ) {
if ( handler - > name ! = NULL )
xmlFree ( handler - > name ) ;
handler - > name = NULL ;
if ( handler - > iconv_out ! = NULL ) {
if ( iconv_close ( handler - > iconv_out ) )
ret = - 1 ;
handler - > iconv_out = NULL ;
}
if ( handler - > iconv_in ! = NULL ) {
if ( iconv_close ( handler - > iconv_in ) )
ret = - 1 ;
handler - > iconv_in = NULL ;
}
xmlFree ( handler ) ;
}
# endif /* LIBXML_ICONV_ENABLED */
# ifdef DEBUG_ENCODING
if ( ret )
xmlGenericError ( xmlGenericErrorContext ,
" failed to close the encoding handler \n " ) ;
else
xmlGenericError ( xmlGenericErrorContext ,
" closed the encoding handler \n " ) ;
# endif
2001-06-22 02:07:42 +04:00
2001-02-23 20:55:21 +03:00
return ( ret ) ;
}