2001-02-23 20:55:21 +03:00
/*
* encoding . c : implements the encoding conversion functions needed for XML
*
2010-11-04 19:42:42 +03:00
* Related specs :
2001-02-23 20:55:21 +03:00
* rfc2044 ( UTF - 8 and UTF - 16 ) F . Yergeau Alis Technologies
* rfc2781 UTF - 16 , an encoding of ISO 10646 , P . Hoffman , F . Yergeau
* [ ISO - 10646 ] UTF - 8 and UTF - 16 in Annexes
* [ ISO - 8859 - 1 ] ISO Latin - 1 characters codes .
* [ UNICODE ] The Unicode Consortium , " The Unicode Standard --
* Worldwide Character Encoding - - Version 1.0 " , Addison-
* Wesley , Volume 1 , 1991 , Volume 2 , 1992. UTF - 8 is
* described in Unicode Technical Report # 4.
* [ US - ASCII ] Coded Character Set - - 7 - bit American Standard Code for
* Information Interchange , ANSI X3 .4 - 1986.
*
* See Copyright for the status of this software .
*
2001-06-24 16:13:24 +04:00
* daniel @ veillard . com
2001-05-30 23:14:17 +04:00
*
* Original code for IsoLatin1 and UTF - 16 by " Martin J. Duerst " < duerst @ w3 . org >
2001-02-23 20:55:21 +03:00
*/
2002-03-18 22:37:11 +03:00
# define IN_LIBXML
2001-04-21 20:57:29 +04:00
# include "libxml.h"
2001-02-23 20:55:21 +03:00
# include <string.h>
2012-07-13 15:51:15 +04:00
# include <limits.h>
2001-02-23 20:55:21 +03:00
# include <ctype.h>
# include <stdlib.h>
2022-03-02 02:29:17 +03:00
2001-02-23 20:55:21 +03:00
# ifdef LIBXML_ICONV_ENABLED
2024-06-28 22:51:21 +03:00
# include <iconv.h>
2001-02-23 20:55:21 +03:00
# include <errno.h>
# endif
2022-03-02 02:29:17 +03:00
2001-02-23 20:55:21 +03:00
# include <libxml/encoding.h>
# include <libxml/xmlmemory.h>
2023-09-20 18:38:26 +03:00
# include <libxml/parser.h>
2001-02-23 20:55:21 +03:00
# ifdef LIBXML_HTML_ENABLED
# include <libxml/HTMLparser.h>
# endif
2001-11-04 23:19:12 +03:00
# include <libxml/xmlerror.h>
2001-02-23 20:55:21 +03:00
2022-08-26 02:22:33 +03:00
# include "private/buf.h"
# include "private/enc.h"
2024-07-12 04:07:57 +03:00
# include "private/entities.h"
2022-08-26 02:22:33 +03:00
# include "private/error.h"
2012-07-13 15:51:15 +04:00
2022-03-01 14:39:02 +03:00
# ifdef LIBXML_ICU_ENABLED
# include <unicode/ucnv.h>
# endif
2024-06-28 20:06:57 +03:00
# define XML_HANDLER_STATIC 1
2001-02-23 20:55:21 +03:00
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias ;
typedef xmlCharEncodingAlias * xmlCharEncodingAliasPtr ;
struct _xmlCharEncodingAlias {
const char * name ;
const char * alias ;
} ;
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL ;
static int xmlCharEncodingAliasesNb = 0 ;
static int xmlCharEncodingAliasesMax = 0 ;
static int xmlLittleEndian = 1 ;
2024-06-27 21:39:52 +03:00
typedef struct {
const char * name ;
xmlCharEncoding enc ;
} xmlEncTableEntry ;
static const xmlEncTableEntry xmlEncTable [ ] = {
{ " ASCII " , XML_CHAR_ENCODING_ASCII } ,
{ " EUC-JP " , XML_CHAR_ENCODING_EUC_JP } ,
{ " HTML " , XML_CHAR_ENCODING_HTML } ,
{ " ISO LATIN 1 " , XML_CHAR_ENCODING_8859_1 } ,
{ " ISO LATIN 2 " , XML_CHAR_ENCODING_8859_2 } ,
{ " ISO-10646-UCS-2 " , XML_CHAR_ENCODING_UCS2 } ,
{ " ISO-10646-UCS-4 " , XML_CHAR_ENCODING_UCS4LE } ,
{ " ISO-2022-JP " , XML_CHAR_ENCODING_2022_JP } ,
{ " ISO-8859-1 " , XML_CHAR_ENCODING_8859_1 } ,
{ " ISO-8859-10 " , XML_CHAR_ENCODING_8859_10 } ,
{ " ISO-8859-11 " , XML_CHAR_ENCODING_8859_11 } ,
{ " ISO-8859-13 " , XML_CHAR_ENCODING_8859_13 } ,
{ " ISO-8859-14 " , XML_CHAR_ENCODING_8859_14 } ,
{ " ISO-8859-15 " , XML_CHAR_ENCODING_8859_15 } ,
{ " ISO-8859-16 " , XML_CHAR_ENCODING_8859_16 } ,
{ " ISO-8859-2 " , XML_CHAR_ENCODING_8859_2 } ,
{ " ISO-8859-3 " , XML_CHAR_ENCODING_8859_3 } ,
{ " ISO-8859-4 " , XML_CHAR_ENCODING_8859_4 } ,
{ " ISO-8859-5 " , XML_CHAR_ENCODING_8859_5 } ,
{ " ISO-8859-6 " , XML_CHAR_ENCODING_8859_6 } ,
{ " ISO-8859-7 " , XML_CHAR_ENCODING_8859_7 } ,
{ " ISO-8859-8 " , XML_CHAR_ENCODING_8859_8 } ,
{ " ISO-8859-9 " , XML_CHAR_ENCODING_8859_9 } ,
{ " ISO-LATIN-1 " , XML_CHAR_ENCODING_8859_1 } ,
{ " ISO-LATIN-2 " , XML_CHAR_ENCODING_8859_2 } ,
{ " SHIFT_JIS " , XML_CHAR_ENCODING_SHIFT_JIS } ,
{ " UCS-2 " , XML_CHAR_ENCODING_UCS2 } ,
{ " UCS-4 " , XML_CHAR_ENCODING_UCS4LE } ,
{ " UCS2 " , XML_CHAR_ENCODING_UCS2 } ,
{ " UCS4 " , XML_CHAR_ENCODING_UCS4LE } ,
{ " US-ASCII " , XML_CHAR_ENCODING_ASCII } ,
{ " UTF-16 " , XML_CHAR_ENCODING_UTF16 } ,
{ " UTF-16BE " , XML_CHAR_ENCODING_UTF16BE } ,
{ " UTF-16LE " , XML_CHAR_ENCODING_UTF16LE } ,
{ " UTF-8 " , XML_CHAR_ENCODING_UTF8 } ,
{ " UTF16 " , XML_CHAR_ENCODING_UTF16LE } ,
{ " UTF8 " , XML_CHAR_ENCODING_UTF8 }
} ;
static int
2024-06-28 05:36:14 +03:00
asciiToAscii ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
static int
UTF8ToUTF8 ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * inb , int * inlenb , void * vctxt ) ;
static int
latin1ToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
static int
UTF16LEToUTF8 ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * inb , int * inlenb , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
static int
UTF16BEToUTF8 ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * inb , int * inlenb , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
# ifdef LIBXML_OUTPUT_ENABLED
2024-06-28 20:06:57 +03:00
static int
UTF8ToLatin1 ( unsigned char * outb , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
static int
UTF8ToUTF16 ( unsigned char * outb , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
static int
UTF8ToUTF16LE ( unsigned char * outb , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
static int
UTF8ToUTF16BE ( unsigned char * outb , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen , void * vctxt ) ;
2024-06-27 21:39:52 +03:00
# else /* LIBXML_OUTPUT_ENABLED */
2024-06-28 20:06:57 +03:00
# define UTF8ToLatin1 NULL
2024-06-27 21:39:52 +03:00
# define UTF8ToUTF16 NULL
# define UTF8ToUTF16LE NULL
# define UTF8ToUTF16BE NULL
# endif /* LIBXML_OUTPUT_ENABLED */
2024-06-28 20:06:57 +03:00
# if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
static int
UTF8ToHtmlWrapper ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) ;
# else
# define UTF8ToHtmlWrapper NULL
2024-06-27 21:39:52 +03:00
# endif
2024-06-28 21:37:47 +03:00
# ifdef LIBXML_ICONV_ENABLED
# define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0
# else
# define EMPTY_ICONV
# endif
2024-06-27 21:39:52 +03:00
# if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
defined ( LIBXML_ISO8859X_ENABLED )
2024-06-28 21:37:47 +03:00
# include "iso8859x.inc"
static int
ISO8859xToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) ;
static int
UTF8ToISO8859x ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) ;
# define MAKE_ISO_HANDLER(name, n) \
{ ( char * ) name , \
( xmlCharEncodingInputFunc ) ( void ( * ) ( void ) ) ISO8859xToUTF8 , \
( xmlCharEncodingInputFunc ) ( void ( * ) ( void ) ) UTF8ToISO8859x \
EMPTY_ICONV , \
( void * ) xmlunicodetable_ISO8859_ # # n , \
( void * ) xmltranscodetable_ISO8859_ # # n , \
NULL , XML_HANDLER_STATIC }
2024-06-27 21:39:52 +03:00
# else /* LIBXML_ISO8859X_ENABLED */
2024-06-28 21:37:47 +03:00
# define MAKE_ISO_HANDLER(name, n) \
{ ( char * ) name , NULL , NULL EMPTY_ICONV , NULL , NULL , NULL , \
XML_HANDLER_STATIC }
2024-06-27 21:39:52 +03:00
# endif /* LIBXML_ISO8859X_ENABLED */
# define MAKE_HANDLER(name, in, out) \
2024-06-28 20:06:57 +03:00
{ ( char * ) name , \
( xmlCharEncodingInputFunc ) ( void ( * ) ( void ) ) in , \
( xmlCharEncodingOutputFunc ) ( void ( * ) ( void ) ) out \
EMPTY_ICONV , NULL , NULL , NULL , XML_HANDLER_STATIC }
2024-06-27 21:39:52 +03:00
/*
* The layout must match enum xmlCharEncoding .
*
* Names should match the IANA registry if possible :
* https : //www.iana.org/assignments/character-sets/character-sets.xhtml
*/
static const xmlCharEncodingHandler defaultHandlers [ 31 ] = {
MAKE_HANDLER ( NULL , NULL , NULL ) , /* NONE */
MAKE_HANDLER ( " UTF-8 " , UTF8ToUTF8 , UTF8ToUTF8 ) ,
MAKE_HANDLER ( " UTF-16LE " , UTF16LEToUTF8 , UTF8ToUTF16LE ) ,
MAKE_HANDLER ( " UTF-16BE " , UTF16BEToUTF8 , UTF8ToUTF16BE ) ,
MAKE_HANDLER ( " UCS-4LE " , NULL , NULL ) ,
MAKE_HANDLER ( " UCS-4BE " , NULL , NULL ) ,
MAKE_HANDLER ( " IBM037 " , NULL , NULL ) ,
MAKE_HANDLER ( " ISO-10646-UCS-4 " , NULL , NULL ) , /* UCS4_2143 */
MAKE_HANDLER ( " ISO-10646-UCS-4 " , NULL , NULL ) , /* UCS4_2143 */
MAKE_HANDLER ( " ISO-10646-UCS-2 " , NULL , NULL ) ,
2024-06-28 20:06:57 +03:00
MAKE_HANDLER ( " ISO-8859-1 " , latin1ToUTF8 , UTF8ToLatin1 ) ,
2024-06-28 21:37:47 +03:00
MAKE_ISO_HANDLER ( " ISO-8859-2 " , 2 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-3 " , 3 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-4 " , 4 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-5 " , 5 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-6 " , 6 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-7 " , 7 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-8 " , 8 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-9 " , 9 ) ,
2024-06-27 21:39:52 +03:00
MAKE_HANDLER ( " ISO-2022-JP " , NULL , NULL ) ,
MAKE_HANDLER ( " Shift_JIS " , NULL , NULL ) ,
MAKE_HANDLER ( " EUC-JP " , NULL , NULL ) ,
2024-06-28 05:36:14 +03:00
MAKE_HANDLER ( " US-ASCII " , asciiToAscii , asciiToAscii ) ,
2024-06-27 21:39:52 +03:00
MAKE_HANDLER ( " UTF-16 " , UTF16LEToUTF8 , UTF8ToUTF16 ) ,
2024-06-28 20:06:57 +03:00
MAKE_HANDLER ( " HTML " , NULL , UTF8ToHtmlWrapper ) ,
2024-06-28 21:37:47 +03:00
MAKE_ISO_HANDLER ( " ISO-8859-10 " , 10 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-11 " , 11 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-13 " , 13 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-14 " , 14 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-15 " , 15 ) ,
MAKE_ISO_HANDLER ( " ISO-8859-16 " , 16 ) ,
2024-06-27 21:39:52 +03:00
} ;
# define NUM_DEFAULT_HANDLERS \
( sizeof ( defaultHandlers ) / sizeof ( defaultHandlers [ 0 ] ) )
/* the size should be growable, but it's not a big deal ... */
# define MAX_ENCODING_HANDLERS 50
2024-06-28 00:32:58 +03:00
static xmlCharEncodingHandlerPtr * globalHandlers = NULL ;
2024-06-27 21:39:52 +03:00
static int nbCharEncodingHandler = 0 ;
2024-06-27 22:37:18 +03:00
# ifdef LIBXML_ICONV_ENABLED
static int
2024-06-28 00:32:58 +03:00
xmlCharEncIconv ( void * vctxt , const char * name , xmlCharEncConverter * conv ) ;
2024-06-27 22:37:18 +03:00
# endif
# ifdef LIBXML_ICU_ENABLED
static int
2024-06-28 00:32:58 +03:00
xmlCharEncUconv ( void * vctxt , const char * name , xmlCharEncConverter * conv ) ;
2024-06-27 22:37:18 +03:00
# endif
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* Generic encoding handling routines *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
/**
* xmlDetectCharEncoding :
* @ in : a pointer to the first bytes of the XML entity , must be at least
2003-11-28 12:39:10 +03:00
* 2 bytes long ( at least 4 if encoding is UTF4 variant ) .
2001-02-23 20:55:21 +03:00
* @ len : pointer to the length of the buffer
*
* Guess the encoding of the entity using the first bytes of the entity content
2003-11-28 12:39:10 +03:00
* according to the non - normative appendix F of the XML - 1.0 recommendation .
2010-11-04 19:42:42 +03:00
*
2001-02-23 20:55:21 +03:00
* Returns one of the XML_CHAR_ENCODING_ . . . values .
*/
xmlCharEncoding
xmlDetectCharEncoding ( const unsigned char * in , int len )
{
2010-11-04 19:42:42 +03:00
if ( in = = NULL )
2004-11-05 20:22:25 +03:00
return ( XML_CHAR_ENCODING_NONE ) ;
2001-02-23 20:55:21 +03:00
if ( len > = 4 ) {
if ( ( in [ 0 ] = = 0x00 ) & & ( in [ 1 ] = = 0x00 ) & &
( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x3C ) )
return ( XML_CHAR_ENCODING_UCS4BE ) ;
if ( ( in [ 0 ] = = 0x3C ) & & ( in [ 1 ] = = 0x00 ) & &
( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x00 ) )
return ( XML_CHAR_ENCODING_UCS4LE ) ;
if ( ( in [ 0 ] = = 0x00 ) & & ( in [ 1 ] = = 0x00 ) & &
( in [ 2 ] = = 0x3C ) & & ( in [ 3 ] = = 0x00 ) )
return ( XML_CHAR_ENCODING_UCS4_2143 ) ;
if ( ( in [ 0 ] = = 0x00 ) & & ( in [ 1 ] = = 0x3C ) & &
( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x00 ) )
return ( XML_CHAR_ENCODING_UCS4_3412 ) ;
if ( ( in [ 0 ] = = 0x4C ) & & ( in [ 1 ] = = 0x6F ) & &
( in [ 2 ] = = 0xA7 ) & & ( in [ 3 ] = = 0x94 ) )
return ( XML_CHAR_ENCODING_EBCDIC ) ;
if ( ( in [ 0 ] = = 0x3C ) & & ( in [ 1 ] = = 0x3F ) & &
( in [ 2 ] = = 0x78 ) & & ( in [ 3 ] = = 0x6D ) )
return ( XML_CHAR_ENCODING_UTF8 ) ;
2003-11-28 12:39:10 +03:00
/*
* Although not part of the recommendation , we also
* attempt an " auto-recognition " of UTF - 16L E and
* UTF - 16 BE encodings .
*/
if ( ( in [ 0 ] = = 0x3C ) & & ( in [ 1 ] = = 0x00 ) & &
( in [ 2 ] = = 0x3F ) & & ( in [ 3 ] = = 0x00 ) )
return ( XML_CHAR_ENCODING_UTF16LE ) ;
if ( ( in [ 0 ] = = 0x00 ) & & ( in [ 1 ] = = 0x3C ) & &
( in [ 2 ] = = 0x00 ) & & ( in [ 3 ] = = 0x3F ) )
return ( XML_CHAR_ENCODING_UTF16BE ) ;
2001-02-23 20:55:21 +03:00
}
2001-06-20 21:41:10 +04:00
if ( len > = 3 ) {
/*
* Errata on XML - 1.0 June 20 2001
* We now allow an UTF8 encoded BOM
*/
if ( ( in [ 0 ] = = 0xEF ) & & ( in [ 1 ] = = 0xBB ) & &
( in [ 2 ] = = 0xBF ) )
return ( XML_CHAR_ENCODING_UTF8 ) ;
}
2003-11-28 12:39:10 +03:00
/* For UTF-16 we can recognize by the BOM */
2001-02-23 20:55:21 +03:00
if ( len > = 2 ) {
if ( ( in [ 0 ] = = 0xFE ) & & ( in [ 1 ] = = 0xFF ) )
return ( XML_CHAR_ENCODING_UTF16BE ) ;
if ( ( in [ 0 ] = = 0xFF ) & & ( in [ 1 ] = = 0xFE ) )
return ( XML_CHAR_ENCODING_UTF16LE ) ;
}
return ( XML_CHAR_ENCODING_NONE ) ;
}
/**
* xmlCleanupEncodingAliases :
*
2024-07-02 03:18:03 +03:00
* DEPRECATED : This function modifies global state and is not
* thread - safe .
*
2001-02-23 20:55:21 +03:00
* Unregisters all aliases
*/
void
xmlCleanupEncodingAliases ( void ) {
int i ;
if ( xmlCharEncodingAliases = = NULL )
return ;
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( xmlCharEncodingAliases [ i ] . name ! = NULL )
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . name ) ;
if ( xmlCharEncodingAliases [ i ] . alias ! = NULL )
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . alias ) ;
}
xmlCharEncodingAliasesNb = 0 ;
xmlCharEncodingAliasesMax = 0 ;
xmlFree ( xmlCharEncodingAliases ) ;
2002-01-08 16:15:33 +03:00
xmlCharEncodingAliases = NULL ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlGetEncodingAlias :
* @ alias : the alias name as parsed , in UTF - 8 format ( ASCII actually )
*
2024-07-02 03:18:03 +03:00
* DEPRECATED : This function is not thread - safe .
*
2001-02-23 20:55:21 +03:00
* Lookup an encoding name for the given alias .
2010-11-04 19:42:42 +03:00
*
2003-11-28 12:39:10 +03:00
* Returns NULL if not found , otherwise the original name
2001-02-23 20:55:21 +03:00
*/
const char *
xmlGetEncodingAlias ( const char * alias ) {
int i ;
char upper [ 100 ] ;
if ( alias = = NULL )
return ( NULL ) ;
if ( xmlCharEncodingAliases = = NULL )
return ( NULL ) ;
for ( i = 0 ; i < 99 ; i + + ) {
2023-02-16 13:50:52 +03:00
upper [ i ] = ( char ) toupper ( ( unsigned char ) alias [ i ] ) ;
2001-02-23 20:55:21 +03:00
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
/*
* Walk down the list looking for a definition of the alias
*/
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( ! strcmp ( xmlCharEncodingAliases [ i ] . alias , upper ) ) {
return ( xmlCharEncodingAliases [ i ] . name ) ;
}
}
return ( NULL ) ;
}
/**
* xmlAddEncodingAlias :
* @ name : the encoding name as parsed , in UTF - 8 format ( ASCII actually )
* @ alias : the alias name as parsed , in UTF - 8 format ( ASCII actually )
*
2024-07-02 03:18:03 +03:00
* DEPRECATED : This function modifies global state and is not
* thread - safe .
*
2003-11-28 12:39:10 +03:00
* Registers an alias @ alias for an encoding named @ name . Existing alias
2001-02-23 20:55:21 +03:00
* will be overwritten .
2010-11-04 19:42:42 +03:00
*
2001-02-23 20:55:21 +03:00
* Returns 0 in case of success , - 1 in case of error
*/
int
xmlAddEncodingAlias ( const char * name , const char * alias ) {
int i ;
char upper [ 100 ] ;
2023-06-06 15:25:30 +03:00
char * nameCopy , * aliasCopy ;
2001-02-23 20:55:21 +03:00
if ( ( name = = NULL ) | | ( alias = = NULL ) )
return ( - 1 ) ;
for ( i = 0 ; i < 99 ; i + + ) {
2023-02-16 13:50:52 +03:00
upper [ i ] = ( char ) toupper ( ( unsigned char ) alias [ i ] ) ;
2001-02-23 20:55:21 +03:00
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
2023-06-06 15:25:30 +03:00
if ( xmlCharEncodingAliasesNb > = xmlCharEncodingAliasesMax ) {
xmlCharEncodingAliasPtr tmp ;
size_t newSize = xmlCharEncodingAliasesMax ?
xmlCharEncodingAliasesMax * 2 :
20 ;
tmp = ( xmlCharEncodingAliasPtr )
xmlRealloc ( xmlCharEncodingAliases ,
newSize * sizeof ( xmlCharEncodingAlias ) ) ;
if ( tmp = = NULL )
return ( - 1 ) ;
xmlCharEncodingAliases = tmp ;
xmlCharEncodingAliasesMax = newSize ;
2001-02-23 20:55:21 +03:00
}
2023-06-06 15:25:30 +03:00
2001-02-23 20:55:21 +03:00
/*
* Walk down the list looking for a definition of the alias
*/
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( ! strcmp ( xmlCharEncodingAliases [ i ] . alias , upper ) ) {
/*
* Replace the definition .
*/
2023-06-06 15:25:30 +03:00
nameCopy = xmlMemStrdup ( name ) ;
if ( nameCopy = = NULL )
return ( - 1 ) ;
2001-02-23 20:55:21 +03:00
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . name ) ;
2023-06-06 15:25:30 +03:00
xmlCharEncodingAliases [ i ] . name = nameCopy ;
2001-02-23 20:55:21 +03:00
return ( 0 ) ;
}
}
/*
* Add the definition
*/
2023-06-06 15:25:30 +03:00
nameCopy = xmlMemStrdup ( name ) ;
if ( nameCopy = = NULL )
return ( - 1 ) ;
aliasCopy = xmlMemStrdup ( upper ) ;
if ( aliasCopy = = NULL ) {
xmlFree ( nameCopy ) ;
return ( - 1 ) ;
}
xmlCharEncodingAliases [ xmlCharEncodingAliasesNb ] . name = nameCopy ;
xmlCharEncodingAliases [ xmlCharEncodingAliasesNb ] . alias = aliasCopy ;
2001-02-23 20:55:21 +03:00
xmlCharEncodingAliasesNb + + ;
return ( 0 ) ;
}
/**
* xmlDelEncodingAlias :
* @ alias : the alias name as parsed , in UTF - 8 format ( ASCII actually )
*
2024-07-02 03:18:03 +03:00
* DEPRECATED : This function modifies global state and is not
* thread - safe .
*
2001-02-23 20:55:21 +03:00
* Unregisters an encoding alias @ alias
2010-11-04 19:42:42 +03:00
*
2001-02-23 20:55:21 +03:00
* Returns 0 in case of success , - 1 in case of error
*/
int
xmlDelEncodingAlias ( const char * alias ) {
int i ;
if ( alias = = NULL )
return ( - 1 ) ;
if ( xmlCharEncodingAliases = = NULL )
return ( - 1 ) ;
/*
* Walk down the list looking for a definition of the alias
*/
for ( i = 0 ; i < xmlCharEncodingAliasesNb ; i + + ) {
if ( ! strcmp ( xmlCharEncodingAliases [ i ] . alias , alias ) ) {
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . name ) ;
xmlFree ( ( char * ) xmlCharEncodingAliases [ i ] . alias ) ;
xmlCharEncodingAliasesNb - - ;
memmove ( & xmlCharEncodingAliases [ i ] , & xmlCharEncodingAliases [ i + 1 ] ,
sizeof ( xmlCharEncodingAlias ) * ( xmlCharEncodingAliasesNb - i ) ) ;
return ( 0 ) ;
}
}
return ( - 1 ) ;
}
2024-06-27 21:39:52 +03:00
static int
xmlCompareEncTableEntries ( const void * vkey , const void * ventry ) {
const char * key = vkey ;
const xmlEncTableEntry * entry = ventry ;
return ( xmlStrcasecmp ( BAD_CAST key , BAD_CAST entry - > name ) ) ;
}
2024-07-04 16:14:54 +03:00
static xmlCharEncoding
xmlParseCharEncodingInternal ( const char * name )
{
const xmlEncTableEntry * entry ;
if ( name = = NULL )
return ( XML_CHAR_ENCODING_NONE ) ;
entry = bsearch ( name , xmlEncTable ,
sizeof ( xmlEncTable ) / sizeof ( xmlEncTable [ 0 ] ) ,
sizeof ( xmlEncTable [ 0 ] ) , xmlCompareEncTableEntries ) ;
if ( entry ! = NULL )
return ( entry - > enc ) ;
return ( XML_CHAR_ENCODING_ERROR ) ;
}
2001-02-23 20:55:21 +03:00
/**
* xmlParseCharEncoding :
* @ name : the encoding name as parsed , in UTF - 8 format ( ASCII actually )
*
2003-11-28 12:39:10 +03:00
* Compare the string to the encoding schemes already known . Note
2001-02-23 20:55:21 +03:00
* that the comparison is case insensitive accordingly to the section
* [ XML ] 4.3 .3 Character Encoding in Entities .
2010-11-04 19:42:42 +03:00
*
2001-02-23 20:55:21 +03:00
* Returns one of the XML_CHAR_ENCODING_ . . . values or XML_CHAR_ENCODING_NONE
* if not recognized .
*/
xmlCharEncoding
2024-06-27 21:39:52 +03:00
xmlParseCharEncoding ( const char * name )
2001-02-23 20:55:21 +03:00
{
2024-07-04 16:14:54 +03:00
xmlCharEncoding enc = xmlParseCharEncodingInternal ( name ) ;
2001-02-23 20:55:21 +03:00
2024-07-04 16:14:54 +03:00
/* Backward compatibility */
if ( enc = = XML_CHAR_ENCODING_UTF16 )
enc = XML_CHAR_ENCODING_UTF16LE ;
2001-02-23 20:55:21 +03:00
2024-07-04 16:14:54 +03:00
return ( enc ) ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlGetCharEncodingName :
* @ enc : the encoding
*
* The " canonical " name for XML encoding .
* C . f . http : //www.w3.org/TR/REC-xml#charencoding
* Section 4.3 .3 Character Encoding in Entities
*
* Returns the canonical name for the given encoding
*/
const char *
xmlGetCharEncodingName ( xmlCharEncoding enc ) {
switch ( enc ) {
case XML_CHAR_ENCODING_UTF16LE :
return ( " UTF-16 " ) ;
case XML_CHAR_ENCODING_UTF16BE :
return ( " UTF-16 " ) ;
case XML_CHAR_ENCODING_UCS4LE :
return ( " ISO-10646-UCS-4 " ) ;
case XML_CHAR_ENCODING_UCS4BE :
return ( " ISO-10646-UCS-4 " ) ;
2024-06-27 21:39:52 +03:00
default :
break ;
2001-02-23 20:55:21 +03:00
}
2024-06-27 21:39:52 +03:00
if ( ( enc < = 0 ) | | ( ( size_t ) enc > = NUM_DEFAULT_HANDLERS ) )
return ( NULL ) ;
return ( defaultHandlers [ enc ] . name ) ;
2001-02-23 20:55:21 +03:00
}
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* Char encoding handlers *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
/**
* xmlNewCharEncodingHandler :
* @ name : the encoding name , in UTF - 8 format ( ASCII actually )
* @ input : the xmlCharEncodingInputFunc to read that encoding
* @ output : the xmlCharEncodingOutputFunc to write that encoding
*
2024-07-02 03:18:03 +03:00
* DEPRECATED : This function modifies global state and is not
* thread - safe .
*
2001-02-23 20:55:21 +03:00
* Create and registers an xmlCharEncodingHandler .
2002-08-01 16:22:24 +04:00
*
2001-02-23 20:55:21 +03:00
* Returns the xmlCharEncodingHandlerPtr created ( or NULL in case of error ) .
*/
2002-08-01 16:22:24 +04:00
xmlCharEncodingHandlerPtr
2010-11-04 19:42:42 +03:00
xmlNewCharEncodingHandler ( const char * name ,
2001-02-23 20:55:21 +03:00
xmlCharEncodingInputFunc input ,
xmlCharEncodingOutputFunc output ) {
xmlCharEncodingHandlerPtr handler ;
const char * alias ;
char upper [ 500 ] ;
int i ;
2005-07-29 03:49:35 +04:00
char * up = NULL ;
2001-02-23 20:55:21 +03:00
/*
* Do the alias resolution
*/
alias = xmlGetEncodingAlias ( name ) ;
if ( alias ! = NULL )
name = alias ;
/*
* Keep only the uppercase version of the encoding .
*/
2023-04-30 19:46:05 +03:00
if ( name = = NULL )
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
for ( i = 0 ; i < 499 ; i + + ) {
2023-02-16 13:50:52 +03:00
upper [ i ] = ( char ) toupper ( ( unsigned char ) name [ i ] ) ;
2001-02-23 20:55:21 +03:00
if ( upper [ i ] = = 0 ) break ;
}
upper [ i ] = 0 ;
up = xmlMemStrdup ( upper ) ;
2023-04-30 19:46:05 +03:00
if ( up = = NULL )
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
/*
* allocate and fill - up an handler block .
*/
handler = ( xmlCharEncodingHandlerPtr )
xmlMalloc ( sizeof ( xmlCharEncodingHandler ) ) ;
if ( handler = = NULL ) {
2004-07-31 20:24:01 +04:00
xmlFree ( up ) ;
2001-02-23 20:55:21 +03:00
return ( NULL ) ;
}
2010-11-04 19:42:42 +03:00
memset ( handler , 0 , sizeof ( xmlCharEncodingHandler ) ) ;
2001-02-23 20:55:21 +03:00
handler - > input = input ;
handler - > output = output ;
handler - > name = up ;
2024-06-28 20:06:57 +03:00
handler - > flags = XML_HANDLER_STATIC ;
2001-02-23 20:55:21 +03:00
# ifdef LIBXML_ICONV_ENABLED
handler - > iconv_in = NULL ;
handler - > iconv_out = NULL ;
2010-11-04 19:42:42 +03:00
# endif
2001-02-23 20:55:21 +03:00
/*
* registers and returns the handler .
*/
xmlRegisterCharEncodingHandler ( handler ) ;
return ( handler ) ;
}
/**
* xmlInitCharEncodingHandlers :
*
2022-11-25 16:26:59 +03:00
* DEPRECATED : Alias for xmlInitParser .
2001-02-23 20:55:21 +03:00
*/
void
xmlInitCharEncodingHandlers ( void ) {
2022-11-25 16:26:59 +03:00
xmlInitParser ( ) ;
}
/**
* xmlInitEncodingInternal :
*
* Initialize the char encoding support .
*/
void
xmlInitEncodingInternal ( void ) {
2001-02-23 20:55:21 +03:00
unsigned short int tst = 0x1234 ;
2010-11-04 19:42:42 +03:00
unsigned char * ptr = ( unsigned char * ) & tst ;
2001-02-23 20:55:21 +03:00
if ( * ptr = = 0x12 ) xmlLittleEndian = 0 ;
2023-04-30 19:46:05 +03:00
else xmlLittleEndian = 1 ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlCleanupCharEncodingHandlers :
*
2022-03-06 15:55:48 +03:00
* DEPRECATED : This function will be made private . Call xmlCleanupParser
* to free global state but see the warnings there . xmlCleanupParser
* should be only called once at program exit . In most cases , you don ' t
* have call cleanup functions at all .
*
2001-02-23 20:55:21 +03:00
* Cleanup the memory allocated for the char encoding support , it
* unregisters all the encoding handlers and the aliases .
*/
void
xmlCleanupCharEncodingHandlers ( void ) {
xmlCleanupEncodingAliases ( ) ;
2024-06-28 00:32:58 +03:00
if ( globalHandlers = = NULL ) return ;
2001-02-23 20:55:21 +03:00
for ( ; nbCharEncodingHandler > 0 ; ) {
2024-06-28 00:32:58 +03:00
xmlCharEncodingHandler * handler ;
2001-02-23 20:55:21 +03:00
nbCharEncodingHandler - - ;
2024-06-28 00:32:58 +03:00
handler = globalHandlers [ nbCharEncodingHandler ] ;
if ( handler ! = NULL ) {
if ( handler - > name ! = NULL )
xmlFree ( handler - > name ) ;
xmlFree ( handler ) ;
2001-02-23 20:55:21 +03:00
}
}
2024-06-28 00:32:58 +03:00
xmlFree ( globalHandlers ) ;
globalHandlers = NULL ;
2001-02-23 20:55:21 +03:00
nbCharEncodingHandler = 0 ;
}
/**
* xmlRegisterCharEncodingHandler :
* @ handler : the xmlCharEncodingHandlerPtr handler block
*
2024-07-02 03:18:03 +03:00
* DEPRECATED : This function modifies global state and is not
* thread - safe .
*
* Register the char encoding handler .
2001-02-23 20:55:21 +03:00
*/
void
xmlRegisterCharEncodingHandler ( xmlCharEncodingHandlerPtr handler ) {
2023-04-30 19:46:05 +03:00
if ( handler = = NULL )
2022-11-24 21:14:33 +03:00
return ;
2024-06-28 00:32:58 +03:00
if ( globalHandlers = = NULL ) {
globalHandlers = xmlMalloc (
MAX_ENCODING_HANDLERS * sizeof ( globalHandlers [ 0 ] ) ) ;
if ( globalHandlers = = NULL )
2022-11-24 21:14:33 +03:00
goto free_handler ;
2001-02-23 20:55:21 +03:00
}
2023-04-30 19:46:05 +03:00
if ( nbCharEncodingHandler > = MAX_ENCODING_HANDLERS )
2020-12-07 15:19:53 +03:00
goto free_handler ;
2024-06-28 00:32:58 +03:00
globalHandlers [ nbCharEncodingHandler + + ] = handler ;
2020-12-07 15:19:53 +03:00
return ;
free_handler :
if ( handler ! = NULL ) {
if ( handler - > name ! = NULL ) {
xmlFree ( handler - > name ) ;
}
xmlFree ( handler ) ;
}
2001-02-23 20:55:21 +03:00
}
2024-06-28 00:32:58 +03:00
static int
xmlInvokeConvImpl ( xmlCharEncConvImpl impl , void * implCtxt ,
const char * name , xmlCharEncodingHandler * handler ) {
2024-06-28 20:06:57 +03:00
xmlCharEncConverter conv = { NULL , NULL , NULL , NULL , NULL } ;
2024-06-28 00:32:58 +03:00
int ret ;
ret = impl ( implCtxt , name , & conv ) ;
if ( ret = = XML_ERR_OK ) {
2024-06-28 20:06:57 +03:00
handler - > input =
( xmlCharEncodingInputFunc ) ( void ( * ) ( void ) ) conv . input ;
handler - > output =
( xmlCharEncodingOutputFunc ) ( void ( * ) ( void ) ) conv . output ;
2024-06-28 00:32:58 +03:00
handler - > ctxtDtor = conv . ctxtDtor ;
handler - > inputCtxt = conv . inputCtxt ;
handler - > outputCtxt = conv . outputCtxt ;
}
return ( ret ) ;
}
2001-02-23 20:55:21 +03:00
/**
2023-12-10 16:56:21 +03:00
* xmlFindExtraHandler :
2024-06-28 00:32:58 +03:00
* @ norig : name of the char encoding
* @ name : potentially aliased name of the encoding
2024-01-02 20:33:57 +03:00
* @ output : boolean , use handler for output
2024-06-28 00:32:58 +03:00
* @ impl : a conversion implementation ( optional )
* @ implCtxt : user data for conversion implementation ( optional )
2023-12-10 16:56:21 +03:00
* @ out : pointer to resulting handler
*
* Search the non - default handlers for an exact match .
*
2024-06-28 00:32:58 +03:00
* Returns an xmlParserErrors error code .
2023-12-10 16:56:21 +03:00
*/
static int
2024-06-28 00:32:58 +03:00
xmlFindExtraHandler ( const char * norig , const char * name , int output ,
xmlCharEncConvImpl impl , void * implCtxt ,
2024-01-02 20:33:57 +03:00
xmlCharEncodingHandler * * out ) {
2024-06-28 00:32:58 +03:00
xmlCharEncodingHandler * handler ;
2023-12-10 16:56:21 +03:00
int ret ;
int i ;
2024-06-28 00:32:58 +03:00
handler = xmlMalloc ( sizeof ( * handler ) ) ;
if ( handler = = NULL )
return ( XML_ERR_NO_MEMORY ) ;
memset ( handler , 0 , sizeof ( * handler ) ) ;
handler - > name = xmlMemStrdup ( name ) ;
if ( handler - > name = = NULL ) {
ret = XML_ERR_NO_MEMORY ;
goto done ;
}
/*
* Try custom implementation before deprecated global handlers .
*
* Note that we pass the original name without deprecated
* alias resolution .
*/
if ( impl ! = NULL ) {
ret = xmlInvokeConvImpl ( impl , implCtxt , norig , handler ) ;
if ( ret ! = XML_ERR_OK )
goto done ;
* out = handler ;
return ( XML_ERR_OK ) ;
}
2023-12-10 16:56:21 +03:00
2024-06-28 00:32:58 +03:00
/*
* Deprecated
*/
if ( globalHandlers ! = NULL ) {
2023-12-10 16:56:21 +03:00
for ( i = 0 ; i < nbCharEncodingHandler ; i + + ) {
2024-06-28 00:32:58 +03:00
xmlCharEncodingHandler * h = globalHandlers [ i ] ;
2024-01-02 20:33:57 +03:00
2023-12-10 16:56:21 +03:00
if ( ! xmlStrcasecmp ( ( const xmlChar * ) name ,
2024-06-28 00:32:58 +03:00
( const xmlChar * ) h - > name ) ) {
if ( ( output ? h - > output : h - > input ) ! = NULL ) {
* out = h ;
ret = XML_ERR_OK ;
goto done ;
2024-01-02 20:33:57 +03:00
}
2023-12-10 16:56:21 +03:00
}
}
}
# ifdef LIBXML_ICONV_ENABLED
2024-06-28 00:32:58 +03:00
ret = xmlInvokeConvImpl ( xmlCharEncIconv , handler , name , handler ) ;
if ( ret = = XML_ERR_OK ) {
* out = handler ;
return ( XML_ERR_OK ) ;
}
2023-12-10 16:56:21 +03:00
if ( ret ! = XML_ERR_UNSUPPORTED_ENCODING )
2024-06-28 00:32:58 +03:00
goto done ;
2023-12-10 16:56:21 +03:00
# endif /* LIBXML_ICONV_ENABLED */
# ifdef LIBXML_ICU_ENABLED
2024-06-28 00:32:58 +03:00
ret = xmlInvokeConvImpl ( xmlCharEncUconv , handler , name , handler ) ;
if ( ret = = XML_ERR_OK ) {
* out = handler ;
return ( XML_ERR_OK ) ;
}
2023-12-10 16:56:21 +03:00
if ( ret ! = XML_ERR_UNSUPPORTED_ENCODING )
2024-06-28 00:32:58 +03:00
goto done ;
2023-12-10 16:56:21 +03:00
# endif /* LIBXML_ICU_ENABLED */
2024-06-28 00:32:58 +03:00
ret = XML_ERR_UNSUPPORTED_ENCODING ;
done :
if ( handler ! = NULL ) {
xmlFree ( handler - > name ) ;
xmlFree ( handler ) ;
}
return ( ret ) ;
2023-12-10 16:56:21 +03:00
}
/**
* xmlLookupCharEncodingHandler :
2001-02-23 20:55:21 +03:00
* @ enc : an xmlCharEncoding value .
2023-12-10 16:56:21 +03:00
* @ out : pointer to result
*
2024-07-02 03:18:03 +03:00
* Find or create a handler matching the encoding . The following
* converters are looked up in order :
*
* - Built - in handler ( UTF - 8 , UTF - 16 , ISO - 8859 - 1 , ASCII )
* - User - registered global handler ( deprecated )
* - iconv if enabled
* - ICU if enabled
2023-12-10 16:56:21 +03:00
*
* The handler must be closed with xmlCharEncCloseFunc .
2001-02-23 20:55:21 +03:00
*
2024-07-02 03:18:03 +03:00
* If the encoding is UTF - 8 , a NULL handler and no error code will
* be returned .
*
2023-12-10 16:56:21 +03:00
* Available since 2.13 .0 .
2001-02-23 20:55:21 +03:00
*
2024-07-02 03:18:03 +03:00
* Returns XML_ERR_OK , XML_ERR_UNSUPPORTED_ENCODING or another
* xmlParserErrors error code .
2001-02-23 20:55:21 +03:00
*/
2023-12-10 16:56:21 +03:00
int
xmlLookupCharEncodingHandler ( xmlCharEncoding enc ,
xmlCharEncodingHandler * * out ) {
2024-06-27 21:39:52 +03:00
const xmlCharEncodingHandler * handler ;
2023-12-10 16:56:21 +03:00
if ( out = = NULL )
return ( XML_ERR_ARGUMENT ) ;
* out = NULL ;
2001-02-23 20:55:21 +03:00
2024-06-27 21:39:52 +03:00
if ( ( enc < = 0 ) | | ( ( size_t ) enc > = NUM_DEFAULT_HANDLERS ) )
return ( XML_ERR_UNSUPPORTED_ENCODING ) ;
2001-02-23 20:55:21 +03:00
2024-06-27 21:39:52 +03:00
/* Return NULL handler for UTF-8 */
2024-06-29 00:13:38 +03:00
if ( ( enc = = XML_CHAR_ENCODING_UTF8 ) | |
( enc = = XML_CHAR_ENCODING_NONE ) )
2024-06-27 21:39:52 +03:00
return ( XML_ERR_OK ) ;
2001-02-23 20:55:21 +03:00
2024-06-27 21:39:52 +03:00
handler = & defaultHandlers [ enc ] ;
if ( ( handler - > input ! = NULL ) | | ( handler - > output ! = NULL ) ) {
* out = ( xmlCharEncodingHandler * ) handler ;
return ( XML_ERR_OK ) ;
2001-02-23 20:55:21 +03:00
}
2010-11-04 19:42:42 +03:00
2024-06-27 21:39:52 +03:00
if ( handler - > name ! = NULL )
2024-06-28 00:32:58 +03:00
return ( xmlFindExtraHandler ( handler - > name , handler - > name , 0 ,
NULL , NULL , out ) ) ;
2023-12-10 16:56:21 +03:00
return ( XML_ERR_UNSUPPORTED_ENCODING ) ;
2001-02-23 20:55:21 +03:00
}
/**
2023-12-10 16:56:21 +03:00
* xmlGetCharEncodingHandler :
* @ enc : an xmlCharEncoding value .
2001-02-23 20:55:21 +03:00
*
2023-12-10 16:56:21 +03:00
* DEPRECATED : Use xmlLookupCharEncodingHandler which has better error
* reporting .
2001-02-23 20:55:21 +03:00
*
2023-12-10 16:56:21 +03:00
* Returns the handler or NULL if no handler was found or an error
* occurred .
2001-02-23 20:55:21 +03:00
*/
xmlCharEncodingHandlerPtr
2023-12-10 16:56:21 +03:00
xmlGetCharEncodingHandler ( xmlCharEncoding enc ) {
xmlCharEncodingHandler * ret ;
xmlLookupCharEncodingHandler ( enc , & ret ) ;
return ( ret ) ;
}
/**
2024-06-28 00:32:58 +03:00
* xmlCreateCharEncodingHandler :
2023-12-10 16:56:21 +03:00
* @ name : a string describing the char encoding .
2024-01-02 20:33:57 +03:00
* @ output : boolean , use handler for output
2024-06-28 00:32:58 +03:00
* @ impl : a conversion implementation ( optional )
* @ implCtxt : user data for conversion implementation ( optional )
2023-12-10 16:56:21 +03:00
* @ out : pointer to result
*
2024-07-02 03:18:03 +03:00
* Find or create a handler matching the encoding . The following
* converters are looked up in order :
*
* - Built - in handler ( UTF - 8 , UTF - 16 , ISO - 8859 - 1 , ASCII )
* - Custom implementation if provided
* - User - registered global handler ( deprecated )
* - iconv if enabled
* - ICU if enabled
2023-12-10 16:56:21 +03:00
*
* The handler must be closed with xmlCharEncCloseFunc .
*
2024-06-24 20:41:32 +03:00
* If the encoding is UTF - 8 , a NULL handler and no error code will
* be returned .
*
2024-07-02 03:18:03 +03:00
* Available since 2.14 .0 .
2023-12-10 16:56:21 +03:00
*
2024-07-02 03:18:03 +03:00
* Returns XML_ERR_OK , XML_ERR_UNSUPPORTED_ENCODING or another
* xmlParserErrors error code .
2023-12-10 16:56:21 +03:00
*/
int
2024-06-28 00:32:58 +03:00
xmlCreateCharEncodingHandler ( const char * name , int output ,
xmlCharEncConvImpl impl , void * implCtxt ,
xmlCharEncodingHandler * * out ) {
2024-06-27 21:39:52 +03:00
const xmlCharEncodingHandler * handler ;
2024-06-28 00:32:58 +03:00
const char * norig , * nalias ;
2023-12-10 16:56:21 +03:00
xmlCharEncoding enc ;
2001-02-23 20:55:21 +03:00
2023-12-10 16:56:21 +03:00
if ( out = = NULL )
return ( XML_ERR_ARGUMENT ) ;
* out = NULL ;
if ( name = = NULL )
return ( XML_ERR_ARGUMENT ) ;
2001-02-23 20:55:21 +03:00
2024-06-28 00:32:58 +03:00
norig = name ;
2001-02-23 20:55:21 +03:00
nalias = xmlGetEncodingAlias ( name ) ;
if ( nalias ! = NULL )
name = nalias ;
2024-07-04 16:14:54 +03:00
enc = xmlParseCharEncodingInternal ( name ) ;
2024-06-27 13:00:45 +03:00
2024-06-27 21:39:52 +03:00
/* Return NULL handler for UTF-8 */
if ( enc = = XML_CHAR_ENCODING_UTF8 )
return ( XML_ERR_OK ) ;
if ( ( enc > 0 ) & & ( ( size_t ) enc < NUM_DEFAULT_HANDLERS ) ) {
handler = & defaultHandlers [ enc ] ;
if ( ( output ? handler - > output : handler - > input ) ! = NULL ) {
* out = ( xmlCharEncodingHandler * ) handler ;
return ( XML_ERR_OK ) ;
}
}
2024-06-28 00:32:58 +03:00
return ( xmlFindExtraHandler ( norig , name , output , impl , implCtxt , out ) ) ;
}
/**
* xmlOpenCharEncodingHandler :
* @ name : a string describing the char encoding .
* @ output : boolean , use handler for output
* @ out : pointer to result
*
2024-07-02 03:18:03 +03:00
* Find or create a handler matching the encoding . The following
* converters are looked up in order :
*
* - Built - in handler ( UTF - 8 , UTF - 16 , ISO - 8859 - 1 , ASCII )
* - User - registered global handler ( deprecated )
* - iconv if enabled
* - ICU if enabled
2024-06-28 00:32:58 +03:00
*
* The handler must be closed with xmlCharEncCloseFunc .
*
* If the encoding is UTF - 8 , a NULL handler and no error code will
* be returned .
*
* Available since 2.13 .0 .
*
2024-07-02 03:18:03 +03:00
* Returns XML_ERR_OK , XML_ERR_UNSUPPORTED_ENCODING or another
* xmlParserErrors error code .
2024-06-28 00:32:58 +03:00
*/
int
xmlOpenCharEncodingHandler ( const char * name , int output ,
xmlCharEncodingHandler * * out ) {
return ( xmlCreateCharEncodingHandler ( name , output , NULL , NULL , out ) ) ;
2023-12-10 16:56:21 +03:00
}
2001-02-23 20:55:21 +03:00
2023-12-10 16:56:21 +03:00
/**
* xmlFindCharEncodingHandler :
* @ name : a string describing the char encoding .
*
* DEPRECATED : Use xmlOpenCharEncodingHandler which has better error
* reporting .
*
2024-07-02 03:18:03 +03:00
* If the encoding is UTF - 8 , this will return a no - op handler that
* shouldn ' t be used .
*
2023-12-10 16:56:21 +03:00
* Returns the handler or NULL if no handler was found or an error
* occurred .
*/
xmlCharEncodingHandlerPtr
xmlFindCharEncodingHandler ( const char * name ) {
xmlCharEncodingHandler * ret ;
2024-06-24 20:41:32 +03:00
/*
* This handler shouldn ' t be used , but we must return a non - NULL
* handler .
*/
if ( ( xmlStrcasecmp ( BAD_CAST name , BAD_CAST " UTF-8 " ) = = 0 ) | |
( xmlStrcasecmp ( BAD_CAST name , BAD_CAST " UTF8 " ) = = 0 ) )
2024-06-27 21:39:52 +03:00
return ( ( xmlCharEncodingHandlerPtr )
& defaultHandlers [ XML_CHAR_ENCODING_UTF8 ] ) ;
2024-06-24 20:41:32 +03:00
2024-01-02 20:33:57 +03:00
xmlOpenCharEncodingHandler ( name , 0 , & ret ) ;
2023-12-10 16:56:21 +03:00
return ( ret ) ;
2001-02-23 20:55:21 +03:00
}
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* ICONV based generic conversion functions *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2001-02-23 20:55:21 +03:00
# ifdef LIBXML_ICONV_ENABLED
2024-06-27 22:37:18 +03:00
typedef struct {
iconv_t cd ;
} xmlIconvCtxt ;
2001-02-23 20:55:21 +03:00
/**
2024-06-27 22:37:18 +03:00
* xmlIconvConvert :
* @ vctxt : conversion context
2001-02-23 20:55:21 +03:00
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
2020-06-30 03:43:57 +03:00
* @ in : a pointer to an array of input bytes
2001-02-23 20:55:21 +03:00
* @ inlen : the length of @ in
*
2023-02-22 19:11:20 +03:00
* Returns an XML_ENC_ERR code .
2010-11-04 19:42:42 +03:00
*
2001-02-23 20:55:21 +03:00
* The value of @ inlen after return is the number of octets consumed
2001-12-31 19:16:02 +03:00
* as the return value is positive , else unpredictable .
2020-06-30 03:43:57 +03:00
* The value of @ outlen after return is the number of octets produced .
2001-02-23 20:55:21 +03:00
*/
static int
2024-06-28 20:06:57 +03:00
xmlIconvConvert ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) {
2024-06-27 22:37:18 +03:00
xmlIconvCtxt * ctxt = vctxt ;
2004-11-05 20:22:25 +03:00
size_t icv_inlen , icv_outlen ;
2001-05-28 15:00:53 +04:00
const char * icv_in = ( const char * ) in ;
char * icv_out = ( char * ) out ;
2022-01-25 04:27:53 +03:00
size_t ret ;
2001-05-28 15:00:53 +04:00
2004-11-06 16:26:59 +03:00
if ( ( out = = NULL ) | | ( outlen = = NULL ) | | ( inlen = = NULL ) | | ( in = = NULL ) ) {
if ( outlen ! = NULL ) * outlen = 0 ;
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2004-11-06 16:26:59 +03:00
}
2004-11-05 20:22:25 +03:00
icv_inlen = * inlen ;
icv_outlen = * outlen ;
2022-03-04 03:07:40 +03:00
/*
* Some versions take const , other versions take non - const input .
*/
2024-06-27 22:37:18 +03:00
ret = iconv ( ctxt - > cd , ( void * ) & icv_in , & icv_inlen , & icv_out , & icv_outlen ) ;
2006-03-09 19:49:24 +03:00
* inlen - = icv_inlen ;
* outlen - = icv_outlen ;
2023-02-22 19:11:20 +03:00
if ( ret = = ( size_t ) - 1 ) {
if ( errno = = EILSEQ )
return ( XML_ENC_ERR_INPUT ) ;
if ( errno = = E2BIG )
return ( XML_ENC_ERR_SPACE ) ;
2024-06-28 05:10:03 +03:00
/*
* EINVAL means a truncated multi - byte sequence at the end
* of the input buffer . We treat this as success .
*/
2023-02-22 19:11:20 +03:00
if ( errno = = EINVAL )
2024-06-28 05:10:03 +03:00
return ( XML_ENC_ERR_SUCCESS ) ;
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
}
return ( XML_ENC_ERR_SUCCESS ) ;
2001-02-23 20:55:21 +03:00
}
2024-06-27 22:37:18 +03:00
static void
xmlIconvFree ( void * vctxt ) {
xmlIconvCtxt * ctxt = vctxt ;
if ( ctxt - > cd ! = ( iconv_t ) - 1 )
iconv_close ( ctxt - > cd ) ;
xmlFree ( ctxt ) ;
}
static int
2024-06-28 00:32:58 +03:00
xmlCharEncIconv ( void * vctxt , const char * name , xmlCharEncConverter * conv ) {
xmlCharEncodingHandler * handler = vctxt ;
2024-06-27 22:37:18 +03:00
xmlIconvCtxt * inputCtxt = NULL , * outputCtxt = NULL ;
iconv_t icv_in ;
iconv_t icv_out ;
int ret ;
inputCtxt = xmlMalloc ( sizeof ( xmlIconvCtxt ) ) ;
if ( inputCtxt = = NULL ) {
ret = XML_ERR_NO_MEMORY ;
goto error ;
}
inputCtxt - > cd = ( iconv_t ) - 1 ;
icv_in = iconv_open ( " UTF-8 " , name ) ;
if ( icv_in = = ( iconv_t ) - 1 ) {
if ( errno = = EINVAL )
ret = XML_ERR_UNSUPPORTED_ENCODING ;
else if ( errno = = ENOMEM )
ret = XML_ERR_NO_MEMORY ;
else
ret = XML_ERR_SYSTEM ;
goto error ;
}
inputCtxt - > cd = icv_in ;
outputCtxt = xmlMalloc ( sizeof ( xmlIconvCtxt ) ) ;
if ( outputCtxt = = NULL ) {
ret = XML_ERR_NO_MEMORY ;
goto error ;
}
outputCtxt - > cd = ( iconv_t ) - 1 ;
icv_out = iconv_open ( name , " UTF-8 " ) ;
if ( icv_out = = ( iconv_t ) - 1 ) {
if ( errno = = EINVAL )
ret = XML_ERR_UNSUPPORTED_ENCODING ;
else if ( errno = = ENOMEM )
ret = XML_ERR_NO_MEMORY ;
else
ret = XML_ERR_SYSTEM ;
goto error ;
}
outputCtxt - > cd = icv_out ;
2024-06-28 20:06:57 +03:00
conv - > input = xmlIconvConvert ;
conv - > output = xmlIconvConvert ;
conv - > ctxtDtor = xmlIconvFree ;
2024-06-28 00:32:58 +03:00
conv - > inputCtxt = inputCtxt ;
conv - > outputCtxt = outputCtxt ;
2024-06-27 22:37:18 +03:00
/* Backward compatibility */
2024-06-28 00:32:58 +03:00
if ( handler ! = NULL ) {
handler - > iconv_in = icv_in ;
handler - > iconv_out = icv_out ;
}
2024-06-27 22:37:18 +03:00
2024-06-28 00:32:58 +03:00
return ( XML_ERR_OK ) ;
2024-06-27 22:37:18 +03:00
error :
if ( inputCtxt ! = NULL )
xmlIconvFree ( inputCtxt ) ;
if ( outputCtxt ! = NULL )
xmlIconvFree ( outputCtxt ) ;
return ( ret ) ;
}
2001-02-23 20:55:21 +03:00
# endif /* LIBXML_ICONV_ENABLED */
2010-11-04 19:42:42 +03:00
/************************************************************************
* *
2012-09-11 09:26:36 +04:00
* ICU based generic conversion functions *
2010-11-04 19:42:42 +03:00
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# ifdef LIBXML_ICU_ENABLED
2024-06-27 22:37:18 +03:00
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
# define ICU_PIVOT_BUF_SIZE 1024
typedef struct _uconv_t xmlUconvCtxt ;
struct _uconv_t {
UConverter * uconv ; /* for conversion between an encoding and UTF-16 */
UConverter * utf8 ; /* for conversion between UTF-8 and UTF-16 */
UChar * pivot_source ;
UChar * pivot_target ;
int isInput ;
UChar pivot_buf [ ICU_PIVOT_BUF_SIZE ] ;
} ;
2010-11-04 19:42:42 +03:00
/**
2024-06-27 22:37:18 +03:00
* xmlUconvConvert :
* @ vctxt : converison context
2010-11-04 19:42:42 +03:00
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
2020-06-30 03:43:57 +03:00
* @ in : a pointer to an array of input bytes
2010-11-04 19:42:42 +03:00
* @ inlen : the length of @ in
*
2023-02-22 19:11:20 +03:00
* Returns an XML_ENC_ERR code .
2010-11-04 19:42:42 +03:00
*
* The value of @ inlen after return is the number of octets consumed
* as the return value is positive , else unpredictable .
2020-06-30 03:43:57 +03:00
* The value of @ outlen after return is the number of octets produced .
2010-11-04 19:42:42 +03:00
*/
static int
2024-06-28 20:06:57 +03:00
xmlUconvConvert ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) {
2024-06-27 22:37:18 +03:00
xmlUconvCtxt * cd = vctxt ;
2010-11-04 19:42:42 +03:00
const char * ucv_in = ( const char * ) in ;
char * ucv_out = ( char * ) out ;
2024-07-03 16:48:01 +03:00
UConverter * target , * source ;
2010-11-04 19:42:42 +03:00
UErrorCode err = U_ZERO_ERROR ;
2024-07-03 16:48:01 +03:00
int ret ;
2010-11-04 19:42:42 +03:00
if ( ( out = = NULL ) | | ( outlen = = NULL ) | | ( inlen = = NULL ) | | ( in = = NULL ) ) {
2024-07-03 16:48:01 +03:00
if ( outlen ! = NULL )
* outlen = 0 ;
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2010-11-04 19:42:42 +03:00
}
2023-08-08 16:21:31 +03:00
/*
* Note that the ICU API is stateful . It can always consume a certain
* amount of input even if the output buffer would overflow . The
* remaining input must be processed by calling ucnv_convertEx with a
* possibly empty input buffer .
*
* ucnv_convertEx is always called with reset and flush set to 0 ,
* so we don ' t mess up the state . This should never generate
* U_TRUNCATED_CHAR_FOUND errors .
*/
2024-06-27 22:37:18 +03:00
if ( cd - > isInput ) {
2024-07-03 16:48:01 +03:00
source = cd - > uconv ;
target = cd - > utf8 ;
2010-11-04 19:42:42 +03:00
} else {
2024-07-03 16:48:01 +03:00
source = cd - > utf8 ;
target = cd - > uconv ;
2010-11-04 19:42:42 +03:00
}
2024-07-03 16:48:01 +03:00
ucnv_convertEx ( target , source , & ucv_out , ucv_out + * outlen ,
& ucv_in , ucv_in + * inlen , cd - > pivot_buf ,
& cd - > pivot_source , & cd - > pivot_target ,
cd - > pivot_buf + ICU_PIVOT_BUF_SIZE , 0 , 0 , & err ) ;
2010-11-04 19:42:42 +03:00
* inlen = ucv_in - ( const char * ) in ;
* outlen = ucv_out - ( char * ) out ;
2024-07-03 16:48:01 +03:00
2017-10-26 04:11:12 +03:00
if ( U_SUCCESS ( err ) ) {
2024-07-03 16:48:01 +03:00
ret = XML_ENC_ERR_SUCCESS ;
} else {
switch ( err ) {
case U_TRUNCATED_CHAR_FOUND :
/* Shouldn't happen without flush */
ret = XML_ENC_ERR_SUCCESS ;
break ;
case U_BUFFER_OVERFLOW_ERROR :
ret = XML_ENC_ERR_SPACE ;
break ;
case U_INVALID_CHAR_FOUND :
case U_ILLEGAL_CHAR_FOUND :
ret = XML_ENC_ERR_INPUT ;
break ;
case U_MEMORY_ALLOCATION_ERROR :
ret = XML_ERR_NO_MEMORY ;
break ;
default :
ret = XML_ENC_ERR_INTERNAL ;
break ;
}
2017-10-26 04:11:12 +03:00
}
2024-07-03 16:48:01 +03:00
return ( ret ) ;
2010-11-04 19:42:42 +03:00
}
2024-06-27 22:37:18 +03:00
static int
openIcuConverter ( const char * name , int isInput , xmlUconvCtxt * * out )
{
UErrorCode status ;
xmlUconvCtxt * conv ;
* out = NULL ;
conv = ( xmlUconvCtxt * ) xmlMalloc ( sizeof ( xmlUconvCtxt ) ) ;
if ( conv = = NULL )
return ( XML_ERR_NO_MEMORY ) ;
conv - > isInput = isInput ;
conv - > pivot_source = conv - > pivot_buf ;
conv - > pivot_target = conv - > pivot_buf ;
status = U_ZERO_ERROR ;
conv - > uconv = ucnv_open ( name , & status ) ;
if ( U_FAILURE ( status ) )
goto error ;
status = U_ZERO_ERROR ;
if ( isInput ) {
ucnv_setToUCallBack ( conv - > uconv , UCNV_TO_U_CALLBACK_STOP ,
NULL , NULL , NULL , & status ) ;
}
else {
ucnv_setFromUCallBack ( conv - > uconv , UCNV_FROM_U_CALLBACK_STOP ,
NULL , NULL , NULL , & status ) ;
}
if ( U_FAILURE ( status ) )
goto error ;
status = U_ZERO_ERROR ;
conv - > utf8 = ucnv_open ( " UTF-8 " , & status ) ;
if ( U_FAILURE ( status ) )
goto error ;
* out = conv ;
return ( 0 ) ;
error :
if ( conv - > uconv )
ucnv_close ( conv - > uconv ) ;
xmlFree ( conv ) ;
if ( status = = U_FILE_ACCESS_ERROR )
return ( XML_ERR_UNSUPPORTED_ENCODING ) ;
if ( status = = U_MEMORY_ALLOCATION_ERROR )
return ( XML_ERR_NO_MEMORY ) ;
return ( XML_ERR_SYSTEM ) ;
}
static void
closeIcuConverter ( xmlUconvCtxt * conv )
{
if ( conv = = NULL )
return ;
ucnv_close ( conv - > uconv ) ;
ucnv_close ( conv - > utf8 ) ;
xmlFree ( conv ) ;
}
static void
xmlUconvFree ( void * vctxt ) {
closeIcuConverter ( vctxt ) ;
}
static int
2024-07-03 15:35:49 +03:00
xmlCharEncUconv ( void * vctxt ATTRIBUTE_UNUSED , const char * name ,
xmlCharEncConverter * conv ) {
2024-06-27 22:37:18 +03:00
xmlUconvCtxt * ucv_in = NULL ;
xmlUconvCtxt * ucv_out = NULL ;
int ret ;
ret = openIcuConverter ( name , 1 , & ucv_in ) ;
if ( ret ! = 0 )
goto error ;
ret = openIcuConverter ( name , 0 , & ucv_out ) ;
if ( ret ! = 0 )
goto error ;
2024-06-28 20:06:57 +03:00
conv - > input = xmlUconvConvert ;
conv - > output = xmlUconvConvert ;
conv - > ctxtDtor = xmlUconvFree ;
2024-06-28 00:32:58 +03:00
conv - > inputCtxt = ucv_in ;
conv - > outputCtxt = ucv_out ;
2024-06-27 22:37:18 +03:00
2024-06-28 00:32:58 +03:00
return ( XML_ERR_OK ) ;
2024-06-27 22:37:18 +03:00
error :
if ( ucv_in ! = NULL )
closeIcuConverter ( ucv_in ) ;
if ( ucv_out ! = NULL )
closeIcuConverter ( ucv_out ) ;
return ( ret ) ;
}
2010-11-04 19:42:42 +03:00
# endif /* LIBXML_ICU_ENABLED */
2001-05-30 23:14:17 +04:00
/************************************************************************
* *
* The real API used by libxml for on - the - fly conversion *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2023-04-30 19:25:09 +03:00
/**
* xmlEncConvertError :
* @ code : XML_ENC_ERR code
*
* Convert XML_ENC_ERR to libxml2 error codes .
*/
static int
xmlEncConvertError ( int code ) {
int ret ;
switch ( code ) {
case XML_ENC_ERR_SUCCESS :
ret = XML_ERR_OK ;
break ;
case XML_ENC_ERR_INPUT :
ret = XML_ERR_INVALID_ENCODING ;
break ;
case XML_ENC_ERR_MEMORY :
ret = XML_ERR_NO_MEMORY ;
break ;
default :
ret = XML_ERR_INTERNAL_ERROR ;
break ;
}
return ( ret ) ;
}
2020-06-30 03:43:57 +03:00
/**
* xmlEncInputChunk :
* @ handler : encoding handler
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of input bytes
* @ inlen : the length of @ in
*
* The value of @ inlen after return is the number of octets consumed
* as the return value is 0 , else unpredictable .
* The value of @ outlen after return is the number of octets produced .
2023-09-21 23:57:33 +03:00
*
* Returns an XML_ENC_ERR code .
2020-06-30 03:43:57 +03:00
*/
2023-03-21 21:07:12 +03:00
int
2017-06-19 15:57:43 +03:00
xmlEncInputChunk ( xmlCharEncodingHandler * handler , unsigned char * out ,
2023-08-08 16:21:31 +03:00
int * outlen , const unsigned char * in , int * inlen ) {
2017-06-19 15:57:43 +03:00
int ret ;
if ( handler - > input ! = NULL ) {
2024-06-28 20:06:57 +03:00
xmlCharEncConvFunc conv =
( xmlCharEncConvFunc ) ( void ( * ) ( void ) ) handler - > input ;
ret = conv ( out , outlen , in , inlen , handler - > inputCtxt ) ;
if ( ret > 0 )
ret = XML_ENC_ERR_SUCCESS ;
2017-06-19 15:57:43 +03:00
}
else {
* outlen = 0 ;
* inlen = 0 ;
2023-02-22 19:11:20 +03:00
ret = XML_ENC_ERR_INTERNAL ;
2017-06-19 15:57:43 +03:00
}
return ( ret ) ;
}
2020-06-30 03:43:57 +03:00
/**
* xmlEncOutputChunk :
* @ handler : encoding handler
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of input bytes
* @ inlen : the length of @ in
*
2023-02-22 19:11:20 +03:00
* Returns an XML_ENC_ERR code .
2020-06-30 03:43:57 +03:00
*
* The value of @ inlen after return is the number of octets consumed
* as the return value is 0 , else unpredictable .
* The value of @ outlen after return is the number of octets produced .
*/
2017-06-19 15:57:43 +03:00
static int
xmlEncOutputChunk ( xmlCharEncodingHandler * handler , unsigned char * out ,
int * outlen , const unsigned char * in , int * inlen ) {
int ret ;
if ( handler - > output ! = NULL ) {
2024-06-28 20:06:57 +03:00
xmlCharEncConvFunc conv =
( xmlCharEncConvFunc ) ( void ( * ) ( void ) ) handler - > output ;
ret = conv ( out , outlen , in , inlen , handler - > outputCtxt ) ;
if ( ret > 0 )
ret = XML_ENC_ERR_SUCCESS ;
2017-06-19 15:57:43 +03:00
}
else {
* outlen = 0 ;
* inlen = 0 ;
2023-02-22 19:11:20 +03:00
ret = XML_ENC_ERR_INTERNAL ;
2017-06-19 15:57:43 +03:00
}
return ( ret ) ;
}
2001-02-23 20:55:21 +03:00
/**
2022-08-26 02:22:33 +03:00
* xmlCharEncFirstLine :
2023-02-22 19:11:20 +03:00
* @ handler : char encoding transformation data structure
2001-02-23 20:55:21 +03:00
* @ out : an xmlBuffer for the output .
* @ in : an xmlBuffer for the input
2009-08-26 13:38:49 +04:00
*
2023-03-21 21:07:12 +03:00
* DEPERECATED : Don ' t use .
2023-09-21 23:57:33 +03:00
*
* Returns the number of bytes written or an XML_ENC_ERR code .
2001-02-23 20:55:21 +03:00
*/
int
2022-08-26 02:22:33 +03:00
xmlCharEncFirstLine ( xmlCharEncodingHandler * handler , xmlBufferPtr out ,
xmlBufferPtr in ) {
2023-03-21 21:07:12 +03:00
return ( xmlCharEncInFunc ( handler , out , in ) ) ;
2012-07-13 15:51:15 +04:00
}
/**
* xmlCharEncInput :
* @ input : a parser input buffer
*
* Generic front - end for the encoding handler on parser input
*
2023-02-22 19:11:20 +03:00
* Returns the number of bytes written or an XML_ENC_ERR code .
2012-07-13 15:51:15 +04:00
*/
int
2023-08-08 16:21:31 +03:00
xmlCharEncInput ( xmlParserInputBufferPtr input )
2012-07-13 15:51:15 +04:00
{
2017-06-19 15:57:43 +03:00
int ret ;
2023-08-08 16:21:31 +03:00
size_t avail ;
2012-07-13 15:51:15 +04:00
size_t toconv ;
int c_in ;
int c_out ;
xmlBufPtr in ;
xmlBufPtr out ;
2023-08-08 16:21:31 +03:00
const xmlChar * inData ;
size_t inTotal = 0 ;
2012-07-13 15:51:15 +04:00
if ( ( input = = NULL ) | | ( input - > encoder = = NULL ) | |
( input - > buffer = = NULL ) | | ( input - > raw = = NULL ) )
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2012-07-13 15:51:15 +04:00
out = input - > buffer ;
in = input - > raw ;
toconv = xmlBufUse ( in ) ;
if ( toconv = = 0 )
return ( 0 ) ;
2023-08-08 16:21:31 +03:00
inData = xmlBufContent ( in ) ;
inTotal = 0 ;
do {
c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv ;
avail = xmlBufAvail ( out ) ;
if ( avail > INT_MAX )
avail = INT_MAX ;
2023-09-29 03:45:20 +03:00
if ( avail < 4096 ) {
if ( xmlBufGrow ( out , 4096 ) < 0 ) {
2023-08-08 16:21:31 +03:00
input - > error = XML_ERR_NO_MEMORY ;
return ( XML_ENC_ERR_MEMORY ) ;
}
avail = xmlBufAvail ( out ) ;
2023-06-08 22:53:05 +03:00
}
2012-07-13 15:51:15 +04:00
2023-08-08 16:21:31 +03:00
c_in = toconv ;
c_out = avail ;
ret = xmlEncInputChunk ( input - > encoder , xmlBufEnd ( out ) , & c_out ,
inData , & c_in ) ;
inTotal + = c_in ;
inData + = c_in ;
toconv - = c_in ;
xmlBufAddLen ( out , c_out ) ;
} while ( ret = = XML_ENC_ERR_SPACE ) ;
xmlBufShrink ( in , inTotal ) ;
2017-06-19 15:57:43 +03:00
2023-08-08 16:19:51 +03:00
if ( input - > rawconsumed > ULONG_MAX - ( unsigned long ) c_in )
input - > rawconsumed = ULONG_MAX ;
else
input - > rawconsumed + = c_in ;
2023-12-10 16:56:21 +03:00
if ( ( ( ret ! = 0 ) & & ( c_out = = 0 ) ) | |
( ret = = XML_ENC_ERR_MEMORY ) ) {
2023-04-30 19:25:09 +03:00
if ( input - > error = = 0 )
input - > error = xmlEncConvertError ( ret ) ;
return ( ret ) ;
}
return ( c_out ) ;
2012-07-13 15:51:15 +04:00
}
2001-02-23 20:55:21 +03:00
/**
* xmlCharEncInFunc :
2001-12-31 19:16:02 +03:00
* @ handler : char encoding transformation data structure
2001-02-23 20:55:21 +03:00
* @ out : an xmlBuffer for the output .
* @ in : an xmlBuffer for the input
2009-08-26 13:38:49 +04:00
*
2001-02-23 20:55:21 +03:00
* Generic front - end for the encoding handler input function
2009-08-26 13:38:49 +04:00
*
2023-02-22 19:11:20 +03:00
* Returns the number of bytes written or an XML_ENC_ERR code .
2001-02-23 20:55:21 +03:00
*/
int
2001-06-22 02:07:42 +04:00
xmlCharEncInFunc ( xmlCharEncodingHandler * handler , xmlBufferPtr out ,
xmlBufferPtr in )
{
2017-06-19 15:57:43 +03:00
int ret ;
2001-02-23 20:55:21 +03:00
int written ;
int toconv ;
2001-06-22 02:07:42 +04:00
if ( handler = = NULL )
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2001-06-22 02:07:42 +04:00
if ( out = = NULL )
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2001-06-22 02:07:42 +04:00
if ( in = = NULL )
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2001-02-23 20:55:21 +03:00
toconv = in - > use ;
if ( toconv = = 0 )
2001-06-22 02:07:42 +04:00
return ( 0 ) ;
2011-08-19 07:05:04 +04:00
written = out - > size - out - > use - 1 ; /* count '\0' */
2001-02-23 20:55:21 +03:00
if ( toconv * 2 > = written ) {
xmlBufferGrow ( out , out - > size + toconv * 2 ) ;
2001-06-22 02:07:42 +04:00
written = out - > size - out - > use - 1 ;
2001-02-23 20:55:21 +03:00
}
2017-06-19 15:57:43 +03:00
ret = xmlEncInputChunk ( handler , & out - > content [ out - > use ] , & written ,
2023-08-08 16:21:31 +03:00
in - > content , & toconv ) ;
2017-06-19 15:57:43 +03:00
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
2005-08-24 18:22:55 +04:00
return ( written ? written : ret ) ;
2001-02-23 20:55:21 +03:00
}
2013-08-03 15:22:54 +04:00
# ifdef LIBXML_OUTPUT_ENABLED
2012-07-13 15:51:15 +04:00
/**
* xmlCharEncOutput :
2012-08-10 06:00:18 +04:00
* @ output : a parser output buffer
2012-07-13 15:51:15 +04:00
* @ init : is this an initialization call without data
*
* Generic front - end for the encoding handler on parser output
* a first call with @ init = = 1 has to be made first to initiate the
* output in case of non - stateless encoding needing to initiate their
* state or the output ( like the BOM in UTF16 ) .
* In case of UTF8 sequence conversion errors for the given encoder ,
* the content will be automatically remapped to a CharRef sequence .
*
2023-02-22 19:11:20 +03:00
* Returns the number of bytes written or an XML_ENC_ERR code .
2012-07-13 15:51:15 +04:00
*/
int
xmlCharEncOutput ( xmlOutputBufferPtr output , int init )
{
2017-06-19 15:57:43 +03:00
int ret ;
2012-07-13 15:51:15 +04:00
size_t written ;
2020-06-15 15:49:22 +03:00
int writtentot = 0 ;
2012-07-13 15:51:15 +04:00
size_t toconv ;
int c_in ;
int c_out ;
xmlBufPtr in ;
xmlBufPtr out ;
if ( ( output = = NULL ) | | ( output - > encoder = = NULL ) | |
( output - > buffer = = NULL ) | | ( output - > conv = = NULL ) )
2023-02-22 19:11:20 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2012-07-13 15:51:15 +04:00
out = output - > conv ;
in = output - > buffer ;
retry :
written = xmlBufAvail ( out ) ;
/*
* First specific handling of the initialization call
*/
if ( init ) {
c_in = 0 ;
c_out = written ;
2017-06-19 15:57:43 +03:00
/* TODO: Check return value. */
xmlEncOutputChunk ( output - > encoder , xmlBufEnd ( out ) , & c_out ,
NULL , & c_in ) ;
xmlBufAddLen ( out , c_out ) ;
2020-06-15 15:49:22 +03:00
return ( c_out ) ;
2012-07-13 15:51:15 +04:00
}
/*
* Conversion itself .
*/
toconv = xmlBufUse ( in ) ;
if ( toconv > 64 * 1024 )
toconv = 64 * 1024 ;
if ( toconv * 4 > = written ) {
2023-12-10 16:56:21 +03:00
if ( xmlBufGrow ( out , toconv * 4 ) < 0 ) {
ret = XML_ENC_ERR_MEMORY ;
goto error ;
}
2022-05-26 04:13:07 +03:00
written = xmlBufAvail ( out ) ;
2012-07-13 15:51:15 +04:00
}
if ( written > 256 * 1024 )
written = 256 * 1024 ;
c_in = toconv ;
c_out = written ;
2017-06-19 15:57:43 +03:00
ret = xmlEncOutputChunk ( output - > encoder , xmlBufEnd ( out ) , & c_out ,
xmlBufContent ( in ) , & c_in ) ;
xmlBufShrink ( in , c_in ) ;
xmlBufAddLen ( out , c_out ) ;
writtentot + = c_out ;
2023-02-22 19:11:20 +03:00
if ( ret = = XML_ENC_ERR_SPACE )
goto retry ;
2012-07-13 15:51:15 +04:00
/*
* Attempt to handle error cases
*/
2023-02-22 19:11:20 +03:00
if ( ret = = XML_ENC_ERR_INPUT ) {
xmlChar charref [ 20 ] ;
int len = xmlBufUse ( in ) ;
xmlChar * content = xmlBufContent ( in ) ;
int cur , charrefLen ;
cur = xmlGetUTF8Char ( content , & len ) ;
if ( cur < = 0 )
2023-04-30 19:25:09 +03:00
goto error ;
2012-07-13 15:51:15 +04:00
2023-02-22 19:11:20 +03:00
/*
* Removes the UTF8 sequence , and replace it by a charref
* and continue the transcoding phase , hoping the error
* did not mangle the encoder state .
*/
2024-07-12 04:07:57 +03:00
charrefLen = xmlSerializeDecCharRef ( ( char * ) charref , cur ) ;
2023-02-22 19:11:20 +03:00
xmlBufGrow ( out , charrefLen * 4 ) ;
c_out = xmlBufAvail ( out ) ;
c_in = charrefLen ;
ret = xmlEncOutputChunk ( output - > encoder , xmlBufEnd ( out ) , & c_out ,
charref , & c_in ) ;
2023-04-30 19:25:09 +03:00
if ( ( ret < 0 ) | | ( c_in ! = charrefLen ) ) {
ret = XML_ENC_ERR_INTERNAL ;
goto error ;
}
2017-06-19 16:32:56 +03:00
2024-03-18 16:17:57 +03:00
xmlBufShrink ( in , len ) ;
2023-02-22 19:11:20 +03:00
xmlBufAddLen ( out , c_out ) ;
writtentot + = c_out ;
goto retry ;
2012-07-13 15:51:15 +04:00
}
2023-04-30 19:25:09 +03:00
error :
2023-12-10 16:56:21 +03:00
if ( ( ( writtentot < = 0 ) & & ( ret ! = 0 ) ) | |
( ret = = XML_ENC_ERR_MEMORY ) ) {
2023-04-30 19:25:09 +03:00
if ( output - > error = = 0 )
output - > error = xmlEncConvertError ( ret ) ;
return ( ret ) ;
}
return ( writtentot ) ;
2012-07-13 15:51:15 +04:00
}
2013-08-03 15:22:54 +04:00
# endif
2012-07-13 15:51:15 +04:00
2001-02-23 20:55:21 +03:00
/**
* xmlCharEncOutFunc :
2019-09-30 18:04:54 +03:00
* @ handler : char encoding transformation data structure
2001-02-23 20:55:21 +03:00
* @ out : an xmlBuffer for the output .
* @ in : an xmlBuffer for the input
2010-11-04 19:42:42 +03:00
*
2001-02-23 20:55:21 +03:00
* Generic front - end for the encoding handler output function
2010-11-04 19:42:42 +03:00
* a first call with @ in = = NULL has to be made firs to initiate the
2001-02-23 20:55:21 +03:00
* output in case of non - stateless encoding needing to initiate their
* state or the output ( like the BOM in UTF16 ) .
* In case of UTF8 sequence conversion errors for the given encoder ,
* the content will be automatically remapped to a CharRef sequence .
2010-11-04 19:42:42 +03:00
*
2023-02-22 19:11:20 +03:00
* Returns the number of bytes written or an XML_ENC_ERR code .
2001-02-23 20:55:21 +03:00
*/
int
xmlCharEncOutFunc ( xmlCharEncodingHandler * handler , xmlBufferPtr out ,
xmlBufferPtr in ) {
2017-06-19 15:57:43 +03:00
int ret ;
2001-02-23 20:55:21 +03:00
int written ;
int writtentot = 0 ;
int toconv ;
2023-02-22 19:11:20 +03:00
if ( handler = = NULL ) return ( XML_ENC_ERR_INTERNAL ) ;
if ( out = = NULL ) return ( XML_ENC_ERR_INTERNAL ) ;
2001-02-23 20:55:21 +03:00
retry :
2010-11-04 19:42:42 +03:00
2001-02-23 20:55:21 +03:00
written = out - > size - out - > use ;
2003-03-08 16:29:24 +03:00
if ( written > 0 )
written - - ; /* Gennady: count '/0' */
2001-02-23 20:55:21 +03:00
/*
* First specific handling of in = NULL , i . e . the initialization call
*/
if ( in = = NULL ) {
toconv = 0 ;
2017-06-19 15:57:43 +03:00
/* TODO: Check return value. */
xmlEncOutputChunk ( handler , & out - > content [ out - > use ] , & written ,
NULL , & toconv ) ;
out - > use + = written ;
out - > content [ out - > use ] = 0 ;
2001-02-23 20:55:21 +03:00
return ( 0 ) ;
}
/*
2001-12-31 19:16:02 +03:00
* Conversion itself .
2001-02-23 20:55:21 +03:00
*/
toconv = in - > use ;
2008-04-03 13:46:34 +04:00
if ( toconv * 4 > = written ) {
xmlBufferGrow ( out , toconv * 4 ) ;
2001-02-23 20:55:21 +03:00
written = out - > size - out - > use - 1 ;
}
2017-06-19 15:57:43 +03:00
ret = xmlEncOutputChunk ( handler , & out - > content [ out - > use ] , & written ,
in - > content , & toconv ) ;
xmlBufferShrink ( in , toconv ) ;
out - > use + = written ;
writtentot + = written ;
out - > content [ out - > use ] = 0 ;
2023-02-22 19:11:20 +03:00
if ( ret = = XML_ENC_ERR_SPACE )
goto retry ;
2001-02-23 20:55:21 +03:00
/*
* Attempt to handle error cases
*/
2023-02-22 19:11:20 +03:00
if ( ret = = XML_ENC_ERR_INPUT ) {
xmlChar charref [ 20 ] ;
int len = in - > use ;
const xmlChar * utf = ( const xmlChar * ) in - > content ;
int cur , charrefLen ;
cur = xmlGetUTF8Char ( utf , & len ) ;
if ( cur < = 0 )
return ( ret ) ;
2001-02-23 20:55:21 +03:00
2023-02-22 19:11:20 +03:00
/*
* Removes the UTF8 sequence , and replace it by a charref
* and continue the transcoding phase , hoping the error
* did not mangle the encoder state .
*/
2024-07-12 04:07:57 +03:00
charrefLen = xmlSerializeDecCharRef ( ( char * ) charref , cur ) ;
2023-02-22 19:11:20 +03:00
xmlBufferShrink ( in , len ) ;
xmlBufferGrow ( out , charrefLen * 4 ) ;
written = out - > size - out - > use - 1 ;
toconv = charrefLen ;
ret = xmlEncOutputChunk ( handler , & out - > content [ out - > use ] , & written ,
charref , & toconv ) ;
if ( ( ret < 0 ) | | ( toconv ! = charrefLen ) )
return ( XML_ENC_ERR_INTERNAL ) ;
2017-06-19 16:32:56 +03:00
2023-02-22 19:11:20 +03:00
out - > use + = written ;
writtentot + = written ;
out - > content [ out - > use ] = 0 ;
goto retry ;
2001-02-23 20:55:21 +03:00
}
2020-06-30 03:43:57 +03:00
return ( writtentot ? writtentot : ret ) ;
2001-02-23 20:55:21 +03:00
}
/**
* xmlCharEncCloseFunc :
2019-09-30 18:04:54 +03:00
* @ handler : char encoding transformation data structure
2010-11-04 19:42:42 +03:00
*
2024-06-28 23:42:46 +03:00
* Releases an xmlCharEncodingHandler . Must be called after
* a handler is no longer in use .
2001-02-23 20:55:21 +03:00
*
2024-06-28 23:42:46 +03:00
* Returns 0.
2001-02-23 20:55:21 +03:00
*/
int
xmlCharEncCloseFunc ( xmlCharEncodingHandler * handler ) {
2024-06-28 23:42:46 +03:00
if ( handler = = NULL )
return ( 0 ) ;
2022-11-24 21:14:33 +03:00
2024-06-28 23:42:46 +03:00
if ( handler - > flags & XML_HANDLER_STATIC )
return ( 0 ) ;
2024-06-27 22:37:18 +03:00
2024-06-28 23:42:46 +03:00
xmlFree ( handler - > name ) ;
2024-06-27 22:37:18 +03:00
if ( handler - > ctxtDtor ! = NULL ) {
handler - > ctxtDtor ( handler - > inputCtxt ) ;
handler - > ctxtDtor ( handler - > outputCtxt ) ;
2024-06-28 00:32:58 +03:00
}
2024-06-28 23:42:46 +03:00
xmlFree ( handler ) ;
return ( 0 ) ;
2001-02-23 20:55:21 +03:00
}
2004-02-11 16:25:26 +03:00
/**
* xmlByteConsumed :
* @ ctxt : an XML parser context
*
2024-07-13 01:17:18 +03:00
* DEPRECATED : Don ' t use .
*
2004-02-11 16:25:26 +03:00
* This function provides the current index of the parser relative
* to the start of the current entity . This function is computed in
* bytes from the beginning starting at zero and finishing at the
* size in byte of the file if parsing a file . The function is
* of constant cost if the input is UTF - 8 but can be costly if run
* on non - UTF - 8 input .
*
* Returns the index in bytes from the beginning of the entity or - 1
* in case the index could not be computed .
*/
long
xmlByteConsumed ( xmlParserCtxtPtr ctxt ) {
xmlParserInputPtr in ;
2010-11-04 19:42:42 +03:00
2024-07-09 15:09:15 +03:00
if ( ctxt = = NULL )
return ( - 1 ) ;
2004-02-11 16:25:26 +03:00
in = ctxt - > input ;
2024-07-09 15:09:15 +03:00
if ( in = = NULL )
return ( - 1 ) ;
2004-02-11 16:25:26 +03:00
if ( ( in - > buf ! = NULL ) & & ( in - > buf - > encoder ! = NULL ) ) {
2024-07-09 15:09:15 +03:00
int unused = 0 ;
2004-02-11 16:25:26 +03:00
xmlCharEncodingHandler * handler = in - > buf - > encoder ;
2024-07-09 15:09:15 +03:00
2004-02-11 16:25:26 +03:00
/*
* Encoding conversion , compute the number of unused original
2019-09-30 18:04:54 +03:00
* bytes from the input not consumed and subtract that from
2004-02-11 16:25:26 +03:00
* the raw consumed value , this is not a cheap operation
*/
if ( in - > end - in - > cur > 0 ) {
2024-07-09 15:09:15 +03:00
unsigned char * convbuf ;
2004-09-18 08:52:08 +04:00
const unsigned char * cur = ( const unsigned char * ) in - > cur ;
2024-07-09 15:09:15 +03:00
int toconv , ret ;
convbuf = xmlMalloc ( 32000 ) ;
if ( convbuf = = NULL )
return ( - 1 ) ;
toconv = in - > end - cur ;
unused = 32000 ;
ret = xmlEncOutputChunk ( handler , convbuf , & unused , cur , & toconv ) ;
xmlFree ( convbuf ) ;
if ( ret ! = XML_ENC_ERR_SUCCESS )
return ( - 1 ) ;
2004-02-11 16:25:26 +03:00
}
2024-07-09 15:09:15 +03:00
if ( in - > buf - > rawconsumed < ( unsigned long ) unused )
2004-02-11 16:25:26 +03:00
return ( - 1 ) ;
return ( in - > buf - > rawconsumed - unused ) ;
}
2024-07-09 15:09:15 +03:00
2004-02-11 16:25:26 +03:00
return ( in - > consumed + ( in - > cur - in - > base ) ) ;
}
2024-06-28 05:36:14 +03:00
/************************************************************************
* *
* Conversions To / From UTF8 encoding *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
static int
asciiToAscii ( unsigned char * out , int * poutlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * pinlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2024-06-28 05:36:14 +03:00
const unsigned char * inend ;
const unsigned char * instart = in ;
int inlen , outlen , ret ;
if ( in = = NULL ) {
* pinlen = 0 ;
* poutlen = 0 ;
return ( XML_ENC_ERR_SUCCESS ) ;
}
inlen = * pinlen ;
outlen = * poutlen ;
if ( outlen < inlen ) {
inlen = outlen ;
ret = XML_ENC_ERR_SPACE ;
} else {
ret = inlen ;
}
inend = in + inlen ;
* poutlen = inlen ;
* pinlen = inlen ;
while ( in < inend ) {
unsigned c = * in ;
if ( c > = 0x80 ) {
* poutlen = in - instart ;
* pinlen = in - instart ;
return ( XML_ENC_ERR_INPUT ) ;
}
in + + ;
* out + + = c ;
}
return ( ret ) ;
}
2024-06-28 20:06:57 +03:00
static int
latin1ToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2024-06-28 05:36:14 +03:00
unsigned char * outstart = out ;
const unsigned char * instart = in ;
unsigned char * outend ;
const unsigned char * inend ;
int ret = XML_ENC_ERR_SPACE ;
if ( ( out = = NULL ) | | ( in = = NULL ) | | ( outlen = = NULL ) | | ( inlen = = NULL ) )
return ( XML_ENC_ERR_INTERNAL ) ;
outend = out + * outlen ;
inend = in + * inlen ;
while ( in < inend ) {
unsigned c = * in ;
if ( c < 0x80 ) {
if ( out > = outend )
goto done ;
* out + + = c ;
} else {
if ( outend - out < 2 )
goto done ;
* out + + = ( c > > 6 ) | 0xC0 ;
* out + + = ( c & 0x3F ) | 0x80 ;
}
in + + ;
}
ret = out - outstart ;
done :
* outlen = out - outstart ;
* inlen = in - instart ;
return ( ret ) ;
}
2024-06-28 20:06:57 +03:00
/**
* isolat1ToUTF8 :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of ISO Latin 1 chars
* @ inlen : the length of @ in
*
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF - 8
* block of chars out .
*
* Returns the number of bytes written or an XML_ENC_ERR code .
*
* The value of @ inlen after return is the number of octets consumed
* if the return value is positive , else unpredictable .
* The value of @ outlen after return is the number of octets produced .
*/
int
isolat1ToUTF8 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ) {
return ( latin1ToUTF8 ( out , outlen , in , inlen , NULL ) ) ;
}
2024-06-28 05:36:14 +03:00
static int
UTF8ToUTF8 ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2024-06-28 05:36:14 +03:00
int len ;
int ret ;
if ( in = = NULL ) {
* inlen = 0 ;
* outlen = 0 ;
return ( XML_ENC_ERR_SUCCESS ) ;
}
if ( * outlen < * inlen ) {
len = * outlen ;
ret = XML_ENC_ERR_SPACE ;
} else {
len = * inlen ;
ret = len ;
}
memcpy ( out , in , len ) ;
* outlen = len ;
* inlen = len ;
return ( ret ) ;
}
# ifdef LIBXML_OUTPUT_ENABLED
2024-06-28 20:06:57 +03:00
static int
UTF8ToLatin1 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
const unsigned char * outend ;
const unsigned char * outstart = out ;
const unsigned char * instart = in ;
const unsigned char * inend ;
unsigned c ;
int ret = XML_ENC_ERR_SPACE ;
if ( ( out = = NULL ) | | ( outlen = = NULL ) | | ( inlen = = NULL ) )
return ( XML_ENC_ERR_INTERNAL ) ;
if ( in = = NULL ) {
* inlen = 0 ;
* outlen = 0 ;
return ( XML_ENC_ERR_SUCCESS ) ;
}
inend = in + * inlen ;
outend = out + * outlen ;
while ( in < inend ) {
if ( out > = outend )
goto done ;
c = * in ;
if ( c < 0x80 ) {
* out + + = c ;
2024-07-10 23:26:19 +03:00
} else if ( ( c > = 0xC2 ) & & ( c < = 0xC3 ) ) {
2024-06-28 20:06:57 +03:00
if ( inend - in < 2 )
break ;
in + + ;
* out + + = ( unsigned char ) ( ( c < < 6 ) | ( * in & 0x3F ) ) ;
} else {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
in + + ;
}
ret = out - outstart ;
done :
* outlen = out - outstart ;
* inlen = in - instart ;
return ( ret ) ;
}
2024-07-03 16:11:20 +03:00
2024-06-28 20:06:57 +03:00
/**
* UTF8Toisolat1 :
* @ out : a pointer to an array of bytes to store the result
* @ outlen : the length of @ out
* @ in : a pointer to an array of UTF - 8 chars
* @ inlen : the length of @ in
*
* Take a block of UTF - 8 chars in and try to convert it to an ISO Latin 1
* block of chars out .
*
* Returns the number of bytes written or an XML_ENC_ERR code .
*
* The value of @ inlen after return is the number of octets consumed
* if the return value is positive , else unpredictable .
* The value of @ outlen after return is the number of octets produced .
*/
2024-06-28 05:36:14 +03:00
int
UTF8Toisolat1 ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ) {
2024-07-03 16:11:20 +03:00
if ( ( out = = NULL ) | | ( outlen = = NULL ) | | ( in = = NULL ) | | ( inlen = = NULL ) )
2024-06-28 05:36:14 +03:00
return ( XML_ENC_ERR_INTERNAL ) ;
2024-07-03 16:11:20 +03:00
return ( UTF8ToLatin1 ( out , outlen , in , inlen , NULL ) ) ;
2024-06-28 05:36:14 +03:00
}
# endif /* LIBXML_OUTPUT_ENABLED */
2017-06-17 22:43:48 +03:00
static int
UTF16LEToUTF8 ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2017-06-17 22:43:48 +03:00
const unsigned char * instart = in ;
const unsigned char * inend = in + ( * inlen & ~ 1 ) ;
unsigned char * outstart = out ;
unsigned char * outend = out + * outlen ;
unsigned c , d ;
int ret = XML_ENC_ERR_SPACE ;
while ( in < inend ) {
c = in [ 0 ] | ( in [ 1 ] < < 8 ) ;
if ( c < 0x80 ) {
if ( out > = outend )
goto done ;
out [ 0 ] = c ;
in + = 2 ;
out + = 1 ;
} else if ( c < 0x800 ) {
if ( outend - out < 2 )
goto done ;
out [ 0 ] = ( c > > 6 ) | 0xC0 ;
out [ 1 ] = ( c & 0x3F ) | 0x80 ;
in + = 2 ;
out + = 2 ;
} else if ( ( c & 0xF800 ) ! = 0xD800 ) {
if ( outend - out < 3 )
goto done ;
out [ 0 ] = ( c > > 12 ) | 0xE0 ;
out [ 1 ] = ( ( c > > 6 ) & 0x3F ) | 0x80 ;
out [ 2 ] = ( c & 0x3F ) | 0x80 ;
in + = 2 ;
out + = 3 ;
} else {
/* Surrogate pair */
if ( ( c & 0xFC00 ) ! = 0xD800 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
if ( inend - in < 4 )
break ;
d = in [ 2 ] | ( in [ 3 ] < < 8 ) ;
if ( ( d & 0xFC00 ) ! = 0xDC00 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
if ( outend - out < 4 )
goto done ;
c = ( c < < 10 ) + d - ( ( 0xD800 < < 10 ) + 0xDC00 - 0x10000 ) ;
out [ 0 ] = ( c > > 18 ) | 0xF0 ;
out [ 1 ] = ( ( c > > 12 ) & 0x3F ) | 0x80 ;
out [ 2 ] = ( ( c > > 6 ) & 0x3F ) | 0x80 ;
out [ 3 ] = ( c & 0x3F ) | 0x80 ;
in + = 4 ;
out + = 4 ;
}
}
ret = out - outstart ;
done :
* outlen = out - outstart ;
* inlen = in - instart ;
return ( ret ) ;
}
# ifdef LIBXML_OUTPUT_ENABLED
static int
UTF8ToUTF16LE ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2017-06-17 22:43:48 +03:00
const unsigned char * instart = in ;
const unsigned char * inend ;
unsigned char * outstart = out ;
unsigned char * outend ;
unsigned c , d ;
int ret = XML_ENC_ERR_SPACE ;
/* UTF16LE encoding has no BOM */
if ( ( out = = NULL ) | | ( outlen = = NULL ) | | ( inlen = = NULL ) )
return ( XML_ENC_ERR_INTERNAL ) ;
if ( in = = NULL ) {
* outlen = 0 ;
* inlen = 0 ;
return ( 0 ) ;
}
inend = in + * inlen ;
outend = out + ( * outlen & ~ 1 ) ;
while ( in < inend ) {
c = in [ 0 ] ;
if ( c < 0x80 ) {
2024-07-10 23:26:19 +03:00
if ( out > = outend )
goto done ;
2017-06-17 22:43:48 +03:00
out [ 0 ] = c ;
out [ 1 ] = 0 ;
in + = 1 ;
out + = 2 ;
2024-07-10 23:26:19 +03:00
} else {
int i , len ;
unsigned min ;
if ( c < 0xE0 ) {
if ( c < 0xC2 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
c & = 0x1F ;
len = 2 ;
min = 0x80 ;
} else if ( c < 0xF0 ) {
c & = 0x0F ;
len = 3 ;
min = 0x800 ;
} else {
c & = 0x0F ;
len = 4 ;
min = 0x10000 ;
}
if ( inend - in < len )
2017-06-17 22:43:48 +03:00
break ;
2024-07-10 23:26:19 +03:00
for ( i = 1 ; i < len ; i + + ) {
if ( ( in [ i ] & 0xC0 ) ! = 0x80 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
c = ( c < < 6 ) | ( in [ i ] & 0x3F ) ;
}
if ( ( c < min ) | |
( ( c > = 0xD800 ) & & ( c < = 0xDFFF ) ) | |
( c > 0x10FFFF ) ) {
ret = XML_ENC_ERR_INPUT ;
2017-06-17 22:43:48 +03:00
goto done ;
2024-07-10 23:26:19 +03:00
}
if ( c < 0x10000 ) {
if ( out > = outend )
goto done ;
out [ 0 ] = c & 0xFF ;
out [ 1 ] = c > > 8 ;
out + = 2 ;
} else {
if ( outend - out < 4 )
goto done ;
c - = 0x10000 ;
d = ( c & 0x03FF ) | 0xDC00 ;
c = ( c > > 10 ) | 0xD800 ;
out [ 0 ] = c & 0xFF ;
out [ 1 ] = c > > 8 ;
out [ 2 ] = d & 0xFF ;
out [ 3 ] = d > > 8 ;
out + = 4 ;
}
in + = len ;
2017-06-17 22:43:48 +03:00
}
}
ret = out - outstart ;
done :
* outlen = out - outstart ;
* inlen = in - instart ;
return ( ret ) ;
}
static int
UTF8ToUTF16 ( unsigned char * outb , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2017-06-17 22:43:48 +03:00
if ( in = = NULL ) {
/*
* initialization , add the Byte Order Mark for UTF - 16L E
*/
if ( * outlen > = 2 ) {
outb [ 0 ] = 0xFF ;
outb [ 1 ] = 0xFE ;
* outlen = 2 ;
* inlen = 0 ;
return ( 2 ) ;
}
* outlen = 0 ;
* inlen = 0 ;
return ( 0 ) ;
}
2024-06-28 20:06:57 +03:00
return ( UTF8ToUTF16LE ( outb , outlen , in , inlen , NULL ) ) ;
2017-06-17 22:43:48 +03:00
}
# endif /* LIBXML_OUTPUT_ENABLED */
static int
UTF16BEToUTF8 ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2017-06-17 22:43:48 +03:00
const unsigned char * instart = in ;
const unsigned char * inend = in + ( * inlen & ~ 1 ) ;
unsigned char * outstart = out ;
unsigned char * outend = out + * outlen ;
unsigned c , d ;
int ret = XML_ENC_ERR_SPACE ;
while ( in < inend ) {
c = ( in [ 0 ] < < 8 ) | in [ 1 ] ;
if ( c < 0x80 ) {
if ( out > = outend )
goto done ;
out [ 0 ] = c ;
in + = 2 ;
out + = 1 ;
} else if ( c < 0x800 ) {
if ( outend - out < 2 )
goto done ;
out [ 0 ] = ( c > > 6 ) | 0xC0 ;
out [ 1 ] = ( c & 0x3F ) | 0x80 ;
in + = 2 ;
out + = 2 ;
} else if ( ( c & 0xF800 ) ! = 0xD800 ) {
if ( outend - out < 3 )
goto done ;
out [ 0 ] = ( c > > 12 ) | 0xE0 ;
out [ 1 ] = ( ( c > > 6 ) & 0x3F ) | 0x80 ;
out [ 2 ] = ( c & 0x3F ) | 0x80 ;
in + = 2 ;
out + = 3 ;
} else {
/* Surrogate pair */
if ( ( c & 0xFC00 ) ! = 0xD800 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
if ( inend - in < 4 )
break ;
d = ( in [ 2 ] < < 8 ) | in [ 3 ] ;
if ( ( d & 0xFC00 ) ! = 0xDC00 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
if ( outend - out < 4 )
goto done ;
c = ( c < < 10 ) + d - ( ( 0xD800 < < 10 ) + 0xDC00 - 0x10000 ) ;
out [ 0 ] = ( c > > 18 ) | 0xF0 ;
out [ 1 ] = ( ( c > > 12 ) & 0x3F ) | 0x80 ;
out [ 2 ] = ( ( c > > 6 ) & 0x3F ) | 0x80 ;
out [ 3 ] = ( c & 0x3F ) | 0x80 ;
in + = 4 ;
out + = 4 ;
}
}
ret = out - outstart ;
done :
* outlen = out - outstart ;
* inlen = in - instart ;
return ( ret ) ;
}
# ifdef LIBXML_OUTPUT_ENABLED
static int
UTF8ToUTF16BE ( unsigned char * out , int * outlen ,
2024-06-28 20:06:57 +03:00
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
2017-06-17 22:43:48 +03:00
const unsigned char * instart = in ;
const unsigned char * inend ;
unsigned char * outstart = out ;
unsigned char * outend ;
unsigned c , d ;
int ret = XML_ENC_ERR_SPACE ;
/* UTF-16BE has no BOM */
if ( ( out = = NULL ) | | ( outlen = = NULL ) | | ( inlen = = NULL ) ) return ( - 1 ) ;
if ( in = = NULL ) {
* outlen = 0 ;
* inlen = 0 ;
return ( 0 ) ;
}
inend = in + * inlen ;
outend = out + ( * outlen & ~ 1 ) ;
while ( in < inend ) {
c = in [ 0 ] ;
if ( c < 0x80 ) {
2024-07-10 23:26:19 +03:00
if ( out > = outend )
goto done ;
2017-06-17 22:43:48 +03:00
out [ 0 ] = 0 ;
out [ 1 ] = c ;
in + = 1 ;
out + = 2 ;
2024-07-10 23:26:19 +03:00
} else {
int i , len ;
unsigned min ;
if ( c < 0xE0 ) {
if ( c < 0xC2 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
c & = 0x1F ;
len = 2 ;
min = 0x80 ;
} else if ( c < 0xF0 ) {
c & = 0x0F ;
len = 3 ;
min = 0x800 ;
} else {
c & = 0x0F ;
len = 4 ;
min = 0x10000 ;
}
if ( inend - in < len )
2017-06-17 22:43:48 +03:00
break ;
2024-07-10 23:26:19 +03:00
for ( i = 1 ; i < len ; i + + ) {
if ( ( in [ i ] & 0xC0 ) ! = 0x80 ) {
ret = XML_ENC_ERR_INPUT ;
goto done ;
}
c = ( c < < 6 ) | ( in [ i ] & 0x3F ) ;
}
if ( ( c < min ) | |
( ( c > = 0xD800 ) & & ( c < = 0xDFFF ) ) | |
( c > 0x10FFFF ) ) {
ret = XML_ENC_ERR_INPUT ;
2017-06-17 22:43:48 +03:00
goto done ;
2024-07-10 23:26:19 +03:00
}
if ( c < 0x10000 ) {
if ( out > = outend )
goto done ;
out [ 0 ] = c > > 8 ;
out [ 1 ] = c & 0xFF ;
out + = 2 ;
} else {
if ( outend - out < 4 )
goto done ;
c - = 0x10000 ;
d = ( c & 0x03FF ) | 0xDC00 ;
c = ( c > > 10 ) | 0xD800 ;
out [ 0 ] = c > > 8 ;
out [ 1 ] = c & 0xFF ;
out [ 2 ] = d > > 8 ;
out [ 3 ] = d & 0xFF ;
out + = 4 ;
}
in + = len ;
2017-06-17 22:43:48 +03:00
}
}
ret = out - outstart ;
done :
* outlen = out - outstart ;
* inlen = in - instart ;
return ( ret ) ;
}
# endif /* LIBXML_OUTPUT_ENABLED */
2024-06-28 20:06:57 +03:00
# if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED)
static int
UTF8ToHtmlWrapper ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen ,
void * vctxt ATTRIBUTE_UNUSED ) {
return ( UTF8ToHtml ( out , outlen , in , inlen ) ) ;
}
# endif
2024-06-28 21:37:47 +03:00
# if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \
defined ( LIBXML_ISO8859X_ENABLED )
2003-07-30 19:12:01 +04:00
static int
2024-06-28 21:37:47 +03:00
UTF8ToISO8859x ( unsigned char * out , int * outlen ,
const unsigned char * in , int * inlen , void * vctxt ) {
const unsigned char * xlattable = vctxt ;
const unsigned char * instart = in ;
const unsigned char * inend ;
unsigned char * outstart = out ;
unsigned char * outend ;
int ret = XML_ENC_ERR_SPACE ;
2003-07-30 19:12:01 +04:00
if ( in = = NULL ) {
/*
* initialization nothing to do
*/
* outlen = 0 ;
* inlen = 0 ;
2024-06-28 21:37:47 +03:00
return ( XML_ENC_ERR_SUCCESS ) ;
2003-07-30 19:12:01 +04:00
}
2024-06-28 21:37:47 +03:00
inend = in + * inlen ;
outend = out + * outlen ;
2003-07-30 19:12:01 +04:00
while ( in < inend ) {
2024-06-28 21:37:47 +03:00
unsigned d = * in ;
2003-07-30 19:12:01 +04:00
if ( d < 0x80 ) {
2024-06-28 21:37:47 +03:00
if ( out > = outend )
goto done ;
in + = 1 ;
2003-07-30 19:12:01 +04:00
} else if ( d < 0xE0 ) {
2024-06-28 21:37:47 +03:00
unsigned c ;
if ( inend - in < 2 )
break ;
c = in [ 1 ] & 0x3F ;
2003-07-30 19:12:01 +04:00
d = d & 0x1F ;
d = xlattable [ 48 + c + xlattable [ d ] * 64 ] ;
if ( d = = 0 ) {
/* not in character set */
2024-06-28 21:37:47 +03:00
ret = XML_ENC_ERR_INPUT ;
goto done ;
2003-07-30 19:12:01 +04:00
}
2024-06-28 21:37:47 +03:00
if ( out > = outend )
goto done ;
in + = 2 ;
2003-07-30 19:12:01 +04:00
} else if ( d < 0xF0 ) {
2024-06-28 21:37:47 +03:00
unsigned c1 ;
unsigned c2 ;
if ( inend - in < 3 )
break ;
c1 = in [ 1 ] & 0x3F ;
c2 = in [ 2 ] & 0x3F ;
2004-09-09 18:35:17 +04:00
d = d & 0x0F ;
2010-11-04 19:42:42 +03:00
d = xlattable [ 48 + c2 + xlattable [ 48 + c1 +
2012-09-11 09:26:36 +04:00
xlattable [ 32 + d ] * 64 ] * 64 ] ;
2003-07-30 19:12:01 +04:00
if ( d = = 0 ) {
/* not in character set */
2024-06-28 21:37:47 +03:00
ret = XML_ENC_ERR_INPUT ;
goto done ;
2003-07-30 19:12:01 +04:00
}
2024-06-28 21:37:47 +03:00
if ( out > = outend )
goto done ;
in + = 3 ;
2003-07-30 19:12:01 +04:00
} else {
/* cannot transcode >= U+010000 */
2024-06-28 21:37:47 +03:00
ret = XML_ENC_ERR_INPUT ;
goto done ;
2003-07-30 19:12:01 +04:00
}
2024-06-28 21:37:47 +03:00
* out + + = d ;
2003-07-30 19:12:01 +04:00
}
2024-06-28 21:37:47 +03:00
ret = out - outstart ;
done :
2003-07-30 19:12:01 +04:00
* outlen = out - outstart ;
2024-06-28 21:37:47 +03:00
* inlen = in - instart ;
return ( ret ) ;
2003-07-30 19:12:01 +04:00
}
static int
ISO8859xToUTF8 ( unsigned char * out , int * outlen ,
2024-06-28 21:37:47 +03:00
const unsigned char * in , int * inlen , void * vctxt ) {
unsigned short const * unicodetable = vctxt ;
2003-07-30 19:12:01 +04:00
const unsigned char * instart = in ;
2004-11-05 20:22:25 +03:00
const unsigned char * inend ;
2024-06-28 21:37:47 +03:00
unsigned char * outstart = out ;
unsigned char * outend ;
int ret = XML_ENC_ERR_SPACE ;
2003-07-30 19:12:01 +04:00
2004-11-05 20:22:25 +03:00
outend = out + * outlen ;
inend = in + * inlen ;
2010-11-03 21:26:35 +03:00
2024-06-28 21:37:47 +03:00
while ( in < inend ) {
unsigned c = * in ;
if ( c < 0x80 ) {
if ( out > = outend )
goto done ;
* out + + = c ;
} else {
c = unicodetable [ c - 0x80 ] ;
2003-07-30 19:12:01 +04:00
if ( c = = 0 ) {
/* undefined code point */
2024-06-28 21:37:47 +03:00
ret = XML_ENC_ERR_INPUT ;
goto done ;
2010-11-03 21:26:35 +03:00
}
2003-07-30 19:12:01 +04:00
if ( c < 0x800 ) {
2024-06-28 21:37:47 +03:00
if ( outend - out < 2 )
goto done ;
2003-07-30 19:12:01 +04:00
* out + + = ( ( c > > 6 ) & 0x1F ) | 0xC0 ;
* out + + = ( c & 0x3F ) | 0x80 ;
} else {
2024-06-28 21:37:47 +03:00
if ( outend - out < 3 )
goto done ;
2003-07-30 19:12:01 +04:00
* out + + = ( ( c > > 12 ) & 0x0F ) | 0xE0 ;
* out + + = ( ( c > > 6 ) & 0x3F ) | 0x80 ;
* out + + = ( c & 0x3F ) | 0x80 ;
2010-11-03 21:26:35 +03:00
}
2003-07-30 19:12:01 +04:00
}
2024-06-28 21:37:47 +03:00
in + = 1 ;
}
2003-07-30 19:12:01 +04:00
2024-06-28 21:37:47 +03:00
ret = out - outstart ;
2003-07-30 19:12:01 +04:00
2024-06-28 21:37:47 +03:00
done :
* outlen = out - outstart ;
* inlen = in - instart ;
return ( ret ) ;
2003-07-30 19:12:01 +04:00
}
# endif