Action against software patentsGnome2 LogoW3C LogoRed Hat Logo
Made with Libxml2 Logo

Module encoding from libxml2

API Menu
API Indexes
Related links

interface for the encoding conversion functions needed for XML basic encoding and iconv() support. Related specs are rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies [ISO-10646] UTF-8 and UTF-16 in Annexes [ISO-8859-1] ISO Latin-1 characters codes. [UNICODE] The Unicode Consortium, "The Unicode Standard -- Worldwide Character Encoding -- Version 1.0", Addison- Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is described in Unicode Technical Report #4. [US-ASCII] Coded Character Set--7-bit American Standard Code for Information Interchange, ANSI X3.4-1986.

Table of Contents

Enum xmlCharEncoding
Structure xmlCharEncodingHandler
struct _xmlCharEncodingHandler
Typedef xmlCharEncodingHandler * xmlCharEncodingHandlerPtr
int	UTF8Toisolat1			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)
int	isolat1ToUTF8			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)
int	xmlAddEncodingAlias		(const char * name, 
const char * alias)
int	xmlCharEncCloseFunc		(xmlCharEncodingHandler * handler)
int	xmlCharEncFirstLine		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)
int	xmlCharEncInFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)
int	xmlCharEncOutFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)
Function type: xmlCharEncodingInputFunc
int	xmlCharEncodingInputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)
Function type: xmlCharEncodingOutputFunc
int	xmlCharEncodingOutputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)
int	xmlCheckUTF8			(const unsigned char * utf)
void	xmlCleanupCharEncodingHandlers	(void)
void	xmlCleanupEncodingAliases	(void)
int	xmlDelEncodingAlias		(const char * alias)
xmlCharEncoding	xmlDetectCharEncoding	(const unsigned char * in, 
int len)
xmlCharEncodingHandlerPtr	xmlFindCharEncodingHandler	(const char * name)
xmlCharEncodingHandlerPtr	xmlGetCharEncodingHandler	(xmlCharEncoding enc)
const char *	xmlGetCharEncodingName	(xmlCharEncoding enc)
const char *	xmlGetEncodingAlias	(const char * alias)
int	xmlGetUTF8Char			(const unsigned char * utf, 
int * len)
void	xmlInitCharEncodingHandlers	(void)
xmlCharEncodingHandlerPtr	xmlNewCharEncodingHandler	(const char * name, 
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output)
xmlCharEncoding	xmlParseCharEncoding	(const char * name)
void	xmlRegisterCharEncodingHandler	(xmlCharEncodingHandlerPtr handler)
int	xmlUTF8Charcmp			(const xmlChar * utf1, 
const xmlChar * utf2)
int	xmlUTF8Size			(const xmlChar * utf)
int	xmlUTF8Strlen			(const xmlChar * utf)
int	xmlUTF8Strloc			(const xmlChar * utf, 
const xmlChar * utfchar)
xmlChar *	xmlUTF8Strndup		(const xmlChar * utf, 
int len)
xmlChar *	xmlUTF8Strpos		(const xmlChar * utf, 
int pos)
int	xmlUTF8Strsize			(const xmlChar * utf, 
int len)
xmlChar *	xmlUTF8Strsub		(const xmlChar * utf, 
int start,
int len)

Description

Enum xmlCharEncoding

Enum xmlCharEncoding {
    XML_CHAR_ENCODING_ERROR = -1 : No char encoding detected
    XML_CHAR_ENCODING_NONE = 0 : No char encoding detected
    XML_CHAR_ENCODING_UTF8 = 1 : UTF-8
    XML_CHAR_ENCODING_UTF16LE = 2 : UTF-16 little endian
    XML_CHAR_ENCODING_UTF16BE = 3 : UTF-16 big endian
    XML_CHAR_ENCODING_UCS4LE = 4 : UCS-4 little endian
    XML_CHAR_ENCODING_UCS4BE = 5 : UCS-4 big endian
    XML_CHAR_ENCODING_EBCDIC = 6 : EBCDIC uh!
    XML_CHAR_ENCODING_UCS4_2143 = 7 : UCS-4 unusual ordering
    XML_CHAR_ENCODING_UCS4_3412 = 8 : UCS-4 unusual ordering
    XML_CHAR_ENCODING_UCS2 = 9 : UCS-2
    XML_CHAR_ENCODING_8859_1 = 10 : ISO-8859-1 ISO Latin 1
    XML_CHAR_ENCODING_8859_2 = 11 : ISO-8859-2 ISO Latin 2
    XML_CHAR_ENCODING_8859_3 = 12 : ISO-8859-3
    XML_CHAR_ENCODING_8859_4 = 13 : ISO-8859-4
    XML_CHAR_ENCODING_8859_5 = 14 : ISO-8859-5
    XML_CHAR_ENCODING_8859_6 = 15 : ISO-8859-6
    XML_CHAR_ENCODING_8859_7 = 16 : ISO-8859-7
    XML_CHAR_ENCODING_8859_8 = 17 : ISO-8859-8
    XML_CHAR_ENCODING_8859_9 = 18 : ISO-8859-9
    XML_CHAR_ENCODING_2022_JP = 19 : ISO-2022-JP
    XML_CHAR_ENCODING_SHIFT_JIS = 20 : Shift_JIS
    XML_CHAR_ENCODING_EUC_JP = 21 : EUC-JP
    XML_CHAR_ENCODING_ASCII = 22 : pure ASCII
}

Structure xmlCharEncodingHandler

Structure xmlCharEncodingHandler
struct _xmlCharEncodingHandler { char * name xmlCharEncodingInputFunc input xmlCharEncodingOutputFunc output iconv_t iconv_in iconv_t iconv_out }

Function: UTF8Toisolat1

int	UTF8Toisolat1			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 block of chars out.

out:a pointer to an array of bytes to store the result
outlen:the length of @out
in:a pointer to an array of UTF-8 chars
inlen:the length of @in
Returns:0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.

Function: isolat1ToUTF8

int	isolat1ToUTF8			(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 block of chars out.

out:a pointer to an array of bytes to store the result
outlen:the length of @out
in:a pointer to an array of ISO Latin 1 chars
inlen:the length of @in
Returns:0 if success, or -1 otherwise The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed.

Function: xmlAddEncodingAlias

int	xmlAddEncodingAlias		(const char * name, 
const char * alias)

Registers an alias @alias for an encoding named @name. Existing alias will be overwritten.

name:the encoding name as parsed, in UTF-8 format (ASCII actually)
alias:the alias name as parsed, in UTF-8 format (ASCII actually)
Returns:0 in case of success, -1 in case of error

Function: xmlCharEncCloseFunc

int	xmlCharEncCloseFunc		(xmlCharEncodingHandler * handler)

Generic front-end for encoding handler close function

handler:char enconding transformation data structure
Returns:0 if success, or -1 in case of error

Function: xmlCharEncFirstLine

int	xmlCharEncFirstLine		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)

Front-end for the encoding handler input function, but handle only the very first line, i.e. limit itself to 45 chars.

handler:char enconding transformation data structure
out:an xmlBuffer for the output.
in:an xmlBuffer for the input
Returns:the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or

Function: xmlCharEncInFunc

int	xmlCharEncInFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)

Generic front-end for the encoding handler input function

handler:char encoding transformation data structure
out:an xmlBuffer for the output.
in:an xmlBuffer for the input
Returns:the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or

Function: xmlCharEncOutFunc

int	xmlCharEncOutFunc		(xmlCharEncodingHandler * handler, 
xmlBufferPtr out,
xmlBufferPtr in)

Generic front-end for the encoding handler output function a first call with @in == NULL has to be made firs to initiate the output in case of non-stateless encoding needing to initiate their state or the output (like the BOM in UTF16). In case of UTF8 sequence conversion errors for the given encoder, the content will be automatically remapped to a CharRef sequence.

handler:char enconding transformation data structure
out:an xmlBuffer for the output.
in:an xmlBuffer for the input
Returns:the number of byte written if success, or -1 general error -2 if the transcoding fails (for *in is not valid utf8 string or the result of transformation can't fit into the encoding we want), or

Function type: xmlCharEncodingInputFunc

Function type: xmlCharEncodingInputFunc
int	xmlCharEncodingInputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of chars in the original encoding and try to convert it to an UTF-8 block of chars out.

out:a pointer to an array of bytes to store the UTF-8 result
outlen:the length of @out
in:a pointer to an array of chars in the original encoding
inlen:the length of @in
Returns:the number of bytes written, -1 if lack of space, or -2 if the transcoding failed. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictiable. The value of @outlen after return is the number of octets consumed.

Function type: xmlCharEncodingOutputFunc

Function type: xmlCharEncodingOutputFunc
int	xmlCharEncodingOutputFunc	(unsigned char * out, 
int * outlen,
const unsigned char * in,
int * inlen)

Take a block of UTF-8 chars in and try to convert it to another encoding. Note: a first call designed to produce heading info is called with in = NULL. If stateful this should also initialize the encoder state.

out:a pointer to an array of bytes to store the result
outlen:the length of @out
in:a pointer to an array of UTF-8 chars
inlen:the length of @in
Returns:the number of bytes written, -1 if lack of space, or -2 if the transcoding failed. The value of @inlen after return is the number of octets consumed if the return value is positive, else unpredictiable. The value of @outlen after return is the number of ocetes consumed.

Function: xmlCheckUTF8

int	xmlCheckUTF8			(const unsigned char * utf)

Checks @utf for being valid UTF-8. @utf is assumed to be null-terminated. This function is not super-strict, as it will allow longer UTF-8 sequences than necessary. Note that Java is capable of producing these sequences if provoked. Also note, this routine checks for the 4-byte maximum size, but does not check for 0x10ffff maximum value.

utf:Pointer to putative UTF-8 encoded string.
Returns:value: true if @utf is valid.

Function: xmlCleanupCharEncodingHandlers

void	xmlCleanupCharEncodingHandlers	(void)

Cleanup the memory allocated for the char encoding support, it unregisters all the encoding handlers and the aliases.

Function: xmlCleanupEncodingAliases

void	xmlCleanupEncodingAliases	(void)

Unregisters all aliases

Function: xmlDelEncodingAlias

int	xmlDelEncodingAlias		(const char * alias)

Unregisters an encoding alias @alias

alias:the alias name as parsed, in UTF-8 format (ASCII actually)
Returns:0 in case of success, -1 in case of error

Function: xmlDetectCharEncoding

xmlCharEncoding	xmlDetectCharEncoding	(const unsigned char * in, 
int len)

Guess the encoding of the entity using the first bytes of the entity content according to the non-normative appendix F of the XML-1.0 recommendation.

in:a pointer to the first bytes of the XML entity, must be at least 2 bytes long (at least 4 if encoding is UTF4 variant).
len:pointer to the length of the buffer
Returns:one of the XML_CHAR_ENCODING_... values.

Function: xmlFindCharEncodingHandler

xmlCharEncodingHandlerPtr	xmlFindCharEncodingHandler	(const char * name)

Search in the registered set the handler able to read/write that encoding.

name:a string describing the char encoding.
Returns:the handler or NULL if not found

Function: xmlGetCharEncodingHandler

xmlCharEncodingHandlerPtr	xmlGetCharEncodingHandler	(xmlCharEncoding enc)

Search in the registered set the handler able to read/write that encoding.

enc:an xmlCharEncoding value.
Returns:the handler or NULL if not found

Function: xmlGetCharEncodingName

const char *	xmlGetCharEncodingName	(xmlCharEncoding enc)

The "canonical" name for XML encoding. C.f. http://www.w3.org/TR/REC-xml#charencoding Section 4.3.3 Character Encoding in Entities

enc:the encoding
Returns:the canonical name for the given encoding

Function: xmlGetEncodingAlias

const char *	xmlGetEncodingAlias	(const char * alias)

Lookup an encoding name for the given alias.

alias:the alias name as parsed, in UTF-8 format (ASCII actually)
Returns:NULL if not found, otherwise the original name

Function: xmlGetUTF8Char

int	xmlGetUTF8Char			(const unsigned char * utf, 
int * len)

Read one UTF8 Char from @utf

utf:a sequence of UTF-8 encoded bytes
len:a pointer to @bytes len
Returns:the char value or -1 in case of error, and updates *len with the number of bytes consumed

Function: xmlInitCharEncodingHandlers

void	xmlInitCharEncodingHandlers	(void)

Initialize the char encoding support, it registers the default encoding supported. NOTE: while public, this function usually doesn't need to be called in normal processing.

Function: xmlNewCharEncodingHandler

xmlCharEncodingHandlerPtr	xmlNewCharEncodingHandler	(const char * name, 
xmlCharEncodingInputFunc input,
xmlCharEncodingOutputFunc output)

Create and registers an xmlCharEncodingHandler.

name:the encoding name, in UTF-8 format (ASCII actually)
input:the xmlCharEncodingInputFunc to read that encoding
output:the xmlCharEncodingOutputFunc to write that encoding
Returns:the xmlCharEncodingHandlerPtr created (or NULL in case of error).

Function: xmlParseCharEncoding

xmlCharEncoding	xmlParseCharEncoding	(const char * name)

Compare the string to the encoding schemes already known. Note that the comparison is case insensitive accordingly to the section [XML] 4.3.3 Character Encoding in Entities.

name:the encoding name as parsed, in UTF-8 format (ASCII actually)
Returns:one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE if not recognized.

Function: xmlRegisterCharEncodingHandler

void	xmlRegisterCharEncodingHandler	(xmlCharEncodingHandlerPtr handler)

Register the char encoding handler, surprising, isn't it ?

handler:the xmlCharEncodingHandlerPtr handler block

Function: xmlUTF8Charcmp

int	xmlUTF8Charcmp			(const xmlChar * utf1, 
const xmlChar * utf2)

compares the two UCS4 values

utf1:pointer to first UTF8 char
utf2:pointer to second UTF8 char
Returns:result of the compare as with xmlStrncmp

Function: xmlUTF8Size

int	xmlUTF8Size			(const xmlChar * utf)

calculates the internal size of a UTF8 character

utf:pointer to the UTF8 character
Returns:the numbers of bytes in the character, -1 on format error

Function: xmlUTF8Strlen

int	xmlUTF8Strlen			(const xmlChar * utf)

compute the length of an UTF8 string, it doesn't do a full UTF8 checking of the content of the string.

utf:a sequence of UTF-8 encoded bytes
Returns:the number of characters in the string or -1 in case of error

Function: xmlUTF8Strloc

int	xmlUTF8Strloc			(const xmlChar * utf, 
const xmlChar * utfchar)

a function to provide the relative location of a UTF8 char

utf:the input UTF8 *
utfchar:the UTF8 character to be found
Returns:the relative character position of the desired char or -1 if not found

Function: xmlUTF8Strndup

xmlChar *	xmlUTF8Strndup		(const xmlChar * utf, 
int len)

a strndup for array of UTF8's

utf:the input UTF8 *
len:the len of @utf (in chars)
Returns:a new UTF8 * or NULL

Function: xmlUTF8Strpos

xmlChar *	xmlUTF8Strpos		(const xmlChar * utf, 
int pos)

a function to provide the equivalent of fetching a character from a string array

utf:the input UTF8 *
pos:the position of the desired UTF8 char (in chars)
Returns:a pointer to the UTF8 character or NULL

Function: xmlUTF8Strsize

int	xmlUTF8Strsize			(const xmlChar * utf, 
int len)

storage size of an UTF8 string

utf:a sequence of UTF-8 encoded bytes
len:the number of characters in the array
Returns:the storage size of the first 'len' characters of ARRAY

Function: xmlUTF8Strsub

xmlChar *	xmlUTF8Strsub		(const xmlChar * utf, 
int start,
int len)

Create a substring from a given UTF-8 string Note: positions are given in units of UTF-8 chars

utf:a sequence of UTF-8 encoded bytes
start:relative pos of first char
len:total number to copy
Returns:a pointer to a newly created string or NULL if any problem

Daniel Veillard