Enhanced the handling of UTF-16, UTF-16LE and UTF-16BE encodings. Now

* encoding.c, include/libxml/encoding.h: Enhanced the handling of UTF-16, UTF-16LE and UTF-16BE encodings. Now UTF-16 output is handled internally by default, with proper BOM and UTF-16LE encoding. Native UTF-16LE and UTF-16BE encoding will not generate a BOM on output, and will be automatically recognized on input. * test/utf16lebom.xml, test/utf16bebom.xml, result/utf16?ebom*: added regression tests for above.
2025-03-27 18:50:07 +03:00 · 2003-11-28 09:39:10 +00:00 · 2003-11-28 09:39:10 +00:00 · f9415e4989
commit f9415e4989
parent ae8c9642d8
13 changed files with 138 additions and 81 deletions
--- a/10
+++ b/10
@ -1,3 +1,13 @@
+Fri Nov 28 17:28:47 HKT 2003 William Brack <wbrack@mmm.com.hk>
+
+	* encoding.c, include/libxml/encoding.h: Enhanced the handling of UTF-16,
+	  UTF-16LE and UTF-16BE encodings.  Now UTF-16 output is handled internally
+	  by default, with proper BOM and UTF-16LE encoding.  Native UTF-16LE and
+	  UTF-16BE encoding will not generate a BOM on output, and will be
+	  automatically recognized on input.
+	* test/utf16lebom.xml, test/utf16bebom.xml, result/utf16?ebom*: added
+	  regression tests for above.
+	  
 Thu Nov 27 19:25:10 CET 2003 Igor Zlatkovic <igor@zlatkovic.com>

 	* win32/Makefile.* win32/configure.js: Modified to allow coexistent 
--- a/encoding.c
+++ b/encoding.c
@ -92,7 +92,7 @@ static int xmlLittleEndian = 1;
 * xmlUTF8Size:
 * @utf: pointer to the UTF8 character
 *
- * calulates the internal size of a UTF8 character
+ * calculates the internal size of a UTF8 character
 *
 * returns the numbers of bytes in the character, -1 on format error
 */
@ -186,8 +186,8 @@ xmlUTF8Strlen(const xmlChar *utf) {
 *
 * Read one UTF8 Char from @utf
 *
- * Returns the char value or -1 in case of error and update @len with the
- *        number of bytes used
+ * Returns the char value or -1 in case of error, and updates *len with the
+ *        number of bytes consumed
 */
 int
 xmlGetUTF8Char(const unsigned char *utf, int *len) {
@ -248,11 +248,11 @@ error:

 /**
 * xmlCheckUTF8:
- * @utf: Pointer to putative utf-8 encoded string.
+ * @utf: Pointer to putative UTF-8 encoded string.
 *
- * Checks @utf for being valid utf-8. @utf is assumed to be
+ * Checks @utf for being valid UTF-8. @utf is assumed to be
 * null-terminated. This function is not super-strict, as it will
- * allow longer utf-8 sequences than necessary. Note that Java is
+ * allow longer UTF-8 sequences than necessary. Note that Java is
 * capable of producing these sequences if provoked. Also note, this
 * routine checks for the 4-byte maximum size, but does not check for
 * 0x10ffff maximum value.
@ -386,7 +386,7 @@ xmlUTF8Strpos(const xmlChar *utf, int pos) {
 * @utf:  the input UTF8 *
 * @utfchar:  the UTF8 character to be found
 *
- * a function to provide relative location of a UTF8 char
+ * a function to provide the relative location of a UTF8 char
 *
 * Returns the relative character position of the desired char
 * or -1 if not found
@ -421,6 +421,7 @@ xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
 * @start: relative pos of first char
 * @len:   total number to copy
 *
+ * Create a substring from a given UTF-8 string
 * Note:  positions are given in units of UTF-8 chars
 *
 * Returns a pointer to a newly created string
@ -472,8 +473,8 @@ xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
 * block of chars out.
 * Returns 0 if success, or -1 otherwise
 * The value of @inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictable.
- * The value of @outlen after return is the number of ocetes consumed.
+ *     if the return value is positive, else unpredictable.
+ * The value of @outlen after return is the number of octets consumed.
 */
 static int
 asciiToUTF8(unsigned char* out, int *outlen,
@ -525,8 +526,8 @@ asciiToUTF8(unsigned char* out, int *outlen,
 *
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
 * The value of @inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictable.
- * The value of @outlen after return is the number of ocetes consumed.
+ *     if the return value is positive, else unpredictable.
+ * The value of @outlen after return is the number of octets consumed.
 */
 static int
 UTF8Toascii(unsigned char* out, int *outlen,
@ -608,8 +609,8 @@ UTF8Toascii(unsigned char* out, int *outlen,
 * block of chars out.
 * Returns 0 if success, or -1 otherwise
 * The value of @inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictable.
- * The value of @outlen after return is the number of ocetes consumed.
+ *     if the return value is positive, else unpredictable.
+ * The value of @outlen after return is the number of octets consumed.
 */
 int
 isolat1ToUTF8(unsigned char* out, int *outlen,
@ -656,10 +657,9 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
 *
 * No op copy operation for UTF8 handling.
 *
- * Returns the number of byte written, or -1 by lack of space, or -2
- *     if the transcoding fails (for *in is not valid utf16 string)
+ * Returns the number of bytes written, or -1 if lack of space.
 *     The value of *inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictable.
+ *     if the return value is positive, else unpredictable.
 */
 static int
 UTF8ToUTF8(unsigned char* out, int *outlen,
@ -698,8 +698,8 @@ UTF8ToUTF8(unsigned char* out, int *outlen,
 *
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
 * The value of @inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictable.
- * The value of @outlen after return is the number of ocetes consumed.
+ *     if the return value is positive, else unpredictable.
+ * The value of @outlen after return is the number of octets consumed.
 */
 int
 UTF8Toisolat1(unsigned char* out, int *outlen,
@ -783,14 +783,14 @@ UTF8Toisolat1(unsigned char* out, int *outlen,
 * @inlenb:  the length of @in in UTF-16LE chars
 *
 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
- * block of chars out. This function assume the endian property
+ * block of chars out. This function assumes the endian property
 * is the same between the native type of this machine and the
 * inputed one.
 *
- * Returns the number of byte written, or -1 by lack of space, or -2
- *     if the transcoding fails (for *in is not valid utf16 string)
+ * Returns the number of bytes written, or -1 if lack of space, or -2
+ *     if the transcoding fails (if *in is not a valid utf16 string)
 *     The value of *inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictable.
+ *     if the return value is positive, else unpredictable.
 */
 static int
 UTF16LEToUTF8(unsigned char* out, int *outlen,
@ -874,7 +874,7 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
 * block of chars out.
 *
- * Returns the number of byte written, or -1 by lack of space, or -2
+ * Returns the number of bytes written, or -1 if lack of space, or -2
 *     if the transcoding failed. 
 */
 static int
@ -892,21 +892,8 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
    unsigned char *tmp;
    unsigned short tmp1, tmp2;

+    /* UTF16LE encoding has no BOM */
    if (in == NULL) {
-        /*
-	 * initialization, add the Byte Order Mark
-	 */
-        if (*outlen >= 2) {
-	    outb[0] = 0xFF;
-	    outb[1] = 0xFE;
-	    *outlen = 2;
-	    *inlen = 0;
-#ifdef DEBUG_ENCODING
-            xmlGenericError(xmlGenericErrorContext,
-		    "Added FFFE Byte Order Mark\n");
-#endif
-	    return(2);
-	}
 	*outlen = 0;
 	*inlen = 0;
 	return(0);
@ -985,22 +972,61 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
 }
 #endif /* LIBXML_OUTPUT_ENABLED */

+/**
+ * UTF8ToUTF16:
+ * @outb:  a pointer to an array of bytes to store the result
+ * @outlen:  the length of @outb
+ * @in:  a pointer to an array of UTF-8 chars
+ * @inlen:  the length of @in
+ *
+ * Take a block of UTF-8 chars in and try to convert it to an UTF-16
+ * block of chars out.
+ *
+ * Returns the number of bytes written, or -1 if lack of space, or -2
+ *     if the transcoding failed. 
+ */
+static int
+UTF8ToUTF16(unsigned char* outb, int *outlen,
+            const unsigned char* in, int *inlen)
+{
+    if (in == NULL) {
+	/*
+	 * initialization, add the Byte Order Mark for UTF-16LE
+	 */
+        if (*outlen >= 2) {
+	    outb[0] = 0xFF;
+	    outb[1] = 0xFE;
+	    *outlen = 2;
+	    *inlen = 0;
+#ifdef DEBUG_ENCODING
+            xmlGenericError(xmlGenericErrorContext,
+		    "Added FFFE Byte Order Mark\n");
+#endif
+	    return(2);
+	}
+	*outlen = 0;
+	*inlen = 0;
+	return(0);
+    }
+    return (UTF8ToUTF16LE(outb, outlen, in, inlen));
+}
+
 /**
 * UTF16BEToUTF8:
 * @out:  a pointer to an array of bytes to store the result
 * @outlen:  the length of @out
- * @inb:  a pointer to an array of UTF-16 passwd as a byte array
+ * @inb:  a pointer to an array of UTF-16 passed as a byte array
 * @inlenb:  the length of @in in UTF-16 chars
 *
 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
- * block of chars out. This function assume the endian property
+ * block of chars out. This function assumes the endian property
 * is the same between the native type of this machine and the
 * inputed one.
 *
- * Returns the number of byte written, or -1 by lack of space, or -2
- *     if the transcoding fails (for *in is not valid utf16 string)
+ * Returns the number of bytes written, or -1 if lack of space, or -2
+ *     if the transcoding fails (if *in is not a valid utf16 string)
 * The value of *inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictable.
+ *     if the return value is positive, else unpredictable.
 */
 static int
 UTF16BEToUTF8(unsigned char* out, int *outlen,
@ -1106,21 +1132,8 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
    unsigned char *tmp;
    unsigned short tmp1, tmp2;

+    /* UTF-16BE has no BOM */
    if (in == NULL) {
-        /*
-	 * initialization, add the Byte Order Mark
-	 */
-        if (*outlen >= 2) {
-	    outb[0] = 0xFE;
-	    outb[1] = 0xFF;
-	    *outlen = 2;
-	    *inlen = 0;
-#ifdef DEBUG_ENCODING
-            xmlGenericError(xmlGenericErrorContext,
-		    "Added FEFF Byte Order Mark\n");
-#endif
-	    return(2);
-	}
 	*outlen = 0;
 	*inlen = 0;
 	return(0);
@ -1205,11 +1218,11 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
 /**
 * xmlDetectCharEncoding:
 * @in:  a pointer to the first bytes of the XML entity, must be at least
- *       4 bytes long.
+ *       2 bytes long (at least 4 if encoding is UTF4 variant).
 * @len:  pointer to the length of the buffer
 *
 * Guess the encoding of the entity using the first bytes of the entity content
- * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
+ * according to the non-normative appendix F of the XML-1.0 recommendation.
 * 
 * Returns one of the XML_CHAR_ENCODING_... values.
 */
@ -1235,6 +1248,17 @@ xmlDetectCharEncoding(const unsigned char* in, int len)
 	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
 	    (in[2] == 0x78) && (in[3] == 0x6D))
 	    return(XML_CHAR_ENCODING_UTF8);
+	/*
+	 * Although not part of the recommendation, we also
+	 * attempt an "auto-recognition" of UTF-16LE and
+	 * UTF-16BE encodings.
+	 */
+	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
+	    (in[2] == 0x3F) && (in[3] == 0x00))
+	    return(XML_CHAR_ENCODING_UTF16LE);
+	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
+	    (in[2] == 0x00) && (in[3] == 0x3F))
+	    return(XML_CHAR_ENCODING_UTF16BE);
    }
    if (len >= 3) {
 	/*
@ -1245,6 +1269,7 @@ xmlDetectCharEncoding(const unsigned char* in, int len)
 	    (in[2] == 0xBF))
 	    return(XML_CHAR_ENCODING_UTF8);
    }
+    /* For UTF-16 we can recognize by the BOM */
    if (len >= 2) {
 	if ((in[0] == 0xFE) && (in[1] == 0xFF))
 	    return(XML_CHAR_ENCODING_UTF16BE);
@ -1284,7 +1309,7 @@ xmlCleanupEncodingAliases(void) {
 *
 * Lookup an encoding name for the given alias.
 * 
- * Returns NULL if not found the original name otherwise
+ * Returns NULL if not found, otherwise the original name
 */
 const char *
 xmlGetEncodingAlias(const char *alias) {
@ -1319,7 +1344,7 @@ xmlGetEncodingAlias(const char *alias) {
 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
 *
- * Registers and alias @alias for an encoding named @name. Existing alias
+ * Registers an alias @alias for an encoding named @name. Existing alias
 * will be overwritten.
 * 
 * Returns 0 in case of success, -1 in case of error
@ -1410,7 +1435,7 @@ xmlDelEncodingAlias(const char *alias) {
 * xmlParseCharEncoding:
 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
 *
- * Compare the string to the known encoding schemes already known. Note
+ * Compare the string to the encoding schemes already known. Note
 * that the comparison is case insensitive accordingly to the section
 * [XML] 4.3.3 Character Encoding in Entities.
 * 
@ -1686,6 +1711,7 @@ xmlInitCharEncodingHandlers(void) {
          xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
    xmlUTF16BEHandler = 
          xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
+    xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, UTF8ToUTF16);
    xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
    xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
    xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
@ -1697,6 +1723,7 @@ xmlInitCharEncodingHandlers(void) {
          xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, NULL);
    xmlUTF16BEHandler = 
          xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, NULL);
+    xmlNewCharEncodingHandler("UTF-16", UTF16LEToUTF8, NULL);
    xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, NULL);
    xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
    xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
@ -1985,14 +2012,7 @@ xmlFindCharEncodingHandler(const char *name) {
        }
    }

-    /*
-     * If nothing was found and it is "UTF-16" then use the Little endian
-     * version.
-     */
-    if ((xmlStrEqual(BAD_CAST upper, BAD_CAST "UTF-16")) ||
-	(xmlStrEqual(BAD_CAST upper, BAD_CAST "UTF16")))
-        return(xmlUTF16LEHandler);
-
+    /* If "none of the above", give up */
    return(NULL);
 }

--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@ -35,19 +35,22 @@ extern "C" {
 * xmlCharEncoding:
 *
 * Predefined values for some standard encodings.
- * Libxml don't do beforehand translation on UTF8, ISOLatinX.
- * It also support UTF16 (LE and BE) by default.
+ * Libxml does not do beforehand translation on UTF8 and ISOLatinX.
+ * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default.
 *
 * Anything else would have to be translated to UTF8 before being
 * given to the parser itself. The BOM for UTF16 and the encoding
 * declaration are looked at and a converter is looked for at that
- * point. If not found the parser stops here as asked by the XML REC
- * Converter can be registered by the user using xmlRegisterCharEncodingHandler
+ * point. If not found the parser stops here as asked by the XML REC. A
+ * converter can be registered by the user using xmlRegisterCharEncodingHandler
 * but the current form doesn't allow stateful transcoding (a serious
 * problem agreed !). If iconv has been found it will be used
 * automatically and allow stateful transcoding, the simplest is then
- * to be sure to enable icon and to provide iconv libs for the encoding
+ * to be sure to enable iconv and to provide iconv libs for the encoding
 * support needed.
+ *
+ * Note that the generic "UTF-16" is not a predefined value.  Instead, only
+ * the specific UTF-16LE and UTF-16BE are present.
 */
 typedef enum {
    XML_CHAR_ENCODING_ERROR=   -1, /* No char encoding detected */
@ -86,10 +89,10 @@ typedef enum {
 * Take a block of chars in the original encoding and try to convert
 * it to an UTF-8 block of chars out.
 *
- * Returns the number of byte written, or -1 by lack of space, or -2
+ * Returns the number of bytes written, -1 if lack of space, or -2
 *     if the transcoding failed.
 * The value of @inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictiable.
+ *     if the return value is positive, else unpredictiable.
 * The value of @outlen after return is the number of octets consumed.
 */
 typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen,
@ -103,15 +106,15 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen,
 * @in:  a pointer to an array of UTF-8 chars
 * @inlen:  the length of @in
 *
- * Take a block of UTF-8 chars in and try to convert it to an other
+ * Take a block of UTF-8 chars in and try to convert it to another
 * encoding.
 * Note: a first call designed to produce heading info is called with
 * in = NULL. If stateful this should also initialize the encoder state.
 *
- * Returns the number of byte written, or -1 by lack of space, or -2
+ * Returns the number of bytes written, -1 if lack of space, or -2
 *     if the transcoding failed.
 * The value of @inlen after return is the number of octets consumed
- *     as the return value is positive, else unpredictiable.
+ *     if the return value is positive, else unpredictiable.
 * The value of @outlen after return is the number of ocetes consumed.
 */
 typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
@ -120,7 +123,7 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,

 /*
 * Block defining the handlers for non UTF-8 encodings.
- * If iconv is supported, there is two extra fields.
+ * If iconv is supported, there are two extra fields.
 */

 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
@ -218,7 +221,7 @@ XMLPUBFUN int XMLCALL
 	xmlGetUTF8Char			(const unsigned char *utf,
 					 int *len);
 /*
- * exports additional "UTF-8 aware" string routines which are.
+ * Export additional string routines which are "UTF-8 aware".
 */

 XMLPUBFUN int XMLCALL	
--- a/result/noent/utf16bebom.xml
+++ b/result/noent/utf16bebom.xml
--- a/result/noent/utf16lebom.xml
+++ b/result/noent/utf16lebom.xml
--- a/result/utf16bebom.xml
+++ b/result/utf16bebom.xml
--- a/result/utf16bebom.xml.rdr
+++ b/result/utf16bebom.xml.rdr
@ -0,0 +1,4 @@
+0 8 #comment 0 1  This file is encoded in UTF-16BE 
+0 1 repository 0 0
+1 1 namespace 1 0
+0 15 repository 0 0
--- a/result/utf16bebom.xml.sax
+++ b/result/utf16bebom.xml.sax
@ -0,0 +1,8 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.comment( This file is encoded in UTF-16BE )
+SAX.startElement(repository, repositroy_id='test')
+SAX.startElement(namespace, name='test')
+SAX.endElement(namespace)
+SAX.endElement(repository)
+SAX.endDocument()
--- a/result/utf16lebom.xml
+++ b/result/utf16lebom.xml
--- a/result/utf16lebom.xml.rdr
+++ b/result/utf16lebom.xml.rdr
@ -0,0 +1,4 @@
+0 8 #comment 0 1  This file is encoded in UTF-16LE 
+0 1 repository 0 0
+1 1 namespace 1 0
+0 15 repository 0 0
--- a/result/utf16lebom.xml.sax
+++ b/result/utf16lebom.xml.sax
@ -0,0 +1,8 @@
+SAX.setDocumentLocator()
+SAX.startDocument()
+SAX.comment( This file is encoded in UTF-16LE )
+SAX.startElement(repository, repositroy_id='test')
+SAX.startElement(namespace, name='test')
+SAX.endElement(namespace)
+SAX.endElement(repository)
+SAX.endDocument()
--- a/test/utf16bebom.xml
+++ b/test/utf16bebom.xml
--- a/test/utf16lebom.xml
+++ b/test/utf16lebom.xml