enhanced for range checking, updated to Unicode version 4.0.1 (API docs

* genUnicode.py, xmlunicode.c, include/libxml/xmlunicode.h, python/libxml2class.txt: enhanced for range checking, updated to Unicode version 4.0.1 (API docs also updated) * python/generator.py: minor change to fix a warning
2025-03-12 16:58:16 +03:00 · 2003-11-09 12:45:26 +00:00 · 2003-11-09 12:45:26 +00:00 · ea939087b9
commit ea939087b9
parent fe9fc792ed
8 changed files with 2732 additions and 3224 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+Sun Nov  9 20:28:21 HKT 2003 William Brack <wbrack@mmm.com.hk>
+
+	* genUnicode.py, xmlunicode.c, include/libxml/xmlunicode.h,
+	  python/libxml2class.txt: enhanced for range checking,
+	  updated to Unicode version 4.0.1 (API docs also updated)
+	* python/generator.py: minor change to fix a warning
+	
 Wed Nov  5 23:46:36 CET 2003 Daniel Veillard <daniel@veillard.com>

 	* Makefile.am: apply fix from Karl Eichwalder for script path
--- a/doc/libxml2-api.xml
+++ b/doc/libxml2-api.xml
--- a/genUnicode.py
+++ b/genUnicode.py
@ -1,40 +1,88 @@
 #!/usr/bin/python -u
+#
+# Original script modified in November 2003 to take advantage of
+# the character-validation range routines, and updated to the
+# current Unicode information (Version 4.0.1)
+#
+# NOTE: there is an 'alias' facility for blocks which are not present in
+#	the current release, but are needed for ABI compatibility.  This
+#	must be accomplished MANUALLY!  Define the alias in the variable
+#	'blockAliases', then MANUALLY provide a function to return the
+#	appropriate value.
+#
 import sys
 import string
 import time

-sources = "Blocks-4.txt UnicodeData-3.1.0.txt"
+webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1d5b.html"
+sources = "Blocks-4.0.1d1b.txt UnicodeData-4.0.1d1b.txt"

+blockAliases = "CombiningMarksforSymbols Greek PrivateUse"
+
+# minTableSize gives the minimum number of ranges which must be present
+# before a range table is produced.  If there are less than this
+# number, inline comparisons are generated
+minTableSize = 8
+
+(blockfile, catfile) = string.split(sources)
+
+#
+# First create a dictionary for the block names
+#
+BlockNames = {}
+
+#
+# Next put in aliases for blocks not currently present, but needed
+# for ABI compatibility (THIS IS A HORRIBLE HACK!)
+#
+aliases = string.split(blockAliases, ' ')
+for name in aliases:
+    BlockNames[name] = []
+
+#
+# Now process the "blocks" file, reducing it to a dictionary
+# indexed by blockname, containing a tuple with the applicable
+# block range
+#
 try:
-    blocks = open("Blocks-4.txt", "r")
+    blocks = open(blockfile, "r")
 except:
-    print "Missing Blocks-4.txt, aborting ..."
+    print "Missing %s, aborting ..." % blockfile
    sys.exit(1)

-BlockNames = {}
 for line in blocks.readlines():
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
-	continue
+        continue
    try:
-	fields = string.split(line, ';')
-	range = string.strip(fields[0])
-	(start, end) = string.split(range, "..")
-	name = string.strip(fields[1])
-	name = string.replace(name, ' ', '')
+        fields = string.split(line, ';')
+        range = string.strip(fields[0])
+        (start, end) = string.split(range, "..")
+        name = string.strip(fields[1])
+        name = string.replace(name, ' ', '')
    except:
        print "Failed to process line: %s" % (line)
-	continue
+        continue
    BlockNames[name] = ("0x"+start, "0x"+end)
 blocks.close()
 print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))

+#
+# Next process the Categories file. This is more complex, since
+# the file is in code sequence, and we need to invert it.  We use
+# a dictionary with index category-name, with each entry containing
+# all the ranges (codepoints) of that category.  Note that category
+# names comprise two parts - the general category, and the "subclass"
+# within that category.  Therefore, both "general category" (which is
+# the first character of the 2-character category-name) and the full
+# (2-character) name are entered into this dictionary.
+#
 try:
-    data = open("UnicodeData-3.1.0.txt", "r")
+    data = open(catfile, "r")
 except:
-    print "Missing UnicodeData-3.1.0.txt, aborting ..."
+    print "Missing %s, aborting ..." % catfile
    sys.exit(1)

 nbchar = 0;
@ -44,44 +92,50 @@ for line in data.readlines():
        continue
    line = string.strip(line)
    if line == '':
-	continue
+        continue
    try:
-	fields = string.split(line, ';')
-	point = string.strip(fields[0])
-	value = 0
-	while point != '':
-	    value = value * 16
-	    if point[0] >= '0' and point[0] <= '9':
-	        value = value + ord(point[0]) - ord('0')
-	    elif point[0] >= 'A' and point[0] <= 'F':
-	        value = value + 10 + ord(point[0]) - ord('A')
-	    elif point[0] >= 'a' and point[0] <= 'f':
-	        value = value + 10 + ord(point[0]) - ord('a')
-	    point = point[1:]
-	name = fields[2]
+        fields = string.split(line, ';')
+        point = string.strip(fields[0])
+        value = 0
+        while point != '':
+            value = value * 16
+            if point[0] >= '0' and point[0] <= '9':
+                value = value + ord(point[0]) - ord('0')
+            elif point[0] >= 'A' and point[0] <= 'F':
+                value = value + 10 + ord(point[0]) - ord('A')
+            elif point[0] >= 'a' and point[0] <= 'f':
+                value = value + 10 + ord(point[0]) - ord('a')
+            point = point[1:]
+        name = fields[2]
    except:
        print "Failed to process line: %s" % (line)
-	continue
+        continue
    
    nbchar = nbchar + 1
+    # update entry for "full name"
    try:
-	Categories[name].append(value)
+        Categories[name].append(value)
    except:
        try:
-	    Categories[name] = [value]
-	except:
-	    print "Failed to process line: %s" % (line)
+            Categories[name] = [value]
+        except:
+            print "Failed to process line: %s" % (line)
+    # update "general category" name
    try:
-	Categories[name[0]].append(value)
+        Categories[name[0]].append(value)
    except:
        try:
-	    Categories[name[0]] = [value]
-	except:
-	    print "Failed to process line: %s" % (line)
-	
+            Categories[name[0]] = [value]
+        except:
+            print "Failed to process line: %s" % (line)
+
 blocks.close()
 print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
-#reduce the number list into ranges
+
+#
+# The data is now all read.  Time to process it into a more useful form.
+#
+# reduce the number list into ranges
 for cat in Categories.keys():
    list = Categories[cat]
    start = -1
@ -90,35 +144,45 @@ for cat in Categories.keys():
    ranges = []
    for val in list:
        if start == -1:
-	    start = val
-	    prev = val
-	    continue
-	elif val == prev + 1:
-	    prev = val
-	    continue
-	elif prev == start:
-	    ranges.append((prev, prev))
-	    start = val
-	    prev = val
-	    continue
-	else:
-	    ranges.append((start, prev))
-	    start = val
-	    prev = val
-	    continue
+            start = val
+            prev = val
+            continue
+        elif val == prev + 1:
+            prev = val
+            continue
+        elif prev == start:
+            ranges.append((prev, prev))
+            start = val
+            prev = val
+            continue
+        else:
+            ranges.append((start, prev))
+            start = val
+            prev = val
+            continue
    if prev == start:
        ranges.append((prev, prev))
    else:
        ranges.append((start, prev))
    Categories[cat] = ranges
-        
+
+#
+# Assure all data is in alphabetic order, since we will be doing binary
+# searches on the tables.
+#
+bkeys = BlockNames.keys()
+bkeys.sort()
+
+ckeys = Categories.keys()
+ckeys.sort()
+
 #
 # Generate the resulting files
 #
 try:
-    header = open("xmlunicode.h", "w")
+    header = open("include/libxml/xmlunicode.h", "w")
 except:
-    print "Failed to open xmlunicode.h"
+    print "Failed to open include/libxml/xmlunicode.h"
    sys.exit(1)

 try:
@ -135,7 +199,7 @@ header.write(
 *
 * This file is automatically generated from the
 * UCS description files of the Unicode Character Database
- * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
+ * %s
 * using the genUnicode.py Python script.
 *
 * Generation date: %s
@ -152,14 +216,15 @@ header.write(
 extern "C" {
 #endif

-""" % (date, sources));
+""" % (webpage, date, sources));
+
 output.write(
 """/*
 * xmlunicode.c: this module implements the Unicode character APIs
 *
 * This file is automatically generated from the
 * UCS description files of the Unicode Character Database
- * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
+ * %s
 * using the genUnicode.py Python script.
 *
 * Generation date: %s
@ -175,15 +240,124 @@ output.write(
 #include <string.h>
 #include <libxml/xmlversion.h>
 #include <libxml/xmlunicode.h>
+#include <libxml/chvalid.h>

-""" % (date, sources));
+typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */

-keys = BlockNames.keys()
-keys.sort()
-for block in keys:
-    (start, end) = BlockNames[block]
+typedef struct {
+    const char *rangename;
+    xmlIntFunc *func;
+} xmlUnicodeRange;
+
+typedef struct {
+    xmlUnicodeRange *table;
+    int		    numentries;
+} xmlUnicodeNameTable;
+
+
+static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
+
+static xmlUnicodeRange xmlUnicodeBlocks[] = {
+""" % (webpage, date, sources));
+
+flag = 0
+for block in bkeys:
+    name = string.replace(block, '-', '')
+    if flag:
+        output.write(',\n')
+    else:
+        flag = 1
+    output.write('  {"%s", xmlUCSIs%s}' % (name, name))
+output.write('};\n\n')
+
+output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
+flag = 0;
+for name in ckeys:
+    if flag:
+        output.write(',\n')
+    else:
+        flag = 1
+    output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
+output.write('};\n\n')
+
+#
+# For any categories with more than minTableSize ranges we generate
+# a range table suitable for xmlCharInRange
+#
+for name in ckeys:
+  if len(Categories[name]) > minTableSize:
+    numshort = 0
+    numlong = 0
+    ranges = Categories[name]
+    sptr = "NULL"
+    lptr = "NULL"
+    for range in ranges:
+      (low, high) = range
+      if high < 0x10000:
+        if numshort == 0:
+          pline = "static xmlChSRange xml%sS[] = {" % name
+          sptr = "xml%sS" % name
+        else:
+          pline += ", "
+        numshort += 1
+      else:
+        if numlong == 0:
+          if numshort > 0:
+            output.write(pline + " };\n")
+          pline = "static xmlChLRange xml%sL[] = {" % name
+          lptr = "xml%sL" % name
+        else:
+          pline += ", "
+        numlong += 1
+      if len(pline) > 60:
+        output.write(pline + "\n")
+        pline = "    "
+      pline += "{%s, %s}" % (hex(low), hex(high))
+    output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
+         % (name, numshort, numlong, sptr, lptr))
+
+
+output.write(
+"""xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
+xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
+
+/**
+ * xmlUnicodeLookup:
+ * @tptr: pointer to the name table
+ * @name: name to be found
+ *
+ * binary table lookup for user-supplied name
+ *
+ * Returns pointer to range function if found, otherwise NULL
+ */
+static xmlIntFunc
+*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
+    int low, high, mid, cmp;
+    xmlUnicodeRange *sptr;
+
+    low = 0;
+    high = tptr->numentries - 1;
+    sptr = tptr->table;
+    while (low <= high) {
+	mid = (low + high) / 2;
+	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
+	    return (sptr[mid].func);
+	if (cmp < 0)
+	    high = mid - 1;
+	else
+	    low = mid + 1;
+    }
+    return (NULL);    
+}
+
+""" % (len(BlockNames), len(Categories)) )
+
+for block in bkeys:
    name = string.replace(block, '-', '')
    header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
+    if len(BlockNames[block]) == 0:	# ignore aliases
+        continue
+    (start, end) = BlockNames[block]
    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
                 (block))
@ -192,24 +366,30 @@ for block in keys:
    output.write("    return((code >= %s) && (code <= %s));\n" % (start, end))
    output.write("}\n\n")

-header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code,\n\t\t\t const char *block);\n\n")
-output.write("/**\n * xmlUCSIsBlock:\n * @code: UCS code point\n")
-output.write(" * @block: UCS block name\n")
-output.write(" *\n * Check whether the caracter is part of the UCS Block\n")
-output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown block\n */\n");
-output.write("int\nxmlUCSIsBlock(int code, const char *block) {\n")
-keys = BlockNames.keys()
-keys.sort()
-for block in keys:
-    name = string.replace(block, '-', '')
-    output.write("    if (!strcmp(block, \"%s\"))\n        return(xmlUCSIs%s(code));\n" %
-                 (block, name));
-output.write("    return(-1);\n}\n\n")
+header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
+output.write(
+"""/**
+ * xmlUCSIsBlock:
+ * @code: UCS code point
+ * @block: UCS block name
+ *
+ * Check whether the character is part of the UCS Block
+ *
+ * Returns 1 if true, 0 if false and -1 on unknown block
+ */
+int
+xmlUCSIsBlock(int code, const char *block) {
+    xmlIntFunc *func;

+    func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
+    if (func == NULL)
+	return (-1);
+    return (func(code));
+}

-keys = Categories.keys()
-keys.sort()
-for name in keys:
+""")
+
+for name in ckeys:
    ranges = Categories[name]
    header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
@ -217,33 +397,99 @@ for name in keys:
                 (name))
    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
-    start = 1
-    for range in ranges:
-        (begin, end) = range;
-	if start:
-	    output.write("    return(");
-	    start = 0
-	else:
-	    output.write(" ||\n           ");
-	if (begin == end):
-	    output.write("(code == %s)" % (hex(begin)))
-	else:
-	    output.write("((code >= %s) && (code <= %s))" % (
-	                 hex(begin), hex(end)))
+    if len(Categories[name]) > minTableSize:
+        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
+            % name)
+    else:
+        start = 1
+        for range in ranges:
+            (begin, end) = range;
+            if start:
+                output.write("    return(");
+                start = 0
+            else:
+                output.write(" ||\n           ");
+            if (begin == end):
+                output.write("(code == %s)" % (hex(begin)))
+            else:
+                output.write("((code >= %s) && (code <= %s))" % (
+                         hex(begin), hex(end)))
    output.write(");\n}\n\n")

-header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code,\n\t\t\t const char *cat);\n")
-output.write("/**\n * xmlUCSIsCat:\n * @code: UCS code point\n")
-output.write(" * @cat: UCS Category name\n")
-output.write(" *\n * Check whether the caracter is part of the UCS Category\n")
-output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown category\n */\n");
-output.write("int\nxmlUCSIsCat(int code, const char *cat) {\n")
-keys = Categories.keys()
-keys.sort()
-for name in keys:
-    output.write("    if (!strcmp(cat, \"%s\"))\n        return(xmlUCSIsCat%s(code));\n" %
-                 (name, name));
-output.write("    return(-1);\n}\n\n")
+header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
+output.write(
+"""/**
+ * xmlUCSIsCat:
+ * @code: UCS code point
+ * @cat: UCS Category name
+ *
+ * Check whether the character is part of the UCS Category
+ *
+ * Returns 1 if true, 0 if false and -1 on unknown category
+ */
+int
+xmlUCSIsCat(int code, const char *cat) {
+    xmlIntFunc *func;
+
+    func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
+    if (func == NULL)
+	return (-1);
+    return (func(code));
+}
+
+/*
+    The following routines are an UGLY HACK to provide aliases for block
+    names which are not in the current release, but are needed for ABI
+    compatibility.
+*/
+
+/**
+ * xmlUCSIsCombiningMarksforSymbols:
+ * @code: UCS code point
+ *
+ * Check whether the character is part of CombiningMarksforSymbols UCS Block
+ *
+ * Returns 1 if true 0 otherwise
+ */
+int
+xmlUCSIsCombiningMarksforSymbols(int code) {
+    return((code >= 0x20D0) && (code <= 0x20FF));
+}
+
+/**
+ * xmlUCSIsGreek:
+ * @code: UCS code point
+ *
+ * Check whether the character is part of Greek UCS Block
+ *
+ * Returns 1 if true 0 otherwise
+ */
+int
+xmlUCSIsGreek(int code) {
+    return((code >= 0x370) && (code <= 0x3FF));
+}
+
+/**
+ * xmlUCSIsPrivateUse:
+ * @code: UCS code point
+ *
+ * Check whether the character is part of PrivateUse UCS Block
+ *
+ * Returns 1 if true 0 otherwise
+ */
+int
+xmlUCSIsPrivateUse(int code) {
+    if ( ((code >= 0xE000)  && (code <= 0xF8FF)) ||
+	 ((code >= 0xF0000) && (code <= 0xFFFFD))||
+	 ((code >= 0x100000)&& (code <= 0x10FFFD)) )
+	return (1);
+    else
+	return (0);
+}
+
+
+#endif /* LIBXML_UNICODE_ENABLED */
+""")

 header.write("""
 #ifdef __cplusplus
@ -251,8 +497,6 @@ header.write("""
 #endif
 #endif /* __XML_UNICODE_H__ */
 """);
-output.write("""
-#endif /* LIBXML_UNICODE_ENABLED */
-""");
+
 header.close()
 output.close()
--- a/include/libxml/xmlunicode.h
+++ b/include/libxml/xmlunicode.h
@ -3,11 +3,11 @@
 *
 * This file is automatically generated from the
 * UCS description files of the Unicode Character Database
- * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
+ * http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1d5b.html
 * using the genUnicode.py Python script.
 *
- * Generation date: Mon Aug 25 10:45:50 2003
- * Sources: Blocks-4.txt UnicodeData-3.1.0.txt
+ * Generation date: Sun Nov  9 20:13:11 2003
+ * Sources: Blocks-4.0.1d1b.txt UnicodeData-4.0.1d1b.txt
 * Daniel Veillard <veillard@redhat.com>
 */

@ -20,6 +20,7 @@
 extern "C" {
 #endif

+XMLPUBFUN int XMLCALL xmlUCSIsAegeanNumbers	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsAlphabeticPresentationForms	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsArabic	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsArabicPresentationFormsA	(int code);
@ -33,6 +34,7 @@ XMLPUBFUN int XMLCALL xmlUCSIsBopomofo	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsBopomofoExtended	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsBoxDrawing	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsBraillePatterns	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsBuhid	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsByzantineMusicalSymbols	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCJKCompatibility	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCJKCompatibilityForms	(int code);
@ -45,11 +47,14 @@ XMLPUBFUN int XMLCALL xmlUCSIsCJKUnifiedIdeographsExtensionA	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCJKUnifiedIdeographsExtensionB	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCherokee	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCombiningDiacriticalMarks	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsCombiningDiacriticalMarksforSymbols	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCombiningHalfMarks	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCombiningMarksforSymbols	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsControlPictures	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCurrencySymbols	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsCypriotSyllabary	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCyrillic	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsCyrillicSupplement	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsDeseret	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsDevanagari	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsDingbats	(int code);
@ -62,12 +67,14 @@ XMLPUBFUN int XMLCALL xmlUCSIsGeorgian	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsGothic	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsGreek	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsGreekExtended	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsGreekandCoptic	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsGujarati	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsGurmukhi	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsHalfwidthandFullwidthForms	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsHangulCompatibilityJamo	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsHangulJamo	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsHangulSyllables	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsHanunoo	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsHebrew	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsHighPrivateUseSurrogates	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsHighSurrogates	(int code);
@ -78,18 +85,26 @@ XMLPUBFUN int XMLCALL xmlUCSIsKanbun	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsKangxiRadicals	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsKannada	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsKatakana	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsKatakanaPhoneticExtensions	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsKhmer	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsKhmerSymbols	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsLao	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsLatin1Supplement	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsLatinExtendedA	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsLatinExtendedB	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsLatinExtendedAdditional	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsLetterlikeSymbols	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsLimbu	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsLinearBIdeograms	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsLinearBSyllabary	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsLowSurrogates	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsMalayalam	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsMathematicalAlphanumericSymbols	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsMathematicalOperators	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsMiscellaneousMathematicalSymbolsA	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsMiscellaneousMathematicalSymbolsB	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsMiscellaneousSymbols	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsMiscellaneousSymbolsandArrows	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsMiscellaneousTechnical	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsMongolian	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsMusicalSymbols	(int code);
@ -99,26 +114,42 @@ XMLPUBFUN int XMLCALL xmlUCSIsOgham	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsOldItalic	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsOpticalCharacterRecognition	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsOriya	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsOsmanya	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsPhoneticExtensions	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsPrivateUse	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsPrivateUseArea	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsRunic	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsShavian	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsSinhala	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsSmallFormVariants	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsSpacingModifierLetters	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsSpecials	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsSuperscriptsandSubscripts	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsSupplementalArrowsA	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsSupplementalArrowsB	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsSupplementalMathematicalOperators	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsSupplementaryPrivateUseAreaA	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsSupplementaryPrivateUseAreaB	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsSyriac	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsTagalog	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsTagbanwa	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsTags	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsTaiLe	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsTaiXuanJingSymbols	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsTamil	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsTelugu	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsThaana	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsThai	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsTibetan	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsUgaritic	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsUnifiedCanadianAboriginalSyllabics	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsVariationSelectors	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsVariationSelectorsSupplement	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsYiRadicals	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsYiSyllables	(int code);
+XMLPUBFUN int XMLCALL xmlUCSIsYijingHexagramSymbols	(int code);

-XMLPUBFUN int XMLCALL xmlUCSIsBlock	(int code,
-			 const char *block);
+XMLPUBFUN int XMLCALL xmlUCSIsBlock	(int code, const char *block);

 XMLPUBFUN int XMLCALL xmlUCSIsCatC	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCatCc	(int code);
@ -157,8 +188,7 @@ XMLPUBFUN int XMLCALL xmlUCSIsCatZl	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCatZp	(int code);
 XMLPUBFUN int XMLCALL xmlUCSIsCatZs	(int code);

-XMLPUBFUN int XMLCALL xmlUCSIsCat	(int code,
-			 const char *cat);
+XMLPUBFUN int XMLCALL xmlUCSIsCat	(int code, const char *cat);

 #ifdef __cplusplus
 }
--- a/python/generator.py
+++ b/python/generator.py
@ -570,6 +570,7 @@ def buildStubs():
    wrapper.write("/* Generated */\n\n")
    wrapper.write("#include <Python.h>\n")
 #    wrapper.write("#include \"config.h\"\n")
+    wrapper.write("#define IN_LIBXML\n")
    wrapper.write("#include <libxml/xmlversion.h>\n")
    wrapper.write("#include <libxml/tree.h>\n")
    wrapper.write("#include <libxml/xmlschemastypes.h>\n")
--- a/python/libxml2class.txt
+++ b/python/libxml2class.txt
@ -229,6 +229,7 @@ schemaCleanupTypes()
 schemaInitTypes()

 # functions from module xmlunicode
+uCSIsAegeanNumbers()
 uCSIsAlphabeticPresentationForms()
 uCSIsArabic()
 uCSIsArabicPresentationFormsA()
@ -243,6 +244,7 @@ uCSIsBopomofo()
 uCSIsBopomofoExtended()
 uCSIsBoxDrawing()
 uCSIsBraillePatterns()
+uCSIsBuhid()
 uCSIsByzantineMusicalSymbols()
 uCSIsCJKCompatibility()
 uCSIsCJKCompatibilityForms()
@ -292,11 +294,14 @@ uCSIsCatZp()
 uCSIsCatZs()
 uCSIsCherokee()
 uCSIsCombiningDiacriticalMarks()
+uCSIsCombiningDiacriticalMarksforSymbols()
 uCSIsCombiningHalfMarks()
 uCSIsCombiningMarksforSymbols()
 uCSIsControlPictures()
 uCSIsCurrencySymbols()
+uCSIsCypriotSyllabary()
 uCSIsCyrillic()
+uCSIsCyrillicSupplement()
 uCSIsDeseret()
 uCSIsDevanagari()
 uCSIsDingbats()
@ -309,12 +314,14 @@ uCSIsGeorgian()
 uCSIsGothic()
 uCSIsGreek()
 uCSIsGreekExtended()
+uCSIsGreekandCoptic()
 uCSIsGujarati()
 uCSIsGurmukhi()
 uCSIsHalfwidthandFullwidthForms()
 uCSIsHangulCompatibilityJamo()
 uCSIsHangulJamo()
 uCSIsHangulSyllables()
+uCSIsHanunoo()
 uCSIsHebrew()
 uCSIsHighPrivateUseSurrogates()
 uCSIsHighSurrogates()
@ -325,18 +332,26 @@ uCSIsKanbun()
 uCSIsKangxiRadicals()
 uCSIsKannada()
 uCSIsKatakana()
+uCSIsKatakanaPhoneticExtensions()
 uCSIsKhmer()
+uCSIsKhmerSymbols()
 uCSIsLao()
 uCSIsLatin1Supplement()
 uCSIsLatinExtendedA()
 uCSIsLatinExtendedAdditional()
 uCSIsLatinExtendedB()
 uCSIsLetterlikeSymbols()
+uCSIsLimbu()
+uCSIsLinearBIdeograms()
+uCSIsLinearBSyllabary()
 uCSIsLowSurrogates()
 uCSIsMalayalam()
 uCSIsMathematicalAlphanumericSymbols()
 uCSIsMathematicalOperators()
+uCSIsMiscellaneousMathematicalSymbolsA()
+uCSIsMiscellaneousMathematicalSymbolsB()
 uCSIsMiscellaneousSymbols()
+uCSIsMiscellaneousSymbolsandArrows()
 uCSIsMiscellaneousTechnical()
 uCSIsMongolian()
 uCSIsMusicalSymbols()
@ -346,23 +361,40 @@ uCSIsOgham()
 uCSIsOldItalic()
 uCSIsOpticalCharacterRecognition()
 uCSIsOriya()
+uCSIsOsmanya()
+uCSIsPhoneticExtensions()
 uCSIsPrivateUse()
+uCSIsPrivateUseArea()
 uCSIsRunic()
+uCSIsShavian()
 uCSIsSinhala()
 uCSIsSmallFormVariants()
 uCSIsSpacingModifierLetters()
 uCSIsSpecials()
 uCSIsSuperscriptsandSubscripts()
+uCSIsSupplementalArrowsA()
+uCSIsSupplementalArrowsB()
+uCSIsSupplementalMathematicalOperators()
+uCSIsSupplementaryPrivateUseAreaA()
+uCSIsSupplementaryPrivateUseAreaB()
 uCSIsSyriac()
+uCSIsTagalog()
+uCSIsTagbanwa()
 uCSIsTags()
+uCSIsTaiLe()
+uCSIsTaiXuanJingSymbols()
 uCSIsTamil()
 uCSIsTelugu()
 uCSIsThaana()
 uCSIsThai()
 uCSIsTibetan()
+uCSIsUgaritic()
 uCSIsUnifiedCanadianAboriginalSyllabics()
+uCSIsVariationSelectors()
+uCSIsVariationSelectorsSupplement()
 uCSIsYiRadicals()
 uCSIsYiSyllables()
+uCSIsYijingHexagramSymbols()

 # functions from module xmlversion
 checkVersion()
--- a/win32/libxml2.def.src
+++ b/win32/libxml2.def.src
@ -674,6 +674,7 @@ xmlCreateIntSubset
 xmlCreateMemoryParserCtxt
 xmlCreatePushParserCtxt
 xmlCreateURI
+xmlCreateURLParserCtxt
 xmlCtxtGetLastError
 xmlCtxtReadDoc
 xmlCtxtReadFd
@ -682,6 +683,7 @@ xmlCtxtReadIO
 xmlCtxtReadMemory
 xmlCtxtReset
 xmlCtxtResetLastError
+xmlCtxtResetPush
 xmlCtxtUseOptions
 xmlCurrentChar
 #ifdef LIBXML_DEBUG_ENABLED
@ -1477,6 +1479,9 @@ xmlSchemaGetPredefinedType
 xmlSchemaInitTypes
 #endif
 #ifdef LIBXML_SCHEMAS_ENABLED
+xmlSchemaNewDocParserCtxt
+#endif
+#ifdef LIBXML_SCHEMAS_ENABLED
 xmlSchemaNewFacet
 #endif
 #ifdef LIBXML_SCHEMAS_ENABLED
@ -1580,6 +1585,7 @@ xmlStopParser
 xmlStrEqual
 xmlStrPrintf
 xmlStrQEqual
+xmlStrVPrintf
 xmlStrcasecmp
 xmlStrcasestr
 xmlStrcat
@ -1747,6 +1753,9 @@ xmlThrDefSetStructuredErrorFunc
 xmlThrDefSubstituteEntitiesDefaultValue
 xmlThrDefTreeIndentString
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsAegeanNumbers
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsAlphabeticPresentationForms
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -1789,6 +1798,9 @@ xmlUCSIsBoxDrawing
 xmlUCSIsBraillePatterns
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsBuhid
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsByzantineMusicalSymbols
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -1936,6 +1948,9 @@ xmlUCSIsCherokee
 xmlUCSIsCombiningDiacriticalMarks
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsCombiningDiacriticalMarksforSymbols
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsCombiningHalfMarks
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -1948,9 +1963,15 @@ xmlUCSIsControlPictures
 xmlUCSIsCurrencySymbols
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsCypriotSyllabary
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsCyrillic
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsCyrillicSupplement
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsDeseret
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -1987,6 +2008,9 @@ xmlUCSIsGreek
 xmlUCSIsGreekExtended
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsGreekandCoptic
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsGujarati
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -2005,6 +2029,9 @@ xmlUCSIsHangulJamo
 xmlUCSIsHangulSyllables
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsHanunoo
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsHebrew
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -2035,9 +2062,15 @@ xmlUCSIsKannada
 xmlUCSIsKatakana
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsKatakanaPhoneticExtensions
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsKhmer
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsKhmerSymbols
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsLao
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -2056,6 +2089,15 @@ xmlUCSIsLatinExtendedB
 xmlUCSIsLetterlikeSymbols
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsLimbu
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsLinearBIdeograms
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsLinearBSyllabary
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsLowSurrogates
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -2068,9 +2110,18 @@ xmlUCSIsMathematicalAlphanumericSymbols
 xmlUCSIsMathematicalOperators
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsMiscellaneousMathematicalSymbolsA
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsMiscellaneousMathematicalSymbolsB
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsMiscellaneousSymbols
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsMiscellaneousSymbolsandArrows
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsMiscellaneousTechnical
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -2098,12 +2149,24 @@ xmlUCSIsOpticalCharacterRecognition
 xmlUCSIsOriya
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsOsmanya
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsPhoneticExtensions
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsPrivateUse
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsPrivateUseArea
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsRunic
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsShavian
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsSinhala
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -2119,12 +2182,39 @@ xmlUCSIsSpecials
 xmlUCSIsSuperscriptsandSubscripts
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsSupplementalArrowsA
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsSupplementalArrowsB
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsSupplementalMathematicalOperators
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsSupplementaryPrivateUseAreaA
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsSupplementaryPrivateUseAreaB
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsSyriac
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsTagalog
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsTagbanwa
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsTags
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsTaiLe
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsTaiXuanJingSymbols
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsTamil
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
@ -2140,14 +2230,26 @@ xmlUCSIsThai
 xmlUCSIsTibetan
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsUgaritic
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsUnifiedCanadianAboriginalSyllabics
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsVariationSelectors
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsVariationSelectorsSupplement
+#endif
+#ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsYiRadicals
 #endif
 #ifdef LIBXML_REGEXP_ENABLED
 xmlUCSIsYiSyllables
 #endif
+#ifdef LIBXML_REGEXP_ENABLED
+xmlUCSIsYijingHexagramSymbols
+#endif
 xmlURIEscape
 xmlURIEscapeStr
 xmlURIUnescapeString
@ -2196,9 +2298,18 @@ xmlValidatePushElement
 xmlValidateQName
 xmlValidateRoot
 #ifdef LIBXML_XINCLUDE_ENABLED
+xmlXIncludeFreeContext
+#endif
+#ifdef LIBXML_XINCLUDE_ENABLED
+xmlXIncludeNewContext
+#endif
+#ifdef LIBXML_XINCLUDE_ENABLED
 xmlXIncludeProcess
 #endif
 #ifdef LIBXML_XINCLUDE_ENABLED
+xmlXIncludeProcessNode
+#endif
+#ifdef LIBXML_XINCLUDE_ENABLED
 xmlXIncludeProcessTree
 #endif
 #ifdef LIBXML_XPATH_ENABLED
--- a/xmlunicode.c
+++ b/xmlunicode.c