2022-12-07 00:40:01 +03:00
#!/usr/bin/env python3
2003-11-09 15:45:26 +03:00
#
# Original script modified in November 2003 to take advantage of
# the character-validation range routines, and updated to the
# current Unicode information (Version 4.0.1)
#
# NOTE: there is an 'alias' facility for blocks which are not present in
# the current release, but are needed for ABI compatibility. This
2003-11-10 18:49:27 +03:00
# must be accomplished MANUALLY! Please see the comments below under
# 'blockAliases'
2003-11-09 15:45:26 +03:00
#
2002-04-16 19:50:10 +04:00
import sys
import string
import time
2006-03-27 13:30:13 +04:00
webpage = " http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html "
sources = " Blocks-4.0.1.txt UnicodeData-4.0.1.txt "
2003-11-09 15:45:26 +03:00
2003-11-10 18:49:27 +03:00
#
# blockAliases is a small hack - it is used for mapping block names which
# were were used in the 3.1 release, but are missing or changed in the current
# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
blockAliases = [ ]
blockAliases . append ( " CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols " )
blockAliases . append ( " Greek:GreekandCoptic " )
blockAliases . append ( " PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A, " +
" SupplementaryPrivateUseArea-B " )
2003-11-09 15:45:26 +03:00
# minTableSize gives the minimum number of ranges which must be present
# before a range table is produced. If there are less than this
# number, inline comparisons are generated
minTableSize = 8
2022-04-21 08:51:23 +03:00
( blockfile , catfile ) = sources . split ( )
2003-11-09 15:45:26 +03:00
2002-04-16 19:50:10 +04:00
2003-11-09 15:45:26 +03:00
#
# Now process the "blocks" file, reducing it to a dictionary
# indexed by blockname, containing a tuple with the applicable
# block range
#
2003-11-10 18:49:27 +03:00
BlockNames = { }
2002-04-16 19:50:10 +04:00
try :
2003-11-09 15:45:26 +03:00
blocks = open ( blockfile , " r " )
2002-04-16 19:50:10 +04:00
except :
2022-04-21 08:51:23 +03:00
print ( " Missing %s , aborting ... " % blockfile )
2002-04-16 19:50:10 +04:00
sys . exit ( 1 )
for line in blocks . readlines ( ) :
if line [ 0 ] == ' # ' :
continue
2022-04-21 08:51:23 +03:00
line = line . strip ( )
2002-04-16 19:50:10 +04:00
if line == ' ' :
2003-11-09 15:45:26 +03:00
continue
2002-04-16 19:50:10 +04:00
try :
2022-04-21 08:51:23 +03:00
fields = line . split ( ' ; ' )
range = fields [ 0 ] . strip ( )
( start , end ) = range . split ( " .. " )
name = fields [ 1 ] . strip ( )
name = name . replace ( ' ' , ' ' )
2002-04-16 19:50:10 +04:00
except :
2022-04-21 08:51:23 +03:00
print ( " Failed to process line: %s " % ( line ) )
2003-11-09 15:45:26 +03:00
continue
2003-11-10 18:49:27 +03:00
start = " 0x " + start
end = " 0x " + end
try :
BlockNames [ name ] . append ( ( start , end ) )
except :
BlockNames [ name ] = [ ( start , end ) ]
2002-04-16 19:50:10 +04:00
blocks . close ( )
2022-04-21 08:51:23 +03:00
print ( " Parsed %d blocks descriptions " % ( len ( BlockNames . keys ( ) ) ) )
2002-04-16 19:50:10 +04:00
2003-11-10 18:49:27 +03:00
for block in blockAliases :
2022-04-21 08:51:23 +03:00
alias = block . split ( ' : ' )
alist = alias [ 1 ] . split ( ' , ' )
2003-11-10 18:49:27 +03:00
for comp in alist :
2022-04-21 08:51:23 +03:00
if comp in BlockNames :
2003-11-10 18:49:27 +03:00
if alias [ 0 ] not in BlockNames :
BlockNames [ alias [ 0 ] ] = [ ]
for r in BlockNames [ comp ] :
BlockNames [ alias [ 0 ] ] . append ( r )
else :
2022-04-21 08:51:23 +03:00
print ( " Alias %s : %s not in Blocks " % ( alias [ 0 ] , comp ) )
2003-11-10 18:49:27 +03:00
continue
2003-11-09 15:45:26 +03:00
#
# Next process the Categories file. This is more complex, since
# the file is in code sequence, and we need to invert it. We use
# a dictionary with index category-name, with each entry containing
# all the ranges (codepoints) of that category. Note that category
# names comprise two parts - the general category, and the "subclass"
# within that category. Therefore, both "general category" (which is
# the first character of the 2-character category-name) and the full
# (2-character) name are entered into this dictionary.
#
2002-04-16 19:50:10 +04:00
try :
2003-11-09 15:45:26 +03:00
data = open ( catfile , " r " )
2002-04-16 19:50:10 +04:00
except :
2022-04-21 08:51:23 +03:00
print ( " Missing %s , aborting ... " % catfile )
2002-04-16 19:50:10 +04:00
sys . exit ( 1 )
nbchar = 0 ;
Categories = { }
for line in data . readlines ( ) :
if line [ 0 ] == ' # ' :
continue
2022-04-21 08:51:23 +03:00
line = line . strip ( )
2002-04-16 19:50:10 +04:00
if line == ' ' :
2003-11-09 15:45:26 +03:00
continue
2002-04-16 19:50:10 +04:00
try :
2022-04-21 08:51:23 +03:00
fields = line . split ( ' ; ' )
point = fields [ 0 ] . strip ( )
2003-11-09 15:45:26 +03:00
value = 0
while point != ' ' :
value = value * 16
if point [ 0 ] > = ' 0 ' and point [ 0 ] < = ' 9 ' :
value = value + ord ( point [ 0 ] ) - ord ( ' 0 ' )
elif point [ 0 ] > = ' A ' and point [ 0 ] < = ' F ' :
value = value + 10 + ord ( point [ 0 ] ) - ord ( ' A ' )
elif point [ 0 ] > = ' a ' and point [ 0 ] < = ' f ' :
value = value + 10 + ord ( point [ 0 ] ) - ord ( ' a ' )
point = point [ 1 : ]
name = fields [ 2 ]
2002-04-16 19:50:10 +04:00
except :
2022-04-21 08:51:23 +03:00
print ( " Failed to process line: %s " % ( line ) )
2003-11-09 15:45:26 +03:00
continue
2002-04-16 19:50:10 +04:00
nbchar = nbchar + 1
2003-11-09 15:45:26 +03:00
# update entry for "full name"
2002-04-16 19:50:10 +04:00
try :
2003-11-09 15:45:26 +03:00
Categories [ name ] . append ( value )
2002-04-16 19:50:10 +04:00
except :
try :
2003-11-09 15:45:26 +03:00
Categories [ name ] = [ value ]
except :
2022-04-21 08:51:23 +03:00
print ( " Failed to process line: %s " % ( line ) )
2003-11-09 15:45:26 +03:00
# update "general category" name
2002-04-16 19:50:10 +04:00
try :
2003-11-09 15:45:26 +03:00
Categories [ name [ 0 ] ] . append ( value )
2002-04-16 19:50:10 +04:00
except :
try :
2003-11-09 15:45:26 +03:00
Categories [ name [ 0 ] ] = [ value ]
except :
2022-04-21 08:51:23 +03:00
print ( " Failed to process line: %s " % ( line ) )
2003-11-09 15:45:26 +03:00
2002-04-16 19:50:10 +04:00
blocks . close ( )
2022-04-21 08:51:23 +03:00
print ( " Parsed %d char generating %d categories " % ( nbchar , len ( Categories . keys ( ) ) ) )
2003-11-09 15:45:26 +03:00
#
# The data is now all read. Time to process it into a more useful form.
#
# reduce the number list into ranges
2002-04-16 19:50:10 +04:00
for cat in Categories . keys ( ) :
list = Categories [ cat ]
start = - 1
prev = - 1
end = - 1
ranges = [ ]
for val in list :
if start == - 1 :
2003-11-09 15:45:26 +03:00
start = val
prev = val
continue
elif val == prev + 1 :
prev = val
continue
elif prev == start :
ranges . append ( ( prev , prev ) )
start = val
prev = val
continue
else :
ranges . append ( ( start , prev ) )
start = val
prev = val
continue
2002-04-16 19:50:10 +04:00
if prev == start :
ranges . append ( ( prev , prev ) )
else :
ranges . append ( ( start , prev ) )
Categories [ cat ] = ranges
2003-11-09 15:45:26 +03:00
#
# Assure all data is in alphabetic order, since we will be doing binary
# searches on the tables.
#
2022-04-21 08:51:23 +03:00
bkeys = sorted ( BlockNames . keys ( ) )
2003-11-09 15:45:26 +03:00
2022-04-21 08:51:23 +03:00
ckeys = sorted ( Categories . keys ( ) )
2003-11-09 15:45:26 +03:00
2002-04-16 19:50:10 +04:00
#
# Generate the resulting files
#
try :
2003-11-09 15:45:26 +03:00
header = open ( " include/libxml/xmlunicode.h " , " w " )
2002-04-16 19:50:10 +04:00
except :
2022-04-21 08:51:23 +03:00
print ( " Failed to open include/libxml/xmlunicode.h " )
2002-04-16 19:50:10 +04:00
sys . exit ( 1 )
try :
output = open ( " xmlunicode.c " , " w " )
except :
2022-04-21 08:51:23 +03:00
print ( " Failed to open xmlunicode.c " )
2002-04-16 19:50:10 +04:00
sys . exit ( 1 )
date = time . asctime ( time . localtime ( time . time ( ) ) )
header . write (
""" /*
2003-11-18 23:56:51 +03:00
* Summary : Unicode character APIs
* Description : API for the Unicode character APIs
2002-04-16 19:50:10 +04:00
*
* This file is automatically generated from the
* UCS description files of the Unicode Character Database
2003-11-09 15:45:26 +03:00
* % s
2002-04-16 19:50:10 +04:00
* using the genUnicode . py Python script .
*
* Generation date : % s
* Sources : % s
2003-11-18 23:56:51 +03:00
* Author : Daniel Veillard
2002-04-16 19:50:10 +04:00
* /
#ifndef __XML_UNICODE_H__
#define __XML_UNICODE_H__
2003-08-25 13:05:12 +04:00
#include <libxml/xmlversion.h>
2006-03-27 13:30:13 +04:00
#ifdef LIBXML_UNICODE_ENABLED
2002-04-16 19:50:10 +04:00
#ifdef __cplusplus
extern " C " {
#endif
2003-11-09 15:45:26 +03:00
""" % (webpage, date, sources));
2002-04-16 19:50:10 +04:00
output . write (
""" /*
* xmlunicode . c : this module implements the Unicode character APIs
*
* This file is automatically generated from the
* UCS description files of the Unicode Character Database
2003-11-09 15:45:26 +03:00
* % s
2002-04-16 19:50:10 +04:00
* using the genUnicode . py Python script .
*
* Generation date : % s
* Sources : % s
* Daniel Veillard < veillard @redhat.com >
* /
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_UNICODE_ENABLED
#include <string.h>
#include <libxml/xmlversion.h>
#include <libxml/xmlunicode.h>
2003-11-09 15:45:26 +03:00
#include <libxml/chvalid.h>
2002-04-16 19:50:10 +04:00
2003-11-09 15:45:26 +03:00
typedef int ( xmlIntFunc ) ( int ) ; / * just to keep one ' s mind untwisted */
2002-04-16 19:50:10 +04:00
2003-11-09 15:45:26 +03:00
typedef struct {
const char * rangename ;
xmlIntFunc * func ;
} xmlUnicodeRange ;
typedef struct {
2017-08-28 22:27:03 +03:00
const xmlUnicodeRange * table ;
2003-11-09 15:45:26 +03:00
int numentries ;
} xmlUnicodeNameTable ;
2022-04-21 08:51:23 +03:00
static xmlIntFunc * xmlUnicodeLookup ( const xmlUnicodeNameTable * tptr , const char * tname ) ;
2003-11-09 15:45:26 +03:00
2017-08-28 22:27:03 +03:00
static const xmlUnicodeRange xmlUnicodeBlocks [ ] = {
2003-11-09 15:45:26 +03:00
""" % (webpage, date, sources));
flag = 0
for block in bkeys :
2022-04-21 08:51:23 +03:00
name = block . replace ( ' - ' , ' ' )
2003-11-09 15:45:26 +03:00
if flag :
output . write ( ' , \n ' )
else :
flag = 1
2003-11-10 18:49:27 +03:00
output . write ( ' { " %s " , xmlUCSIs %s } ' % ( block , name ) )
2003-11-09 15:45:26 +03:00
output . write ( ' }; \n \n ' )
2022-04-21 08:51:23 +03:00
output . write ( ' static const xmlUnicodeRange xmlUnicodeCats[] = { \n ' )
2003-11-09 15:45:26 +03:00
flag = 0 ;
for name in ckeys :
if flag :
output . write ( ' , \n ' )
else :
flag = 1
output . write ( ' { " %s " , xmlUCSIsCat %s } ' % ( name , name ) )
output . write ( ' }; \n \n ' )
#
# For any categories with more than minTableSize ranges we generate
# a range table suitable for xmlCharInRange
#
for name in ckeys :
if len ( Categories [ name ] ) > minTableSize :
numshort = 0
numlong = 0
ranges = Categories [ name ]
sptr = " NULL "
lptr = " NULL "
for range in ranges :
( low , high ) = range
if high < 0x10000 :
if numshort == 0 :
2006-03-27 13:30:13 +04:00
pline = " static const xmlChSRange xml %s S[] = { " % name
2003-11-09 15:45:26 +03:00
sptr = " xml %s S " % name
else :
2022-04-21 08:51:23 +03:00
pline + = " , "
2003-11-09 15:45:26 +03:00
numshort + = 1
else :
if numlong == 0 :
if numshort > 0 :
output . write ( pline + " }; \n " )
2006-03-27 13:30:13 +04:00
pline = " static const xmlChLRange xml %s L[] = { " % name
2003-11-09 15:45:26 +03:00
lptr = " xml %s L " % name
else :
2022-04-21 08:51:23 +03:00
pline + = " , "
2003-11-09 15:45:26 +03:00
numlong + = 1
if len ( pline ) > 60 :
output . write ( pline + " \n " )
pline = " "
2022-04-21 08:51:23 +03:00
elif pline [ - 1 : ] == " , " :
pline + = " "
2003-11-09 15:45:26 +03:00
pline + = " { %s , %s } " % ( hex ( low ) , hex ( high ) )
2022-04-21 08:51:23 +03:00
output . write ( pline + " }; \n static const xmlChRangeGroup xml %s G = { %s , %s , %s , %s }; \n \n "
2003-11-09 15:45:26 +03:00
% ( name , numshort , numlong , sptr , lptr ) )
output . write (
2022-04-21 08:51:23 +03:00
""" static const xmlUnicodeNameTable xmlUnicodeBlockTbl = { xmlUnicodeBlocks, %s };
static const xmlUnicodeNameTable xmlUnicodeCatTbl = { xmlUnicodeCats , % s } ;
2003-11-09 15:45:26 +03:00
/ * *
* xmlUnicodeLookup :
* @tptr : pointer to the name table
* @name : name to be found
*
* binary table lookup for user - supplied name
*
* Returns pointer to range function if found , otherwise NULL
* /
static xmlIntFunc
2022-04-21 08:51:23 +03:00
* xmlUnicodeLookup ( const xmlUnicodeNameTable * tptr , const char * tname ) {
2003-11-09 15:45:26 +03:00
int low , high , mid , cmp ;
2022-04-21 08:51:23 +03:00
const xmlUnicodeRange * sptr ;
2003-11-09 15:45:26 +03:00
2006-03-27 13:30:13 +04:00
if ( ( tptr == NULL ) | | ( tname == NULL ) ) return ( NULL ) ;
2003-11-09 15:45:26 +03:00
low = 0 ;
high = tptr - > numentries - 1 ;
sptr = tptr - > table ;
while ( low < = high ) {
mid = ( low + high ) / 2 ;
if ( ( cmp = strcmp ( tname , sptr [ mid ] . rangename ) ) == 0 )
return ( sptr [ mid ] . func ) ;
if ( cmp < 0 )
high = mid - 1 ;
else
low = mid + 1 ;
}
2022-04-21 08:51:23 +03:00
return ( NULL ) ;
2003-11-09 15:45:26 +03:00
}
""" % (len(BlockNames), len(Categories)) )
for block in bkeys :
2022-04-21 08:51:23 +03:00
name = block . replace ( ' - ' , ' ' )
2022-12-08 04:48:27 +03:00
header . write ( " XMLPUBFUN int xmlUCSIs %s \t (int code); \n " % name )
2002-04-16 19:50:10 +04:00
output . write ( " /** \n * xmlUCSIs %s : \n * @code: UCS code point \n " % ( name ) )
output . write ( " * \n * Check whether the character is part of %s UCS Block \n " %
( block ) )
output . write ( " * \n * Returns 1 if true 0 otherwise \n */ \n " ) ;
2003-11-10 18:49:27 +03:00
output . write ( " int \n xmlUCSIs %s (int code) { \n return( " % name )
flag = 0
for ( start , end ) in BlockNames [ block ] :
if flag :
output . write ( " || \n " )
else :
flag = 1
output . write ( " ((code >= %s ) && (code <= %s )) " % ( start , end ) )
output . write ( " ); \n } \n \n " )
2002-04-16 19:50:10 +04:00
2022-12-08 04:48:27 +03:00
header . write ( " \n XMLPUBFUN int xmlUCSIsBlock \t (int code, const char *block); \n \n " )
2003-11-09 15:45:26 +03:00
output . write (
""" /**
* xmlUCSIsBlock :
* @code : UCS code point
* @block : UCS block name
*
* Check whether the character is part of the UCS Block
*
* Returns 1 if true , 0 if false and - 1 on unknown block
* /
int
xmlUCSIsBlock ( int code , const char * block ) {
xmlIntFunc * func ;
func = xmlUnicodeLookup ( & xmlUnicodeBlockTbl , block ) ;
if ( func == NULL )
return ( - 1 ) ;
return ( func ( code ) ) ;
}
2002-04-16 19:50:10 +04:00
2003-11-09 15:45:26 +03:00
""" )
2002-04-16 19:50:10 +04:00
2003-11-09 15:45:26 +03:00
for name in ckeys :
2002-04-16 19:50:10 +04:00
ranges = Categories [ name ]
2022-12-08 04:48:27 +03:00
header . write ( " XMLPUBFUN int xmlUCSIsCat %s \t (int code); \n " % name )
2002-04-16 19:50:10 +04:00
output . write ( " /** \n * xmlUCSIsCat %s : \n * @code: UCS code point \n " % ( name ) )
output . write ( " * \n * Check whether the character is part of %s UCS Category \n " %
( name ) )
output . write ( " * \n * Returns 1 if true 0 otherwise \n */ \n " ) ;
output . write ( " int \n xmlUCSIsCat %s (int code) { \n " % name )
2003-11-09 15:45:26 +03:00
if len ( Categories [ name ] ) > minTableSize :
output . write ( " return(xmlCharInRange((unsigned int)code, &xml %s G) "
% name )
else :
start = 1
for range in ranges :
( begin , end ) = range ;
if start :
output . write ( " return( " ) ;
start = 0
else :
output . write ( " || \n " ) ;
if ( begin == end ) :
output . write ( " (code == %s ) " % ( hex ( begin ) ) )
else :
output . write ( " ((code >= %s ) && (code <= %s )) " % (
hex ( begin ) , hex ( end ) ) )
2002-04-16 19:50:10 +04:00
output . write ( " ); \n } \n \n " )
2022-12-08 04:48:27 +03:00
header . write ( " \n XMLPUBFUN int xmlUCSIsCat \t (int code, const char *cat); \n " )
2003-11-09 15:45:26 +03:00
output . write (
""" /**
* xmlUCSIsCat :
* @code : UCS code point
* @cat : UCS Category name
*
* Check whether the character is part of the UCS Category
*
* Returns 1 if true , 0 if false and - 1 on unknown category
* /
int
xmlUCSIsCat ( int code , const char * cat ) {
xmlIntFunc * func ;
func = xmlUnicodeLookup ( & xmlUnicodeCatTbl , cat ) ;
if ( func == NULL )
return ( - 1 ) ;
return ( func ( code ) ) ;
}
#endif /* LIBXML_UNICODE_ENABLED */
""" )
2002-04-16 19:50:10 +04:00
header . write ( """
#ifdef __cplusplus
}
#endif
2006-03-27 13:30:13 +04:00
#endif /* LIBXML_UNICODE_ENABLED */
2002-04-16 19:50:10 +04:00
#endif /* __XML_UNICODE_H__ */
""" );
2003-11-09 15:45:26 +03:00
2002-04-16 19:50:10 +04:00
header . close ( )
output . close ( )