2003-10-11 19:22:13 +04:00
#!/usr/bin/python -u
#
# Portions of this script have been (shamelessly) stolen from the
# prior work of Daniel Veillard (genUnicode.py)
#
# I, however, take full credit for any bugs, errors or difficulties :-)
#
# William Brack
# October 2003
#
2003-10-18 08:53:14 +04:00
# 18 October 2003
# Modified to maintain binary compatibility with previous library versions
# by adding a suffix 'Q' ('quick') to the macro generated for the original,
# function, and adding generation of a function (with the original name) which
# instantiates the macro.
#
2003-10-11 19:22:13 +04:00
import sys
import string
import time
#
# A routine to take a list of yes/no (1, 0) values and turn it
# into a list of ranges. This will later be used to determine whether
# to generate single-byte lookup tables, or inline comparisons
#
def makeRange ( lst ) :
ret = [ ]
pos = 0
while pos < len ( lst ) :
try : # index generates exception if not present
s = lst [ pos : ] . index ( 1 ) # look for start of next range
except :
break # if no more, finished
pos + = s # pointer to start of possible range
try :
e = lst [ pos : ] . index ( 0 ) # look for end of range
e + = pos
except : # if no end, set to end of list
e = len ( lst )
ret . append ( ( pos , e - 1 ) ) # append range tuple to list
pos = e + 1 # ready to check for next range
return ret
sources = " chvalid.def " # input filename
# minTableSize gives the minimum number of ranges which must be present
# before a 256-byte lookup table is produced. If there are less than this
# number, a macro with inline comparisons is generated
minTableSize = 6
# dictionary of functions, key=name, element contains char-map and range-list
Functs = { }
state = 0
try :
defines = open ( " chvalid.def " , " r " )
except :
print " Missing chvalid.def, aborting ... "
sys . exit ( 1 )
#
# The lines in the .def file have three types:-
# name: Defines a new function block
# ur: Defines individual or ranges of unicode values
# end: Indicates the end of the function block
#
# These lines are processed below.
#
for line in defines . readlines ( ) :
# ignore blank lines, or lines beginning with '#'
if line [ 0 ] == ' # ' :
continue
line = string . strip ( line )
if line == ' ' :
continue
# split line into space-separated fields, then split on type
try :
fields = string . split ( line , ' ' )
#
# name line:
# validate any previous function block already ended
# validate this function not already defined
# initialize an entry in the function dicitonary
# including a mask table with no values yet defined
#
if fields [ 0 ] == ' name ' :
name = fields [ 1 ]
if state != 0 :
print " ' name ' %s found before previous name " \
" completed " % ( fields [ 1 ] )
continue
state = 1
if Functs . has_key ( name ) :
print " name ' %s ' already present - may give " \
" wrong results " % ( name )
else :
# dict entry with two list elements (chdata, rangedata)
Functs [ name ] = [ [ ] , [ ] ]
for v in range ( 256 ) :
Functs [ name ] [ 0 ] . append ( 0 )
#
# end line:
# validate there was a preceding function name line
# set state to show no current function active
#
elif fields [ 0 ] == ' end ' :
if state == 0 :
print " ' end ' found outside of function block "
continue
state = 0
#
# ur line:
# validate function has been defined
# process remaining fields on the line, which may be either
# individual unicode values or ranges of values
#
elif fields [ 0 ] == ' ur ' :
if state != 1 :
raise ValidationError , " ' ur ' found outside of ' name ' block "
for el in fields [ 1 : ] :
pos = string . find ( el , ' .. ' )
# pos <=0 means not a range, so must be individual value
if pos < = 0 :
# cheap handling of hex or decimal values
if el [ 0 : 2 ] == ' 0x ' :
value = int ( el [ 2 : ] , 16 )
elif el [ 0 ] == " ' " :
value = ord ( el [ 1 ] )
else :
value = int ( el )
if ( ( value < 0 ) | ( value > 0x1fffff ) ) :
raise ValidationError , ' Illegal value ( %s ) in ch for ' \
' name %s ' % ( el , name )
# for ur we have only ranges (makes things simpler),
# so convert val to range
currange = ( value , value )
# pos > 0 means this is a range, so isolate/validate
# the interval
else :
# split the range into it's first-val, last-val
( first , last ) = string . split ( el , " .. " )
# convert values from text into binary
if first [ 0 : 2 ] == ' 0x ' :
start = int ( first [ 2 : ] , 16 )
elif first [ 0 ] == " ' " :
start = ord ( first [ 1 ] )
else :
start = int ( first )
if last [ 0 : 2 ] == ' 0x ' :
end = int ( last [ 2 : ] , 16 )
elif last [ 0 ] == " ' " :
end = ord ( last [ 1 ] )
else :
end = int ( last )
if ( start < 0 ) | ( end > 0x1fffff ) | ( start > end ) :
raise ValidationError , " Invalid range ' %s ' " % el
currange = ( start , end )
# common path - 'currange' has the range, now take care of it
# We split on single-byte values vs. multibyte
if currange [ 1 ] < 0x100 : # single-byte
for ch in range ( currange [ 0 ] , currange [ 1 ] + 1 ) :
# validate that value not previously defined
if Functs [ name ] [ 0 ] [ ch ] :
msg = " Duplicate ch value ' %s ' for name ' %s ' " % ( el , name )
raise ValidationError , msg
Functs [ name ] [ 0 ] [ ch ] = 1
else : # multi-byte
if currange in Functs [ name ] [ 1 ] :
raise ValidationError , " range already defined in " \
" function "
else :
Functs [ name ] [ 1 ] . append ( currange )
except :
print " Failed to process line: %s " % ( line )
raise
#
# At this point, the entire definition file has been processed. Now we
# enter the output phase, where we generate the two files chvalid.c and'
# chvalid.h
#
# To do this, we first output the 'static' data (heading, fixed
# definitions, etc.), then output the 'dynamic' data (the results
# of the above processing), and finally output closing 'static' data
# (e.g. the subroutine to process the ranges)
#
#
# Generate the headings:
#
try :
2003-10-12 00:58:06 +04:00
header = open ( " include/libxml/chvalid.h " , " w " )
2003-10-11 19:22:13 +04:00
except :
2003-10-12 00:58:06 +04:00
print " Failed to open include/libxml/chvalid.h "
2003-10-11 19:22:13 +04:00
sys . exit ( 1 )
try :
output = open ( " chvalid.c " , " w " )
except :
print " Failed to open chvalid.c "
sys . exit ( 1 )
date = time . asctime ( time . localtime ( time . time ( ) ) )
header . write (
""" /*
2003-11-18 23:56:51 +03:00
* Summary : Unicode character range checking
* Description : this module exports interfaces for the character
* range validation APIs
2003-10-11 19:22:13 +04:00
*
* This file is automatically generated from the cvs source
* definition files using the genChRanges . py Python script
*
* Generation date : % s
* Sources : % s
2003-11-18 23:56:51 +03:00
* Author : William Brack < wbrack @mmm.com.hk >
2003-10-11 19:22:13 +04:00
* /
#ifndef __XML_CHVALID_H__
#define __XML_CHVALID_H__
2003-10-18 08:53:14 +04:00
#include <libxml/xmlversion.h>
2006-03-27 02:52:40 +04:00
#include <libxml/xmlstring.h>
2003-10-18 08:53:14 +04:00
2003-10-11 19:22:13 +04:00
#ifdef __cplusplus
extern " C " {
#endif
/ *
* Define our typedefs and structures
*
* /
typedef struct _xmlChSRange xmlChSRange ;
typedef xmlChSRange * xmlChSRangePtr ;
struct _xmlChSRange {
unsigned short low ;
unsigned short high ;
} ;
typedef struct _xmlChLRange xmlChLRange ;
typedef xmlChLRange * xmlChLRangePtr ;
struct _xmlChLRange {
2003-10-18 16:42:41 +04:00
unsigned int low ;
unsigned int high ;
2003-10-11 19:22:13 +04:00
} ;
typedef struct _xmlChRangeGroup xmlChRangeGroup ;
typedef xmlChRangeGroup * xmlChRangeGroupPtr ;
struct _xmlChRangeGroup {
int nbShortRange ;
int nbLongRange ;
2006-03-27 13:30:13 +04:00
const xmlChSRange * shortRange ; / * points to an array of ranges * /
const xmlChLRange * longRange ;
2003-10-11 19:22:13 +04:00
} ;
2003-10-18 16:42:41 +04:00
/ * *
* Range checking routine
* /
2003-10-18 08:53:14 +04:00
XMLPUBFUN int XMLCALL
2006-03-27 13:30:13 +04:00
xmlCharInRange ( unsigned int val , const xmlChRangeGroup * group ) ;
2003-10-11 19:22:13 +04:00
""" % (date, sources));
output . write (
""" /*
* chvalid . c : this module implements the character range
* validation APIs
*
* This file is automatically generated from the cvs source
* definition files using the genChRanges . py Python script
*
* Generation date : % s
* Sources : % s
* William Brack < wbrack @mmm.com.hk >
* /
2003-10-22 12:44:26 +04:00
#define IN_LIBXML
#include "libxml.h"
2003-10-11 19:59:36 +04:00
#include <libxml/chvalid.h>
2003-10-11 19:22:13 +04:00
/ *
* The initial tables ( { func_name } _tab ) are used to validate whether a
* single - byte character is within the specified group . Each table
* contains 256 bytes , with each byte representing one of the 256
* possible characters . If the table byte is set , the character is
* allowed .
*
* /
""" % (date, sources));
#
# Now output the generated data.
# We try to produce the best execution times. Tests have shown that validation
# with direct table lookup is, when there are a "small" number of valid items,
# still not as fast as a sequence of inline compares. So, if the single-byte
# portion of a range has a "small" number of ranges, we output a macro for inline
# compares, otherwise we output a 256-byte table and a macro to use it.
#
fkeys = Functs . keys ( ) # Dictionary of all defined functions
fkeys . sort ( ) # Put some order to our output
for f in fkeys :
# First we convert the specified single-byte values into a group of ranges.
# If the total number of such ranges is less than minTableSize, we generate
# an inline macro for direct comparisons; if greater, we generate a lookup
# table.
if max ( Functs [ f ] [ 0 ] ) > 0 : # only check if at least one entry
rangeTable = makeRange ( Functs [ f ] [ 0 ] )
numRanges = len ( rangeTable )
if numRanges > = minTableSize : # table is worthwhile
2006-03-27 02:52:40 +04:00
header . write ( " XMLPUBVAR const unsigned char %s _tab[256]; \n " % f )
2003-10-18 16:42:41 +04:00
header . write ( """
/ * *
* % s_ch :
* @c : char to validate
*
* Automatically generated by genChRanges . py
* /
""" % f )
2003-10-11 19:22:13 +04:00
header . write ( " #define %s _ch(c) \t ( %s _tab[(c)]) \n " % ( f , f ) )
# write the constant data to the code file
2006-03-27 02:52:40 +04:00
output . write ( " const unsigned char %s _tab[256] = { \n " % f )
2003-10-11 19:22:13 +04:00
pline = " "
for n in range ( 255 ) :
pline + = " 0x %02x , " % Functs [ f ] [ 0 ] [ n ]
if len ( pline ) > 72 :
output . write ( pline + " \n " )
pline = " "
output . write ( pline + " 0x %02x }; \n \n " % Functs [ f ] [ 0 ] [ 255 ] )
else : # inline check is used
# first another little optimisation - if space is present,
# put it at the front of the list so it is checked first
try :
ix = rangeTable . remove ( ( 0x20 , 0x20 ) )
rangeTable . insert ( 0 , ( 0x20 , 0x20 ) )
except :
pass
firstFlag = 1
2003-10-18 16:42:41 +04:00
header . write ( """
/ * *
* % s_ch :
* @c : char to validate
*
* Automatically generated by genChRanges . py
* /
""" % f )
2003-10-12 14:42:46 +04:00
# okay, I'm tired of the messy lineup - let's automate it!
pline = " #define %s _ch(c) " % f
# 'ntab' is number of tabs needed to position to col. 33 from name end
ntab = 4 - ( len ( pline ) ) / 8
if ntab < 0 :
ntab = 0
just = " "
for i in range ( ntab ) :
just + = " \t "
pline = pline + just + " ( "
2003-10-11 19:22:13 +04:00
for rg in rangeTable :
if not firstFlag :
2003-10-12 14:42:46 +04:00
pline + = " || \\ \n \t \t \t \t "
2003-10-11 19:22:13 +04:00
else :
firstFlag = 0
if rg [ 0 ] == rg [ 1 ] : # single value - check equal
2003-10-12 14:42:46 +04:00
pline + = " ((c) == 0x %x ) " % rg [ 0 ]
else : # value range
2003-11-14 19:20:34 +03:00
# since we are doing char, also change range ending in 0xff
if rg [ 1 ] != 0xff :
pline + = " ((0x %x <= (c)) && " % rg [ 0 ]
pline + = " ((c) <= 0x %x )) " % rg [ 1 ]
else :
pline + = " (0x %x <= (c)) " % rg [ 0 ]
2003-10-11 19:22:13 +04:00
pline + = " ) \n "
header . write ( pline )
2003-10-18 16:42:41 +04:00
header . write ( """
/ * *
* % sQ :
* @c : char to validate
*
* Automatically generated by genChRanges . py
* /
""" % f )
2003-10-18 08:53:14 +04:00
pline = " #define %s Q(c) " % f
2003-10-12 14:42:46 +04:00
ntab = 4 - ( len ( pline ) ) / 8
if ntab < 0 :
ntab = 0
just = " "
for i in range ( ntab ) :
just + = " \t "
header . write ( pline + just + " (((c) < 0x100) ? \\ \n \t \t \t \t " )
2003-10-11 19:22:13 +04:00
if max ( Functs [ f ] [ 0 ] ) > 0 :
header . write ( " %s _ch((c)) : " % f )
else :
header . write ( " 0 : " )
# if no ranges defined, value invalid if >= 0x100
2003-10-12 14:42:46 +04:00
numRanges = len ( Functs [ f ] [ 1 ] )
if numRanges == 0 :
2003-10-11 19:22:13 +04:00
header . write ( " 0) \n \n " )
else :
2003-10-12 14:42:46 +04:00
if numRanges > = minTableSize :
header . write ( " \\ \n \t \t \t \t xmlCharInRange((c), & %s Group)) \n \n " % f )
else : # if < minTableSize, generate inline code
firstFlag = 1
for rg in Functs [ f ] [ 1 ] :
if not firstFlag :
pline + = " || \\ \n \t \t \t \t "
else :
firstFlag = 0
pline = " \\ \n \t \t \t \t ( "
if rg [ 0 ] == rg [ 1 ] : # single value - check equal
pline + = " ((c) == 0x %x ) " % rg [ 0 ]
else : # value range
pline + = " ((0x %x <= (c)) && " % rg [ 0 ]
pline + = " ((c) <= 0x %x )) " % rg [ 1 ]
pline + = " )) \n \n "
header . write ( pline )
2003-10-11 19:22:13 +04:00
if len ( Functs [ f ] [ 1 ] ) > 0 :
2006-03-27 13:30:13 +04:00
header . write ( " XMLPUBVAR const xmlChRangeGroup %s Group; \n " % f )
2003-10-11 19:22:13 +04:00
#
# Next we do the unicode ranges
#
for f in fkeys :
if len ( Functs [ f ] [ 1 ] ) > 0 : # only generate if unicode ranges present
rangeTable = Functs [ f ] [ 1 ]
rangeTable . sort ( ) # ascending tuple sequence
numShort = 0
numLong = 0
for rg in rangeTable :
if rg [ 1 ] < 0x10000 : # if short value
if numShort == 0 : # first occurence
2006-03-27 13:30:13 +04:00
pline = " static const xmlChSRange %s _srng[] = { " % f
2003-10-11 19:22:13 +04:00
else :
pline + = " , "
numShort + = 1
if len ( pline ) > 60 :
output . write ( pline + " \n " )
pline = " "
pline + = " { 0x %x , 0x %x } " % ( rg [ 0 ] , rg [ 1 ] )
else : # if long value
if numLong == 0 : # first occurence
if numShort > 0 : # if there were shorts, finish them off
output . write ( pline + " }; \n " )
2006-03-27 13:30:13 +04:00
pline = " static const xmlChLRange %s _lrng[] = { " % f
2003-10-11 19:22:13 +04:00
else :
pline + = " , "
numLong + = 1
if len ( pline ) > 60 :
output . write ( pline + " \n " )
pline = " "
pline + = " { 0x %x , 0x %x } " % ( rg [ 0 ] , rg [ 1 ] )
output . write ( pline + " }; \n " ) # finish off last group
2006-03-27 13:30:13 +04:00
pline = " const xmlChRangeGroup %s Group = \n \t { %d , %d , " % ( f , numShort , numLong )
2003-10-11 19:22:13 +04:00
if numShort > 0 :
pline + = " %s _srng " % f
2003-10-12 14:42:46 +04:00
else :
pline + = " (xmlChSRangePtr)0 "
2003-10-11 19:22:13 +04:00
if numLong > 0 :
pline + = " , %s _lrng " % f
2003-10-12 14:42:46 +04:00
else :
pline + = " , (xmlChLRangePtr)0 "
2003-10-11 19:22:13 +04:00
output . write ( pline + " }; \n \n " )
output . write (
"""
2003-10-18 16:42:41 +04:00
/ * *
* xmlCharInRange :
* @val : character to be validated
* @rptr : pointer to range to be used to validate
*
* Does a binary search of the range table to determine if char
* is valid
*
* Returns : true if character valid , false otherwise
* /
2003-10-11 19:22:13 +04:00
int
2006-03-27 13:30:13 +04:00
xmlCharInRange ( unsigned int val , const xmlChRangeGroup * rptr ) {
2003-10-11 19:22:13 +04:00
int low , high , mid ;
2006-03-27 13:30:13 +04:00
const xmlChSRange * sptr ;
const xmlChLRange * lptr ;
2006-03-27 02:52:40 +04:00
if ( rptr == NULL ) return ( 0 ) ;
2003-10-11 19:22:13 +04:00
if ( val < 0x10000 ) { / * is val in ' short ' or ' long ' array ? * /
if ( rptr - > nbShortRange == 0 )
return 0 ;
low = 0 ;
2003-10-12 14:42:46 +04:00
high = rptr - > nbShortRange - 1 ;
2003-10-11 19:22:13 +04:00
sptr = rptr - > shortRange ;
while ( low < = high ) {
mid = ( low + high ) / 2 ;
2003-10-12 14:42:46 +04:00
if ( ( unsigned short ) val < sptr [ mid ] . low ) {
2003-10-11 19:22:13 +04:00
high = mid - 1 ;
2003-10-12 14:42:46 +04:00
} else {
if ( ( unsigned short ) val > sptr [ mid ] . high ) {
low = mid + 1 ;
} else {
return 1 ;
}
}
2003-10-11 19:22:13 +04:00
}
} else {
2003-10-12 14:42:46 +04:00
if ( rptr - > nbLongRange == 0 ) {
2003-10-11 19:22:13 +04:00
return 0 ;
2003-10-12 14:42:46 +04:00
}
2003-10-11 19:22:13 +04:00
low = 0 ;
2003-10-12 14:42:46 +04:00
high = rptr - > nbLongRange - 1 ;
2003-10-11 19:22:13 +04:00
lptr = rptr - > longRange ;
while ( low < = high ) {
mid = ( low + high ) / 2 ;
2003-10-12 14:42:46 +04:00
if ( val < lptr [ mid ] . low ) {
2003-10-11 19:22:13 +04:00
high = mid - 1 ;
2003-10-12 14:42:46 +04:00
} else {
if ( val > lptr [ mid ] . high ) {
low = mid + 1 ;
} else {
return 1 ;
}
}
2003-10-11 19:22:13 +04:00
}
}
return 0 ;
}
""" );
2003-10-18 08:53:14 +04:00
#
# finally, generate the ABI compatibility functions
#
for f in fkeys :
2003-10-18 16:42:41 +04:00
output . write ( """
/ * *
* % s :
* @ch : character to validate
*
2003-11-18 09:54:40 +03:00
* This function is DEPRECATED .
""" % f );
if max ( Functs [ f ] [ 0 ] ) > 0 :
output . write ( " * Use %s _ch or %s Q instead " % ( f , f ) )
else :
output . write ( " * Use %s Q instead " % f )
output . write ( """
2003-10-18 16:42:41 +04:00
*
* Returns true if argument valid , false otherwise
* /
2003-11-18 09:54:40 +03:00
""" )
2003-10-18 08:53:14 +04:00
output . write ( " int \n %s (unsigned int ch) { \n return( %s Q(ch)); \n } \n \n " % ( f , f ) )
header . write ( " XMLPUBFUN int XMLCALL \n \t \t %s (unsigned int ch); \n " % f ) ;
#
# Run complete - write trailers and close the output files
#
header . write ( """
#ifdef __cplusplus
}
#endif
#endif /* __XML_CHVALID_H__ */
2006-03-27 02:52:40 +04:00
""" )
2003-10-18 08:53:14 +04:00
header . close ( )
2006-03-27 02:52:40 +04:00
output . write ( """ #define bottom_chvalid
#include "elfgcchack.h"
""" )
2003-10-11 19:22:13 +04:00
output . close ( )
2003-10-18 08:53:14 +04:00