2019-05-29 17:18:08 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2019-04-25 20:45:46 +03:00
/*
* Copyright ( c ) 2014 SGI .
* All rights reserved .
*/
# include "utf8n.h"
2021-09-15 10:00:05 +03:00
int utf8version_is_supported ( const struct unicode_map * um , unsigned int version )
2019-04-25 20:45:46 +03:00
{
2021-09-15 10:00:05 +03:00
int i = um - > tables - > utf8agetab_size - 1 ;
2019-04-25 20:45:46 +03:00
2021-09-15 10:00:05 +03:00
while ( i > = 0 & & um - > tables - > utf8agetab [ i ] ! = 0 ) {
if ( version = = um - > tables - > utf8agetab [ i ] )
2019-04-25 20:45:46 +03:00
return 1 ;
i - - ;
}
return 0 ;
}
/*
* UTF - 8 valid ranges .
*
* The UTF - 8 encoding spreads the bits of a 32 bit word over several
* bytes . This table gives the ranges that can be held and how they ' d
* be represented .
*
* 0x00000000 0x0000007F : 0 xxxxxxx
* 0x00000000 0x000007FF : 110 xxxxx 10 xxxxxx
* 0x00000000 0x0000FFFF : 1110 xxxx 10 xxxxxx 10 xxxxxx
* 0x00000000 0x001FFFFF : 11110 xxx 10 xxxxxx 10 xxxxxx 10 xxxxxx
* 0x00000000 0x03FFFFFF : 111110 xx 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx
* 0x00000000 0x7FFFFFFF : 1111110 x 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx
*
* There is an additional requirement on UTF - 8 , in that only the
* shortest representation of a 32 bit value is to be used . A decoder
* must not decode sequences that do not satisfy this requirement .
* Thus the allowed ranges have a lower bound .
*
* 0x00000000 0x0000007F : 0 xxxxxxx
* 0x00000080 0x000007FF : 110 xxxxx 10 xxxxxx
* 0x00000800 0x0000FFFF : 1110 xxxx 10 xxxxxx 10 xxxxxx
* 0x00010000 0x001FFFFF : 11110 xxx 10 xxxxxx 10 xxxxxx 10 xxxxxx
* 0x00200000 0x03FFFFFF : 111110 xx 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx
* 0x04000000 0x7FFFFFFF : 1111110 x 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx
*
* Actual unicode characters are limited to the range 0x0 - 0x10FFFF ,
* 17 planes of 65536 values . This limits the sequences actually seen
* even more , to just the following .
*
* 0 - 0x7F : 0 - 0x7F
* 0x80 - 0x7FF : 0xC2 0x80 - 0xDF 0xBF
* 0x800 - 0xFFFF : 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
* 0x10000 - 0x10FFFF : 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
*
* Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed .
*
* Note that the longest sequence seen with valid usage is 4 bytes ,
* the same a single UTF - 32 character . This makes the UTF - 8
* representation of Unicode strictly smaller than UTF - 32.
*
* The shortest sequence requirement was introduced by :
* Corrigendum # 1 : UTF - 8 Shortest Form
* It can be found here :
* http : //www.unicode.org/versions/corrigendum1.html
*
*/
/*
* Return the number of bytes used by the current UTF - 8 sequence .
* Assumes the input points to the first byte of a valid UTF - 8
* sequence .
*/
static inline int utf8clen ( const char * s )
{
unsigned char c = * s ;
return 1 + ( c > = 0xC0 ) + ( c > = 0xE0 ) + ( c > = 0xF0 ) ;
}
2019-04-25 20:49:18 +03:00
/*
* Decode a 3 - byte UTF - 8 sequence .
*/
static unsigned int
utf8decode3 ( const char * str )
{
unsigned int uc ;
uc = * str + + & 0x0F ;
uc < < = 6 ;
uc | = * str + + & 0x3F ;
uc < < = 6 ;
uc | = * str + + & 0x3F ;
return uc ;
}
/*
* Encode a 3 - byte UTF - 8 sequence .
*/
static int
utf8encode3 ( char * str , unsigned int val )
{
str [ 2 ] = ( val & 0x3F ) | 0x80 ;
val > > = 6 ;
str [ 1 ] = ( val & 0x3F ) | 0x80 ;
val > > = 6 ;
str [ 0 ] = val | 0xE0 ;
return 3 ;
}
2019-04-25 20:45:46 +03:00
/*
* utf8trie_t
*
* A compact binary tree , used to decode UTF - 8 characters .
*
* Internal nodes are one byte for the node itself , and up to three
* bytes for an offset into the tree . The first byte contains the
* following information :
* NEXTBYTE - flag - advance to next byte if set
* BITNUM - 3 bit field - the bit number to tested
* OFFLEN - 2 bit field - number of bytes in the offset
* if offlen = = 0 ( non - branching node )
* RIGHTPATH - 1 bit field - set if the following node is for the
* right - hand path ( tested bit is set )
* TRIENODE - 1 bit field - set if the following node is an internal
* node , otherwise it is a leaf node
* if offlen ! = 0 ( branching node )
* LEFTNODE - 1 bit field - set if the left - hand node is internal
* RIGHTNODE - 1 bit field - set if the right - hand node is internal
*
* Due to the way utf8 works , there cannot be branching nodes with
* NEXTBYTE set , and moreover those nodes always have a righthand
* descendant .
*/
typedef const unsigned char utf8trie_t ;
# define BITNUM 0x07
# define NEXTBYTE 0x08
# define OFFLEN 0x30
# define OFFLEN_SHIFT 4
# define RIGHTPATH 0x40
# define TRIENODE 0x80
# define RIGHTNODE 0x40
# define LEFTNODE 0x80
/*
* utf8leaf_t
*
* The leaves of the trie are embedded in the trie , and so the same
* underlying datatype : unsigned char .
*
* leaf [ 0 ] : The unicode version , stored as a generation number that is
2021-09-15 10:00:05 +03:00
* an index into - > utf8agetab [ ] . With this we can filter code
2019-04-25 20:45:46 +03:00
* points based on the unicode version in which they were
* defined . The CCC of a non - defined code point is 0.
* leaf [ 1 ] : Canonical Combining Class . During normalization , we need
* to do a stable sort into ascending order of all characters
* with a non - zero CCC that occur between two characters with
* a CCC of 0 , or at the begin or end of a string .
* The unicode standard guarantees that all CCC values are
* between 0 and 254 inclusive , which leaves 255 available as
* a special value .
* Code points with CCC 0 are known as stoppers .
* leaf [ 2 ] : Decomposition . If leaf [ 1 ] = = 255 , then leaf [ 2 ] is the
* start of a NUL - terminated string that is the decomposition
* of the character .
* The CCC of a decomposable character is the same as the CCC
* of the first character of its decomposition .
* Some characters decompose as the empty string : these are
* characters with the Default_Ignorable_Code_Point property .
* These do affect normalization , as they all have CCC 0.
*
2019-04-25 20:49:18 +03:00
* The decompositions in the trie have been fully expanded , with the
* exception of Hangul syllables , which are decomposed algorithmically .
2019-04-25 20:45:46 +03:00
*
* Casefolding , if applicable , is also done using decompositions .
*
* The trie is constructed in such a way that leaves exist for all
* UTF - 8 sequences that match the criteria from the " UTF-8 valid
* ranges " comment above, and only for those sequences. Therefore a
* lookup in the trie can be used to validate the UTF - 8 input .
*/
typedef const unsigned char utf8leaf_t ;
# define LEAF_GEN(LEAF) ((LEAF)[0])
# define LEAF_CCC(LEAF) ((LEAF)[1])
# define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
# define MINCCC (0)
# define MAXCCC (254)
# define STOPPER (0)
# define DECOMPOSE (255)
2019-04-25 20:49:18 +03:00
/* Marker for hangul syllable decomposition. */
# define HANGUL ((char)(255))
/* Size of the synthesized leaf used for Hangul syllable decomposition. */
# define UTF8HANGULLEAF (12)
/*
* Hangul decomposition ( algorithm from Section 3.12 of Unicode 6.3 .0 )
*
* AC00 ; < Hangul Syllable , First > ; Lo ; 0 ; L ; ; ; ; ; N ; ; ; ; ;
* D7A3 ; < Hangul Syllable , Last > ; Lo ; 0 ; L ; ; ; ; ; N ; ; ; ; ;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 ( VCount * TCount )
* SCount = 11172 ( LCount * NCount )
*
* Decomposition :
* SIndex = s - SBase
*
* LV ( Canonical / Full )
* LIndex = SIndex / NCount
* VIndex = ( Sindex % NCount ) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT ( Canonical )
* LVIndex = ( SIndex / TCount ) * TCount
* TIndex = ( Sindex % TCount )
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT ( Full )
* LIndex = SIndex / NCount
* VIndex = ( Sindex % NCount ) / TCount
* TIndex = ( Sindex % TCount )
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if ( TIndex = = 0 ) {
* d = < LPart , VPart >
* } else {
* TPart = TBase + TIndex
* d = < LPart , TPart , VPart >
* }
*/
/* Constants */
# define SB (0xAC00)
# define LB (0x1100)
# define VB (0x1161)
# define TB (0x11A7)
# define LC (19)
# define VC (21)
# define TC (28)
# define NC (VC * TC)
# define SC (LC * NC)
/* Algorithmic decomposition of hangul syllable. */
static utf8leaf_t *
utf8hangul ( const char * str , unsigned char * hangul )
{
unsigned int si ;
unsigned int li ;
unsigned int vi ;
unsigned int ti ;
unsigned char * h ;
/* Calculate the SI, LI, VI, and TI values. */
si = utf8decode3 ( str ) - SB ;
li = si / NC ;
vi = ( si % NC ) / TC ;
ti = si % TC ;
/* Fill in base of leaf. */
h = hangul ;
LEAF_GEN ( h ) = 2 ;
LEAF_CCC ( h ) = DECOMPOSE ;
h + = 2 ;
/* Add LPart, a 3-byte UTF-8 sequence. */
h + = utf8encode3 ( ( char * ) h , li + LB ) ;
/* Add VPart, a 3-byte UTF-8 sequence. */
h + = utf8encode3 ( ( char * ) h , vi + VB ) ;
/* Add TPart if required, also a 3-byte UTF-8 sequence. */
if ( ti )
h + = utf8encode3 ( ( char * ) h , ti + TB ) ;
/* Terminate string. */
h [ 0 ] = ' \0 ' ;
return hangul ;
}
2019-04-25 20:45:46 +03:00
/*
* Use trie to scan s , touching at most len bytes .
* Returns the leaf if one exists , NULL otherwise .
*
* A non - NULL return guarantees that the UTF - 8 sequence starting at s
* is well - formed and corresponds to a known unicode code point . The
* shorthand for this will be " is valid UTF-8 unicode " .
*/
2021-09-15 10:00:04 +03:00
static utf8leaf_t * utf8nlookup ( const struct unicode_map * um ,
enum utf8_normalization n , unsigned char * hangul , const char * s ,
size_t len )
2019-04-25 20:45:46 +03:00
{
2021-09-15 10:00:05 +03:00
utf8trie_t * trie = um - > tables - > utf8data + um - > ntab [ n ] - > offset ;
2019-04-25 20:45:46 +03:00
int offlen ;
int offset ;
int mask ;
int node ;
if ( len = = 0 )
return NULL ;
node = 1 ;
while ( node ) {
offlen = ( * trie & OFFLEN ) > > OFFLEN_SHIFT ;
if ( * trie & NEXTBYTE ) {
if ( - - len = = 0 )
return NULL ;
s + + ;
}
mask = 1 < < ( * trie & BITNUM ) ;
if ( * s & mask ) {
/* Right leg */
if ( offlen ) {
/* Right node at offset of trie */
node = ( * trie & RIGHTNODE ) ;
offset = trie [ offlen ] ;
while ( - - offlen ) {
offset < < = 8 ;
offset | = trie [ offlen ] ;
}
trie + = offset ;
} else if ( * trie & RIGHTPATH ) {
/* Right node after this node */
node = ( * trie & TRIENODE ) ;
trie + + ;
} else {
/* No right node. */
2019-04-25 20:49:18 +03:00
return NULL ;
2019-04-25 20:45:46 +03:00
}
} else {
/* Left leg */
if ( offlen ) {
/* Left node after this node. */
node = ( * trie & LEFTNODE ) ;
trie + = offlen + 1 ;
} else if ( * trie & RIGHTPATH ) {
/* No left node. */
2019-04-25 20:49:18 +03:00
return NULL ;
2019-04-25 20:45:46 +03:00
} else {
/* Left node after this node */
node = ( * trie & TRIENODE ) ;
trie + + ;
}
}
}
2019-04-25 20:49:18 +03:00
/*
* Hangul decomposition is done algorithmically . These are the
* codepoints > = 0xAC00 and < = 0xD7A3 . Their UTF - 8 encoding is
* always 3 bytes long , so s has been advanced twice , and the
* start of the sequence is at s - 2.
*/
if ( LEAF_CCC ( trie ) = = DECOMPOSE & & LEAF_STR ( trie ) [ 0 ] = = HANGUL )
trie = utf8hangul ( s - 2 , hangul ) ;
2019-04-25 20:45:46 +03:00
return trie ;
}
/*
* Use trie to scan s .
* Returns the leaf if one exists , NULL otherwise .
*
* Forwards to utf8nlookup ( ) .
*/
2021-09-15 10:00:04 +03:00
static utf8leaf_t * utf8lookup ( const struct unicode_map * um ,
enum utf8_normalization n , unsigned char * hangul , const char * s )
2019-04-25 20:45:46 +03:00
{
2021-09-15 10:00:04 +03:00
return utf8nlookup ( um , n , hangul , s , ( size_t ) - 1 ) ;
2019-04-25 20:45:46 +03:00
}
/*
* Length of the normalization of s , touch at most len bytes .
* Return - 1 if s is not valid UTF - 8 unicode .
*/
2021-09-15 10:00:04 +03:00
ssize_t utf8nlen ( const struct unicode_map * um , enum utf8_normalization n ,
const char * s , size_t len )
2019-04-25 20:45:46 +03:00
{
utf8leaf_t * leaf ;
size_t ret = 0 ;
2019-04-25 20:49:18 +03:00
unsigned char hangul [ UTF8HANGULLEAF ] ;
2019-04-25 20:45:46 +03:00
while ( len & & * s ) {
2021-09-15 10:00:04 +03:00
leaf = utf8nlookup ( um , n , hangul , s , len ) ;
2019-04-25 20:45:46 +03:00
if ( ! leaf )
return - 1 ;
2021-09-15 10:00:05 +03:00
if ( um - > tables - > utf8agetab [ LEAF_GEN ( leaf ) ] >
um - > ntab [ n ] - > maxage )
2019-04-25 20:45:46 +03:00
ret + = utf8clen ( s ) ;
else if ( LEAF_CCC ( leaf ) = = DECOMPOSE )
ret + = strlen ( LEAF_STR ( leaf ) ) ;
else
ret + = utf8clen ( s ) ;
len - = utf8clen ( s ) ;
s + = utf8clen ( s ) ;
}
return ret ;
}
/*
* Set up an utf8cursor for use by utf8byte ( ) .
*
* u8c : pointer to cursor .
* data : const struct utf8data to use for normalization .
* s : string .
* len : length of s .
*
* Returns - 1 on error , 0 on success .
*/
2021-09-15 10:00:04 +03:00
int utf8ncursor ( struct utf8cursor * u8c , const struct unicode_map * um ,
enum utf8_normalization n , const char * s , size_t len )
2019-04-25 20:45:46 +03:00
{
if ( ! s )
return - 1 ;
2021-09-15 10:00:04 +03:00
u8c - > um = um ;
u8c - > n = n ;
2019-04-25 20:45:46 +03:00
u8c - > s = s ;
u8c - > p = NULL ;
u8c - > ss = NULL ;
u8c - > sp = NULL ;
u8c - > len = len ;
u8c - > slen = 0 ;
u8c - > ccc = STOPPER ;
u8c - > nccc = STOPPER ;
/* Check we didn't clobber the maximum length. */
if ( u8c - > len ! = len )
return - 1 ;
/* The first byte of s may not be an utf8 continuation. */
if ( len > 0 & & ( * s & 0xC0 ) = = 0x80 )
return - 1 ;
return 0 ;
}
/*
* Get one byte from the normalized form of the string described by u8c .
*
* Returns the byte cast to an unsigned char on succes , and - 1 on failure .
*
* The cursor keeps track of the location in the string in u8c - > s .
* When a character is decomposed , the current location is stored in
* u8c - > p , and u8c - > s is set to the start of the decomposition . Note
* that bytes from a decomposition do not count against u8c - > len .
*
* Characters are emitted if they match the current CCC in u8c - > ccc .
* Hitting end - of - string while u8c - > ccc = = STOPPER means we ' re done ,
* and the function returns 0 in that case .
*
* Sorting by CCC is done by repeatedly scanning the string . The
* values of u8c - > s and u8c - > p are stored in u8c - > ss and u8c - > sp at
* the start of the scan . The first pass finds the lowest CCC to be
* emitted and stores it in u8c - > nccc , the second pass emits the
* characters with this CCC and finds the next lowest CCC . This limits
* the number of passes to 1 + the number of different CCCs in the
* sequence being scanned .
*
* Therefore :
* u8c - > p ! = NULL - > a decomposition is being scanned .
* u8c - > ss ! = NULL - > this is a repeating scan .
* u8c - > ccc = = - 1 - > this is the first scan of a repeating scan .
*/
int utf8byte ( struct utf8cursor * u8c )
{
utf8leaf_t * leaf ;
int ccc ;
for ( ; ; ) {
/* Check for the end of a decomposed character. */
if ( u8c - > p & & * u8c - > s = = ' \0 ' ) {
u8c - > s = u8c - > p ;
u8c - > p = NULL ;
}
/* Check for end-of-string. */
if ( ! u8c - > p & & ( u8c - > len = = 0 | | * u8c - > s = = ' \0 ' ) ) {
/* There is no next byte. */
if ( u8c - > ccc = = STOPPER )
return 0 ;
/* End-of-string during a scan counts as a stopper. */
ccc = STOPPER ;
goto ccc_mismatch ;
} else if ( ( * u8c - > s & 0xC0 ) = = 0x80 ) {
/* This is a continuation of the current character. */
if ( ! u8c - > p )
u8c - > len - - ;
return ( unsigned char ) * u8c - > s + + ;
}
/* Look up the data for the current character. */
2019-04-25 20:49:18 +03:00
if ( u8c - > p ) {
2021-09-15 10:00:04 +03:00
leaf = utf8lookup ( u8c - > um , u8c - > n , u8c - > hangul , u8c - > s ) ;
2019-04-25 20:49:18 +03:00
} else {
2021-09-15 10:00:04 +03:00
leaf = utf8nlookup ( u8c - > um , u8c - > n , u8c - > hangul ,
2019-04-25 20:49:18 +03:00
u8c - > s , u8c - > len ) ;
}
2019-04-25 20:45:46 +03:00
/* No leaf found implies that the input is a binary blob. */
if ( ! leaf )
return - 1 ;
ccc = LEAF_CCC ( leaf ) ;
/* Characters that are too new have CCC 0. */
2021-09-15 10:00:05 +03:00
if ( u8c - > um - > tables - > utf8agetab [ LEAF_GEN ( leaf ) ] >
2021-09-15 10:00:04 +03:00
u8c - > um - > ntab [ u8c - > n ] - > maxage ) {
2019-04-25 20:45:46 +03:00
ccc = STOPPER ;
} else if ( ccc = = DECOMPOSE ) {
u8c - > len - = utf8clen ( u8c - > s ) ;
u8c - > p = u8c - > s + utf8clen ( u8c - > s ) ;
u8c - > s = LEAF_STR ( leaf ) ;
/* Empty decomposition implies CCC 0. */
if ( * u8c - > s = = ' \0 ' ) {
if ( u8c - > ccc = = STOPPER )
continue ;
ccc = STOPPER ;
goto ccc_mismatch ;
}
2019-04-25 20:49:18 +03:00
2021-09-15 10:00:04 +03:00
leaf = utf8lookup ( u8c - > um , u8c - > n , u8c - > hangul , u8c - > s ) ;
2019-05-12 11:56:51 +03:00
if ( ! leaf )
return - 1 ;
2019-04-25 20:49:18 +03:00
ccc = LEAF_CCC ( leaf ) ;
2019-04-25 20:45:46 +03:00
}
/*
* If this is not a stopper , then see if it updates
* the next canonical class to be emitted .
*/
if ( ccc ! = STOPPER & & u8c - > ccc < ccc & & ccc < u8c - > nccc )
u8c - > nccc = ccc ;
/*
* Return the current byte if this is the current
* combining class .
*/
if ( ccc = = u8c - > ccc ) {
if ( ! u8c - > p )
u8c - > len - - ;
return ( unsigned char ) * u8c - > s + + ;
}
/* Current combining class mismatch. */
ccc_mismatch :
if ( u8c - > nccc = = STOPPER ) {
/*
* Scan forward for the first canonical class
* to be emitted . Save the position from
* which to restart .
*/
u8c - > ccc = MINCCC - 1 ;
u8c - > nccc = ccc ;
u8c - > sp = u8c - > p ;
u8c - > ss = u8c - > s ;
u8c - > slen = u8c - > len ;
if ( ! u8c - > p )
u8c - > len - = utf8clen ( u8c - > s ) ;
u8c - > s + = utf8clen ( u8c - > s ) ;
} else if ( ccc ! = STOPPER ) {
/* Not a stopper, and not the ccc we're emitting. */
if ( ! u8c - > p )
u8c - > len - = utf8clen ( u8c - > s ) ;
u8c - > s + = utf8clen ( u8c - > s ) ;
} else if ( u8c - > nccc ! = MAXCCC + 1 ) {
/* At a stopper, restart for next ccc. */
u8c - > ccc = u8c - > nccc ;
u8c - > nccc = MAXCCC + 1 ;
u8c - > s = u8c - > ss ;
u8c - > p = u8c - > sp ;
u8c - > len = u8c - > slen ;
} else {
/* All done, proceed from here. */
u8c - > ccc = STOPPER ;
u8c - > nccc = STOPPER ;
u8c - > sp = NULL ;
u8c - > ss = NULL ;
u8c - > slen = 0 ;
}
}
}
2021-09-15 10:00:06 +03:00
# ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE
EXPORT_SYMBOL_GPL ( utf8version_is_supported ) ;
EXPORT_SYMBOL_GPL ( utf8nlen ) ;
EXPORT_SYMBOL_GPL ( utf8ncursor ) ;
EXPORT_SYMBOL_GPL ( utf8byte ) ;
# endif