linux/fs/unicode/mkutf8data.c

3420 lines
79 KiB
C
Raw Normal View History

unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
/*
* Copyright (c) 2014 SGI.
* All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Generator for a compact trie for unicode normalization */
#include <sys/types.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
/* Default names of the in- and output files. */
#define AGE_NAME "DerivedAge.txt"
#define CCC_NAME "DerivedCombiningClass.txt"
#define PROP_NAME "DerivedCoreProperties.txt"
#define DATA_NAME "UnicodeData.txt"
#define FOLD_NAME "CaseFolding.txt"
#define NORM_NAME "NormalizationCorrections.txt"
#define TEST_NAME "NormalizationTest.txt"
#define UTF8_NAME "utf8data.h"
const char *age_name = AGE_NAME;
const char *ccc_name = CCC_NAME;
const char *prop_name = PROP_NAME;
const char *data_name = DATA_NAME;
const char *fold_name = FOLD_NAME;
const char *norm_name = NORM_NAME;
const char *test_name = TEST_NAME;
const char *utf8_name = UTF8_NAME;
int verbose = 0;
/* An arbitrary line size limit on input lines. */
#define LINESIZE 1024
char line[LINESIZE];
char buf0[LINESIZE];
char buf1[LINESIZE];
char buf2[LINESIZE];
char buf3[LINESIZE];
const char *argv0;
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
/* ------------------------------------------------------------------ */
/*
* Unicode version numbers consist of three parts: major, minor, and a
* revision. These numbers are packed into an unsigned int to obtain
* a single version number.
*
* To save space in the generated trie, the unicode version is not
* stored directly, instead we calculate a generation number from the
* unicode versions seen in the DerivedAge file, and use that as an
* index into a table of unicode versions.
*/
#define UNICODE_MAJ_SHIFT (16)
#define UNICODE_MIN_SHIFT (8)
#define UNICODE_MAJ_MAX ((unsigned short)-1)
#define UNICODE_MIN_MAX ((unsigned char)-1)
#define UNICODE_REV_MAX ((unsigned char)-1)
#define UNICODE_AGE(MAJ,MIN,REV) \
(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
((unsigned int)(REV)))
unsigned int *ages;
int ages_count;
unsigned int unicode_maxage;
static int age_valid(unsigned int major, unsigned int minor,
unsigned int revision)
{
if (major > UNICODE_MAJ_MAX)
return 0;
if (minor > UNICODE_MIN_MAX)
return 0;
if (revision > UNICODE_REV_MAX)
return 0;
return 1;
}
/* ------------------------------------------------------------------ */
/*
* utf8trie_t
*
* A compact binary tree, used to decode UTF-8 characters.
*
* Internal nodes are one byte for the node itself, and up to three
* bytes for an offset into the tree. The first byte contains the
* following information:
* NEXTBYTE - flag - advance to next byte if set
* BITNUM - 3 bit field - the bit number to tested
* OFFLEN - 2 bit field - number of bytes in the offset
* if offlen == 0 (non-branching node)
* RIGHTPATH - 1 bit field - set if the following node is for the
* right-hand path (tested bit is set)
* TRIENODE - 1 bit field - set if the following node is an internal
* node, otherwise it is a leaf node
* if offlen != 0 (branching node)
* LEFTNODE - 1 bit field - set if the left-hand node is internal
* RIGHTNODE - 1 bit field - set if the right-hand node is internal
*
* Due to the way utf8 works, there cannot be branching nodes with
* NEXTBYTE set, and moreover those nodes always have a righthand
* descendant.
*/
typedef unsigned char utf8trie_t;
#define BITNUM 0x07
#define NEXTBYTE 0x08
#define OFFLEN 0x30
#define OFFLEN_SHIFT 4
#define RIGHTPATH 0x40
#define TRIENODE 0x80
#define RIGHTNODE 0x40
#define LEFTNODE 0x80
/*
* utf8leaf_t
*
* The leaves of the trie are embedded in the trie, and so the same
* underlying datatype, unsigned char.
*
* leaf[0]: The unicode version, stored as a generation number that is
* an index into utf8agetab[]. With this we can filter code
* points based on the unicode version in which they were
* defined. The CCC of a non-defined code point is 0.
* leaf[1]: Canonical Combining Class. During normalization, we need
* to do a stable sort into ascending order of all characters
* with a non-zero CCC that occur between two characters with
* a CCC of 0, or at the begin or end of a string.
* The unicode standard guarantees that all CCC values are
* between 0 and 254 inclusive, which leaves 255 available as
* a special value.
* Code points with CCC 0 are known as stoppers.
* leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
* start of a NUL-terminated string that is the decomposition
* of the character.
* The CCC of a decomposable character is the same as the CCC
* of the first character of its decomposition.
* Some characters decompose as the empty string: these are
* characters with the Default_Ignorable_Code_Point property.
* These do affect normalization, as they all have CCC 0.
*
* The decompositions in the trie have been fully expanded.
*
* Casefolding, if applicable, is also done using decompositions.
*/
typedef unsigned char utf8leaf_t;
#define LEAF_GEN(LEAF) ((LEAF)[0])
#define LEAF_CCC(LEAF) ((LEAF)[1])
#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
#define MAXGEN (255)
#define MINCCC (0)
#define MAXCCC (254)
#define STOPPER (0)
#define DECOMPOSE (255)
#define HANGUL ((char)(255))
#define UTF8HANGULLEAF (12)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
struct tree;
static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
const char *, size_t);
static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
unsigned char *utf8data;
size_t utf8data_size;
utf8trie_t *nfdi;
utf8trie_t *nfdicf;
/* ------------------------------------------------------------------ */
/*
* UTF8 valid ranges.
*
* The UTF-8 encoding spreads the bits of a 32bit word over several
* bytes. This table gives the ranges that can be held and how they'd
* be represented.
*
* 0x00000000 0x0000007F: 0xxxxxxx
* 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
* 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* There is an additional requirement on UTF-8, in that only the
* shortest representation of a 32bit value is to be used. A decoder
* must not decode sequences that do not satisfy this requirement.
* Thus the allowed ranges have a lower bound.
*
* 0x00000000 0x0000007F: 0xxxxxxx
* 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
* 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
* 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
* 17 planes of 65536 values. This limits the sequences actually seen
* even more, to just the following.
*
* 0 - 0x7f: 0 0x7f
* 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
* 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
* 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
*
* Even within those ranges not all values are allowed: the surrogates
* 0xd800 - 0xdfff should never be seen.
*
* Note that the longest sequence seen with valid usage is 4 bytes,
* the same a single UTF-32 character. This makes the UTF-8
* representation of Unicode strictly smaller than UTF-32.
*
* The shortest sequence requirement was introduced by:
* Corrigendum #1: UTF-8 Shortest Form
* It can be found here:
* http://www.unicode.org/versions/corrigendum1.html
*
*/
#define UTF8_2_BITS 0xC0
#define UTF8_3_BITS 0xE0
#define UTF8_4_BITS 0xF0
#define UTF8_N_BITS 0x80
#define UTF8_2_MASK 0xE0
#define UTF8_3_MASK 0xF0
#define UTF8_4_MASK 0xF8
#define UTF8_N_MASK 0xC0
#define UTF8_V_MASK 0x3F
#define UTF8_V_SHIFT 6
static int utf8encode(char *str, unsigned int val)
{
int len;
if (val < 0x80) {
str[0] = val;
len = 1;
} else if (val < 0x800) {
str[1] = val & UTF8_V_MASK;
str[1] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[0] = val;
str[0] |= UTF8_2_BITS;
len = 2;
} else if (val < 0x10000) {
str[2] = val & UTF8_V_MASK;
str[2] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[1] = val & UTF8_V_MASK;
str[1] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[0] = val;
str[0] |= UTF8_3_BITS;
len = 3;
} else if (val < 0x110000) {
str[3] = val & UTF8_V_MASK;
str[3] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[2] = val & UTF8_V_MASK;
str[2] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[1] = val & UTF8_V_MASK;
str[1] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[0] = val;
str[0] |= UTF8_4_BITS;
len = 4;
} else {
printf("%#x: illegal val\n", val);
len = 0;
}
return len;
}
static unsigned int utf8decode(const char *str)
{
const unsigned char *s = (const unsigned char*)str;
unsigned int unichar = 0;
if (*s < 0x80) {
unichar = *s;
} else if (*s < UTF8_3_BITS) {
unichar = *s++ & 0x1F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s & 0x3F;
} else if (*s < UTF8_4_BITS) {
unichar = *s++ & 0x0F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s++ & 0x3F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s & 0x3F;
} else {
unichar = *s++ & 0x0F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s++ & 0x3F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s++ & 0x3F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s & 0x3F;
}
return unichar;
}
static int utf32valid(unsigned int unichar)
{
return unichar < 0x110000;
}
#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
#define NODE 1
#define LEAF 0
struct tree {
void *root;
int childnode;
const char *type;
unsigned int maxage;
struct tree *next;
int (*leaf_equal)(void *, void *);
void (*leaf_print)(void *, int);
int (*leaf_mark)(void *);
int (*leaf_size)(void *);
int *(*leaf_index)(struct tree *, void *);
unsigned char *(*leaf_emit)(void *, unsigned char *);
int leafindex[0x110000];
int index;
};
struct node {
int index;
int offset;
int mark;
int size;
struct node *parent;
void *left;
void *right;
unsigned char bitnum;
unsigned char nextbyte;
unsigned char leftnode;
unsigned char rightnode;
unsigned int keybits;
unsigned int keymask;
};
/*
* Example lookup function for a tree.
*/
static void *lookup(struct tree *tree, const char *key)
{
struct node *node;
void *leaf = NULL;
node = tree->root;
while (!leaf && node) {
if (node->nextbyte)
key++;
if (*key & (1 << (node->bitnum & 7))) {
/* Right leg */
if (node->rightnode == NODE) {
node = node->right;
} else if (node->rightnode == LEAF) {
leaf = node->right;
} else {
node = NULL;
}
} else {
/* Left leg */
if (node->leftnode == NODE) {
node = node->left;
} else if (node->leftnode == LEAF) {
leaf = node->left;
} else {
node = NULL;
}
}
}
return leaf;
}
/*
* A simple non-recursive tree walker: keep track of visits to the
* left and right branches in the leftmask and rightmask.
*/
static void tree_walk(struct tree *tree)
{
struct node *node;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int indent = 1;
int nodes, singletons, leaves;
nodes = singletons = leaves = 0;
printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
if (tree->childnode == LEAF) {
assert(tree->root);
tree->leaf_print(tree->root, indent);
leaves = 1;
} else {
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
printf("%*snode @ %p bitnum %d nextbyte %d"
" left %p right %p mask %x bits %x\n",
indent, "", node,
node->bitnum, node->nextbyte,
node->left, node->right,
node->keymask, node->keybits);
nodes += 1;
if (!(node->left && node->right))
singletons += 1;
while (node) {
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
tree->leaf_print(node->left,
indent+1);
leaves += 1;
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if ((rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
tree->leaf_print(node->right,
indent+1);
leaves += 1;
} else if (node->right) {
assert(node->rightnode == NODE);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
}
printf("nodes %d leaves %d singletons %d\n",
nodes, leaves, singletons);
}
/*
* Allocate an initialize a new internal node.
*/
static struct node *alloc_node(struct node *parent)
{
struct node *node;
int bitnum;
node = malloc(sizeof(*node));
node->left = node->right = NULL;
node->parent = parent;
node->leftnode = NODE;
node->rightnode = NODE;
node->keybits = 0;
node->keymask = 0;
node->mark = 0;
node->index = 0;
node->offset = -1;
node->size = 4;
if (node->parent) {
bitnum = parent->bitnum;
if ((bitnum & 7) == 0) {
node->bitnum = bitnum + 7 + 8;
node->nextbyte = 1;
} else {
node->bitnum = bitnum - 1;
node->nextbyte = 0;
}
} else {
node->bitnum = 7;
node->nextbyte = 0;
}
return node;
}
/*
* Insert a new leaf into the tree, and collapse any subtrees that are
* fully populated and end in identical leaves. A nextbyte tagged
* internal node will not be removed to preserve the tree's integrity.
* Note that due to the structure of utf8, no nextbyte tagged node
* will be a candidate for removal.
*/
static int insert(struct tree *tree, char *key, int keylen, void *leaf)
{
struct node *node;
struct node *parent;
void **cursor;
int keybits;
assert(keylen >= 1 && keylen <= 4);
node = NULL;
cursor = &tree->root;
keybits = 8 * keylen;
/* Insert, creating path along the way. */
while (keybits) {
if (!*cursor)
*cursor = alloc_node(node);
node = *cursor;
if (node->nextbyte)
key++;
if (*key & (1 << (node->bitnum & 7)))
cursor = &node->right;
else
cursor = &node->left;
keybits--;
}
*cursor = leaf;
/* Merge subtrees if possible. */
while (node) {
if (*key & (1 << (node->bitnum & 7)))
node->rightnode = LEAF;
else
node->leftnode = LEAF;
if (node->nextbyte)
break;
if (node->leftnode == NODE || node->rightnode == NODE)
break;
assert(node->left);
assert(node->right);
/* Compare */
if (! tree->leaf_equal(node->left, node->right))
break;
/* Keep left, drop right leaf. */
leaf = node->left;
/* Check in parent */
parent = node->parent;
if (!parent) {
/* root of tree! */
tree->root = leaf;
tree->childnode = LEAF;
} else if (parent->left == node) {
parent->left = leaf;
parent->leftnode = LEAF;
if (parent->right) {
parent->keymask = 0;
parent->keybits = 0;
} else {
parent->keymask |= (1 << node->bitnum);
}
} else if (parent->right == node) {
parent->right = leaf;
parent->rightnode = LEAF;
if (parent->left) {
parent->keymask = 0;
parent->keybits = 0;
} else {
parent->keymask |= (1 << node->bitnum);
parent->keybits |= (1 << node->bitnum);
}
} else {
/* internal tree error */
assert(0);
}
free(node);
node = parent;
}
/* Propagate keymasks up along singleton chains. */
while (node) {
parent = node->parent;
if (!parent)
break;
/* Nix the mask for parents with two children. */
if (node->keymask == 0) {
parent->keymask = 0;
parent->keybits = 0;
} else if (parent->left && parent->right) {
parent->keymask = 0;
parent->keybits = 0;
} else {
assert((parent->keymask & node->keymask) == 0);
parent->keymask |= node->keymask;
parent->keymask |= (1 << parent->bitnum);
parent->keybits |= node->keybits;
if (parent->right)
parent->keybits |= (1 << parent->bitnum);
}
node = parent;
}
return 0;
}
/*
* Prune internal nodes.
*
* Fully populated subtrees that end at the same leaf have already
* been collapsed. There are still internal nodes that have for both
* their left and right branches a sequence of singletons that make
* identical choices and end in identical leaves. The keymask and
* keybits collected in the nodes describe the choices made in these
* singleton chains. When they are identical for the left and right
* branch of a node, and the two leaves comare identical, the node in
* question can be removed.
*
* Note that nodes with the nextbyte tag set will not be removed by
* this to ensure tree integrity. Note as well that the structure of
* utf8 ensures that these nodes would not have been candidates for
* removal in any case.
*/
static void prune(struct tree *tree)
{
struct node *node;
struct node *left;
struct node *right;
struct node *parent;
void *leftleaf;
void *rightleaf;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int count;
if (verbose > 0)
printf("Pruning %s_%x\n", tree->type, tree->maxage);
count = 0;
if (tree->childnode == LEAF)
return;
if (!tree->root)
return;
leftmask = rightmask = 0;
node = tree->root;
while (node) {
if (node->nextbyte)
goto advance;
if (node->leftnode == LEAF)
goto advance;
if (node->rightnode == LEAF)
goto advance;
if (!node->left)
goto advance;
if (!node->right)
goto advance;
left = node->left;
right = node->right;
if (left->keymask == 0)
goto advance;
if (right->keymask == 0)
goto advance;
if (left->keymask != right->keymask)
goto advance;
if (left->keybits != right->keybits)
goto advance;
leftleaf = NULL;
while (!leftleaf) {
assert(left->left || left->right);
if (left->leftnode == LEAF)
leftleaf = left->left;
else if (left->rightnode == LEAF)
leftleaf = left->right;
else if (left->left)
left = left->left;
else if (left->right)
left = left->right;
else
assert(0);
}
rightleaf = NULL;
while (!rightleaf) {
assert(right->left || right->right);
if (right->leftnode == LEAF)
rightleaf = right->left;
else if (right->rightnode == LEAF)
rightleaf = right->right;
else if (right->left)
right = right->left;
else if (right->right)
right = right->right;
else
assert(0);
}
if (! tree->leaf_equal(leftleaf, rightleaf))
goto advance;
/*
* This node has identical singleton-only subtrees.
* Remove it.
*/
parent = node->parent;
left = node->left;
right = node->right;
if (parent->left == node)
parent->left = left;
else if (parent->right == node)
parent->right = left;
else
assert(0);
left->parent = parent;
left->keymask |= (1 << node->bitnum);
node->left = NULL;
while (node) {
bitmask = 1 << node->bitnum;
leftmask &= ~bitmask;
rightmask &= ~bitmask;
if (node->leftnode == NODE && node->left) {
left = node->left;
free(node);
count++;
node = left;
} else if (node->rightnode == NODE && node->right) {
right = node->right;
free(node);
count++;
node = right;
} else {
node = NULL;
}
}
/* Propagate keymasks up along singleton chains. */
node = parent;
/* Force re-check */
bitmask = 1 << node->bitnum;
leftmask &= ~bitmask;
rightmask &= ~bitmask;
for (;;) {
if (node->left && node->right)
break;
if (node->left) {
left = node->left;
node->keymask |= left->keymask;
node->keybits |= left->keybits;
}
if (node->right) {
right = node->right;
node->keymask |= right->keymask;
node->keybits |= right->keybits;
}
node->keymask |= (1 << node->bitnum);
node = node->parent;
/* Force re-check */
bitmask = 1 << node->bitnum;
leftmask &= ~bitmask;
rightmask &= ~bitmask;
}
advance:
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0 &&
node->leftnode == NODE &&
node->left) {
leftmask |= bitmask;
node = node->left;
} else if ((rightmask & bitmask) == 0 &&
node->rightnode == NODE &&
node->right) {
rightmask |= bitmask;
node = node->right;
} else {
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
}
}
if (verbose > 0)
printf("Pruned %d nodes\n", count);
}
/*
* Mark the nodes in the tree that lead to leaves that must be
* emitted.
*/
static void mark_nodes(struct tree *tree)
{
struct node *node;
struct node *n;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int marked;
marked = 0;
if (verbose > 0)
printf("Marking %s_%x\n", tree->type, tree->maxage);
if (tree->childnode == LEAF)
goto done;
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
if (tree->leaf_mark(node->left)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->left) {
assert(node->leftnode == NODE);
node = node->left;
continue;
}
}
if ((rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
if (tree->leaf_mark(node->right)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->right) {
assert(node->rightnode == NODE);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
node = node->right;
continue;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
}
/* second pass: left siblings and singletons */
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
if (tree->leaf_mark(node->left)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->left) {
assert(node->leftnode == NODE);
node = node->left;
if (!node->mark && node->parent->mark) {
marked++;
node->mark = 1;
}
continue;
}
}
if ((rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
if (tree->leaf_mark(node->right)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->right) {
assert(node->rightnode == NODE);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
node = node->right;
if (!node->mark && node->parent->mark &&
!node->parent->left) {
marked++;
node->mark = 1;
}
continue;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
}
done:
if (verbose > 0)
printf("Marked %d nodes\n", marked);
}
/*
* Compute the index of each node and leaf, which is the offset in the
* emitted trie. These values must be pre-computed because relative
* offsets between nodes are used to navigate the tree.
*/
static int index_nodes(struct tree *tree, int index)
{
struct node *node;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int count;
int indent;
/* Align to a cache line (or half a cache line?). */
while (index % 64)
index++;
tree->index = index;
indent = 1;
count = 0;
if (verbose > 0)
printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
if (tree->childnode == LEAF) {
index += tree->leaf_size(tree->root);
goto done;
}
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
if (!node->mark)
goto skip;
count++;
if (node->index != index)
node->index = index;
index += node->size;
skip:
while (node) {
bitmask = 1 << node->bitnum;
if (node->mark && (leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
*tree->leaf_index(tree, node->left) =
index;
index += tree->leaf_size(node->left);
count++;
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if (node->mark && (rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
*tree->leaf_index(tree, node->right) = index;
index += tree->leaf_size(node->right);
count++;
} else if (node->right) {
assert(node->rightnode == NODE);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
done:
/* Round up to a multiple of 16 */
while (index % 16)
index++;
if (verbose > 0)
printf("Final index %d\n", index);
return index;
}
/*
* Mark the nodes in a subtree, helper for size_nodes().
*/
static int mark_subtree(struct node *node)
{
int changed;
if (!node || node->mark)
return 0;
node->mark = 1;
node->index = node->parent->index;
changed = 1;
if (node->leftnode == NODE)
changed += mark_subtree(node->left);
if (node->rightnode == NODE)
changed += mark_subtree(node->right);
return changed;
}
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
/*
* Compute the size of nodes and leaves. We start by assuming that
* each node needs to store a three-byte offset. The indexes of the
* nodes are calculated based on that, and then this function is
* called to see if the sizes of some nodes can be reduced. This is
* repeated until no more changes are seen.
*/
static int size_nodes(struct tree *tree)
{
struct tree *next;
struct node *node;
struct node *right;
struct node *n;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
unsigned int pathbits;
unsigned int pathmask;
unsigned int nbit;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
int changed;
int offset;
int size;
int indent;
indent = 1;
changed = 0;
size = 0;
if (verbose > 0)
printf("Sizing %s_%x\n", tree->type, tree->maxage);
if (tree->childnode == LEAF)
goto done;
assert(tree->childnode == NODE);
pathbits = 0;
pathmask = 0;
node = tree->root;
leftmask = rightmask = 0;
while (node) {
if (!node->mark)
goto skip;
offset = 0;
if (!node->left || !node->right) {
size = 1;
} else {
if (node->rightnode == NODE) {
/*
* If the right node is not marked,
* look for a corresponding node in
* the next tree. Such a node need
* not exist.
*/
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
right = node->right;
next = tree->next;
while (!right->mark) {
assert(next);
n = next->root;
while (n->bitnum != node->bitnum) {
nbit = 1 << n->bitnum;
if (!(pathmask & nbit))
break;
if (pathbits & nbit) {
if (n->rightnode == LEAF)
break;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
n = n->right;
} else {
if (n->leftnode == LEAF)
break;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
n = n->left;
}
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
}
if (n->bitnum != node->bitnum)
break;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
n = n->right;
right = n;
next = next->next;
}
/* Make sure the right node is marked. */
if (!right->mark)
changed += mark_subtree(right);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
offset = right->index - node->index;
} else {
offset = *tree->leaf_index(tree, node->right);
offset -= node->index;
}
assert(offset >= 0);
assert(offset <= 0xffffff);
if (offset <= 0xff) {
size = 2;
} else if (offset <= 0xffff) {
size = 3;
} else { /* offset <= 0xffffff */
size = 4;
}
}
if (node->size != size || node->offset != offset) {
node->size = size;
node->offset = offset;
changed++;
}
skip:
while (node) {
bitmask = 1 << node->bitnum;
pathmask |= bitmask;
if (node->mark && (leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if (node->mark && (rightmask & bitmask) == 0) {
rightmask |= bitmask;
pathbits |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
} else if (node->right) {
assert(node->rightnode == NODE);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
pathmask &= ~bitmask;
pathbits &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
done:
if (verbose > 0)
printf("Found %d changes\n", changed);
return changed;
}
/*
* Emit a trie for the given tree into the data array.
*/
static void emit(struct tree *tree, unsigned char *data)
{
struct node *node;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int offlen;
int offset;
int index;
int indent;
int size;
int bytes;
int leaves;
int nodes[4];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
unsigned char byte;
nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
leaves = 0;
bytes = 0;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
index = tree->index;
data += index;
indent = 1;
if (verbose > 0)
printf("Emitting %s_%x\n", tree->type, tree->maxage);
if (tree->childnode == LEAF) {
assert(tree->root);
tree->leaf_emit(tree->root, data);
size = tree->leaf_size(tree->root);
index += size;
leaves++;
goto done;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
}
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
if (!node->mark)
goto skip;
assert(node->offset != -1);
assert(node->index == index);
byte = 0;
if (node->nextbyte)
byte |= NEXTBYTE;
byte |= (node->bitnum & BITNUM);
if (node->left && node->right) {
if (node->leftnode == NODE)
byte |= LEFTNODE;
if (node->rightnode == NODE)
byte |= RIGHTNODE;
if (node->offset <= 0xff)
offlen = 1;
else if (node->offset <= 0xffff)
offlen = 2;
else
offlen = 3;
nodes[offlen]++;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
offset = node->offset;
byte |= offlen << OFFLEN_SHIFT;
*data++ = byte;
index++;
while (offlen--) {
*data++ = offset & 0xff;
index++;
offset >>= 8;
}
} else if (node->left) {
if (node->leftnode == NODE)
byte |= TRIENODE;
nodes[0]++;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
*data++ = byte;
index++;
} else if (node->right) {
byte |= RIGHTNODE;
if (node->rightnode == NODE)
byte |= TRIENODE;
nodes[0]++;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
*data++ = byte;
index++;
} else {
assert(0);
}
skip:
while (node) {
bitmask = 1 << node->bitnum;
if (node->mark && (leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
data = tree->leaf_emit(node->left,
data);
size = tree->leaf_size(node->left);
index += size;
bytes += size;
leaves++;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if (node->mark && (rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
data = tree->leaf_emit(node->right,
data);
size = tree->leaf_size(node->right);
index += size;
bytes += size;
leaves++;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
} else if (node->right) {
assert(node->rightnode == NODE);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
done:
if (verbose > 0) {
printf("Emitted %d (%d) leaves",
leaves, bytes);
printf(" %d (%d+%d+%d+%d) nodes",
nodes[0] + nodes[1] + nodes[2] + nodes[3],
nodes[0], nodes[1], nodes[2], nodes[3]);
printf(" %d total\n", index - tree->index);
}
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
}
/* ------------------------------------------------------------------ */
/*
* Unicode data.
*
* We need to keep track of the Canonical Combining Class, the Age,
* and decompositions for a code point.
*
* For the Age, we store the index into the ages table. Effectively
* this is a generation number that the table maps to a unicode
* version.
*
* The correction field is used to indicate that this entry is in the
* corrections array, which contains decompositions that were
* corrected in later revisions. The value of the correction field is
* the Unicode version in which the mapping was corrected.
*/
struct unicode_data {
unsigned int code;
int ccc;
int gen;
int correction;
unsigned int *utf32nfdi;
unsigned int *utf32nfdicf;
char *utf8nfdi;
char *utf8nfdicf;
};
struct unicode_data unicode_data[0x110000];
struct unicode_data *corrections;
int corrections_count;
struct tree *nfdi_tree;
struct tree *nfdicf_tree;
struct tree *trees;
int trees_count;
/*
* Check the corrections array to see if this entry was corrected at
* some point.
*/
static struct unicode_data *corrections_lookup(struct unicode_data *u)
{
int i;
for (i = 0; i != corrections_count; i++)
if (u->code == corrections[i].code)
return &corrections[i];
return u;
}
static int nfdi_equal(void *l, void *r)
{
struct unicode_data *left = l;
struct unicode_data *right = r;
if (left->gen != right->gen)
return 0;
if (left->ccc != right->ccc)
return 0;
if (left->utf8nfdi && right->utf8nfdi &&
strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
return 1;
if (left->utf8nfdi || right->utf8nfdi)
return 0;
return 1;
}
static int nfdicf_equal(void *l, void *r)
{
struct unicode_data *left = l;
struct unicode_data *right = r;
if (left->gen != right->gen)
return 0;
if (left->ccc != right->ccc)
return 0;
if (left->utf8nfdicf && right->utf8nfdicf &&
strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
return 1;
if (left->utf8nfdicf && right->utf8nfdicf)
return 0;
if (left->utf8nfdicf || right->utf8nfdicf)
return 0;
if (left->utf8nfdi && right->utf8nfdi &&
strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
return 1;
if (left->utf8nfdi || right->utf8nfdi)
return 0;
return 1;
}
static void nfdi_print(void *l, int indent)
{
struct unicode_data *leaf = l;
printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
leaf->code, leaf->ccc, leaf->gen);
if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
else if (leaf->utf8nfdi)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
printf("\n");
}
static void nfdicf_print(void *l, int indent)
{
struct unicode_data *leaf = l;
printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
leaf->code, leaf->ccc, leaf->gen);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (leaf->utf8nfdicf)
printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
else if (leaf->utf8nfdi)
printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
printf("\n");
}
static int nfdi_mark(void *l)
{
return 1;
}
static int nfdicf_mark(void *l)
{
struct unicode_data *leaf = l;
if (leaf->utf8nfdicf)
return 1;
return 0;
}
static int correction_mark(void *l)
{
struct unicode_data *leaf = l;
return leaf->correction;
}
static int nfdi_size(void *l)
{
struct unicode_data *leaf = l;
int size = 2;
if (HANGUL_SYLLABLE(leaf->code))
size += 1;
else if (leaf->utf8nfdi)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
size += strlen(leaf->utf8nfdi) + 1;
return size;
}
static int nfdicf_size(void *l)
{
struct unicode_data *leaf = l;
int size = 2;
if (HANGUL_SYLLABLE(leaf->code))
size += 1;
else if (leaf->utf8nfdicf)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
size += strlen(leaf->utf8nfdicf) + 1;
else if (leaf->utf8nfdi)
size += strlen(leaf->utf8nfdi) + 1;
return size;
}
static int *nfdi_index(struct tree *tree, void *l)
{
struct unicode_data *leaf = l;
return &tree->leafindex[leaf->code];
}
static int *nfdicf_index(struct tree *tree, void *l)
{
struct unicode_data *leaf = l;
return &tree->leafindex[leaf->code];
}
static unsigned char *nfdi_emit(void *l, unsigned char *data)
{
struct unicode_data *leaf = l;
unsigned char *s;
*data++ = leaf->gen;
if (HANGUL_SYLLABLE(leaf->code)) {
*data++ = DECOMPOSE;
*data++ = HANGUL;
} else if (leaf->utf8nfdi) {
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
*data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdi;
while ((*data++ = *s++) != 0)
;
} else {
*data++ = leaf->ccc;
}
return data;
}
static unsigned char *nfdicf_emit(void *l, unsigned char *data)
{
struct unicode_data *leaf = l;
unsigned char *s;
*data++ = leaf->gen;
if (HANGUL_SYLLABLE(leaf->code)) {
*data++ = DECOMPOSE;
*data++ = HANGUL;
} else if (leaf->utf8nfdicf) {
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
*data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdicf;
while ((*data++ = *s++) != 0)
;
} else if (leaf->utf8nfdi) {
*data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdi;
while ((*data++ = *s++) != 0)
;
} else {
*data++ = leaf->ccc;
}
return data;
}
static void utf8_create(struct unicode_data *data)
{
char utf[18*4+1];
char *u;
unsigned int *um;
int i;
if (data->utf8nfdi) {
assert(data->utf8nfdi[0] == HANGUL);
return;
}
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
u = utf;
um = data->utf32nfdi;
if (um) {
for (i = 0; um[i]; i++)
u += utf8encode(u, um[i]);
*u = '\0';
data->utf8nfdi = strdup(utf);
}
u = utf;
um = data->utf32nfdicf;
if (um) {
for (i = 0; um[i]; i++)
u += utf8encode(u, um[i]);
*u = '\0';
if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
data->utf8nfdicf = strdup(utf);
}
}
static void utf8_init(void)
{
unsigned int unichar;
int i;
for (unichar = 0; unichar != 0x110000; unichar++)
utf8_create(&unicode_data[unichar]);
for (i = 0; i != corrections_count; i++)
utf8_create(&corrections[i]);
}
static void trees_init(void)
{
struct unicode_data *data;
unsigned int maxage;
unsigned int nextage;
int count;
int i;
int j;
/* Count the number of different ages. */
count = 0;
nextage = (unsigned int)-1;
do {
maxage = nextage;
nextage = 0;
for (i = 0; i <= corrections_count; i++) {
data = &corrections[i];
if (nextage < data->correction &&
data->correction < maxage)
nextage = data->correction;
}
count++;
} while (nextage);
/* Two trees per age: nfdi and nfdicf */
trees_count = count * 2;
trees = calloc(trees_count, sizeof(struct tree));
/* Assign ages to the trees. */
count = trees_count;
nextage = (unsigned int)-1;
do {
maxage = nextage;
trees[--count].maxage = maxage;
trees[--count].maxage = maxage;
nextage = 0;
for (i = 0; i <= corrections_count; i++) {
data = &corrections[i];
if (nextage < data->correction &&
data->correction < maxage)
nextage = data->correction;
}
} while (nextage);
/* The ages assigned above are off by one. */
for (i = 0; i != trees_count; i++) {
j = 0;
while (ages[j] < trees[i].maxage)
j++;
trees[i].maxage = ages[j-1];
}
/* Set up the forwarding between trees. */
trees[trees_count-2].next = &trees[trees_count-1];
trees[trees_count-1].leaf_mark = nfdi_mark;
trees[trees_count-2].leaf_mark = nfdicf_mark;
for (i = 0; i != trees_count-2; i += 2) {
trees[i].next = &trees[trees_count-2];
trees[i].leaf_mark = correction_mark;
trees[i+1].next = &trees[trees_count-1];
trees[i+1].leaf_mark = correction_mark;
}
/* Assign the callouts. */
for (i = 0; i != trees_count; i += 2) {
trees[i].type = "nfdicf";
trees[i].leaf_equal = nfdicf_equal;
trees[i].leaf_print = nfdicf_print;
trees[i].leaf_size = nfdicf_size;
trees[i].leaf_index = nfdicf_index;
trees[i].leaf_emit = nfdicf_emit;
trees[i+1].type = "nfdi";
trees[i+1].leaf_equal = nfdi_equal;
trees[i+1].leaf_print = nfdi_print;
trees[i+1].leaf_size = nfdi_size;
trees[i+1].leaf_index = nfdi_index;
trees[i+1].leaf_emit = nfdi_emit;
}
/* Finish init. */
for (i = 0; i != trees_count; i++)
trees[i].childnode = NODE;
}
static void trees_populate(void)
{
struct unicode_data *data;
unsigned int unichar;
char keyval[4];
int keylen;
int i;
for (i = 0; i != trees_count; i++) {
if (verbose > 0) {
printf("Populating %s_%x\n",
trees[i].type, trees[i].maxage);
}
for (unichar = 0; unichar != 0x110000; unichar++) {
if (unicode_data[unichar].gen < 0)
continue;
keylen = utf8encode(keyval, unichar);
data = corrections_lookup(&unicode_data[unichar]);
if (data->correction <= trees[i].maxage)
data = &unicode_data[unichar];
insert(&trees[i], keyval, keylen, data);
}
}
}
static void trees_reduce(void)
{
int i;
int size;
int changed;
for (i = 0; i != trees_count; i++)
prune(&trees[i]);
for (i = 0; i != trees_count; i++)
mark_nodes(&trees[i]);
do {
size = 0;
for (i = 0; i != trees_count; i++)
size = index_nodes(&trees[i], size);
changed = 0;
for (i = 0; i != trees_count; i++)
changed += size_nodes(&trees[i]);
} while (changed);
utf8data = calloc(size, 1);
utf8data_size = size;
for (i = 0; i != trees_count; i++)
emit(&trees[i], utf8data);
if (verbose > 0) {
for (i = 0; i != trees_count; i++) {
printf("%s_%x idx %d\n",
trees[i].type, trees[i].maxage, trees[i].index);
}
}
nfdi = utf8data + trees[trees_count-1].index;
nfdicf = utf8data + trees[trees_count-2].index;
nfdi_tree = &trees[trees_count-1];
nfdicf_tree = &trees[trees_count-2];
}
static void verify(struct tree *tree)
{
struct unicode_data *data;
utf8leaf_t *leaf;
unsigned int unichar;
char key[4];
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
int report;
int nocf;
if (verbose > 0)
printf("Verifying %s_%x\n", tree->type, tree->maxage);
nocf = strcmp(tree->type, "nfdicf");
for (unichar = 0; unichar != 0x110000; unichar++) {
report = 0;
data = corrections_lookup(&unicode_data[unichar]);
if (data->correction <= tree->maxage)
data = &unicode_data[unichar];
utf8encode(key,unichar);
leaf = utf8lookup(tree, hangul, key);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!leaf) {
if (data->gen != -1)
report++;
if (unichar < 0xd800 || unichar > 0xdfff)
report++;
} else {
if (unichar >= 0xd800 && unichar <= 0xdfff)
report++;
if (data->gen == -1)
report++;
if (data->gen != LEAF_GEN(leaf))
report++;
if (LEAF_CCC(leaf) == DECOMPOSE) {
if (HANGUL_SYLLABLE(data->code)) {
if (data->utf8nfdi[0] != HANGUL)
report++;
} else if (nocf) {
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!data->utf8nfdi) {
report++;
} else if (strcmp(data->utf8nfdi,
LEAF_STR(leaf))) {
report++;
}
} else {
if (!data->utf8nfdicf &&
!data->utf8nfdi) {
report++;
} else if (data->utf8nfdicf) {
if (strcmp(data->utf8nfdicf,
LEAF_STR(leaf)))
report++;
} else if (strcmp(data->utf8nfdi,
LEAF_STR(leaf))) {
report++;
}
}
} else if (data->ccc != LEAF_CCC(leaf)) {
report++;
}
}
if (report) {
printf("%X code %X gen %d ccc %d"
" nfdi -> \"%s\"",
unichar, data->code, data->gen,
data->ccc,
data->utf8nfdi);
if (leaf) {
printf(" gen %d ccc %d"
" nfdi -> \"%s\"",
LEAF_GEN(leaf),
LEAF_CCC(leaf),
LEAF_CCC(leaf) == DECOMPOSE ?
LEAF_STR(leaf) : "");
}
printf("\n");
}
}
}
static void trees_verify(void)
{
int i;
for (i = 0; i != trees_count; i++)
verify(&trees[i]);
}
/* ------------------------------------------------------------------ */
static void help(void)
{
printf("Usage: %s [options]\n", argv0);
printf("\n");
printf("This program creates an a data trie used for parsing and\n");
printf("normalization of UTF-8 strings. The trie is derived from\n");
printf("a set of input files from the Unicode character database\n");
printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
printf("\n");
printf("The generated tree supports two normalization forms:\n");
printf("\n");
printf("\tnfdi:\n");
printf("\t- Apply unicode normalization form NFD.\n");
printf("\t- Remove any Default_Ignorable_Code_Point.\n");
printf("\n");
printf("\tnfdicf:\n");
printf("\t- Apply unicode normalization form NFD.\n");
printf("\t- Remove any Default_Ignorable_Code_Point.\n");
printf("\t- Apply a full casefold (C + F).\n");
printf("\n");
printf("These forms were chosen as being most useful when dealing\n");
printf("with file names: NFD catches most cases where characters\n");
printf("should be considered equivalent. The ignorables are mostly\n");
printf("invisible, making names hard to type.\n");
printf("\n");
printf("The options to specify the files to be used are listed\n");
printf("below with their default values, which are the names used\n");
printf("by version 11.0.0 of the Unicode Character Database.\n");
printf("\n");
printf("The input files:\n");
printf("\t-a %s\n", AGE_NAME);
printf("\t-c %s\n", CCC_NAME);
printf("\t-p %s\n", PROP_NAME);
printf("\t-d %s\n", DATA_NAME);
printf("\t-f %s\n", FOLD_NAME);
printf("\t-n %s\n", NORM_NAME);
printf("\n");
printf("Additionally, the generated tables are tested using:\n");
printf("\t-t %s\n", TEST_NAME);
printf("\n");
printf("Finally, the output file:\n");
printf("\t-o %s\n", UTF8_NAME);
printf("\n");
}
static void usage(void)
{
help();
exit(1);
}
static void open_fail(const char *name, int error)
{
printf("Error %d opening %s: %s\n", error, name, strerror(error));
exit(1);
}
static void file_fail(const char *filename)
{
printf("Error parsing %s\n", filename);
exit(1);
}
static void line_fail(const char *filename, const char *line)
{
printf("Error parsing %s:%s\n", filename, line);
exit(1);
}
/* ------------------------------------------------------------------ */
static void print_utf32(unsigned int *utf32str)
{
int i;
for (i = 0; utf32str[i]; i++)
printf(" %X", utf32str[i]);
}
static void print_utf32nfdi(unsigned int unichar)
{
printf(" %X ->", unichar);
print_utf32(unicode_data[unichar].utf32nfdi);
printf("\n");
}
static void print_utf32nfdicf(unsigned int unichar)
{
printf(" %X ->", unichar);
print_utf32(unicode_data[unichar].utf32nfdicf);
printf("\n");
}
/* ------------------------------------------------------------------ */
static void age_init(void)
{
FILE *file;
unsigned int first;
unsigned int last;
unsigned int unichar;
unsigned int major;
unsigned int minor;
unsigned int revision;
int gen;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", age_name);
file = fopen(age_name, "r");
if (!file)
open_fail(age_name, errno);
count = 0;
gen = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "# Age=V%d_%d_%d",
&major, &minor, &revision);
if (ret == 3) {
ages_count++;
if (verbose > 1)
printf(" Age V%d_%d_%d\n",
major, minor, revision);
if (!age_valid(major, minor, revision))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
if (ret == 2) {
ages_count++;
if (verbose > 1)
printf(" Age V%d_%d\n", major, minor);
if (!age_valid(major, minor, 0))
line_fail(age_name, line);
continue;
}
}
/* We must have found something above. */
if (verbose > 1)
printf("%d age entries\n", ages_count);
if (ages_count == 0 || ages_count > MAXGEN)
file_fail(age_name);
/* There is a 0 entry. */
ages_count++;
ages = calloc(ages_count + 1, sizeof(*ages));
/* And a guard entry. */
ages[ages_count] = (unsigned int)-1;
rewind(file);
count = 0;
gen = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "# Age=V%d_%d_%d",
&major, &minor, &revision);
if (ret == 3) {
ages[++gen] =
UNICODE_AGE(major, minor, revision);
if (verbose > 1)
printf(" Age V%d_%d_%d = gen %d\n",
major, minor, revision, gen);
if (!age_valid(major, minor, revision))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
if (ret == 2) {
ages[++gen] = UNICODE_AGE(major, minor, 0);
if (verbose > 1)
printf(" Age V%d_%d = %d\n",
major, minor, gen);
if (!age_valid(major, minor, 0))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "%X..%X ; %d.%d #",
&first, &last, &major, &minor);
if (ret == 4) {
for (unichar = first; unichar <= last; unichar++)
unicode_data[unichar].gen = gen;
count += 1 + last - first;
if (verbose > 1)
printf(" %X..%X gen %d\n", first, last, gen);
if (!utf32valid(first) || !utf32valid(last))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
if (ret == 3) {
unicode_data[unichar].gen = gen;
count++;
if (verbose > 1)
printf(" %X gen %d\n", unichar, gen);
if (!utf32valid(unichar))
line_fail(age_name, line);
continue;
}
}
unicode_maxage = ages[gen];
fclose(file);
/* Nix surrogate block */
if (verbose > 1)
printf(" Removing surrogate block D800..DFFF\n");
for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
unicode_data[unichar].gen = -1;
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(age_name);
}
static void ccc_init(void)
{
FILE *file;
unsigned int first;
unsigned int last;
unsigned int unichar;
unsigned int value;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", ccc_name);
file = fopen(ccc_name, "r");
if (!file)
open_fail(ccc_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
if (ret == 3) {
for (unichar = first; unichar <= last; unichar++) {
unicode_data[unichar].ccc = value;
count++;
}
if (verbose > 1)
printf(" %X..%X ccc %d\n", first, last, value);
if (!utf32valid(first) || !utf32valid(last))
line_fail(ccc_name, line);
continue;
}
ret = sscanf(line, "%X ; %d #", &unichar, &value);
if (ret == 2) {
unicode_data[unichar].ccc = value;
count++;
if (verbose > 1)
printf(" %X ccc %d\n", unichar, value);
if (!utf32valid(unichar))
line_fail(ccc_name, line);
continue;
}
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(ccc_name);
}
static int ignore_compatibility_form(char *type)
{
int i;
char *ignored_types[] = {"font", "noBreak", "initial", "medial",
"final", "isolated", "circle", "super",
"sub", "vertical", "wide", "narrow",
"small", "square", "fraction", "compat"};
for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
if (strcmp(type, ignored_types[i]) == 0)
return 1;
return 0;
}
static void nfdi_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
char *s;
char *type;
unsigned int *um;
int count;
int i;
int ret;
if (verbose > 0)
printf("Parsing %s\n", data_name);
file = fopen(data_name, "r");
if (!file)
open_fail(data_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
&unichar, buf0);
if (ret != 2)
continue;
if (!utf32valid(unichar))
line_fail(data_name, line);
s = buf0;
/* skip over <tag> */
if (*s == '<') {
type = ++s;
while (*++s != '>');
*s++ = '\0';
if(ignore_compatibility_form(type))
continue;
}
/* decode the decomposition into UTF-32 */
i = 0;
while (*s) {
mapping[i] = strtoul(s, &s, 16);
if (!utf32valid(mapping[i]))
line_fail(data_name, line);
i++;
}
mapping[i++] = 0;
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
if (verbose > 1)
print_utf32nfdi(unichar);
count++;
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(data_name);
}
static void nfdicf_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
char status;
char *s;
unsigned int *um;
int i;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", fold_name);
file = fopen(fold_name, "r");
if (!file)
open_fail(fold_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
if (ret != 3)
continue;
if (!utf32valid(unichar))
line_fail(fold_name, line);
/* Use the C+F casefold. */
if (status != 'C' && status != 'F')
continue;
s = buf0;
if (*s == '<')
while (*s++ != ' ')
;
i = 0;
while (*s) {
mapping[i] = strtoul(s, &s, 16);
if (!utf32valid(mapping[i]))
line_fail(fold_name, line);
i++;
}
mapping[i++] = 0;
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
if (verbose > 1)
print_utf32nfdicf(unichar);
count++;
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(fold_name);
}
static void ignore_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int first;
unsigned int last;
unsigned int *um;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", prop_name);
file = fopen(prop_name, "r");
if (!file)
open_fail(prop_name, errno);
assert(file);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
if (ret == 3) {
if (strcmp(buf0, "Default_Ignorable_Code_Point"))
continue;
if (!utf32valid(first) || !utf32valid(last))
line_fail(prop_name, line);
for (unichar = first; unichar <= last; unichar++) {
free(unicode_data[unichar].utf32nfdi);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
count++;
}
if (verbose > 1)
printf(" %X..%X Default_Ignorable_Code_Point\n",
first, last);
continue;
}
ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
if (ret == 2) {
if (strcmp(buf0, "Default_Ignorable_Code_Point"))
continue;
if (!utf32valid(unichar))
line_fail(prop_name, line);
free(unicode_data[unichar].utf32nfdi);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
if (verbose > 1)
printf(" %X Default_Ignorable_Code_Point\n",
unichar);
count++;
continue;
}
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(prop_name);
}
static void corrections_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int major;
unsigned int minor;
unsigned int revision;
unsigned int age;
unsigned int *um;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
char *s;
int i;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", norm_name);
file = fopen(norm_name, "r");
if (!file)
open_fail(norm_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
&unichar, buf0, buf1,
&major, &minor, &revision);
if (ret != 6)
continue;
if (!utf32valid(unichar) || !age_valid(major, minor, revision))
line_fail(norm_name, line);
count++;
}
corrections = calloc(count, sizeof(struct unicode_data));
corrections_count = count;
rewind(file);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
&unichar, buf0, buf1,
&major, &minor, &revision);
if (ret != 6)
continue;
if (!utf32valid(unichar) || !age_valid(major, minor, revision))
line_fail(norm_name, line);
corrections[count] = unicode_data[unichar];
assert(corrections[count].code == unichar);
age = UNICODE_AGE(major, minor, revision);
corrections[count].correction = age;
i = 0;
s = buf0;
while (*s) {
mapping[i] = strtoul(s, &s, 16);
if (!utf32valid(mapping[i]))
line_fail(norm_name, line);
i++;
}
mapping[i++] = 0;
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
corrections[count].utf32nfdi = um;
if (verbose > 1)
printf(" %X -> %s -> %s V%d_%d_%d\n",
unichar, buf0, buf1, major, minor, revision);
count++;
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(norm_name);
}
/* ------------------------------------------------------------------ */
/*
* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
*
* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 (VCount * TCount)
* SCount = 11172 (LCount * NCount)
*
* Decomposition:
* SIndex = s - SBase
*
* LV (Canonical/Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT (Canonical)
* LVIndex = (SIndex / TCount) * TCount
* TIndex = (Sindex % TCount)
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT (Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* TIndex = (Sindex % TCount)
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if (TIndex == 0) {
* d = <LPart, VPart>
* } else {
* TPart = TBase + TIndex
* d = <LPart, VPart, TPart>
* }
*
*/
static void hangul_decompose(void)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
{
unsigned int sb = 0xAC00;
unsigned int lb = 0x1100;
unsigned int vb = 0x1161;
unsigned int tb = 0x11a7;
/* unsigned int lc = 19; */
unsigned int vc = 21;
unsigned int tc = 28;
unsigned int nc = (vc * tc);
/* unsigned int sc = (lc * nc); */
unsigned int unichar;
unsigned int mapping[4];
unsigned int *um;
int count;
int i;
if (verbose > 0)
printf("Decomposing hangul\n");
/* Hangul */
count = 0;
for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
unsigned int si = unichar - sb;
unsigned int li = si / nc;
unsigned int vi = (si % nc) / tc;
unsigned int ti = si % tc;
i = 0;
mapping[i++] = lb + li;
mapping[i++] = vb + vi;
if (ti)
mapping[i++] = tb + ti;
mapping[i++] = 0;
assert(!unicode_data[unichar].utf32nfdi);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
assert(!unicode_data[unichar].utf32nfdicf);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
/*
* Add a cookie as a reminder that the hangul syllable
* decompositions must not be stored in the generated
* trie.
*/
unicode_data[unichar].utf8nfdi = malloc(2);
unicode_data[unichar].utf8nfdi[0] = HANGUL;
unicode_data[unichar].utf8nfdi[1] = '\0';
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (verbose > 1)
print_utf32nfdi(unichar);
count++;
}
if (verbose > 0)
printf("Created %d entries\n", count);
}
static void nfdi_decompose(void)
{
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
unsigned int *um;
unsigned int *dc;
int count;
int i;
int j;
int ret;
if (verbose > 0)
printf("Decomposing nfdi\n");
count = 0;
for (unichar = 0; unichar != 0x110000; unichar++) {
if (!unicode_data[unichar].utf32nfdi)
continue;
for (;;) {
ret = 1;
i = 0;
um = unicode_data[unichar].utf32nfdi;
while (*um) {
dc = unicode_data[*um].utf32nfdi;
if (dc) {
for (j = 0; dc[j]; j++)
mapping[i++] = dc[j];
ret = 0;
} else {
mapping[i++] = *um;
}
um++;
}
mapping[i++] = 0;
if (ret)
break;
free(unicode_data[unichar].utf32nfdi);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
}
/* Add this decomposition to nfdicf if there is no entry. */
if (!unicode_data[unichar].utf32nfdicf) {
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
}
if (verbose > 1)
print_utf32nfdi(unichar);
count++;
}
if (verbose > 0)
printf("Processed %d entries\n", count);
}
static void nfdicf_decompose(void)
{
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
unsigned int *um;
unsigned int *dc;
int count;
int i;
int j;
int ret;
if (verbose > 0)
printf("Decomposing nfdicf\n");
count = 0;
for (unichar = 0; unichar != 0x110000; unichar++) {
if (!unicode_data[unichar].utf32nfdicf)
continue;
for (;;) {
ret = 1;
i = 0;
um = unicode_data[unichar].utf32nfdicf;
while (*um) {
dc = unicode_data[*um].utf32nfdicf;
if (dc) {
for (j = 0; dc[j]; j++)
mapping[i++] = dc[j];
ret = 0;
} else {
mapping[i++] = *um;
}
um++;
}
mapping[i++] = 0;
if (ret)
break;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
}
if (verbose > 1)
print_utf32nfdicf(unichar);
count++;
}
if (verbose > 0)
printf("Processed %d entries\n", count);
}
/* ------------------------------------------------------------------ */
int utf8agemax(struct tree *, const char *);
int utf8nagemax(struct tree *, const char *, size_t);
int utf8agemin(struct tree *, const char *);
int utf8nagemin(struct tree *, const char *, size_t);
ssize_t utf8len(struct tree *, const char *);
ssize_t utf8nlen(struct tree *, const char *, size_t);
struct utf8cursor;
int utf8cursor(struct utf8cursor *, struct tree *, const char *);
int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
int utf8byte(struct utf8cursor *);
/*
* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
*
* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 (VCount * TCount)
* SCount = 11172 (LCount * NCount)
*
* Decomposition:
* SIndex = s - SBase
*
* LV (Canonical/Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT (Canonical)
* LVIndex = (SIndex / TCount) * TCount
* TIndex = (Sindex % TCount)
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT (Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* TIndex = (Sindex % TCount)
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if (TIndex == 0) {
* d = <LPart, VPart>
* } else {
* TPart = TBase + TIndex
* d = <LPart, VPart, TPart>
* }
*/
/* Constants */
#define SB (0xAC00)
#define LB (0x1100)
#define VB (0x1161)
#define TB (0x11A7)
#define LC (19)
#define VC (21)
#define TC (28)
#define NC (VC * TC)
#define SC (LC * NC)
/* Algorithmic decomposition of hangul syllable. */
static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
{
unsigned int si;
unsigned int li;
unsigned int vi;
unsigned int ti;
unsigned char *h;
/* Calculate the SI, LI, VI, and TI values. */
si = utf8decode(str) - SB;
li = si / NC;
vi = (si % NC) / TC;
ti = si % TC;
/* Fill in base of leaf. */
h = hangul;
LEAF_GEN(h) = 2;
LEAF_CCC(h) = DECOMPOSE;
h += 2;
/* Add LPart, a 3-byte UTF-8 sequence. */
h += utf8encode((char *)h, li + LB);
/* Add VPart, a 3-byte UTF-8 sequence. */
h += utf8encode((char *)h, vi + VB);
/* Add TPart if required, also a 3-byte UTF-8 sequence. */
if (ti)
h += utf8encode((char *)h, ti + TB);
/* Terminate string. */
h[0] = '\0';
return hangul;
}
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
/*
* Use trie to scan s, touching at most len bytes.
* Returns the leaf if one exists, NULL otherwise.
*
* A non-NULL return guarantees that the UTF-8 sequence starting at s
* is well-formed and corresponds to a known unicode code point. The
* shorthand for this will be "is valid UTF-8 unicode".
*/
static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
const char *s, size_t len)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
{
utf8trie_t *trie;
int offlen;
int offset;
int mask;
int node;
if (!tree)
return NULL;
if (len == 0)
return NULL;
node = 1;
trie = utf8data + tree->index;
while (node) {
offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
if (*trie & NEXTBYTE) {
if (--len == 0)
return NULL;
s++;
}
mask = 1 << (*trie & BITNUM);
if (*s & mask) {
/* Right leg */
if (offlen) {
/* Right node at offset of trie */
node = (*trie & RIGHTNODE);
offset = trie[offlen];
while (--offlen) {
offset <<= 8;
offset |= trie[offlen];
}
trie += offset;
} else if (*trie & RIGHTPATH) {
/* Right node after this node */
node = (*trie & TRIENODE);
trie++;
} else {
/* No right node. */
return NULL;
}
} else {
/* Left leg */
if (offlen) {
/* Left node after this node. */
node = (*trie & LEFTNODE);
trie += offlen + 1;
} else if (*trie & RIGHTPATH) {
/* No left node. */
return NULL;
} else {
/* Left node after this node */
node = (*trie & TRIENODE);
trie++;
}
}
}
/*
* Hangul decomposition is done algorithmically. These are the
* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
* always 3 bytes long, so s has been advanced twice, and the
* start of the sequence is at s-2.
*/
if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
trie = utf8hangul(s - 2, hangul);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
return trie;
}
/*
* Use trie to scan s.
* Returns the leaf if one exists, NULL otherwise.
*
* Forwards to trie_nlookup().
*/
static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
const char *s)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
{
return utf8nlookup(tree, hangul, s, (size_t)-1);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
}
/*
* Return the number of bytes used by the current UTF-8 sequence.
* Assumes the input points to the first byte of a valid UTF-8
* sequence.
*/
static inline int utf8clen(const char *s)
{
unsigned char c = *s;
return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
}
/*
* Maximum age of any character in s.
* Return -1 if s is not valid UTF-8 unicode.
* Return 0 if only non-assigned code points are used.
*/
int utf8agemax(struct tree *tree, const char *s)
{
utf8leaf_t *leaf;
int age = 0;
int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!tree)
return -1;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
while (*s) {
leaf = utf8lookup(tree, hangul, s);
if (!leaf)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age > age)
age = leaf_age;
s += utf8clen(s);
}
return age;
}
/*
* Minimum age of any character in s.
* Return -1 if s is not valid UTF-8 unicode.
* Return 0 if non-assigned code points are used.
*/
int utf8agemin(struct tree *tree, const char *s)
{
utf8leaf_t *leaf;
int age;
int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!tree)
return -1;
age = tree->maxage;
while (*s) {
leaf = utf8lookup(tree, hangul, s);
if (!leaf)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age < age)
age = leaf_age;
s += utf8clen(s);
}
return age;
}
/*
* Maximum age of any character in s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
int utf8nagemax(struct tree *tree, const char *s, size_t len)
{
utf8leaf_t *leaf;
int age = 0;
int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!tree)
return -1;
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
while (len && *s) {
leaf = utf8nlookup(tree, hangul, s, len);
if (!leaf)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age > age)
age = leaf_age;
len -= utf8clen(s);
s += utf8clen(s);
}
return age;
}
/*
* Maximum age of any character in s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
int utf8nagemin(struct tree *tree, const char *s, size_t len)
{
utf8leaf_t *leaf;
int leaf_age;
int age;
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!tree)
return -1;
age = tree->maxage;
while (len && *s) {
leaf = utf8nlookup(tree, hangul, s, len);
if (!leaf)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age < age)
age = leaf_age;
len -= utf8clen(s);
s += utf8clen(s);
}
return age;
}
/*
* Length of the normalization of s.
* Return -1 if s is not valid UTF-8 unicode.
*
* A string of Default_Ignorable_Code_Point has length 0.
*/
ssize_t utf8len(struct tree *tree, const char *s)
{
utf8leaf_t *leaf;
size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!tree)
return -1;
while (*s) {
leaf = utf8lookup(tree, hangul, s);
if (!leaf)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
return -1;
if (ages[LEAF_GEN(leaf)] > tree->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
else
ret += utf8clen(s);
s += utf8clen(s);
}
return ret;
}
/*
* Length of the normalization of s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
{
utf8leaf_t *leaf;
size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
if (!tree)
return -1;
while (len && *s) {
leaf = utf8nlookup(tree, hangul, s, len);
if (!leaf)
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
return -1;
if (ages[LEAF_GEN(leaf)] > tree->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
else
ret += utf8clen(s);
len -= utf8clen(s);
s += utf8clen(s);
}
return ret;
}
/*
* Cursor structure used by the normalizer.
*/
struct utf8cursor {
struct tree *tree;
const char *s;
const char *p;
const char *ss;
const char *sp;
unsigned int len;
unsigned int slen;
short int ccc;
short int nccc;
unsigned int unichar;
unsigned char hangul[UTF8HANGULLEAF];
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
};
/*
* Set up an utf8cursor for use by utf8byte().
*
* s : string.
* len : length of s.
* u8c : pointer to cursor.
* trie : utf8trie_t to use for normalization.
*
* Returns -1 on error, 0 on success.
*/
int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
size_t len)
{
if (!tree)
return -1;
if (!s)
return -1;
u8c->tree = tree;
u8c->s = s;
u8c->p = NULL;
u8c->ss = NULL;
u8c->sp = NULL;
u8c->len = len;
u8c->slen = 0;
u8c->ccc = STOPPER;
u8c->nccc = STOPPER;
u8c->unichar = 0;
/* Check we didn't clobber the maximum length. */
if (u8c->len != len)
return -1;
/* The first byte of s may not be an utf8 continuation. */
if (len > 0 && (*s & 0xC0) == 0x80)
return -1;
return 0;
}
/*
* Set up an utf8cursor for use by utf8byte().
*
* s : NUL-terminated string.
* u8c : pointer to cursor.
* trie : utf8trie_t to use for normalization.
*
* Returns -1 on error, 0 on success.
*/
int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
{
return utf8ncursor(u8c, tree, s, (unsigned int)-1);
}
/*
* Get one byte from the normalized form of the string described by u8c.
*
* Returns the byte cast to an unsigned char on succes, and -1 on failure.
*
* The cursor keeps track of the location in the string in u8c->s.
* When a character is decomposed, the current location is stored in
* u8c->p, and u8c->s is set to the start of the decomposition. Note
* that bytes from a decomposition do not count against u8c->len.
*
* Characters are emitted if they match the current CCC in u8c->ccc.
* Hitting end-of-string while u8c->ccc == STOPPER means we're done,
* and the function returns 0 in that case.
*
* Sorting by CCC is done by repeatedly scanning the string. The
* values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
* the start of the scan. The first pass finds the lowest CCC to be
* emitted and stores it in u8c->nccc, the second pass emits the
* characters with this CCC and finds the next lowest CCC. This limits
* the number of passes to 1 + the number of different CCCs in the
* sequence being scanned.
*
* Therefore:
* u8c->p != NULL -> a decomposition is being scanned.
* u8c->ss != NULL -> this is a repeating scan.
* u8c->ccc == -1 -> this is the first scan of a repeating scan.
*/
int utf8byte(struct utf8cursor *u8c)
{
utf8leaf_t *leaf;
int ccc;
for (;;) {
/* Check for the end of a decomposed character. */
if (u8c->p && *u8c->s == '\0') {
u8c->s = u8c->p;
u8c->p = NULL;
}
/* Check for end-of-string. */
if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
/* There is no next byte. */
if (u8c->ccc == STOPPER)
return 0;
/* End-of-string during a scan counts as a stopper. */
ccc = STOPPER;
goto ccc_mismatch;
} else if ((*u8c->s & 0xC0) == 0x80) {
/* This is a continuation of the current character. */
if (!u8c->p)
u8c->len--;
return (unsigned char)*u8c->s++;
}
/* Look up the data for the current character. */
if (u8c->p) {
leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
} else {
leaf = utf8nlookup(u8c->tree, u8c->hangul,
u8c->s, u8c->len);
}
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
/* No leaf found implies that the input is a binary blob. */
if (!leaf)
return -1;
/* Characters that are too new have CCC 0. */
if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
ccc = STOPPER;
} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
u8c->len -= utf8clen(u8c->s);
u8c->p = u8c->s + utf8clen(u8c->s);
u8c->s = LEAF_STR(leaf);
/* Empty decomposition implies CCC 0. */
if (*u8c->s == '\0') {
if (u8c->ccc == STOPPER)
continue;
ccc = STOPPER;
goto ccc_mismatch;
}
leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
unicode: introduce UTF-8 character database The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-04-25 13:38:44 -04:00
ccc = LEAF_CCC(leaf);
}
u8c->unichar = utf8decode(u8c->s);
/*
* If this is not a stopper, then see if it updates
* the next canonical class to be emitted.
*/
if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
u8c->nccc = ccc;
/*
* Return the current byte if this is the current
* combining class.
*/
if (ccc == u8c->ccc) {
if (!u8c->p)
u8c->len--;
return (unsigned char)*u8c->s++;
}
/* Current combining class mismatch. */
ccc_mismatch:
if (u8c->nccc == STOPPER) {
/*
* Scan forward for the first canonical class
* to be emitted. Save the position from
* which to restart.
*/
assert(u8c->ccc == STOPPER);
u8c->ccc = MINCCC - 1;
u8c->nccc = ccc;
u8c->sp = u8c->p;
u8c->ss = u8c->s;
u8c->slen = u8c->len;
if (!u8c->p)
u8c->len -= utf8clen(u8c->s);
u8c->s += utf8clen(u8c->s);
} else if (ccc != STOPPER) {
/* Not a stopper, and not the ccc we're emitting. */
if (!u8c->p)
u8c->len -= utf8clen(u8c->s);
u8c->s += utf8clen(u8c->s);
} else if (u8c->nccc != MAXCCC + 1) {
/* At a stopper, restart for next ccc. */
u8c->ccc = u8c->nccc;
u8c->nccc = MAXCCC + 1;
u8c->s = u8c->ss;
u8c->p = u8c->sp;
u8c->len = u8c->slen;
} else {
/* All done, proceed from here. */
u8c->ccc = STOPPER;
u8c->nccc = STOPPER;
u8c->sp = NULL;
u8c->ss = NULL;
u8c->slen = 0;
}
}
}
/* ------------------------------------------------------------------ */
static int normalize_line(struct tree *tree)
{
char *s;
char *t;
int c;
struct utf8cursor u8c;
/* First test: null-terminated string. */
s = buf2;
t = buf3;
if (utf8cursor(&u8c, tree, s))
return -1;
while ((c = utf8byte(&u8c)) > 0)
if (c != (unsigned char)*t++)
return -1;
if (c < 0)
return -1;
if (*t != 0)
return -1;
/* Second test: length-limited string. */
s = buf2;
/* Replace NUL with a value that will cause an error if seen. */
s[strlen(s) + 1] = -1;
t = buf3;
if (utf8cursor(&u8c, tree, s))
return -1;
while ((c = utf8byte(&u8c)) > 0)
if (c != (unsigned char)*t++)
return -1;
if (c < 0)
return -1;
if (*t != 0)
return -1;
return 0;
}
static void normalization_test(void)
{
FILE *file;
unsigned int unichar;
struct unicode_data *data;
char *s;
char *t;
int ret;
int ignorables;
int tests = 0;
int failures = 0;
if (verbose > 0)
printf("Parsing %s\n", test_name);
/* Step one, read data from file. */
file = fopen(test_name, "r");
if (!file)
open_fail(test_name, errno);
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
buf0, buf1);
if (ret != 2 || *line == '#')
continue;
s = buf0;
t = buf2;
while (*s) {
unichar = strtoul(s, &s, 16);
t += utf8encode(t, unichar);
}
*t = '\0';
ignorables = 0;
s = buf1;
t = buf3;
while (*s) {
unichar = strtoul(s, &s, 16);
data = &unicode_data[unichar];
if (data->utf8nfdi && !*data->utf8nfdi)
ignorables = 1;
else
t += utf8encode(t, unichar);
}
*t = '\0';
tests++;
if (normalize_line(nfdi_tree) < 0) {
printf("Line %s -> %s", buf0, buf1);
if (ignorables)
printf(" (ignorables removed)");
printf(" failure\n");
failures++;
}
}
fclose(file);
if (verbose > 0)
printf("Ran %d tests with %d failures\n", tests, failures);
if (failures)
file_fail(test_name);
}
/* ------------------------------------------------------------------ */
static void write_file(void)
{
FILE *file;
int i;
int j;
int t;
int gen;
if (verbose > 0)
printf("Writing %s\n", utf8_name);
file = fopen(utf8_name, "w");
if (!file)
open_fail(utf8_name, errno);
fprintf(file, "/* This file is generated code, do not edit. */\n");
fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
fprintf(file, "#endif\n");
fprintf(file, "\n");
fprintf(file, "static const unsigned int utf8vers = %#x;\n",
unicode_maxage);
fprintf(file, "\n");
fprintf(file, "static const unsigned int utf8agetab[] = {\n");
for (i = 0; i != ages_count; i++)
fprintf(file, "\t%#x%s\n", ages[i],
ages[i] == unicode_maxage ? "" : ",");
fprintf(file, "};\n");
fprintf(file, "\n");
fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
t = 0;
for (gen = 0; gen < ages_count; gen++) {
fprintf(file, "\t{ %#x, %d }%s\n",
ages[gen], trees[t].index,
ages[gen] == unicode_maxage ? "" : ",");
if (trees[t].maxage == ages[gen])
t += 2;
}
fprintf(file, "};\n");
fprintf(file, "\n");
fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
t = 1;
for (gen = 0; gen < ages_count; gen++) {
fprintf(file, "\t{ %#x, %d }%s\n",
ages[gen], trees[t].index,
ages[gen] == unicode_maxage ? "" : ",");
if (trees[t].maxage == ages[gen])
t += 2;
}
fprintf(file, "};\n");
fprintf(file, "\n");
fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
utf8data_size);
t = 0;
for (i = 0; i != utf8data_size; i += 16) {
if (i == trees[t].index) {
fprintf(file, "\t/* %s_%x */\n",
trees[t].type, trees[t].maxage);
if (t < trees_count-1)
t++;
}
fprintf(file, "\t");
for (j = i; j != i + 16; j++)
fprintf(file, "0x%.2x%s", utf8data[j],
(j < utf8data_size -1 ? "," : ""));
fprintf(file, "\n");
}
fprintf(file, "};\n");
fclose(file);
}
/* ------------------------------------------------------------------ */
int main(int argc, char *argv[])
{
unsigned int unichar;
int opt;
argv0 = argv[0];
while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
switch (opt) {
case 'a':
age_name = optarg;
break;
case 'c':
ccc_name = optarg;
break;
case 'd':
data_name = optarg;
break;
case 'f':
fold_name = optarg;
break;
case 'n':
norm_name = optarg;
break;
case 'o':
utf8_name = optarg;
break;
case 'p':
prop_name = optarg;
break;
case 't':
test_name = optarg;
break;
case 'v':
verbose++;
break;
case 'h':
help();
exit(0);
default:
usage();
}
}
if (verbose > 1)
help();
for (unichar = 0; unichar != 0x110000; unichar++)
unicode_data[unichar].code = unichar;
age_init();
ccc_init();
nfdi_init();
nfdicf_init();
ignore_init();
corrections_init();
hangul_decompose();
nfdi_decompose();
nfdicf_decompose();
utf8_init();
trees_init();
trees_populate();
trees_reduce();
trees_verify();
/* Prevent "unused function" warning. */
(void)lookup(nfdi_tree, " ");
if (verbose > 2)
tree_walk(nfdi_tree);
if (verbose > 2)
tree_walk(nfdicf_tree);
normalization_test();
write_file();
return 0;
}