2 * Copyright (c) 2014 SGI.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 /* Generator for a compact trie for unicode normalization */
21 #include <sys/types.h>
30 /* Default names of the in- and output files. */
32 #define AGE_NAME "DerivedAge.txt"
33 #define CCC_NAME "DerivedCombiningClass.txt"
34 #define PROP_NAME "DerivedCoreProperties.txt"
35 #define DATA_NAME "UnicodeData.txt"
36 #define FOLD_NAME "CaseFolding.txt"
37 #define NORM_NAME "NormalizationCorrections.txt"
38 #define TEST_NAME "NormalizationTest.txt"
39 #define UTF8_NAME "utf8data.h"
41 const char *age_name = AGE_NAME;
42 const char *ccc_name = CCC_NAME;
43 const char *prop_name = PROP_NAME;
44 const char *data_name = DATA_NAME;
45 const char *fold_name = FOLD_NAME;
46 const char *norm_name = NORM_NAME;
47 const char *test_name = TEST_NAME;
48 const char *utf8_name = UTF8_NAME;
52 /* An arbitrary line size limit on input lines. */
63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
65 /* ------------------------------------------------------------------ */
68 * Unicode version numbers consist of three parts: major, minor, and a
69 * revision. These numbers are packed into an unsigned int to obtain
70 * a single version number.
72 * To save space in the generated trie, the unicode version is not
73 * stored directly, instead we calculate a generation number from the
74 * unicode versions seen in the DerivedAge file, and use that as an
75 * index into a table of unicode versions.
77 #define UNICODE_MAJ_SHIFT (16)
78 #define UNICODE_MIN_SHIFT (8)
80 #define UNICODE_MAJ_MAX ((unsigned short)-1)
81 #define UNICODE_MIN_MAX ((unsigned char)-1)
82 #define UNICODE_REV_MAX ((unsigned char)-1)
84 #define UNICODE_AGE(MAJ,MIN,REV) \
85 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
86 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
87 ((unsigned int)(REV)))
92 unsigned int unicode_maxage;
94 static int age_valid(unsigned int major, unsigned int minor,
95 unsigned int revision)
97 if (major > UNICODE_MAJ_MAX)
99 if (minor > UNICODE_MIN_MAX)
101 if (revision > UNICODE_REV_MAX)
106 /* ------------------------------------------------------------------ */
111 * A compact binary tree, used to decode UTF-8 characters.
113 * Internal nodes are one byte for the node itself, and up to three
114 * bytes for an offset into the tree. The first byte contains the
115 * following information:
116 * NEXTBYTE - flag - advance to next byte if set
117 * BITNUM - 3 bit field - the bit number to tested
118 * OFFLEN - 2 bit field - number of bytes in the offset
119 * if offlen == 0 (non-branching node)
120 * RIGHTPATH - 1 bit field - set if the following node is for the
121 * right-hand path (tested bit is set)
122 * TRIENODE - 1 bit field - set if the following node is an internal
123 * node, otherwise it is a leaf node
124 * if offlen != 0 (branching node)
125 * LEFTNODE - 1 bit field - set if the left-hand node is internal
126 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
128 * Due to the way utf8 works, there cannot be branching nodes with
129 * NEXTBYTE set, and moreover those nodes always have a righthand
132 typedef unsigned char utf8trie_t;
134 #define NEXTBYTE 0x08
136 #define OFFLEN_SHIFT 4
137 #define RIGHTPATH 0x40
138 #define TRIENODE 0x80
139 #define RIGHTNODE 0x40
140 #define LEFTNODE 0x80
145 * The leaves of the trie are embedded in the trie, and so the same
146 * underlying datatype, unsigned char.
148 * leaf[0]: The unicode version, stored as a generation number that is
149 * an index into utf8agetab[]. With this we can filter code
150 * points based on the unicode version in which they were
151 * defined. The CCC of a non-defined code point is 0.
152 * leaf[1]: Canonical Combining Class. During normalization, we need
153 * to do a stable sort into ascending order of all characters
154 * with a non-zero CCC that occur between two characters with
155 * a CCC of 0, or at the begin or end of a string.
156 * The unicode standard guarantees that all CCC values are
157 * between 0 and 254 inclusive, which leaves 255 available as
159 * Code points with CCC 0 are known as stoppers.
160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161 * start of a NUL-terminated string that is the decomposition
163 * The CCC of a decomposable character is the same as the CCC
164 * of the first character of its decomposition.
165 * Some characters decompose as the empty string: these are
166 * characters with the Default_Ignorable_Code_Point property.
167 * These do affect normalization, as they all have CCC 0.
169 * The decompositions in the trie have been fully expanded.
171 * Casefolding, if applicable, is also done using decompositions.
173 typedef unsigned char utf8leaf_t;
175 #define LEAF_GEN(LEAF) ((LEAF)[0])
176 #define LEAF_CCC(LEAF) ((LEAF)[1])
177 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
184 #define DECOMPOSE (255)
187 static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t);
188 static utf8leaf_t *utf8lookup(struct tree *, const char *);
190 unsigned char *utf8data;
191 size_t utf8data_size;
196 /* ------------------------------------------------------------------ */
201 * The UTF-8 encoding spreads the bits of a 32bit word over several
202 * bytes. This table gives the ranges that can be held and how they'd
205 * 0x00000000 0x0000007F: 0xxxxxxx
206 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
207 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
208 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
209 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
210 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
212 * There is an additional requirement on UTF-8, in that only the
213 * shortest representation of a 32bit value is to be used. A decoder
214 * must not decode sequences that do not satisfy this requirement.
215 * Thus the allowed ranges have a lower bound.
217 * 0x00000000 0x0000007F: 0xxxxxxx
218 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
219 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
220 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
221 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
222 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
224 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
225 * 17 planes of 65536 values. This limits the sequences actually seen
226 * even more, to just the following.
229 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
230 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
231 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
233 * Even within those ranges not all values are allowed: the surrogates
234 * 0xd800 - 0xdfff should never be seen.
236 * Note that the longest sequence seen with valid usage is 4 bytes,
237 * the same a single UTF-32 character. This makes the UTF-8
238 * representation of Unicode strictly smaller than UTF-32.
240 * The shortest sequence requirement was introduced by:
241 * Corrigendum #1: UTF-8 Shortest Form
242 * It can be found here:
243 * http://www.unicode.org/versions/corrigendum1.html
247 #define UTF8_2_BITS 0xC0
248 #define UTF8_3_BITS 0xE0
249 #define UTF8_4_BITS 0xF0
250 #define UTF8_N_BITS 0x80
251 #define UTF8_2_MASK 0xE0
252 #define UTF8_3_MASK 0xF0
253 #define UTF8_4_MASK 0xF8
254 #define UTF8_N_MASK 0xC0
255 #define UTF8_V_MASK 0x3F
256 #define UTF8_V_SHIFT 6
258 static int utf8encode(char *str, unsigned int val)
265 } else if (val < 0x800) {
266 str[1] = val & UTF8_V_MASK;
267 str[1] |= UTF8_N_BITS;
268 val >>= UTF8_V_SHIFT;
270 str[0] |= UTF8_2_BITS;
272 } else if (val < 0x10000) {
273 str[2] = val & UTF8_V_MASK;
274 str[2] |= UTF8_N_BITS;
275 val >>= UTF8_V_SHIFT;
276 str[1] = val & UTF8_V_MASK;
277 str[1] |= UTF8_N_BITS;
278 val >>= UTF8_V_SHIFT;
280 str[0] |= UTF8_3_BITS;
282 } else if (val < 0x110000) {
283 str[3] = val & UTF8_V_MASK;
284 str[3] |= UTF8_N_BITS;
285 val >>= UTF8_V_SHIFT;
286 str[2] = val & UTF8_V_MASK;
287 str[2] |= UTF8_N_BITS;
288 val >>= UTF8_V_SHIFT;
289 str[1] = val & UTF8_V_MASK;
290 str[1] |= UTF8_N_BITS;
291 val >>= UTF8_V_SHIFT;
293 str[0] |= UTF8_4_BITS;
296 printf("%#x: illegal val\n", val);
302 static unsigned int utf8decode(const char *str)
304 const unsigned char *s = (const unsigned char*)str;
305 unsigned int unichar = 0;
309 } else if (*s < UTF8_3_BITS) {
310 unichar = *s++ & 0x1F;
311 unichar <<= UTF8_V_SHIFT;
312 unichar |= *s & 0x3F;
313 } else if (*s < UTF8_4_BITS) {
314 unichar = *s++ & 0x0F;
315 unichar <<= UTF8_V_SHIFT;
316 unichar |= *s++ & 0x3F;
317 unichar <<= UTF8_V_SHIFT;
318 unichar |= *s & 0x3F;
320 unichar = *s++ & 0x0F;
321 unichar <<= UTF8_V_SHIFT;
322 unichar |= *s++ & 0x3F;
323 unichar <<= UTF8_V_SHIFT;
324 unichar |= *s++ & 0x3F;
325 unichar <<= UTF8_V_SHIFT;
326 unichar |= *s & 0x3F;
331 static int utf32valid(unsigned int unichar)
333 return unichar < 0x110000;
345 int (*leaf_equal)(void *, void *);
346 void (*leaf_print)(void *, int);
347 int (*leaf_mark)(void *);
348 int (*leaf_size)(void *);
349 int *(*leaf_index)(struct tree *, void *);
350 unsigned char *(*leaf_emit)(void *, unsigned char *);
351 int leafindex[0x110000];
363 unsigned char bitnum;
364 unsigned char nextbyte;
365 unsigned char leftnode;
366 unsigned char rightnode;
367 unsigned int keybits;
368 unsigned int keymask;
372 * Example lookup function for a tree.
374 static void *lookup(struct tree *tree, const char *key)
380 while (!leaf && node) {
383 if (*key & (1 << (node->bitnum & 7))) {
385 if (node->rightnode == NODE) {
387 } else if (node->rightnode == LEAF) {
394 if (node->leftnode == NODE) {
396 } else if (node->leftnode == LEAF) {
408 * A simple non-recursive tree walker: keep track of visits to the
409 * left and right branches in the leftmask and rightmask.
411 static void tree_walk(struct tree *tree)
414 unsigned int leftmask;
415 unsigned int rightmask;
416 unsigned int bitmask;
418 int nodes, singletons, leaves;
420 nodes = singletons = leaves = 0;
422 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
423 if (tree->childnode == LEAF) {
425 tree->leaf_print(tree->root, indent);
428 assert(tree->childnode == NODE);
430 leftmask = rightmask = 0;
432 printf("%*snode @ %p bitnum %d nextbyte %d"
433 " left %p right %p mask %x bits %x\n",
435 node->bitnum, node->nextbyte,
436 node->left, node->right,
437 node->keymask, node->keybits);
439 if (!(node->left && node->right))
443 bitmask = 1 << node->bitnum;
444 if ((leftmask & bitmask) == 0) {
446 if (node->leftnode == LEAF) {
448 tree->leaf_print(node->left,
451 } else if (node->left) {
452 assert(node->leftnode == NODE);
458 if ((rightmask & bitmask) == 0) {
459 rightmask |= bitmask;
460 if (node->rightnode == LEAF) {
462 tree->leaf_print(node->right,
465 } else if (node->right) {
466 assert(node->rightnode==NODE);
472 leftmask &= ~bitmask;
473 rightmask &= ~bitmask;
479 printf("nodes %d leaves %d singletons %d\n",
480 nodes, leaves, singletons);
484 * Allocate an initialize a new internal node.
486 static struct node *alloc_node(struct node *parent)
491 node = malloc(sizeof(*node));
492 node->left = node->right = NULL;
493 node->parent = parent;
494 node->leftnode = NODE;
495 node->rightnode = NODE;
504 bitnum = parent->bitnum;
505 if ((bitnum & 7) == 0) {
506 node->bitnum = bitnum + 7 + 8;
509 node->bitnum = bitnum - 1;
521 * Insert a new leaf into the tree, and collapse any subtrees that are
522 * fully populated and end in identical leaves. A nextbyte tagged
523 * internal node will not be removed to preserve the tree's integrity.
524 * Note that due to the structure of utf8, no nextbyte tagged node
525 * will be a candidate for removal.
527 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
534 assert(keylen >= 1 && keylen <= 4);
537 cursor = &tree->root;
538 keybits = 8 * keylen;
540 /* Insert, creating path along the way. */
543 *cursor = alloc_node(node);
547 if (*key & (1 << (node->bitnum & 7)))
548 cursor = &node->right;
550 cursor = &node->left;
555 /* Merge subtrees if possible. */
557 if (*key & (1 << (node->bitnum & 7)))
558 node->rightnode = LEAF;
560 node->leftnode = LEAF;
563 if (node->leftnode == NODE || node->rightnode == NODE)
568 if (! tree->leaf_equal(node->left, node->right))
570 /* Keep left, drop right leaf. */
572 /* Check in parent */
573 parent = node->parent;
577 tree->childnode = LEAF;
578 } else if (parent->left == node) {
580 parent->leftnode = LEAF;
585 parent->keymask |= (1 << node->bitnum);
587 } else if (parent->right == node) {
588 parent->right = leaf;
589 parent->rightnode = LEAF;
594 parent->keymask |= (1 << node->bitnum);
595 parent->keybits |= (1 << node->bitnum);
598 /* internal tree error */
605 /* Propagate keymasks up along singleton chains. */
607 parent = node->parent;
610 /* Nix the mask for parents with two children. */
611 if (node->keymask == 0) {
614 } else if (parent->left && parent->right) {
618 assert((parent->keymask & node->keymask) == 0);
619 parent->keymask |= node->keymask;
620 parent->keymask |= (1 << parent->bitnum);
621 parent->keybits |= node->keybits;
623 parent->keybits |= (1 << parent->bitnum);
632 * Prune internal nodes.
634 * Fully populated subtrees that end at the same leaf have already
635 * been collapsed. There are still internal nodes that have for both
636 * their left and right branches a sequence of singletons that make
637 * identical choices and end in identical leaves. The keymask and
638 * keybits collected in the nodes describe the choices made in these
639 * singleton chains. When they are identical for the left and right
640 * branch of a node, and the two leaves comare identical, the node in
641 * question can be removed.
643 * Note that nodes with the nextbyte tag set will not be removed by
644 * this to ensure tree integrity. Note as well that the structure of
645 * utf8 ensures that these nodes would not have been candidates for
646 * removal in any case.
648 static void prune(struct tree *tree)
656 unsigned int leftmask;
657 unsigned int rightmask;
658 unsigned int bitmask;
662 printf("Pruning %s_%x\n", tree->type, tree->maxage);
665 if (tree->childnode == LEAF)
670 leftmask = rightmask = 0;
675 if (node->leftnode == LEAF)
677 if (node->rightnode == LEAF)
685 if (left->keymask == 0)
687 if (right->keymask == 0)
689 if (left->keymask != right->keymask)
691 if (left->keybits != right->keybits)
695 assert(left->left || left->right);
696 if (left->leftnode == LEAF)
697 leftleaf = left->left;
698 else if (left->rightnode == LEAF)
699 leftleaf = left->right;
702 else if (left->right)
709 assert(right->left || right->right);
710 if (right->leftnode == LEAF)
711 rightleaf = right->left;
712 else if (right->rightnode == LEAF)
713 rightleaf = right->right;
714 else if (right->left)
716 else if (right->right)
717 right = right->right;
721 if (! tree->leaf_equal(leftleaf, rightleaf))
724 * This node has identical singleton-only subtrees.
727 parent = node->parent;
730 if (parent->left == node)
732 else if (parent->right == node)
733 parent->right = left;
736 left->parent = parent;
737 left->keymask |= (1 << node->bitnum);
740 bitmask = 1 << node->bitnum;
741 leftmask &= ~bitmask;
742 rightmask &= ~bitmask;
743 if (node->leftnode == NODE && node->left) {
748 } else if (node->rightnode == NODE && node->right) {
757 /* Propagate keymasks up along singleton chains. */
760 bitmask = 1 << node->bitnum;
761 leftmask &= ~bitmask;
762 rightmask &= ~bitmask;
764 if (node->left && node->right)
768 node->keymask |= left->keymask;
769 node->keybits |= left->keybits;
773 node->keymask |= right->keymask;
774 node->keybits |= right->keybits;
776 node->keymask |= (1 << node->bitnum);
779 bitmask = 1 << node->bitnum;
780 leftmask &= ~bitmask;
781 rightmask &= ~bitmask;
784 bitmask = 1 << node->bitnum;
785 if ((leftmask & bitmask) == 0 &&
786 node->leftnode == NODE &&
790 } else if ((rightmask & bitmask) == 0 &&
791 node->rightnode == NODE &&
793 rightmask |= bitmask;
796 leftmask &= ~bitmask;
797 rightmask &= ~bitmask;
802 printf("Pruned %d nodes\n", count);
806 * Mark the nodes in the tree that lead to leaves that must be
809 static void mark_nodes(struct tree *tree)
813 unsigned int leftmask;
814 unsigned int rightmask;
815 unsigned int bitmask;
820 printf("Marking %s_%x\n", tree->type, tree->maxage);
821 if (tree->childnode == LEAF)
824 assert(tree->childnode == NODE);
826 leftmask = rightmask = 0;
828 bitmask = 1 << node->bitnum;
829 if ((leftmask & bitmask) == 0) {
831 if (node->leftnode == LEAF) {
833 if (tree->leaf_mark(node->left)) {
835 while (n && !n->mark) {
841 } else if (node->left) {
842 assert(node->leftnode == NODE);
847 if ((rightmask & bitmask) == 0) {
848 rightmask |= bitmask;
849 if (node->rightnode == LEAF) {
851 if (tree->leaf_mark(node->right)) {
853 while (n && !n->mark) {
859 } else if (node->right) {
860 assert(node->rightnode==NODE);
865 leftmask &= ~bitmask;
866 rightmask &= ~bitmask;
870 /* second pass: left siblings and singletons */
872 assert(tree->childnode == NODE);
874 leftmask = rightmask = 0;
876 bitmask = 1 << node->bitnum;
877 if ((leftmask & bitmask) == 0) {
879 if (node->leftnode == LEAF) {
881 if (tree->leaf_mark(node->left)) {
883 while (n && !n->mark) {
889 } else if (node->left) {
890 assert(node->leftnode == NODE);
892 if (!node->mark && node->parent->mark) {
899 if ((rightmask & bitmask) == 0) {
900 rightmask |= bitmask;
901 if (node->rightnode == LEAF) {
903 if (tree->leaf_mark(node->right)) {
905 while (n && !n->mark) {
911 } else if (node->right) {
912 assert(node->rightnode==NODE);
914 if (!node->mark && node->parent->mark &&
915 !node->parent->left) {
922 leftmask &= ~bitmask;
923 rightmask &= ~bitmask;
928 printf("Marked %d nodes\n", marked);
932 * Compute the index of each node and leaf, which is the offset in the
933 * emitted trie. These values must be pre-computed because relative
934 * offsets between nodes are used to navigate the tree.
936 static int index_nodes(struct tree *tree, int index)
939 unsigned int leftmask;
940 unsigned int rightmask;
941 unsigned int bitmask;
945 /* Align to a cache line (or half a cache line?). */
953 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
954 if (tree->childnode == LEAF) {
955 index += tree->leaf_size(tree->root);
959 assert(tree->childnode == NODE);
961 leftmask = rightmask = 0;
966 if (node->index != index)
971 bitmask = 1 << node->bitnum;
972 if (node->mark && (leftmask & bitmask) == 0) {
974 if (node->leftnode == LEAF) {
976 *tree->leaf_index(tree, node->left) =
978 index += tree->leaf_size(node->left);
980 } else if (node->left) {
981 assert(node->leftnode == NODE);
987 if (node->mark && (rightmask & bitmask) == 0) {
988 rightmask |= bitmask;
989 if (node->rightnode == LEAF) {
991 *tree->leaf_index(tree, node->right) = index;
992 index += tree->leaf_size(node->right);
994 } else if (node->right) {
995 assert(node->rightnode==NODE);
1001 leftmask &= ~bitmask;
1002 rightmask &= ~bitmask;
1003 node = node->parent;
1008 /* Round up to a multiple of 16 */
1012 printf("Final index %d\n", index);
1017 * Compute the size of nodes and leaves. We start by assuming that
1018 * each node needs to store a three-byte offset. The indexes of the
1019 * nodes are calculated based on that, and then this function is
1020 * called to see if the sizes of some nodes can be reduced. This is
1021 * repeated until no more changes are seen.
1023 static int size_nodes(struct tree *tree)
1029 unsigned int leftmask;
1030 unsigned int rightmask;
1031 unsigned int bitmask;
1032 unsigned int pathbits;
1033 unsigned int pathmask;
1044 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1045 if (tree->childnode == LEAF)
1048 assert(tree->childnode == NODE);
1052 leftmask = rightmask = 0;
1057 if (!node->left || !node->right) {
1060 if (node->rightnode == NODE) {
1061 right = node->right;
1063 while (!right->mark) {
1066 while (n->bitnum != node->bitnum) {
1067 if (pathbits & (1<<n->bitnum))
1073 assert(right->bitnum == n->bitnum);
1077 offset = right->index - node->index;
1079 offset = *tree->leaf_index(tree, node->right);
1080 offset -= node->index;
1082 assert(offset >= 0);
1083 assert(offset <= 0xffffff);
1084 if (offset <= 0xff) {
1086 } else if (offset <= 0xffff) {
1088 } else { /* offset <= 0xffffff */
1092 if (node->size != size || node->offset != offset) {
1094 node->offset = offset;
1099 bitmask = 1 << node->bitnum;
1100 pathmask |= bitmask;
1101 if (node->mark && (leftmask & bitmask) == 0) {
1102 leftmask |= bitmask;
1103 if (node->leftnode == LEAF) {
1105 } else if (node->left) {
1106 assert(node->leftnode == NODE);
1112 if (node->mark && (rightmask & bitmask) == 0) {
1113 rightmask |= bitmask;
1114 pathbits |= bitmask;
1115 if (node->rightnode == LEAF) {
1116 assert(node->right);
1117 } else if (node->right) {
1118 assert(node->rightnode==NODE);
1124 leftmask &= ~bitmask;
1125 rightmask &= ~bitmask;
1126 pathmask &= ~bitmask;
1127 pathbits &= ~bitmask;
1128 node = node->parent;
1134 printf("Found %d changes\n", changed);
1139 * Emit a trie for the given tree into the data array.
1141 static void emit(struct tree *tree, unsigned char *data)
1144 unsigned int leftmask;
1145 unsigned int rightmask;
1146 unsigned int bitmask;
1153 index = tree->index;
1157 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1158 if (tree->childnode == LEAF) {
1160 tree->leaf_emit(tree->root, data);
1164 assert(tree->childnode == NODE);
1166 leftmask = rightmask = 0;
1170 assert(node->offset != -1);
1171 assert(node->index == index);
1176 byte |= (node->bitnum & BITNUM);
1177 if (node->left && node->right) {
1178 if (node->leftnode == NODE)
1180 if (node->rightnode == NODE)
1182 if (node->offset <= 0xff)
1184 else if (node->offset <= 0xffff)
1188 offset = node->offset;
1189 byte |= offlen << OFFLEN_SHIFT;
1193 *data++ = offset & 0xff;
1197 } else if (node->left) {
1198 if (node->leftnode == NODE)
1202 } else if (node->right) {
1204 if (node->rightnode == NODE)
1213 bitmask = 1 << node->bitnum;
1214 if (node->mark && (leftmask & bitmask) == 0) {
1215 leftmask |= bitmask;
1216 if (node->leftnode == LEAF) {
1218 data = tree->leaf_emit(node->left,
1220 index += tree->leaf_size(node->left);
1221 } else if (node->left) {
1222 assert(node->leftnode == NODE);
1228 if (node->mark && (rightmask & bitmask) == 0) {
1229 rightmask |= bitmask;
1230 if (node->rightnode == LEAF) {
1231 assert(node->right);
1232 data = tree->leaf_emit(node->right,
1234 index += tree->leaf_size(node->right);
1235 } else if (node->right) {
1236 assert(node->rightnode==NODE);
1242 leftmask &= ~bitmask;
1243 rightmask &= ~bitmask;
1244 node = node->parent;
1250 /* ------------------------------------------------------------------ */
1255 * We need to keep track of the Canonical Combining Class, the Age,
1256 * and decompositions for a code point.
1258 * For the Age, we store the index into the ages table. Effectively
1259 * this is a generation number that the table maps to a unicode
1262 * The correction field is used to indicate that this entry is in the
1263 * corrections array, which contains decompositions that were
1264 * corrected in later revisions. The value of the correction field is
1265 * the Unicode version in which the mapping was corrected.
1267 struct unicode_data {
1272 unsigned int *utf32nfdi;
1273 unsigned int *utf32nfdicf;
1278 struct unicode_data unicode_data[0x110000];
1279 struct unicode_data *corrections;
1280 int corrections_count;
1282 struct tree *nfdi_tree;
1283 struct tree *nfdicf_tree;
1289 * Check the corrections array to see if this entry was corrected at
1292 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1296 for (i = 0; i != corrections_count; i++)
1297 if (u->code == corrections[i].code)
1298 return &corrections[i];
1302 static int nfdi_equal(void *l, void *r)
1304 struct unicode_data *left = l;
1305 struct unicode_data *right = r;
1307 if (left->gen != right->gen)
1309 if (left->ccc != right->ccc)
1311 if (left->utf8nfdi && right->utf8nfdi &&
1312 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1314 if (left->utf8nfdi || right->utf8nfdi)
1319 static int nfdicf_equal(void *l, void *r)
1321 struct unicode_data *left = l;
1322 struct unicode_data *right = r;
1324 if (left->gen != right->gen)
1326 if (left->ccc != right->ccc)
1328 if (left->utf8nfdicf && right->utf8nfdicf &&
1329 strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1331 if (left->utf8nfdicf && right->utf8nfdicf)
1333 if (left->utf8nfdicf || right->utf8nfdicf)
1335 if (left->utf8nfdi && right->utf8nfdi &&
1336 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1338 if (left->utf8nfdi || right->utf8nfdi)
1343 static void nfdi_print(void *l, int indent)
1345 struct unicode_data *leaf = l;
1347 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1348 leaf->code, leaf->ccc, leaf->gen);
1350 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1354 static void nfdicf_print(void *l, int indent)
1356 struct unicode_data *leaf = l;
1358 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1359 leaf->code, leaf->ccc, leaf->gen);
1360 if (leaf->utf8nfdicf)
1361 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1362 else if (leaf->utf8nfdi)
1363 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1367 static int nfdi_mark(void *l)
1372 static int nfdicf_mark(void *l)
1374 struct unicode_data *leaf = l;
1376 if (leaf->utf8nfdicf)
1381 static int correction_mark(void *l)
1383 struct unicode_data *leaf = l;
1385 return leaf->correction;
1388 static int nfdi_size(void *l)
1390 struct unicode_data *leaf = l;
1394 size += strlen(leaf->utf8nfdi) + 1;
1398 static int nfdicf_size(void *l)
1400 struct unicode_data *leaf = l;
1403 if (leaf->utf8nfdicf)
1404 size += strlen(leaf->utf8nfdicf) + 1;
1405 else if (leaf->utf8nfdi)
1406 size += strlen(leaf->utf8nfdi) + 1;
1410 static int *nfdi_index(struct tree *tree, void *l)
1412 struct unicode_data *leaf = l;
1414 return &tree->leafindex[leaf->code];
1417 static int *nfdicf_index(struct tree *tree, void *l)
1419 struct unicode_data *leaf = l;
1421 return &tree->leafindex[leaf->code];
1424 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1426 struct unicode_data *leaf = l;
1429 *data++ = leaf->gen;
1430 if (leaf->utf8nfdi) {
1431 *data++ = DECOMPOSE;
1432 s = (unsigned char*)leaf->utf8nfdi;
1433 while ((*data++ = *s++) != 0)
1436 *data++ = leaf->ccc;
1441 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1443 struct unicode_data *leaf = l;
1446 *data++ = leaf->gen;
1447 if (leaf->utf8nfdicf) {
1448 *data++ = DECOMPOSE;
1449 s = (unsigned char*)leaf->utf8nfdicf;
1450 while ((*data++ = *s++) != 0)
1452 } else if (leaf->utf8nfdi) {
1453 *data++ = DECOMPOSE;
1454 s = (unsigned char*)leaf->utf8nfdi;
1455 while ((*data++ = *s++) != 0)
1458 *data++ = leaf->ccc;
1463 static void utf8_create(struct unicode_data *data)
1471 um = data->utf32nfdi;
1473 for (i = 0; um[i]; i++)
1474 u += utf8encode(u, um[i]);
1476 data->utf8nfdi = strdup(utf);
1479 um = data->utf32nfdicf;
1481 for (i = 0; um[i]; i++)
1482 u += utf8encode(u, um[i]);
1484 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1485 data->utf8nfdicf = strdup(utf);
1489 static void utf8_init(void)
1491 unsigned int unichar;
1494 for (unichar = 0; unichar != 0x110000; unichar++)
1495 utf8_create(&unicode_data[unichar]);
1497 for (i = 0; i != corrections_count; i++)
1498 utf8_create(&corrections[i]);
1501 static void trees_init(void)
1503 struct unicode_data *data;
1504 unsigned int maxage;
1505 unsigned int nextage;
1510 /* Count the number of different ages. */
1512 nextage = (unsigned int)-1;
1516 for (i = 0; i <= corrections_count; i++) {
1517 data = &corrections[i];
1518 if (nextage < data->correction &&
1519 data->correction < maxage)
1520 nextage = data->correction;
1525 /* Two trees per age: nfdi and nfdicf */
1526 trees_count = count * 2;
1527 trees = calloc(trees_count, sizeof(struct tree));
1529 /* Assign ages to the trees. */
1530 count = trees_count;
1531 nextage = (unsigned int)-1;
1534 trees[--count].maxage = maxage;
1535 trees[--count].maxage = maxage;
1537 for (i = 0; i <= corrections_count; i++) {
1538 data = &corrections[i];
1539 if (nextage < data->correction &&
1540 data->correction < maxage)
1541 nextage = data->correction;
1545 /* The ages assigned above are off by one. */
1546 for (i = 0; i != trees_count; i++) {
1548 while (ages[j] < trees[i].maxage)
1550 trees[i].maxage = ages[j-1];
1553 /* Set up the forwarding between trees. */
1554 trees[trees_count-2].next = &trees[trees_count-1];
1555 trees[trees_count-1].leaf_mark = nfdi_mark;
1556 trees[trees_count-2].leaf_mark = nfdicf_mark;
1557 for (i = 0; i != trees_count-2; i += 2) {
1558 trees[i].next = &trees[trees_count-2];
1559 trees[i].leaf_mark = correction_mark;
1560 trees[i+1].next = &trees[trees_count-1];
1561 trees[i+1].leaf_mark = correction_mark;
1564 /* Assign the callouts. */
1565 for (i = 0; i != trees_count; i += 2) {
1566 trees[i].type = "nfdicf";
1567 trees[i].leaf_equal = nfdicf_equal;
1568 trees[i].leaf_print = nfdicf_print;
1569 trees[i].leaf_size = nfdicf_size;
1570 trees[i].leaf_index = nfdicf_index;
1571 trees[i].leaf_emit = nfdicf_emit;
1573 trees[i+1].type = "nfdi";
1574 trees[i+1].leaf_equal = nfdi_equal;
1575 trees[i+1].leaf_print = nfdi_print;
1576 trees[i+1].leaf_size = nfdi_size;
1577 trees[i+1].leaf_index = nfdi_index;
1578 trees[i+1].leaf_emit = nfdi_emit;
1582 for (i = 0; i != trees_count; i++)
1583 trees[i].childnode = NODE;
1586 static void trees_populate(void)
1588 struct unicode_data *data;
1589 unsigned int unichar;
1594 for (i = 0; i != trees_count; i++) {
1596 printf("Populating %s_%x\n",
1597 trees[i].type, trees[i].maxage);
1599 for (unichar = 0; unichar != 0x110000; unichar++) {
1600 if (unicode_data[unichar].gen < 0)
1602 keylen = utf8encode(keyval, unichar);
1603 data = corrections_lookup(&unicode_data[unichar]);
1604 if (data->correction <= trees[i].maxage)
1605 data = &unicode_data[unichar];
1606 insert(&trees[i], keyval, keylen, data);
1611 static void trees_reduce(void)
1617 for (i = 0; i != trees_count; i++)
1619 for (i = 0; i != trees_count; i++)
1620 mark_nodes(&trees[i]);
1623 for (i = 0; i != trees_count; i++)
1624 size = index_nodes(&trees[i], size);
1626 for (i = 0; i != trees_count; i++)
1627 changed += size_nodes(&trees[i]);
1630 utf8data = calloc(size, 1);
1631 utf8data_size = size;
1632 for (i = 0; i != trees_count; i++)
1633 emit(&trees[i], utf8data);
1636 for (i = 0; i != trees_count; i++) {
1637 printf("%s_%x idx %d\n",
1638 trees[i].type, trees[i].maxage, trees[i].index);
1642 nfdi = utf8data + trees[trees_count-1].index;
1643 nfdicf = utf8data + trees[trees_count-2].index;
1645 nfdi_tree = &trees[trees_count-1];
1646 nfdicf_tree = &trees[trees_count-2];
1649 static void verify(struct tree *tree)
1651 struct unicode_data *data;
1653 unsigned int unichar;
1659 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1660 nocf = strcmp(tree->type, "nfdicf");
1662 for (unichar = 0; unichar != 0x110000; unichar++) {
1664 data = corrections_lookup(&unicode_data[unichar]);
1665 if (data->correction <= tree->maxage)
1666 data = &unicode_data[unichar];
1667 utf8encode(key,unichar);
1668 leaf = utf8lookup(tree, key);
1670 if (data->gen != -1)
1672 if (unichar < 0xd800 || unichar > 0xdfff)
1675 if (unichar >= 0xd800 && unichar <= 0xdfff)
1677 if (data->gen == -1)
1679 if (data->gen != LEAF_GEN(leaf))
1681 if (LEAF_CCC(leaf) == DECOMPOSE) {
1683 if (!data->utf8nfdi) {
1685 } else if (strcmp(data->utf8nfdi,
1690 if (!data->utf8nfdicf &&
1693 } else if (data->utf8nfdicf) {
1694 if (strcmp(data->utf8nfdicf,
1697 } else if (strcmp(data->utf8nfdi,
1702 } else if (data->ccc != LEAF_CCC(leaf)) {
1707 printf("%X code %X gen %d ccc %d"
1709 unichar, data->code, data->gen,
1713 printf(" gen %d ccc %d"
1717 LEAF_CCC(leaf) == DECOMPOSE ?
1718 LEAF_STR(leaf) : "");
1725 static void trees_verify(void)
1729 for (i = 0; i != trees_count; i++)
1733 /* ------------------------------------------------------------------ */
1735 static void help(void)
1737 printf("Usage: %s [options]\n", argv0);
1739 printf("This program creates an a data trie used for parsing and\n");
1740 printf("normalization of UTF-8 strings. The trie is derived from\n");
1741 printf("a set of input files from the Unicode character database\n");
1742 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1744 printf("The generated tree supports two normalization forms:\n");
1746 printf("\tnfdi:\n");
1747 printf("\t- Apply unicode normalization form NFD.\n");
1748 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1750 printf("\tnfdicf:\n");
1751 printf("\t- Apply unicode normalization form NFD.\n");
1752 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1753 printf("\t- Apply a full casefold (C + F).\n");
1755 printf("These forms were chosen as being most useful when dealing\n");
1756 printf("with file names: NFD catches most cases where characters\n");
1757 printf("should be considered equivalent. The ignorables are mostly\n");
1758 printf("invisible, making names hard to type.\n");
1760 printf("The options to specify the files to be used are listed\n");
1761 printf("below with their default values, which are the names used\n");
1762 printf("by version 11.0.0 of the Unicode Character Database.\n");
1764 printf("The input files:\n");
1765 printf("\t-a %s\n", AGE_NAME);
1766 printf("\t-c %s\n", CCC_NAME);
1767 printf("\t-p %s\n", PROP_NAME);
1768 printf("\t-d %s\n", DATA_NAME);
1769 printf("\t-f %s\n", FOLD_NAME);
1770 printf("\t-n %s\n", NORM_NAME);
1772 printf("Additionally, the generated tables are tested using:\n");
1773 printf("\t-t %s\n", TEST_NAME);
1775 printf("Finally, the output file:\n");
1776 printf("\t-o %s\n", UTF8_NAME);
1780 static void usage(void)
1786 static void open_fail(const char *name, int error)
1788 printf("Error %d opening %s: %s\n", error, name, strerror(error));
1792 static void file_fail(const char *filename)
1794 printf("Error parsing %s\n", filename);
1798 static void line_fail(const char *filename, const char *line)
1800 printf("Error parsing %s:%s\n", filename, line);
1804 /* ------------------------------------------------------------------ */
1806 static void print_utf32(unsigned int *utf32str)
1810 for (i = 0; utf32str[i]; i++)
1811 printf(" %X", utf32str[i]);
1814 static void print_utf32nfdi(unsigned int unichar)
1816 printf(" %X ->", unichar);
1817 print_utf32(unicode_data[unichar].utf32nfdi);
1821 static void print_utf32nfdicf(unsigned int unichar)
1823 printf(" %X ->", unichar);
1824 print_utf32(unicode_data[unichar].utf32nfdicf);
1828 /* ------------------------------------------------------------------ */
1830 static void age_init(void)
1835 unsigned int unichar;
1838 unsigned int revision;
1844 printf("Parsing %s\n", age_name);
1846 file = fopen(age_name, "r");
1848 open_fail(age_name, errno);
1852 while (fgets(line, LINESIZE, file)) {
1853 ret = sscanf(line, "# Age=V%d_%d_%d",
1854 &major, &minor, &revision);
1858 printf(" Age V%d_%d_%d\n",
1859 major, minor, revision);
1860 if (!age_valid(major, minor, revision))
1861 line_fail(age_name, line);
1864 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1868 printf(" Age V%d_%d\n", major, minor);
1869 if (!age_valid(major, minor, 0))
1870 line_fail(age_name, line);
1875 /* We must have found something above. */
1877 printf("%d age entries\n", ages_count);
1878 if (ages_count == 0 || ages_count > MAXGEN)
1879 file_fail(age_name);
1881 /* There is a 0 entry. */
1883 ages = calloc(ages_count + 1, sizeof(*ages));
1884 /* And a guard entry. */
1885 ages[ages_count] = (unsigned int)-1;
1890 while (fgets(line, LINESIZE, file)) {
1891 ret = sscanf(line, "# Age=V%d_%d_%d",
1892 &major, &minor, &revision);
1895 UNICODE_AGE(major, minor, revision);
1897 printf(" Age V%d_%d_%d = gen %d\n",
1898 major, minor, revision, gen);
1899 if (!age_valid(major, minor, revision))
1900 line_fail(age_name, line);
1903 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1905 ages[++gen] = UNICODE_AGE(major, minor, 0);
1907 printf(" Age V%d_%d = %d\n",
1909 if (!age_valid(major, minor, 0))
1910 line_fail(age_name, line);
1913 ret = sscanf(line, "%X..%X ; %d.%d #",
1914 &first, &last, &major, &minor);
1916 for (unichar = first; unichar <= last; unichar++)
1917 unicode_data[unichar].gen = gen;
1918 count += 1 + last - first;
1920 printf(" %X..%X gen %d\n", first, last, gen);
1921 if (!utf32valid(first) || !utf32valid(last))
1922 line_fail(age_name, line);
1925 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
1927 unicode_data[unichar].gen = gen;
1930 printf(" %X gen %d\n", unichar, gen);
1931 if (!utf32valid(unichar))
1932 line_fail(age_name, line);
1936 unicode_maxage = ages[gen];
1939 /* Nix surrogate block */
1941 printf(" Removing surrogate block D800..DFFF\n");
1942 for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
1943 unicode_data[unichar].gen = -1;
1946 printf("Found %d entries\n", count);
1948 file_fail(age_name);
1951 static void ccc_init(void)
1956 unsigned int unichar;
1962 printf("Parsing %s\n", ccc_name);
1964 file = fopen(ccc_name, "r");
1966 open_fail(ccc_name, errno);
1969 while (fgets(line, LINESIZE, file)) {
1970 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
1972 for (unichar = first; unichar <= last; unichar++) {
1973 unicode_data[unichar].ccc = value;
1977 printf(" %X..%X ccc %d\n", first, last, value);
1978 if (!utf32valid(first) || !utf32valid(last))
1979 line_fail(ccc_name, line);
1982 ret = sscanf(line, "%X ; %d #", &unichar, &value);
1984 unicode_data[unichar].ccc = value;
1987 printf(" %X ccc %d\n", unichar, value);
1988 if (!utf32valid(unichar))
1989 line_fail(ccc_name, line);
1996 printf("Found %d entries\n", count);
1998 file_fail(ccc_name);
2001 static int ignore_compatibility_form(char *type)
2004 char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2005 "final", "isolated", "circle", "super",
2006 "sub", "vertical", "wide", "narrow",
2007 "small", "square", "fraction", "compat"};
2009 for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2010 if (strcmp(type, ignored_types[i]) == 0)
2015 static void nfdi_init(void)
2018 unsigned int unichar;
2019 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2028 printf("Parsing %s\n", data_name);
2029 file = fopen(data_name, "r");
2031 open_fail(data_name, errno);
2034 while (fgets(line, LINESIZE, file)) {
2035 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2039 if (!utf32valid(unichar))
2040 line_fail(data_name, line);
2043 /* skip over <tag> */
2046 while (*++s != '>');
2048 if(ignore_compatibility_form(type))
2051 /* decode the decomposition into UTF-32 */
2054 mapping[i] = strtoul(s, &s, 16);
2055 if (!utf32valid(mapping[i]))
2056 line_fail(data_name, line);
2061 um = malloc(i * sizeof(unsigned int));
2062 memcpy(um, mapping, i * sizeof(unsigned int));
2063 unicode_data[unichar].utf32nfdi = um;
2066 print_utf32nfdi(unichar);
2071 printf("Found %d entries\n", count);
2073 file_fail(data_name);
2076 static void nfdicf_init(void)
2079 unsigned int unichar;
2080 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2089 printf("Parsing %s\n", fold_name);
2090 file = fopen(fold_name, "r");
2092 open_fail(fold_name, errno);
2095 while (fgets(line, LINESIZE, file)) {
2096 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2099 if (!utf32valid(unichar))
2100 line_fail(fold_name, line);
2101 /* Use the C+F casefold. */
2102 if (status != 'C' && status != 'F')
2110 mapping[i] = strtoul(s, &s, 16);
2111 if (!utf32valid(mapping[i]))
2112 line_fail(fold_name, line);
2117 um = malloc(i * sizeof(unsigned int));
2118 memcpy(um, mapping, i * sizeof(unsigned int));
2119 unicode_data[unichar].utf32nfdicf = um;
2122 print_utf32nfdicf(unichar);
2127 printf("Found %d entries\n", count);
2129 file_fail(fold_name);
2132 static void ignore_init(void)
2135 unsigned int unichar;
2143 printf("Parsing %s\n", prop_name);
2144 file = fopen(prop_name, "r");
2146 open_fail(prop_name, errno);
2149 while (fgets(line, LINESIZE, file)) {
2150 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2152 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2154 if (!utf32valid(first) || !utf32valid(last))
2155 line_fail(prop_name, line);
2156 for (unichar = first; unichar <= last; unichar++) {
2157 free(unicode_data[unichar].utf32nfdi);
2158 um = malloc(sizeof(unsigned int));
2160 unicode_data[unichar].utf32nfdi = um;
2161 free(unicode_data[unichar].utf32nfdicf);
2162 um = malloc(sizeof(unsigned int));
2164 unicode_data[unichar].utf32nfdicf = um;
2168 printf(" %X..%X Default_Ignorable_Code_Point\n",
2172 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2174 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2176 if (!utf32valid(unichar))
2177 line_fail(prop_name, line);
2178 free(unicode_data[unichar].utf32nfdi);
2179 um = malloc(sizeof(unsigned int));
2181 unicode_data[unichar].utf32nfdi = um;
2182 free(unicode_data[unichar].utf32nfdicf);
2183 um = malloc(sizeof(unsigned int));
2185 unicode_data[unichar].utf32nfdicf = um;
2187 printf(" %X Default_Ignorable_Code_Point\n",
2196 printf("Found %d entries\n", count);
2198 file_fail(prop_name);
2201 static void corrections_init(void)
2204 unsigned int unichar;
2207 unsigned int revision;
2210 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2217 printf("Parsing %s\n", norm_name);
2218 file = fopen(norm_name, "r");
2220 open_fail(norm_name, errno);
2223 while (fgets(line, LINESIZE, file)) {
2224 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2225 &unichar, buf0, buf1,
2226 &major, &minor, &revision);
2229 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2230 line_fail(norm_name, line);
2233 corrections = calloc(count, sizeof(struct unicode_data));
2234 corrections_count = count;
2238 while (fgets(line, LINESIZE, file)) {
2239 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2240 &unichar, buf0, buf1,
2241 &major, &minor, &revision);
2244 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2245 line_fail(norm_name, line);
2246 corrections[count] = unicode_data[unichar];
2247 assert(corrections[count].code == unichar);
2248 age = UNICODE_AGE(major, minor, revision);
2249 corrections[count].correction = age;
2254 mapping[i] = strtoul(s, &s, 16);
2255 if (!utf32valid(mapping[i]))
2256 line_fail(norm_name, line);
2261 um = malloc(i * sizeof(unsigned int));
2262 memcpy(um, mapping, i * sizeof(unsigned int));
2263 corrections[count].utf32nfdi = um;
2266 printf(" %X -> %s -> %s V%d_%d_%d\n",
2267 unichar, buf0, buf1, major, minor, revision);
2273 printf("Found %d entries\n", count);
2275 file_fail(norm_name);
2278 /* ------------------------------------------------------------------ */
2281 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2283 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2284 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2293 * NCount = 588 (VCount * TCount)
2294 * SCount = 11172 (LCount * NCount)
2297 * SIndex = s - SBase
2299 * LV (Canonical/Full)
2300 * LIndex = SIndex / NCount
2301 * VIndex = (Sindex % NCount) / TCount
2302 * LPart = LBase + LIndex
2303 * VPart = VBase + VIndex
2306 * LVIndex = (SIndex / TCount) * TCount
2307 * TIndex = (Sindex % TCount)
2308 * LVPart = SBase + LVIndex
2309 * TPart = TBase + TIndex
2312 * LIndex = SIndex / NCount
2313 * VIndex = (Sindex % NCount) / TCount
2314 * TIndex = (Sindex % TCount)
2315 * LPart = LBase + LIndex
2316 * VPart = VBase + VIndex
2317 * if (TIndex == 0) {
2318 * d = <LPart, VPart>
2320 * TPart = TBase + TIndex
2321 * d = <LPart, VPart, TPart>
2327 hangul_decompose(void)
2329 unsigned int sb = 0xAC00;
2330 unsigned int lb = 0x1100;
2331 unsigned int vb = 0x1161;
2332 unsigned int tb = 0x11a7;
2333 /* unsigned int lc = 19; */
2334 unsigned int vc = 21;
2335 unsigned int tc = 28;
2336 unsigned int nc = (vc * tc);
2337 /* unsigned int sc = (lc * nc); */
2338 unsigned int unichar;
2339 unsigned int mapping[4];
2345 printf("Decomposing hangul\n");
2348 for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2349 unsigned int si = unichar - sb;
2350 unsigned int li = si / nc;
2351 unsigned int vi = (si % nc) / tc;
2352 unsigned int ti = si % tc;
2355 mapping[i++] = lb + li;
2356 mapping[i++] = vb + vi;
2358 mapping[i++] = tb + ti;
2361 assert(!unicode_data[unichar].utf32nfdi);
2362 um = malloc(i * sizeof(unsigned int));
2363 memcpy(um, mapping, i * sizeof(unsigned int));
2364 unicode_data[unichar].utf32nfdi = um;
2366 assert(!unicode_data[unichar].utf32nfdicf);
2367 um = malloc(i * sizeof(unsigned int));
2368 memcpy(um, mapping, i * sizeof(unsigned int));
2369 unicode_data[unichar].utf32nfdicf = um;
2372 print_utf32nfdi(unichar);
2377 printf("Created %d entries\n", count);
2380 static void nfdi_decompose(void)
2382 unsigned int unichar;
2383 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2392 printf("Decomposing nfdi\n");
2395 for (unichar = 0; unichar != 0x110000; unichar++) {
2396 if (!unicode_data[unichar].utf32nfdi)
2401 um = unicode_data[unichar].utf32nfdi;
2403 dc = unicode_data[*um].utf32nfdi;
2405 for (j = 0; dc[j]; j++)
2406 mapping[i++] = dc[j];
2416 free(unicode_data[unichar].utf32nfdi);
2417 um = malloc(i * sizeof(unsigned int));
2418 memcpy(um, mapping, i * sizeof(unsigned int));
2419 unicode_data[unichar].utf32nfdi = um;
2421 /* Add this decomposition to nfdicf if there is no entry. */
2422 if (!unicode_data[unichar].utf32nfdicf) {
2423 um = malloc(i * sizeof(unsigned int));
2424 memcpy(um, mapping, i * sizeof(unsigned int));
2425 unicode_data[unichar].utf32nfdicf = um;
2428 print_utf32nfdi(unichar);
2432 printf("Processed %d entries\n", count);
2435 static void nfdicf_decompose(void)
2437 unsigned int unichar;
2438 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2447 printf("Decomposing nfdicf\n");
2449 for (unichar = 0; unichar != 0x110000; unichar++) {
2450 if (!unicode_data[unichar].utf32nfdicf)
2455 um = unicode_data[unichar].utf32nfdicf;
2457 dc = unicode_data[*um].utf32nfdicf;
2459 for (j = 0; dc[j]; j++)
2460 mapping[i++] = dc[j];
2470 free(unicode_data[unichar].utf32nfdicf);
2471 um = malloc(i * sizeof(unsigned int));
2472 memcpy(um, mapping, i * sizeof(unsigned int));
2473 unicode_data[unichar].utf32nfdicf = um;
2476 print_utf32nfdicf(unichar);
2480 printf("Processed %d entries\n", count);
2483 /* ------------------------------------------------------------------ */
2485 int utf8agemax(struct tree *, const char *);
2486 int utf8nagemax(struct tree *, const char *, size_t);
2487 int utf8agemin(struct tree *, const char *);
2488 int utf8nagemin(struct tree *, const char *, size_t);
2489 ssize_t utf8len(struct tree *, const char *);
2490 ssize_t utf8nlen(struct tree *, const char *, size_t);
2492 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2493 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2494 int utf8byte(struct utf8cursor *);
2497 * Use trie to scan s, touching at most len bytes.
2498 * Returns the leaf if one exists, NULL otherwise.
2500 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2501 * is well-formed and corresponds to a known unicode code point. The
2502 * shorthand for this will be "is valid UTF-8 unicode".
2504 static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
2517 trie = utf8data + tree->index;
2519 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2520 if (*trie & NEXTBYTE) {
2525 mask = 1 << (*trie & BITNUM);
2529 /* Right node at offset of trie */
2530 node = (*trie & RIGHTNODE);
2531 offset = trie[offlen];
2534 offset |= trie[offlen];
2537 } else if (*trie & RIGHTPATH) {
2538 /* Right node after this node */
2539 node = (*trie & TRIENODE);
2542 /* No right node. */
2548 /* Left node after this node. */
2549 node = (*trie & LEFTNODE);
2551 } else if (*trie & RIGHTPATH) {
2555 /* Left node after this node */
2556 node = (*trie & TRIENODE);
2565 * Use trie to scan s.
2566 * Returns the leaf if one exists, NULL otherwise.
2568 * Forwards to trie_nlookup().
2570 static utf8leaf_t *utf8lookup(struct tree *tree, const char *s)
2572 return utf8nlookup(tree, s, (size_t)-1);
2576 * Return the number of bytes used by the current UTF-8 sequence.
2577 * Assumes the input points to the first byte of a valid UTF-8
2580 static inline int utf8clen(const char *s)
2582 unsigned char c = *s;
2583 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2587 * Maximum age of any character in s.
2588 * Return -1 if s is not valid UTF-8 unicode.
2589 * Return 0 if only non-assigned code points are used.
2591 int utf8agemax(struct tree *tree, const char *s)
2600 if (!(leaf = utf8lookup(tree, s)))
2602 leaf_age = ages[LEAF_GEN(leaf)];
2603 if (leaf_age <= tree->maxage && leaf_age > age)
2611 * Minimum age of any character in s.
2612 * Return -1 if s is not valid UTF-8 unicode.
2613 * Return 0 if non-assigned code points are used.
2615 int utf8agemin(struct tree *tree, const char *s)
2625 if (!(leaf = utf8lookup(tree, s)))
2627 leaf_age = ages[LEAF_GEN(leaf)];
2628 if (leaf_age <= tree->maxage && leaf_age < age)
2636 * Maximum age of any character in s, touch at most len bytes.
2637 * Return -1 if s is not valid UTF-8 unicode.
2639 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2648 if (!(leaf = utf8nlookup(tree, s, len)))
2650 leaf_age = ages[LEAF_GEN(leaf)];
2651 if (leaf_age <= tree->maxage && leaf_age > age)
2660 * Maximum age of any character in s, touch at most len bytes.
2661 * Return -1 if s is not valid UTF-8 unicode.
2663 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2673 if (!(leaf = utf8nlookup(tree, s, len)))
2675 leaf_age = ages[LEAF_GEN(leaf)];
2676 if (leaf_age <= tree->maxage && leaf_age < age)
2685 * Length of the normalization of s.
2686 * Return -1 if s is not valid UTF-8 unicode.
2688 * A string of Default_Ignorable_Code_Point has length 0.
2690 ssize_t utf8len(struct tree *tree, const char *s)
2698 if (!(leaf = utf8lookup(tree, s)))
2700 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2702 else if (LEAF_CCC(leaf) == DECOMPOSE)
2703 ret += strlen(LEAF_STR(leaf));
2712 * Length of the normalization of s, touch at most len bytes.
2713 * Return -1 if s is not valid UTF-8 unicode.
2715 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2723 if (!(leaf = utf8nlookup(tree, s, len)))
2725 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2727 else if (LEAF_CCC(leaf) == DECOMPOSE)
2728 ret += strlen(LEAF_STR(leaf));
2738 * Cursor structure used by the normalizer.
2750 unsigned int unichar;
2754 * Set up an utf8cursor for use by utf8byte().
2757 * len : length of s.
2758 * u8c : pointer to cursor.
2759 * trie : utf8trie_t to use for normalization.
2761 * Returns -1 on error, 0 on success.
2763 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2778 u8c->nccc = STOPPER;
2780 /* Check we didn't clobber the maximum length. */
2781 if (u8c->len != len)
2783 /* The first byte of s may not be an utf8 continuation. */
2784 if (len > 0 && (*s & 0xC0) == 0x80)
2790 * Set up an utf8cursor for use by utf8byte().
2792 * s : NUL-terminated string.
2793 * u8c : pointer to cursor.
2794 * trie : utf8trie_t to use for normalization.
2796 * Returns -1 on error, 0 on success.
2798 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2800 return utf8ncursor(u8c, tree, s, (unsigned int)-1);
2804 * Get one byte from the normalized form of the string described by u8c.
2806 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
2808 * The cursor keeps track of the location in the string in u8c->s.
2809 * When a character is decomposed, the current location is stored in
2810 * u8c->p, and u8c->s is set to the start of the decomposition. Note
2811 * that bytes from a decomposition do not count against u8c->len.
2813 * Characters are emitted if they match the current CCC in u8c->ccc.
2814 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
2815 * and the function returns 0 in that case.
2817 * Sorting by CCC is done by repeatedly scanning the string. The
2818 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
2819 * the start of the scan. The first pass finds the lowest CCC to be
2820 * emitted and stores it in u8c->nccc, the second pass emits the
2821 * characters with this CCC and finds the next lowest CCC. This limits
2822 * the number of passes to 1 + the number of different CCCs in the
2823 * sequence being scanned.
2826 * u8c->p != NULL -> a decomposition is being scanned.
2827 * u8c->ss != NULL -> this is a repeating scan.
2828 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
2830 int utf8byte(struct utf8cursor *u8c)
2836 /* Check for the end of a decomposed character. */
2837 if (u8c->p && *u8c->s == '\0') {
2842 /* Check for end-of-string. */
2843 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
2844 /* There is no next byte. */
2845 if (u8c->ccc == STOPPER)
2847 /* End-of-string during a scan counts as a stopper. */
2850 } else if ((*u8c->s & 0xC0) == 0x80) {
2851 /* This is a continuation of the current character. */
2854 return (unsigned char)*u8c->s++;
2857 /* Look up the data for the current character. */
2859 leaf = utf8lookup(u8c->tree, u8c->s);
2861 leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len);
2863 /* No leaf found implies that the input is a binary blob. */
2867 /* Characters that are too new have CCC 0. */
2868 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
2870 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
2871 u8c->len -= utf8clen(u8c->s);
2872 u8c->p = u8c->s + utf8clen(u8c->s);
2873 u8c->s = LEAF_STR(leaf);
2874 /* Empty decomposition implies CCC 0. */
2875 if (*u8c->s == '\0') {
2876 if (u8c->ccc == STOPPER)
2881 leaf = utf8lookup(u8c->tree, u8c->s);
2882 ccc = LEAF_CCC(leaf);
2884 u8c->unichar = utf8decode(u8c->s);
2887 * If this is not a stopper, then see if it updates
2888 * the next canonical class to be emitted.
2890 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
2894 * Return the current byte if this is the current
2897 if (ccc == u8c->ccc) {
2900 return (unsigned char)*u8c->s++;
2903 /* Current combining class mismatch. */
2905 if (u8c->nccc == STOPPER) {
2907 * Scan forward for the first canonical class
2908 * to be emitted. Save the position from
2911 assert(u8c->ccc == STOPPER);
2912 u8c->ccc = MINCCC - 1;
2916 u8c->slen = u8c->len;
2918 u8c->len -= utf8clen(u8c->s);
2919 u8c->s += utf8clen(u8c->s);
2920 } else if (ccc != STOPPER) {
2921 /* Not a stopper, and not the ccc we're emitting. */
2923 u8c->len -= utf8clen(u8c->s);
2924 u8c->s += utf8clen(u8c->s);
2925 } else if (u8c->nccc != MAXCCC + 1) {
2926 /* At a stopper, restart for next ccc. */
2927 u8c->ccc = u8c->nccc;
2928 u8c->nccc = MAXCCC + 1;
2931 u8c->len = u8c->slen;
2933 /* All done, proceed from here. */
2935 u8c->nccc = STOPPER;
2943 /* ------------------------------------------------------------------ */
2945 static int normalize_line(struct tree *tree)
2950 struct utf8cursor u8c;
2952 /* First test: null-terminated string. */
2955 if (utf8cursor(&u8c, tree, s))
2957 while ((c = utf8byte(&u8c)) > 0)
2958 if (c != (unsigned char)*t++)
2965 /* Second test: length-limited string. */
2967 /* Replace NUL with a value that will cause an error if seen. */
2968 s[strlen(s) + 1] = -1;
2970 if (utf8cursor(&u8c, tree, s))
2972 while ((c = utf8byte(&u8c)) > 0)
2973 if (c != (unsigned char)*t++)
2983 static void normalization_test(void)
2986 unsigned int unichar;
2987 struct unicode_data *data;
2996 printf("Parsing %s\n", test_name);
2997 /* Step one, read data from file. */
2998 file = fopen(test_name, "r");
3000 open_fail(test_name, errno);
3002 while (fgets(line, LINESIZE, file)) {
3003 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3005 if (ret != 2 || *line == '#')
3010 unichar = strtoul(s, &s, 16);
3011 t += utf8encode(t, unichar);
3019 unichar = strtoul(s, &s, 16);
3020 data = &unicode_data[unichar];
3021 if (data->utf8nfdi && !*data->utf8nfdi)
3024 t += utf8encode(t, unichar);
3029 if (normalize_line(nfdi_tree) < 0) {
3030 printf("Line %s -> %s", buf0, buf1);
3032 printf(" (ignorables removed)");
3033 printf(" failure\n");
3039 printf("Ran %d tests with %d failures\n", tests, failures);
3041 file_fail(test_name);
3044 /* ------------------------------------------------------------------ */
3046 static void write_file(void)
3055 printf("Writing %s\n", utf8_name);
3056 file = fopen(utf8_name, "w");
3058 open_fail(utf8_name, errno);
3060 fprintf(file, "/* This file is generated code, do not edit. */\n");
3061 fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3062 fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3063 fprintf(file, "#endif\n");
3064 fprintf(file, "\n");
3065 fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3067 fprintf(file, "\n");
3068 fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3069 for (i = 0; i != ages_count; i++)
3070 fprintf(file, "\t%#x%s\n", ages[i],
3071 ages[i] == unicode_maxage ? "" : ",");
3072 fprintf(file, "};\n");
3073 fprintf(file, "\n");
3074 fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3076 for (gen = 0; gen < ages_count; gen++) {
3077 fprintf(file, "\t{ %#x, %d }%s\n",
3078 ages[gen], trees[t].index,
3079 ages[gen] == unicode_maxage ? "" : ",");
3080 if (trees[t].maxage == ages[gen])
3083 fprintf(file, "};\n");
3084 fprintf(file, "\n");
3085 fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3087 for (gen = 0; gen < ages_count; gen++) {
3088 fprintf(file, "\t{ %#x, %d }%s\n",
3089 ages[gen], trees[t].index,
3090 ages[gen] == unicode_maxage ? "" : ",");
3091 if (trees[t].maxage == ages[gen])
3094 fprintf(file, "};\n");
3095 fprintf(file, "\n");
3096 fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3099 for (i = 0; i != utf8data_size; i += 16) {
3100 if (i == trees[t].index) {
3101 fprintf(file, "\t/* %s_%x */\n",
3102 trees[t].type, trees[t].maxage);
3103 if (t < trees_count-1)
3106 fprintf(file, "\t");
3107 for (j = i; j != i + 16; j++)
3108 fprintf(file, "0x%.2x%s", utf8data[j],
3109 (j < utf8data_size -1 ? "," : ""));
3110 fprintf(file, "\n");
3112 fprintf(file, "};\n");
3116 /* ------------------------------------------------------------------ */
3118 int main(int argc, char *argv[])
3120 unsigned int unichar;
3125 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3164 for (unichar = 0; unichar != 0x110000; unichar++)
3165 unicode_data[unichar].code = unichar;
3180 /* Prevent "unused function" warning. */
3181 (void)lookup(nfdi_tree, " ");
3183 tree_walk(nfdi_tree);
3185 tree_walk(nfdicf_tree);
3186 normalization_test();