unicode: introduce UTF-8 character database
[linux-2.6-microblaze.git] / scripts / mkutf8data.c
1 /*
2  * Copyright (c) 2014 SGI.
3  * All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18
19 /* Generator for a compact trie for unicode normalization */
20
21 #include <sys/types.h>
22 #include <stddef.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <assert.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <errno.h>
29
30 /* Default names of the in- and output files. */
31
32 #define AGE_NAME        "DerivedAge.txt"
33 #define CCC_NAME        "DerivedCombiningClass.txt"
34 #define PROP_NAME       "DerivedCoreProperties.txt"
35 #define DATA_NAME       "UnicodeData.txt"
36 #define FOLD_NAME       "CaseFolding.txt"
37 #define NORM_NAME       "NormalizationCorrections.txt"
38 #define TEST_NAME       "NormalizationTest.txt"
39 #define UTF8_NAME       "utf8data.h"
40
41 const char      *age_name  = AGE_NAME;
42 const char      *ccc_name  = CCC_NAME;
43 const char      *prop_name = PROP_NAME;
44 const char      *data_name = DATA_NAME;
45 const char      *fold_name = FOLD_NAME;
46 const char      *norm_name = NORM_NAME;
47 const char      *test_name = TEST_NAME;
48 const char      *utf8_name = UTF8_NAME;
49
50 int verbose = 0;
51
52 /* An arbitrary line size limit on input lines. */
53
54 #define LINESIZE        1024
55 char line[LINESIZE];
56 char buf0[LINESIZE];
57 char buf1[LINESIZE];
58 char buf2[LINESIZE];
59 char buf3[LINESIZE];
60
61 const char *argv0;
62
63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
64
65 /* ------------------------------------------------------------------ */
66
67 /*
68  * Unicode version numbers consist of three parts: major, minor, and a
69  * revision.  These numbers are packed into an unsigned int to obtain
70  * a single version number.
71  *
72  * To save space in the generated trie, the unicode version is not
73  * stored directly, instead we calculate a generation number from the
74  * unicode versions seen in the DerivedAge file, and use that as an
75  * index into a table of unicode versions.
76  */
77 #define UNICODE_MAJ_SHIFT               (16)
78 #define UNICODE_MIN_SHIFT               (8)
79
80 #define UNICODE_MAJ_MAX                 ((unsigned short)-1)
81 #define UNICODE_MIN_MAX                 ((unsigned char)-1)
82 #define UNICODE_REV_MAX                 ((unsigned char)-1)
83
84 #define UNICODE_AGE(MAJ,MIN,REV)                        \
85         (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
86          ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
87          ((unsigned int)(REV)))
88
89 unsigned int *ages;
90 int ages_count;
91
92 unsigned int unicode_maxage;
93
94 static int age_valid(unsigned int major, unsigned int minor,
95                      unsigned int revision)
96 {
97         if (major > UNICODE_MAJ_MAX)
98                 return 0;
99         if (minor > UNICODE_MIN_MAX)
100                 return 0;
101         if (revision > UNICODE_REV_MAX)
102                 return 0;
103         return 1;
104 }
105
106 /* ------------------------------------------------------------------ */
107
108 /*
109  * utf8trie_t
110  *
111  * A compact binary tree, used to decode UTF-8 characters.
112  *
113  * Internal nodes are one byte for the node itself, and up to three
114  * bytes for an offset into the tree.  The first byte contains the
115  * following information:
116  *  NEXTBYTE  - flag        - advance to next byte if set
117  *  BITNUM    - 3 bit field - the bit number to tested
118  *  OFFLEN    - 2 bit field - number of bytes in the offset
119  * if offlen == 0 (non-branching node)
120  *  RIGHTPATH - 1 bit field - set if the following node is for the
121  *                            right-hand path (tested bit is set)
122  *  TRIENODE  - 1 bit field - set if the following node is an internal
123  *                            node, otherwise it is a leaf node
124  * if offlen != 0 (branching node)
125  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
126  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
127  *
128  * Due to the way utf8 works, there cannot be branching nodes with
129  * NEXTBYTE set, and moreover those nodes always have a righthand
130  * descendant.
131  */
132 typedef unsigned char utf8trie_t;
133 #define BITNUM          0x07
134 #define NEXTBYTE        0x08
135 #define OFFLEN          0x30
136 #define OFFLEN_SHIFT    4
137 #define RIGHTPATH       0x40
138 #define TRIENODE        0x80
139 #define RIGHTNODE       0x40
140 #define LEFTNODE        0x80
141
142 /*
143  * utf8leaf_t
144  *
145  * The leaves of the trie are embedded in the trie, and so the same
146  * underlying datatype, unsigned char.
147  *
148  * leaf[0]: The unicode version, stored as a generation number that is
149  *          an index into utf8agetab[].  With this we can filter code
150  *          points based on the unicode version in which they were
151  *          defined.  The CCC of a non-defined code point is 0.
152  * leaf[1]: Canonical Combining Class. During normalization, we need
153  *          to do a stable sort into ascending order of all characters
154  *          with a non-zero CCC that occur between two characters with
155  *          a CCC of 0, or at the begin or end of a string.
156  *          The unicode standard guarantees that all CCC values are
157  *          between 0 and 254 inclusive, which leaves 255 available as
158  *          a special value.
159  *          Code points with CCC 0 are known as stoppers.
160  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161  *          start of a NUL-terminated string that is the decomposition
162  *          of the character.
163  *          The CCC of a decomposable character is the same as the CCC
164  *          of the first character of its decomposition.
165  *          Some characters decompose as the empty string: these are
166  *          characters with the Default_Ignorable_Code_Point property.
167  *          These do affect normalization, as they all have CCC 0.
168  *
169  * The decompositions in the trie have been fully expanded.
170  *
171  * Casefolding, if applicable, is also done using decompositions.
172  */
173 typedef unsigned char utf8leaf_t;
174
175 #define LEAF_GEN(LEAF)  ((LEAF)[0])
176 #define LEAF_CCC(LEAF)  ((LEAF)[1])
177 #define LEAF_STR(LEAF)  ((const char*)((LEAF) + 2))
178
179 #define MAXGEN          (255)
180
181 #define MINCCC          (0)
182 #define MAXCCC          (254)
183 #define STOPPER         (0)
184 #define DECOMPOSE       (255)
185
186 struct tree;
187 static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t);
188 static utf8leaf_t *utf8lookup(struct tree *, const char *);
189
190 unsigned char *utf8data;
191 size_t utf8data_size;
192
193 utf8trie_t *nfdi;
194 utf8trie_t *nfdicf;
195
196 /* ------------------------------------------------------------------ */
197
198 /*
199  * UTF8 valid ranges.
200  *
201  * The UTF-8 encoding spreads the bits of a 32bit word over several
202  * bytes. This table gives the ranges that can be held and how they'd
203  * be represented.
204  *
205  * 0x00000000 0x0000007F: 0xxxxxxx
206  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
207  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
208  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
209  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
210  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
211  *
212  * There is an additional requirement on UTF-8, in that only the
213  * shortest representation of a 32bit value is to be used.  A decoder
214  * must not decode sequences that do not satisfy this requirement.
215  * Thus the allowed ranges have a lower bound.
216  *
217  * 0x00000000 0x0000007F: 0xxxxxxx
218  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
219  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
220  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
221  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
222  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
223  *
224  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
225  * 17 planes of 65536 values.  This limits the sequences actually seen
226  * even more, to just the following.
227  *
228  *          0 -     0x7f: 0                     0x7f
229  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
230  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
231  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
232  *
233  * Even within those ranges not all values are allowed: the surrogates
234  * 0xd800 - 0xdfff should never be seen.
235  *
236  * Note that the longest sequence seen with valid usage is 4 bytes,
237  * the same a single UTF-32 character.  This makes the UTF-8
238  * representation of Unicode strictly smaller than UTF-32.
239  *
240  * The shortest sequence requirement was introduced by:
241  *    Corrigendum #1: UTF-8 Shortest Form
242  * It can be found here:
243  *    http://www.unicode.org/versions/corrigendum1.html
244  *
245  */
246
247 #define UTF8_2_BITS     0xC0
248 #define UTF8_3_BITS     0xE0
249 #define UTF8_4_BITS     0xF0
250 #define UTF8_N_BITS     0x80
251 #define UTF8_2_MASK     0xE0
252 #define UTF8_3_MASK     0xF0
253 #define UTF8_4_MASK     0xF8
254 #define UTF8_N_MASK     0xC0
255 #define UTF8_V_MASK     0x3F
256 #define UTF8_V_SHIFT    6
257
258 static int utf8encode(char *str, unsigned int val)
259 {
260         int len;
261
262         if (val < 0x80) {
263                 str[0] = val;
264                 len = 1;
265         } else if (val < 0x800) {
266                 str[1] = val & UTF8_V_MASK;
267                 str[1] |= UTF8_N_BITS;
268                 val >>= UTF8_V_SHIFT;
269                 str[0] = val;
270                 str[0] |= UTF8_2_BITS;
271                 len = 2;
272         } else if (val < 0x10000) {
273                 str[2] = val & UTF8_V_MASK;
274                 str[2] |= UTF8_N_BITS;
275                 val >>= UTF8_V_SHIFT;
276                 str[1] = val & UTF8_V_MASK;
277                 str[1] |= UTF8_N_BITS;
278                 val >>= UTF8_V_SHIFT;
279                 str[0] = val;
280                 str[0] |= UTF8_3_BITS;
281                 len = 3;
282         } else if (val < 0x110000) {
283                 str[3] = val & UTF8_V_MASK;
284                 str[3] |= UTF8_N_BITS;
285                 val >>= UTF8_V_SHIFT;
286                 str[2] = val & UTF8_V_MASK;
287                 str[2] |= UTF8_N_BITS;
288                 val >>= UTF8_V_SHIFT;
289                 str[1] = val & UTF8_V_MASK;
290                 str[1] |= UTF8_N_BITS;
291                 val >>= UTF8_V_SHIFT;
292                 str[0] = val;
293                 str[0] |= UTF8_4_BITS;
294                 len = 4;
295         } else {
296                 printf("%#x: illegal val\n", val);
297                 len = 0;
298         }
299         return len;
300 }
301
302 static unsigned int utf8decode(const char *str)
303 {
304         const unsigned char *s = (const unsigned char*)str;
305         unsigned int unichar = 0;
306
307         if (*s < 0x80) {
308                 unichar = *s;
309         } else if (*s < UTF8_3_BITS) {
310                 unichar = *s++ & 0x1F;
311                 unichar <<= UTF8_V_SHIFT;
312                 unichar |= *s & 0x3F;
313         } else if (*s < UTF8_4_BITS) {
314                 unichar = *s++ & 0x0F;
315                 unichar <<= UTF8_V_SHIFT;
316                 unichar |= *s++ & 0x3F;
317                 unichar <<= UTF8_V_SHIFT;
318                 unichar |= *s & 0x3F;
319         } else {
320                 unichar = *s++ & 0x0F;
321                 unichar <<= UTF8_V_SHIFT;
322                 unichar |= *s++ & 0x3F;
323                 unichar <<= UTF8_V_SHIFT;
324                 unichar |= *s++ & 0x3F;
325                 unichar <<= UTF8_V_SHIFT;
326                 unichar |= *s & 0x3F;
327         }
328         return unichar;
329 }
330
331 static int utf32valid(unsigned int unichar)
332 {
333         return unichar < 0x110000;
334 }
335
336 #define NODE 1
337 #define LEAF 0
338
339 struct tree {
340         void *root;
341         int childnode;
342         const char *type;
343         unsigned int maxage;
344         struct tree *next;
345         int (*leaf_equal)(void *, void *);
346         void (*leaf_print)(void *, int);
347         int (*leaf_mark)(void *);
348         int (*leaf_size)(void *);
349         int *(*leaf_index)(struct tree *, void *);
350         unsigned char *(*leaf_emit)(void *, unsigned char *);
351         int leafindex[0x110000];
352         int index;
353 };
354
355 struct node {
356         int index;
357         int offset;
358         int mark;
359         int size;
360         struct node *parent;
361         void *left;
362         void *right;
363         unsigned char bitnum;
364         unsigned char nextbyte;
365         unsigned char leftnode;
366         unsigned char rightnode;
367         unsigned int keybits;
368         unsigned int keymask;
369 };
370
371 /*
372  * Example lookup function for a tree.
373  */
374 static void *lookup(struct tree *tree, const char *key)
375 {
376         struct node *node;
377         void *leaf = NULL;
378
379         node = tree->root;
380         while (!leaf && node) {
381                 if (node->nextbyte)
382                         key++;
383                 if (*key & (1 << (node->bitnum & 7))) {
384                         /* Right leg */
385                         if (node->rightnode == NODE) {
386                                 node = node->right;
387                         } else if (node->rightnode == LEAF) {
388                                 leaf = node->right;
389                         } else {
390                                 node = NULL;
391                         }
392                 } else {
393                         /* Left leg */
394                         if (node->leftnode == NODE) {
395                                 node = node->left;
396                         } else if (node->leftnode == LEAF) {
397                                 leaf = node->left;
398                         } else {
399                                 node = NULL;
400                         }
401                 }
402         }
403
404         return leaf;
405 }
406
407 /*
408  * A simple non-recursive tree walker: keep track of visits to the
409  * left and right branches in the leftmask and rightmask.
410  */
411 static void tree_walk(struct tree *tree)
412 {
413         struct node *node;
414         unsigned int leftmask;
415         unsigned int rightmask;
416         unsigned int bitmask;
417         int indent = 1;
418         int nodes, singletons, leaves;
419
420         nodes = singletons = leaves = 0;
421
422         printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
423         if (tree->childnode == LEAF) {
424                 assert(tree->root);
425                 tree->leaf_print(tree->root, indent);
426                 leaves = 1;
427         } else {
428                 assert(tree->childnode == NODE);
429                 node = tree->root;
430                 leftmask = rightmask = 0;
431                 while (node) {
432                         printf("%*snode @ %p bitnum %d nextbyte %d"
433                                " left %p right %p mask %x bits %x\n",
434                                 indent, "", node,
435                                 node->bitnum, node->nextbyte,
436                                 node->left, node->right,
437                                 node->keymask, node->keybits);
438                         nodes += 1;
439                         if (!(node->left && node->right))
440                                 singletons += 1;
441
442                         while (node) {
443                                 bitmask = 1 << node->bitnum;
444                                 if ((leftmask & bitmask) == 0) {
445                                         leftmask |= bitmask;
446                                         if (node->leftnode == LEAF) {
447                                                 assert(node->left);
448                                                 tree->leaf_print(node->left,
449                                                                  indent+1);
450                                                 leaves += 1;
451                                         } else if (node->left) {
452                                                 assert(node->leftnode == NODE);
453                                                 indent += 1;
454                                                 node = node->left;
455                                                 break;
456                                         }
457                                 }
458                                 if ((rightmask & bitmask) == 0) {
459                                         rightmask |= bitmask;
460                                         if (node->rightnode == LEAF) {
461                                                 assert(node->right);
462                                                 tree->leaf_print(node->right,
463                                                                  indent+1);
464                                                 leaves += 1;
465                                         } else if (node->right) {
466                                                 assert(node->rightnode==NODE);
467                                                 indent += 1;
468                                                 node = node->right;
469                                                 break;
470                                         }
471                                 }
472                                 leftmask &= ~bitmask;
473                                 rightmask &= ~bitmask;
474                                 node = node->parent;
475                                 indent -= 1;
476                         }
477                 }
478         }
479         printf("nodes %d leaves %d singletons %d\n",
480                nodes, leaves, singletons);
481 }
482
483 /*
484  * Allocate an initialize a new internal node.
485  */
486 static struct node *alloc_node(struct node *parent)
487 {
488         struct node *node;
489         int bitnum;
490
491         node = malloc(sizeof(*node));
492         node->left = node->right = NULL;
493         node->parent = parent;
494         node->leftnode = NODE;
495         node->rightnode = NODE;
496         node->keybits = 0;
497         node->keymask = 0;
498         node->mark = 0;
499         node->index = 0;
500         node->offset = -1;
501         node->size = 4;
502
503         if (node->parent) {
504                 bitnum = parent->bitnum;
505                 if ((bitnum & 7) == 0) {
506                         node->bitnum = bitnum + 7 + 8;
507                         node->nextbyte = 1;
508                 } else {
509                         node->bitnum = bitnum - 1;
510                         node->nextbyte = 0;
511                 }
512         } else {
513                 node->bitnum = 7;
514                 node->nextbyte = 0;
515         }
516
517         return node;
518 }
519
520 /*
521  * Insert a new leaf into the tree, and collapse any subtrees that are
522  * fully populated and end in identical leaves. A nextbyte tagged
523  * internal node will not be removed to preserve the tree's integrity.
524  * Note that due to the structure of utf8, no nextbyte tagged node
525  * will be a candidate for removal.
526  */
527 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
528 {
529         struct node *node;
530         struct node *parent;
531         void **cursor;
532         int keybits;
533
534         assert(keylen >= 1 && keylen <= 4);
535
536         node = NULL;
537         cursor = &tree->root;
538         keybits = 8 * keylen;
539
540         /* Insert, creating path along the way. */
541         while (keybits) {
542                 if (!*cursor)
543                         *cursor = alloc_node(node);
544                 node = *cursor;
545                 if (node->nextbyte)
546                         key++;
547                 if (*key & (1 << (node->bitnum & 7)))
548                         cursor = &node->right;
549                 else
550                         cursor = &node->left;
551                 keybits--;
552         }
553         *cursor = leaf;
554
555         /* Merge subtrees if possible. */
556         while (node) {
557                 if (*key & (1 << (node->bitnum & 7)))
558                         node->rightnode = LEAF;
559                 else
560                         node->leftnode = LEAF;
561                 if (node->nextbyte)
562                         break;
563                 if (node->leftnode == NODE || node->rightnode == NODE)
564                         break;
565                 assert(node->left);
566                 assert(node->right);
567                 /* Compare */
568                 if (! tree->leaf_equal(node->left, node->right))
569                         break;
570                 /* Keep left, drop right leaf. */
571                 leaf = node->left;
572                 /* Check in parent */
573                 parent = node->parent;
574                 if (!parent) {
575                         /* root of tree! */
576                         tree->root = leaf;
577                         tree->childnode = LEAF;
578                 } else if (parent->left == node) {
579                         parent->left = leaf;
580                         parent->leftnode = LEAF;
581                         if (parent->right) {
582                                 parent->keymask = 0;
583                                 parent->keybits = 0;
584                         } else {
585                                 parent->keymask |= (1 << node->bitnum);
586                         }
587                 } else if (parent->right == node) {
588                         parent->right = leaf;
589                         parent->rightnode = LEAF;
590                         if (parent->left) {
591                                 parent->keymask = 0;
592                                 parent->keybits = 0;
593                         } else {
594                                 parent->keymask |= (1 << node->bitnum);
595                                 parent->keybits |= (1 << node->bitnum);
596                         }
597                 } else {
598                         /* internal tree error */
599                         assert(0);
600                 }
601                 free(node);
602                 node = parent;
603         }
604
605         /* Propagate keymasks up along singleton chains. */
606         while (node) {
607                 parent = node->parent;
608                 if (!parent)
609                         break;
610                 /* Nix the mask for parents with two children. */
611                 if (node->keymask == 0) {
612                         parent->keymask = 0;
613                         parent->keybits = 0;
614                 } else if (parent->left && parent->right) {
615                         parent->keymask = 0;
616                         parent->keybits = 0;
617                 } else {
618                         assert((parent->keymask & node->keymask) == 0);
619                         parent->keymask |= node->keymask;
620                         parent->keymask |= (1 << parent->bitnum);
621                         parent->keybits |= node->keybits;
622                         if (parent->right)
623                                 parent->keybits |= (1 << parent->bitnum);
624                 }
625                 node = parent;
626         }
627
628         return 0;
629 }
630
631 /*
632  * Prune internal nodes.
633  *
634  * Fully populated subtrees that end at the same leaf have already
635  * been collapsed.  There are still internal nodes that have for both
636  * their left and right branches a sequence of singletons that make
637  * identical choices and end in identical leaves.  The keymask and
638  * keybits collected in the nodes describe the choices made in these
639  * singleton chains.  When they are identical for the left and right
640  * branch of a node, and the two leaves comare identical, the node in
641  * question can be removed.
642  *
643  * Note that nodes with the nextbyte tag set will not be removed by
644  * this to ensure tree integrity.  Note as well that the structure of
645  * utf8 ensures that these nodes would not have been candidates for
646  * removal in any case.
647  */
648 static void prune(struct tree *tree)
649 {
650         struct node *node;
651         struct node *left;
652         struct node *right;
653         struct node *parent;
654         void *leftleaf;
655         void *rightleaf;
656         unsigned int leftmask;
657         unsigned int rightmask;
658         unsigned int bitmask;
659         int count;
660
661         if (verbose > 0)
662                 printf("Pruning %s_%x\n", tree->type, tree->maxage);
663
664         count = 0;
665         if (tree->childnode == LEAF)
666                 return;
667         if (!tree->root)
668                 return;
669
670         leftmask = rightmask = 0;
671         node = tree->root;
672         while (node) {
673                 if (node->nextbyte)
674                         goto advance;
675                 if (node->leftnode == LEAF)
676                         goto advance;
677                 if (node->rightnode == LEAF)
678                         goto advance;
679                 if (!node->left)
680                         goto advance;
681                 if (!node->right)
682                         goto advance;
683                 left = node->left;
684                 right = node->right;
685                 if (left->keymask == 0)
686                         goto advance;
687                 if (right->keymask == 0)
688                         goto advance;
689                 if (left->keymask != right->keymask)
690                         goto advance;
691                 if (left->keybits != right->keybits)
692                         goto advance;
693                 leftleaf = NULL;
694                 while (!leftleaf) {
695                         assert(left->left || left->right);
696                         if (left->leftnode == LEAF)
697                                 leftleaf = left->left;
698                         else if (left->rightnode == LEAF)
699                                 leftleaf = left->right;
700                         else if (left->left)
701                                 left = left->left;
702                         else if (left->right)
703                                 left = left->right;
704                         else
705                                 assert(0);
706                 }
707                 rightleaf = NULL;
708                 while (!rightleaf) {
709                         assert(right->left || right->right);
710                         if (right->leftnode == LEAF)
711                                 rightleaf = right->left;
712                         else if (right->rightnode == LEAF)
713                                 rightleaf = right->right;
714                         else if (right->left)
715                                 right = right->left;
716                         else if (right->right)
717                                 right = right->right;
718                         else
719                                 assert(0);
720                 }
721                 if (! tree->leaf_equal(leftleaf, rightleaf))
722                         goto advance;
723                 /*
724                  * This node has identical singleton-only subtrees.
725                  * Remove it.
726                  */
727                 parent = node->parent;
728                 left = node->left;
729                 right = node->right;
730                 if (parent->left == node)
731                         parent->left = left;
732                 else if (parent->right == node)
733                         parent->right = left;
734                 else
735                         assert(0);
736                 left->parent = parent;
737                 left->keymask |= (1 << node->bitnum);
738                 node->left = NULL;
739                 while (node) {
740                         bitmask = 1 << node->bitnum;
741                         leftmask &= ~bitmask;
742                         rightmask &= ~bitmask;
743                         if (node->leftnode == NODE && node->left) {
744                                 left = node->left;
745                                 free(node);
746                                 count++;
747                                 node = left;
748                         } else if (node->rightnode == NODE && node->right) {
749                                 right = node->right;
750                                 free(node);
751                                 count++;
752                                 node = right;
753                         } else {
754                                 node = NULL;
755                         }
756                 }
757                 /* Propagate keymasks up along singleton chains. */
758                 node = parent;
759                 /* Force re-check */
760                 bitmask = 1 << node->bitnum;
761                 leftmask &= ~bitmask;
762                 rightmask &= ~bitmask;
763                 for (;;) {
764                         if (node->left && node->right)
765                                 break;
766                         if (node->left) {
767                                 left = node->left;
768                                 node->keymask |= left->keymask;
769                                 node->keybits |= left->keybits;
770                         }
771                         if (node->right) {
772                                 right = node->right;
773                                 node->keymask |= right->keymask;
774                                 node->keybits |= right->keybits;
775                         }
776                         node->keymask |= (1 << node->bitnum);
777                         node = node->parent;
778                         /* Force re-check */
779                         bitmask = 1 << node->bitnum;
780                         leftmask &= ~bitmask;
781                         rightmask &= ~bitmask;
782                 }
783         advance:
784                 bitmask = 1 << node->bitnum;
785                 if ((leftmask & bitmask) == 0 &&
786                     node->leftnode == NODE &&
787                     node->left) {
788                         leftmask |= bitmask;
789                         node = node->left;
790                 } else if ((rightmask & bitmask) == 0 &&
791                            node->rightnode == NODE &&
792                            node->right) {
793                         rightmask |= bitmask;
794                         node = node->right;
795                 } else {
796                         leftmask &= ~bitmask;
797                         rightmask &= ~bitmask;
798                         node = node->parent;
799                 }
800         }
801         if (verbose > 0)
802                 printf("Pruned %d nodes\n", count);
803 }
804
805 /*
806  * Mark the nodes in the tree that lead to leaves that must be
807  * emitted.
808  */
809 static void mark_nodes(struct tree *tree)
810 {
811         struct node *node;
812         struct node *n;
813         unsigned int leftmask;
814         unsigned int rightmask;
815         unsigned int bitmask;
816         int marked;
817
818         marked = 0;
819         if (verbose > 0)
820                 printf("Marking %s_%x\n", tree->type, tree->maxage);
821         if (tree->childnode == LEAF)
822                 goto done;
823
824         assert(tree->childnode == NODE);
825         node = tree->root;
826         leftmask = rightmask = 0;
827         while (node) {
828                 bitmask = 1 << node->bitnum;
829                 if ((leftmask & bitmask) == 0) {
830                         leftmask |= bitmask;
831                         if (node->leftnode == LEAF) {
832                                 assert(node->left);
833                                 if (tree->leaf_mark(node->left)) {
834                                         n = node;
835                                         while (n && !n->mark) {
836                                                 marked++;
837                                                 n->mark = 1;
838                                                 n = n->parent;
839                                         }
840                                 }
841                         } else if (node->left) {
842                                 assert(node->leftnode == NODE);
843                                 node = node->left;
844                                 continue;
845                         }
846                 }
847                 if ((rightmask & bitmask) == 0) {
848                         rightmask |= bitmask;
849                         if (node->rightnode == LEAF) {
850                                 assert(node->right);
851                                 if (tree->leaf_mark(node->right)) {
852                                         n = node;
853                                         while (n && !n->mark) {
854                                                 marked++;
855                                                 n->mark = 1;
856                                                 n = n->parent;
857                                         }
858                                 }
859                         } else if (node->right) {
860                                 assert(node->rightnode==NODE);
861                                 node = node->right;
862                                 continue;
863                         }
864                 }
865                 leftmask &= ~bitmask;
866                 rightmask &= ~bitmask;
867                 node = node->parent;
868         }
869
870         /* second pass: left siblings and singletons */
871
872         assert(tree->childnode == NODE);
873         node = tree->root;
874         leftmask = rightmask = 0;
875         while (node) {
876                 bitmask = 1 << node->bitnum;
877                 if ((leftmask & bitmask) == 0) {
878                         leftmask |= bitmask;
879                         if (node->leftnode == LEAF) {
880                                 assert(node->left);
881                                 if (tree->leaf_mark(node->left)) {
882                                         n = node;
883                                         while (n && !n->mark) {
884                                                 marked++;
885                                                 n->mark = 1;
886                                                 n = n->parent;
887                                         }
888                                 }
889                         } else if (node->left) {
890                                 assert(node->leftnode == NODE);
891                                 node = node->left;
892                                 if (!node->mark && node->parent->mark) {
893                                         marked++;
894                                         node->mark = 1;
895                                 }
896                                 continue;
897                         }
898                 }
899                 if ((rightmask & bitmask) == 0) {
900                         rightmask |= bitmask;
901                         if (node->rightnode == LEAF) {
902                                 assert(node->right);
903                                 if (tree->leaf_mark(node->right)) {
904                                         n = node;
905                                         while (n && !n->mark) {
906                                                 marked++;
907                                                 n->mark = 1;
908                                                 n = n->parent;
909                                         }
910                                 }
911                         } else if (node->right) {
912                                 assert(node->rightnode==NODE);
913                                 node = node->right;
914                                 if (!node->mark && node->parent->mark &&
915                                     !node->parent->left) {
916                                         marked++;
917                                         node->mark = 1;
918                                 }
919                                 continue;
920                         }
921                 }
922                 leftmask &= ~bitmask;
923                 rightmask &= ~bitmask;
924                 node = node->parent;
925         }
926 done:
927         if (verbose > 0)
928                 printf("Marked %d nodes\n", marked);
929 }
930
931 /*
932  * Compute the index of each node and leaf, which is the offset in the
933  * emitted trie.  These values must be pre-computed because relative
934  * offsets between nodes are used to navigate the tree.
935  */
936 static int index_nodes(struct tree *tree, int index)
937 {
938         struct node *node;
939         unsigned int leftmask;
940         unsigned int rightmask;
941         unsigned int bitmask;
942         int count;
943         int indent;
944
945         /* Align to a cache line (or half a cache line?). */
946         while (index % 64)
947                 index++;
948         tree->index = index;
949         indent = 1;
950         count = 0;
951
952         if (verbose > 0)
953                 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
954         if (tree->childnode == LEAF) {
955                 index += tree->leaf_size(tree->root);
956                 goto done;
957         }
958
959         assert(tree->childnode == NODE);
960         node = tree->root;
961         leftmask = rightmask = 0;
962         while (node) {
963                 if (!node->mark)
964                         goto skip;
965                 count++;
966                 if (node->index != index)
967                         node->index = index;
968                 index += node->size;
969 skip:
970                 while (node) {
971                         bitmask = 1 << node->bitnum;
972                         if (node->mark && (leftmask & bitmask) == 0) {
973                                 leftmask |= bitmask;
974                                 if (node->leftnode == LEAF) {
975                                         assert(node->left);
976                                         *tree->leaf_index(tree, node->left) =
977                                                                         index;
978                                         index += tree->leaf_size(node->left);
979                                         count++;
980                                 } else if (node->left) {
981                                         assert(node->leftnode == NODE);
982                                         indent += 1;
983                                         node = node->left;
984                                         break;
985                                 }
986                         }
987                         if (node->mark && (rightmask & bitmask) == 0) {
988                                 rightmask |= bitmask;
989                                 if (node->rightnode == LEAF) {
990                                         assert(node->right);
991                                         *tree->leaf_index(tree, node->right) = index;
992                                         index += tree->leaf_size(node->right);
993                                         count++;
994                                 } else if (node->right) {
995                                         assert(node->rightnode==NODE);
996                                         indent += 1;
997                                         node = node->right;
998                                         break;
999                                 }
1000                         }
1001                         leftmask &= ~bitmask;
1002                         rightmask &= ~bitmask;
1003                         node = node->parent;
1004                         indent -= 1;
1005                 }
1006         }
1007 done:
1008         /* Round up to a multiple of 16 */
1009         while (index % 16)
1010                 index++;
1011         if (verbose > 0)
1012                 printf("Final index %d\n", index);
1013         return index;
1014 }
1015
1016 /*
1017  * Compute the size of nodes and leaves. We start by assuming that
1018  * each node needs to store a three-byte offset. The indexes of the
1019  * nodes are calculated based on that, and then this function is
1020  * called to see if the sizes of some nodes can be reduced.  This is
1021  * repeated until no more changes are seen.
1022  */
1023 static int size_nodes(struct tree *tree)
1024 {
1025         struct tree *next;
1026         struct node *node;
1027         struct node *right;
1028         struct node *n;
1029         unsigned int leftmask;
1030         unsigned int rightmask;
1031         unsigned int bitmask;
1032         unsigned int pathbits;
1033         unsigned int pathmask;
1034         int changed;
1035         int offset;
1036         int size;
1037         int indent;
1038
1039         indent = 1;
1040         changed = 0;
1041         size = 0;
1042
1043         if (verbose > 0)
1044                 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1045         if (tree->childnode == LEAF)
1046                 goto done;
1047
1048         assert(tree->childnode == NODE);
1049         pathbits = 0;
1050         pathmask = 0;
1051         node = tree->root;
1052         leftmask = rightmask = 0;
1053         while (node) {
1054                 if (!node->mark)
1055                         goto skip;
1056                 offset = 0;
1057                 if (!node->left || !node->right) {
1058                         size = 1;
1059                 } else {
1060                         if (node->rightnode == NODE) {
1061                                 right = node->right;
1062                                 next = tree->next;
1063                                 while (!right->mark) {
1064                                         assert(next);
1065                                         n = next->root;
1066                                         while (n->bitnum != node->bitnum) {
1067                                                 if (pathbits & (1<<n->bitnum))
1068                                                         n = n->right;
1069                                                 else
1070                                                         n = n->left;
1071                                         }
1072                                         n = n->right;
1073                                         assert(right->bitnum == n->bitnum);
1074                                         right = n;
1075                                         next = next->next;
1076                                 }
1077                                 offset = right->index - node->index;
1078                         } else {
1079                                 offset = *tree->leaf_index(tree, node->right);
1080                                 offset -= node->index;
1081                         }
1082                         assert(offset >= 0);
1083                         assert(offset <= 0xffffff);
1084                         if (offset <= 0xff) {
1085                                 size = 2;
1086                         } else if (offset <= 0xffff) {
1087                                 size = 3;
1088                         } else { /* offset <= 0xffffff */
1089                                 size = 4;
1090                         }
1091                 }
1092                 if (node->size != size || node->offset != offset) {
1093                         node->size = size;
1094                         node->offset = offset;
1095                         changed++;
1096                 }
1097 skip:
1098                 while (node) {
1099                         bitmask = 1 << node->bitnum;
1100                         pathmask |= bitmask;
1101                         if (node->mark && (leftmask & bitmask) == 0) {
1102                                 leftmask |= bitmask;
1103                                 if (node->leftnode == LEAF) {
1104                                         assert(node->left);
1105                                 } else if (node->left) {
1106                                         assert(node->leftnode == NODE);
1107                                         indent += 1;
1108                                         node = node->left;
1109                                         break;
1110                                 }
1111                         }
1112                         if (node->mark && (rightmask & bitmask) == 0) {
1113                                 rightmask |= bitmask;
1114                                 pathbits |= bitmask;
1115                                 if (node->rightnode == LEAF) {
1116                                         assert(node->right);
1117                                 } else if (node->right) {
1118                                         assert(node->rightnode==NODE);
1119                                         indent += 1;
1120                                         node = node->right;
1121                                         break;
1122                                 }
1123                         }
1124                         leftmask &= ~bitmask;
1125                         rightmask &= ~bitmask;
1126                         pathmask &= ~bitmask;
1127                         pathbits &= ~bitmask;
1128                         node = node->parent;
1129                         indent -= 1;
1130                 }
1131         }
1132 done:
1133         if (verbose > 0)
1134                 printf("Found %d changes\n", changed);
1135         return changed;
1136 }
1137
1138 /*
1139  * Emit a trie for the given tree into the data array.
1140  */
1141 static void emit(struct tree *tree, unsigned char *data)
1142 {
1143         struct node *node;
1144         unsigned int leftmask;
1145         unsigned int rightmask;
1146         unsigned int bitmask;
1147         int offlen;
1148         int offset;
1149         int index;
1150         int indent;
1151         unsigned char byte;
1152
1153         index = tree->index;
1154         data += index;
1155         indent = 1;
1156         if (verbose > 0)
1157                 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1158         if (tree->childnode == LEAF) {
1159                 assert(tree->root);
1160                 tree->leaf_emit(tree->root, data);
1161                 return;
1162         }
1163
1164         assert(tree->childnode == NODE);
1165         node = tree->root;
1166         leftmask = rightmask = 0;
1167         while (node) {
1168                 if (!node->mark)
1169                         goto skip;
1170                 assert(node->offset != -1);
1171                 assert(node->index == index);
1172
1173                 byte = 0;
1174                 if (node->nextbyte)
1175                         byte |= NEXTBYTE;
1176                 byte |= (node->bitnum & BITNUM);
1177                 if (node->left && node->right) {
1178                         if (node->leftnode == NODE)
1179                                 byte |= LEFTNODE;
1180                         if (node->rightnode == NODE)
1181                                 byte |= RIGHTNODE;
1182                         if (node->offset <= 0xff)
1183                                 offlen = 1;
1184                         else if (node->offset <= 0xffff)
1185                                 offlen = 2;
1186                         else
1187                                 offlen = 3;
1188                         offset = node->offset;
1189                         byte |= offlen << OFFLEN_SHIFT;
1190                         *data++ = byte;
1191                         index++;
1192                         while (offlen--) {
1193                                 *data++ = offset & 0xff;
1194                                 index++;
1195                                 offset >>= 8;
1196                         }
1197                 } else if (node->left) {
1198                         if (node->leftnode == NODE)
1199                                 byte |= TRIENODE;
1200                         *data++ = byte;
1201                         index++;
1202                 } else if (node->right) {
1203                         byte |= RIGHTNODE;
1204                         if (node->rightnode == NODE)
1205                                 byte |= TRIENODE;
1206                         *data++ = byte;
1207                         index++;
1208                 } else {
1209                         assert(0);
1210                 }
1211 skip:
1212                 while (node) {
1213                         bitmask = 1 << node->bitnum;
1214                         if (node->mark && (leftmask & bitmask) == 0) {
1215                                 leftmask |= bitmask;
1216                                 if (node->leftnode == LEAF) {
1217                                         assert(node->left);
1218                                         data = tree->leaf_emit(node->left,
1219                                                                data);
1220                                         index += tree->leaf_size(node->left);
1221                                 } else if (node->left) {
1222                                         assert(node->leftnode == NODE);
1223                                         indent += 1;
1224                                         node = node->left;
1225                                         break;
1226                                 }
1227                         }
1228                         if (node->mark && (rightmask & bitmask) == 0) {
1229                                 rightmask |= bitmask;
1230                                 if (node->rightnode == LEAF) {
1231                                         assert(node->right);
1232                                         data = tree->leaf_emit(node->right,
1233                                                                data);
1234                                         index += tree->leaf_size(node->right);
1235                                 } else if (node->right) {
1236                                         assert(node->rightnode==NODE);
1237                                         indent += 1;
1238                                         node = node->right;
1239                                         break;
1240                                 }
1241                         }
1242                         leftmask &= ~bitmask;
1243                         rightmask &= ~bitmask;
1244                         node = node->parent;
1245                         indent -= 1;
1246                 }
1247         }
1248 }
1249
1250 /* ------------------------------------------------------------------ */
1251
1252 /*
1253  * Unicode data.
1254  *
1255  * We need to keep track of the Canonical Combining Class, the Age,
1256  * and decompositions for a code point.
1257  *
1258  * For the Age, we store the index into the ages table.  Effectively
1259  * this is a generation number that the table maps to a unicode
1260  * version.
1261  *
1262  * The correction field is used to indicate that this entry is in the
1263  * corrections array, which contains decompositions that were
1264  * corrected in later revisions.  The value of the correction field is
1265  * the Unicode version in which the mapping was corrected.
1266  */
1267 struct unicode_data {
1268         unsigned int code;
1269         int ccc;
1270         int gen;
1271         int correction;
1272         unsigned int *utf32nfdi;
1273         unsigned int *utf32nfdicf;
1274         char *utf8nfdi;
1275         char *utf8nfdicf;
1276 };
1277
1278 struct unicode_data unicode_data[0x110000];
1279 struct unicode_data *corrections;
1280 int    corrections_count;
1281
1282 struct tree *nfdi_tree;
1283 struct tree *nfdicf_tree;
1284
1285 struct tree *trees;
1286 int          trees_count;
1287
1288 /*
1289  * Check the corrections array to see if this entry was corrected at
1290  * some point.
1291  */
1292 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1293 {
1294         int i;
1295
1296         for (i = 0; i != corrections_count; i++)
1297                 if (u->code == corrections[i].code)
1298                         return &corrections[i];
1299         return u;
1300 }
1301
1302 static int nfdi_equal(void *l, void *r)
1303 {
1304         struct unicode_data *left = l;
1305         struct unicode_data *right = r;
1306
1307         if (left->gen != right->gen)
1308                 return 0;
1309         if (left->ccc != right->ccc)
1310                 return 0;
1311         if (left->utf8nfdi && right->utf8nfdi &&
1312             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1313                 return 1;
1314         if (left->utf8nfdi || right->utf8nfdi)
1315                 return 0;
1316         return 1;
1317 }
1318
1319 static int nfdicf_equal(void *l, void *r)
1320 {
1321         struct unicode_data *left = l;
1322         struct unicode_data *right = r;
1323
1324         if (left->gen != right->gen)
1325                 return 0;
1326         if (left->ccc != right->ccc)
1327                 return 0;
1328         if (left->utf8nfdicf && right->utf8nfdicf &&
1329             strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1330                 return 1;
1331         if (left->utf8nfdicf && right->utf8nfdicf)
1332                 return 0;
1333         if (left->utf8nfdicf || right->utf8nfdicf)
1334                 return 0;
1335         if (left->utf8nfdi && right->utf8nfdi &&
1336             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1337                 return 1;
1338         if (left->utf8nfdi || right->utf8nfdi)
1339                 return 0;
1340         return 1;
1341 }
1342
1343 static void nfdi_print(void *l, int indent)
1344 {
1345         struct unicode_data *leaf = l;
1346
1347         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1348                 leaf->code, leaf->ccc, leaf->gen);
1349         if (leaf->utf8nfdi)
1350                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1351         printf("\n");
1352 }
1353
1354 static void nfdicf_print(void *l, int indent)
1355 {
1356         struct unicode_data *leaf = l;
1357
1358         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1359                 leaf->code, leaf->ccc, leaf->gen);
1360         if (leaf->utf8nfdicf)
1361                 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1362         else if (leaf->utf8nfdi)
1363                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1364         printf("\n");
1365 }
1366
1367 static int nfdi_mark(void *l)
1368 {
1369         return 1;
1370 }
1371
1372 static int nfdicf_mark(void *l)
1373 {
1374         struct unicode_data *leaf = l;
1375
1376         if (leaf->utf8nfdicf)
1377                 return 1;
1378         return 0;
1379 }
1380
1381 static int correction_mark(void *l)
1382 {
1383         struct unicode_data *leaf = l;
1384
1385         return leaf->correction;
1386 }
1387
1388 static int nfdi_size(void *l)
1389 {
1390         struct unicode_data *leaf = l;
1391
1392         int size = 2;
1393         if (leaf->utf8nfdi)
1394                 size += strlen(leaf->utf8nfdi) + 1;
1395         return size;
1396 }
1397
1398 static int nfdicf_size(void *l)
1399 {
1400         struct unicode_data *leaf = l;
1401
1402         int size = 2;
1403         if (leaf->utf8nfdicf)
1404                 size += strlen(leaf->utf8nfdicf) + 1;
1405         else if (leaf->utf8nfdi)
1406                 size += strlen(leaf->utf8nfdi) + 1;
1407         return size;
1408 }
1409
1410 static int *nfdi_index(struct tree *tree, void *l)
1411 {
1412         struct unicode_data *leaf = l;
1413
1414         return &tree->leafindex[leaf->code];
1415 }
1416
1417 static int *nfdicf_index(struct tree *tree, void *l)
1418 {
1419         struct unicode_data *leaf = l;
1420
1421         return &tree->leafindex[leaf->code];
1422 }
1423
1424 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1425 {
1426         struct unicode_data *leaf = l;
1427         unsigned char *s;
1428
1429         *data++ = leaf->gen;
1430         if (leaf->utf8nfdi) {
1431                 *data++ = DECOMPOSE;
1432                 s = (unsigned char*)leaf->utf8nfdi;
1433                 while ((*data++ = *s++) != 0)
1434                         ;
1435         } else {
1436                 *data++ = leaf->ccc;
1437         }
1438         return data;
1439 }
1440
1441 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1442 {
1443         struct unicode_data *leaf = l;
1444         unsigned char *s;
1445
1446         *data++ = leaf->gen;
1447         if (leaf->utf8nfdicf) {
1448                 *data++ = DECOMPOSE;
1449                 s = (unsigned char*)leaf->utf8nfdicf;
1450                 while ((*data++ = *s++) != 0)
1451                         ;
1452         } else if (leaf->utf8nfdi) {
1453                 *data++ = DECOMPOSE;
1454                 s = (unsigned char*)leaf->utf8nfdi;
1455                 while ((*data++ = *s++) != 0)
1456                         ;
1457         } else {
1458                 *data++ = leaf->ccc;
1459         }
1460         return data;
1461 }
1462
1463 static void utf8_create(struct unicode_data *data)
1464 {
1465         char utf[18*4+1];
1466         char *u;
1467         unsigned int *um;
1468         int i;
1469
1470         u = utf;
1471         um = data->utf32nfdi;
1472         if (um) {
1473                 for (i = 0; um[i]; i++)
1474                         u += utf8encode(u, um[i]);
1475                 *u = '\0';
1476                 data->utf8nfdi = strdup(utf);
1477         }
1478         u = utf;
1479         um = data->utf32nfdicf;
1480         if (um) {
1481                 for (i = 0; um[i]; i++)
1482                         u += utf8encode(u, um[i]);
1483                 *u = '\0';
1484                 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1485                         data->utf8nfdicf = strdup(utf);
1486         }
1487 }
1488
1489 static void utf8_init(void)
1490 {
1491         unsigned int unichar;
1492         int i;
1493
1494         for (unichar = 0; unichar != 0x110000; unichar++)
1495                 utf8_create(&unicode_data[unichar]);
1496
1497         for (i = 0; i != corrections_count; i++)
1498                 utf8_create(&corrections[i]);
1499 }
1500
1501 static void trees_init(void)
1502 {
1503         struct unicode_data *data;
1504         unsigned int maxage;
1505         unsigned int nextage;
1506         int count;
1507         int i;
1508         int j;
1509
1510         /* Count the number of different ages. */
1511         count = 0;
1512         nextage = (unsigned int)-1;
1513         do {
1514                 maxage = nextage;
1515                 nextage = 0;
1516                 for (i = 0; i <= corrections_count; i++) {
1517                         data = &corrections[i];
1518                         if (nextage < data->correction &&
1519                             data->correction < maxage)
1520                                 nextage = data->correction;
1521                 }
1522                 count++;
1523         } while (nextage);
1524
1525         /* Two trees per age: nfdi and nfdicf */
1526         trees_count = count * 2;
1527         trees = calloc(trees_count, sizeof(struct tree));
1528
1529         /* Assign ages to the trees. */
1530         count = trees_count;
1531         nextage = (unsigned int)-1;
1532         do {
1533                 maxage = nextage;
1534                 trees[--count].maxage = maxage;
1535                 trees[--count].maxage = maxage;
1536                 nextage = 0;
1537                 for (i = 0; i <= corrections_count; i++) {
1538                         data = &corrections[i];
1539                         if (nextage < data->correction &&
1540                             data->correction < maxage)
1541                                 nextage = data->correction;
1542                 }
1543         } while (nextage);
1544
1545         /* The ages assigned above are off by one. */
1546         for (i = 0; i != trees_count; i++) {
1547                 j = 0;
1548                 while (ages[j] < trees[i].maxage)
1549                         j++;
1550                 trees[i].maxage = ages[j-1];
1551         }
1552
1553         /* Set up the forwarding between trees. */
1554         trees[trees_count-2].next = &trees[trees_count-1];
1555         trees[trees_count-1].leaf_mark = nfdi_mark;
1556         trees[trees_count-2].leaf_mark = nfdicf_mark;
1557         for (i = 0; i != trees_count-2; i += 2) {
1558                 trees[i].next = &trees[trees_count-2];
1559                 trees[i].leaf_mark = correction_mark;
1560                 trees[i+1].next = &trees[trees_count-1];
1561                 trees[i+1].leaf_mark = correction_mark;
1562         }
1563
1564         /* Assign the callouts. */
1565         for (i = 0; i != trees_count; i += 2) {
1566                 trees[i].type = "nfdicf";
1567                 trees[i].leaf_equal = nfdicf_equal;
1568                 trees[i].leaf_print = nfdicf_print;
1569                 trees[i].leaf_size = nfdicf_size;
1570                 trees[i].leaf_index = nfdicf_index;
1571                 trees[i].leaf_emit = nfdicf_emit;
1572
1573                 trees[i+1].type = "nfdi";
1574                 trees[i+1].leaf_equal = nfdi_equal;
1575                 trees[i+1].leaf_print = nfdi_print;
1576                 trees[i+1].leaf_size = nfdi_size;
1577                 trees[i+1].leaf_index = nfdi_index;
1578                 trees[i+1].leaf_emit = nfdi_emit;
1579         }
1580
1581         /* Finish init. */
1582         for (i = 0; i != trees_count; i++)
1583                 trees[i].childnode = NODE;
1584 }
1585
1586 static void trees_populate(void)
1587 {
1588         struct unicode_data *data;
1589         unsigned int unichar;
1590         char keyval[4];
1591         int keylen;
1592         int i;
1593
1594         for (i = 0; i != trees_count; i++) {
1595                 if (verbose > 0) {
1596                         printf("Populating %s_%x\n",
1597                                 trees[i].type, trees[i].maxage);
1598                 }
1599                 for (unichar = 0; unichar != 0x110000; unichar++) {
1600                         if (unicode_data[unichar].gen < 0)
1601                                 continue;
1602                         keylen = utf8encode(keyval, unichar);
1603                         data = corrections_lookup(&unicode_data[unichar]);
1604                         if (data->correction <= trees[i].maxage)
1605                                 data = &unicode_data[unichar];
1606                         insert(&trees[i], keyval, keylen, data);
1607                 }
1608         }
1609 }
1610
1611 static void trees_reduce(void)
1612 {
1613         int i;
1614         int size;
1615         int changed;
1616
1617         for (i = 0; i != trees_count; i++)
1618                 prune(&trees[i]);
1619         for (i = 0; i != trees_count; i++)
1620                 mark_nodes(&trees[i]);
1621         do {
1622                 size = 0;
1623                 for (i = 0; i != trees_count; i++)
1624                         size = index_nodes(&trees[i], size);
1625                 changed = 0;
1626                 for (i = 0; i != trees_count; i++)
1627                         changed += size_nodes(&trees[i]);
1628         } while (changed);
1629
1630         utf8data = calloc(size, 1);
1631         utf8data_size = size;
1632         for (i = 0; i != trees_count; i++)
1633                 emit(&trees[i], utf8data);
1634
1635         if (verbose > 0) {
1636                 for (i = 0; i != trees_count; i++) {
1637                         printf("%s_%x idx %d\n",
1638                                 trees[i].type, trees[i].maxage, trees[i].index);
1639                 }
1640         }
1641
1642         nfdi = utf8data + trees[trees_count-1].index;
1643         nfdicf = utf8data + trees[trees_count-2].index;
1644
1645         nfdi_tree = &trees[trees_count-1];
1646         nfdicf_tree = &trees[trees_count-2];
1647 }
1648
1649 static void verify(struct tree *tree)
1650 {
1651         struct unicode_data *data;
1652         utf8leaf_t      *leaf;
1653         unsigned int    unichar;
1654         char            key[4];
1655         int             report;
1656         int             nocf;
1657
1658         if (verbose > 0)
1659                 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1660         nocf = strcmp(tree->type, "nfdicf");
1661
1662         for (unichar = 0; unichar != 0x110000; unichar++) {
1663                 report = 0;
1664                 data = corrections_lookup(&unicode_data[unichar]);
1665                 if (data->correction <= tree->maxage)
1666                         data = &unicode_data[unichar];
1667                 utf8encode(key,unichar);
1668                 leaf = utf8lookup(tree, key);
1669                 if (!leaf) {
1670                         if (data->gen != -1)
1671                                 report++;
1672                         if (unichar < 0xd800 || unichar > 0xdfff)
1673                                 report++;
1674                 } else {
1675                         if (unichar >= 0xd800 && unichar <= 0xdfff)
1676                                 report++;
1677                         if (data->gen == -1)
1678                                 report++;
1679                         if (data->gen != LEAF_GEN(leaf))
1680                                 report++;
1681                         if (LEAF_CCC(leaf) == DECOMPOSE) {
1682                                 if (nocf) {
1683                                         if (!data->utf8nfdi) {
1684                                                 report++;
1685                                         } else if (strcmp(data->utf8nfdi,
1686                                                           LEAF_STR(leaf))) {
1687                                                 report++;
1688                                         }
1689                                 } else {
1690                                         if (!data->utf8nfdicf &&
1691                                             !data->utf8nfdi) {
1692                                                 report++;
1693                                         } else if (data->utf8nfdicf) {
1694                                                 if (strcmp(data->utf8nfdicf,
1695                                                            LEAF_STR(leaf)))
1696                                                         report++;
1697                                         } else if (strcmp(data->utf8nfdi,
1698                                                           LEAF_STR(leaf))) {
1699                                                 report++;
1700                                         }
1701                                 }
1702                         } else if (data->ccc != LEAF_CCC(leaf)) {
1703                                 report++;
1704                         }
1705                 }
1706                 if (report) {
1707                         printf("%X code %X gen %d ccc %d"
1708                                 " nfdi -> \"%s\"",
1709                                 unichar, data->code, data->gen,
1710                                 data->ccc,
1711                                 data->utf8nfdi);
1712                         if (leaf) {
1713                                 printf(" gen %d ccc %d"
1714                                         " nfdi -> \"%s\"",
1715                                         LEAF_GEN(leaf),
1716                                         LEAF_CCC(leaf),
1717                                         LEAF_CCC(leaf) == DECOMPOSE ?
1718                                                 LEAF_STR(leaf) : "");
1719                         }
1720                         printf("\n");
1721                 }
1722         }
1723 }
1724
1725 static void trees_verify(void)
1726 {
1727         int i;
1728
1729         for (i = 0; i != trees_count; i++)
1730                 verify(&trees[i]);
1731 }
1732
1733 /* ------------------------------------------------------------------ */
1734
1735 static void help(void)
1736 {
1737         printf("Usage: %s [options]\n", argv0);
1738         printf("\n");
1739         printf("This program creates an a data trie used for parsing and\n");
1740         printf("normalization of UTF-8 strings. The trie is derived from\n");
1741         printf("a set of input files from the Unicode character database\n");
1742         printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1743         printf("\n");
1744         printf("The generated tree supports two normalization forms:\n");
1745         printf("\n");
1746         printf("\tnfdi:\n");
1747         printf("\t- Apply unicode normalization form NFD.\n");
1748         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1749         printf("\n");
1750         printf("\tnfdicf:\n");
1751         printf("\t- Apply unicode normalization form NFD.\n");
1752         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1753         printf("\t- Apply a full casefold (C + F).\n");
1754         printf("\n");
1755         printf("These forms were chosen as being most useful when dealing\n");
1756         printf("with file names: NFD catches most cases where characters\n");
1757         printf("should be considered equivalent. The ignorables are mostly\n");
1758         printf("invisible, making names hard to type.\n");
1759         printf("\n");
1760         printf("The options to specify the files to be used are listed\n");
1761         printf("below with their default values, which are the names used\n");
1762         printf("by version 11.0.0 of the Unicode Character Database.\n");
1763         printf("\n");
1764         printf("The input files:\n");
1765         printf("\t-a %s\n", AGE_NAME);
1766         printf("\t-c %s\n", CCC_NAME);
1767         printf("\t-p %s\n", PROP_NAME);
1768         printf("\t-d %s\n", DATA_NAME);
1769         printf("\t-f %s\n", FOLD_NAME);
1770         printf("\t-n %s\n", NORM_NAME);
1771         printf("\n");
1772         printf("Additionally, the generated tables are tested using:\n");
1773         printf("\t-t %s\n", TEST_NAME);
1774         printf("\n");
1775         printf("Finally, the output file:\n");
1776         printf("\t-o %s\n", UTF8_NAME);
1777         printf("\n");
1778 }
1779
1780 static void usage(void)
1781 {
1782         help();
1783         exit(1);
1784 }
1785
1786 static void open_fail(const char *name, int error)
1787 {
1788         printf("Error %d opening %s: %s\n", error, name, strerror(error));
1789         exit(1);
1790 }
1791
1792 static void file_fail(const char *filename)
1793 {
1794         printf("Error parsing %s\n", filename);
1795         exit(1);
1796 }
1797
1798 static void line_fail(const char *filename, const char *line)
1799 {
1800         printf("Error parsing %s:%s\n", filename, line);
1801         exit(1);
1802 }
1803
1804 /* ------------------------------------------------------------------ */
1805
1806 static void print_utf32(unsigned int *utf32str)
1807 {
1808         int     i;
1809
1810         for (i = 0; utf32str[i]; i++)
1811                 printf(" %X", utf32str[i]);
1812 }
1813
1814 static void print_utf32nfdi(unsigned int unichar)
1815 {
1816         printf(" %X ->", unichar);
1817         print_utf32(unicode_data[unichar].utf32nfdi);
1818         printf("\n");
1819 }
1820
1821 static void print_utf32nfdicf(unsigned int unichar)
1822 {
1823         printf(" %X ->", unichar);
1824         print_utf32(unicode_data[unichar].utf32nfdicf);
1825         printf("\n");
1826 }
1827
1828 /* ------------------------------------------------------------------ */
1829
1830 static void age_init(void)
1831 {
1832         FILE *file;
1833         unsigned int first;
1834         unsigned int last;
1835         unsigned int unichar;
1836         unsigned int major;
1837         unsigned int minor;
1838         unsigned int revision;
1839         int gen;
1840         int count;
1841         int ret;
1842
1843         if (verbose > 0)
1844                 printf("Parsing %s\n", age_name);
1845
1846         file = fopen(age_name, "r");
1847         if (!file)
1848                 open_fail(age_name, errno);
1849         count = 0;
1850
1851         gen = 0;
1852         while (fgets(line, LINESIZE, file)) {
1853                 ret = sscanf(line, "# Age=V%d_%d_%d",
1854                                 &major, &minor, &revision);
1855                 if (ret == 3) {
1856                         ages_count++;
1857                         if (verbose > 1)
1858                                 printf(" Age V%d_%d_%d\n",
1859                                         major, minor, revision);
1860                         if (!age_valid(major, minor, revision))
1861                                 line_fail(age_name, line);
1862                         continue;
1863                 }
1864                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1865                 if (ret == 2) {
1866                         ages_count++;
1867                         if (verbose > 1)
1868                                 printf(" Age V%d_%d\n", major, minor);
1869                         if (!age_valid(major, minor, 0))
1870                                 line_fail(age_name, line);
1871                         continue;
1872                 }
1873         }
1874
1875         /* We must have found something above. */
1876         if (verbose > 1)
1877                 printf("%d age entries\n", ages_count);
1878         if (ages_count == 0 || ages_count > MAXGEN)
1879                 file_fail(age_name);
1880
1881         /* There is a 0 entry. */
1882         ages_count++;
1883         ages = calloc(ages_count + 1, sizeof(*ages));
1884         /* And a guard entry. */
1885         ages[ages_count] = (unsigned int)-1;
1886
1887         rewind(file);
1888         count = 0;
1889         gen = 0;
1890         while (fgets(line, LINESIZE, file)) {
1891                 ret = sscanf(line, "# Age=V%d_%d_%d",
1892                                 &major, &minor, &revision);
1893                 if (ret == 3) {
1894                         ages[++gen] =
1895                                 UNICODE_AGE(major, minor, revision);
1896                         if (verbose > 1)
1897                                 printf(" Age V%d_%d_%d = gen %d\n",
1898                                         major, minor, revision, gen);
1899                         if (!age_valid(major, minor, revision))
1900                                 line_fail(age_name, line);
1901                         continue;
1902                 }
1903                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1904                 if (ret == 2) {
1905                         ages[++gen] = UNICODE_AGE(major, minor, 0);
1906                         if (verbose > 1)
1907                                 printf(" Age V%d_%d = %d\n",
1908                                         major, minor, gen);
1909                         if (!age_valid(major, minor, 0))
1910                                 line_fail(age_name, line);
1911                         continue;
1912                 }
1913                 ret = sscanf(line, "%X..%X ; %d.%d #",
1914                              &first, &last, &major, &minor);
1915                 if (ret == 4) {
1916                         for (unichar = first; unichar <= last; unichar++)
1917                                 unicode_data[unichar].gen = gen;
1918                         count += 1 + last - first;
1919                         if (verbose > 1)
1920                                 printf("  %X..%X gen %d\n", first, last, gen);
1921                         if (!utf32valid(first) || !utf32valid(last))
1922                                 line_fail(age_name, line);
1923                         continue;
1924                 }
1925                 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
1926                 if (ret == 3) {
1927                         unicode_data[unichar].gen = gen;
1928                         count++;
1929                         if (verbose > 1)
1930                                 printf("  %X gen %d\n", unichar, gen);
1931                         if (!utf32valid(unichar))
1932                                 line_fail(age_name, line);
1933                         continue;
1934                 }
1935         }
1936         unicode_maxage = ages[gen];
1937         fclose(file);
1938
1939         /* Nix surrogate block */
1940         if (verbose > 1)
1941                 printf(" Removing surrogate block D800..DFFF\n");
1942         for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
1943                 unicode_data[unichar].gen = -1;
1944
1945         if (verbose > 0)
1946                 printf("Found %d entries\n", count);
1947         if (count == 0)
1948                 file_fail(age_name);
1949 }
1950
1951 static void ccc_init(void)
1952 {
1953         FILE *file;
1954         unsigned int first;
1955         unsigned int last;
1956         unsigned int unichar;
1957         unsigned int value;
1958         int count;
1959         int ret;
1960
1961         if (verbose > 0)
1962                 printf("Parsing %s\n", ccc_name);
1963
1964         file = fopen(ccc_name, "r");
1965         if (!file)
1966                 open_fail(ccc_name, errno);
1967
1968         count = 0;
1969         while (fgets(line, LINESIZE, file)) {
1970                 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
1971                 if (ret == 3) {
1972                         for (unichar = first; unichar <= last; unichar++) {
1973                                 unicode_data[unichar].ccc = value;
1974                                 count++;
1975                         }
1976                         if (verbose > 1)
1977                                 printf(" %X..%X ccc %d\n", first, last, value);
1978                         if (!utf32valid(first) || !utf32valid(last))
1979                                 line_fail(ccc_name, line);
1980                         continue;
1981                 }
1982                 ret = sscanf(line, "%X ; %d #", &unichar, &value);
1983                 if (ret == 2) {
1984                         unicode_data[unichar].ccc = value;
1985                         count++;
1986                         if (verbose > 1)
1987                                 printf(" %X ccc %d\n", unichar, value);
1988                         if (!utf32valid(unichar))
1989                                 line_fail(ccc_name, line);
1990                         continue;
1991                 }
1992         }
1993         fclose(file);
1994
1995         if (verbose > 0)
1996                 printf("Found %d entries\n", count);
1997         if (count == 0)
1998                 file_fail(ccc_name);
1999 }
2000
2001 static int ignore_compatibility_form(char *type)
2002 {
2003         int i;
2004         char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2005                                  "final", "isolated", "circle", "super",
2006                                  "sub", "vertical", "wide", "narrow",
2007                                  "small", "square", "fraction", "compat"};
2008
2009         for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2010                 if (strcmp(type, ignored_types[i]) == 0)
2011                         return 1;
2012         return 0;
2013 }
2014
2015 static void nfdi_init(void)
2016 {
2017         FILE *file;
2018         unsigned int unichar;
2019         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2020         char *s;
2021         char *type;
2022         unsigned int *um;
2023         int count;
2024         int i;
2025         int ret;
2026
2027         if (verbose > 0)
2028                 printf("Parsing %s\n", data_name);
2029         file = fopen(data_name, "r");
2030         if (!file)
2031                 open_fail(data_name, errno);
2032
2033         count = 0;
2034         while (fgets(line, LINESIZE, file)) {
2035                 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2036                              &unichar, buf0);
2037                 if (ret != 2)
2038                         continue;
2039                 if (!utf32valid(unichar))
2040                         line_fail(data_name, line);
2041
2042                 s = buf0;
2043                 /* skip over <tag> */
2044                 if (*s == '<') {
2045                         type = ++s;
2046                         while (*++s != '>');
2047                         *s++ = '\0';
2048                         if(ignore_compatibility_form(type))
2049                                 continue;
2050                 }
2051                 /* decode the decomposition into UTF-32 */
2052                 i = 0;
2053                 while (*s) {
2054                         mapping[i] = strtoul(s, &s, 16);
2055                         if (!utf32valid(mapping[i]))
2056                                 line_fail(data_name, line);
2057                         i++;
2058                 }
2059                 mapping[i++] = 0;
2060
2061                 um = malloc(i * sizeof(unsigned int));
2062                 memcpy(um, mapping, i * sizeof(unsigned int));
2063                 unicode_data[unichar].utf32nfdi = um;
2064
2065                 if (verbose > 1)
2066                         print_utf32nfdi(unichar);
2067                 count++;
2068         }
2069         fclose(file);
2070         if (verbose > 0)
2071                 printf("Found %d entries\n", count);
2072         if (count == 0)
2073                 file_fail(data_name);
2074 }
2075
2076 static void nfdicf_init(void)
2077 {
2078         FILE *file;
2079         unsigned int unichar;
2080         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2081         char status;
2082         char *s;
2083         unsigned int *um;
2084         int i;
2085         int count;
2086         int ret;
2087
2088         if (verbose > 0)
2089                 printf("Parsing %s\n", fold_name);
2090         file = fopen(fold_name, "r");
2091         if (!file)
2092                 open_fail(fold_name, errno);
2093
2094         count = 0;
2095         while (fgets(line, LINESIZE, file)) {
2096                 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2097                 if (ret != 3)
2098                         continue;
2099                 if (!utf32valid(unichar))
2100                         line_fail(fold_name, line);
2101                 /* Use the C+F casefold. */
2102                 if (status != 'C' && status != 'F')
2103                         continue;
2104                 s = buf0;
2105                 if (*s == '<')
2106                         while (*s++ != ' ')
2107                                 ;
2108                 i = 0;
2109                 while (*s) {
2110                         mapping[i] = strtoul(s, &s, 16);
2111                         if (!utf32valid(mapping[i]))
2112                                 line_fail(fold_name, line);
2113                         i++;
2114                 }
2115                 mapping[i++] = 0;
2116
2117                 um = malloc(i * sizeof(unsigned int));
2118                 memcpy(um, mapping, i * sizeof(unsigned int));
2119                 unicode_data[unichar].utf32nfdicf = um;
2120
2121                 if (verbose > 1)
2122                         print_utf32nfdicf(unichar);
2123                 count++;
2124         }
2125         fclose(file);
2126         if (verbose > 0)
2127                 printf("Found %d entries\n", count);
2128         if (count == 0)
2129                 file_fail(fold_name);
2130 }
2131
2132 static void ignore_init(void)
2133 {
2134         FILE *file;
2135         unsigned int unichar;
2136         unsigned int first;
2137         unsigned int last;
2138         unsigned int *um;
2139         int count;
2140         int ret;
2141
2142         if (verbose > 0)
2143                 printf("Parsing %s\n", prop_name);
2144         file = fopen(prop_name, "r");
2145         if (!file)
2146                 open_fail(prop_name, errno);
2147         assert(file);
2148         count = 0;
2149         while (fgets(line, LINESIZE, file)) {
2150                 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2151                 if (ret == 3) {
2152                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2153                                 continue;
2154                         if (!utf32valid(first) || !utf32valid(last))
2155                                 line_fail(prop_name, line);
2156                         for (unichar = first; unichar <= last; unichar++) {
2157                                 free(unicode_data[unichar].utf32nfdi);
2158                                 um = malloc(sizeof(unsigned int));
2159                                 *um = 0;
2160                                 unicode_data[unichar].utf32nfdi = um;
2161                                 free(unicode_data[unichar].utf32nfdicf);
2162                                 um = malloc(sizeof(unsigned int));
2163                                 *um = 0;
2164                                 unicode_data[unichar].utf32nfdicf = um;
2165                                 count++;
2166                         }
2167                         if (verbose > 1)
2168                                 printf(" %X..%X Default_Ignorable_Code_Point\n",
2169                                         first, last);
2170                         continue;
2171                 }
2172                 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2173                 if (ret == 2) {
2174                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2175                                 continue;
2176                         if (!utf32valid(unichar))
2177                                 line_fail(prop_name, line);
2178                         free(unicode_data[unichar].utf32nfdi);
2179                         um = malloc(sizeof(unsigned int));
2180                         *um = 0;
2181                         unicode_data[unichar].utf32nfdi = um;
2182                         free(unicode_data[unichar].utf32nfdicf);
2183                         um = malloc(sizeof(unsigned int));
2184                         *um = 0;
2185                         unicode_data[unichar].utf32nfdicf = um;
2186                         if (verbose > 1)
2187                                 printf(" %X Default_Ignorable_Code_Point\n",
2188                                         unichar);
2189                         count++;
2190                         continue;
2191                 }
2192         }
2193         fclose(file);
2194
2195         if (verbose > 0)
2196                 printf("Found %d entries\n", count);
2197         if (count == 0)
2198                 file_fail(prop_name);
2199 }
2200
2201 static void corrections_init(void)
2202 {
2203         FILE *file;
2204         unsigned int unichar;
2205         unsigned int major;
2206         unsigned int minor;
2207         unsigned int revision;
2208         unsigned int age;
2209         unsigned int *um;
2210         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2211         char *s;
2212         int i;
2213         int count;
2214         int ret;
2215
2216         if (verbose > 0)
2217                 printf("Parsing %s\n", norm_name);
2218         file = fopen(norm_name, "r");
2219         if (!file)
2220                 open_fail(norm_name, errno);
2221
2222         count = 0;
2223         while (fgets(line, LINESIZE, file)) {
2224                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2225                                 &unichar, buf0, buf1,
2226                                 &major, &minor, &revision);
2227                 if (ret != 6)
2228                         continue;
2229                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2230                         line_fail(norm_name, line);
2231                 count++;
2232         }
2233         corrections = calloc(count, sizeof(struct unicode_data));
2234         corrections_count = count;
2235         rewind(file);
2236
2237         count = 0;
2238         while (fgets(line, LINESIZE, file)) {
2239                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2240                                 &unichar, buf0, buf1,
2241                                 &major, &minor, &revision);
2242                 if (ret != 6)
2243                         continue;
2244                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2245                         line_fail(norm_name, line);
2246                 corrections[count] = unicode_data[unichar];
2247                 assert(corrections[count].code == unichar);
2248                 age = UNICODE_AGE(major, minor, revision);
2249                 corrections[count].correction = age;
2250
2251                 i = 0;
2252                 s = buf0;
2253                 while (*s) {
2254                         mapping[i] = strtoul(s, &s, 16);
2255                         if (!utf32valid(mapping[i]))
2256                                 line_fail(norm_name, line);
2257                         i++;
2258                 }
2259                 mapping[i++] = 0;
2260
2261                 um = malloc(i * sizeof(unsigned int));
2262                 memcpy(um, mapping, i * sizeof(unsigned int));
2263                 corrections[count].utf32nfdi = um;
2264
2265                 if (verbose > 1)
2266                         printf(" %X -> %s -> %s V%d_%d_%d\n",
2267                                 unichar, buf0, buf1, major, minor, revision);
2268                 count++;
2269         }
2270         fclose(file);
2271
2272         if (verbose > 0)
2273                 printf("Found %d entries\n", count);
2274         if (count == 0)
2275                 file_fail(norm_name);
2276 }
2277
2278 /* ------------------------------------------------------------------ */
2279
2280 /*
2281  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2282  *
2283  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2284  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2285  *
2286  * SBase = 0xAC00
2287  * LBase = 0x1100
2288  * VBase = 0x1161
2289  * TBase = 0x11A7
2290  * LCount = 19
2291  * VCount = 21
2292  * TCount = 28
2293  * NCount = 588 (VCount * TCount)
2294  * SCount = 11172 (LCount * NCount)
2295  *
2296  * Decomposition:
2297  *   SIndex = s - SBase
2298  *
2299  * LV (Canonical/Full)
2300  *   LIndex = SIndex / NCount
2301  *   VIndex = (Sindex % NCount) / TCount
2302  *   LPart = LBase + LIndex
2303  *   VPart = VBase + VIndex
2304  *
2305  * LVT (Canonical)
2306  *   LVIndex = (SIndex / TCount) * TCount
2307  *   TIndex = (Sindex % TCount)
2308  *   LVPart = SBase + LVIndex
2309  *   TPart = TBase + TIndex
2310  *
2311  * LVT (Full)
2312  *   LIndex = SIndex / NCount
2313  *   VIndex = (Sindex % NCount) / TCount
2314  *   TIndex = (Sindex % TCount)
2315  *   LPart = LBase + LIndex
2316  *   VPart = VBase + VIndex
2317  *   if (TIndex == 0) {
2318  *          d = <LPart, VPart>
2319  *   } else {
2320  *          TPart = TBase + TIndex
2321  *          d = <LPart, VPart, TPart>
2322  *   }
2323  *
2324  */
2325
2326 static void
2327 hangul_decompose(void)
2328 {
2329         unsigned int sb = 0xAC00;
2330         unsigned int lb = 0x1100;
2331         unsigned int vb = 0x1161;
2332         unsigned int tb = 0x11a7;
2333         /* unsigned int lc = 19; */
2334         unsigned int vc = 21;
2335         unsigned int tc = 28;
2336         unsigned int nc = (vc * tc);
2337         /* unsigned int sc = (lc * nc); */
2338         unsigned int unichar;
2339         unsigned int mapping[4];
2340         unsigned int *um;
2341         int count;
2342         int i;
2343
2344         if (verbose > 0)
2345                 printf("Decomposing hangul\n");
2346         /* Hangul */
2347         count = 0;
2348         for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2349                 unsigned int si = unichar - sb;
2350                 unsigned int li = si / nc;
2351                 unsigned int vi = (si % nc) / tc;
2352                 unsigned int ti = si % tc;
2353
2354                 i = 0;
2355                 mapping[i++] = lb + li;
2356                 mapping[i++] = vb + vi;
2357                 if (ti)
2358                         mapping[i++] = tb + ti;
2359                 mapping[i++] = 0;
2360
2361                 assert(!unicode_data[unichar].utf32nfdi);
2362                 um = malloc(i * sizeof(unsigned int));
2363                 memcpy(um, mapping, i * sizeof(unsigned int));
2364                 unicode_data[unichar].utf32nfdi = um;
2365
2366                 assert(!unicode_data[unichar].utf32nfdicf);
2367                 um = malloc(i * sizeof(unsigned int));
2368                 memcpy(um, mapping, i * sizeof(unsigned int));
2369                 unicode_data[unichar].utf32nfdicf = um;
2370
2371                 if (verbose > 1)
2372                         print_utf32nfdi(unichar);
2373
2374                 count++;
2375         }
2376         if (verbose > 0)
2377                 printf("Created %d entries\n", count);
2378 }
2379
2380 static void nfdi_decompose(void)
2381 {
2382         unsigned int unichar;
2383         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2384         unsigned int *um;
2385         unsigned int *dc;
2386         int count;
2387         int i;
2388         int j;
2389         int ret;
2390
2391         if (verbose > 0)
2392                 printf("Decomposing nfdi\n");
2393
2394         count = 0;
2395         for (unichar = 0; unichar != 0x110000; unichar++) {
2396                 if (!unicode_data[unichar].utf32nfdi)
2397                         continue;
2398                 for (;;) {
2399                         ret = 1;
2400                         i = 0;
2401                         um = unicode_data[unichar].utf32nfdi;
2402                         while (*um) {
2403                                 dc = unicode_data[*um].utf32nfdi;
2404                                 if (dc) {
2405                                         for (j = 0; dc[j]; j++)
2406                                                 mapping[i++] = dc[j];
2407                                         ret = 0;
2408                                 } else {
2409                                         mapping[i++] = *um;
2410                                 }
2411                                 um++;
2412                         }
2413                         mapping[i++] = 0;
2414                         if (ret)
2415                                 break;
2416                         free(unicode_data[unichar].utf32nfdi);
2417                         um = malloc(i * sizeof(unsigned int));
2418                         memcpy(um, mapping, i * sizeof(unsigned int));
2419                         unicode_data[unichar].utf32nfdi = um;
2420                 }
2421                 /* Add this decomposition to nfdicf if there is no entry. */
2422                 if (!unicode_data[unichar].utf32nfdicf) {
2423                         um = malloc(i * sizeof(unsigned int));
2424                         memcpy(um, mapping, i * sizeof(unsigned int));
2425                         unicode_data[unichar].utf32nfdicf = um;
2426                 }
2427                 if (verbose > 1)
2428                         print_utf32nfdi(unichar);
2429                 count++;
2430         }
2431         if (verbose > 0)
2432                 printf("Processed %d entries\n", count);
2433 }
2434
2435 static void nfdicf_decompose(void)
2436 {
2437         unsigned int unichar;
2438         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2439         unsigned int *um;
2440         unsigned int *dc;
2441         int count;
2442         int i;
2443         int j;
2444         int ret;
2445
2446         if (verbose > 0)
2447                 printf("Decomposing nfdicf\n");
2448         count = 0;
2449         for (unichar = 0; unichar != 0x110000; unichar++) {
2450                 if (!unicode_data[unichar].utf32nfdicf)
2451                         continue;
2452                 for (;;) {
2453                         ret = 1;
2454                         i = 0;
2455                         um = unicode_data[unichar].utf32nfdicf;
2456                         while (*um) {
2457                                 dc = unicode_data[*um].utf32nfdicf;
2458                                 if (dc) {
2459                                         for (j = 0; dc[j]; j++)
2460                                                 mapping[i++] = dc[j];
2461                                         ret = 0;
2462                                 } else {
2463                                         mapping[i++] = *um;
2464                                 }
2465                                 um++;
2466                         }
2467                         mapping[i++] = 0;
2468                         if (ret)
2469                                 break;
2470                         free(unicode_data[unichar].utf32nfdicf);
2471                         um = malloc(i * sizeof(unsigned int));
2472                         memcpy(um, mapping, i * sizeof(unsigned int));
2473                         unicode_data[unichar].utf32nfdicf = um;
2474                 }
2475                 if (verbose > 1)
2476                         print_utf32nfdicf(unichar);
2477                 count++;
2478         }
2479         if (verbose > 0)
2480                 printf("Processed %d entries\n", count);
2481 }
2482
2483 /* ------------------------------------------------------------------ */
2484
2485 int utf8agemax(struct tree *, const char *);
2486 int utf8nagemax(struct tree *, const char *, size_t);
2487 int utf8agemin(struct tree *, const char *);
2488 int utf8nagemin(struct tree *, const char *, size_t);
2489 ssize_t utf8len(struct tree *, const char *);
2490 ssize_t utf8nlen(struct tree *, const char *, size_t);
2491 struct utf8cursor;
2492 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2493 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2494 int utf8byte(struct utf8cursor *);
2495
2496 /*
2497  * Use trie to scan s, touching at most len bytes.
2498  * Returns the leaf if one exists, NULL otherwise.
2499  *
2500  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2501  * is well-formed and corresponds to a known unicode code point.  The
2502  * shorthand for this will be "is valid UTF-8 unicode".
2503  */
2504 static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
2505 {
2506         utf8trie_t      *trie;
2507         int             offlen;
2508         int             offset;
2509         int             mask;
2510         int             node;
2511
2512         if (!tree)
2513                 return NULL;
2514         if (len == 0)
2515                 return NULL;
2516         node = 1;
2517         trie = utf8data + tree->index;
2518         while (node) {
2519                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2520                 if (*trie & NEXTBYTE) {
2521                         if (--len == 0)
2522                                 return NULL;
2523                         s++;
2524                 }
2525                 mask = 1 << (*trie & BITNUM);
2526                 if (*s & mask) {
2527                         /* Right leg */
2528                         if (offlen) {
2529                                 /* Right node at offset of trie */
2530                                 node = (*trie & RIGHTNODE);
2531                                 offset = trie[offlen];
2532                                 while (--offlen) {
2533                                         offset <<= 8;
2534                                         offset |= trie[offlen];
2535                                 }
2536                                 trie += offset;
2537                         } else if (*trie & RIGHTPATH) {
2538                                 /* Right node after this node */
2539                                 node = (*trie & TRIENODE);
2540                                 trie++;
2541                         } else {
2542                                 /* No right node. */
2543                                 return NULL;
2544                         }
2545                 } else {
2546                         /* Left leg */
2547                         if (offlen) {
2548                                 /* Left node after this node. */
2549                                 node = (*trie & LEFTNODE);
2550                                 trie += offlen + 1;
2551                         } else if (*trie & RIGHTPATH) {
2552                                 /* No left node. */
2553                                 return NULL;
2554                         } else {
2555                                 /* Left node after this node */
2556                                 node = (*trie & TRIENODE);
2557                                 trie++;
2558                         }
2559                 }
2560         }
2561         return trie;
2562 }
2563
2564 /*
2565  * Use trie to scan s.
2566  * Returns the leaf if one exists, NULL otherwise.
2567  *
2568  * Forwards to trie_nlookup().
2569  */
2570 static utf8leaf_t *utf8lookup(struct tree *tree, const char *s)
2571 {
2572         return utf8nlookup(tree, s, (size_t)-1);
2573 }
2574
2575 /*
2576  * Return the number of bytes used by the current UTF-8 sequence.
2577  * Assumes the input points to the first byte of a valid UTF-8
2578  * sequence.
2579  */
2580 static inline int utf8clen(const char *s)
2581 {
2582         unsigned char c = *s;
2583         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2584 }
2585
2586 /*
2587  * Maximum age of any character in s.
2588  * Return -1 if s is not valid UTF-8 unicode.
2589  * Return 0 if only non-assigned code points are used.
2590  */
2591 int utf8agemax(struct tree *tree, const char *s)
2592 {
2593         utf8leaf_t      *leaf;
2594         int             age = 0;
2595         int             leaf_age;
2596
2597         if (!tree)
2598                 return -1;
2599         while (*s) {
2600                 if (!(leaf = utf8lookup(tree, s)))
2601                         return -1;
2602                 leaf_age = ages[LEAF_GEN(leaf)];
2603                 if (leaf_age <= tree->maxage && leaf_age > age)
2604                         age = leaf_age;
2605                 s += utf8clen(s);
2606         }
2607         return age;
2608 }
2609
2610 /*
2611  * Minimum age of any character in s.
2612  * Return -1 if s is not valid UTF-8 unicode.
2613  * Return 0 if non-assigned code points are used.
2614  */
2615 int utf8agemin(struct tree *tree, const char *s)
2616 {
2617         utf8leaf_t      *leaf;
2618         int             age;
2619         int             leaf_age;
2620
2621         if (!tree)
2622                 return -1;
2623         age = tree->maxage;
2624         while (*s) {
2625                 if (!(leaf = utf8lookup(tree, s)))
2626                         return -1;
2627                 leaf_age = ages[LEAF_GEN(leaf)];
2628                 if (leaf_age <= tree->maxage && leaf_age < age)
2629                         age = leaf_age;
2630                 s += utf8clen(s);
2631         }
2632         return age;
2633 }
2634
2635 /*
2636  * Maximum age of any character in s, touch at most len bytes.
2637  * Return -1 if s is not valid UTF-8 unicode.
2638  */
2639 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2640 {
2641         utf8leaf_t      *leaf;
2642         int             age = 0;
2643         int             leaf_age;
2644
2645         if (!tree)
2646                 return -1;
2647         while (len && *s) {
2648                 if (!(leaf = utf8nlookup(tree, s, len)))
2649                         return -1;
2650                 leaf_age = ages[LEAF_GEN(leaf)];
2651                 if (leaf_age <= tree->maxage && leaf_age > age)
2652                         age = leaf_age;
2653                 len -= utf8clen(s);
2654                 s += utf8clen(s);
2655         }
2656         return age;
2657 }
2658
2659 /*
2660  * Maximum age of any character in s, touch at most len bytes.
2661  * Return -1 if s is not valid UTF-8 unicode.
2662  */
2663 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2664 {
2665         utf8leaf_t      *leaf;
2666         int             leaf_age;
2667         int             age;
2668
2669         if (!tree)
2670                 return -1;
2671         age = tree->maxage;
2672         while (len && *s) {
2673                 if (!(leaf = utf8nlookup(tree, s, len)))
2674                         return -1;
2675                 leaf_age = ages[LEAF_GEN(leaf)];
2676                 if (leaf_age <= tree->maxage && leaf_age < age)
2677                         age = leaf_age;
2678                 len -= utf8clen(s);
2679                 s += utf8clen(s);
2680         }
2681         return age;
2682 }
2683
2684 /*
2685  * Length of the normalization of s.
2686  * Return -1 if s is not valid UTF-8 unicode.
2687  *
2688  * A string of Default_Ignorable_Code_Point has length 0.
2689  */
2690 ssize_t utf8len(struct tree *tree, const char *s)
2691 {
2692         utf8leaf_t      *leaf;
2693         size_t          ret = 0;
2694
2695         if (!tree)
2696                 return -1;
2697         while (*s) {
2698                 if (!(leaf = utf8lookup(tree, s)))
2699                         return -1;
2700                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2701                         ret += utf8clen(s);
2702                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2703                         ret += strlen(LEAF_STR(leaf));
2704                 else
2705                         ret += utf8clen(s);
2706                 s += utf8clen(s);
2707         }
2708         return ret;
2709 }
2710
2711 /*
2712  * Length of the normalization of s, touch at most len bytes.
2713  * Return -1 if s is not valid UTF-8 unicode.
2714  */
2715 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2716 {
2717         utf8leaf_t      *leaf;
2718         size_t          ret = 0;
2719
2720         if (!tree)
2721                 return -1;
2722         while (len && *s) {
2723                 if (!(leaf = utf8nlookup(tree, s, len)))
2724                         return -1;
2725                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2726                         ret += utf8clen(s);
2727                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2728                         ret += strlen(LEAF_STR(leaf));
2729                 else
2730                         ret += utf8clen(s);
2731                 len -= utf8clen(s);
2732                 s += utf8clen(s);
2733         }
2734         return ret;
2735 }
2736
2737 /*
2738  * Cursor structure used by the normalizer.
2739  */
2740 struct utf8cursor {
2741         struct tree     *tree;
2742         const char      *s;
2743         const char      *p;
2744         const char      *ss;
2745         const char      *sp;
2746         unsigned int    len;
2747         unsigned int    slen;
2748         short int       ccc;
2749         short int       nccc;
2750         unsigned int    unichar;
2751 };
2752
2753 /*
2754  * Set up an utf8cursor for use by utf8byte().
2755  *
2756  *   s      : string.
2757  *   len    : length of s.
2758  *   u8c    : pointer to cursor.
2759  *   trie   : utf8trie_t to use for normalization.
2760  *
2761  * Returns -1 on error, 0 on success.
2762  */
2763 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2764                 size_t len)
2765 {
2766         if (!tree)
2767                 return -1;
2768         if (!s)
2769                 return -1;
2770         u8c->tree = tree;
2771         u8c->s = s;
2772         u8c->p = NULL;
2773         u8c->ss = NULL;
2774         u8c->sp = NULL;
2775         u8c->len = len;
2776         u8c->slen = 0;
2777         u8c->ccc = STOPPER;
2778         u8c->nccc = STOPPER;
2779         u8c->unichar = 0;
2780         /* Check we didn't clobber the maximum length. */
2781         if (u8c->len != len)
2782                 return -1;
2783         /* The first byte of s may not be an utf8 continuation. */
2784         if (len > 0 && (*s & 0xC0) == 0x80)
2785                 return -1;
2786         return 0;
2787 }
2788
2789 /*
2790  * Set up an utf8cursor for use by utf8byte().
2791  *
2792  *   s      : NUL-terminated string.
2793  *   u8c    : pointer to cursor.
2794  *   trie   : utf8trie_t to use for normalization.
2795  *
2796  * Returns -1 on error, 0 on success.
2797  */
2798 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2799 {
2800         return utf8ncursor(u8c, tree, s, (unsigned int)-1);
2801 }
2802
2803 /*
2804  * Get one byte from the normalized form of the string described by u8c.
2805  *
2806  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
2807  *
2808  * The cursor keeps track of the location in the string in u8c->s.
2809  * When a character is decomposed, the current location is stored in
2810  * u8c->p, and u8c->s is set to the start of the decomposition. Note
2811  * that bytes from a decomposition do not count against u8c->len.
2812  *
2813  * Characters are emitted if they match the current CCC in u8c->ccc.
2814  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
2815  * and the function returns 0 in that case.
2816  *
2817  * Sorting by CCC is done by repeatedly scanning the string.  The
2818  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
2819  * the start of the scan.  The first pass finds the lowest CCC to be
2820  * emitted and stores it in u8c->nccc, the second pass emits the
2821  * characters with this CCC and finds the next lowest CCC. This limits
2822  * the number of passes to 1 + the number of different CCCs in the
2823  * sequence being scanned.
2824  *
2825  * Therefore:
2826  *  u8c->p  != NULL -> a decomposition is being scanned.
2827  *  u8c->ss != NULL -> this is a repeating scan.
2828  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
2829  */
2830 int utf8byte(struct utf8cursor *u8c)
2831 {
2832         utf8leaf_t *leaf;
2833         int ccc;
2834
2835         for (;;) {
2836                 /* Check for the end of a decomposed character. */
2837                 if (u8c->p && *u8c->s == '\0') {
2838                         u8c->s = u8c->p;
2839                         u8c->p = NULL;
2840                 }
2841
2842                 /* Check for end-of-string. */
2843                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
2844                         /* There is no next byte. */
2845                         if (u8c->ccc == STOPPER)
2846                                 return 0;
2847                         /* End-of-string during a scan counts as a stopper. */
2848                         ccc = STOPPER;
2849                         goto ccc_mismatch;
2850                 } else if ((*u8c->s & 0xC0) == 0x80) {
2851                         /* This is a continuation of the current character. */
2852                         if (!u8c->p)
2853                                 u8c->len--;
2854                         return (unsigned char)*u8c->s++;
2855                 }
2856
2857                 /* Look up the data for the current character. */
2858                 if (u8c->p)
2859                         leaf = utf8lookup(u8c->tree, u8c->s);
2860                 else
2861                         leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len);
2862
2863                 /* No leaf found implies that the input is a binary blob. */
2864                 if (!leaf)
2865                         return -1;
2866
2867                 /* Characters that are too new have CCC 0. */
2868                 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
2869                         ccc = STOPPER;
2870                 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
2871                         u8c->len -= utf8clen(u8c->s);
2872                         u8c->p = u8c->s + utf8clen(u8c->s);
2873                         u8c->s = LEAF_STR(leaf);
2874                         /* Empty decomposition implies CCC 0. */
2875                         if (*u8c->s == '\0') {
2876                                 if (u8c->ccc == STOPPER)
2877                                         continue;
2878                                 ccc = STOPPER;
2879                                 goto ccc_mismatch;
2880                         }
2881                         leaf = utf8lookup(u8c->tree, u8c->s);
2882                         ccc = LEAF_CCC(leaf);
2883                 }
2884                 u8c->unichar = utf8decode(u8c->s);
2885
2886                 /*
2887                  * If this is not a stopper, then see if it updates
2888                  * the next canonical class to be emitted.
2889                  */
2890                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
2891                         u8c->nccc = ccc;
2892
2893                 /*
2894                  * Return the current byte if this is the current
2895                  * combining class.
2896                  */
2897                 if (ccc == u8c->ccc) {
2898                         if (!u8c->p)
2899                                 u8c->len--;
2900                         return (unsigned char)*u8c->s++;
2901                 }
2902
2903                 /* Current combining class mismatch. */
2904         ccc_mismatch:
2905                 if (u8c->nccc == STOPPER) {
2906                         /*
2907                          * Scan forward for the first canonical class
2908                          * to be emitted.  Save the position from
2909                          * which to restart.
2910                          */
2911                         assert(u8c->ccc == STOPPER);
2912                         u8c->ccc = MINCCC - 1;
2913                         u8c->nccc = ccc;
2914                         u8c->sp = u8c->p;
2915                         u8c->ss = u8c->s;
2916                         u8c->slen = u8c->len;
2917                         if (!u8c->p)
2918                                 u8c->len -= utf8clen(u8c->s);
2919                         u8c->s += utf8clen(u8c->s);
2920                 } else if (ccc != STOPPER) {
2921                         /* Not a stopper, and not the ccc we're emitting. */
2922                         if (!u8c->p)
2923                                 u8c->len -= utf8clen(u8c->s);
2924                         u8c->s += utf8clen(u8c->s);
2925                 } else if (u8c->nccc != MAXCCC + 1) {
2926                         /* At a stopper, restart for next ccc. */
2927                         u8c->ccc = u8c->nccc;
2928                         u8c->nccc = MAXCCC + 1;
2929                         u8c->s = u8c->ss;
2930                         u8c->p = u8c->sp;
2931                         u8c->len = u8c->slen;
2932                 } else {
2933                         /* All done, proceed from here. */
2934                         u8c->ccc = STOPPER;
2935                         u8c->nccc = STOPPER;
2936                         u8c->sp = NULL;
2937                         u8c->ss = NULL;
2938                         u8c->slen = 0;
2939                 }
2940         }
2941 }
2942
2943 /* ------------------------------------------------------------------ */
2944
2945 static int normalize_line(struct tree *tree)
2946 {
2947         char *s;
2948         char *t;
2949         int c;
2950         struct utf8cursor u8c;
2951
2952         /* First test: null-terminated string. */
2953         s = buf2;
2954         t = buf3;
2955         if (utf8cursor(&u8c, tree, s))
2956                 return -1;
2957         while ((c = utf8byte(&u8c)) > 0)
2958                 if (c != (unsigned char)*t++)
2959                         return -1;
2960         if (c < 0)
2961                 return -1;
2962         if (*t != 0)
2963                 return -1;
2964
2965         /* Second test: length-limited string. */
2966         s = buf2;
2967         /* Replace NUL with a value that will cause an error if seen. */
2968         s[strlen(s) + 1] = -1;
2969         t = buf3;
2970         if (utf8cursor(&u8c, tree, s))
2971                 return -1;
2972         while ((c = utf8byte(&u8c)) > 0)
2973                 if (c != (unsigned char)*t++)
2974                         return -1;
2975         if (c < 0)
2976                 return -1;
2977         if (*t != 0)
2978                 return -1;
2979
2980         return 0;
2981 }
2982
2983 static void normalization_test(void)
2984 {
2985         FILE *file;
2986         unsigned int unichar;
2987         struct unicode_data *data;
2988         char *s;
2989         char *t;
2990         int ret;
2991         int ignorables;
2992         int tests = 0;
2993         int failures = 0;
2994
2995         if (verbose > 0)
2996                 printf("Parsing %s\n", test_name);
2997         /* Step one, read data from file. */
2998         file = fopen(test_name, "r");
2999         if (!file)
3000                 open_fail(test_name, errno);
3001
3002         while (fgets(line, LINESIZE, file)) {
3003                 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3004                              buf0, buf1);
3005                 if (ret != 2 || *line == '#')
3006                         continue;
3007                 s = buf0;
3008                 t = buf2;
3009                 while (*s) {
3010                         unichar = strtoul(s, &s, 16);
3011                         t += utf8encode(t, unichar);
3012                 }
3013                 *t = '\0';
3014
3015                 ignorables = 0;
3016                 s = buf1;
3017                 t = buf3;
3018                 while (*s) {
3019                         unichar = strtoul(s, &s, 16);
3020                         data = &unicode_data[unichar];
3021                         if (data->utf8nfdi && !*data->utf8nfdi)
3022                                 ignorables = 1;
3023                         else
3024                                 t += utf8encode(t, unichar);
3025                 }
3026                 *t = '\0';
3027
3028                 tests++;
3029                 if (normalize_line(nfdi_tree) < 0) {
3030                         printf("Line %s -> %s", buf0, buf1);
3031                         if (ignorables)
3032                                 printf(" (ignorables removed)");
3033                         printf(" failure\n");
3034                         failures++;
3035                 }
3036         }
3037         fclose(file);
3038         if (verbose > 0)
3039                 printf("Ran %d tests with %d failures\n", tests, failures);
3040         if (failures)
3041                 file_fail(test_name);
3042 }
3043
3044 /* ------------------------------------------------------------------ */
3045
3046 static void write_file(void)
3047 {
3048         FILE *file;
3049         int i;
3050         int j;
3051         int t;
3052         int gen;
3053
3054         if (verbose > 0)
3055                 printf("Writing %s\n", utf8_name);
3056         file = fopen(utf8_name, "w");
3057         if (!file)
3058                 open_fail(utf8_name, errno);
3059
3060         fprintf(file, "/* This file is generated code, do not edit. */\n");
3061         fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3062         fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3063         fprintf(file, "#endif\n");
3064         fprintf(file, "\n");
3065         fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3066                 unicode_maxage);
3067         fprintf(file, "\n");
3068         fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3069         for (i = 0; i != ages_count; i++)
3070                 fprintf(file, "\t%#x%s\n", ages[i],
3071                         ages[i] == unicode_maxage ? "" : ",");
3072         fprintf(file, "};\n");
3073         fprintf(file, "\n");
3074         fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3075         t = 0;
3076         for (gen = 0; gen < ages_count; gen++) {
3077                 fprintf(file, "\t{ %#x, %d }%s\n",
3078                         ages[gen], trees[t].index,
3079                         ages[gen] == unicode_maxage ? "" : ",");
3080                 if (trees[t].maxage == ages[gen])
3081                         t += 2;
3082         }
3083         fprintf(file, "};\n");
3084         fprintf(file, "\n");
3085         fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3086         t = 1;
3087         for (gen = 0; gen < ages_count; gen++) {
3088                 fprintf(file, "\t{ %#x, %d }%s\n",
3089                         ages[gen], trees[t].index,
3090                         ages[gen] == unicode_maxage ? "" : ",");
3091                 if (trees[t].maxage == ages[gen])
3092                         t += 2;
3093         }
3094         fprintf(file, "};\n");
3095         fprintf(file, "\n");
3096         fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3097                 utf8data_size);
3098         t = 0;
3099         for (i = 0; i != utf8data_size; i += 16) {
3100                 if (i == trees[t].index) {
3101                         fprintf(file, "\t/* %s_%x */\n",
3102                                 trees[t].type, trees[t].maxage);
3103                         if (t < trees_count-1)
3104                                 t++;
3105                 }
3106                 fprintf(file, "\t");
3107                 for (j = i; j != i + 16; j++)
3108                         fprintf(file, "0x%.2x%s", utf8data[j],
3109                                 (j < utf8data_size -1 ? "," : ""));
3110                 fprintf(file, "\n");
3111         }
3112         fprintf(file, "};\n");
3113         fclose(file);
3114 }
3115
3116 /* ------------------------------------------------------------------ */
3117
3118 int main(int argc, char *argv[])
3119 {
3120         unsigned int unichar;
3121         int opt;
3122
3123         argv0 = argv[0];
3124
3125         while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3126                 switch (opt) {
3127                 case 'a':
3128                         age_name = optarg;
3129                         break;
3130                 case 'c':
3131                         ccc_name = optarg;
3132                         break;
3133                 case 'd':
3134                         data_name = optarg;
3135                         break;
3136                 case 'f':
3137                         fold_name = optarg;
3138                         break;
3139                 case 'n':
3140                         norm_name = optarg;
3141                         break;
3142                 case 'o':
3143                         utf8_name = optarg;
3144                         break;
3145                 case 'p':
3146                         prop_name = optarg;
3147                         break;
3148                 case 't':
3149                         test_name = optarg;
3150                         break;
3151                 case 'v':
3152                         verbose++;
3153                         break;
3154                 case 'h':
3155                         help();
3156                         exit(0);
3157                 default:
3158                         usage();
3159                 }
3160         }
3161
3162         if (verbose > 1)
3163                 help();
3164         for (unichar = 0; unichar != 0x110000; unichar++)
3165                 unicode_data[unichar].code = unichar;
3166         age_init();
3167         ccc_init();
3168         nfdi_init();
3169         nfdicf_init();
3170         ignore_init();
3171         corrections_init();
3172         hangul_decompose();
3173         nfdi_decompose();
3174         nfdicf_decompose();
3175         utf8_init();
3176         trees_init();
3177         trees_populate();
3178         trees_reduce();
3179         trees_verify();
3180         /* Prevent "unused function" warning. */
3181         (void)lookup(nfdi_tree, " ");
3182         if (verbose > 2)
3183                 tree_walk(nfdi_tree);
3184         if (verbose > 2)
3185                 tree_walk(nfdicf_tree);
3186         normalization_test();
3187         write_file();
3188
3189         return 0;
3190 }