scripts/mkutf8data.c

   1 /*
   2  * Copyright (c) 2014 SGI.
   3  * All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 /* Generator for a compact trie for unicode normalization */
  20
  21 #include <sys/types.h>
  22 #include <stddef.h>
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <assert.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <errno.h>
  29
  30 /* Default names of the in- and output files. */
  31
  32 #define AGE_NAME        "DerivedAge.txt"
  33 #define CCC_NAME        "DerivedCombiningClass.txt"
  34 #define PROP_NAME       "DerivedCoreProperties.txt"
  35 #define DATA_NAME       "UnicodeData.txt"
  36 #define FOLD_NAME       "CaseFolding.txt"
  37 #define NORM_NAME       "NormalizationCorrections.txt"
  38 #define TEST_NAME       "NormalizationTest.txt"
  39 #define UTF8_NAME       "utf8data.h"
  40
  41 const char      *age_name  = AGE_NAME;
  42 const char      *ccc_name  = CCC_NAME;
  43 const char      *prop_name = PROP_NAME;
  44 const char      *data_name = DATA_NAME;
  45 const char      *fold_name = FOLD_NAME;
  46 const char      *norm_name = NORM_NAME;
  47 const char      *test_name = TEST_NAME;
  48 const char      *utf8_name = UTF8_NAME;
  49
  50 int verbose = 0;
  51
  52 /* An arbitrary line size limit on input lines. */
  53
  54 #define LINESIZE        1024
  55 char line[LINESIZE];
  56 char buf0[LINESIZE];
  57 char buf1[LINESIZE];
  58 char buf2[LINESIZE];
  59 char buf3[LINESIZE];
  60
  61 const char *argv0;
  62
  63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
  64
  65 /* ------------------------------------------------------------------ */
  66
  67 /*
  68  * Unicode version numbers consist of three parts: major, minor, and a
  69  * revision.  These numbers are packed into an unsigned int to obtain
  70  * a single version number.
  71  *
  72  * To save space in the generated trie, the unicode version is not
  73  * stored directly, instead we calculate a generation number from the
  74  * unicode versions seen in the DerivedAge file, and use that as an
  75  * index into a table of unicode versions.
  76  */
  77 #define UNICODE_MAJ_SHIFT               (16)
  78 #define UNICODE_MIN_SHIFT               (8)
  79
  80 #define UNICODE_MAJ_MAX                 ((unsigned short)-1)
  81 #define UNICODE_MIN_MAX                 ((unsigned char)-1)
  82 #define UNICODE_REV_MAX                 ((unsigned char)-1)
  83
  84 #define UNICODE_AGE(MAJ,MIN,REV)                        \
  85         (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
  86          ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
  87          ((unsigned int)(REV)))
  88
  89 unsigned int *ages;
  90 int ages_count;
  91
  92 unsigned int unicode_maxage;
  93
  94 static int age_valid(unsigned int major, unsigned int minor,
  95                      unsigned int revision)
  96 {
  97         if (major > UNICODE_MAJ_MAX)
  98                 return 0;
  99         if (minor > UNICODE_MIN_MAX)
 100                 return 0;
 101         if (revision > UNICODE_REV_MAX)
 102                 return 0;
 103         return 1;
 104 }
 105
 106 /* ------------------------------------------------------------------ */
 107
 108 /*
 109  * utf8trie_t
 110  *
 111  * A compact binary tree, used to decode UTF-8 characters.
 112  *
 113  * Internal nodes are one byte for the node itself, and up to three
 114  * bytes for an offset into the tree.  The first byte contains the
 115  * following information:
 116  *  NEXTBYTE  - flag        - advance to next byte if set
 117  *  BITNUM    - 3 bit field - the bit number to tested
 118  *  OFFLEN    - 2 bit field - number of bytes in the offset
 119  * if offlen == 0 (non-branching node)
 120  *  RIGHTPATH - 1 bit field - set if the following node is for the
 121  *                            right-hand path (tested bit is set)
 122  *  TRIENODE  - 1 bit field - set if the following node is an internal
 123  *                            node, otherwise it is a leaf node
 124  * if offlen != 0 (branching node)
 125  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 126  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 127  *
 128  * Due to the way utf8 works, there cannot be branching nodes with
 129  * NEXTBYTE set, and moreover those nodes always have a righthand
 130  * descendant.
 131  */
 132 typedef unsigned char utf8trie_t;
 133 #define BITNUM          0x07
 134 #define NEXTBYTE        0x08
 135 #define OFFLEN          0x30
 136 #define OFFLEN_SHIFT    4
 137 #define RIGHTPATH       0x40
 138 #define TRIENODE        0x80
 139 #define RIGHTNODE       0x40
 140 #define LEFTNODE        0x80
 141
 142 /*
 143  * utf8leaf_t
 144  *
 145  * The leaves of the trie are embedded in the trie, and so the same
 146  * underlying datatype, unsigned char.
 147  *
 148  * leaf[0]: The unicode version, stored as a generation number that is
 149  *          an index into utf8agetab[].  With this we can filter code
 150  *          points based on the unicode version in which they were
 151  *          defined.  The CCC of a non-defined code point is 0.
 152  * leaf[1]: Canonical Combining Class. During normalization, we need
 153  *          to do a stable sort into ascending order of all characters
 154  *          with a non-zero CCC that occur between two characters with
 155  *          a CCC of 0, or at the begin or end of a string.
 156  *          The unicode standard guarantees that all CCC values are
 157  *          between 0 and 254 inclusive, which leaves 255 available as
 158  *          a special value.
 159  *          Code points with CCC 0 are known as stoppers.
 160  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 161  *          start of a NUL-terminated string that is the decomposition
 162  *          of the character.
 163  *          The CCC of a decomposable character is the same as the CCC
 164  *          of the first character of its decomposition.
 165  *          Some characters decompose as the empty string: these are
 166  *          characters with the Default_Ignorable_Code_Point property.
 167  *          These do affect normalization, as they all have CCC 0.
 168  *
 169  * The decompositions in the trie have been fully expanded.
 170  *
 171  * Casefolding, if applicable, is also done using decompositions.
 172  */
 173 typedef unsigned char utf8leaf_t;
 174
 175 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 176 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 177 #define LEAF_STR(LEAF)  ((const char*)((LEAF) + 2))
 178
 179 #define MAXGEN          (255)
 180
 181 #define MINCCC          (0)
 182 #define MAXCCC          (254)
 183 #define STOPPER         (0)
 184 #define DECOMPOSE       (255)
 185
 186 struct tree;
 187 static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t);
 188 static utf8leaf_t *utf8lookup(struct tree *, const char *);
 189
 190 unsigned char *utf8data;
 191 size_t utf8data_size;
 192
 193 utf8trie_t *nfdi;
 194 utf8trie_t *nfdicf;
 195
 196 /* ------------------------------------------------------------------ */
 197
 198 /*
 199  * UTF8 valid ranges.
 200  *
 201  * The UTF-8 encoding spreads the bits of a 32bit word over several
 202  * bytes. This table gives the ranges that can be held and how they'd
 203  * be represented.
 204  *
 205  * 0x00000000 0x0000007F: 0xxxxxxx
 206  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
 207  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 208  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 209  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 210  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 211  *
 212  * There is an additional requirement on UTF-8, in that only the
 213  * shortest representation of a 32bit value is to be used.  A decoder
 214  * must not decode sequences that do not satisfy this requirement.
 215  * Thus the allowed ranges have a lower bound.
 216  *
 217  * 0x00000000 0x0000007F: 0xxxxxxx
 218  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
 219  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 220  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 221  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 222  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 223  *
 224  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
 225  * 17 planes of 65536 values.  This limits the sequences actually seen
 226  * even more, to just the following.
 227  *
 228  *          0 -     0x7f: 0                     0x7f
 229  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
 230  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
 231  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
 232  *
 233  * Even within those ranges not all values are allowed: the surrogates
 234  * 0xd800 - 0xdfff should never be seen.
 235  *
 236  * Note that the longest sequence seen with valid usage is 4 bytes,
 237  * the same a single UTF-32 character.  This makes the UTF-8
 238  * representation of Unicode strictly smaller than UTF-32.
 239  *
 240  * The shortest sequence requirement was introduced by:
 241  *    Corrigendum #1: UTF-8 Shortest Form
 242  * It can be found here:
 243  *    http://www.unicode.org/versions/corrigendum1.html
 244  *
 245  */
 246
 247 #define UTF8_2_BITS     0xC0
 248 #define UTF8_3_BITS     0xE0
 249 #define UTF8_4_BITS     0xF0
 250 #define UTF8_N_BITS     0x80
 251 #define UTF8_2_MASK     0xE0
 252 #define UTF8_3_MASK     0xF0
 253 #define UTF8_4_MASK     0xF8
 254 #define UTF8_N_MASK     0xC0
 255 #define UTF8_V_MASK     0x3F
 256 #define UTF8_V_SHIFT    6
 257
 258 static int utf8encode(char *str, unsigned int val)
 259 {
 260         int len;
 261
 262         if (val < 0x80) {
 263                 str[0] = val;
 264                 len = 1;
 265         } else if (val < 0x800) {
 266                 str[1] = val & UTF8_V_MASK;
 267                 str[1] |= UTF8_N_BITS;
 268                 val >>= UTF8_V_SHIFT;
 269                 str[0] = val;
 270                 str[0] |= UTF8_2_BITS;
 271                 len = 2;
 272         } else if (val < 0x10000) {
 273                 str[2] = val & UTF8_V_MASK;
 274                 str[2] |= UTF8_N_BITS;
 275                 val >>= UTF8_V_SHIFT;
 276                 str[1] = val & UTF8_V_MASK;
 277                 str[1] |= UTF8_N_BITS;
 278                 val >>= UTF8_V_SHIFT;
 279                 str[0] = val;
 280                 str[0] |= UTF8_3_BITS;
 281                 len = 3;
 282         } else if (val < 0x110000) {
 283                 str[3] = val & UTF8_V_MASK;
 284                 str[3] |= UTF8_N_BITS;
 285                 val >>= UTF8_V_SHIFT;
 286                 str[2] = val & UTF8_V_MASK;
 287                 str[2] |= UTF8_N_BITS;
 288                 val >>= UTF8_V_SHIFT;
 289                 str[1] = val & UTF8_V_MASK;
 290                 str[1] |= UTF8_N_BITS;
 291                 val >>= UTF8_V_SHIFT;
 292                 str[0] = val;
 293                 str[0] |= UTF8_4_BITS;
 294                 len = 4;
 295         } else {
 296                 printf("%#x: illegal val\n", val);
 297                 len = 0;
 298         }
 299         return len;
 300 }
 301
 302 static unsigned int utf8decode(const char *str)
 303 {
 304         const unsigned char *s = (const unsigned char*)str;
 305         unsigned int unichar = 0;
 306
 307         if (*s < 0x80) {
 308                 unichar = *s;
 309         } else if (*s < UTF8_3_BITS) {
 310                 unichar = *s++ & 0x1F;
 311                 unichar <<= UTF8_V_SHIFT;
 312                 unichar |= *s & 0x3F;
 313         } else if (*s < UTF8_4_BITS) {
 314                 unichar = *s++ & 0x0F;
 315                 unichar <<= UTF8_V_SHIFT;
 316                 unichar |= *s++ & 0x3F;
 317                 unichar <<= UTF8_V_SHIFT;
 318                 unichar |= *s & 0x3F;
 319         } else {
 320                 unichar = *s++ & 0x0F;
 321                 unichar <<= UTF8_V_SHIFT;
 322                 unichar |= *s++ & 0x3F;
 323                 unichar <<= UTF8_V_SHIFT;
 324                 unichar |= *s++ & 0x3F;
 325                 unichar <<= UTF8_V_SHIFT;
 326                 unichar |= *s & 0x3F;
 327         }
 328         return unichar;
 329 }
 330
 331 static int utf32valid(unsigned int unichar)
 332 {
 333         return unichar < 0x110000;
 334 }
 335
 336 #define NODE 1
 337 #define LEAF 0
 338
 339 struct tree {
 340         void *root;
 341         int childnode;
 342         const char *type;
 343         unsigned int maxage;
 344         struct tree *next;
 345         int (*leaf_equal)(void *, void *);
 346         void (*leaf_print)(void *, int);
 347         int (*leaf_mark)(void *);
 348         int (*leaf_size)(void *);
 349         int *(*leaf_index)(struct tree *, void *);
 350         unsigned char *(*leaf_emit)(void *, unsigned char *);
 351         int leafindex[0x110000];
 352         int index;
 353 };
 354
 355 struct node {
 356         int index;
 357         int offset;
 358         int mark;
 359         int size;
 360         struct node *parent;
 361         void *left;
 362         void *right;
 363         unsigned char bitnum;
 364         unsigned char nextbyte;
 365         unsigned char leftnode;
 366         unsigned char rightnode;
 367         unsigned int keybits;
 368         unsigned int keymask;
 369 };
 370
 371 /*
 372  * Example lookup function for a tree.
 373  */
 374 static void *lookup(struct tree *tree, const char *key)
 375 {
 376         struct node *node;
 377         void *leaf = NULL;
 378
 379         node = tree->root;
 380         while (!leaf && node) {
 381                 if (node->nextbyte)
 382                         key++;
 383                 if (*key & (1 << (node->bitnum & 7))) {
 384                         /* Right leg */
 385                         if (node->rightnode == NODE) {
 386                                 node = node->right;
 387                         } else if (node->rightnode == LEAF) {
 388                                 leaf = node->right;
 389                         } else {
 390                                 node = NULL;
 391                         }
 392                 } else {
 393                         /* Left leg */
 394                         if (node->leftnode == NODE) {
 395                                 node = node->left;
 396                         } else if (node->leftnode == LEAF) {
 397                                 leaf = node->left;
 398                         } else {
 399                                 node = NULL;
 400                         }
 401                 }
 402         }
 403
 404         return leaf;
 405 }
 406
 407 /*
 408  * A simple non-recursive tree walker: keep track of visits to the
 409  * left and right branches in the leftmask and rightmask.
 410  */
 411 static void tree_walk(struct tree *tree)
 412 {
 413         struct node *node;
 414         unsigned int leftmask;
 415         unsigned int rightmask;
 416         unsigned int bitmask;
 417         int indent = 1;
 418         int nodes, singletons, leaves;
 419
 420         nodes = singletons = leaves = 0;
 421
 422         printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
 423         if (tree->childnode == LEAF) {
 424                 assert(tree->root);
 425                 tree->leaf_print(tree->root, indent);
 426                 leaves = 1;
 427         } else {
 428                 assert(tree->childnode == NODE);
 429                 node = tree->root;
 430                 leftmask = rightmask = 0;
 431                 while (node) {
 432                         printf("%*snode @ %p bitnum %d nextbyte %d"
 433                                " left %p right %p mask %x bits %x\n",
 434                                 indent, "", node,
 435                                 node->bitnum, node->nextbyte,
 436                                 node->left, node->right,
 437                                 node->keymask, node->keybits);
 438                         nodes += 1;
 439                         if (!(node->left && node->right))
 440                                 singletons += 1;
 441
 442                         while (node) {
 443                                 bitmask = 1 << node->bitnum;
 444                                 if ((leftmask & bitmask) == 0) {
 445                                         leftmask |= bitmask;
 446                                         if (node->leftnode == LEAF) {
 447                                                 assert(node->left);
 448                                                 tree->leaf_print(node->left,
 449                                                                  indent+1);
 450                                                 leaves += 1;
 451                                         } else if (node->left) {
 452                                                 assert(node->leftnode == NODE);
 453                                                 indent += 1;
 454                                                 node = node->left;
 455                                                 break;
 456                                         }
 457                                 }
 458                                 if ((rightmask & bitmask) == 0) {
 459                                         rightmask |= bitmask;
 460                                         if (node->rightnode == LEAF) {
 461                                                 assert(node->right);
 462                                                 tree->leaf_print(node->right,
 463                                                                  indent+1);
 464                                                 leaves += 1;
 465                                         } else if (node->right) {
 466                                                 assert(node->rightnode==NODE);
 467                                                 indent += 1;
 468                                                 node = node->right;
 469                                                 break;
 470                                         }
 471                                 }
 472                                 leftmask &= ~bitmask;
 473                                 rightmask &= ~bitmask;
 474                                 node = node->parent;
 475                                 indent -= 1;
 476                         }
 477                 }
 478         }
 479         printf("nodes %d leaves %d singletons %d\n",
 480                nodes, leaves, singletons);
 481 }
 482
 483 /*
 484  * Allocate an initialize a new internal node.
 485  */
 486 static struct node *alloc_node(struct node *parent)
 487 {
 488         struct node *node;
 489         int bitnum;
 490
 491         node = malloc(sizeof(*node));
 492         node->left = node->right = NULL;
 493         node->parent = parent;
 494         node->leftnode = NODE;
 495         node->rightnode = NODE;
 496         node->keybits = 0;
 497         node->keymask = 0;
 498         node->mark = 0;
 499         node->index = 0;
 500         node->offset = -1;
 501         node->size = 4;
 502
 503         if (node->parent) {
 504                 bitnum = parent->bitnum;
 505                 if ((bitnum & 7) == 0) {
 506                         node->bitnum = bitnum + 7 + 8;
 507                         node->nextbyte = 1;
 508                 } else {
 509                         node->bitnum = bitnum - 1;
 510                         node->nextbyte = 0;
 511                 }
 512         } else {
 513                 node->bitnum = 7;
 514                 node->nextbyte = 0;
 515         }
 516
 517         return node;
 518 }
 519
 520 /*
 521  * Insert a new leaf into the tree, and collapse any subtrees that are
 522  * fully populated and end in identical leaves. A nextbyte tagged
 523  * internal node will not be removed to preserve the tree's integrity.
 524  * Note that due to the structure of utf8, no nextbyte tagged node
 525  * will be a candidate for removal.
 526  */
 527 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
 528 {
 529         struct node *node;
 530         struct node *parent;
 531         void **cursor;
 532         int keybits;
 533
 534         assert(keylen >= 1 && keylen <= 4);
 535
 536         node = NULL;
 537         cursor = &tree->root;
 538         keybits = 8 * keylen;
 539
 540         /* Insert, creating path along the way. */
 541         while (keybits) {
 542                 if (!*cursor)
 543                         *cursor = alloc_node(node);
 544                 node = *cursor;
 545                 if (node->nextbyte)
 546                         key++;
 547                 if (*key & (1 << (node->bitnum & 7)))
 548                         cursor = &node->right;
 549                 else
 550                         cursor = &node->left;
 551                 keybits--;
 552         }
 553         *cursor = leaf;
 554
 555         /* Merge subtrees if possible. */
 556         while (node) {
 557                 if (*key & (1 << (node->bitnum & 7)))
 558                         node->rightnode = LEAF;
 559                 else
 560                         node->leftnode = LEAF;
 561                 if (node->nextbyte)
 562                         break;
 563                 if (node->leftnode == NODE || node->rightnode == NODE)
 564                         break;
 565                 assert(node->left);
 566                 assert(node->right);
 567                 /* Compare */
 568                 if (! tree->leaf_equal(node->left, node->right))
 569                         break;
 570                 /* Keep left, drop right leaf. */
 571                 leaf = node->left;
 572                 /* Check in parent */
 573                 parent = node->parent;
 574                 if (!parent) {
 575                         /* root of tree! */
 576                         tree->root = leaf;
 577                         tree->childnode = LEAF;
 578                 } else if (parent->left == node) {
 579                         parent->left = leaf;
 580                         parent->leftnode = LEAF;
 581                         if (parent->right) {
 582                                 parent->keymask = 0;
 583                                 parent->keybits = 0;
 584                         } else {
 585                                 parent->keymask |= (1 << node->bitnum);
 586                         }
 587                 } else if (parent->right == node) {
 588                         parent->right = leaf;
 589                         parent->rightnode = LEAF;
 590                         if (parent->left) {
 591                                 parent->keymask = 0;
 592                                 parent->keybits = 0;
 593                         } else {
 594                                 parent->keymask |= (1 << node->bitnum);
 595                                 parent->keybits |= (1 << node->bitnum);
 596                         }
 597                 } else {
 598                         /* internal tree error */
 599                         assert(0);
 600                 }
 601                 free(node);
 602                 node = parent;
 603         }
 604
 605         /* Propagate keymasks up along singleton chains. */
 606         while (node) {
 607                 parent = node->parent;
 608                 if (!parent)
 609                         break;
 610                 /* Nix the mask for parents with two children. */
 611                 if (node->keymask == 0) {
 612                         parent->keymask = 0;
 613                         parent->keybits = 0;
 614                 } else if (parent->left && parent->right) {
 615                         parent->keymask = 0;
 616                         parent->keybits = 0;
 617                 } else {
 618                         assert((parent->keymask & node->keymask) == 0);
 619                         parent->keymask |= node->keymask;
 620                         parent->keymask |= (1 << parent->bitnum);
 621                         parent->keybits |= node->keybits;
 622                         if (parent->right)
 623                                 parent->keybits |= (1 << parent->bitnum);
 624                 }
 625                 node = parent;
 626         }
 627
 628         return 0;
 629 }
 630
 631 /*
 632  * Prune internal nodes.
 633  *
 634  * Fully populated subtrees that end at the same leaf have already
 635  * been collapsed.  There are still internal nodes that have for both
 636  * their left and right branches a sequence of singletons that make
 637  * identical choices and end in identical leaves.  The keymask and
 638  * keybits collected in the nodes describe the choices made in these
 639  * singleton chains.  When they are identical for the left and right
 640  * branch of a node, and the two leaves comare identical, the node in
 641  * question can be removed.
 642  *
 643  * Note that nodes with the nextbyte tag set will not be removed by
 644  * this to ensure tree integrity.  Note as well that the structure of
 645  * utf8 ensures that these nodes would not have been candidates for
 646  * removal in any case.
 647  */
 648 static void prune(struct tree *tree)
 649 {
 650         struct node *node;
 651         struct node *left;
 652         struct node *right;
 653         struct node *parent;
 654         void *leftleaf;
 655         void *rightleaf;
 656         unsigned int leftmask;
 657         unsigned int rightmask;
 658         unsigned int bitmask;
 659         int count;
 660
 661         if (verbose > 0)
 662                 printf("Pruning %s_%x\n", tree->type, tree->maxage);
 663
 664         count = 0;
 665         if (tree->childnode == LEAF)
 666                 return;
 667         if (!tree->root)
 668                 return;
 669
 670         leftmask = rightmask = 0;
 671         node = tree->root;
 672         while (node) {
 673                 if (node->nextbyte)
 674                         goto advance;
 675                 if (node->leftnode == LEAF)
 676                         goto advance;
 677                 if (node->rightnode == LEAF)
 678                         goto advance;
 679                 if (!node->left)
 680                         goto advance;
 681                 if (!node->right)
 682                         goto advance;
 683                 left = node->left;
 684                 right = node->right;
 685                 if (left->keymask == 0)
 686                         goto advance;
 687                 if (right->keymask == 0)
 688                         goto advance;
 689                 if (left->keymask != right->keymask)
 690                         goto advance;
 691                 if (left->keybits != right->keybits)
 692                         goto advance;
 693                 leftleaf = NULL;
 694                 while (!leftleaf) {
 695                         assert(left->left || left->right);
 696                         if (left->leftnode == LEAF)
 697                                 leftleaf = left->left;
 698                         else if (left->rightnode == LEAF)
 699                                 leftleaf = left->right;
 700                         else if (left->left)
 701                                 left = left->left;
 702                         else if (left->right)
 703                                 left = left->right;
 704                         else
 705                                 assert(0);
 706                 }
 707                 rightleaf = NULL;
 708                 while (!rightleaf) {
 709                         assert(right->left || right->right);
 710                         if (right->leftnode == LEAF)
 711                                 rightleaf = right->left;
 712                         else if (right->rightnode == LEAF)
 713                                 rightleaf = right->right;
 714                         else if (right->left)
 715                                 right = right->left;
 716                         else if (right->right)
 717                                 right = right->right;
 718                         else
 719                                 assert(0);
 720                 }
 721                 if (! tree->leaf_equal(leftleaf, rightleaf))
 722                         goto advance;
 723                 /*
 724                  * This node has identical singleton-only subtrees.
 725                  * Remove it.
 726                  */
 727                 parent = node->parent;
 728                 left = node->left;
 729                 right = node->right;
 730                 if (parent->left == node)
 731                         parent->left = left;
 732                 else if (parent->right == node)
 733                         parent->right = left;
 734                 else
 735                         assert(0);
 736                 left->parent = parent;
 737                 left->keymask |= (1 << node->bitnum);
 738                 node->left = NULL;
 739                 while (node) {
 740                         bitmask = 1 << node->bitnum;
 741                         leftmask &= ~bitmask;
 742                         rightmask &= ~bitmask;
 743                         if (node->leftnode == NODE && node->left) {
 744                                 left = node->left;
 745                                 free(node);
 746                                 count++;
 747                                 node = left;
 748                         } else if (node->rightnode == NODE && node->right) {
 749                                 right = node->right;
 750                                 free(node);
 751                                 count++;
 752                                 node = right;
 753                         } else {
 754                                 node = NULL;
 755                         }
 756                 }
 757                 /* Propagate keymasks up along singleton chains. */
 758                 node = parent;
 759                 /* Force re-check */
 760                 bitmask = 1 << node->bitnum;
 761                 leftmask &= ~bitmask;
 762                 rightmask &= ~bitmask;
 763                 for (;;) {
 764                         if (node->left && node->right)
 765                                 break;
 766                         if (node->left) {
 767                                 left = node->left;
 768                                 node->keymask |= left->keymask;
 769                                 node->keybits |= left->keybits;
 770                         }
 771                         if (node->right) {
 772                                 right = node->right;
 773                                 node->keymask |= right->keymask;
 774                                 node->keybits |= right->keybits;
 775                         }
 776                         node->keymask |= (1 << node->bitnum);
 777                         node = node->parent;
 778                         /* Force re-check */
 779                         bitmask = 1 << node->bitnum;
 780                         leftmask &= ~bitmask;
 781                         rightmask &= ~bitmask;
 782                 }
 783         advance:
 784                 bitmask = 1 << node->bitnum;
 785                 if ((leftmask & bitmask) == 0 &&
 786                     node->leftnode == NODE &&
 787                     node->left) {
 788                         leftmask |= bitmask;
 789                         node = node->left;
 790                 } else if ((rightmask & bitmask) == 0 &&
 791                            node->rightnode == NODE &&
 792                            node->right) {
 793                         rightmask |= bitmask;
 794                         node = node->right;
 795                 } else {
 796                         leftmask &= ~bitmask;
 797                         rightmask &= ~bitmask;
 798                         node = node->parent;
 799                 }
 800         }
 801         if (verbose > 0)
 802                 printf("Pruned %d nodes\n", count);
 803 }
 804
 805 /*
 806  * Mark the nodes in the tree that lead to leaves that must be
 807  * emitted.
 808  */
 809 static void mark_nodes(struct tree *tree)
 810 {
 811         struct node *node;
 812         struct node *n;
 813         unsigned int leftmask;
 814         unsigned int rightmask;
 815         unsigned int bitmask;
 816         int marked;
 817
 818         marked = 0;
 819         if (verbose > 0)
 820                 printf("Marking %s_%x\n", tree->type, tree->maxage);
 821         if (tree->childnode == LEAF)
 822                 goto done;
 823
 824         assert(tree->childnode == NODE);
 825         node = tree->root;
 826         leftmask = rightmask = 0;
 827         while (node) {
 828                 bitmask = 1 << node->bitnum;
 829                 if ((leftmask & bitmask) == 0) {
 830                         leftmask |= bitmask;
 831                         if (node->leftnode == LEAF) {
 832                                 assert(node->left);
 833                                 if (tree->leaf_mark(node->left)) {
 834                                         n = node;
 835                                         while (n && !n->mark) {
 836                                                 marked++;
 837                                                 n->mark = 1;
 838                                                 n = n->parent;
 839                                         }
 840                                 }
 841                         } else if (node->left) {
 842                                 assert(node->leftnode == NODE);
 843                                 node = node->left;
 844                                 continue;
 845                         }
 846                 }
 847                 if ((rightmask & bitmask) == 0) {
 848                         rightmask |= bitmask;
 849                         if (node->rightnode == LEAF) {
 850                                 assert(node->right);
 851                                 if (tree->leaf_mark(node->right)) {
 852                                         n = node;
 853                                         while (n && !n->mark) {
 854                                                 marked++;
 855                                                 n->mark = 1;
 856                                                 n = n->parent;
 857                                         }
 858                                 }
 859                         } else if (node->right) {
 860                                 assert(node->rightnode==NODE);
 861                                 node = node->right;
 862                                 continue;
 863                         }
 864                 }
 865                 leftmask &= ~bitmask;
 866                 rightmask &= ~bitmask;
 867                 node = node->parent;
 868         }
 869
 870         /* second pass: left siblings and singletons */
 871
 872         assert(tree->childnode == NODE);
 873         node = tree->root;
 874         leftmask = rightmask = 0;
 875         while (node) {
 876                 bitmask = 1 << node->bitnum;
 877                 if ((leftmask & bitmask) == 0) {
 878                         leftmask |= bitmask;
 879                         if (node->leftnode == LEAF) {
 880                                 assert(node->left);
 881                                 if (tree->leaf_mark(node->left)) {
 882                                         n = node;
 883                                         while (n && !n->mark) {
 884                                                 marked++;
 885                                                 n->mark = 1;
 886                                                 n = n->parent;
 887                                         }
 888                                 }
 889                         } else if (node->left) {
 890                                 assert(node->leftnode == NODE);
 891                                 node = node->left;
 892                                 if (!node->mark && node->parent->mark) {
 893                                         marked++;
 894                                         node->mark = 1;
 895                                 }
 896                                 continue;
 897                         }
 898                 }
 899                 if ((rightmask & bitmask) == 0) {
 900                         rightmask |= bitmask;
 901                         if (node->rightnode == LEAF) {
 902                                 assert(node->right);
 903                                 if (tree->leaf_mark(node->right)) {
 904                                         n = node;
 905                                         while (n && !n->mark) {
 906                                                 marked++;
 907                                                 n->mark = 1;
 908                                                 n = n->parent;
 909                                         }
 910                                 }
 911                         } else if (node->right) {
 912                                 assert(node->rightnode==NODE);
 913                                 node = node->right;
 914                                 if (!node->mark && node->parent->mark &&
 915                                     !node->parent->left) {
 916                                         marked++;
 917                                         node->mark = 1;
 918                                 }
 919                                 continue;
 920                         }
 921                 }
 922                 leftmask &= ~bitmask;
 923                 rightmask &= ~bitmask;
 924                 node = node->parent;
 925         }
 926 done:
 927         if (verbose > 0)
 928                 printf("Marked %d nodes\n", marked);
 929 }
 930
 931 /*
 932  * Compute the index of each node and leaf, which is the offset in the
 933  * emitted trie.  These values must be pre-computed because relative
 934  * offsets between nodes are used to navigate the tree.
 935  */
 936 static int index_nodes(struct tree *tree, int index)
 937 {
 938         struct node *node;
 939         unsigned int leftmask;
 940         unsigned int rightmask;
 941         unsigned int bitmask;
 942         int count;
 943         int indent;
 944
 945         /* Align to a cache line (or half a cache line?). */
 946         while (index % 64)
 947                 index++;
 948         tree->index = index;
 949         indent = 1;
 950         count = 0;
 951
 952         if (verbose > 0)
 953                 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
 954         if (tree->childnode == LEAF) {
 955                 index += tree->leaf_size(tree->root);
 956                 goto done;
 957         }
 958
 959         assert(tree->childnode == NODE);
 960         node = tree->root;
 961         leftmask = rightmask = 0;
 962         while (node) {
 963                 if (!node->mark)
 964                         goto skip;
 965                 count++;
 966                 if (node->index != index)
 967                         node->index = index;
 968                 index += node->size;
 969 skip:
 970                 while (node) {
 971                         bitmask = 1 << node->bitnum;
 972                         if (node->mark && (leftmask & bitmask) == 0) {
 973                                 leftmask |= bitmask;
 974                                 if (node->leftnode == LEAF) {
 975                                         assert(node->left);
 976                                         *tree->leaf_index(tree, node->left) =
 977                                                                         index;
 978                                         index += tree->leaf_size(node->left);
 979                                         count++;
 980                                 } else if (node->left) {
 981                                         assert(node->leftnode == NODE);
 982                                         indent += 1;
 983                                         node = node->left;
 984                                         break;
 985                                 }
 986                         }
 987                         if (node->mark && (rightmask & bitmask) == 0) {
 988                                 rightmask |= bitmask;
 989                                 if (node->rightnode == LEAF) {
 990                                         assert(node->right);
 991                                         *tree->leaf_index(tree, node->right) = index;
 992                                         index += tree->leaf_size(node->right);
 993                                         count++;
 994                                 } else if (node->right) {
 995                                         assert(node->rightnode==NODE);
 996                                         indent += 1;
 997                                         node = node->right;
 998                                         break;
 999                                 }
1000                         }
1001                         leftmask &= ~bitmask;
1002                         rightmask &= ~bitmask;
1003                         node = node->parent;
1004                         indent -= 1;
1005                 }
1006         }
1007 done:
1008         /* Round up to a multiple of 16 */
1009         while (index % 16)
1010                 index++;
1011         if (verbose > 0)
1012                 printf("Final index %d\n", index);
1013         return index;
1014 }
1015
1016 /*
1017  * Compute the size of nodes and leaves. We start by assuming that
1018  * each node needs to store a three-byte offset. The indexes of the
1019  * nodes are calculated based on that, and then this function is
1020  * called to see if the sizes of some nodes can be reduced.  This is
1021  * repeated until no more changes are seen.
1022  */
1023 static int size_nodes(struct tree *tree)
1024 {
1025         struct tree *next;
1026         struct node *node;
1027         struct node *right;
1028         struct node *n;
1029         unsigned int leftmask;
1030         unsigned int rightmask;
1031         unsigned int bitmask;
1032         unsigned int pathbits;
1033         unsigned int pathmask;
1034         int changed;
1035         int offset;
1036         int size;
1037         int indent;
1038
1039         indent = 1;
1040         changed = 0;
1041         size = 0;
1042
1043         if (verbose > 0)
1044                 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1045         if (tree->childnode == LEAF)
1046                 goto done;
1047
1048         assert(tree->childnode == NODE);
1049         pathbits = 0;
1050         pathmask = 0;
1051         node = tree->root;
1052         leftmask = rightmask = 0;
1053         while (node) {
1054                 if (!node->mark)
1055                         goto skip;
1056                 offset = 0;
1057                 if (!node->left || !node->right) {
1058                         size = 1;
1059                 } else {
1060                         if (node->rightnode == NODE) {
1061                                 right = node->right;
1062                                 next = tree->next;
1063                                 while (!right->mark) {
1064                                         assert(next);
1065                                         n = next->root;
1066                                         while (n->bitnum != node->bitnum) {
1067                                                 if (pathbits & (1<<n->bitnum))
1068                                                         n = n->right;
1069                                                 else
1070                                                         n = n->left;
1071                                         }
1072                                         n = n->right;
1073                                         assert(right->bitnum == n->bitnum);
1074                                         right = n;
1075                                         next = next->next;
1076                                 }
1077                                 offset = right->index - node->index;
1078                         } else {
1079                                 offset = *tree->leaf_index(tree, node->right);
1080                                 offset -= node->index;
1081                         }
1082                         assert(offset >= 0);
1083                         assert(offset <= 0xffffff);
1084                         if (offset <= 0xff) {
1085                                 size = 2;
1086                         } else if (offset <= 0xffff) {
1087                                 size = 3;
1088                         } else { /* offset <= 0xffffff */
1089                                 size = 4;
1090                         }
1091                 }
1092                 if (node->size != size || node->offset != offset) {
1093                         node->size = size;
1094                         node->offset = offset;
1095                         changed++;
1096                 }
1097 skip:
1098                 while (node) {
1099                         bitmask = 1 << node->bitnum;
1100                         pathmask |= bitmask;
1101                         if (node->mark && (leftmask & bitmask) == 0) {
1102                                 leftmask |= bitmask;
1103                                 if (node->leftnode == LEAF) {
1104                                         assert(node->left);
1105                                 } else if (node->left) {
1106                                         assert(node->leftnode == NODE);
1107                                         indent += 1;
1108                                         node = node->left;
1109                                         break;
1110                                 }
1111                         }
1112                         if (node->mark && (rightmask & bitmask) == 0) {
1113                                 rightmask |= bitmask;
1114                                 pathbits |= bitmask;
1115                                 if (node->rightnode == LEAF) {
1116                                         assert(node->right);
1117                                 } else if (node->right) {
1118                                         assert(node->rightnode==NODE);
1119                                         indent += 1;
1120                                         node = node->right;
1121                                         break;
1122                                 }
1123                         }
1124                         leftmask &= ~bitmask;
1125                         rightmask &= ~bitmask;
1126                         pathmask &= ~bitmask;
1127                         pathbits &= ~bitmask;
1128                         node = node->parent;
1129                         indent -= 1;
1130                 }
1131         }
1132 done:
1133         if (verbose > 0)
1134                 printf("Found %d changes\n", changed);
1135         return changed;
1136 }
1137
1138 /*
1139  * Emit a trie for the given tree into the data array.
1140  */
1141 static void emit(struct tree *tree, unsigned char *data)
1142 {
1143         struct node *node;
1144         unsigned int leftmask;
1145         unsigned int rightmask;
1146         unsigned int bitmask;
1147         int offlen;
1148         int offset;
1149         int index;
1150         int indent;
1151         unsigned char byte;
1152
1153         index = tree->index;
1154         data += index;
1155         indent = 1;
1156         if (verbose > 0)
1157                 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1158         if (tree->childnode == LEAF) {
1159                 assert(tree->root);
1160                 tree->leaf_emit(tree->root, data);
1161                 return;
1162         }
1163
1164         assert(tree->childnode == NODE);
1165         node = tree->root;
1166         leftmask = rightmask = 0;
1167         while (node) {
1168                 if (!node->mark)
1169                         goto skip;
1170                 assert(node->offset != -1);
1171                 assert(node->index == index);
1172
1173                 byte = 0;
1174                 if (node->nextbyte)
1175                         byte |= NEXTBYTE;
1176                 byte |= (node->bitnum & BITNUM);
1177                 if (node->left && node->right) {
1178                         if (node->leftnode == NODE)
1179                                 byte |= LEFTNODE;
1180                         if (node->rightnode == NODE)
1181                                 byte |= RIGHTNODE;
1182                         if (node->offset <= 0xff)
1183                                 offlen = 1;
1184                         else if (node->offset <= 0xffff)
1185                                 offlen = 2;
1186                         else
1187                                 offlen = 3;
1188                         offset = node->offset;
1189                         byte |= offlen << OFFLEN_SHIFT;
1190                         *data++ = byte;
1191                         index++;
1192                         while (offlen--) {
1193                                 *data++ = offset & 0xff;
1194                                 index++;
1195                                 offset >>= 8;
1196                         }
1197                 } else if (node->left) {
1198                         if (node->leftnode == NODE)
1199                                 byte |= TRIENODE;
1200                         *data++ = byte;
1201                         index++;
1202                 } else if (node->right) {
1203                         byte |= RIGHTNODE;
1204                         if (node->rightnode == NODE)
1205                                 byte |= TRIENODE;
1206                         *data++ = byte;
1207                         index++;
1208                 } else {
1209                         assert(0);
1210                 }
1211 skip:
1212                 while (node) {
1213                         bitmask = 1 << node->bitnum;
1214                         if (node->mark && (leftmask & bitmask) == 0) {
1215                                 leftmask |= bitmask;
1216                                 if (node->leftnode == LEAF) {
1217                                         assert(node->left);
1218                                         data = tree->leaf_emit(node->left,
1219                                                                data);
1220                                         index += tree->leaf_size(node->left);
1221                                 } else if (node->left) {
1222                                         assert(node->leftnode == NODE);
1223                                         indent += 1;
1224                                         node = node->left;
1225                                         break;
1226                                 }
1227                         }
1228                         if (node->mark && (rightmask & bitmask) == 0) {
1229                                 rightmask |= bitmask;
1230                                 if (node->rightnode == LEAF) {
1231                                         assert(node->right);
1232                                         data = tree->leaf_emit(node->right,
1233                                                                data);
1234                                         index += tree->leaf_size(node->right);
1235                                 } else if (node->right) {
1236                                         assert(node->rightnode==NODE);
1237                                         indent += 1;
1238                                         node = node->right;
1239                                         break;
1240                                 }
1241                         }
1242                         leftmask &= ~bitmask;
1243                         rightmask &= ~bitmask;
1244                         node = node->parent;
1245                         indent -= 1;
1246                 }
1247         }
1248 }
1249
1250 /* ------------------------------------------------------------------ */
1251
1252 /*
1253  * Unicode data.
1254  *
1255  * We need to keep track of the Canonical Combining Class, the Age,
1256  * and decompositions for a code point.
1257  *
1258  * For the Age, we store the index into the ages table.  Effectively
1259  * this is a generation number that the table maps to a unicode
1260  * version.
1261  *
1262  * The correction field is used to indicate that this entry is in the
1263  * corrections array, which contains decompositions that were
1264  * corrected in later revisions.  The value of the correction field is
1265  * the Unicode version in which the mapping was corrected.
1266  */
1267 struct unicode_data {
1268         unsigned int code;
1269         int ccc;
1270         int gen;
1271         int correction;
1272         unsigned int *utf32nfdi;
1273         unsigned int *utf32nfdicf;
1274         char *utf8nfdi;
1275         char *utf8nfdicf;
1276 };
1277
1278 struct unicode_data unicode_data[0x110000];
1279 struct unicode_data *corrections;
1280 int    corrections_count;
1281
1282 struct tree *nfdi_tree;
1283 struct tree *nfdicf_tree;
1284
1285 struct tree *trees;
1286 int          trees_count;
1287
1288 /*
1289  * Check the corrections array to see if this entry was corrected at
1290  * some point.
1291  */
1292 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1293 {
1294         int i;
1295
1296         for (i = 0; i != corrections_count; i++)
1297                 if (u->code == corrections[i].code)
1298                         return &corrections[i];
1299         return u;
1300 }
1301
1302 static int nfdi_equal(void *l, void *r)
1303 {
1304         struct unicode_data *left = l;
1305         struct unicode_data *right = r;
1306
1307         if (left->gen != right->gen)
1308                 return 0;
1309         if (left->ccc != right->ccc)
1310                 return 0;
1311         if (left->utf8nfdi && right->utf8nfdi &&
1312             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1313                 return 1;
1314         if (left->utf8nfdi || right->utf8nfdi)
1315                 return 0;
1316         return 1;
1317 }
1318
1319 static int nfdicf_equal(void *l, void *r)
1320 {
1321         struct unicode_data *left = l;
1322         struct unicode_data *right = r;
1323
1324         if (left->gen != right->gen)
1325                 return 0;
1326         if (left->ccc != right->ccc)
1327                 return 0;
1328         if (left->utf8nfdicf && right->utf8nfdicf &&
1329             strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1330                 return 1;
1331         if (left->utf8nfdicf && right->utf8nfdicf)
1332                 return 0;
1333         if (left->utf8nfdicf || right->utf8nfdicf)
1334                 return 0;
1335         if (left->utf8nfdi && right->utf8nfdi &&
1336             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1337                 return 1;
1338         if (left->utf8nfdi || right->utf8nfdi)
1339                 return 0;
1340         return 1;
1341 }
1342
1343 static void nfdi_print(void *l, int indent)
1344 {
1345         struct unicode_data *leaf = l;
1346
1347         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1348                 leaf->code, leaf->ccc, leaf->gen);
1349         if (leaf->utf8nfdi)
1350                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1351         printf("\n");
1352 }
1353
1354 static void nfdicf_print(void *l, int indent)
1355 {
1356         struct unicode_data *leaf = l;
1357
1358         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1359                 leaf->code, leaf->ccc, leaf->gen);
1360         if (leaf->utf8nfdicf)
1361                 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1362         else if (leaf->utf8nfdi)
1363                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1364         printf("\n");
1365 }
1366
1367 static int nfdi_mark(void *l)
1368 {
1369         return 1;
1370 }
1371
1372 static int nfdicf_mark(void *l)
1373 {
1374         struct unicode_data *leaf = l;
1375
1376         if (leaf->utf8nfdicf)
1377                 return 1;
1378         return 0;
1379 }
1380
1381 static int correction_mark(void *l)
1382 {
1383         struct unicode_data *leaf = l;
1384
1385         return leaf->correction;
1386 }
1387
1388 static int nfdi_size(void *l)
1389 {
1390         struct unicode_data *leaf = l;
1391
1392         int size = 2;
1393         if (leaf->utf8nfdi)
1394                 size += strlen(leaf->utf8nfdi) + 1;
1395         return size;
1396 }
1397
1398 static int nfdicf_size(void *l)
1399 {
1400         struct unicode_data *leaf = l;
1401
1402         int size = 2;
1403         if (leaf->utf8nfdicf)
1404                 size += strlen(leaf->utf8nfdicf) + 1;
1405         else if (leaf->utf8nfdi)
1406                 size += strlen(leaf->utf8nfdi) + 1;
1407         return size;
1408 }
1409
1410 static int *nfdi_index(struct tree *tree, void *l)
1411 {
1412         struct unicode_data *leaf = l;
1413
1414         return &tree->leafindex[leaf->code];
1415 }
1416
1417 static int *nfdicf_index(struct tree *tree, void *l)
1418 {
1419         struct unicode_data *leaf = l;
1420
1421         return &tree->leafindex[leaf->code];
1422 }
1423
1424 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1425 {
1426         struct unicode_data *leaf = l;
1427         unsigned char *s;
1428
1429         *data++ = leaf->gen;
1430         if (leaf->utf8nfdi) {
1431                 *data++ = DECOMPOSE;
1432                 s = (unsigned char*)leaf->utf8nfdi;
1433                 while ((*data++ = *s++) != 0)
1434                         ;
1435         } else {
1436                 *data++ = leaf->ccc;
1437         }
1438         return data;
1439 }
1440
1441 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1442 {
1443         struct unicode_data *leaf = l;
1444         unsigned char *s;
1445
1446         *data++ = leaf->gen;
1447         if (leaf->utf8nfdicf) {
1448                 *data++ = DECOMPOSE;
1449                 s = (unsigned char*)leaf->utf8nfdicf;
1450                 while ((*data++ = *s++) != 0)
1451                         ;
1452         } else if (leaf->utf8nfdi) {
1453                 *data++ = DECOMPOSE;
1454                 s = (unsigned char*)leaf->utf8nfdi;
1455                 while ((*data++ = *s++) != 0)
1456                         ;
1457         } else {
1458                 *data++ = leaf->ccc;
1459         }
1460         return data;
1461 }
1462
1463 static void utf8_create(struct unicode_data *data)
1464 {
1465         char utf[18*4+1];
1466         char *u;
1467         unsigned int *um;
1468         int i;
1469
1470         u = utf;
1471         um = data->utf32nfdi;
1472         if (um) {
1473                 for (i = 0; um[i]; i++)
1474                         u += utf8encode(u, um[i]);
1475                 *u = '\0';
1476                 data->utf8nfdi = strdup(utf);
1477         }
1478         u = utf;
1479         um = data->utf32nfdicf;
1480         if (um) {
1481                 for (i = 0; um[i]; i++)
1482                         u += utf8encode(u, um[i]);
1483                 *u = '\0';
1484                 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1485                         data->utf8nfdicf = strdup(utf);
1486         }
1487 }
1488
1489 static void utf8_init(void)
1490 {
1491         unsigned int unichar;
1492         int i;
1493
1494         for (unichar = 0; unichar != 0x110000; unichar++)
1495                 utf8_create(&unicode_data[unichar]);
1496
1497         for (i = 0; i != corrections_count; i++)
1498                 utf8_create(&corrections[i]);
1499 }
1500
1501 static void trees_init(void)
1502 {
1503         struct unicode_data *data;
1504         unsigned int maxage;
1505         unsigned int nextage;
1506         int count;
1507         int i;
1508         int j;
1509
1510         /* Count the number of different ages. */
1511         count = 0;
1512         nextage = (unsigned int)-1;
1513         do {
1514                 maxage = nextage;
1515                 nextage = 0;
1516                 for (i = 0; i <= corrections_count; i++) {
1517                         data = &corrections[i];
1518                         if (nextage < data->correction &&
1519                             data->correction < maxage)
1520                                 nextage = data->correction;
1521                 }
1522                 count++;
1523         } while (nextage);
1524
1525         /* Two trees per age: nfdi and nfdicf */
1526         trees_count = count * 2;
1527         trees = calloc(trees_count, sizeof(struct tree));
1528
1529         /* Assign ages to the trees. */
1530         count = trees_count;
1531         nextage = (unsigned int)-1;
1532         do {
1533                 maxage = nextage;
1534                 trees[--count].maxage = maxage;
1535                 trees[--count].maxage = maxage;
1536                 nextage = 0;
1537                 for (i = 0; i <= corrections_count; i++) {
1538                         data = &corrections[i];
1539                         if (nextage < data->correction &&
1540                             data->correction < maxage)
1541                                 nextage = data->correction;
1542                 }
1543         } while (nextage);
1544
1545         /* The ages assigned above are off by one. */
1546         for (i = 0; i != trees_count; i++) {
1547                 j = 0;
1548                 while (ages[j] < trees[i].maxage)
1549                         j++;
1550                 trees[i].maxage = ages[j-1];
1551         }
1552
1553         /* Set up the forwarding between trees. */
1554         trees[trees_count-2].next = &trees[trees_count-1];
1555         trees[trees_count-1].leaf_mark = nfdi_mark;
1556         trees[trees_count-2].leaf_mark = nfdicf_mark;
1557         for (i = 0; i != trees_count-2; i += 2) {
1558                 trees[i].next = &trees[trees_count-2];
1559                 trees[i].leaf_mark = correction_mark;
1560                 trees[i+1].next = &trees[trees_count-1];
1561                 trees[i+1].leaf_mark = correction_mark;
1562         }
1563
1564         /* Assign the callouts. */
1565         for (i = 0; i != trees_count; i += 2) {
1566                 trees[i].type = "nfdicf";
1567                 trees[i].leaf_equal = nfdicf_equal;
1568                 trees[i].leaf_print = nfdicf_print;
1569                 trees[i].leaf_size = nfdicf_size;
1570                 trees[i].leaf_index = nfdicf_index;
1571                 trees[i].leaf_emit = nfdicf_emit;
1572
1573                 trees[i+1].type = "nfdi";
1574                 trees[i+1].leaf_equal = nfdi_equal;
1575                 trees[i+1].leaf_print = nfdi_print;
1576                 trees[i+1].leaf_size = nfdi_size;
1577                 trees[i+1].leaf_index = nfdi_index;
1578                 trees[i+1].leaf_emit = nfdi_emit;
1579         }
1580
1581         /* Finish init. */
1582         for (i = 0; i != trees_count; i++)
1583                 trees[i].childnode = NODE;
1584 }
1585
1586 static void trees_populate(void)
1587 {
1588         struct unicode_data *data;
1589         unsigned int unichar;
1590         char keyval[4];
1591         int keylen;
1592         int i;
1593
1594         for (i = 0; i != trees_count; i++) {
1595                 if (verbose > 0) {
1596                         printf("Populating %s_%x\n",
1597                                 trees[i].type, trees[i].maxage);
1598                 }
1599                 for (unichar = 0; unichar != 0x110000; unichar++) {
1600                         if (unicode_data[unichar].gen < 0)
1601                                 continue;
1602                         keylen = utf8encode(keyval, unichar);
1603                         data = corrections_lookup(&unicode_data[unichar]);
1604                         if (data->correction <= trees[i].maxage)
1605                                 data = &unicode_data[unichar];
1606                         insert(&trees[i], keyval, keylen, data);
1607                 }
1608         }
1609 }
1610
1611 static void trees_reduce(void)
1612 {
1613         int i;
1614         int size;
1615         int changed;
1616
1617         for (i = 0; i != trees_count; i++)
1618                 prune(&trees[i]);
1619         for (i = 0; i != trees_count; i++)
1620                 mark_nodes(&trees[i]);
1621         do {
1622                 size = 0;
1623                 for (i = 0; i != trees_count; i++)
1624                         size = index_nodes(&trees[i], size);
1625                 changed = 0;
1626                 for (i = 0; i != trees_count; i++)
1627                         changed += size_nodes(&trees[i]);
1628         } while (changed);
1629
1630         utf8data = calloc(size, 1);
1631         utf8data_size = size;
1632         for (i = 0; i != trees_count; i++)
1633                 emit(&trees[i], utf8data);
1634
1635         if (verbose > 0) {
1636                 for (i = 0; i != trees_count; i++) {
1637                         printf("%s_%x idx %d\n",
1638                                 trees[i].type, trees[i].maxage, trees[i].index);
1639                 }
1640         }
1641
1642         nfdi = utf8data + trees[trees_count-1].index;
1643         nfdicf = utf8data + trees[trees_count-2].index;
1644
1645         nfdi_tree = &trees[trees_count-1];
1646         nfdicf_tree = &trees[trees_count-2];
1647 }
1648
1649 static void verify(struct tree *tree)
1650 {
1651         struct unicode_data *data;
1652         utf8leaf_t      *leaf;
1653         unsigned int    unichar;
1654         char            key[4];
1655         int             report;
1656         int             nocf;
1657
1658         if (verbose > 0)
1659                 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1660         nocf = strcmp(tree->type, "nfdicf");
1661
1662         for (unichar = 0; unichar != 0x110000; unichar++) {
1663                 report = 0;
1664                 data = corrections_lookup(&unicode_data[unichar]);
1665                 if (data->correction <= tree->maxage)
1666                         data = &unicode_data[unichar];
1667                 utf8encode(key,unichar);
1668                 leaf = utf8lookup(tree, key);
1669                 if (!leaf) {
1670                         if (data->gen != -1)
1671                                 report++;
1672                         if (unichar < 0xd800 || unichar > 0xdfff)
1673                                 report++;
1674                 } else {
1675                         if (unichar >= 0xd800 && unichar <= 0xdfff)
1676                                 report++;
1677                         if (data->gen == -1)
1678                                 report++;
1679                         if (data->gen != LEAF_GEN(leaf))
1680                                 report++;
1681                         if (LEAF_CCC(leaf) == DECOMPOSE) {
1682                                 if (nocf) {
1683                                         if (!data->utf8nfdi) {
1684                                                 report++;
1685                                         } else if (strcmp(data->utf8nfdi,
1686                                                           LEAF_STR(leaf))) {
1687                                                 report++;
1688                                         }
1689                                 } else {
1690                                         if (!data->utf8nfdicf &&
1691                                             !data->utf8nfdi) {
1692                                                 report++;
1693                                         } else if (data->utf8nfdicf) {
1694                                                 if (strcmp(data->utf8nfdicf,
1695                                                            LEAF_STR(leaf)))
1696                                                         report++;
1697                                         } else if (strcmp(data->utf8nfdi,
1698                                                           LEAF_STR(leaf))) {
1699                                                 report++;
1700                                         }
1701                                 }
1702                         } else if (data->ccc != LEAF_CCC(leaf)) {
1703                                 report++;
1704                         }
1705                 }
1706                 if (report) {
1707                         printf("%X code %X gen %d ccc %d"
1708                                 " nfdi -> \"%s\"",
1709                                 unichar, data->code, data->gen,
1710                                 data->ccc,
1711                                 data->utf8nfdi);
1712                         if (leaf) {
1713                                 printf(" gen %d ccc %d"
1714                                         " nfdi -> \"%s\"",
1715                                         LEAF_GEN(leaf),
1716                                         LEAF_CCC(leaf),
1717                                         LEAF_CCC(leaf) == DECOMPOSE ?
1718                                                 LEAF_STR(leaf) : "");
1719                         }
1720                         printf("\n");
1721                 }
1722         }
1723 }
1724
1725 static void trees_verify(void)
1726 {
1727         int i;
1728
1729         for (i = 0; i != trees_count; i++)
1730                 verify(&trees[i]);
1731 }
1732
1733 /* ------------------------------------------------------------------ */
1734
1735 static void help(void)
1736 {
1737         printf("Usage: %s [options]\n", argv0);
1738         printf("\n");
1739         printf("This program creates an a data trie used for parsing and\n");
1740         printf("normalization of UTF-8 strings. The trie is derived from\n");
1741         printf("a set of input files from the Unicode character database\n");
1742         printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1743         printf("\n");
1744         printf("The generated tree supports two normalization forms:\n");
1745         printf("\n");
1746         printf("\tnfdi:\n");
1747         printf("\t- Apply unicode normalization form NFD.\n");
1748         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1749         printf("\n");
1750         printf("\tnfdicf:\n");
1751         printf("\t- Apply unicode normalization form NFD.\n");
1752         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1753         printf("\t- Apply a full casefold (C + F).\n");
1754         printf("\n");
1755         printf("These forms were chosen as being most useful when dealing\n");
1756         printf("with file names: NFD catches most cases where characters\n");
1757         printf("should be considered equivalent. The ignorables are mostly\n");
1758         printf("invisible, making names hard to type.\n");
1759         printf("\n");
1760         printf("The options to specify the files to be used are listed\n");
1761         printf("below with their default values, which are the names used\n");
1762         printf("by version 11.0.0 of the Unicode Character Database.\n");
1763         printf("\n");
1764         printf("The input files:\n");
1765         printf("\t-a %s\n", AGE_NAME);
1766         printf("\t-c %s\n", CCC_NAME);
1767         printf("\t-p %s\n", PROP_NAME);
1768         printf("\t-d %s\n", DATA_NAME);
1769         printf("\t-f %s\n", FOLD_NAME);
1770         printf("\t-n %s\n", NORM_NAME);
1771         printf("\n");
1772         printf("Additionally, the generated tables are tested using:\n");
1773         printf("\t-t %s\n", TEST_NAME);
1774         printf("\n");
1775         printf("Finally, the output file:\n");
1776         printf("\t-o %s\n", UTF8_NAME);
1777         printf("\n");
1778 }
1779
1780 static void usage(void)
1781 {
1782         help();
1783         exit(1);
1784 }
1785
1786 static void open_fail(const char *name, int error)
1787 {
1788         printf("Error %d opening %s: %s\n", error, name, strerror(error));
1789         exit(1);
1790 }
1791
1792 static void file_fail(const char *filename)
1793 {
1794         printf("Error parsing %s\n", filename);
1795         exit(1);
1796 }
1797
1798 static void line_fail(const char *filename, const char *line)
1799 {
1800         printf("Error parsing %s:%s\n", filename, line);
1801         exit(1);
1802 }
1803
1804 /* ------------------------------------------------------------------ */
1805
1806 static void print_utf32(unsigned int *utf32str)
1807 {
1808         int     i;
1809
1810         for (i = 0; utf32str[i]; i++)
1811                 printf(" %X", utf32str[i]);
1812 }
1813
1814 static void print_utf32nfdi(unsigned int unichar)
1815 {
1816         printf(" %X ->", unichar);
1817         print_utf32(unicode_data[unichar].utf32nfdi);
1818         printf("\n");
1819 }
1820
1821 static void print_utf32nfdicf(unsigned int unichar)
1822 {
1823         printf(" %X ->", unichar);
1824         print_utf32(unicode_data[unichar].utf32nfdicf);
1825         printf("\n");
1826 }
1827
1828 /* ------------------------------------------------------------------ */
1829
1830 static void age_init(void)
1831 {
1832         FILE *file;
1833         unsigned int first;
1834         unsigned int last;
1835         unsigned int unichar;
1836         unsigned int major;
1837         unsigned int minor;
1838         unsigned int revision;
1839         int gen;
1840         int count;
1841         int ret;
1842
1843         if (verbose > 0)
1844                 printf("Parsing %s\n", age_name);
1845
1846         file = fopen(age_name, "r");
1847         if (!file)
1848                 open_fail(age_name, errno);
1849         count = 0;
1850
1851         gen = 0;
1852         while (fgets(line, LINESIZE, file)) {
1853                 ret = sscanf(line, "# Age=V%d_%d_%d",
1854                                 &major, &minor, &revision);
1855                 if (ret == 3) {
1856                         ages_count++;
1857                         if (verbose > 1)
1858                                 printf(" Age V%d_%d_%d\n",
1859                                         major, minor, revision);
1860                         if (!age_valid(major, minor, revision))
1861                                 line_fail(age_name, line);
1862                         continue;
1863                 }
1864                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1865                 if (ret == 2) {
1866                         ages_count++;
1867                         if (verbose > 1)
1868                                 printf(" Age V%d_%d\n", major, minor);
1869                         if (!age_valid(major, minor, 0))
1870                                 line_fail(age_name, line);
1871                         continue;
1872                 }
1873         }
1874
1875         /* We must have found something above. */
1876         if (verbose > 1)
1877                 printf("%d age entries\n", ages_count);
1878         if (ages_count == 0 || ages_count > MAXGEN)
1879                 file_fail(age_name);
1880
1881         /* There is a 0 entry. */
1882         ages_count++;
1883         ages = calloc(ages_count + 1, sizeof(*ages));
1884         /* And a guard entry. */
1885         ages[ages_count] = (unsigned int)-1;
1886
1887         rewind(file);
1888         count = 0;
1889         gen = 0;
1890         while (fgets(line, LINESIZE, file)) {
1891                 ret = sscanf(line, "# Age=V%d_%d_%d",
1892                                 &major, &minor, &revision);
1893                 if (ret == 3) {
1894                         ages[++gen] =
1895                                 UNICODE_AGE(major, minor, revision);
1896                         if (verbose > 1)
1897                                 printf(" Age V%d_%d_%d = gen %d\n",
1898                                         major, minor, revision, gen);
1899                         if (!age_valid(major, minor, revision))
1900                                 line_fail(age_name, line);
1901                         continue;
1902                 }
1903                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1904                 if (ret == 2) {
1905                         ages[++gen] = UNICODE_AGE(major, minor, 0);
1906                         if (verbose > 1)
1907                                 printf(" Age V%d_%d = %d\n",
1908                                         major, minor, gen);
1909                         if (!age_valid(major, minor, 0))
1910                                 line_fail(age_name, line);
1911                         continue;
1912                 }
1913                 ret = sscanf(line, "%X..%X ; %d.%d #",
1914                              &first, &last, &major, &minor);
1915                 if (ret == 4) {
1916                         for (unichar = first; unichar <= last; unichar++)
1917                                 unicode_data[unichar].gen = gen;
1918                         count += 1 + last - first;
1919                         if (verbose > 1)
1920                                 printf("  %X..%X gen %d\n", first, last, gen);
1921                         if (!utf32valid(first) || !utf32valid(last))
1922                                 line_fail(age_name, line);
1923                         continue;
1924                 }
1925                 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
1926                 if (ret == 3) {
1927                         unicode_data[unichar].gen = gen;
1928                         count++;
1929                         if (verbose > 1)
1930                                 printf("  %X gen %d\n", unichar, gen);
1931                         if (!utf32valid(unichar))
1932                                 line_fail(age_name, line);
1933                         continue;
1934                 }
1935         }
1936         unicode_maxage = ages[gen];
1937         fclose(file);
1938
1939         /* Nix surrogate block */
1940         if (verbose > 1)
1941                 printf(" Removing surrogate block D800..DFFF\n");
1942         for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
1943                 unicode_data[unichar].gen = -1;
1944
1945         if (verbose > 0)
1946                 printf("Found %d entries\n", count);
1947         if (count == 0)
1948                 file_fail(age_name);
1949 }
1950
1951 static void ccc_init(void)
1952 {
1953         FILE *file;
1954         unsigned int first;
1955         unsigned int last;
1956         unsigned int unichar;
1957         unsigned int value;
1958         int count;
1959         int ret;
1960
1961         if (verbose > 0)
1962                 printf("Parsing %s\n", ccc_name);
1963
1964         file = fopen(ccc_name, "r");
1965         if (!file)
1966                 open_fail(ccc_name, errno);
1967
1968         count = 0;
1969         while (fgets(line, LINESIZE, file)) {
1970                 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
1971                 if (ret == 3) {
1972                         for (unichar = first; unichar <= last; unichar++) {
1973                                 unicode_data[unichar].ccc = value;
1974                                 count++;
1975                         }
1976                         if (verbose > 1)
1977                                 printf(" %X..%X ccc %d\n", first, last, value);
1978                         if (!utf32valid(first) || !utf32valid(last))
1979                                 line_fail(ccc_name, line);
1980                         continue;
1981                 }
1982                 ret = sscanf(line, "%X ; %d #", &unichar, &value);
1983                 if (ret == 2) {
1984                         unicode_data[unichar].ccc = value;
1985                         count++;
1986                         if (verbose > 1)
1987                                 printf(" %X ccc %d\n", unichar, value);
1988                         if (!utf32valid(unichar))
1989                                 line_fail(ccc_name, line);
1990                         continue;
1991                 }
1992         }
1993         fclose(file);
1994
1995         if (verbose > 0)
1996                 printf("Found %d entries\n", count);
1997         if (count == 0)
1998                 file_fail(ccc_name);
1999 }
2000
2001 static int ignore_compatibility_form(char *type)
2002 {
2003         int i;
2004         char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2005                                  "final", "isolated", "circle", "super",
2006                                  "sub", "vertical", "wide", "narrow",
2007                                  "small", "square", "fraction", "compat"};
2008
2009         for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2010                 if (strcmp(type, ignored_types[i]) == 0)
2011                         return 1;
2012         return 0;
2013 }
2014
2015 static void nfdi_init(void)
2016 {
2017         FILE *file;
2018         unsigned int unichar;
2019         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2020         char *s;
2021         char *type;
2022         unsigned int *um;
2023         int count;
2024         int i;
2025         int ret;
2026
2027         if (verbose > 0)
2028                 printf("Parsing %s\n", data_name);
2029         file = fopen(data_name, "r");
2030         if (!file)
2031                 open_fail(data_name, errno);
2032
2033         count = 0;
2034         while (fgets(line, LINESIZE, file)) {
2035                 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2036                              &unichar, buf0);
2037                 if (ret != 2)
2038                         continue;
2039                 if (!utf32valid(unichar))
2040                         line_fail(data_name, line);
2041
2042                 s = buf0;
2043                 /* skip over <tag> */
2044                 if (*s == '<') {
2045                         type = ++s;
2046                         while (*++s != '>');
2047                         *s++ = '\0';
2048                         if(ignore_compatibility_form(type))
2049                                 continue;
2050                 }
2051                 /* decode the decomposition into UTF-32 */
2052                 i = 0;
2053                 while (*s) {
2054                         mapping[i] = strtoul(s, &s, 16);
2055                         if (!utf32valid(mapping[i]))
2056                                 line_fail(data_name, line);
2057                         i++;
2058                 }
2059                 mapping[i++] = 0;
2060
2061                 um = malloc(i * sizeof(unsigned int));
2062                 memcpy(um, mapping, i * sizeof(unsigned int));
2063                 unicode_data[unichar].utf32nfdi = um;
2064
2065                 if (verbose > 1)
2066                         print_utf32nfdi(unichar);
2067                 count++;
2068         }
2069         fclose(file);
2070         if (verbose > 0)
2071                 printf("Found %d entries\n", count);
2072         if (count == 0)
2073                 file_fail(data_name);
2074 }
2075
2076 static void nfdicf_init(void)
2077 {
2078         FILE *file;
2079         unsigned int unichar;
2080         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2081         char status;
2082         char *s;
2083         unsigned int *um;
2084         int i;
2085         int count;
2086         int ret;
2087
2088         if (verbose > 0)
2089                 printf("Parsing %s\n", fold_name);
2090         file = fopen(fold_name, "r");
2091         if (!file)
2092                 open_fail(fold_name, errno);
2093
2094         count = 0;
2095         while (fgets(line, LINESIZE, file)) {
2096                 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2097                 if (ret != 3)
2098                         continue;
2099                 if (!utf32valid(unichar))
2100                         line_fail(fold_name, line);
2101                 /* Use the C+F casefold. */
2102                 if (status != 'C' && status != 'F')
2103                         continue;
2104                 s = buf0;
2105                 if (*s == '<')
2106                         while (*s++ != ' ')
2107                                 ;
2108                 i = 0;
2109                 while (*s) {
2110                         mapping[i] = strtoul(s, &s, 16);
2111                         if (!utf32valid(mapping[i]))
2112                                 line_fail(fold_name, line);
2113                         i++;
2114                 }
2115                 mapping[i++] = 0;
2116
2117                 um = malloc(i * sizeof(unsigned int));
2118                 memcpy(um, mapping, i * sizeof(unsigned int));
2119                 unicode_data[unichar].utf32nfdicf = um;
2120
2121                 if (verbose > 1)
2122                         print_utf32nfdicf(unichar);
2123                 count++;
2124         }
2125         fclose(file);
2126         if (verbose > 0)
2127                 printf("Found %d entries\n", count);
2128         if (count == 0)
2129                 file_fail(fold_name);
2130 }
2131
2132 static void ignore_init(void)
2133 {
2134         FILE *file;
2135         unsigned int unichar;
2136         unsigned int first;
2137         unsigned int last;
2138         unsigned int *um;
2139         int count;
2140         int ret;
2141
2142         if (verbose > 0)
2143                 printf("Parsing %s\n", prop_name);
2144         file = fopen(prop_name, "r");
2145         if (!file)
2146                 open_fail(prop_name, errno);
2147         assert(file);
2148         count = 0;
2149         while (fgets(line, LINESIZE, file)) {
2150                 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2151                 if (ret == 3) {
2152                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2153                                 continue;
2154                         if (!utf32valid(first) || !utf32valid(last))
2155                                 line_fail(prop_name, line);
2156                         for (unichar = first; unichar <= last; unichar++) {
2157                                 free(unicode_data[unichar].utf32nfdi);
2158                                 um = malloc(sizeof(unsigned int));
2159                                 *um = 0;
2160                                 unicode_data[unichar].utf32nfdi = um;
2161                                 free(unicode_data[unichar].utf32nfdicf);
2162                                 um = malloc(sizeof(unsigned int));
2163                                 *um = 0;
2164                                 unicode_data[unichar].utf32nfdicf = um;
2165                                 count++;
2166                         }
2167                         if (verbose > 1)
2168                                 printf(" %X..%X Default_Ignorable_Code_Point\n",
2169                                         first, last);
2170                         continue;
2171                 }
2172                 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2173                 if (ret == 2) {
2174                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2175                                 continue;
2176                         if (!utf32valid(unichar))
2177                                 line_fail(prop_name, line);
2178                         free(unicode_data[unichar].utf32nfdi);
2179                         um = malloc(sizeof(unsigned int));
2180                         *um = 0;
2181                         unicode_data[unichar].utf32nfdi = um;
2182                         free(unicode_data[unichar].utf32nfdicf);
2183                         um = malloc(sizeof(unsigned int));
2184                         *um = 0;
2185                         unicode_data[unichar].utf32nfdicf = um;
2186                         if (verbose > 1)
2187                                 printf(" %X Default_Ignorable_Code_Point\n",
2188                                         unichar);
2189                         count++;
2190                         continue;
2191                 }
2192         }
2193         fclose(file);
2194
2195         if (verbose > 0)
2196                 printf("Found %d entries\n", count);
2197         if (count == 0)
2198                 file_fail(prop_name);
2199 }
2200
2201 static void corrections_init(void)
2202 {
2203         FILE *file;
2204         unsigned int unichar;
2205         unsigned int major;
2206         unsigned int minor;
2207         unsigned int revision;
2208         unsigned int age;
2209         unsigned int *um;
2210         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2211         char *s;
2212         int i;
2213         int count;
2214         int ret;
2215
2216         if (verbose > 0)
2217                 printf("Parsing %s\n", norm_name);
2218         file = fopen(norm_name, "r");
2219         if (!file)
2220                 open_fail(norm_name, errno);
2221
2222         count = 0;
2223         while (fgets(line, LINESIZE, file)) {
2224                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2225                                 &unichar, buf0, buf1,
2226                                 &major, &minor, &revision);
2227                 if (ret != 6)
2228                         continue;
2229                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2230                         line_fail(norm_name, line);
2231                 count++;
2232         }
2233         corrections = calloc(count, sizeof(struct unicode_data));
2234         corrections_count = count;
2235         rewind(file);
2236
2237         count = 0;
2238         while (fgets(line, LINESIZE, file)) {
2239                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2240                                 &unichar, buf0, buf1,
2241                                 &major, &minor, &revision);
2242                 if (ret != 6)
2243                         continue;
2244                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2245                         line_fail(norm_name, line);
2246                 corrections[count] = unicode_data[unichar];
2247                 assert(corrections[count].code == unichar);
2248                 age = UNICODE_AGE(major, minor, revision);
2249                 corrections[count].correction = age;
2250
2251                 i = 0;
2252                 s = buf0;
2253                 while (*s) {
2254                         mapping[i] = strtoul(s, &s, 16);
2255                         if (!utf32valid(mapping[i]))
2256                                 line_fail(norm_name, line);
2257                         i++;
2258                 }
2259                 mapping[i++] = 0;
2260
2261                 um = malloc(i * sizeof(unsigned int));
2262                 memcpy(um, mapping, i * sizeof(unsigned int));
2263                 corrections[count].utf32nfdi = um;
2264
2265                 if (verbose > 1)
2266                         printf(" %X -> %s -> %s V%d_%d_%d\n",
2267                                 unichar, buf0, buf1, major, minor, revision);
2268                 count++;
2269         }
2270         fclose(file);
2271
2272         if (verbose > 0)
2273                 printf("Found %d entries\n", count);
2274         if (count == 0)
2275                 file_fail(norm_name);
2276 }
2277
2278 /* ------------------------------------------------------------------ */
2279
2280 /*
2281  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2282  *
2283  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2284  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2285  *
2286  * SBase = 0xAC00
2287  * LBase = 0x1100
2288  * VBase = 0x1161
2289  * TBase = 0x11A7
2290  * LCount = 19
2291  * VCount = 21
2292  * TCount = 28
2293  * NCount = 588 (VCount * TCount)
2294  * SCount = 11172 (LCount * NCount)
2295  *
2296  * Decomposition:
2297  *   SIndex = s - SBase
2298  *
2299  * LV (Canonical/Full)
2300  *   LIndex = SIndex / NCount
2301  *   VIndex = (Sindex % NCount) / TCount
2302  *   LPart = LBase + LIndex
2303  *   VPart = VBase + VIndex
2304  *
2305  * LVT (Canonical)
2306  *   LVIndex = (SIndex / TCount) * TCount
2307  *   TIndex = (Sindex % TCount)
2308  *   LVPart = SBase + LVIndex
2309  *   TPart = TBase + TIndex
2310  *
2311  * LVT (Full)
2312  *   LIndex = SIndex / NCount
2313  *   VIndex = (Sindex % NCount) / TCount
2314  *   TIndex = (Sindex % TCount)
2315  *   LPart = LBase + LIndex
2316  *   VPart = VBase + VIndex
2317  *   if (TIndex == 0) {
2318  *          d = <LPart, VPart>
2319  *   } else {
2320  *          TPart = TBase + TIndex
2321  *          d = <LPart, VPart, TPart>
2322  *   }
2323  *
2324  */
2325
2326 static void
2327 hangul_decompose(void)
2328 {
2329         unsigned int sb = 0xAC00;
2330         unsigned int lb = 0x1100;
2331         unsigned int vb = 0x1161;
2332         unsigned int tb = 0x11a7;
2333         /* unsigned int lc = 19; */
2334         unsigned int vc = 21;
2335         unsigned int tc = 28;
2336         unsigned int nc = (vc * tc);
2337         /* unsigned int sc = (lc * nc); */
2338         unsigned int unichar;
2339         unsigned int mapping[4];
2340         unsigned int *um;
2341         int count;
2342         int i;
2343
2344         if (verbose > 0)
2345                 printf("Decomposing hangul\n");
2346         /* Hangul */
2347         count = 0;
2348         for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2349                 unsigned int si = unichar - sb;
2350                 unsigned int li = si / nc;
2351                 unsigned int vi = (si % nc) / tc;
2352                 unsigned int ti = si % tc;
2353
2354                 i = 0;
2355                 mapping[i++] = lb + li;
2356                 mapping[i++] = vb + vi;
2357                 if (ti)
2358                         mapping[i++] = tb + ti;
2359                 mapping[i++] = 0;
2360
2361                 assert(!unicode_data[unichar].utf32nfdi);
2362                 um = malloc(i * sizeof(unsigned int));
2363                 memcpy(um, mapping, i * sizeof(unsigned int));
2364                 unicode_data[unichar].utf32nfdi = um;
2365
2366                 assert(!unicode_data[unichar].utf32nfdicf);
2367                 um = malloc(i * sizeof(unsigned int));
2368                 memcpy(um, mapping, i * sizeof(unsigned int));
2369                 unicode_data[unichar].utf32nfdicf = um;
2370
2371                 if (verbose > 1)
2372                         print_utf32nfdi(unichar);
2373
2374                 count++;
2375         }
2376         if (verbose > 0)
2377                 printf("Created %d entries\n", count);
2378 }
2379
2380 static void nfdi_decompose(void)
2381 {
2382         unsigned int unichar;
2383         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2384         unsigned int *um;
2385         unsigned int *dc;
2386         int count;
2387         int i;
2388         int j;
2389         int ret;
2390
2391         if (verbose > 0)
2392                 printf("Decomposing nfdi\n");
2393
2394         count = 0;
2395         for (unichar = 0; unichar != 0x110000; unichar++) {
2396                 if (!unicode_data[unichar].utf32nfdi)
2397                         continue;
2398                 for (;;) {
2399                         ret = 1;
2400                         i = 0;
2401                         um = unicode_data[unichar].utf32nfdi;
2402                         while (*um) {
2403                                 dc = unicode_data[*um].utf32nfdi;
2404                                 if (dc) {
2405                                         for (j = 0; dc[j]; j++)
2406                                                 mapping[i++] = dc[j];
2407                                         ret = 0;
2408                                 } else {
2409                                         mapping[i++] = *um;
2410                                 }
2411                                 um++;
2412                         }
2413                         mapping[i++] = 0;
2414                         if (ret)
2415                                 break;
2416                         free(unicode_data[unichar].utf32nfdi);
2417                         um = malloc(i * sizeof(unsigned int));
2418                         memcpy(um, mapping, i * sizeof(unsigned int));
2419                         unicode_data[unichar].utf32nfdi = um;
2420                 }
2421                 /* Add this decomposition to nfdicf if there is no entry. */
2422                 if (!unicode_data[unichar].utf32nfdicf) {
2423                         um = malloc(i * sizeof(unsigned int));
2424                         memcpy(um, mapping, i * sizeof(unsigned int));
2425                         unicode_data[unichar].utf32nfdicf = um;
2426                 }
2427                 if (verbose > 1)
2428                         print_utf32nfdi(unichar);
2429                 count++;
2430         }
2431         if (verbose > 0)
2432                 printf("Processed %d entries\n", count);
2433 }
2434
2435 static void nfdicf_decompose(void)
2436 {
2437         unsigned int unichar;
2438         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2439         unsigned int *um;
2440         unsigned int *dc;
2441         int count;
2442         int i;
2443         int j;
2444         int ret;
2445
2446         if (verbose > 0)
2447                 printf("Decomposing nfdicf\n");
2448         count = 0;
2449         for (unichar = 0; unichar != 0x110000; unichar++) {
2450                 if (!unicode_data[unichar].utf32nfdicf)
2451                         continue;
2452                 for (;;) {
2453                         ret = 1;
2454                         i = 0;
2455                         um = unicode_data[unichar].utf32nfdicf;
2456                         while (*um) {
2457                                 dc = unicode_data[*um].utf32nfdicf;
2458                                 if (dc) {
2459                                         for (j = 0; dc[j]; j++)
2460                                                 mapping[i++] = dc[j];
2461                                         ret = 0;
2462                                 } else {
2463                                         mapping[i++] = *um;
2464                                 }
2465                                 um++;
2466                         }
2467                         mapping[i++] = 0;
2468                         if (ret)
2469                                 break;
2470                         free(unicode_data[unichar].utf32nfdicf);
2471                         um = malloc(i * sizeof(unsigned int));
2472                         memcpy(um, mapping, i * sizeof(unsigned int));
2473                         unicode_data[unichar].utf32nfdicf = um;
2474                 }
2475                 if (verbose > 1)
2476                         print_utf32nfdicf(unichar);
2477                 count++;
2478         }
2479         if (verbose > 0)
2480                 printf("Processed %d entries\n", count);
2481 }
2482
2483 /* ------------------------------------------------------------------ */
2484
2485 int utf8agemax(struct tree *, const char *);
2486 int utf8nagemax(struct tree *, const char *, size_t);
2487 int utf8agemin(struct tree *, const char *);
2488 int utf8nagemin(struct tree *, const char *, size_t);
2489 ssize_t utf8len(struct tree *, const char *);
2490 ssize_t utf8nlen(struct tree *, const char *, size_t);
2491 struct utf8cursor;
2492 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2493 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2494 int utf8byte(struct utf8cursor *);
2495
2496 /*
2497  * Use trie to scan s, touching at most len bytes.
2498  * Returns the leaf if one exists, NULL otherwise.
2499  *
2500  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2501  * is well-formed and corresponds to a known unicode code point.  The
2502  * shorthand for this will be "is valid UTF-8 unicode".
2503  */
2504 static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
2505 {
2506         utf8trie_t      *trie;
2507         int             offlen;
2508         int             offset;
2509         int             mask;
2510         int             node;
2511
2512         if (!tree)
2513                 return NULL;
2514         if (len == 0)
2515                 return NULL;
2516         node = 1;
2517         trie = utf8data + tree->index;
2518         while (node) {
2519                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2520                 if (*trie & NEXTBYTE) {
2521                         if (--len == 0)
2522                                 return NULL;
2523                         s++;
2524                 }
2525                 mask = 1 << (*trie & BITNUM);
2526                 if (*s & mask) {
2527                         /* Right leg */
2528                         if (offlen) {
2529                                 /* Right node at offset of trie */
2530                                 node = (*trie & RIGHTNODE);
2531                                 offset = trie[offlen];
2532                                 while (--offlen) {
2533                                         offset <<= 8;
2534                                         offset |= trie[offlen];
2535                                 }
2536                                 trie += offset;
2537                         } else if (*trie & RIGHTPATH) {
2538                                 /* Right node after this node */
2539                                 node = (*trie & TRIENODE);
2540                                 trie++;
2541                         } else {
2542                                 /* No right node. */
2543                                 return NULL;
2544                         }
2545                 } else {
2546                         /* Left leg */
2547                         if (offlen) {
2548                                 /* Left node after this node. */
2549                                 node = (*trie & LEFTNODE);
2550                                 trie += offlen + 1;
2551                         } else if (*trie & RIGHTPATH) {
2552                                 /* No left node. */
2553                                 return NULL;
2554                         } else {
2555                                 /* Left node after this node */
2556                                 node = (*trie & TRIENODE);
2557                                 trie++;
2558                         }
2559                 }
2560         }
2561         return trie;
2562 }
2563
2564 /*
2565  * Use trie to scan s.
2566  * Returns the leaf if one exists, NULL otherwise.
2567  *
2568  * Forwards to trie_nlookup().
2569  */
2570 static utf8leaf_t *utf8lookup(struct tree *tree, const char *s)
2571 {
2572         return utf8nlookup(tree, s, (size_t)-1);
2573 }
2574
2575 /*
2576  * Return the number of bytes used by the current UTF-8 sequence.
2577  * Assumes the input points to the first byte of a valid UTF-8
2578  * sequence.
2579  */
2580 static inline int utf8clen(const char *s)
2581 {
2582         unsigned char c = *s;
2583         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2584 }
2585
2586 /*
2587  * Maximum age of any character in s.
2588  * Return -1 if s is not valid UTF-8 unicode.
2589  * Return 0 if only non-assigned code points are used.
2590  */
2591 int utf8agemax(struct tree *tree, const char *s)
2592 {
2593         utf8leaf_t      *leaf;
2594         int             age = 0;
2595         int             leaf_age;
2596
2597         if (!tree)
2598                 return -1;
2599         while (*s) {
2600                 if (!(leaf = utf8lookup(tree, s)))
2601                         return -1;
2602                 leaf_age = ages[LEAF_GEN(leaf)];
2603                 if (leaf_age <= tree->maxage && leaf_age > age)
2604                         age = leaf_age;
2605                 s += utf8clen(s);
2606         }
2607         return age;
2608 }
2609
2610 /*
2611  * Minimum age of any character in s.
2612  * Return -1 if s is not valid UTF-8 unicode.
2613  * Return 0 if non-assigned code points are used.
2614  */
2615 int utf8agemin(struct tree *tree, const char *s)
2616 {
2617         utf8leaf_t      *leaf;
2618         int             age;
2619         int             leaf_age;
2620
2621         if (!tree)
2622                 return -1;
2623         age = tree->maxage;
2624         while (*s) {
2625                 if (!(leaf = utf8lookup(tree, s)))
2626                         return -1;
2627                 leaf_age = ages[LEAF_GEN(leaf)];
2628                 if (leaf_age <= tree->maxage && leaf_age < age)
2629                         age = leaf_age;
2630                 s += utf8clen(s);
2631         }
2632         return age;
2633 }
2634
2635 /*
2636  * Maximum age of any character in s, touch at most len bytes.
2637  * Return -1 if s is not valid UTF-8 unicode.
2638  */
2639 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2640 {
2641         utf8leaf_t      *leaf;
2642         int             age = 0;
2643         int             leaf_age;
2644
2645         if (!tree)
2646                 return -1;
2647         while (len && *s) {
2648                 if (!(leaf = utf8nlookup(tree, s, len)))
2649                         return -1;
2650                 leaf_age = ages[LEAF_GEN(leaf)];
2651                 if (leaf_age <= tree->maxage && leaf_age > age)
2652                         age = leaf_age;
2653                 len -= utf8clen(s);
2654                 s += utf8clen(s);
2655         }
2656         return age;
2657 }
2658
2659 /*
2660  * Maximum age of any character in s, touch at most len bytes.
2661  * Return -1 if s is not valid UTF-8 unicode.
2662  */
2663 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2664 {
2665         utf8leaf_t      *leaf;
2666         int             leaf_age;
2667         int             age;
2668
2669         if (!tree)
2670                 return -1;
2671         age = tree->maxage;
2672         while (len && *s) {
2673                 if (!(leaf = utf8nlookup(tree, s, len)))
2674                         return -1;
2675                 leaf_age = ages[LEAF_GEN(leaf)];
2676                 if (leaf_age <= tree->maxage && leaf_age < age)
2677                         age = leaf_age;
2678                 len -= utf8clen(s);
2679                 s += utf8clen(s);
2680         }
2681         return age;
2682 }
2683
2684 /*
2685  * Length of the normalization of s.
2686  * Return -1 if s is not valid UTF-8 unicode.
2687  *
2688  * A string of Default_Ignorable_Code_Point has length 0.
2689  */
2690 ssize_t utf8len(struct tree *tree, const char *s)
2691 {
2692         utf8leaf_t      *leaf;
2693         size_t          ret = 0;
2694
2695         if (!tree)
2696                 return -1;
2697         while (*s) {
2698                 if (!(leaf = utf8lookup(tree, s)))
2699                         return -1;
2700                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2701                         ret += utf8clen(s);
2702                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2703                         ret += strlen(LEAF_STR(leaf));
2704                 else
2705                         ret += utf8clen(s);
2706                 s += utf8clen(s);
2707         }
2708         return ret;
2709 }
2710
2711 /*
2712  * Length of the normalization of s, touch at most len bytes.
2713  * Return -1 if s is not valid UTF-8 unicode.
2714  */
2715 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2716 {
2717         utf8leaf_t      *leaf;
2718         size_t          ret = 0;
2719
2720         if (!tree)
2721                 return -1;
2722         while (len && *s) {
2723                 if (!(leaf = utf8nlookup(tree, s, len)))
2724                         return -1;
2725                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2726                         ret += utf8clen(s);
2727                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2728                         ret += strlen(LEAF_STR(leaf));
2729                 else
2730                         ret += utf8clen(s);
2731                 len -= utf8clen(s);
2732                 s += utf8clen(s);
2733         }
2734         return ret;
2735 }
2736
2737 /*
2738  * Cursor structure used by the normalizer.
2739  */
2740 struct utf8cursor {
2741         struct tree     *tree;
2742         const char      *s;
2743         const char      *p;
2744         const char      *ss;
2745         const char      *sp;
2746         unsigned int    len;
2747         unsigned int    slen;
2748         short int       ccc;
2749         short int       nccc;
2750         unsigned int    unichar;
2751 };
2752
2753 /*
2754  * Set up an utf8cursor for use by utf8byte().
2755  *
2756  *   s      : string.
2757  *   len    : length of s.
2758  *   u8c    : pointer to cursor.
2759  *   trie   : utf8trie_t to use for normalization.
2760  *
2761  * Returns -1 on error, 0 on success.
2762  */
2763 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2764                 size_t len)
2765 {
2766         if (!tree)
2767                 return -1;
2768         if (!s)
2769                 return -1;
2770         u8c->tree = tree;
2771         u8c->s = s;
2772         u8c->p = NULL;
2773         u8c->ss = NULL;
2774         u8c->sp = NULL;
2775         u8c->len = len;
2776         u8c->slen = 0;
2777         u8c->ccc = STOPPER;
2778         u8c->nccc = STOPPER;
2779         u8c->unichar = 0;
2780         /* Check we didn't clobber the maximum length. */
2781         if (u8c->len != len)
2782                 return -1;
2783         /* The first byte of s may not be an utf8 continuation. */
2784         if (len > 0 && (*s & 0xC0) == 0x80)
2785                 return -1;
2786         return 0;
2787 }
2788
2789 /*
2790  * Set up an utf8cursor for use by utf8byte().
2791  *
2792  *   s      : NUL-terminated string.
2793  *   u8c    : pointer to cursor.
2794  *   trie   : utf8trie_t to use for normalization.
2795  *
2796  * Returns -1 on error, 0 on success.
2797  */
2798 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2799 {
2800         return utf8ncursor(u8c, tree, s, (unsigned int)-1);
2801 }
2802
2803 /*
2804  * Get one byte from the normalized form of the string described by u8c.
2805  *
2806  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
2807  *
2808  * The cursor keeps track of the location in the string in u8c->s.
2809  * When a character is decomposed, the current location is stored in
2810  * u8c->p, and u8c->s is set to the start of the decomposition. Note
2811  * that bytes from a decomposition do not count against u8c->len.
2812  *
2813  * Characters are emitted if they match the current CCC in u8c->ccc.
2814  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
2815  * and the function returns 0 in that case.
2816  *
2817  * Sorting by CCC is done by repeatedly scanning the string.  The
2818  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
2819  * the start of the scan.  The first pass finds the lowest CCC to be
2820  * emitted and stores it in u8c->nccc, the second pass emits the
2821  * characters with this CCC and finds the next lowest CCC. This limits
2822  * the number of passes to 1 + the number of different CCCs in the
2823  * sequence being scanned.
2824  *
2825  * Therefore:
2826  *  u8c->p  != NULL -> a decomposition is being scanned.
2827  *  u8c->ss != NULL -> this is a repeating scan.
2828  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
2829  */
2830 int utf8byte(struct utf8cursor *u8c)
2831 {
2832         utf8leaf_t *leaf;
2833         int ccc;
2834
2835         for (;;) {
2836                 /* Check for the end of a decomposed character. */
2837                 if (u8c->p && *u8c->s == '\0') {
2838                         u8c->s = u8c->p;
2839                         u8c->p = NULL;
2840                 }
2841
2842                 /* Check for end-of-string. */
2843                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
2844                         /* There is no next byte. */
2845                         if (u8c->ccc == STOPPER)
2846                                 return 0;
2847                         /* End-of-string during a scan counts as a stopper. */
2848                         ccc = STOPPER;
2849                         goto ccc_mismatch;
2850                 } else if ((*u8c->s & 0xC0) == 0x80) {
2851                         /* This is a continuation of the current character. */
2852                         if (!u8c->p)
2853                                 u8c->len--;
2854                         return (unsigned char)*u8c->s++;
2855                 }
2856
2857                 /* Look up the data for the current character. */
2858                 if (u8c->p)
2859                         leaf = utf8lookup(u8c->tree, u8c->s);
2860                 else
2861                         leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len);
2862
2863                 /* No leaf found implies that the input is a binary blob. */
2864                 if (!leaf)
2865                         return -1;
2866
2867                 /* Characters that are too new have CCC 0. */
2868                 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
2869                         ccc = STOPPER;
2870                 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
2871                         u8c->len -= utf8clen(u8c->s);
2872                         u8c->p = u8c->s + utf8clen(u8c->s);
2873                         u8c->s = LEAF_STR(leaf);
2874                         /* Empty decomposition implies CCC 0. */
2875                         if (*u8c->s == '\0') {
2876                                 if (u8c->ccc == STOPPER)
2877                                         continue;
2878                                 ccc = STOPPER;
2879                                 goto ccc_mismatch;
2880                         }
2881                         leaf = utf8lookup(u8c->tree, u8c->s);
2882                         ccc = LEAF_CCC(leaf);
2883                 }
2884                 u8c->unichar = utf8decode(u8c->s);
2885
2886                 /*
2887                  * If this is not a stopper, then see if it updates
2888                  * the next canonical class to be emitted.
2889                  */
2890                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
2891                         u8c->nccc = ccc;
2892
2893                 /*
2894                  * Return the current byte if this is the current
2895                  * combining class.
2896                  */
2897                 if (ccc == u8c->ccc) {
2898                         if (!u8c->p)
2899                                 u8c->len--;
2900                         return (unsigned char)*u8c->s++;
2901                 }
2902
2903                 /* Current combining class mismatch. */
2904         ccc_mismatch:
2905                 if (u8c->nccc == STOPPER) {
2906                         /*
2907                          * Scan forward for the first canonical class
2908                          * to be emitted.  Save the position from
2909                          * which to restart.
2910                          */
2911                         assert(u8c->ccc == STOPPER);
2912                         u8c->ccc = MINCCC - 1;
2913                         u8c->nccc = ccc;
2914                         u8c->sp = u8c->p;
2915                         u8c->ss = u8c->s;
2916                         u8c->slen = u8c->len;
2917                         if (!u8c->p)
2918                                 u8c->len -= utf8clen(u8c->s);
2919                         u8c->s += utf8clen(u8c->s);
2920                 } else if (ccc != STOPPER) {
2921                         /* Not a stopper, and not the ccc we're emitting. */
2922                         if (!u8c->p)
2923                                 u8c->len -= utf8clen(u8c->s);
2924                         u8c->s += utf8clen(u8c->s);
2925                 } else if (u8c->nccc != MAXCCC + 1) {
2926                         /* At a stopper, restart for next ccc. */
2927                         u8c->ccc = u8c->nccc;
2928                         u8c->nccc = MAXCCC + 1;
2929                         u8c->s = u8c->ss;
2930                         u8c->p = u8c->sp;
2931                         u8c->len = u8c->slen;
2932                 } else {
2933                         /* All done, proceed from here. */
2934                         u8c->ccc = STOPPER;
2935                         u8c->nccc = STOPPER;
2936                         u8c->sp = NULL;
2937                         u8c->ss = NULL;
2938                         u8c->slen = 0;
2939                 }
2940         }
2941 }
2942
2943 /* ------------------------------------------------------------------ */
2944
2945 static int normalize_line(struct tree *tree)
2946 {
2947         char *s;
2948         char *t;
2949         int c;
2950         struct utf8cursor u8c;
2951
2952         /* First test: null-terminated string. */
2953         s = buf2;
2954         t = buf3;
2955         if (utf8cursor(&u8c, tree, s))
2956                 return -1;
2957         while ((c = utf8byte(&u8c)) > 0)
2958                 if (c != (unsigned char)*t++)
2959                         return -1;
2960         if (c < 0)
2961                 return -1;
2962         if (*t != 0)
2963                 return -1;
2964
2965         /* Second test: length-limited string. */
2966         s = buf2;
2967         /* Replace NUL with a value that will cause an error if seen. */
2968         s[strlen(s) + 1] = -1;
2969         t = buf3;
2970         if (utf8cursor(&u8c, tree, s))
2971                 return -1;
2972         while ((c = utf8byte(&u8c)) > 0)
2973                 if (c != (unsigned char)*t++)
2974                         return -1;
2975         if (c < 0)
2976                 return -1;
2977         if (*t != 0)
2978                 return -1;
2979
2980         return 0;
2981 }
2982
2983 static void normalization_test(void)
2984 {
2985         FILE *file;
2986         unsigned int unichar;
2987         struct unicode_data *data;
2988         char *s;
2989         char *t;
2990         int ret;
2991         int ignorables;
2992         int tests = 0;
2993         int failures = 0;
2994
2995         if (verbose > 0)
2996                 printf("Parsing %s\n", test_name);
2997         /* Step one, read data from file. */
2998         file = fopen(test_name, "r");
2999         if (!file)
3000                 open_fail(test_name, errno);
3001
3002         while (fgets(line, LINESIZE, file)) {
3003                 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3004                              buf0, buf1);
3005                 if (ret != 2 || *line == '#')
3006                         continue;
3007                 s = buf0;
3008                 t = buf2;
3009                 while (*s) {
3010                         unichar = strtoul(s, &s, 16);
3011                         t += utf8encode(t, unichar);
3012                 }
3013                 *t = '\0';
3014
3015                 ignorables = 0;
3016                 s = buf1;
3017                 t = buf3;
3018                 while (*s) {
3019                         unichar = strtoul(s, &s, 16);
3020                         data = &unicode_data[unichar];
3021                         if (data->utf8nfdi && !*data->utf8nfdi)
3022                                 ignorables = 1;
3023                         else
3024                                 t += utf8encode(t, unichar);
3025                 }
3026                 *t = '\0';
3027
3028                 tests++;
3029                 if (normalize_line(nfdi_tree) < 0) {
3030                         printf("Line %s -> %s", buf0, buf1);
3031                         if (ignorables)
3032                                 printf(" (ignorables removed)");
3033                         printf(" failure\n");
3034                         failures++;
3035                 }
3036         }
3037         fclose(file);
3038         if (verbose > 0)
3039                 printf("Ran %d tests with %d failures\n", tests, failures);
3040         if (failures)
3041                 file_fail(test_name);
3042 }
3043
3044 /* ------------------------------------------------------------------ */
3045
3046 static void write_file(void)
3047 {
3048         FILE *file;
3049         int i;
3050         int j;
3051         int t;
3052         int gen;
3053
3054         if (verbose > 0)
3055                 printf("Writing %s\n", utf8_name);
3056         file = fopen(utf8_name, "w");
3057         if (!file)
3058                 open_fail(utf8_name, errno);
3059
3060         fprintf(file, "/* This file is generated code, do not edit. */\n");
3061         fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3062         fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3063         fprintf(file, "#endif\n");
3064         fprintf(file, "\n");
3065         fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3066                 unicode_maxage);
3067         fprintf(file, "\n");
3068         fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3069         for (i = 0; i != ages_count; i++)
3070                 fprintf(file, "\t%#x%s\n", ages[i],
3071                         ages[i] == unicode_maxage ? "" : ",");
3072         fprintf(file, "};\n");
3073         fprintf(file, "\n");
3074         fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3075         t = 0;
3076         for (gen = 0; gen < ages_count; gen++) {
3077                 fprintf(file, "\t{ %#x, %d }%s\n",
3078                         ages[gen], trees[t].index,
3079                         ages[gen] == unicode_maxage ? "" : ",");
3080                 if (trees[t].maxage == ages[gen])
3081                         t += 2;
3082         }
3083         fprintf(file, "};\n");
3084         fprintf(file, "\n");
3085         fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3086         t = 1;
3087         for (gen = 0; gen < ages_count; gen++) {
3088                 fprintf(file, "\t{ %#x, %d }%s\n",
3089                         ages[gen], trees[t].index,
3090                         ages[gen] == unicode_maxage ? "" : ",");
3091                 if (trees[t].maxage == ages[gen])
3092                         t += 2;
3093         }
3094         fprintf(file, "};\n");
3095         fprintf(file, "\n");
3096         fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3097                 utf8data_size);
3098         t = 0;
3099         for (i = 0; i != utf8data_size; i += 16) {
3100                 if (i == trees[t].index) {
3101                         fprintf(file, "\t/* %s_%x */\n",
3102                                 trees[t].type, trees[t].maxage);
3103                         if (t < trees_count-1)
3104                                 t++;
3105                 }
3106                 fprintf(file, "\t");
3107                 for (j = i; j != i + 16; j++)
3108                         fprintf(file, "0x%.2x%s", utf8data[j],
3109                                 (j < utf8data_size -1 ? "," : ""));
3110                 fprintf(file, "\n");
3111         }
3112         fprintf(file, "};\n");
3113         fclose(file);
3114 }
3115
3116 /* ------------------------------------------------------------------ */
3117
3118 int main(int argc, char *argv[])
3119 {
3120         unsigned int unichar;
3121         int opt;
3122
3123         argv0 = argv[0];
3124
3125         while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3126                 switch (opt) {
3127                 case 'a':
3128                         age_name = optarg;
3129                         break;
3130                 case 'c':
3131                         ccc_name = optarg;
3132                         break;
3133                 case 'd':
3134                         data_name = optarg;
3135                         break;
3136                 case 'f':
3137                         fold_name = optarg;
3138                         break;
3139                 case 'n':
3140                         norm_name = optarg;
3141                         break;
3142                 case 'o':
3143                         utf8_name = optarg;
3144                         break;
3145                 case 'p':
3146                         prop_name = optarg;
3147                         break;
3148                 case 't':
3149                         test_name = optarg;
3150                         break;
3151                 case 'v':
3152                         verbose++;
3153                         break;
3154                 case 'h':
3155                         help();
3156                         exit(0);
3157                 default:
3158                         usage();
3159                 }
3160         }
3161
3162         if (verbose > 1)
3163                 help();
3164         for (unichar = 0; unichar != 0x110000; unichar++)
3165                 unicode_data[unichar].code = unichar;
3166         age_init();
3167         ccc_init();
3168         nfdi_init();
3169         nfdicf_init();
3170         ignore_init();
3171         corrections_init();
3172         hangul_decompose();
3173         nfdi_decompose();
3174         nfdicf_decompose();
3175         utf8_init();
3176         trees_init();
3177         trees_populate();
3178         trees_reduce();
3179         trees_verify();
3180         /* Prevent "unused function" warning. */
3181         (void)lookup(nfdi_tree, " ");
3182         if (verbose > 2)
3183                 tree_walk(nfdi_tree);
3184         if (verbose > 2)
3185                 tree_walk(nfdicf_tree);
3186         normalization_test();
3187         write_file();
3188
3189         return 0;
3190 }