fs/unicode/utf8-norm.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (c) 2014 SGI.
   4  * All rights reserved.
   5  */
   6
   7 #include "utf8n.h"
   8
   9 struct utf8data {
  10         unsigned int maxage;
  11         unsigned int offset;
  12 };
  13
  14 #define __INCLUDED_FROM_UTF8NORM_C__
  15 #include "utf8data.h"
  16 #undef __INCLUDED_FROM_UTF8NORM_C__
  17
  18 int utf8version_is_supported(u8 maj, u8 min, u8 rev)
  19 {
  20         int i = ARRAY_SIZE(utf8agetab) - 1;
  21         unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
  22
  23         while (i >= 0 && utf8agetab[i] != 0) {
  24                 if (sb_utf8version == utf8agetab[i])
  25                         return 1;
  26                 i--;
  27         }
  28         return 0;
  29 }
  30 EXPORT_SYMBOL(utf8version_is_supported);
  31
  32 int utf8version_latest(void)
  33 {
  34         return utf8vers;
  35 }
  36 EXPORT_SYMBOL(utf8version_latest);
  37
  38 /*
  39  * UTF-8 valid ranges.
  40  *
  41  * The UTF-8 encoding spreads the bits of a 32bit word over several
  42  * bytes. This table gives the ranges that can be held and how they'd
  43  * be represented.
  44  *
  45  * 0x00000000 0x0000007F: 0xxxxxxx
  46  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
  47  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  48  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  49  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  50  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  51  *
  52  * There is an additional requirement on UTF-8, in that only the
  53  * shortest representation of a 32bit value is to be used.  A decoder
  54  * must not decode sequences that do not satisfy this requirement.
  55  * Thus the allowed ranges have a lower bound.
  56  *
  57  * 0x00000000 0x0000007F: 0xxxxxxx
  58  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
  59  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  60  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  61  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  62  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  63  *
  64  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
  65  * 17 planes of 65536 values.  This limits the sequences actually seen
  66  * even more, to just the following.
  67  *
  68  *          0 -     0x7F: 0                   - 0x7F
  69  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
  70  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
  71  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
  72  *
  73  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
  74  *
  75  * Note that the longest sequence seen with valid usage is 4 bytes,
  76  * the same a single UTF-32 character.  This makes the UTF-8
  77  * representation of Unicode strictly smaller than UTF-32.
  78  *
  79  * The shortest sequence requirement was introduced by:
  80  *    Corrigendum #1: UTF-8 Shortest Form
  81  * It can be found here:
  82  *    http://www.unicode.org/versions/corrigendum1.html
  83  *
  84  */
  85
  86 /*
  87  * Return the number of bytes used by the current UTF-8 sequence.
  88  * Assumes the input points to the first byte of a valid UTF-8
  89  * sequence.
  90  */
  91 static inline int utf8clen(const char *s)
  92 {
  93         unsigned char c = *s;
  94
  95         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
  96 }
  97
  98 /*
  99  * Decode a 3-byte UTF-8 sequence.
 100  */
 101 static unsigned int
 102 utf8decode3(const char *str)
 103 {
 104         unsigned int            uc;
 105
 106         uc = *str++ & 0x0F;
 107         uc <<= 6;
 108         uc |= *str++ & 0x3F;
 109         uc <<= 6;
 110         uc |= *str++ & 0x3F;
 111
 112         return uc;
 113 }
 114
 115 /*
 116  * Encode a 3-byte UTF-8 sequence.
 117  */
 118 static int
 119 utf8encode3(char *str, unsigned int val)
 120 {
 121         str[2] = (val & 0x3F) | 0x80;
 122         val >>= 6;
 123         str[1] = (val & 0x3F) | 0x80;
 124         val >>= 6;
 125         str[0] = val | 0xE0;
 126
 127         return 3;
 128 }
 129
 130 /*
 131  * utf8trie_t
 132  *
 133  * A compact binary tree, used to decode UTF-8 characters.
 134  *
 135  * Internal nodes are one byte for the node itself, and up to three
 136  * bytes for an offset into the tree.  The first byte contains the
 137  * following information:
 138  *  NEXTBYTE  - flag        - advance to next byte if set
 139  *  BITNUM    - 3 bit field - the bit number to tested
 140  *  OFFLEN    - 2 bit field - number of bytes in the offset
 141  * if offlen == 0 (non-branching node)
 142  *  RIGHTPATH - 1 bit field - set if the following node is for the
 143  *                            right-hand path (tested bit is set)
 144  *  TRIENODE  - 1 bit field - set if the following node is an internal
 145  *                            node, otherwise it is a leaf node
 146  * if offlen != 0 (branching node)
 147  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 148  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 149  *
 150  * Due to the way utf8 works, there cannot be branching nodes with
 151  * NEXTBYTE set, and moreover those nodes always have a righthand
 152  * descendant.
 153  */
 154 typedef const unsigned char utf8trie_t;
 155 #define BITNUM          0x07
 156 #define NEXTBYTE        0x08
 157 #define OFFLEN          0x30
 158 #define OFFLEN_SHIFT    4
 159 #define RIGHTPATH       0x40
 160 #define TRIENODE        0x80
 161 #define RIGHTNODE       0x40
 162 #define LEFTNODE        0x80
 163
 164 /*
 165  * utf8leaf_t
 166  *
 167  * The leaves of the trie are embedded in the trie, and so the same
 168  * underlying datatype: unsigned char.
 169  *
 170  * leaf[0]: The unicode version, stored as a generation number that is
 171  *          an index into utf8agetab[].  With this we can filter code
 172  *          points based on the unicode version in which they were
 173  *          defined.  The CCC of a non-defined code point is 0.
 174  * leaf[1]: Canonical Combining Class. During normalization, we need
 175  *          to do a stable sort into ascending order of all characters
 176  *          with a non-zero CCC that occur between two characters with
 177  *          a CCC of 0, or at the begin or end of a string.
 178  *          The unicode standard guarantees that all CCC values are
 179  *          between 0 and 254 inclusive, which leaves 255 available as
 180  *          a special value.
 181  *          Code points with CCC 0 are known as stoppers.
 182  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 183  *          start of a NUL-terminated string that is the decomposition
 184  *          of the character.
 185  *          The CCC of a decomposable character is the same as the CCC
 186  *          of the first character of its decomposition.
 187  *          Some characters decompose as the empty string: these are
 188  *          characters with the Default_Ignorable_Code_Point property.
 189  *          These do affect normalization, as they all have CCC 0.
 190  *
 191  * The decompositions in the trie have been fully expanded, with the
 192  * exception of Hangul syllables, which are decomposed algorithmically.
 193  *
 194  * Casefolding, if applicable, is also done using decompositions.
 195  *
 196  * The trie is constructed in such a way that leaves exist for all
 197  * UTF-8 sequences that match the criteria from the "UTF-8 valid
 198  * ranges" comment above, and only for those sequences.  Therefore a
 199  * lookup in the trie can be used to validate the UTF-8 input.
 200  */
 201 typedef const unsigned char utf8leaf_t;
 202
 203 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 204 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 205 #define LEAF_STR(LEAF)  ((const char *)((LEAF) + 2))
 206
 207 #define MINCCC          (0)
 208 #define MAXCCC          (254)
 209 #define STOPPER         (0)
 210 #define DECOMPOSE       (255)
 211
 212 /* Marker for hangul syllable decomposition. */
 213 #define HANGUL          ((char)(255))
 214 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
 215 #define UTF8HANGULLEAF  (12)
 216
 217 /*
 218  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
 219  *
 220  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 221  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 222  *
 223  * SBase = 0xAC00
 224  * LBase = 0x1100
 225  * VBase = 0x1161
 226  * TBase = 0x11A7
 227  * LCount = 19
 228  * VCount = 21
 229  * TCount = 28
 230  * NCount = 588 (VCount * TCount)
 231  * SCount = 11172 (LCount * NCount)
 232  *
 233  * Decomposition:
 234  *   SIndex = s - SBase
 235  *
 236  * LV (Canonical/Full)
 237  *   LIndex = SIndex / NCount
 238  *   VIndex = (Sindex % NCount) / TCount
 239  *   LPart = LBase + LIndex
 240  *   VPart = VBase + VIndex
 241  *
 242  * LVT (Canonical)
 243  *   LVIndex = (SIndex / TCount) * TCount
 244  *   TIndex = (Sindex % TCount)
 245  *   LVPart = SBase + LVIndex
 246  *   TPart = TBase + TIndex
 247  *
 248  * LVT (Full)
 249  *   LIndex = SIndex / NCount
 250  *   VIndex = (Sindex % NCount) / TCount
 251  *   TIndex = (Sindex % TCount)
 252  *   LPart = LBase + LIndex
 253  *   VPart = VBase + VIndex
 254  *   if (TIndex == 0) {
 255  *          d = <LPart, VPart>
 256  *   } else {
 257  *          TPart = TBase + TIndex
 258  *          d = <LPart, TPart, VPart>
 259  *   }
 260  */
 261
 262 /* Constants */
 263 #define SB      (0xAC00)
 264 #define LB      (0x1100)
 265 #define VB      (0x1161)
 266 #define TB      (0x11A7)
 267 #define LC      (19)
 268 #define VC      (21)
 269 #define TC      (28)
 270 #define NC      (VC * TC)
 271 #define SC      (LC * NC)
 272
 273 /* Algorithmic decomposition of hangul syllable. */
 274 static utf8leaf_t *
 275 utf8hangul(const char *str, unsigned char *hangul)
 276 {
 277         unsigned int    si;
 278         unsigned int    li;
 279         unsigned int    vi;
 280         unsigned int    ti;
 281         unsigned char   *h;
 282
 283         /* Calculate the SI, LI, VI, and TI values. */
 284         si = utf8decode3(str) - SB;
 285         li = si / NC;
 286         vi = (si % NC) / TC;
 287         ti = si % TC;
 288
 289         /* Fill in base of leaf. */
 290         h = hangul;
 291         LEAF_GEN(h) = 2;
 292         LEAF_CCC(h) = DECOMPOSE;
 293         h += 2;
 294
 295         /* Add LPart, a 3-byte UTF-8 sequence. */
 296         h += utf8encode3((char *)h, li + LB);
 297
 298         /* Add VPart, a 3-byte UTF-8 sequence. */
 299         h += utf8encode3((char *)h, vi + VB);
 300
 301         /* Add TPart if required, also a 3-byte UTF-8 sequence. */
 302         if (ti)
 303                 h += utf8encode3((char *)h, ti + TB);
 304
 305         /* Terminate string. */
 306         h[0] = '\0';
 307
 308         return hangul;
 309 }
 310
 311 /*
 312  * Use trie to scan s, touching at most len bytes.
 313  * Returns the leaf if one exists, NULL otherwise.
 314  *
 315  * A non-NULL return guarantees that the UTF-8 sequence starting at s
 316  * is well-formed and corresponds to a known unicode code point.  The
 317  * shorthand for this will be "is valid UTF-8 unicode".
 318  */
 319 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
 320                                unsigned char *hangul, const char *s, size_t len)
 321 {
 322         utf8trie_t      *trie = NULL;
 323         int             offlen;
 324         int             offset;
 325         int             mask;
 326         int             node;
 327
 328         if (!data)
 329                 return NULL;
 330         if (len == 0)
 331                 return NULL;
 332
 333         trie = utf8data + data->offset;
 334         node = 1;
 335         while (node) {
 336                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
 337                 if (*trie & NEXTBYTE) {
 338                         if (--len == 0)
 339                                 return NULL;
 340                         s++;
 341                 }
 342                 mask = 1 << (*trie & BITNUM);
 343                 if (*s & mask) {
 344                         /* Right leg */
 345                         if (offlen) {
 346                                 /* Right node at offset of trie */
 347                                 node = (*trie & RIGHTNODE);
 348                                 offset = trie[offlen];
 349                                 while (--offlen) {
 350                                         offset <<= 8;
 351                                         offset |= trie[offlen];
 352                                 }
 353                                 trie += offset;
 354                         } else if (*trie & RIGHTPATH) {
 355                                 /* Right node after this node */
 356                                 node = (*trie & TRIENODE);
 357                                 trie++;
 358                         } else {
 359                                 /* No right node. */
 360                                 return NULL;
 361                         }
 362                 } else {
 363                         /* Left leg */
 364                         if (offlen) {
 365                                 /* Left node after this node. */
 366                                 node = (*trie & LEFTNODE);
 367                                 trie += offlen + 1;
 368                         } else if (*trie & RIGHTPATH) {
 369                                 /* No left node. */
 370                                 return NULL;
 371                         } else {
 372                                 /* Left node after this node */
 373                                 node = (*trie & TRIENODE);
 374                                 trie++;
 375                         }
 376                 }
 377         }
 378         /*
 379          * Hangul decomposition is done algorithmically. These are the
 380          * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
 381          * always 3 bytes long, so s has been advanced twice, and the
 382          * start of the sequence is at s-2.
 383          */
 384         if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
 385                 trie = utf8hangul(s - 2, hangul);
 386         return trie;
 387 }
 388
 389 /*
 390  * Use trie to scan s.
 391  * Returns the leaf if one exists, NULL otherwise.
 392  *
 393  * Forwards to utf8nlookup().
 394  */
 395 static utf8leaf_t *utf8lookup(const struct utf8data *data,
 396                               unsigned char *hangul, const char *s)
 397 {
 398         return utf8nlookup(data, hangul, s, (size_t)-1);
 399 }
 400
 401 /*
 402  * Maximum age of any character in s.
 403  * Return -1 if s is not valid UTF-8 unicode.
 404  * Return 0 if only non-assigned code points are used.
 405  */
 406 int utf8agemax(const struct utf8data *data, const char *s)
 407 {
 408         utf8leaf_t      *leaf;
 409         int             age = 0;
 410         int             leaf_age;
 411         unsigned char   hangul[UTF8HANGULLEAF];
 412
 413         if (!data)
 414                 return -1;
 415
 416         while (*s) {
 417                 leaf = utf8lookup(data, hangul, s);
 418                 if (!leaf)
 419                         return -1;
 420
 421                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 422                 if (leaf_age <= data->maxage && leaf_age > age)
 423                         age = leaf_age;
 424                 s += utf8clen(s);
 425         }
 426         return age;
 427 }
 428 EXPORT_SYMBOL(utf8agemax);
 429
 430 /*
 431  * Minimum age of any character in s.
 432  * Return -1 if s is not valid UTF-8 unicode.
 433  * Return 0 if non-assigned code points are used.
 434  */
 435 int utf8agemin(const struct utf8data *data, const char *s)
 436 {
 437         utf8leaf_t      *leaf;
 438         int             age;
 439         int             leaf_age;
 440         unsigned char   hangul[UTF8HANGULLEAF];
 441
 442         if (!data)
 443                 return -1;
 444         age = data->maxage;
 445         while (*s) {
 446                 leaf = utf8lookup(data, hangul, s);
 447                 if (!leaf)
 448                         return -1;
 449                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 450                 if (leaf_age <= data->maxage && leaf_age < age)
 451                         age = leaf_age;
 452                 s += utf8clen(s);
 453         }
 454         return age;
 455 }
 456 EXPORT_SYMBOL(utf8agemin);
 457
 458 /*
 459  * Maximum age of any character in s, touch at most len bytes.
 460  * Return -1 if s is not valid UTF-8 unicode.
 461  */
 462 int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
 463 {
 464         utf8leaf_t      *leaf;
 465         int             age = 0;
 466         int             leaf_age;
 467         unsigned char   hangul[UTF8HANGULLEAF];
 468
 469         if (!data)
 470                 return -1;
 471
 472         while (len && *s) {
 473                 leaf = utf8nlookup(data, hangul, s, len);
 474                 if (!leaf)
 475                         return -1;
 476                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 477                 if (leaf_age <= data->maxage && leaf_age > age)
 478                         age = leaf_age;
 479                 len -= utf8clen(s);
 480                 s += utf8clen(s);
 481         }
 482         return age;
 483 }
 484 EXPORT_SYMBOL(utf8nagemax);
 485
 486 /*
 487  * Maximum age of any character in s, touch at most len bytes.
 488  * Return -1 if s is not valid UTF-8 unicode.
 489  */
 490 int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
 491 {
 492         utf8leaf_t      *leaf;
 493         int             leaf_age;
 494         int             age;
 495         unsigned char   hangul[UTF8HANGULLEAF];
 496
 497         if (!data)
 498                 return -1;
 499         age = data->maxage;
 500         while (len && *s) {
 501                 leaf = utf8nlookup(data, hangul, s, len);
 502                 if (!leaf)
 503                         return -1;
 504                 leaf_age = utf8agetab[LEAF_GEN(leaf)];
 505                 if (leaf_age <= data->maxage && leaf_age < age)
 506                         age = leaf_age;
 507                 len -= utf8clen(s);
 508                 s += utf8clen(s);
 509         }
 510         return age;
 511 }
 512 EXPORT_SYMBOL(utf8nagemin);
 513
 514 /*
 515  * Length of the normalization of s.
 516  * Return -1 if s is not valid UTF-8 unicode.
 517  *
 518  * A string of Default_Ignorable_Code_Point has length 0.
 519  */
 520 ssize_t utf8len(const struct utf8data *data, const char *s)
 521 {
 522         utf8leaf_t      *leaf;
 523         size_t          ret = 0;
 524         unsigned char   hangul[UTF8HANGULLEAF];
 525
 526         if (!data)
 527                 return -1;
 528         while (*s) {
 529                 leaf = utf8lookup(data, hangul, s);
 530                 if (!leaf)
 531                         return -1;
 532                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 533                         ret += utf8clen(s);
 534                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 535                         ret += strlen(LEAF_STR(leaf));
 536                 else
 537                         ret += utf8clen(s);
 538                 s += utf8clen(s);
 539         }
 540         return ret;
 541 }
 542 EXPORT_SYMBOL(utf8len);
 543
 544 /*
 545  * Length of the normalization of s, touch at most len bytes.
 546  * Return -1 if s is not valid UTF-8 unicode.
 547  */
 548 ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
 549 {
 550         utf8leaf_t      *leaf;
 551         size_t          ret = 0;
 552         unsigned char   hangul[UTF8HANGULLEAF];
 553
 554         if (!data)
 555                 return -1;
 556         while (len && *s) {
 557                 leaf = utf8nlookup(data, hangul, s, len);
 558                 if (!leaf)
 559                         return -1;
 560                 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
 561                         ret += utf8clen(s);
 562                 else if (LEAF_CCC(leaf) == DECOMPOSE)
 563                         ret += strlen(LEAF_STR(leaf));
 564                 else
 565                         ret += utf8clen(s);
 566                 len -= utf8clen(s);
 567                 s += utf8clen(s);
 568         }
 569         return ret;
 570 }
 571 EXPORT_SYMBOL(utf8nlen);
 572
 573 /*
 574  * Set up an utf8cursor for use by utf8byte().
 575  *
 576  *   u8c    : pointer to cursor.
 577  *   data   : const struct utf8data to use for normalization.
 578  *   s      : string.
 579  *   len    : length of s.
 580  *
 581  * Returns -1 on error, 0 on success.
 582  */
 583 int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
 584                 const char *s, size_t len)
 585 {
 586         if (!data)
 587                 return -1;
 588         if (!s)
 589                 return -1;
 590         u8c->data = data;
 591         u8c->s = s;
 592         u8c->p = NULL;
 593         u8c->ss = NULL;
 594         u8c->sp = NULL;
 595         u8c->len = len;
 596         u8c->slen = 0;
 597         u8c->ccc = STOPPER;
 598         u8c->nccc = STOPPER;
 599         /* Check we didn't clobber the maximum length. */
 600         if (u8c->len != len)
 601                 return -1;
 602         /* The first byte of s may not be an utf8 continuation. */
 603         if (len > 0 && (*s & 0xC0) == 0x80)
 604                 return -1;
 605         return 0;
 606 }
 607 EXPORT_SYMBOL(utf8ncursor);
 608
 609 /*
 610  * Set up an utf8cursor for use by utf8byte().
 611  *
 612  *   u8c    : pointer to cursor.
 613  *   data   : const struct utf8data to use for normalization.
 614  *   s      : NUL-terminated string.
 615  *
 616  * Returns -1 on error, 0 on success.
 617  */
 618 int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
 619                const char *s)
 620 {
 621         return utf8ncursor(u8c, data, s, (unsigned int)-1);
 622 }
 623 EXPORT_SYMBOL(utf8cursor);
 624
 625 /*
 626  * Get one byte from the normalized form of the string described by u8c.
 627  *
 628  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
 629  *
 630  * The cursor keeps track of the location in the string in u8c->s.
 631  * When a character is decomposed, the current location is stored in
 632  * u8c->p, and u8c->s is set to the start of the decomposition. Note
 633  * that bytes from a decomposition do not count against u8c->len.
 634  *
 635  * Characters are emitted if they match the current CCC in u8c->ccc.
 636  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
 637  * and the function returns 0 in that case.
 638  *
 639  * Sorting by CCC is done by repeatedly scanning the string.  The
 640  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
 641  * the start of the scan.  The first pass finds the lowest CCC to be
 642  * emitted and stores it in u8c->nccc, the second pass emits the
 643  * characters with this CCC and finds the next lowest CCC. This limits
 644  * the number of passes to 1 + the number of different CCCs in the
 645  * sequence being scanned.
 646  *
 647  * Therefore:
 648  *  u8c->p  != NULL -> a decomposition is being scanned.
 649  *  u8c->ss != NULL -> this is a repeating scan.
 650  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
 651  */
 652 int utf8byte(struct utf8cursor *u8c)
 653 {
 654         utf8leaf_t *leaf;
 655         int ccc;
 656
 657         for (;;) {
 658                 /* Check for the end of a decomposed character. */
 659                 if (u8c->p && *u8c->s == '\0') {
 660                         u8c->s = u8c->p;
 661                         u8c->p = NULL;
 662                 }
 663
 664                 /* Check for end-of-string. */
 665                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
 666                         /* There is no next byte. */
 667                         if (u8c->ccc == STOPPER)
 668                                 return 0;
 669                         /* End-of-string during a scan counts as a stopper. */
 670                         ccc = STOPPER;
 671                         goto ccc_mismatch;
 672                 } else if ((*u8c->s & 0xC0) == 0x80) {
 673                         /* This is a continuation of the current character. */
 674                         if (!u8c->p)
 675                                 u8c->len--;
 676                         return (unsigned char)*u8c->s++;
 677                 }
 678
 679                 /* Look up the data for the current character. */
 680                 if (u8c->p) {
 681                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 682                 } else {
 683                         leaf = utf8nlookup(u8c->data, u8c->hangul,
 684                                            u8c->s, u8c->len);
 685                 }
 686
 687                 /* No leaf found implies that the input is a binary blob. */
 688                 if (!leaf)
 689                         return -1;
 690
 691                 ccc = LEAF_CCC(leaf);
 692                 /* Characters that are too new have CCC 0. */
 693                 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
 694                         ccc = STOPPER;
 695                 } else if (ccc == DECOMPOSE) {
 696                         u8c->len -= utf8clen(u8c->s);
 697                         u8c->p = u8c->s + utf8clen(u8c->s);
 698                         u8c->s = LEAF_STR(leaf);
 699                         /* Empty decomposition implies CCC 0. */
 700                         if (*u8c->s == '\0') {
 701                                 if (u8c->ccc == STOPPER)
 702                                         continue;
 703                                 ccc = STOPPER;
 704                                 goto ccc_mismatch;
 705                         }
 706
 707                         leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
 708                         if (!leaf)
 709                                 return -1;
 710                         ccc = LEAF_CCC(leaf);
 711                 }
 712
 713                 /*
 714                  * If this is not a stopper, then see if it updates
 715                  * the next canonical class to be emitted.
 716                  */
 717                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
 718                         u8c->nccc = ccc;
 719
 720                 /*
 721                  * Return the current byte if this is the current
 722                  * combining class.
 723                  */
 724                 if (ccc == u8c->ccc) {
 725                         if (!u8c->p)
 726                                 u8c->len--;
 727                         return (unsigned char)*u8c->s++;
 728                 }
 729
 730                 /* Current combining class mismatch. */
 731 ccc_mismatch:
 732                 if (u8c->nccc == STOPPER) {
 733                         /*
 734                          * Scan forward for the first canonical class
 735                          * to be emitted.  Save the position from
 736                          * which to restart.
 737                          */
 738                         u8c->ccc = MINCCC - 1;
 739                         u8c->nccc = ccc;
 740                         u8c->sp = u8c->p;
 741                         u8c->ss = u8c->s;
 742                         u8c->slen = u8c->len;
 743                         if (!u8c->p)
 744                                 u8c->len -= utf8clen(u8c->s);
 745                         u8c->s += utf8clen(u8c->s);
 746                 } else if (ccc != STOPPER) {
 747                         /* Not a stopper, and not the ccc we're emitting. */
 748                         if (!u8c->p)
 749                                 u8c->len -= utf8clen(u8c->s);
 750                         u8c->s += utf8clen(u8c->s);
 751                 } else if (u8c->nccc != MAXCCC + 1) {
 752                         /* At a stopper, restart for next ccc. */
 753                         u8c->ccc = u8c->nccc;
 754                         u8c->nccc = MAXCCC + 1;
 755                         u8c->s = u8c->ss;
 756                         u8c->p = u8c->sp;
 757                         u8c->len = u8c->slen;
 758                 } else {
 759                         /* All done, proceed from here. */
 760                         u8c->ccc = STOPPER;
 761                         u8c->nccc = STOPPER;
 762                         u8c->sp = NULL;
 763                         u8c->ss = NULL;
 764                         u8c->slen = 0;
 765                 }
 766         }
 767 }
 768 EXPORT_SYMBOL(utf8byte);
 769
 770 const struct utf8data *utf8nfdi(unsigned int maxage)
 771 {
 772         int i = ARRAY_SIZE(utf8nfdidata) - 1;
 773
 774         while (maxage < utf8nfdidata[i].maxage)
 775                 i--;
 776         if (maxage > utf8nfdidata[i].maxage)
 777                 return NULL;
 778         return &utf8nfdidata[i];
 779 }
 780 EXPORT_SYMBOL(utf8nfdi);
 781
 782 const struct utf8data *utf8nfdicf(unsigned int maxage)
 783 {
 784         int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
 785
 786         while (maxage < utf8nfdicfdata[i].maxage)
 787                 i--;
 788         if (maxage > utf8nfdicfdata[i].maxage)
 789                 return NULL;
 790         return &utf8nfdicfdata[i];
 791 }
 792 EXPORT_SYMBOL(utf8nfdicf);