162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (c) 2014 SGI. 462306a36Sopenharmony_ci * All rights reserved. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci#include "utf8n.h" 862306a36Sopenharmony_ci 962306a36Sopenharmony_ciint utf8version_is_supported(const struct unicode_map *um, unsigned int version) 1062306a36Sopenharmony_ci{ 1162306a36Sopenharmony_ci int i = um->tables->utf8agetab_size - 1; 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci while (i >= 0 && um->tables->utf8agetab[i] != 0) { 1462306a36Sopenharmony_ci if (version == um->tables->utf8agetab[i]) 1562306a36Sopenharmony_ci return 1; 1662306a36Sopenharmony_ci i--; 1762306a36Sopenharmony_ci } 1862306a36Sopenharmony_ci return 0; 1962306a36Sopenharmony_ci} 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci/* 2262306a36Sopenharmony_ci * UTF-8 valid ranges. 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * The UTF-8 encoding spreads the bits of a 32bit word over several 2562306a36Sopenharmony_ci * bytes. This table gives the ranges that can be held and how they'd 2662306a36Sopenharmony_ci * be represented. 2762306a36Sopenharmony_ci * 2862306a36Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx 2962306a36Sopenharmony_ci * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 3062306a36Sopenharmony_ci * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 3162306a36Sopenharmony_ci * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 3262306a36Sopenharmony_ci * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 3362306a36Sopenharmony_ci * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 3462306a36Sopenharmony_ci * 3562306a36Sopenharmony_ci * There is an additional requirement on UTF-8, in that only the 3662306a36Sopenharmony_ci * shortest representation of a 32bit value is to be used. A decoder 3762306a36Sopenharmony_ci * must not decode sequences that do not satisfy this requirement. 3862306a36Sopenharmony_ci * Thus the allowed ranges have a lower bound. 3962306a36Sopenharmony_ci * 4062306a36Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx 4162306a36Sopenharmony_ci * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 4262306a36Sopenharmony_ci * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 4362306a36Sopenharmony_ci * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4462306a36Sopenharmony_ci * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4562306a36Sopenharmony_ci * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4662306a36Sopenharmony_ci * 4762306a36Sopenharmony_ci * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 4862306a36Sopenharmony_ci * 17 planes of 65536 values. This limits the sequences actually seen 4962306a36Sopenharmony_ci * even more, to just the following. 5062306a36Sopenharmony_ci * 5162306a36Sopenharmony_ci * 0 - 0x7F: 0 - 0x7F 5262306a36Sopenharmony_ci * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF 5362306a36Sopenharmony_ci * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF 5462306a36Sopenharmony_ci * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF 5562306a36Sopenharmony_ci * 5662306a36Sopenharmony_ci * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. 5762306a36Sopenharmony_ci * 5862306a36Sopenharmony_ci * Note that the longest sequence seen with valid usage is 4 bytes, 5962306a36Sopenharmony_ci * the same a single UTF-32 character. This makes the UTF-8 6062306a36Sopenharmony_ci * representation of Unicode strictly smaller than UTF-32. 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * The shortest sequence requirement was introduced by: 6362306a36Sopenharmony_ci * Corrigendum #1: UTF-8 Shortest Form 6462306a36Sopenharmony_ci * It can be found here: 6562306a36Sopenharmony_ci * http://www.unicode.org/versions/corrigendum1.html 6662306a36Sopenharmony_ci * 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci/* 7062306a36Sopenharmony_ci * Return the number of bytes used by the current UTF-8 sequence. 7162306a36Sopenharmony_ci * Assumes the input points to the first byte of a valid UTF-8 7262306a36Sopenharmony_ci * sequence. 7362306a36Sopenharmony_ci */ 7462306a36Sopenharmony_cistatic inline int utf8clen(const char *s) 7562306a36Sopenharmony_ci{ 7662306a36Sopenharmony_ci unsigned char c = *s; 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 7962306a36Sopenharmony_ci} 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci/* 8262306a36Sopenharmony_ci * Decode a 3-byte UTF-8 sequence. 8362306a36Sopenharmony_ci */ 8462306a36Sopenharmony_cistatic unsigned int 8562306a36Sopenharmony_ciutf8decode3(const char *str) 8662306a36Sopenharmony_ci{ 8762306a36Sopenharmony_ci unsigned int uc; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci uc = *str++ & 0x0F; 9062306a36Sopenharmony_ci uc <<= 6; 9162306a36Sopenharmony_ci uc |= *str++ & 0x3F; 9262306a36Sopenharmony_ci uc <<= 6; 9362306a36Sopenharmony_ci uc |= *str++ & 0x3F; 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci return uc; 9662306a36Sopenharmony_ci} 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci/* 9962306a36Sopenharmony_ci * Encode a 3-byte UTF-8 sequence. 10062306a36Sopenharmony_ci */ 10162306a36Sopenharmony_cistatic int 10262306a36Sopenharmony_ciutf8encode3(char *str, unsigned int val) 10362306a36Sopenharmony_ci{ 10462306a36Sopenharmony_ci str[2] = (val & 0x3F) | 0x80; 10562306a36Sopenharmony_ci val >>= 6; 10662306a36Sopenharmony_ci str[1] = (val & 0x3F) | 0x80; 10762306a36Sopenharmony_ci val >>= 6; 10862306a36Sopenharmony_ci str[0] = val | 0xE0; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci return 3; 11162306a36Sopenharmony_ci} 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci/* 11462306a36Sopenharmony_ci * utf8trie_t 11562306a36Sopenharmony_ci * 11662306a36Sopenharmony_ci * A compact binary tree, used to decode UTF-8 characters. 11762306a36Sopenharmony_ci * 11862306a36Sopenharmony_ci * Internal nodes are one byte for the node itself, and up to three 11962306a36Sopenharmony_ci * bytes for an offset into the tree. The first byte contains the 12062306a36Sopenharmony_ci * following information: 12162306a36Sopenharmony_ci * NEXTBYTE - flag - advance to next byte if set 12262306a36Sopenharmony_ci * BITNUM - 3 bit field - the bit number to tested 12362306a36Sopenharmony_ci * OFFLEN - 2 bit field - number of bytes in the offset 12462306a36Sopenharmony_ci * if offlen == 0 (non-branching node) 12562306a36Sopenharmony_ci * RIGHTPATH - 1 bit field - set if the following node is for the 12662306a36Sopenharmony_ci * right-hand path (tested bit is set) 12762306a36Sopenharmony_ci * TRIENODE - 1 bit field - set if the following node is an internal 12862306a36Sopenharmony_ci * node, otherwise it is a leaf node 12962306a36Sopenharmony_ci * if offlen != 0 (branching node) 13062306a36Sopenharmony_ci * LEFTNODE - 1 bit field - set if the left-hand node is internal 13162306a36Sopenharmony_ci * RIGHTNODE - 1 bit field - set if the right-hand node is internal 13262306a36Sopenharmony_ci * 13362306a36Sopenharmony_ci * Due to the way utf8 works, there cannot be branching nodes with 13462306a36Sopenharmony_ci * NEXTBYTE set, and moreover those nodes always have a righthand 13562306a36Sopenharmony_ci * descendant. 13662306a36Sopenharmony_ci */ 13762306a36Sopenharmony_citypedef const unsigned char utf8trie_t; 13862306a36Sopenharmony_ci#define BITNUM 0x07 13962306a36Sopenharmony_ci#define NEXTBYTE 0x08 14062306a36Sopenharmony_ci#define OFFLEN 0x30 14162306a36Sopenharmony_ci#define OFFLEN_SHIFT 4 14262306a36Sopenharmony_ci#define RIGHTPATH 0x40 14362306a36Sopenharmony_ci#define TRIENODE 0x80 14462306a36Sopenharmony_ci#define RIGHTNODE 0x40 14562306a36Sopenharmony_ci#define LEFTNODE 0x80 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci/* 14862306a36Sopenharmony_ci * utf8leaf_t 14962306a36Sopenharmony_ci * 15062306a36Sopenharmony_ci * The leaves of the trie are embedded in the trie, and so the same 15162306a36Sopenharmony_ci * underlying datatype: unsigned char. 15262306a36Sopenharmony_ci * 15362306a36Sopenharmony_ci * leaf[0]: The unicode version, stored as a generation number that is 15462306a36Sopenharmony_ci * an index into ->utf8agetab[]. With this we can filter code 15562306a36Sopenharmony_ci * points based on the unicode version in which they were 15662306a36Sopenharmony_ci * defined. The CCC of a non-defined code point is 0. 15762306a36Sopenharmony_ci * leaf[1]: Canonical Combining Class. During normalization, we need 15862306a36Sopenharmony_ci * to do a stable sort into ascending order of all characters 15962306a36Sopenharmony_ci * with a non-zero CCC that occur between two characters with 16062306a36Sopenharmony_ci * a CCC of 0, or at the begin or end of a string. 16162306a36Sopenharmony_ci * The unicode standard guarantees that all CCC values are 16262306a36Sopenharmony_ci * between 0 and 254 inclusive, which leaves 255 available as 16362306a36Sopenharmony_ci * a special value. 16462306a36Sopenharmony_ci * Code points with CCC 0 are known as stoppers. 16562306a36Sopenharmony_ci * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 16662306a36Sopenharmony_ci * start of a NUL-terminated string that is the decomposition 16762306a36Sopenharmony_ci * of the character. 16862306a36Sopenharmony_ci * The CCC of a decomposable character is the same as the CCC 16962306a36Sopenharmony_ci * of the first character of its decomposition. 17062306a36Sopenharmony_ci * Some characters decompose as the empty string: these are 17162306a36Sopenharmony_ci * characters with the Default_Ignorable_Code_Point property. 17262306a36Sopenharmony_ci * These do affect normalization, as they all have CCC 0. 17362306a36Sopenharmony_ci * 17462306a36Sopenharmony_ci * The decompositions in the trie have been fully expanded, with the 17562306a36Sopenharmony_ci * exception of Hangul syllables, which are decomposed algorithmically. 17662306a36Sopenharmony_ci * 17762306a36Sopenharmony_ci * Casefolding, if applicable, is also done using decompositions. 17862306a36Sopenharmony_ci * 17962306a36Sopenharmony_ci * The trie is constructed in such a way that leaves exist for all 18062306a36Sopenharmony_ci * UTF-8 sequences that match the criteria from the "UTF-8 valid 18162306a36Sopenharmony_ci * ranges" comment above, and only for those sequences. Therefore a 18262306a36Sopenharmony_ci * lookup in the trie can be used to validate the UTF-8 input. 18362306a36Sopenharmony_ci */ 18462306a36Sopenharmony_citypedef const unsigned char utf8leaf_t; 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci#define LEAF_GEN(LEAF) ((LEAF)[0]) 18762306a36Sopenharmony_ci#define LEAF_CCC(LEAF) ((LEAF)[1]) 18862306a36Sopenharmony_ci#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci#define MINCCC (0) 19162306a36Sopenharmony_ci#define MAXCCC (254) 19262306a36Sopenharmony_ci#define STOPPER (0) 19362306a36Sopenharmony_ci#define DECOMPOSE (255) 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci/* Marker for hangul syllable decomposition. */ 19662306a36Sopenharmony_ci#define HANGUL ((char)(255)) 19762306a36Sopenharmony_ci/* Size of the synthesized leaf used for Hangul syllable decomposition. */ 19862306a36Sopenharmony_ci#define UTF8HANGULLEAF (12) 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci/* 20162306a36Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 20262306a36Sopenharmony_ci * 20362306a36Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 20462306a36Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 20562306a36Sopenharmony_ci * 20662306a36Sopenharmony_ci * SBase = 0xAC00 20762306a36Sopenharmony_ci * LBase = 0x1100 20862306a36Sopenharmony_ci * VBase = 0x1161 20962306a36Sopenharmony_ci * TBase = 0x11A7 21062306a36Sopenharmony_ci * LCount = 19 21162306a36Sopenharmony_ci * VCount = 21 21262306a36Sopenharmony_ci * TCount = 28 21362306a36Sopenharmony_ci * NCount = 588 (VCount * TCount) 21462306a36Sopenharmony_ci * SCount = 11172 (LCount * NCount) 21562306a36Sopenharmony_ci * 21662306a36Sopenharmony_ci * Decomposition: 21762306a36Sopenharmony_ci * SIndex = s - SBase 21862306a36Sopenharmony_ci * 21962306a36Sopenharmony_ci * LV (Canonical/Full) 22062306a36Sopenharmony_ci * LIndex = SIndex / NCount 22162306a36Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 22262306a36Sopenharmony_ci * LPart = LBase + LIndex 22362306a36Sopenharmony_ci * VPart = VBase + VIndex 22462306a36Sopenharmony_ci * 22562306a36Sopenharmony_ci * LVT (Canonical) 22662306a36Sopenharmony_ci * LVIndex = (SIndex / TCount) * TCount 22762306a36Sopenharmony_ci * TIndex = (Sindex % TCount) 22862306a36Sopenharmony_ci * LVPart = SBase + LVIndex 22962306a36Sopenharmony_ci * TPart = TBase + TIndex 23062306a36Sopenharmony_ci * 23162306a36Sopenharmony_ci * LVT (Full) 23262306a36Sopenharmony_ci * LIndex = SIndex / NCount 23362306a36Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 23462306a36Sopenharmony_ci * TIndex = (Sindex % TCount) 23562306a36Sopenharmony_ci * LPart = LBase + LIndex 23662306a36Sopenharmony_ci * VPart = VBase + VIndex 23762306a36Sopenharmony_ci * if (TIndex == 0) { 23862306a36Sopenharmony_ci * d = <LPart, VPart> 23962306a36Sopenharmony_ci * } else { 24062306a36Sopenharmony_ci * TPart = TBase + TIndex 24162306a36Sopenharmony_ci * d = <LPart, TPart, VPart> 24262306a36Sopenharmony_ci * } 24362306a36Sopenharmony_ci */ 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci/* Constants */ 24662306a36Sopenharmony_ci#define SB (0xAC00) 24762306a36Sopenharmony_ci#define LB (0x1100) 24862306a36Sopenharmony_ci#define VB (0x1161) 24962306a36Sopenharmony_ci#define TB (0x11A7) 25062306a36Sopenharmony_ci#define LC (19) 25162306a36Sopenharmony_ci#define VC (21) 25262306a36Sopenharmony_ci#define TC (28) 25362306a36Sopenharmony_ci#define NC (VC * TC) 25462306a36Sopenharmony_ci#define SC (LC * NC) 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci/* Algorithmic decomposition of hangul syllable. */ 25762306a36Sopenharmony_cistatic utf8leaf_t * 25862306a36Sopenharmony_ciutf8hangul(const char *str, unsigned char *hangul) 25962306a36Sopenharmony_ci{ 26062306a36Sopenharmony_ci unsigned int si; 26162306a36Sopenharmony_ci unsigned int li; 26262306a36Sopenharmony_ci unsigned int vi; 26362306a36Sopenharmony_ci unsigned int ti; 26462306a36Sopenharmony_ci unsigned char *h; 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci /* Calculate the SI, LI, VI, and TI values. */ 26762306a36Sopenharmony_ci si = utf8decode3(str) - SB; 26862306a36Sopenharmony_ci li = si / NC; 26962306a36Sopenharmony_ci vi = (si % NC) / TC; 27062306a36Sopenharmony_ci ti = si % TC; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci /* Fill in base of leaf. */ 27362306a36Sopenharmony_ci h = hangul; 27462306a36Sopenharmony_ci LEAF_GEN(h) = 2; 27562306a36Sopenharmony_ci LEAF_CCC(h) = DECOMPOSE; 27662306a36Sopenharmony_ci h += 2; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci /* Add LPart, a 3-byte UTF-8 sequence. */ 27962306a36Sopenharmony_ci h += utf8encode3((char *)h, li + LB); 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci /* Add VPart, a 3-byte UTF-8 sequence. */ 28262306a36Sopenharmony_ci h += utf8encode3((char *)h, vi + VB); 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 28562306a36Sopenharmony_ci if (ti) 28662306a36Sopenharmony_ci h += utf8encode3((char *)h, ti + TB); 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci /* Terminate string. */ 28962306a36Sopenharmony_ci h[0] = '\0'; 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci return hangul; 29262306a36Sopenharmony_ci} 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci/* 29562306a36Sopenharmony_ci * Use trie to scan s, touching at most len bytes. 29662306a36Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise. 29762306a36Sopenharmony_ci * 29862306a36Sopenharmony_ci * A non-NULL return guarantees that the UTF-8 sequence starting at s 29962306a36Sopenharmony_ci * is well-formed and corresponds to a known unicode code point. The 30062306a36Sopenharmony_ci * shorthand for this will be "is valid UTF-8 unicode". 30162306a36Sopenharmony_ci */ 30262306a36Sopenharmony_cistatic utf8leaf_t *utf8nlookup(const struct unicode_map *um, 30362306a36Sopenharmony_ci enum utf8_normalization n, unsigned char *hangul, const char *s, 30462306a36Sopenharmony_ci size_t len) 30562306a36Sopenharmony_ci{ 30662306a36Sopenharmony_ci utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset; 30762306a36Sopenharmony_ci int offlen; 30862306a36Sopenharmony_ci int offset; 30962306a36Sopenharmony_ci int mask; 31062306a36Sopenharmony_ci int node; 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci if (len == 0) 31362306a36Sopenharmony_ci return NULL; 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci node = 1; 31662306a36Sopenharmony_ci while (node) { 31762306a36Sopenharmony_ci offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 31862306a36Sopenharmony_ci if (*trie & NEXTBYTE) { 31962306a36Sopenharmony_ci if (--len == 0) 32062306a36Sopenharmony_ci return NULL; 32162306a36Sopenharmony_ci s++; 32262306a36Sopenharmony_ci } 32362306a36Sopenharmony_ci mask = 1 << (*trie & BITNUM); 32462306a36Sopenharmony_ci if (*s & mask) { 32562306a36Sopenharmony_ci /* Right leg */ 32662306a36Sopenharmony_ci if (offlen) { 32762306a36Sopenharmony_ci /* Right node at offset of trie */ 32862306a36Sopenharmony_ci node = (*trie & RIGHTNODE); 32962306a36Sopenharmony_ci offset = trie[offlen]; 33062306a36Sopenharmony_ci while (--offlen) { 33162306a36Sopenharmony_ci offset <<= 8; 33262306a36Sopenharmony_ci offset |= trie[offlen]; 33362306a36Sopenharmony_ci } 33462306a36Sopenharmony_ci trie += offset; 33562306a36Sopenharmony_ci } else if (*trie & RIGHTPATH) { 33662306a36Sopenharmony_ci /* Right node after this node */ 33762306a36Sopenharmony_ci node = (*trie & TRIENODE); 33862306a36Sopenharmony_ci trie++; 33962306a36Sopenharmony_ci } else { 34062306a36Sopenharmony_ci /* No right node. */ 34162306a36Sopenharmony_ci return NULL; 34262306a36Sopenharmony_ci } 34362306a36Sopenharmony_ci } else { 34462306a36Sopenharmony_ci /* Left leg */ 34562306a36Sopenharmony_ci if (offlen) { 34662306a36Sopenharmony_ci /* Left node after this node. */ 34762306a36Sopenharmony_ci node = (*trie & LEFTNODE); 34862306a36Sopenharmony_ci trie += offlen + 1; 34962306a36Sopenharmony_ci } else if (*trie & RIGHTPATH) { 35062306a36Sopenharmony_ci /* No left node. */ 35162306a36Sopenharmony_ci return NULL; 35262306a36Sopenharmony_ci } else { 35362306a36Sopenharmony_ci /* Left node after this node */ 35462306a36Sopenharmony_ci node = (*trie & TRIENODE); 35562306a36Sopenharmony_ci trie++; 35662306a36Sopenharmony_ci } 35762306a36Sopenharmony_ci } 35862306a36Sopenharmony_ci } 35962306a36Sopenharmony_ci /* 36062306a36Sopenharmony_ci * Hangul decomposition is done algorithmically. These are the 36162306a36Sopenharmony_ci * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 36262306a36Sopenharmony_ci * always 3 bytes long, so s has been advanced twice, and the 36362306a36Sopenharmony_ci * start of the sequence is at s-2. 36462306a36Sopenharmony_ci */ 36562306a36Sopenharmony_ci if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 36662306a36Sopenharmony_ci trie = utf8hangul(s - 2, hangul); 36762306a36Sopenharmony_ci return trie; 36862306a36Sopenharmony_ci} 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci/* 37162306a36Sopenharmony_ci * Use trie to scan s. 37262306a36Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise. 37362306a36Sopenharmony_ci * 37462306a36Sopenharmony_ci * Forwards to utf8nlookup(). 37562306a36Sopenharmony_ci */ 37662306a36Sopenharmony_cistatic utf8leaf_t *utf8lookup(const struct unicode_map *um, 37762306a36Sopenharmony_ci enum utf8_normalization n, unsigned char *hangul, const char *s) 37862306a36Sopenharmony_ci{ 37962306a36Sopenharmony_ci return utf8nlookup(um, n, hangul, s, (size_t)-1); 38062306a36Sopenharmony_ci} 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci/* 38362306a36Sopenharmony_ci * Length of the normalization of s, touch at most len bytes. 38462306a36Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 38562306a36Sopenharmony_ci */ 38662306a36Sopenharmony_cissize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n, 38762306a36Sopenharmony_ci const char *s, size_t len) 38862306a36Sopenharmony_ci{ 38962306a36Sopenharmony_ci utf8leaf_t *leaf; 39062306a36Sopenharmony_ci size_t ret = 0; 39162306a36Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci while (len && *s) { 39462306a36Sopenharmony_ci leaf = utf8nlookup(um, n, hangul, s, len); 39562306a36Sopenharmony_ci if (!leaf) 39662306a36Sopenharmony_ci return -1; 39762306a36Sopenharmony_ci if (um->tables->utf8agetab[LEAF_GEN(leaf)] > 39862306a36Sopenharmony_ci um->ntab[n]->maxage) 39962306a36Sopenharmony_ci ret += utf8clen(s); 40062306a36Sopenharmony_ci else if (LEAF_CCC(leaf) == DECOMPOSE) 40162306a36Sopenharmony_ci ret += strlen(LEAF_STR(leaf)); 40262306a36Sopenharmony_ci else 40362306a36Sopenharmony_ci ret += utf8clen(s); 40462306a36Sopenharmony_ci len -= utf8clen(s); 40562306a36Sopenharmony_ci s += utf8clen(s); 40662306a36Sopenharmony_ci } 40762306a36Sopenharmony_ci return ret; 40862306a36Sopenharmony_ci} 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci/* 41162306a36Sopenharmony_ci * Set up an utf8cursor for use by utf8byte(). 41262306a36Sopenharmony_ci * 41362306a36Sopenharmony_ci * u8c : pointer to cursor. 41462306a36Sopenharmony_ci * data : const struct utf8data to use for normalization. 41562306a36Sopenharmony_ci * s : string. 41662306a36Sopenharmony_ci * len : length of s. 41762306a36Sopenharmony_ci * 41862306a36Sopenharmony_ci * Returns -1 on error, 0 on success. 41962306a36Sopenharmony_ci */ 42062306a36Sopenharmony_ciint utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um, 42162306a36Sopenharmony_ci enum utf8_normalization n, const char *s, size_t len) 42262306a36Sopenharmony_ci{ 42362306a36Sopenharmony_ci if (!s) 42462306a36Sopenharmony_ci return -1; 42562306a36Sopenharmony_ci u8c->um = um; 42662306a36Sopenharmony_ci u8c->n = n; 42762306a36Sopenharmony_ci u8c->s = s; 42862306a36Sopenharmony_ci u8c->p = NULL; 42962306a36Sopenharmony_ci u8c->ss = NULL; 43062306a36Sopenharmony_ci u8c->sp = NULL; 43162306a36Sopenharmony_ci u8c->len = len; 43262306a36Sopenharmony_ci u8c->slen = 0; 43362306a36Sopenharmony_ci u8c->ccc = STOPPER; 43462306a36Sopenharmony_ci u8c->nccc = STOPPER; 43562306a36Sopenharmony_ci /* Check we didn't clobber the maximum length. */ 43662306a36Sopenharmony_ci if (u8c->len != len) 43762306a36Sopenharmony_ci return -1; 43862306a36Sopenharmony_ci /* The first byte of s may not be an utf8 continuation. */ 43962306a36Sopenharmony_ci if (len > 0 && (*s & 0xC0) == 0x80) 44062306a36Sopenharmony_ci return -1; 44162306a36Sopenharmony_ci return 0; 44262306a36Sopenharmony_ci} 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci/* 44562306a36Sopenharmony_ci * Get one byte from the normalized form of the string described by u8c. 44662306a36Sopenharmony_ci * 44762306a36Sopenharmony_ci * Returns the byte cast to an unsigned char on succes, and -1 on failure. 44862306a36Sopenharmony_ci * 44962306a36Sopenharmony_ci * The cursor keeps track of the location in the string in u8c->s. 45062306a36Sopenharmony_ci * When a character is decomposed, the current location is stored in 45162306a36Sopenharmony_ci * u8c->p, and u8c->s is set to the start of the decomposition. Note 45262306a36Sopenharmony_ci * that bytes from a decomposition do not count against u8c->len. 45362306a36Sopenharmony_ci * 45462306a36Sopenharmony_ci * Characters are emitted if they match the current CCC in u8c->ccc. 45562306a36Sopenharmony_ci * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 45662306a36Sopenharmony_ci * and the function returns 0 in that case. 45762306a36Sopenharmony_ci * 45862306a36Sopenharmony_ci * Sorting by CCC is done by repeatedly scanning the string. The 45962306a36Sopenharmony_ci * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 46062306a36Sopenharmony_ci * the start of the scan. The first pass finds the lowest CCC to be 46162306a36Sopenharmony_ci * emitted and stores it in u8c->nccc, the second pass emits the 46262306a36Sopenharmony_ci * characters with this CCC and finds the next lowest CCC. This limits 46362306a36Sopenharmony_ci * the number of passes to 1 + the number of different CCCs in the 46462306a36Sopenharmony_ci * sequence being scanned. 46562306a36Sopenharmony_ci * 46662306a36Sopenharmony_ci * Therefore: 46762306a36Sopenharmony_ci * u8c->p != NULL -> a decomposition is being scanned. 46862306a36Sopenharmony_ci * u8c->ss != NULL -> this is a repeating scan. 46962306a36Sopenharmony_ci * u8c->ccc == -1 -> this is the first scan of a repeating scan. 47062306a36Sopenharmony_ci */ 47162306a36Sopenharmony_ciint utf8byte(struct utf8cursor *u8c) 47262306a36Sopenharmony_ci{ 47362306a36Sopenharmony_ci utf8leaf_t *leaf; 47462306a36Sopenharmony_ci int ccc; 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_ci for (;;) { 47762306a36Sopenharmony_ci /* Check for the end of a decomposed character. */ 47862306a36Sopenharmony_ci if (u8c->p && *u8c->s == '\0') { 47962306a36Sopenharmony_ci u8c->s = u8c->p; 48062306a36Sopenharmony_ci u8c->p = NULL; 48162306a36Sopenharmony_ci } 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci /* Check for end-of-string. */ 48462306a36Sopenharmony_ci if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 48562306a36Sopenharmony_ci /* There is no next byte. */ 48662306a36Sopenharmony_ci if (u8c->ccc == STOPPER) 48762306a36Sopenharmony_ci return 0; 48862306a36Sopenharmony_ci /* End-of-string during a scan counts as a stopper. */ 48962306a36Sopenharmony_ci ccc = STOPPER; 49062306a36Sopenharmony_ci goto ccc_mismatch; 49162306a36Sopenharmony_ci } else if ((*u8c->s & 0xC0) == 0x80) { 49262306a36Sopenharmony_ci /* This is a continuation of the current character. */ 49362306a36Sopenharmony_ci if (!u8c->p) 49462306a36Sopenharmony_ci u8c->len--; 49562306a36Sopenharmony_ci return (unsigned char)*u8c->s++; 49662306a36Sopenharmony_ci } 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_ci /* Look up the data for the current character. */ 49962306a36Sopenharmony_ci if (u8c->p) { 50062306a36Sopenharmony_ci leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); 50162306a36Sopenharmony_ci } else { 50262306a36Sopenharmony_ci leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul, 50362306a36Sopenharmony_ci u8c->s, u8c->len); 50462306a36Sopenharmony_ci } 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci /* No leaf found implies that the input is a binary blob. */ 50762306a36Sopenharmony_ci if (!leaf) 50862306a36Sopenharmony_ci return -1; 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci ccc = LEAF_CCC(leaf); 51162306a36Sopenharmony_ci /* Characters that are too new have CCC 0. */ 51262306a36Sopenharmony_ci if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] > 51362306a36Sopenharmony_ci u8c->um->ntab[u8c->n]->maxage) { 51462306a36Sopenharmony_ci ccc = STOPPER; 51562306a36Sopenharmony_ci } else if (ccc == DECOMPOSE) { 51662306a36Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 51762306a36Sopenharmony_ci u8c->p = u8c->s + utf8clen(u8c->s); 51862306a36Sopenharmony_ci u8c->s = LEAF_STR(leaf); 51962306a36Sopenharmony_ci /* Empty decomposition implies CCC 0. */ 52062306a36Sopenharmony_ci if (*u8c->s == '\0') { 52162306a36Sopenharmony_ci if (u8c->ccc == STOPPER) 52262306a36Sopenharmony_ci continue; 52362306a36Sopenharmony_ci ccc = STOPPER; 52462306a36Sopenharmony_ci goto ccc_mismatch; 52562306a36Sopenharmony_ci } 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_ci leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s); 52862306a36Sopenharmony_ci if (!leaf) 52962306a36Sopenharmony_ci return -1; 53062306a36Sopenharmony_ci ccc = LEAF_CCC(leaf); 53162306a36Sopenharmony_ci } 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci /* 53462306a36Sopenharmony_ci * If this is not a stopper, then see if it updates 53562306a36Sopenharmony_ci * the next canonical class to be emitted. 53662306a36Sopenharmony_ci */ 53762306a36Sopenharmony_ci if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 53862306a36Sopenharmony_ci u8c->nccc = ccc; 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci /* 54162306a36Sopenharmony_ci * Return the current byte if this is the current 54262306a36Sopenharmony_ci * combining class. 54362306a36Sopenharmony_ci */ 54462306a36Sopenharmony_ci if (ccc == u8c->ccc) { 54562306a36Sopenharmony_ci if (!u8c->p) 54662306a36Sopenharmony_ci u8c->len--; 54762306a36Sopenharmony_ci return (unsigned char)*u8c->s++; 54862306a36Sopenharmony_ci } 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci /* Current combining class mismatch. */ 55162306a36Sopenharmony_ciccc_mismatch: 55262306a36Sopenharmony_ci if (u8c->nccc == STOPPER) { 55362306a36Sopenharmony_ci /* 55462306a36Sopenharmony_ci * Scan forward for the first canonical class 55562306a36Sopenharmony_ci * to be emitted. Save the position from 55662306a36Sopenharmony_ci * which to restart. 55762306a36Sopenharmony_ci */ 55862306a36Sopenharmony_ci u8c->ccc = MINCCC - 1; 55962306a36Sopenharmony_ci u8c->nccc = ccc; 56062306a36Sopenharmony_ci u8c->sp = u8c->p; 56162306a36Sopenharmony_ci u8c->ss = u8c->s; 56262306a36Sopenharmony_ci u8c->slen = u8c->len; 56362306a36Sopenharmony_ci if (!u8c->p) 56462306a36Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 56562306a36Sopenharmony_ci u8c->s += utf8clen(u8c->s); 56662306a36Sopenharmony_ci } else if (ccc != STOPPER) { 56762306a36Sopenharmony_ci /* Not a stopper, and not the ccc we're emitting. */ 56862306a36Sopenharmony_ci if (!u8c->p) 56962306a36Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 57062306a36Sopenharmony_ci u8c->s += utf8clen(u8c->s); 57162306a36Sopenharmony_ci } else if (u8c->nccc != MAXCCC + 1) { 57262306a36Sopenharmony_ci /* At a stopper, restart for next ccc. */ 57362306a36Sopenharmony_ci u8c->ccc = u8c->nccc; 57462306a36Sopenharmony_ci u8c->nccc = MAXCCC + 1; 57562306a36Sopenharmony_ci u8c->s = u8c->ss; 57662306a36Sopenharmony_ci u8c->p = u8c->sp; 57762306a36Sopenharmony_ci u8c->len = u8c->slen; 57862306a36Sopenharmony_ci } else { 57962306a36Sopenharmony_ci /* All done, proceed from here. */ 58062306a36Sopenharmony_ci u8c->ccc = STOPPER; 58162306a36Sopenharmony_ci u8c->nccc = STOPPER; 58262306a36Sopenharmony_ci u8c->sp = NULL; 58362306a36Sopenharmony_ci u8c->ss = NULL; 58462306a36Sopenharmony_ci u8c->slen = 0; 58562306a36Sopenharmony_ci } 58662306a36Sopenharmony_ci } 58762306a36Sopenharmony_ci} 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE 59062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(utf8version_is_supported); 59162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(utf8nlen); 59262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(utf8ncursor); 59362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(utf8byte); 59462306a36Sopenharmony_ci#endif 595