18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (c) 2014 SGI. 48c2ecf20Sopenharmony_ci * All rights reserved. 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci#include "utf8n.h" 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_cistruct utf8data { 108c2ecf20Sopenharmony_ci unsigned int maxage; 118c2ecf20Sopenharmony_ci unsigned int offset; 128c2ecf20Sopenharmony_ci}; 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#define __INCLUDED_FROM_UTF8NORM_C__ 158c2ecf20Sopenharmony_ci#include "utf8data.h" 168c2ecf20Sopenharmony_ci#undef __INCLUDED_FROM_UTF8NORM_C__ 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_ciint utf8version_is_supported(u8 maj, u8 min, u8 rev) 198c2ecf20Sopenharmony_ci{ 208c2ecf20Sopenharmony_ci int i = ARRAY_SIZE(utf8agetab) - 1; 218c2ecf20Sopenharmony_ci unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_ci while (i >= 0 && utf8agetab[i] != 0) { 248c2ecf20Sopenharmony_ci if (sb_utf8version == utf8agetab[i]) 258c2ecf20Sopenharmony_ci return 1; 268c2ecf20Sopenharmony_ci i--; 278c2ecf20Sopenharmony_ci } 288c2ecf20Sopenharmony_ci return 0; 298c2ecf20Sopenharmony_ci} 308c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8version_is_supported); 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ciint utf8version_latest(void) 338c2ecf20Sopenharmony_ci{ 348c2ecf20Sopenharmony_ci return utf8vers; 358c2ecf20Sopenharmony_ci} 368c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8version_latest); 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_ci/* 398c2ecf20Sopenharmony_ci * UTF-8 valid ranges. 408c2ecf20Sopenharmony_ci * 418c2ecf20Sopenharmony_ci * The UTF-8 encoding spreads the bits of a 32bit word over several 428c2ecf20Sopenharmony_ci * bytes. This table gives the ranges that can be held and how they'd 438c2ecf20Sopenharmony_ci * be represented. 448c2ecf20Sopenharmony_ci * 458c2ecf20Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx 468c2ecf20Sopenharmony_ci * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 478c2ecf20Sopenharmony_ci * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 488c2ecf20Sopenharmony_ci * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 498c2ecf20Sopenharmony_ci * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 508c2ecf20Sopenharmony_ci * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 518c2ecf20Sopenharmony_ci * 528c2ecf20Sopenharmony_ci * There is an additional requirement on UTF-8, in that only the 538c2ecf20Sopenharmony_ci * shortest representation of a 32bit value is to be used. A decoder 548c2ecf20Sopenharmony_ci * must not decode sequences that do not satisfy this requirement. 558c2ecf20Sopenharmony_ci * Thus the allowed ranges have a lower bound. 568c2ecf20Sopenharmony_ci * 578c2ecf20Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx 588c2ecf20Sopenharmony_ci * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 598c2ecf20Sopenharmony_ci * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 608c2ecf20Sopenharmony_ci * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 618c2ecf20Sopenharmony_ci * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 628c2ecf20Sopenharmony_ci * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 638c2ecf20Sopenharmony_ci * 648c2ecf20Sopenharmony_ci * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 658c2ecf20Sopenharmony_ci * 17 planes of 65536 values. This limits the sequences actually seen 668c2ecf20Sopenharmony_ci * even more, to just the following. 678c2ecf20Sopenharmony_ci * 688c2ecf20Sopenharmony_ci * 0 - 0x7F: 0 - 0x7F 698c2ecf20Sopenharmony_ci * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF 708c2ecf20Sopenharmony_ci * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF 718c2ecf20Sopenharmony_ci * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF 728c2ecf20Sopenharmony_ci * 738c2ecf20Sopenharmony_ci * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. 748c2ecf20Sopenharmony_ci * 758c2ecf20Sopenharmony_ci * Note that the longest sequence seen with valid usage is 4 bytes, 768c2ecf20Sopenharmony_ci * the same a single UTF-32 character. This makes the UTF-8 778c2ecf20Sopenharmony_ci * representation of Unicode strictly smaller than UTF-32. 788c2ecf20Sopenharmony_ci * 798c2ecf20Sopenharmony_ci * The shortest sequence requirement was introduced by: 808c2ecf20Sopenharmony_ci * Corrigendum #1: UTF-8 Shortest Form 818c2ecf20Sopenharmony_ci * It can be found here: 828c2ecf20Sopenharmony_ci * http://www.unicode.org/versions/corrigendum1.html 838c2ecf20Sopenharmony_ci * 848c2ecf20Sopenharmony_ci */ 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci/* 878c2ecf20Sopenharmony_ci * Return the number of bytes used by the current UTF-8 sequence. 888c2ecf20Sopenharmony_ci * Assumes the input points to the first byte of a valid UTF-8 898c2ecf20Sopenharmony_ci * sequence. 908c2ecf20Sopenharmony_ci */ 918c2ecf20Sopenharmony_cistatic inline int utf8clen(const char *s) 928c2ecf20Sopenharmony_ci{ 938c2ecf20Sopenharmony_ci unsigned char c = *s; 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 968c2ecf20Sopenharmony_ci} 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci/* 998c2ecf20Sopenharmony_ci * Decode a 3-byte UTF-8 sequence. 1008c2ecf20Sopenharmony_ci */ 1018c2ecf20Sopenharmony_cistatic unsigned int 1028c2ecf20Sopenharmony_ciutf8decode3(const char *str) 1038c2ecf20Sopenharmony_ci{ 1048c2ecf20Sopenharmony_ci unsigned int uc; 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci uc = *str++ & 0x0F; 1078c2ecf20Sopenharmony_ci uc <<= 6; 1088c2ecf20Sopenharmony_ci uc |= *str++ & 0x3F; 1098c2ecf20Sopenharmony_ci uc <<= 6; 1108c2ecf20Sopenharmony_ci uc |= *str++ & 0x3F; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci return uc; 1138c2ecf20Sopenharmony_ci} 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_ci/* 1168c2ecf20Sopenharmony_ci * Encode a 3-byte UTF-8 sequence. 1178c2ecf20Sopenharmony_ci */ 1188c2ecf20Sopenharmony_cistatic int 1198c2ecf20Sopenharmony_ciutf8encode3(char *str, unsigned int val) 1208c2ecf20Sopenharmony_ci{ 1218c2ecf20Sopenharmony_ci str[2] = (val & 0x3F) | 0x80; 1228c2ecf20Sopenharmony_ci val >>= 6; 1238c2ecf20Sopenharmony_ci str[1] = (val & 0x3F) | 0x80; 1248c2ecf20Sopenharmony_ci val >>= 6; 1258c2ecf20Sopenharmony_ci str[0] = val | 0xE0; 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci return 3; 1288c2ecf20Sopenharmony_ci} 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci/* 1318c2ecf20Sopenharmony_ci * utf8trie_t 1328c2ecf20Sopenharmony_ci * 1338c2ecf20Sopenharmony_ci * A compact binary tree, used to decode UTF-8 characters. 1348c2ecf20Sopenharmony_ci * 1358c2ecf20Sopenharmony_ci * Internal nodes are one byte for the node itself, and up to three 1368c2ecf20Sopenharmony_ci * bytes for an offset into the tree. The first byte contains the 1378c2ecf20Sopenharmony_ci * following information: 1388c2ecf20Sopenharmony_ci * NEXTBYTE - flag - advance to next byte if set 1398c2ecf20Sopenharmony_ci * BITNUM - 3 bit field - the bit number to tested 1408c2ecf20Sopenharmony_ci * OFFLEN - 2 bit field - number of bytes in the offset 1418c2ecf20Sopenharmony_ci * if offlen == 0 (non-branching node) 1428c2ecf20Sopenharmony_ci * RIGHTPATH - 1 bit field - set if the following node is for the 1438c2ecf20Sopenharmony_ci * right-hand path (tested bit is set) 1448c2ecf20Sopenharmony_ci * TRIENODE - 1 bit field - set if the following node is an internal 1458c2ecf20Sopenharmony_ci * node, otherwise it is a leaf node 1468c2ecf20Sopenharmony_ci * if offlen != 0 (branching node) 1478c2ecf20Sopenharmony_ci * LEFTNODE - 1 bit field - set if the left-hand node is internal 1488c2ecf20Sopenharmony_ci * RIGHTNODE - 1 bit field - set if the right-hand node is internal 1498c2ecf20Sopenharmony_ci * 1508c2ecf20Sopenharmony_ci * Due to the way utf8 works, there cannot be branching nodes with 1518c2ecf20Sopenharmony_ci * NEXTBYTE set, and moreover those nodes always have a righthand 1528c2ecf20Sopenharmony_ci * descendant. 1538c2ecf20Sopenharmony_ci */ 1548c2ecf20Sopenharmony_citypedef const unsigned char utf8trie_t; 1558c2ecf20Sopenharmony_ci#define BITNUM 0x07 1568c2ecf20Sopenharmony_ci#define NEXTBYTE 0x08 1578c2ecf20Sopenharmony_ci#define OFFLEN 0x30 1588c2ecf20Sopenharmony_ci#define OFFLEN_SHIFT 4 1598c2ecf20Sopenharmony_ci#define RIGHTPATH 0x40 1608c2ecf20Sopenharmony_ci#define TRIENODE 0x80 1618c2ecf20Sopenharmony_ci#define RIGHTNODE 0x40 1628c2ecf20Sopenharmony_ci#define LEFTNODE 0x80 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci/* 1658c2ecf20Sopenharmony_ci * utf8leaf_t 1668c2ecf20Sopenharmony_ci * 1678c2ecf20Sopenharmony_ci * The leaves of the trie are embedded in the trie, and so the same 1688c2ecf20Sopenharmony_ci * underlying datatype: unsigned char. 1698c2ecf20Sopenharmony_ci * 1708c2ecf20Sopenharmony_ci * leaf[0]: The unicode version, stored as a generation number that is 1718c2ecf20Sopenharmony_ci * an index into utf8agetab[]. With this we can filter code 1728c2ecf20Sopenharmony_ci * points based on the unicode version in which they were 1738c2ecf20Sopenharmony_ci * defined. The CCC of a non-defined code point is 0. 1748c2ecf20Sopenharmony_ci * leaf[1]: Canonical Combining Class. During normalization, we need 1758c2ecf20Sopenharmony_ci * to do a stable sort into ascending order of all characters 1768c2ecf20Sopenharmony_ci * with a non-zero CCC that occur between two characters with 1778c2ecf20Sopenharmony_ci * a CCC of 0, or at the begin or end of a string. 1788c2ecf20Sopenharmony_ci * The unicode standard guarantees that all CCC values are 1798c2ecf20Sopenharmony_ci * between 0 and 254 inclusive, which leaves 255 available as 1808c2ecf20Sopenharmony_ci * a special value. 1818c2ecf20Sopenharmony_ci * Code points with CCC 0 are known as stoppers. 1828c2ecf20Sopenharmony_ci * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 1838c2ecf20Sopenharmony_ci * start of a NUL-terminated string that is the decomposition 1848c2ecf20Sopenharmony_ci * of the character. 1858c2ecf20Sopenharmony_ci * The CCC of a decomposable character is the same as the CCC 1868c2ecf20Sopenharmony_ci * of the first character of its decomposition. 1878c2ecf20Sopenharmony_ci * Some characters decompose as the empty string: these are 1888c2ecf20Sopenharmony_ci * characters with the Default_Ignorable_Code_Point property. 1898c2ecf20Sopenharmony_ci * These do affect normalization, as they all have CCC 0. 1908c2ecf20Sopenharmony_ci * 1918c2ecf20Sopenharmony_ci * The decompositions in the trie have been fully expanded, with the 1928c2ecf20Sopenharmony_ci * exception of Hangul syllables, which are decomposed algorithmically. 1938c2ecf20Sopenharmony_ci * 1948c2ecf20Sopenharmony_ci * Casefolding, if applicable, is also done using decompositions. 1958c2ecf20Sopenharmony_ci * 1968c2ecf20Sopenharmony_ci * The trie is constructed in such a way that leaves exist for all 1978c2ecf20Sopenharmony_ci * UTF-8 sequences that match the criteria from the "UTF-8 valid 1988c2ecf20Sopenharmony_ci * ranges" comment above, and only for those sequences. Therefore a 1998c2ecf20Sopenharmony_ci * lookup in the trie can be used to validate the UTF-8 input. 2008c2ecf20Sopenharmony_ci */ 2018c2ecf20Sopenharmony_citypedef const unsigned char utf8leaf_t; 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_ci#define LEAF_GEN(LEAF) ((LEAF)[0]) 2048c2ecf20Sopenharmony_ci#define LEAF_CCC(LEAF) ((LEAF)[1]) 2058c2ecf20Sopenharmony_ci#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci#define MINCCC (0) 2088c2ecf20Sopenharmony_ci#define MAXCCC (254) 2098c2ecf20Sopenharmony_ci#define STOPPER (0) 2108c2ecf20Sopenharmony_ci#define DECOMPOSE (255) 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci/* Marker for hangul syllable decomposition. */ 2138c2ecf20Sopenharmony_ci#define HANGUL ((char)(255)) 2148c2ecf20Sopenharmony_ci/* Size of the synthesized leaf used for Hangul syllable decomposition. */ 2158c2ecf20Sopenharmony_ci#define UTF8HANGULLEAF (12) 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_ci/* 2188c2ecf20Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 2198c2ecf20Sopenharmony_ci * 2208c2ecf20Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 2218c2ecf20Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 2228c2ecf20Sopenharmony_ci * 2238c2ecf20Sopenharmony_ci * SBase = 0xAC00 2248c2ecf20Sopenharmony_ci * LBase = 0x1100 2258c2ecf20Sopenharmony_ci * VBase = 0x1161 2268c2ecf20Sopenharmony_ci * TBase = 0x11A7 2278c2ecf20Sopenharmony_ci * LCount = 19 2288c2ecf20Sopenharmony_ci * VCount = 21 2298c2ecf20Sopenharmony_ci * TCount = 28 2308c2ecf20Sopenharmony_ci * NCount = 588 (VCount * TCount) 2318c2ecf20Sopenharmony_ci * SCount = 11172 (LCount * NCount) 2328c2ecf20Sopenharmony_ci * 2338c2ecf20Sopenharmony_ci * Decomposition: 2348c2ecf20Sopenharmony_ci * SIndex = s - SBase 2358c2ecf20Sopenharmony_ci * 2368c2ecf20Sopenharmony_ci * LV (Canonical/Full) 2378c2ecf20Sopenharmony_ci * LIndex = SIndex / NCount 2388c2ecf20Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 2398c2ecf20Sopenharmony_ci * LPart = LBase + LIndex 2408c2ecf20Sopenharmony_ci * VPart = VBase + VIndex 2418c2ecf20Sopenharmony_ci * 2428c2ecf20Sopenharmony_ci * LVT (Canonical) 2438c2ecf20Sopenharmony_ci * LVIndex = (SIndex / TCount) * TCount 2448c2ecf20Sopenharmony_ci * TIndex = (Sindex % TCount) 2458c2ecf20Sopenharmony_ci * LVPart = SBase + LVIndex 2468c2ecf20Sopenharmony_ci * TPart = TBase + TIndex 2478c2ecf20Sopenharmony_ci * 2488c2ecf20Sopenharmony_ci * LVT (Full) 2498c2ecf20Sopenharmony_ci * LIndex = SIndex / NCount 2508c2ecf20Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 2518c2ecf20Sopenharmony_ci * TIndex = (Sindex % TCount) 2528c2ecf20Sopenharmony_ci * LPart = LBase + LIndex 2538c2ecf20Sopenharmony_ci * VPart = VBase + VIndex 2548c2ecf20Sopenharmony_ci * if (TIndex == 0) { 2558c2ecf20Sopenharmony_ci * d = <LPart, VPart> 2568c2ecf20Sopenharmony_ci * } else { 2578c2ecf20Sopenharmony_ci * TPart = TBase + TIndex 2588c2ecf20Sopenharmony_ci * d = <LPart, TPart, VPart> 2598c2ecf20Sopenharmony_ci * } 2608c2ecf20Sopenharmony_ci */ 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_ci/* Constants */ 2638c2ecf20Sopenharmony_ci#define SB (0xAC00) 2648c2ecf20Sopenharmony_ci#define LB (0x1100) 2658c2ecf20Sopenharmony_ci#define VB (0x1161) 2668c2ecf20Sopenharmony_ci#define TB (0x11A7) 2678c2ecf20Sopenharmony_ci#define LC (19) 2688c2ecf20Sopenharmony_ci#define VC (21) 2698c2ecf20Sopenharmony_ci#define TC (28) 2708c2ecf20Sopenharmony_ci#define NC (VC * TC) 2718c2ecf20Sopenharmony_ci#define SC (LC * NC) 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci/* Algorithmic decomposition of hangul syllable. */ 2748c2ecf20Sopenharmony_cistatic utf8leaf_t * 2758c2ecf20Sopenharmony_ciutf8hangul(const char *str, unsigned char *hangul) 2768c2ecf20Sopenharmony_ci{ 2778c2ecf20Sopenharmony_ci unsigned int si; 2788c2ecf20Sopenharmony_ci unsigned int li; 2798c2ecf20Sopenharmony_ci unsigned int vi; 2808c2ecf20Sopenharmony_ci unsigned int ti; 2818c2ecf20Sopenharmony_ci unsigned char *h; 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci /* Calculate the SI, LI, VI, and TI values. */ 2848c2ecf20Sopenharmony_ci si = utf8decode3(str) - SB; 2858c2ecf20Sopenharmony_ci li = si / NC; 2868c2ecf20Sopenharmony_ci vi = (si % NC) / TC; 2878c2ecf20Sopenharmony_ci ti = si % TC; 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci /* Fill in base of leaf. */ 2908c2ecf20Sopenharmony_ci h = hangul; 2918c2ecf20Sopenharmony_ci LEAF_GEN(h) = 2; 2928c2ecf20Sopenharmony_ci LEAF_CCC(h) = DECOMPOSE; 2938c2ecf20Sopenharmony_ci h += 2; 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci /* Add LPart, a 3-byte UTF-8 sequence. */ 2968c2ecf20Sopenharmony_ci h += utf8encode3((char *)h, li + LB); 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci /* Add VPart, a 3-byte UTF-8 sequence. */ 2998c2ecf20Sopenharmony_ci h += utf8encode3((char *)h, vi + VB); 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_ci /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 3028c2ecf20Sopenharmony_ci if (ti) 3038c2ecf20Sopenharmony_ci h += utf8encode3((char *)h, ti + TB); 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci /* Terminate string. */ 3068c2ecf20Sopenharmony_ci h[0] = '\0'; 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_ci return hangul; 3098c2ecf20Sopenharmony_ci} 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci/* 3128c2ecf20Sopenharmony_ci * Use trie to scan s, touching at most len bytes. 3138c2ecf20Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise. 3148c2ecf20Sopenharmony_ci * 3158c2ecf20Sopenharmony_ci * A non-NULL return guarantees that the UTF-8 sequence starting at s 3168c2ecf20Sopenharmony_ci * is well-formed and corresponds to a known unicode code point. The 3178c2ecf20Sopenharmony_ci * shorthand for this will be "is valid UTF-8 unicode". 3188c2ecf20Sopenharmony_ci */ 3198c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8nlookup(const struct utf8data *data, 3208c2ecf20Sopenharmony_ci unsigned char *hangul, const char *s, size_t len) 3218c2ecf20Sopenharmony_ci{ 3228c2ecf20Sopenharmony_ci utf8trie_t *trie = NULL; 3238c2ecf20Sopenharmony_ci int offlen; 3248c2ecf20Sopenharmony_ci int offset; 3258c2ecf20Sopenharmony_ci int mask; 3268c2ecf20Sopenharmony_ci int node; 3278c2ecf20Sopenharmony_ci 3288c2ecf20Sopenharmony_ci if (!data) 3298c2ecf20Sopenharmony_ci return NULL; 3308c2ecf20Sopenharmony_ci if (len == 0) 3318c2ecf20Sopenharmony_ci return NULL; 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci trie = utf8data + data->offset; 3348c2ecf20Sopenharmony_ci node = 1; 3358c2ecf20Sopenharmony_ci while (node) { 3368c2ecf20Sopenharmony_ci offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 3378c2ecf20Sopenharmony_ci if (*trie & NEXTBYTE) { 3388c2ecf20Sopenharmony_ci if (--len == 0) 3398c2ecf20Sopenharmony_ci return NULL; 3408c2ecf20Sopenharmony_ci s++; 3418c2ecf20Sopenharmony_ci } 3428c2ecf20Sopenharmony_ci mask = 1 << (*trie & BITNUM); 3438c2ecf20Sopenharmony_ci if (*s & mask) { 3448c2ecf20Sopenharmony_ci /* Right leg */ 3458c2ecf20Sopenharmony_ci if (offlen) { 3468c2ecf20Sopenharmony_ci /* Right node at offset of trie */ 3478c2ecf20Sopenharmony_ci node = (*trie & RIGHTNODE); 3488c2ecf20Sopenharmony_ci offset = trie[offlen]; 3498c2ecf20Sopenharmony_ci while (--offlen) { 3508c2ecf20Sopenharmony_ci offset <<= 8; 3518c2ecf20Sopenharmony_ci offset |= trie[offlen]; 3528c2ecf20Sopenharmony_ci } 3538c2ecf20Sopenharmony_ci trie += offset; 3548c2ecf20Sopenharmony_ci } else if (*trie & RIGHTPATH) { 3558c2ecf20Sopenharmony_ci /* Right node after this node */ 3568c2ecf20Sopenharmony_ci node = (*trie & TRIENODE); 3578c2ecf20Sopenharmony_ci trie++; 3588c2ecf20Sopenharmony_ci } else { 3598c2ecf20Sopenharmony_ci /* No right node. */ 3608c2ecf20Sopenharmony_ci return NULL; 3618c2ecf20Sopenharmony_ci } 3628c2ecf20Sopenharmony_ci } else { 3638c2ecf20Sopenharmony_ci /* Left leg */ 3648c2ecf20Sopenharmony_ci if (offlen) { 3658c2ecf20Sopenharmony_ci /* Left node after this node. */ 3668c2ecf20Sopenharmony_ci node = (*trie & LEFTNODE); 3678c2ecf20Sopenharmony_ci trie += offlen + 1; 3688c2ecf20Sopenharmony_ci } else if (*trie & RIGHTPATH) { 3698c2ecf20Sopenharmony_ci /* No left node. */ 3708c2ecf20Sopenharmony_ci return NULL; 3718c2ecf20Sopenharmony_ci } else { 3728c2ecf20Sopenharmony_ci /* Left node after this node */ 3738c2ecf20Sopenharmony_ci node = (*trie & TRIENODE); 3748c2ecf20Sopenharmony_ci trie++; 3758c2ecf20Sopenharmony_ci } 3768c2ecf20Sopenharmony_ci } 3778c2ecf20Sopenharmony_ci } 3788c2ecf20Sopenharmony_ci /* 3798c2ecf20Sopenharmony_ci * Hangul decomposition is done algorithmically. These are the 3808c2ecf20Sopenharmony_ci * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 3818c2ecf20Sopenharmony_ci * always 3 bytes long, so s has been advanced twice, and the 3828c2ecf20Sopenharmony_ci * start of the sequence is at s-2. 3838c2ecf20Sopenharmony_ci */ 3848c2ecf20Sopenharmony_ci if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 3858c2ecf20Sopenharmony_ci trie = utf8hangul(s - 2, hangul); 3868c2ecf20Sopenharmony_ci return trie; 3878c2ecf20Sopenharmony_ci} 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci/* 3908c2ecf20Sopenharmony_ci * Use trie to scan s. 3918c2ecf20Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise. 3928c2ecf20Sopenharmony_ci * 3938c2ecf20Sopenharmony_ci * Forwards to utf8nlookup(). 3948c2ecf20Sopenharmony_ci */ 3958c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8lookup(const struct utf8data *data, 3968c2ecf20Sopenharmony_ci unsigned char *hangul, const char *s) 3978c2ecf20Sopenharmony_ci{ 3988c2ecf20Sopenharmony_ci return utf8nlookup(data, hangul, s, (size_t)-1); 3998c2ecf20Sopenharmony_ci} 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci/* 4028c2ecf20Sopenharmony_ci * Maximum age of any character in s. 4038c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 4048c2ecf20Sopenharmony_ci * Return 0 if only non-assigned code points are used. 4058c2ecf20Sopenharmony_ci */ 4068c2ecf20Sopenharmony_ciint utf8agemax(const struct utf8data *data, const char *s) 4078c2ecf20Sopenharmony_ci{ 4088c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 4098c2ecf20Sopenharmony_ci int age = 0; 4108c2ecf20Sopenharmony_ci int leaf_age; 4118c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci if (!data) 4148c2ecf20Sopenharmony_ci return -1; 4158c2ecf20Sopenharmony_ci 4168c2ecf20Sopenharmony_ci while (*s) { 4178c2ecf20Sopenharmony_ci leaf = utf8lookup(data, hangul, s); 4188c2ecf20Sopenharmony_ci if (!leaf) 4198c2ecf20Sopenharmony_ci return -1; 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci leaf_age = utf8agetab[LEAF_GEN(leaf)]; 4228c2ecf20Sopenharmony_ci if (leaf_age <= data->maxage && leaf_age > age) 4238c2ecf20Sopenharmony_ci age = leaf_age; 4248c2ecf20Sopenharmony_ci s += utf8clen(s); 4258c2ecf20Sopenharmony_ci } 4268c2ecf20Sopenharmony_ci return age; 4278c2ecf20Sopenharmony_ci} 4288c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8agemax); 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci/* 4318c2ecf20Sopenharmony_ci * Minimum age of any character in s. 4328c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 4338c2ecf20Sopenharmony_ci * Return 0 if non-assigned code points are used. 4348c2ecf20Sopenharmony_ci */ 4358c2ecf20Sopenharmony_ciint utf8agemin(const struct utf8data *data, const char *s) 4368c2ecf20Sopenharmony_ci{ 4378c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 4388c2ecf20Sopenharmony_ci int age; 4398c2ecf20Sopenharmony_ci int leaf_age; 4408c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci if (!data) 4438c2ecf20Sopenharmony_ci return -1; 4448c2ecf20Sopenharmony_ci age = data->maxage; 4458c2ecf20Sopenharmony_ci while (*s) { 4468c2ecf20Sopenharmony_ci leaf = utf8lookup(data, hangul, s); 4478c2ecf20Sopenharmony_ci if (!leaf) 4488c2ecf20Sopenharmony_ci return -1; 4498c2ecf20Sopenharmony_ci leaf_age = utf8agetab[LEAF_GEN(leaf)]; 4508c2ecf20Sopenharmony_ci if (leaf_age <= data->maxage && leaf_age < age) 4518c2ecf20Sopenharmony_ci age = leaf_age; 4528c2ecf20Sopenharmony_ci s += utf8clen(s); 4538c2ecf20Sopenharmony_ci } 4548c2ecf20Sopenharmony_ci return age; 4558c2ecf20Sopenharmony_ci} 4568c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8agemin); 4578c2ecf20Sopenharmony_ci 4588c2ecf20Sopenharmony_ci/* 4598c2ecf20Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes. 4608c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 4618c2ecf20Sopenharmony_ci */ 4628c2ecf20Sopenharmony_ciint utf8nagemax(const struct utf8data *data, const char *s, size_t len) 4638c2ecf20Sopenharmony_ci{ 4648c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 4658c2ecf20Sopenharmony_ci int age = 0; 4668c2ecf20Sopenharmony_ci int leaf_age; 4678c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 4688c2ecf20Sopenharmony_ci 4698c2ecf20Sopenharmony_ci if (!data) 4708c2ecf20Sopenharmony_ci return -1; 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci while (len && *s) { 4738c2ecf20Sopenharmony_ci leaf = utf8nlookup(data, hangul, s, len); 4748c2ecf20Sopenharmony_ci if (!leaf) 4758c2ecf20Sopenharmony_ci return -1; 4768c2ecf20Sopenharmony_ci leaf_age = utf8agetab[LEAF_GEN(leaf)]; 4778c2ecf20Sopenharmony_ci if (leaf_age <= data->maxage && leaf_age > age) 4788c2ecf20Sopenharmony_ci age = leaf_age; 4798c2ecf20Sopenharmony_ci len -= utf8clen(s); 4808c2ecf20Sopenharmony_ci s += utf8clen(s); 4818c2ecf20Sopenharmony_ci } 4828c2ecf20Sopenharmony_ci return age; 4838c2ecf20Sopenharmony_ci} 4848c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8nagemax); 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci/* 4878c2ecf20Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes. 4888c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 4898c2ecf20Sopenharmony_ci */ 4908c2ecf20Sopenharmony_ciint utf8nagemin(const struct utf8data *data, const char *s, size_t len) 4918c2ecf20Sopenharmony_ci{ 4928c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 4938c2ecf20Sopenharmony_ci int leaf_age; 4948c2ecf20Sopenharmony_ci int age; 4958c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_ci if (!data) 4988c2ecf20Sopenharmony_ci return -1; 4998c2ecf20Sopenharmony_ci age = data->maxage; 5008c2ecf20Sopenharmony_ci while (len && *s) { 5018c2ecf20Sopenharmony_ci leaf = utf8nlookup(data, hangul, s, len); 5028c2ecf20Sopenharmony_ci if (!leaf) 5038c2ecf20Sopenharmony_ci return -1; 5048c2ecf20Sopenharmony_ci leaf_age = utf8agetab[LEAF_GEN(leaf)]; 5058c2ecf20Sopenharmony_ci if (leaf_age <= data->maxage && leaf_age < age) 5068c2ecf20Sopenharmony_ci age = leaf_age; 5078c2ecf20Sopenharmony_ci len -= utf8clen(s); 5088c2ecf20Sopenharmony_ci s += utf8clen(s); 5098c2ecf20Sopenharmony_ci } 5108c2ecf20Sopenharmony_ci return age; 5118c2ecf20Sopenharmony_ci} 5128c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8nagemin); 5138c2ecf20Sopenharmony_ci 5148c2ecf20Sopenharmony_ci/* 5158c2ecf20Sopenharmony_ci * Length of the normalization of s. 5168c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 5178c2ecf20Sopenharmony_ci * 5188c2ecf20Sopenharmony_ci * A string of Default_Ignorable_Code_Point has length 0. 5198c2ecf20Sopenharmony_ci */ 5208c2ecf20Sopenharmony_cissize_t utf8len(const struct utf8data *data, const char *s) 5218c2ecf20Sopenharmony_ci{ 5228c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 5238c2ecf20Sopenharmony_ci size_t ret = 0; 5248c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci if (!data) 5278c2ecf20Sopenharmony_ci return -1; 5288c2ecf20Sopenharmony_ci while (*s) { 5298c2ecf20Sopenharmony_ci leaf = utf8lookup(data, hangul, s); 5308c2ecf20Sopenharmony_ci if (!leaf) 5318c2ecf20Sopenharmony_ci return -1; 5328c2ecf20Sopenharmony_ci if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 5338c2ecf20Sopenharmony_ci ret += utf8clen(s); 5348c2ecf20Sopenharmony_ci else if (LEAF_CCC(leaf) == DECOMPOSE) 5358c2ecf20Sopenharmony_ci ret += strlen(LEAF_STR(leaf)); 5368c2ecf20Sopenharmony_ci else 5378c2ecf20Sopenharmony_ci ret += utf8clen(s); 5388c2ecf20Sopenharmony_ci s += utf8clen(s); 5398c2ecf20Sopenharmony_ci } 5408c2ecf20Sopenharmony_ci return ret; 5418c2ecf20Sopenharmony_ci} 5428c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8len); 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci/* 5458c2ecf20Sopenharmony_ci * Length of the normalization of s, touch at most len bytes. 5468c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 5478c2ecf20Sopenharmony_ci */ 5488c2ecf20Sopenharmony_cissize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) 5498c2ecf20Sopenharmony_ci{ 5508c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 5518c2ecf20Sopenharmony_ci size_t ret = 0; 5528c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci if (!data) 5558c2ecf20Sopenharmony_ci return -1; 5568c2ecf20Sopenharmony_ci while (len && *s) { 5578c2ecf20Sopenharmony_ci leaf = utf8nlookup(data, hangul, s, len); 5588c2ecf20Sopenharmony_ci if (!leaf) 5598c2ecf20Sopenharmony_ci return -1; 5608c2ecf20Sopenharmony_ci if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 5618c2ecf20Sopenharmony_ci ret += utf8clen(s); 5628c2ecf20Sopenharmony_ci else if (LEAF_CCC(leaf) == DECOMPOSE) 5638c2ecf20Sopenharmony_ci ret += strlen(LEAF_STR(leaf)); 5648c2ecf20Sopenharmony_ci else 5658c2ecf20Sopenharmony_ci ret += utf8clen(s); 5668c2ecf20Sopenharmony_ci len -= utf8clen(s); 5678c2ecf20Sopenharmony_ci s += utf8clen(s); 5688c2ecf20Sopenharmony_ci } 5698c2ecf20Sopenharmony_ci return ret; 5708c2ecf20Sopenharmony_ci} 5718c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8nlen); 5728c2ecf20Sopenharmony_ci 5738c2ecf20Sopenharmony_ci/* 5748c2ecf20Sopenharmony_ci * Set up an utf8cursor for use by utf8byte(). 5758c2ecf20Sopenharmony_ci * 5768c2ecf20Sopenharmony_ci * u8c : pointer to cursor. 5778c2ecf20Sopenharmony_ci * data : const struct utf8data to use for normalization. 5788c2ecf20Sopenharmony_ci * s : string. 5798c2ecf20Sopenharmony_ci * len : length of s. 5808c2ecf20Sopenharmony_ci * 5818c2ecf20Sopenharmony_ci * Returns -1 on error, 0 on success. 5828c2ecf20Sopenharmony_ci */ 5838c2ecf20Sopenharmony_ciint utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, 5848c2ecf20Sopenharmony_ci const char *s, size_t len) 5858c2ecf20Sopenharmony_ci{ 5868c2ecf20Sopenharmony_ci if (!data) 5878c2ecf20Sopenharmony_ci return -1; 5888c2ecf20Sopenharmony_ci if (!s) 5898c2ecf20Sopenharmony_ci return -1; 5908c2ecf20Sopenharmony_ci u8c->data = data; 5918c2ecf20Sopenharmony_ci u8c->s = s; 5928c2ecf20Sopenharmony_ci u8c->p = NULL; 5938c2ecf20Sopenharmony_ci u8c->ss = NULL; 5948c2ecf20Sopenharmony_ci u8c->sp = NULL; 5958c2ecf20Sopenharmony_ci u8c->len = len; 5968c2ecf20Sopenharmony_ci u8c->slen = 0; 5978c2ecf20Sopenharmony_ci u8c->ccc = STOPPER; 5988c2ecf20Sopenharmony_ci u8c->nccc = STOPPER; 5998c2ecf20Sopenharmony_ci /* Check we didn't clobber the maximum length. */ 6008c2ecf20Sopenharmony_ci if (u8c->len != len) 6018c2ecf20Sopenharmony_ci return -1; 6028c2ecf20Sopenharmony_ci /* The first byte of s may not be an utf8 continuation. */ 6038c2ecf20Sopenharmony_ci if (len > 0 && (*s & 0xC0) == 0x80) 6048c2ecf20Sopenharmony_ci return -1; 6058c2ecf20Sopenharmony_ci return 0; 6068c2ecf20Sopenharmony_ci} 6078c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8ncursor); 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci/* 6108c2ecf20Sopenharmony_ci * Set up an utf8cursor for use by utf8byte(). 6118c2ecf20Sopenharmony_ci * 6128c2ecf20Sopenharmony_ci * u8c : pointer to cursor. 6138c2ecf20Sopenharmony_ci * data : const struct utf8data to use for normalization. 6148c2ecf20Sopenharmony_ci * s : NUL-terminated string. 6158c2ecf20Sopenharmony_ci * 6168c2ecf20Sopenharmony_ci * Returns -1 on error, 0 on success. 6178c2ecf20Sopenharmony_ci */ 6188c2ecf20Sopenharmony_ciint utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, 6198c2ecf20Sopenharmony_ci const char *s) 6208c2ecf20Sopenharmony_ci{ 6218c2ecf20Sopenharmony_ci return utf8ncursor(u8c, data, s, (unsigned int)-1); 6228c2ecf20Sopenharmony_ci} 6238c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8cursor); 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ci/* 6268c2ecf20Sopenharmony_ci * Get one byte from the normalized form of the string described by u8c. 6278c2ecf20Sopenharmony_ci * 6288c2ecf20Sopenharmony_ci * Returns the byte cast to an unsigned char on succes, and -1 on failure. 6298c2ecf20Sopenharmony_ci * 6308c2ecf20Sopenharmony_ci * The cursor keeps track of the location in the string in u8c->s. 6318c2ecf20Sopenharmony_ci * When a character is decomposed, the current location is stored in 6328c2ecf20Sopenharmony_ci * u8c->p, and u8c->s is set to the start of the decomposition. Note 6338c2ecf20Sopenharmony_ci * that bytes from a decomposition do not count against u8c->len. 6348c2ecf20Sopenharmony_ci * 6358c2ecf20Sopenharmony_ci * Characters are emitted if they match the current CCC in u8c->ccc. 6368c2ecf20Sopenharmony_ci * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 6378c2ecf20Sopenharmony_ci * and the function returns 0 in that case. 6388c2ecf20Sopenharmony_ci * 6398c2ecf20Sopenharmony_ci * Sorting by CCC is done by repeatedly scanning the string. The 6408c2ecf20Sopenharmony_ci * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 6418c2ecf20Sopenharmony_ci * the start of the scan. The first pass finds the lowest CCC to be 6428c2ecf20Sopenharmony_ci * emitted and stores it in u8c->nccc, the second pass emits the 6438c2ecf20Sopenharmony_ci * characters with this CCC and finds the next lowest CCC. This limits 6448c2ecf20Sopenharmony_ci * the number of passes to 1 + the number of different CCCs in the 6458c2ecf20Sopenharmony_ci * sequence being scanned. 6468c2ecf20Sopenharmony_ci * 6478c2ecf20Sopenharmony_ci * Therefore: 6488c2ecf20Sopenharmony_ci * u8c->p != NULL -> a decomposition is being scanned. 6498c2ecf20Sopenharmony_ci * u8c->ss != NULL -> this is a repeating scan. 6508c2ecf20Sopenharmony_ci * u8c->ccc == -1 -> this is the first scan of a repeating scan. 6518c2ecf20Sopenharmony_ci */ 6528c2ecf20Sopenharmony_ciint utf8byte(struct utf8cursor *u8c) 6538c2ecf20Sopenharmony_ci{ 6548c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 6558c2ecf20Sopenharmony_ci int ccc; 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_ci for (;;) { 6588c2ecf20Sopenharmony_ci /* Check for the end of a decomposed character. */ 6598c2ecf20Sopenharmony_ci if (u8c->p && *u8c->s == '\0') { 6608c2ecf20Sopenharmony_ci u8c->s = u8c->p; 6618c2ecf20Sopenharmony_ci u8c->p = NULL; 6628c2ecf20Sopenharmony_ci } 6638c2ecf20Sopenharmony_ci 6648c2ecf20Sopenharmony_ci /* Check for end-of-string. */ 6658c2ecf20Sopenharmony_ci if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 6668c2ecf20Sopenharmony_ci /* There is no next byte. */ 6678c2ecf20Sopenharmony_ci if (u8c->ccc == STOPPER) 6688c2ecf20Sopenharmony_ci return 0; 6698c2ecf20Sopenharmony_ci /* End-of-string during a scan counts as a stopper. */ 6708c2ecf20Sopenharmony_ci ccc = STOPPER; 6718c2ecf20Sopenharmony_ci goto ccc_mismatch; 6728c2ecf20Sopenharmony_ci } else if ((*u8c->s & 0xC0) == 0x80) { 6738c2ecf20Sopenharmony_ci /* This is a continuation of the current character. */ 6748c2ecf20Sopenharmony_ci if (!u8c->p) 6758c2ecf20Sopenharmony_ci u8c->len--; 6768c2ecf20Sopenharmony_ci return (unsigned char)*u8c->s++; 6778c2ecf20Sopenharmony_ci } 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci /* Look up the data for the current character. */ 6808c2ecf20Sopenharmony_ci if (u8c->p) { 6818c2ecf20Sopenharmony_ci leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 6828c2ecf20Sopenharmony_ci } else { 6838c2ecf20Sopenharmony_ci leaf = utf8nlookup(u8c->data, u8c->hangul, 6848c2ecf20Sopenharmony_ci u8c->s, u8c->len); 6858c2ecf20Sopenharmony_ci } 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci /* No leaf found implies that the input is a binary blob. */ 6888c2ecf20Sopenharmony_ci if (!leaf) 6898c2ecf20Sopenharmony_ci return -1; 6908c2ecf20Sopenharmony_ci 6918c2ecf20Sopenharmony_ci ccc = LEAF_CCC(leaf); 6928c2ecf20Sopenharmony_ci /* Characters that are too new have CCC 0. */ 6938c2ecf20Sopenharmony_ci if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { 6948c2ecf20Sopenharmony_ci ccc = STOPPER; 6958c2ecf20Sopenharmony_ci } else if (ccc == DECOMPOSE) { 6968c2ecf20Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 6978c2ecf20Sopenharmony_ci u8c->p = u8c->s + utf8clen(u8c->s); 6988c2ecf20Sopenharmony_ci u8c->s = LEAF_STR(leaf); 6998c2ecf20Sopenharmony_ci /* Empty decomposition implies CCC 0. */ 7008c2ecf20Sopenharmony_ci if (*u8c->s == '\0') { 7018c2ecf20Sopenharmony_ci if (u8c->ccc == STOPPER) 7028c2ecf20Sopenharmony_ci continue; 7038c2ecf20Sopenharmony_ci ccc = STOPPER; 7048c2ecf20Sopenharmony_ci goto ccc_mismatch; 7058c2ecf20Sopenharmony_ci } 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ci leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 7088c2ecf20Sopenharmony_ci if (!leaf) 7098c2ecf20Sopenharmony_ci return -1; 7108c2ecf20Sopenharmony_ci ccc = LEAF_CCC(leaf); 7118c2ecf20Sopenharmony_ci } 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci /* 7148c2ecf20Sopenharmony_ci * If this is not a stopper, then see if it updates 7158c2ecf20Sopenharmony_ci * the next canonical class to be emitted. 7168c2ecf20Sopenharmony_ci */ 7178c2ecf20Sopenharmony_ci if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 7188c2ecf20Sopenharmony_ci u8c->nccc = ccc; 7198c2ecf20Sopenharmony_ci 7208c2ecf20Sopenharmony_ci /* 7218c2ecf20Sopenharmony_ci * Return the current byte if this is the current 7228c2ecf20Sopenharmony_ci * combining class. 7238c2ecf20Sopenharmony_ci */ 7248c2ecf20Sopenharmony_ci if (ccc == u8c->ccc) { 7258c2ecf20Sopenharmony_ci if (!u8c->p) 7268c2ecf20Sopenharmony_ci u8c->len--; 7278c2ecf20Sopenharmony_ci return (unsigned char)*u8c->s++; 7288c2ecf20Sopenharmony_ci } 7298c2ecf20Sopenharmony_ci 7308c2ecf20Sopenharmony_ci /* Current combining class mismatch. */ 7318c2ecf20Sopenharmony_ciccc_mismatch: 7328c2ecf20Sopenharmony_ci if (u8c->nccc == STOPPER) { 7338c2ecf20Sopenharmony_ci /* 7348c2ecf20Sopenharmony_ci * Scan forward for the first canonical class 7358c2ecf20Sopenharmony_ci * to be emitted. Save the position from 7368c2ecf20Sopenharmony_ci * which to restart. 7378c2ecf20Sopenharmony_ci */ 7388c2ecf20Sopenharmony_ci u8c->ccc = MINCCC - 1; 7398c2ecf20Sopenharmony_ci u8c->nccc = ccc; 7408c2ecf20Sopenharmony_ci u8c->sp = u8c->p; 7418c2ecf20Sopenharmony_ci u8c->ss = u8c->s; 7428c2ecf20Sopenharmony_ci u8c->slen = u8c->len; 7438c2ecf20Sopenharmony_ci if (!u8c->p) 7448c2ecf20Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 7458c2ecf20Sopenharmony_ci u8c->s += utf8clen(u8c->s); 7468c2ecf20Sopenharmony_ci } else if (ccc != STOPPER) { 7478c2ecf20Sopenharmony_ci /* Not a stopper, and not the ccc we're emitting. */ 7488c2ecf20Sopenharmony_ci if (!u8c->p) 7498c2ecf20Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 7508c2ecf20Sopenharmony_ci u8c->s += utf8clen(u8c->s); 7518c2ecf20Sopenharmony_ci } else if (u8c->nccc != MAXCCC + 1) { 7528c2ecf20Sopenharmony_ci /* At a stopper, restart for next ccc. */ 7538c2ecf20Sopenharmony_ci u8c->ccc = u8c->nccc; 7548c2ecf20Sopenharmony_ci u8c->nccc = MAXCCC + 1; 7558c2ecf20Sopenharmony_ci u8c->s = u8c->ss; 7568c2ecf20Sopenharmony_ci u8c->p = u8c->sp; 7578c2ecf20Sopenharmony_ci u8c->len = u8c->slen; 7588c2ecf20Sopenharmony_ci } else { 7598c2ecf20Sopenharmony_ci /* All done, proceed from here. */ 7608c2ecf20Sopenharmony_ci u8c->ccc = STOPPER; 7618c2ecf20Sopenharmony_ci u8c->nccc = STOPPER; 7628c2ecf20Sopenharmony_ci u8c->sp = NULL; 7638c2ecf20Sopenharmony_ci u8c->ss = NULL; 7648c2ecf20Sopenharmony_ci u8c->slen = 0; 7658c2ecf20Sopenharmony_ci } 7668c2ecf20Sopenharmony_ci } 7678c2ecf20Sopenharmony_ci} 7688c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8byte); 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_ciconst struct utf8data *utf8nfdi(unsigned int maxage) 7718c2ecf20Sopenharmony_ci{ 7728c2ecf20Sopenharmony_ci int i = ARRAY_SIZE(utf8nfdidata) - 1; 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci while (maxage < utf8nfdidata[i].maxage) 7758c2ecf20Sopenharmony_ci i--; 7768c2ecf20Sopenharmony_ci if (maxage > utf8nfdidata[i].maxage) 7778c2ecf20Sopenharmony_ci return NULL; 7788c2ecf20Sopenharmony_ci return &utf8nfdidata[i]; 7798c2ecf20Sopenharmony_ci} 7808c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8nfdi); 7818c2ecf20Sopenharmony_ci 7828c2ecf20Sopenharmony_ciconst struct utf8data *utf8nfdicf(unsigned int maxage) 7838c2ecf20Sopenharmony_ci{ 7848c2ecf20Sopenharmony_ci int i = ARRAY_SIZE(utf8nfdicfdata) - 1; 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci while (maxage < utf8nfdicfdata[i].maxage) 7878c2ecf20Sopenharmony_ci i--; 7888c2ecf20Sopenharmony_ci if (maxage > utf8nfdicfdata[i].maxage) 7898c2ecf20Sopenharmony_ci return NULL; 7908c2ecf20Sopenharmony_ci return &utf8nfdicfdata[i]; 7918c2ecf20Sopenharmony_ci} 7928c2ecf20Sopenharmony_ciEXPORT_SYMBOL(utf8nfdicf); 793