18c2ecf20Sopenharmony_ci/* 28c2ecf20Sopenharmony_ci * Copyright (c) 2014 SGI. 38c2ecf20Sopenharmony_ci * All rights reserved. 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or 68c2ecf20Sopenharmony_ci * modify it under the terms of the GNU General Public License as 78c2ecf20Sopenharmony_ci * published by the Free Software Foundation. 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * This program is distributed in the hope that it would be useful, 108c2ecf20Sopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 118c2ecf20Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 128c2ecf20Sopenharmony_ci * GNU General Public License for more details. 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * You should have received a copy of the GNU General Public License 158c2ecf20Sopenharmony_ci * along with this program; if not, write the Free Software Foundation, 168c2ecf20Sopenharmony_ci * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 178c2ecf20Sopenharmony_ci */ 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci/* Generator for a compact trie for unicode normalization */ 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci#include <sys/types.h> 228c2ecf20Sopenharmony_ci#include <stddef.h> 238c2ecf20Sopenharmony_ci#include <stdlib.h> 248c2ecf20Sopenharmony_ci#include <stdio.h> 258c2ecf20Sopenharmony_ci#include <assert.h> 268c2ecf20Sopenharmony_ci#include <string.h> 278c2ecf20Sopenharmony_ci#include <unistd.h> 288c2ecf20Sopenharmony_ci#include <errno.h> 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci/* Default names of the in- and output files. */ 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ci#define AGE_NAME "DerivedAge.txt" 338c2ecf20Sopenharmony_ci#define CCC_NAME "DerivedCombiningClass.txt" 348c2ecf20Sopenharmony_ci#define PROP_NAME "DerivedCoreProperties.txt" 358c2ecf20Sopenharmony_ci#define DATA_NAME "UnicodeData.txt" 368c2ecf20Sopenharmony_ci#define FOLD_NAME "CaseFolding.txt" 378c2ecf20Sopenharmony_ci#define NORM_NAME "NormalizationCorrections.txt" 388c2ecf20Sopenharmony_ci#define TEST_NAME "NormalizationTest.txt" 398c2ecf20Sopenharmony_ci#define UTF8_NAME "utf8data.h" 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ciconst char *age_name = AGE_NAME; 428c2ecf20Sopenharmony_ciconst char *ccc_name = CCC_NAME; 438c2ecf20Sopenharmony_ciconst char *prop_name = PROP_NAME; 448c2ecf20Sopenharmony_ciconst char *data_name = DATA_NAME; 458c2ecf20Sopenharmony_ciconst char *fold_name = FOLD_NAME; 468c2ecf20Sopenharmony_ciconst char *norm_name = NORM_NAME; 478c2ecf20Sopenharmony_ciconst char *test_name = TEST_NAME; 488c2ecf20Sopenharmony_ciconst char *utf8_name = UTF8_NAME; 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ciint verbose = 0; 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci/* An arbitrary line size limit on input lines. */ 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci#define LINESIZE 1024 558c2ecf20Sopenharmony_cichar line[LINESIZE]; 568c2ecf20Sopenharmony_cichar buf0[LINESIZE]; 578c2ecf20Sopenharmony_cichar buf1[LINESIZE]; 588c2ecf20Sopenharmony_cichar buf2[LINESIZE]; 598c2ecf20Sopenharmony_cichar buf3[LINESIZE]; 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ciconst char *argv0; 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 648c2ecf20Sopenharmony_ci 658c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci/* 688c2ecf20Sopenharmony_ci * Unicode version numbers consist of three parts: major, minor, and a 698c2ecf20Sopenharmony_ci * revision. These numbers are packed into an unsigned int to obtain 708c2ecf20Sopenharmony_ci * a single version number. 718c2ecf20Sopenharmony_ci * 728c2ecf20Sopenharmony_ci * To save space in the generated trie, the unicode version is not 738c2ecf20Sopenharmony_ci * stored directly, instead we calculate a generation number from the 748c2ecf20Sopenharmony_ci * unicode versions seen in the DerivedAge file, and use that as an 758c2ecf20Sopenharmony_ci * index into a table of unicode versions. 768c2ecf20Sopenharmony_ci */ 778c2ecf20Sopenharmony_ci#define UNICODE_MAJ_SHIFT (16) 788c2ecf20Sopenharmony_ci#define UNICODE_MIN_SHIFT (8) 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci#define UNICODE_MAJ_MAX ((unsigned short)-1) 818c2ecf20Sopenharmony_ci#define UNICODE_MIN_MAX ((unsigned char)-1) 828c2ecf20Sopenharmony_ci#define UNICODE_REV_MAX ((unsigned char)-1) 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci#define UNICODE_AGE(MAJ,MIN,REV) \ 858c2ecf20Sopenharmony_ci (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ 868c2ecf20Sopenharmony_ci ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ 878c2ecf20Sopenharmony_ci ((unsigned int)(REV))) 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ciunsigned int *ages; 908c2ecf20Sopenharmony_ciint ages_count; 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ciunsigned int unicode_maxage; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_cistatic int age_valid(unsigned int major, unsigned int minor, 958c2ecf20Sopenharmony_ci unsigned int revision) 968c2ecf20Sopenharmony_ci{ 978c2ecf20Sopenharmony_ci if (major > UNICODE_MAJ_MAX) 988c2ecf20Sopenharmony_ci return 0; 998c2ecf20Sopenharmony_ci if (minor > UNICODE_MIN_MAX) 1008c2ecf20Sopenharmony_ci return 0; 1018c2ecf20Sopenharmony_ci if (revision > UNICODE_REV_MAX) 1028c2ecf20Sopenharmony_ci return 0; 1038c2ecf20Sopenharmony_ci return 1; 1048c2ecf20Sopenharmony_ci} 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci/* 1098c2ecf20Sopenharmony_ci * utf8trie_t 1108c2ecf20Sopenharmony_ci * 1118c2ecf20Sopenharmony_ci * A compact binary tree, used to decode UTF-8 characters. 1128c2ecf20Sopenharmony_ci * 1138c2ecf20Sopenharmony_ci * Internal nodes are one byte for the node itself, and up to three 1148c2ecf20Sopenharmony_ci * bytes for an offset into the tree. The first byte contains the 1158c2ecf20Sopenharmony_ci * following information: 1168c2ecf20Sopenharmony_ci * NEXTBYTE - flag - advance to next byte if set 1178c2ecf20Sopenharmony_ci * BITNUM - 3 bit field - the bit number to tested 1188c2ecf20Sopenharmony_ci * OFFLEN - 2 bit field - number of bytes in the offset 1198c2ecf20Sopenharmony_ci * if offlen == 0 (non-branching node) 1208c2ecf20Sopenharmony_ci * RIGHTPATH - 1 bit field - set if the following node is for the 1218c2ecf20Sopenharmony_ci * right-hand path (tested bit is set) 1228c2ecf20Sopenharmony_ci * TRIENODE - 1 bit field - set if the following node is an internal 1238c2ecf20Sopenharmony_ci * node, otherwise it is a leaf node 1248c2ecf20Sopenharmony_ci * if offlen != 0 (branching node) 1258c2ecf20Sopenharmony_ci * LEFTNODE - 1 bit field - set if the left-hand node is internal 1268c2ecf20Sopenharmony_ci * RIGHTNODE - 1 bit field - set if the right-hand node is internal 1278c2ecf20Sopenharmony_ci * 1288c2ecf20Sopenharmony_ci * Due to the way utf8 works, there cannot be branching nodes with 1298c2ecf20Sopenharmony_ci * NEXTBYTE set, and moreover those nodes always have a righthand 1308c2ecf20Sopenharmony_ci * descendant. 1318c2ecf20Sopenharmony_ci */ 1328c2ecf20Sopenharmony_citypedef unsigned char utf8trie_t; 1338c2ecf20Sopenharmony_ci#define BITNUM 0x07 1348c2ecf20Sopenharmony_ci#define NEXTBYTE 0x08 1358c2ecf20Sopenharmony_ci#define OFFLEN 0x30 1368c2ecf20Sopenharmony_ci#define OFFLEN_SHIFT 4 1378c2ecf20Sopenharmony_ci#define RIGHTPATH 0x40 1388c2ecf20Sopenharmony_ci#define TRIENODE 0x80 1398c2ecf20Sopenharmony_ci#define RIGHTNODE 0x40 1408c2ecf20Sopenharmony_ci#define LEFTNODE 0x80 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci/* 1438c2ecf20Sopenharmony_ci * utf8leaf_t 1448c2ecf20Sopenharmony_ci * 1458c2ecf20Sopenharmony_ci * The leaves of the trie are embedded in the trie, and so the same 1468c2ecf20Sopenharmony_ci * underlying datatype, unsigned char. 1478c2ecf20Sopenharmony_ci * 1488c2ecf20Sopenharmony_ci * leaf[0]: The unicode version, stored as a generation number that is 1498c2ecf20Sopenharmony_ci * an index into utf8agetab[]. With this we can filter code 1508c2ecf20Sopenharmony_ci * points based on the unicode version in which they were 1518c2ecf20Sopenharmony_ci * defined. The CCC of a non-defined code point is 0. 1528c2ecf20Sopenharmony_ci * leaf[1]: Canonical Combining Class. During normalization, we need 1538c2ecf20Sopenharmony_ci * to do a stable sort into ascending order of all characters 1548c2ecf20Sopenharmony_ci * with a non-zero CCC that occur between two characters with 1558c2ecf20Sopenharmony_ci * a CCC of 0, or at the begin or end of a string. 1568c2ecf20Sopenharmony_ci * The unicode standard guarantees that all CCC values are 1578c2ecf20Sopenharmony_ci * between 0 and 254 inclusive, which leaves 255 available as 1588c2ecf20Sopenharmony_ci * a special value. 1598c2ecf20Sopenharmony_ci * Code points with CCC 0 are known as stoppers. 1608c2ecf20Sopenharmony_ci * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 1618c2ecf20Sopenharmony_ci * start of a NUL-terminated string that is the decomposition 1628c2ecf20Sopenharmony_ci * of the character. 1638c2ecf20Sopenharmony_ci * The CCC of a decomposable character is the same as the CCC 1648c2ecf20Sopenharmony_ci * of the first character of its decomposition. 1658c2ecf20Sopenharmony_ci * Some characters decompose as the empty string: these are 1668c2ecf20Sopenharmony_ci * characters with the Default_Ignorable_Code_Point property. 1678c2ecf20Sopenharmony_ci * These do affect normalization, as they all have CCC 0. 1688c2ecf20Sopenharmony_ci * 1698c2ecf20Sopenharmony_ci * The decompositions in the trie have been fully expanded. 1708c2ecf20Sopenharmony_ci * 1718c2ecf20Sopenharmony_ci * Casefolding, if applicable, is also done using decompositions. 1728c2ecf20Sopenharmony_ci */ 1738c2ecf20Sopenharmony_citypedef unsigned char utf8leaf_t; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci#define LEAF_GEN(LEAF) ((LEAF)[0]) 1768c2ecf20Sopenharmony_ci#define LEAF_CCC(LEAF) ((LEAF)[1]) 1778c2ecf20Sopenharmony_ci#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci#define MAXGEN (255) 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_ci#define MINCCC (0) 1828c2ecf20Sopenharmony_ci#define MAXCCC (254) 1838c2ecf20Sopenharmony_ci#define STOPPER (0) 1848c2ecf20Sopenharmony_ci#define DECOMPOSE (255) 1858c2ecf20Sopenharmony_ci#define HANGUL ((char)(255)) 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci#define UTF8HANGULLEAF (12) 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_cistruct tree; 1908c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, 1918c2ecf20Sopenharmony_ci const char *, size_t); 1928c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ciunsigned char *utf8data; 1958c2ecf20Sopenharmony_cisize_t utf8data_size; 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ciutf8trie_t *nfdi; 1988c2ecf20Sopenharmony_ciutf8trie_t *nfdicf; 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci/* 2038c2ecf20Sopenharmony_ci * UTF8 valid ranges. 2048c2ecf20Sopenharmony_ci * 2058c2ecf20Sopenharmony_ci * The UTF-8 encoding spreads the bits of a 32bit word over several 2068c2ecf20Sopenharmony_ci * bytes. This table gives the ranges that can be held and how they'd 2078c2ecf20Sopenharmony_ci * be represented. 2088c2ecf20Sopenharmony_ci * 2098c2ecf20Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx 2108c2ecf20Sopenharmony_ci * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 2118c2ecf20Sopenharmony_ci * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 2128c2ecf20Sopenharmony_ci * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 2138c2ecf20Sopenharmony_ci * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 2148c2ecf20Sopenharmony_ci * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 2158c2ecf20Sopenharmony_ci * 2168c2ecf20Sopenharmony_ci * There is an additional requirement on UTF-8, in that only the 2178c2ecf20Sopenharmony_ci * shortest representation of a 32bit value is to be used. A decoder 2188c2ecf20Sopenharmony_ci * must not decode sequences that do not satisfy this requirement. 2198c2ecf20Sopenharmony_ci * Thus the allowed ranges have a lower bound. 2208c2ecf20Sopenharmony_ci * 2218c2ecf20Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx 2228c2ecf20Sopenharmony_ci * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 2238c2ecf20Sopenharmony_ci * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 2248c2ecf20Sopenharmony_ci * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 2258c2ecf20Sopenharmony_ci * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 2268c2ecf20Sopenharmony_ci * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 2278c2ecf20Sopenharmony_ci * 2288c2ecf20Sopenharmony_ci * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 2298c2ecf20Sopenharmony_ci * 17 planes of 65536 values. This limits the sequences actually seen 2308c2ecf20Sopenharmony_ci * even more, to just the following. 2318c2ecf20Sopenharmony_ci * 2328c2ecf20Sopenharmony_ci * 0 - 0x7f: 0 0x7f 2338c2ecf20Sopenharmony_ci * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf 2348c2ecf20Sopenharmony_ci * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf 2358c2ecf20Sopenharmony_ci * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf 2368c2ecf20Sopenharmony_ci * 2378c2ecf20Sopenharmony_ci * Even within those ranges not all values are allowed: the surrogates 2388c2ecf20Sopenharmony_ci * 0xd800 - 0xdfff should never be seen. 2398c2ecf20Sopenharmony_ci * 2408c2ecf20Sopenharmony_ci * Note that the longest sequence seen with valid usage is 4 bytes, 2418c2ecf20Sopenharmony_ci * the same a single UTF-32 character. This makes the UTF-8 2428c2ecf20Sopenharmony_ci * representation of Unicode strictly smaller than UTF-32. 2438c2ecf20Sopenharmony_ci * 2448c2ecf20Sopenharmony_ci * The shortest sequence requirement was introduced by: 2458c2ecf20Sopenharmony_ci * Corrigendum #1: UTF-8 Shortest Form 2468c2ecf20Sopenharmony_ci * It can be found here: 2478c2ecf20Sopenharmony_ci * http://www.unicode.org/versions/corrigendum1.html 2488c2ecf20Sopenharmony_ci * 2498c2ecf20Sopenharmony_ci */ 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci#define UTF8_2_BITS 0xC0 2528c2ecf20Sopenharmony_ci#define UTF8_3_BITS 0xE0 2538c2ecf20Sopenharmony_ci#define UTF8_4_BITS 0xF0 2548c2ecf20Sopenharmony_ci#define UTF8_N_BITS 0x80 2558c2ecf20Sopenharmony_ci#define UTF8_2_MASK 0xE0 2568c2ecf20Sopenharmony_ci#define UTF8_3_MASK 0xF0 2578c2ecf20Sopenharmony_ci#define UTF8_4_MASK 0xF8 2588c2ecf20Sopenharmony_ci#define UTF8_N_MASK 0xC0 2598c2ecf20Sopenharmony_ci#define UTF8_V_MASK 0x3F 2608c2ecf20Sopenharmony_ci#define UTF8_V_SHIFT 6 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_cistatic int utf8encode(char *str, unsigned int val) 2638c2ecf20Sopenharmony_ci{ 2648c2ecf20Sopenharmony_ci int len; 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci if (val < 0x80) { 2678c2ecf20Sopenharmony_ci str[0] = val; 2688c2ecf20Sopenharmony_ci len = 1; 2698c2ecf20Sopenharmony_ci } else if (val < 0x800) { 2708c2ecf20Sopenharmony_ci str[1] = val & UTF8_V_MASK; 2718c2ecf20Sopenharmony_ci str[1] |= UTF8_N_BITS; 2728c2ecf20Sopenharmony_ci val >>= UTF8_V_SHIFT; 2738c2ecf20Sopenharmony_ci str[0] = val; 2748c2ecf20Sopenharmony_ci str[0] |= UTF8_2_BITS; 2758c2ecf20Sopenharmony_ci len = 2; 2768c2ecf20Sopenharmony_ci } else if (val < 0x10000) { 2778c2ecf20Sopenharmony_ci str[2] = val & UTF8_V_MASK; 2788c2ecf20Sopenharmony_ci str[2] |= UTF8_N_BITS; 2798c2ecf20Sopenharmony_ci val >>= UTF8_V_SHIFT; 2808c2ecf20Sopenharmony_ci str[1] = val & UTF8_V_MASK; 2818c2ecf20Sopenharmony_ci str[1] |= UTF8_N_BITS; 2828c2ecf20Sopenharmony_ci val >>= UTF8_V_SHIFT; 2838c2ecf20Sopenharmony_ci str[0] = val; 2848c2ecf20Sopenharmony_ci str[0] |= UTF8_3_BITS; 2858c2ecf20Sopenharmony_ci len = 3; 2868c2ecf20Sopenharmony_ci } else if (val < 0x110000) { 2878c2ecf20Sopenharmony_ci str[3] = val & UTF8_V_MASK; 2888c2ecf20Sopenharmony_ci str[3] |= UTF8_N_BITS; 2898c2ecf20Sopenharmony_ci val >>= UTF8_V_SHIFT; 2908c2ecf20Sopenharmony_ci str[2] = val & UTF8_V_MASK; 2918c2ecf20Sopenharmony_ci str[2] |= UTF8_N_BITS; 2928c2ecf20Sopenharmony_ci val >>= UTF8_V_SHIFT; 2938c2ecf20Sopenharmony_ci str[1] = val & UTF8_V_MASK; 2948c2ecf20Sopenharmony_ci str[1] |= UTF8_N_BITS; 2958c2ecf20Sopenharmony_ci val >>= UTF8_V_SHIFT; 2968c2ecf20Sopenharmony_ci str[0] = val; 2978c2ecf20Sopenharmony_ci str[0] |= UTF8_4_BITS; 2988c2ecf20Sopenharmony_ci len = 4; 2998c2ecf20Sopenharmony_ci } else { 3008c2ecf20Sopenharmony_ci printf("%#x: illegal val\n", val); 3018c2ecf20Sopenharmony_ci len = 0; 3028c2ecf20Sopenharmony_ci } 3038c2ecf20Sopenharmony_ci return len; 3048c2ecf20Sopenharmony_ci} 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_cistatic unsigned int utf8decode(const char *str) 3078c2ecf20Sopenharmony_ci{ 3088c2ecf20Sopenharmony_ci const unsigned char *s = (const unsigned char*)str; 3098c2ecf20Sopenharmony_ci unsigned int unichar = 0; 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci if (*s < 0x80) { 3128c2ecf20Sopenharmony_ci unichar = *s; 3138c2ecf20Sopenharmony_ci } else if (*s < UTF8_3_BITS) { 3148c2ecf20Sopenharmony_ci unichar = *s++ & 0x1F; 3158c2ecf20Sopenharmony_ci unichar <<= UTF8_V_SHIFT; 3168c2ecf20Sopenharmony_ci unichar |= *s & 0x3F; 3178c2ecf20Sopenharmony_ci } else if (*s < UTF8_4_BITS) { 3188c2ecf20Sopenharmony_ci unichar = *s++ & 0x0F; 3198c2ecf20Sopenharmony_ci unichar <<= UTF8_V_SHIFT; 3208c2ecf20Sopenharmony_ci unichar |= *s++ & 0x3F; 3218c2ecf20Sopenharmony_ci unichar <<= UTF8_V_SHIFT; 3228c2ecf20Sopenharmony_ci unichar |= *s & 0x3F; 3238c2ecf20Sopenharmony_ci } else { 3248c2ecf20Sopenharmony_ci unichar = *s++ & 0x0F; 3258c2ecf20Sopenharmony_ci unichar <<= UTF8_V_SHIFT; 3268c2ecf20Sopenharmony_ci unichar |= *s++ & 0x3F; 3278c2ecf20Sopenharmony_ci unichar <<= UTF8_V_SHIFT; 3288c2ecf20Sopenharmony_ci unichar |= *s++ & 0x3F; 3298c2ecf20Sopenharmony_ci unichar <<= UTF8_V_SHIFT; 3308c2ecf20Sopenharmony_ci unichar |= *s & 0x3F; 3318c2ecf20Sopenharmony_ci } 3328c2ecf20Sopenharmony_ci return unichar; 3338c2ecf20Sopenharmony_ci} 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_cistatic int utf32valid(unsigned int unichar) 3368c2ecf20Sopenharmony_ci{ 3378c2ecf20Sopenharmony_ci return unichar < 0x110000; 3388c2ecf20Sopenharmony_ci} 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci#define NODE 1 3438c2ecf20Sopenharmony_ci#define LEAF 0 3448c2ecf20Sopenharmony_ci 3458c2ecf20Sopenharmony_cistruct tree { 3468c2ecf20Sopenharmony_ci void *root; 3478c2ecf20Sopenharmony_ci int childnode; 3488c2ecf20Sopenharmony_ci const char *type; 3498c2ecf20Sopenharmony_ci unsigned int maxage; 3508c2ecf20Sopenharmony_ci struct tree *next; 3518c2ecf20Sopenharmony_ci int (*leaf_equal)(void *, void *); 3528c2ecf20Sopenharmony_ci void (*leaf_print)(void *, int); 3538c2ecf20Sopenharmony_ci int (*leaf_mark)(void *); 3548c2ecf20Sopenharmony_ci int (*leaf_size)(void *); 3558c2ecf20Sopenharmony_ci int *(*leaf_index)(struct tree *, void *); 3568c2ecf20Sopenharmony_ci unsigned char *(*leaf_emit)(void *, unsigned char *); 3578c2ecf20Sopenharmony_ci int leafindex[0x110000]; 3588c2ecf20Sopenharmony_ci int index; 3598c2ecf20Sopenharmony_ci}; 3608c2ecf20Sopenharmony_ci 3618c2ecf20Sopenharmony_cistruct node { 3628c2ecf20Sopenharmony_ci int index; 3638c2ecf20Sopenharmony_ci int offset; 3648c2ecf20Sopenharmony_ci int mark; 3658c2ecf20Sopenharmony_ci int size; 3668c2ecf20Sopenharmony_ci struct node *parent; 3678c2ecf20Sopenharmony_ci void *left; 3688c2ecf20Sopenharmony_ci void *right; 3698c2ecf20Sopenharmony_ci unsigned char bitnum; 3708c2ecf20Sopenharmony_ci unsigned char nextbyte; 3718c2ecf20Sopenharmony_ci unsigned char leftnode; 3728c2ecf20Sopenharmony_ci unsigned char rightnode; 3738c2ecf20Sopenharmony_ci unsigned int keybits; 3748c2ecf20Sopenharmony_ci unsigned int keymask; 3758c2ecf20Sopenharmony_ci}; 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci/* 3788c2ecf20Sopenharmony_ci * Example lookup function for a tree. 3798c2ecf20Sopenharmony_ci */ 3808c2ecf20Sopenharmony_cistatic void *lookup(struct tree *tree, const char *key) 3818c2ecf20Sopenharmony_ci{ 3828c2ecf20Sopenharmony_ci struct node *node; 3838c2ecf20Sopenharmony_ci void *leaf = NULL; 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci node = tree->root; 3868c2ecf20Sopenharmony_ci while (!leaf && node) { 3878c2ecf20Sopenharmony_ci if (node->nextbyte) 3888c2ecf20Sopenharmony_ci key++; 3898c2ecf20Sopenharmony_ci if (*key & (1 << (node->bitnum & 7))) { 3908c2ecf20Sopenharmony_ci /* Right leg */ 3918c2ecf20Sopenharmony_ci if (node->rightnode == NODE) { 3928c2ecf20Sopenharmony_ci node = node->right; 3938c2ecf20Sopenharmony_ci } else if (node->rightnode == LEAF) { 3948c2ecf20Sopenharmony_ci leaf = node->right; 3958c2ecf20Sopenharmony_ci } else { 3968c2ecf20Sopenharmony_ci node = NULL; 3978c2ecf20Sopenharmony_ci } 3988c2ecf20Sopenharmony_ci } else { 3998c2ecf20Sopenharmony_ci /* Left leg */ 4008c2ecf20Sopenharmony_ci if (node->leftnode == NODE) { 4018c2ecf20Sopenharmony_ci node = node->left; 4028c2ecf20Sopenharmony_ci } else if (node->leftnode == LEAF) { 4038c2ecf20Sopenharmony_ci leaf = node->left; 4048c2ecf20Sopenharmony_ci } else { 4058c2ecf20Sopenharmony_ci node = NULL; 4068c2ecf20Sopenharmony_ci } 4078c2ecf20Sopenharmony_ci } 4088c2ecf20Sopenharmony_ci } 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci return leaf; 4118c2ecf20Sopenharmony_ci} 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci/* 4148c2ecf20Sopenharmony_ci * A simple non-recursive tree walker: keep track of visits to the 4158c2ecf20Sopenharmony_ci * left and right branches in the leftmask and rightmask. 4168c2ecf20Sopenharmony_ci */ 4178c2ecf20Sopenharmony_cistatic void tree_walk(struct tree *tree) 4188c2ecf20Sopenharmony_ci{ 4198c2ecf20Sopenharmony_ci struct node *node; 4208c2ecf20Sopenharmony_ci unsigned int leftmask; 4218c2ecf20Sopenharmony_ci unsigned int rightmask; 4228c2ecf20Sopenharmony_ci unsigned int bitmask; 4238c2ecf20Sopenharmony_ci int indent = 1; 4248c2ecf20Sopenharmony_ci int nodes, singletons, leaves; 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci nodes = singletons = leaves = 0; 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_ci printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); 4298c2ecf20Sopenharmony_ci if (tree->childnode == LEAF) { 4308c2ecf20Sopenharmony_ci assert(tree->root); 4318c2ecf20Sopenharmony_ci tree->leaf_print(tree->root, indent); 4328c2ecf20Sopenharmony_ci leaves = 1; 4338c2ecf20Sopenharmony_ci } else { 4348c2ecf20Sopenharmony_ci assert(tree->childnode == NODE); 4358c2ecf20Sopenharmony_ci node = tree->root; 4368c2ecf20Sopenharmony_ci leftmask = rightmask = 0; 4378c2ecf20Sopenharmony_ci while (node) { 4388c2ecf20Sopenharmony_ci printf("%*snode @ %p bitnum %d nextbyte %d" 4398c2ecf20Sopenharmony_ci " left %p right %p mask %x bits %x\n", 4408c2ecf20Sopenharmony_ci indent, "", node, 4418c2ecf20Sopenharmony_ci node->bitnum, node->nextbyte, 4428c2ecf20Sopenharmony_ci node->left, node->right, 4438c2ecf20Sopenharmony_ci node->keymask, node->keybits); 4448c2ecf20Sopenharmony_ci nodes += 1; 4458c2ecf20Sopenharmony_ci if (!(node->left && node->right)) 4468c2ecf20Sopenharmony_ci singletons += 1; 4478c2ecf20Sopenharmony_ci 4488c2ecf20Sopenharmony_ci while (node) { 4498c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 4508c2ecf20Sopenharmony_ci if ((leftmask & bitmask) == 0) { 4518c2ecf20Sopenharmony_ci leftmask |= bitmask; 4528c2ecf20Sopenharmony_ci if (node->leftnode == LEAF) { 4538c2ecf20Sopenharmony_ci assert(node->left); 4548c2ecf20Sopenharmony_ci tree->leaf_print(node->left, 4558c2ecf20Sopenharmony_ci indent+1); 4568c2ecf20Sopenharmony_ci leaves += 1; 4578c2ecf20Sopenharmony_ci } else if (node->left) { 4588c2ecf20Sopenharmony_ci assert(node->leftnode == NODE); 4598c2ecf20Sopenharmony_ci indent += 1; 4608c2ecf20Sopenharmony_ci node = node->left; 4618c2ecf20Sopenharmony_ci break; 4628c2ecf20Sopenharmony_ci } 4638c2ecf20Sopenharmony_ci } 4648c2ecf20Sopenharmony_ci if ((rightmask & bitmask) == 0) { 4658c2ecf20Sopenharmony_ci rightmask |= bitmask; 4668c2ecf20Sopenharmony_ci if (node->rightnode == LEAF) { 4678c2ecf20Sopenharmony_ci assert(node->right); 4688c2ecf20Sopenharmony_ci tree->leaf_print(node->right, 4698c2ecf20Sopenharmony_ci indent+1); 4708c2ecf20Sopenharmony_ci leaves += 1; 4718c2ecf20Sopenharmony_ci } else if (node->right) { 4728c2ecf20Sopenharmony_ci assert(node->rightnode == NODE); 4738c2ecf20Sopenharmony_ci indent += 1; 4748c2ecf20Sopenharmony_ci node = node->right; 4758c2ecf20Sopenharmony_ci break; 4768c2ecf20Sopenharmony_ci } 4778c2ecf20Sopenharmony_ci } 4788c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 4798c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 4808c2ecf20Sopenharmony_ci node = node->parent; 4818c2ecf20Sopenharmony_ci indent -= 1; 4828c2ecf20Sopenharmony_ci } 4838c2ecf20Sopenharmony_ci } 4848c2ecf20Sopenharmony_ci } 4858c2ecf20Sopenharmony_ci printf("nodes %d leaves %d singletons %d\n", 4868c2ecf20Sopenharmony_ci nodes, leaves, singletons); 4878c2ecf20Sopenharmony_ci} 4888c2ecf20Sopenharmony_ci 4898c2ecf20Sopenharmony_ci/* 4908c2ecf20Sopenharmony_ci * Allocate an initialize a new internal node. 4918c2ecf20Sopenharmony_ci */ 4928c2ecf20Sopenharmony_cistatic struct node *alloc_node(struct node *parent) 4938c2ecf20Sopenharmony_ci{ 4948c2ecf20Sopenharmony_ci struct node *node; 4958c2ecf20Sopenharmony_ci int bitnum; 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_ci node = malloc(sizeof(*node)); 4988c2ecf20Sopenharmony_ci node->left = node->right = NULL; 4998c2ecf20Sopenharmony_ci node->parent = parent; 5008c2ecf20Sopenharmony_ci node->leftnode = NODE; 5018c2ecf20Sopenharmony_ci node->rightnode = NODE; 5028c2ecf20Sopenharmony_ci node->keybits = 0; 5038c2ecf20Sopenharmony_ci node->keymask = 0; 5048c2ecf20Sopenharmony_ci node->mark = 0; 5058c2ecf20Sopenharmony_ci node->index = 0; 5068c2ecf20Sopenharmony_ci node->offset = -1; 5078c2ecf20Sopenharmony_ci node->size = 4; 5088c2ecf20Sopenharmony_ci 5098c2ecf20Sopenharmony_ci if (node->parent) { 5108c2ecf20Sopenharmony_ci bitnum = parent->bitnum; 5118c2ecf20Sopenharmony_ci if ((bitnum & 7) == 0) { 5128c2ecf20Sopenharmony_ci node->bitnum = bitnum + 7 + 8; 5138c2ecf20Sopenharmony_ci node->nextbyte = 1; 5148c2ecf20Sopenharmony_ci } else { 5158c2ecf20Sopenharmony_ci node->bitnum = bitnum - 1; 5168c2ecf20Sopenharmony_ci node->nextbyte = 0; 5178c2ecf20Sopenharmony_ci } 5188c2ecf20Sopenharmony_ci } else { 5198c2ecf20Sopenharmony_ci node->bitnum = 7; 5208c2ecf20Sopenharmony_ci node->nextbyte = 0; 5218c2ecf20Sopenharmony_ci } 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci return node; 5248c2ecf20Sopenharmony_ci} 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci/* 5278c2ecf20Sopenharmony_ci * Insert a new leaf into the tree, and collapse any subtrees that are 5288c2ecf20Sopenharmony_ci * fully populated and end in identical leaves. A nextbyte tagged 5298c2ecf20Sopenharmony_ci * internal node will not be removed to preserve the tree's integrity. 5308c2ecf20Sopenharmony_ci * Note that due to the structure of utf8, no nextbyte tagged node 5318c2ecf20Sopenharmony_ci * will be a candidate for removal. 5328c2ecf20Sopenharmony_ci */ 5338c2ecf20Sopenharmony_cistatic int insert(struct tree *tree, char *key, int keylen, void *leaf) 5348c2ecf20Sopenharmony_ci{ 5358c2ecf20Sopenharmony_ci struct node *node; 5368c2ecf20Sopenharmony_ci struct node *parent; 5378c2ecf20Sopenharmony_ci void **cursor; 5388c2ecf20Sopenharmony_ci int keybits; 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci assert(keylen >= 1 && keylen <= 4); 5418c2ecf20Sopenharmony_ci 5428c2ecf20Sopenharmony_ci node = NULL; 5438c2ecf20Sopenharmony_ci cursor = &tree->root; 5448c2ecf20Sopenharmony_ci keybits = 8 * keylen; 5458c2ecf20Sopenharmony_ci 5468c2ecf20Sopenharmony_ci /* Insert, creating path along the way. */ 5478c2ecf20Sopenharmony_ci while (keybits) { 5488c2ecf20Sopenharmony_ci if (!*cursor) 5498c2ecf20Sopenharmony_ci *cursor = alloc_node(node); 5508c2ecf20Sopenharmony_ci node = *cursor; 5518c2ecf20Sopenharmony_ci if (node->nextbyte) 5528c2ecf20Sopenharmony_ci key++; 5538c2ecf20Sopenharmony_ci if (*key & (1 << (node->bitnum & 7))) 5548c2ecf20Sopenharmony_ci cursor = &node->right; 5558c2ecf20Sopenharmony_ci else 5568c2ecf20Sopenharmony_ci cursor = &node->left; 5578c2ecf20Sopenharmony_ci keybits--; 5588c2ecf20Sopenharmony_ci } 5598c2ecf20Sopenharmony_ci *cursor = leaf; 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_ci /* Merge subtrees if possible. */ 5628c2ecf20Sopenharmony_ci while (node) { 5638c2ecf20Sopenharmony_ci if (*key & (1 << (node->bitnum & 7))) 5648c2ecf20Sopenharmony_ci node->rightnode = LEAF; 5658c2ecf20Sopenharmony_ci else 5668c2ecf20Sopenharmony_ci node->leftnode = LEAF; 5678c2ecf20Sopenharmony_ci if (node->nextbyte) 5688c2ecf20Sopenharmony_ci break; 5698c2ecf20Sopenharmony_ci if (node->leftnode == NODE || node->rightnode == NODE) 5708c2ecf20Sopenharmony_ci break; 5718c2ecf20Sopenharmony_ci assert(node->left); 5728c2ecf20Sopenharmony_ci assert(node->right); 5738c2ecf20Sopenharmony_ci /* Compare */ 5748c2ecf20Sopenharmony_ci if (! tree->leaf_equal(node->left, node->right)) 5758c2ecf20Sopenharmony_ci break; 5768c2ecf20Sopenharmony_ci /* Keep left, drop right leaf. */ 5778c2ecf20Sopenharmony_ci leaf = node->left; 5788c2ecf20Sopenharmony_ci /* Check in parent */ 5798c2ecf20Sopenharmony_ci parent = node->parent; 5808c2ecf20Sopenharmony_ci if (!parent) { 5818c2ecf20Sopenharmony_ci /* root of tree! */ 5828c2ecf20Sopenharmony_ci tree->root = leaf; 5838c2ecf20Sopenharmony_ci tree->childnode = LEAF; 5848c2ecf20Sopenharmony_ci } else if (parent->left == node) { 5858c2ecf20Sopenharmony_ci parent->left = leaf; 5868c2ecf20Sopenharmony_ci parent->leftnode = LEAF; 5878c2ecf20Sopenharmony_ci if (parent->right) { 5888c2ecf20Sopenharmony_ci parent->keymask = 0; 5898c2ecf20Sopenharmony_ci parent->keybits = 0; 5908c2ecf20Sopenharmony_ci } else { 5918c2ecf20Sopenharmony_ci parent->keymask |= (1 << node->bitnum); 5928c2ecf20Sopenharmony_ci } 5938c2ecf20Sopenharmony_ci } else if (parent->right == node) { 5948c2ecf20Sopenharmony_ci parent->right = leaf; 5958c2ecf20Sopenharmony_ci parent->rightnode = LEAF; 5968c2ecf20Sopenharmony_ci if (parent->left) { 5978c2ecf20Sopenharmony_ci parent->keymask = 0; 5988c2ecf20Sopenharmony_ci parent->keybits = 0; 5998c2ecf20Sopenharmony_ci } else { 6008c2ecf20Sopenharmony_ci parent->keymask |= (1 << node->bitnum); 6018c2ecf20Sopenharmony_ci parent->keybits |= (1 << node->bitnum); 6028c2ecf20Sopenharmony_ci } 6038c2ecf20Sopenharmony_ci } else { 6048c2ecf20Sopenharmony_ci /* internal tree error */ 6058c2ecf20Sopenharmony_ci assert(0); 6068c2ecf20Sopenharmony_ci } 6078c2ecf20Sopenharmony_ci free(node); 6088c2ecf20Sopenharmony_ci node = parent; 6098c2ecf20Sopenharmony_ci } 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_ci /* Propagate keymasks up along singleton chains. */ 6128c2ecf20Sopenharmony_ci while (node) { 6138c2ecf20Sopenharmony_ci parent = node->parent; 6148c2ecf20Sopenharmony_ci if (!parent) 6158c2ecf20Sopenharmony_ci break; 6168c2ecf20Sopenharmony_ci /* Nix the mask for parents with two children. */ 6178c2ecf20Sopenharmony_ci if (node->keymask == 0) { 6188c2ecf20Sopenharmony_ci parent->keymask = 0; 6198c2ecf20Sopenharmony_ci parent->keybits = 0; 6208c2ecf20Sopenharmony_ci } else if (parent->left && parent->right) { 6218c2ecf20Sopenharmony_ci parent->keymask = 0; 6228c2ecf20Sopenharmony_ci parent->keybits = 0; 6238c2ecf20Sopenharmony_ci } else { 6248c2ecf20Sopenharmony_ci assert((parent->keymask & node->keymask) == 0); 6258c2ecf20Sopenharmony_ci parent->keymask |= node->keymask; 6268c2ecf20Sopenharmony_ci parent->keymask |= (1 << parent->bitnum); 6278c2ecf20Sopenharmony_ci parent->keybits |= node->keybits; 6288c2ecf20Sopenharmony_ci if (parent->right) 6298c2ecf20Sopenharmony_ci parent->keybits |= (1 << parent->bitnum); 6308c2ecf20Sopenharmony_ci } 6318c2ecf20Sopenharmony_ci node = parent; 6328c2ecf20Sopenharmony_ci } 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci return 0; 6358c2ecf20Sopenharmony_ci} 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_ci/* 6388c2ecf20Sopenharmony_ci * Prune internal nodes. 6398c2ecf20Sopenharmony_ci * 6408c2ecf20Sopenharmony_ci * Fully populated subtrees that end at the same leaf have already 6418c2ecf20Sopenharmony_ci * been collapsed. There are still internal nodes that have for both 6428c2ecf20Sopenharmony_ci * their left and right branches a sequence of singletons that make 6438c2ecf20Sopenharmony_ci * identical choices and end in identical leaves. The keymask and 6448c2ecf20Sopenharmony_ci * keybits collected in the nodes describe the choices made in these 6458c2ecf20Sopenharmony_ci * singleton chains. When they are identical for the left and right 6468c2ecf20Sopenharmony_ci * branch of a node, and the two leaves comare identical, the node in 6478c2ecf20Sopenharmony_ci * question can be removed. 6488c2ecf20Sopenharmony_ci * 6498c2ecf20Sopenharmony_ci * Note that nodes with the nextbyte tag set will not be removed by 6508c2ecf20Sopenharmony_ci * this to ensure tree integrity. Note as well that the structure of 6518c2ecf20Sopenharmony_ci * utf8 ensures that these nodes would not have been candidates for 6528c2ecf20Sopenharmony_ci * removal in any case. 6538c2ecf20Sopenharmony_ci */ 6548c2ecf20Sopenharmony_cistatic void prune(struct tree *tree) 6558c2ecf20Sopenharmony_ci{ 6568c2ecf20Sopenharmony_ci struct node *node; 6578c2ecf20Sopenharmony_ci struct node *left; 6588c2ecf20Sopenharmony_ci struct node *right; 6598c2ecf20Sopenharmony_ci struct node *parent; 6608c2ecf20Sopenharmony_ci void *leftleaf; 6618c2ecf20Sopenharmony_ci void *rightleaf; 6628c2ecf20Sopenharmony_ci unsigned int leftmask; 6638c2ecf20Sopenharmony_ci unsigned int rightmask; 6648c2ecf20Sopenharmony_ci unsigned int bitmask; 6658c2ecf20Sopenharmony_ci int count; 6668c2ecf20Sopenharmony_ci 6678c2ecf20Sopenharmony_ci if (verbose > 0) 6688c2ecf20Sopenharmony_ci printf("Pruning %s_%x\n", tree->type, tree->maxage); 6698c2ecf20Sopenharmony_ci 6708c2ecf20Sopenharmony_ci count = 0; 6718c2ecf20Sopenharmony_ci if (tree->childnode == LEAF) 6728c2ecf20Sopenharmony_ci return; 6738c2ecf20Sopenharmony_ci if (!tree->root) 6748c2ecf20Sopenharmony_ci return; 6758c2ecf20Sopenharmony_ci 6768c2ecf20Sopenharmony_ci leftmask = rightmask = 0; 6778c2ecf20Sopenharmony_ci node = tree->root; 6788c2ecf20Sopenharmony_ci while (node) { 6798c2ecf20Sopenharmony_ci if (node->nextbyte) 6808c2ecf20Sopenharmony_ci goto advance; 6818c2ecf20Sopenharmony_ci if (node->leftnode == LEAF) 6828c2ecf20Sopenharmony_ci goto advance; 6838c2ecf20Sopenharmony_ci if (node->rightnode == LEAF) 6848c2ecf20Sopenharmony_ci goto advance; 6858c2ecf20Sopenharmony_ci if (!node->left) 6868c2ecf20Sopenharmony_ci goto advance; 6878c2ecf20Sopenharmony_ci if (!node->right) 6888c2ecf20Sopenharmony_ci goto advance; 6898c2ecf20Sopenharmony_ci left = node->left; 6908c2ecf20Sopenharmony_ci right = node->right; 6918c2ecf20Sopenharmony_ci if (left->keymask == 0) 6928c2ecf20Sopenharmony_ci goto advance; 6938c2ecf20Sopenharmony_ci if (right->keymask == 0) 6948c2ecf20Sopenharmony_ci goto advance; 6958c2ecf20Sopenharmony_ci if (left->keymask != right->keymask) 6968c2ecf20Sopenharmony_ci goto advance; 6978c2ecf20Sopenharmony_ci if (left->keybits != right->keybits) 6988c2ecf20Sopenharmony_ci goto advance; 6998c2ecf20Sopenharmony_ci leftleaf = NULL; 7008c2ecf20Sopenharmony_ci while (!leftleaf) { 7018c2ecf20Sopenharmony_ci assert(left->left || left->right); 7028c2ecf20Sopenharmony_ci if (left->leftnode == LEAF) 7038c2ecf20Sopenharmony_ci leftleaf = left->left; 7048c2ecf20Sopenharmony_ci else if (left->rightnode == LEAF) 7058c2ecf20Sopenharmony_ci leftleaf = left->right; 7068c2ecf20Sopenharmony_ci else if (left->left) 7078c2ecf20Sopenharmony_ci left = left->left; 7088c2ecf20Sopenharmony_ci else if (left->right) 7098c2ecf20Sopenharmony_ci left = left->right; 7108c2ecf20Sopenharmony_ci else 7118c2ecf20Sopenharmony_ci assert(0); 7128c2ecf20Sopenharmony_ci } 7138c2ecf20Sopenharmony_ci rightleaf = NULL; 7148c2ecf20Sopenharmony_ci while (!rightleaf) { 7158c2ecf20Sopenharmony_ci assert(right->left || right->right); 7168c2ecf20Sopenharmony_ci if (right->leftnode == LEAF) 7178c2ecf20Sopenharmony_ci rightleaf = right->left; 7188c2ecf20Sopenharmony_ci else if (right->rightnode == LEAF) 7198c2ecf20Sopenharmony_ci rightleaf = right->right; 7208c2ecf20Sopenharmony_ci else if (right->left) 7218c2ecf20Sopenharmony_ci right = right->left; 7228c2ecf20Sopenharmony_ci else if (right->right) 7238c2ecf20Sopenharmony_ci right = right->right; 7248c2ecf20Sopenharmony_ci else 7258c2ecf20Sopenharmony_ci assert(0); 7268c2ecf20Sopenharmony_ci } 7278c2ecf20Sopenharmony_ci if (! tree->leaf_equal(leftleaf, rightleaf)) 7288c2ecf20Sopenharmony_ci goto advance; 7298c2ecf20Sopenharmony_ci /* 7308c2ecf20Sopenharmony_ci * This node has identical singleton-only subtrees. 7318c2ecf20Sopenharmony_ci * Remove it. 7328c2ecf20Sopenharmony_ci */ 7338c2ecf20Sopenharmony_ci parent = node->parent; 7348c2ecf20Sopenharmony_ci left = node->left; 7358c2ecf20Sopenharmony_ci right = node->right; 7368c2ecf20Sopenharmony_ci if (parent->left == node) 7378c2ecf20Sopenharmony_ci parent->left = left; 7388c2ecf20Sopenharmony_ci else if (parent->right == node) 7398c2ecf20Sopenharmony_ci parent->right = left; 7408c2ecf20Sopenharmony_ci else 7418c2ecf20Sopenharmony_ci assert(0); 7428c2ecf20Sopenharmony_ci left->parent = parent; 7438c2ecf20Sopenharmony_ci left->keymask |= (1 << node->bitnum); 7448c2ecf20Sopenharmony_ci node->left = NULL; 7458c2ecf20Sopenharmony_ci while (node) { 7468c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 7478c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 7488c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 7498c2ecf20Sopenharmony_ci if (node->leftnode == NODE && node->left) { 7508c2ecf20Sopenharmony_ci left = node->left; 7518c2ecf20Sopenharmony_ci free(node); 7528c2ecf20Sopenharmony_ci count++; 7538c2ecf20Sopenharmony_ci node = left; 7548c2ecf20Sopenharmony_ci } else if (node->rightnode == NODE && node->right) { 7558c2ecf20Sopenharmony_ci right = node->right; 7568c2ecf20Sopenharmony_ci free(node); 7578c2ecf20Sopenharmony_ci count++; 7588c2ecf20Sopenharmony_ci node = right; 7598c2ecf20Sopenharmony_ci } else { 7608c2ecf20Sopenharmony_ci node = NULL; 7618c2ecf20Sopenharmony_ci } 7628c2ecf20Sopenharmony_ci } 7638c2ecf20Sopenharmony_ci /* Propagate keymasks up along singleton chains. */ 7648c2ecf20Sopenharmony_ci node = parent; 7658c2ecf20Sopenharmony_ci /* Force re-check */ 7668c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 7678c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 7688c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 7698c2ecf20Sopenharmony_ci for (;;) { 7708c2ecf20Sopenharmony_ci if (node->left && node->right) 7718c2ecf20Sopenharmony_ci break; 7728c2ecf20Sopenharmony_ci if (node->left) { 7738c2ecf20Sopenharmony_ci left = node->left; 7748c2ecf20Sopenharmony_ci node->keymask |= left->keymask; 7758c2ecf20Sopenharmony_ci node->keybits |= left->keybits; 7768c2ecf20Sopenharmony_ci } 7778c2ecf20Sopenharmony_ci if (node->right) { 7788c2ecf20Sopenharmony_ci right = node->right; 7798c2ecf20Sopenharmony_ci node->keymask |= right->keymask; 7808c2ecf20Sopenharmony_ci node->keybits |= right->keybits; 7818c2ecf20Sopenharmony_ci } 7828c2ecf20Sopenharmony_ci node->keymask |= (1 << node->bitnum); 7838c2ecf20Sopenharmony_ci node = node->parent; 7848c2ecf20Sopenharmony_ci /* Force re-check */ 7858c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 7868c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 7878c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 7888c2ecf20Sopenharmony_ci } 7898c2ecf20Sopenharmony_ci advance: 7908c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 7918c2ecf20Sopenharmony_ci if ((leftmask & bitmask) == 0 && 7928c2ecf20Sopenharmony_ci node->leftnode == NODE && 7938c2ecf20Sopenharmony_ci node->left) { 7948c2ecf20Sopenharmony_ci leftmask |= bitmask; 7958c2ecf20Sopenharmony_ci node = node->left; 7968c2ecf20Sopenharmony_ci } else if ((rightmask & bitmask) == 0 && 7978c2ecf20Sopenharmony_ci node->rightnode == NODE && 7988c2ecf20Sopenharmony_ci node->right) { 7998c2ecf20Sopenharmony_ci rightmask |= bitmask; 8008c2ecf20Sopenharmony_ci node = node->right; 8018c2ecf20Sopenharmony_ci } else { 8028c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 8038c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 8048c2ecf20Sopenharmony_ci node = node->parent; 8058c2ecf20Sopenharmony_ci } 8068c2ecf20Sopenharmony_ci } 8078c2ecf20Sopenharmony_ci if (verbose > 0) 8088c2ecf20Sopenharmony_ci printf("Pruned %d nodes\n", count); 8098c2ecf20Sopenharmony_ci} 8108c2ecf20Sopenharmony_ci 8118c2ecf20Sopenharmony_ci/* 8128c2ecf20Sopenharmony_ci * Mark the nodes in the tree that lead to leaves that must be 8138c2ecf20Sopenharmony_ci * emitted. 8148c2ecf20Sopenharmony_ci */ 8158c2ecf20Sopenharmony_cistatic void mark_nodes(struct tree *tree) 8168c2ecf20Sopenharmony_ci{ 8178c2ecf20Sopenharmony_ci struct node *node; 8188c2ecf20Sopenharmony_ci struct node *n; 8198c2ecf20Sopenharmony_ci unsigned int leftmask; 8208c2ecf20Sopenharmony_ci unsigned int rightmask; 8218c2ecf20Sopenharmony_ci unsigned int bitmask; 8228c2ecf20Sopenharmony_ci int marked; 8238c2ecf20Sopenharmony_ci 8248c2ecf20Sopenharmony_ci marked = 0; 8258c2ecf20Sopenharmony_ci if (verbose > 0) 8268c2ecf20Sopenharmony_ci printf("Marking %s_%x\n", tree->type, tree->maxage); 8278c2ecf20Sopenharmony_ci if (tree->childnode == LEAF) 8288c2ecf20Sopenharmony_ci goto done; 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci assert(tree->childnode == NODE); 8318c2ecf20Sopenharmony_ci node = tree->root; 8328c2ecf20Sopenharmony_ci leftmask = rightmask = 0; 8338c2ecf20Sopenharmony_ci while (node) { 8348c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 8358c2ecf20Sopenharmony_ci if ((leftmask & bitmask) == 0) { 8368c2ecf20Sopenharmony_ci leftmask |= bitmask; 8378c2ecf20Sopenharmony_ci if (node->leftnode == LEAF) { 8388c2ecf20Sopenharmony_ci assert(node->left); 8398c2ecf20Sopenharmony_ci if (tree->leaf_mark(node->left)) { 8408c2ecf20Sopenharmony_ci n = node; 8418c2ecf20Sopenharmony_ci while (n && !n->mark) { 8428c2ecf20Sopenharmony_ci marked++; 8438c2ecf20Sopenharmony_ci n->mark = 1; 8448c2ecf20Sopenharmony_ci n = n->parent; 8458c2ecf20Sopenharmony_ci } 8468c2ecf20Sopenharmony_ci } 8478c2ecf20Sopenharmony_ci } else if (node->left) { 8488c2ecf20Sopenharmony_ci assert(node->leftnode == NODE); 8498c2ecf20Sopenharmony_ci node = node->left; 8508c2ecf20Sopenharmony_ci continue; 8518c2ecf20Sopenharmony_ci } 8528c2ecf20Sopenharmony_ci } 8538c2ecf20Sopenharmony_ci if ((rightmask & bitmask) == 0) { 8548c2ecf20Sopenharmony_ci rightmask |= bitmask; 8558c2ecf20Sopenharmony_ci if (node->rightnode == LEAF) { 8568c2ecf20Sopenharmony_ci assert(node->right); 8578c2ecf20Sopenharmony_ci if (tree->leaf_mark(node->right)) { 8588c2ecf20Sopenharmony_ci n = node; 8598c2ecf20Sopenharmony_ci while (n && !n->mark) { 8608c2ecf20Sopenharmony_ci marked++; 8618c2ecf20Sopenharmony_ci n->mark = 1; 8628c2ecf20Sopenharmony_ci n = n->parent; 8638c2ecf20Sopenharmony_ci } 8648c2ecf20Sopenharmony_ci } 8658c2ecf20Sopenharmony_ci } else if (node->right) { 8668c2ecf20Sopenharmony_ci assert(node->rightnode == NODE); 8678c2ecf20Sopenharmony_ci node = node->right; 8688c2ecf20Sopenharmony_ci continue; 8698c2ecf20Sopenharmony_ci } 8708c2ecf20Sopenharmony_ci } 8718c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 8728c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 8738c2ecf20Sopenharmony_ci node = node->parent; 8748c2ecf20Sopenharmony_ci } 8758c2ecf20Sopenharmony_ci 8768c2ecf20Sopenharmony_ci /* second pass: left siblings and singletons */ 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci assert(tree->childnode == NODE); 8798c2ecf20Sopenharmony_ci node = tree->root; 8808c2ecf20Sopenharmony_ci leftmask = rightmask = 0; 8818c2ecf20Sopenharmony_ci while (node) { 8828c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 8838c2ecf20Sopenharmony_ci if ((leftmask & bitmask) == 0) { 8848c2ecf20Sopenharmony_ci leftmask |= bitmask; 8858c2ecf20Sopenharmony_ci if (node->leftnode == LEAF) { 8868c2ecf20Sopenharmony_ci assert(node->left); 8878c2ecf20Sopenharmony_ci if (tree->leaf_mark(node->left)) { 8888c2ecf20Sopenharmony_ci n = node; 8898c2ecf20Sopenharmony_ci while (n && !n->mark) { 8908c2ecf20Sopenharmony_ci marked++; 8918c2ecf20Sopenharmony_ci n->mark = 1; 8928c2ecf20Sopenharmony_ci n = n->parent; 8938c2ecf20Sopenharmony_ci } 8948c2ecf20Sopenharmony_ci } 8958c2ecf20Sopenharmony_ci } else if (node->left) { 8968c2ecf20Sopenharmony_ci assert(node->leftnode == NODE); 8978c2ecf20Sopenharmony_ci node = node->left; 8988c2ecf20Sopenharmony_ci if (!node->mark && node->parent->mark) { 8998c2ecf20Sopenharmony_ci marked++; 9008c2ecf20Sopenharmony_ci node->mark = 1; 9018c2ecf20Sopenharmony_ci } 9028c2ecf20Sopenharmony_ci continue; 9038c2ecf20Sopenharmony_ci } 9048c2ecf20Sopenharmony_ci } 9058c2ecf20Sopenharmony_ci if ((rightmask & bitmask) == 0) { 9068c2ecf20Sopenharmony_ci rightmask |= bitmask; 9078c2ecf20Sopenharmony_ci if (node->rightnode == LEAF) { 9088c2ecf20Sopenharmony_ci assert(node->right); 9098c2ecf20Sopenharmony_ci if (tree->leaf_mark(node->right)) { 9108c2ecf20Sopenharmony_ci n = node; 9118c2ecf20Sopenharmony_ci while (n && !n->mark) { 9128c2ecf20Sopenharmony_ci marked++; 9138c2ecf20Sopenharmony_ci n->mark = 1; 9148c2ecf20Sopenharmony_ci n = n->parent; 9158c2ecf20Sopenharmony_ci } 9168c2ecf20Sopenharmony_ci } 9178c2ecf20Sopenharmony_ci } else if (node->right) { 9188c2ecf20Sopenharmony_ci assert(node->rightnode == NODE); 9198c2ecf20Sopenharmony_ci node = node->right; 9208c2ecf20Sopenharmony_ci if (!node->mark && node->parent->mark && 9218c2ecf20Sopenharmony_ci !node->parent->left) { 9228c2ecf20Sopenharmony_ci marked++; 9238c2ecf20Sopenharmony_ci node->mark = 1; 9248c2ecf20Sopenharmony_ci } 9258c2ecf20Sopenharmony_ci continue; 9268c2ecf20Sopenharmony_ci } 9278c2ecf20Sopenharmony_ci } 9288c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 9298c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 9308c2ecf20Sopenharmony_ci node = node->parent; 9318c2ecf20Sopenharmony_ci } 9328c2ecf20Sopenharmony_cidone: 9338c2ecf20Sopenharmony_ci if (verbose > 0) 9348c2ecf20Sopenharmony_ci printf("Marked %d nodes\n", marked); 9358c2ecf20Sopenharmony_ci} 9368c2ecf20Sopenharmony_ci 9378c2ecf20Sopenharmony_ci/* 9388c2ecf20Sopenharmony_ci * Compute the index of each node and leaf, which is the offset in the 9398c2ecf20Sopenharmony_ci * emitted trie. These values must be pre-computed because relative 9408c2ecf20Sopenharmony_ci * offsets between nodes are used to navigate the tree. 9418c2ecf20Sopenharmony_ci */ 9428c2ecf20Sopenharmony_cistatic int index_nodes(struct tree *tree, int index) 9438c2ecf20Sopenharmony_ci{ 9448c2ecf20Sopenharmony_ci struct node *node; 9458c2ecf20Sopenharmony_ci unsigned int leftmask; 9468c2ecf20Sopenharmony_ci unsigned int rightmask; 9478c2ecf20Sopenharmony_ci unsigned int bitmask; 9488c2ecf20Sopenharmony_ci int count; 9498c2ecf20Sopenharmony_ci int indent; 9508c2ecf20Sopenharmony_ci 9518c2ecf20Sopenharmony_ci /* Align to a cache line (or half a cache line?). */ 9528c2ecf20Sopenharmony_ci while (index % 64) 9538c2ecf20Sopenharmony_ci index++; 9548c2ecf20Sopenharmony_ci tree->index = index; 9558c2ecf20Sopenharmony_ci indent = 1; 9568c2ecf20Sopenharmony_ci count = 0; 9578c2ecf20Sopenharmony_ci 9588c2ecf20Sopenharmony_ci if (verbose > 0) 9598c2ecf20Sopenharmony_ci printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); 9608c2ecf20Sopenharmony_ci if (tree->childnode == LEAF) { 9618c2ecf20Sopenharmony_ci index += tree->leaf_size(tree->root); 9628c2ecf20Sopenharmony_ci goto done; 9638c2ecf20Sopenharmony_ci } 9648c2ecf20Sopenharmony_ci 9658c2ecf20Sopenharmony_ci assert(tree->childnode == NODE); 9668c2ecf20Sopenharmony_ci node = tree->root; 9678c2ecf20Sopenharmony_ci leftmask = rightmask = 0; 9688c2ecf20Sopenharmony_ci while (node) { 9698c2ecf20Sopenharmony_ci if (!node->mark) 9708c2ecf20Sopenharmony_ci goto skip; 9718c2ecf20Sopenharmony_ci count++; 9728c2ecf20Sopenharmony_ci if (node->index != index) 9738c2ecf20Sopenharmony_ci node->index = index; 9748c2ecf20Sopenharmony_ci index += node->size; 9758c2ecf20Sopenharmony_ciskip: 9768c2ecf20Sopenharmony_ci while (node) { 9778c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 9788c2ecf20Sopenharmony_ci if (node->mark && (leftmask & bitmask) == 0) { 9798c2ecf20Sopenharmony_ci leftmask |= bitmask; 9808c2ecf20Sopenharmony_ci if (node->leftnode == LEAF) { 9818c2ecf20Sopenharmony_ci assert(node->left); 9828c2ecf20Sopenharmony_ci *tree->leaf_index(tree, node->left) = 9838c2ecf20Sopenharmony_ci index; 9848c2ecf20Sopenharmony_ci index += tree->leaf_size(node->left); 9858c2ecf20Sopenharmony_ci count++; 9868c2ecf20Sopenharmony_ci } else if (node->left) { 9878c2ecf20Sopenharmony_ci assert(node->leftnode == NODE); 9888c2ecf20Sopenharmony_ci indent += 1; 9898c2ecf20Sopenharmony_ci node = node->left; 9908c2ecf20Sopenharmony_ci break; 9918c2ecf20Sopenharmony_ci } 9928c2ecf20Sopenharmony_ci } 9938c2ecf20Sopenharmony_ci if (node->mark && (rightmask & bitmask) == 0) { 9948c2ecf20Sopenharmony_ci rightmask |= bitmask; 9958c2ecf20Sopenharmony_ci if (node->rightnode == LEAF) { 9968c2ecf20Sopenharmony_ci assert(node->right); 9978c2ecf20Sopenharmony_ci *tree->leaf_index(tree, node->right) = index; 9988c2ecf20Sopenharmony_ci index += tree->leaf_size(node->right); 9998c2ecf20Sopenharmony_ci count++; 10008c2ecf20Sopenharmony_ci } else if (node->right) { 10018c2ecf20Sopenharmony_ci assert(node->rightnode == NODE); 10028c2ecf20Sopenharmony_ci indent += 1; 10038c2ecf20Sopenharmony_ci node = node->right; 10048c2ecf20Sopenharmony_ci break; 10058c2ecf20Sopenharmony_ci } 10068c2ecf20Sopenharmony_ci } 10078c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 10088c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 10098c2ecf20Sopenharmony_ci node = node->parent; 10108c2ecf20Sopenharmony_ci indent -= 1; 10118c2ecf20Sopenharmony_ci } 10128c2ecf20Sopenharmony_ci } 10138c2ecf20Sopenharmony_cidone: 10148c2ecf20Sopenharmony_ci /* Round up to a multiple of 16 */ 10158c2ecf20Sopenharmony_ci while (index % 16) 10168c2ecf20Sopenharmony_ci index++; 10178c2ecf20Sopenharmony_ci if (verbose > 0) 10188c2ecf20Sopenharmony_ci printf("Final index %d\n", index); 10198c2ecf20Sopenharmony_ci return index; 10208c2ecf20Sopenharmony_ci} 10218c2ecf20Sopenharmony_ci 10228c2ecf20Sopenharmony_ci/* 10238c2ecf20Sopenharmony_ci * Mark the nodes in a subtree, helper for size_nodes(). 10248c2ecf20Sopenharmony_ci */ 10258c2ecf20Sopenharmony_cistatic int mark_subtree(struct node *node) 10268c2ecf20Sopenharmony_ci{ 10278c2ecf20Sopenharmony_ci int changed; 10288c2ecf20Sopenharmony_ci 10298c2ecf20Sopenharmony_ci if (!node || node->mark) 10308c2ecf20Sopenharmony_ci return 0; 10318c2ecf20Sopenharmony_ci node->mark = 1; 10328c2ecf20Sopenharmony_ci node->index = node->parent->index; 10338c2ecf20Sopenharmony_ci changed = 1; 10348c2ecf20Sopenharmony_ci if (node->leftnode == NODE) 10358c2ecf20Sopenharmony_ci changed += mark_subtree(node->left); 10368c2ecf20Sopenharmony_ci if (node->rightnode == NODE) 10378c2ecf20Sopenharmony_ci changed += mark_subtree(node->right); 10388c2ecf20Sopenharmony_ci return changed; 10398c2ecf20Sopenharmony_ci} 10408c2ecf20Sopenharmony_ci 10418c2ecf20Sopenharmony_ci/* 10428c2ecf20Sopenharmony_ci * Compute the size of nodes and leaves. We start by assuming that 10438c2ecf20Sopenharmony_ci * each node needs to store a three-byte offset. The indexes of the 10448c2ecf20Sopenharmony_ci * nodes are calculated based on that, and then this function is 10458c2ecf20Sopenharmony_ci * called to see if the sizes of some nodes can be reduced. This is 10468c2ecf20Sopenharmony_ci * repeated until no more changes are seen. 10478c2ecf20Sopenharmony_ci */ 10488c2ecf20Sopenharmony_cistatic int size_nodes(struct tree *tree) 10498c2ecf20Sopenharmony_ci{ 10508c2ecf20Sopenharmony_ci struct tree *next; 10518c2ecf20Sopenharmony_ci struct node *node; 10528c2ecf20Sopenharmony_ci struct node *right; 10538c2ecf20Sopenharmony_ci struct node *n; 10548c2ecf20Sopenharmony_ci unsigned int leftmask; 10558c2ecf20Sopenharmony_ci unsigned int rightmask; 10568c2ecf20Sopenharmony_ci unsigned int bitmask; 10578c2ecf20Sopenharmony_ci unsigned int pathbits; 10588c2ecf20Sopenharmony_ci unsigned int pathmask; 10598c2ecf20Sopenharmony_ci unsigned int nbit; 10608c2ecf20Sopenharmony_ci int changed; 10618c2ecf20Sopenharmony_ci int offset; 10628c2ecf20Sopenharmony_ci int size; 10638c2ecf20Sopenharmony_ci int indent; 10648c2ecf20Sopenharmony_ci 10658c2ecf20Sopenharmony_ci indent = 1; 10668c2ecf20Sopenharmony_ci changed = 0; 10678c2ecf20Sopenharmony_ci size = 0; 10688c2ecf20Sopenharmony_ci 10698c2ecf20Sopenharmony_ci if (verbose > 0) 10708c2ecf20Sopenharmony_ci printf("Sizing %s_%x\n", tree->type, tree->maxage); 10718c2ecf20Sopenharmony_ci if (tree->childnode == LEAF) 10728c2ecf20Sopenharmony_ci goto done; 10738c2ecf20Sopenharmony_ci 10748c2ecf20Sopenharmony_ci assert(tree->childnode == NODE); 10758c2ecf20Sopenharmony_ci pathbits = 0; 10768c2ecf20Sopenharmony_ci pathmask = 0; 10778c2ecf20Sopenharmony_ci node = tree->root; 10788c2ecf20Sopenharmony_ci leftmask = rightmask = 0; 10798c2ecf20Sopenharmony_ci while (node) { 10808c2ecf20Sopenharmony_ci if (!node->mark) 10818c2ecf20Sopenharmony_ci goto skip; 10828c2ecf20Sopenharmony_ci offset = 0; 10838c2ecf20Sopenharmony_ci if (!node->left || !node->right) { 10848c2ecf20Sopenharmony_ci size = 1; 10858c2ecf20Sopenharmony_ci } else { 10868c2ecf20Sopenharmony_ci if (node->rightnode == NODE) { 10878c2ecf20Sopenharmony_ci /* 10888c2ecf20Sopenharmony_ci * If the right node is not marked, 10898c2ecf20Sopenharmony_ci * look for a corresponding node in 10908c2ecf20Sopenharmony_ci * the next tree. Such a node need 10918c2ecf20Sopenharmony_ci * not exist. 10928c2ecf20Sopenharmony_ci */ 10938c2ecf20Sopenharmony_ci right = node->right; 10948c2ecf20Sopenharmony_ci next = tree->next; 10958c2ecf20Sopenharmony_ci while (!right->mark) { 10968c2ecf20Sopenharmony_ci assert(next); 10978c2ecf20Sopenharmony_ci n = next->root; 10988c2ecf20Sopenharmony_ci while (n->bitnum != node->bitnum) { 10998c2ecf20Sopenharmony_ci nbit = 1 << n->bitnum; 11008c2ecf20Sopenharmony_ci if (!(pathmask & nbit)) 11018c2ecf20Sopenharmony_ci break; 11028c2ecf20Sopenharmony_ci if (pathbits & nbit) { 11038c2ecf20Sopenharmony_ci if (n->rightnode == LEAF) 11048c2ecf20Sopenharmony_ci break; 11058c2ecf20Sopenharmony_ci n = n->right; 11068c2ecf20Sopenharmony_ci } else { 11078c2ecf20Sopenharmony_ci if (n->leftnode == LEAF) 11088c2ecf20Sopenharmony_ci break; 11098c2ecf20Sopenharmony_ci n = n->left; 11108c2ecf20Sopenharmony_ci } 11118c2ecf20Sopenharmony_ci } 11128c2ecf20Sopenharmony_ci if (n->bitnum != node->bitnum) 11138c2ecf20Sopenharmony_ci break; 11148c2ecf20Sopenharmony_ci n = n->right; 11158c2ecf20Sopenharmony_ci right = n; 11168c2ecf20Sopenharmony_ci next = next->next; 11178c2ecf20Sopenharmony_ci } 11188c2ecf20Sopenharmony_ci /* Make sure the right node is marked. */ 11198c2ecf20Sopenharmony_ci if (!right->mark) 11208c2ecf20Sopenharmony_ci changed += mark_subtree(right); 11218c2ecf20Sopenharmony_ci offset = right->index - node->index; 11228c2ecf20Sopenharmony_ci } else { 11238c2ecf20Sopenharmony_ci offset = *tree->leaf_index(tree, node->right); 11248c2ecf20Sopenharmony_ci offset -= node->index; 11258c2ecf20Sopenharmony_ci } 11268c2ecf20Sopenharmony_ci assert(offset >= 0); 11278c2ecf20Sopenharmony_ci assert(offset <= 0xffffff); 11288c2ecf20Sopenharmony_ci if (offset <= 0xff) { 11298c2ecf20Sopenharmony_ci size = 2; 11308c2ecf20Sopenharmony_ci } else if (offset <= 0xffff) { 11318c2ecf20Sopenharmony_ci size = 3; 11328c2ecf20Sopenharmony_ci } else { /* offset <= 0xffffff */ 11338c2ecf20Sopenharmony_ci size = 4; 11348c2ecf20Sopenharmony_ci } 11358c2ecf20Sopenharmony_ci } 11368c2ecf20Sopenharmony_ci if (node->size != size || node->offset != offset) { 11378c2ecf20Sopenharmony_ci node->size = size; 11388c2ecf20Sopenharmony_ci node->offset = offset; 11398c2ecf20Sopenharmony_ci changed++; 11408c2ecf20Sopenharmony_ci } 11418c2ecf20Sopenharmony_ciskip: 11428c2ecf20Sopenharmony_ci while (node) { 11438c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 11448c2ecf20Sopenharmony_ci pathmask |= bitmask; 11458c2ecf20Sopenharmony_ci if (node->mark && (leftmask & bitmask) == 0) { 11468c2ecf20Sopenharmony_ci leftmask |= bitmask; 11478c2ecf20Sopenharmony_ci if (node->leftnode == LEAF) { 11488c2ecf20Sopenharmony_ci assert(node->left); 11498c2ecf20Sopenharmony_ci } else if (node->left) { 11508c2ecf20Sopenharmony_ci assert(node->leftnode == NODE); 11518c2ecf20Sopenharmony_ci indent += 1; 11528c2ecf20Sopenharmony_ci node = node->left; 11538c2ecf20Sopenharmony_ci break; 11548c2ecf20Sopenharmony_ci } 11558c2ecf20Sopenharmony_ci } 11568c2ecf20Sopenharmony_ci if (node->mark && (rightmask & bitmask) == 0) { 11578c2ecf20Sopenharmony_ci rightmask |= bitmask; 11588c2ecf20Sopenharmony_ci pathbits |= bitmask; 11598c2ecf20Sopenharmony_ci if (node->rightnode == LEAF) { 11608c2ecf20Sopenharmony_ci assert(node->right); 11618c2ecf20Sopenharmony_ci } else if (node->right) { 11628c2ecf20Sopenharmony_ci assert(node->rightnode == NODE); 11638c2ecf20Sopenharmony_ci indent += 1; 11648c2ecf20Sopenharmony_ci node = node->right; 11658c2ecf20Sopenharmony_ci break; 11668c2ecf20Sopenharmony_ci } 11678c2ecf20Sopenharmony_ci } 11688c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 11698c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 11708c2ecf20Sopenharmony_ci pathmask &= ~bitmask; 11718c2ecf20Sopenharmony_ci pathbits &= ~bitmask; 11728c2ecf20Sopenharmony_ci node = node->parent; 11738c2ecf20Sopenharmony_ci indent -= 1; 11748c2ecf20Sopenharmony_ci } 11758c2ecf20Sopenharmony_ci } 11768c2ecf20Sopenharmony_cidone: 11778c2ecf20Sopenharmony_ci if (verbose > 0) 11788c2ecf20Sopenharmony_ci printf("Found %d changes\n", changed); 11798c2ecf20Sopenharmony_ci return changed; 11808c2ecf20Sopenharmony_ci} 11818c2ecf20Sopenharmony_ci 11828c2ecf20Sopenharmony_ci/* 11838c2ecf20Sopenharmony_ci * Emit a trie for the given tree into the data array. 11848c2ecf20Sopenharmony_ci */ 11858c2ecf20Sopenharmony_cistatic void emit(struct tree *tree, unsigned char *data) 11868c2ecf20Sopenharmony_ci{ 11878c2ecf20Sopenharmony_ci struct node *node; 11888c2ecf20Sopenharmony_ci unsigned int leftmask; 11898c2ecf20Sopenharmony_ci unsigned int rightmask; 11908c2ecf20Sopenharmony_ci unsigned int bitmask; 11918c2ecf20Sopenharmony_ci int offlen; 11928c2ecf20Sopenharmony_ci int offset; 11938c2ecf20Sopenharmony_ci int index; 11948c2ecf20Sopenharmony_ci int indent; 11958c2ecf20Sopenharmony_ci int size; 11968c2ecf20Sopenharmony_ci int bytes; 11978c2ecf20Sopenharmony_ci int leaves; 11988c2ecf20Sopenharmony_ci int nodes[4]; 11998c2ecf20Sopenharmony_ci unsigned char byte; 12008c2ecf20Sopenharmony_ci 12018c2ecf20Sopenharmony_ci nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; 12028c2ecf20Sopenharmony_ci leaves = 0; 12038c2ecf20Sopenharmony_ci bytes = 0; 12048c2ecf20Sopenharmony_ci index = tree->index; 12058c2ecf20Sopenharmony_ci data += index; 12068c2ecf20Sopenharmony_ci indent = 1; 12078c2ecf20Sopenharmony_ci if (verbose > 0) 12088c2ecf20Sopenharmony_ci printf("Emitting %s_%x\n", tree->type, tree->maxage); 12098c2ecf20Sopenharmony_ci if (tree->childnode == LEAF) { 12108c2ecf20Sopenharmony_ci assert(tree->root); 12118c2ecf20Sopenharmony_ci tree->leaf_emit(tree->root, data); 12128c2ecf20Sopenharmony_ci size = tree->leaf_size(tree->root); 12138c2ecf20Sopenharmony_ci index += size; 12148c2ecf20Sopenharmony_ci leaves++; 12158c2ecf20Sopenharmony_ci goto done; 12168c2ecf20Sopenharmony_ci } 12178c2ecf20Sopenharmony_ci 12188c2ecf20Sopenharmony_ci assert(tree->childnode == NODE); 12198c2ecf20Sopenharmony_ci node = tree->root; 12208c2ecf20Sopenharmony_ci leftmask = rightmask = 0; 12218c2ecf20Sopenharmony_ci while (node) { 12228c2ecf20Sopenharmony_ci if (!node->mark) 12238c2ecf20Sopenharmony_ci goto skip; 12248c2ecf20Sopenharmony_ci assert(node->offset != -1); 12258c2ecf20Sopenharmony_ci assert(node->index == index); 12268c2ecf20Sopenharmony_ci 12278c2ecf20Sopenharmony_ci byte = 0; 12288c2ecf20Sopenharmony_ci if (node->nextbyte) 12298c2ecf20Sopenharmony_ci byte |= NEXTBYTE; 12308c2ecf20Sopenharmony_ci byte |= (node->bitnum & BITNUM); 12318c2ecf20Sopenharmony_ci if (node->left && node->right) { 12328c2ecf20Sopenharmony_ci if (node->leftnode == NODE) 12338c2ecf20Sopenharmony_ci byte |= LEFTNODE; 12348c2ecf20Sopenharmony_ci if (node->rightnode == NODE) 12358c2ecf20Sopenharmony_ci byte |= RIGHTNODE; 12368c2ecf20Sopenharmony_ci if (node->offset <= 0xff) 12378c2ecf20Sopenharmony_ci offlen = 1; 12388c2ecf20Sopenharmony_ci else if (node->offset <= 0xffff) 12398c2ecf20Sopenharmony_ci offlen = 2; 12408c2ecf20Sopenharmony_ci else 12418c2ecf20Sopenharmony_ci offlen = 3; 12428c2ecf20Sopenharmony_ci nodes[offlen]++; 12438c2ecf20Sopenharmony_ci offset = node->offset; 12448c2ecf20Sopenharmony_ci byte |= offlen << OFFLEN_SHIFT; 12458c2ecf20Sopenharmony_ci *data++ = byte; 12468c2ecf20Sopenharmony_ci index++; 12478c2ecf20Sopenharmony_ci while (offlen--) { 12488c2ecf20Sopenharmony_ci *data++ = offset & 0xff; 12498c2ecf20Sopenharmony_ci index++; 12508c2ecf20Sopenharmony_ci offset >>= 8; 12518c2ecf20Sopenharmony_ci } 12528c2ecf20Sopenharmony_ci } else if (node->left) { 12538c2ecf20Sopenharmony_ci if (node->leftnode == NODE) 12548c2ecf20Sopenharmony_ci byte |= TRIENODE; 12558c2ecf20Sopenharmony_ci nodes[0]++; 12568c2ecf20Sopenharmony_ci *data++ = byte; 12578c2ecf20Sopenharmony_ci index++; 12588c2ecf20Sopenharmony_ci } else if (node->right) { 12598c2ecf20Sopenharmony_ci byte |= RIGHTNODE; 12608c2ecf20Sopenharmony_ci if (node->rightnode == NODE) 12618c2ecf20Sopenharmony_ci byte |= TRIENODE; 12628c2ecf20Sopenharmony_ci nodes[0]++; 12638c2ecf20Sopenharmony_ci *data++ = byte; 12648c2ecf20Sopenharmony_ci index++; 12658c2ecf20Sopenharmony_ci } else { 12668c2ecf20Sopenharmony_ci assert(0); 12678c2ecf20Sopenharmony_ci } 12688c2ecf20Sopenharmony_ciskip: 12698c2ecf20Sopenharmony_ci while (node) { 12708c2ecf20Sopenharmony_ci bitmask = 1 << node->bitnum; 12718c2ecf20Sopenharmony_ci if (node->mark && (leftmask & bitmask) == 0) { 12728c2ecf20Sopenharmony_ci leftmask |= bitmask; 12738c2ecf20Sopenharmony_ci if (node->leftnode == LEAF) { 12748c2ecf20Sopenharmony_ci assert(node->left); 12758c2ecf20Sopenharmony_ci data = tree->leaf_emit(node->left, 12768c2ecf20Sopenharmony_ci data); 12778c2ecf20Sopenharmony_ci size = tree->leaf_size(node->left); 12788c2ecf20Sopenharmony_ci index += size; 12798c2ecf20Sopenharmony_ci bytes += size; 12808c2ecf20Sopenharmony_ci leaves++; 12818c2ecf20Sopenharmony_ci } else if (node->left) { 12828c2ecf20Sopenharmony_ci assert(node->leftnode == NODE); 12838c2ecf20Sopenharmony_ci indent += 1; 12848c2ecf20Sopenharmony_ci node = node->left; 12858c2ecf20Sopenharmony_ci break; 12868c2ecf20Sopenharmony_ci } 12878c2ecf20Sopenharmony_ci } 12888c2ecf20Sopenharmony_ci if (node->mark && (rightmask & bitmask) == 0) { 12898c2ecf20Sopenharmony_ci rightmask |= bitmask; 12908c2ecf20Sopenharmony_ci if (node->rightnode == LEAF) { 12918c2ecf20Sopenharmony_ci assert(node->right); 12928c2ecf20Sopenharmony_ci data = tree->leaf_emit(node->right, 12938c2ecf20Sopenharmony_ci data); 12948c2ecf20Sopenharmony_ci size = tree->leaf_size(node->right); 12958c2ecf20Sopenharmony_ci index += size; 12968c2ecf20Sopenharmony_ci bytes += size; 12978c2ecf20Sopenharmony_ci leaves++; 12988c2ecf20Sopenharmony_ci } else if (node->right) { 12998c2ecf20Sopenharmony_ci assert(node->rightnode == NODE); 13008c2ecf20Sopenharmony_ci indent += 1; 13018c2ecf20Sopenharmony_ci node = node->right; 13028c2ecf20Sopenharmony_ci break; 13038c2ecf20Sopenharmony_ci } 13048c2ecf20Sopenharmony_ci } 13058c2ecf20Sopenharmony_ci leftmask &= ~bitmask; 13068c2ecf20Sopenharmony_ci rightmask &= ~bitmask; 13078c2ecf20Sopenharmony_ci node = node->parent; 13088c2ecf20Sopenharmony_ci indent -= 1; 13098c2ecf20Sopenharmony_ci } 13108c2ecf20Sopenharmony_ci } 13118c2ecf20Sopenharmony_cidone: 13128c2ecf20Sopenharmony_ci if (verbose > 0) { 13138c2ecf20Sopenharmony_ci printf("Emitted %d (%d) leaves", 13148c2ecf20Sopenharmony_ci leaves, bytes); 13158c2ecf20Sopenharmony_ci printf(" %d (%d+%d+%d+%d) nodes", 13168c2ecf20Sopenharmony_ci nodes[0] + nodes[1] + nodes[2] + nodes[3], 13178c2ecf20Sopenharmony_ci nodes[0], nodes[1], nodes[2], nodes[3]); 13188c2ecf20Sopenharmony_ci printf(" %d total\n", index - tree->index); 13198c2ecf20Sopenharmony_ci } 13208c2ecf20Sopenharmony_ci} 13218c2ecf20Sopenharmony_ci 13228c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 13238c2ecf20Sopenharmony_ci 13248c2ecf20Sopenharmony_ci/* 13258c2ecf20Sopenharmony_ci * Unicode data. 13268c2ecf20Sopenharmony_ci * 13278c2ecf20Sopenharmony_ci * We need to keep track of the Canonical Combining Class, the Age, 13288c2ecf20Sopenharmony_ci * and decompositions for a code point. 13298c2ecf20Sopenharmony_ci * 13308c2ecf20Sopenharmony_ci * For the Age, we store the index into the ages table. Effectively 13318c2ecf20Sopenharmony_ci * this is a generation number that the table maps to a unicode 13328c2ecf20Sopenharmony_ci * version. 13338c2ecf20Sopenharmony_ci * 13348c2ecf20Sopenharmony_ci * The correction field is used to indicate that this entry is in the 13358c2ecf20Sopenharmony_ci * corrections array, which contains decompositions that were 13368c2ecf20Sopenharmony_ci * corrected in later revisions. The value of the correction field is 13378c2ecf20Sopenharmony_ci * the Unicode version in which the mapping was corrected. 13388c2ecf20Sopenharmony_ci */ 13398c2ecf20Sopenharmony_cistruct unicode_data { 13408c2ecf20Sopenharmony_ci unsigned int code; 13418c2ecf20Sopenharmony_ci int ccc; 13428c2ecf20Sopenharmony_ci int gen; 13438c2ecf20Sopenharmony_ci int correction; 13448c2ecf20Sopenharmony_ci unsigned int *utf32nfdi; 13458c2ecf20Sopenharmony_ci unsigned int *utf32nfdicf; 13468c2ecf20Sopenharmony_ci char *utf8nfdi; 13478c2ecf20Sopenharmony_ci char *utf8nfdicf; 13488c2ecf20Sopenharmony_ci}; 13498c2ecf20Sopenharmony_ci 13508c2ecf20Sopenharmony_cistruct unicode_data unicode_data[0x110000]; 13518c2ecf20Sopenharmony_cistruct unicode_data *corrections; 13528c2ecf20Sopenharmony_ciint corrections_count; 13538c2ecf20Sopenharmony_ci 13548c2ecf20Sopenharmony_cistruct tree *nfdi_tree; 13558c2ecf20Sopenharmony_cistruct tree *nfdicf_tree; 13568c2ecf20Sopenharmony_ci 13578c2ecf20Sopenharmony_cistruct tree *trees; 13588c2ecf20Sopenharmony_ciint trees_count; 13598c2ecf20Sopenharmony_ci 13608c2ecf20Sopenharmony_ci/* 13618c2ecf20Sopenharmony_ci * Check the corrections array to see if this entry was corrected at 13628c2ecf20Sopenharmony_ci * some point. 13638c2ecf20Sopenharmony_ci */ 13648c2ecf20Sopenharmony_cistatic struct unicode_data *corrections_lookup(struct unicode_data *u) 13658c2ecf20Sopenharmony_ci{ 13668c2ecf20Sopenharmony_ci int i; 13678c2ecf20Sopenharmony_ci 13688c2ecf20Sopenharmony_ci for (i = 0; i != corrections_count; i++) 13698c2ecf20Sopenharmony_ci if (u->code == corrections[i].code) 13708c2ecf20Sopenharmony_ci return &corrections[i]; 13718c2ecf20Sopenharmony_ci return u; 13728c2ecf20Sopenharmony_ci} 13738c2ecf20Sopenharmony_ci 13748c2ecf20Sopenharmony_cistatic int nfdi_equal(void *l, void *r) 13758c2ecf20Sopenharmony_ci{ 13768c2ecf20Sopenharmony_ci struct unicode_data *left = l; 13778c2ecf20Sopenharmony_ci struct unicode_data *right = r; 13788c2ecf20Sopenharmony_ci 13798c2ecf20Sopenharmony_ci if (left->gen != right->gen) 13808c2ecf20Sopenharmony_ci return 0; 13818c2ecf20Sopenharmony_ci if (left->ccc != right->ccc) 13828c2ecf20Sopenharmony_ci return 0; 13838c2ecf20Sopenharmony_ci if (left->utf8nfdi && right->utf8nfdi && 13848c2ecf20Sopenharmony_ci strcmp(left->utf8nfdi, right->utf8nfdi) == 0) 13858c2ecf20Sopenharmony_ci return 1; 13868c2ecf20Sopenharmony_ci if (left->utf8nfdi || right->utf8nfdi) 13878c2ecf20Sopenharmony_ci return 0; 13888c2ecf20Sopenharmony_ci return 1; 13898c2ecf20Sopenharmony_ci} 13908c2ecf20Sopenharmony_ci 13918c2ecf20Sopenharmony_cistatic int nfdicf_equal(void *l, void *r) 13928c2ecf20Sopenharmony_ci{ 13938c2ecf20Sopenharmony_ci struct unicode_data *left = l; 13948c2ecf20Sopenharmony_ci struct unicode_data *right = r; 13958c2ecf20Sopenharmony_ci 13968c2ecf20Sopenharmony_ci if (left->gen != right->gen) 13978c2ecf20Sopenharmony_ci return 0; 13988c2ecf20Sopenharmony_ci if (left->ccc != right->ccc) 13998c2ecf20Sopenharmony_ci return 0; 14008c2ecf20Sopenharmony_ci if (left->utf8nfdicf && right->utf8nfdicf && 14018c2ecf20Sopenharmony_ci strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) 14028c2ecf20Sopenharmony_ci return 1; 14038c2ecf20Sopenharmony_ci if (left->utf8nfdicf && right->utf8nfdicf) 14048c2ecf20Sopenharmony_ci return 0; 14058c2ecf20Sopenharmony_ci if (left->utf8nfdicf || right->utf8nfdicf) 14068c2ecf20Sopenharmony_ci return 0; 14078c2ecf20Sopenharmony_ci if (left->utf8nfdi && right->utf8nfdi && 14088c2ecf20Sopenharmony_ci strcmp(left->utf8nfdi, right->utf8nfdi) == 0) 14098c2ecf20Sopenharmony_ci return 1; 14108c2ecf20Sopenharmony_ci if (left->utf8nfdi || right->utf8nfdi) 14118c2ecf20Sopenharmony_ci return 0; 14128c2ecf20Sopenharmony_ci return 1; 14138c2ecf20Sopenharmony_ci} 14148c2ecf20Sopenharmony_ci 14158c2ecf20Sopenharmony_cistatic void nfdi_print(void *l, int indent) 14168c2ecf20Sopenharmony_ci{ 14178c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 14188c2ecf20Sopenharmony_ci 14198c2ecf20Sopenharmony_ci printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, 14208c2ecf20Sopenharmony_ci leaf->code, leaf->ccc, leaf->gen); 14218c2ecf20Sopenharmony_ci 14228c2ecf20Sopenharmony_ci if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) 14238c2ecf20Sopenharmony_ci printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); 14248c2ecf20Sopenharmony_ci else if (leaf->utf8nfdi) 14258c2ecf20Sopenharmony_ci printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); 14268c2ecf20Sopenharmony_ci 14278c2ecf20Sopenharmony_ci printf("\n"); 14288c2ecf20Sopenharmony_ci} 14298c2ecf20Sopenharmony_ci 14308c2ecf20Sopenharmony_cistatic void nfdicf_print(void *l, int indent) 14318c2ecf20Sopenharmony_ci{ 14328c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 14338c2ecf20Sopenharmony_ci 14348c2ecf20Sopenharmony_ci printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, 14358c2ecf20Sopenharmony_ci leaf->code, leaf->ccc, leaf->gen); 14368c2ecf20Sopenharmony_ci 14378c2ecf20Sopenharmony_ci if (leaf->utf8nfdicf) 14388c2ecf20Sopenharmony_ci printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); 14398c2ecf20Sopenharmony_ci else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) 14408c2ecf20Sopenharmony_ci printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); 14418c2ecf20Sopenharmony_ci else if (leaf->utf8nfdi) 14428c2ecf20Sopenharmony_ci printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); 14438c2ecf20Sopenharmony_ci printf("\n"); 14448c2ecf20Sopenharmony_ci} 14458c2ecf20Sopenharmony_ci 14468c2ecf20Sopenharmony_cistatic int nfdi_mark(void *l) 14478c2ecf20Sopenharmony_ci{ 14488c2ecf20Sopenharmony_ci return 1; 14498c2ecf20Sopenharmony_ci} 14508c2ecf20Sopenharmony_ci 14518c2ecf20Sopenharmony_cistatic int nfdicf_mark(void *l) 14528c2ecf20Sopenharmony_ci{ 14538c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 14548c2ecf20Sopenharmony_ci 14558c2ecf20Sopenharmony_ci if (leaf->utf8nfdicf) 14568c2ecf20Sopenharmony_ci return 1; 14578c2ecf20Sopenharmony_ci return 0; 14588c2ecf20Sopenharmony_ci} 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_cistatic int correction_mark(void *l) 14618c2ecf20Sopenharmony_ci{ 14628c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 14638c2ecf20Sopenharmony_ci 14648c2ecf20Sopenharmony_ci return leaf->correction; 14658c2ecf20Sopenharmony_ci} 14668c2ecf20Sopenharmony_ci 14678c2ecf20Sopenharmony_cistatic int nfdi_size(void *l) 14688c2ecf20Sopenharmony_ci{ 14698c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 14708c2ecf20Sopenharmony_ci int size = 2; 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_ci if (HANGUL_SYLLABLE(leaf->code)) 14738c2ecf20Sopenharmony_ci size += 1; 14748c2ecf20Sopenharmony_ci else if (leaf->utf8nfdi) 14758c2ecf20Sopenharmony_ci size += strlen(leaf->utf8nfdi) + 1; 14768c2ecf20Sopenharmony_ci return size; 14778c2ecf20Sopenharmony_ci} 14788c2ecf20Sopenharmony_ci 14798c2ecf20Sopenharmony_cistatic int nfdicf_size(void *l) 14808c2ecf20Sopenharmony_ci{ 14818c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 14828c2ecf20Sopenharmony_ci int size = 2; 14838c2ecf20Sopenharmony_ci 14848c2ecf20Sopenharmony_ci if (HANGUL_SYLLABLE(leaf->code)) 14858c2ecf20Sopenharmony_ci size += 1; 14868c2ecf20Sopenharmony_ci else if (leaf->utf8nfdicf) 14878c2ecf20Sopenharmony_ci size += strlen(leaf->utf8nfdicf) + 1; 14888c2ecf20Sopenharmony_ci else if (leaf->utf8nfdi) 14898c2ecf20Sopenharmony_ci size += strlen(leaf->utf8nfdi) + 1; 14908c2ecf20Sopenharmony_ci return size; 14918c2ecf20Sopenharmony_ci} 14928c2ecf20Sopenharmony_ci 14938c2ecf20Sopenharmony_cistatic int *nfdi_index(struct tree *tree, void *l) 14948c2ecf20Sopenharmony_ci{ 14958c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 14968c2ecf20Sopenharmony_ci 14978c2ecf20Sopenharmony_ci return &tree->leafindex[leaf->code]; 14988c2ecf20Sopenharmony_ci} 14998c2ecf20Sopenharmony_ci 15008c2ecf20Sopenharmony_cistatic int *nfdicf_index(struct tree *tree, void *l) 15018c2ecf20Sopenharmony_ci{ 15028c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 15038c2ecf20Sopenharmony_ci 15048c2ecf20Sopenharmony_ci return &tree->leafindex[leaf->code]; 15058c2ecf20Sopenharmony_ci} 15068c2ecf20Sopenharmony_ci 15078c2ecf20Sopenharmony_cistatic unsigned char *nfdi_emit(void *l, unsigned char *data) 15088c2ecf20Sopenharmony_ci{ 15098c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 15108c2ecf20Sopenharmony_ci unsigned char *s; 15118c2ecf20Sopenharmony_ci 15128c2ecf20Sopenharmony_ci *data++ = leaf->gen; 15138c2ecf20Sopenharmony_ci 15148c2ecf20Sopenharmony_ci if (HANGUL_SYLLABLE(leaf->code)) { 15158c2ecf20Sopenharmony_ci *data++ = DECOMPOSE; 15168c2ecf20Sopenharmony_ci *data++ = HANGUL; 15178c2ecf20Sopenharmony_ci } else if (leaf->utf8nfdi) { 15188c2ecf20Sopenharmony_ci *data++ = DECOMPOSE; 15198c2ecf20Sopenharmony_ci s = (unsigned char*)leaf->utf8nfdi; 15208c2ecf20Sopenharmony_ci while ((*data++ = *s++) != 0) 15218c2ecf20Sopenharmony_ci ; 15228c2ecf20Sopenharmony_ci } else { 15238c2ecf20Sopenharmony_ci *data++ = leaf->ccc; 15248c2ecf20Sopenharmony_ci } 15258c2ecf20Sopenharmony_ci return data; 15268c2ecf20Sopenharmony_ci} 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_cistatic unsigned char *nfdicf_emit(void *l, unsigned char *data) 15298c2ecf20Sopenharmony_ci{ 15308c2ecf20Sopenharmony_ci struct unicode_data *leaf = l; 15318c2ecf20Sopenharmony_ci unsigned char *s; 15328c2ecf20Sopenharmony_ci 15338c2ecf20Sopenharmony_ci *data++ = leaf->gen; 15348c2ecf20Sopenharmony_ci 15358c2ecf20Sopenharmony_ci if (HANGUL_SYLLABLE(leaf->code)) { 15368c2ecf20Sopenharmony_ci *data++ = DECOMPOSE; 15378c2ecf20Sopenharmony_ci *data++ = HANGUL; 15388c2ecf20Sopenharmony_ci } else if (leaf->utf8nfdicf) { 15398c2ecf20Sopenharmony_ci *data++ = DECOMPOSE; 15408c2ecf20Sopenharmony_ci s = (unsigned char*)leaf->utf8nfdicf; 15418c2ecf20Sopenharmony_ci while ((*data++ = *s++) != 0) 15428c2ecf20Sopenharmony_ci ; 15438c2ecf20Sopenharmony_ci } else if (leaf->utf8nfdi) { 15448c2ecf20Sopenharmony_ci *data++ = DECOMPOSE; 15458c2ecf20Sopenharmony_ci s = (unsigned char*)leaf->utf8nfdi; 15468c2ecf20Sopenharmony_ci while ((*data++ = *s++) != 0) 15478c2ecf20Sopenharmony_ci ; 15488c2ecf20Sopenharmony_ci } else { 15498c2ecf20Sopenharmony_ci *data++ = leaf->ccc; 15508c2ecf20Sopenharmony_ci } 15518c2ecf20Sopenharmony_ci return data; 15528c2ecf20Sopenharmony_ci} 15538c2ecf20Sopenharmony_ci 15548c2ecf20Sopenharmony_cistatic void utf8_create(struct unicode_data *data) 15558c2ecf20Sopenharmony_ci{ 15568c2ecf20Sopenharmony_ci char utf[18*4+1]; 15578c2ecf20Sopenharmony_ci char *u; 15588c2ecf20Sopenharmony_ci unsigned int *um; 15598c2ecf20Sopenharmony_ci int i; 15608c2ecf20Sopenharmony_ci 15618c2ecf20Sopenharmony_ci if (data->utf8nfdi) { 15628c2ecf20Sopenharmony_ci assert(data->utf8nfdi[0] == HANGUL); 15638c2ecf20Sopenharmony_ci return; 15648c2ecf20Sopenharmony_ci } 15658c2ecf20Sopenharmony_ci 15668c2ecf20Sopenharmony_ci u = utf; 15678c2ecf20Sopenharmony_ci um = data->utf32nfdi; 15688c2ecf20Sopenharmony_ci if (um) { 15698c2ecf20Sopenharmony_ci for (i = 0; um[i]; i++) 15708c2ecf20Sopenharmony_ci u += utf8encode(u, um[i]); 15718c2ecf20Sopenharmony_ci *u = '\0'; 15728c2ecf20Sopenharmony_ci data->utf8nfdi = strdup(utf); 15738c2ecf20Sopenharmony_ci } 15748c2ecf20Sopenharmony_ci u = utf; 15758c2ecf20Sopenharmony_ci um = data->utf32nfdicf; 15768c2ecf20Sopenharmony_ci if (um) { 15778c2ecf20Sopenharmony_ci for (i = 0; um[i]; i++) 15788c2ecf20Sopenharmony_ci u += utf8encode(u, um[i]); 15798c2ecf20Sopenharmony_ci *u = '\0'; 15808c2ecf20Sopenharmony_ci if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) 15818c2ecf20Sopenharmony_ci data->utf8nfdicf = strdup(utf); 15828c2ecf20Sopenharmony_ci } 15838c2ecf20Sopenharmony_ci} 15848c2ecf20Sopenharmony_ci 15858c2ecf20Sopenharmony_cistatic void utf8_init(void) 15868c2ecf20Sopenharmony_ci{ 15878c2ecf20Sopenharmony_ci unsigned int unichar; 15888c2ecf20Sopenharmony_ci int i; 15898c2ecf20Sopenharmony_ci 15908c2ecf20Sopenharmony_ci for (unichar = 0; unichar != 0x110000; unichar++) 15918c2ecf20Sopenharmony_ci utf8_create(&unicode_data[unichar]); 15928c2ecf20Sopenharmony_ci 15938c2ecf20Sopenharmony_ci for (i = 0; i != corrections_count; i++) 15948c2ecf20Sopenharmony_ci utf8_create(&corrections[i]); 15958c2ecf20Sopenharmony_ci} 15968c2ecf20Sopenharmony_ci 15978c2ecf20Sopenharmony_cistatic void trees_init(void) 15988c2ecf20Sopenharmony_ci{ 15998c2ecf20Sopenharmony_ci struct unicode_data *data; 16008c2ecf20Sopenharmony_ci unsigned int maxage; 16018c2ecf20Sopenharmony_ci unsigned int nextage; 16028c2ecf20Sopenharmony_ci int count; 16038c2ecf20Sopenharmony_ci int i; 16048c2ecf20Sopenharmony_ci int j; 16058c2ecf20Sopenharmony_ci 16068c2ecf20Sopenharmony_ci /* Count the number of different ages. */ 16078c2ecf20Sopenharmony_ci count = 0; 16088c2ecf20Sopenharmony_ci nextage = (unsigned int)-1; 16098c2ecf20Sopenharmony_ci do { 16108c2ecf20Sopenharmony_ci maxage = nextage; 16118c2ecf20Sopenharmony_ci nextage = 0; 16128c2ecf20Sopenharmony_ci for (i = 0; i <= corrections_count; i++) { 16138c2ecf20Sopenharmony_ci data = &corrections[i]; 16148c2ecf20Sopenharmony_ci if (nextage < data->correction && 16158c2ecf20Sopenharmony_ci data->correction < maxage) 16168c2ecf20Sopenharmony_ci nextage = data->correction; 16178c2ecf20Sopenharmony_ci } 16188c2ecf20Sopenharmony_ci count++; 16198c2ecf20Sopenharmony_ci } while (nextage); 16208c2ecf20Sopenharmony_ci 16218c2ecf20Sopenharmony_ci /* Two trees per age: nfdi and nfdicf */ 16228c2ecf20Sopenharmony_ci trees_count = count * 2; 16238c2ecf20Sopenharmony_ci trees = calloc(trees_count, sizeof(struct tree)); 16248c2ecf20Sopenharmony_ci 16258c2ecf20Sopenharmony_ci /* Assign ages to the trees. */ 16268c2ecf20Sopenharmony_ci count = trees_count; 16278c2ecf20Sopenharmony_ci nextage = (unsigned int)-1; 16288c2ecf20Sopenharmony_ci do { 16298c2ecf20Sopenharmony_ci maxage = nextage; 16308c2ecf20Sopenharmony_ci trees[--count].maxage = maxage; 16318c2ecf20Sopenharmony_ci trees[--count].maxage = maxage; 16328c2ecf20Sopenharmony_ci nextage = 0; 16338c2ecf20Sopenharmony_ci for (i = 0; i <= corrections_count; i++) { 16348c2ecf20Sopenharmony_ci data = &corrections[i]; 16358c2ecf20Sopenharmony_ci if (nextage < data->correction && 16368c2ecf20Sopenharmony_ci data->correction < maxage) 16378c2ecf20Sopenharmony_ci nextage = data->correction; 16388c2ecf20Sopenharmony_ci } 16398c2ecf20Sopenharmony_ci } while (nextage); 16408c2ecf20Sopenharmony_ci 16418c2ecf20Sopenharmony_ci /* The ages assigned above are off by one. */ 16428c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) { 16438c2ecf20Sopenharmony_ci j = 0; 16448c2ecf20Sopenharmony_ci while (ages[j] < trees[i].maxage) 16458c2ecf20Sopenharmony_ci j++; 16468c2ecf20Sopenharmony_ci trees[i].maxage = ages[j-1]; 16478c2ecf20Sopenharmony_ci } 16488c2ecf20Sopenharmony_ci 16498c2ecf20Sopenharmony_ci /* Set up the forwarding between trees. */ 16508c2ecf20Sopenharmony_ci trees[trees_count-2].next = &trees[trees_count-1]; 16518c2ecf20Sopenharmony_ci trees[trees_count-1].leaf_mark = nfdi_mark; 16528c2ecf20Sopenharmony_ci trees[trees_count-2].leaf_mark = nfdicf_mark; 16538c2ecf20Sopenharmony_ci for (i = 0; i != trees_count-2; i += 2) { 16548c2ecf20Sopenharmony_ci trees[i].next = &trees[trees_count-2]; 16558c2ecf20Sopenharmony_ci trees[i].leaf_mark = correction_mark; 16568c2ecf20Sopenharmony_ci trees[i+1].next = &trees[trees_count-1]; 16578c2ecf20Sopenharmony_ci trees[i+1].leaf_mark = correction_mark; 16588c2ecf20Sopenharmony_ci } 16598c2ecf20Sopenharmony_ci 16608c2ecf20Sopenharmony_ci /* Assign the callouts. */ 16618c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i += 2) { 16628c2ecf20Sopenharmony_ci trees[i].type = "nfdicf"; 16638c2ecf20Sopenharmony_ci trees[i].leaf_equal = nfdicf_equal; 16648c2ecf20Sopenharmony_ci trees[i].leaf_print = nfdicf_print; 16658c2ecf20Sopenharmony_ci trees[i].leaf_size = nfdicf_size; 16668c2ecf20Sopenharmony_ci trees[i].leaf_index = nfdicf_index; 16678c2ecf20Sopenharmony_ci trees[i].leaf_emit = nfdicf_emit; 16688c2ecf20Sopenharmony_ci 16698c2ecf20Sopenharmony_ci trees[i+1].type = "nfdi"; 16708c2ecf20Sopenharmony_ci trees[i+1].leaf_equal = nfdi_equal; 16718c2ecf20Sopenharmony_ci trees[i+1].leaf_print = nfdi_print; 16728c2ecf20Sopenharmony_ci trees[i+1].leaf_size = nfdi_size; 16738c2ecf20Sopenharmony_ci trees[i+1].leaf_index = nfdi_index; 16748c2ecf20Sopenharmony_ci trees[i+1].leaf_emit = nfdi_emit; 16758c2ecf20Sopenharmony_ci } 16768c2ecf20Sopenharmony_ci 16778c2ecf20Sopenharmony_ci /* Finish init. */ 16788c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) 16798c2ecf20Sopenharmony_ci trees[i].childnode = NODE; 16808c2ecf20Sopenharmony_ci} 16818c2ecf20Sopenharmony_ci 16828c2ecf20Sopenharmony_cistatic void trees_populate(void) 16838c2ecf20Sopenharmony_ci{ 16848c2ecf20Sopenharmony_ci struct unicode_data *data; 16858c2ecf20Sopenharmony_ci unsigned int unichar; 16868c2ecf20Sopenharmony_ci char keyval[4]; 16878c2ecf20Sopenharmony_ci int keylen; 16888c2ecf20Sopenharmony_ci int i; 16898c2ecf20Sopenharmony_ci 16908c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) { 16918c2ecf20Sopenharmony_ci if (verbose > 0) { 16928c2ecf20Sopenharmony_ci printf("Populating %s_%x\n", 16938c2ecf20Sopenharmony_ci trees[i].type, trees[i].maxage); 16948c2ecf20Sopenharmony_ci } 16958c2ecf20Sopenharmony_ci for (unichar = 0; unichar != 0x110000; unichar++) { 16968c2ecf20Sopenharmony_ci if (unicode_data[unichar].gen < 0) 16978c2ecf20Sopenharmony_ci continue; 16988c2ecf20Sopenharmony_ci keylen = utf8encode(keyval, unichar); 16998c2ecf20Sopenharmony_ci data = corrections_lookup(&unicode_data[unichar]); 17008c2ecf20Sopenharmony_ci if (data->correction <= trees[i].maxage) 17018c2ecf20Sopenharmony_ci data = &unicode_data[unichar]; 17028c2ecf20Sopenharmony_ci insert(&trees[i], keyval, keylen, data); 17038c2ecf20Sopenharmony_ci } 17048c2ecf20Sopenharmony_ci } 17058c2ecf20Sopenharmony_ci} 17068c2ecf20Sopenharmony_ci 17078c2ecf20Sopenharmony_cistatic void trees_reduce(void) 17088c2ecf20Sopenharmony_ci{ 17098c2ecf20Sopenharmony_ci int i; 17108c2ecf20Sopenharmony_ci int size; 17118c2ecf20Sopenharmony_ci int changed; 17128c2ecf20Sopenharmony_ci 17138c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) 17148c2ecf20Sopenharmony_ci prune(&trees[i]); 17158c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) 17168c2ecf20Sopenharmony_ci mark_nodes(&trees[i]); 17178c2ecf20Sopenharmony_ci do { 17188c2ecf20Sopenharmony_ci size = 0; 17198c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) 17208c2ecf20Sopenharmony_ci size = index_nodes(&trees[i], size); 17218c2ecf20Sopenharmony_ci changed = 0; 17228c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) 17238c2ecf20Sopenharmony_ci changed += size_nodes(&trees[i]); 17248c2ecf20Sopenharmony_ci } while (changed); 17258c2ecf20Sopenharmony_ci 17268c2ecf20Sopenharmony_ci utf8data = calloc(size, 1); 17278c2ecf20Sopenharmony_ci utf8data_size = size; 17288c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) 17298c2ecf20Sopenharmony_ci emit(&trees[i], utf8data); 17308c2ecf20Sopenharmony_ci 17318c2ecf20Sopenharmony_ci if (verbose > 0) { 17328c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) { 17338c2ecf20Sopenharmony_ci printf("%s_%x idx %d\n", 17348c2ecf20Sopenharmony_ci trees[i].type, trees[i].maxage, trees[i].index); 17358c2ecf20Sopenharmony_ci } 17368c2ecf20Sopenharmony_ci } 17378c2ecf20Sopenharmony_ci 17388c2ecf20Sopenharmony_ci nfdi = utf8data + trees[trees_count-1].index; 17398c2ecf20Sopenharmony_ci nfdicf = utf8data + trees[trees_count-2].index; 17408c2ecf20Sopenharmony_ci 17418c2ecf20Sopenharmony_ci nfdi_tree = &trees[trees_count-1]; 17428c2ecf20Sopenharmony_ci nfdicf_tree = &trees[trees_count-2]; 17438c2ecf20Sopenharmony_ci} 17448c2ecf20Sopenharmony_ci 17458c2ecf20Sopenharmony_cistatic void verify(struct tree *tree) 17468c2ecf20Sopenharmony_ci{ 17478c2ecf20Sopenharmony_ci struct unicode_data *data; 17488c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 17498c2ecf20Sopenharmony_ci unsigned int unichar; 17508c2ecf20Sopenharmony_ci char key[4]; 17518c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 17528c2ecf20Sopenharmony_ci int report; 17538c2ecf20Sopenharmony_ci int nocf; 17548c2ecf20Sopenharmony_ci 17558c2ecf20Sopenharmony_ci if (verbose > 0) 17568c2ecf20Sopenharmony_ci printf("Verifying %s_%x\n", tree->type, tree->maxage); 17578c2ecf20Sopenharmony_ci nocf = strcmp(tree->type, "nfdicf"); 17588c2ecf20Sopenharmony_ci 17598c2ecf20Sopenharmony_ci for (unichar = 0; unichar != 0x110000; unichar++) { 17608c2ecf20Sopenharmony_ci report = 0; 17618c2ecf20Sopenharmony_ci data = corrections_lookup(&unicode_data[unichar]); 17628c2ecf20Sopenharmony_ci if (data->correction <= tree->maxage) 17638c2ecf20Sopenharmony_ci data = &unicode_data[unichar]; 17648c2ecf20Sopenharmony_ci utf8encode(key,unichar); 17658c2ecf20Sopenharmony_ci leaf = utf8lookup(tree, hangul, key); 17668c2ecf20Sopenharmony_ci 17678c2ecf20Sopenharmony_ci if (!leaf) { 17688c2ecf20Sopenharmony_ci if (data->gen != -1) 17698c2ecf20Sopenharmony_ci report++; 17708c2ecf20Sopenharmony_ci if (unichar < 0xd800 || unichar > 0xdfff) 17718c2ecf20Sopenharmony_ci report++; 17728c2ecf20Sopenharmony_ci } else { 17738c2ecf20Sopenharmony_ci if (unichar >= 0xd800 && unichar <= 0xdfff) 17748c2ecf20Sopenharmony_ci report++; 17758c2ecf20Sopenharmony_ci if (data->gen == -1) 17768c2ecf20Sopenharmony_ci report++; 17778c2ecf20Sopenharmony_ci if (data->gen != LEAF_GEN(leaf)) 17788c2ecf20Sopenharmony_ci report++; 17798c2ecf20Sopenharmony_ci if (LEAF_CCC(leaf) == DECOMPOSE) { 17808c2ecf20Sopenharmony_ci if (HANGUL_SYLLABLE(data->code)) { 17818c2ecf20Sopenharmony_ci if (data->utf8nfdi[0] != HANGUL) 17828c2ecf20Sopenharmony_ci report++; 17838c2ecf20Sopenharmony_ci } else if (nocf) { 17848c2ecf20Sopenharmony_ci if (!data->utf8nfdi) { 17858c2ecf20Sopenharmony_ci report++; 17868c2ecf20Sopenharmony_ci } else if (strcmp(data->utf8nfdi, 17878c2ecf20Sopenharmony_ci LEAF_STR(leaf))) { 17888c2ecf20Sopenharmony_ci report++; 17898c2ecf20Sopenharmony_ci } 17908c2ecf20Sopenharmony_ci } else { 17918c2ecf20Sopenharmony_ci if (!data->utf8nfdicf && 17928c2ecf20Sopenharmony_ci !data->utf8nfdi) { 17938c2ecf20Sopenharmony_ci report++; 17948c2ecf20Sopenharmony_ci } else if (data->utf8nfdicf) { 17958c2ecf20Sopenharmony_ci if (strcmp(data->utf8nfdicf, 17968c2ecf20Sopenharmony_ci LEAF_STR(leaf))) 17978c2ecf20Sopenharmony_ci report++; 17988c2ecf20Sopenharmony_ci } else if (strcmp(data->utf8nfdi, 17998c2ecf20Sopenharmony_ci LEAF_STR(leaf))) { 18008c2ecf20Sopenharmony_ci report++; 18018c2ecf20Sopenharmony_ci } 18028c2ecf20Sopenharmony_ci } 18038c2ecf20Sopenharmony_ci } else if (data->ccc != LEAF_CCC(leaf)) { 18048c2ecf20Sopenharmony_ci report++; 18058c2ecf20Sopenharmony_ci } 18068c2ecf20Sopenharmony_ci } 18078c2ecf20Sopenharmony_ci if (report) { 18088c2ecf20Sopenharmony_ci printf("%X code %X gen %d ccc %d" 18098c2ecf20Sopenharmony_ci " nfdi -> \"%s\"", 18108c2ecf20Sopenharmony_ci unichar, data->code, data->gen, 18118c2ecf20Sopenharmony_ci data->ccc, 18128c2ecf20Sopenharmony_ci data->utf8nfdi); 18138c2ecf20Sopenharmony_ci if (leaf) { 18148c2ecf20Sopenharmony_ci printf(" gen %d ccc %d" 18158c2ecf20Sopenharmony_ci " nfdi -> \"%s\"", 18168c2ecf20Sopenharmony_ci LEAF_GEN(leaf), 18178c2ecf20Sopenharmony_ci LEAF_CCC(leaf), 18188c2ecf20Sopenharmony_ci LEAF_CCC(leaf) == DECOMPOSE ? 18198c2ecf20Sopenharmony_ci LEAF_STR(leaf) : ""); 18208c2ecf20Sopenharmony_ci } 18218c2ecf20Sopenharmony_ci printf("\n"); 18228c2ecf20Sopenharmony_ci } 18238c2ecf20Sopenharmony_ci } 18248c2ecf20Sopenharmony_ci} 18258c2ecf20Sopenharmony_ci 18268c2ecf20Sopenharmony_cistatic void trees_verify(void) 18278c2ecf20Sopenharmony_ci{ 18288c2ecf20Sopenharmony_ci int i; 18298c2ecf20Sopenharmony_ci 18308c2ecf20Sopenharmony_ci for (i = 0; i != trees_count; i++) 18318c2ecf20Sopenharmony_ci verify(&trees[i]); 18328c2ecf20Sopenharmony_ci} 18338c2ecf20Sopenharmony_ci 18348c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 18358c2ecf20Sopenharmony_ci 18368c2ecf20Sopenharmony_cistatic void help(void) 18378c2ecf20Sopenharmony_ci{ 18388c2ecf20Sopenharmony_ci printf("Usage: %s [options]\n", argv0); 18398c2ecf20Sopenharmony_ci printf("\n"); 18408c2ecf20Sopenharmony_ci printf("This program creates an a data trie used for parsing and\n"); 18418c2ecf20Sopenharmony_ci printf("normalization of UTF-8 strings. The trie is derived from\n"); 18428c2ecf20Sopenharmony_ci printf("a set of input files from the Unicode character database\n"); 18438c2ecf20Sopenharmony_ci printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); 18448c2ecf20Sopenharmony_ci printf("\n"); 18458c2ecf20Sopenharmony_ci printf("The generated tree supports two normalization forms:\n"); 18468c2ecf20Sopenharmony_ci printf("\n"); 18478c2ecf20Sopenharmony_ci printf("\tnfdi:\n"); 18488c2ecf20Sopenharmony_ci printf("\t- Apply unicode normalization form NFD.\n"); 18498c2ecf20Sopenharmony_ci printf("\t- Remove any Default_Ignorable_Code_Point.\n"); 18508c2ecf20Sopenharmony_ci printf("\n"); 18518c2ecf20Sopenharmony_ci printf("\tnfdicf:\n"); 18528c2ecf20Sopenharmony_ci printf("\t- Apply unicode normalization form NFD.\n"); 18538c2ecf20Sopenharmony_ci printf("\t- Remove any Default_Ignorable_Code_Point.\n"); 18548c2ecf20Sopenharmony_ci printf("\t- Apply a full casefold (C + F).\n"); 18558c2ecf20Sopenharmony_ci printf("\n"); 18568c2ecf20Sopenharmony_ci printf("These forms were chosen as being most useful when dealing\n"); 18578c2ecf20Sopenharmony_ci printf("with file names: NFD catches most cases where characters\n"); 18588c2ecf20Sopenharmony_ci printf("should be considered equivalent. The ignorables are mostly\n"); 18598c2ecf20Sopenharmony_ci printf("invisible, making names hard to type.\n"); 18608c2ecf20Sopenharmony_ci printf("\n"); 18618c2ecf20Sopenharmony_ci printf("The options to specify the files to be used are listed\n"); 18628c2ecf20Sopenharmony_ci printf("below with their default values, which are the names used\n"); 18638c2ecf20Sopenharmony_ci printf("by version 11.0.0 of the Unicode Character Database.\n"); 18648c2ecf20Sopenharmony_ci printf("\n"); 18658c2ecf20Sopenharmony_ci printf("The input files:\n"); 18668c2ecf20Sopenharmony_ci printf("\t-a %s\n", AGE_NAME); 18678c2ecf20Sopenharmony_ci printf("\t-c %s\n", CCC_NAME); 18688c2ecf20Sopenharmony_ci printf("\t-p %s\n", PROP_NAME); 18698c2ecf20Sopenharmony_ci printf("\t-d %s\n", DATA_NAME); 18708c2ecf20Sopenharmony_ci printf("\t-f %s\n", FOLD_NAME); 18718c2ecf20Sopenharmony_ci printf("\t-n %s\n", NORM_NAME); 18728c2ecf20Sopenharmony_ci printf("\n"); 18738c2ecf20Sopenharmony_ci printf("Additionally, the generated tables are tested using:\n"); 18748c2ecf20Sopenharmony_ci printf("\t-t %s\n", TEST_NAME); 18758c2ecf20Sopenharmony_ci printf("\n"); 18768c2ecf20Sopenharmony_ci printf("Finally, the output file:\n"); 18778c2ecf20Sopenharmony_ci printf("\t-o %s\n", UTF8_NAME); 18788c2ecf20Sopenharmony_ci printf("\n"); 18798c2ecf20Sopenharmony_ci} 18808c2ecf20Sopenharmony_ci 18818c2ecf20Sopenharmony_cistatic void usage(void) 18828c2ecf20Sopenharmony_ci{ 18838c2ecf20Sopenharmony_ci help(); 18848c2ecf20Sopenharmony_ci exit(1); 18858c2ecf20Sopenharmony_ci} 18868c2ecf20Sopenharmony_ci 18878c2ecf20Sopenharmony_cistatic void open_fail(const char *name, int error) 18888c2ecf20Sopenharmony_ci{ 18898c2ecf20Sopenharmony_ci printf("Error %d opening %s: %s\n", error, name, strerror(error)); 18908c2ecf20Sopenharmony_ci exit(1); 18918c2ecf20Sopenharmony_ci} 18928c2ecf20Sopenharmony_ci 18938c2ecf20Sopenharmony_cistatic void file_fail(const char *filename) 18948c2ecf20Sopenharmony_ci{ 18958c2ecf20Sopenharmony_ci printf("Error parsing %s\n", filename); 18968c2ecf20Sopenharmony_ci exit(1); 18978c2ecf20Sopenharmony_ci} 18988c2ecf20Sopenharmony_ci 18998c2ecf20Sopenharmony_cistatic void line_fail(const char *filename, const char *line) 19008c2ecf20Sopenharmony_ci{ 19018c2ecf20Sopenharmony_ci printf("Error parsing %s:%s\n", filename, line); 19028c2ecf20Sopenharmony_ci exit(1); 19038c2ecf20Sopenharmony_ci} 19048c2ecf20Sopenharmony_ci 19058c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 19068c2ecf20Sopenharmony_ci 19078c2ecf20Sopenharmony_cistatic void print_utf32(unsigned int *utf32str) 19088c2ecf20Sopenharmony_ci{ 19098c2ecf20Sopenharmony_ci int i; 19108c2ecf20Sopenharmony_ci 19118c2ecf20Sopenharmony_ci for (i = 0; utf32str[i]; i++) 19128c2ecf20Sopenharmony_ci printf(" %X", utf32str[i]); 19138c2ecf20Sopenharmony_ci} 19148c2ecf20Sopenharmony_ci 19158c2ecf20Sopenharmony_cistatic void print_utf32nfdi(unsigned int unichar) 19168c2ecf20Sopenharmony_ci{ 19178c2ecf20Sopenharmony_ci printf(" %X ->", unichar); 19188c2ecf20Sopenharmony_ci print_utf32(unicode_data[unichar].utf32nfdi); 19198c2ecf20Sopenharmony_ci printf("\n"); 19208c2ecf20Sopenharmony_ci} 19218c2ecf20Sopenharmony_ci 19228c2ecf20Sopenharmony_cistatic void print_utf32nfdicf(unsigned int unichar) 19238c2ecf20Sopenharmony_ci{ 19248c2ecf20Sopenharmony_ci printf(" %X ->", unichar); 19258c2ecf20Sopenharmony_ci print_utf32(unicode_data[unichar].utf32nfdicf); 19268c2ecf20Sopenharmony_ci printf("\n"); 19278c2ecf20Sopenharmony_ci} 19288c2ecf20Sopenharmony_ci 19298c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 19308c2ecf20Sopenharmony_ci 19318c2ecf20Sopenharmony_cistatic void age_init(void) 19328c2ecf20Sopenharmony_ci{ 19338c2ecf20Sopenharmony_ci FILE *file; 19348c2ecf20Sopenharmony_ci unsigned int first; 19358c2ecf20Sopenharmony_ci unsigned int last; 19368c2ecf20Sopenharmony_ci unsigned int unichar; 19378c2ecf20Sopenharmony_ci unsigned int major; 19388c2ecf20Sopenharmony_ci unsigned int minor; 19398c2ecf20Sopenharmony_ci unsigned int revision; 19408c2ecf20Sopenharmony_ci int gen; 19418c2ecf20Sopenharmony_ci int count; 19428c2ecf20Sopenharmony_ci int ret; 19438c2ecf20Sopenharmony_ci 19448c2ecf20Sopenharmony_ci if (verbose > 0) 19458c2ecf20Sopenharmony_ci printf("Parsing %s\n", age_name); 19468c2ecf20Sopenharmony_ci 19478c2ecf20Sopenharmony_ci file = fopen(age_name, "r"); 19488c2ecf20Sopenharmony_ci if (!file) 19498c2ecf20Sopenharmony_ci open_fail(age_name, errno); 19508c2ecf20Sopenharmony_ci count = 0; 19518c2ecf20Sopenharmony_ci 19528c2ecf20Sopenharmony_ci gen = 0; 19538c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 19548c2ecf20Sopenharmony_ci ret = sscanf(line, "# Age=V%d_%d_%d", 19558c2ecf20Sopenharmony_ci &major, &minor, &revision); 19568c2ecf20Sopenharmony_ci if (ret == 3) { 19578c2ecf20Sopenharmony_ci ages_count++; 19588c2ecf20Sopenharmony_ci if (verbose > 1) 19598c2ecf20Sopenharmony_ci printf(" Age V%d_%d_%d\n", 19608c2ecf20Sopenharmony_ci major, minor, revision); 19618c2ecf20Sopenharmony_ci if (!age_valid(major, minor, revision)) 19628c2ecf20Sopenharmony_ci line_fail(age_name, line); 19638c2ecf20Sopenharmony_ci continue; 19648c2ecf20Sopenharmony_ci } 19658c2ecf20Sopenharmony_ci ret = sscanf(line, "# Age=V%d_%d", &major, &minor); 19668c2ecf20Sopenharmony_ci if (ret == 2) { 19678c2ecf20Sopenharmony_ci ages_count++; 19688c2ecf20Sopenharmony_ci if (verbose > 1) 19698c2ecf20Sopenharmony_ci printf(" Age V%d_%d\n", major, minor); 19708c2ecf20Sopenharmony_ci if (!age_valid(major, minor, 0)) 19718c2ecf20Sopenharmony_ci line_fail(age_name, line); 19728c2ecf20Sopenharmony_ci continue; 19738c2ecf20Sopenharmony_ci } 19748c2ecf20Sopenharmony_ci } 19758c2ecf20Sopenharmony_ci 19768c2ecf20Sopenharmony_ci /* We must have found something above. */ 19778c2ecf20Sopenharmony_ci if (verbose > 1) 19788c2ecf20Sopenharmony_ci printf("%d age entries\n", ages_count); 19798c2ecf20Sopenharmony_ci if (ages_count == 0 || ages_count > MAXGEN) 19808c2ecf20Sopenharmony_ci file_fail(age_name); 19818c2ecf20Sopenharmony_ci 19828c2ecf20Sopenharmony_ci /* There is a 0 entry. */ 19838c2ecf20Sopenharmony_ci ages_count++; 19848c2ecf20Sopenharmony_ci ages = calloc(ages_count + 1, sizeof(*ages)); 19858c2ecf20Sopenharmony_ci /* And a guard entry. */ 19868c2ecf20Sopenharmony_ci ages[ages_count] = (unsigned int)-1; 19878c2ecf20Sopenharmony_ci 19888c2ecf20Sopenharmony_ci rewind(file); 19898c2ecf20Sopenharmony_ci count = 0; 19908c2ecf20Sopenharmony_ci gen = 0; 19918c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 19928c2ecf20Sopenharmony_ci ret = sscanf(line, "# Age=V%d_%d_%d", 19938c2ecf20Sopenharmony_ci &major, &minor, &revision); 19948c2ecf20Sopenharmony_ci if (ret == 3) { 19958c2ecf20Sopenharmony_ci ages[++gen] = 19968c2ecf20Sopenharmony_ci UNICODE_AGE(major, minor, revision); 19978c2ecf20Sopenharmony_ci if (verbose > 1) 19988c2ecf20Sopenharmony_ci printf(" Age V%d_%d_%d = gen %d\n", 19998c2ecf20Sopenharmony_ci major, minor, revision, gen); 20008c2ecf20Sopenharmony_ci if (!age_valid(major, minor, revision)) 20018c2ecf20Sopenharmony_ci line_fail(age_name, line); 20028c2ecf20Sopenharmony_ci continue; 20038c2ecf20Sopenharmony_ci } 20048c2ecf20Sopenharmony_ci ret = sscanf(line, "# Age=V%d_%d", &major, &minor); 20058c2ecf20Sopenharmony_ci if (ret == 2) { 20068c2ecf20Sopenharmony_ci ages[++gen] = UNICODE_AGE(major, minor, 0); 20078c2ecf20Sopenharmony_ci if (verbose > 1) 20088c2ecf20Sopenharmony_ci printf(" Age V%d_%d = %d\n", 20098c2ecf20Sopenharmony_ci major, minor, gen); 20108c2ecf20Sopenharmony_ci if (!age_valid(major, minor, 0)) 20118c2ecf20Sopenharmony_ci line_fail(age_name, line); 20128c2ecf20Sopenharmony_ci continue; 20138c2ecf20Sopenharmony_ci } 20148c2ecf20Sopenharmony_ci ret = sscanf(line, "%X..%X ; %d.%d #", 20158c2ecf20Sopenharmony_ci &first, &last, &major, &minor); 20168c2ecf20Sopenharmony_ci if (ret == 4) { 20178c2ecf20Sopenharmony_ci for (unichar = first; unichar <= last; unichar++) 20188c2ecf20Sopenharmony_ci unicode_data[unichar].gen = gen; 20198c2ecf20Sopenharmony_ci count += 1 + last - first; 20208c2ecf20Sopenharmony_ci if (verbose > 1) 20218c2ecf20Sopenharmony_ci printf(" %X..%X gen %d\n", first, last, gen); 20228c2ecf20Sopenharmony_ci if (!utf32valid(first) || !utf32valid(last)) 20238c2ecf20Sopenharmony_ci line_fail(age_name, line); 20248c2ecf20Sopenharmony_ci continue; 20258c2ecf20Sopenharmony_ci } 20268c2ecf20Sopenharmony_ci ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); 20278c2ecf20Sopenharmony_ci if (ret == 3) { 20288c2ecf20Sopenharmony_ci unicode_data[unichar].gen = gen; 20298c2ecf20Sopenharmony_ci count++; 20308c2ecf20Sopenharmony_ci if (verbose > 1) 20318c2ecf20Sopenharmony_ci printf(" %X gen %d\n", unichar, gen); 20328c2ecf20Sopenharmony_ci if (!utf32valid(unichar)) 20338c2ecf20Sopenharmony_ci line_fail(age_name, line); 20348c2ecf20Sopenharmony_ci continue; 20358c2ecf20Sopenharmony_ci } 20368c2ecf20Sopenharmony_ci } 20378c2ecf20Sopenharmony_ci unicode_maxage = ages[gen]; 20388c2ecf20Sopenharmony_ci fclose(file); 20398c2ecf20Sopenharmony_ci 20408c2ecf20Sopenharmony_ci /* Nix surrogate block */ 20418c2ecf20Sopenharmony_ci if (verbose > 1) 20428c2ecf20Sopenharmony_ci printf(" Removing surrogate block D800..DFFF\n"); 20438c2ecf20Sopenharmony_ci for (unichar = 0xd800; unichar <= 0xdfff; unichar++) 20448c2ecf20Sopenharmony_ci unicode_data[unichar].gen = -1; 20458c2ecf20Sopenharmony_ci 20468c2ecf20Sopenharmony_ci if (verbose > 0) 20478c2ecf20Sopenharmony_ci printf("Found %d entries\n", count); 20488c2ecf20Sopenharmony_ci if (count == 0) 20498c2ecf20Sopenharmony_ci file_fail(age_name); 20508c2ecf20Sopenharmony_ci} 20518c2ecf20Sopenharmony_ci 20528c2ecf20Sopenharmony_cistatic void ccc_init(void) 20538c2ecf20Sopenharmony_ci{ 20548c2ecf20Sopenharmony_ci FILE *file; 20558c2ecf20Sopenharmony_ci unsigned int first; 20568c2ecf20Sopenharmony_ci unsigned int last; 20578c2ecf20Sopenharmony_ci unsigned int unichar; 20588c2ecf20Sopenharmony_ci unsigned int value; 20598c2ecf20Sopenharmony_ci int count; 20608c2ecf20Sopenharmony_ci int ret; 20618c2ecf20Sopenharmony_ci 20628c2ecf20Sopenharmony_ci if (verbose > 0) 20638c2ecf20Sopenharmony_ci printf("Parsing %s\n", ccc_name); 20648c2ecf20Sopenharmony_ci 20658c2ecf20Sopenharmony_ci file = fopen(ccc_name, "r"); 20668c2ecf20Sopenharmony_ci if (!file) 20678c2ecf20Sopenharmony_ci open_fail(ccc_name, errno); 20688c2ecf20Sopenharmony_ci 20698c2ecf20Sopenharmony_ci count = 0; 20708c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 20718c2ecf20Sopenharmony_ci ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); 20728c2ecf20Sopenharmony_ci if (ret == 3) { 20738c2ecf20Sopenharmony_ci for (unichar = first; unichar <= last; unichar++) { 20748c2ecf20Sopenharmony_ci unicode_data[unichar].ccc = value; 20758c2ecf20Sopenharmony_ci count++; 20768c2ecf20Sopenharmony_ci } 20778c2ecf20Sopenharmony_ci if (verbose > 1) 20788c2ecf20Sopenharmony_ci printf(" %X..%X ccc %d\n", first, last, value); 20798c2ecf20Sopenharmony_ci if (!utf32valid(first) || !utf32valid(last)) 20808c2ecf20Sopenharmony_ci line_fail(ccc_name, line); 20818c2ecf20Sopenharmony_ci continue; 20828c2ecf20Sopenharmony_ci } 20838c2ecf20Sopenharmony_ci ret = sscanf(line, "%X ; %d #", &unichar, &value); 20848c2ecf20Sopenharmony_ci if (ret == 2) { 20858c2ecf20Sopenharmony_ci unicode_data[unichar].ccc = value; 20868c2ecf20Sopenharmony_ci count++; 20878c2ecf20Sopenharmony_ci if (verbose > 1) 20888c2ecf20Sopenharmony_ci printf(" %X ccc %d\n", unichar, value); 20898c2ecf20Sopenharmony_ci if (!utf32valid(unichar)) 20908c2ecf20Sopenharmony_ci line_fail(ccc_name, line); 20918c2ecf20Sopenharmony_ci continue; 20928c2ecf20Sopenharmony_ci } 20938c2ecf20Sopenharmony_ci } 20948c2ecf20Sopenharmony_ci fclose(file); 20958c2ecf20Sopenharmony_ci 20968c2ecf20Sopenharmony_ci if (verbose > 0) 20978c2ecf20Sopenharmony_ci printf("Found %d entries\n", count); 20988c2ecf20Sopenharmony_ci if (count == 0) 20998c2ecf20Sopenharmony_ci file_fail(ccc_name); 21008c2ecf20Sopenharmony_ci} 21018c2ecf20Sopenharmony_ci 21028c2ecf20Sopenharmony_cistatic int ignore_compatibility_form(char *type) 21038c2ecf20Sopenharmony_ci{ 21048c2ecf20Sopenharmony_ci int i; 21058c2ecf20Sopenharmony_ci char *ignored_types[] = {"font", "noBreak", "initial", "medial", 21068c2ecf20Sopenharmony_ci "final", "isolated", "circle", "super", 21078c2ecf20Sopenharmony_ci "sub", "vertical", "wide", "narrow", 21088c2ecf20Sopenharmony_ci "small", "square", "fraction", "compat"}; 21098c2ecf20Sopenharmony_ci 21108c2ecf20Sopenharmony_ci for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) 21118c2ecf20Sopenharmony_ci if (strcmp(type, ignored_types[i]) == 0) 21128c2ecf20Sopenharmony_ci return 1; 21138c2ecf20Sopenharmony_ci return 0; 21148c2ecf20Sopenharmony_ci} 21158c2ecf20Sopenharmony_ci 21168c2ecf20Sopenharmony_cistatic void nfdi_init(void) 21178c2ecf20Sopenharmony_ci{ 21188c2ecf20Sopenharmony_ci FILE *file; 21198c2ecf20Sopenharmony_ci unsigned int unichar; 21208c2ecf20Sopenharmony_ci unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 21218c2ecf20Sopenharmony_ci char *s; 21228c2ecf20Sopenharmony_ci char *type; 21238c2ecf20Sopenharmony_ci unsigned int *um; 21248c2ecf20Sopenharmony_ci int count; 21258c2ecf20Sopenharmony_ci int i; 21268c2ecf20Sopenharmony_ci int ret; 21278c2ecf20Sopenharmony_ci 21288c2ecf20Sopenharmony_ci if (verbose > 0) 21298c2ecf20Sopenharmony_ci printf("Parsing %s\n", data_name); 21308c2ecf20Sopenharmony_ci file = fopen(data_name, "r"); 21318c2ecf20Sopenharmony_ci if (!file) 21328c2ecf20Sopenharmony_ci open_fail(data_name, errno); 21338c2ecf20Sopenharmony_ci 21348c2ecf20Sopenharmony_ci count = 0; 21358c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 21368c2ecf20Sopenharmony_ci ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", 21378c2ecf20Sopenharmony_ci &unichar, buf0); 21388c2ecf20Sopenharmony_ci if (ret != 2) 21398c2ecf20Sopenharmony_ci continue; 21408c2ecf20Sopenharmony_ci if (!utf32valid(unichar)) 21418c2ecf20Sopenharmony_ci line_fail(data_name, line); 21428c2ecf20Sopenharmony_ci 21438c2ecf20Sopenharmony_ci s = buf0; 21448c2ecf20Sopenharmony_ci /* skip over <tag> */ 21458c2ecf20Sopenharmony_ci if (*s == '<') { 21468c2ecf20Sopenharmony_ci type = ++s; 21478c2ecf20Sopenharmony_ci while (*++s != '>'); 21488c2ecf20Sopenharmony_ci *s++ = '\0'; 21498c2ecf20Sopenharmony_ci if(ignore_compatibility_form(type)) 21508c2ecf20Sopenharmony_ci continue; 21518c2ecf20Sopenharmony_ci } 21528c2ecf20Sopenharmony_ci /* decode the decomposition into UTF-32 */ 21538c2ecf20Sopenharmony_ci i = 0; 21548c2ecf20Sopenharmony_ci while (*s) { 21558c2ecf20Sopenharmony_ci mapping[i] = strtoul(s, &s, 16); 21568c2ecf20Sopenharmony_ci if (!utf32valid(mapping[i])) 21578c2ecf20Sopenharmony_ci line_fail(data_name, line); 21588c2ecf20Sopenharmony_ci i++; 21598c2ecf20Sopenharmony_ci } 21608c2ecf20Sopenharmony_ci mapping[i++] = 0; 21618c2ecf20Sopenharmony_ci 21628c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 21638c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 21648c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdi = um; 21658c2ecf20Sopenharmony_ci 21668c2ecf20Sopenharmony_ci if (verbose > 1) 21678c2ecf20Sopenharmony_ci print_utf32nfdi(unichar); 21688c2ecf20Sopenharmony_ci count++; 21698c2ecf20Sopenharmony_ci } 21708c2ecf20Sopenharmony_ci fclose(file); 21718c2ecf20Sopenharmony_ci if (verbose > 0) 21728c2ecf20Sopenharmony_ci printf("Found %d entries\n", count); 21738c2ecf20Sopenharmony_ci if (count == 0) 21748c2ecf20Sopenharmony_ci file_fail(data_name); 21758c2ecf20Sopenharmony_ci} 21768c2ecf20Sopenharmony_ci 21778c2ecf20Sopenharmony_cistatic void nfdicf_init(void) 21788c2ecf20Sopenharmony_ci{ 21798c2ecf20Sopenharmony_ci FILE *file; 21808c2ecf20Sopenharmony_ci unsigned int unichar; 21818c2ecf20Sopenharmony_ci unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 21828c2ecf20Sopenharmony_ci char status; 21838c2ecf20Sopenharmony_ci char *s; 21848c2ecf20Sopenharmony_ci unsigned int *um; 21858c2ecf20Sopenharmony_ci int i; 21868c2ecf20Sopenharmony_ci int count; 21878c2ecf20Sopenharmony_ci int ret; 21888c2ecf20Sopenharmony_ci 21898c2ecf20Sopenharmony_ci if (verbose > 0) 21908c2ecf20Sopenharmony_ci printf("Parsing %s\n", fold_name); 21918c2ecf20Sopenharmony_ci file = fopen(fold_name, "r"); 21928c2ecf20Sopenharmony_ci if (!file) 21938c2ecf20Sopenharmony_ci open_fail(fold_name, errno); 21948c2ecf20Sopenharmony_ci 21958c2ecf20Sopenharmony_ci count = 0; 21968c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 21978c2ecf20Sopenharmony_ci ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); 21988c2ecf20Sopenharmony_ci if (ret != 3) 21998c2ecf20Sopenharmony_ci continue; 22008c2ecf20Sopenharmony_ci if (!utf32valid(unichar)) 22018c2ecf20Sopenharmony_ci line_fail(fold_name, line); 22028c2ecf20Sopenharmony_ci /* Use the C+F casefold. */ 22038c2ecf20Sopenharmony_ci if (status != 'C' && status != 'F') 22048c2ecf20Sopenharmony_ci continue; 22058c2ecf20Sopenharmony_ci s = buf0; 22068c2ecf20Sopenharmony_ci if (*s == '<') 22078c2ecf20Sopenharmony_ci while (*s++ != ' ') 22088c2ecf20Sopenharmony_ci ; 22098c2ecf20Sopenharmony_ci i = 0; 22108c2ecf20Sopenharmony_ci while (*s) { 22118c2ecf20Sopenharmony_ci mapping[i] = strtoul(s, &s, 16); 22128c2ecf20Sopenharmony_ci if (!utf32valid(mapping[i])) 22138c2ecf20Sopenharmony_ci line_fail(fold_name, line); 22148c2ecf20Sopenharmony_ci i++; 22158c2ecf20Sopenharmony_ci } 22168c2ecf20Sopenharmony_ci mapping[i++] = 0; 22178c2ecf20Sopenharmony_ci 22188c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 22198c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 22208c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdicf = um; 22218c2ecf20Sopenharmony_ci 22228c2ecf20Sopenharmony_ci if (verbose > 1) 22238c2ecf20Sopenharmony_ci print_utf32nfdicf(unichar); 22248c2ecf20Sopenharmony_ci count++; 22258c2ecf20Sopenharmony_ci } 22268c2ecf20Sopenharmony_ci fclose(file); 22278c2ecf20Sopenharmony_ci if (verbose > 0) 22288c2ecf20Sopenharmony_ci printf("Found %d entries\n", count); 22298c2ecf20Sopenharmony_ci if (count == 0) 22308c2ecf20Sopenharmony_ci file_fail(fold_name); 22318c2ecf20Sopenharmony_ci} 22328c2ecf20Sopenharmony_ci 22338c2ecf20Sopenharmony_cistatic void ignore_init(void) 22348c2ecf20Sopenharmony_ci{ 22358c2ecf20Sopenharmony_ci FILE *file; 22368c2ecf20Sopenharmony_ci unsigned int unichar; 22378c2ecf20Sopenharmony_ci unsigned int first; 22388c2ecf20Sopenharmony_ci unsigned int last; 22398c2ecf20Sopenharmony_ci unsigned int *um; 22408c2ecf20Sopenharmony_ci int count; 22418c2ecf20Sopenharmony_ci int ret; 22428c2ecf20Sopenharmony_ci 22438c2ecf20Sopenharmony_ci if (verbose > 0) 22448c2ecf20Sopenharmony_ci printf("Parsing %s\n", prop_name); 22458c2ecf20Sopenharmony_ci file = fopen(prop_name, "r"); 22468c2ecf20Sopenharmony_ci if (!file) 22478c2ecf20Sopenharmony_ci open_fail(prop_name, errno); 22488c2ecf20Sopenharmony_ci assert(file); 22498c2ecf20Sopenharmony_ci count = 0; 22508c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 22518c2ecf20Sopenharmony_ci ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); 22528c2ecf20Sopenharmony_ci if (ret == 3) { 22538c2ecf20Sopenharmony_ci if (strcmp(buf0, "Default_Ignorable_Code_Point")) 22548c2ecf20Sopenharmony_ci continue; 22558c2ecf20Sopenharmony_ci if (!utf32valid(first) || !utf32valid(last)) 22568c2ecf20Sopenharmony_ci line_fail(prop_name, line); 22578c2ecf20Sopenharmony_ci for (unichar = first; unichar <= last; unichar++) { 22588c2ecf20Sopenharmony_ci free(unicode_data[unichar].utf32nfdi); 22598c2ecf20Sopenharmony_ci um = malloc(sizeof(unsigned int)); 22608c2ecf20Sopenharmony_ci *um = 0; 22618c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdi = um; 22628c2ecf20Sopenharmony_ci free(unicode_data[unichar].utf32nfdicf); 22638c2ecf20Sopenharmony_ci um = malloc(sizeof(unsigned int)); 22648c2ecf20Sopenharmony_ci *um = 0; 22658c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdicf = um; 22668c2ecf20Sopenharmony_ci count++; 22678c2ecf20Sopenharmony_ci } 22688c2ecf20Sopenharmony_ci if (verbose > 1) 22698c2ecf20Sopenharmony_ci printf(" %X..%X Default_Ignorable_Code_Point\n", 22708c2ecf20Sopenharmony_ci first, last); 22718c2ecf20Sopenharmony_ci continue; 22728c2ecf20Sopenharmony_ci } 22738c2ecf20Sopenharmony_ci ret = sscanf(line, "%X ; %s # ", &unichar, buf0); 22748c2ecf20Sopenharmony_ci if (ret == 2) { 22758c2ecf20Sopenharmony_ci if (strcmp(buf0, "Default_Ignorable_Code_Point")) 22768c2ecf20Sopenharmony_ci continue; 22778c2ecf20Sopenharmony_ci if (!utf32valid(unichar)) 22788c2ecf20Sopenharmony_ci line_fail(prop_name, line); 22798c2ecf20Sopenharmony_ci free(unicode_data[unichar].utf32nfdi); 22808c2ecf20Sopenharmony_ci um = malloc(sizeof(unsigned int)); 22818c2ecf20Sopenharmony_ci *um = 0; 22828c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdi = um; 22838c2ecf20Sopenharmony_ci free(unicode_data[unichar].utf32nfdicf); 22848c2ecf20Sopenharmony_ci um = malloc(sizeof(unsigned int)); 22858c2ecf20Sopenharmony_ci *um = 0; 22868c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdicf = um; 22878c2ecf20Sopenharmony_ci if (verbose > 1) 22888c2ecf20Sopenharmony_ci printf(" %X Default_Ignorable_Code_Point\n", 22898c2ecf20Sopenharmony_ci unichar); 22908c2ecf20Sopenharmony_ci count++; 22918c2ecf20Sopenharmony_ci continue; 22928c2ecf20Sopenharmony_ci } 22938c2ecf20Sopenharmony_ci } 22948c2ecf20Sopenharmony_ci fclose(file); 22958c2ecf20Sopenharmony_ci 22968c2ecf20Sopenharmony_ci if (verbose > 0) 22978c2ecf20Sopenharmony_ci printf("Found %d entries\n", count); 22988c2ecf20Sopenharmony_ci if (count == 0) 22998c2ecf20Sopenharmony_ci file_fail(prop_name); 23008c2ecf20Sopenharmony_ci} 23018c2ecf20Sopenharmony_ci 23028c2ecf20Sopenharmony_cistatic void corrections_init(void) 23038c2ecf20Sopenharmony_ci{ 23048c2ecf20Sopenharmony_ci FILE *file; 23058c2ecf20Sopenharmony_ci unsigned int unichar; 23068c2ecf20Sopenharmony_ci unsigned int major; 23078c2ecf20Sopenharmony_ci unsigned int minor; 23088c2ecf20Sopenharmony_ci unsigned int revision; 23098c2ecf20Sopenharmony_ci unsigned int age; 23108c2ecf20Sopenharmony_ci unsigned int *um; 23118c2ecf20Sopenharmony_ci unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 23128c2ecf20Sopenharmony_ci char *s; 23138c2ecf20Sopenharmony_ci int i; 23148c2ecf20Sopenharmony_ci int count; 23158c2ecf20Sopenharmony_ci int ret; 23168c2ecf20Sopenharmony_ci 23178c2ecf20Sopenharmony_ci if (verbose > 0) 23188c2ecf20Sopenharmony_ci printf("Parsing %s\n", norm_name); 23198c2ecf20Sopenharmony_ci file = fopen(norm_name, "r"); 23208c2ecf20Sopenharmony_ci if (!file) 23218c2ecf20Sopenharmony_ci open_fail(norm_name, errno); 23228c2ecf20Sopenharmony_ci 23238c2ecf20Sopenharmony_ci count = 0; 23248c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 23258c2ecf20Sopenharmony_ci ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", 23268c2ecf20Sopenharmony_ci &unichar, buf0, buf1, 23278c2ecf20Sopenharmony_ci &major, &minor, &revision); 23288c2ecf20Sopenharmony_ci if (ret != 6) 23298c2ecf20Sopenharmony_ci continue; 23308c2ecf20Sopenharmony_ci if (!utf32valid(unichar) || !age_valid(major, minor, revision)) 23318c2ecf20Sopenharmony_ci line_fail(norm_name, line); 23328c2ecf20Sopenharmony_ci count++; 23338c2ecf20Sopenharmony_ci } 23348c2ecf20Sopenharmony_ci corrections = calloc(count, sizeof(struct unicode_data)); 23358c2ecf20Sopenharmony_ci corrections_count = count; 23368c2ecf20Sopenharmony_ci rewind(file); 23378c2ecf20Sopenharmony_ci 23388c2ecf20Sopenharmony_ci count = 0; 23398c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 23408c2ecf20Sopenharmony_ci ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", 23418c2ecf20Sopenharmony_ci &unichar, buf0, buf1, 23428c2ecf20Sopenharmony_ci &major, &minor, &revision); 23438c2ecf20Sopenharmony_ci if (ret != 6) 23448c2ecf20Sopenharmony_ci continue; 23458c2ecf20Sopenharmony_ci if (!utf32valid(unichar) || !age_valid(major, minor, revision)) 23468c2ecf20Sopenharmony_ci line_fail(norm_name, line); 23478c2ecf20Sopenharmony_ci corrections[count] = unicode_data[unichar]; 23488c2ecf20Sopenharmony_ci assert(corrections[count].code == unichar); 23498c2ecf20Sopenharmony_ci age = UNICODE_AGE(major, minor, revision); 23508c2ecf20Sopenharmony_ci corrections[count].correction = age; 23518c2ecf20Sopenharmony_ci 23528c2ecf20Sopenharmony_ci i = 0; 23538c2ecf20Sopenharmony_ci s = buf0; 23548c2ecf20Sopenharmony_ci while (*s) { 23558c2ecf20Sopenharmony_ci mapping[i] = strtoul(s, &s, 16); 23568c2ecf20Sopenharmony_ci if (!utf32valid(mapping[i])) 23578c2ecf20Sopenharmony_ci line_fail(norm_name, line); 23588c2ecf20Sopenharmony_ci i++; 23598c2ecf20Sopenharmony_ci } 23608c2ecf20Sopenharmony_ci mapping[i++] = 0; 23618c2ecf20Sopenharmony_ci 23628c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 23638c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 23648c2ecf20Sopenharmony_ci corrections[count].utf32nfdi = um; 23658c2ecf20Sopenharmony_ci 23668c2ecf20Sopenharmony_ci if (verbose > 1) 23678c2ecf20Sopenharmony_ci printf(" %X -> %s -> %s V%d_%d_%d\n", 23688c2ecf20Sopenharmony_ci unichar, buf0, buf1, major, minor, revision); 23698c2ecf20Sopenharmony_ci count++; 23708c2ecf20Sopenharmony_ci } 23718c2ecf20Sopenharmony_ci fclose(file); 23728c2ecf20Sopenharmony_ci 23738c2ecf20Sopenharmony_ci if (verbose > 0) 23748c2ecf20Sopenharmony_ci printf("Found %d entries\n", count); 23758c2ecf20Sopenharmony_ci if (count == 0) 23768c2ecf20Sopenharmony_ci file_fail(norm_name); 23778c2ecf20Sopenharmony_ci} 23788c2ecf20Sopenharmony_ci 23798c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 23808c2ecf20Sopenharmony_ci 23818c2ecf20Sopenharmony_ci/* 23828c2ecf20Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 23838c2ecf20Sopenharmony_ci * 23848c2ecf20Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 23858c2ecf20Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 23868c2ecf20Sopenharmony_ci * 23878c2ecf20Sopenharmony_ci * SBase = 0xAC00 23888c2ecf20Sopenharmony_ci * LBase = 0x1100 23898c2ecf20Sopenharmony_ci * VBase = 0x1161 23908c2ecf20Sopenharmony_ci * TBase = 0x11A7 23918c2ecf20Sopenharmony_ci * LCount = 19 23928c2ecf20Sopenharmony_ci * VCount = 21 23938c2ecf20Sopenharmony_ci * TCount = 28 23948c2ecf20Sopenharmony_ci * NCount = 588 (VCount * TCount) 23958c2ecf20Sopenharmony_ci * SCount = 11172 (LCount * NCount) 23968c2ecf20Sopenharmony_ci * 23978c2ecf20Sopenharmony_ci * Decomposition: 23988c2ecf20Sopenharmony_ci * SIndex = s - SBase 23998c2ecf20Sopenharmony_ci * 24008c2ecf20Sopenharmony_ci * LV (Canonical/Full) 24018c2ecf20Sopenharmony_ci * LIndex = SIndex / NCount 24028c2ecf20Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 24038c2ecf20Sopenharmony_ci * LPart = LBase + LIndex 24048c2ecf20Sopenharmony_ci * VPart = VBase + VIndex 24058c2ecf20Sopenharmony_ci * 24068c2ecf20Sopenharmony_ci * LVT (Canonical) 24078c2ecf20Sopenharmony_ci * LVIndex = (SIndex / TCount) * TCount 24088c2ecf20Sopenharmony_ci * TIndex = (Sindex % TCount) 24098c2ecf20Sopenharmony_ci * LVPart = SBase + LVIndex 24108c2ecf20Sopenharmony_ci * TPart = TBase + TIndex 24118c2ecf20Sopenharmony_ci * 24128c2ecf20Sopenharmony_ci * LVT (Full) 24138c2ecf20Sopenharmony_ci * LIndex = SIndex / NCount 24148c2ecf20Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 24158c2ecf20Sopenharmony_ci * TIndex = (Sindex % TCount) 24168c2ecf20Sopenharmony_ci * LPart = LBase + LIndex 24178c2ecf20Sopenharmony_ci * VPart = VBase + VIndex 24188c2ecf20Sopenharmony_ci * if (TIndex == 0) { 24198c2ecf20Sopenharmony_ci * d = <LPart, VPart> 24208c2ecf20Sopenharmony_ci * } else { 24218c2ecf20Sopenharmony_ci * TPart = TBase + TIndex 24228c2ecf20Sopenharmony_ci * d = <LPart, VPart, TPart> 24238c2ecf20Sopenharmony_ci * } 24248c2ecf20Sopenharmony_ci * 24258c2ecf20Sopenharmony_ci */ 24268c2ecf20Sopenharmony_ci 24278c2ecf20Sopenharmony_cistatic void hangul_decompose(void) 24288c2ecf20Sopenharmony_ci{ 24298c2ecf20Sopenharmony_ci unsigned int sb = 0xAC00; 24308c2ecf20Sopenharmony_ci unsigned int lb = 0x1100; 24318c2ecf20Sopenharmony_ci unsigned int vb = 0x1161; 24328c2ecf20Sopenharmony_ci unsigned int tb = 0x11a7; 24338c2ecf20Sopenharmony_ci /* unsigned int lc = 19; */ 24348c2ecf20Sopenharmony_ci unsigned int vc = 21; 24358c2ecf20Sopenharmony_ci unsigned int tc = 28; 24368c2ecf20Sopenharmony_ci unsigned int nc = (vc * tc); 24378c2ecf20Sopenharmony_ci /* unsigned int sc = (lc * nc); */ 24388c2ecf20Sopenharmony_ci unsigned int unichar; 24398c2ecf20Sopenharmony_ci unsigned int mapping[4]; 24408c2ecf20Sopenharmony_ci unsigned int *um; 24418c2ecf20Sopenharmony_ci int count; 24428c2ecf20Sopenharmony_ci int i; 24438c2ecf20Sopenharmony_ci 24448c2ecf20Sopenharmony_ci if (verbose > 0) 24458c2ecf20Sopenharmony_ci printf("Decomposing hangul\n"); 24468c2ecf20Sopenharmony_ci /* Hangul */ 24478c2ecf20Sopenharmony_ci count = 0; 24488c2ecf20Sopenharmony_ci for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { 24498c2ecf20Sopenharmony_ci unsigned int si = unichar - sb; 24508c2ecf20Sopenharmony_ci unsigned int li = si / nc; 24518c2ecf20Sopenharmony_ci unsigned int vi = (si % nc) / tc; 24528c2ecf20Sopenharmony_ci unsigned int ti = si % tc; 24538c2ecf20Sopenharmony_ci 24548c2ecf20Sopenharmony_ci i = 0; 24558c2ecf20Sopenharmony_ci mapping[i++] = lb + li; 24568c2ecf20Sopenharmony_ci mapping[i++] = vb + vi; 24578c2ecf20Sopenharmony_ci if (ti) 24588c2ecf20Sopenharmony_ci mapping[i++] = tb + ti; 24598c2ecf20Sopenharmony_ci mapping[i++] = 0; 24608c2ecf20Sopenharmony_ci 24618c2ecf20Sopenharmony_ci assert(!unicode_data[unichar].utf32nfdi); 24628c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 24638c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 24648c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdi = um; 24658c2ecf20Sopenharmony_ci 24668c2ecf20Sopenharmony_ci assert(!unicode_data[unichar].utf32nfdicf); 24678c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 24688c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 24698c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdicf = um; 24708c2ecf20Sopenharmony_ci 24718c2ecf20Sopenharmony_ci /* 24728c2ecf20Sopenharmony_ci * Add a cookie as a reminder that the hangul syllable 24738c2ecf20Sopenharmony_ci * decompositions must not be stored in the generated 24748c2ecf20Sopenharmony_ci * trie. 24758c2ecf20Sopenharmony_ci */ 24768c2ecf20Sopenharmony_ci unicode_data[unichar].utf8nfdi = malloc(2); 24778c2ecf20Sopenharmony_ci unicode_data[unichar].utf8nfdi[0] = HANGUL; 24788c2ecf20Sopenharmony_ci unicode_data[unichar].utf8nfdi[1] = '\0'; 24798c2ecf20Sopenharmony_ci 24808c2ecf20Sopenharmony_ci if (verbose > 1) 24818c2ecf20Sopenharmony_ci print_utf32nfdi(unichar); 24828c2ecf20Sopenharmony_ci 24838c2ecf20Sopenharmony_ci count++; 24848c2ecf20Sopenharmony_ci } 24858c2ecf20Sopenharmony_ci if (verbose > 0) 24868c2ecf20Sopenharmony_ci printf("Created %d entries\n", count); 24878c2ecf20Sopenharmony_ci} 24888c2ecf20Sopenharmony_ci 24898c2ecf20Sopenharmony_cistatic void nfdi_decompose(void) 24908c2ecf20Sopenharmony_ci{ 24918c2ecf20Sopenharmony_ci unsigned int unichar; 24928c2ecf20Sopenharmony_ci unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 24938c2ecf20Sopenharmony_ci unsigned int *um; 24948c2ecf20Sopenharmony_ci unsigned int *dc; 24958c2ecf20Sopenharmony_ci int count; 24968c2ecf20Sopenharmony_ci int i; 24978c2ecf20Sopenharmony_ci int j; 24988c2ecf20Sopenharmony_ci int ret; 24998c2ecf20Sopenharmony_ci 25008c2ecf20Sopenharmony_ci if (verbose > 0) 25018c2ecf20Sopenharmony_ci printf("Decomposing nfdi\n"); 25028c2ecf20Sopenharmony_ci 25038c2ecf20Sopenharmony_ci count = 0; 25048c2ecf20Sopenharmony_ci for (unichar = 0; unichar != 0x110000; unichar++) { 25058c2ecf20Sopenharmony_ci if (!unicode_data[unichar].utf32nfdi) 25068c2ecf20Sopenharmony_ci continue; 25078c2ecf20Sopenharmony_ci for (;;) { 25088c2ecf20Sopenharmony_ci ret = 1; 25098c2ecf20Sopenharmony_ci i = 0; 25108c2ecf20Sopenharmony_ci um = unicode_data[unichar].utf32nfdi; 25118c2ecf20Sopenharmony_ci while (*um) { 25128c2ecf20Sopenharmony_ci dc = unicode_data[*um].utf32nfdi; 25138c2ecf20Sopenharmony_ci if (dc) { 25148c2ecf20Sopenharmony_ci for (j = 0; dc[j]; j++) 25158c2ecf20Sopenharmony_ci mapping[i++] = dc[j]; 25168c2ecf20Sopenharmony_ci ret = 0; 25178c2ecf20Sopenharmony_ci } else { 25188c2ecf20Sopenharmony_ci mapping[i++] = *um; 25198c2ecf20Sopenharmony_ci } 25208c2ecf20Sopenharmony_ci um++; 25218c2ecf20Sopenharmony_ci } 25228c2ecf20Sopenharmony_ci mapping[i++] = 0; 25238c2ecf20Sopenharmony_ci if (ret) 25248c2ecf20Sopenharmony_ci break; 25258c2ecf20Sopenharmony_ci free(unicode_data[unichar].utf32nfdi); 25268c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 25278c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 25288c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdi = um; 25298c2ecf20Sopenharmony_ci } 25308c2ecf20Sopenharmony_ci /* Add this decomposition to nfdicf if there is no entry. */ 25318c2ecf20Sopenharmony_ci if (!unicode_data[unichar].utf32nfdicf) { 25328c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 25338c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 25348c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdicf = um; 25358c2ecf20Sopenharmony_ci } 25368c2ecf20Sopenharmony_ci if (verbose > 1) 25378c2ecf20Sopenharmony_ci print_utf32nfdi(unichar); 25388c2ecf20Sopenharmony_ci count++; 25398c2ecf20Sopenharmony_ci } 25408c2ecf20Sopenharmony_ci if (verbose > 0) 25418c2ecf20Sopenharmony_ci printf("Processed %d entries\n", count); 25428c2ecf20Sopenharmony_ci} 25438c2ecf20Sopenharmony_ci 25448c2ecf20Sopenharmony_cistatic void nfdicf_decompose(void) 25458c2ecf20Sopenharmony_ci{ 25468c2ecf20Sopenharmony_ci unsigned int unichar; 25478c2ecf20Sopenharmony_ci unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 25488c2ecf20Sopenharmony_ci unsigned int *um; 25498c2ecf20Sopenharmony_ci unsigned int *dc; 25508c2ecf20Sopenharmony_ci int count; 25518c2ecf20Sopenharmony_ci int i; 25528c2ecf20Sopenharmony_ci int j; 25538c2ecf20Sopenharmony_ci int ret; 25548c2ecf20Sopenharmony_ci 25558c2ecf20Sopenharmony_ci if (verbose > 0) 25568c2ecf20Sopenharmony_ci printf("Decomposing nfdicf\n"); 25578c2ecf20Sopenharmony_ci count = 0; 25588c2ecf20Sopenharmony_ci for (unichar = 0; unichar != 0x110000; unichar++) { 25598c2ecf20Sopenharmony_ci if (!unicode_data[unichar].utf32nfdicf) 25608c2ecf20Sopenharmony_ci continue; 25618c2ecf20Sopenharmony_ci for (;;) { 25628c2ecf20Sopenharmony_ci ret = 1; 25638c2ecf20Sopenharmony_ci i = 0; 25648c2ecf20Sopenharmony_ci um = unicode_data[unichar].utf32nfdicf; 25658c2ecf20Sopenharmony_ci while (*um) { 25668c2ecf20Sopenharmony_ci dc = unicode_data[*um].utf32nfdicf; 25678c2ecf20Sopenharmony_ci if (dc) { 25688c2ecf20Sopenharmony_ci for (j = 0; dc[j]; j++) 25698c2ecf20Sopenharmony_ci mapping[i++] = dc[j]; 25708c2ecf20Sopenharmony_ci ret = 0; 25718c2ecf20Sopenharmony_ci } else { 25728c2ecf20Sopenharmony_ci mapping[i++] = *um; 25738c2ecf20Sopenharmony_ci } 25748c2ecf20Sopenharmony_ci um++; 25758c2ecf20Sopenharmony_ci } 25768c2ecf20Sopenharmony_ci mapping[i++] = 0; 25778c2ecf20Sopenharmony_ci if (ret) 25788c2ecf20Sopenharmony_ci break; 25798c2ecf20Sopenharmony_ci free(unicode_data[unichar].utf32nfdicf); 25808c2ecf20Sopenharmony_ci um = malloc(i * sizeof(unsigned int)); 25818c2ecf20Sopenharmony_ci memcpy(um, mapping, i * sizeof(unsigned int)); 25828c2ecf20Sopenharmony_ci unicode_data[unichar].utf32nfdicf = um; 25838c2ecf20Sopenharmony_ci } 25848c2ecf20Sopenharmony_ci if (verbose > 1) 25858c2ecf20Sopenharmony_ci print_utf32nfdicf(unichar); 25868c2ecf20Sopenharmony_ci count++; 25878c2ecf20Sopenharmony_ci } 25888c2ecf20Sopenharmony_ci if (verbose > 0) 25898c2ecf20Sopenharmony_ci printf("Processed %d entries\n", count); 25908c2ecf20Sopenharmony_ci} 25918c2ecf20Sopenharmony_ci 25928c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 25938c2ecf20Sopenharmony_ci 25948c2ecf20Sopenharmony_ciint utf8agemax(struct tree *, const char *); 25958c2ecf20Sopenharmony_ciint utf8nagemax(struct tree *, const char *, size_t); 25968c2ecf20Sopenharmony_ciint utf8agemin(struct tree *, const char *); 25978c2ecf20Sopenharmony_ciint utf8nagemin(struct tree *, const char *, size_t); 25988c2ecf20Sopenharmony_cissize_t utf8len(struct tree *, const char *); 25998c2ecf20Sopenharmony_cissize_t utf8nlen(struct tree *, const char *, size_t); 26008c2ecf20Sopenharmony_cistruct utf8cursor; 26018c2ecf20Sopenharmony_ciint utf8cursor(struct utf8cursor *, struct tree *, const char *); 26028c2ecf20Sopenharmony_ciint utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); 26038c2ecf20Sopenharmony_ciint utf8byte(struct utf8cursor *); 26048c2ecf20Sopenharmony_ci 26058c2ecf20Sopenharmony_ci/* 26068c2ecf20Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 26078c2ecf20Sopenharmony_ci * 26088c2ecf20Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 26098c2ecf20Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 26108c2ecf20Sopenharmony_ci * 26118c2ecf20Sopenharmony_ci * SBase = 0xAC00 26128c2ecf20Sopenharmony_ci * LBase = 0x1100 26138c2ecf20Sopenharmony_ci * VBase = 0x1161 26148c2ecf20Sopenharmony_ci * TBase = 0x11A7 26158c2ecf20Sopenharmony_ci * LCount = 19 26168c2ecf20Sopenharmony_ci * VCount = 21 26178c2ecf20Sopenharmony_ci * TCount = 28 26188c2ecf20Sopenharmony_ci * NCount = 588 (VCount * TCount) 26198c2ecf20Sopenharmony_ci * SCount = 11172 (LCount * NCount) 26208c2ecf20Sopenharmony_ci * 26218c2ecf20Sopenharmony_ci * Decomposition: 26228c2ecf20Sopenharmony_ci * SIndex = s - SBase 26238c2ecf20Sopenharmony_ci * 26248c2ecf20Sopenharmony_ci * LV (Canonical/Full) 26258c2ecf20Sopenharmony_ci * LIndex = SIndex / NCount 26268c2ecf20Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 26278c2ecf20Sopenharmony_ci * LPart = LBase + LIndex 26288c2ecf20Sopenharmony_ci * VPart = VBase + VIndex 26298c2ecf20Sopenharmony_ci * 26308c2ecf20Sopenharmony_ci * LVT (Canonical) 26318c2ecf20Sopenharmony_ci * LVIndex = (SIndex / TCount) * TCount 26328c2ecf20Sopenharmony_ci * TIndex = (Sindex % TCount) 26338c2ecf20Sopenharmony_ci * LVPart = SBase + LVIndex 26348c2ecf20Sopenharmony_ci * TPart = TBase + TIndex 26358c2ecf20Sopenharmony_ci * 26368c2ecf20Sopenharmony_ci * LVT (Full) 26378c2ecf20Sopenharmony_ci * LIndex = SIndex / NCount 26388c2ecf20Sopenharmony_ci * VIndex = (Sindex % NCount) / TCount 26398c2ecf20Sopenharmony_ci * TIndex = (Sindex % TCount) 26408c2ecf20Sopenharmony_ci * LPart = LBase + LIndex 26418c2ecf20Sopenharmony_ci * VPart = VBase + VIndex 26428c2ecf20Sopenharmony_ci * if (TIndex == 0) { 26438c2ecf20Sopenharmony_ci * d = <LPart, VPart> 26448c2ecf20Sopenharmony_ci * } else { 26458c2ecf20Sopenharmony_ci * TPart = TBase + TIndex 26468c2ecf20Sopenharmony_ci * d = <LPart, VPart, TPart> 26478c2ecf20Sopenharmony_ci * } 26488c2ecf20Sopenharmony_ci */ 26498c2ecf20Sopenharmony_ci 26508c2ecf20Sopenharmony_ci/* Constants */ 26518c2ecf20Sopenharmony_ci#define SB (0xAC00) 26528c2ecf20Sopenharmony_ci#define LB (0x1100) 26538c2ecf20Sopenharmony_ci#define VB (0x1161) 26548c2ecf20Sopenharmony_ci#define TB (0x11A7) 26558c2ecf20Sopenharmony_ci#define LC (19) 26568c2ecf20Sopenharmony_ci#define VC (21) 26578c2ecf20Sopenharmony_ci#define TC (28) 26588c2ecf20Sopenharmony_ci#define NC (VC * TC) 26598c2ecf20Sopenharmony_ci#define SC (LC * NC) 26608c2ecf20Sopenharmony_ci 26618c2ecf20Sopenharmony_ci/* Algorithmic decomposition of hangul syllable. */ 26628c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) 26638c2ecf20Sopenharmony_ci{ 26648c2ecf20Sopenharmony_ci unsigned int si; 26658c2ecf20Sopenharmony_ci unsigned int li; 26668c2ecf20Sopenharmony_ci unsigned int vi; 26678c2ecf20Sopenharmony_ci unsigned int ti; 26688c2ecf20Sopenharmony_ci unsigned char *h; 26698c2ecf20Sopenharmony_ci 26708c2ecf20Sopenharmony_ci /* Calculate the SI, LI, VI, and TI values. */ 26718c2ecf20Sopenharmony_ci si = utf8decode(str) - SB; 26728c2ecf20Sopenharmony_ci li = si / NC; 26738c2ecf20Sopenharmony_ci vi = (si % NC) / TC; 26748c2ecf20Sopenharmony_ci ti = si % TC; 26758c2ecf20Sopenharmony_ci 26768c2ecf20Sopenharmony_ci /* Fill in base of leaf. */ 26778c2ecf20Sopenharmony_ci h = hangul; 26788c2ecf20Sopenharmony_ci LEAF_GEN(h) = 2; 26798c2ecf20Sopenharmony_ci LEAF_CCC(h) = DECOMPOSE; 26808c2ecf20Sopenharmony_ci h += 2; 26818c2ecf20Sopenharmony_ci 26828c2ecf20Sopenharmony_ci /* Add LPart, a 3-byte UTF-8 sequence. */ 26838c2ecf20Sopenharmony_ci h += utf8encode((char *)h, li + LB); 26848c2ecf20Sopenharmony_ci 26858c2ecf20Sopenharmony_ci /* Add VPart, a 3-byte UTF-8 sequence. */ 26868c2ecf20Sopenharmony_ci h += utf8encode((char *)h, vi + VB); 26878c2ecf20Sopenharmony_ci 26888c2ecf20Sopenharmony_ci /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 26898c2ecf20Sopenharmony_ci if (ti) 26908c2ecf20Sopenharmony_ci h += utf8encode((char *)h, ti + TB); 26918c2ecf20Sopenharmony_ci 26928c2ecf20Sopenharmony_ci /* Terminate string. */ 26938c2ecf20Sopenharmony_ci h[0] = '\0'; 26948c2ecf20Sopenharmony_ci 26958c2ecf20Sopenharmony_ci return hangul; 26968c2ecf20Sopenharmony_ci} 26978c2ecf20Sopenharmony_ci 26988c2ecf20Sopenharmony_ci/* 26998c2ecf20Sopenharmony_ci * Use trie to scan s, touching at most len bytes. 27008c2ecf20Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise. 27018c2ecf20Sopenharmony_ci * 27028c2ecf20Sopenharmony_ci * A non-NULL return guarantees that the UTF-8 sequence starting at s 27038c2ecf20Sopenharmony_ci * is well-formed and corresponds to a known unicode code point. The 27048c2ecf20Sopenharmony_ci * shorthand for this will be "is valid UTF-8 unicode". 27058c2ecf20Sopenharmony_ci */ 27068c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, 27078c2ecf20Sopenharmony_ci const char *s, size_t len) 27088c2ecf20Sopenharmony_ci{ 27098c2ecf20Sopenharmony_ci utf8trie_t *trie; 27108c2ecf20Sopenharmony_ci int offlen; 27118c2ecf20Sopenharmony_ci int offset; 27128c2ecf20Sopenharmony_ci int mask; 27138c2ecf20Sopenharmony_ci int node; 27148c2ecf20Sopenharmony_ci 27158c2ecf20Sopenharmony_ci if (!tree) 27168c2ecf20Sopenharmony_ci return NULL; 27178c2ecf20Sopenharmony_ci if (len == 0) 27188c2ecf20Sopenharmony_ci return NULL; 27198c2ecf20Sopenharmony_ci node = 1; 27208c2ecf20Sopenharmony_ci trie = utf8data + tree->index; 27218c2ecf20Sopenharmony_ci while (node) { 27228c2ecf20Sopenharmony_ci offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 27238c2ecf20Sopenharmony_ci if (*trie & NEXTBYTE) { 27248c2ecf20Sopenharmony_ci if (--len == 0) 27258c2ecf20Sopenharmony_ci return NULL; 27268c2ecf20Sopenharmony_ci s++; 27278c2ecf20Sopenharmony_ci } 27288c2ecf20Sopenharmony_ci mask = 1 << (*trie & BITNUM); 27298c2ecf20Sopenharmony_ci if (*s & mask) { 27308c2ecf20Sopenharmony_ci /* Right leg */ 27318c2ecf20Sopenharmony_ci if (offlen) { 27328c2ecf20Sopenharmony_ci /* Right node at offset of trie */ 27338c2ecf20Sopenharmony_ci node = (*trie & RIGHTNODE); 27348c2ecf20Sopenharmony_ci offset = trie[offlen]; 27358c2ecf20Sopenharmony_ci while (--offlen) { 27368c2ecf20Sopenharmony_ci offset <<= 8; 27378c2ecf20Sopenharmony_ci offset |= trie[offlen]; 27388c2ecf20Sopenharmony_ci } 27398c2ecf20Sopenharmony_ci trie += offset; 27408c2ecf20Sopenharmony_ci } else if (*trie & RIGHTPATH) { 27418c2ecf20Sopenharmony_ci /* Right node after this node */ 27428c2ecf20Sopenharmony_ci node = (*trie & TRIENODE); 27438c2ecf20Sopenharmony_ci trie++; 27448c2ecf20Sopenharmony_ci } else { 27458c2ecf20Sopenharmony_ci /* No right node. */ 27468c2ecf20Sopenharmony_ci return NULL; 27478c2ecf20Sopenharmony_ci } 27488c2ecf20Sopenharmony_ci } else { 27498c2ecf20Sopenharmony_ci /* Left leg */ 27508c2ecf20Sopenharmony_ci if (offlen) { 27518c2ecf20Sopenharmony_ci /* Left node after this node. */ 27528c2ecf20Sopenharmony_ci node = (*trie & LEFTNODE); 27538c2ecf20Sopenharmony_ci trie += offlen + 1; 27548c2ecf20Sopenharmony_ci } else if (*trie & RIGHTPATH) { 27558c2ecf20Sopenharmony_ci /* No left node. */ 27568c2ecf20Sopenharmony_ci return NULL; 27578c2ecf20Sopenharmony_ci } else { 27588c2ecf20Sopenharmony_ci /* Left node after this node */ 27598c2ecf20Sopenharmony_ci node = (*trie & TRIENODE); 27608c2ecf20Sopenharmony_ci trie++; 27618c2ecf20Sopenharmony_ci } 27628c2ecf20Sopenharmony_ci } 27638c2ecf20Sopenharmony_ci } 27648c2ecf20Sopenharmony_ci /* 27658c2ecf20Sopenharmony_ci * Hangul decomposition is done algorithmically. These are the 27668c2ecf20Sopenharmony_ci * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 27678c2ecf20Sopenharmony_ci * always 3 bytes long, so s has been advanced twice, and the 27688c2ecf20Sopenharmony_ci * start of the sequence is at s-2. 27698c2ecf20Sopenharmony_ci */ 27708c2ecf20Sopenharmony_ci if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 27718c2ecf20Sopenharmony_ci trie = utf8hangul(s - 2, hangul); 27728c2ecf20Sopenharmony_ci return trie; 27738c2ecf20Sopenharmony_ci} 27748c2ecf20Sopenharmony_ci 27758c2ecf20Sopenharmony_ci/* 27768c2ecf20Sopenharmony_ci * Use trie to scan s. 27778c2ecf20Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise. 27788c2ecf20Sopenharmony_ci * 27798c2ecf20Sopenharmony_ci * Forwards to trie_nlookup(). 27808c2ecf20Sopenharmony_ci */ 27818c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, 27828c2ecf20Sopenharmony_ci const char *s) 27838c2ecf20Sopenharmony_ci{ 27848c2ecf20Sopenharmony_ci return utf8nlookup(tree, hangul, s, (size_t)-1); 27858c2ecf20Sopenharmony_ci} 27868c2ecf20Sopenharmony_ci 27878c2ecf20Sopenharmony_ci/* 27888c2ecf20Sopenharmony_ci * Return the number of bytes used by the current UTF-8 sequence. 27898c2ecf20Sopenharmony_ci * Assumes the input points to the first byte of a valid UTF-8 27908c2ecf20Sopenharmony_ci * sequence. 27918c2ecf20Sopenharmony_ci */ 27928c2ecf20Sopenharmony_cistatic inline int utf8clen(const char *s) 27938c2ecf20Sopenharmony_ci{ 27948c2ecf20Sopenharmony_ci unsigned char c = *s; 27958c2ecf20Sopenharmony_ci return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 27968c2ecf20Sopenharmony_ci} 27978c2ecf20Sopenharmony_ci 27988c2ecf20Sopenharmony_ci/* 27998c2ecf20Sopenharmony_ci * Maximum age of any character in s. 28008c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 28018c2ecf20Sopenharmony_ci * Return 0 if only non-assigned code points are used. 28028c2ecf20Sopenharmony_ci */ 28038c2ecf20Sopenharmony_ciint utf8agemax(struct tree *tree, const char *s) 28048c2ecf20Sopenharmony_ci{ 28058c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 28068c2ecf20Sopenharmony_ci int age = 0; 28078c2ecf20Sopenharmony_ci int leaf_age; 28088c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 28098c2ecf20Sopenharmony_ci 28108c2ecf20Sopenharmony_ci if (!tree) 28118c2ecf20Sopenharmony_ci return -1; 28128c2ecf20Sopenharmony_ci 28138c2ecf20Sopenharmony_ci while (*s) { 28148c2ecf20Sopenharmony_ci leaf = utf8lookup(tree, hangul, s); 28158c2ecf20Sopenharmony_ci if (!leaf) 28168c2ecf20Sopenharmony_ci return -1; 28178c2ecf20Sopenharmony_ci leaf_age = ages[LEAF_GEN(leaf)]; 28188c2ecf20Sopenharmony_ci if (leaf_age <= tree->maxage && leaf_age > age) 28198c2ecf20Sopenharmony_ci age = leaf_age; 28208c2ecf20Sopenharmony_ci s += utf8clen(s); 28218c2ecf20Sopenharmony_ci } 28228c2ecf20Sopenharmony_ci return age; 28238c2ecf20Sopenharmony_ci} 28248c2ecf20Sopenharmony_ci 28258c2ecf20Sopenharmony_ci/* 28268c2ecf20Sopenharmony_ci * Minimum age of any character in s. 28278c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 28288c2ecf20Sopenharmony_ci * Return 0 if non-assigned code points are used. 28298c2ecf20Sopenharmony_ci */ 28308c2ecf20Sopenharmony_ciint utf8agemin(struct tree *tree, const char *s) 28318c2ecf20Sopenharmony_ci{ 28328c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 28338c2ecf20Sopenharmony_ci int age; 28348c2ecf20Sopenharmony_ci int leaf_age; 28358c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 28368c2ecf20Sopenharmony_ci 28378c2ecf20Sopenharmony_ci if (!tree) 28388c2ecf20Sopenharmony_ci return -1; 28398c2ecf20Sopenharmony_ci age = tree->maxage; 28408c2ecf20Sopenharmony_ci while (*s) { 28418c2ecf20Sopenharmony_ci leaf = utf8lookup(tree, hangul, s); 28428c2ecf20Sopenharmony_ci if (!leaf) 28438c2ecf20Sopenharmony_ci return -1; 28448c2ecf20Sopenharmony_ci leaf_age = ages[LEAF_GEN(leaf)]; 28458c2ecf20Sopenharmony_ci if (leaf_age <= tree->maxage && leaf_age < age) 28468c2ecf20Sopenharmony_ci age = leaf_age; 28478c2ecf20Sopenharmony_ci s += utf8clen(s); 28488c2ecf20Sopenharmony_ci } 28498c2ecf20Sopenharmony_ci return age; 28508c2ecf20Sopenharmony_ci} 28518c2ecf20Sopenharmony_ci 28528c2ecf20Sopenharmony_ci/* 28538c2ecf20Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes. 28548c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 28558c2ecf20Sopenharmony_ci */ 28568c2ecf20Sopenharmony_ciint utf8nagemax(struct tree *tree, const char *s, size_t len) 28578c2ecf20Sopenharmony_ci{ 28588c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 28598c2ecf20Sopenharmony_ci int age = 0; 28608c2ecf20Sopenharmony_ci int leaf_age; 28618c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 28628c2ecf20Sopenharmony_ci 28638c2ecf20Sopenharmony_ci if (!tree) 28648c2ecf20Sopenharmony_ci return -1; 28658c2ecf20Sopenharmony_ci 28668c2ecf20Sopenharmony_ci while (len && *s) { 28678c2ecf20Sopenharmony_ci leaf = utf8nlookup(tree, hangul, s, len); 28688c2ecf20Sopenharmony_ci if (!leaf) 28698c2ecf20Sopenharmony_ci return -1; 28708c2ecf20Sopenharmony_ci leaf_age = ages[LEAF_GEN(leaf)]; 28718c2ecf20Sopenharmony_ci if (leaf_age <= tree->maxage && leaf_age > age) 28728c2ecf20Sopenharmony_ci age = leaf_age; 28738c2ecf20Sopenharmony_ci len -= utf8clen(s); 28748c2ecf20Sopenharmony_ci s += utf8clen(s); 28758c2ecf20Sopenharmony_ci } 28768c2ecf20Sopenharmony_ci return age; 28778c2ecf20Sopenharmony_ci} 28788c2ecf20Sopenharmony_ci 28798c2ecf20Sopenharmony_ci/* 28808c2ecf20Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes. 28818c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 28828c2ecf20Sopenharmony_ci */ 28838c2ecf20Sopenharmony_ciint utf8nagemin(struct tree *tree, const char *s, size_t len) 28848c2ecf20Sopenharmony_ci{ 28858c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 28868c2ecf20Sopenharmony_ci int leaf_age; 28878c2ecf20Sopenharmony_ci int age; 28888c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 28898c2ecf20Sopenharmony_ci 28908c2ecf20Sopenharmony_ci if (!tree) 28918c2ecf20Sopenharmony_ci return -1; 28928c2ecf20Sopenharmony_ci age = tree->maxage; 28938c2ecf20Sopenharmony_ci while (len && *s) { 28948c2ecf20Sopenharmony_ci leaf = utf8nlookup(tree, hangul, s, len); 28958c2ecf20Sopenharmony_ci if (!leaf) 28968c2ecf20Sopenharmony_ci return -1; 28978c2ecf20Sopenharmony_ci leaf_age = ages[LEAF_GEN(leaf)]; 28988c2ecf20Sopenharmony_ci if (leaf_age <= tree->maxage && leaf_age < age) 28998c2ecf20Sopenharmony_ci age = leaf_age; 29008c2ecf20Sopenharmony_ci len -= utf8clen(s); 29018c2ecf20Sopenharmony_ci s += utf8clen(s); 29028c2ecf20Sopenharmony_ci } 29038c2ecf20Sopenharmony_ci return age; 29048c2ecf20Sopenharmony_ci} 29058c2ecf20Sopenharmony_ci 29068c2ecf20Sopenharmony_ci/* 29078c2ecf20Sopenharmony_ci * Length of the normalization of s. 29088c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 29098c2ecf20Sopenharmony_ci * 29108c2ecf20Sopenharmony_ci * A string of Default_Ignorable_Code_Point has length 0. 29118c2ecf20Sopenharmony_ci */ 29128c2ecf20Sopenharmony_cissize_t utf8len(struct tree *tree, const char *s) 29138c2ecf20Sopenharmony_ci{ 29148c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 29158c2ecf20Sopenharmony_ci size_t ret = 0; 29168c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 29178c2ecf20Sopenharmony_ci 29188c2ecf20Sopenharmony_ci if (!tree) 29198c2ecf20Sopenharmony_ci return -1; 29208c2ecf20Sopenharmony_ci while (*s) { 29218c2ecf20Sopenharmony_ci leaf = utf8lookup(tree, hangul, s); 29228c2ecf20Sopenharmony_ci if (!leaf) 29238c2ecf20Sopenharmony_ci return -1; 29248c2ecf20Sopenharmony_ci if (ages[LEAF_GEN(leaf)] > tree->maxage) 29258c2ecf20Sopenharmony_ci ret += utf8clen(s); 29268c2ecf20Sopenharmony_ci else if (LEAF_CCC(leaf) == DECOMPOSE) 29278c2ecf20Sopenharmony_ci ret += strlen(LEAF_STR(leaf)); 29288c2ecf20Sopenharmony_ci else 29298c2ecf20Sopenharmony_ci ret += utf8clen(s); 29308c2ecf20Sopenharmony_ci s += utf8clen(s); 29318c2ecf20Sopenharmony_ci } 29328c2ecf20Sopenharmony_ci return ret; 29338c2ecf20Sopenharmony_ci} 29348c2ecf20Sopenharmony_ci 29358c2ecf20Sopenharmony_ci/* 29368c2ecf20Sopenharmony_ci * Length of the normalization of s, touch at most len bytes. 29378c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode. 29388c2ecf20Sopenharmony_ci */ 29398c2ecf20Sopenharmony_cissize_t utf8nlen(struct tree *tree, const char *s, size_t len) 29408c2ecf20Sopenharmony_ci{ 29418c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 29428c2ecf20Sopenharmony_ci size_t ret = 0; 29438c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 29448c2ecf20Sopenharmony_ci 29458c2ecf20Sopenharmony_ci if (!tree) 29468c2ecf20Sopenharmony_ci return -1; 29478c2ecf20Sopenharmony_ci while (len && *s) { 29488c2ecf20Sopenharmony_ci leaf = utf8nlookup(tree, hangul, s, len); 29498c2ecf20Sopenharmony_ci if (!leaf) 29508c2ecf20Sopenharmony_ci return -1; 29518c2ecf20Sopenharmony_ci if (ages[LEAF_GEN(leaf)] > tree->maxage) 29528c2ecf20Sopenharmony_ci ret += utf8clen(s); 29538c2ecf20Sopenharmony_ci else if (LEAF_CCC(leaf) == DECOMPOSE) 29548c2ecf20Sopenharmony_ci ret += strlen(LEAF_STR(leaf)); 29558c2ecf20Sopenharmony_ci else 29568c2ecf20Sopenharmony_ci ret += utf8clen(s); 29578c2ecf20Sopenharmony_ci len -= utf8clen(s); 29588c2ecf20Sopenharmony_ci s += utf8clen(s); 29598c2ecf20Sopenharmony_ci } 29608c2ecf20Sopenharmony_ci return ret; 29618c2ecf20Sopenharmony_ci} 29628c2ecf20Sopenharmony_ci 29638c2ecf20Sopenharmony_ci/* 29648c2ecf20Sopenharmony_ci * Cursor structure used by the normalizer. 29658c2ecf20Sopenharmony_ci */ 29668c2ecf20Sopenharmony_cistruct utf8cursor { 29678c2ecf20Sopenharmony_ci struct tree *tree; 29688c2ecf20Sopenharmony_ci const char *s; 29698c2ecf20Sopenharmony_ci const char *p; 29708c2ecf20Sopenharmony_ci const char *ss; 29718c2ecf20Sopenharmony_ci const char *sp; 29728c2ecf20Sopenharmony_ci unsigned int len; 29738c2ecf20Sopenharmony_ci unsigned int slen; 29748c2ecf20Sopenharmony_ci short int ccc; 29758c2ecf20Sopenharmony_ci short int nccc; 29768c2ecf20Sopenharmony_ci unsigned int unichar; 29778c2ecf20Sopenharmony_ci unsigned char hangul[UTF8HANGULLEAF]; 29788c2ecf20Sopenharmony_ci}; 29798c2ecf20Sopenharmony_ci 29808c2ecf20Sopenharmony_ci/* 29818c2ecf20Sopenharmony_ci * Set up an utf8cursor for use by utf8byte(). 29828c2ecf20Sopenharmony_ci * 29838c2ecf20Sopenharmony_ci * s : string. 29848c2ecf20Sopenharmony_ci * len : length of s. 29858c2ecf20Sopenharmony_ci * u8c : pointer to cursor. 29868c2ecf20Sopenharmony_ci * trie : utf8trie_t to use for normalization. 29878c2ecf20Sopenharmony_ci * 29888c2ecf20Sopenharmony_ci * Returns -1 on error, 0 on success. 29898c2ecf20Sopenharmony_ci */ 29908c2ecf20Sopenharmony_ciint utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s, 29918c2ecf20Sopenharmony_ci size_t len) 29928c2ecf20Sopenharmony_ci{ 29938c2ecf20Sopenharmony_ci if (!tree) 29948c2ecf20Sopenharmony_ci return -1; 29958c2ecf20Sopenharmony_ci if (!s) 29968c2ecf20Sopenharmony_ci return -1; 29978c2ecf20Sopenharmony_ci u8c->tree = tree; 29988c2ecf20Sopenharmony_ci u8c->s = s; 29998c2ecf20Sopenharmony_ci u8c->p = NULL; 30008c2ecf20Sopenharmony_ci u8c->ss = NULL; 30018c2ecf20Sopenharmony_ci u8c->sp = NULL; 30028c2ecf20Sopenharmony_ci u8c->len = len; 30038c2ecf20Sopenharmony_ci u8c->slen = 0; 30048c2ecf20Sopenharmony_ci u8c->ccc = STOPPER; 30058c2ecf20Sopenharmony_ci u8c->nccc = STOPPER; 30068c2ecf20Sopenharmony_ci u8c->unichar = 0; 30078c2ecf20Sopenharmony_ci /* Check we didn't clobber the maximum length. */ 30088c2ecf20Sopenharmony_ci if (u8c->len != len) 30098c2ecf20Sopenharmony_ci return -1; 30108c2ecf20Sopenharmony_ci /* The first byte of s may not be an utf8 continuation. */ 30118c2ecf20Sopenharmony_ci if (len > 0 && (*s & 0xC0) == 0x80) 30128c2ecf20Sopenharmony_ci return -1; 30138c2ecf20Sopenharmony_ci return 0; 30148c2ecf20Sopenharmony_ci} 30158c2ecf20Sopenharmony_ci 30168c2ecf20Sopenharmony_ci/* 30178c2ecf20Sopenharmony_ci * Set up an utf8cursor for use by utf8byte(). 30188c2ecf20Sopenharmony_ci * 30198c2ecf20Sopenharmony_ci * s : NUL-terminated string. 30208c2ecf20Sopenharmony_ci * u8c : pointer to cursor. 30218c2ecf20Sopenharmony_ci * trie : utf8trie_t to use for normalization. 30228c2ecf20Sopenharmony_ci * 30238c2ecf20Sopenharmony_ci * Returns -1 on error, 0 on success. 30248c2ecf20Sopenharmony_ci */ 30258c2ecf20Sopenharmony_ciint utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s) 30268c2ecf20Sopenharmony_ci{ 30278c2ecf20Sopenharmony_ci return utf8ncursor(u8c, tree, s, (unsigned int)-1); 30288c2ecf20Sopenharmony_ci} 30298c2ecf20Sopenharmony_ci 30308c2ecf20Sopenharmony_ci/* 30318c2ecf20Sopenharmony_ci * Get one byte from the normalized form of the string described by u8c. 30328c2ecf20Sopenharmony_ci * 30338c2ecf20Sopenharmony_ci * Returns the byte cast to an unsigned char on succes, and -1 on failure. 30348c2ecf20Sopenharmony_ci * 30358c2ecf20Sopenharmony_ci * The cursor keeps track of the location in the string in u8c->s. 30368c2ecf20Sopenharmony_ci * When a character is decomposed, the current location is stored in 30378c2ecf20Sopenharmony_ci * u8c->p, and u8c->s is set to the start of the decomposition. Note 30388c2ecf20Sopenharmony_ci * that bytes from a decomposition do not count against u8c->len. 30398c2ecf20Sopenharmony_ci * 30408c2ecf20Sopenharmony_ci * Characters are emitted if they match the current CCC in u8c->ccc. 30418c2ecf20Sopenharmony_ci * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 30428c2ecf20Sopenharmony_ci * and the function returns 0 in that case. 30438c2ecf20Sopenharmony_ci * 30448c2ecf20Sopenharmony_ci * Sorting by CCC is done by repeatedly scanning the string. The 30458c2ecf20Sopenharmony_ci * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 30468c2ecf20Sopenharmony_ci * the start of the scan. The first pass finds the lowest CCC to be 30478c2ecf20Sopenharmony_ci * emitted and stores it in u8c->nccc, the second pass emits the 30488c2ecf20Sopenharmony_ci * characters with this CCC and finds the next lowest CCC. This limits 30498c2ecf20Sopenharmony_ci * the number of passes to 1 + the number of different CCCs in the 30508c2ecf20Sopenharmony_ci * sequence being scanned. 30518c2ecf20Sopenharmony_ci * 30528c2ecf20Sopenharmony_ci * Therefore: 30538c2ecf20Sopenharmony_ci * u8c->p != NULL -> a decomposition is being scanned. 30548c2ecf20Sopenharmony_ci * u8c->ss != NULL -> this is a repeating scan. 30558c2ecf20Sopenharmony_ci * u8c->ccc == -1 -> this is the first scan of a repeating scan. 30568c2ecf20Sopenharmony_ci */ 30578c2ecf20Sopenharmony_ciint utf8byte(struct utf8cursor *u8c) 30588c2ecf20Sopenharmony_ci{ 30598c2ecf20Sopenharmony_ci utf8leaf_t *leaf; 30608c2ecf20Sopenharmony_ci int ccc; 30618c2ecf20Sopenharmony_ci 30628c2ecf20Sopenharmony_ci for (;;) { 30638c2ecf20Sopenharmony_ci /* Check for the end of a decomposed character. */ 30648c2ecf20Sopenharmony_ci if (u8c->p && *u8c->s == '\0') { 30658c2ecf20Sopenharmony_ci u8c->s = u8c->p; 30668c2ecf20Sopenharmony_ci u8c->p = NULL; 30678c2ecf20Sopenharmony_ci } 30688c2ecf20Sopenharmony_ci 30698c2ecf20Sopenharmony_ci /* Check for end-of-string. */ 30708c2ecf20Sopenharmony_ci if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 30718c2ecf20Sopenharmony_ci /* There is no next byte. */ 30728c2ecf20Sopenharmony_ci if (u8c->ccc == STOPPER) 30738c2ecf20Sopenharmony_ci return 0; 30748c2ecf20Sopenharmony_ci /* End-of-string during a scan counts as a stopper. */ 30758c2ecf20Sopenharmony_ci ccc = STOPPER; 30768c2ecf20Sopenharmony_ci goto ccc_mismatch; 30778c2ecf20Sopenharmony_ci } else if ((*u8c->s & 0xC0) == 0x80) { 30788c2ecf20Sopenharmony_ci /* This is a continuation of the current character. */ 30798c2ecf20Sopenharmony_ci if (!u8c->p) 30808c2ecf20Sopenharmony_ci u8c->len--; 30818c2ecf20Sopenharmony_ci return (unsigned char)*u8c->s++; 30828c2ecf20Sopenharmony_ci } 30838c2ecf20Sopenharmony_ci 30848c2ecf20Sopenharmony_ci /* Look up the data for the current character. */ 30858c2ecf20Sopenharmony_ci if (u8c->p) { 30868c2ecf20Sopenharmony_ci leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); 30878c2ecf20Sopenharmony_ci } else { 30888c2ecf20Sopenharmony_ci leaf = utf8nlookup(u8c->tree, u8c->hangul, 30898c2ecf20Sopenharmony_ci u8c->s, u8c->len); 30908c2ecf20Sopenharmony_ci } 30918c2ecf20Sopenharmony_ci 30928c2ecf20Sopenharmony_ci /* No leaf found implies that the input is a binary blob. */ 30938c2ecf20Sopenharmony_ci if (!leaf) 30948c2ecf20Sopenharmony_ci return -1; 30958c2ecf20Sopenharmony_ci 30968c2ecf20Sopenharmony_ci /* Characters that are too new have CCC 0. */ 30978c2ecf20Sopenharmony_ci if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { 30988c2ecf20Sopenharmony_ci ccc = STOPPER; 30998c2ecf20Sopenharmony_ci } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { 31008c2ecf20Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 31018c2ecf20Sopenharmony_ci u8c->p = u8c->s + utf8clen(u8c->s); 31028c2ecf20Sopenharmony_ci u8c->s = LEAF_STR(leaf); 31038c2ecf20Sopenharmony_ci /* Empty decomposition implies CCC 0. */ 31048c2ecf20Sopenharmony_ci if (*u8c->s == '\0') { 31058c2ecf20Sopenharmony_ci if (u8c->ccc == STOPPER) 31068c2ecf20Sopenharmony_ci continue; 31078c2ecf20Sopenharmony_ci ccc = STOPPER; 31088c2ecf20Sopenharmony_ci goto ccc_mismatch; 31098c2ecf20Sopenharmony_ci } 31108c2ecf20Sopenharmony_ci leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); 31118c2ecf20Sopenharmony_ci ccc = LEAF_CCC(leaf); 31128c2ecf20Sopenharmony_ci } 31138c2ecf20Sopenharmony_ci u8c->unichar = utf8decode(u8c->s); 31148c2ecf20Sopenharmony_ci 31158c2ecf20Sopenharmony_ci /* 31168c2ecf20Sopenharmony_ci * If this is not a stopper, then see if it updates 31178c2ecf20Sopenharmony_ci * the next canonical class to be emitted. 31188c2ecf20Sopenharmony_ci */ 31198c2ecf20Sopenharmony_ci if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 31208c2ecf20Sopenharmony_ci u8c->nccc = ccc; 31218c2ecf20Sopenharmony_ci 31228c2ecf20Sopenharmony_ci /* 31238c2ecf20Sopenharmony_ci * Return the current byte if this is the current 31248c2ecf20Sopenharmony_ci * combining class. 31258c2ecf20Sopenharmony_ci */ 31268c2ecf20Sopenharmony_ci if (ccc == u8c->ccc) { 31278c2ecf20Sopenharmony_ci if (!u8c->p) 31288c2ecf20Sopenharmony_ci u8c->len--; 31298c2ecf20Sopenharmony_ci return (unsigned char)*u8c->s++; 31308c2ecf20Sopenharmony_ci } 31318c2ecf20Sopenharmony_ci 31328c2ecf20Sopenharmony_ci /* Current combining class mismatch. */ 31338c2ecf20Sopenharmony_ci ccc_mismatch: 31348c2ecf20Sopenharmony_ci if (u8c->nccc == STOPPER) { 31358c2ecf20Sopenharmony_ci /* 31368c2ecf20Sopenharmony_ci * Scan forward for the first canonical class 31378c2ecf20Sopenharmony_ci * to be emitted. Save the position from 31388c2ecf20Sopenharmony_ci * which to restart. 31398c2ecf20Sopenharmony_ci */ 31408c2ecf20Sopenharmony_ci assert(u8c->ccc == STOPPER); 31418c2ecf20Sopenharmony_ci u8c->ccc = MINCCC - 1; 31428c2ecf20Sopenharmony_ci u8c->nccc = ccc; 31438c2ecf20Sopenharmony_ci u8c->sp = u8c->p; 31448c2ecf20Sopenharmony_ci u8c->ss = u8c->s; 31458c2ecf20Sopenharmony_ci u8c->slen = u8c->len; 31468c2ecf20Sopenharmony_ci if (!u8c->p) 31478c2ecf20Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 31488c2ecf20Sopenharmony_ci u8c->s += utf8clen(u8c->s); 31498c2ecf20Sopenharmony_ci } else if (ccc != STOPPER) { 31508c2ecf20Sopenharmony_ci /* Not a stopper, and not the ccc we're emitting. */ 31518c2ecf20Sopenharmony_ci if (!u8c->p) 31528c2ecf20Sopenharmony_ci u8c->len -= utf8clen(u8c->s); 31538c2ecf20Sopenharmony_ci u8c->s += utf8clen(u8c->s); 31548c2ecf20Sopenharmony_ci } else if (u8c->nccc != MAXCCC + 1) { 31558c2ecf20Sopenharmony_ci /* At a stopper, restart for next ccc. */ 31568c2ecf20Sopenharmony_ci u8c->ccc = u8c->nccc; 31578c2ecf20Sopenharmony_ci u8c->nccc = MAXCCC + 1; 31588c2ecf20Sopenharmony_ci u8c->s = u8c->ss; 31598c2ecf20Sopenharmony_ci u8c->p = u8c->sp; 31608c2ecf20Sopenharmony_ci u8c->len = u8c->slen; 31618c2ecf20Sopenharmony_ci } else { 31628c2ecf20Sopenharmony_ci /* All done, proceed from here. */ 31638c2ecf20Sopenharmony_ci u8c->ccc = STOPPER; 31648c2ecf20Sopenharmony_ci u8c->nccc = STOPPER; 31658c2ecf20Sopenharmony_ci u8c->sp = NULL; 31668c2ecf20Sopenharmony_ci u8c->ss = NULL; 31678c2ecf20Sopenharmony_ci u8c->slen = 0; 31688c2ecf20Sopenharmony_ci } 31698c2ecf20Sopenharmony_ci } 31708c2ecf20Sopenharmony_ci} 31718c2ecf20Sopenharmony_ci 31728c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 31738c2ecf20Sopenharmony_ci 31748c2ecf20Sopenharmony_cistatic int normalize_line(struct tree *tree) 31758c2ecf20Sopenharmony_ci{ 31768c2ecf20Sopenharmony_ci char *s; 31778c2ecf20Sopenharmony_ci char *t; 31788c2ecf20Sopenharmony_ci int c; 31798c2ecf20Sopenharmony_ci struct utf8cursor u8c; 31808c2ecf20Sopenharmony_ci 31818c2ecf20Sopenharmony_ci /* First test: null-terminated string. */ 31828c2ecf20Sopenharmony_ci s = buf2; 31838c2ecf20Sopenharmony_ci t = buf3; 31848c2ecf20Sopenharmony_ci if (utf8cursor(&u8c, tree, s)) 31858c2ecf20Sopenharmony_ci return -1; 31868c2ecf20Sopenharmony_ci while ((c = utf8byte(&u8c)) > 0) 31878c2ecf20Sopenharmony_ci if (c != (unsigned char)*t++) 31888c2ecf20Sopenharmony_ci return -1; 31898c2ecf20Sopenharmony_ci if (c < 0) 31908c2ecf20Sopenharmony_ci return -1; 31918c2ecf20Sopenharmony_ci if (*t != 0) 31928c2ecf20Sopenharmony_ci return -1; 31938c2ecf20Sopenharmony_ci 31948c2ecf20Sopenharmony_ci /* Second test: length-limited string. */ 31958c2ecf20Sopenharmony_ci s = buf2; 31968c2ecf20Sopenharmony_ci /* Replace NUL with a value that will cause an error if seen. */ 31978c2ecf20Sopenharmony_ci s[strlen(s) + 1] = -1; 31988c2ecf20Sopenharmony_ci t = buf3; 31998c2ecf20Sopenharmony_ci if (utf8cursor(&u8c, tree, s)) 32008c2ecf20Sopenharmony_ci return -1; 32018c2ecf20Sopenharmony_ci while ((c = utf8byte(&u8c)) > 0) 32028c2ecf20Sopenharmony_ci if (c != (unsigned char)*t++) 32038c2ecf20Sopenharmony_ci return -1; 32048c2ecf20Sopenharmony_ci if (c < 0) 32058c2ecf20Sopenharmony_ci return -1; 32068c2ecf20Sopenharmony_ci if (*t != 0) 32078c2ecf20Sopenharmony_ci return -1; 32088c2ecf20Sopenharmony_ci 32098c2ecf20Sopenharmony_ci return 0; 32108c2ecf20Sopenharmony_ci} 32118c2ecf20Sopenharmony_ci 32128c2ecf20Sopenharmony_cistatic void normalization_test(void) 32138c2ecf20Sopenharmony_ci{ 32148c2ecf20Sopenharmony_ci FILE *file; 32158c2ecf20Sopenharmony_ci unsigned int unichar; 32168c2ecf20Sopenharmony_ci struct unicode_data *data; 32178c2ecf20Sopenharmony_ci char *s; 32188c2ecf20Sopenharmony_ci char *t; 32198c2ecf20Sopenharmony_ci int ret; 32208c2ecf20Sopenharmony_ci int ignorables; 32218c2ecf20Sopenharmony_ci int tests = 0; 32228c2ecf20Sopenharmony_ci int failures = 0; 32238c2ecf20Sopenharmony_ci 32248c2ecf20Sopenharmony_ci if (verbose > 0) 32258c2ecf20Sopenharmony_ci printf("Parsing %s\n", test_name); 32268c2ecf20Sopenharmony_ci /* Step one, read data from file. */ 32278c2ecf20Sopenharmony_ci file = fopen(test_name, "r"); 32288c2ecf20Sopenharmony_ci if (!file) 32298c2ecf20Sopenharmony_ci open_fail(test_name, errno); 32308c2ecf20Sopenharmony_ci 32318c2ecf20Sopenharmony_ci while (fgets(line, LINESIZE, file)) { 32328c2ecf20Sopenharmony_ci ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];", 32338c2ecf20Sopenharmony_ci buf0, buf1); 32348c2ecf20Sopenharmony_ci if (ret != 2 || *line == '#') 32358c2ecf20Sopenharmony_ci continue; 32368c2ecf20Sopenharmony_ci s = buf0; 32378c2ecf20Sopenharmony_ci t = buf2; 32388c2ecf20Sopenharmony_ci while (*s) { 32398c2ecf20Sopenharmony_ci unichar = strtoul(s, &s, 16); 32408c2ecf20Sopenharmony_ci t += utf8encode(t, unichar); 32418c2ecf20Sopenharmony_ci } 32428c2ecf20Sopenharmony_ci *t = '\0'; 32438c2ecf20Sopenharmony_ci 32448c2ecf20Sopenharmony_ci ignorables = 0; 32458c2ecf20Sopenharmony_ci s = buf1; 32468c2ecf20Sopenharmony_ci t = buf3; 32478c2ecf20Sopenharmony_ci while (*s) { 32488c2ecf20Sopenharmony_ci unichar = strtoul(s, &s, 16); 32498c2ecf20Sopenharmony_ci data = &unicode_data[unichar]; 32508c2ecf20Sopenharmony_ci if (data->utf8nfdi && !*data->utf8nfdi) 32518c2ecf20Sopenharmony_ci ignorables = 1; 32528c2ecf20Sopenharmony_ci else 32538c2ecf20Sopenharmony_ci t += utf8encode(t, unichar); 32548c2ecf20Sopenharmony_ci } 32558c2ecf20Sopenharmony_ci *t = '\0'; 32568c2ecf20Sopenharmony_ci 32578c2ecf20Sopenharmony_ci tests++; 32588c2ecf20Sopenharmony_ci if (normalize_line(nfdi_tree) < 0) { 32598c2ecf20Sopenharmony_ci printf("Line %s -> %s", buf0, buf1); 32608c2ecf20Sopenharmony_ci if (ignorables) 32618c2ecf20Sopenharmony_ci printf(" (ignorables removed)"); 32628c2ecf20Sopenharmony_ci printf(" failure\n"); 32638c2ecf20Sopenharmony_ci failures++; 32648c2ecf20Sopenharmony_ci } 32658c2ecf20Sopenharmony_ci } 32668c2ecf20Sopenharmony_ci fclose(file); 32678c2ecf20Sopenharmony_ci if (verbose > 0) 32688c2ecf20Sopenharmony_ci printf("Ran %d tests with %d failures\n", tests, failures); 32698c2ecf20Sopenharmony_ci if (failures) 32708c2ecf20Sopenharmony_ci file_fail(test_name); 32718c2ecf20Sopenharmony_ci} 32728c2ecf20Sopenharmony_ci 32738c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 32748c2ecf20Sopenharmony_ci 32758c2ecf20Sopenharmony_cistatic void write_file(void) 32768c2ecf20Sopenharmony_ci{ 32778c2ecf20Sopenharmony_ci FILE *file; 32788c2ecf20Sopenharmony_ci int i; 32798c2ecf20Sopenharmony_ci int j; 32808c2ecf20Sopenharmony_ci int t; 32818c2ecf20Sopenharmony_ci int gen; 32828c2ecf20Sopenharmony_ci 32838c2ecf20Sopenharmony_ci if (verbose > 0) 32848c2ecf20Sopenharmony_ci printf("Writing %s\n", utf8_name); 32858c2ecf20Sopenharmony_ci file = fopen(utf8_name, "w"); 32868c2ecf20Sopenharmony_ci if (!file) 32878c2ecf20Sopenharmony_ci open_fail(utf8_name, errno); 32888c2ecf20Sopenharmony_ci 32898c2ecf20Sopenharmony_ci fprintf(file, "/* This file is generated code, do not edit. */\n"); 32908c2ecf20Sopenharmony_ci fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); 32918c2ecf20Sopenharmony_ci fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); 32928c2ecf20Sopenharmony_ci fprintf(file, "#endif\n"); 32938c2ecf20Sopenharmony_ci fprintf(file, "\n"); 32948c2ecf20Sopenharmony_ci fprintf(file, "static const unsigned int utf8vers = %#x;\n", 32958c2ecf20Sopenharmony_ci unicode_maxage); 32968c2ecf20Sopenharmony_ci fprintf(file, "\n"); 32978c2ecf20Sopenharmony_ci fprintf(file, "static const unsigned int utf8agetab[] = {\n"); 32988c2ecf20Sopenharmony_ci for (i = 0; i != ages_count; i++) 32998c2ecf20Sopenharmony_ci fprintf(file, "\t%#x%s\n", ages[i], 33008c2ecf20Sopenharmony_ci ages[i] == unicode_maxage ? "" : ","); 33018c2ecf20Sopenharmony_ci fprintf(file, "};\n"); 33028c2ecf20Sopenharmony_ci fprintf(file, "\n"); 33038c2ecf20Sopenharmony_ci fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n"); 33048c2ecf20Sopenharmony_ci t = 0; 33058c2ecf20Sopenharmony_ci for (gen = 0; gen < ages_count; gen++) { 33068c2ecf20Sopenharmony_ci fprintf(file, "\t{ %#x, %d }%s\n", 33078c2ecf20Sopenharmony_ci ages[gen], trees[t].index, 33088c2ecf20Sopenharmony_ci ages[gen] == unicode_maxage ? "" : ","); 33098c2ecf20Sopenharmony_ci if (trees[t].maxage == ages[gen]) 33108c2ecf20Sopenharmony_ci t += 2; 33118c2ecf20Sopenharmony_ci } 33128c2ecf20Sopenharmony_ci fprintf(file, "};\n"); 33138c2ecf20Sopenharmony_ci fprintf(file, "\n"); 33148c2ecf20Sopenharmony_ci fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n"); 33158c2ecf20Sopenharmony_ci t = 1; 33168c2ecf20Sopenharmony_ci for (gen = 0; gen < ages_count; gen++) { 33178c2ecf20Sopenharmony_ci fprintf(file, "\t{ %#x, %d }%s\n", 33188c2ecf20Sopenharmony_ci ages[gen], trees[t].index, 33198c2ecf20Sopenharmony_ci ages[gen] == unicode_maxage ? "" : ","); 33208c2ecf20Sopenharmony_ci if (trees[t].maxage == ages[gen]) 33218c2ecf20Sopenharmony_ci t += 2; 33228c2ecf20Sopenharmony_ci } 33238c2ecf20Sopenharmony_ci fprintf(file, "};\n"); 33248c2ecf20Sopenharmony_ci fprintf(file, "\n"); 33258c2ecf20Sopenharmony_ci fprintf(file, "static const unsigned char utf8data[%zd] = {\n", 33268c2ecf20Sopenharmony_ci utf8data_size); 33278c2ecf20Sopenharmony_ci t = 0; 33288c2ecf20Sopenharmony_ci for (i = 0; i != utf8data_size; i += 16) { 33298c2ecf20Sopenharmony_ci if (i == trees[t].index) { 33308c2ecf20Sopenharmony_ci fprintf(file, "\t/* %s_%x */\n", 33318c2ecf20Sopenharmony_ci trees[t].type, trees[t].maxage); 33328c2ecf20Sopenharmony_ci if (t < trees_count-1) 33338c2ecf20Sopenharmony_ci t++; 33348c2ecf20Sopenharmony_ci } 33358c2ecf20Sopenharmony_ci fprintf(file, "\t"); 33368c2ecf20Sopenharmony_ci for (j = i; j != i + 16; j++) 33378c2ecf20Sopenharmony_ci fprintf(file, "0x%.2x%s", utf8data[j], 33388c2ecf20Sopenharmony_ci (j < utf8data_size -1 ? "," : "")); 33398c2ecf20Sopenharmony_ci fprintf(file, "\n"); 33408c2ecf20Sopenharmony_ci } 33418c2ecf20Sopenharmony_ci fprintf(file, "};\n"); 33428c2ecf20Sopenharmony_ci fclose(file); 33438c2ecf20Sopenharmony_ci} 33448c2ecf20Sopenharmony_ci 33458c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */ 33468c2ecf20Sopenharmony_ci 33478c2ecf20Sopenharmony_ciint main(int argc, char *argv[]) 33488c2ecf20Sopenharmony_ci{ 33498c2ecf20Sopenharmony_ci unsigned int unichar; 33508c2ecf20Sopenharmony_ci int opt; 33518c2ecf20Sopenharmony_ci 33528c2ecf20Sopenharmony_ci argv0 = argv[0]; 33538c2ecf20Sopenharmony_ci 33548c2ecf20Sopenharmony_ci while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { 33558c2ecf20Sopenharmony_ci switch (opt) { 33568c2ecf20Sopenharmony_ci case 'a': 33578c2ecf20Sopenharmony_ci age_name = optarg; 33588c2ecf20Sopenharmony_ci break; 33598c2ecf20Sopenharmony_ci case 'c': 33608c2ecf20Sopenharmony_ci ccc_name = optarg; 33618c2ecf20Sopenharmony_ci break; 33628c2ecf20Sopenharmony_ci case 'd': 33638c2ecf20Sopenharmony_ci data_name = optarg; 33648c2ecf20Sopenharmony_ci break; 33658c2ecf20Sopenharmony_ci case 'f': 33668c2ecf20Sopenharmony_ci fold_name = optarg; 33678c2ecf20Sopenharmony_ci break; 33688c2ecf20Sopenharmony_ci case 'n': 33698c2ecf20Sopenharmony_ci norm_name = optarg; 33708c2ecf20Sopenharmony_ci break; 33718c2ecf20Sopenharmony_ci case 'o': 33728c2ecf20Sopenharmony_ci utf8_name = optarg; 33738c2ecf20Sopenharmony_ci break; 33748c2ecf20Sopenharmony_ci case 'p': 33758c2ecf20Sopenharmony_ci prop_name = optarg; 33768c2ecf20Sopenharmony_ci break; 33778c2ecf20Sopenharmony_ci case 't': 33788c2ecf20Sopenharmony_ci test_name = optarg; 33798c2ecf20Sopenharmony_ci break; 33808c2ecf20Sopenharmony_ci case 'v': 33818c2ecf20Sopenharmony_ci verbose++; 33828c2ecf20Sopenharmony_ci break; 33838c2ecf20Sopenharmony_ci case 'h': 33848c2ecf20Sopenharmony_ci help(); 33858c2ecf20Sopenharmony_ci exit(0); 33868c2ecf20Sopenharmony_ci default: 33878c2ecf20Sopenharmony_ci usage(); 33888c2ecf20Sopenharmony_ci } 33898c2ecf20Sopenharmony_ci } 33908c2ecf20Sopenharmony_ci 33918c2ecf20Sopenharmony_ci if (verbose > 1) 33928c2ecf20Sopenharmony_ci help(); 33938c2ecf20Sopenharmony_ci for (unichar = 0; unichar != 0x110000; unichar++) 33948c2ecf20Sopenharmony_ci unicode_data[unichar].code = unichar; 33958c2ecf20Sopenharmony_ci age_init(); 33968c2ecf20Sopenharmony_ci ccc_init(); 33978c2ecf20Sopenharmony_ci nfdi_init(); 33988c2ecf20Sopenharmony_ci nfdicf_init(); 33998c2ecf20Sopenharmony_ci ignore_init(); 34008c2ecf20Sopenharmony_ci corrections_init(); 34018c2ecf20Sopenharmony_ci hangul_decompose(); 34028c2ecf20Sopenharmony_ci nfdi_decompose(); 34038c2ecf20Sopenharmony_ci nfdicf_decompose(); 34048c2ecf20Sopenharmony_ci utf8_init(); 34058c2ecf20Sopenharmony_ci trees_init(); 34068c2ecf20Sopenharmony_ci trees_populate(); 34078c2ecf20Sopenharmony_ci trees_reduce(); 34088c2ecf20Sopenharmony_ci trees_verify(); 34098c2ecf20Sopenharmony_ci /* Prevent "unused function" warning. */ 34108c2ecf20Sopenharmony_ci (void)lookup(nfdi_tree, " "); 34118c2ecf20Sopenharmony_ci if (verbose > 2) 34128c2ecf20Sopenharmony_ci tree_walk(nfdi_tree); 34138c2ecf20Sopenharmony_ci if (verbose > 2) 34148c2ecf20Sopenharmony_ci tree_walk(nfdicf_tree); 34158c2ecf20Sopenharmony_ci normalization_test(); 34168c2ecf20Sopenharmony_ci write_file(); 34178c2ecf20Sopenharmony_ci 34188c2ecf20Sopenharmony_ci return 0; 34198c2ecf20Sopenharmony_ci} 3420