18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * Copyright (c) 2014 SGI.
38c2ecf20Sopenharmony_ci * All rights reserved.
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or
68c2ecf20Sopenharmony_ci * modify it under the terms of the GNU General Public License as
78c2ecf20Sopenharmony_ci * published by the Free Software Foundation.
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * This program is distributed in the hope that it would be useful,
108c2ecf20Sopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
118c2ecf20Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
128c2ecf20Sopenharmony_ci * GNU General Public License for more details.
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci * You should have received a copy of the GNU General Public License
158c2ecf20Sopenharmony_ci * along with this program; if not, write the Free Software Foundation,
168c2ecf20Sopenharmony_ci * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
178c2ecf20Sopenharmony_ci */
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci/* Generator for a compact trie for unicode normalization */
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci#include <sys/types.h>
228c2ecf20Sopenharmony_ci#include <stddef.h>
238c2ecf20Sopenharmony_ci#include <stdlib.h>
248c2ecf20Sopenharmony_ci#include <stdio.h>
258c2ecf20Sopenharmony_ci#include <assert.h>
268c2ecf20Sopenharmony_ci#include <string.h>
278c2ecf20Sopenharmony_ci#include <unistd.h>
288c2ecf20Sopenharmony_ci#include <errno.h>
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_ci/* Default names of the in- and output files. */
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci#define AGE_NAME	"DerivedAge.txt"
338c2ecf20Sopenharmony_ci#define CCC_NAME	"DerivedCombiningClass.txt"
348c2ecf20Sopenharmony_ci#define PROP_NAME	"DerivedCoreProperties.txt"
358c2ecf20Sopenharmony_ci#define DATA_NAME	"UnicodeData.txt"
368c2ecf20Sopenharmony_ci#define FOLD_NAME	"CaseFolding.txt"
378c2ecf20Sopenharmony_ci#define NORM_NAME	"NormalizationCorrections.txt"
388c2ecf20Sopenharmony_ci#define TEST_NAME	"NormalizationTest.txt"
398c2ecf20Sopenharmony_ci#define UTF8_NAME	"utf8data.h"
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ciconst char	*age_name  = AGE_NAME;
428c2ecf20Sopenharmony_ciconst char	*ccc_name  = CCC_NAME;
438c2ecf20Sopenharmony_ciconst char	*prop_name = PROP_NAME;
448c2ecf20Sopenharmony_ciconst char	*data_name = DATA_NAME;
458c2ecf20Sopenharmony_ciconst char	*fold_name = FOLD_NAME;
468c2ecf20Sopenharmony_ciconst char	*norm_name = NORM_NAME;
478c2ecf20Sopenharmony_ciconst char	*test_name = TEST_NAME;
488c2ecf20Sopenharmony_ciconst char	*utf8_name = UTF8_NAME;
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ciint verbose = 0;
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci/* An arbitrary line size limit on input lines. */
538c2ecf20Sopenharmony_ci
548c2ecf20Sopenharmony_ci#define LINESIZE	1024
558c2ecf20Sopenharmony_cichar line[LINESIZE];
568c2ecf20Sopenharmony_cichar buf0[LINESIZE];
578c2ecf20Sopenharmony_cichar buf1[LINESIZE];
588c2ecf20Sopenharmony_cichar buf2[LINESIZE];
598c2ecf20Sopenharmony_cichar buf3[LINESIZE];
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ciconst char *argv0;
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci/*
688c2ecf20Sopenharmony_ci * Unicode version numbers consist of three parts: major, minor, and a
698c2ecf20Sopenharmony_ci * revision.  These numbers are packed into an unsigned int to obtain
708c2ecf20Sopenharmony_ci * a single version number.
718c2ecf20Sopenharmony_ci *
728c2ecf20Sopenharmony_ci * To save space in the generated trie, the unicode version is not
738c2ecf20Sopenharmony_ci * stored directly, instead we calculate a generation number from the
748c2ecf20Sopenharmony_ci * unicode versions seen in the DerivedAge file, and use that as an
758c2ecf20Sopenharmony_ci * index into a table of unicode versions.
768c2ecf20Sopenharmony_ci */
778c2ecf20Sopenharmony_ci#define UNICODE_MAJ_SHIFT		(16)
788c2ecf20Sopenharmony_ci#define UNICODE_MIN_SHIFT		(8)
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci#define UNICODE_MAJ_MAX			((unsigned short)-1)
818c2ecf20Sopenharmony_ci#define UNICODE_MIN_MAX			((unsigned char)-1)
828c2ecf20Sopenharmony_ci#define UNICODE_REV_MAX			((unsigned char)-1)
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci#define UNICODE_AGE(MAJ,MIN,REV)			\
858c2ecf20Sopenharmony_ci	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
868c2ecf20Sopenharmony_ci	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
878c2ecf20Sopenharmony_ci	 ((unsigned int)(REV)))
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ciunsigned int *ages;
908c2ecf20Sopenharmony_ciint ages_count;
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ciunsigned int unicode_maxage;
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_cistatic int age_valid(unsigned int major, unsigned int minor,
958c2ecf20Sopenharmony_ci		     unsigned int revision)
968c2ecf20Sopenharmony_ci{
978c2ecf20Sopenharmony_ci	if (major > UNICODE_MAJ_MAX)
988c2ecf20Sopenharmony_ci		return 0;
998c2ecf20Sopenharmony_ci	if (minor > UNICODE_MIN_MAX)
1008c2ecf20Sopenharmony_ci		return 0;
1018c2ecf20Sopenharmony_ci	if (revision > UNICODE_REV_MAX)
1028c2ecf20Sopenharmony_ci		return 0;
1038c2ecf20Sopenharmony_ci	return 1;
1048c2ecf20Sopenharmony_ci}
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci/*
1098c2ecf20Sopenharmony_ci * utf8trie_t
1108c2ecf20Sopenharmony_ci *
1118c2ecf20Sopenharmony_ci * A compact binary tree, used to decode UTF-8 characters.
1128c2ecf20Sopenharmony_ci *
1138c2ecf20Sopenharmony_ci * Internal nodes are one byte for the node itself, and up to three
1148c2ecf20Sopenharmony_ci * bytes for an offset into the tree.  The first byte contains the
1158c2ecf20Sopenharmony_ci * following information:
1168c2ecf20Sopenharmony_ci *  NEXTBYTE  - flag        - advance to next byte if set
1178c2ecf20Sopenharmony_ci *  BITNUM    - 3 bit field - the bit number to tested
1188c2ecf20Sopenharmony_ci *  OFFLEN    - 2 bit field - number of bytes in the offset
1198c2ecf20Sopenharmony_ci * if offlen == 0 (non-branching node)
1208c2ecf20Sopenharmony_ci *  RIGHTPATH - 1 bit field - set if the following node is for the
1218c2ecf20Sopenharmony_ci *                            right-hand path (tested bit is set)
1228c2ecf20Sopenharmony_ci *  TRIENODE  - 1 bit field - set if the following node is an internal
1238c2ecf20Sopenharmony_ci *                            node, otherwise it is a leaf node
1248c2ecf20Sopenharmony_ci * if offlen != 0 (branching node)
1258c2ecf20Sopenharmony_ci *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
1268c2ecf20Sopenharmony_ci *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
1278c2ecf20Sopenharmony_ci *
1288c2ecf20Sopenharmony_ci * Due to the way utf8 works, there cannot be branching nodes with
1298c2ecf20Sopenharmony_ci * NEXTBYTE set, and moreover those nodes always have a righthand
1308c2ecf20Sopenharmony_ci * descendant.
1318c2ecf20Sopenharmony_ci */
1328c2ecf20Sopenharmony_citypedef unsigned char utf8trie_t;
1338c2ecf20Sopenharmony_ci#define BITNUM		0x07
1348c2ecf20Sopenharmony_ci#define NEXTBYTE	0x08
1358c2ecf20Sopenharmony_ci#define OFFLEN		0x30
1368c2ecf20Sopenharmony_ci#define OFFLEN_SHIFT	4
1378c2ecf20Sopenharmony_ci#define RIGHTPATH	0x40
1388c2ecf20Sopenharmony_ci#define TRIENODE	0x80
1398c2ecf20Sopenharmony_ci#define RIGHTNODE	0x40
1408c2ecf20Sopenharmony_ci#define LEFTNODE	0x80
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci/*
1438c2ecf20Sopenharmony_ci * utf8leaf_t
1448c2ecf20Sopenharmony_ci *
1458c2ecf20Sopenharmony_ci * The leaves of the trie are embedded in the trie, and so the same
1468c2ecf20Sopenharmony_ci * underlying datatype, unsigned char.
1478c2ecf20Sopenharmony_ci *
1488c2ecf20Sopenharmony_ci * leaf[0]: The unicode version, stored as a generation number that is
1498c2ecf20Sopenharmony_ci *          an index into utf8agetab[].  With this we can filter code
1508c2ecf20Sopenharmony_ci *          points based on the unicode version in which they were
1518c2ecf20Sopenharmony_ci *          defined.  The CCC of a non-defined code point is 0.
1528c2ecf20Sopenharmony_ci * leaf[1]: Canonical Combining Class. During normalization, we need
1538c2ecf20Sopenharmony_ci *          to do a stable sort into ascending order of all characters
1548c2ecf20Sopenharmony_ci *          with a non-zero CCC that occur between two characters with
1558c2ecf20Sopenharmony_ci *          a CCC of 0, or at the begin or end of a string.
1568c2ecf20Sopenharmony_ci *          The unicode standard guarantees that all CCC values are
1578c2ecf20Sopenharmony_ci *          between 0 and 254 inclusive, which leaves 255 available as
1588c2ecf20Sopenharmony_ci *          a special value.
1598c2ecf20Sopenharmony_ci *          Code points with CCC 0 are known as stoppers.
1608c2ecf20Sopenharmony_ci * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
1618c2ecf20Sopenharmony_ci *          start of a NUL-terminated string that is the decomposition
1628c2ecf20Sopenharmony_ci *          of the character.
1638c2ecf20Sopenharmony_ci *          The CCC of a decomposable character is the same as the CCC
1648c2ecf20Sopenharmony_ci *          of the first character of its decomposition.
1658c2ecf20Sopenharmony_ci *          Some characters decompose as the empty string: these are
1668c2ecf20Sopenharmony_ci *          characters with the Default_Ignorable_Code_Point property.
1678c2ecf20Sopenharmony_ci *          These do affect normalization, as they all have CCC 0.
1688c2ecf20Sopenharmony_ci *
1698c2ecf20Sopenharmony_ci * The decompositions in the trie have been fully expanded.
1708c2ecf20Sopenharmony_ci *
1718c2ecf20Sopenharmony_ci * Casefolding, if applicable, is also done using decompositions.
1728c2ecf20Sopenharmony_ci */
1738c2ecf20Sopenharmony_citypedef unsigned char utf8leaf_t;
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_ci#define LEAF_GEN(LEAF)	((LEAF)[0])
1768c2ecf20Sopenharmony_ci#define LEAF_CCC(LEAF)	((LEAF)[1])
1778c2ecf20Sopenharmony_ci#define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci#define MAXGEN		(255)
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci#define MINCCC		(0)
1828c2ecf20Sopenharmony_ci#define MAXCCC		(254)
1838c2ecf20Sopenharmony_ci#define STOPPER		(0)
1848c2ecf20Sopenharmony_ci#define DECOMPOSE	(255)
1858c2ecf20Sopenharmony_ci#define HANGUL		((char)(255))
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci#define UTF8HANGULLEAF	(12)
1888c2ecf20Sopenharmony_ci
1898c2ecf20Sopenharmony_cistruct tree;
1908c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
1918c2ecf20Sopenharmony_ci			       const char *, size_t);
1928c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_ciunsigned char *utf8data;
1958c2ecf20Sopenharmony_cisize_t utf8data_size;
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ciutf8trie_t *nfdi;
1988c2ecf20Sopenharmony_ciutf8trie_t *nfdicf;
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci/*
2038c2ecf20Sopenharmony_ci * UTF8 valid ranges.
2048c2ecf20Sopenharmony_ci *
2058c2ecf20Sopenharmony_ci * The UTF-8 encoding spreads the bits of a 32bit word over several
2068c2ecf20Sopenharmony_ci * bytes. This table gives the ranges that can be held and how they'd
2078c2ecf20Sopenharmony_ci * be represented.
2088c2ecf20Sopenharmony_ci *
2098c2ecf20Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx
2108c2ecf20Sopenharmony_ci * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
2118c2ecf20Sopenharmony_ci * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
2128c2ecf20Sopenharmony_ci * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2138c2ecf20Sopenharmony_ci * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2148c2ecf20Sopenharmony_ci * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2158c2ecf20Sopenharmony_ci *
2168c2ecf20Sopenharmony_ci * There is an additional requirement on UTF-8, in that only the
2178c2ecf20Sopenharmony_ci * shortest representation of a 32bit value is to be used.  A decoder
2188c2ecf20Sopenharmony_ci * must not decode sequences that do not satisfy this requirement.
2198c2ecf20Sopenharmony_ci * Thus the allowed ranges have a lower bound.
2208c2ecf20Sopenharmony_ci *
2218c2ecf20Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx
2228c2ecf20Sopenharmony_ci * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
2238c2ecf20Sopenharmony_ci * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
2248c2ecf20Sopenharmony_ci * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2258c2ecf20Sopenharmony_ci * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2268c2ecf20Sopenharmony_ci * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2278c2ecf20Sopenharmony_ci *
2288c2ecf20Sopenharmony_ci * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
2298c2ecf20Sopenharmony_ci * 17 planes of 65536 values.  This limits the sequences actually seen
2308c2ecf20Sopenharmony_ci * even more, to just the following.
2318c2ecf20Sopenharmony_ci *
2328c2ecf20Sopenharmony_ci *          0 -     0x7f: 0                     0x7f
2338c2ecf20Sopenharmony_ci *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
2348c2ecf20Sopenharmony_ci *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
2358c2ecf20Sopenharmony_ci *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
2368c2ecf20Sopenharmony_ci *
2378c2ecf20Sopenharmony_ci * Even within those ranges not all values are allowed: the surrogates
2388c2ecf20Sopenharmony_ci * 0xd800 - 0xdfff should never be seen.
2398c2ecf20Sopenharmony_ci *
2408c2ecf20Sopenharmony_ci * Note that the longest sequence seen with valid usage is 4 bytes,
2418c2ecf20Sopenharmony_ci * the same a single UTF-32 character.  This makes the UTF-8
2428c2ecf20Sopenharmony_ci * representation of Unicode strictly smaller than UTF-32.
2438c2ecf20Sopenharmony_ci *
2448c2ecf20Sopenharmony_ci * The shortest sequence requirement was introduced by:
2458c2ecf20Sopenharmony_ci *    Corrigendum #1: UTF-8 Shortest Form
2468c2ecf20Sopenharmony_ci * It can be found here:
2478c2ecf20Sopenharmony_ci *    http://www.unicode.org/versions/corrigendum1.html
2488c2ecf20Sopenharmony_ci *
2498c2ecf20Sopenharmony_ci */
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci#define UTF8_2_BITS     0xC0
2528c2ecf20Sopenharmony_ci#define UTF8_3_BITS     0xE0
2538c2ecf20Sopenharmony_ci#define UTF8_4_BITS     0xF0
2548c2ecf20Sopenharmony_ci#define UTF8_N_BITS     0x80
2558c2ecf20Sopenharmony_ci#define UTF8_2_MASK     0xE0
2568c2ecf20Sopenharmony_ci#define UTF8_3_MASK     0xF0
2578c2ecf20Sopenharmony_ci#define UTF8_4_MASK     0xF8
2588c2ecf20Sopenharmony_ci#define UTF8_N_MASK     0xC0
2598c2ecf20Sopenharmony_ci#define UTF8_V_MASK     0x3F
2608c2ecf20Sopenharmony_ci#define UTF8_V_SHIFT    6
2618c2ecf20Sopenharmony_ci
2628c2ecf20Sopenharmony_cistatic int utf8encode(char *str, unsigned int val)
2638c2ecf20Sopenharmony_ci{
2648c2ecf20Sopenharmony_ci	int len;
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci	if (val < 0x80) {
2678c2ecf20Sopenharmony_ci		str[0] = val;
2688c2ecf20Sopenharmony_ci		len = 1;
2698c2ecf20Sopenharmony_ci	} else if (val < 0x800) {
2708c2ecf20Sopenharmony_ci		str[1] = val & UTF8_V_MASK;
2718c2ecf20Sopenharmony_ci		str[1] |= UTF8_N_BITS;
2728c2ecf20Sopenharmony_ci		val >>= UTF8_V_SHIFT;
2738c2ecf20Sopenharmony_ci		str[0] = val;
2748c2ecf20Sopenharmony_ci		str[0] |= UTF8_2_BITS;
2758c2ecf20Sopenharmony_ci		len = 2;
2768c2ecf20Sopenharmony_ci	} else if (val < 0x10000) {
2778c2ecf20Sopenharmony_ci		str[2] = val & UTF8_V_MASK;
2788c2ecf20Sopenharmony_ci		str[2] |= UTF8_N_BITS;
2798c2ecf20Sopenharmony_ci		val >>= UTF8_V_SHIFT;
2808c2ecf20Sopenharmony_ci		str[1] = val & UTF8_V_MASK;
2818c2ecf20Sopenharmony_ci		str[1] |= UTF8_N_BITS;
2828c2ecf20Sopenharmony_ci		val >>= UTF8_V_SHIFT;
2838c2ecf20Sopenharmony_ci		str[0] = val;
2848c2ecf20Sopenharmony_ci		str[0] |= UTF8_3_BITS;
2858c2ecf20Sopenharmony_ci		len = 3;
2868c2ecf20Sopenharmony_ci	} else if (val < 0x110000) {
2878c2ecf20Sopenharmony_ci		str[3] = val & UTF8_V_MASK;
2888c2ecf20Sopenharmony_ci		str[3] |= UTF8_N_BITS;
2898c2ecf20Sopenharmony_ci		val >>= UTF8_V_SHIFT;
2908c2ecf20Sopenharmony_ci		str[2] = val & UTF8_V_MASK;
2918c2ecf20Sopenharmony_ci		str[2] |= UTF8_N_BITS;
2928c2ecf20Sopenharmony_ci		val >>= UTF8_V_SHIFT;
2938c2ecf20Sopenharmony_ci		str[1] = val & UTF8_V_MASK;
2948c2ecf20Sopenharmony_ci		str[1] |= UTF8_N_BITS;
2958c2ecf20Sopenharmony_ci		val >>= UTF8_V_SHIFT;
2968c2ecf20Sopenharmony_ci		str[0] = val;
2978c2ecf20Sopenharmony_ci		str[0] |= UTF8_4_BITS;
2988c2ecf20Sopenharmony_ci		len = 4;
2998c2ecf20Sopenharmony_ci	} else {
3008c2ecf20Sopenharmony_ci		printf("%#x: illegal val\n", val);
3018c2ecf20Sopenharmony_ci		len = 0;
3028c2ecf20Sopenharmony_ci	}
3038c2ecf20Sopenharmony_ci	return len;
3048c2ecf20Sopenharmony_ci}
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_cistatic unsigned int utf8decode(const char *str)
3078c2ecf20Sopenharmony_ci{
3088c2ecf20Sopenharmony_ci	const unsigned char *s = (const unsigned char*)str;
3098c2ecf20Sopenharmony_ci	unsigned int unichar = 0;
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	if (*s < 0x80) {
3128c2ecf20Sopenharmony_ci		unichar = *s;
3138c2ecf20Sopenharmony_ci	} else if (*s < UTF8_3_BITS) {
3148c2ecf20Sopenharmony_ci		unichar = *s++ & 0x1F;
3158c2ecf20Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
3168c2ecf20Sopenharmony_ci		unichar |= *s & 0x3F;
3178c2ecf20Sopenharmony_ci	} else if (*s < UTF8_4_BITS) {
3188c2ecf20Sopenharmony_ci		unichar = *s++ & 0x0F;
3198c2ecf20Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
3208c2ecf20Sopenharmony_ci		unichar |= *s++ & 0x3F;
3218c2ecf20Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
3228c2ecf20Sopenharmony_ci		unichar |= *s & 0x3F;
3238c2ecf20Sopenharmony_ci	} else {
3248c2ecf20Sopenharmony_ci		unichar = *s++ & 0x0F;
3258c2ecf20Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
3268c2ecf20Sopenharmony_ci		unichar |= *s++ & 0x3F;
3278c2ecf20Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
3288c2ecf20Sopenharmony_ci		unichar |= *s++ & 0x3F;
3298c2ecf20Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
3308c2ecf20Sopenharmony_ci		unichar |= *s & 0x3F;
3318c2ecf20Sopenharmony_ci	}
3328c2ecf20Sopenharmony_ci	return unichar;
3338c2ecf20Sopenharmony_ci}
3348c2ecf20Sopenharmony_ci
3358c2ecf20Sopenharmony_cistatic int utf32valid(unsigned int unichar)
3368c2ecf20Sopenharmony_ci{
3378c2ecf20Sopenharmony_ci	return unichar < 0x110000;
3388c2ecf20Sopenharmony_ci}
3398c2ecf20Sopenharmony_ci
3408c2ecf20Sopenharmony_ci#define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci#define NODE 1
3438c2ecf20Sopenharmony_ci#define LEAF 0
3448c2ecf20Sopenharmony_ci
3458c2ecf20Sopenharmony_cistruct tree {
3468c2ecf20Sopenharmony_ci	void *root;
3478c2ecf20Sopenharmony_ci	int childnode;
3488c2ecf20Sopenharmony_ci	const char *type;
3498c2ecf20Sopenharmony_ci	unsigned int maxage;
3508c2ecf20Sopenharmony_ci	struct tree *next;
3518c2ecf20Sopenharmony_ci	int (*leaf_equal)(void *, void *);
3528c2ecf20Sopenharmony_ci	void (*leaf_print)(void *, int);
3538c2ecf20Sopenharmony_ci	int (*leaf_mark)(void *);
3548c2ecf20Sopenharmony_ci	int (*leaf_size)(void *);
3558c2ecf20Sopenharmony_ci	int *(*leaf_index)(struct tree *, void *);
3568c2ecf20Sopenharmony_ci	unsigned char *(*leaf_emit)(void *, unsigned char *);
3578c2ecf20Sopenharmony_ci	int leafindex[0x110000];
3588c2ecf20Sopenharmony_ci	int index;
3598c2ecf20Sopenharmony_ci};
3608c2ecf20Sopenharmony_ci
3618c2ecf20Sopenharmony_cistruct node {
3628c2ecf20Sopenharmony_ci	int index;
3638c2ecf20Sopenharmony_ci	int offset;
3648c2ecf20Sopenharmony_ci	int mark;
3658c2ecf20Sopenharmony_ci	int size;
3668c2ecf20Sopenharmony_ci	struct node *parent;
3678c2ecf20Sopenharmony_ci	void *left;
3688c2ecf20Sopenharmony_ci	void *right;
3698c2ecf20Sopenharmony_ci	unsigned char bitnum;
3708c2ecf20Sopenharmony_ci	unsigned char nextbyte;
3718c2ecf20Sopenharmony_ci	unsigned char leftnode;
3728c2ecf20Sopenharmony_ci	unsigned char rightnode;
3738c2ecf20Sopenharmony_ci	unsigned int keybits;
3748c2ecf20Sopenharmony_ci	unsigned int keymask;
3758c2ecf20Sopenharmony_ci};
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci/*
3788c2ecf20Sopenharmony_ci * Example lookup function for a tree.
3798c2ecf20Sopenharmony_ci */
3808c2ecf20Sopenharmony_cistatic void *lookup(struct tree *tree, const char *key)
3818c2ecf20Sopenharmony_ci{
3828c2ecf20Sopenharmony_ci	struct node *node;
3838c2ecf20Sopenharmony_ci	void *leaf = NULL;
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci	node = tree->root;
3868c2ecf20Sopenharmony_ci	while (!leaf && node) {
3878c2ecf20Sopenharmony_ci		if (node->nextbyte)
3888c2ecf20Sopenharmony_ci			key++;
3898c2ecf20Sopenharmony_ci		if (*key & (1 << (node->bitnum & 7))) {
3908c2ecf20Sopenharmony_ci			/* Right leg */
3918c2ecf20Sopenharmony_ci			if (node->rightnode == NODE) {
3928c2ecf20Sopenharmony_ci				node = node->right;
3938c2ecf20Sopenharmony_ci			} else if (node->rightnode == LEAF) {
3948c2ecf20Sopenharmony_ci				leaf = node->right;
3958c2ecf20Sopenharmony_ci			} else {
3968c2ecf20Sopenharmony_ci				node = NULL;
3978c2ecf20Sopenharmony_ci			}
3988c2ecf20Sopenharmony_ci		} else {
3998c2ecf20Sopenharmony_ci			/* Left leg */
4008c2ecf20Sopenharmony_ci			if (node->leftnode == NODE) {
4018c2ecf20Sopenharmony_ci				node = node->left;
4028c2ecf20Sopenharmony_ci			} else if (node->leftnode == LEAF) {
4038c2ecf20Sopenharmony_ci				leaf = node->left;
4048c2ecf20Sopenharmony_ci			} else {
4058c2ecf20Sopenharmony_ci				node = NULL;
4068c2ecf20Sopenharmony_ci			}
4078c2ecf20Sopenharmony_ci		}
4088c2ecf20Sopenharmony_ci	}
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci	return leaf;
4118c2ecf20Sopenharmony_ci}
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_ci/*
4148c2ecf20Sopenharmony_ci * A simple non-recursive tree walker: keep track of visits to the
4158c2ecf20Sopenharmony_ci * left and right branches in the leftmask and rightmask.
4168c2ecf20Sopenharmony_ci */
4178c2ecf20Sopenharmony_cistatic void tree_walk(struct tree *tree)
4188c2ecf20Sopenharmony_ci{
4198c2ecf20Sopenharmony_ci	struct node *node;
4208c2ecf20Sopenharmony_ci	unsigned int leftmask;
4218c2ecf20Sopenharmony_ci	unsigned int rightmask;
4228c2ecf20Sopenharmony_ci	unsigned int bitmask;
4238c2ecf20Sopenharmony_ci	int indent = 1;
4248c2ecf20Sopenharmony_ci	int nodes, singletons, leaves;
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	nodes = singletons = leaves = 0;
4278c2ecf20Sopenharmony_ci
4288c2ecf20Sopenharmony_ci	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
4298c2ecf20Sopenharmony_ci	if (tree->childnode == LEAF) {
4308c2ecf20Sopenharmony_ci		assert(tree->root);
4318c2ecf20Sopenharmony_ci		tree->leaf_print(tree->root, indent);
4328c2ecf20Sopenharmony_ci		leaves = 1;
4338c2ecf20Sopenharmony_ci	} else {
4348c2ecf20Sopenharmony_ci		assert(tree->childnode == NODE);
4358c2ecf20Sopenharmony_ci		node = tree->root;
4368c2ecf20Sopenharmony_ci		leftmask = rightmask = 0;
4378c2ecf20Sopenharmony_ci		while (node) {
4388c2ecf20Sopenharmony_ci			printf("%*snode @ %p bitnum %d nextbyte %d"
4398c2ecf20Sopenharmony_ci			       " left %p right %p mask %x bits %x\n",
4408c2ecf20Sopenharmony_ci				indent, "", node,
4418c2ecf20Sopenharmony_ci				node->bitnum, node->nextbyte,
4428c2ecf20Sopenharmony_ci				node->left, node->right,
4438c2ecf20Sopenharmony_ci				node->keymask, node->keybits);
4448c2ecf20Sopenharmony_ci			nodes += 1;
4458c2ecf20Sopenharmony_ci			if (!(node->left && node->right))
4468c2ecf20Sopenharmony_ci				singletons += 1;
4478c2ecf20Sopenharmony_ci
4488c2ecf20Sopenharmony_ci			while (node) {
4498c2ecf20Sopenharmony_ci				bitmask = 1 << node->bitnum;
4508c2ecf20Sopenharmony_ci				if ((leftmask & bitmask) == 0) {
4518c2ecf20Sopenharmony_ci					leftmask |= bitmask;
4528c2ecf20Sopenharmony_ci					if (node->leftnode == LEAF) {
4538c2ecf20Sopenharmony_ci						assert(node->left);
4548c2ecf20Sopenharmony_ci						tree->leaf_print(node->left,
4558c2ecf20Sopenharmony_ci								 indent+1);
4568c2ecf20Sopenharmony_ci						leaves += 1;
4578c2ecf20Sopenharmony_ci					} else if (node->left) {
4588c2ecf20Sopenharmony_ci						assert(node->leftnode == NODE);
4598c2ecf20Sopenharmony_ci						indent += 1;
4608c2ecf20Sopenharmony_ci						node = node->left;
4618c2ecf20Sopenharmony_ci						break;
4628c2ecf20Sopenharmony_ci					}
4638c2ecf20Sopenharmony_ci				}
4648c2ecf20Sopenharmony_ci				if ((rightmask & bitmask) == 0) {
4658c2ecf20Sopenharmony_ci					rightmask |= bitmask;
4668c2ecf20Sopenharmony_ci					if (node->rightnode == LEAF) {
4678c2ecf20Sopenharmony_ci						assert(node->right);
4688c2ecf20Sopenharmony_ci						tree->leaf_print(node->right,
4698c2ecf20Sopenharmony_ci								 indent+1);
4708c2ecf20Sopenharmony_ci						leaves += 1;
4718c2ecf20Sopenharmony_ci					} else if (node->right) {
4728c2ecf20Sopenharmony_ci						assert(node->rightnode == NODE);
4738c2ecf20Sopenharmony_ci						indent += 1;
4748c2ecf20Sopenharmony_ci						node = node->right;
4758c2ecf20Sopenharmony_ci						break;
4768c2ecf20Sopenharmony_ci					}
4778c2ecf20Sopenharmony_ci				}
4788c2ecf20Sopenharmony_ci				leftmask &= ~bitmask;
4798c2ecf20Sopenharmony_ci				rightmask &= ~bitmask;
4808c2ecf20Sopenharmony_ci				node = node->parent;
4818c2ecf20Sopenharmony_ci				indent -= 1;
4828c2ecf20Sopenharmony_ci			}
4838c2ecf20Sopenharmony_ci		}
4848c2ecf20Sopenharmony_ci	}
4858c2ecf20Sopenharmony_ci	printf("nodes %d leaves %d singletons %d\n",
4868c2ecf20Sopenharmony_ci	       nodes, leaves, singletons);
4878c2ecf20Sopenharmony_ci}
4888c2ecf20Sopenharmony_ci
4898c2ecf20Sopenharmony_ci/*
4908c2ecf20Sopenharmony_ci * Allocate an initialize a new internal node.
4918c2ecf20Sopenharmony_ci */
4928c2ecf20Sopenharmony_cistatic struct node *alloc_node(struct node *parent)
4938c2ecf20Sopenharmony_ci{
4948c2ecf20Sopenharmony_ci	struct node *node;
4958c2ecf20Sopenharmony_ci	int bitnum;
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_ci	node = malloc(sizeof(*node));
4988c2ecf20Sopenharmony_ci	node->left = node->right = NULL;
4998c2ecf20Sopenharmony_ci	node->parent = parent;
5008c2ecf20Sopenharmony_ci	node->leftnode = NODE;
5018c2ecf20Sopenharmony_ci	node->rightnode = NODE;
5028c2ecf20Sopenharmony_ci	node->keybits = 0;
5038c2ecf20Sopenharmony_ci	node->keymask = 0;
5048c2ecf20Sopenharmony_ci	node->mark = 0;
5058c2ecf20Sopenharmony_ci	node->index = 0;
5068c2ecf20Sopenharmony_ci	node->offset = -1;
5078c2ecf20Sopenharmony_ci	node->size = 4;
5088c2ecf20Sopenharmony_ci
5098c2ecf20Sopenharmony_ci	if (node->parent) {
5108c2ecf20Sopenharmony_ci		bitnum = parent->bitnum;
5118c2ecf20Sopenharmony_ci		if ((bitnum & 7) == 0) {
5128c2ecf20Sopenharmony_ci			node->bitnum = bitnum + 7 + 8;
5138c2ecf20Sopenharmony_ci			node->nextbyte = 1;
5148c2ecf20Sopenharmony_ci		} else {
5158c2ecf20Sopenharmony_ci			node->bitnum = bitnum - 1;
5168c2ecf20Sopenharmony_ci			node->nextbyte = 0;
5178c2ecf20Sopenharmony_ci		}
5188c2ecf20Sopenharmony_ci	} else {
5198c2ecf20Sopenharmony_ci		node->bitnum = 7;
5208c2ecf20Sopenharmony_ci		node->nextbyte = 0;
5218c2ecf20Sopenharmony_ci	}
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ci	return node;
5248c2ecf20Sopenharmony_ci}
5258c2ecf20Sopenharmony_ci
5268c2ecf20Sopenharmony_ci/*
5278c2ecf20Sopenharmony_ci * Insert a new leaf into the tree, and collapse any subtrees that are
5288c2ecf20Sopenharmony_ci * fully populated and end in identical leaves. A nextbyte tagged
5298c2ecf20Sopenharmony_ci * internal node will not be removed to preserve the tree's integrity.
5308c2ecf20Sopenharmony_ci * Note that due to the structure of utf8, no nextbyte tagged node
5318c2ecf20Sopenharmony_ci * will be a candidate for removal.
5328c2ecf20Sopenharmony_ci */
5338c2ecf20Sopenharmony_cistatic int insert(struct tree *tree, char *key, int keylen, void *leaf)
5348c2ecf20Sopenharmony_ci{
5358c2ecf20Sopenharmony_ci	struct node *node;
5368c2ecf20Sopenharmony_ci	struct node *parent;
5378c2ecf20Sopenharmony_ci	void **cursor;
5388c2ecf20Sopenharmony_ci	int keybits;
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci	assert(keylen >= 1 && keylen <= 4);
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci	node = NULL;
5438c2ecf20Sopenharmony_ci	cursor = &tree->root;
5448c2ecf20Sopenharmony_ci	keybits = 8 * keylen;
5458c2ecf20Sopenharmony_ci
5468c2ecf20Sopenharmony_ci	/* Insert, creating path along the way. */
5478c2ecf20Sopenharmony_ci	while (keybits) {
5488c2ecf20Sopenharmony_ci		if (!*cursor)
5498c2ecf20Sopenharmony_ci			*cursor = alloc_node(node);
5508c2ecf20Sopenharmony_ci		node = *cursor;
5518c2ecf20Sopenharmony_ci		if (node->nextbyte)
5528c2ecf20Sopenharmony_ci			key++;
5538c2ecf20Sopenharmony_ci		if (*key & (1 << (node->bitnum & 7)))
5548c2ecf20Sopenharmony_ci			cursor = &node->right;
5558c2ecf20Sopenharmony_ci		else
5568c2ecf20Sopenharmony_ci			cursor = &node->left;
5578c2ecf20Sopenharmony_ci		keybits--;
5588c2ecf20Sopenharmony_ci	}
5598c2ecf20Sopenharmony_ci	*cursor = leaf;
5608c2ecf20Sopenharmony_ci
5618c2ecf20Sopenharmony_ci	/* Merge subtrees if possible. */
5628c2ecf20Sopenharmony_ci	while (node) {
5638c2ecf20Sopenharmony_ci		if (*key & (1 << (node->bitnum & 7)))
5648c2ecf20Sopenharmony_ci			node->rightnode = LEAF;
5658c2ecf20Sopenharmony_ci		else
5668c2ecf20Sopenharmony_ci			node->leftnode = LEAF;
5678c2ecf20Sopenharmony_ci		if (node->nextbyte)
5688c2ecf20Sopenharmony_ci			break;
5698c2ecf20Sopenharmony_ci		if (node->leftnode == NODE || node->rightnode == NODE)
5708c2ecf20Sopenharmony_ci			break;
5718c2ecf20Sopenharmony_ci		assert(node->left);
5728c2ecf20Sopenharmony_ci		assert(node->right);
5738c2ecf20Sopenharmony_ci		/* Compare */
5748c2ecf20Sopenharmony_ci		if (! tree->leaf_equal(node->left, node->right))
5758c2ecf20Sopenharmony_ci			break;
5768c2ecf20Sopenharmony_ci		/* Keep left, drop right leaf. */
5778c2ecf20Sopenharmony_ci		leaf = node->left;
5788c2ecf20Sopenharmony_ci		/* Check in parent */
5798c2ecf20Sopenharmony_ci		parent = node->parent;
5808c2ecf20Sopenharmony_ci		if (!parent) {
5818c2ecf20Sopenharmony_ci			/* root of tree! */
5828c2ecf20Sopenharmony_ci			tree->root = leaf;
5838c2ecf20Sopenharmony_ci			tree->childnode = LEAF;
5848c2ecf20Sopenharmony_ci		} else if (parent->left == node) {
5858c2ecf20Sopenharmony_ci			parent->left = leaf;
5868c2ecf20Sopenharmony_ci			parent->leftnode = LEAF;
5878c2ecf20Sopenharmony_ci			if (parent->right) {
5888c2ecf20Sopenharmony_ci				parent->keymask = 0;
5898c2ecf20Sopenharmony_ci				parent->keybits = 0;
5908c2ecf20Sopenharmony_ci			} else {
5918c2ecf20Sopenharmony_ci				parent->keymask |= (1 << node->bitnum);
5928c2ecf20Sopenharmony_ci			}
5938c2ecf20Sopenharmony_ci		} else if (parent->right == node) {
5948c2ecf20Sopenharmony_ci			parent->right = leaf;
5958c2ecf20Sopenharmony_ci			parent->rightnode = LEAF;
5968c2ecf20Sopenharmony_ci			if (parent->left) {
5978c2ecf20Sopenharmony_ci				parent->keymask = 0;
5988c2ecf20Sopenharmony_ci				parent->keybits = 0;
5998c2ecf20Sopenharmony_ci			} else {
6008c2ecf20Sopenharmony_ci				parent->keymask |= (1 << node->bitnum);
6018c2ecf20Sopenharmony_ci				parent->keybits |= (1 << node->bitnum);
6028c2ecf20Sopenharmony_ci			}
6038c2ecf20Sopenharmony_ci		} else {
6048c2ecf20Sopenharmony_ci			/* internal tree error */
6058c2ecf20Sopenharmony_ci			assert(0);
6068c2ecf20Sopenharmony_ci		}
6078c2ecf20Sopenharmony_ci		free(node);
6088c2ecf20Sopenharmony_ci		node = parent;
6098c2ecf20Sopenharmony_ci	}
6108c2ecf20Sopenharmony_ci
6118c2ecf20Sopenharmony_ci	/* Propagate keymasks up along singleton chains. */
6128c2ecf20Sopenharmony_ci	while (node) {
6138c2ecf20Sopenharmony_ci		parent = node->parent;
6148c2ecf20Sopenharmony_ci		if (!parent)
6158c2ecf20Sopenharmony_ci			break;
6168c2ecf20Sopenharmony_ci		/* Nix the mask for parents with two children. */
6178c2ecf20Sopenharmony_ci		if (node->keymask == 0) {
6188c2ecf20Sopenharmony_ci			parent->keymask = 0;
6198c2ecf20Sopenharmony_ci			parent->keybits = 0;
6208c2ecf20Sopenharmony_ci		} else if (parent->left && parent->right) {
6218c2ecf20Sopenharmony_ci			parent->keymask = 0;
6228c2ecf20Sopenharmony_ci			parent->keybits = 0;
6238c2ecf20Sopenharmony_ci		} else {
6248c2ecf20Sopenharmony_ci			assert((parent->keymask & node->keymask) == 0);
6258c2ecf20Sopenharmony_ci			parent->keymask |= node->keymask;
6268c2ecf20Sopenharmony_ci			parent->keymask |= (1 << parent->bitnum);
6278c2ecf20Sopenharmony_ci			parent->keybits |= node->keybits;
6288c2ecf20Sopenharmony_ci			if (parent->right)
6298c2ecf20Sopenharmony_ci				parent->keybits |= (1 << parent->bitnum);
6308c2ecf20Sopenharmony_ci		}
6318c2ecf20Sopenharmony_ci		node = parent;
6328c2ecf20Sopenharmony_ci	}
6338c2ecf20Sopenharmony_ci
6348c2ecf20Sopenharmony_ci	return 0;
6358c2ecf20Sopenharmony_ci}
6368c2ecf20Sopenharmony_ci
6378c2ecf20Sopenharmony_ci/*
6388c2ecf20Sopenharmony_ci * Prune internal nodes.
6398c2ecf20Sopenharmony_ci *
6408c2ecf20Sopenharmony_ci * Fully populated subtrees that end at the same leaf have already
6418c2ecf20Sopenharmony_ci * been collapsed.  There are still internal nodes that have for both
6428c2ecf20Sopenharmony_ci * their left and right branches a sequence of singletons that make
6438c2ecf20Sopenharmony_ci * identical choices and end in identical leaves.  The keymask and
6448c2ecf20Sopenharmony_ci * keybits collected in the nodes describe the choices made in these
6458c2ecf20Sopenharmony_ci * singleton chains.  When they are identical for the left and right
6468c2ecf20Sopenharmony_ci * branch of a node, and the two leaves comare identical, the node in
6478c2ecf20Sopenharmony_ci * question can be removed.
6488c2ecf20Sopenharmony_ci *
6498c2ecf20Sopenharmony_ci * Note that nodes with the nextbyte tag set will not be removed by
6508c2ecf20Sopenharmony_ci * this to ensure tree integrity.  Note as well that the structure of
6518c2ecf20Sopenharmony_ci * utf8 ensures that these nodes would not have been candidates for
6528c2ecf20Sopenharmony_ci * removal in any case.
6538c2ecf20Sopenharmony_ci */
6548c2ecf20Sopenharmony_cistatic void prune(struct tree *tree)
6558c2ecf20Sopenharmony_ci{
6568c2ecf20Sopenharmony_ci	struct node *node;
6578c2ecf20Sopenharmony_ci	struct node *left;
6588c2ecf20Sopenharmony_ci	struct node *right;
6598c2ecf20Sopenharmony_ci	struct node *parent;
6608c2ecf20Sopenharmony_ci	void *leftleaf;
6618c2ecf20Sopenharmony_ci	void *rightleaf;
6628c2ecf20Sopenharmony_ci	unsigned int leftmask;
6638c2ecf20Sopenharmony_ci	unsigned int rightmask;
6648c2ecf20Sopenharmony_ci	unsigned int bitmask;
6658c2ecf20Sopenharmony_ci	int count;
6668c2ecf20Sopenharmony_ci
6678c2ecf20Sopenharmony_ci	if (verbose > 0)
6688c2ecf20Sopenharmony_ci		printf("Pruning %s_%x\n", tree->type, tree->maxage);
6698c2ecf20Sopenharmony_ci
6708c2ecf20Sopenharmony_ci	count = 0;
6718c2ecf20Sopenharmony_ci	if (tree->childnode == LEAF)
6728c2ecf20Sopenharmony_ci		return;
6738c2ecf20Sopenharmony_ci	if (!tree->root)
6748c2ecf20Sopenharmony_ci		return;
6758c2ecf20Sopenharmony_ci
6768c2ecf20Sopenharmony_ci	leftmask = rightmask = 0;
6778c2ecf20Sopenharmony_ci	node = tree->root;
6788c2ecf20Sopenharmony_ci	while (node) {
6798c2ecf20Sopenharmony_ci		if (node->nextbyte)
6808c2ecf20Sopenharmony_ci			goto advance;
6818c2ecf20Sopenharmony_ci		if (node->leftnode == LEAF)
6828c2ecf20Sopenharmony_ci			goto advance;
6838c2ecf20Sopenharmony_ci		if (node->rightnode == LEAF)
6848c2ecf20Sopenharmony_ci			goto advance;
6858c2ecf20Sopenharmony_ci		if (!node->left)
6868c2ecf20Sopenharmony_ci			goto advance;
6878c2ecf20Sopenharmony_ci		if (!node->right)
6888c2ecf20Sopenharmony_ci			goto advance;
6898c2ecf20Sopenharmony_ci		left = node->left;
6908c2ecf20Sopenharmony_ci		right = node->right;
6918c2ecf20Sopenharmony_ci		if (left->keymask == 0)
6928c2ecf20Sopenharmony_ci			goto advance;
6938c2ecf20Sopenharmony_ci		if (right->keymask == 0)
6948c2ecf20Sopenharmony_ci			goto advance;
6958c2ecf20Sopenharmony_ci		if (left->keymask != right->keymask)
6968c2ecf20Sopenharmony_ci			goto advance;
6978c2ecf20Sopenharmony_ci		if (left->keybits != right->keybits)
6988c2ecf20Sopenharmony_ci			goto advance;
6998c2ecf20Sopenharmony_ci		leftleaf = NULL;
7008c2ecf20Sopenharmony_ci		while (!leftleaf) {
7018c2ecf20Sopenharmony_ci			assert(left->left || left->right);
7028c2ecf20Sopenharmony_ci			if (left->leftnode == LEAF)
7038c2ecf20Sopenharmony_ci				leftleaf = left->left;
7048c2ecf20Sopenharmony_ci			else if (left->rightnode == LEAF)
7058c2ecf20Sopenharmony_ci				leftleaf = left->right;
7068c2ecf20Sopenharmony_ci			else if (left->left)
7078c2ecf20Sopenharmony_ci				left = left->left;
7088c2ecf20Sopenharmony_ci			else if (left->right)
7098c2ecf20Sopenharmony_ci				left = left->right;
7108c2ecf20Sopenharmony_ci			else
7118c2ecf20Sopenharmony_ci				assert(0);
7128c2ecf20Sopenharmony_ci		}
7138c2ecf20Sopenharmony_ci		rightleaf = NULL;
7148c2ecf20Sopenharmony_ci		while (!rightleaf) {
7158c2ecf20Sopenharmony_ci			assert(right->left || right->right);
7168c2ecf20Sopenharmony_ci			if (right->leftnode == LEAF)
7178c2ecf20Sopenharmony_ci				rightleaf = right->left;
7188c2ecf20Sopenharmony_ci			else if (right->rightnode == LEAF)
7198c2ecf20Sopenharmony_ci				rightleaf = right->right;
7208c2ecf20Sopenharmony_ci			else if (right->left)
7218c2ecf20Sopenharmony_ci				right = right->left;
7228c2ecf20Sopenharmony_ci			else if (right->right)
7238c2ecf20Sopenharmony_ci				right = right->right;
7248c2ecf20Sopenharmony_ci			else
7258c2ecf20Sopenharmony_ci				assert(0);
7268c2ecf20Sopenharmony_ci		}
7278c2ecf20Sopenharmony_ci		if (! tree->leaf_equal(leftleaf, rightleaf))
7288c2ecf20Sopenharmony_ci			goto advance;
7298c2ecf20Sopenharmony_ci		/*
7308c2ecf20Sopenharmony_ci		 * This node has identical singleton-only subtrees.
7318c2ecf20Sopenharmony_ci		 * Remove it.
7328c2ecf20Sopenharmony_ci		 */
7338c2ecf20Sopenharmony_ci		parent = node->parent;
7348c2ecf20Sopenharmony_ci		left = node->left;
7358c2ecf20Sopenharmony_ci		right = node->right;
7368c2ecf20Sopenharmony_ci		if (parent->left == node)
7378c2ecf20Sopenharmony_ci			parent->left = left;
7388c2ecf20Sopenharmony_ci		else if (parent->right == node)
7398c2ecf20Sopenharmony_ci			parent->right = left;
7408c2ecf20Sopenharmony_ci		else
7418c2ecf20Sopenharmony_ci			assert(0);
7428c2ecf20Sopenharmony_ci		left->parent = parent;
7438c2ecf20Sopenharmony_ci		left->keymask |= (1 << node->bitnum);
7448c2ecf20Sopenharmony_ci		node->left = NULL;
7458c2ecf20Sopenharmony_ci		while (node) {
7468c2ecf20Sopenharmony_ci			bitmask = 1 << node->bitnum;
7478c2ecf20Sopenharmony_ci			leftmask &= ~bitmask;
7488c2ecf20Sopenharmony_ci			rightmask &= ~bitmask;
7498c2ecf20Sopenharmony_ci			if (node->leftnode == NODE && node->left) {
7508c2ecf20Sopenharmony_ci				left = node->left;
7518c2ecf20Sopenharmony_ci				free(node);
7528c2ecf20Sopenharmony_ci				count++;
7538c2ecf20Sopenharmony_ci				node = left;
7548c2ecf20Sopenharmony_ci			} else if (node->rightnode == NODE && node->right) {
7558c2ecf20Sopenharmony_ci				right = node->right;
7568c2ecf20Sopenharmony_ci				free(node);
7578c2ecf20Sopenharmony_ci				count++;
7588c2ecf20Sopenharmony_ci				node = right;
7598c2ecf20Sopenharmony_ci			} else {
7608c2ecf20Sopenharmony_ci				node = NULL;
7618c2ecf20Sopenharmony_ci			}
7628c2ecf20Sopenharmony_ci		}
7638c2ecf20Sopenharmony_ci		/* Propagate keymasks up along singleton chains. */
7648c2ecf20Sopenharmony_ci		node = parent;
7658c2ecf20Sopenharmony_ci		/* Force re-check */
7668c2ecf20Sopenharmony_ci		bitmask = 1 << node->bitnum;
7678c2ecf20Sopenharmony_ci		leftmask &= ~bitmask;
7688c2ecf20Sopenharmony_ci		rightmask &= ~bitmask;
7698c2ecf20Sopenharmony_ci		for (;;) {
7708c2ecf20Sopenharmony_ci			if (node->left && node->right)
7718c2ecf20Sopenharmony_ci				break;
7728c2ecf20Sopenharmony_ci			if (node->left) {
7738c2ecf20Sopenharmony_ci				left = node->left;
7748c2ecf20Sopenharmony_ci				node->keymask |= left->keymask;
7758c2ecf20Sopenharmony_ci				node->keybits |= left->keybits;
7768c2ecf20Sopenharmony_ci			}
7778c2ecf20Sopenharmony_ci			if (node->right) {
7788c2ecf20Sopenharmony_ci				right = node->right;
7798c2ecf20Sopenharmony_ci				node->keymask |= right->keymask;
7808c2ecf20Sopenharmony_ci				node->keybits |= right->keybits;
7818c2ecf20Sopenharmony_ci			}
7828c2ecf20Sopenharmony_ci			node->keymask |= (1 << node->bitnum);
7838c2ecf20Sopenharmony_ci			node = node->parent;
7848c2ecf20Sopenharmony_ci			/* Force re-check */
7858c2ecf20Sopenharmony_ci			bitmask = 1 << node->bitnum;
7868c2ecf20Sopenharmony_ci			leftmask &= ~bitmask;
7878c2ecf20Sopenharmony_ci			rightmask &= ~bitmask;
7888c2ecf20Sopenharmony_ci		}
7898c2ecf20Sopenharmony_ci	advance:
7908c2ecf20Sopenharmony_ci		bitmask = 1 << node->bitnum;
7918c2ecf20Sopenharmony_ci		if ((leftmask & bitmask) == 0 &&
7928c2ecf20Sopenharmony_ci		    node->leftnode == NODE &&
7938c2ecf20Sopenharmony_ci		    node->left) {
7948c2ecf20Sopenharmony_ci			leftmask |= bitmask;
7958c2ecf20Sopenharmony_ci			node = node->left;
7968c2ecf20Sopenharmony_ci		} else if ((rightmask & bitmask) == 0 &&
7978c2ecf20Sopenharmony_ci			   node->rightnode == NODE &&
7988c2ecf20Sopenharmony_ci			   node->right) {
7998c2ecf20Sopenharmony_ci			rightmask |= bitmask;
8008c2ecf20Sopenharmony_ci			node = node->right;
8018c2ecf20Sopenharmony_ci		} else {
8028c2ecf20Sopenharmony_ci			leftmask &= ~bitmask;
8038c2ecf20Sopenharmony_ci			rightmask &= ~bitmask;
8048c2ecf20Sopenharmony_ci			node = node->parent;
8058c2ecf20Sopenharmony_ci		}
8068c2ecf20Sopenharmony_ci	}
8078c2ecf20Sopenharmony_ci	if (verbose > 0)
8088c2ecf20Sopenharmony_ci		printf("Pruned %d nodes\n", count);
8098c2ecf20Sopenharmony_ci}
8108c2ecf20Sopenharmony_ci
8118c2ecf20Sopenharmony_ci/*
8128c2ecf20Sopenharmony_ci * Mark the nodes in the tree that lead to leaves that must be
8138c2ecf20Sopenharmony_ci * emitted.
8148c2ecf20Sopenharmony_ci */
8158c2ecf20Sopenharmony_cistatic void mark_nodes(struct tree *tree)
8168c2ecf20Sopenharmony_ci{
8178c2ecf20Sopenharmony_ci	struct node *node;
8188c2ecf20Sopenharmony_ci	struct node *n;
8198c2ecf20Sopenharmony_ci	unsigned int leftmask;
8208c2ecf20Sopenharmony_ci	unsigned int rightmask;
8218c2ecf20Sopenharmony_ci	unsigned int bitmask;
8228c2ecf20Sopenharmony_ci	int marked;
8238c2ecf20Sopenharmony_ci
8248c2ecf20Sopenharmony_ci	marked = 0;
8258c2ecf20Sopenharmony_ci	if (verbose > 0)
8268c2ecf20Sopenharmony_ci		printf("Marking %s_%x\n", tree->type, tree->maxage);
8278c2ecf20Sopenharmony_ci	if (tree->childnode == LEAF)
8288c2ecf20Sopenharmony_ci		goto done;
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_ci	assert(tree->childnode == NODE);
8318c2ecf20Sopenharmony_ci	node = tree->root;
8328c2ecf20Sopenharmony_ci	leftmask = rightmask = 0;
8338c2ecf20Sopenharmony_ci	while (node) {
8348c2ecf20Sopenharmony_ci		bitmask = 1 << node->bitnum;
8358c2ecf20Sopenharmony_ci		if ((leftmask & bitmask) == 0) {
8368c2ecf20Sopenharmony_ci			leftmask |= bitmask;
8378c2ecf20Sopenharmony_ci			if (node->leftnode == LEAF) {
8388c2ecf20Sopenharmony_ci				assert(node->left);
8398c2ecf20Sopenharmony_ci				if (tree->leaf_mark(node->left)) {
8408c2ecf20Sopenharmony_ci					n = node;
8418c2ecf20Sopenharmony_ci					while (n && !n->mark) {
8428c2ecf20Sopenharmony_ci						marked++;
8438c2ecf20Sopenharmony_ci						n->mark = 1;
8448c2ecf20Sopenharmony_ci						n = n->parent;
8458c2ecf20Sopenharmony_ci					}
8468c2ecf20Sopenharmony_ci				}
8478c2ecf20Sopenharmony_ci			} else if (node->left) {
8488c2ecf20Sopenharmony_ci				assert(node->leftnode == NODE);
8498c2ecf20Sopenharmony_ci				node = node->left;
8508c2ecf20Sopenharmony_ci				continue;
8518c2ecf20Sopenharmony_ci			}
8528c2ecf20Sopenharmony_ci		}
8538c2ecf20Sopenharmony_ci		if ((rightmask & bitmask) == 0) {
8548c2ecf20Sopenharmony_ci			rightmask |= bitmask;
8558c2ecf20Sopenharmony_ci			if (node->rightnode == LEAF) {
8568c2ecf20Sopenharmony_ci				assert(node->right);
8578c2ecf20Sopenharmony_ci				if (tree->leaf_mark(node->right)) {
8588c2ecf20Sopenharmony_ci					n = node;
8598c2ecf20Sopenharmony_ci					while (n && !n->mark) {
8608c2ecf20Sopenharmony_ci						marked++;
8618c2ecf20Sopenharmony_ci						n->mark = 1;
8628c2ecf20Sopenharmony_ci						n = n->parent;
8638c2ecf20Sopenharmony_ci					}
8648c2ecf20Sopenharmony_ci				}
8658c2ecf20Sopenharmony_ci			} else if (node->right) {
8668c2ecf20Sopenharmony_ci				assert(node->rightnode == NODE);
8678c2ecf20Sopenharmony_ci				node = node->right;
8688c2ecf20Sopenharmony_ci				continue;
8698c2ecf20Sopenharmony_ci			}
8708c2ecf20Sopenharmony_ci		}
8718c2ecf20Sopenharmony_ci		leftmask &= ~bitmask;
8728c2ecf20Sopenharmony_ci		rightmask &= ~bitmask;
8738c2ecf20Sopenharmony_ci		node = node->parent;
8748c2ecf20Sopenharmony_ci	}
8758c2ecf20Sopenharmony_ci
8768c2ecf20Sopenharmony_ci	/* second pass: left siblings and singletons */
8778c2ecf20Sopenharmony_ci
8788c2ecf20Sopenharmony_ci	assert(tree->childnode == NODE);
8798c2ecf20Sopenharmony_ci	node = tree->root;
8808c2ecf20Sopenharmony_ci	leftmask = rightmask = 0;
8818c2ecf20Sopenharmony_ci	while (node) {
8828c2ecf20Sopenharmony_ci		bitmask = 1 << node->bitnum;
8838c2ecf20Sopenharmony_ci		if ((leftmask & bitmask) == 0) {
8848c2ecf20Sopenharmony_ci			leftmask |= bitmask;
8858c2ecf20Sopenharmony_ci			if (node->leftnode == LEAF) {
8868c2ecf20Sopenharmony_ci				assert(node->left);
8878c2ecf20Sopenharmony_ci				if (tree->leaf_mark(node->left)) {
8888c2ecf20Sopenharmony_ci					n = node;
8898c2ecf20Sopenharmony_ci					while (n && !n->mark) {
8908c2ecf20Sopenharmony_ci						marked++;
8918c2ecf20Sopenharmony_ci						n->mark = 1;
8928c2ecf20Sopenharmony_ci						n = n->parent;
8938c2ecf20Sopenharmony_ci					}
8948c2ecf20Sopenharmony_ci				}
8958c2ecf20Sopenharmony_ci			} else if (node->left) {
8968c2ecf20Sopenharmony_ci				assert(node->leftnode == NODE);
8978c2ecf20Sopenharmony_ci				node = node->left;
8988c2ecf20Sopenharmony_ci				if (!node->mark && node->parent->mark) {
8998c2ecf20Sopenharmony_ci					marked++;
9008c2ecf20Sopenharmony_ci					node->mark = 1;
9018c2ecf20Sopenharmony_ci				}
9028c2ecf20Sopenharmony_ci				continue;
9038c2ecf20Sopenharmony_ci			}
9048c2ecf20Sopenharmony_ci		}
9058c2ecf20Sopenharmony_ci		if ((rightmask & bitmask) == 0) {
9068c2ecf20Sopenharmony_ci			rightmask |= bitmask;
9078c2ecf20Sopenharmony_ci			if (node->rightnode == LEAF) {
9088c2ecf20Sopenharmony_ci				assert(node->right);
9098c2ecf20Sopenharmony_ci				if (tree->leaf_mark(node->right)) {
9108c2ecf20Sopenharmony_ci					n = node;
9118c2ecf20Sopenharmony_ci					while (n && !n->mark) {
9128c2ecf20Sopenharmony_ci						marked++;
9138c2ecf20Sopenharmony_ci						n->mark = 1;
9148c2ecf20Sopenharmony_ci						n = n->parent;
9158c2ecf20Sopenharmony_ci					}
9168c2ecf20Sopenharmony_ci				}
9178c2ecf20Sopenharmony_ci			} else if (node->right) {
9188c2ecf20Sopenharmony_ci				assert(node->rightnode == NODE);
9198c2ecf20Sopenharmony_ci				node = node->right;
9208c2ecf20Sopenharmony_ci				if (!node->mark && node->parent->mark &&
9218c2ecf20Sopenharmony_ci				    !node->parent->left) {
9228c2ecf20Sopenharmony_ci					marked++;
9238c2ecf20Sopenharmony_ci					node->mark = 1;
9248c2ecf20Sopenharmony_ci				}
9258c2ecf20Sopenharmony_ci				continue;
9268c2ecf20Sopenharmony_ci			}
9278c2ecf20Sopenharmony_ci		}
9288c2ecf20Sopenharmony_ci		leftmask &= ~bitmask;
9298c2ecf20Sopenharmony_ci		rightmask &= ~bitmask;
9308c2ecf20Sopenharmony_ci		node = node->parent;
9318c2ecf20Sopenharmony_ci	}
9328c2ecf20Sopenharmony_cidone:
9338c2ecf20Sopenharmony_ci	if (verbose > 0)
9348c2ecf20Sopenharmony_ci		printf("Marked %d nodes\n", marked);
9358c2ecf20Sopenharmony_ci}
9368c2ecf20Sopenharmony_ci
9378c2ecf20Sopenharmony_ci/*
9388c2ecf20Sopenharmony_ci * Compute the index of each node and leaf, which is the offset in the
9398c2ecf20Sopenharmony_ci * emitted trie.  These values must be pre-computed because relative
9408c2ecf20Sopenharmony_ci * offsets between nodes are used to navigate the tree.
9418c2ecf20Sopenharmony_ci */
9428c2ecf20Sopenharmony_cistatic int index_nodes(struct tree *tree, int index)
9438c2ecf20Sopenharmony_ci{
9448c2ecf20Sopenharmony_ci	struct node *node;
9458c2ecf20Sopenharmony_ci	unsigned int leftmask;
9468c2ecf20Sopenharmony_ci	unsigned int rightmask;
9478c2ecf20Sopenharmony_ci	unsigned int bitmask;
9488c2ecf20Sopenharmony_ci	int count;
9498c2ecf20Sopenharmony_ci	int indent;
9508c2ecf20Sopenharmony_ci
9518c2ecf20Sopenharmony_ci	/* Align to a cache line (or half a cache line?). */
9528c2ecf20Sopenharmony_ci	while (index % 64)
9538c2ecf20Sopenharmony_ci		index++;
9548c2ecf20Sopenharmony_ci	tree->index = index;
9558c2ecf20Sopenharmony_ci	indent = 1;
9568c2ecf20Sopenharmony_ci	count = 0;
9578c2ecf20Sopenharmony_ci
9588c2ecf20Sopenharmony_ci	if (verbose > 0)
9598c2ecf20Sopenharmony_ci		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
9608c2ecf20Sopenharmony_ci	if (tree->childnode == LEAF) {
9618c2ecf20Sopenharmony_ci		index += tree->leaf_size(tree->root);
9628c2ecf20Sopenharmony_ci		goto done;
9638c2ecf20Sopenharmony_ci	}
9648c2ecf20Sopenharmony_ci
9658c2ecf20Sopenharmony_ci	assert(tree->childnode == NODE);
9668c2ecf20Sopenharmony_ci	node = tree->root;
9678c2ecf20Sopenharmony_ci	leftmask = rightmask = 0;
9688c2ecf20Sopenharmony_ci	while (node) {
9698c2ecf20Sopenharmony_ci		if (!node->mark)
9708c2ecf20Sopenharmony_ci			goto skip;
9718c2ecf20Sopenharmony_ci		count++;
9728c2ecf20Sopenharmony_ci		if (node->index != index)
9738c2ecf20Sopenharmony_ci			node->index = index;
9748c2ecf20Sopenharmony_ci		index += node->size;
9758c2ecf20Sopenharmony_ciskip:
9768c2ecf20Sopenharmony_ci		while (node) {
9778c2ecf20Sopenharmony_ci			bitmask = 1 << node->bitnum;
9788c2ecf20Sopenharmony_ci			if (node->mark && (leftmask & bitmask) == 0) {
9798c2ecf20Sopenharmony_ci				leftmask |= bitmask;
9808c2ecf20Sopenharmony_ci				if (node->leftnode == LEAF) {
9818c2ecf20Sopenharmony_ci					assert(node->left);
9828c2ecf20Sopenharmony_ci					*tree->leaf_index(tree, node->left) =
9838c2ecf20Sopenharmony_ci									index;
9848c2ecf20Sopenharmony_ci					index += tree->leaf_size(node->left);
9858c2ecf20Sopenharmony_ci					count++;
9868c2ecf20Sopenharmony_ci				} else if (node->left) {
9878c2ecf20Sopenharmony_ci					assert(node->leftnode == NODE);
9888c2ecf20Sopenharmony_ci					indent += 1;
9898c2ecf20Sopenharmony_ci					node = node->left;
9908c2ecf20Sopenharmony_ci					break;
9918c2ecf20Sopenharmony_ci				}
9928c2ecf20Sopenharmony_ci			}
9938c2ecf20Sopenharmony_ci			if (node->mark && (rightmask & bitmask) == 0) {
9948c2ecf20Sopenharmony_ci				rightmask |= bitmask;
9958c2ecf20Sopenharmony_ci				if (node->rightnode == LEAF) {
9968c2ecf20Sopenharmony_ci					assert(node->right);
9978c2ecf20Sopenharmony_ci					*tree->leaf_index(tree, node->right) = index;
9988c2ecf20Sopenharmony_ci					index += tree->leaf_size(node->right);
9998c2ecf20Sopenharmony_ci					count++;
10008c2ecf20Sopenharmony_ci				} else if (node->right) {
10018c2ecf20Sopenharmony_ci					assert(node->rightnode == NODE);
10028c2ecf20Sopenharmony_ci					indent += 1;
10038c2ecf20Sopenharmony_ci					node = node->right;
10048c2ecf20Sopenharmony_ci					break;
10058c2ecf20Sopenharmony_ci				}
10068c2ecf20Sopenharmony_ci			}
10078c2ecf20Sopenharmony_ci			leftmask &= ~bitmask;
10088c2ecf20Sopenharmony_ci			rightmask &= ~bitmask;
10098c2ecf20Sopenharmony_ci			node = node->parent;
10108c2ecf20Sopenharmony_ci			indent -= 1;
10118c2ecf20Sopenharmony_ci		}
10128c2ecf20Sopenharmony_ci	}
10138c2ecf20Sopenharmony_cidone:
10148c2ecf20Sopenharmony_ci	/* Round up to a multiple of 16 */
10158c2ecf20Sopenharmony_ci	while (index % 16)
10168c2ecf20Sopenharmony_ci		index++;
10178c2ecf20Sopenharmony_ci	if (verbose > 0)
10188c2ecf20Sopenharmony_ci		printf("Final index %d\n", index);
10198c2ecf20Sopenharmony_ci	return index;
10208c2ecf20Sopenharmony_ci}
10218c2ecf20Sopenharmony_ci
10228c2ecf20Sopenharmony_ci/*
10238c2ecf20Sopenharmony_ci * Mark the nodes in a subtree, helper for size_nodes().
10248c2ecf20Sopenharmony_ci */
10258c2ecf20Sopenharmony_cistatic int mark_subtree(struct node *node)
10268c2ecf20Sopenharmony_ci{
10278c2ecf20Sopenharmony_ci	int changed;
10288c2ecf20Sopenharmony_ci
10298c2ecf20Sopenharmony_ci	if (!node || node->mark)
10308c2ecf20Sopenharmony_ci		return 0;
10318c2ecf20Sopenharmony_ci	node->mark = 1;
10328c2ecf20Sopenharmony_ci	node->index = node->parent->index;
10338c2ecf20Sopenharmony_ci	changed = 1;
10348c2ecf20Sopenharmony_ci	if (node->leftnode == NODE)
10358c2ecf20Sopenharmony_ci		changed += mark_subtree(node->left);
10368c2ecf20Sopenharmony_ci	if (node->rightnode == NODE)
10378c2ecf20Sopenharmony_ci		changed += mark_subtree(node->right);
10388c2ecf20Sopenharmony_ci	return changed;
10398c2ecf20Sopenharmony_ci}
10408c2ecf20Sopenharmony_ci
10418c2ecf20Sopenharmony_ci/*
10428c2ecf20Sopenharmony_ci * Compute the size of nodes and leaves. We start by assuming that
10438c2ecf20Sopenharmony_ci * each node needs to store a three-byte offset. The indexes of the
10448c2ecf20Sopenharmony_ci * nodes are calculated based on that, and then this function is
10458c2ecf20Sopenharmony_ci * called to see if the sizes of some nodes can be reduced.  This is
10468c2ecf20Sopenharmony_ci * repeated until no more changes are seen.
10478c2ecf20Sopenharmony_ci */
10488c2ecf20Sopenharmony_cistatic int size_nodes(struct tree *tree)
10498c2ecf20Sopenharmony_ci{
10508c2ecf20Sopenharmony_ci	struct tree *next;
10518c2ecf20Sopenharmony_ci	struct node *node;
10528c2ecf20Sopenharmony_ci	struct node *right;
10538c2ecf20Sopenharmony_ci	struct node *n;
10548c2ecf20Sopenharmony_ci	unsigned int leftmask;
10558c2ecf20Sopenharmony_ci	unsigned int rightmask;
10568c2ecf20Sopenharmony_ci	unsigned int bitmask;
10578c2ecf20Sopenharmony_ci	unsigned int pathbits;
10588c2ecf20Sopenharmony_ci	unsigned int pathmask;
10598c2ecf20Sopenharmony_ci	unsigned int nbit;
10608c2ecf20Sopenharmony_ci	int changed;
10618c2ecf20Sopenharmony_ci	int offset;
10628c2ecf20Sopenharmony_ci	int size;
10638c2ecf20Sopenharmony_ci	int indent;
10648c2ecf20Sopenharmony_ci
10658c2ecf20Sopenharmony_ci	indent = 1;
10668c2ecf20Sopenharmony_ci	changed = 0;
10678c2ecf20Sopenharmony_ci	size = 0;
10688c2ecf20Sopenharmony_ci
10698c2ecf20Sopenharmony_ci	if (verbose > 0)
10708c2ecf20Sopenharmony_ci		printf("Sizing %s_%x\n", tree->type, tree->maxage);
10718c2ecf20Sopenharmony_ci	if (tree->childnode == LEAF)
10728c2ecf20Sopenharmony_ci		goto done;
10738c2ecf20Sopenharmony_ci
10748c2ecf20Sopenharmony_ci	assert(tree->childnode == NODE);
10758c2ecf20Sopenharmony_ci	pathbits = 0;
10768c2ecf20Sopenharmony_ci	pathmask = 0;
10778c2ecf20Sopenharmony_ci	node = tree->root;
10788c2ecf20Sopenharmony_ci	leftmask = rightmask = 0;
10798c2ecf20Sopenharmony_ci	while (node) {
10808c2ecf20Sopenharmony_ci		if (!node->mark)
10818c2ecf20Sopenharmony_ci			goto skip;
10828c2ecf20Sopenharmony_ci		offset = 0;
10838c2ecf20Sopenharmony_ci		if (!node->left || !node->right) {
10848c2ecf20Sopenharmony_ci			size = 1;
10858c2ecf20Sopenharmony_ci		} else {
10868c2ecf20Sopenharmony_ci			if (node->rightnode == NODE) {
10878c2ecf20Sopenharmony_ci				/*
10888c2ecf20Sopenharmony_ci				 * If the right node is not marked,
10898c2ecf20Sopenharmony_ci				 * look for a corresponding node in
10908c2ecf20Sopenharmony_ci				 * the next tree.  Such a node need
10918c2ecf20Sopenharmony_ci				 * not exist.
10928c2ecf20Sopenharmony_ci				 */
10938c2ecf20Sopenharmony_ci				right = node->right;
10948c2ecf20Sopenharmony_ci				next = tree->next;
10958c2ecf20Sopenharmony_ci				while (!right->mark) {
10968c2ecf20Sopenharmony_ci					assert(next);
10978c2ecf20Sopenharmony_ci					n = next->root;
10988c2ecf20Sopenharmony_ci					while (n->bitnum != node->bitnum) {
10998c2ecf20Sopenharmony_ci						nbit = 1 << n->bitnum;
11008c2ecf20Sopenharmony_ci						if (!(pathmask & nbit))
11018c2ecf20Sopenharmony_ci							break;
11028c2ecf20Sopenharmony_ci						if (pathbits & nbit) {
11038c2ecf20Sopenharmony_ci							if (n->rightnode == LEAF)
11048c2ecf20Sopenharmony_ci								break;
11058c2ecf20Sopenharmony_ci							n = n->right;
11068c2ecf20Sopenharmony_ci						} else {
11078c2ecf20Sopenharmony_ci							if (n->leftnode == LEAF)
11088c2ecf20Sopenharmony_ci								break;
11098c2ecf20Sopenharmony_ci							n = n->left;
11108c2ecf20Sopenharmony_ci						}
11118c2ecf20Sopenharmony_ci					}
11128c2ecf20Sopenharmony_ci					if (n->bitnum != node->bitnum)
11138c2ecf20Sopenharmony_ci						break;
11148c2ecf20Sopenharmony_ci					n = n->right;
11158c2ecf20Sopenharmony_ci					right = n;
11168c2ecf20Sopenharmony_ci					next = next->next;
11178c2ecf20Sopenharmony_ci				}
11188c2ecf20Sopenharmony_ci				/* Make sure the right node is marked. */
11198c2ecf20Sopenharmony_ci				if (!right->mark)
11208c2ecf20Sopenharmony_ci					changed += mark_subtree(right);
11218c2ecf20Sopenharmony_ci				offset = right->index - node->index;
11228c2ecf20Sopenharmony_ci			} else {
11238c2ecf20Sopenharmony_ci				offset = *tree->leaf_index(tree, node->right);
11248c2ecf20Sopenharmony_ci				offset -= node->index;
11258c2ecf20Sopenharmony_ci			}
11268c2ecf20Sopenharmony_ci			assert(offset >= 0);
11278c2ecf20Sopenharmony_ci			assert(offset <= 0xffffff);
11288c2ecf20Sopenharmony_ci			if (offset <= 0xff) {
11298c2ecf20Sopenharmony_ci				size = 2;
11308c2ecf20Sopenharmony_ci			} else if (offset <= 0xffff) {
11318c2ecf20Sopenharmony_ci				size = 3;
11328c2ecf20Sopenharmony_ci			} else { /* offset <= 0xffffff */
11338c2ecf20Sopenharmony_ci				size = 4;
11348c2ecf20Sopenharmony_ci			}
11358c2ecf20Sopenharmony_ci		}
11368c2ecf20Sopenharmony_ci		if (node->size != size || node->offset != offset) {
11378c2ecf20Sopenharmony_ci			node->size = size;
11388c2ecf20Sopenharmony_ci			node->offset = offset;
11398c2ecf20Sopenharmony_ci			changed++;
11408c2ecf20Sopenharmony_ci		}
11418c2ecf20Sopenharmony_ciskip:
11428c2ecf20Sopenharmony_ci		while (node) {
11438c2ecf20Sopenharmony_ci			bitmask = 1 << node->bitnum;
11448c2ecf20Sopenharmony_ci			pathmask |= bitmask;
11458c2ecf20Sopenharmony_ci			if (node->mark && (leftmask & bitmask) == 0) {
11468c2ecf20Sopenharmony_ci				leftmask |= bitmask;
11478c2ecf20Sopenharmony_ci				if (node->leftnode == LEAF) {
11488c2ecf20Sopenharmony_ci					assert(node->left);
11498c2ecf20Sopenharmony_ci				} else if (node->left) {
11508c2ecf20Sopenharmony_ci					assert(node->leftnode == NODE);
11518c2ecf20Sopenharmony_ci					indent += 1;
11528c2ecf20Sopenharmony_ci					node = node->left;
11538c2ecf20Sopenharmony_ci					break;
11548c2ecf20Sopenharmony_ci				}
11558c2ecf20Sopenharmony_ci			}
11568c2ecf20Sopenharmony_ci			if (node->mark && (rightmask & bitmask) == 0) {
11578c2ecf20Sopenharmony_ci				rightmask |= bitmask;
11588c2ecf20Sopenharmony_ci				pathbits |= bitmask;
11598c2ecf20Sopenharmony_ci				if (node->rightnode == LEAF) {
11608c2ecf20Sopenharmony_ci					assert(node->right);
11618c2ecf20Sopenharmony_ci				} else if (node->right) {
11628c2ecf20Sopenharmony_ci					assert(node->rightnode == NODE);
11638c2ecf20Sopenharmony_ci					indent += 1;
11648c2ecf20Sopenharmony_ci					node = node->right;
11658c2ecf20Sopenharmony_ci					break;
11668c2ecf20Sopenharmony_ci				}
11678c2ecf20Sopenharmony_ci			}
11688c2ecf20Sopenharmony_ci			leftmask &= ~bitmask;
11698c2ecf20Sopenharmony_ci			rightmask &= ~bitmask;
11708c2ecf20Sopenharmony_ci			pathmask &= ~bitmask;
11718c2ecf20Sopenharmony_ci			pathbits &= ~bitmask;
11728c2ecf20Sopenharmony_ci			node = node->parent;
11738c2ecf20Sopenharmony_ci			indent -= 1;
11748c2ecf20Sopenharmony_ci		}
11758c2ecf20Sopenharmony_ci	}
11768c2ecf20Sopenharmony_cidone:
11778c2ecf20Sopenharmony_ci	if (verbose > 0)
11788c2ecf20Sopenharmony_ci		printf("Found %d changes\n", changed);
11798c2ecf20Sopenharmony_ci	return changed;
11808c2ecf20Sopenharmony_ci}
11818c2ecf20Sopenharmony_ci
11828c2ecf20Sopenharmony_ci/*
11838c2ecf20Sopenharmony_ci * Emit a trie for the given tree into the data array.
11848c2ecf20Sopenharmony_ci */
11858c2ecf20Sopenharmony_cistatic void emit(struct tree *tree, unsigned char *data)
11868c2ecf20Sopenharmony_ci{
11878c2ecf20Sopenharmony_ci	struct node *node;
11888c2ecf20Sopenharmony_ci	unsigned int leftmask;
11898c2ecf20Sopenharmony_ci	unsigned int rightmask;
11908c2ecf20Sopenharmony_ci	unsigned int bitmask;
11918c2ecf20Sopenharmony_ci	int offlen;
11928c2ecf20Sopenharmony_ci	int offset;
11938c2ecf20Sopenharmony_ci	int index;
11948c2ecf20Sopenharmony_ci	int indent;
11958c2ecf20Sopenharmony_ci	int size;
11968c2ecf20Sopenharmony_ci	int bytes;
11978c2ecf20Sopenharmony_ci	int leaves;
11988c2ecf20Sopenharmony_ci	int nodes[4];
11998c2ecf20Sopenharmony_ci	unsigned char byte;
12008c2ecf20Sopenharmony_ci
12018c2ecf20Sopenharmony_ci	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
12028c2ecf20Sopenharmony_ci	leaves = 0;
12038c2ecf20Sopenharmony_ci	bytes = 0;
12048c2ecf20Sopenharmony_ci	index = tree->index;
12058c2ecf20Sopenharmony_ci	data += index;
12068c2ecf20Sopenharmony_ci	indent = 1;
12078c2ecf20Sopenharmony_ci	if (verbose > 0)
12088c2ecf20Sopenharmony_ci		printf("Emitting %s_%x\n", tree->type, tree->maxage);
12098c2ecf20Sopenharmony_ci	if (tree->childnode == LEAF) {
12108c2ecf20Sopenharmony_ci		assert(tree->root);
12118c2ecf20Sopenharmony_ci		tree->leaf_emit(tree->root, data);
12128c2ecf20Sopenharmony_ci		size = tree->leaf_size(tree->root);
12138c2ecf20Sopenharmony_ci		index += size;
12148c2ecf20Sopenharmony_ci		leaves++;
12158c2ecf20Sopenharmony_ci		goto done;
12168c2ecf20Sopenharmony_ci	}
12178c2ecf20Sopenharmony_ci
12188c2ecf20Sopenharmony_ci	assert(tree->childnode == NODE);
12198c2ecf20Sopenharmony_ci	node = tree->root;
12208c2ecf20Sopenharmony_ci	leftmask = rightmask = 0;
12218c2ecf20Sopenharmony_ci	while (node) {
12228c2ecf20Sopenharmony_ci		if (!node->mark)
12238c2ecf20Sopenharmony_ci			goto skip;
12248c2ecf20Sopenharmony_ci		assert(node->offset != -1);
12258c2ecf20Sopenharmony_ci		assert(node->index == index);
12268c2ecf20Sopenharmony_ci
12278c2ecf20Sopenharmony_ci		byte = 0;
12288c2ecf20Sopenharmony_ci		if (node->nextbyte)
12298c2ecf20Sopenharmony_ci			byte |= NEXTBYTE;
12308c2ecf20Sopenharmony_ci		byte |= (node->bitnum & BITNUM);
12318c2ecf20Sopenharmony_ci		if (node->left && node->right) {
12328c2ecf20Sopenharmony_ci			if (node->leftnode == NODE)
12338c2ecf20Sopenharmony_ci				byte |= LEFTNODE;
12348c2ecf20Sopenharmony_ci			if (node->rightnode == NODE)
12358c2ecf20Sopenharmony_ci				byte |= RIGHTNODE;
12368c2ecf20Sopenharmony_ci			if (node->offset <= 0xff)
12378c2ecf20Sopenharmony_ci				offlen = 1;
12388c2ecf20Sopenharmony_ci			else if (node->offset <= 0xffff)
12398c2ecf20Sopenharmony_ci				offlen = 2;
12408c2ecf20Sopenharmony_ci			else
12418c2ecf20Sopenharmony_ci				offlen = 3;
12428c2ecf20Sopenharmony_ci			nodes[offlen]++;
12438c2ecf20Sopenharmony_ci			offset = node->offset;
12448c2ecf20Sopenharmony_ci			byte |= offlen << OFFLEN_SHIFT;
12458c2ecf20Sopenharmony_ci			*data++ = byte;
12468c2ecf20Sopenharmony_ci			index++;
12478c2ecf20Sopenharmony_ci			while (offlen--) {
12488c2ecf20Sopenharmony_ci				*data++ = offset & 0xff;
12498c2ecf20Sopenharmony_ci				index++;
12508c2ecf20Sopenharmony_ci				offset >>= 8;
12518c2ecf20Sopenharmony_ci			}
12528c2ecf20Sopenharmony_ci		} else if (node->left) {
12538c2ecf20Sopenharmony_ci			if (node->leftnode == NODE)
12548c2ecf20Sopenharmony_ci				byte |= TRIENODE;
12558c2ecf20Sopenharmony_ci			nodes[0]++;
12568c2ecf20Sopenharmony_ci			*data++ = byte;
12578c2ecf20Sopenharmony_ci			index++;
12588c2ecf20Sopenharmony_ci		} else if (node->right) {
12598c2ecf20Sopenharmony_ci			byte |= RIGHTNODE;
12608c2ecf20Sopenharmony_ci			if (node->rightnode == NODE)
12618c2ecf20Sopenharmony_ci				byte |= TRIENODE;
12628c2ecf20Sopenharmony_ci			nodes[0]++;
12638c2ecf20Sopenharmony_ci			*data++ = byte;
12648c2ecf20Sopenharmony_ci			index++;
12658c2ecf20Sopenharmony_ci		} else {
12668c2ecf20Sopenharmony_ci			assert(0);
12678c2ecf20Sopenharmony_ci		}
12688c2ecf20Sopenharmony_ciskip:
12698c2ecf20Sopenharmony_ci		while (node) {
12708c2ecf20Sopenharmony_ci			bitmask = 1 << node->bitnum;
12718c2ecf20Sopenharmony_ci			if (node->mark && (leftmask & bitmask) == 0) {
12728c2ecf20Sopenharmony_ci				leftmask |= bitmask;
12738c2ecf20Sopenharmony_ci				if (node->leftnode == LEAF) {
12748c2ecf20Sopenharmony_ci					assert(node->left);
12758c2ecf20Sopenharmony_ci					data = tree->leaf_emit(node->left,
12768c2ecf20Sopenharmony_ci							       data);
12778c2ecf20Sopenharmony_ci					size = tree->leaf_size(node->left);
12788c2ecf20Sopenharmony_ci					index += size;
12798c2ecf20Sopenharmony_ci					bytes += size;
12808c2ecf20Sopenharmony_ci					leaves++;
12818c2ecf20Sopenharmony_ci				} else if (node->left) {
12828c2ecf20Sopenharmony_ci					assert(node->leftnode == NODE);
12838c2ecf20Sopenharmony_ci					indent += 1;
12848c2ecf20Sopenharmony_ci					node = node->left;
12858c2ecf20Sopenharmony_ci					break;
12868c2ecf20Sopenharmony_ci				}
12878c2ecf20Sopenharmony_ci			}
12888c2ecf20Sopenharmony_ci			if (node->mark && (rightmask & bitmask) == 0) {
12898c2ecf20Sopenharmony_ci				rightmask |= bitmask;
12908c2ecf20Sopenharmony_ci				if (node->rightnode == LEAF) {
12918c2ecf20Sopenharmony_ci					assert(node->right);
12928c2ecf20Sopenharmony_ci					data = tree->leaf_emit(node->right,
12938c2ecf20Sopenharmony_ci							       data);
12948c2ecf20Sopenharmony_ci					size = tree->leaf_size(node->right);
12958c2ecf20Sopenharmony_ci					index += size;
12968c2ecf20Sopenharmony_ci					bytes += size;
12978c2ecf20Sopenharmony_ci					leaves++;
12988c2ecf20Sopenharmony_ci				} else if (node->right) {
12998c2ecf20Sopenharmony_ci					assert(node->rightnode == NODE);
13008c2ecf20Sopenharmony_ci					indent += 1;
13018c2ecf20Sopenharmony_ci					node = node->right;
13028c2ecf20Sopenharmony_ci					break;
13038c2ecf20Sopenharmony_ci				}
13048c2ecf20Sopenharmony_ci			}
13058c2ecf20Sopenharmony_ci			leftmask &= ~bitmask;
13068c2ecf20Sopenharmony_ci			rightmask &= ~bitmask;
13078c2ecf20Sopenharmony_ci			node = node->parent;
13088c2ecf20Sopenharmony_ci			indent -= 1;
13098c2ecf20Sopenharmony_ci		}
13108c2ecf20Sopenharmony_ci	}
13118c2ecf20Sopenharmony_cidone:
13128c2ecf20Sopenharmony_ci	if (verbose > 0) {
13138c2ecf20Sopenharmony_ci		printf("Emitted %d (%d) leaves",
13148c2ecf20Sopenharmony_ci			leaves, bytes);
13158c2ecf20Sopenharmony_ci		printf(" %d (%d+%d+%d+%d) nodes",
13168c2ecf20Sopenharmony_ci			nodes[0] + nodes[1] + nodes[2] + nodes[3],
13178c2ecf20Sopenharmony_ci			nodes[0], nodes[1], nodes[2], nodes[3]);
13188c2ecf20Sopenharmony_ci		printf(" %d total\n", index - tree->index);
13198c2ecf20Sopenharmony_ci	}
13208c2ecf20Sopenharmony_ci}
13218c2ecf20Sopenharmony_ci
13228c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
13238c2ecf20Sopenharmony_ci
13248c2ecf20Sopenharmony_ci/*
13258c2ecf20Sopenharmony_ci * Unicode data.
13268c2ecf20Sopenharmony_ci *
13278c2ecf20Sopenharmony_ci * We need to keep track of the Canonical Combining Class, the Age,
13288c2ecf20Sopenharmony_ci * and decompositions for a code point.
13298c2ecf20Sopenharmony_ci *
13308c2ecf20Sopenharmony_ci * For the Age, we store the index into the ages table.  Effectively
13318c2ecf20Sopenharmony_ci * this is a generation number that the table maps to a unicode
13328c2ecf20Sopenharmony_ci * version.
13338c2ecf20Sopenharmony_ci *
13348c2ecf20Sopenharmony_ci * The correction field is used to indicate that this entry is in the
13358c2ecf20Sopenharmony_ci * corrections array, which contains decompositions that were
13368c2ecf20Sopenharmony_ci * corrected in later revisions.  The value of the correction field is
13378c2ecf20Sopenharmony_ci * the Unicode version in which the mapping was corrected.
13388c2ecf20Sopenharmony_ci */
13398c2ecf20Sopenharmony_cistruct unicode_data {
13408c2ecf20Sopenharmony_ci	unsigned int code;
13418c2ecf20Sopenharmony_ci	int ccc;
13428c2ecf20Sopenharmony_ci	int gen;
13438c2ecf20Sopenharmony_ci	int correction;
13448c2ecf20Sopenharmony_ci	unsigned int *utf32nfdi;
13458c2ecf20Sopenharmony_ci	unsigned int *utf32nfdicf;
13468c2ecf20Sopenharmony_ci	char *utf8nfdi;
13478c2ecf20Sopenharmony_ci	char *utf8nfdicf;
13488c2ecf20Sopenharmony_ci};
13498c2ecf20Sopenharmony_ci
13508c2ecf20Sopenharmony_cistruct unicode_data unicode_data[0x110000];
13518c2ecf20Sopenharmony_cistruct unicode_data *corrections;
13528c2ecf20Sopenharmony_ciint    corrections_count;
13538c2ecf20Sopenharmony_ci
13548c2ecf20Sopenharmony_cistruct tree *nfdi_tree;
13558c2ecf20Sopenharmony_cistruct tree *nfdicf_tree;
13568c2ecf20Sopenharmony_ci
13578c2ecf20Sopenharmony_cistruct tree *trees;
13588c2ecf20Sopenharmony_ciint          trees_count;
13598c2ecf20Sopenharmony_ci
13608c2ecf20Sopenharmony_ci/*
13618c2ecf20Sopenharmony_ci * Check the corrections array to see if this entry was corrected at
13628c2ecf20Sopenharmony_ci * some point.
13638c2ecf20Sopenharmony_ci */
13648c2ecf20Sopenharmony_cistatic struct unicode_data *corrections_lookup(struct unicode_data *u)
13658c2ecf20Sopenharmony_ci{
13668c2ecf20Sopenharmony_ci	int i;
13678c2ecf20Sopenharmony_ci
13688c2ecf20Sopenharmony_ci	for (i = 0; i != corrections_count; i++)
13698c2ecf20Sopenharmony_ci		if (u->code == corrections[i].code)
13708c2ecf20Sopenharmony_ci			return &corrections[i];
13718c2ecf20Sopenharmony_ci	return u;
13728c2ecf20Sopenharmony_ci}
13738c2ecf20Sopenharmony_ci
13748c2ecf20Sopenharmony_cistatic int nfdi_equal(void *l, void *r)
13758c2ecf20Sopenharmony_ci{
13768c2ecf20Sopenharmony_ci	struct unicode_data *left = l;
13778c2ecf20Sopenharmony_ci	struct unicode_data *right = r;
13788c2ecf20Sopenharmony_ci
13798c2ecf20Sopenharmony_ci	if (left->gen != right->gen)
13808c2ecf20Sopenharmony_ci		return 0;
13818c2ecf20Sopenharmony_ci	if (left->ccc != right->ccc)
13828c2ecf20Sopenharmony_ci		return 0;
13838c2ecf20Sopenharmony_ci	if (left->utf8nfdi && right->utf8nfdi &&
13848c2ecf20Sopenharmony_ci	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
13858c2ecf20Sopenharmony_ci		return 1;
13868c2ecf20Sopenharmony_ci	if (left->utf8nfdi || right->utf8nfdi)
13878c2ecf20Sopenharmony_ci		return 0;
13888c2ecf20Sopenharmony_ci	return 1;
13898c2ecf20Sopenharmony_ci}
13908c2ecf20Sopenharmony_ci
13918c2ecf20Sopenharmony_cistatic int nfdicf_equal(void *l, void *r)
13928c2ecf20Sopenharmony_ci{
13938c2ecf20Sopenharmony_ci	struct unicode_data *left = l;
13948c2ecf20Sopenharmony_ci	struct unicode_data *right = r;
13958c2ecf20Sopenharmony_ci
13968c2ecf20Sopenharmony_ci	if (left->gen != right->gen)
13978c2ecf20Sopenharmony_ci		return 0;
13988c2ecf20Sopenharmony_ci	if (left->ccc != right->ccc)
13998c2ecf20Sopenharmony_ci		return 0;
14008c2ecf20Sopenharmony_ci	if (left->utf8nfdicf && right->utf8nfdicf &&
14018c2ecf20Sopenharmony_ci	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
14028c2ecf20Sopenharmony_ci		return 1;
14038c2ecf20Sopenharmony_ci	if (left->utf8nfdicf && right->utf8nfdicf)
14048c2ecf20Sopenharmony_ci		return 0;
14058c2ecf20Sopenharmony_ci	if (left->utf8nfdicf || right->utf8nfdicf)
14068c2ecf20Sopenharmony_ci		return 0;
14078c2ecf20Sopenharmony_ci	if (left->utf8nfdi && right->utf8nfdi &&
14088c2ecf20Sopenharmony_ci	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
14098c2ecf20Sopenharmony_ci		return 1;
14108c2ecf20Sopenharmony_ci	if (left->utf8nfdi || right->utf8nfdi)
14118c2ecf20Sopenharmony_ci		return 0;
14128c2ecf20Sopenharmony_ci	return 1;
14138c2ecf20Sopenharmony_ci}
14148c2ecf20Sopenharmony_ci
14158c2ecf20Sopenharmony_cistatic void nfdi_print(void *l, int indent)
14168c2ecf20Sopenharmony_ci{
14178c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
14188c2ecf20Sopenharmony_ci
14198c2ecf20Sopenharmony_ci	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
14208c2ecf20Sopenharmony_ci		leaf->code, leaf->ccc, leaf->gen);
14218c2ecf20Sopenharmony_ci
14228c2ecf20Sopenharmony_ci	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
14238c2ecf20Sopenharmony_ci		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
14248c2ecf20Sopenharmony_ci	else if (leaf->utf8nfdi)
14258c2ecf20Sopenharmony_ci		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
14268c2ecf20Sopenharmony_ci
14278c2ecf20Sopenharmony_ci	printf("\n");
14288c2ecf20Sopenharmony_ci}
14298c2ecf20Sopenharmony_ci
14308c2ecf20Sopenharmony_cistatic void nfdicf_print(void *l, int indent)
14318c2ecf20Sopenharmony_ci{
14328c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
14338c2ecf20Sopenharmony_ci
14348c2ecf20Sopenharmony_ci	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
14358c2ecf20Sopenharmony_ci		leaf->code, leaf->ccc, leaf->gen);
14368c2ecf20Sopenharmony_ci
14378c2ecf20Sopenharmony_ci	if (leaf->utf8nfdicf)
14388c2ecf20Sopenharmony_ci		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
14398c2ecf20Sopenharmony_ci	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
14408c2ecf20Sopenharmony_ci		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
14418c2ecf20Sopenharmony_ci	else if (leaf->utf8nfdi)
14428c2ecf20Sopenharmony_ci		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
14438c2ecf20Sopenharmony_ci	printf("\n");
14448c2ecf20Sopenharmony_ci}
14458c2ecf20Sopenharmony_ci
14468c2ecf20Sopenharmony_cistatic int nfdi_mark(void *l)
14478c2ecf20Sopenharmony_ci{
14488c2ecf20Sopenharmony_ci	return 1;
14498c2ecf20Sopenharmony_ci}
14508c2ecf20Sopenharmony_ci
14518c2ecf20Sopenharmony_cistatic int nfdicf_mark(void *l)
14528c2ecf20Sopenharmony_ci{
14538c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
14548c2ecf20Sopenharmony_ci
14558c2ecf20Sopenharmony_ci	if (leaf->utf8nfdicf)
14568c2ecf20Sopenharmony_ci		return 1;
14578c2ecf20Sopenharmony_ci	return 0;
14588c2ecf20Sopenharmony_ci}
14598c2ecf20Sopenharmony_ci
14608c2ecf20Sopenharmony_cistatic int correction_mark(void *l)
14618c2ecf20Sopenharmony_ci{
14628c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
14638c2ecf20Sopenharmony_ci
14648c2ecf20Sopenharmony_ci	return leaf->correction;
14658c2ecf20Sopenharmony_ci}
14668c2ecf20Sopenharmony_ci
14678c2ecf20Sopenharmony_cistatic int nfdi_size(void *l)
14688c2ecf20Sopenharmony_ci{
14698c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
14708c2ecf20Sopenharmony_ci	int size = 2;
14718c2ecf20Sopenharmony_ci
14728c2ecf20Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code))
14738c2ecf20Sopenharmony_ci		size += 1;
14748c2ecf20Sopenharmony_ci	else if (leaf->utf8nfdi)
14758c2ecf20Sopenharmony_ci		size += strlen(leaf->utf8nfdi) + 1;
14768c2ecf20Sopenharmony_ci	return size;
14778c2ecf20Sopenharmony_ci}
14788c2ecf20Sopenharmony_ci
14798c2ecf20Sopenharmony_cistatic int nfdicf_size(void *l)
14808c2ecf20Sopenharmony_ci{
14818c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
14828c2ecf20Sopenharmony_ci	int size = 2;
14838c2ecf20Sopenharmony_ci
14848c2ecf20Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code))
14858c2ecf20Sopenharmony_ci		size += 1;
14868c2ecf20Sopenharmony_ci	else if (leaf->utf8nfdicf)
14878c2ecf20Sopenharmony_ci		size += strlen(leaf->utf8nfdicf) + 1;
14888c2ecf20Sopenharmony_ci	else if (leaf->utf8nfdi)
14898c2ecf20Sopenharmony_ci		size += strlen(leaf->utf8nfdi) + 1;
14908c2ecf20Sopenharmony_ci	return size;
14918c2ecf20Sopenharmony_ci}
14928c2ecf20Sopenharmony_ci
14938c2ecf20Sopenharmony_cistatic int *nfdi_index(struct tree *tree, void *l)
14948c2ecf20Sopenharmony_ci{
14958c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
14968c2ecf20Sopenharmony_ci
14978c2ecf20Sopenharmony_ci	return &tree->leafindex[leaf->code];
14988c2ecf20Sopenharmony_ci}
14998c2ecf20Sopenharmony_ci
15008c2ecf20Sopenharmony_cistatic int *nfdicf_index(struct tree *tree, void *l)
15018c2ecf20Sopenharmony_ci{
15028c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
15038c2ecf20Sopenharmony_ci
15048c2ecf20Sopenharmony_ci	return &tree->leafindex[leaf->code];
15058c2ecf20Sopenharmony_ci}
15068c2ecf20Sopenharmony_ci
15078c2ecf20Sopenharmony_cistatic unsigned char *nfdi_emit(void *l, unsigned char *data)
15088c2ecf20Sopenharmony_ci{
15098c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
15108c2ecf20Sopenharmony_ci	unsigned char *s;
15118c2ecf20Sopenharmony_ci
15128c2ecf20Sopenharmony_ci	*data++ = leaf->gen;
15138c2ecf20Sopenharmony_ci
15148c2ecf20Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code)) {
15158c2ecf20Sopenharmony_ci		*data++ = DECOMPOSE;
15168c2ecf20Sopenharmony_ci		*data++ = HANGUL;
15178c2ecf20Sopenharmony_ci	} else if (leaf->utf8nfdi) {
15188c2ecf20Sopenharmony_ci		*data++ = DECOMPOSE;
15198c2ecf20Sopenharmony_ci		s = (unsigned char*)leaf->utf8nfdi;
15208c2ecf20Sopenharmony_ci		while ((*data++ = *s++) != 0)
15218c2ecf20Sopenharmony_ci			;
15228c2ecf20Sopenharmony_ci	} else {
15238c2ecf20Sopenharmony_ci		*data++ = leaf->ccc;
15248c2ecf20Sopenharmony_ci	}
15258c2ecf20Sopenharmony_ci	return data;
15268c2ecf20Sopenharmony_ci}
15278c2ecf20Sopenharmony_ci
15288c2ecf20Sopenharmony_cistatic unsigned char *nfdicf_emit(void *l, unsigned char *data)
15298c2ecf20Sopenharmony_ci{
15308c2ecf20Sopenharmony_ci	struct unicode_data *leaf = l;
15318c2ecf20Sopenharmony_ci	unsigned char *s;
15328c2ecf20Sopenharmony_ci
15338c2ecf20Sopenharmony_ci	*data++ = leaf->gen;
15348c2ecf20Sopenharmony_ci
15358c2ecf20Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code)) {
15368c2ecf20Sopenharmony_ci		*data++ = DECOMPOSE;
15378c2ecf20Sopenharmony_ci		*data++ = HANGUL;
15388c2ecf20Sopenharmony_ci	} else if (leaf->utf8nfdicf) {
15398c2ecf20Sopenharmony_ci		*data++ = DECOMPOSE;
15408c2ecf20Sopenharmony_ci		s = (unsigned char*)leaf->utf8nfdicf;
15418c2ecf20Sopenharmony_ci		while ((*data++ = *s++) != 0)
15428c2ecf20Sopenharmony_ci			;
15438c2ecf20Sopenharmony_ci	} else if (leaf->utf8nfdi) {
15448c2ecf20Sopenharmony_ci		*data++ = DECOMPOSE;
15458c2ecf20Sopenharmony_ci		s = (unsigned char*)leaf->utf8nfdi;
15468c2ecf20Sopenharmony_ci		while ((*data++ = *s++) != 0)
15478c2ecf20Sopenharmony_ci			;
15488c2ecf20Sopenharmony_ci	} else {
15498c2ecf20Sopenharmony_ci		*data++ = leaf->ccc;
15508c2ecf20Sopenharmony_ci	}
15518c2ecf20Sopenharmony_ci	return data;
15528c2ecf20Sopenharmony_ci}
15538c2ecf20Sopenharmony_ci
15548c2ecf20Sopenharmony_cistatic void utf8_create(struct unicode_data *data)
15558c2ecf20Sopenharmony_ci{
15568c2ecf20Sopenharmony_ci	char utf[18*4+1];
15578c2ecf20Sopenharmony_ci	char *u;
15588c2ecf20Sopenharmony_ci	unsigned int *um;
15598c2ecf20Sopenharmony_ci	int i;
15608c2ecf20Sopenharmony_ci
15618c2ecf20Sopenharmony_ci	if (data->utf8nfdi) {
15628c2ecf20Sopenharmony_ci		assert(data->utf8nfdi[0] == HANGUL);
15638c2ecf20Sopenharmony_ci		return;
15648c2ecf20Sopenharmony_ci	}
15658c2ecf20Sopenharmony_ci
15668c2ecf20Sopenharmony_ci	u = utf;
15678c2ecf20Sopenharmony_ci	um = data->utf32nfdi;
15688c2ecf20Sopenharmony_ci	if (um) {
15698c2ecf20Sopenharmony_ci		for (i = 0; um[i]; i++)
15708c2ecf20Sopenharmony_ci			u += utf8encode(u, um[i]);
15718c2ecf20Sopenharmony_ci		*u = '\0';
15728c2ecf20Sopenharmony_ci		data->utf8nfdi = strdup(utf);
15738c2ecf20Sopenharmony_ci	}
15748c2ecf20Sopenharmony_ci	u = utf;
15758c2ecf20Sopenharmony_ci	um = data->utf32nfdicf;
15768c2ecf20Sopenharmony_ci	if (um) {
15778c2ecf20Sopenharmony_ci		for (i = 0; um[i]; i++)
15788c2ecf20Sopenharmony_ci			u += utf8encode(u, um[i]);
15798c2ecf20Sopenharmony_ci		*u = '\0';
15808c2ecf20Sopenharmony_ci		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
15818c2ecf20Sopenharmony_ci			data->utf8nfdicf = strdup(utf);
15828c2ecf20Sopenharmony_ci	}
15838c2ecf20Sopenharmony_ci}
15848c2ecf20Sopenharmony_ci
15858c2ecf20Sopenharmony_cistatic void utf8_init(void)
15868c2ecf20Sopenharmony_ci{
15878c2ecf20Sopenharmony_ci	unsigned int unichar;
15888c2ecf20Sopenharmony_ci	int i;
15898c2ecf20Sopenharmony_ci
15908c2ecf20Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++)
15918c2ecf20Sopenharmony_ci		utf8_create(&unicode_data[unichar]);
15928c2ecf20Sopenharmony_ci
15938c2ecf20Sopenharmony_ci	for (i = 0; i != corrections_count; i++)
15948c2ecf20Sopenharmony_ci		utf8_create(&corrections[i]);
15958c2ecf20Sopenharmony_ci}
15968c2ecf20Sopenharmony_ci
15978c2ecf20Sopenharmony_cistatic void trees_init(void)
15988c2ecf20Sopenharmony_ci{
15998c2ecf20Sopenharmony_ci	struct unicode_data *data;
16008c2ecf20Sopenharmony_ci	unsigned int maxage;
16018c2ecf20Sopenharmony_ci	unsigned int nextage;
16028c2ecf20Sopenharmony_ci	int count;
16038c2ecf20Sopenharmony_ci	int i;
16048c2ecf20Sopenharmony_ci	int j;
16058c2ecf20Sopenharmony_ci
16068c2ecf20Sopenharmony_ci	/* Count the number of different ages. */
16078c2ecf20Sopenharmony_ci	count = 0;
16088c2ecf20Sopenharmony_ci	nextage = (unsigned int)-1;
16098c2ecf20Sopenharmony_ci	do {
16108c2ecf20Sopenharmony_ci		maxage = nextage;
16118c2ecf20Sopenharmony_ci		nextage = 0;
16128c2ecf20Sopenharmony_ci		for (i = 0; i <= corrections_count; i++) {
16138c2ecf20Sopenharmony_ci			data = &corrections[i];
16148c2ecf20Sopenharmony_ci			if (nextage < data->correction &&
16158c2ecf20Sopenharmony_ci			    data->correction < maxage)
16168c2ecf20Sopenharmony_ci				nextage = data->correction;
16178c2ecf20Sopenharmony_ci		}
16188c2ecf20Sopenharmony_ci		count++;
16198c2ecf20Sopenharmony_ci	} while (nextage);
16208c2ecf20Sopenharmony_ci
16218c2ecf20Sopenharmony_ci	/* Two trees per age: nfdi and nfdicf */
16228c2ecf20Sopenharmony_ci	trees_count = count * 2;
16238c2ecf20Sopenharmony_ci	trees = calloc(trees_count, sizeof(struct tree));
16248c2ecf20Sopenharmony_ci
16258c2ecf20Sopenharmony_ci	/* Assign ages to the trees. */
16268c2ecf20Sopenharmony_ci	count = trees_count;
16278c2ecf20Sopenharmony_ci	nextage = (unsigned int)-1;
16288c2ecf20Sopenharmony_ci	do {
16298c2ecf20Sopenharmony_ci		maxage = nextage;
16308c2ecf20Sopenharmony_ci		trees[--count].maxage = maxage;
16318c2ecf20Sopenharmony_ci		trees[--count].maxage = maxage;
16328c2ecf20Sopenharmony_ci		nextage = 0;
16338c2ecf20Sopenharmony_ci		for (i = 0; i <= corrections_count; i++) {
16348c2ecf20Sopenharmony_ci			data = &corrections[i];
16358c2ecf20Sopenharmony_ci			if (nextage < data->correction &&
16368c2ecf20Sopenharmony_ci			    data->correction < maxage)
16378c2ecf20Sopenharmony_ci				nextage = data->correction;
16388c2ecf20Sopenharmony_ci		}
16398c2ecf20Sopenharmony_ci	} while (nextage);
16408c2ecf20Sopenharmony_ci
16418c2ecf20Sopenharmony_ci	/* The ages assigned above are off by one. */
16428c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i++) {
16438c2ecf20Sopenharmony_ci		j = 0;
16448c2ecf20Sopenharmony_ci		while (ages[j] < trees[i].maxage)
16458c2ecf20Sopenharmony_ci			j++;
16468c2ecf20Sopenharmony_ci		trees[i].maxage = ages[j-1];
16478c2ecf20Sopenharmony_ci	}
16488c2ecf20Sopenharmony_ci
16498c2ecf20Sopenharmony_ci	/* Set up the forwarding between trees. */
16508c2ecf20Sopenharmony_ci	trees[trees_count-2].next = &trees[trees_count-1];
16518c2ecf20Sopenharmony_ci	trees[trees_count-1].leaf_mark = nfdi_mark;
16528c2ecf20Sopenharmony_ci	trees[trees_count-2].leaf_mark = nfdicf_mark;
16538c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count-2; i += 2) {
16548c2ecf20Sopenharmony_ci		trees[i].next = &trees[trees_count-2];
16558c2ecf20Sopenharmony_ci		trees[i].leaf_mark = correction_mark;
16568c2ecf20Sopenharmony_ci		trees[i+1].next = &trees[trees_count-1];
16578c2ecf20Sopenharmony_ci		trees[i+1].leaf_mark = correction_mark;
16588c2ecf20Sopenharmony_ci	}
16598c2ecf20Sopenharmony_ci
16608c2ecf20Sopenharmony_ci	/* Assign the callouts. */
16618c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i += 2) {
16628c2ecf20Sopenharmony_ci		trees[i].type = "nfdicf";
16638c2ecf20Sopenharmony_ci		trees[i].leaf_equal = nfdicf_equal;
16648c2ecf20Sopenharmony_ci		trees[i].leaf_print = nfdicf_print;
16658c2ecf20Sopenharmony_ci		trees[i].leaf_size = nfdicf_size;
16668c2ecf20Sopenharmony_ci		trees[i].leaf_index = nfdicf_index;
16678c2ecf20Sopenharmony_ci		trees[i].leaf_emit = nfdicf_emit;
16688c2ecf20Sopenharmony_ci
16698c2ecf20Sopenharmony_ci		trees[i+1].type = "nfdi";
16708c2ecf20Sopenharmony_ci		trees[i+1].leaf_equal = nfdi_equal;
16718c2ecf20Sopenharmony_ci		trees[i+1].leaf_print = nfdi_print;
16728c2ecf20Sopenharmony_ci		trees[i+1].leaf_size = nfdi_size;
16738c2ecf20Sopenharmony_ci		trees[i+1].leaf_index = nfdi_index;
16748c2ecf20Sopenharmony_ci		trees[i+1].leaf_emit = nfdi_emit;
16758c2ecf20Sopenharmony_ci	}
16768c2ecf20Sopenharmony_ci
16778c2ecf20Sopenharmony_ci	/* Finish init. */
16788c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i++)
16798c2ecf20Sopenharmony_ci		trees[i].childnode = NODE;
16808c2ecf20Sopenharmony_ci}
16818c2ecf20Sopenharmony_ci
16828c2ecf20Sopenharmony_cistatic void trees_populate(void)
16838c2ecf20Sopenharmony_ci{
16848c2ecf20Sopenharmony_ci	struct unicode_data *data;
16858c2ecf20Sopenharmony_ci	unsigned int unichar;
16868c2ecf20Sopenharmony_ci	char keyval[4];
16878c2ecf20Sopenharmony_ci	int keylen;
16888c2ecf20Sopenharmony_ci	int i;
16898c2ecf20Sopenharmony_ci
16908c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i++) {
16918c2ecf20Sopenharmony_ci		if (verbose > 0) {
16928c2ecf20Sopenharmony_ci			printf("Populating %s_%x\n",
16938c2ecf20Sopenharmony_ci				trees[i].type, trees[i].maxage);
16948c2ecf20Sopenharmony_ci		}
16958c2ecf20Sopenharmony_ci		for (unichar = 0; unichar != 0x110000; unichar++) {
16968c2ecf20Sopenharmony_ci			if (unicode_data[unichar].gen < 0)
16978c2ecf20Sopenharmony_ci				continue;
16988c2ecf20Sopenharmony_ci			keylen = utf8encode(keyval, unichar);
16998c2ecf20Sopenharmony_ci			data = corrections_lookup(&unicode_data[unichar]);
17008c2ecf20Sopenharmony_ci			if (data->correction <= trees[i].maxage)
17018c2ecf20Sopenharmony_ci				data = &unicode_data[unichar];
17028c2ecf20Sopenharmony_ci			insert(&trees[i], keyval, keylen, data);
17038c2ecf20Sopenharmony_ci		}
17048c2ecf20Sopenharmony_ci	}
17058c2ecf20Sopenharmony_ci}
17068c2ecf20Sopenharmony_ci
17078c2ecf20Sopenharmony_cistatic void trees_reduce(void)
17088c2ecf20Sopenharmony_ci{
17098c2ecf20Sopenharmony_ci	int i;
17108c2ecf20Sopenharmony_ci	int size;
17118c2ecf20Sopenharmony_ci	int changed;
17128c2ecf20Sopenharmony_ci
17138c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i++)
17148c2ecf20Sopenharmony_ci		prune(&trees[i]);
17158c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i++)
17168c2ecf20Sopenharmony_ci		mark_nodes(&trees[i]);
17178c2ecf20Sopenharmony_ci	do {
17188c2ecf20Sopenharmony_ci		size = 0;
17198c2ecf20Sopenharmony_ci		for (i = 0; i != trees_count; i++)
17208c2ecf20Sopenharmony_ci			size = index_nodes(&trees[i], size);
17218c2ecf20Sopenharmony_ci		changed = 0;
17228c2ecf20Sopenharmony_ci		for (i = 0; i != trees_count; i++)
17238c2ecf20Sopenharmony_ci			changed += size_nodes(&trees[i]);
17248c2ecf20Sopenharmony_ci	} while (changed);
17258c2ecf20Sopenharmony_ci
17268c2ecf20Sopenharmony_ci	utf8data = calloc(size, 1);
17278c2ecf20Sopenharmony_ci	utf8data_size = size;
17288c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i++)
17298c2ecf20Sopenharmony_ci		emit(&trees[i], utf8data);
17308c2ecf20Sopenharmony_ci
17318c2ecf20Sopenharmony_ci	if (verbose > 0) {
17328c2ecf20Sopenharmony_ci		for (i = 0; i != trees_count; i++) {
17338c2ecf20Sopenharmony_ci			printf("%s_%x idx %d\n",
17348c2ecf20Sopenharmony_ci				trees[i].type, trees[i].maxage, trees[i].index);
17358c2ecf20Sopenharmony_ci		}
17368c2ecf20Sopenharmony_ci	}
17378c2ecf20Sopenharmony_ci
17388c2ecf20Sopenharmony_ci	nfdi = utf8data + trees[trees_count-1].index;
17398c2ecf20Sopenharmony_ci	nfdicf = utf8data + trees[trees_count-2].index;
17408c2ecf20Sopenharmony_ci
17418c2ecf20Sopenharmony_ci	nfdi_tree = &trees[trees_count-1];
17428c2ecf20Sopenharmony_ci	nfdicf_tree = &trees[trees_count-2];
17438c2ecf20Sopenharmony_ci}
17448c2ecf20Sopenharmony_ci
17458c2ecf20Sopenharmony_cistatic void verify(struct tree *tree)
17468c2ecf20Sopenharmony_ci{
17478c2ecf20Sopenharmony_ci	struct unicode_data *data;
17488c2ecf20Sopenharmony_ci	utf8leaf_t	*leaf;
17498c2ecf20Sopenharmony_ci	unsigned int	unichar;
17508c2ecf20Sopenharmony_ci	char		key[4];
17518c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
17528c2ecf20Sopenharmony_ci	int		report;
17538c2ecf20Sopenharmony_ci	int		nocf;
17548c2ecf20Sopenharmony_ci
17558c2ecf20Sopenharmony_ci	if (verbose > 0)
17568c2ecf20Sopenharmony_ci		printf("Verifying %s_%x\n", tree->type, tree->maxage);
17578c2ecf20Sopenharmony_ci	nocf = strcmp(tree->type, "nfdicf");
17588c2ecf20Sopenharmony_ci
17598c2ecf20Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++) {
17608c2ecf20Sopenharmony_ci		report = 0;
17618c2ecf20Sopenharmony_ci		data = corrections_lookup(&unicode_data[unichar]);
17628c2ecf20Sopenharmony_ci		if (data->correction <= tree->maxage)
17638c2ecf20Sopenharmony_ci			data = &unicode_data[unichar];
17648c2ecf20Sopenharmony_ci		utf8encode(key,unichar);
17658c2ecf20Sopenharmony_ci		leaf = utf8lookup(tree, hangul, key);
17668c2ecf20Sopenharmony_ci
17678c2ecf20Sopenharmony_ci		if (!leaf) {
17688c2ecf20Sopenharmony_ci			if (data->gen != -1)
17698c2ecf20Sopenharmony_ci				report++;
17708c2ecf20Sopenharmony_ci			if (unichar < 0xd800 || unichar > 0xdfff)
17718c2ecf20Sopenharmony_ci				report++;
17728c2ecf20Sopenharmony_ci		} else {
17738c2ecf20Sopenharmony_ci			if (unichar >= 0xd800 && unichar <= 0xdfff)
17748c2ecf20Sopenharmony_ci				report++;
17758c2ecf20Sopenharmony_ci			if (data->gen == -1)
17768c2ecf20Sopenharmony_ci				report++;
17778c2ecf20Sopenharmony_ci			if (data->gen != LEAF_GEN(leaf))
17788c2ecf20Sopenharmony_ci				report++;
17798c2ecf20Sopenharmony_ci			if (LEAF_CCC(leaf) == DECOMPOSE) {
17808c2ecf20Sopenharmony_ci				if (HANGUL_SYLLABLE(data->code)) {
17818c2ecf20Sopenharmony_ci					if (data->utf8nfdi[0] != HANGUL)
17828c2ecf20Sopenharmony_ci						report++;
17838c2ecf20Sopenharmony_ci				} else if (nocf) {
17848c2ecf20Sopenharmony_ci					if (!data->utf8nfdi) {
17858c2ecf20Sopenharmony_ci						report++;
17868c2ecf20Sopenharmony_ci					} else if (strcmp(data->utf8nfdi,
17878c2ecf20Sopenharmony_ci							  LEAF_STR(leaf))) {
17888c2ecf20Sopenharmony_ci						report++;
17898c2ecf20Sopenharmony_ci					}
17908c2ecf20Sopenharmony_ci				} else {
17918c2ecf20Sopenharmony_ci					if (!data->utf8nfdicf &&
17928c2ecf20Sopenharmony_ci					    !data->utf8nfdi) {
17938c2ecf20Sopenharmony_ci						report++;
17948c2ecf20Sopenharmony_ci					} else if (data->utf8nfdicf) {
17958c2ecf20Sopenharmony_ci						if (strcmp(data->utf8nfdicf,
17968c2ecf20Sopenharmony_ci							   LEAF_STR(leaf)))
17978c2ecf20Sopenharmony_ci							report++;
17988c2ecf20Sopenharmony_ci					} else if (strcmp(data->utf8nfdi,
17998c2ecf20Sopenharmony_ci							  LEAF_STR(leaf))) {
18008c2ecf20Sopenharmony_ci						report++;
18018c2ecf20Sopenharmony_ci					}
18028c2ecf20Sopenharmony_ci				}
18038c2ecf20Sopenharmony_ci			} else if (data->ccc != LEAF_CCC(leaf)) {
18048c2ecf20Sopenharmony_ci				report++;
18058c2ecf20Sopenharmony_ci			}
18068c2ecf20Sopenharmony_ci		}
18078c2ecf20Sopenharmony_ci		if (report) {
18088c2ecf20Sopenharmony_ci			printf("%X code %X gen %d ccc %d"
18098c2ecf20Sopenharmony_ci				" nfdi -> \"%s\"",
18108c2ecf20Sopenharmony_ci				unichar, data->code, data->gen,
18118c2ecf20Sopenharmony_ci				data->ccc,
18128c2ecf20Sopenharmony_ci				data->utf8nfdi);
18138c2ecf20Sopenharmony_ci			if (leaf) {
18148c2ecf20Sopenharmony_ci				printf(" gen %d ccc %d"
18158c2ecf20Sopenharmony_ci					" nfdi -> \"%s\"",
18168c2ecf20Sopenharmony_ci					LEAF_GEN(leaf),
18178c2ecf20Sopenharmony_ci					LEAF_CCC(leaf),
18188c2ecf20Sopenharmony_ci					LEAF_CCC(leaf) == DECOMPOSE ?
18198c2ecf20Sopenharmony_ci						LEAF_STR(leaf) : "");
18208c2ecf20Sopenharmony_ci			}
18218c2ecf20Sopenharmony_ci			printf("\n");
18228c2ecf20Sopenharmony_ci		}
18238c2ecf20Sopenharmony_ci	}
18248c2ecf20Sopenharmony_ci}
18258c2ecf20Sopenharmony_ci
18268c2ecf20Sopenharmony_cistatic void trees_verify(void)
18278c2ecf20Sopenharmony_ci{
18288c2ecf20Sopenharmony_ci	int i;
18298c2ecf20Sopenharmony_ci
18308c2ecf20Sopenharmony_ci	for (i = 0; i != trees_count; i++)
18318c2ecf20Sopenharmony_ci		verify(&trees[i]);
18328c2ecf20Sopenharmony_ci}
18338c2ecf20Sopenharmony_ci
18348c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
18358c2ecf20Sopenharmony_ci
18368c2ecf20Sopenharmony_cistatic void help(void)
18378c2ecf20Sopenharmony_ci{
18388c2ecf20Sopenharmony_ci	printf("Usage: %s [options]\n", argv0);
18398c2ecf20Sopenharmony_ci	printf("\n");
18408c2ecf20Sopenharmony_ci	printf("This program creates an a data trie used for parsing and\n");
18418c2ecf20Sopenharmony_ci	printf("normalization of UTF-8 strings. The trie is derived from\n");
18428c2ecf20Sopenharmony_ci	printf("a set of input files from the Unicode character database\n");
18438c2ecf20Sopenharmony_ci	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
18448c2ecf20Sopenharmony_ci	printf("\n");
18458c2ecf20Sopenharmony_ci	printf("The generated tree supports two normalization forms:\n");
18468c2ecf20Sopenharmony_ci	printf("\n");
18478c2ecf20Sopenharmony_ci	printf("\tnfdi:\n");
18488c2ecf20Sopenharmony_ci	printf("\t- Apply unicode normalization form NFD.\n");
18498c2ecf20Sopenharmony_ci	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
18508c2ecf20Sopenharmony_ci	printf("\n");
18518c2ecf20Sopenharmony_ci	printf("\tnfdicf:\n");
18528c2ecf20Sopenharmony_ci	printf("\t- Apply unicode normalization form NFD.\n");
18538c2ecf20Sopenharmony_ci	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
18548c2ecf20Sopenharmony_ci	printf("\t- Apply a full casefold (C + F).\n");
18558c2ecf20Sopenharmony_ci	printf("\n");
18568c2ecf20Sopenharmony_ci	printf("These forms were chosen as being most useful when dealing\n");
18578c2ecf20Sopenharmony_ci	printf("with file names: NFD catches most cases where characters\n");
18588c2ecf20Sopenharmony_ci	printf("should be considered equivalent. The ignorables are mostly\n");
18598c2ecf20Sopenharmony_ci	printf("invisible, making names hard to type.\n");
18608c2ecf20Sopenharmony_ci	printf("\n");
18618c2ecf20Sopenharmony_ci	printf("The options to specify the files to be used are listed\n");
18628c2ecf20Sopenharmony_ci	printf("below with their default values, which are the names used\n");
18638c2ecf20Sopenharmony_ci	printf("by version 11.0.0 of the Unicode Character Database.\n");
18648c2ecf20Sopenharmony_ci	printf("\n");
18658c2ecf20Sopenharmony_ci	printf("The input files:\n");
18668c2ecf20Sopenharmony_ci	printf("\t-a %s\n", AGE_NAME);
18678c2ecf20Sopenharmony_ci	printf("\t-c %s\n", CCC_NAME);
18688c2ecf20Sopenharmony_ci	printf("\t-p %s\n", PROP_NAME);
18698c2ecf20Sopenharmony_ci	printf("\t-d %s\n", DATA_NAME);
18708c2ecf20Sopenharmony_ci	printf("\t-f %s\n", FOLD_NAME);
18718c2ecf20Sopenharmony_ci	printf("\t-n %s\n", NORM_NAME);
18728c2ecf20Sopenharmony_ci	printf("\n");
18738c2ecf20Sopenharmony_ci	printf("Additionally, the generated tables are tested using:\n");
18748c2ecf20Sopenharmony_ci	printf("\t-t %s\n", TEST_NAME);
18758c2ecf20Sopenharmony_ci	printf("\n");
18768c2ecf20Sopenharmony_ci	printf("Finally, the output file:\n");
18778c2ecf20Sopenharmony_ci	printf("\t-o %s\n", UTF8_NAME);
18788c2ecf20Sopenharmony_ci	printf("\n");
18798c2ecf20Sopenharmony_ci}
18808c2ecf20Sopenharmony_ci
18818c2ecf20Sopenharmony_cistatic void usage(void)
18828c2ecf20Sopenharmony_ci{
18838c2ecf20Sopenharmony_ci	help();
18848c2ecf20Sopenharmony_ci	exit(1);
18858c2ecf20Sopenharmony_ci}
18868c2ecf20Sopenharmony_ci
18878c2ecf20Sopenharmony_cistatic void open_fail(const char *name, int error)
18888c2ecf20Sopenharmony_ci{
18898c2ecf20Sopenharmony_ci	printf("Error %d opening %s: %s\n", error, name, strerror(error));
18908c2ecf20Sopenharmony_ci	exit(1);
18918c2ecf20Sopenharmony_ci}
18928c2ecf20Sopenharmony_ci
18938c2ecf20Sopenharmony_cistatic void file_fail(const char *filename)
18948c2ecf20Sopenharmony_ci{
18958c2ecf20Sopenharmony_ci	printf("Error parsing %s\n", filename);
18968c2ecf20Sopenharmony_ci	exit(1);
18978c2ecf20Sopenharmony_ci}
18988c2ecf20Sopenharmony_ci
18998c2ecf20Sopenharmony_cistatic void line_fail(const char *filename, const char *line)
19008c2ecf20Sopenharmony_ci{
19018c2ecf20Sopenharmony_ci	printf("Error parsing %s:%s\n", filename, line);
19028c2ecf20Sopenharmony_ci	exit(1);
19038c2ecf20Sopenharmony_ci}
19048c2ecf20Sopenharmony_ci
19058c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
19068c2ecf20Sopenharmony_ci
19078c2ecf20Sopenharmony_cistatic void print_utf32(unsigned int *utf32str)
19088c2ecf20Sopenharmony_ci{
19098c2ecf20Sopenharmony_ci	int	i;
19108c2ecf20Sopenharmony_ci
19118c2ecf20Sopenharmony_ci	for (i = 0; utf32str[i]; i++)
19128c2ecf20Sopenharmony_ci		printf(" %X", utf32str[i]);
19138c2ecf20Sopenharmony_ci}
19148c2ecf20Sopenharmony_ci
19158c2ecf20Sopenharmony_cistatic void print_utf32nfdi(unsigned int unichar)
19168c2ecf20Sopenharmony_ci{
19178c2ecf20Sopenharmony_ci	printf(" %X ->", unichar);
19188c2ecf20Sopenharmony_ci	print_utf32(unicode_data[unichar].utf32nfdi);
19198c2ecf20Sopenharmony_ci	printf("\n");
19208c2ecf20Sopenharmony_ci}
19218c2ecf20Sopenharmony_ci
19228c2ecf20Sopenharmony_cistatic void print_utf32nfdicf(unsigned int unichar)
19238c2ecf20Sopenharmony_ci{
19248c2ecf20Sopenharmony_ci	printf(" %X ->", unichar);
19258c2ecf20Sopenharmony_ci	print_utf32(unicode_data[unichar].utf32nfdicf);
19268c2ecf20Sopenharmony_ci	printf("\n");
19278c2ecf20Sopenharmony_ci}
19288c2ecf20Sopenharmony_ci
19298c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
19308c2ecf20Sopenharmony_ci
19318c2ecf20Sopenharmony_cistatic void age_init(void)
19328c2ecf20Sopenharmony_ci{
19338c2ecf20Sopenharmony_ci	FILE *file;
19348c2ecf20Sopenharmony_ci	unsigned int first;
19358c2ecf20Sopenharmony_ci	unsigned int last;
19368c2ecf20Sopenharmony_ci	unsigned int unichar;
19378c2ecf20Sopenharmony_ci	unsigned int major;
19388c2ecf20Sopenharmony_ci	unsigned int minor;
19398c2ecf20Sopenharmony_ci	unsigned int revision;
19408c2ecf20Sopenharmony_ci	int gen;
19418c2ecf20Sopenharmony_ci	int count;
19428c2ecf20Sopenharmony_ci	int ret;
19438c2ecf20Sopenharmony_ci
19448c2ecf20Sopenharmony_ci	if (verbose > 0)
19458c2ecf20Sopenharmony_ci		printf("Parsing %s\n", age_name);
19468c2ecf20Sopenharmony_ci
19478c2ecf20Sopenharmony_ci	file = fopen(age_name, "r");
19488c2ecf20Sopenharmony_ci	if (!file)
19498c2ecf20Sopenharmony_ci		open_fail(age_name, errno);
19508c2ecf20Sopenharmony_ci	count = 0;
19518c2ecf20Sopenharmony_ci
19528c2ecf20Sopenharmony_ci	gen = 0;
19538c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
19548c2ecf20Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d_%d",
19558c2ecf20Sopenharmony_ci				&major, &minor, &revision);
19568c2ecf20Sopenharmony_ci		if (ret == 3) {
19578c2ecf20Sopenharmony_ci			ages_count++;
19588c2ecf20Sopenharmony_ci			if (verbose > 1)
19598c2ecf20Sopenharmony_ci				printf(" Age V%d_%d_%d\n",
19608c2ecf20Sopenharmony_ci					major, minor, revision);
19618c2ecf20Sopenharmony_ci			if (!age_valid(major, minor, revision))
19628c2ecf20Sopenharmony_ci				line_fail(age_name, line);
19638c2ecf20Sopenharmony_ci			continue;
19648c2ecf20Sopenharmony_ci		}
19658c2ecf20Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
19668c2ecf20Sopenharmony_ci		if (ret == 2) {
19678c2ecf20Sopenharmony_ci			ages_count++;
19688c2ecf20Sopenharmony_ci			if (verbose > 1)
19698c2ecf20Sopenharmony_ci				printf(" Age V%d_%d\n", major, minor);
19708c2ecf20Sopenharmony_ci			if (!age_valid(major, minor, 0))
19718c2ecf20Sopenharmony_ci				line_fail(age_name, line);
19728c2ecf20Sopenharmony_ci			continue;
19738c2ecf20Sopenharmony_ci		}
19748c2ecf20Sopenharmony_ci	}
19758c2ecf20Sopenharmony_ci
19768c2ecf20Sopenharmony_ci	/* We must have found something above. */
19778c2ecf20Sopenharmony_ci	if (verbose > 1)
19788c2ecf20Sopenharmony_ci		printf("%d age entries\n", ages_count);
19798c2ecf20Sopenharmony_ci	if (ages_count == 0 || ages_count > MAXGEN)
19808c2ecf20Sopenharmony_ci		file_fail(age_name);
19818c2ecf20Sopenharmony_ci
19828c2ecf20Sopenharmony_ci	/* There is a 0 entry. */
19838c2ecf20Sopenharmony_ci	ages_count++;
19848c2ecf20Sopenharmony_ci	ages = calloc(ages_count + 1, sizeof(*ages));
19858c2ecf20Sopenharmony_ci	/* And a guard entry. */
19868c2ecf20Sopenharmony_ci	ages[ages_count] = (unsigned int)-1;
19878c2ecf20Sopenharmony_ci
19888c2ecf20Sopenharmony_ci	rewind(file);
19898c2ecf20Sopenharmony_ci	count = 0;
19908c2ecf20Sopenharmony_ci	gen = 0;
19918c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
19928c2ecf20Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d_%d",
19938c2ecf20Sopenharmony_ci				&major, &minor, &revision);
19948c2ecf20Sopenharmony_ci		if (ret == 3) {
19958c2ecf20Sopenharmony_ci			ages[++gen] =
19968c2ecf20Sopenharmony_ci				UNICODE_AGE(major, minor, revision);
19978c2ecf20Sopenharmony_ci			if (verbose > 1)
19988c2ecf20Sopenharmony_ci				printf(" Age V%d_%d_%d = gen %d\n",
19998c2ecf20Sopenharmony_ci					major, minor, revision, gen);
20008c2ecf20Sopenharmony_ci			if (!age_valid(major, minor, revision))
20018c2ecf20Sopenharmony_ci				line_fail(age_name, line);
20028c2ecf20Sopenharmony_ci			continue;
20038c2ecf20Sopenharmony_ci		}
20048c2ecf20Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
20058c2ecf20Sopenharmony_ci		if (ret == 2) {
20068c2ecf20Sopenharmony_ci			ages[++gen] = UNICODE_AGE(major, minor, 0);
20078c2ecf20Sopenharmony_ci			if (verbose > 1)
20088c2ecf20Sopenharmony_ci				printf(" Age V%d_%d = %d\n",
20098c2ecf20Sopenharmony_ci					major, minor, gen);
20108c2ecf20Sopenharmony_ci			if (!age_valid(major, minor, 0))
20118c2ecf20Sopenharmony_ci				line_fail(age_name, line);
20128c2ecf20Sopenharmony_ci			continue;
20138c2ecf20Sopenharmony_ci		}
20148c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X..%X ; %d.%d #",
20158c2ecf20Sopenharmony_ci			     &first, &last, &major, &minor);
20168c2ecf20Sopenharmony_ci		if (ret == 4) {
20178c2ecf20Sopenharmony_ci			for (unichar = first; unichar <= last; unichar++)
20188c2ecf20Sopenharmony_ci				unicode_data[unichar].gen = gen;
20198c2ecf20Sopenharmony_ci			count += 1 + last - first;
20208c2ecf20Sopenharmony_ci			if (verbose > 1)
20218c2ecf20Sopenharmony_ci				printf("  %X..%X gen %d\n", first, last, gen);
20228c2ecf20Sopenharmony_ci			if (!utf32valid(first) || !utf32valid(last))
20238c2ecf20Sopenharmony_ci				line_fail(age_name, line);
20248c2ecf20Sopenharmony_ci			continue;
20258c2ecf20Sopenharmony_ci		}
20268c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
20278c2ecf20Sopenharmony_ci		if (ret == 3) {
20288c2ecf20Sopenharmony_ci			unicode_data[unichar].gen = gen;
20298c2ecf20Sopenharmony_ci			count++;
20308c2ecf20Sopenharmony_ci			if (verbose > 1)
20318c2ecf20Sopenharmony_ci				printf("  %X gen %d\n", unichar, gen);
20328c2ecf20Sopenharmony_ci			if (!utf32valid(unichar))
20338c2ecf20Sopenharmony_ci				line_fail(age_name, line);
20348c2ecf20Sopenharmony_ci			continue;
20358c2ecf20Sopenharmony_ci		}
20368c2ecf20Sopenharmony_ci	}
20378c2ecf20Sopenharmony_ci	unicode_maxage = ages[gen];
20388c2ecf20Sopenharmony_ci	fclose(file);
20398c2ecf20Sopenharmony_ci
20408c2ecf20Sopenharmony_ci	/* Nix surrogate block */
20418c2ecf20Sopenharmony_ci	if (verbose > 1)
20428c2ecf20Sopenharmony_ci		printf(" Removing surrogate block D800..DFFF\n");
20438c2ecf20Sopenharmony_ci	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
20448c2ecf20Sopenharmony_ci		unicode_data[unichar].gen = -1;
20458c2ecf20Sopenharmony_ci
20468c2ecf20Sopenharmony_ci	if (verbose > 0)
20478c2ecf20Sopenharmony_ci	        printf("Found %d entries\n", count);
20488c2ecf20Sopenharmony_ci	if (count == 0)
20498c2ecf20Sopenharmony_ci		file_fail(age_name);
20508c2ecf20Sopenharmony_ci}
20518c2ecf20Sopenharmony_ci
20528c2ecf20Sopenharmony_cistatic void ccc_init(void)
20538c2ecf20Sopenharmony_ci{
20548c2ecf20Sopenharmony_ci	FILE *file;
20558c2ecf20Sopenharmony_ci	unsigned int first;
20568c2ecf20Sopenharmony_ci	unsigned int last;
20578c2ecf20Sopenharmony_ci	unsigned int unichar;
20588c2ecf20Sopenharmony_ci	unsigned int value;
20598c2ecf20Sopenharmony_ci	int count;
20608c2ecf20Sopenharmony_ci	int ret;
20618c2ecf20Sopenharmony_ci
20628c2ecf20Sopenharmony_ci	if (verbose > 0)
20638c2ecf20Sopenharmony_ci		printf("Parsing %s\n", ccc_name);
20648c2ecf20Sopenharmony_ci
20658c2ecf20Sopenharmony_ci	file = fopen(ccc_name, "r");
20668c2ecf20Sopenharmony_ci	if (!file)
20678c2ecf20Sopenharmony_ci		open_fail(ccc_name, errno);
20688c2ecf20Sopenharmony_ci
20698c2ecf20Sopenharmony_ci	count = 0;
20708c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
20718c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
20728c2ecf20Sopenharmony_ci		if (ret == 3) {
20738c2ecf20Sopenharmony_ci			for (unichar = first; unichar <= last; unichar++) {
20748c2ecf20Sopenharmony_ci				unicode_data[unichar].ccc = value;
20758c2ecf20Sopenharmony_ci                                count++;
20768c2ecf20Sopenharmony_ci			}
20778c2ecf20Sopenharmony_ci			if (verbose > 1)
20788c2ecf20Sopenharmony_ci				printf(" %X..%X ccc %d\n", first, last, value);
20798c2ecf20Sopenharmony_ci			if (!utf32valid(first) || !utf32valid(last))
20808c2ecf20Sopenharmony_ci				line_fail(ccc_name, line);
20818c2ecf20Sopenharmony_ci			continue;
20828c2ecf20Sopenharmony_ci		}
20838c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X ; %d #", &unichar, &value);
20848c2ecf20Sopenharmony_ci		if (ret == 2) {
20858c2ecf20Sopenharmony_ci			unicode_data[unichar].ccc = value;
20868c2ecf20Sopenharmony_ci                        count++;
20878c2ecf20Sopenharmony_ci			if (verbose > 1)
20888c2ecf20Sopenharmony_ci				printf(" %X ccc %d\n", unichar, value);
20898c2ecf20Sopenharmony_ci			if (!utf32valid(unichar))
20908c2ecf20Sopenharmony_ci				line_fail(ccc_name, line);
20918c2ecf20Sopenharmony_ci			continue;
20928c2ecf20Sopenharmony_ci		}
20938c2ecf20Sopenharmony_ci	}
20948c2ecf20Sopenharmony_ci	fclose(file);
20958c2ecf20Sopenharmony_ci
20968c2ecf20Sopenharmony_ci	if (verbose > 0)
20978c2ecf20Sopenharmony_ci		printf("Found %d entries\n", count);
20988c2ecf20Sopenharmony_ci	if (count == 0)
20998c2ecf20Sopenharmony_ci		file_fail(ccc_name);
21008c2ecf20Sopenharmony_ci}
21018c2ecf20Sopenharmony_ci
21028c2ecf20Sopenharmony_cistatic int ignore_compatibility_form(char *type)
21038c2ecf20Sopenharmony_ci{
21048c2ecf20Sopenharmony_ci	int i;
21058c2ecf20Sopenharmony_ci	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
21068c2ecf20Sopenharmony_ci				 "final", "isolated", "circle", "super",
21078c2ecf20Sopenharmony_ci				 "sub", "vertical", "wide", "narrow",
21088c2ecf20Sopenharmony_ci				 "small", "square", "fraction", "compat"};
21098c2ecf20Sopenharmony_ci
21108c2ecf20Sopenharmony_ci	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
21118c2ecf20Sopenharmony_ci		if (strcmp(type, ignored_types[i]) == 0)
21128c2ecf20Sopenharmony_ci			return 1;
21138c2ecf20Sopenharmony_ci	return 0;
21148c2ecf20Sopenharmony_ci}
21158c2ecf20Sopenharmony_ci
21168c2ecf20Sopenharmony_cistatic void nfdi_init(void)
21178c2ecf20Sopenharmony_ci{
21188c2ecf20Sopenharmony_ci	FILE *file;
21198c2ecf20Sopenharmony_ci	unsigned int unichar;
21208c2ecf20Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
21218c2ecf20Sopenharmony_ci	char *s;
21228c2ecf20Sopenharmony_ci	char *type;
21238c2ecf20Sopenharmony_ci	unsigned int *um;
21248c2ecf20Sopenharmony_ci	int count;
21258c2ecf20Sopenharmony_ci	int i;
21268c2ecf20Sopenharmony_ci	int ret;
21278c2ecf20Sopenharmony_ci
21288c2ecf20Sopenharmony_ci	if (verbose > 0)
21298c2ecf20Sopenharmony_ci		printf("Parsing %s\n", data_name);
21308c2ecf20Sopenharmony_ci	file = fopen(data_name, "r");
21318c2ecf20Sopenharmony_ci	if (!file)
21328c2ecf20Sopenharmony_ci		open_fail(data_name, errno);
21338c2ecf20Sopenharmony_ci
21348c2ecf20Sopenharmony_ci	count = 0;
21358c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
21368c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
21378c2ecf20Sopenharmony_ci			     &unichar, buf0);
21388c2ecf20Sopenharmony_ci		if (ret != 2)
21398c2ecf20Sopenharmony_ci			continue;
21408c2ecf20Sopenharmony_ci		if (!utf32valid(unichar))
21418c2ecf20Sopenharmony_ci			line_fail(data_name, line);
21428c2ecf20Sopenharmony_ci
21438c2ecf20Sopenharmony_ci		s = buf0;
21448c2ecf20Sopenharmony_ci		/* skip over <tag> */
21458c2ecf20Sopenharmony_ci		if (*s == '<') {
21468c2ecf20Sopenharmony_ci			type = ++s;
21478c2ecf20Sopenharmony_ci			while (*++s != '>');
21488c2ecf20Sopenharmony_ci			*s++ = '\0';
21498c2ecf20Sopenharmony_ci			if(ignore_compatibility_form(type))
21508c2ecf20Sopenharmony_ci				continue;
21518c2ecf20Sopenharmony_ci		}
21528c2ecf20Sopenharmony_ci		/* decode the decomposition into UTF-32 */
21538c2ecf20Sopenharmony_ci		i = 0;
21548c2ecf20Sopenharmony_ci		while (*s) {
21558c2ecf20Sopenharmony_ci			mapping[i] = strtoul(s, &s, 16);
21568c2ecf20Sopenharmony_ci			if (!utf32valid(mapping[i]))
21578c2ecf20Sopenharmony_ci				line_fail(data_name, line);
21588c2ecf20Sopenharmony_ci			i++;
21598c2ecf20Sopenharmony_ci		}
21608c2ecf20Sopenharmony_ci		mapping[i++] = 0;
21618c2ecf20Sopenharmony_ci
21628c2ecf20Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
21638c2ecf20Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
21648c2ecf20Sopenharmony_ci		unicode_data[unichar].utf32nfdi = um;
21658c2ecf20Sopenharmony_ci
21668c2ecf20Sopenharmony_ci		if (verbose > 1)
21678c2ecf20Sopenharmony_ci			print_utf32nfdi(unichar);
21688c2ecf20Sopenharmony_ci		count++;
21698c2ecf20Sopenharmony_ci	}
21708c2ecf20Sopenharmony_ci	fclose(file);
21718c2ecf20Sopenharmony_ci	if (verbose > 0)
21728c2ecf20Sopenharmony_ci		printf("Found %d entries\n", count);
21738c2ecf20Sopenharmony_ci	if (count == 0)
21748c2ecf20Sopenharmony_ci		file_fail(data_name);
21758c2ecf20Sopenharmony_ci}
21768c2ecf20Sopenharmony_ci
21778c2ecf20Sopenharmony_cistatic void nfdicf_init(void)
21788c2ecf20Sopenharmony_ci{
21798c2ecf20Sopenharmony_ci	FILE *file;
21808c2ecf20Sopenharmony_ci	unsigned int unichar;
21818c2ecf20Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
21828c2ecf20Sopenharmony_ci	char status;
21838c2ecf20Sopenharmony_ci	char *s;
21848c2ecf20Sopenharmony_ci	unsigned int *um;
21858c2ecf20Sopenharmony_ci	int i;
21868c2ecf20Sopenharmony_ci	int count;
21878c2ecf20Sopenharmony_ci	int ret;
21888c2ecf20Sopenharmony_ci
21898c2ecf20Sopenharmony_ci	if (verbose > 0)
21908c2ecf20Sopenharmony_ci		printf("Parsing %s\n", fold_name);
21918c2ecf20Sopenharmony_ci	file = fopen(fold_name, "r");
21928c2ecf20Sopenharmony_ci	if (!file)
21938c2ecf20Sopenharmony_ci		open_fail(fold_name, errno);
21948c2ecf20Sopenharmony_ci
21958c2ecf20Sopenharmony_ci	count = 0;
21968c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
21978c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
21988c2ecf20Sopenharmony_ci		if (ret != 3)
21998c2ecf20Sopenharmony_ci			continue;
22008c2ecf20Sopenharmony_ci		if (!utf32valid(unichar))
22018c2ecf20Sopenharmony_ci			line_fail(fold_name, line);
22028c2ecf20Sopenharmony_ci		/* Use the C+F casefold. */
22038c2ecf20Sopenharmony_ci		if (status != 'C' && status != 'F')
22048c2ecf20Sopenharmony_ci			continue;
22058c2ecf20Sopenharmony_ci		s = buf0;
22068c2ecf20Sopenharmony_ci		if (*s == '<')
22078c2ecf20Sopenharmony_ci			while (*s++ != ' ')
22088c2ecf20Sopenharmony_ci				;
22098c2ecf20Sopenharmony_ci		i = 0;
22108c2ecf20Sopenharmony_ci		while (*s) {
22118c2ecf20Sopenharmony_ci			mapping[i] = strtoul(s, &s, 16);
22128c2ecf20Sopenharmony_ci			if (!utf32valid(mapping[i]))
22138c2ecf20Sopenharmony_ci				line_fail(fold_name, line);
22148c2ecf20Sopenharmony_ci			i++;
22158c2ecf20Sopenharmony_ci		}
22168c2ecf20Sopenharmony_ci		mapping[i++] = 0;
22178c2ecf20Sopenharmony_ci
22188c2ecf20Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
22198c2ecf20Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
22208c2ecf20Sopenharmony_ci		unicode_data[unichar].utf32nfdicf = um;
22218c2ecf20Sopenharmony_ci
22228c2ecf20Sopenharmony_ci		if (verbose > 1)
22238c2ecf20Sopenharmony_ci			print_utf32nfdicf(unichar);
22248c2ecf20Sopenharmony_ci		count++;
22258c2ecf20Sopenharmony_ci	}
22268c2ecf20Sopenharmony_ci	fclose(file);
22278c2ecf20Sopenharmony_ci	if (verbose > 0)
22288c2ecf20Sopenharmony_ci		printf("Found %d entries\n", count);
22298c2ecf20Sopenharmony_ci	if (count == 0)
22308c2ecf20Sopenharmony_ci		file_fail(fold_name);
22318c2ecf20Sopenharmony_ci}
22328c2ecf20Sopenharmony_ci
22338c2ecf20Sopenharmony_cistatic void ignore_init(void)
22348c2ecf20Sopenharmony_ci{
22358c2ecf20Sopenharmony_ci	FILE *file;
22368c2ecf20Sopenharmony_ci	unsigned int unichar;
22378c2ecf20Sopenharmony_ci	unsigned int first;
22388c2ecf20Sopenharmony_ci	unsigned int last;
22398c2ecf20Sopenharmony_ci	unsigned int *um;
22408c2ecf20Sopenharmony_ci	int count;
22418c2ecf20Sopenharmony_ci	int ret;
22428c2ecf20Sopenharmony_ci
22438c2ecf20Sopenharmony_ci	if (verbose > 0)
22448c2ecf20Sopenharmony_ci		printf("Parsing %s\n", prop_name);
22458c2ecf20Sopenharmony_ci	file = fopen(prop_name, "r");
22468c2ecf20Sopenharmony_ci	if (!file)
22478c2ecf20Sopenharmony_ci		open_fail(prop_name, errno);
22488c2ecf20Sopenharmony_ci	assert(file);
22498c2ecf20Sopenharmony_ci	count = 0;
22508c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
22518c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
22528c2ecf20Sopenharmony_ci		if (ret == 3) {
22538c2ecf20Sopenharmony_ci			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
22548c2ecf20Sopenharmony_ci				continue;
22558c2ecf20Sopenharmony_ci			if (!utf32valid(first) || !utf32valid(last))
22568c2ecf20Sopenharmony_ci				line_fail(prop_name, line);
22578c2ecf20Sopenharmony_ci			for (unichar = first; unichar <= last; unichar++) {
22588c2ecf20Sopenharmony_ci				free(unicode_data[unichar].utf32nfdi);
22598c2ecf20Sopenharmony_ci				um = malloc(sizeof(unsigned int));
22608c2ecf20Sopenharmony_ci				*um = 0;
22618c2ecf20Sopenharmony_ci				unicode_data[unichar].utf32nfdi = um;
22628c2ecf20Sopenharmony_ci				free(unicode_data[unichar].utf32nfdicf);
22638c2ecf20Sopenharmony_ci				um = malloc(sizeof(unsigned int));
22648c2ecf20Sopenharmony_ci				*um = 0;
22658c2ecf20Sopenharmony_ci				unicode_data[unichar].utf32nfdicf = um;
22668c2ecf20Sopenharmony_ci				count++;
22678c2ecf20Sopenharmony_ci			}
22688c2ecf20Sopenharmony_ci			if (verbose > 1)
22698c2ecf20Sopenharmony_ci				printf(" %X..%X Default_Ignorable_Code_Point\n",
22708c2ecf20Sopenharmony_ci					first, last);
22718c2ecf20Sopenharmony_ci			continue;
22728c2ecf20Sopenharmony_ci		}
22738c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
22748c2ecf20Sopenharmony_ci		if (ret == 2) {
22758c2ecf20Sopenharmony_ci			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
22768c2ecf20Sopenharmony_ci				continue;
22778c2ecf20Sopenharmony_ci			if (!utf32valid(unichar))
22788c2ecf20Sopenharmony_ci				line_fail(prop_name, line);
22798c2ecf20Sopenharmony_ci			free(unicode_data[unichar].utf32nfdi);
22808c2ecf20Sopenharmony_ci			um = malloc(sizeof(unsigned int));
22818c2ecf20Sopenharmony_ci			*um = 0;
22828c2ecf20Sopenharmony_ci			unicode_data[unichar].utf32nfdi = um;
22838c2ecf20Sopenharmony_ci			free(unicode_data[unichar].utf32nfdicf);
22848c2ecf20Sopenharmony_ci			um = malloc(sizeof(unsigned int));
22858c2ecf20Sopenharmony_ci			*um = 0;
22868c2ecf20Sopenharmony_ci			unicode_data[unichar].utf32nfdicf = um;
22878c2ecf20Sopenharmony_ci			if (verbose > 1)
22888c2ecf20Sopenharmony_ci				printf(" %X Default_Ignorable_Code_Point\n",
22898c2ecf20Sopenharmony_ci					unichar);
22908c2ecf20Sopenharmony_ci			count++;
22918c2ecf20Sopenharmony_ci			continue;
22928c2ecf20Sopenharmony_ci		}
22938c2ecf20Sopenharmony_ci	}
22948c2ecf20Sopenharmony_ci	fclose(file);
22958c2ecf20Sopenharmony_ci
22968c2ecf20Sopenharmony_ci	if (verbose > 0)
22978c2ecf20Sopenharmony_ci		printf("Found %d entries\n", count);
22988c2ecf20Sopenharmony_ci	if (count == 0)
22998c2ecf20Sopenharmony_ci		file_fail(prop_name);
23008c2ecf20Sopenharmony_ci}
23018c2ecf20Sopenharmony_ci
23028c2ecf20Sopenharmony_cistatic void corrections_init(void)
23038c2ecf20Sopenharmony_ci{
23048c2ecf20Sopenharmony_ci	FILE *file;
23058c2ecf20Sopenharmony_ci	unsigned int unichar;
23068c2ecf20Sopenharmony_ci	unsigned int major;
23078c2ecf20Sopenharmony_ci	unsigned int minor;
23088c2ecf20Sopenharmony_ci	unsigned int revision;
23098c2ecf20Sopenharmony_ci	unsigned int age;
23108c2ecf20Sopenharmony_ci	unsigned int *um;
23118c2ecf20Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
23128c2ecf20Sopenharmony_ci	char *s;
23138c2ecf20Sopenharmony_ci	int i;
23148c2ecf20Sopenharmony_ci	int count;
23158c2ecf20Sopenharmony_ci	int ret;
23168c2ecf20Sopenharmony_ci
23178c2ecf20Sopenharmony_ci	if (verbose > 0)
23188c2ecf20Sopenharmony_ci		printf("Parsing %s\n", norm_name);
23198c2ecf20Sopenharmony_ci	file = fopen(norm_name, "r");
23208c2ecf20Sopenharmony_ci	if (!file)
23218c2ecf20Sopenharmony_ci		open_fail(norm_name, errno);
23228c2ecf20Sopenharmony_ci
23238c2ecf20Sopenharmony_ci	count = 0;
23248c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
23258c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
23268c2ecf20Sopenharmony_ci				&unichar, buf0, buf1,
23278c2ecf20Sopenharmony_ci				&major, &minor, &revision);
23288c2ecf20Sopenharmony_ci		if (ret != 6)
23298c2ecf20Sopenharmony_ci			continue;
23308c2ecf20Sopenharmony_ci		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
23318c2ecf20Sopenharmony_ci			line_fail(norm_name, line);
23328c2ecf20Sopenharmony_ci		count++;
23338c2ecf20Sopenharmony_ci	}
23348c2ecf20Sopenharmony_ci	corrections = calloc(count, sizeof(struct unicode_data));
23358c2ecf20Sopenharmony_ci	corrections_count = count;
23368c2ecf20Sopenharmony_ci	rewind(file);
23378c2ecf20Sopenharmony_ci
23388c2ecf20Sopenharmony_ci	count = 0;
23398c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
23408c2ecf20Sopenharmony_ci		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
23418c2ecf20Sopenharmony_ci				&unichar, buf0, buf1,
23428c2ecf20Sopenharmony_ci				&major, &minor, &revision);
23438c2ecf20Sopenharmony_ci		if (ret != 6)
23448c2ecf20Sopenharmony_ci			continue;
23458c2ecf20Sopenharmony_ci		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
23468c2ecf20Sopenharmony_ci			line_fail(norm_name, line);
23478c2ecf20Sopenharmony_ci		corrections[count] = unicode_data[unichar];
23488c2ecf20Sopenharmony_ci		assert(corrections[count].code == unichar);
23498c2ecf20Sopenharmony_ci		age = UNICODE_AGE(major, minor, revision);
23508c2ecf20Sopenharmony_ci		corrections[count].correction = age;
23518c2ecf20Sopenharmony_ci
23528c2ecf20Sopenharmony_ci		i = 0;
23538c2ecf20Sopenharmony_ci		s = buf0;
23548c2ecf20Sopenharmony_ci		while (*s) {
23558c2ecf20Sopenharmony_ci			mapping[i] = strtoul(s, &s, 16);
23568c2ecf20Sopenharmony_ci			if (!utf32valid(mapping[i]))
23578c2ecf20Sopenharmony_ci				line_fail(norm_name, line);
23588c2ecf20Sopenharmony_ci			i++;
23598c2ecf20Sopenharmony_ci		}
23608c2ecf20Sopenharmony_ci		mapping[i++] = 0;
23618c2ecf20Sopenharmony_ci
23628c2ecf20Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
23638c2ecf20Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
23648c2ecf20Sopenharmony_ci		corrections[count].utf32nfdi = um;
23658c2ecf20Sopenharmony_ci
23668c2ecf20Sopenharmony_ci		if (verbose > 1)
23678c2ecf20Sopenharmony_ci			printf(" %X -> %s -> %s V%d_%d_%d\n",
23688c2ecf20Sopenharmony_ci				unichar, buf0, buf1, major, minor, revision);
23698c2ecf20Sopenharmony_ci		count++;
23708c2ecf20Sopenharmony_ci	}
23718c2ecf20Sopenharmony_ci	fclose(file);
23728c2ecf20Sopenharmony_ci
23738c2ecf20Sopenharmony_ci	if (verbose > 0)
23748c2ecf20Sopenharmony_ci	        printf("Found %d entries\n", count);
23758c2ecf20Sopenharmony_ci	if (count == 0)
23768c2ecf20Sopenharmony_ci		file_fail(norm_name);
23778c2ecf20Sopenharmony_ci}
23788c2ecf20Sopenharmony_ci
23798c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
23808c2ecf20Sopenharmony_ci
23818c2ecf20Sopenharmony_ci/*
23828c2ecf20Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
23838c2ecf20Sopenharmony_ci *
23848c2ecf20Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
23858c2ecf20Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
23868c2ecf20Sopenharmony_ci *
23878c2ecf20Sopenharmony_ci * SBase = 0xAC00
23888c2ecf20Sopenharmony_ci * LBase = 0x1100
23898c2ecf20Sopenharmony_ci * VBase = 0x1161
23908c2ecf20Sopenharmony_ci * TBase = 0x11A7
23918c2ecf20Sopenharmony_ci * LCount = 19
23928c2ecf20Sopenharmony_ci * VCount = 21
23938c2ecf20Sopenharmony_ci * TCount = 28
23948c2ecf20Sopenharmony_ci * NCount = 588 (VCount * TCount)
23958c2ecf20Sopenharmony_ci * SCount = 11172 (LCount * NCount)
23968c2ecf20Sopenharmony_ci *
23978c2ecf20Sopenharmony_ci * Decomposition:
23988c2ecf20Sopenharmony_ci *   SIndex = s - SBase
23998c2ecf20Sopenharmony_ci *
24008c2ecf20Sopenharmony_ci * LV (Canonical/Full)
24018c2ecf20Sopenharmony_ci *   LIndex = SIndex / NCount
24028c2ecf20Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
24038c2ecf20Sopenharmony_ci *   LPart = LBase + LIndex
24048c2ecf20Sopenharmony_ci *   VPart = VBase + VIndex
24058c2ecf20Sopenharmony_ci *
24068c2ecf20Sopenharmony_ci * LVT (Canonical)
24078c2ecf20Sopenharmony_ci *   LVIndex = (SIndex / TCount) * TCount
24088c2ecf20Sopenharmony_ci *   TIndex = (Sindex % TCount)
24098c2ecf20Sopenharmony_ci *   LVPart = SBase + LVIndex
24108c2ecf20Sopenharmony_ci *   TPart = TBase + TIndex
24118c2ecf20Sopenharmony_ci *
24128c2ecf20Sopenharmony_ci * LVT (Full)
24138c2ecf20Sopenharmony_ci *   LIndex = SIndex / NCount
24148c2ecf20Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
24158c2ecf20Sopenharmony_ci *   TIndex = (Sindex % TCount)
24168c2ecf20Sopenharmony_ci *   LPart = LBase + LIndex
24178c2ecf20Sopenharmony_ci *   VPart = VBase + VIndex
24188c2ecf20Sopenharmony_ci *   if (TIndex == 0) {
24198c2ecf20Sopenharmony_ci *          d = <LPart, VPart>
24208c2ecf20Sopenharmony_ci *   } else {
24218c2ecf20Sopenharmony_ci *          TPart = TBase + TIndex
24228c2ecf20Sopenharmony_ci *          d = <LPart, VPart, TPart>
24238c2ecf20Sopenharmony_ci *   }
24248c2ecf20Sopenharmony_ci *
24258c2ecf20Sopenharmony_ci */
24268c2ecf20Sopenharmony_ci
24278c2ecf20Sopenharmony_cistatic void hangul_decompose(void)
24288c2ecf20Sopenharmony_ci{
24298c2ecf20Sopenharmony_ci	unsigned int sb = 0xAC00;
24308c2ecf20Sopenharmony_ci	unsigned int lb = 0x1100;
24318c2ecf20Sopenharmony_ci	unsigned int vb = 0x1161;
24328c2ecf20Sopenharmony_ci	unsigned int tb = 0x11a7;
24338c2ecf20Sopenharmony_ci	/* unsigned int lc = 19; */
24348c2ecf20Sopenharmony_ci	unsigned int vc = 21;
24358c2ecf20Sopenharmony_ci	unsigned int tc = 28;
24368c2ecf20Sopenharmony_ci	unsigned int nc = (vc * tc);
24378c2ecf20Sopenharmony_ci	/* unsigned int sc = (lc * nc); */
24388c2ecf20Sopenharmony_ci	unsigned int unichar;
24398c2ecf20Sopenharmony_ci	unsigned int mapping[4];
24408c2ecf20Sopenharmony_ci	unsigned int *um;
24418c2ecf20Sopenharmony_ci        int count;
24428c2ecf20Sopenharmony_ci	int i;
24438c2ecf20Sopenharmony_ci
24448c2ecf20Sopenharmony_ci	if (verbose > 0)
24458c2ecf20Sopenharmony_ci		printf("Decomposing hangul\n");
24468c2ecf20Sopenharmony_ci	/* Hangul */
24478c2ecf20Sopenharmony_ci	count = 0;
24488c2ecf20Sopenharmony_ci	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
24498c2ecf20Sopenharmony_ci		unsigned int si = unichar - sb;
24508c2ecf20Sopenharmony_ci		unsigned int li = si / nc;
24518c2ecf20Sopenharmony_ci		unsigned int vi = (si % nc) / tc;
24528c2ecf20Sopenharmony_ci		unsigned int ti = si % tc;
24538c2ecf20Sopenharmony_ci
24548c2ecf20Sopenharmony_ci		i = 0;
24558c2ecf20Sopenharmony_ci		mapping[i++] = lb + li;
24568c2ecf20Sopenharmony_ci		mapping[i++] = vb + vi;
24578c2ecf20Sopenharmony_ci		if (ti)
24588c2ecf20Sopenharmony_ci			mapping[i++] = tb + ti;
24598c2ecf20Sopenharmony_ci		mapping[i++] = 0;
24608c2ecf20Sopenharmony_ci
24618c2ecf20Sopenharmony_ci		assert(!unicode_data[unichar].utf32nfdi);
24628c2ecf20Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
24638c2ecf20Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
24648c2ecf20Sopenharmony_ci		unicode_data[unichar].utf32nfdi = um;
24658c2ecf20Sopenharmony_ci
24668c2ecf20Sopenharmony_ci		assert(!unicode_data[unichar].utf32nfdicf);
24678c2ecf20Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
24688c2ecf20Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
24698c2ecf20Sopenharmony_ci		unicode_data[unichar].utf32nfdicf = um;
24708c2ecf20Sopenharmony_ci
24718c2ecf20Sopenharmony_ci		/*
24728c2ecf20Sopenharmony_ci		 * Add a cookie as a reminder that the hangul syllable
24738c2ecf20Sopenharmony_ci		 * decompositions must not be stored in the generated
24748c2ecf20Sopenharmony_ci		 * trie.
24758c2ecf20Sopenharmony_ci		 */
24768c2ecf20Sopenharmony_ci		unicode_data[unichar].utf8nfdi = malloc(2);
24778c2ecf20Sopenharmony_ci		unicode_data[unichar].utf8nfdi[0] = HANGUL;
24788c2ecf20Sopenharmony_ci		unicode_data[unichar].utf8nfdi[1] = '\0';
24798c2ecf20Sopenharmony_ci
24808c2ecf20Sopenharmony_ci		if (verbose > 1)
24818c2ecf20Sopenharmony_ci			print_utf32nfdi(unichar);
24828c2ecf20Sopenharmony_ci
24838c2ecf20Sopenharmony_ci		count++;
24848c2ecf20Sopenharmony_ci	}
24858c2ecf20Sopenharmony_ci	if (verbose > 0)
24868c2ecf20Sopenharmony_ci		printf("Created %d entries\n", count);
24878c2ecf20Sopenharmony_ci}
24888c2ecf20Sopenharmony_ci
24898c2ecf20Sopenharmony_cistatic void nfdi_decompose(void)
24908c2ecf20Sopenharmony_ci{
24918c2ecf20Sopenharmony_ci	unsigned int unichar;
24928c2ecf20Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
24938c2ecf20Sopenharmony_ci	unsigned int *um;
24948c2ecf20Sopenharmony_ci	unsigned int *dc;
24958c2ecf20Sopenharmony_ci	int count;
24968c2ecf20Sopenharmony_ci	int i;
24978c2ecf20Sopenharmony_ci	int j;
24988c2ecf20Sopenharmony_ci	int ret;
24998c2ecf20Sopenharmony_ci
25008c2ecf20Sopenharmony_ci	if (verbose > 0)
25018c2ecf20Sopenharmony_ci		printf("Decomposing nfdi\n");
25028c2ecf20Sopenharmony_ci
25038c2ecf20Sopenharmony_ci	count = 0;
25048c2ecf20Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++) {
25058c2ecf20Sopenharmony_ci		if (!unicode_data[unichar].utf32nfdi)
25068c2ecf20Sopenharmony_ci			continue;
25078c2ecf20Sopenharmony_ci		for (;;) {
25088c2ecf20Sopenharmony_ci			ret = 1;
25098c2ecf20Sopenharmony_ci			i = 0;
25108c2ecf20Sopenharmony_ci			um = unicode_data[unichar].utf32nfdi;
25118c2ecf20Sopenharmony_ci			while (*um) {
25128c2ecf20Sopenharmony_ci				dc = unicode_data[*um].utf32nfdi;
25138c2ecf20Sopenharmony_ci				if (dc) {
25148c2ecf20Sopenharmony_ci					for (j = 0; dc[j]; j++)
25158c2ecf20Sopenharmony_ci						mapping[i++] = dc[j];
25168c2ecf20Sopenharmony_ci					ret = 0;
25178c2ecf20Sopenharmony_ci				} else {
25188c2ecf20Sopenharmony_ci					mapping[i++] = *um;
25198c2ecf20Sopenharmony_ci				}
25208c2ecf20Sopenharmony_ci				um++;
25218c2ecf20Sopenharmony_ci			}
25228c2ecf20Sopenharmony_ci			mapping[i++] = 0;
25238c2ecf20Sopenharmony_ci			if (ret)
25248c2ecf20Sopenharmony_ci				break;
25258c2ecf20Sopenharmony_ci			free(unicode_data[unichar].utf32nfdi);
25268c2ecf20Sopenharmony_ci			um = malloc(i * sizeof(unsigned int));
25278c2ecf20Sopenharmony_ci			memcpy(um, mapping, i * sizeof(unsigned int));
25288c2ecf20Sopenharmony_ci			unicode_data[unichar].utf32nfdi = um;
25298c2ecf20Sopenharmony_ci		}
25308c2ecf20Sopenharmony_ci		/* Add this decomposition to nfdicf if there is no entry. */
25318c2ecf20Sopenharmony_ci		if (!unicode_data[unichar].utf32nfdicf) {
25328c2ecf20Sopenharmony_ci			um = malloc(i * sizeof(unsigned int));
25338c2ecf20Sopenharmony_ci			memcpy(um, mapping, i * sizeof(unsigned int));
25348c2ecf20Sopenharmony_ci			unicode_data[unichar].utf32nfdicf = um;
25358c2ecf20Sopenharmony_ci		}
25368c2ecf20Sopenharmony_ci		if (verbose > 1)
25378c2ecf20Sopenharmony_ci			print_utf32nfdi(unichar);
25388c2ecf20Sopenharmony_ci		count++;
25398c2ecf20Sopenharmony_ci	}
25408c2ecf20Sopenharmony_ci	if (verbose > 0)
25418c2ecf20Sopenharmony_ci		printf("Processed %d entries\n", count);
25428c2ecf20Sopenharmony_ci}
25438c2ecf20Sopenharmony_ci
25448c2ecf20Sopenharmony_cistatic void nfdicf_decompose(void)
25458c2ecf20Sopenharmony_ci{
25468c2ecf20Sopenharmony_ci	unsigned int unichar;
25478c2ecf20Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
25488c2ecf20Sopenharmony_ci	unsigned int *um;
25498c2ecf20Sopenharmony_ci	unsigned int *dc;
25508c2ecf20Sopenharmony_ci	int count;
25518c2ecf20Sopenharmony_ci	int i;
25528c2ecf20Sopenharmony_ci	int j;
25538c2ecf20Sopenharmony_ci	int ret;
25548c2ecf20Sopenharmony_ci
25558c2ecf20Sopenharmony_ci	if (verbose > 0)
25568c2ecf20Sopenharmony_ci		printf("Decomposing nfdicf\n");
25578c2ecf20Sopenharmony_ci	count = 0;
25588c2ecf20Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++) {
25598c2ecf20Sopenharmony_ci		if (!unicode_data[unichar].utf32nfdicf)
25608c2ecf20Sopenharmony_ci			continue;
25618c2ecf20Sopenharmony_ci		for (;;) {
25628c2ecf20Sopenharmony_ci			ret = 1;
25638c2ecf20Sopenharmony_ci			i = 0;
25648c2ecf20Sopenharmony_ci			um = unicode_data[unichar].utf32nfdicf;
25658c2ecf20Sopenharmony_ci			while (*um) {
25668c2ecf20Sopenharmony_ci				dc = unicode_data[*um].utf32nfdicf;
25678c2ecf20Sopenharmony_ci				if (dc) {
25688c2ecf20Sopenharmony_ci					for (j = 0; dc[j]; j++)
25698c2ecf20Sopenharmony_ci						mapping[i++] = dc[j];
25708c2ecf20Sopenharmony_ci					ret = 0;
25718c2ecf20Sopenharmony_ci				} else {
25728c2ecf20Sopenharmony_ci					mapping[i++] = *um;
25738c2ecf20Sopenharmony_ci				}
25748c2ecf20Sopenharmony_ci				um++;
25758c2ecf20Sopenharmony_ci			}
25768c2ecf20Sopenharmony_ci			mapping[i++] = 0;
25778c2ecf20Sopenharmony_ci			if (ret)
25788c2ecf20Sopenharmony_ci				break;
25798c2ecf20Sopenharmony_ci			free(unicode_data[unichar].utf32nfdicf);
25808c2ecf20Sopenharmony_ci			um = malloc(i * sizeof(unsigned int));
25818c2ecf20Sopenharmony_ci			memcpy(um, mapping, i * sizeof(unsigned int));
25828c2ecf20Sopenharmony_ci			unicode_data[unichar].utf32nfdicf = um;
25838c2ecf20Sopenharmony_ci		}
25848c2ecf20Sopenharmony_ci		if (verbose > 1)
25858c2ecf20Sopenharmony_ci			print_utf32nfdicf(unichar);
25868c2ecf20Sopenharmony_ci		count++;
25878c2ecf20Sopenharmony_ci	}
25888c2ecf20Sopenharmony_ci	if (verbose > 0)
25898c2ecf20Sopenharmony_ci		printf("Processed %d entries\n", count);
25908c2ecf20Sopenharmony_ci}
25918c2ecf20Sopenharmony_ci
25928c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
25938c2ecf20Sopenharmony_ci
25948c2ecf20Sopenharmony_ciint utf8agemax(struct tree *, const char *);
25958c2ecf20Sopenharmony_ciint utf8nagemax(struct tree *, const char *, size_t);
25968c2ecf20Sopenharmony_ciint utf8agemin(struct tree *, const char *);
25978c2ecf20Sopenharmony_ciint utf8nagemin(struct tree *, const char *, size_t);
25988c2ecf20Sopenharmony_cissize_t utf8len(struct tree *, const char *);
25998c2ecf20Sopenharmony_cissize_t utf8nlen(struct tree *, const char *, size_t);
26008c2ecf20Sopenharmony_cistruct utf8cursor;
26018c2ecf20Sopenharmony_ciint utf8cursor(struct utf8cursor *, struct tree *, const char *);
26028c2ecf20Sopenharmony_ciint utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
26038c2ecf20Sopenharmony_ciint utf8byte(struct utf8cursor *);
26048c2ecf20Sopenharmony_ci
26058c2ecf20Sopenharmony_ci/*
26068c2ecf20Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
26078c2ecf20Sopenharmony_ci *
26088c2ecf20Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
26098c2ecf20Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
26108c2ecf20Sopenharmony_ci *
26118c2ecf20Sopenharmony_ci * SBase = 0xAC00
26128c2ecf20Sopenharmony_ci * LBase = 0x1100
26138c2ecf20Sopenharmony_ci * VBase = 0x1161
26148c2ecf20Sopenharmony_ci * TBase = 0x11A7
26158c2ecf20Sopenharmony_ci * LCount = 19
26168c2ecf20Sopenharmony_ci * VCount = 21
26178c2ecf20Sopenharmony_ci * TCount = 28
26188c2ecf20Sopenharmony_ci * NCount = 588 (VCount * TCount)
26198c2ecf20Sopenharmony_ci * SCount = 11172 (LCount * NCount)
26208c2ecf20Sopenharmony_ci *
26218c2ecf20Sopenharmony_ci * Decomposition:
26228c2ecf20Sopenharmony_ci *   SIndex = s - SBase
26238c2ecf20Sopenharmony_ci *
26248c2ecf20Sopenharmony_ci * LV (Canonical/Full)
26258c2ecf20Sopenharmony_ci *   LIndex = SIndex / NCount
26268c2ecf20Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
26278c2ecf20Sopenharmony_ci *   LPart = LBase + LIndex
26288c2ecf20Sopenharmony_ci *   VPart = VBase + VIndex
26298c2ecf20Sopenharmony_ci *
26308c2ecf20Sopenharmony_ci * LVT (Canonical)
26318c2ecf20Sopenharmony_ci *   LVIndex = (SIndex / TCount) * TCount
26328c2ecf20Sopenharmony_ci *   TIndex = (Sindex % TCount)
26338c2ecf20Sopenharmony_ci *   LVPart = SBase + LVIndex
26348c2ecf20Sopenharmony_ci *   TPart = TBase + TIndex
26358c2ecf20Sopenharmony_ci *
26368c2ecf20Sopenharmony_ci * LVT (Full)
26378c2ecf20Sopenharmony_ci *   LIndex = SIndex / NCount
26388c2ecf20Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
26398c2ecf20Sopenharmony_ci *   TIndex = (Sindex % TCount)
26408c2ecf20Sopenharmony_ci *   LPart = LBase + LIndex
26418c2ecf20Sopenharmony_ci *   VPart = VBase + VIndex
26428c2ecf20Sopenharmony_ci *   if (TIndex == 0) {
26438c2ecf20Sopenharmony_ci *          d = <LPart, VPart>
26448c2ecf20Sopenharmony_ci *   } else {
26458c2ecf20Sopenharmony_ci *          TPart = TBase + TIndex
26468c2ecf20Sopenharmony_ci *          d = <LPart, VPart, TPart>
26478c2ecf20Sopenharmony_ci *   }
26488c2ecf20Sopenharmony_ci */
26498c2ecf20Sopenharmony_ci
26508c2ecf20Sopenharmony_ci/* Constants */
26518c2ecf20Sopenharmony_ci#define SB	(0xAC00)
26528c2ecf20Sopenharmony_ci#define LB	(0x1100)
26538c2ecf20Sopenharmony_ci#define VB	(0x1161)
26548c2ecf20Sopenharmony_ci#define TB	(0x11A7)
26558c2ecf20Sopenharmony_ci#define LC	(19)
26568c2ecf20Sopenharmony_ci#define VC	(21)
26578c2ecf20Sopenharmony_ci#define TC	(28)
26588c2ecf20Sopenharmony_ci#define NC	(VC * TC)
26598c2ecf20Sopenharmony_ci#define SC	(LC * NC)
26608c2ecf20Sopenharmony_ci
26618c2ecf20Sopenharmony_ci/* Algorithmic decomposition of hangul syllable. */
26628c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
26638c2ecf20Sopenharmony_ci{
26648c2ecf20Sopenharmony_ci	unsigned int	si;
26658c2ecf20Sopenharmony_ci	unsigned int	li;
26668c2ecf20Sopenharmony_ci	unsigned int	vi;
26678c2ecf20Sopenharmony_ci	unsigned int	ti;
26688c2ecf20Sopenharmony_ci	unsigned char	*h;
26698c2ecf20Sopenharmony_ci
26708c2ecf20Sopenharmony_ci	/* Calculate the SI, LI, VI, and TI values. */
26718c2ecf20Sopenharmony_ci	si = utf8decode(str) - SB;
26728c2ecf20Sopenharmony_ci	li = si / NC;
26738c2ecf20Sopenharmony_ci	vi = (si % NC) / TC;
26748c2ecf20Sopenharmony_ci	ti = si % TC;
26758c2ecf20Sopenharmony_ci
26768c2ecf20Sopenharmony_ci	/* Fill in base of leaf. */
26778c2ecf20Sopenharmony_ci	h = hangul;
26788c2ecf20Sopenharmony_ci	LEAF_GEN(h) = 2;
26798c2ecf20Sopenharmony_ci	LEAF_CCC(h) = DECOMPOSE;
26808c2ecf20Sopenharmony_ci	h += 2;
26818c2ecf20Sopenharmony_ci
26828c2ecf20Sopenharmony_ci	/* Add LPart, a 3-byte UTF-8 sequence. */
26838c2ecf20Sopenharmony_ci	h += utf8encode((char *)h, li + LB);
26848c2ecf20Sopenharmony_ci
26858c2ecf20Sopenharmony_ci	/* Add VPart, a 3-byte UTF-8 sequence. */
26868c2ecf20Sopenharmony_ci	h += utf8encode((char *)h, vi + VB);
26878c2ecf20Sopenharmony_ci
26888c2ecf20Sopenharmony_ci	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
26898c2ecf20Sopenharmony_ci	if (ti)
26908c2ecf20Sopenharmony_ci		h += utf8encode((char *)h, ti + TB);
26918c2ecf20Sopenharmony_ci
26928c2ecf20Sopenharmony_ci	/* Terminate string. */
26938c2ecf20Sopenharmony_ci	h[0] = '\0';
26948c2ecf20Sopenharmony_ci
26958c2ecf20Sopenharmony_ci	return hangul;
26968c2ecf20Sopenharmony_ci}
26978c2ecf20Sopenharmony_ci
26988c2ecf20Sopenharmony_ci/*
26998c2ecf20Sopenharmony_ci * Use trie to scan s, touching at most len bytes.
27008c2ecf20Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise.
27018c2ecf20Sopenharmony_ci *
27028c2ecf20Sopenharmony_ci * A non-NULL return guarantees that the UTF-8 sequence starting at s
27038c2ecf20Sopenharmony_ci * is well-formed and corresponds to a known unicode code point.  The
27048c2ecf20Sopenharmony_ci * shorthand for this will be "is valid UTF-8 unicode".
27058c2ecf20Sopenharmony_ci */
27068c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
27078c2ecf20Sopenharmony_ci			       const char *s, size_t len)
27088c2ecf20Sopenharmony_ci{
27098c2ecf20Sopenharmony_ci	utf8trie_t	*trie;
27108c2ecf20Sopenharmony_ci	int		offlen;
27118c2ecf20Sopenharmony_ci	int		offset;
27128c2ecf20Sopenharmony_ci	int		mask;
27138c2ecf20Sopenharmony_ci	int		node;
27148c2ecf20Sopenharmony_ci
27158c2ecf20Sopenharmony_ci	if (!tree)
27168c2ecf20Sopenharmony_ci		return NULL;
27178c2ecf20Sopenharmony_ci	if (len == 0)
27188c2ecf20Sopenharmony_ci		return NULL;
27198c2ecf20Sopenharmony_ci	node = 1;
27208c2ecf20Sopenharmony_ci	trie = utf8data + tree->index;
27218c2ecf20Sopenharmony_ci	while (node) {
27228c2ecf20Sopenharmony_ci		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
27238c2ecf20Sopenharmony_ci		if (*trie & NEXTBYTE) {
27248c2ecf20Sopenharmony_ci			if (--len == 0)
27258c2ecf20Sopenharmony_ci				return NULL;
27268c2ecf20Sopenharmony_ci			s++;
27278c2ecf20Sopenharmony_ci		}
27288c2ecf20Sopenharmony_ci		mask = 1 << (*trie & BITNUM);
27298c2ecf20Sopenharmony_ci		if (*s & mask) {
27308c2ecf20Sopenharmony_ci			/* Right leg */
27318c2ecf20Sopenharmony_ci			if (offlen) {
27328c2ecf20Sopenharmony_ci				/* Right node at offset of trie */
27338c2ecf20Sopenharmony_ci				node = (*trie & RIGHTNODE);
27348c2ecf20Sopenharmony_ci				offset = trie[offlen];
27358c2ecf20Sopenharmony_ci				while (--offlen) {
27368c2ecf20Sopenharmony_ci					offset <<= 8;
27378c2ecf20Sopenharmony_ci					offset |= trie[offlen];
27388c2ecf20Sopenharmony_ci				}
27398c2ecf20Sopenharmony_ci				trie += offset;
27408c2ecf20Sopenharmony_ci			} else if (*trie & RIGHTPATH) {
27418c2ecf20Sopenharmony_ci				/* Right node after this node */
27428c2ecf20Sopenharmony_ci				node = (*trie & TRIENODE);
27438c2ecf20Sopenharmony_ci				trie++;
27448c2ecf20Sopenharmony_ci			} else {
27458c2ecf20Sopenharmony_ci				/* No right node. */
27468c2ecf20Sopenharmony_ci				return NULL;
27478c2ecf20Sopenharmony_ci			}
27488c2ecf20Sopenharmony_ci		} else {
27498c2ecf20Sopenharmony_ci			/* Left leg */
27508c2ecf20Sopenharmony_ci			if (offlen) {
27518c2ecf20Sopenharmony_ci				/* Left node after this node. */
27528c2ecf20Sopenharmony_ci				node = (*trie & LEFTNODE);
27538c2ecf20Sopenharmony_ci				trie += offlen + 1;
27548c2ecf20Sopenharmony_ci			} else if (*trie & RIGHTPATH) {
27558c2ecf20Sopenharmony_ci				/* No left node. */
27568c2ecf20Sopenharmony_ci				return NULL;
27578c2ecf20Sopenharmony_ci			} else {
27588c2ecf20Sopenharmony_ci				/* Left node after this node */
27598c2ecf20Sopenharmony_ci				node = (*trie & TRIENODE);
27608c2ecf20Sopenharmony_ci				trie++;
27618c2ecf20Sopenharmony_ci			}
27628c2ecf20Sopenharmony_ci		}
27638c2ecf20Sopenharmony_ci	}
27648c2ecf20Sopenharmony_ci	/*
27658c2ecf20Sopenharmony_ci	 * Hangul decomposition is done algorithmically. These are the
27668c2ecf20Sopenharmony_ci	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
27678c2ecf20Sopenharmony_ci	 * always 3 bytes long, so s has been advanced twice, and the
27688c2ecf20Sopenharmony_ci	 * start of the sequence is at s-2.
27698c2ecf20Sopenharmony_ci	 */
27708c2ecf20Sopenharmony_ci	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
27718c2ecf20Sopenharmony_ci		trie = utf8hangul(s - 2, hangul);
27728c2ecf20Sopenharmony_ci	return trie;
27738c2ecf20Sopenharmony_ci}
27748c2ecf20Sopenharmony_ci
27758c2ecf20Sopenharmony_ci/*
27768c2ecf20Sopenharmony_ci * Use trie to scan s.
27778c2ecf20Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise.
27788c2ecf20Sopenharmony_ci *
27798c2ecf20Sopenharmony_ci * Forwards to trie_nlookup().
27808c2ecf20Sopenharmony_ci */
27818c2ecf20Sopenharmony_cistatic utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
27828c2ecf20Sopenharmony_ci			      const char *s)
27838c2ecf20Sopenharmony_ci{
27848c2ecf20Sopenharmony_ci	return utf8nlookup(tree, hangul, s, (size_t)-1);
27858c2ecf20Sopenharmony_ci}
27868c2ecf20Sopenharmony_ci
27878c2ecf20Sopenharmony_ci/*
27888c2ecf20Sopenharmony_ci * Return the number of bytes used by the current UTF-8 sequence.
27898c2ecf20Sopenharmony_ci * Assumes the input points to the first byte of a valid UTF-8
27908c2ecf20Sopenharmony_ci * sequence.
27918c2ecf20Sopenharmony_ci */
27928c2ecf20Sopenharmony_cistatic inline int utf8clen(const char *s)
27938c2ecf20Sopenharmony_ci{
27948c2ecf20Sopenharmony_ci	unsigned char c = *s;
27958c2ecf20Sopenharmony_ci	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
27968c2ecf20Sopenharmony_ci}
27978c2ecf20Sopenharmony_ci
27988c2ecf20Sopenharmony_ci/*
27998c2ecf20Sopenharmony_ci * Maximum age of any character in s.
28008c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
28018c2ecf20Sopenharmony_ci * Return 0 if only non-assigned code points are used.
28028c2ecf20Sopenharmony_ci */
28038c2ecf20Sopenharmony_ciint utf8agemax(struct tree *tree, const char *s)
28048c2ecf20Sopenharmony_ci{
28058c2ecf20Sopenharmony_ci	utf8leaf_t	*leaf;
28068c2ecf20Sopenharmony_ci	int		age = 0;
28078c2ecf20Sopenharmony_ci	int		leaf_age;
28088c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
28098c2ecf20Sopenharmony_ci
28108c2ecf20Sopenharmony_ci	if (!tree)
28118c2ecf20Sopenharmony_ci		return -1;
28128c2ecf20Sopenharmony_ci
28138c2ecf20Sopenharmony_ci	while (*s) {
28148c2ecf20Sopenharmony_ci		leaf = utf8lookup(tree, hangul, s);
28158c2ecf20Sopenharmony_ci		if (!leaf)
28168c2ecf20Sopenharmony_ci			return -1;
28178c2ecf20Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
28188c2ecf20Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age > age)
28198c2ecf20Sopenharmony_ci			age = leaf_age;
28208c2ecf20Sopenharmony_ci		s += utf8clen(s);
28218c2ecf20Sopenharmony_ci	}
28228c2ecf20Sopenharmony_ci	return age;
28238c2ecf20Sopenharmony_ci}
28248c2ecf20Sopenharmony_ci
28258c2ecf20Sopenharmony_ci/*
28268c2ecf20Sopenharmony_ci * Minimum age of any character in s.
28278c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
28288c2ecf20Sopenharmony_ci * Return 0 if non-assigned code points are used.
28298c2ecf20Sopenharmony_ci */
28308c2ecf20Sopenharmony_ciint utf8agemin(struct tree *tree, const char *s)
28318c2ecf20Sopenharmony_ci{
28328c2ecf20Sopenharmony_ci	utf8leaf_t	*leaf;
28338c2ecf20Sopenharmony_ci	int		age;
28348c2ecf20Sopenharmony_ci	int		leaf_age;
28358c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
28368c2ecf20Sopenharmony_ci
28378c2ecf20Sopenharmony_ci	if (!tree)
28388c2ecf20Sopenharmony_ci		return -1;
28398c2ecf20Sopenharmony_ci	age = tree->maxage;
28408c2ecf20Sopenharmony_ci	while (*s) {
28418c2ecf20Sopenharmony_ci		leaf = utf8lookup(tree, hangul, s);
28428c2ecf20Sopenharmony_ci		if (!leaf)
28438c2ecf20Sopenharmony_ci			return -1;
28448c2ecf20Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
28458c2ecf20Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age < age)
28468c2ecf20Sopenharmony_ci			age = leaf_age;
28478c2ecf20Sopenharmony_ci		s += utf8clen(s);
28488c2ecf20Sopenharmony_ci	}
28498c2ecf20Sopenharmony_ci	return age;
28508c2ecf20Sopenharmony_ci}
28518c2ecf20Sopenharmony_ci
28528c2ecf20Sopenharmony_ci/*
28538c2ecf20Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes.
28548c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
28558c2ecf20Sopenharmony_ci */
28568c2ecf20Sopenharmony_ciint utf8nagemax(struct tree *tree, const char *s, size_t len)
28578c2ecf20Sopenharmony_ci{
28588c2ecf20Sopenharmony_ci	utf8leaf_t	*leaf;
28598c2ecf20Sopenharmony_ci	int		age = 0;
28608c2ecf20Sopenharmony_ci	int		leaf_age;
28618c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
28628c2ecf20Sopenharmony_ci
28638c2ecf20Sopenharmony_ci	if (!tree)
28648c2ecf20Sopenharmony_ci		return -1;
28658c2ecf20Sopenharmony_ci
28668c2ecf20Sopenharmony_ci        while (len && *s) {
28678c2ecf20Sopenharmony_ci		leaf = utf8nlookup(tree, hangul, s, len);
28688c2ecf20Sopenharmony_ci		if (!leaf)
28698c2ecf20Sopenharmony_ci			return -1;
28708c2ecf20Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
28718c2ecf20Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age > age)
28728c2ecf20Sopenharmony_ci			age = leaf_age;
28738c2ecf20Sopenharmony_ci		len -= utf8clen(s);
28748c2ecf20Sopenharmony_ci		s += utf8clen(s);
28758c2ecf20Sopenharmony_ci	}
28768c2ecf20Sopenharmony_ci	return age;
28778c2ecf20Sopenharmony_ci}
28788c2ecf20Sopenharmony_ci
28798c2ecf20Sopenharmony_ci/*
28808c2ecf20Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes.
28818c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
28828c2ecf20Sopenharmony_ci */
28838c2ecf20Sopenharmony_ciint utf8nagemin(struct tree *tree, const char *s, size_t len)
28848c2ecf20Sopenharmony_ci{
28858c2ecf20Sopenharmony_ci	utf8leaf_t	*leaf;
28868c2ecf20Sopenharmony_ci	int		leaf_age;
28878c2ecf20Sopenharmony_ci	int		age;
28888c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
28898c2ecf20Sopenharmony_ci
28908c2ecf20Sopenharmony_ci	if (!tree)
28918c2ecf20Sopenharmony_ci		return -1;
28928c2ecf20Sopenharmony_ci	age = tree->maxage;
28938c2ecf20Sopenharmony_ci        while (len && *s) {
28948c2ecf20Sopenharmony_ci		leaf = utf8nlookup(tree, hangul, s, len);
28958c2ecf20Sopenharmony_ci		if (!leaf)
28968c2ecf20Sopenharmony_ci			return -1;
28978c2ecf20Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
28988c2ecf20Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age < age)
28998c2ecf20Sopenharmony_ci			age = leaf_age;
29008c2ecf20Sopenharmony_ci		len -= utf8clen(s);
29018c2ecf20Sopenharmony_ci		s += utf8clen(s);
29028c2ecf20Sopenharmony_ci	}
29038c2ecf20Sopenharmony_ci	return age;
29048c2ecf20Sopenharmony_ci}
29058c2ecf20Sopenharmony_ci
29068c2ecf20Sopenharmony_ci/*
29078c2ecf20Sopenharmony_ci * Length of the normalization of s.
29088c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
29098c2ecf20Sopenharmony_ci *
29108c2ecf20Sopenharmony_ci * A string of Default_Ignorable_Code_Point has length 0.
29118c2ecf20Sopenharmony_ci */
29128c2ecf20Sopenharmony_cissize_t utf8len(struct tree *tree, const char *s)
29138c2ecf20Sopenharmony_ci{
29148c2ecf20Sopenharmony_ci	utf8leaf_t	*leaf;
29158c2ecf20Sopenharmony_ci	size_t		ret = 0;
29168c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
29178c2ecf20Sopenharmony_ci
29188c2ecf20Sopenharmony_ci	if (!tree)
29198c2ecf20Sopenharmony_ci		return -1;
29208c2ecf20Sopenharmony_ci	while (*s) {
29218c2ecf20Sopenharmony_ci		leaf = utf8lookup(tree, hangul, s);
29228c2ecf20Sopenharmony_ci		if (!leaf)
29238c2ecf20Sopenharmony_ci			return -1;
29248c2ecf20Sopenharmony_ci		if (ages[LEAF_GEN(leaf)] > tree->maxage)
29258c2ecf20Sopenharmony_ci			ret += utf8clen(s);
29268c2ecf20Sopenharmony_ci		else if (LEAF_CCC(leaf) == DECOMPOSE)
29278c2ecf20Sopenharmony_ci			ret += strlen(LEAF_STR(leaf));
29288c2ecf20Sopenharmony_ci		else
29298c2ecf20Sopenharmony_ci			ret += utf8clen(s);
29308c2ecf20Sopenharmony_ci		s += utf8clen(s);
29318c2ecf20Sopenharmony_ci	}
29328c2ecf20Sopenharmony_ci	return ret;
29338c2ecf20Sopenharmony_ci}
29348c2ecf20Sopenharmony_ci
29358c2ecf20Sopenharmony_ci/*
29368c2ecf20Sopenharmony_ci * Length of the normalization of s, touch at most len bytes.
29378c2ecf20Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
29388c2ecf20Sopenharmony_ci */
29398c2ecf20Sopenharmony_cissize_t utf8nlen(struct tree *tree, const char *s, size_t len)
29408c2ecf20Sopenharmony_ci{
29418c2ecf20Sopenharmony_ci	utf8leaf_t	*leaf;
29428c2ecf20Sopenharmony_ci	size_t		ret = 0;
29438c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
29448c2ecf20Sopenharmony_ci
29458c2ecf20Sopenharmony_ci	if (!tree)
29468c2ecf20Sopenharmony_ci		return -1;
29478c2ecf20Sopenharmony_ci	while (len && *s) {
29488c2ecf20Sopenharmony_ci		leaf = utf8nlookup(tree, hangul, s, len);
29498c2ecf20Sopenharmony_ci		if (!leaf)
29508c2ecf20Sopenharmony_ci			return -1;
29518c2ecf20Sopenharmony_ci		if (ages[LEAF_GEN(leaf)] > tree->maxage)
29528c2ecf20Sopenharmony_ci			ret += utf8clen(s);
29538c2ecf20Sopenharmony_ci		else if (LEAF_CCC(leaf) == DECOMPOSE)
29548c2ecf20Sopenharmony_ci			ret += strlen(LEAF_STR(leaf));
29558c2ecf20Sopenharmony_ci		else
29568c2ecf20Sopenharmony_ci			ret += utf8clen(s);
29578c2ecf20Sopenharmony_ci		len -= utf8clen(s);
29588c2ecf20Sopenharmony_ci		s += utf8clen(s);
29598c2ecf20Sopenharmony_ci	}
29608c2ecf20Sopenharmony_ci	return ret;
29618c2ecf20Sopenharmony_ci}
29628c2ecf20Sopenharmony_ci
29638c2ecf20Sopenharmony_ci/*
29648c2ecf20Sopenharmony_ci * Cursor structure used by the normalizer.
29658c2ecf20Sopenharmony_ci */
29668c2ecf20Sopenharmony_cistruct utf8cursor {
29678c2ecf20Sopenharmony_ci	struct tree	*tree;
29688c2ecf20Sopenharmony_ci	const char	*s;
29698c2ecf20Sopenharmony_ci	const char	*p;
29708c2ecf20Sopenharmony_ci	const char	*ss;
29718c2ecf20Sopenharmony_ci	const char	*sp;
29728c2ecf20Sopenharmony_ci	unsigned int	len;
29738c2ecf20Sopenharmony_ci	unsigned int	slen;
29748c2ecf20Sopenharmony_ci	short int	ccc;
29758c2ecf20Sopenharmony_ci	short int	nccc;
29768c2ecf20Sopenharmony_ci	unsigned int	unichar;
29778c2ecf20Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
29788c2ecf20Sopenharmony_ci};
29798c2ecf20Sopenharmony_ci
29808c2ecf20Sopenharmony_ci/*
29818c2ecf20Sopenharmony_ci * Set up an utf8cursor for use by utf8byte().
29828c2ecf20Sopenharmony_ci *
29838c2ecf20Sopenharmony_ci *   s      : string.
29848c2ecf20Sopenharmony_ci *   len    : length of s.
29858c2ecf20Sopenharmony_ci *   u8c    : pointer to cursor.
29868c2ecf20Sopenharmony_ci *   trie   : utf8trie_t to use for normalization.
29878c2ecf20Sopenharmony_ci *
29888c2ecf20Sopenharmony_ci * Returns -1 on error, 0 on success.
29898c2ecf20Sopenharmony_ci */
29908c2ecf20Sopenharmony_ciint utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
29918c2ecf20Sopenharmony_ci		size_t len)
29928c2ecf20Sopenharmony_ci{
29938c2ecf20Sopenharmony_ci	if (!tree)
29948c2ecf20Sopenharmony_ci		return -1;
29958c2ecf20Sopenharmony_ci	if (!s)
29968c2ecf20Sopenharmony_ci		return -1;
29978c2ecf20Sopenharmony_ci	u8c->tree = tree;
29988c2ecf20Sopenharmony_ci	u8c->s = s;
29998c2ecf20Sopenharmony_ci	u8c->p = NULL;
30008c2ecf20Sopenharmony_ci	u8c->ss = NULL;
30018c2ecf20Sopenharmony_ci	u8c->sp = NULL;
30028c2ecf20Sopenharmony_ci	u8c->len = len;
30038c2ecf20Sopenharmony_ci	u8c->slen = 0;
30048c2ecf20Sopenharmony_ci	u8c->ccc = STOPPER;
30058c2ecf20Sopenharmony_ci	u8c->nccc = STOPPER;
30068c2ecf20Sopenharmony_ci	u8c->unichar = 0;
30078c2ecf20Sopenharmony_ci	/* Check we didn't clobber the maximum length. */
30088c2ecf20Sopenharmony_ci	if (u8c->len != len)
30098c2ecf20Sopenharmony_ci		return -1;
30108c2ecf20Sopenharmony_ci	/* The first byte of s may not be an utf8 continuation. */
30118c2ecf20Sopenharmony_ci	if (len > 0 && (*s & 0xC0) == 0x80)
30128c2ecf20Sopenharmony_ci		return -1;
30138c2ecf20Sopenharmony_ci	return 0;
30148c2ecf20Sopenharmony_ci}
30158c2ecf20Sopenharmony_ci
30168c2ecf20Sopenharmony_ci/*
30178c2ecf20Sopenharmony_ci * Set up an utf8cursor for use by utf8byte().
30188c2ecf20Sopenharmony_ci *
30198c2ecf20Sopenharmony_ci *   s      : NUL-terminated string.
30208c2ecf20Sopenharmony_ci *   u8c    : pointer to cursor.
30218c2ecf20Sopenharmony_ci *   trie   : utf8trie_t to use for normalization.
30228c2ecf20Sopenharmony_ci *
30238c2ecf20Sopenharmony_ci * Returns -1 on error, 0 on success.
30248c2ecf20Sopenharmony_ci */
30258c2ecf20Sopenharmony_ciint utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
30268c2ecf20Sopenharmony_ci{
30278c2ecf20Sopenharmony_ci	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
30288c2ecf20Sopenharmony_ci}
30298c2ecf20Sopenharmony_ci
30308c2ecf20Sopenharmony_ci/*
30318c2ecf20Sopenharmony_ci * Get one byte from the normalized form of the string described by u8c.
30328c2ecf20Sopenharmony_ci *
30338c2ecf20Sopenharmony_ci * Returns the byte cast to an unsigned char on succes, and -1 on failure.
30348c2ecf20Sopenharmony_ci *
30358c2ecf20Sopenharmony_ci * The cursor keeps track of the location in the string in u8c->s.
30368c2ecf20Sopenharmony_ci * When a character is decomposed, the current location is stored in
30378c2ecf20Sopenharmony_ci * u8c->p, and u8c->s is set to the start of the decomposition. Note
30388c2ecf20Sopenharmony_ci * that bytes from a decomposition do not count against u8c->len.
30398c2ecf20Sopenharmony_ci *
30408c2ecf20Sopenharmony_ci * Characters are emitted if they match the current CCC in u8c->ccc.
30418c2ecf20Sopenharmony_ci * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
30428c2ecf20Sopenharmony_ci * and the function returns 0 in that case.
30438c2ecf20Sopenharmony_ci *
30448c2ecf20Sopenharmony_ci * Sorting by CCC is done by repeatedly scanning the string.  The
30458c2ecf20Sopenharmony_ci * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
30468c2ecf20Sopenharmony_ci * the start of the scan.  The first pass finds the lowest CCC to be
30478c2ecf20Sopenharmony_ci * emitted and stores it in u8c->nccc, the second pass emits the
30488c2ecf20Sopenharmony_ci * characters with this CCC and finds the next lowest CCC. This limits
30498c2ecf20Sopenharmony_ci * the number of passes to 1 + the number of different CCCs in the
30508c2ecf20Sopenharmony_ci * sequence being scanned.
30518c2ecf20Sopenharmony_ci *
30528c2ecf20Sopenharmony_ci * Therefore:
30538c2ecf20Sopenharmony_ci *  u8c->p  != NULL -> a decomposition is being scanned.
30548c2ecf20Sopenharmony_ci *  u8c->ss != NULL -> this is a repeating scan.
30558c2ecf20Sopenharmony_ci *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
30568c2ecf20Sopenharmony_ci */
30578c2ecf20Sopenharmony_ciint utf8byte(struct utf8cursor *u8c)
30588c2ecf20Sopenharmony_ci{
30598c2ecf20Sopenharmony_ci	utf8leaf_t *leaf;
30608c2ecf20Sopenharmony_ci	int ccc;
30618c2ecf20Sopenharmony_ci
30628c2ecf20Sopenharmony_ci	for (;;) {
30638c2ecf20Sopenharmony_ci		/* Check for the end of a decomposed character. */
30648c2ecf20Sopenharmony_ci		if (u8c->p && *u8c->s == '\0') {
30658c2ecf20Sopenharmony_ci			u8c->s = u8c->p;
30668c2ecf20Sopenharmony_ci			u8c->p = NULL;
30678c2ecf20Sopenharmony_ci		}
30688c2ecf20Sopenharmony_ci
30698c2ecf20Sopenharmony_ci		/* Check for end-of-string. */
30708c2ecf20Sopenharmony_ci		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
30718c2ecf20Sopenharmony_ci			/* There is no next byte. */
30728c2ecf20Sopenharmony_ci			if (u8c->ccc == STOPPER)
30738c2ecf20Sopenharmony_ci				return 0;
30748c2ecf20Sopenharmony_ci			/* End-of-string during a scan counts as a stopper. */
30758c2ecf20Sopenharmony_ci			ccc = STOPPER;
30768c2ecf20Sopenharmony_ci			goto ccc_mismatch;
30778c2ecf20Sopenharmony_ci		} else if ((*u8c->s & 0xC0) == 0x80) {
30788c2ecf20Sopenharmony_ci			/* This is a continuation of the current character. */
30798c2ecf20Sopenharmony_ci			if (!u8c->p)
30808c2ecf20Sopenharmony_ci				u8c->len--;
30818c2ecf20Sopenharmony_ci			return (unsigned char)*u8c->s++;
30828c2ecf20Sopenharmony_ci		}
30838c2ecf20Sopenharmony_ci
30848c2ecf20Sopenharmony_ci		/* Look up the data for the current character. */
30858c2ecf20Sopenharmony_ci		if (u8c->p) {
30868c2ecf20Sopenharmony_ci			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
30878c2ecf20Sopenharmony_ci		} else {
30888c2ecf20Sopenharmony_ci			leaf = utf8nlookup(u8c->tree, u8c->hangul,
30898c2ecf20Sopenharmony_ci					   u8c->s, u8c->len);
30908c2ecf20Sopenharmony_ci		}
30918c2ecf20Sopenharmony_ci
30928c2ecf20Sopenharmony_ci		/* No leaf found implies that the input is a binary blob. */
30938c2ecf20Sopenharmony_ci		if (!leaf)
30948c2ecf20Sopenharmony_ci			return -1;
30958c2ecf20Sopenharmony_ci
30968c2ecf20Sopenharmony_ci		/* Characters that are too new have CCC 0. */
30978c2ecf20Sopenharmony_ci		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
30988c2ecf20Sopenharmony_ci			ccc = STOPPER;
30998c2ecf20Sopenharmony_ci		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
31008c2ecf20Sopenharmony_ci			u8c->len -= utf8clen(u8c->s);
31018c2ecf20Sopenharmony_ci			u8c->p = u8c->s + utf8clen(u8c->s);
31028c2ecf20Sopenharmony_ci			u8c->s = LEAF_STR(leaf);
31038c2ecf20Sopenharmony_ci			/* Empty decomposition implies CCC 0. */
31048c2ecf20Sopenharmony_ci			if (*u8c->s == '\0') {
31058c2ecf20Sopenharmony_ci				if (u8c->ccc == STOPPER)
31068c2ecf20Sopenharmony_ci					continue;
31078c2ecf20Sopenharmony_ci				ccc = STOPPER;
31088c2ecf20Sopenharmony_ci				goto ccc_mismatch;
31098c2ecf20Sopenharmony_ci			}
31108c2ecf20Sopenharmony_ci			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
31118c2ecf20Sopenharmony_ci			ccc = LEAF_CCC(leaf);
31128c2ecf20Sopenharmony_ci		}
31138c2ecf20Sopenharmony_ci		u8c->unichar = utf8decode(u8c->s);
31148c2ecf20Sopenharmony_ci
31158c2ecf20Sopenharmony_ci		/*
31168c2ecf20Sopenharmony_ci		 * If this is not a stopper, then see if it updates
31178c2ecf20Sopenharmony_ci		 * the next canonical class to be emitted.
31188c2ecf20Sopenharmony_ci		 */
31198c2ecf20Sopenharmony_ci		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
31208c2ecf20Sopenharmony_ci			u8c->nccc = ccc;
31218c2ecf20Sopenharmony_ci
31228c2ecf20Sopenharmony_ci		/*
31238c2ecf20Sopenharmony_ci		 * Return the current byte if this is the current
31248c2ecf20Sopenharmony_ci		 * combining class.
31258c2ecf20Sopenharmony_ci		 */
31268c2ecf20Sopenharmony_ci		if (ccc == u8c->ccc) {
31278c2ecf20Sopenharmony_ci			if (!u8c->p)
31288c2ecf20Sopenharmony_ci				u8c->len--;
31298c2ecf20Sopenharmony_ci			return (unsigned char)*u8c->s++;
31308c2ecf20Sopenharmony_ci		}
31318c2ecf20Sopenharmony_ci
31328c2ecf20Sopenharmony_ci		/* Current combining class mismatch. */
31338c2ecf20Sopenharmony_ci	ccc_mismatch:
31348c2ecf20Sopenharmony_ci		if (u8c->nccc == STOPPER) {
31358c2ecf20Sopenharmony_ci			/*
31368c2ecf20Sopenharmony_ci			 * Scan forward for the first canonical class
31378c2ecf20Sopenharmony_ci			 * to be emitted.  Save the position from
31388c2ecf20Sopenharmony_ci			 * which to restart.
31398c2ecf20Sopenharmony_ci			 */
31408c2ecf20Sopenharmony_ci			assert(u8c->ccc == STOPPER);
31418c2ecf20Sopenharmony_ci			u8c->ccc = MINCCC - 1;
31428c2ecf20Sopenharmony_ci			u8c->nccc = ccc;
31438c2ecf20Sopenharmony_ci			u8c->sp = u8c->p;
31448c2ecf20Sopenharmony_ci			u8c->ss = u8c->s;
31458c2ecf20Sopenharmony_ci			u8c->slen = u8c->len;
31468c2ecf20Sopenharmony_ci			if (!u8c->p)
31478c2ecf20Sopenharmony_ci				u8c->len -= utf8clen(u8c->s);
31488c2ecf20Sopenharmony_ci			u8c->s += utf8clen(u8c->s);
31498c2ecf20Sopenharmony_ci		} else if (ccc != STOPPER) {
31508c2ecf20Sopenharmony_ci			/* Not a stopper, and not the ccc we're emitting. */
31518c2ecf20Sopenharmony_ci			if (!u8c->p)
31528c2ecf20Sopenharmony_ci				u8c->len -= utf8clen(u8c->s);
31538c2ecf20Sopenharmony_ci			u8c->s += utf8clen(u8c->s);
31548c2ecf20Sopenharmony_ci		} else if (u8c->nccc != MAXCCC + 1) {
31558c2ecf20Sopenharmony_ci			/* At a stopper, restart for next ccc. */
31568c2ecf20Sopenharmony_ci			u8c->ccc = u8c->nccc;
31578c2ecf20Sopenharmony_ci			u8c->nccc = MAXCCC + 1;
31588c2ecf20Sopenharmony_ci			u8c->s = u8c->ss;
31598c2ecf20Sopenharmony_ci			u8c->p = u8c->sp;
31608c2ecf20Sopenharmony_ci			u8c->len = u8c->slen;
31618c2ecf20Sopenharmony_ci		} else {
31628c2ecf20Sopenharmony_ci			/* All done, proceed from here. */
31638c2ecf20Sopenharmony_ci			u8c->ccc = STOPPER;
31648c2ecf20Sopenharmony_ci			u8c->nccc = STOPPER;
31658c2ecf20Sopenharmony_ci			u8c->sp = NULL;
31668c2ecf20Sopenharmony_ci			u8c->ss = NULL;
31678c2ecf20Sopenharmony_ci			u8c->slen = 0;
31688c2ecf20Sopenharmony_ci		}
31698c2ecf20Sopenharmony_ci	}
31708c2ecf20Sopenharmony_ci}
31718c2ecf20Sopenharmony_ci
31728c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
31738c2ecf20Sopenharmony_ci
31748c2ecf20Sopenharmony_cistatic int normalize_line(struct tree *tree)
31758c2ecf20Sopenharmony_ci{
31768c2ecf20Sopenharmony_ci	char *s;
31778c2ecf20Sopenharmony_ci	char *t;
31788c2ecf20Sopenharmony_ci	int c;
31798c2ecf20Sopenharmony_ci	struct utf8cursor u8c;
31808c2ecf20Sopenharmony_ci
31818c2ecf20Sopenharmony_ci	/* First test: null-terminated string. */
31828c2ecf20Sopenharmony_ci	s = buf2;
31838c2ecf20Sopenharmony_ci	t = buf3;
31848c2ecf20Sopenharmony_ci	if (utf8cursor(&u8c, tree, s))
31858c2ecf20Sopenharmony_ci		return -1;
31868c2ecf20Sopenharmony_ci	while ((c = utf8byte(&u8c)) > 0)
31878c2ecf20Sopenharmony_ci		if (c != (unsigned char)*t++)
31888c2ecf20Sopenharmony_ci			return -1;
31898c2ecf20Sopenharmony_ci	if (c < 0)
31908c2ecf20Sopenharmony_ci		return -1;
31918c2ecf20Sopenharmony_ci	if (*t != 0)
31928c2ecf20Sopenharmony_ci		return -1;
31938c2ecf20Sopenharmony_ci
31948c2ecf20Sopenharmony_ci	/* Second test: length-limited string. */
31958c2ecf20Sopenharmony_ci	s = buf2;
31968c2ecf20Sopenharmony_ci	/* Replace NUL with a value that will cause an error if seen. */
31978c2ecf20Sopenharmony_ci	s[strlen(s) + 1] = -1;
31988c2ecf20Sopenharmony_ci	t = buf3;
31998c2ecf20Sopenharmony_ci	if (utf8cursor(&u8c, tree, s))
32008c2ecf20Sopenharmony_ci		return -1;
32018c2ecf20Sopenharmony_ci	while ((c = utf8byte(&u8c)) > 0)
32028c2ecf20Sopenharmony_ci		if (c != (unsigned char)*t++)
32038c2ecf20Sopenharmony_ci			return -1;
32048c2ecf20Sopenharmony_ci	if (c < 0)
32058c2ecf20Sopenharmony_ci		return -1;
32068c2ecf20Sopenharmony_ci	if (*t != 0)
32078c2ecf20Sopenharmony_ci		return -1;
32088c2ecf20Sopenharmony_ci
32098c2ecf20Sopenharmony_ci	return 0;
32108c2ecf20Sopenharmony_ci}
32118c2ecf20Sopenharmony_ci
32128c2ecf20Sopenharmony_cistatic void normalization_test(void)
32138c2ecf20Sopenharmony_ci{
32148c2ecf20Sopenharmony_ci	FILE *file;
32158c2ecf20Sopenharmony_ci	unsigned int unichar;
32168c2ecf20Sopenharmony_ci	struct unicode_data *data;
32178c2ecf20Sopenharmony_ci	char *s;
32188c2ecf20Sopenharmony_ci	char *t;
32198c2ecf20Sopenharmony_ci	int ret;
32208c2ecf20Sopenharmony_ci	int ignorables;
32218c2ecf20Sopenharmony_ci	int tests = 0;
32228c2ecf20Sopenharmony_ci	int failures = 0;
32238c2ecf20Sopenharmony_ci
32248c2ecf20Sopenharmony_ci	if (verbose > 0)
32258c2ecf20Sopenharmony_ci		printf("Parsing %s\n", test_name);
32268c2ecf20Sopenharmony_ci	/* Step one, read data from file. */
32278c2ecf20Sopenharmony_ci	file = fopen(test_name, "r");
32288c2ecf20Sopenharmony_ci	if (!file)
32298c2ecf20Sopenharmony_ci		open_fail(test_name, errno);
32308c2ecf20Sopenharmony_ci
32318c2ecf20Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
32328c2ecf20Sopenharmony_ci		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
32338c2ecf20Sopenharmony_ci			     buf0, buf1);
32348c2ecf20Sopenharmony_ci		if (ret != 2 || *line == '#')
32358c2ecf20Sopenharmony_ci			continue;
32368c2ecf20Sopenharmony_ci		s = buf0;
32378c2ecf20Sopenharmony_ci		t = buf2;
32388c2ecf20Sopenharmony_ci		while (*s) {
32398c2ecf20Sopenharmony_ci			unichar = strtoul(s, &s, 16);
32408c2ecf20Sopenharmony_ci			t += utf8encode(t, unichar);
32418c2ecf20Sopenharmony_ci		}
32428c2ecf20Sopenharmony_ci		*t = '\0';
32438c2ecf20Sopenharmony_ci
32448c2ecf20Sopenharmony_ci		ignorables = 0;
32458c2ecf20Sopenharmony_ci		s = buf1;
32468c2ecf20Sopenharmony_ci		t = buf3;
32478c2ecf20Sopenharmony_ci		while (*s) {
32488c2ecf20Sopenharmony_ci			unichar = strtoul(s, &s, 16);
32498c2ecf20Sopenharmony_ci			data = &unicode_data[unichar];
32508c2ecf20Sopenharmony_ci			if (data->utf8nfdi && !*data->utf8nfdi)
32518c2ecf20Sopenharmony_ci				ignorables = 1;
32528c2ecf20Sopenharmony_ci			else
32538c2ecf20Sopenharmony_ci				t += utf8encode(t, unichar);
32548c2ecf20Sopenharmony_ci		}
32558c2ecf20Sopenharmony_ci		*t = '\0';
32568c2ecf20Sopenharmony_ci
32578c2ecf20Sopenharmony_ci		tests++;
32588c2ecf20Sopenharmony_ci		if (normalize_line(nfdi_tree) < 0) {
32598c2ecf20Sopenharmony_ci			printf("Line %s -> %s", buf0, buf1);
32608c2ecf20Sopenharmony_ci			if (ignorables)
32618c2ecf20Sopenharmony_ci				printf(" (ignorables removed)");
32628c2ecf20Sopenharmony_ci			printf(" failure\n");
32638c2ecf20Sopenharmony_ci			failures++;
32648c2ecf20Sopenharmony_ci		}
32658c2ecf20Sopenharmony_ci	}
32668c2ecf20Sopenharmony_ci	fclose(file);
32678c2ecf20Sopenharmony_ci	if (verbose > 0)
32688c2ecf20Sopenharmony_ci		printf("Ran %d tests with %d failures\n", tests, failures);
32698c2ecf20Sopenharmony_ci	if (failures)
32708c2ecf20Sopenharmony_ci		file_fail(test_name);
32718c2ecf20Sopenharmony_ci}
32728c2ecf20Sopenharmony_ci
32738c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
32748c2ecf20Sopenharmony_ci
32758c2ecf20Sopenharmony_cistatic void write_file(void)
32768c2ecf20Sopenharmony_ci{
32778c2ecf20Sopenharmony_ci	FILE *file;
32788c2ecf20Sopenharmony_ci	int i;
32798c2ecf20Sopenharmony_ci	int j;
32808c2ecf20Sopenharmony_ci	int t;
32818c2ecf20Sopenharmony_ci	int gen;
32828c2ecf20Sopenharmony_ci
32838c2ecf20Sopenharmony_ci	if (verbose > 0)
32848c2ecf20Sopenharmony_ci		printf("Writing %s\n", utf8_name);
32858c2ecf20Sopenharmony_ci	file = fopen(utf8_name, "w");
32868c2ecf20Sopenharmony_ci	if (!file)
32878c2ecf20Sopenharmony_ci		open_fail(utf8_name, errno);
32888c2ecf20Sopenharmony_ci
32898c2ecf20Sopenharmony_ci	fprintf(file, "/* This file is generated code, do not edit. */\n");
32908c2ecf20Sopenharmony_ci	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
32918c2ecf20Sopenharmony_ci	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
32928c2ecf20Sopenharmony_ci	fprintf(file, "#endif\n");
32938c2ecf20Sopenharmony_ci	fprintf(file, "\n");
32948c2ecf20Sopenharmony_ci	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
32958c2ecf20Sopenharmony_ci		unicode_maxage);
32968c2ecf20Sopenharmony_ci	fprintf(file, "\n");
32978c2ecf20Sopenharmony_ci	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
32988c2ecf20Sopenharmony_ci	for (i = 0; i != ages_count; i++)
32998c2ecf20Sopenharmony_ci		fprintf(file, "\t%#x%s\n", ages[i],
33008c2ecf20Sopenharmony_ci			ages[i] == unicode_maxage ? "" : ",");
33018c2ecf20Sopenharmony_ci	fprintf(file, "};\n");
33028c2ecf20Sopenharmony_ci	fprintf(file, "\n");
33038c2ecf20Sopenharmony_ci	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
33048c2ecf20Sopenharmony_ci	t = 0;
33058c2ecf20Sopenharmony_ci	for (gen = 0; gen < ages_count; gen++) {
33068c2ecf20Sopenharmony_ci		fprintf(file, "\t{ %#x, %d }%s\n",
33078c2ecf20Sopenharmony_ci			ages[gen], trees[t].index,
33088c2ecf20Sopenharmony_ci			ages[gen] == unicode_maxage ? "" : ",");
33098c2ecf20Sopenharmony_ci		if (trees[t].maxage == ages[gen])
33108c2ecf20Sopenharmony_ci			t += 2;
33118c2ecf20Sopenharmony_ci	}
33128c2ecf20Sopenharmony_ci	fprintf(file, "};\n");
33138c2ecf20Sopenharmony_ci	fprintf(file, "\n");
33148c2ecf20Sopenharmony_ci	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
33158c2ecf20Sopenharmony_ci	t = 1;
33168c2ecf20Sopenharmony_ci	for (gen = 0; gen < ages_count; gen++) {
33178c2ecf20Sopenharmony_ci		fprintf(file, "\t{ %#x, %d }%s\n",
33188c2ecf20Sopenharmony_ci			ages[gen], trees[t].index,
33198c2ecf20Sopenharmony_ci			ages[gen] == unicode_maxage ? "" : ",");
33208c2ecf20Sopenharmony_ci		if (trees[t].maxage == ages[gen])
33218c2ecf20Sopenharmony_ci			t += 2;
33228c2ecf20Sopenharmony_ci	}
33238c2ecf20Sopenharmony_ci	fprintf(file, "};\n");
33248c2ecf20Sopenharmony_ci	fprintf(file, "\n");
33258c2ecf20Sopenharmony_ci	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
33268c2ecf20Sopenharmony_ci		utf8data_size);
33278c2ecf20Sopenharmony_ci	t = 0;
33288c2ecf20Sopenharmony_ci	for (i = 0; i != utf8data_size; i += 16) {
33298c2ecf20Sopenharmony_ci		if (i == trees[t].index) {
33308c2ecf20Sopenharmony_ci			fprintf(file, "\t/* %s_%x */\n",
33318c2ecf20Sopenharmony_ci				trees[t].type, trees[t].maxage);
33328c2ecf20Sopenharmony_ci			if (t < trees_count-1)
33338c2ecf20Sopenharmony_ci				t++;
33348c2ecf20Sopenharmony_ci		}
33358c2ecf20Sopenharmony_ci		fprintf(file, "\t");
33368c2ecf20Sopenharmony_ci		for (j = i; j != i + 16; j++)
33378c2ecf20Sopenharmony_ci			fprintf(file, "0x%.2x%s", utf8data[j],
33388c2ecf20Sopenharmony_ci				(j < utf8data_size -1 ? "," : ""));
33398c2ecf20Sopenharmony_ci		fprintf(file, "\n");
33408c2ecf20Sopenharmony_ci	}
33418c2ecf20Sopenharmony_ci	fprintf(file, "};\n");
33428c2ecf20Sopenharmony_ci	fclose(file);
33438c2ecf20Sopenharmony_ci}
33448c2ecf20Sopenharmony_ci
33458c2ecf20Sopenharmony_ci/* ------------------------------------------------------------------ */
33468c2ecf20Sopenharmony_ci
33478c2ecf20Sopenharmony_ciint main(int argc, char *argv[])
33488c2ecf20Sopenharmony_ci{
33498c2ecf20Sopenharmony_ci	unsigned int unichar;
33508c2ecf20Sopenharmony_ci	int opt;
33518c2ecf20Sopenharmony_ci
33528c2ecf20Sopenharmony_ci	argv0 = argv[0];
33538c2ecf20Sopenharmony_ci
33548c2ecf20Sopenharmony_ci	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
33558c2ecf20Sopenharmony_ci		switch (opt) {
33568c2ecf20Sopenharmony_ci		case 'a':
33578c2ecf20Sopenharmony_ci			age_name = optarg;
33588c2ecf20Sopenharmony_ci			break;
33598c2ecf20Sopenharmony_ci		case 'c':
33608c2ecf20Sopenharmony_ci			ccc_name = optarg;
33618c2ecf20Sopenharmony_ci			break;
33628c2ecf20Sopenharmony_ci		case 'd':
33638c2ecf20Sopenharmony_ci			data_name = optarg;
33648c2ecf20Sopenharmony_ci			break;
33658c2ecf20Sopenharmony_ci		case 'f':
33668c2ecf20Sopenharmony_ci			fold_name = optarg;
33678c2ecf20Sopenharmony_ci			break;
33688c2ecf20Sopenharmony_ci		case 'n':
33698c2ecf20Sopenharmony_ci			norm_name = optarg;
33708c2ecf20Sopenharmony_ci			break;
33718c2ecf20Sopenharmony_ci		case 'o':
33728c2ecf20Sopenharmony_ci			utf8_name = optarg;
33738c2ecf20Sopenharmony_ci			break;
33748c2ecf20Sopenharmony_ci		case 'p':
33758c2ecf20Sopenharmony_ci			prop_name = optarg;
33768c2ecf20Sopenharmony_ci			break;
33778c2ecf20Sopenharmony_ci		case 't':
33788c2ecf20Sopenharmony_ci			test_name = optarg;
33798c2ecf20Sopenharmony_ci			break;
33808c2ecf20Sopenharmony_ci		case 'v':
33818c2ecf20Sopenharmony_ci			verbose++;
33828c2ecf20Sopenharmony_ci			break;
33838c2ecf20Sopenharmony_ci		case 'h':
33848c2ecf20Sopenharmony_ci			help();
33858c2ecf20Sopenharmony_ci			exit(0);
33868c2ecf20Sopenharmony_ci		default:
33878c2ecf20Sopenharmony_ci			usage();
33888c2ecf20Sopenharmony_ci		}
33898c2ecf20Sopenharmony_ci	}
33908c2ecf20Sopenharmony_ci
33918c2ecf20Sopenharmony_ci	if (verbose > 1)
33928c2ecf20Sopenharmony_ci		help();
33938c2ecf20Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++)
33948c2ecf20Sopenharmony_ci		unicode_data[unichar].code = unichar;
33958c2ecf20Sopenharmony_ci	age_init();
33968c2ecf20Sopenharmony_ci	ccc_init();
33978c2ecf20Sopenharmony_ci	nfdi_init();
33988c2ecf20Sopenharmony_ci	nfdicf_init();
33998c2ecf20Sopenharmony_ci	ignore_init();
34008c2ecf20Sopenharmony_ci	corrections_init();
34018c2ecf20Sopenharmony_ci	hangul_decompose();
34028c2ecf20Sopenharmony_ci	nfdi_decompose();
34038c2ecf20Sopenharmony_ci	nfdicf_decompose();
34048c2ecf20Sopenharmony_ci	utf8_init();
34058c2ecf20Sopenharmony_ci	trees_init();
34068c2ecf20Sopenharmony_ci	trees_populate();
34078c2ecf20Sopenharmony_ci	trees_reduce();
34088c2ecf20Sopenharmony_ci	trees_verify();
34098c2ecf20Sopenharmony_ci	/* Prevent "unused function" warning. */
34108c2ecf20Sopenharmony_ci	(void)lookup(nfdi_tree, " ");
34118c2ecf20Sopenharmony_ci	if (verbose > 2)
34128c2ecf20Sopenharmony_ci		tree_walk(nfdi_tree);
34138c2ecf20Sopenharmony_ci	if (verbose > 2)
34148c2ecf20Sopenharmony_ci		tree_walk(nfdicf_tree);
34158c2ecf20Sopenharmony_ci	normalization_test();
34168c2ecf20Sopenharmony_ci	write_file();
34178c2ecf20Sopenharmony_ci
34188c2ecf20Sopenharmony_ci	return 0;
34198c2ecf20Sopenharmony_ci}
3420