162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * Copyright (c) 2014 SGI.
362306a36Sopenharmony_ci * All rights reserved.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * This program is free software; you can redistribute it and/or
662306a36Sopenharmony_ci * modify it under the terms of the GNU General Public License as
762306a36Sopenharmony_ci * published by the Free Software Foundation.
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * This program is distributed in the hope that it would be useful,
1062306a36Sopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
1162306a36Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1262306a36Sopenharmony_ci * GNU General Public License for more details.
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci * You should have received a copy of the GNU General Public License
1562306a36Sopenharmony_ci * along with this program; if not, write the Free Software Foundation,
1662306a36Sopenharmony_ci * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
1762306a36Sopenharmony_ci */
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci/* Generator for a compact trie for unicode normalization */
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#include <sys/types.h>
2262306a36Sopenharmony_ci#include <stddef.h>
2362306a36Sopenharmony_ci#include <stdlib.h>
2462306a36Sopenharmony_ci#include <stdio.h>
2562306a36Sopenharmony_ci#include <assert.h>
2662306a36Sopenharmony_ci#include <string.h>
2762306a36Sopenharmony_ci#include <unistd.h>
2862306a36Sopenharmony_ci#include <errno.h>
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci/* Default names of the in- and output files. */
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci#define AGE_NAME	"DerivedAge.txt"
3362306a36Sopenharmony_ci#define CCC_NAME	"DerivedCombiningClass.txt"
3462306a36Sopenharmony_ci#define PROP_NAME	"DerivedCoreProperties.txt"
3562306a36Sopenharmony_ci#define DATA_NAME	"UnicodeData.txt"
3662306a36Sopenharmony_ci#define FOLD_NAME	"CaseFolding.txt"
3762306a36Sopenharmony_ci#define NORM_NAME	"NormalizationCorrections.txt"
3862306a36Sopenharmony_ci#define TEST_NAME	"NormalizationTest.txt"
3962306a36Sopenharmony_ci#define UTF8_NAME	"utf8data.h"
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ciconst char	*age_name  = AGE_NAME;
4262306a36Sopenharmony_ciconst char	*ccc_name  = CCC_NAME;
4362306a36Sopenharmony_ciconst char	*prop_name = PROP_NAME;
4462306a36Sopenharmony_ciconst char	*data_name = DATA_NAME;
4562306a36Sopenharmony_ciconst char	*fold_name = FOLD_NAME;
4662306a36Sopenharmony_ciconst char	*norm_name = NORM_NAME;
4762306a36Sopenharmony_ciconst char	*test_name = TEST_NAME;
4862306a36Sopenharmony_ciconst char	*utf8_name = UTF8_NAME;
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ciint verbose = 0;
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci/* An arbitrary line size limit on input lines. */
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci#define LINESIZE	1024
5562306a36Sopenharmony_cichar line[LINESIZE];
5662306a36Sopenharmony_cichar buf0[LINESIZE];
5762306a36Sopenharmony_cichar buf1[LINESIZE];
5862306a36Sopenharmony_cichar buf2[LINESIZE];
5962306a36Sopenharmony_cichar buf3[LINESIZE];
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ciconst char *argv0;
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
6462306a36Sopenharmony_ci
6562306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci/*
6862306a36Sopenharmony_ci * Unicode version numbers consist of three parts: major, minor, and a
6962306a36Sopenharmony_ci * revision.  These numbers are packed into an unsigned int to obtain
7062306a36Sopenharmony_ci * a single version number.
7162306a36Sopenharmony_ci *
7262306a36Sopenharmony_ci * To save space in the generated trie, the unicode version is not
7362306a36Sopenharmony_ci * stored directly, instead we calculate a generation number from the
7462306a36Sopenharmony_ci * unicode versions seen in the DerivedAge file, and use that as an
7562306a36Sopenharmony_ci * index into a table of unicode versions.
7662306a36Sopenharmony_ci */
7762306a36Sopenharmony_ci#define UNICODE_MAJ_SHIFT		(16)
7862306a36Sopenharmony_ci#define UNICODE_MIN_SHIFT		(8)
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci#define UNICODE_MAJ_MAX			((unsigned short)-1)
8162306a36Sopenharmony_ci#define UNICODE_MIN_MAX			((unsigned char)-1)
8262306a36Sopenharmony_ci#define UNICODE_REV_MAX			((unsigned char)-1)
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci#define UNICODE_AGE(MAJ,MIN,REV)			\
8562306a36Sopenharmony_ci	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
8662306a36Sopenharmony_ci	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
8762306a36Sopenharmony_ci	 ((unsigned int)(REV)))
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ciunsigned int *ages;
9062306a36Sopenharmony_ciint ages_count;
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ciunsigned int unicode_maxage;
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_cistatic int age_valid(unsigned int major, unsigned int minor,
9562306a36Sopenharmony_ci		     unsigned int revision)
9662306a36Sopenharmony_ci{
9762306a36Sopenharmony_ci	if (major > UNICODE_MAJ_MAX)
9862306a36Sopenharmony_ci		return 0;
9962306a36Sopenharmony_ci	if (minor > UNICODE_MIN_MAX)
10062306a36Sopenharmony_ci		return 0;
10162306a36Sopenharmony_ci	if (revision > UNICODE_REV_MAX)
10262306a36Sopenharmony_ci		return 0;
10362306a36Sopenharmony_ci	return 1;
10462306a36Sopenharmony_ci}
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci/*
10962306a36Sopenharmony_ci * utf8trie_t
11062306a36Sopenharmony_ci *
11162306a36Sopenharmony_ci * A compact binary tree, used to decode UTF-8 characters.
11262306a36Sopenharmony_ci *
11362306a36Sopenharmony_ci * Internal nodes are one byte for the node itself, and up to three
11462306a36Sopenharmony_ci * bytes for an offset into the tree.  The first byte contains the
11562306a36Sopenharmony_ci * following information:
11662306a36Sopenharmony_ci *  NEXTBYTE  - flag        - advance to next byte if set
11762306a36Sopenharmony_ci *  BITNUM    - 3 bit field - the bit number to tested
11862306a36Sopenharmony_ci *  OFFLEN    - 2 bit field - number of bytes in the offset
11962306a36Sopenharmony_ci * if offlen == 0 (non-branching node)
12062306a36Sopenharmony_ci *  RIGHTPATH - 1 bit field - set if the following node is for the
12162306a36Sopenharmony_ci *                            right-hand path (tested bit is set)
12262306a36Sopenharmony_ci *  TRIENODE  - 1 bit field - set if the following node is an internal
12362306a36Sopenharmony_ci *                            node, otherwise it is a leaf node
12462306a36Sopenharmony_ci * if offlen != 0 (branching node)
12562306a36Sopenharmony_ci *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
12662306a36Sopenharmony_ci *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
12762306a36Sopenharmony_ci *
12862306a36Sopenharmony_ci * Due to the way utf8 works, there cannot be branching nodes with
12962306a36Sopenharmony_ci * NEXTBYTE set, and moreover those nodes always have a righthand
13062306a36Sopenharmony_ci * descendant.
13162306a36Sopenharmony_ci */
13262306a36Sopenharmony_citypedef unsigned char utf8trie_t;
13362306a36Sopenharmony_ci#define BITNUM		0x07
13462306a36Sopenharmony_ci#define NEXTBYTE	0x08
13562306a36Sopenharmony_ci#define OFFLEN		0x30
13662306a36Sopenharmony_ci#define OFFLEN_SHIFT	4
13762306a36Sopenharmony_ci#define RIGHTPATH	0x40
13862306a36Sopenharmony_ci#define TRIENODE	0x80
13962306a36Sopenharmony_ci#define RIGHTNODE	0x40
14062306a36Sopenharmony_ci#define LEFTNODE	0x80
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci/*
14362306a36Sopenharmony_ci * utf8leaf_t
14462306a36Sopenharmony_ci *
14562306a36Sopenharmony_ci * The leaves of the trie are embedded in the trie, and so the same
14662306a36Sopenharmony_ci * underlying datatype, unsigned char.
14762306a36Sopenharmony_ci *
14862306a36Sopenharmony_ci * leaf[0]: The unicode version, stored as a generation number that is
14962306a36Sopenharmony_ci *          an index into utf8agetab[].  With this we can filter code
15062306a36Sopenharmony_ci *          points based on the unicode version in which they were
15162306a36Sopenharmony_ci *          defined.  The CCC of a non-defined code point is 0.
15262306a36Sopenharmony_ci * leaf[1]: Canonical Combining Class. During normalization, we need
15362306a36Sopenharmony_ci *          to do a stable sort into ascending order of all characters
15462306a36Sopenharmony_ci *          with a non-zero CCC that occur between two characters with
15562306a36Sopenharmony_ci *          a CCC of 0, or at the begin or end of a string.
15662306a36Sopenharmony_ci *          The unicode standard guarantees that all CCC values are
15762306a36Sopenharmony_ci *          between 0 and 254 inclusive, which leaves 255 available as
15862306a36Sopenharmony_ci *          a special value.
15962306a36Sopenharmony_ci *          Code points with CCC 0 are known as stoppers.
16062306a36Sopenharmony_ci * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
16162306a36Sopenharmony_ci *          start of a NUL-terminated string that is the decomposition
16262306a36Sopenharmony_ci *          of the character.
16362306a36Sopenharmony_ci *          The CCC of a decomposable character is the same as the CCC
16462306a36Sopenharmony_ci *          of the first character of its decomposition.
16562306a36Sopenharmony_ci *          Some characters decompose as the empty string: these are
16662306a36Sopenharmony_ci *          characters with the Default_Ignorable_Code_Point property.
16762306a36Sopenharmony_ci *          These do affect normalization, as they all have CCC 0.
16862306a36Sopenharmony_ci *
16962306a36Sopenharmony_ci * The decompositions in the trie have been fully expanded.
17062306a36Sopenharmony_ci *
17162306a36Sopenharmony_ci * Casefolding, if applicable, is also done using decompositions.
17262306a36Sopenharmony_ci */
17362306a36Sopenharmony_citypedef unsigned char utf8leaf_t;
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci#define LEAF_GEN(LEAF)	((LEAF)[0])
17662306a36Sopenharmony_ci#define LEAF_CCC(LEAF)	((LEAF)[1])
17762306a36Sopenharmony_ci#define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ci#define MAXGEN		(255)
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci#define MINCCC		(0)
18262306a36Sopenharmony_ci#define MAXCCC		(254)
18362306a36Sopenharmony_ci#define STOPPER		(0)
18462306a36Sopenharmony_ci#define DECOMPOSE	(255)
18562306a36Sopenharmony_ci#define HANGUL		((char)(255))
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci#define UTF8HANGULLEAF	(12)
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_cistruct tree;
19062306a36Sopenharmony_cistatic utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
19162306a36Sopenharmony_ci			       const char *, size_t);
19262306a36Sopenharmony_cistatic utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ciunsigned char *utf8data;
19562306a36Sopenharmony_cisize_t utf8data_size;
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ciutf8trie_t *nfdi;
19862306a36Sopenharmony_ciutf8trie_t *nfdicf;
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_ci/*
20362306a36Sopenharmony_ci * UTF8 valid ranges.
20462306a36Sopenharmony_ci *
20562306a36Sopenharmony_ci * The UTF-8 encoding spreads the bits of a 32bit word over several
20662306a36Sopenharmony_ci * bytes. This table gives the ranges that can be held and how they'd
20762306a36Sopenharmony_ci * be represented.
20862306a36Sopenharmony_ci *
20962306a36Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx
21062306a36Sopenharmony_ci * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
21162306a36Sopenharmony_ci * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
21262306a36Sopenharmony_ci * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
21362306a36Sopenharmony_ci * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
21462306a36Sopenharmony_ci * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
21562306a36Sopenharmony_ci *
21662306a36Sopenharmony_ci * There is an additional requirement on UTF-8, in that only the
21762306a36Sopenharmony_ci * shortest representation of a 32bit value is to be used.  A decoder
21862306a36Sopenharmony_ci * must not decode sequences that do not satisfy this requirement.
21962306a36Sopenharmony_ci * Thus the allowed ranges have a lower bound.
22062306a36Sopenharmony_ci *
22162306a36Sopenharmony_ci * 0x00000000 0x0000007F: 0xxxxxxx
22262306a36Sopenharmony_ci * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
22362306a36Sopenharmony_ci * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
22462306a36Sopenharmony_ci * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
22562306a36Sopenharmony_ci * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
22662306a36Sopenharmony_ci * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
22762306a36Sopenharmony_ci *
22862306a36Sopenharmony_ci * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
22962306a36Sopenharmony_ci * 17 planes of 65536 values.  This limits the sequences actually seen
23062306a36Sopenharmony_ci * even more, to just the following.
23162306a36Sopenharmony_ci *
23262306a36Sopenharmony_ci *          0 -     0x7f: 0                     0x7f
23362306a36Sopenharmony_ci *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
23462306a36Sopenharmony_ci *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
23562306a36Sopenharmony_ci *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
23662306a36Sopenharmony_ci *
23762306a36Sopenharmony_ci * Even within those ranges not all values are allowed: the surrogates
23862306a36Sopenharmony_ci * 0xd800 - 0xdfff should never be seen.
23962306a36Sopenharmony_ci *
24062306a36Sopenharmony_ci * Note that the longest sequence seen with valid usage is 4 bytes,
24162306a36Sopenharmony_ci * the same a single UTF-32 character.  This makes the UTF-8
24262306a36Sopenharmony_ci * representation of Unicode strictly smaller than UTF-32.
24362306a36Sopenharmony_ci *
24462306a36Sopenharmony_ci * The shortest sequence requirement was introduced by:
24562306a36Sopenharmony_ci *    Corrigendum #1: UTF-8 Shortest Form
24662306a36Sopenharmony_ci * It can be found here:
24762306a36Sopenharmony_ci *    http://www.unicode.org/versions/corrigendum1.html
24862306a36Sopenharmony_ci *
24962306a36Sopenharmony_ci */
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci#define UTF8_2_BITS     0xC0
25262306a36Sopenharmony_ci#define UTF8_3_BITS     0xE0
25362306a36Sopenharmony_ci#define UTF8_4_BITS     0xF0
25462306a36Sopenharmony_ci#define UTF8_N_BITS     0x80
25562306a36Sopenharmony_ci#define UTF8_2_MASK     0xE0
25662306a36Sopenharmony_ci#define UTF8_3_MASK     0xF0
25762306a36Sopenharmony_ci#define UTF8_4_MASK     0xF8
25862306a36Sopenharmony_ci#define UTF8_N_MASK     0xC0
25962306a36Sopenharmony_ci#define UTF8_V_MASK     0x3F
26062306a36Sopenharmony_ci#define UTF8_V_SHIFT    6
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_cistatic int utf8encode(char *str, unsigned int val)
26362306a36Sopenharmony_ci{
26462306a36Sopenharmony_ci	int len;
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci	if (val < 0x80) {
26762306a36Sopenharmony_ci		str[0] = val;
26862306a36Sopenharmony_ci		len = 1;
26962306a36Sopenharmony_ci	} else if (val < 0x800) {
27062306a36Sopenharmony_ci		str[1] = val & UTF8_V_MASK;
27162306a36Sopenharmony_ci		str[1] |= UTF8_N_BITS;
27262306a36Sopenharmony_ci		val >>= UTF8_V_SHIFT;
27362306a36Sopenharmony_ci		str[0] = val;
27462306a36Sopenharmony_ci		str[0] |= UTF8_2_BITS;
27562306a36Sopenharmony_ci		len = 2;
27662306a36Sopenharmony_ci	} else if (val < 0x10000) {
27762306a36Sopenharmony_ci		str[2] = val & UTF8_V_MASK;
27862306a36Sopenharmony_ci		str[2] |= UTF8_N_BITS;
27962306a36Sopenharmony_ci		val >>= UTF8_V_SHIFT;
28062306a36Sopenharmony_ci		str[1] = val & UTF8_V_MASK;
28162306a36Sopenharmony_ci		str[1] |= UTF8_N_BITS;
28262306a36Sopenharmony_ci		val >>= UTF8_V_SHIFT;
28362306a36Sopenharmony_ci		str[0] = val;
28462306a36Sopenharmony_ci		str[0] |= UTF8_3_BITS;
28562306a36Sopenharmony_ci		len = 3;
28662306a36Sopenharmony_ci	} else if (val < 0x110000) {
28762306a36Sopenharmony_ci		str[3] = val & UTF8_V_MASK;
28862306a36Sopenharmony_ci		str[3] |= UTF8_N_BITS;
28962306a36Sopenharmony_ci		val >>= UTF8_V_SHIFT;
29062306a36Sopenharmony_ci		str[2] = val & UTF8_V_MASK;
29162306a36Sopenharmony_ci		str[2] |= UTF8_N_BITS;
29262306a36Sopenharmony_ci		val >>= UTF8_V_SHIFT;
29362306a36Sopenharmony_ci		str[1] = val & UTF8_V_MASK;
29462306a36Sopenharmony_ci		str[1] |= UTF8_N_BITS;
29562306a36Sopenharmony_ci		val >>= UTF8_V_SHIFT;
29662306a36Sopenharmony_ci		str[0] = val;
29762306a36Sopenharmony_ci		str[0] |= UTF8_4_BITS;
29862306a36Sopenharmony_ci		len = 4;
29962306a36Sopenharmony_ci	} else {
30062306a36Sopenharmony_ci		printf("%#x: illegal val\n", val);
30162306a36Sopenharmony_ci		len = 0;
30262306a36Sopenharmony_ci	}
30362306a36Sopenharmony_ci	return len;
30462306a36Sopenharmony_ci}
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_cistatic unsigned int utf8decode(const char *str)
30762306a36Sopenharmony_ci{
30862306a36Sopenharmony_ci	const unsigned char *s = (const unsigned char*)str;
30962306a36Sopenharmony_ci	unsigned int unichar = 0;
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	if (*s < 0x80) {
31262306a36Sopenharmony_ci		unichar = *s;
31362306a36Sopenharmony_ci	} else if (*s < UTF8_3_BITS) {
31462306a36Sopenharmony_ci		unichar = *s++ & 0x1F;
31562306a36Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
31662306a36Sopenharmony_ci		unichar |= *s & 0x3F;
31762306a36Sopenharmony_ci	} else if (*s < UTF8_4_BITS) {
31862306a36Sopenharmony_ci		unichar = *s++ & 0x0F;
31962306a36Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
32062306a36Sopenharmony_ci		unichar |= *s++ & 0x3F;
32162306a36Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
32262306a36Sopenharmony_ci		unichar |= *s & 0x3F;
32362306a36Sopenharmony_ci	} else {
32462306a36Sopenharmony_ci		unichar = *s++ & 0x0F;
32562306a36Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
32662306a36Sopenharmony_ci		unichar |= *s++ & 0x3F;
32762306a36Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
32862306a36Sopenharmony_ci		unichar |= *s++ & 0x3F;
32962306a36Sopenharmony_ci		unichar <<= UTF8_V_SHIFT;
33062306a36Sopenharmony_ci		unichar |= *s & 0x3F;
33162306a36Sopenharmony_ci	}
33262306a36Sopenharmony_ci	return unichar;
33362306a36Sopenharmony_ci}
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_cistatic int utf32valid(unsigned int unichar)
33662306a36Sopenharmony_ci{
33762306a36Sopenharmony_ci	return unichar < 0x110000;
33862306a36Sopenharmony_ci}
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci#define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci#define NODE 1
34362306a36Sopenharmony_ci#define LEAF 0
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_cistruct tree {
34662306a36Sopenharmony_ci	void *root;
34762306a36Sopenharmony_ci	int childnode;
34862306a36Sopenharmony_ci	const char *type;
34962306a36Sopenharmony_ci	unsigned int maxage;
35062306a36Sopenharmony_ci	struct tree *next;
35162306a36Sopenharmony_ci	int (*leaf_equal)(void *, void *);
35262306a36Sopenharmony_ci	void (*leaf_print)(void *, int);
35362306a36Sopenharmony_ci	int (*leaf_mark)(void *);
35462306a36Sopenharmony_ci	int (*leaf_size)(void *);
35562306a36Sopenharmony_ci	int *(*leaf_index)(struct tree *, void *);
35662306a36Sopenharmony_ci	unsigned char *(*leaf_emit)(void *, unsigned char *);
35762306a36Sopenharmony_ci	int leafindex[0x110000];
35862306a36Sopenharmony_ci	int index;
35962306a36Sopenharmony_ci};
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_cistruct node {
36262306a36Sopenharmony_ci	int index;
36362306a36Sopenharmony_ci	int offset;
36462306a36Sopenharmony_ci	int mark;
36562306a36Sopenharmony_ci	int size;
36662306a36Sopenharmony_ci	struct node *parent;
36762306a36Sopenharmony_ci	void *left;
36862306a36Sopenharmony_ci	void *right;
36962306a36Sopenharmony_ci	unsigned char bitnum;
37062306a36Sopenharmony_ci	unsigned char nextbyte;
37162306a36Sopenharmony_ci	unsigned char leftnode;
37262306a36Sopenharmony_ci	unsigned char rightnode;
37362306a36Sopenharmony_ci	unsigned int keybits;
37462306a36Sopenharmony_ci	unsigned int keymask;
37562306a36Sopenharmony_ci};
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci/*
37862306a36Sopenharmony_ci * Example lookup function for a tree.
37962306a36Sopenharmony_ci */
38062306a36Sopenharmony_cistatic void *lookup(struct tree *tree, const char *key)
38162306a36Sopenharmony_ci{
38262306a36Sopenharmony_ci	struct node *node;
38362306a36Sopenharmony_ci	void *leaf = NULL;
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	node = tree->root;
38662306a36Sopenharmony_ci	while (!leaf && node) {
38762306a36Sopenharmony_ci		if (node->nextbyte)
38862306a36Sopenharmony_ci			key++;
38962306a36Sopenharmony_ci		if (*key & (1 << (node->bitnum & 7))) {
39062306a36Sopenharmony_ci			/* Right leg */
39162306a36Sopenharmony_ci			if (node->rightnode == NODE) {
39262306a36Sopenharmony_ci				node = node->right;
39362306a36Sopenharmony_ci			} else if (node->rightnode == LEAF) {
39462306a36Sopenharmony_ci				leaf = node->right;
39562306a36Sopenharmony_ci			} else {
39662306a36Sopenharmony_ci				node = NULL;
39762306a36Sopenharmony_ci			}
39862306a36Sopenharmony_ci		} else {
39962306a36Sopenharmony_ci			/* Left leg */
40062306a36Sopenharmony_ci			if (node->leftnode == NODE) {
40162306a36Sopenharmony_ci				node = node->left;
40262306a36Sopenharmony_ci			} else if (node->leftnode == LEAF) {
40362306a36Sopenharmony_ci				leaf = node->left;
40462306a36Sopenharmony_ci			} else {
40562306a36Sopenharmony_ci				node = NULL;
40662306a36Sopenharmony_ci			}
40762306a36Sopenharmony_ci		}
40862306a36Sopenharmony_ci	}
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	return leaf;
41162306a36Sopenharmony_ci}
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci/*
41462306a36Sopenharmony_ci * A simple non-recursive tree walker: keep track of visits to the
41562306a36Sopenharmony_ci * left and right branches in the leftmask and rightmask.
41662306a36Sopenharmony_ci */
41762306a36Sopenharmony_cistatic void tree_walk(struct tree *tree)
41862306a36Sopenharmony_ci{
41962306a36Sopenharmony_ci	struct node *node;
42062306a36Sopenharmony_ci	unsigned int leftmask;
42162306a36Sopenharmony_ci	unsigned int rightmask;
42262306a36Sopenharmony_ci	unsigned int bitmask;
42362306a36Sopenharmony_ci	int indent = 1;
42462306a36Sopenharmony_ci	int nodes, singletons, leaves;
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	nodes = singletons = leaves = 0;
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
42962306a36Sopenharmony_ci	if (tree->childnode == LEAF) {
43062306a36Sopenharmony_ci		assert(tree->root);
43162306a36Sopenharmony_ci		tree->leaf_print(tree->root, indent);
43262306a36Sopenharmony_ci		leaves = 1;
43362306a36Sopenharmony_ci	} else {
43462306a36Sopenharmony_ci		assert(tree->childnode == NODE);
43562306a36Sopenharmony_ci		node = tree->root;
43662306a36Sopenharmony_ci		leftmask = rightmask = 0;
43762306a36Sopenharmony_ci		while (node) {
43862306a36Sopenharmony_ci			printf("%*snode @ %p bitnum %d nextbyte %d"
43962306a36Sopenharmony_ci			       " left %p right %p mask %x bits %x\n",
44062306a36Sopenharmony_ci				indent, "", node,
44162306a36Sopenharmony_ci				node->bitnum, node->nextbyte,
44262306a36Sopenharmony_ci				node->left, node->right,
44362306a36Sopenharmony_ci				node->keymask, node->keybits);
44462306a36Sopenharmony_ci			nodes += 1;
44562306a36Sopenharmony_ci			if (!(node->left && node->right))
44662306a36Sopenharmony_ci				singletons += 1;
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci			while (node) {
44962306a36Sopenharmony_ci				bitmask = 1 << node->bitnum;
45062306a36Sopenharmony_ci				if ((leftmask & bitmask) == 0) {
45162306a36Sopenharmony_ci					leftmask |= bitmask;
45262306a36Sopenharmony_ci					if (node->leftnode == LEAF) {
45362306a36Sopenharmony_ci						assert(node->left);
45462306a36Sopenharmony_ci						tree->leaf_print(node->left,
45562306a36Sopenharmony_ci								 indent+1);
45662306a36Sopenharmony_ci						leaves += 1;
45762306a36Sopenharmony_ci					} else if (node->left) {
45862306a36Sopenharmony_ci						assert(node->leftnode == NODE);
45962306a36Sopenharmony_ci						indent += 1;
46062306a36Sopenharmony_ci						node = node->left;
46162306a36Sopenharmony_ci						break;
46262306a36Sopenharmony_ci					}
46362306a36Sopenharmony_ci				}
46462306a36Sopenharmony_ci				if ((rightmask & bitmask) == 0) {
46562306a36Sopenharmony_ci					rightmask |= bitmask;
46662306a36Sopenharmony_ci					if (node->rightnode == LEAF) {
46762306a36Sopenharmony_ci						assert(node->right);
46862306a36Sopenharmony_ci						tree->leaf_print(node->right,
46962306a36Sopenharmony_ci								 indent+1);
47062306a36Sopenharmony_ci						leaves += 1;
47162306a36Sopenharmony_ci					} else if (node->right) {
47262306a36Sopenharmony_ci						assert(node->rightnode == NODE);
47362306a36Sopenharmony_ci						indent += 1;
47462306a36Sopenharmony_ci						node = node->right;
47562306a36Sopenharmony_ci						break;
47662306a36Sopenharmony_ci					}
47762306a36Sopenharmony_ci				}
47862306a36Sopenharmony_ci				leftmask &= ~bitmask;
47962306a36Sopenharmony_ci				rightmask &= ~bitmask;
48062306a36Sopenharmony_ci				node = node->parent;
48162306a36Sopenharmony_ci				indent -= 1;
48262306a36Sopenharmony_ci			}
48362306a36Sopenharmony_ci		}
48462306a36Sopenharmony_ci	}
48562306a36Sopenharmony_ci	printf("nodes %d leaves %d singletons %d\n",
48662306a36Sopenharmony_ci	       nodes, leaves, singletons);
48762306a36Sopenharmony_ci}
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci/*
49062306a36Sopenharmony_ci * Allocate an initialize a new internal node.
49162306a36Sopenharmony_ci */
49262306a36Sopenharmony_cistatic struct node *alloc_node(struct node *parent)
49362306a36Sopenharmony_ci{
49462306a36Sopenharmony_ci	struct node *node;
49562306a36Sopenharmony_ci	int bitnum;
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci	node = malloc(sizeof(*node));
49862306a36Sopenharmony_ci	node->left = node->right = NULL;
49962306a36Sopenharmony_ci	node->parent = parent;
50062306a36Sopenharmony_ci	node->leftnode = NODE;
50162306a36Sopenharmony_ci	node->rightnode = NODE;
50262306a36Sopenharmony_ci	node->keybits = 0;
50362306a36Sopenharmony_ci	node->keymask = 0;
50462306a36Sopenharmony_ci	node->mark = 0;
50562306a36Sopenharmony_ci	node->index = 0;
50662306a36Sopenharmony_ci	node->offset = -1;
50762306a36Sopenharmony_ci	node->size = 4;
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	if (node->parent) {
51062306a36Sopenharmony_ci		bitnum = parent->bitnum;
51162306a36Sopenharmony_ci		if ((bitnum & 7) == 0) {
51262306a36Sopenharmony_ci			node->bitnum = bitnum + 7 + 8;
51362306a36Sopenharmony_ci			node->nextbyte = 1;
51462306a36Sopenharmony_ci		} else {
51562306a36Sopenharmony_ci			node->bitnum = bitnum - 1;
51662306a36Sopenharmony_ci			node->nextbyte = 0;
51762306a36Sopenharmony_ci		}
51862306a36Sopenharmony_ci	} else {
51962306a36Sopenharmony_ci		node->bitnum = 7;
52062306a36Sopenharmony_ci		node->nextbyte = 0;
52162306a36Sopenharmony_ci	}
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	return node;
52462306a36Sopenharmony_ci}
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci/*
52762306a36Sopenharmony_ci * Insert a new leaf into the tree, and collapse any subtrees that are
52862306a36Sopenharmony_ci * fully populated and end in identical leaves. A nextbyte tagged
52962306a36Sopenharmony_ci * internal node will not be removed to preserve the tree's integrity.
53062306a36Sopenharmony_ci * Note that due to the structure of utf8, no nextbyte tagged node
53162306a36Sopenharmony_ci * will be a candidate for removal.
53262306a36Sopenharmony_ci */
53362306a36Sopenharmony_cistatic int insert(struct tree *tree, char *key, int keylen, void *leaf)
53462306a36Sopenharmony_ci{
53562306a36Sopenharmony_ci	struct node *node;
53662306a36Sopenharmony_ci	struct node *parent;
53762306a36Sopenharmony_ci	void **cursor;
53862306a36Sopenharmony_ci	int keybits;
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci	assert(keylen >= 1 && keylen <= 4);
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	node = NULL;
54362306a36Sopenharmony_ci	cursor = &tree->root;
54462306a36Sopenharmony_ci	keybits = 8 * keylen;
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci	/* Insert, creating path along the way. */
54762306a36Sopenharmony_ci	while (keybits) {
54862306a36Sopenharmony_ci		if (!*cursor)
54962306a36Sopenharmony_ci			*cursor = alloc_node(node);
55062306a36Sopenharmony_ci		node = *cursor;
55162306a36Sopenharmony_ci		if (node->nextbyte)
55262306a36Sopenharmony_ci			key++;
55362306a36Sopenharmony_ci		if (*key & (1 << (node->bitnum & 7)))
55462306a36Sopenharmony_ci			cursor = &node->right;
55562306a36Sopenharmony_ci		else
55662306a36Sopenharmony_ci			cursor = &node->left;
55762306a36Sopenharmony_ci		keybits--;
55862306a36Sopenharmony_ci	}
55962306a36Sopenharmony_ci	*cursor = leaf;
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci	/* Merge subtrees if possible. */
56262306a36Sopenharmony_ci	while (node) {
56362306a36Sopenharmony_ci		if (*key & (1 << (node->bitnum & 7)))
56462306a36Sopenharmony_ci			node->rightnode = LEAF;
56562306a36Sopenharmony_ci		else
56662306a36Sopenharmony_ci			node->leftnode = LEAF;
56762306a36Sopenharmony_ci		if (node->nextbyte)
56862306a36Sopenharmony_ci			break;
56962306a36Sopenharmony_ci		if (node->leftnode == NODE || node->rightnode == NODE)
57062306a36Sopenharmony_ci			break;
57162306a36Sopenharmony_ci		assert(node->left);
57262306a36Sopenharmony_ci		assert(node->right);
57362306a36Sopenharmony_ci		/* Compare */
57462306a36Sopenharmony_ci		if (! tree->leaf_equal(node->left, node->right))
57562306a36Sopenharmony_ci			break;
57662306a36Sopenharmony_ci		/* Keep left, drop right leaf. */
57762306a36Sopenharmony_ci		leaf = node->left;
57862306a36Sopenharmony_ci		/* Check in parent */
57962306a36Sopenharmony_ci		parent = node->parent;
58062306a36Sopenharmony_ci		if (!parent) {
58162306a36Sopenharmony_ci			/* root of tree! */
58262306a36Sopenharmony_ci			tree->root = leaf;
58362306a36Sopenharmony_ci			tree->childnode = LEAF;
58462306a36Sopenharmony_ci		} else if (parent->left == node) {
58562306a36Sopenharmony_ci			parent->left = leaf;
58662306a36Sopenharmony_ci			parent->leftnode = LEAF;
58762306a36Sopenharmony_ci			if (parent->right) {
58862306a36Sopenharmony_ci				parent->keymask = 0;
58962306a36Sopenharmony_ci				parent->keybits = 0;
59062306a36Sopenharmony_ci			} else {
59162306a36Sopenharmony_ci				parent->keymask |= (1 << node->bitnum);
59262306a36Sopenharmony_ci			}
59362306a36Sopenharmony_ci		} else if (parent->right == node) {
59462306a36Sopenharmony_ci			parent->right = leaf;
59562306a36Sopenharmony_ci			parent->rightnode = LEAF;
59662306a36Sopenharmony_ci			if (parent->left) {
59762306a36Sopenharmony_ci				parent->keymask = 0;
59862306a36Sopenharmony_ci				parent->keybits = 0;
59962306a36Sopenharmony_ci			} else {
60062306a36Sopenharmony_ci				parent->keymask |= (1 << node->bitnum);
60162306a36Sopenharmony_ci				parent->keybits |= (1 << node->bitnum);
60262306a36Sopenharmony_ci			}
60362306a36Sopenharmony_ci		} else {
60462306a36Sopenharmony_ci			/* internal tree error */
60562306a36Sopenharmony_ci			assert(0);
60662306a36Sopenharmony_ci		}
60762306a36Sopenharmony_ci		free(node);
60862306a36Sopenharmony_ci		node = parent;
60962306a36Sopenharmony_ci	}
61062306a36Sopenharmony_ci
61162306a36Sopenharmony_ci	/* Propagate keymasks up along singleton chains. */
61262306a36Sopenharmony_ci	while (node) {
61362306a36Sopenharmony_ci		parent = node->parent;
61462306a36Sopenharmony_ci		if (!parent)
61562306a36Sopenharmony_ci			break;
61662306a36Sopenharmony_ci		/* Nix the mask for parents with two children. */
61762306a36Sopenharmony_ci		if (node->keymask == 0) {
61862306a36Sopenharmony_ci			parent->keymask = 0;
61962306a36Sopenharmony_ci			parent->keybits = 0;
62062306a36Sopenharmony_ci		} else if (parent->left && parent->right) {
62162306a36Sopenharmony_ci			parent->keymask = 0;
62262306a36Sopenharmony_ci			parent->keybits = 0;
62362306a36Sopenharmony_ci		} else {
62462306a36Sopenharmony_ci			assert((parent->keymask & node->keymask) == 0);
62562306a36Sopenharmony_ci			parent->keymask |= node->keymask;
62662306a36Sopenharmony_ci			parent->keymask |= (1 << parent->bitnum);
62762306a36Sopenharmony_ci			parent->keybits |= node->keybits;
62862306a36Sopenharmony_ci			if (parent->right)
62962306a36Sopenharmony_ci				parent->keybits |= (1 << parent->bitnum);
63062306a36Sopenharmony_ci		}
63162306a36Sopenharmony_ci		node = parent;
63262306a36Sopenharmony_ci	}
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	return 0;
63562306a36Sopenharmony_ci}
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci/*
63862306a36Sopenharmony_ci * Prune internal nodes.
63962306a36Sopenharmony_ci *
64062306a36Sopenharmony_ci * Fully populated subtrees that end at the same leaf have already
64162306a36Sopenharmony_ci * been collapsed.  There are still internal nodes that have for both
64262306a36Sopenharmony_ci * their left and right branches a sequence of singletons that make
64362306a36Sopenharmony_ci * identical choices and end in identical leaves.  The keymask and
64462306a36Sopenharmony_ci * keybits collected in the nodes describe the choices made in these
64562306a36Sopenharmony_ci * singleton chains.  When they are identical for the left and right
64662306a36Sopenharmony_ci * branch of a node, and the two leaves comare identical, the node in
64762306a36Sopenharmony_ci * question can be removed.
64862306a36Sopenharmony_ci *
64962306a36Sopenharmony_ci * Note that nodes with the nextbyte tag set will not be removed by
65062306a36Sopenharmony_ci * this to ensure tree integrity.  Note as well that the structure of
65162306a36Sopenharmony_ci * utf8 ensures that these nodes would not have been candidates for
65262306a36Sopenharmony_ci * removal in any case.
65362306a36Sopenharmony_ci */
65462306a36Sopenharmony_cistatic void prune(struct tree *tree)
65562306a36Sopenharmony_ci{
65662306a36Sopenharmony_ci	struct node *node;
65762306a36Sopenharmony_ci	struct node *left;
65862306a36Sopenharmony_ci	struct node *right;
65962306a36Sopenharmony_ci	struct node *parent;
66062306a36Sopenharmony_ci	void *leftleaf;
66162306a36Sopenharmony_ci	void *rightleaf;
66262306a36Sopenharmony_ci	unsigned int leftmask;
66362306a36Sopenharmony_ci	unsigned int rightmask;
66462306a36Sopenharmony_ci	unsigned int bitmask;
66562306a36Sopenharmony_ci	int count;
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci	if (verbose > 0)
66862306a36Sopenharmony_ci		printf("Pruning %s_%x\n", tree->type, tree->maxage);
66962306a36Sopenharmony_ci
67062306a36Sopenharmony_ci	count = 0;
67162306a36Sopenharmony_ci	if (tree->childnode == LEAF)
67262306a36Sopenharmony_ci		return;
67362306a36Sopenharmony_ci	if (!tree->root)
67462306a36Sopenharmony_ci		return;
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci	leftmask = rightmask = 0;
67762306a36Sopenharmony_ci	node = tree->root;
67862306a36Sopenharmony_ci	while (node) {
67962306a36Sopenharmony_ci		if (node->nextbyte)
68062306a36Sopenharmony_ci			goto advance;
68162306a36Sopenharmony_ci		if (node->leftnode == LEAF)
68262306a36Sopenharmony_ci			goto advance;
68362306a36Sopenharmony_ci		if (node->rightnode == LEAF)
68462306a36Sopenharmony_ci			goto advance;
68562306a36Sopenharmony_ci		if (!node->left)
68662306a36Sopenharmony_ci			goto advance;
68762306a36Sopenharmony_ci		if (!node->right)
68862306a36Sopenharmony_ci			goto advance;
68962306a36Sopenharmony_ci		left = node->left;
69062306a36Sopenharmony_ci		right = node->right;
69162306a36Sopenharmony_ci		if (left->keymask == 0)
69262306a36Sopenharmony_ci			goto advance;
69362306a36Sopenharmony_ci		if (right->keymask == 0)
69462306a36Sopenharmony_ci			goto advance;
69562306a36Sopenharmony_ci		if (left->keymask != right->keymask)
69662306a36Sopenharmony_ci			goto advance;
69762306a36Sopenharmony_ci		if (left->keybits != right->keybits)
69862306a36Sopenharmony_ci			goto advance;
69962306a36Sopenharmony_ci		leftleaf = NULL;
70062306a36Sopenharmony_ci		while (!leftleaf) {
70162306a36Sopenharmony_ci			assert(left->left || left->right);
70262306a36Sopenharmony_ci			if (left->leftnode == LEAF)
70362306a36Sopenharmony_ci				leftleaf = left->left;
70462306a36Sopenharmony_ci			else if (left->rightnode == LEAF)
70562306a36Sopenharmony_ci				leftleaf = left->right;
70662306a36Sopenharmony_ci			else if (left->left)
70762306a36Sopenharmony_ci				left = left->left;
70862306a36Sopenharmony_ci			else if (left->right)
70962306a36Sopenharmony_ci				left = left->right;
71062306a36Sopenharmony_ci			else
71162306a36Sopenharmony_ci				assert(0);
71262306a36Sopenharmony_ci		}
71362306a36Sopenharmony_ci		rightleaf = NULL;
71462306a36Sopenharmony_ci		while (!rightleaf) {
71562306a36Sopenharmony_ci			assert(right->left || right->right);
71662306a36Sopenharmony_ci			if (right->leftnode == LEAF)
71762306a36Sopenharmony_ci				rightleaf = right->left;
71862306a36Sopenharmony_ci			else if (right->rightnode == LEAF)
71962306a36Sopenharmony_ci				rightleaf = right->right;
72062306a36Sopenharmony_ci			else if (right->left)
72162306a36Sopenharmony_ci				right = right->left;
72262306a36Sopenharmony_ci			else if (right->right)
72362306a36Sopenharmony_ci				right = right->right;
72462306a36Sopenharmony_ci			else
72562306a36Sopenharmony_ci				assert(0);
72662306a36Sopenharmony_ci		}
72762306a36Sopenharmony_ci		if (! tree->leaf_equal(leftleaf, rightleaf))
72862306a36Sopenharmony_ci			goto advance;
72962306a36Sopenharmony_ci		/*
73062306a36Sopenharmony_ci		 * This node has identical singleton-only subtrees.
73162306a36Sopenharmony_ci		 * Remove it.
73262306a36Sopenharmony_ci		 */
73362306a36Sopenharmony_ci		parent = node->parent;
73462306a36Sopenharmony_ci		left = node->left;
73562306a36Sopenharmony_ci		right = node->right;
73662306a36Sopenharmony_ci		if (parent->left == node)
73762306a36Sopenharmony_ci			parent->left = left;
73862306a36Sopenharmony_ci		else if (parent->right == node)
73962306a36Sopenharmony_ci			parent->right = left;
74062306a36Sopenharmony_ci		else
74162306a36Sopenharmony_ci			assert(0);
74262306a36Sopenharmony_ci		left->parent = parent;
74362306a36Sopenharmony_ci		left->keymask |= (1 << node->bitnum);
74462306a36Sopenharmony_ci		node->left = NULL;
74562306a36Sopenharmony_ci		while (node) {
74662306a36Sopenharmony_ci			bitmask = 1 << node->bitnum;
74762306a36Sopenharmony_ci			leftmask &= ~bitmask;
74862306a36Sopenharmony_ci			rightmask &= ~bitmask;
74962306a36Sopenharmony_ci			if (node->leftnode == NODE && node->left) {
75062306a36Sopenharmony_ci				left = node->left;
75162306a36Sopenharmony_ci				free(node);
75262306a36Sopenharmony_ci				count++;
75362306a36Sopenharmony_ci				node = left;
75462306a36Sopenharmony_ci			} else if (node->rightnode == NODE && node->right) {
75562306a36Sopenharmony_ci				right = node->right;
75662306a36Sopenharmony_ci				free(node);
75762306a36Sopenharmony_ci				count++;
75862306a36Sopenharmony_ci				node = right;
75962306a36Sopenharmony_ci			} else {
76062306a36Sopenharmony_ci				node = NULL;
76162306a36Sopenharmony_ci			}
76262306a36Sopenharmony_ci		}
76362306a36Sopenharmony_ci		/* Propagate keymasks up along singleton chains. */
76462306a36Sopenharmony_ci		node = parent;
76562306a36Sopenharmony_ci		/* Force re-check */
76662306a36Sopenharmony_ci		bitmask = 1 << node->bitnum;
76762306a36Sopenharmony_ci		leftmask &= ~bitmask;
76862306a36Sopenharmony_ci		rightmask &= ~bitmask;
76962306a36Sopenharmony_ci		for (;;) {
77062306a36Sopenharmony_ci			if (node->left && node->right)
77162306a36Sopenharmony_ci				break;
77262306a36Sopenharmony_ci			if (node->left) {
77362306a36Sopenharmony_ci				left = node->left;
77462306a36Sopenharmony_ci				node->keymask |= left->keymask;
77562306a36Sopenharmony_ci				node->keybits |= left->keybits;
77662306a36Sopenharmony_ci			}
77762306a36Sopenharmony_ci			if (node->right) {
77862306a36Sopenharmony_ci				right = node->right;
77962306a36Sopenharmony_ci				node->keymask |= right->keymask;
78062306a36Sopenharmony_ci				node->keybits |= right->keybits;
78162306a36Sopenharmony_ci			}
78262306a36Sopenharmony_ci			node->keymask |= (1 << node->bitnum);
78362306a36Sopenharmony_ci			node = node->parent;
78462306a36Sopenharmony_ci			/* Force re-check */
78562306a36Sopenharmony_ci			bitmask = 1 << node->bitnum;
78662306a36Sopenharmony_ci			leftmask &= ~bitmask;
78762306a36Sopenharmony_ci			rightmask &= ~bitmask;
78862306a36Sopenharmony_ci		}
78962306a36Sopenharmony_ci	advance:
79062306a36Sopenharmony_ci		bitmask = 1 << node->bitnum;
79162306a36Sopenharmony_ci		if ((leftmask & bitmask) == 0 &&
79262306a36Sopenharmony_ci		    node->leftnode == NODE &&
79362306a36Sopenharmony_ci		    node->left) {
79462306a36Sopenharmony_ci			leftmask |= bitmask;
79562306a36Sopenharmony_ci			node = node->left;
79662306a36Sopenharmony_ci		} else if ((rightmask & bitmask) == 0 &&
79762306a36Sopenharmony_ci			   node->rightnode == NODE &&
79862306a36Sopenharmony_ci			   node->right) {
79962306a36Sopenharmony_ci			rightmask |= bitmask;
80062306a36Sopenharmony_ci			node = node->right;
80162306a36Sopenharmony_ci		} else {
80262306a36Sopenharmony_ci			leftmask &= ~bitmask;
80362306a36Sopenharmony_ci			rightmask &= ~bitmask;
80462306a36Sopenharmony_ci			node = node->parent;
80562306a36Sopenharmony_ci		}
80662306a36Sopenharmony_ci	}
80762306a36Sopenharmony_ci	if (verbose > 0)
80862306a36Sopenharmony_ci		printf("Pruned %d nodes\n", count);
80962306a36Sopenharmony_ci}
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_ci/*
81262306a36Sopenharmony_ci * Mark the nodes in the tree that lead to leaves that must be
81362306a36Sopenharmony_ci * emitted.
81462306a36Sopenharmony_ci */
81562306a36Sopenharmony_cistatic void mark_nodes(struct tree *tree)
81662306a36Sopenharmony_ci{
81762306a36Sopenharmony_ci	struct node *node;
81862306a36Sopenharmony_ci	struct node *n;
81962306a36Sopenharmony_ci	unsigned int leftmask;
82062306a36Sopenharmony_ci	unsigned int rightmask;
82162306a36Sopenharmony_ci	unsigned int bitmask;
82262306a36Sopenharmony_ci	int marked;
82362306a36Sopenharmony_ci
82462306a36Sopenharmony_ci	marked = 0;
82562306a36Sopenharmony_ci	if (verbose > 0)
82662306a36Sopenharmony_ci		printf("Marking %s_%x\n", tree->type, tree->maxage);
82762306a36Sopenharmony_ci	if (tree->childnode == LEAF)
82862306a36Sopenharmony_ci		goto done;
82962306a36Sopenharmony_ci
83062306a36Sopenharmony_ci	assert(tree->childnode == NODE);
83162306a36Sopenharmony_ci	node = tree->root;
83262306a36Sopenharmony_ci	leftmask = rightmask = 0;
83362306a36Sopenharmony_ci	while (node) {
83462306a36Sopenharmony_ci		bitmask = 1 << node->bitnum;
83562306a36Sopenharmony_ci		if ((leftmask & bitmask) == 0) {
83662306a36Sopenharmony_ci			leftmask |= bitmask;
83762306a36Sopenharmony_ci			if (node->leftnode == LEAF) {
83862306a36Sopenharmony_ci				assert(node->left);
83962306a36Sopenharmony_ci				if (tree->leaf_mark(node->left)) {
84062306a36Sopenharmony_ci					n = node;
84162306a36Sopenharmony_ci					while (n && !n->mark) {
84262306a36Sopenharmony_ci						marked++;
84362306a36Sopenharmony_ci						n->mark = 1;
84462306a36Sopenharmony_ci						n = n->parent;
84562306a36Sopenharmony_ci					}
84662306a36Sopenharmony_ci				}
84762306a36Sopenharmony_ci			} else if (node->left) {
84862306a36Sopenharmony_ci				assert(node->leftnode == NODE);
84962306a36Sopenharmony_ci				node = node->left;
85062306a36Sopenharmony_ci				continue;
85162306a36Sopenharmony_ci			}
85262306a36Sopenharmony_ci		}
85362306a36Sopenharmony_ci		if ((rightmask & bitmask) == 0) {
85462306a36Sopenharmony_ci			rightmask |= bitmask;
85562306a36Sopenharmony_ci			if (node->rightnode == LEAF) {
85662306a36Sopenharmony_ci				assert(node->right);
85762306a36Sopenharmony_ci				if (tree->leaf_mark(node->right)) {
85862306a36Sopenharmony_ci					n = node;
85962306a36Sopenharmony_ci					while (n && !n->mark) {
86062306a36Sopenharmony_ci						marked++;
86162306a36Sopenharmony_ci						n->mark = 1;
86262306a36Sopenharmony_ci						n = n->parent;
86362306a36Sopenharmony_ci					}
86462306a36Sopenharmony_ci				}
86562306a36Sopenharmony_ci			} else if (node->right) {
86662306a36Sopenharmony_ci				assert(node->rightnode == NODE);
86762306a36Sopenharmony_ci				node = node->right;
86862306a36Sopenharmony_ci				continue;
86962306a36Sopenharmony_ci			}
87062306a36Sopenharmony_ci		}
87162306a36Sopenharmony_ci		leftmask &= ~bitmask;
87262306a36Sopenharmony_ci		rightmask &= ~bitmask;
87362306a36Sopenharmony_ci		node = node->parent;
87462306a36Sopenharmony_ci	}
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci	/* second pass: left siblings and singletons */
87762306a36Sopenharmony_ci
87862306a36Sopenharmony_ci	assert(tree->childnode == NODE);
87962306a36Sopenharmony_ci	node = tree->root;
88062306a36Sopenharmony_ci	leftmask = rightmask = 0;
88162306a36Sopenharmony_ci	while (node) {
88262306a36Sopenharmony_ci		bitmask = 1 << node->bitnum;
88362306a36Sopenharmony_ci		if ((leftmask & bitmask) == 0) {
88462306a36Sopenharmony_ci			leftmask |= bitmask;
88562306a36Sopenharmony_ci			if (node->leftnode == LEAF) {
88662306a36Sopenharmony_ci				assert(node->left);
88762306a36Sopenharmony_ci				if (tree->leaf_mark(node->left)) {
88862306a36Sopenharmony_ci					n = node;
88962306a36Sopenharmony_ci					while (n && !n->mark) {
89062306a36Sopenharmony_ci						marked++;
89162306a36Sopenharmony_ci						n->mark = 1;
89262306a36Sopenharmony_ci						n = n->parent;
89362306a36Sopenharmony_ci					}
89462306a36Sopenharmony_ci				}
89562306a36Sopenharmony_ci			} else if (node->left) {
89662306a36Sopenharmony_ci				assert(node->leftnode == NODE);
89762306a36Sopenharmony_ci				node = node->left;
89862306a36Sopenharmony_ci				if (!node->mark && node->parent->mark) {
89962306a36Sopenharmony_ci					marked++;
90062306a36Sopenharmony_ci					node->mark = 1;
90162306a36Sopenharmony_ci				}
90262306a36Sopenharmony_ci				continue;
90362306a36Sopenharmony_ci			}
90462306a36Sopenharmony_ci		}
90562306a36Sopenharmony_ci		if ((rightmask & bitmask) == 0) {
90662306a36Sopenharmony_ci			rightmask |= bitmask;
90762306a36Sopenharmony_ci			if (node->rightnode == LEAF) {
90862306a36Sopenharmony_ci				assert(node->right);
90962306a36Sopenharmony_ci				if (tree->leaf_mark(node->right)) {
91062306a36Sopenharmony_ci					n = node;
91162306a36Sopenharmony_ci					while (n && !n->mark) {
91262306a36Sopenharmony_ci						marked++;
91362306a36Sopenharmony_ci						n->mark = 1;
91462306a36Sopenharmony_ci						n = n->parent;
91562306a36Sopenharmony_ci					}
91662306a36Sopenharmony_ci				}
91762306a36Sopenharmony_ci			} else if (node->right) {
91862306a36Sopenharmony_ci				assert(node->rightnode == NODE);
91962306a36Sopenharmony_ci				node = node->right;
92062306a36Sopenharmony_ci				if (!node->mark && node->parent->mark &&
92162306a36Sopenharmony_ci				    !node->parent->left) {
92262306a36Sopenharmony_ci					marked++;
92362306a36Sopenharmony_ci					node->mark = 1;
92462306a36Sopenharmony_ci				}
92562306a36Sopenharmony_ci				continue;
92662306a36Sopenharmony_ci			}
92762306a36Sopenharmony_ci		}
92862306a36Sopenharmony_ci		leftmask &= ~bitmask;
92962306a36Sopenharmony_ci		rightmask &= ~bitmask;
93062306a36Sopenharmony_ci		node = node->parent;
93162306a36Sopenharmony_ci	}
93262306a36Sopenharmony_cidone:
93362306a36Sopenharmony_ci	if (verbose > 0)
93462306a36Sopenharmony_ci		printf("Marked %d nodes\n", marked);
93562306a36Sopenharmony_ci}
93662306a36Sopenharmony_ci
93762306a36Sopenharmony_ci/*
93862306a36Sopenharmony_ci * Compute the index of each node and leaf, which is the offset in the
93962306a36Sopenharmony_ci * emitted trie.  These values must be pre-computed because relative
94062306a36Sopenharmony_ci * offsets between nodes are used to navigate the tree.
94162306a36Sopenharmony_ci */
94262306a36Sopenharmony_cistatic int index_nodes(struct tree *tree, int index)
94362306a36Sopenharmony_ci{
94462306a36Sopenharmony_ci	struct node *node;
94562306a36Sopenharmony_ci	unsigned int leftmask;
94662306a36Sopenharmony_ci	unsigned int rightmask;
94762306a36Sopenharmony_ci	unsigned int bitmask;
94862306a36Sopenharmony_ci	int count;
94962306a36Sopenharmony_ci	int indent;
95062306a36Sopenharmony_ci
95162306a36Sopenharmony_ci	/* Align to a cache line (or half a cache line?). */
95262306a36Sopenharmony_ci	while (index % 64)
95362306a36Sopenharmony_ci		index++;
95462306a36Sopenharmony_ci	tree->index = index;
95562306a36Sopenharmony_ci	indent = 1;
95662306a36Sopenharmony_ci	count = 0;
95762306a36Sopenharmony_ci
95862306a36Sopenharmony_ci	if (verbose > 0)
95962306a36Sopenharmony_ci		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
96062306a36Sopenharmony_ci	if (tree->childnode == LEAF) {
96162306a36Sopenharmony_ci		index += tree->leaf_size(tree->root);
96262306a36Sopenharmony_ci		goto done;
96362306a36Sopenharmony_ci	}
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_ci	assert(tree->childnode == NODE);
96662306a36Sopenharmony_ci	node = tree->root;
96762306a36Sopenharmony_ci	leftmask = rightmask = 0;
96862306a36Sopenharmony_ci	while (node) {
96962306a36Sopenharmony_ci		if (!node->mark)
97062306a36Sopenharmony_ci			goto skip;
97162306a36Sopenharmony_ci		count++;
97262306a36Sopenharmony_ci		if (node->index != index)
97362306a36Sopenharmony_ci			node->index = index;
97462306a36Sopenharmony_ci		index += node->size;
97562306a36Sopenharmony_ciskip:
97662306a36Sopenharmony_ci		while (node) {
97762306a36Sopenharmony_ci			bitmask = 1 << node->bitnum;
97862306a36Sopenharmony_ci			if (node->mark && (leftmask & bitmask) == 0) {
97962306a36Sopenharmony_ci				leftmask |= bitmask;
98062306a36Sopenharmony_ci				if (node->leftnode == LEAF) {
98162306a36Sopenharmony_ci					assert(node->left);
98262306a36Sopenharmony_ci					*tree->leaf_index(tree, node->left) =
98362306a36Sopenharmony_ci									index;
98462306a36Sopenharmony_ci					index += tree->leaf_size(node->left);
98562306a36Sopenharmony_ci					count++;
98662306a36Sopenharmony_ci				} else if (node->left) {
98762306a36Sopenharmony_ci					assert(node->leftnode == NODE);
98862306a36Sopenharmony_ci					indent += 1;
98962306a36Sopenharmony_ci					node = node->left;
99062306a36Sopenharmony_ci					break;
99162306a36Sopenharmony_ci				}
99262306a36Sopenharmony_ci			}
99362306a36Sopenharmony_ci			if (node->mark && (rightmask & bitmask) == 0) {
99462306a36Sopenharmony_ci				rightmask |= bitmask;
99562306a36Sopenharmony_ci				if (node->rightnode == LEAF) {
99662306a36Sopenharmony_ci					assert(node->right);
99762306a36Sopenharmony_ci					*tree->leaf_index(tree, node->right) = index;
99862306a36Sopenharmony_ci					index += tree->leaf_size(node->right);
99962306a36Sopenharmony_ci					count++;
100062306a36Sopenharmony_ci				} else if (node->right) {
100162306a36Sopenharmony_ci					assert(node->rightnode == NODE);
100262306a36Sopenharmony_ci					indent += 1;
100362306a36Sopenharmony_ci					node = node->right;
100462306a36Sopenharmony_ci					break;
100562306a36Sopenharmony_ci				}
100662306a36Sopenharmony_ci			}
100762306a36Sopenharmony_ci			leftmask &= ~bitmask;
100862306a36Sopenharmony_ci			rightmask &= ~bitmask;
100962306a36Sopenharmony_ci			node = node->parent;
101062306a36Sopenharmony_ci			indent -= 1;
101162306a36Sopenharmony_ci		}
101262306a36Sopenharmony_ci	}
101362306a36Sopenharmony_cidone:
101462306a36Sopenharmony_ci	/* Round up to a multiple of 16 */
101562306a36Sopenharmony_ci	while (index % 16)
101662306a36Sopenharmony_ci		index++;
101762306a36Sopenharmony_ci	if (verbose > 0)
101862306a36Sopenharmony_ci		printf("Final index %d\n", index);
101962306a36Sopenharmony_ci	return index;
102062306a36Sopenharmony_ci}
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_ci/*
102362306a36Sopenharmony_ci * Mark the nodes in a subtree, helper for size_nodes().
102462306a36Sopenharmony_ci */
102562306a36Sopenharmony_cistatic int mark_subtree(struct node *node)
102662306a36Sopenharmony_ci{
102762306a36Sopenharmony_ci	int changed;
102862306a36Sopenharmony_ci
102962306a36Sopenharmony_ci	if (!node || node->mark)
103062306a36Sopenharmony_ci		return 0;
103162306a36Sopenharmony_ci	node->mark = 1;
103262306a36Sopenharmony_ci	node->index = node->parent->index;
103362306a36Sopenharmony_ci	changed = 1;
103462306a36Sopenharmony_ci	if (node->leftnode == NODE)
103562306a36Sopenharmony_ci		changed += mark_subtree(node->left);
103662306a36Sopenharmony_ci	if (node->rightnode == NODE)
103762306a36Sopenharmony_ci		changed += mark_subtree(node->right);
103862306a36Sopenharmony_ci	return changed;
103962306a36Sopenharmony_ci}
104062306a36Sopenharmony_ci
104162306a36Sopenharmony_ci/*
104262306a36Sopenharmony_ci * Compute the size of nodes and leaves. We start by assuming that
104362306a36Sopenharmony_ci * each node needs to store a three-byte offset. The indexes of the
104462306a36Sopenharmony_ci * nodes are calculated based on that, and then this function is
104562306a36Sopenharmony_ci * called to see if the sizes of some nodes can be reduced.  This is
104662306a36Sopenharmony_ci * repeated until no more changes are seen.
104762306a36Sopenharmony_ci */
104862306a36Sopenharmony_cistatic int size_nodes(struct tree *tree)
104962306a36Sopenharmony_ci{
105062306a36Sopenharmony_ci	struct tree *next;
105162306a36Sopenharmony_ci	struct node *node;
105262306a36Sopenharmony_ci	struct node *right;
105362306a36Sopenharmony_ci	struct node *n;
105462306a36Sopenharmony_ci	unsigned int leftmask;
105562306a36Sopenharmony_ci	unsigned int rightmask;
105662306a36Sopenharmony_ci	unsigned int bitmask;
105762306a36Sopenharmony_ci	unsigned int pathbits;
105862306a36Sopenharmony_ci	unsigned int pathmask;
105962306a36Sopenharmony_ci	unsigned int nbit;
106062306a36Sopenharmony_ci	int changed;
106162306a36Sopenharmony_ci	int offset;
106262306a36Sopenharmony_ci	int size;
106362306a36Sopenharmony_ci	int indent;
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_ci	indent = 1;
106662306a36Sopenharmony_ci	changed = 0;
106762306a36Sopenharmony_ci	size = 0;
106862306a36Sopenharmony_ci
106962306a36Sopenharmony_ci	if (verbose > 0)
107062306a36Sopenharmony_ci		printf("Sizing %s_%x\n", tree->type, tree->maxage);
107162306a36Sopenharmony_ci	if (tree->childnode == LEAF)
107262306a36Sopenharmony_ci		goto done;
107362306a36Sopenharmony_ci
107462306a36Sopenharmony_ci	assert(tree->childnode == NODE);
107562306a36Sopenharmony_ci	pathbits = 0;
107662306a36Sopenharmony_ci	pathmask = 0;
107762306a36Sopenharmony_ci	node = tree->root;
107862306a36Sopenharmony_ci	leftmask = rightmask = 0;
107962306a36Sopenharmony_ci	while (node) {
108062306a36Sopenharmony_ci		if (!node->mark)
108162306a36Sopenharmony_ci			goto skip;
108262306a36Sopenharmony_ci		offset = 0;
108362306a36Sopenharmony_ci		if (!node->left || !node->right) {
108462306a36Sopenharmony_ci			size = 1;
108562306a36Sopenharmony_ci		} else {
108662306a36Sopenharmony_ci			if (node->rightnode == NODE) {
108762306a36Sopenharmony_ci				/*
108862306a36Sopenharmony_ci				 * If the right node is not marked,
108962306a36Sopenharmony_ci				 * look for a corresponding node in
109062306a36Sopenharmony_ci				 * the next tree.  Such a node need
109162306a36Sopenharmony_ci				 * not exist.
109262306a36Sopenharmony_ci				 */
109362306a36Sopenharmony_ci				right = node->right;
109462306a36Sopenharmony_ci				next = tree->next;
109562306a36Sopenharmony_ci				while (!right->mark) {
109662306a36Sopenharmony_ci					assert(next);
109762306a36Sopenharmony_ci					n = next->root;
109862306a36Sopenharmony_ci					while (n->bitnum != node->bitnum) {
109962306a36Sopenharmony_ci						nbit = 1 << n->bitnum;
110062306a36Sopenharmony_ci						if (!(pathmask & nbit))
110162306a36Sopenharmony_ci							break;
110262306a36Sopenharmony_ci						if (pathbits & nbit) {
110362306a36Sopenharmony_ci							if (n->rightnode == LEAF)
110462306a36Sopenharmony_ci								break;
110562306a36Sopenharmony_ci							n = n->right;
110662306a36Sopenharmony_ci						} else {
110762306a36Sopenharmony_ci							if (n->leftnode == LEAF)
110862306a36Sopenharmony_ci								break;
110962306a36Sopenharmony_ci							n = n->left;
111062306a36Sopenharmony_ci						}
111162306a36Sopenharmony_ci					}
111262306a36Sopenharmony_ci					if (n->bitnum != node->bitnum)
111362306a36Sopenharmony_ci						break;
111462306a36Sopenharmony_ci					n = n->right;
111562306a36Sopenharmony_ci					right = n;
111662306a36Sopenharmony_ci					next = next->next;
111762306a36Sopenharmony_ci				}
111862306a36Sopenharmony_ci				/* Make sure the right node is marked. */
111962306a36Sopenharmony_ci				if (!right->mark)
112062306a36Sopenharmony_ci					changed += mark_subtree(right);
112162306a36Sopenharmony_ci				offset = right->index - node->index;
112262306a36Sopenharmony_ci			} else {
112362306a36Sopenharmony_ci				offset = *tree->leaf_index(tree, node->right);
112462306a36Sopenharmony_ci				offset -= node->index;
112562306a36Sopenharmony_ci			}
112662306a36Sopenharmony_ci			assert(offset >= 0);
112762306a36Sopenharmony_ci			assert(offset <= 0xffffff);
112862306a36Sopenharmony_ci			if (offset <= 0xff) {
112962306a36Sopenharmony_ci				size = 2;
113062306a36Sopenharmony_ci			} else if (offset <= 0xffff) {
113162306a36Sopenharmony_ci				size = 3;
113262306a36Sopenharmony_ci			} else { /* offset <= 0xffffff */
113362306a36Sopenharmony_ci				size = 4;
113462306a36Sopenharmony_ci			}
113562306a36Sopenharmony_ci		}
113662306a36Sopenharmony_ci		if (node->size != size || node->offset != offset) {
113762306a36Sopenharmony_ci			node->size = size;
113862306a36Sopenharmony_ci			node->offset = offset;
113962306a36Sopenharmony_ci			changed++;
114062306a36Sopenharmony_ci		}
114162306a36Sopenharmony_ciskip:
114262306a36Sopenharmony_ci		while (node) {
114362306a36Sopenharmony_ci			bitmask = 1 << node->bitnum;
114462306a36Sopenharmony_ci			pathmask |= bitmask;
114562306a36Sopenharmony_ci			if (node->mark && (leftmask & bitmask) == 0) {
114662306a36Sopenharmony_ci				leftmask |= bitmask;
114762306a36Sopenharmony_ci				if (node->leftnode == LEAF) {
114862306a36Sopenharmony_ci					assert(node->left);
114962306a36Sopenharmony_ci				} else if (node->left) {
115062306a36Sopenharmony_ci					assert(node->leftnode == NODE);
115162306a36Sopenharmony_ci					indent += 1;
115262306a36Sopenharmony_ci					node = node->left;
115362306a36Sopenharmony_ci					break;
115462306a36Sopenharmony_ci				}
115562306a36Sopenharmony_ci			}
115662306a36Sopenharmony_ci			if (node->mark && (rightmask & bitmask) == 0) {
115762306a36Sopenharmony_ci				rightmask |= bitmask;
115862306a36Sopenharmony_ci				pathbits |= bitmask;
115962306a36Sopenharmony_ci				if (node->rightnode == LEAF) {
116062306a36Sopenharmony_ci					assert(node->right);
116162306a36Sopenharmony_ci				} else if (node->right) {
116262306a36Sopenharmony_ci					assert(node->rightnode == NODE);
116362306a36Sopenharmony_ci					indent += 1;
116462306a36Sopenharmony_ci					node = node->right;
116562306a36Sopenharmony_ci					break;
116662306a36Sopenharmony_ci				}
116762306a36Sopenharmony_ci			}
116862306a36Sopenharmony_ci			leftmask &= ~bitmask;
116962306a36Sopenharmony_ci			rightmask &= ~bitmask;
117062306a36Sopenharmony_ci			pathmask &= ~bitmask;
117162306a36Sopenharmony_ci			pathbits &= ~bitmask;
117262306a36Sopenharmony_ci			node = node->parent;
117362306a36Sopenharmony_ci			indent -= 1;
117462306a36Sopenharmony_ci		}
117562306a36Sopenharmony_ci	}
117662306a36Sopenharmony_cidone:
117762306a36Sopenharmony_ci	if (verbose > 0)
117862306a36Sopenharmony_ci		printf("Found %d changes\n", changed);
117962306a36Sopenharmony_ci	return changed;
118062306a36Sopenharmony_ci}
118162306a36Sopenharmony_ci
118262306a36Sopenharmony_ci/*
118362306a36Sopenharmony_ci * Emit a trie for the given tree into the data array.
118462306a36Sopenharmony_ci */
118562306a36Sopenharmony_cistatic void emit(struct tree *tree, unsigned char *data)
118662306a36Sopenharmony_ci{
118762306a36Sopenharmony_ci	struct node *node;
118862306a36Sopenharmony_ci	unsigned int leftmask;
118962306a36Sopenharmony_ci	unsigned int rightmask;
119062306a36Sopenharmony_ci	unsigned int bitmask;
119162306a36Sopenharmony_ci	int offlen;
119262306a36Sopenharmony_ci	int offset;
119362306a36Sopenharmony_ci	int index;
119462306a36Sopenharmony_ci	int indent;
119562306a36Sopenharmony_ci	int size;
119662306a36Sopenharmony_ci	int bytes;
119762306a36Sopenharmony_ci	int leaves;
119862306a36Sopenharmony_ci	int nodes[4];
119962306a36Sopenharmony_ci	unsigned char byte;
120062306a36Sopenharmony_ci
120162306a36Sopenharmony_ci	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
120262306a36Sopenharmony_ci	leaves = 0;
120362306a36Sopenharmony_ci	bytes = 0;
120462306a36Sopenharmony_ci	index = tree->index;
120562306a36Sopenharmony_ci	data += index;
120662306a36Sopenharmony_ci	indent = 1;
120762306a36Sopenharmony_ci	if (verbose > 0)
120862306a36Sopenharmony_ci		printf("Emitting %s_%x\n", tree->type, tree->maxage);
120962306a36Sopenharmony_ci	if (tree->childnode == LEAF) {
121062306a36Sopenharmony_ci		assert(tree->root);
121162306a36Sopenharmony_ci		tree->leaf_emit(tree->root, data);
121262306a36Sopenharmony_ci		size = tree->leaf_size(tree->root);
121362306a36Sopenharmony_ci		index += size;
121462306a36Sopenharmony_ci		leaves++;
121562306a36Sopenharmony_ci		goto done;
121662306a36Sopenharmony_ci	}
121762306a36Sopenharmony_ci
121862306a36Sopenharmony_ci	assert(tree->childnode == NODE);
121962306a36Sopenharmony_ci	node = tree->root;
122062306a36Sopenharmony_ci	leftmask = rightmask = 0;
122162306a36Sopenharmony_ci	while (node) {
122262306a36Sopenharmony_ci		if (!node->mark)
122362306a36Sopenharmony_ci			goto skip;
122462306a36Sopenharmony_ci		assert(node->offset != -1);
122562306a36Sopenharmony_ci		assert(node->index == index);
122662306a36Sopenharmony_ci
122762306a36Sopenharmony_ci		byte = 0;
122862306a36Sopenharmony_ci		if (node->nextbyte)
122962306a36Sopenharmony_ci			byte |= NEXTBYTE;
123062306a36Sopenharmony_ci		byte |= (node->bitnum & BITNUM);
123162306a36Sopenharmony_ci		if (node->left && node->right) {
123262306a36Sopenharmony_ci			if (node->leftnode == NODE)
123362306a36Sopenharmony_ci				byte |= LEFTNODE;
123462306a36Sopenharmony_ci			if (node->rightnode == NODE)
123562306a36Sopenharmony_ci				byte |= RIGHTNODE;
123662306a36Sopenharmony_ci			if (node->offset <= 0xff)
123762306a36Sopenharmony_ci				offlen = 1;
123862306a36Sopenharmony_ci			else if (node->offset <= 0xffff)
123962306a36Sopenharmony_ci				offlen = 2;
124062306a36Sopenharmony_ci			else
124162306a36Sopenharmony_ci				offlen = 3;
124262306a36Sopenharmony_ci			nodes[offlen]++;
124362306a36Sopenharmony_ci			offset = node->offset;
124462306a36Sopenharmony_ci			byte |= offlen << OFFLEN_SHIFT;
124562306a36Sopenharmony_ci			*data++ = byte;
124662306a36Sopenharmony_ci			index++;
124762306a36Sopenharmony_ci			while (offlen--) {
124862306a36Sopenharmony_ci				*data++ = offset & 0xff;
124962306a36Sopenharmony_ci				index++;
125062306a36Sopenharmony_ci				offset >>= 8;
125162306a36Sopenharmony_ci			}
125262306a36Sopenharmony_ci		} else if (node->left) {
125362306a36Sopenharmony_ci			if (node->leftnode == NODE)
125462306a36Sopenharmony_ci				byte |= TRIENODE;
125562306a36Sopenharmony_ci			nodes[0]++;
125662306a36Sopenharmony_ci			*data++ = byte;
125762306a36Sopenharmony_ci			index++;
125862306a36Sopenharmony_ci		} else if (node->right) {
125962306a36Sopenharmony_ci			byte |= RIGHTNODE;
126062306a36Sopenharmony_ci			if (node->rightnode == NODE)
126162306a36Sopenharmony_ci				byte |= TRIENODE;
126262306a36Sopenharmony_ci			nodes[0]++;
126362306a36Sopenharmony_ci			*data++ = byte;
126462306a36Sopenharmony_ci			index++;
126562306a36Sopenharmony_ci		} else {
126662306a36Sopenharmony_ci			assert(0);
126762306a36Sopenharmony_ci		}
126862306a36Sopenharmony_ciskip:
126962306a36Sopenharmony_ci		while (node) {
127062306a36Sopenharmony_ci			bitmask = 1 << node->bitnum;
127162306a36Sopenharmony_ci			if (node->mark && (leftmask & bitmask) == 0) {
127262306a36Sopenharmony_ci				leftmask |= bitmask;
127362306a36Sopenharmony_ci				if (node->leftnode == LEAF) {
127462306a36Sopenharmony_ci					assert(node->left);
127562306a36Sopenharmony_ci					data = tree->leaf_emit(node->left,
127662306a36Sopenharmony_ci							       data);
127762306a36Sopenharmony_ci					size = tree->leaf_size(node->left);
127862306a36Sopenharmony_ci					index += size;
127962306a36Sopenharmony_ci					bytes += size;
128062306a36Sopenharmony_ci					leaves++;
128162306a36Sopenharmony_ci				} else if (node->left) {
128262306a36Sopenharmony_ci					assert(node->leftnode == NODE);
128362306a36Sopenharmony_ci					indent += 1;
128462306a36Sopenharmony_ci					node = node->left;
128562306a36Sopenharmony_ci					break;
128662306a36Sopenharmony_ci				}
128762306a36Sopenharmony_ci			}
128862306a36Sopenharmony_ci			if (node->mark && (rightmask & bitmask) == 0) {
128962306a36Sopenharmony_ci				rightmask |= bitmask;
129062306a36Sopenharmony_ci				if (node->rightnode == LEAF) {
129162306a36Sopenharmony_ci					assert(node->right);
129262306a36Sopenharmony_ci					data = tree->leaf_emit(node->right,
129362306a36Sopenharmony_ci							       data);
129462306a36Sopenharmony_ci					size = tree->leaf_size(node->right);
129562306a36Sopenharmony_ci					index += size;
129662306a36Sopenharmony_ci					bytes += size;
129762306a36Sopenharmony_ci					leaves++;
129862306a36Sopenharmony_ci				} else if (node->right) {
129962306a36Sopenharmony_ci					assert(node->rightnode == NODE);
130062306a36Sopenharmony_ci					indent += 1;
130162306a36Sopenharmony_ci					node = node->right;
130262306a36Sopenharmony_ci					break;
130362306a36Sopenharmony_ci				}
130462306a36Sopenharmony_ci			}
130562306a36Sopenharmony_ci			leftmask &= ~bitmask;
130662306a36Sopenharmony_ci			rightmask &= ~bitmask;
130762306a36Sopenharmony_ci			node = node->parent;
130862306a36Sopenharmony_ci			indent -= 1;
130962306a36Sopenharmony_ci		}
131062306a36Sopenharmony_ci	}
131162306a36Sopenharmony_cidone:
131262306a36Sopenharmony_ci	if (verbose > 0) {
131362306a36Sopenharmony_ci		printf("Emitted %d (%d) leaves",
131462306a36Sopenharmony_ci			leaves, bytes);
131562306a36Sopenharmony_ci		printf(" %d (%d+%d+%d+%d) nodes",
131662306a36Sopenharmony_ci			nodes[0] + nodes[1] + nodes[2] + nodes[3],
131762306a36Sopenharmony_ci			nodes[0], nodes[1], nodes[2], nodes[3]);
131862306a36Sopenharmony_ci		printf(" %d total\n", index - tree->index);
131962306a36Sopenharmony_ci	}
132062306a36Sopenharmony_ci}
132162306a36Sopenharmony_ci
132262306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
132362306a36Sopenharmony_ci
132462306a36Sopenharmony_ci/*
132562306a36Sopenharmony_ci * Unicode data.
132662306a36Sopenharmony_ci *
132762306a36Sopenharmony_ci * We need to keep track of the Canonical Combining Class, the Age,
132862306a36Sopenharmony_ci * and decompositions for a code point.
132962306a36Sopenharmony_ci *
133062306a36Sopenharmony_ci * For the Age, we store the index into the ages table.  Effectively
133162306a36Sopenharmony_ci * this is a generation number that the table maps to a unicode
133262306a36Sopenharmony_ci * version.
133362306a36Sopenharmony_ci *
133462306a36Sopenharmony_ci * The correction field is used to indicate that this entry is in the
133562306a36Sopenharmony_ci * corrections array, which contains decompositions that were
133662306a36Sopenharmony_ci * corrected in later revisions.  The value of the correction field is
133762306a36Sopenharmony_ci * the Unicode version in which the mapping was corrected.
133862306a36Sopenharmony_ci */
133962306a36Sopenharmony_cistruct unicode_data {
134062306a36Sopenharmony_ci	unsigned int code;
134162306a36Sopenharmony_ci	int ccc;
134262306a36Sopenharmony_ci	int gen;
134362306a36Sopenharmony_ci	int correction;
134462306a36Sopenharmony_ci	unsigned int *utf32nfdi;
134562306a36Sopenharmony_ci	unsigned int *utf32nfdicf;
134662306a36Sopenharmony_ci	char *utf8nfdi;
134762306a36Sopenharmony_ci	char *utf8nfdicf;
134862306a36Sopenharmony_ci};
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_cistruct unicode_data unicode_data[0x110000];
135162306a36Sopenharmony_cistruct unicode_data *corrections;
135262306a36Sopenharmony_ciint    corrections_count;
135362306a36Sopenharmony_ci
135462306a36Sopenharmony_cistruct tree *nfdi_tree;
135562306a36Sopenharmony_cistruct tree *nfdicf_tree;
135662306a36Sopenharmony_ci
135762306a36Sopenharmony_cistruct tree *trees;
135862306a36Sopenharmony_ciint          trees_count;
135962306a36Sopenharmony_ci
136062306a36Sopenharmony_ci/*
136162306a36Sopenharmony_ci * Check the corrections array to see if this entry was corrected at
136262306a36Sopenharmony_ci * some point.
136362306a36Sopenharmony_ci */
136462306a36Sopenharmony_cistatic struct unicode_data *corrections_lookup(struct unicode_data *u)
136562306a36Sopenharmony_ci{
136662306a36Sopenharmony_ci	int i;
136762306a36Sopenharmony_ci
136862306a36Sopenharmony_ci	for (i = 0; i != corrections_count; i++)
136962306a36Sopenharmony_ci		if (u->code == corrections[i].code)
137062306a36Sopenharmony_ci			return &corrections[i];
137162306a36Sopenharmony_ci	return u;
137262306a36Sopenharmony_ci}
137362306a36Sopenharmony_ci
137462306a36Sopenharmony_cistatic int nfdi_equal(void *l, void *r)
137562306a36Sopenharmony_ci{
137662306a36Sopenharmony_ci	struct unicode_data *left = l;
137762306a36Sopenharmony_ci	struct unicode_data *right = r;
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci	if (left->gen != right->gen)
138062306a36Sopenharmony_ci		return 0;
138162306a36Sopenharmony_ci	if (left->ccc != right->ccc)
138262306a36Sopenharmony_ci		return 0;
138362306a36Sopenharmony_ci	if (left->utf8nfdi && right->utf8nfdi &&
138462306a36Sopenharmony_ci	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
138562306a36Sopenharmony_ci		return 1;
138662306a36Sopenharmony_ci	if (left->utf8nfdi || right->utf8nfdi)
138762306a36Sopenharmony_ci		return 0;
138862306a36Sopenharmony_ci	return 1;
138962306a36Sopenharmony_ci}
139062306a36Sopenharmony_ci
139162306a36Sopenharmony_cistatic int nfdicf_equal(void *l, void *r)
139262306a36Sopenharmony_ci{
139362306a36Sopenharmony_ci	struct unicode_data *left = l;
139462306a36Sopenharmony_ci	struct unicode_data *right = r;
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ci	if (left->gen != right->gen)
139762306a36Sopenharmony_ci		return 0;
139862306a36Sopenharmony_ci	if (left->ccc != right->ccc)
139962306a36Sopenharmony_ci		return 0;
140062306a36Sopenharmony_ci	if (left->utf8nfdicf && right->utf8nfdicf &&
140162306a36Sopenharmony_ci	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
140262306a36Sopenharmony_ci		return 1;
140362306a36Sopenharmony_ci	if (left->utf8nfdicf && right->utf8nfdicf)
140462306a36Sopenharmony_ci		return 0;
140562306a36Sopenharmony_ci	if (left->utf8nfdicf || right->utf8nfdicf)
140662306a36Sopenharmony_ci		return 0;
140762306a36Sopenharmony_ci	if (left->utf8nfdi && right->utf8nfdi &&
140862306a36Sopenharmony_ci	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
140962306a36Sopenharmony_ci		return 1;
141062306a36Sopenharmony_ci	if (left->utf8nfdi || right->utf8nfdi)
141162306a36Sopenharmony_ci		return 0;
141262306a36Sopenharmony_ci	return 1;
141362306a36Sopenharmony_ci}
141462306a36Sopenharmony_ci
141562306a36Sopenharmony_cistatic void nfdi_print(void *l, int indent)
141662306a36Sopenharmony_ci{
141762306a36Sopenharmony_ci	struct unicode_data *leaf = l;
141862306a36Sopenharmony_ci
141962306a36Sopenharmony_ci	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
142062306a36Sopenharmony_ci		leaf->code, leaf->ccc, leaf->gen);
142162306a36Sopenharmony_ci
142262306a36Sopenharmony_ci	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
142362306a36Sopenharmony_ci		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
142462306a36Sopenharmony_ci	else if (leaf->utf8nfdi)
142562306a36Sopenharmony_ci		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_ci	printf("\n");
142862306a36Sopenharmony_ci}
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_cistatic void nfdicf_print(void *l, int indent)
143162306a36Sopenharmony_ci{
143262306a36Sopenharmony_ci	struct unicode_data *leaf = l;
143362306a36Sopenharmony_ci
143462306a36Sopenharmony_ci	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
143562306a36Sopenharmony_ci		leaf->code, leaf->ccc, leaf->gen);
143662306a36Sopenharmony_ci
143762306a36Sopenharmony_ci	if (leaf->utf8nfdicf)
143862306a36Sopenharmony_ci		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
143962306a36Sopenharmony_ci	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
144062306a36Sopenharmony_ci		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
144162306a36Sopenharmony_ci	else if (leaf->utf8nfdi)
144262306a36Sopenharmony_ci		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
144362306a36Sopenharmony_ci	printf("\n");
144462306a36Sopenharmony_ci}
144562306a36Sopenharmony_ci
144662306a36Sopenharmony_cistatic int nfdi_mark(void *l)
144762306a36Sopenharmony_ci{
144862306a36Sopenharmony_ci	return 1;
144962306a36Sopenharmony_ci}
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_cistatic int nfdicf_mark(void *l)
145262306a36Sopenharmony_ci{
145362306a36Sopenharmony_ci	struct unicode_data *leaf = l;
145462306a36Sopenharmony_ci
145562306a36Sopenharmony_ci	if (leaf->utf8nfdicf)
145662306a36Sopenharmony_ci		return 1;
145762306a36Sopenharmony_ci	return 0;
145862306a36Sopenharmony_ci}
145962306a36Sopenharmony_ci
146062306a36Sopenharmony_cistatic int correction_mark(void *l)
146162306a36Sopenharmony_ci{
146262306a36Sopenharmony_ci	struct unicode_data *leaf = l;
146362306a36Sopenharmony_ci
146462306a36Sopenharmony_ci	return leaf->correction;
146562306a36Sopenharmony_ci}
146662306a36Sopenharmony_ci
146762306a36Sopenharmony_cistatic int nfdi_size(void *l)
146862306a36Sopenharmony_ci{
146962306a36Sopenharmony_ci	struct unicode_data *leaf = l;
147062306a36Sopenharmony_ci	int size = 2;
147162306a36Sopenharmony_ci
147262306a36Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code))
147362306a36Sopenharmony_ci		size += 1;
147462306a36Sopenharmony_ci	else if (leaf->utf8nfdi)
147562306a36Sopenharmony_ci		size += strlen(leaf->utf8nfdi) + 1;
147662306a36Sopenharmony_ci	return size;
147762306a36Sopenharmony_ci}
147862306a36Sopenharmony_ci
147962306a36Sopenharmony_cistatic int nfdicf_size(void *l)
148062306a36Sopenharmony_ci{
148162306a36Sopenharmony_ci	struct unicode_data *leaf = l;
148262306a36Sopenharmony_ci	int size = 2;
148362306a36Sopenharmony_ci
148462306a36Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code))
148562306a36Sopenharmony_ci		size += 1;
148662306a36Sopenharmony_ci	else if (leaf->utf8nfdicf)
148762306a36Sopenharmony_ci		size += strlen(leaf->utf8nfdicf) + 1;
148862306a36Sopenharmony_ci	else if (leaf->utf8nfdi)
148962306a36Sopenharmony_ci		size += strlen(leaf->utf8nfdi) + 1;
149062306a36Sopenharmony_ci	return size;
149162306a36Sopenharmony_ci}
149262306a36Sopenharmony_ci
149362306a36Sopenharmony_cistatic int *nfdi_index(struct tree *tree, void *l)
149462306a36Sopenharmony_ci{
149562306a36Sopenharmony_ci	struct unicode_data *leaf = l;
149662306a36Sopenharmony_ci
149762306a36Sopenharmony_ci	return &tree->leafindex[leaf->code];
149862306a36Sopenharmony_ci}
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_cistatic int *nfdicf_index(struct tree *tree, void *l)
150162306a36Sopenharmony_ci{
150262306a36Sopenharmony_ci	struct unicode_data *leaf = l;
150362306a36Sopenharmony_ci
150462306a36Sopenharmony_ci	return &tree->leafindex[leaf->code];
150562306a36Sopenharmony_ci}
150662306a36Sopenharmony_ci
150762306a36Sopenharmony_cistatic unsigned char *nfdi_emit(void *l, unsigned char *data)
150862306a36Sopenharmony_ci{
150962306a36Sopenharmony_ci	struct unicode_data *leaf = l;
151062306a36Sopenharmony_ci	unsigned char *s;
151162306a36Sopenharmony_ci
151262306a36Sopenharmony_ci	*data++ = leaf->gen;
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code)) {
151562306a36Sopenharmony_ci		*data++ = DECOMPOSE;
151662306a36Sopenharmony_ci		*data++ = HANGUL;
151762306a36Sopenharmony_ci	} else if (leaf->utf8nfdi) {
151862306a36Sopenharmony_ci		*data++ = DECOMPOSE;
151962306a36Sopenharmony_ci		s = (unsigned char*)leaf->utf8nfdi;
152062306a36Sopenharmony_ci		while ((*data++ = *s++) != 0)
152162306a36Sopenharmony_ci			;
152262306a36Sopenharmony_ci	} else {
152362306a36Sopenharmony_ci		*data++ = leaf->ccc;
152462306a36Sopenharmony_ci	}
152562306a36Sopenharmony_ci	return data;
152662306a36Sopenharmony_ci}
152762306a36Sopenharmony_ci
152862306a36Sopenharmony_cistatic unsigned char *nfdicf_emit(void *l, unsigned char *data)
152962306a36Sopenharmony_ci{
153062306a36Sopenharmony_ci	struct unicode_data *leaf = l;
153162306a36Sopenharmony_ci	unsigned char *s;
153262306a36Sopenharmony_ci
153362306a36Sopenharmony_ci	*data++ = leaf->gen;
153462306a36Sopenharmony_ci
153562306a36Sopenharmony_ci	if (HANGUL_SYLLABLE(leaf->code)) {
153662306a36Sopenharmony_ci		*data++ = DECOMPOSE;
153762306a36Sopenharmony_ci		*data++ = HANGUL;
153862306a36Sopenharmony_ci	} else if (leaf->utf8nfdicf) {
153962306a36Sopenharmony_ci		*data++ = DECOMPOSE;
154062306a36Sopenharmony_ci		s = (unsigned char*)leaf->utf8nfdicf;
154162306a36Sopenharmony_ci		while ((*data++ = *s++) != 0)
154262306a36Sopenharmony_ci			;
154362306a36Sopenharmony_ci	} else if (leaf->utf8nfdi) {
154462306a36Sopenharmony_ci		*data++ = DECOMPOSE;
154562306a36Sopenharmony_ci		s = (unsigned char*)leaf->utf8nfdi;
154662306a36Sopenharmony_ci		while ((*data++ = *s++) != 0)
154762306a36Sopenharmony_ci			;
154862306a36Sopenharmony_ci	} else {
154962306a36Sopenharmony_ci		*data++ = leaf->ccc;
155062306a36Sopenharmony_ci	}
155162306a36Sopenharmony_ci	return data;
155262306a36Sopenharmony_ci}
155362306a36Sopenharmony_ci
155462306a36Sopenharmony_cistatic void utf8_create(struct unicode_data *data)
155562306a36Sopenharmony_ci{
155662306a36Sopenharmony_ci	char utf[18*4+1];
155762306a36Sopenharmony_ci	char *u;
155862306a36Sopenharmony_ci	unsigned int *um;
155962306a36Sopenharmony_ci	int i;
156062306a36Sopenharmony_ci
156162306a36Sopenharmony_ci	if (data->utf8nfdi) {
156262306a36Sopenharmony_ci		assert(data->utf8nfdi[0] == HANGUL);
156362306a36Sopenharmony_ci		return;
156462306a36Sopenharmony_ci	}
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci	u = utf;
156762306a36Sopenharmony_ci	um = data->utf32nfdi;
156862306a36Sopenharmony_ci	if (um) {
156962306a36Sopenharmony_ci		for (i = 0; um[i]; i++)
157062306a36Sopenharmony_ci			u += utf8encode(u, um[i]);
157162306a36Sopenharmony_ci		*u = '\0';
157262306a36Sopenharmony_ci		data->utf8nfdi = strdup(utf);
157362306a36Sopenharmony_ci	}
157462306a36Sopenharmony_ci	u = utf;
157562306a36Sopenharmony_ci	um = data->utf32nfdicf;
157662306a36Sopenharmony_ci	if (um) {
157762306a36Sopenharmony_ci		for (i = 0; um[i]; i++)
157862306a36Sopenharmony_ci			u += utf8encode(u, um[i]);
157962306a36Sopenharmony_ci		*u = '\0';
158062306a36Sopenharmony_ci		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
158162306a36Sopenharmony_ci			data->utf8nfdicf = strdup(utf);
158262306a36Sopenharmony_ci	}
158362306a36Sopenharmony_ci}
158462306a36Sopenharmony_ci
158562306a36Sopenharmony_cistatic void utf8_init(void)
158662306a36Sopenharmony_ci{
158762306a36Sopenharmony_ci	unsigned int unichar;
158862306a36Sopenharmony_ci	int i;
158962306a36Sopenharmony_ci
159062306a36Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++)
159162306a36Sopenharmony_ci		utf8_create(&unicode_data[unichar]);
159262306a36Sopenharmony_ci
159362306a36Sopenharmony_ci	for (i = 0; i != corrections_count; i++)
159462306a36Sopenharmony_ci		utf8_create(&corrections[i]);
159562306a36Sopenharmony_ci}
159662306a36Sopenharmony_ci
159762306a36Sopenharmony_cistatic void trees_init(void)
159862306a36Sopenharmony_ci{
159962306a36Sopenharmony_ci	struct unicode_data *data;
160062306a36Sopenharmony_ci	unsigned int maxage;
160162306a36Sopenharmony_ci	unsigned int nextage;
160262306a36Sopenharmony_ci	int count;
160362306a36Sopenharmony_ci	int i;
160462306a36Sopenharmony_ci	int j;
160562306a36Sopenharmony_ci
160662306a36Sopenharmony_ci	/* Count the number of different ages. */
160762306a36Sopenharmony_ci	count = 0;
160862306a36Sopenharmony_ci	nextage = (unsigned int)-1;
160962306a36Sopenharmony_ci	do {
161062306a36Sopenharmony_ci		maxage = nextage;
161162306a36Sopenharmony_ci		nextage = 0;
161262306a36Sopenharmony_ci		for (i = 0; i <= corrections_count; i++) {
161362306a36Sopenharmony_ci			data = &corrections[i];
161462306a36Sopenharmony_ci			if (nextage < data->correction &&
161562306a36Sopenharmony_ci			    data->correction < maxage)
161662306a36Sopenharmony_ci				nextage = data->correction;
161762306a36Sopenharmony_ci		}
161862306a36Sopenharmony_ci		count++;
161962306a36Sopenharmony_ci	} while (nextage);
162062306a36Sopenharmony_ci
162162306a36Sopenharmony_ci	/* Two trees per age: nfdi and nfdicf */
162262306a36Sopenharmony_ci	trees_count = count * 2;
162362306a36Sopenharmony_ci	trees = calloc(trees_count, sizeof(struct tree));
162462306a36Sopenharmony_ci
162562306a36Sopenharmony_ci	/* Assign ages to the trees. */
162662306a36Sopenharmony_ci	count = trees_count;
162762306a36Sopenharmony_ci	nextage = (unsigned int)-1;
162862306a36Sopenharmony_ci	do {
162962306a36Sopenharmony_ci		maxage = nextage;
163062306a36Sopenharmony_ci		trees[--count].maxage = maxage;
163162306a36Sopenharmony_ci		trees[--count].maxage = maxage;
163262306a36Sopenharmony_ci		nextage = 0;
163362306a36Sopenharmony_ci		for (i = 0; i <= corrections_count; i++) {
163462306a36Sopenharmony_ci			data = &corrections[i];
163562306a36Sopenharmony_ci			if (nextage < data->correction &&
163662306a36Sopenharmony_ci			    data->correction < maxage)
163762306a36Sopenharmony_ci				nextage = data->correction;
163862306a36Sopenharmony_ci		}
163962306a36Sopenharmony_ci	} while (nextage);
164062306a36Sopenharmony_ci
164162306a36Sopenharmony_ci	/* The ages assigned above are off by one. */
164262306a36Sopenharmony_ci	for (i = 0; i != trees_count; i++) {
164362306a36Sopenharmony_ci		j = 0;
164462306a36Sopenharmony_ci		while (ages[j] < trees[i].maxage)
164562306a36Sopenharmony_ci			j++;
164662306a36Sopenharmony_ci		trees[i].maxage = ages[j-1];
164762306a36Sopenharmony_ci	}
164862306a36Sopenharmony_ci
164962306a36Sopenharmony_ci	/* Set up the forwarding between trees. */
165062306a36Sopenharmony_ci	trees[trees_count-2].next = &trees[trees_count-1];
165162306a36Sopenharmony_ci	trees[trees_count-1].leaf_mark = nfdi_mark;
165262306a36Sopenharmony_ci	trees[trees_count-2].leaf_mark = nfdicf_mark;
165362306a36Sopenharmony_ci	for (i = 0; i != trees_count-2; i += 2) {
165462306a36Sopenharmony_ci		trees[i].next = &trees[trees_count-2];
165562306a36Sopenharmony_ci		trees[i].leaf_mark = correction_mark;
165662306a36Sopenharmony_ci		trees[i+1].next = &trees[trees_count-1];
165762306a36Sopenharmony_ci		trees[i+1].leaf_mark = correction_mark;
165862306a36Sopenharmony_ci	}
165962306a36Sopenharmony_ci
166062306a36Sopenharmony_ci	/* Assign the callouts. */
166162306a36Sopenharmony_ci	for (i = 0; i != trees_count; i += 2) {
166262306a36Sopenharmony_ci		trees[i].type = "nfdicf";
166362306a36Sopenharmony_ci		trees[i].leaf_equal = nfdicf_equal;
166462306a36Sopenharmony_ci		trees[i].leaf_print = nfdicf_print;
166562306a36Sopenharmony_ci		trees[i].leaf_size = nfdicf_size;
166662306a36Sopenharmony_ci		trees[i].leaf_index = nfdicf_index;
166762306a36Sopenharmony_ci		trees[i].leaf_emit = nfdicf_emit;
166862306a36Sopenharmony_ci
166962306a36Sopenharmony_ci		trees[i+1].type = "nfdi";
167062306a36Sopenharmony_ci		trees[i+1].leaf_equal = nfdi_equal;
167162306a36Sopenharmony_ci		trees[i+1].leaf_print = nfdi_print;
167262306a36Sopenharmony_ci		trees[i+1].leaf_size = nfdi_size;
167362306a36Sopenharmony_ci		trees[i+1].leaf_index = nfdi_index;
167462306a36Sopenharmony_ci		trees[i+1].leaf_emit = nfdi_emit;
167562306a36Sopenharmony_ci	}
167662306a36Sopenharmony_ci
167762306a36Sopenharmony_ci	/* Finish init. */
167862306a36Sopenharmony_ci	for (i = 0; i != trees_count; i++)
167962306a36Sopenharmony_ci		trees[i].childnode = NODE;
168062306a36Sopenharmony_ci}
168162306a36Sopenharmony_ci
168262306a36Sopenharmony_cistatic void trees_populate(void)
168362306a36Sopenharmony_ci{
168462306a36Sopenharmony_ci	struct unicode_data *data;
168562306a36Sopenharmony_ci	unsigned int unichar;
168662306a36Sopenharmony_ci	char keyval[4];
168762306a36Sopenharmony_ci	int keylen;
168862306a36Sopenharmony_ci	int i;
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ci	for (i = 0; i != trees_count; i++) {
169162306a36Sopenharmony_ci		if (verbose > 0) {
169262306a36Sopenharmony_ci			printf("Populating %s_%x\n",
169362306a36Sopenharmony_ci				trees[i].type, trees[i].maxage);
169462306a36Sopenharmony_ci		}
169562306a36Sopenharmony_ci		for (unichar = 0; unichar != 0x110000; unichar++) {
169662306a36Sopenharmony_ci			if (unicode_data[unichar].gen < 0)
169762306a36Sopenharmony_ci				continue;
169862306a36Sopenharmony_ci			keylen = utf8encode(keyval, unichar);
169962306a36Sopenharmony_ci			data = corrections_lookup(&unicode_data[unichar]);
170062306a36Sopenharmony_ci			if (data->correction <= trees[i].maxage)
170162306a36Sopenharmony_ci				data = &unicode_data[unichar];
170262306a36Sopenharmony_ci			insert(&trees[i], keyval, keylen, data);
170362306a36Sopenharmony_ci		}
170462306a36Sopenharmony_ci	}
170562306a36Sopenharmony_ci}
170662306a36Sopenharmony_ci
170762306a36Sopenharmony_cistatic void trees_reduce(void)
170862306a36Sopenharmony_ci{
170962306a36Sopenharmony_ci	int i;
171062306a36Sopenharmony_ci	int size;
171162306a36Sopenharmony_ci	int changed;
171262306a36Sopenharmony_ci
171362306a36Sopenharmony_ci	for (i = 0; i != trees_count; i++)
171462306a36Sopenharmony_ci		prune(&trees[i]);
171562306a36Sopenharmony_ci	for (i = 0; i != trees_count; i++)
171662306a36Sopenharmony_ci		mark_nodes(&trees[i]);
171762306a36Sopenharmony_ci	do {
171862306a36Sopenharmony_ci		size = 0;
171962306a36Sopenharmony_ci		for (i = 0; i != trees_count; i++)
172062306a36Sopenharmony_ci			size = index_nodes(&trees[i], size);
172162306a36Sopenharmony_ci		changed = 0;
172262306a36Sopenharmony_ci		for (i = 0; i != trees_count; i++)
172362306a36Sopenharmony_ci			changed += size_nodes(&trees[i]);
172462306a36Sopenharmony_ci	} while (changed);
172562306a36Sopenharmony_ci
172662306a36Sopenharmony_ci	utf8data = calloc(size, 1);
172762306a36Sopenharmony_ci	utf8data_size = size;
172862306a36Sopenharmony_ci	for (i = 0; i != trees_count; i++)
172962306a36Sopenharmony_ci		emit(&trees[i], utf8data);
173062306a36Sopenharmony_ci
173162306a36Sopenharmony_ci	if (verbose > 0) {
173262306a36Sopenharmony_ci		for (i = 0; i != trees_count; i++) {
173362306a36Sopenharmony_ci			printf("%s_%x idx %d\n",
173462306a36Sopenharmony_ci				trees[i].type, trees[i].maxage, trees[i].index);
173562306a36Sopenharmony_ci		}
173662306a36Sopenharmony_ci	}
173762306a36Sopenharmony_ci
173862306a36Sopenharmony_ci	nfdi = utf8data + trees[trees_count-1].index;
173962306a36Sopenharmony_ci	nfdicf = utf8data + trees[trees_count-2].index;
174062306a36Sopenharmony_ci
174162306a36Sopenharmony_ci	nfdi_tree = &trees[trees_count-1];
174262306a36Sopenharmony_ci	nfdicf_tree = &trees[trees_count-2];
174362306a36Sopenharmony_ci}
174462306a36Sopenharmony_ci
174562306a36Sopenharmony_cistatic void verify(struct tree *tree)
174662306a36Sopenharmony_ci{
174762306a36Sopenharmony_ci	struct unicode_data *data;
174862306a36Sopenharmony_ci	utf8leaf_t	*leaf;
174962306a36Sopenharmony_ci	unsigned int	unichar;
175062306a36Sopenharmony_ci	char		key[4];
175162306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
175262306a36Sopenharmony_ci	int		report;
175362306a36Sopenharmony_ci	int		nocf;
175462306a36Sopenharmony_ci
175562306a36Sopenharmony_ci	if (verbose > 0)
175662306a36Sopenharmony_ci		printf("Verifying %s_%x\n", tree->type, tree->maxage);
175762306a36Sopenharmony_ci	nocf = strcmp(tree->type, "nfdicf");
175862306a36Sopenharmony_ci
175962306a36Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++) {
176062306a36Sopenharmony_ci		report = 0;
176162306a36Sopenharmony_ci		data = corrections_lookup(&unicode_data[unichar]);
176262306a36Sopenharmony_ci		if (data->correction <= tree->maxage)
176362306a36Sopenharmony_ci			data = &unicode_data[unichar];
176462306a36Sopenharmony_ci		utf8encode(key,unichar);
176562306a36Sopenharmony_ci		leaf = utf8lookup(tree, hangul, key);
176662306a36Sopenharmony_ci
176762306a36Sopenharmony_ci		if (!leaf) {
176862306a36Sopenharmony_ci			if (data->gen != -1)
176962306a36Sopenharmony_ci				report++;
177062306a36Sopenharmony_ci			if (unichar < 0xd800 || unichar > 0xdfff)
177162306a36Sopenharmony_ci				report++;
177262306a36Sopenharmony_ci		} else {
177362306a36Sopenharmony_ci			if (unichar >= 0xd800 && unichar <= 0xdfff)
177462306a36Sopenharmony_ci				report++;
177562306a36Sopenharmony_ci			if (data->gen == -1)
177662306a36Sopenharmony_ci				report++;
177762306a36Sopenharmony_ci			if (data->gen != LEAF_GEN(leaf))
177862306a36Sopenharmony_ci				report++;
177962306a36Sopenharmony_ci			if (LEAF_CCC(leaf) == DECOMPOSE) {
178062306a36Sopenharmony_ci				if (HANGUL_SYLLABLE(data->code)) {
178162306a36Sopenharmony_ci					if (data->utf8nfdi[0] != HANGUL)
178262306a36Sopenharmony_ci						report++;
178362306a36Sopenharmony_ci				} else if (nocf) {
178462306a36Sopenharmony_ci					if (!data->utf8nfdi) {
178562306a36Sopenharmony_ci						report++;
178662306a36Sopenharmony_ci					} else if (strcmp(data->utf8nfdi,
178762306a36Sopenharmony_ci							  LEAF_STR(leaf))) {
178862306a36Sopenharmony_ci						report++;
178962306a36Sopenharmony_ci					}
179062306a36Sopenharmony_ci				} else {
179162306a36Sopenharmony_ci					if (!data->utf8nfdicf &&
179262306a36Sopenharmony_ci					    !data->utf8nfdi) {
179362306a36Sopenharmony_ci						report++;
179462306a36Sopenharmony_ci					} else if (data->utf8nfdicf) {
179562306a36Sopenharmony_ci						if (strcmp(data->utf8nfdicf,
179662306a36Sopenharmony_ci							   LEAF_STR(leaf)))
179762306a36Sopenharmony_ci							report++;
179862306a36Sopenharmony_ci					} else if (strcmp(data->utf8nfdi,
179962306a36Sopenharmony_ci							  LEAF_STR(leaf))) {
180062306a36Sopenharmony_ci						report++;
180162306a36Sopenharmony_ci					}
180262306a36Sopenharmony_ci				}
180362306a36Sopenharmony_ci			} else if (data->ccc != LEAF_CCC(leaf)) {
180462306a36Sopenharmony_ci				report++;
180562306a36Sopenharmony_ci			}
180662306a36Sopenharmony_ci		}
180762306a36Sopenharmony_ci		if (report) {
180862306a36Sopenharmony_ci			printf("%X code %X gen %d ccc %d"
180962306a36Sopenharmony_ci				" nfdi -> \"%s\"",
181062306a36Sopenharmony_ci				unichar, data->code, data->gen,
181162306a36Sopenharmony_ci				data->ccc,
181262306a36Sopenharmony_ci				data->utf8nfdi);
181362306a36Sopenharmony_ci			if (leaf) {
181462306a36Sopenharmony_ci				printf(" gen %d ccc %d"
181562306a36Sopenharmony_ci					" nfdi -> \"%s\"",
181662306a36Sopenharmony_ci					LEAF_GEN(leaf),
181762306a36Sopenharmony_ci					LEAF_CCC(leaf),
181862306a36Sopenharmony_ci					LEAF_CCC(leaf) == DECOMPOSE ?
181962306a36Sopenharmony_ci						LEAF_STR(leaf) : "");
182062306a36Sopenharmony_ci			}
182162306a36Sopenharmony_ci			printf("\n");
182262306a36Sopenharmony_ci		}
182362306a36Sopenharmony_ci	}
182462306a36Sopenharmony_ci}
182562306a36Sopenharmony_ci
182662306a36Sopenharmony_cistatic void trees_verify(void)
182762306a36Sopenharmony_ci{
182862306a36Sopenharmony_ci	int i;
182962306a36Sopenharmony_ci
183062306a36Sopenharmony_ci	for (i = 0; i != trees_count; i++)
183162306a36Sopenharmony_ci		verify(&trees[i]);
183262306a36Sopenharmony_ci}
183362306a36Sopenharmony_ci
183462306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
183562306a36Sopenharmony_ci
183662306a36Sopenharmony_cistatic void help(void)
183762306a36Sopenharmony_ci{
183862306a36Sopenharmony_ci	printf("Usage: %s [options]\n", argv0);
183962306a36Sopenharmony_ci	printf("\n");
184062306a36Sopenharmony_ci	printf("This program creates an a data trie used for parsing and\n");
184162306a36Sopenharmony_ci	printf("normalization of UTF-8 strings. The trie is derived from\n");
184262306a36Sopenharmony_ci	printf("a set of input files from the Unicode character database\n");
184362306a36Sopenharmony_ci	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
184462306a36Sopenharmony_ci	printf("\n");
184562306a36Sopenharmony_ci	printf("The generated tree supports two normalization forms:\n");
184662306a36Sopenharmony_ci	printf("\n");
184762306a36Sopenharmony_ci	printf("\tnfdi:\n");
184862306a36Sopenharmony_ci	printf("\t- Apply unicode normalization form NFD.\n");
184962306a36Sopenharmony_ci	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
185062306a36Sopenharmony_ci	printf("\n");
185162306a36Sopenharmony_ci	printf("\tnfdicf:\n");
185262306a36Sopenharmony_ci	printf("\t- Apply unicode normalization form NFD.\n");
185362306a36Sopenharmony_ci	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
185462306a36Sopenharmony_ci	printf("\t- Apply a full casefold (C + F).\n");
185562306a36Sopenharmony_ci	printf("\n");
185662306a36Sopenharmony_ci	printf("These forms were chosen as being most useful when dealing\n");
185762306a36Sopenharmony_ci	printf("with file names: NFD catches most cases where characters\n");
185862306a36Sopenharmony_ci	printf("should be considered equivalent. The ignorables are mostly\n");
185962306a36Sopenharmony_ci	printf("invisible, making names hard to type.\n");
186062306a36Sopenharmony_ci	printf("\n");
186162306a36Sopenharmony_ci	printf("The options to specify the files to be used are listed\n");
186262306a36Sopenharmony_ci	printf("below with their default values, which are the names used\n");
186362306a36Sopenharmony_ci	printf("by version 11.0.0 of the Unicode Character Database.\n");
186462306a36Sopenharmony_ci	printf("\n");
186562306a36Sopenharmony_ci	printf("The input files:\n");
186662306a36Sopenharmony_ci	printf("\t-a %s\n", AGE_NAME);
186762306a36Sopenharmony_ci	printf("\t-c %s\n", CCC_NAME);
186862306a36Sopenharmony_ci	printf("\t-p %s\n", PROP_NAME);
186962306a36Sopenharmony_ci	printf("\t-d %s\n", DATA_NAME);
187062306a36Sopenharmony_ci	printf("\t-f %s\n", FOLD_NAME);
187162306a36Sopenharmony_ci	printf("\t-n %s\n", NORM_NAME);
187262306a36Sopenharmony_ci	printf("\n");
187362306a36Sopenharmony_ci	printf("Additionally, the generated tables are tested using:\n");
187462306a36Sopenharmony_ci	printf("\t-t %s\n", TEST_NAME);
187562306a36Sopenharmony_ci	printf("\n");
187662306a36Sopenharmony_ci	printf("Finally, the output file:\n");
187762306a36Sopenharmony_ci	printf("\t-o %s\n", UTF8_NAME);
187862306a36Sopenharmony_ci	printf("\n");
187962306a36Sopenharmony_ci}
188062306a36Sopenharmony_ci
188162306a36Sopenharmony_cistatic void usage(void)
188262306a36Sopenharmony_ci{
188362306a36Sopenharmony_ci	help();
188462306a36Sopenharmony_ci	exit(1);
188562306a36Sopenharmony_ci}
188662306a36Sopenharmony_ci
188762306a36Sopenharmony_cistatic void open_fail(const char *name, int error)
188862306a36Sopenharmony_ci{
188962306a36Sopenharmony_ci	printf("Error %d opening %s: %s\n", error, name, strerror(error));
189062306a36Sopenharmony_ci	exit(1);
189162306a36Sopenharmony_ci}
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_cistatic void file_fail(const char *filename)
189462306a36Sopenharmony_ci{
189562306a36Sopenharmony_ci	printf("Error parsing %s\n", filename);
189662306a36Sopenharmony_ci	exit(1);
189762306a36Sopenharmony_ci}
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_cistatic void line_fail(const char *filename, const char *line)
190062306a36Sopenharmony_ci{
190162306a36Sopenharmony_ci	printf("Error parsing %s:%s\n", filename, line);
190262306a36Sopenharmony_ci	exit(1);
190362306a36Sopenharmony_ci}
190462306a36Sopenharmony_ci
190562306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
190662306a36Sopenharmony_ci
190762306a36Sopenharmony_cistatic void print_utf32(unsigned int *utf32str)
190862306a36Sopenharmony_ci{
190962306a36Sopenharmony_ci	int	i;
191062306a36Sopenharmony_ci
191162306a36Sopenharmony_ci	for (i = 0; utf32str[i]; i++)
191262306a36Sopenharmony_ci		printf(" %X", utf32str[i]);
191362306a36Sopenharmony_ci}
191462306a36Sopenharmony_ci
191562306a36Sopenharmony_cistatic void print_utf32nfdi(unsigned int unichar)
191662306a36Sopenharmony_ci{
191762306a36Sopenharmony_ci	printf(" %X ->", unichar);
191862306a36Sopenharmony_ci	print_utf32(unicode_data[unichar].utf32nfdi);
191962306a36Sopenharmony_ci	printf("\n");
192062306a36Sopenharmony_ci}
192162306a36Sopenharmony_ci
192262306a36Sopenharmony_cistatic void print_utf32nfdicf(unsigned int unichar)
192362306a36Sopenharmony_ci{
192462306a36Sopenharmony_ci	printf(" %X ->", unichar);
192562306a36Sopenharmony_ci	print_utf32(unicode_data[unichar].utf32nfdicf);
192662306a36Sopenharmony_ci	printf("\n");
192762306a36Sopenharmony_ci}
192862306a36Sopenharmony_ci
192962306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
193062306a36Sopenharmony_ci
193162306a36Sopenharmony_cistatic void age_init(void)
193262306a36Sopenharmony_ci{
193362306a36Sopenharmony_ci	FILE *file;
193462306a36Sopenharmony_ci	unsigned int first;
193562306a36Sopenharmony_ci	unsigned int last;
193662306a36Sopenharmony_ci	unsigned int unichar;
193762306a36Sopenharmony_ci	unsigned int major;
193862306a36Sopenharmony_ci	unsigned int minor;
193962306a36Sopenharmony_ci	unsigned int revision;
194062306a36Sopenharmony_ci	int gen;
194162306a36Sopenharmony_ci	int count;
194262306a36Sopenharmony_ci	int ret;
194362306a36Sopenharmony_ci
194462306a36Sopenharmony_ci	if (verbose > 0)
194562306a36Sopenharmony_ci		printf("Parsing %s\n", age_name);
194662306a36Sopenharmony_ci
194762306a36Sopenharmony_ci	file = fopen(age_name, "r");
194862306a36Sopenharmony_ci	if (!file)
194962306a36Sopenharmony_ci		open_fail(age_name, errno);
195062306a36Sopenharmony_ci	count = 0;
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ci	gen = 0;
195362306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
195462306a36Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d_%d",
195562306a36Sopenharmony_ci				&major, &minor, &revision);
195662306a36Sopenharmony_ci		if (ret == 3) {
195762306a36Sopenharmony_ci			ages_count++;
195862306a36Sopenharmony_ci			if (verbose > 1)
195962306a36Sopenharmony_ci				printf(" Age V%d_%d_%d\n",
196062306a36Sopenharmony_ci					major, minor, revision);
196162306a36Sopenharmony_ci			if (!age_valid(major, minor, revision))
196262306a36Sopenharmony_ci				line_fail(age_name, line);
196362306a36Sopenharmony_ci			continue;
196462306a36Sopenharmony_ci		}
196562306a36Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
196662306a36Sopenharmony_ci		if (ret == 2) {
196762306a36Sopenharmony_ci			ages_count++;
196862306a36Sopenharmony_ci			if (verbose > 1)
196962306a36Sopenharmony_ci				printf(" Age V%d_%d\n", major, minor);
197062306a36Sopenharmony_ci			if (!age_valid(major, minor, 0))
197162306a36Sopenharmony_ci				line_fail(age_name, line);
197262306a36Sopenharmony_ci			continue;
197362306a36Sopenharmony_ci		}
197462306a36Sopenharmony_ci	}
197562306a36Sopenharmony_ci
197662306a36Sopenharmony_ci	/* We must have found something above. */
197762306a36Sopenharmony_ci	if (verbose > 1)
197862306a36Sopenharmony_ci		printf("%d age entries\n", ages_count);
197962306a36Sopenharmony_ci	if (ages_count == 0 || ages_count > MAXGEN)
198062306a36Sopenharmony_ci		file_fail(age_name);
198162306a36Sopenharmony_ci
198262306a36Sopenharmony_ci	/* There is a 0 entry. */
198362306a36Sopenharmony_ci	ages_count++;
198462306a36Sopenharmony_ci	ages = calloc(ages_count + 1, sizeof(*ages));
198562306a36Sopenharmony_ci	/* And a guard entry. */
198662306a36Sopenharmony_ci	ages[ages_count] = (unsigned int)-1;
198762306a36Sopenharmony_ci
198862306a36Sopenharmony_ci	rewind(file);
198962306a36Sopenharmony_ci	count = 0;
199062306a36Sopenharmony_ci	gen = 0;
199162306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
199262306a36Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d_%d",
199362306a36Sopenharmony_ci				&major, &minor, &revision);
199462306a36Sopenharmony_ci		if (ret == 3) {
199562306a36Sopenharmony_ci			ages[++gen] =
199662306a36Sopenharmony_ci				UNICODE_AGE(major, minor, revision);
199762306a36Sopenharmony_ci			if (verbose > 1)
199862306a36Sopenharmony_ci				printf(" Age V%d_%d_%d = gen %d\n",
199962306a36Sopenharmony_ci					major, minor, revision, gen);
200062306a36Sopenharmony_ci			if (!age_valid(major, minor, revision))
200162306a36Sopenharmony_ci				line_fail(age_name, line);
200262306a36Sopenharmony_ci			continue;
200362306a36Sopenharmony_ci		}
200462306a36Sopenharmony_ci		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
200562306a36Sopenharmony_ci		if (ret == 2) {
200662306a36Sopenharmony_ci			ages[++gen] = UNICODE_AGE(major, minor, 0);
200762306a36Sopenharmony_ci			if (verbose > 1)
200862306a36Sopenharmony_ci				printf(" Age V%d_%d = %d\n",
200962306a36Sopenharmony_ci					major, minor, gen);
201062306a36Sopenharmony_ci			if (!age_valid(major, minor, 0))
201162306a36Sopenharmony_ci				line_fail(age_name, line);
201262306a36Sopenharmony_ci			continue;
201362306a36Sopenharmony_ci		}
201462306a36Sopenharmony_ci		ret = sscanf(line, "%X..%X ; %d.%d #",
201562306a36Sopenharmony_ci			     &first, &last, &major, &minor);
201662306a36Sopenharmony_ci		if (ret == 4) {
201762306a36Sopenharmony_ci			for (unichar = first; unichar <= last; unichar++)
201862306a36Sopenharmony_ci				unicode_data[unichar].gen = gen;
201962306a36Sopenharmony_ci			count += 1 + last - first;
202062306a36Sopenharmony_ci			if (verbose > 1)
202162306a36Sopenharmony_ci				printf("  %X..%X gen %d\n", first, last, gen);
202262306a36Sopenharmony_ci			if (!utf32valid(first) || !utf32valid(last))
202362306a36Sopenharmony_ci				line_fail(age_name, line);
202462306a36Sopenharmony_ci			continue;
202562306a36Sopenharmony_ci		}
202662306a36Sopenharmony_ci		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
202762306a36Sopenharmony_ci		if (ret == 3) {
202862306a36Sopenharmony_ci			unicode_data[unichar].gen = gen;
202962306a36Sopenharmony_ci			count++;
203062306a36Sopenharmony_ci			if (verbose > 1)
203162306a36Sopenharmony_ci				printf("  %X gen %d\n", unichar, gen);
203262306a36Sopenharmony_ci			if (!utf32valid(unichar))
203362306a36Sopenharmony_ci				line_fail(age_name, line);
203462306a36Sopenharmony_ci			continue;
203562306a36Sopenharmony_ci		}
203662306a36Sopenharmony_ci	}
203762306a36Sopenharmony_ci	unicode_maxage = ages[gen];
203862306a36Sopenharmony_ci	fclose(file);
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci	/* Nix surrogate block */
204162306a36Sopenharmony_ci	if (verbose > 1)
204262306a36Sopenharmony_ci		printf(" Removing surrogate block D800..DFFF\n");
204362306a36Sopenharmony_ci	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
204462306a36Sopenharmony_ci		unicode_data[unichar].gen = -1;
204562306a36Sopenharmony_ci
204662306a36Sopenharmony_ci	if (verbose > 0)
204762306a36Sopenharmony_ci	        printf("Found %d entries\n", count);
204862306a36Sopenharmony_ci	if (count == 0)
204962306a36Sopenharmony_ci		file_fail(age_name);
205062306a36Sopenharmony_ci}
205162306a36Sopenharmony_ci
205262306a36Sopenharmony_cistatic void ccc_init(void)
205362306a36Sopenharmony_ci{
205462306a36Sopenharmony_ci	FILE *file;
205562306a36Sopenharmony_ci	unsigned int first;
205662306a36Sopenharmony_ci	unsigned int last;
205762306a36Sopenharmony_ci	unsigned int unichar;
205862306a36Sopenharmony_ci	unsigned int value;
205962306a36Sopenharmony_ci	int count;
206062306a36Sopenharmony_ci	int ret;
206162306a36Sopenharmony_ci
206262306a36Sopenharmony_ci	if (verbose > 0)
206362306a36Sopenharmony_ci		printf("Parsing %s\n", ccc_name);
206462306a36Sopenharmony_ci
206562306a36Sopenharmony_ci	file = fopen(ccc_name, "r");
206662306a36Sopenharmony_ci	if (!file)
206762306a36Sopenharmony_ci		open_fail(ccc_name, errno);
206862306a36Sopenharmony_ci
206962306a36Sopenharmony_ci	count = 0;
207062306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
207162306a36Sopenharmony_ci		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
207262306a36Sopenharmony_ci		if (ret == 3) {
207362306a36Sopenharmony_ci			for (unichar = first; unichar <= last; unichar++) {
207462306a36Sopenharmony_ci				unicode_data[unichar].ccc = value;
207562306a36Sopenharmony_ci                                count++;
207662306a36Sopenharmony_ci			}
207762306a36Sopenharmony_ci			if (verbose > 1)
207862306a36Sopenharmony_ci				printf(" %X..%X ccc %d\n", first, last, value);
207962306a36Sopenharmony_ci			if (!utf32valid(first) || !utf32valid(last))
208062306a36Sopenharmony_ci				line_fail(ccc_name, line);
208162306a36Sopenharmony_ci			continue;
208262306a36Sopenharmony_ci		}
208362306a36Sopenharmony_ci		ret = sscanf(line, "%X ; %d #", &unichar, &value);
208462306a36Sopenharmony_ci		if (ret == 2) {
208562306a36Sopenharmony_ci			unicode_data[unichar].ccc = value;
208662306a36Sopenharmony_ci                        count++;
208762306a36Sopenharmony_ci			if (verbose > 1)
208862306a36Sopenharmony_ci				printf(" %X ccc %d\n", unichar, value);
208962306a36Sopenharmony_ci			if (!utf32valid(unichar))
209062306a36Sopenharmony_ci				line_fail(ccc_name, line);
209162306a36Sopenharmony_ci			continue;
209262306a36Sopenharmony_ci		}
209362306a36Sopenharmony_ci	}
209462306a36Sopenharmony_ci	fclose(file);
209562306a36Sopenharmony_ci
209662306a36Sopenharmony_ci	if (verbose > 0)
209762306a36Sopenharmony_ci		printf("Found %d entries\n", count);
209862306a36Sopenharmony_ci	if (count == 0)
209962306a36Sopenharmony_ci		file_fail(ccc_name);
210062306a36Sopenharmony_ci}
210162306a36Sopenharmony_ci
210262306a36Sopenharmony_cistatic int ignore_compatibility_form(char *type)
210362306a36Sopenharmony_ci{
210462306a36Sopenharmony_ci	int i;
210562306a36Sopenharmony_ci	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
210662306a36Sopenharmony_ci				 "final", "isolated", "circle", "super",
210762306a36Sopenharmony_ci				 "sub", "vertical", "wide", "narrow",
210862306a36Sopenharmony_ci				 "small", "square", "fraction", "compat"};
210962306a36Sopenharmony_ci
211062306a36Sopenharmony_ci	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
211162306a36Sopenharmony_ci		if (strcmp(type, ignored_types[i]) == 0)
211262306a36Sopenharmony_ci			return 1;
211362306a36Sopenharmony_ci	return 0;
211462306a36Sopenharmony_ci}
211562306a36Sopenharmony_ci
211662306a36Sopenharmony_cistatic void nfdi_init(void)
211762306a36Sopenharmony_ci{
211862306a36Sopenharmony_ci	FILE *file;
211962306a36Sopenharmony_ci	unsigned int unichar;
212062306a36Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
212162306a36Sopenharmony_ci	char *s;
212262306a36Sopenharmony_ci	char *type;
212362306a36Sopenharmony_ci	unsigned int *um;
212462306a36Sopenharmony_ci	int count;
212562306a36Sopenharmony_ci	int i;
212662306a36Sopenharmony_ci	int ret;
212762306a36Sopenharmony_ci
212862306a36Sopenharmony_ci	if (verbose > 0)
212962306a36Sopenharmony_ci		printf("Parsing %s\n", data_name);
213062306a36Sopenharmony_ci	file = fopen(data_name, "r");
213162306a36Sopenharmony_ci	if (!file)
213262306a36Sopenharmony_ci		open_fail(data_name, errno);
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_ci	count = 0;
213562306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
213662306a36Sopenharmony_ci		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
213762306a36Sopenharmony_ci			     &unichar, buf0);
213862306a36Sopenharmony_ci		if (ret != 2)
213962306a36Sopenharmony_ci			continue;
214062306a36Sopenharmony_ci		if (!utf32valid(unichar))
214162306a36Sopenharmony_ci			line_fail(data_name, line);
214262306a36Sopenharmony_ci
214362306a36Sopenharmony_ci		s = buf0;
214462306a36Sopenharmony_ci		/* skip over <tag> */
214562306a36Sopenharmony_ci		if (*s == '<') {
214662306a36Sopenharmony_ci			type = ++s;
214762306a36Sopenharmony_ci			while (*++s != '>');
214862306a36Sopenharmony_ci			*s++ = '\0';
214962306a36Sopenharmony_ci			if(ignore_compatibility_form(type))
215062306a36Sopenharmony_ci				continue;
215162306a36Sopenharmony_ci		}
215262306a36Sopenharmony_ci		/* decode the decomposition into UTF-32 */
215362306a36Sopenharmony_ci		i = 0;
215462306a36Sopenharmony_ci		while (*s) {
215562306a36Sopenharmony_ci			mapping[i] = strtoul(s, &s, 16);
215662306a36Sopenharmony_ci			if (!utf32valid(mapping[i]))
215762306a36Sopenharmony_ci				line_fail(data_name, line);
215862306a36Sopenharmony_ci			i++;
215962306a36Sopenharmony_ci		}
216062306a36Sopenharmony_ci		mapping[i++] = 0;
216162306a36Sopenharmony_ci
216262306a36Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
216362306a36Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
216462306a36Sopenharmony_ci		unicode_data[unichar].utf32nfdi = um;
216562306a36Sopenharmony_ci
216662306a36Sopenharmony_ci		if (verbose > 1)
216762306a36Sopenharmony_ci			print_utf32nfdi(unichar);
216862306a36Sopenharmony_ci		count++;
216962306a36Sopenharmony_ci	}
217062306a36Sopenharmony_ci	fclose(file);
217162306a36Sopenharmony_ci	if (verbose > 0)
217262306a36Sopenharmony_ci		printf("Found %d entries\n", count);
217362306a36Sopenharmony_ci	if (count == 0)
217462306a36Sopenharmony_ci		file_fail(data_name);
217562306a36Sopenharmony_ci}
217662306a36Sopenharmony_ci
217762306a36Sopenharmony_cistatic void nfdicf_init(void)
217862306a36Sopenharmony_ci{
217962306a36Sopenharmony_ci	FILE *file;
218062306a36Sopenharmony_ci	unsigned int unichar;
218162306a36Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
218262306a36Sopenharmony_ci	char status;
218362306a36Sopenharmony_ci	char *s;
218462306a36Sopenharmony_ci	unsigned int *um;
218562306a36Sopenharmony_ci	int i;
218662306a36Sopenharmony_ci	int count;
218762306a36Sopenharmony_ci	int ret;
218862306a36Sopenharmony_ci
218962306a36Sopenharmony_ci	if (verbose > 0)
219062306a36Sopenharmony_ci		printf("Parsing %s\n", fold_name);
219162306a36Sopenharmony_ci	file = fopen(fold_name, "r");
219262306a36Sopenharmony_ci	if (!file)
219362306a36Sopenharmony_ci		open_fail(fold_name, errno);
219462306a36Sopenharmony_ci
219562306a36Sopenharmony_ci	count = 0;
219662306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
219762306a36Sopenharmony_ci		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
219862306a36Sopenharmony_ci		if (ret != 3)
219962306a36Sopenharmony_ci			continue;
220062306a36Sopenharmony_ci		if (!utf32valid(unichar))
220162306a36Sopenharmony_ci			line_fail(fold_name, line);
220262306a36Sopenharmony_ci		/* Use the C+F casefold. */
220362306a36Sopenharmony_ci		if (status != 'C' && status != 'F')
220462306a36Sopenharmony_ci			continue;
220562306a36Sopenharmony_ci		s = buf0;
220662306a36Sopenharmony_ci		if (*s == '<')
220762306a36Sopenharmony_ci			while (*s++ != ' ')
220862306a36Sopenharmony_ci				;
220962306a36Sopenharmony_ci		i = 0;
221062306a36Sopenharmony_ci		while (*s) {
221162306a36Sopenharmony_ci			mapping[i] = strtoul(s, &s, 16);
221262306a36Sopenharmony_ci			if (!utf32valid(mapping[i]))
221362306a36Sopenharmony_ci				line_fail(fold_name, line);
221462306a36Sopenharmony_ci			i++;
221562306a36Sopenharmony_ci		}
221662306a36Sopenharmony_ci		mapping[i++] = 0;
221762306a36Sopenharmony_ci
221862306a36Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
221962306a36Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
222062306a36Sopenharmony_ci		unicode_data[unichar].utf32nfdicf = um;
222162306a36Sopenharmony_ci
222262306a36Sopenharmony_ci		if (verbose > 1)
222362306a36Sopenharmony_ci			print_utf32nfdicf(unichar);
222462306a36Sopenharmony_ci		count++;
222562306a36Sopenharmony_ci	}
222662306a36Sopenharmony_ci	fclose(file);
222762306a36Sopenharmony_ci	if (verbose > 0)
222862306a36Sopenharmony_ci		printf("Found %d entries\n", count);
222962306a36Sopenharmony_ci	if (count == 0)
223062306a36Sopenharmony_ci		file_fail(fold_name);
223162306a36Sopenharmony_ci}
223262306a36Sopenharmony_ci
223362306a36Sopenharmony_cistatic void ignore_init(void)
223462306a36Sopenharmony_ci{
223562306a36Sopenharmony_ci	FILE *file;
223662306a36Sopenharmony_ci	unsigned int unichar;
223762306a36Sopenharmony_ci	unsigned int first;
223862306a36Sopenharmony_ci	unsigned int last;
223962306a36Sopenharmony_ci	unsigned int *um;
224062306a36Sopenharmony_ci	int count;
224162306a36Sopenharmony_ci	int ret;
224262306a36Sopenharmony_ci
224362306a36Sopenharmony_ci	if (verbose > 0)
224462306a36Sopenharmony_ci		printf("Parsing %s\n", prop_name);
224562306a36Sopenharmony_ci	file = fopen(prop_name, "r");
224662306a36Sopenharmony_ci	if (!file)
224762306a36Sopenharmony_ci		open_fail(prop_name, errno);
224862306a36Sopenharmony_ci	assert(file);
224962306a36Sopenharmony_ci	count = 0;
225062306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
225162306a36Sopenharmony_ci		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
225262306a36Sopenharmony_ci		if (ret == 3) {
225362306a36Sopenharmony_ci			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
225462306a36Sopenharmony_ci				continue;
225562306a36Sopenharmony_ci			if (!utf32valid(first) || !utf32valid(last))
225662306a36Sopenharmony_ci				line_fail(prop_name, line);
225762306a36Sopenharmony_ci			for (unichar = first; unichar <= last; unichar++) {
225862306a36Sopenharmony_ci				free(unicode_data[unichar].utf32nfdi);
225962306a36Sopenharmony_ci				um = malloc(sizeof(unsigned int));
226062306a36Sopenharmony_ci				*um = 0;
226162306a36Sopenharmony_ci				unicode_data[unichar].utf32nfdi = um;
226262306a36Sopenharmony_ci				free(unicode_data[unichar].utf32nfdicf);
226362306a36Sopenharmony_ci				um = malloc(sizeof(unsigned int));
226462306a36Sopenharmony_ci				*um = 0;
226562306a36Sopenharmony_ci				unicode_data[unichar].utf32nfdicf = um;
226662306a36Sopenharmony_ci				count++;
226762306a36Sopenharmony_ci			}
226862306a36Sopenharmony_ci			if (verbose > 1)
226962306a36Sopenharmony_ci				printf(" %X..%X Default_Ignorable_Code_Point\n",
227062306a36Sopenharmony_ci					first, last);
227162306a36Sopenharmony_ci			continue;
227262306a36Sopenharmony_ci		}
227362306a36Sopenharmony_ci		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
227462306a36Sopenharmony_ci		if (ret == 2) {
227562306a36Sopenharmony_ci			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
227662306a36Sopenharmony_ci				continue;
227762306a36Sopenharmony_ci			if (!utf32valid(unichar))
227862306a36Sopenharmony_ci				line_fail(prop_name, line);
227962306a36Sopenharmony_ci			free(unicode_data[unichar].utf32nfdi);
228062306a36Sopenharmony_ci			um = malloc(sizeof(unsigned int));
228162306a36Sopenharmony_ci			*um = 0;
228262306a36Sopenharmony_ci			unicode_data[unichar].utf32nfdi = um;
228362306a36Sopenharmony_ci			free(unicode_data[unichar].utf32nfdicf);
228462306a36Sopenharmony_ci			um = malloc(sizeof(unsigned int));
228562306a36Sopenharmony_ci			*um = 0;
228662306a36Sopenharmony_ci			unicode_data[unichar].utf32nfdicf = um;
228762306a36Sopenharmony_ci			if (verbose > 1)
228862306a36Sopenharmony_ci				printf(" %X Default_Ignorable_Code_Point\n",
228962306a36Sopenharmony_ci					unichar);
229062306a36Sopenharmony_ci			count++;
229162306a36Sopenharmony_ci			continue;
229262306a36Sopenharmony_ci		}
229362306a36Sopenharmony_ci	}
229462306a36Sopenharmony_ci	fclose(file);
229562306a36Sopenharmony_ci
229662306a36Sopenharmony_ci	if (verbose > 0)
229762306a36Sopenharmony_ci		printf("Found %d entries\n", count);
229862306a36Sopenharmony_ci	if (count == 0)
229962306a36Sopenharmony_ci		file_fail(prop_name);
230062306a36Sopenharmony_ci}
230162306a36Sopenharmony_ci
230262306a36Sopenharmony_cistatic void corrections_init(void)
230362306a36Sopenharmony_ci{
230462306a36Sopenharmony_ci	FILE *file;
230562306a36Sopenharmony_ci	unsigned int unichar;
230662306a36Sopenharmony_ci	unsigned int major;
230762306a36Sopenharmony_ci	unsigned int minor;
230862306a36Sopenharmony_ci	unsigned int revision;
230962306a36Sopenharmony_ci	unsigned int age;
231062306a36Sopenharmony_ci	unsigned int *um;
231162306a36Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
231262306a36Sopenharmony_ci	char *s;
231362306a36Sopenharmony_ci	int i;
231462306a36Sopenharmony_ci	int count;
231562306a36Sopenharmony_ci	int ret;
231662306a36Sopenharmony_ci
231762306a36Sopenharmony_ci	if (verbose > 0)
231862306a36Sopenharmony_ci		printf("Parsing %s\n", norm_name);
231962306a36Sopenharmony_ci	file = fopen(norm_name, "r");
232062306a36Sopenharmony_ci	if (!file)
232162306a36Sopenharmony_ci		open_fail(norm_name, errno);
232262306a36Sopenharmony_ci
232362306a36Sopenharmony_ci	count = 0;
232462306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
232562306a36Sopenharmony_ci		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
232662306a36Sopenharmony_ci				&unichar, buf0, buf1,
232762306a36Sopenharmony_ci				&major, &minor, &revision);
232862306a36Sopenharmony_ci		if (ret != 6)
232962306a36Sopenharmony_ci			continue;
233062306a36Sopenharmony_ci		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
233162306a36Sopenharmony_ci			line_fail(norm_name, line);
233262306a36Sopenharmony_ci		count++;
233362306a36Sopenharmony_ci	}
233462306a36Sopenharmony_ci	corrections = calloc(count, sizeof(struct unicode_data));
233562306a36Sopenharmony_ci	corrections_count = count;
233662306a36Sopenharmony_ci	rewind(file);
233762306a36Sopenharmony_ci
233862306a36Sopenharmony_ci	count = 0;
233962306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
234062306a36Sopenharmony_ci		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
234162306a36Sopenharmony_ci				&unichar, buf0, buf1,
234262306a36Sopenharmony_ci				&major, &minor, &revision);
234362306a36Sopenharmony_ci		if (ret != 6)
234462306a36Sopenharmony_ci			continue;
234562306a36Sopenharmony_ci		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
234662306a36Sopenharmony_ci			line_fail(norm_name, line);
234762306a36Sopenharmony_ci		corrections[count] = unicode_data[unichar];
234862306a36Sopenharmony_ci		assert(corrections[count].code == unichar);
234962306a36Sopenharmony_ci		age = UNICODE_AGE(major, minor, revision);
235062306a36Sopenharmony_ci		corrections[count].correction = age;
235162306a36Sopenharmony_ci
235262306a36Sopenharmony_ci		i = 0;
235362306a36Sopenharmony_ci		s = buf0;
235462306a36Sopenharmony_ci		while (*s) {
235562306a36Sopenharmony_ci			mapping[i] = strtoul(s, &s, 16);
235662306a36Sopenharmony_ci			if (!utf32valid(mapping[i]))
235762306a36Sopenharmony_ci				line_fail(norm_name, line);
235862306a36Sopenharmony_ci			i++;
235962306a36Sopenharmony_ci		}
236062306a36Sopenharmony_ci		mapping[i++] = 0;
236162306a36Sopenharmony_ci
236262306a36Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
236362306a36Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
236462306a36Sopenharmony_ci		corrections[count].utf32nfdi = um;
236562306a36Sopenharmony_ci
236662306a36Sopenharmony_ci		if (verbose > 1)
236762306a36Sopenharmony_ci			printf(" %X -> %s -> %s V%d_%d_%d\n",
236862306a36Sopenharmony_ci				unichar, buf0, buf1, major, minor, revision);
236962306a36Sopenharmony_ci		count++;
237062306a36Sopenharmony_ci	}
237162306a36Sopenharmony_ci	fclose(file);
237262306a36Sopenharmony_ci
237362306a36Sopenharmony_ci	if (verbose > 0)
237462306a36Sopenharmony_ci	        printf("Found %d entries\n", count);
237562306a36Sopenharmony_ci	if (count == 0)
237662306a36Sopenharmony_ci		file_fail(norm_name);
237762306a36Sopenharmony_ci}
237862306a36Sopenharmony_ci
237962306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
238062306a36Sopenharmony_ci
238162306a36Sopenharmony_ci/*
238262306a36Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
238362306a36Sopenharmony_ci *
238462306a36Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
238562306a36Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
238662306a36Sopenharmony_ci *
238762306a36Sopenharmony_ci * SBase = 0xAC00
238862306a36Sopenharmony_ci * LBase = 0x1100
238962306a36Sopenharmony_ci * VBase = 0x1161
239062306a36Sopenharmony_ci * TBase = 0x11A7
239162306a36Sopenharmony_ci * LCount = 19
239262306a36Sopenharmony_ci * VCount = 21
239362306a36Sopenharmony_ci * TCount = 28
239462306a36Sopenharmony_ci * NCount = 588 (VCount * TCount)
239562306a36Sopenharmony_ci * SCount = 11172 (LCount * NCount)
239662306a36Sopenharmony_ci *
239762306a36Sopenharmony_ci * Decomposition:
239862306a36Sopenharmony_ci *   SIndex = s - SBase
239962306a36Sopenharmony_ci *
240062306a36Sopenharmony_ci * LV (Canonical/Full)
240162306a36Sopenharmony_ci *   LIndex = SIndex / NCount
240262306a36Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
240362306a36Sopenharmony_ci *   LPart = LBase + LIndex
240462306a36Sopenharmony_ci *   VPart = VBase + VIndex
240562306a36Sopenharmony_ci *
240662306a36Sopenharmony_ci * LVT (Canonical)
240762306a36Sopenharmony_ci *   LVIndex = (SIndex / TCount) * TCount
240862306a36Sopenharmony_ci *   TIndex = (Sindex % TCount)
240962306a36Sopenharmony_ci *   LVPart = SBase + LVIndex
241062306a36Sopenharmony_ci *   TPart = TBase + TIndex
241162306a36Sopenharmony_ci *
241262306a36Sopenharmony_ci * LVT (Full)
241362306a36Sopenharmony_ci *   LIndex = SIndex / NCount
241462306a36Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
241562306a36Sopenharmony_ci *   TIndex = (Sindex % TCount)
241662306a36Sopenharmony_ci *   LPart = LBase + LIndex
241762306a36Sopenharmony_ci *   VPart = VBase + VIndex
241862306a36Sopenharmony_ci *   if (TIndex == 0) {
241962306a36Sopenharmony_ci *          d = <LPart, VPart>
242062306a36Sopenharmony_ci *   } else {
242162306a36Sopenharmony_ci *          TPart = TBase + TIndex
242262306a36Sopenharmony_ci *          d = <LPart, VPart, TPart>
242362306a36Sopenharmony_ci *   }
242462306a36Sopenharmony_ci *
242562306a36Sopenharmony_ci */
242662306a36Sopenharmony_ci
242762306a36Sopenharmony_cistatic void hangul_decompose(void)
242862306a36Sopenharmony_ci{
242962306a36Sopenharmony_ci	unsigned int sb = 0xAC00;
243062306a36Sopenharmony_ci	unsigned int lb = 0x1100;
243162306a36Sopenharmony_ci	unsigned int vb = 0x1161;
243262306a36Sopenharmony_ci	unsigned int tb = 0x11a7;
243362306a36Sopenharmony_ci	/* unsigned int lc = 19; */
243462306a36Sopenharmony_ci	unsigned int vc = 21;
243562306a36Sopenharmony_ci	unsigned int tc = 28;
243662306a36Sopenharmony_ci	unsigned int nc = (vc * tc);
243762306a36Sopenharmony_ci	/* unsigned int sc = (lc * nc); */
243862306a36Sopenharmony_ci	unsigned int unichar;
243962306a36Sopenharmony_ci	unsigned int mapping[4];
244062306a36Sopenharmony_ci	unsigned int *um;
244162306a36Sopenharmony_ci        int count;
244262306a36Sopenharmony_ci	int i;
244362306a36Sopenharmony_ci
244462306a36Sopenharmony_ci	if (verbose > 0)
244562306a36Sopenharmony_ci		printf("Decomposing hangul\n");
244662306a36Sopenharmony_ci	/* Hangul */
244762306a36Sopenharmony_ci	count = 0;
244862306a36Sopenharmony_ci	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
244962306a36Sopenharmony_ci		unsigned int si = unichar - sb;
245062306a36Sopenharmony_ci		unsigned int li = si / nc;
245162306a36Sopenharmony_ci		unsigned int vi = (si % nc) / tc;
245262306a36Sopenharmony_ci		unsigned int ti = si % tc;
245362306a36Sopenharmony_ci
245462306a36Sopenharmony_ci		i = 0;
245562306a36Sopenharmony_ci		mapping[i++] = lb + li;
245662306a36Sopenharmony_ci		mapping[i++] = vb + vi;
245762306a36Sopenharmony_ci		if (ti)
245862306a36Sopenharmony_ci			mapping[i++] = tb + ti;
245962306a36Sopenharmony_ci		mapping[i++] = 0;
246062306a36Sopenharmony_ci
246162306a36Sopenharmony_ci		assert(!unicode_data[unichar].utf32nfdi);
246262306a36Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
246362306a36Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
246462306a36Sopenharmony_ci		unicode_data[unichar].utf32nfdi = um;
246562306a36Sopenharmony_ci
246662306a36Sopenharmony_ci		assert(!unicode_data[unichar].utf32nfdicf);
246762306a36Sopenharmony_ci		um = malloc(i * sizeof(unsigned int));
246862306a36Sopenharmony_ci		memcpy(um, mapping, i * sizeof(unsigned int));
246962306a36Sopenharmony_ci		unicode_data[unichar].utf32nfdicf = um;
247062306a36Sopenharmony_ci
247162306a36Sopenharmony_ci		/*
247262306a36Sopenharmony_ci		 * Add a cookie as a reminder that the hangul syllable
247362306a36Sopenharmony_ci		 * decompositions must not be stored in the generated
247462306a36Sopenharmony_ci		 * trie.
247562306a36Sopenharmony_ci		 */
247662306a36Sopenharmony_ci		unicode_data[unichar].utf8nfdi = malloc(2);
247762306a36Sopenharmony_ci		unicode_data[unichar].utf8nfdi[0] = HANGUL;
247862306a36Sopenharmony_ci		unicode_data[unichar].utf8nfdi[1] = '\0';
247962306a36Sopenharmony_ci
248062306a36Sopenharmony_ci		if (verbose > 1)
248162306a36Sopenharmony_ci			print_utf32nfdi(unichar);
248262306a36Sopenharmony_ci
248362306a36Sopenharmony_ci		count++;
248462306a36Sopenharmony_ci	}
248562306a36Sopenharmony_ci	if (verbose > 0)
248662306a36Sopenharmony_ci		printf("Created %d entries\n", count);
248762306a36Sopenharmony_ci}
248862306a36Sopenharmony_ci
248962306a36Sopenharmony_cistatic void nfdi_decompose(void)
249062306a36Sopenharmony_ci{
249162306a36Sopenharmony_ci	unsigned int unichar;
249262306a36Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
249362306a36Sopenharmony_ci	unsigned int *um;
249462306a36Sopenharmony_ci	unsigned int *dc;
249562306a36Sopenharmony_ci	int count;
249662306a36Sopenharmony_ci	int i;
249762306a36Sopenharmony_ci	int j;
249862306a36Sopenharmony_ci	int ret;
249962306a36Sopenharmony_ci
250062306a36Sopenharmony_ci	if (verbose > 0)
250162306a36Sopenharmony_ci		printf("Decomposing nfdi\n");
250262306a36Sopenharmony_ci
250362306a36Sopenharmony_ci	count = 0;
250462306a36Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++) {
250562306a36Sopenharmony_ci		if (!unicode_data[unichar].utf32nfdi)
250662306a36Sopenharmony_ci			continue;
250762306a36Sopenharmony_ci		for (;;) {
250862306a36Sopenharmony_ci			ret = 1;
250962306a36Sopenharmony_ci			i = 0;
251062306a36Sopenharmony_ci			um = unicode_data[unichar].utf32nfdi;
251162306a36Sopenharmony_ci			while (*um) {
251262306a36Sopenharmony_ci				dc = unicode_data[*um].utf32nfdi;
251362306a36Sopenharmony_ci				if (dc) {
251462306a36Sopenharmony_ci					for (j = 0; dc[j]; j++)
251562306a36Sopenharmony_ci						mapping[i++] = dc[j];
251662306a36Sopenharmony_ci					ret = 0;
251762306a36Sopenharmony_ci				} else {
251862306a36Sopenharmony_ci					mapping[i++] = *um;
251962306a36Sopenharmony_ci				}
252062306a36Sopenharmony_ci				um++;
252162306a36Sopenharmony_ci			}
252262306a36Sopenharmony_ci			mapping[i++] = 0;
252362306a36Sopenharmony_ci			if (ret)
252462306a36Sopenharmony_ci				break;
252562306a36Sopenharmony_ci			free(unicode_data[unichar].utf32nfdi);
252662306a36Sopenharmony_ci			um = malloc(i * sizeof(unsigned int));
252762306a36Sopenharmony_ci			memcpy(um, mapping, i * sizeof(unsigned int));
252862306a36Sopenharmony_ci			unicode_data[unichar].utf32nfdi = um;
252962306a36Sopenharmony_ci		}
253062306a36Sopenharmony_ci		/* Add this decomposition to nfdicf if there is no entry. */
253162306a36Sopenharmony_ci		if (!unicode_data[unichar].utf32nfdicf) {
253262306a36Sopenharmony_ci			um = malloc(i * sizeof(unsigned int));
253362306a36Sopenharmony_ci			memcpy(um, mapping, i * sizeof(unsigned int));
253462306a36Sopenharmony_ci			unicode_data[unichar].utf32nfdicf = um;
253562306a36Sopenharmony_ci		}
253662306a36Sopenharmony_ci		if (verbose > 1)
253762306a36Sopenharmony_ci			print_utf32nfdi(unichar);
253862306a36Sopenharmony_ci		count++;
253962306a36Sopenharmony_ci	}
254062306a36Sopenharmony_ci	if (verbose > 0)
254162306a36Sopenharmony_ci		printf("Processed %d entries\n", count);
254262306a36Sopenharmony_ci}
254362306a36Sopenharmony_ci
254462306a36Sopenharmony_cistatic void nfdicf_decompose(void)
254562306a36Sopenharmony_ci{
254662306a36Sopenharmony_ci	unsigned int unichar;
254762306a36Sopenharmony_ci	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
254862306a36Sopenharmony_ci	unsigned int *um;
254962306a36Sopenharmony_ci	unsigned int *dc;
255062306a36Sopenharmony_ci	int count;
255162306a36Sopenharmony_ci	int i;
255262306a36Sopenharmony_ci	int j;
255362306a36Sopenharmony_ci	int ret;
255462306a36Sopenharmony_ci
255562306a36Sopenharmony_ci	if (verbose > 0)
255662306a36Sopenharmony_ci		printf("Decomposing nfdicf\n");
255762306a36Sopenharmony_ci	count = 0;
255862306a36Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++) {
255962306a36Sopenharmony_ci		if (!unicode_data[unichar].utf32nfdicf)
256062306a36Sopenharmony_ci			continue;
256162306a36Sopenharmony_ci		for (;;) {
256262306a36Sopenharmony_ci			ret = 1;
256362306a36Sopenharmony_ci			i = 0;
256462306a36Sopenharmony_ci			um = unicode_data[unichar].utf32nfdicf;
256562306a36Sopenharmony_ci			while (*um) {
256662306a36Sopenharmony_ci				dc = unicode_data[*um].utf32nfdicf;
256762306a36Sopenharmony_ci				if (dc) {
256862306a36Sopenharmony_ci					for (j = 0; dc[j]; j++)
256962306a36Sopenharmony_ci						mapping[i++] = dc[j];
257062306a36Sopenharmony_ci					ret = 0;
257162306a36Sopenharmony_ci				} else {
257262306a36Sopenharmony_ci					mapping[i++] = *um;
257362306a36Sopenharmony_ci				}
257462306a36Sopenharmony_ci				um++;
257562306a36Sopenharmony_ci			}
257662306a36Sopenharmony_ci			mapping[i++] = 0;
257762306a36Sopenharmony_ci			if (ret)
257862306a36Sopenharmony_ci				break;
257962306a36Sopenharmony_ci			free(unicode_data[unichar].utf32nfdicf);
258062306a36Sopenharmony_ci			um = malloc(i * sizeof(unsigned int));
258162306a36Sopenharmony_ci			memcpy(um, mapping, i * sizeof(unsigned int));
258262306a36Sopenharmony_ci			unicode_data[unichar].utf32nfdicf = um;
258362306a36Sopenharmony_ci		}
258462306a36Sopenharmony_ci		if (verbose > 1)
258562306a36Sopenharmony_ci			print_utf32nfdicf(unichar);
258662306a36Sopenharmony_ci		count++;
258762306a36Sopenharmony_ci	}
258862306a36Sopenharmony_ci	if (verbose > 0)
258962306a36Sopenharmony_ci		printf("Processed %d entries\n", count);
259062306a36Sopenharmony_ci}
259162306a36Sopenharmony_ci
259262306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
259362306a36Sopenharmony_ci
259462306a36Sopenharmony_ciint utf8agemax(struct tree *, const char *);
259562306a36Sopenharmony_ciint utf8nagemax(struct tree *, const char *, size_t);
259662306a36Sopenharmony_ciint utf8agemin(struct tree *, const char *);
259762306a36Sopenharmony_ciint utf8nagemin(struct tree *, const char *, size_t);
259862306a36Sopenharmony_cissize_t utf8len(struct tree *, const char *);
259962306a36Sopenharmony_cissize_t utf8nlen(struct tree *, const char *, size_t);
260062306a36Sopenharmony_cistruct utf8cursor;
260162306a36Sopenharmony_ciint utf8cursor(struct utf8cursor *, struct tree *, const char *);
260262306a36Sopenharmony_ciint utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
260362306a36Sopenharmony_ciint utf8byte(struct utf8cursor *);
260462306a36Sopenharmony_ci
260562306a36Sopenharmony_ci/*
260662306a36Sopenharmony_ci * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
260762306a36Sopenharmony_ci *
260862306a36Sopenharmony_ci * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
260962306a36Sopenharmony_ci * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
261062306a36Sopenharmony_ci *
261162306a36Sopenharmony_ci * SBase = 0xAC00
261262306a36Sopenharmony_ci * LBase = 0x1100
261362306a36Sopenharmony_ci * VBase = 0x1161
261462306a36Sopenharmony_ci * TBase = 0x11A7
261562306a36Sopenharmony_ci * LCount = 19
261662306a36Sopenharmony_ci * VCount = 21
261762306a36Sopenharmony_ci * TCount = 28
261862306a36Sopenharmony_ci * NCount = 588 (VCount * TCount)
261962306a36Sopenharmony_ci * SCount = 11172 (LCount * NCount)
262062306a36Sopenharmony_ci *
262162306a36Sopenharmony_ci * Decomposition:
262262306a36Sopenharmony_ci *   SIndex = s - SBase
262362306a36Sopenharmony_ci *
262462306a36Sopenharmony_ci * LV (Canonical/Full)
262562306a36Sopenharmony_ci *   LIndex = SIndex / NCount
262662306a36Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
262762306a36Sopenharmony_ci *   LPart = LBase + LIndex
262862306a36Sopenharmony_ci *   VPart = VBase + VIndex
262962306a36Sopenharmony_ci *
263062306a36Sopenharmony_ci * LVT (Canonical)
263162306a36Sopenharmony_ci *   LVIndex = (SIndex / TCount) * TCount
263262306a36Sopenharmony_ci *   TIndex = (Sindex % TCount)
263362306a36Sopenharmony_ci *   LVPart = SBase + LVIndex
263462306a36Sopenharmony_ci *   TPart = TBase + TIndex
263562306a36Sopenharmony_ci *
263662306a36Sopenharmony_ci * LVT (Full)
263762306a36Sopenharmony_ci *   LIndex = SIndex / NCount
263862306a36Sopenharmony_ci *   VIndex = (Sindex % NCount) / TCount
263962306a36Sopenharmony_ci *   TIndex = (Sindex % TCount)
264062306a36Sopenharmony_ci *   LPart = LBase + LIndex
264162306a36Sopenharmony_ci *   VPart = VBase + VIndex
264262306a36Sopenharmony_ci *   if (TIndex == 0) {
264362306a36Sopenharmony_ci *          d = <LPart, VPart>
264462306a36Sopenharmony_ci *   } else {
264562306a36Sopenharmony_ci *          TPart = TBase + TIndex
264662306a36Sopenharmony_ci *          d = <LPart, VPart, TPart>
264762306a36Sopenharmony_ci *   }
264862306a36Sopenharmony_ci */
264962306a36Sopenharmony_ci
265062306a36Sopenharmony_ci/* Constants */
265162306a36Sopenharmony_ci#define SB	(0xAC00)
265262306a36Sopenharmony_ci#define LB	(0x1100)
265362306a36Sopenharmony_ci#define VB	(0x1161)
265462306a36Sopenharmony_ci#define TB	(0x11A7)
265562306a36Sopenharmony_ci#define LC	(19)
265662306a36Sopenharmony_ci#define VC	(21)
265762306a36Sopenharmony_ci#define TC	(28)
265862306a36Sopenharmony_ci#define NC	(VC * TC)
265962306a36Sopenharmony_ci#define SC	(LC * NC)
266062306a36Sopenharmony_ci
266162306a36Sopenharmony_ci/* Algorithmic decomposition of hangul syllable. */
266262306a36Sopenharmony_cistatic utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
266362306a36Sopenharmony_ci{
266462306a36Sopenharmony_ci	unsigned int	si;
266562306a36Sopenharmony_ci	unsigned int	li;
266662306a36Sopenharmony_ci	unsigned int	vi;
266762306a36Sopenharmony_ci	unsigned int	ti;
266862306a36Sopenharmony_ci	unsigned char	*h;
266962306a36Sopenharmony_ci
267062306a36Sopenharmony_ci	/* Calculate the SI, LI, VI, and TI values. */
267162306a36Sopenharmony_ci	si = utf8decode(str) - SB;
267262306a36Sopenharmony_ci	li = si / NC;
267362306a36Sopenharmony_ci	vi = (si % NC) / TC;
267462306a36Sopenharmony_ci	ti = si % TC;
267562306a36Sopenharmony_ci
267662306a36Sopenharmony_ci	/* Fill in base of leaf. */
267762306a36Sopenharmony_ci	h = hangul;
267862306a36Sopenharmony_ci	LEAF_GEN(h) = 2;
267962306a36Sopenharmony_ci	LEAF_CCC(h) = DECOMPOSE;
268062306a36Sopenharmony_ci	h += 2;
268162306a36Sopenharmony_ci
268262306a36Sopenharmony_ci	/* Add LPart, a 3-byte UTF-8 sequence. */
268362306a36Sopenharmony_ci	h += utf8encode((char *)h, li + LB);
268462306a36Sopenharmony_ci
268562306a36Sopenharmony_ci	/* Add VPart, a 3-byte UTF-8 sequence. */
268662306a36Sopenharmony_ci	h += utf8encode((char *)h, vi + VB);
268762306a36Sopenharmony_ci
268862306a36Sopenharmony_ci	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
268962306a36Sopenharmony_ci	if (ti)
269062306a36Sopenharmony_ci		h += utf8encode((char *)h, ti + TB);
269162306a36Sopenharmony_ci
269262306a36Sopenharmony_ci	/* Terminate string. */
269362306a36Sopenharmony_ci	h[0] = '\0';
269462306a36Sopenharmony_ci
269562306a36Sopenharmony_ci	return hangul;
269662306a36Sopenharmony_ci}
269762306a36Sopenharmony_ci
269862306a36Sopenharmony_ci/*
269962306a36Sopenharmony_ci * Use trie to scan s, touching at most len bytes.
270062306a36Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise.
270162306a36Sopenharmony_ci *
270262306a36Sopenharmony_ci * A non-NULL return guarantees that the UTF-8 sequence starting at s
270362306a36Sopenharmony_ci * is well-formed and corresponds to a known unicode code point.  The
270462306a36Sopenharmony_ci * shorthand for this will be "is valid UTF-8 unicode".
270562306a36Sopenharmony_ci */
270662306a36Sopenharmony_cistatic utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
270762306a36Sopenharmony_ci			       const char *s, size_t len)
270862306a36Sopenharmony_ci{
270962306a36Sopenharmony_ci	utf8trie_t	*trie;
271062306a36Sopenharmony_ci	int		offlen;
271162306a36Sopenharmony_ci	int		offset;
271262306a36Sopenharmony_ci	int		mask;
271362306a36Sopenharmony_ci	int		node;
271462306a36Sopenharmony_ci
271562306a36Sopenharmony_ci	if (!tree)
271662306a36Sopenharmony_ci		return NULL;
271762306a36Sopenharmony_ci	if (len == 0)
271862306a36Sopenharmony_ci		return NULL;
271962306a36Sopenharmony_ci	node = 1;
272062306a36Sopenharmony_ci	trie = utf8data + tree->index;
272162306a36Sopenharmony_ci	while (node) {
272262306a36Sopenharmony_ci		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
272362306a36Sopenharmony_ci		if (*trie & NEXTBYTE) {
272462306a36Sopenharmony_ci			if (--len == 0)
272562306a36Sopenharmony_ci				return NULL;
272662306a36Sopenharmony_ci			s++;
272762306a36Sopenharmony_ci		}
272862306a36Sopenharmony_ci		mask = 1 << (*trie & BITNUM);
272962306a36Sopenharmony_ci		if (*s & mask) {
273062306a36Sopenharmony_ci			/* Right leg */
273162306a36Sopenharmony_ci			if (offlen) {
273262306a36Sopenharmony_ci				/* Right node at offset of trie */
273362306a36Sopenharmony_ci				node = (*trie & RIGHTNODE);
273462306a36Sopenharmony_ci				offset = trie[offlen];
273562306a36Sopenharmony_ci				while (--offlen) {
273662306a36Sopenharmony_ci					offset <<= 8;
273762306a36Sopenharmony_ci					offset |= trie[offlen];
273862306a36Sopenharmony_ci				}
273962306a36Sopenharmony_ci				trie += offset;
274062306a36Sopenharmony_ci			} else if (*trie & RIGHTPATH) {
274162306a36Sopenharmony_ci				/* Right node after this node */
274262306a36Sopenharmony_ci				node = (*trie & TRIENODE);
274362306a36Sopenharmony_ci				trie++;
274462306a36Sopenharmony_ci			} else {
274562306a36Sopenharmony_ci				/* No right node. */
274662306a36Sopenharmony_ci				return NULL;
274762306a36Sopenharmony_ci			}
274862306a36Sopenharmony_ci		} else {
274962306a36Sopenharmony_ci			/* Left leg */
275062306a36Sopenharmony_ci			if (offlen) {
275162306a36Sopenharmony_ci				/* Left node after this node. */
275262306a36Sopenharmony_ci				node = (*trie & LEFTNODE);
275362306a36Sopenharmony_ci				trie += offlen + 1;
275462306a36Sopenharmony_ci			} else if (*trie & RIGHTPATH) {
275562306a36Sopenharmony_ci				/* No left node. */
275662306a36Sopenharmony_ci				return NULL;
275762306a36Sopenharmony_ci			} else {
275862306a36Sopenharmony_ci				/* Left node after this node */
275962306a36Sopenharmony_ci				node = (*trie & TRIENODE);
276062306a36Sopenharmony_ci				trie++;
276162306a36Sopenharmony_ci			}
276262306a36Sopenharmony_ci		}
276362306a36Sopenharmony_ci	}
276462306a36Sopenharmony_ci	/*
276562306a36Sopenharmony_ci	 * Hangul decomposition is done algorithmically. These are the
276662306a36Sopenharmony_ci	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
276762306a36Sopenharmony_ci	 * always 3 bytes long, so s has been advanced twice, and the
276862306a36Sopenharmony_ci	 * start of the sequence is at s-2.
276962306a36Sopenharmony_ci	 */
277062306a36Sopenharmony_ci	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
277162306a36Sopenharmony_ci		trie = utf8hangul(s - 2, hangul);
277262306a36Sopenharmony_ci	return trie;
277362306a36Sopenharmony_ci}
277462306a36Sopenharmony_ci
277562306a36Sopenharmony_ci/*
277662306a36Sopenharmony_ci * Use trie to scan s.
277762306a36Sopenharmony_ci * Returns the leaf if one exists, NULL otherwise.
277862306a36Sopenharmony_ci *
277962306a36Sopenharmony_ci * Forwards to trie_nlookup().
278062306a36Sopenharmony_ci */
278162306a36Sopenharmony_cistatic utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
278262306a36Sopenharmony_ci			      const char *s)
278362306a36Sopenharmony_ci{
278462306a36Sopenharmony_ci	return utf8nlookup(tree, hangul, s, (size_t)-1);
278562306a36Sopenharmony_ci}
278662306a36Sopenharmony_ci
278762306a36Sopenharmony_ci/*
278862306a36Sopenharmony_ci * Return the number of bytes used by the current UTF-8 sequence.
278962306a36Sopenharmony_ci * Assumes the input points to the first byte of a valid UTF-8
279062306a36Sopenharmony_ci * sequence.
279162306a36Sopenharmony_ci */
279262306a36Sopenharmony_cistatic inline int utf8clen(const char *s)
279362306a36Sopenharmony_ci{
279462306a36Sopenharmony_ci	unsigned char c = *s;
279562306a36Sopenharmony_ci	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
279662306a36Sopenharmony_ci}
279762306a36Sopenharmony_ci
279862306a36Sopenharmony_ci/*
279962306a36Sopenharmony_ci * Maximum age of any character in s.
280062306a36Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
280162306a36Sopenharmony_ci * Return 0 if only non-assigned code points are used.
280262306a36Sopenharmony_ci */
280362306a36Sopenharmony_ciint utf8agemax(struct tree *tree, const char *s)
280462306a36Sopenharmony_ci{
280562306a36Sopenharmony_ci	utf8leaf_t	*leaf;
280662306a36Sopenharmony_ci	int		age = 0;
280762306a36Sopenharmony_ci	int		leaf_age;
280862306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
280962306a36Sopenharmony_ci
281062306a36Sopenharmony_ci	if (!tree)
281162306a36Sopenharmony_ci		return -1;
281262306a36Sopenharmony_ci
281362306a36Sopenharmony_ci	while (*s) {
281462306a36Sopenharmony_ci		leaf = utf8lookup(tree, hangul, s);
281562306a36Sopenharmony_ci		if (!leaf)
281662306a36Sopenharmony_ci			return -1;
281762306a36Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
281862306a36Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age > age)
281962306a36Sopenharmony_ci			age = leaf_age;
282062306a36Sopenharmony_ci		s += utf8clen(s);
282162306a36Sopenharmony_ci	}
282262306a36Sopenharmony_ci	return age;
282362306a36Sopenharmony_ci}
282462306a36Sopenharmony_ci
282562306a36Sopenharmony_ci/*
282662306a36Sopenharmony_ci * Minimum age of any character in s.
282762306a36Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
282862306a36Sopenharmony_ci * Return 0 if non-assigned code points are used.
282962306a36Sopenharmony_ci */
283062306a36Sopenharmony_ciint utf8agemin(struct tree *tree, const char *s)
283162306a36Sopenharmony_ci{
283262306a36Sopenharmony_ci	utf8leaf_t	*leaf;
283362306a36Sopenharmony_ci	int		age;
283462306a36Sopenharmony_ci	int		leaf_age;
283562306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
283662306a36Sopenharmony_ci
283762306a36Sopenharmony_ci	if (!tree)
283862306a36Sopenharmony_ci		return -1;
283962306a36Sopenharmony_ci	age = tree->maxage;
284062306a36Sopenharmony_ci	while (*s) {
284162306a36Sopenharmony_ci		leaf = utf8lookup(tree, hangul, s);
284262306a36Sopenharmony_ci		if (!leaf)
284362306a36Sopenharmony_ci			return -1;
284462306a36Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
284562306a36Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age < age)
284662306a36Sopenharmony_ci			age = leaf_age;
284762306a36Sopenharmony_ci		s += utf8clen(s);
284862306a36Sopenharmony_ci	}
284962306a36Sopenharmony_ci	return age;
285062306a36Sopenharmony_ci}
285162306a36Sopenharmony_ci
285262306a36Sopenharmony_ci/*
285362306a36Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes.
285462306a36Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
285562306a36Sopenharmony_ci */
285662306a36Sopenharmony_ciint utf8nagemax(struct tree *tree, const char *s, size_t len)
285762306a36Sopenharmony_ci{
285862306a36Sopenharmony_ci	utf8leaf_t	*leaf;
285962306a36Sopenharmony_ci	int		age = 0;
286062306a36Sopenharmony_ci	int		leaf_age;
286162306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
286262306a36Sopenharmony_ci
286362306a36Sopenharmony_ci	if (!tree)
286462306a36Sopenharmony_ci		return -1;
286562306a36Sopenharmony_ci
286662306a36Sopenharmony_ci        while (len && *s) {
286762306a36Sopenharmony_ci		leaf = utf8nlookup(tree, hangul, s, len);
286862306a36Sopenharmony_ci		if (!leaf)
286962306a36Sopenharmony_ci			return -1;
287062306a36Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
287162306a36Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age > age)
287262306a36Sopenharmony_ci			age = leaf_age;
287362306a36Sopenharmony_ci		len -= utf8clen(s);
287462306a36Sopenharmony_ci		s += utf8clen(s);
287562306a36Sopenharmony_ci	}
287662306a36Sopenharmony_ci	return age;
287762306a36Sopenharmony_ci}
287862306a36Sopenharmony_ci
287962306a36Sopenharmony_ci/*
288062306a36Sopenharmony_ci * Maximum age of any character in s, touch at most len bytes.
288162306a36Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
288262306a36Sopenharmony_ci */
288362306a36Sopenharmony_ciint utf8nagemin(struct tree *tree, const char *s, size_t len)
288462306a36Sopenharmony_ci{
288562306a36Sopenharmony_ci	utf8leaf_t	*leaf;
288662306a36Sopenharmony_ci	int		leaf_age;
288762306a36Sopenharmony_ci	int		age;
288862306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
288962306a36Sopenharmony_ci
289062306a36Sopenharmony_ci	if (!tree)
289162306a36Sopenharmony_ci		return -1;
289262306a36Sopenharmony_ci	age = tree->maxage;
289362306a36Sopenharmony_ci        while (len && *s) {
289462306a36Sopenharmony_ci		leaf = utf8nlookup(tree, hangul, s, len);
289562306a36Sopenharmony_ci		if (!leaf)
289662306a36Sopenharmony_ci			return -1;
289762306a36Sopenharmony_ci		leaf_age = ages[LEAF_GEN(leaf)];
289862306a36Sopenharmony_ci		if (leaf_age <= tree->maxage && leaf_age < age)
289962306a36Sopenharmony_ci			age = leaf_age;
290062306a36Sopenharmony_ci		len -= utf8clen(s);
290162306a36Sopenharmony_ci		s += utf8clen(s);
290262306a36Sopenharmony_ci	}
290362306a36Sopenharmony_ci	return age;
290462306a36Sopenharmony_ci}
290562306a36Sopenharmony_ci
290662306a36Sopenharmony_ci/*
290762306a36Sopenharmony_ci * Length of the normalization of s.
290862306a36Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
290962306a36Sopenharmony_ci *
291062306a36Sopenharmony_ci * A string of Default_Ignorable_Code_Point has length 0.
291162306a36Sopenharmony_ci */
291262306a36Sopenharmony_cissize_t utf8len(struct tree *tree, const char *s)
291362306a36Sopenharmony_ci{
291462306a36Sopenharmony_ci	utf8leaf_t	*leaf;
291562306a36Sopenharmony_ci	size_t		ret = 0;
291662306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
291762306a36Sopenharmony_ci
291862306a36Sopenharmony_ci	if (!tree)
291962306a36Sopenharmony_ci		return -1;
292062306a36Sopenharmony_ci	while (*s) {
292162306a36Sopenharmony_ci		leaf = utf8lookup(tree, hangul, s);
292262306a36Sopenharmony_ci		if (!leaf)
292362306a36Sopenharmony_ci			return -1;
292462306a36Sopenharmony_ci		if (ages[LEAF_GEN(leaf)] > tree->maxage)
292562306a36Sopenharmony_ci			ret += utf8clen(s);
292662306a36Sopenharmony_ci		else if (LEAF_CCC(leaf) == DECOMPOSE)
292762306a36Sopenharmony_ci			ret += strlen(LEAF_STR(leaf));
292862306a36Sopenharmony_ci		else
292962306a36Sopenharmony_ci			ret += utf8clen(s);
293062306a36Sopenharmony_ci		s += utf8clen(s);
293162306a36Sopenharmony_ci	}
293262306a36Sopenharmony_ci	return ret;
293362306a36Sopenharmony_ci}
293462306a36Sopenharmony_ci
293562306a36Sopenharmony_ci/*
293662306a36Sopenharmony_ci * Length of the normalization of s, touch at most len bytes.
293762306a36Sopenharmony_ci * Return -1 if s is not valid UTF-8 unicode.
293862306a36Sopenharmony_ci */
293962306a36Sopenharmony_cissize_t utf8nlen(struct tree *tree, const char *s, size_t len)
294062306a36Sopenharmony_ci{
294162306a36Sopenharmony_ci	utf8leaf_t	*leaf;
294262306a36Sopenharmony_ci	size_t		ret = 0;
294362306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
294462306a36Sopenharmony_ci
294562306a36Sopenharmony_ci	if (!tree)
294662306a36Sopenharmony_ci		return -1;
294762306a36Sopenharmony_ci	while (len && *s) {
294862306a36Sopenharmony_ci		leaf = utf8nlookup(tree, hangul, s, len);
294962306a36Sopenharmony_ci		if (!leaf)
295062306a36Sopenharmony_ci			return -1;
295162306a36Sopenharmony_ci		if (ages[LEAF_GEN(leaf)] > tree->maxage)
295262306a36Sopenharmony_ci			ret += utf8clen(s);
295362306a36Sopenharmony_ci		else if (LEAF_CCC(leaf) == DECOMPOSE)
295462306a36Sopenharmony_ci			ret += strlen(LEAF_STR(leaf));
295562306a36Sopenharmony_ci		else
295662306a36Sopenharmony_ci			ret += utf8clen(s);
295762306a36Sopenharmony_ci		len -= utf8clen(s);
295862306a36Sopenharmony_ci		s += utf8clen(s);
295962306a36Sopenharmony_ci	}
296062306a36Sopenharmony_ci	return ret;
296162306a36Sopenharmony_ci}
296262306a36Sopenharmony_ci
296362306a36Sopenharmony_ci/*
296462306a36Sopenharmony_ci * Cursor structure used by the normalizer.
296562306a36Sopenharmony_ci */
296662306a36Sopenharmony_cistruct utf8cursor {
296762306a36Sopenharmony_ci	struct tree	*tree;
296862306a36Sopenharmony_ci	const char	*s;
296962306a36Sopenharmony_ci	const char	*p;
297062306a36Sopenharmony_ci	const char	*ss;
297162306a36Sopenharmony_ci	const char	*sp;
297262306a36Sopenharmony_ci	unsigned int	len;
297362306a36Sopenharmony_ci	unsigned int	slen;
297462306a36Sopenharmony_ci	short int	ccc;
297562306a36Sopenharmony_ci	short int	nccc;
297662306a36Sopenharmony_ci	unsigned int	unichar;
297762306a36Sopenharmony_ci	unsigned char	hangul[UTF8HANGULLEAF];
297862306a36Sopenharmony_ci};
297962306a36Sopenharmony_ci
298062306a36Sopenharmony_ci/*
298162306a36Sopenharmony_ci * Set up an utf8cursor for use by utf8byte().
298262306a36Sopenharmony_ci *
298362306a36Sopenharmony_ci *   s      : string.
298462306a36Sopenharmony_ci *   len    : length of s.
298562306a36Sopenharmony_ci *   u8c    : pointer to cursor.
298662306a36Sopenharmony_ci *   trie   : utf8trie_t to use for normalization.
298762306a36Sopenharmony_ci *
298862306a36Sopenharmony_ci * Returns -1 on error, 0 on success.
298962306a36Sopenharmony_ci */
299062306a36Sopenharmony_ciint utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
299162306a36Sopenharmony_ci		size_t len)
299262306a36Sopenharmony_ci{
299362306a36Sopenharmony_ci	if (!tree)
299462306a36Sopenharmony_ci		return -1;
299562306a36Sopenharmony_ci	if (!s)
299662306a36Sopenharmony_ci		return -1;
299762306a36Sopenharmony_ci	u8c->tree = tree;
299862306a36Sopenharmony_ci	u8c->s = s;
299962306a36Sopenharmony_ci	u8c->p = NULL;
300062306a36Sopenharmony_ci	u8c->ss = NULL;
300162306a36Sopenharmony_ci	u8c->sp = NULL;
300262306a36Sopenharmony_ci	u8c->len = len;
300362306a36Sopenharmony_ci	u8c->slen = 0;
300462306a36Sopenharmony_ci	u8c->ccc = STOPPER;
300562306a36Sopenharmony_ci	u8c->nccc = STOPPER;
300662306a36Sopenharmony_ci	u8c->unichar = 0;
300762306a36Sopenharmony_ci	/* Check we didn't clobber the maximum length. */
300862306a36Sopenharmony_ci	if (u8c->len != len)
300962306a36Sopenharmony_ci		return -1;
301062306a36Sopenharmony_ci	/* The first byte of s may not be an utf8 continuation. */
301162306a36Sopenharmony_ci	if (len > 0 && (*s & 0xC0) == 0x80)
301262306a36Sopenharmony_ci		return -1;
301362306a36Sopenharmony_ci	return 0;
301462306a36Sopenharmony_ci}
301562306a36Sopenharmony_ci
301662306a36Sopenharmony_ci/*
301762306a36Sopenharmony_ci * Set up an utf8cursor for use by utf8byte().
301862306a36Sopenharmony_ci *
301962306a36Sopenharmony_ci *   s      : NUL-terminated string.
302062306a36Sopenharmony_ci *   u8c    : pointer to cursor.
302162306a36Sopenharmony_ci *   trie   : utf8trie_t to use for normalization.
302262306a36Sopenharmony_ci *
302362306a36Sopenharmony_ci * Returns -1 on error, 0 on success.
302462306a36Sopenharmony_ci */
302562306a36Sopenharmony_ciint utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
302662306a36Sopenharmony_ci{
302762306a36Sopenharmony_ci	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
302862306a36Sopenharmony_ci}
302962306a36Sopenharmony_ci
303062306a36Sopenharmony_ci/*
303162306a36Sopenharmony_ci * Get one byte from the normalized form of the string described by u8c.
303262306a36Sopenharmony_ci *
303362306a36Sopenharmony_ci * Returns the byte cast to an unsigned char on succes, and -1 on failure.
303462306a36Sopenharmony_ci *
303562306a36Sopenharmony_ci * The cursor keeps track of the location in the string in u8c->s.
303662306a36Sopenharmony_ci * When a character is decomposed, the current location is stored in
303762306a36Sopenharmony_ci * u8c->p, and u8c->s is set to the start of the decomposition. Note
303862306a36Sopenharmony_ci * that bytes from a decomposition do not count against u8c->len.
303962306a36Sopenharmony_ci *
304062306a36Sopenharmony_ci * Characters are emitted if they match the current CCC in u8c->ccc.
304162306a36Sopenharmony_ci * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
304262306a36Sopenharmony_ci * and the function returns 0 in that case.
304362306a36Sopenharmony_ci *
304462306a36Sopenharmony_ci * Sorting by CCC is done by repeatedly scanning the string.  The
304562306a36Sopenharmony_ci * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
304662306a36Sopenharmony_ci * the start of the scan.  The first pass finds the lowest CCC to be
304762306a36Sopenharmony_ci * emitted and stores it in u8c->nccc, the second pass emits the
304862306a36Sopenharmony_ci * characters with this CCC and finds the next lowest CCC. This limits
304962306a36Sopenharmony_ci * the number of passes to 1 + the number of different CCCs in the
305062306a36Sopenharmony_ci * sequence being scanned.
305162306a36Sopenharmony_ci *
305262306a36Sopenharmony_ci * Therefore:
305362306a36Sopenharmony_ci *  u8c->p  != NULL -> a decomposition is being scanned.
305462306a36Sopenharmony_ci *  u8c->ss != NULL -> this is a repeating scan.
305562306a36Sopenharmony_ci *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
305662306a36Sopenharmony_ci */
305762306a36Sopenharmony_ciint utf8byte(struct utf8cursor *u8c)
305862306a36Sopenharmony_ci{
305962306a36Sopenharmony_ci	utf8leaf_t *leaf;
306062306a36Sopenharmony_ci	int ccc;
306162306a36Sopenharmony_ci
306262306a36Sopenharmony_ci	for (;;) {
306362306a36Sopenharmony_ci		/* Check for the end of a decomposed character. */
306462306a36Sopenharmony_ci		if (u8c->p && *u8c->s == '\0') {
306562306a36Sopenharmony_ci			u8c->s = u8c->p;
306662306a36Sopenharmony_ci			u8c->p = NULL;
306762306a36Sopenharmony_ci		}
306862306a36Sopenharmony_ci
306962306a36Sopenharmony_ci		/* Check for end-of-string. */
307062306a36Sopenharmony_ci		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
307162306a36Sopenharmony_ci			/* There is no next byte. */
307262306a36Sopenharmony_ci			if (u8c->ccc == STOPPER)
307362306a36Sopenharmony_ci				return 0;
307462306a36Sopenharmony_ci			/* End-of-string during a scan counts as a stopper. */
307562306a36Sopenharmony_ci			ccc = STOPPER;
307662306a36Sopenharmony_ci			goto ccc_mismatch;
307762306a36Sopenharmony_ci		} else if ((*u8c->s & 0xC0) == 0x80) {
307862306a36Sopenharmony_ci			/* This is a continuation of the current character. */
307962306a36Sopenharmony_ci			if (!u8c->p)
308062306a36Sopenharmony_ci				u8c->len--;
308162306a36Sopenharmony_ci			return (unsigned char)*u8c->s++;
308262306a36Sopenharmony_ci		}
308362306a36Sopenharmony_ci
308462306a36Sopenharmony_ci		/* Look up the data for the current character. */
308562306a36Sopenharmony_ci		if (u8c->p) {
308662306a36Sopenharmony_ci			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
308762306a36Sopenharmony_ci		} else {
308862306a36Sopenharmony_ci			leaf = utf8nlookup(u8c->tree, u8c->hangul,
308962306a36Sopenharmony_ci					   u8c->s, u8c->len);
309062306a36Sopenharmony_ci		}
309162306a36Sopenharmony_ci
309262306a36Sopenharmony_ci		/* No leaf found implies that the input is a binary blob. */
309362306a36Sopenharmony_ci		if (!leaf)
309462306a36Sopenharmony_ci			return -1;
309562306a36Sopenharmony_ci
309662306a36Sopenharmony_ci		/* Characters that are too new have CCC 0. */
309762306a36Sopenharmony_ci		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
309862306a36Sopenharmony_ci			ccc = STOPPER;
309962306a36Sopenharmony_ci		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
310062306a36Sopenharmony_ci			u8c->len -= utf8clen(u8c->s);
310162306a36Sopenharmony_ci			u8c->p = u8c->s + utf8clen(u8c->s);
310262306a36Sopenharmony_ci			u8c->s = LEAF_STR(leaf);
310362306a36Sopenharmony_ci			/* Empty decomposition implies CCC 0. */
310462306a36Sopenharmony_ci			if (*u8c->s == '\0') {
310562306a36Sopenharmony_ci				if (u8c->ccc == STOPPER)
310662306a36Sopenharmony_ci					continue;
310762306a36Sopenharmony_ci				ccc = STOPPER;
310862306a36Sopenharmony_ci				goto ccc_mismatch;
310962306a36Sopenharmony_ci			}
311062306a36Sopenharmony_ci			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
311162306a36Sopenharmony_ci			ccc = LEAF_CCC(leaf);
311262306a36Sopenharmony_ci		}
311362306a36Sopenharmony_ci		u8c->unichar = utf8decode(u8c->s);
311462306a36Sopenharmony_ci
311562306a36Sopenharmony_ci		/*
311662306a36Sopenharmony_ci		 * If this is not a stopper, then see if it updates
311762306a36Sopenharmony_ci		 * the next canonical class to be emitted.
311862306a36Sopenharmony_ci		 */
311962306a36Sopenharmony_ci		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
312062306a36Sopenharmony_ci			u8c->nccc = ccc;
312162306a36Sopenharmony_ci
312262306a36Sopenharmony_ci		/*
312362306a36Sopenharmony_ci		 * Return the current byte if this is the current
312462306a36Sopenharmony_ci		 * combining class.
312562306a36Sopenharmony_ci		 */
312662306a36Sopenharmony_ci		if (ccc == u8c->ccc) {
312762306a36Sopenharmony_ci			if (!u8c->p)
312862306a36Sopenharmony_ci				u8c->len--;
312962306a36Sopenharmony_ci			return (unsigned char)*u8c->s++;
313062306a36Sopenharmony_ci		}
313162306a36Sopenharmony_ci
313262306a36Sopenharmony_ci		/* Current combining class mismatch. */
313362306a36Sopenharmony_ci	ccc_mismatch:
313462306a36Sopenharmony_ci		if (u8c->nccc == STOPPER) {
313562306a36Sopenharmony_ci			/*
313662306a36Sopenharmony_ci			 * Scan forward for the first canonical class
313762306a36Sopenharmony_ci			 * to be emitted.  Save the position from
313862306a36Sopenharmony_ci			 * which to restart.
313962306a36Sopenharmony_ci			 */
314062306a36Sopenharmony_ci			assert(u8c->ccc == STOPPER);
314162306a36Sopenharmony_ci			u8c->ccc = MINCCC - 1;
314262306a36Sopenharmony_ci			u8c->nccc = ccc;
314362306a36Sopenharmony_ci			u8c->sp = u8c->p;
314462306a36Sopenharmony_ci			u8c->ss = u8c->s;
314562306a36Sopenharmony_ci			u8c->slen = u8c->len;
314662306a36Sopenharmony_ci			if (!u8c->p)
314762306a36Sopenharmony_ci				u8c->len -= utf8clen(u8c->s);
314862306a36Sopenharmony_ci			u8c->s += utf8clen(u8c->s);
314962306a36Sopenharmony_ci		} else if (ccc != STOPPER) {
315062306a36Sopenharmony_ci			/* Not a stopper, and not the ccc we're emitting. */
315162306a36Sopenharmony_ci			if (!u8c->p)
315262306a36Sopenharmony_ci				u8c->len -= utf8clen(u8c->s);
315362306a36Sopenharmony_ci			u8c->s += utf8clen(u8c->s);
315462306a36Sopenharmony_ci		} else if (u8c->nccc != MAXCCC + 1) {
315562306a36Sopenharmony_ci			/* At a stopper, restart for next ccc. */
315662306a36Sopenharmony_ci			u8c->ccc = u8c->nccc;
315762306a36Sopenharmony_ci			u8c->nccc = MAXCCC + 1;
315862306a36Sopenharmony_ci			u8c->s = u8c->ss;
315962306a36Sopenharmony_ci			u8c->p = u8c->sp;
316062306a36Sopenharmony_ci			u8c->len = u8c->slen;
316162306a36Sopenharmony_ci		} else {
316262306a36Sopenharmony_ci			/* All done, proceed from here. */
316362306a36Sopenharmony_ci			u8c->ccc = STOPPER;
316462306a36Sopenharmony_ci			u8c->nccc = STOPPER;
316562306a36Sopenharmony_ci			u8c->sp = NULL;
316662306a36Sopenharmony_ci			u8c->ss = NULL;
316762306a36Sopenharmony_ci			u8c->slen = 0;
316862306a36Sopenharmony_ci		}
316962306a36Sopenharmony_ci	}
317062306a36Sopenharmony_ci}
317162306a36Sopenharmony_ci
317262306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
317362306a36Sopenharmony_ci
317462306a36Sopenharmony_cistatic int normalize_line(struct tree *tree)
317562306a36Sopenharmony_ci{
317662306a36Sopenharmony_ci	char *s;
317762306a36Sopenharmony_ci	char *t;
317862306a36Sopenharmony_ci	int c;
317962306a36Sopenharmony_ci	struct utf8cursor u8c;
318062306a36Sopenharmony_ci
318162306a36Sopenharmony_ci	/* First test: null-terminated string. */
318262306a36Sopenharmony_ci	s = buf2;
318362306a36Sopenharmony_ci	t = buf3;
318462306a36Sopenharmony_ci	if (utf8cursor(&u8c, tree, s))
318562306a36Sopenharmony_ci		return -1;
318662306a36Sopenharmony_ci	while ((c = utf8byte(&u8c)) > 0)
318762306a36Sopenharmony_ci		if (c != (unsigned char)*t++)
318862306a36Sopenharmony_ci			return -1;
318962306a36Sopenharmony_ci	if (c < 0)
319062306a36Sopenharmony_ci		return -1;
319162306a36Sopenharmony_ci	if (*t != 0)
319262306a36Sopenharmony_ci		return -1;
319362306a36Sopenharmony_ci
319462306a36Sopenharmony_ci	/* Second test: length-limited string. */
319562306a36Sopenharmony_ci	s = buf2;
319662306a36Sopenharmony_ci	/* Replace NUL with a value that will cause an error if seen. */
319762306a36Sopenharmony_ci	s[strlen(s) + 1] = -1;
319862306a36Sopenharmony_ci	t = buf3;
319962306a36Sopenharmony_ci	if (utf8cursor(&u8c, tree, s))
320062306a36Sopenharmony_ci		return -1;
320162306a36Sopenharmony_ci	while ((c = utf8byte(&u8c)) > 0)
320262306a36Sopenharmony_ci		if (c != (unsigned char)*t++)
320362306a36Sopenharmony_ci			return -1;
320462306a36Sopenharmony_ci	if (c < 0)
320562306a36Sopenharmony_ci		return -1;
320662306a36Sopenharmony_ci	if (*t != 0)
320762306a36Sopenharmony_ci		return -1;
320862306a36Sopenharmony_ci
320962306a36Sopenharmony_ci	return 0;
321062306a36Sopenharmony_ci}
321162306a36Sopenharmony_ci
321262306a36Sopenharmony_cistatic void normalization_test(void)
321362306a36Sopenharmony_ci{
321462306a36Sopenharmony_ci	FILE *file;
321562306a36Sopenharmony_ci	unsigned int unichar;
321662306a36Sopenharmony_ci	struct unicode_data *data;
321762306a36Sopenharmony_ci	char *s;
321862306a36Sopenharmony_ci	char *t;
321962306a36Sopenharmony_ci	int ret;
322062306a36Sopenharmony_ci	int ignorables;
322162306a36Sopenharmony_ci	int tests = 0;
322262306a36Sopenharmony_ci	int failures = 0;
322362306a36Sopenharmony_ci
322462306a36Sopenharmony_ci	if (verbose > 0)
322562306a36Sopenharmony_ci		printf("Parsing %s\n", test_name);
322662306a36Sopenharmony_ci	/* Step one, read data from file. */
322762306a36Sopenharmony_ci	file = fopen(test_name, "r");
322862306a36Sopenharmony_ci	if (!file)
322962306a36Sopenharmony_ci		open_fail(test_name, errno);
323062306a36Sopenharmony_ci
323162306a36Sopenharmony_ci	while (fgets(line, LINESIZE, file)) {
323262306a36Sopenharmony_ci		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
323362306a36Sopenharmony_ci			     buf0, buf1);
323462306a36Sopenharmony_ci		if (ret != 2 || *line == '#')
323562306a36Sopenharmony_ci			continue;
323662306a36Sopenharmony_ci		s = buf0;
323762306a36Sopenharmony_ci		t = buf2;
323862306a36Sopenharmony_ci		while (*s) {
323962306a36Sopenharmony_ci			unichar = strtoul(s, &s, 16);
324062306a36Sopenharmony_ci			t += utf8encode(t, unichar);
324162306a36Sopenharmony_ci		}
324262306a36Sopenharmony_ci		*t = '\0';
324362306a36Sopenharmony_ci
324462306a36Sopenharmony_ci		ignorables = 0;
324562306a36Sopenharmony_ci		s = buf1;
324662306a36Sopenharmony_ci		t = buf3;
324762306a36Sopenharmony_ci		while (*s) {
324862306a36Sopenharmony_ci			unichar = strtoul(s, &s, 16);
324962306a36Sopenharmony_ci			data = &unicode_data[unichar];
325062306a36Sopenharmony_ci			if (data->utf8nfdi && !*data->utf8nfdi)
325162306a36Sopenharmony_ci				ignorables = 1;
325262306a36Sopenharmony_ci			else
325362306a36Sopenharmony_ci				t += utf8encode(t, unichar);
325462306a36Sopenharmony_ci		}
325562306a36Sopenharmony_ci		*t = '\0';
325662306a36Sopenharmony_ci
325762306a36Sopenharmony_ci		tests++;
325862306a36Sopenharmony_ci		if (normalize_line(nfdi_tree) < 0) {
325962306a36Sopenharmony_ci			printf("Line %s -> %s", buf0, buf1);
326062306a36Sopenharmony_ci			if (ignorables)
326162306a36Sopenharmony_ci				printf(" (ignorables removed)");
326262306a36Sopenharmony_ci			printf(" failure\n");
326362306a36Sopenharmony_ci			failures++;
326462306a36Sopenharmony_ci		}
326562306a36Sopenharmony_ci	}
326662306a36Sopenharmony_ci	fclose(file);
326762306a36Sopenharmony_ci	if (verbose > 0)
326862306a36Sopenharmony_ci		printf("Ran %d tests with %d failures\n", tests, failures);
326962306a36Sopenharmony_ci	if (failures)
327062306a36Sopenharmony_ci		file_fail(test_name);
327162306a36Sopenharmony_ci}
327262306a36Sopenharmony_ci
327362306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
327462306a36Sopenharmony_ci
327562306a36Sopenharmony_cistatic void write_file(void)
327662306a36Sopenharmony_ci{
327762306a36Sopenharmony_ci	FILE *file;
327862306a36Sopenharmony_ci	int i;
327962306a36Sopenharmony_ci	int j;
328062306a36Sopenharmony_ci	int t;
328162306a36Sopenharmony_ci	int gen;
328262306a36Sopenharmony_ci
328362306a36Sopenharmony_ci	if (verbose > 0)
328462306a36Sopenharmony_ci		printf("Writing %s\n", utf8_name);
328562306a36Sopenharmony_ci	file = fopen(utf8_name, "w");
328662306a36Sopenharmony_ci	if (!file)
328762306a36Sopenharmony_ci		open_fail(utf8_name, errno);
328862306a36Sopenharmony_ci
328962306a36Sopenharmony_ci	fprintf(file, "/* This file is generated code, do not edit. */\n");
329062306a36Sopenharmony_ci	fprintf(file, "\n");
329162306a36Sopenharmony_ci	fprintf(file, "#include <linux/module.h>\n");
329262306a36Sopenharmony_ci	fprintf(file, "#include <linux/kernel.h>\n");
329362306a36Sopenharmony_ci	fprintf(file, "#include \"utf8n.h\"\n");
329462306a36Sopenharmony_ci	fprintf(file, "\n");
329562306a36Sopenharmony_ci	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
329662306a36Sopenharmony_ci	for (i = 0; i != ages_count; i++)
329762306a36Sopenharmony_ci		fprintf(file, "\t%#x%s\n", ages[i],
329862306a36Sopenharmony_ci			ages[i] == unicode_maxage ? "" : ",");
329962306a36Sopenharmony_ci	fprintf(file, "};\n");
330062306a36Sopenharmony_ci	fprintf(file, "\n");
330162306a36Sopenharmony_ci	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
330262306a36Sopenharmony_ci	t = 0;
330362306a36Sopenharmony_ci	for (gen = 0; gen < ages_count; gen++) {
330462306a36Sopenharmony_ci		fprintf(file, "\t{ %#x, %d }%s\n",
330562306a36Sopenharmony_ci			ages[gen], trees[t].index,
330662306a36Sopenharmony_ci			ages[gen] == unicode_maxage ? "" : ",");
330762306a36Sopenharmony_ci		if (trees[t].maxage == ages[gen])
330862306a36Sopenharmony_ci			t += 2;
330962306a36Sopenharmony_ci	}
331062306a36Sopenharmony_ci	fprintf(file, "};\n");
331162306a36Sopenharmony_ci	fprintf(file, "\n");
331262306a36Sopenharmony_ci	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
331362306a36Sopenharmony_ci	t = 1;
331462306a36Sopenharmony_ci	for (gen = 0; gen < ages_count; gen++) {
331562306a36Sopenharmony_ci		fprintf(file, "\t{ %#x, %d }%s\n",
331662306a36Sopenharmony_ci			ages[gen], trees[t].index,
331762306a36Sopenharmony_ci			ages[gen] == unicode_maxage ? "" : ",");
331862306a36Sopenharmony_ci		if (trees[t].maxage == ages[gen])
331962306a36Sopenharmony_ci			t += 2;
332062306a36Sopenharmony_ci	}
332162306a36Sopenharmony_ci	fprintf(file, "};\n");
332262306a36Sopenharmony_ci	fprintf(file, "\n");
332362306a36Sopenharmony_ci	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
332462306a36Sopenharmony_ci		utf8data_size);
332562306a36Sopenharmony_ci	t = 0;
332662306a36Sopenharmony_ci	for (i = 0; i != utf8data_size; i += 16) {
332762306a36Sopenharmony_ci		if (i == trees[t].index) {
332862306a36Sopenharmony_ci			fprintf(file, "\t/* %s_%x */\n",
332962306a36Sopenharmony_ci				trees[t].type, trees[t].maxage);
333062306a36Sopenharmony_ci			if (t < trees_count-1)
333162306a36Sopenharmony_ci				t++;
333262306a36Sopenharmony_ci		}
333362306a36Sopenharmony_ci		fprintf(file, "\t");
333462306a36Sopenharmony_ci		for (j = i; j != i + 16; j++)
333562306a36Sopenharmony_ci			fprintf(file, "0x%.2x%s", utf8data[j],
333662306a36Sopenharmony_ci				(j < utf8data_size -1 ? "," : ""));
333762306a36Sopenharmony_ci		fprintf(file, "\n");
333862306a36Sopenharmony_ci	}
333962306a36Sopenharmony_ci	fprintf(file, "};\n");
334062306a36Sopenharmony_ci	fprintf(file, "\n");
334162306a36Sopenharmony_ci	fprintf(file, "struct utf8data_table utf8_data_table = {\n");
334262306a36Sopenharmony_ci	fprintf(file, "\t.utf8agetab = utf8agetab,\n");
334362306a36Sopenharmony_ci	fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
334462306a36Sopenharmony_ci	fprintf(file, "\n");
334562306a36Sopenharmony_ci	fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
334662306a36Sopenharmony_ci	fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
334762306a36Sopenharmony_ci	fprintf(file, "\n");
334862306a36Sopenharmony_ci	fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
334962306a36Sopenharmony_ci	fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
335062306a36Sopenharmony_ci	fprintf(file, "\n");
335162306a36Sopenharmony_ci	fprintf(file, "\t.utf8data = utf8data,\n");
335262306a36Sopenharmony_ci	fprintf(file, "};\n");
335362306a36Sopenharmony_ci	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
335462306a36Sopenharmony_ci	fprintf(file, "\n");
335562306a36Sopenharmony_ci	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
335662306a36Sopenharmony_ci	fclose(file);
335762306a36Sopenharmony_ci}
335862306a36Sopenharmony_ci
335962306a36Sopenharmony_ci/* ------------------------------------------------------------------ */
336062306a36Sopenharmony_ci
336162306a36Sopenharmony_ciint main(int argc, char *argv[])
336262306a36Sopenharmony_ci{
336362306a36Sopenharmony_ci	unsigned int unichar;
336462306a36Sopenharmony_ci	int opt;
336562306a36Sopenharmony_ci
336662306a36Sopenharmony_ci	argv0 = argv[0];
336762306a36Sopenharmony_ci
336862306a36Sopenharmony_ci	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
336962306a36Sopenharmony_ci		switch (opt) {
337062306a36Sopenharmony_ci		case 'a':
337162306a36Sopenharmony_ci			age_name = optarg;
337262306a36Sopenharmony_ci			break;
337362306a36Sopenharmony_ci		case 'c':
337462306a36Sopenharmony_ci			ccc_name = optarg;
337562306a36Sopenharmony_ci			break;
337662306a36Sopenharmony_ci		case 'd':
337762306a36Sopenharmony_ci			data_name = optarg;
337862306a36Sopenharmony_ci			break;
337962306a36Sopenharmony_ci		case 'f':
338062306a36Sopenharmony_ci			fold_name = optarg;
338162306a36Sopenharmony_ci			break;
338262306a36Sopenharmony_ci		case 'n':
338362306a36Sopenharmony_ci			norm_name = optarg;
338462306a36Sopenharmony_ci			break;
338562306a36Sopenharmony_ci		case 'o':
338662306a36Sopenharmony_ci			utf8_name = optarg;
338762306a36Sopenharmony_ci			break;
338862306a36Sopenharmony_ci		case 'p':
338962306a36Sopenharmony_ci			prop_name = optarg;
339062306a36Sopenharmony_ci			break;
339162306a36Sopenharmony_ci		case 't':
339262306a36Sopenharmony_ci			test_name = optarg;
339362306a36Sopenharmony_ci			break;
339462306a36Sopenharmony_ci		case 'v':
339562306a36Sopenharmony_ci			verbose++;
339662306a36Sopenharmony_ci			break;
339762306a36Sopenharmony_ci		case 'h':
339862306a36Sopenharmony_ci			help();
339962306a36Sopenharmony_ci			exit(0);
340062306a36Sopenharmony_ci		default:
340162306a36Sopenharmony_ci			usage();
340262306a36Sopenharmony_ci		}
340362306a36Sopenharmony_ci	}
340462306a36Sopenharmony_ci
340562306a36Sopenharmony_ci	if (verbose > 1)
340662306a36Sopenharmony_ci		help();
340762306a36Sopenharmony_ci	for (unichar = 0; unichar != 0x110000; unichar++)
340862306a36Sopenharmony_ci		unicode_data[unichar].code = unichar;
340962306a36Sopenharmony_ci	age_init();
341062306a36Sopenharmony_ci	ccc_init();
341162306a36Sopenharmony_ci	nfdi_init();
341262306a36Sopenharmony_ci	nfdicf_init();
341362306a36Sopenharmony_ci	ignore_init();
341462306a36Sopenharmony_ci	corrections_init();
341562306a36Sopenharmony_ci	hangul_decompose();
341662306a36Sopenharmony_ci	nfdi_decompose();
341762306a36Sopenharmony_ci	nfdicf_decompose();
341862306a36Sopenharmony_ci	utf8_init();
341962306a36Sopenharmony_ci	trees_init();
342062306a36Sopenharmony_ci	trees_populate();
342162306a36Sopenharmony_ci	trees_reduce();
342262306a36Sopenharmony_ci	trees_verify();
342362306a36Sopenharmony_ci	/* Prevent "unused function" warning. */
342462306a36Sopenharmony_ci	(void)lookup(nfdi_tree, " ");
342562306a36Sopenharmony_ci	if (verbose > 2)
342662306a36Sopenharmony_ci		tree_walk(nfdi_tree);
342762306a36Sopenharmony_ci	if (verbose > 2)
342862306a36Sopenharmony_ci		tree_walk(nfdicf_tree);
342962306a36Sopenharmony_ci	normalization_test();
343062306a36Sopenharmony_ci	write_file();
343162306a36Sopenharmony_ci
343262306a36Sopenharmony_ci	return 0;
343362306a36Sopenharmony_ci}
3434