17db96d56Sopenharmony_ci#
27db96d56Sopenharmony_ci# (re)generate unicode property and type databases
37db96d56Sopenharmony_ci#
47db96d56Sopenharmony_ci# This script converts Unicode database files to Modules/unicodedata_db.h,
57db96d56Sopenharmony_ci# Modules/unicodename_db.h, and Objects/unicodetype_db.h
67db96d56Sopenharmony_ci#
77db96d56Sopenharmony_ci# history:
87db96d56Sopenharmony_ci# 2000-09-24 fl   created (based on bits and pieces from unidb)
97db96d56Sopenharmony_ci# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
107db96d56Sopenharmony_ci# 2000-09-25 fl   added character type table
117db96d56Sopenharmony_ci# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
127db96d56Sopenharmony_ci# 2000-11-03 fl   expand first/last ranges
137db96d56Sopenharmony_ci# 2001-01-19 fl   added character name tables (2.1)
147db96d56Sopenharmony_ci# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
157db96d56Sopenharmony_ci# 2002-09-11 wd   use string methods
167db96d56Sopenharmony_ci# 2002-10-18 mvl  update to Unicode 3.2
177db96d56Sopenharmony_ci# 2002-10-22 mvl  generate NFC tables
187db96d56Sopenharmony_ci# 2002-11-24 mvl  expand all ranges, sort names version-independently
197db96d56Sopenharmony_ci# 2002-11-25 mvl  add UNIDATA_VERSION
207db96d56Sopenharmony_ci# 2004-05-29 perky add east asian width information
217db96d56Sopenharmony_ci# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
227db96d56Sopenharmony_ci# 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
237db96d56Sopenharmony_ci# 2011-10-21 ezio add support for name aliases and named sequences
247db96d56Sopenharmony_ci# 2012-01    benjamin add full case mappings
257db96d56Sopenharmony_ci#
267db96d56Sopenharmony_ci# written by Fredrik Lundh (fredrik@pythonware.com)
277db96d56Sopenharmony_ci#
287db96d56Sopenharmony_ci
297db96d56Sopenharmony_ciimport dataclasses
307db96d56Sopenharmony_ciimport os
317db96d56Sopenharmony_ciimport sys
327db96d56Sopenharmony_ciimport zipfile
337db96d56Sopenharmony_ci
347db96d56Sopenharmony_cifrom functools import partial
357db96d56Sopenharmony_cifrom textwrap import dedent
367db96d56Sopenharmony_cifrom typing import Iterator, List, Optional, Set, Tuple
377db96d56Sopenharmony_ci
387db96d56Sopenharmony_ciSCRIPT = sys.argv[0]
397db96d56Sopenharmony_ciVERSION = "3.3"
407db96d56Sopenharmony_ci
417db96d56Sopenharmony_ci# The Unicode Database
427db96d56Sopenharmony_ci# --------------------
437db96d56Sopenharmony_ci# When changing UCD version please update
447db96d56Sopenharmony_ci#   * Doc/library/stdtypes.rst, and
457db96d56Sopenharmony_ci#   * Doc/library/unicodedata.rst
467db96d56Sopenharmony_ci#   * Doc/reference/lexical_analysis.rst (two occurrences)
477db96d56Sopenharmony_ciUNIDATA_VERSION = "14.0.0"
487db96d56Sopenharmony_ciUNICODE_DATA = "UnicodeData%s.txt"
497db96d56Sopenharmony_ciCOMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
507db96d56Sopenharmony_ciEASTASIAN_WIDTH = "EastAsianWidth%s.txt"
517db96d56Sopenharmony_ciUNIHAN = "Unihan%s.zip"
527db96d56Sopenharmony_ciDERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
537db96d56Sopenharmony_ciDERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
547db96d56Sopenharmony_ciLINE_BREAK = "LineBreak%s.txt"
557db96d56Sopenharmony_ciNAME_ALIASES = "NameAliases%s.txt"
567db96d56Sopenharmony_ciNAMED_SEQUENCES = "NamedSequences%s.txt"
577db96d56Sopenharmony_ciSPECIAL_CASING = "SpecialCasing%s.txt"
587db96d56Sopenharmony_ciCASE_FOLDING = "CaseFolding%s.txt"
597db96d56Sopenharmony_ci
607db96d56Sopenharmony_ci# Private Use Areas -- in planes 1, 15, 16
617db96d56Sopenharmony_ciPUA_1 = range(0xE000, 0xF900)
627db96d56Sopenharmony_ciPUA_15 = range(0xF0000, 0xFFFFE)
637db96d56Sopenharmony_ciPUA_16 = range(0x100000, 0x10FFFE)
647db96d56Sopenharmony_ci
657db96d56Sopenharmony_ci# we use this ranges of PUA_15 to store name aliases and named sequences
667db96d56Sopenharmony_ciNAME_ALIASES_START = 0xF0000
677db96d56Sopenharmony_ciNAMED_SEQUENCES_START = 0xF0200
687db96d56Sopenharmony_ci
697db96d56Sopenharmony_ciold_versions = ["3.2.0"]
707db96d56Sopenharmony_ci
717db96d56Sopenharmony_ciCATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
727db96d56Sopenharmony_ci    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
737db96d56Sopenharmony_ci    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
747db96d56Sopenharmony_ci    "So" ]
757db96d56Sopenharmony_ci
767db96d56Sopenharmony_ciBIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
777db96d56Sopenharmony_ci    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
787db96d56Sopenharmony_ci    "ON", "LRI", "RLI", "FSI", "PDI" ]
797db96d56Sopenharmony_ci
807db96d56Sopenharmony_ciEASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
817db96d56Sopenharmony_ci
827db96d56Sopenharmony_ciMANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
837db96d56Sopenharmony_ci
847db96d56Sopenharmony_ci# note: should match definitions in Objects/unicodectype.c
857db96d56Sopenharmony_ciALPHA_MASK = 0x01
867db96d56Sopenharmony_ciDECIMAL_MASK = 0x02
877db96d56Sopenharmony_ciDIGIT_MASK = 0x04
887db96d56Sopenharmony_ciLOWER_MASK = 0x08
897db96d56Sopenharmony_ciLINEBREAK_MASK = 0x10
907db96d56Sopenharmony_ciSPACE_MASK = 0x20
917db96d56Sopenharmony_ciTITLE_MASK = 0x40
927db96d56Sopenharmony_ciUPPER_MASK = 0x80
937db96d56Sopenharmony_ciXID_START_MASK = 0x100
947db96d56Sopenharmony_ciXID_CONTINUE_MASK = 0x200
957db96d56Sopenharmony_ciPRINTABLE_MASK = 0x400
967db96d56Sopenharmony_ciNUMERIC_MASK = 0x800
977db96d56Sopenharmony_ciCASE_IGNORABLE_MASK = 0x1000
987db96d56Sopenharmony_ciCASED_MASK = 0x2000
997db96d56Sopenharmony_ciEXTENDED_CASE_MASK = 0x4000
1007db96d56Sopenharmony_ci
1017db96d56Sopenharmony_ci# these ranges need to match unicodedata.c:is_unified_ideograph
1027db96d56Sopenharmony_cicjk_ranges = [
1037db96d56Sopenharmony_ci    ('3400', '4DBF'),
1047db96d56Sopenharmony_ci    ('4E00', '9FFF'),
1057db96d56Sopenharmony_ci    ('20000', '2A6DF'),
1067db96d56Sopenharmony_ci    ('2A700', '2B738'),
1077db96d56Sopenharmony_ci    ('2B740', '2B81D'),
1087db96d56Sopenharmony_ci    ('2B820', '2CEA1'),
1097db96d56Sopenharmony_ci    ('2CEB0', '2EBE0'),
1107db96d56Sopenharmony_ci    ('30000', '3134A'),
1117db96d56Sopenharmony_ci]
1127db96d56Sopenharmony_ci
1137db96d56Sopenharmony_ci
1147db96d56Sopenharmony_cidef maketables(trace=0):
1157db96d56Sopenharmony_ci
1167db96d56Sopenharmony_ci    print("--- Reading", UNICODE_DATA % "", "...")
1177db96d56Sopenharmony_ci
1187db96d56Sopenharmony_ci    unicode = UnicodeData(UNIDATA_VERSION)
1197db96d56Sopenharmony_ci
1207db96d56Sopenharmony_ci    print(len(list(filter(None, unicode.table))), "characters")
1217db96d56Sopenharmony_ci
1227db96d56Sopenharmony_ci    for version in old_versions:
1237db96d56Sopenharmony_ci        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
1247db96d56Sopenharmony_ci        old_unicode = UnicodeData(version, cjk_check=False)
1257db96d56Sopenharmony_ci        print(len(list(filter(None, old_unicode.table))), "characters")
1267db96d56Sopenharmony_ci        merge_old_version(version, unicode, old_unicode)
1277db96d56Sopenharmony_ci
1287db96d56Sopenharmony_ci    makeunicodename(unicode, trace)
1297db96d56Sopenharmony_ci    makeunicodedata(unicode, trace)
1307db96d56Sopenharmony_ci    makeunicodetype(unicode, trace)
1317db96d56Sopenharmony_ci
1327db96d56Sopenharmony_ci
1337db96d56Sopenharmony_ci# --------------------------------------------------------------------
1347db96d56Sopenharmony_ci# unicode character properties
1357db96d56Sopenharmony_ci
1367db96d56Sopenharmony_cidef makeunicodedata(unicode, trace):
1377db96d56Sopenharmony_ci
1387db96d56Sopenharmony_ci    dummy = (0, 0, 0, 0, 0, 0)
1397db96d56Sopenharmony_ci    table = [dummy]
1407db96d56Sopenharmony_ci    cache = {0: dummy}
1417db96d56Sopenharmony_ci    index = [0] * len(unicode.chars)
1427db96d56Sopenharmony_ci
1437db96d56Sopenharmony_ci    FILE = "Modules/unicodedata_db.h"
1447db96d56Sopenharmony_ci
1457db96d56Sopenharmony_ci    print("--- Preparing", FILE, "...")
1467db96d56Sopenharmony_ci
1477db96d56Sopenharmony_ci    # 1) database properties
1487db96d56Sopenharmony_ci
1497db96d56Sopenharmony_ci    for char in unicode.chars:
1507db96d56Sopenharmony_ci        record = unicode.table[char]
1517db96d56Sopenharmony_ci        if record:
1527db96d56Sopenharmony_ci            # extract database properties
1537db96d56Sopenharmony_ci            category = CATEGORY_NAMES.index(record.general_category)
1547db96d56Sopenharmony_ci            combining = int(record.canonical_combining_class)
1557db96d56Sopenharmony_ci            bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
1567db96d56Sopenharmony_ci            mirrored = record.bidi_mirrored == "Y"
1577db96d56Sopenharmony_ci            eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
1587db96d56Sopenharmony_ci            normalizationquickcheck = record.quick_check
1597db96d56Sopenharmony_ci            item = (
1607db96d56Sopenharmony_ci                category, combining, bidirectional, mirrored, eastasianwidth,
1617db96d56Sopenharmony_ci                normalizationquickcheck
1627db96d56Sopenharmony_ci                )
1637db96d56Sopenharmony_ci            # add entry to index and item tables
1647db96d56Sopenharmony_ci            i = cache.get(item)
1657db96d56Sopenharmony_ci            if i is None:
1667db96d56Sopenharmony_ci                cache[item] = i = len(table)
1677db96d56Sopenharmony_ci                table.append(item)
1687db96d56Sopenharmony_ci            index[char] = i
1697db96d56Sopenharmony_ci
1707db96d56Sopenharmony_ci    # 2) decomposition data
1717db96d56Sopenharmony_ci
1727db96d56Sopenharmony_ci    decomp_data = [0]
1737db96d56Sopenharmony_ci    decomp_prefix = [""]
1747db96d56Sopenharmony_ci    decomp_index = [0] * len(unicode.chars)
1757db96d56Sopenharmony_ci    decomp_size = 0
1767db96d56Sopenharmony_ci
1777db96d56Sopenharmony_ci    comp_pairs = []
1787db96d56Sopenharmony_ci    comp_first = [None] * len(unicode.chars)
1797db96d56Sopenharmony_ci    comp_last = [None] * len(unicode.chars)
1807db96d56Sopenharmony_ci
1817db96d56Sopenharmony_ci    for char in unicode.chars:
1827db96d56Sopenharmony_ci        record = unicode.table[char]
1837db96d56Sopenharmony_ci        if record:
1847db96d56Sopenharmony_ci            if record.decomposition_type:
1857db96d56Sopenharmony_ci                decomp = record.decomposition_type.split()
1867db96d56Sopenharmony_ci                if len(decomp) > 19:
1877db96d56Sopenharmony_ci                    raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
1887db96d56Sopenharmony_ci                # prefix
1897db96d56Sopenharmony_ci                if decomp[0][0] == "<":
1907db96d56Sopenharmony_ci                    prefix = decomp.pop(0)
1917db96d56Sopenharmony_ci                else:
1927db96d56Sopenharmony_ci                    prefix = ""
1937db96d56Sopenharmony_ci                try:
1947db96d56Sopenharmony_ci                    i = decomp_prefix.index(prefix)
1957db96d56Sopenharmony_ci                except ValueError:
1967db96d56Sopenharmony_ci                    i = len(decomp_prefix)
1977db96d56Sopenharmony_ci                    decomp_prefix.append(prefix)
1987db96d56Sopenharmony_ci                prefix = i
1997db96d56Sopenharmony_ci                assert prefix < 256
2007db96d56Sopenharmony_ci                # content
2017db96d56Sopenharmony_ci                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
2027db96d56Sopenharmony_ci                # Collect NFC pairs
2037db96d56Sopenharmony_ci                if not prefix and len(decomp) == 3 and \
2047db96d56Sopenharmony_ci                   char not in unicode.exclusions and \
2057db96d56Sopenharmony_ci                   unicode.table[decomp[1]].canonical_combining_class == "0":
2067db96d56Sopenharmony_ci                    p, l, r = decomp
2077db96d56Sopenharmony_ci                    comp_first[l] = 1
2087db96d56Sopenharmony_ci                    comp_last[r] = 1
2097db96d56Sopenharmony_ci                    comp_pairs.append((l,r,char))
2107db96d56Sopenharmony_ci                try:
2117db96d56Sopenharmony_ci                    i = decomp_data.index(decomp)
2127db96d56Sopenharmony_ci                except ValueError:
2137db96d56Sopenharmony_ci                    i = len(decomp_data)
2147db96d56Sopenharmony_ci                    decomp_data.extend(decomp)
2157db96d56Sopenharmony_ci                    decomp_size = decomp_size + len(decomp) * 2
2167db96d56Sopenharmony_ci            else:
2177db96d56Sopenharmony_ci                i = 0
2187db96d56Sopenharmony_ci            decomp_index[char] = i
2197db96d56Sopenharmony_ci
2207db96d56Sopenharmony_ci    f = l = 0
2217db96d56Sopenharmony_ci    comp_first_ranges = []
2227db96d56Sopenharmony_ci    comp_last_ranges = []
2237db96d56Sopenharmony_ci    prev_f = prev_l = None
2247db96d56Sopenharmony_ci    for i in unicode.chars:
2257db96d56Sopenharmony_ci        if comp_first[i] is not None:
2267db96d56Sopenharmony_ci            comp_first[i] = f
2277db96d56Sopenharmony_ci            f += 1
2287db96d56Sopenharmony_ci            if prev_f is None:
2297db96d56Sopenharmony_ci                prev_f = (i,i)
2307db96d56Sopenharmony_ci            elif prev_f[1]+1 == i:
2317db96d56Sopenharmony_ci                prev_f = prev_f[0],i
2327db96d56Sopenharmony_ci            else:
2337db96d56Sopenharmony_ci                comp_first_ranges.append(prev_f)
2347db96d56Sopenharmony_ci                prev_f = (i,i)
2357db96d56Sopenharmony_ci        if comp_last[i] is not None:
2367db96d56Sopenharmony_ci            comp_last[i] = l
2377db96d56Sopenharmony_ci            l += 1
2387db96d56Sopenharmony_ci            if prev_l is None:
2397db96d56Sopenharmony_ci                prev_l = (i,i)
2407db96d56Sopenharmony_ci            elif prev_l[1]+1 == i:
2417db96d56Sopenharmony_ci                prev_l = prev_l[0],i
2427db96d56Sopenharmony_ci            else:
2437db96d56Sopenharmony_ci                comp_last_ranges.append(prev_l)
2447db96d56Sopenharmony_ci                prev_l = (i,i)
2457db96d56Sopenharmony_ci    comp_first_ranges.append(prev_f)
2467db96d56Sopenharmony_ci    comp_last_ranges.append(prev_l)
2477db96d56Sopenharmony_ci    total_first = f
2487db96d56Sopenharmony_ci    total_last = l
2497db96d56Sopenharmony_ci
2507db96d56Sopenharmony_ci    comp_data = [0]*(total_first*total_last)
2517db96d56Sopenharmony_ci    for f,l,char in comp_pairs:
2527db96d56Sopenharmony_ci        f = comp_first[f]
2537db96d56Sopenharmony_ci        l = comp_last[l]
2547db96d56Sopenharmony_ci        comp_data[f*total_last+l] = char
2557db96d56Sopenharmony_ci
2567db96d56Sopenharmony_ci    print(len(table), "unique properties")
2577db96d56Sopenharmony_ci    print(len(decomp_prefix), "unique decomposition prefixes")
2587db96d56Sopenharmony_ci    print(len(decomp_data), "unique decomposition entries:", end=' ')
2597db96d56Sopenharmony_ci    print(decomp_size, "bytes")
2607db96d56Sopenharmony_ci    print(total_first, "first characters in NFC")
2617db96d56Sopenharmony_ci    print(total_last, "last characters in NFC")
2627db96d56Sopenharmony_ci    print(len(comp_pairs), "NFC pairs")
2637db96d56Sopenharmony_ci
2647db96d56Sopenharmony_ci    print("--- Writing", FILE, "...")
2657db96d56Sopenharmony_ci
2667db96d56Sopenharmony_ci    with open(FILE, "w") as fp:
2677db96d56Sopenharmony_ci        fprint = partial(print, file=fp)
2687db96d56Sopenharmony_ci
2697db96d56Sopenharmony_ci        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
2707db96d56Sopenharmony_ci        fprint()
2717db96d56Sopenharmony_ci        fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
2727db96d56Sopenharmony_ci        fprint("/* a list of unique database records */")
2737db96d56Sopenharmony_ci        fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
2747db96d56Sopenharmony_ci        for item in table:
2757db96d56Sopenharmony_ci            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
2767db96d56Sopenharmony_ci        fprint("};")
2777db96d56Sopenharmony_ci        fprint()
2787db96d56Sopenharmony_ci
2797db96d56Sopenharmony_ci        fprint("/* Reindexing of NFC first characters. */")
2807db96d56Sopenharmony_ci        fprint("#define TOTAL_FIRST",total_first)
2817db96d56Sopenharmony_ci        fprint("#define TOTAL_LAST",total_last)
2827db96d56Sopenharmony_ci        fprint("struct reindex{int start;short count,index;};")
2837db96d56Sopenharmony_ci        fprint("static struct reindex nfc_first[] = {")
2847db96d56Sopenharmony_ci        for start,end in comp_first_ranges:
2857db96d56Sopenharmony_ci            fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
2867db96d56Sopenharmony_ci        fprint("    {0,0,0}")
2877db96d56Sopenharmony_ci        fprint("};\n")
2887db96d56Sopenharmony_ci        fprint("static struct reindex nfc_last[] = {")
2897db96d56Sopenharmony_ci        for start,end in comp_last_ranges:
2907db96d56Sopenharmony_ci            fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
2917db96d56Sopenharmony_ci        fprint("  {0,0,0}")
2927db96d56Sopenharmony_ci        fprint("};\n")
2937db96d56Sopenharmony_ci
2947db96d56Sopenharmony_ci        # FIXME: <fl> the following tables could be made static, and
2957db96d56Sopenharmony_ci        # the support code moved into unicodedatabase.c
2967db96d56Sopenharmony_ci
2977db96d56Sopenharmony_ci        fprint("/* string literals */")
2987db96d56Sopenharmony_ci        fprint("const char *_PyUnicode_CategoryNames[] = {")
2997db96d56Sopenharmony_ci        for name in CATEGORY_NAMES:
3007db96d56Sopenharmony_ci            fprint("    \"%s\"," % name)
3017db96d56Sopenharmony_ci        fprint("    NULL")
3027db96d56Sopenharmony_ci        fprint("};")
3037db96d56Sopenharmony_ci
3047db96d56Sopenharmony_ci        fprint("const char *_PyUnicode_BidirectionalNames[] = {")
3057db96d56Sopenharmony_ci        for name in BIDIRECTIONAL_NAMES:
3067db96d56Sopenharmony_ci            fprint("    \"%s\"," % name)
3077db96d56Sopenharmony_ci        fprint("    NULL")
3087db96d56Sopenharmony_ci        fprint("};")
3097db96d56Sopenharmony_ci
3107db96d56Sopenharmony_ci        fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
3117db96d56Sopenharmony_ci        for name in EASTASIANWIDTH_NAMES:
3127db96d56Sopenharmony_ci            fprint("    \"%s\"," % name)
3137db96d56Sopenharmony_ci        fprint("    NULL")
3147db96d56Sopenharmony_ci        fprint("};")
3157db96d56Sopenharmony_ci
3167db96d56Sopenharmony_ci        fprint("static const char *decomp_prefix[] = {")
3177db96d56Sopenharmony_ci        for name in decomp_prefix:
3187db96d56Sopenharmony_ci            fprint("    \"%s\"," % name)
3197db96d56Sopenharmony_ci        fprint("    NULL")
3207db96d56Sopenharmony_ci        fprint("};")
3217db96d56Sopenharmony_ci
3227db96d56Sopenharmony_ci        # split record index table
3237db96d56Sopenharmony_ci        index1, index2, shift = splitbins(index, trace)
3247db96d56Sopenharmony_ci
3257db96d56Sopenharmony_ci        fprint("/* index tables for the database records */")
3267db96d56Sopenharmony_ci        fprint("#define SHIFT", shift)
3277db96d56Sopenharmony_ci        Array("index1", index1).dump(fp, trace)
3287db96d56Sopenharmony_ci        Array("index2", index2).dump(fp, trace)
3297db96d56Sopenharmony_ci
3307db96d56Sopenharmony_ci        # split decomposition index table
3317db96d56Sopenharmony_ci        index1, index2, shift = splitbins(decomp_index, trace)
3327db96d56Sopenharmony_ci
3337db96d56Sopenharmony_ci        fprint("/* decomposition data */")
3347db96d56Sopenharmony_ci        Array("decomp_data", decomp_data).dump(fp, trace)
3357db96d56Sopenharmony_ci
3367db96d56Sopenharmony_ci        fprint("/* index tables for the decomposition data */")
3377db96d56Sopenharmony_ci        fprint("#define DECOMP_SHIFT", shift)
3387db96d56Sopenharmony_ci        Array("decomp_index1", index1).dump(fp, trace)
3397db96d56Sopenharmony_ci        Array("decomp_index2", index2).dump(fp, trace)
3407db96d56Sopenharmony_ci
3417db96d56Sopenharmony_ci        index, index2, shift = splitbins(comp_data, trace)
3427db96d56Sopenharmony_ci        fprint("/* NFC pairs */")
3437db96d56Sopenharmony_ci        fprint("#define COMP_SHIFT", shift)
3447db96d56Sopenharmony_ci        Array("comp_index", index).dump(fp, trace)
3457db96d56Sopenharmony_ci        Array("comp_data", index2).dump(fp, trace)
3467db96d56Sopenharmony_ci
3477db96d56Sopenharmony_ci        # Generate delta tables for old versions
3487db96d56Sopenharmony_ci        for version, table, normalization in unicode.changed:
3497db96d56Sopenharmony_ci            cversion = version.replace(".","_")
3507db96d56Sopenharmony_ci            records = [table[0]]
3517db96d56Sopenharmony_ci            cache = {table[0]:0}
3527db96d56Sopenharmony_ci            index = [0] * len(table)
3537db96d56Sopenharmony_ci            for i, record in enumerate(table):
3547db96d56Sopenharmony_ci                try:
3557db96d56Sopenharmony_ci                    index[i] = cache[record]
3567db96d56Sopenharmony_ci                except KeyError:
3577db96d56Sopenharmony_ci                    index[i] = cache[record] = len(records)
3587db96d56Sopenharmony_ci                    records.append(record)
3597db96d56Sopenharmony_ci            index1, index2, shift = splitbins(index, trace)
3607db96d56Sopenharmony_ci            fprint("static const change_record change_records_%s[] = {" % cversion)
3617db96d56Sopenharmony_ci            for record in records:
3627db96d56Sopenharmony_ci                fprint("    { %s }," % ", ".join(map(str,record)))
3637db96d56Sopenharmony_ci            fprint("};")
3647db96d56Sopenharmony_ci            Array("changes_%s_index" % cversion, index1).dump(fp, trace)
3657db96d56Sopenharmony_ci            Array("changes_%s_data" % cversion, index2).dump(fp, trace)
3667db96d56Sopenharmony_ci            fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
3677db96d56Sopenharmony_ci            fprint("{")
3687db96d56Sopenharmony_ci            fprint("    int index;")
3697db96d56Sopenharmony_ci            fprint("    if (n >= 0x110000) index = 0;")
3707db96d56Sopenharmony_ci            fprint("    else {")
3717db96d56Sopenharmony_ci            fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift))
3727db96d56Sopenharmony_ci            fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \
3737db96d56Sopenharmony_ci                   (cversion, shift, ((1<<shift)-1)))
3747db96d56Sopenharmony_ci            fprint("    }")
3757db96d56Sopenharmony_ci            fprint("    return change_records_%s+index;" % cversion)
3767db96d56Sopenharmony_ci            fprint("}\n")
3777db96d56Sopenharmony_ci            fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
3787db96d56Sopenharmony_ci            fprint("{")
3797db96d56Sopenharmony_ci            fprint("    switch(n) {")
3807db96d56Sopenharmony_ci            for k, v in normalization:
3817db96d56Sopenharmony_ci                fprint("    case %s: return 0x%s;" % (hex(k), v))
3827db96d56Sopenharmony_ci            fprint("    default: return 0;")
3837db96d56Sopenharmony_ci            fprint("    }\n}\n")
3847db96d56Sopenharmony_ci
3857db96d56Sopenharmony_ci
3867db96d56Sopenharmony_ci# --------------------------------------------------------------------
3877db96d56Sopenharmony_ci# unicode character type tables
3887db96d56Sopenharmony_ci
3897db96d56Sopenharmony_cidef makeunicodetype(unicode, trace):
3907db96d56Sopenharmony_ci
3917db96d56Sopenharmony_ci    FILE = "Objects/unicodetype_db.h"
3927db96d56Sopenharmony_ci
3937db96d56Sopenharmony_ci    print("--- Preparing", FILE, "...")
3947db96d56Sopenharmony_ci
3957db96d56Sopenharmony_ci    # extract unicode types
3967db96d56Sopenharmony_ci    dummy = (0, 0, 0, 0, 0, 0)
3977db96d56Sopenharmony_ci    table = [dummy]
3987db96d56Sopenharmony_ci    cache = {0: dummy}
3997db96d56Sopenharmony_ci    index = [0] * len(unicode.chars)
4007db96d56Sopenharmony_ci    numeric = {}
4017db96d56Sopenharmony_ci    spaces = []
4027db96d56Sopenharmony_ci    linebreaks = []
4037db96d56Sopenharmony_ci    extra_casing = []
4047db96d56Sopenharmony_ci
4057db96d56Sopenharmony_ci    for char in unicode.chars:
4067db96d56Sopenharmony_ci        record = unicode.table[char]
4077db96d56Sopenharmony_ci        if record:
4087db96d56Sopenharmony_ci            # extract database properties
4097db96d56Sopenharmony_ci            category = record.general_category
4107db96d56Sopenharmony_ci            bidirectional = record.bidi_class
4117db96d56Sopenharmony_ci            properties = record.binary_properties
4127db96d56Sopenharmony_ci            flags = 0
4137db96d56Sopenharmony_ci            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
4147db96d56Sopenharmony_ci                flags |= ALPHA_MASK
4157db96d56Sopenharmony_ci            if "Lowercase" in properties:
4167db96d56Sopenharmony_ci                flags |= LOWER_MASK
4177db96d56Sopenharmony_ci            if 'Line_Break' in properties or bidirectional == "B":
4187db96d56Sopenharmony_ci                flags |= LINEBREAK_MASK
4197db96d56Sopenharmony_ci                linebreaks.append(char)
4207db96d56Sopenharmony_ci            if category == "Zs" or bidirectional in ("WS", "B", "S"):
4217db96d56Sopenharmony_ci                flags |= SPACE_MASK
4227db96d56Sopenharmony_ci                spaces.append(char)
4237db96d56Sopenharmony_ci            if category == "Lt":
4247db96d56Sopenharmony_ci                flags |= TITLE_MASK
4257db96d56Sopenharmony_ci            if "Uppercase" in properties:
4267db96d56Sopenharmony_ci                flags |= UPPER_MASK
4277db96d56Sopenharmony_ci            if char == ord(" ") or category[0] not in ("C", "Z"):
4287db96d56Sopenharmony_ci                flags |= PRINTABLE_MASK
4297db96d56Sopenharmony_ci            if "XID_Start" in properties:
4307db96d56Sopenharmony_ci                flags |= XID_START_MASK
4317db96d56Sopenharmony_ci            if "XID_Continue" in properties:
4327db96d56Sopenharmony_ci                flags |= XID_CONTINUE_MASK
4337db96d56Sopenharmony_ci            if "Cased" in properties:
4347db96d56Sopenharmony_ci                flags |= CASED_MASK
4357db96d56Sopenharmony_ci            if "Case_Ignorable" in properties:
4367db96d56Sopenharmony_ci                flags |= CASE_IGNORABLE_MASK
4377db96d56Sopenharmony_ci            sc = unicode.special_casing.get(char)
4387db96d56Sopenharmony_ci            cf = unicode.case_folding.get(char, [char])
4397db96d56Sopenharmony_ci            if record.simple_uppercase_mapping:
4407db96d56Sopenharmony_ci                upper = int(record.simple_uppercase_mapping, 16)
4417db96d56Sopenharmony_ci            else:
4427db96d56Sopenharmony_ci                upper = char
4437db96d56Sopenharmony_ci            if record.simple_lowercase_mapping:
4447db96d56Sopenharmony_ci                lower = int(record.simple_lowercase_mapping, 16)
4457db96d56Sopenharmony_ci            else:
4467db96d56Sopenharmony_ci                lower = char
4477db96d56Sopenharmony_ci            if record.simple_titlecase_mapping:
4487db96d56Sopenharmony_ci                title = int(record.simple_titlecase_mapping, 16)
4497db96d56Sopenharmony_ci            else:
4507db96d56Sopenharmony_ci                title = upper
4517db96d56Sopenharmony_ci            if sc is None and cf != [lower]:
4527db96d56Sopenharmony_ci                sc = ([lower], [title], [upper])
4537db96d56Sopenharmony_ci            if sc is None:
4547db96d56Sopenharmony_ci                if upper == lower == title:
4557db96d56Sopenharmony_ci                    upper = lower = title = 0
4567db96d56Sopenharmony_ci                else:
4577db96d56Sopenharmony_ci                    upper = upper - char
4587db96d56Sopenharmony_ci                    lower = lower - char
4597db96d56Sopenharmony_ci                    title = title - char
4607db96d56Sopenharmony_ci                    assert (abs(upper) <= 2147483647 and
4617db96d56Sopenharmony_ci                            abs(lower) <= 2147483647 and
4627db96d56Sopenharmony_ci                            abs(title) <= 2147483647)
4637db96d56Sopenharmony_ci            else:
4647db96d56Sopenharmony_ci                # This happens either when some character maps to more than one
4657db96d56Sopenharmony_ci                # character in uppercase, lowercase, or titlecase or the
4667db96d56Sopenharmony_ci                # casefolded version of the character is different from the
4677db96d56Sopenharmony_ci                # lowercase. The extra characters are stored in a different
4687db96d56Sopenharmony_ci                # array.
4697db96d56Sopenharmony_ci                flags |= EXTENDED_CASE_MASK
4707db96d56Sopenharmony_ci                lower = len(extra_casing) | (len(sc[0]) << 24)
4717db96d56Sopenharmony_ci                extra_casing.extend(sc[0])
4727db96d56Sopenharmony_ci                if cf != sc[0]:
4737db96d56Sopenharmony_ci                    lower |= len(cf) << 20
4747db96d56Sopenharmony_ci                    extra_casing.extend(cf)
4757db96d56Sopenharmony_ci                upper = len(extra_casing) | (len(sc[2]) << 24)
4767db96d56Sopenharmony_ci                extra_casing.extend(sc[2])
4777db96d56Sopenharmony_ci                # Title is probably equal to upper.
4787db96d56Sopenharmony_ci                if sc[1] == sc[2]:
4797db96d56Sopenharmony_ci                    title = upper
4807db96d56Sopenharmony_ci                else:
4817db96d56Sopenharmony_ci                    title = len(extra_casing) | (len(sc[1]) << 24)
4827db96d56Sopenharmony_ci                    extra_casing.extend(sc[1])
4837db96d56Sopenharmony_ci            # decimal digit, integer digit
4847db96d56Sopenharmony_ci            decimal = 0
4857db96d56Sopenharmony_ci            if record.decomposition_mapping:
4867db96d56Sopenharmony_ci                flags |= DECIMAL_MASK
4877db96d56Sopenharmony_ci                decimal = int(record.decomposition_mapping)
4887db96d56Sopenharmony_ci            digit = 0
4897db96d56Sopenharmony_ci            if record.numeric_type:
4907db96d56Sopenharmony_ci                flags |= DIGIT_MASK
4917db96d56Sopenharmony_ci                digit = int(record.numeric_type)
4927db96d56Sopenharmony_ci            if record.numeric_value:
4937db96d56Sopenharmony_ci                flags |= NUMERIC_MASK
4947db96d56Sopenharmony_ci                numeric.setdefault(record.numeric_value, []).append(char)
4957db96d56Sopenharmony_ci            item = (
4967db96d56Sopenharmony_ci                upper, lower, title, decimal, digit, flags
4977db96d56Sopenharmony_ci                )
4987db96d56Sopenharmony_ci            # add entry to index and item tables
4997db96d56Sopenharmony_ci            i = cache.get(item)
5007db96d56Sopenharmony_ci            if i is None:
5017db96d56Sopenharmony_ci                cache[item] = i = len(table)
5027db96d56Sopenharmony_ci                table.append(item)
5037db96d56Sopenharmony_ci            index[char] = i
5047db96d56Sopenharmony_ci
5057db96d56Sopenharmony_ci    print(len(table), "unique character type entries")
5067db96d56Sopenharmony_ci    print(sum(map(len, numeric.values())), "numeric code points")
5077db96d56Sopenharmony_ci    print(len(spaces), "whitespace code points")
5087db96d56Sopenharmony_ci    print(len(linebreaks), "linebreak code points")
5097db96d56Sopenharmony_ci    print(len(extra_casing), "extended case array")
5107db96d56Sopenharmony_ci
5117db96d56Sopenharmony_ci    print("--- Writing", FILE, "...")
5127db96d56Sopenharmony_ci
5137db96d56Sopenharmony_ci    with open(FILE, "w") as fp:
5147db96d56Sopenharmony_ci        fprint = partial(print, file=fp)
5157db96d56Sopenharmony_ci
5167db96d56Sopenharmony_ci        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
5177db96d56Sopenharmony_ci        fprint()
5187db96d56Sopenharmony_ci        fprint("/* a list of unique character type descriptors */")
5197db96d56Sopenharmony_ci        fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
5207db96d56Sopenharmony_ci        for item in table:
5217db96d56Sopenharmony_ci            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
5227db96d56Sopenharmony_ci        fprint("};")
5237db96d56Sopenharmony_ci        fprint()
5247db96d56Sopenharmony_ci
5257db96d56Sopenharmony_ci        fprint("/* extended case mappings */")
5267db96d56Sopenharmony_ci        fprint()
5277db96d56Sopenharmony_ci        fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
5287db96d56Sopenharmony_ci        for c in extra_casing:
5297db96d56Sopenharmony_ci            fprint("    %d," % c)
5307db96d56Sopenharmony_ci        fprint("};")
5317db96d56Sopenharmony_ci        fprint()
5327db96d56Sopenharmony_ci
5337db96d56Sopenharmony_ci        # split decomposition index table
5347db96d56Sopenharmony_ci        index1, index2, shift = splitbins(index, trace)
5357db96d56Sopenharmony_ci
5367db96d56Sopenharmony_ci        fprint("/* type indexes */")
5377db96d56Sopenharmony_ci        fprint("#define SHIFT", shift)
5387db96d56Sopenharmony_ci        Array("index1", index1).dump(fp, trace)
5397db96d56Sopenharmony_ci        Array("index2", index2).dump(fp, trace)
5407db96d56Sopenharmony_ci
5417db96d56Sopenharmony_ci        # Generate code for _PyUnicode_ToNumeric()
5427db96d56Sopenharmony_ci        numeric_items = sorted(numeric.items())
5437db96d56Sopenharmony_ci        fprint('/* Returns the numeric value as double for Unicode characters')
5447db96d56Sopenharmony_ci        fprint(' * having this property, -1.0 otherwise.')
5457db96d56Sopenharmony_ci        fprint(' */')
5467db96d56Sopenharmony_ci        fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
5477db96d56Sopenharmony_ci        fprint('{')
5487db96d56Sopenharmony_ci        fprint('    switch (ch) {')
5497db96d56Sopenharmony_ci        for value, codepoints in numeric_items:
5507db96d56Sopenharmony_ci            # Turn text into float literals
5517db96d56Sopenharmony_ci            parts = value.split('/')
5527db96d56Sopenharmony_ci            parts = [repr(float(part)) for part in parts]
5537db96d56Sopenharmony_ci            value = '/'.join(parts)
5547db96d56Sopenharmony_ci
5557db96d56Sopenharmony_ci            codepoints.sort()
5567db96d56Sopenharmony_ci            for codepoint in codepoints:
5577db96d56Sopenharmony_ci                fprint('    case 0x%04X:' % (codepoint,))
5587db96d56Sopenharmony_ci            fprint('        return (double) %s;' % (value,))
5597db96d56Sopenharmony_ci        fprint('    }')
5607db96d56Sopenharmony_ci        fprint('    return -1.0;')
5617db96d56Sopenharmony_ci        fprint('}')
5627db96d56Sopenharmony_ci        fprint()
5637db96d56Sopenharmony_ci
5647db96d56Sopenharmony_ci        # Generate code for _PyUnicode_IsWhitespace()
5657db96d56Sopenharmony_ci        fprint("/* Returns 1 for Unicode characters having the bidirectional")
5667db96d56Sopenharmony_ci        fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
5677db96d56Sopenharmony_ci        fprint(" */")
5687db96d56Sopenharmony_ci        fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
5697db96d56Sopenharmony_ci        fprint('{')
5707db96d56Sopenharmony_ci        fprint('    switch (ch) {')
5717db96d56Sopenharmony_ci
5727db96d56Sopenharmony_ci        for codepoint in sorted(spaces):
5737db96d56Sopenharmony_ci            fprint('    case 0x%04X:' % (codepoint,))
5747db96d56Sopenharmony_ci        fprint('        return 1;')
5757db96d56Sopenharmony_ci
5767db96d56Sopenharmony_ci        fprint('    }')
5777db96d56Sopenharmony_ci        fprint('    return 0;')
5787db96d56Sopenharmony_ci        fprint('}')
5797db96d56Sopenharmony_ci        fprint()
5807db96d56Sopenharmony_ci
5817db96d56Sopenharmony_ci        # Generate code for _PyUnicode_IsLinebreak()
5827db96d56Sopenharmony_ci        fprint("/* Returns 1 for Unicode characters having the line break")
5837db96d56Sopenharmony_ci        fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
5847db96d56Sopenharmony_ci        fprint(" * type 'B', 0 otherwise.")
5857db96d56Sopenharmony_ci        fprint(" */")
5867db96d56Sopenharmony_ci        fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
5877db96d56Sopenharmony_ci        fprint('{')
5887db96d56Sopenharmony_ci        fprint('    switch (ch) {')
5897db96d56Sopenharmony_ci        for codepoint in sorted(linebreaks):
5907db96d56Sopenharmony_ci            fprint('    case 0x%04X:' % (codepoint,))
5917db96d56Sopenharmony_ci        fprint('        return 1;')
5927db96d56Sopenharmony_ci
5937db96d56Sopenharmony_ci        fprint('    }')
5947db96d56Sopenharmony_ci        fprint('    return 0;')
5957db96d56Sopenharmony_ci        fprint('}')
5967db96d56Sopenharmony_ci        fprint()
5977db96d56Sopenharmony_ci
5987db96d56Sopenharmony_ci
5997db96d56Sopenharmony_ci# --------------------------------------------------------------------
6007db96d56Sopenharmony_ci# unicode name database
6017db96d56Sopenharmony_ci
6027db96d56Sopenharmony_cidef makeunicodename(unicode, trace):
6037db96d56Sopenharmony_ci
6047db96d56Sopenharmony_ci    FILE = "Modules/unicodename_db.h"
6057db96d56Sopenharmony_ci
6067db96d56Sopenharmony_ci    print("--- Preparing", FILE, "...")
6077db96d56Sopenharmony_ci
6087db96d56Sopenharmony_ci    # collect names
6097db96d56Sopenharmony_ci    names = [None] * len(unicode.chars)
6107db96d56Sopenharmony_ci
6117db96d56Sopenharmony_ci    for char in unicode.chars:
6127db96d56Sopenharmony_ci        record = unicode.table[char]
6137db96d56Sopenharmony_ci        if record:
6147db96d56Sopenharmony_ci            name = record.name.strip()
6157db96d56Sopenharmony_ci            if name and name[0] != "<":
6167db96d56Sopenharmony_ci                names[char] = name + chr(0)
6177db96d56Sopenharmony_ci
6187db96d56Sopenharmony_ci    print(len([n for n in names if n is not None]), "distinct names")
6197db96d56Sopenharmony_ci
6207db96d56Sopenharmony_ci    # collect unique words from names (note that we differ between
6217db96d56Sopenharmony_ci    # words inside a sentence, and words ending a sentence.  the
6227db96d56Sopenharmony_ci    # latter includes the trailing null byte.
6237db96d56Sopenharmony_ci
6247db96d56Sopenharmony_ci    words = {}
6257db96d56Sopenharmony_ci    n = b = 0
6267db96d56Sopenharmony_ci    for char in unicode.chars:
6277db96d56Sopenharmony_ci        name = names[char]
6287db96d56Sopenharmony_ci        if name:
6297db96d56Sopenharmony_ci            w = name.split()
6307db96d56Sopenharmony_ci            b = b + len(name)
6317db96d56Sopenharmony_ci            n = n + len(w)
6327db96d56Sopenharmony_ci            for w in w:
6337db96d56Sopenharmony_ci                l = words.get(w)
6347db96d56Sopenharmony_ci                if l:
6357db96d56Sopenharmony_ci                    l.append(None)
6367db96d56Sopenharmony_ci                else:
6377db96d56Sopenharmony_ci                    words[w] = [len(words)]
6387db96d56Sopenharmony_ci
6397db96d56Sopenharmony_ci    print(n, "words in text;", b, "bytes")
6407db96d56Sopenharmony_ci
6417db96d56Sopenharmony_ci    wordlist = list(words.items())
6427db96d56Sopenharmony_ci
6437db96d56Sopenharmony_ci    # sort on falling frequency, then by name
6447db96d56Sopenharmony_ci    def word_key(a):
6457db96d56Sopenharmony_ci        aword, alist = a
6467db96d56Sopenharmony_ci        return -len(alist), aword
6477db96d56Sopenharmony_ci    wordlist.sort(key=word_key)
6487db96d56Sopenharmony_ci
6497db96d56Sopenharmony_ci    # figure out how many phrasebook escapes we need
6507db96d56Sopenharmony_ci    escapes = 0
6517db96d56Sopenharmony_ci    while escapes * 256 < len(wordlist):
6527db96d56Sopenharmony_ci        escapes = escapes + 1
6537db96d56Sopenharmony_ci    print(escapes, "escapes")
6547db96d56Sopenharmony_ci
6557db96d56Sopenharmony_ci    short = 256 - escapes
6567db96d56Sopenharmony_ci
6577db96d56Sopenharmony_ci    assert short > 0
6587db96d56Sopenharmony_ci
6597db96d56Sopenharmony_ci    print(short, "short indexes in lexicon")
6607db96d56Sopenharmony_ci
6617db96d56Sopenharmony_ci    # statistics
6627db96d56Sopenharmony_ci    n = 0
6637db96d56Sopenharmony_ci    for i in range(short):
6647db96d56Sopenharmony_ci        n = n + len(wordlist[i][1])
6657db96d56Sopenharmony_ci    print(n, "short indexes in phrasebook")
6667db96d56Sopenharmony_ci
6677db96d56Sopenharmony_ci    # pick the most commonly used words, and sort the rest on falling
6687db96d56Sopenharmony_ci    # length (to maximize overlap)
6697db96d56Sopenharmony_ci
6707db96d56Sopenharmony_ci    wordlist, wordtail = wordlist[:short], wordlist[short:]
6717db96d56Sopenharmony_ci    wordtail.sort(key=lambda a: a[0], reverse=True)
6727db96d56Sopenharmony_ci    wordlist.extend(wordtail)
6737db96d56Sopenharmony_ci
6747db96d56Sopenharmony_ci    # generate lexicon from words
6757db96d56Sopenharmony_ci
6767db96d56Sopenharmony_ci    lexicon_offset = [0]
6777db96d56Sopenharmony_ci    lexicon = ""
6787db96d56Sopenharmony_ci    words = {}
6797db96d56Sopenharmony_ci
6807db96d56Sopenharmony_ci    # build a lexicon string
6817db96d56Sopenharmony_ci    offset = 0
6827db96d56Sopenharmony_ci    for w, x in wordlist:
6837db96d56Sopenharmony_ci        # encoding: bit 7 indicates last character in word (chr(128)
6847db96d56Sopenharmony_ci        # indicates the last character in an entire string)
6857db96d56Sopenharmony_ci        ww = w[:-1] + chr(ord(w[-1])+128)
6867db96d56Sopenharmony_ci        # reuse string tails, when possible
6877db96d56Sopenharmony_ci        o = lexicon.find(ww)
6887db96d56Sopenharmony_ci        if o < 0:
6897db96d56Sopenharmony_ci            o = offset
6907db96d56Sopenharmony_ci            lexicon = lexicon + ww
6917db96d56Sopenharmony_ci            offset = offset + len(w)
6927db96d56Sopenharmony_ci        words[w] = len(lexicon_offset)
6937db96d56Sopenharmony_ci        lexicon_offset.append(o)
6947db96d56Sopenharmony_ci
6957db96d56Sopenharmony_ci    lexicon = list(map(ord, lexicon))
6967db96d56Sopenharmony_ci
6977db96d56Sopenharmony_ci    # generate phrasebook from names and lexicon
6987db96d56Sopenharmony_ci    phrasebook = [0]
6997db96d56Sopenharmony_ci    phrasebook_offset = [0] * len(unicode.chars)
7007db96d56Sopenharmony_ci    for char in unicode.chars:
7017db96d56Sopenharmony_ci        name = names[char]
7027db96d56Sopenharmony_ci        if name:
7037db96d56Sopenharmony_ci            w = name.split()
7047db96d56Sopenharmony_ci            phrasebook_offset[char] = len(phrasebook)
7057db96d56Sopenharmony_ci            for w in w:
7067db96d56Sopenharmony_ci                i = words[w]
7077db96d56Sopenharmony_ci                if i < short:
7087db96d56Sopenharmony_ci                    phrasebook.append(i)
7097db96d56Sopenharmony_ci                else:
7107db96d56Sopenharmony_ci                    # store as two bytes
7117db96d56Sopenharmony_ci                    phrasebook.append((i>>8) + short)
7127db96d56Sopenharmony_ci                    phrasebook.append(i&255)
7137db96d56Sopenharmony_ci
7147db96d56Sopenharmony_ci    assert getsize(phrasebook) == 1
7157db96d56Sopenharmony_ci
7167db96d56Sopenharmony_ci    #
7177db96d56Sopenharmony_ci    # unicode name hash table
7187db96d56Sopenharmony_ci
7197db96d56Sopenharmony_ci    # extract names
7207db96d56Sopenharmony_ci    data = []
7217db96d56Sopenharmony_ci    for char in unicode.chars:
7227db96d56Sopenharmony_ci        record = unicode.table[char]
7237db96d56Sopenharmony_ci        if record:
7247db96d56Sopenharmony_ci            name = record.name.strip()
7257db96d56Sopenharmony_ci            if name and name[0] != "<":
7267db96d56Sopenharmony_ci                data.append((name, char))
7277db96d56Sopenharmony_ci
7287db96d56Sopenharmony_ci    # the magic number 47 was chosen to minimize the number of
7297db96d56Sopenharmony_ci    # collisions on the current data set.  if you like, change it
7307db96d56Sopenharmony_ci    # and see what happens...
7317db96d56Sopenharmony_ci
7327db96d56Sopenharmony_ci    codehash = Hash("code", data, 47)
7337db96d56Sopenharmony_ci
7347db96d56Sopenharmony_ci    print("--- Writing", FILE, "...")
7357db96d56Sopenharmony_ci
7367db96d56Sopenharmony_ci    with open(FILE, "w") as fp:
7377db96d56Sopenharmony_ci        fprint = partial(print, file=fp)
7387db96d56Sopenharmony_ci
7397db96d56Sopenharmony_ci        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
7407db96d56Sopenharmony_ci        fprint()
7417db96d56Sopenharmony_ci        fprint("#define NAME_MAXLEN", 256)
7427db96d56Sopenharmony_ci        fprint()
7437db96d56Sopenharmony_ci        fprint("/* lexicon */")
7447db96d56Sopenharmony_ci        Array("lexicon", lexicon).dump(fp, trace)
7457db96d56Sopenharmony_ci        Array("lexicon_offset", lexicon_offset).dump(fp, trace)
7467db96d56Sopenharmony_ci
7477db96d56Sopenharmony_ci        # split decomposition index table
7487db96d56Sopenharmony_ci        offset1, offset2, shift = splitbins(phrasebook_offset, trace)
7497db96d56Sopenharmony_ci
7507db96d56Sopenharmony_ci        fprint("/* code->name phrasebook */")
7517db96d56Sopenharmony_ci        fprint("#define phrasebook_shift", shift)
7527db96d56Sopenharmony_ci        fprint("#define phrasebook_short", short)
7537db96d56Sopenharmony_ci
7547db96d56Sopenharmony_ci        Array("phrasebook", phrasebook).dump(fp, trace)
7557db96d56Sopenharmony_ci        Array("phrasebook_offset1", offset1).dump(fp, trace)
7567db96d56Sopenharmony_ci        Array("phrasebook_offset2", offset2).dump(fp, trace)
7577db96d56Sopenharmony_ci
7587db96d56Sopenharmony_ci        fprint("/* name->code dictionary */")
7597db96d56Sopenharmony_ci        codehash.dump(fp, trace)
7607db96d56Sopenharmony_ci
7617db96d56Sopenharmony_ci        fprint()
7627db96d56Sopenharmony_ci        fprint('static const unsigned int aliases_start = %#x;' %
7637db96d56Sopenharmony_ci               NAME_ALIASES_START)
7647db96d56Sopenharmony_ci        fprint('static const unsigned int aliases_end = %#x;' %
7657db96d56Sopenharmony_ci               (NAME_ALIASES_START + len(unicode.aliases)))
7667db96d56Sopenharmony_ci
7677db96d56Sopenharmony_ci        fprint('static const unsigned int name_aliases[] = {')
7687db96d56Sopenharmony_ci        for name, codepoint in unicode.aliases:
7697db96d56Sopenharmony_ci            fprint('    0x%04X,' % codepoint)
7707db96d56Sopenharmony_ci        fprint('};')
7717db96d56Sopenharmony_ci
7727db96d56Sopenharmony_ci        # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
7737db96d56Sopenharmony_ci        # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
7747db96d56Sopenharmony_ci        # sequences or sequences with non-BMP chars are added.
7757db96d56Sopenharmony_ci        # unicodedata_lookup should be adapted too.
7767db96d56Sopenharmony_ci        fprint(dedent("""
7777db96d56Sopenharmony_ci            typedef struct NamedSequence {
7787db96d56Sopenharmony_ci                int seqlen;
7797db96d56Sopenharmony_ci                Py_UCS2 seq[4];
7807db96d56Sopenharmony_ci            } named_sequence;
7817db96d56Sopenharmony_ci            """))
7827db96d56Sopenharmony_ci
7837db96d56Sopenharmony_ci        fprint('static const unsigned int named_sequences_start = %#x;' %
7847db96d56Sopenharmony_ci               NAMED_SEQUENCES_START)
7857db96d56Sopenharmony_ci        fprint('static const unsigned int named_sequences_end = %#x;' %
7867db96d56Sopenharmony_ci               (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
7877db96d56Sopenharmony_ci
7887db96d56Sopenharmony_ci        fprint('static const named_sequence named_sequences[] = {')
7897db96d56Sopenharmony_ci        for name, sequence in unicode.named_sequences:
7907db96d56Sopenharmony_ci            seq_str = ', '.join('0x%04X' % cp for cp in sequence)
7917db96d56Sopenharmony_ci            fprint('    {%d, {%s}},' % (len(sequence), seq_str))
7927db96d56Sopenharmony_ci        fprint('};')
7937db96d56Sopenharmony_ci
7947db96d56Sopenharmony_ci
7957db96d56Sopenharmony_cidef merge_old_version(version, new, old):
7967db96d56Sopenharmony_ci    # Changes to exclusion file not implemented yet
7977db96d56Sopenharmony_ci    if old.exclusions != new.exclusions:
7987db96d56Sopenharmony_ci        raise NotImplementedError("exclusions differ")
7997db96d56Sopenharmony_ci
8007db96d56Sopenharmony_ci    # In these change records, 0xFF means "no change"
8017db96d56Sopenharmony_ci    bidir_changes = [0xFF]*0x110000
8027db96d56Sopenharmony_ci    category_changes = [0xFF]*0x110000
8037db96d56Sopenharmony_ci    decimal_changes = [0xFF]*0x110000
8047db96d56Sopenharmony_ci    mirrored_changes = [0xFF]*0x110000
8057db96d56Sopenharmony_ci    east_asian_width_changes = [0xFF]*0x110000
8067db96d56Sopenharmony_ci    # In numeric data, 0 means "no change",
8077db96d56Sopenharmony_ci    # -1 means "did not have a numeric value
8087db96d56Sopenharmony_ci    numeric_changes = [0] * 0x110000
8097db96d56Sopenharmony_ci    # normalization_changes is a list of key-value pairs
8107db96d56Sopenharmony_ci    normalization_changes = []
8117db96d56Sopenharmony_ci    for i in range(0x110000):
8127db96d56Sopenharmony_ci        if new.table[i] is None:
8137db96d56Sopenharmony_ci            # Characters unassigned in the new version ought to
8147db96d56Sopenharmony_ci            # be unassigned in the old one
8157db96d56Sopenharmony_ci            assert old.table[i] is None
8167db96d56Sopenharmony_ci            continue
8177db96d56Sopenharmony_ci        # check characters unassigned in the old version
8187db96d56Sopenharmony_ci        if old.table[i] is None:
8197db96d56Sopenharmony_ci            # category 0 is "unassigned"
8207db96d56Sopenharmony_ci            category_changes[i] = 0
8217db96d56Sopenharmony_ci            continue
8227db96d56Sopenharmony_ci        # check characters that differ
8237db96d56Sopenharmony_ci        if old.table[i] != new.table[i]:
8247db96d56Sopenharmony_ci            for k, field in enumerate(dataclasses.fields(UcdRecord)):
8257db96d56Sopenharmony_ci                value = getattr(old.table[i], field.name)
8267db96d56Sopenharmony_ci                new_value = getattr(new.table[i], field.name)
8277db96d56Sopenharmony_ci                if value != new_value:
8287db96d56Sopenharmony_ci                    if k == 1 and i in PUA_15:
8297db96d56Sopenharmony_ci                        # the name is not set in the old.table, but in the
8307db96d56Sopenharmony_ci                        # new.table we are using it for aliases and named seq
8317db96d56Sopenharmony_ci                        assert value == ''
8327db96d56Sopenharmony_ci                    elif k == 2:
8337db96d56Sopenharmony_ci                        category_changes[i] = CATEGORY_NAMES.index(value)
8347db96d56Sopenharmony_ci                    elif k == 4:
8357db96d56Sopenharmony_ci                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
8367db96d56Sopenharmony_ci                    elif k == 5:
8377db96d56Sopenharmony_ci                        # We assume that all normalization changes are in 1:1 mappings
8387db96d56Sopenharmony_ci                        assert " " not in value
8397db96d56Sopenharmony_ci                        normalization_changes.append((i, value))
8407db96d56Sopenharmony_ci                    elif k == 6:
8417db96d56Sopenharmony_ci                        # we only support changes where the old value is a single digit
8427db96d56Sopenharmony_ci                        assert value in "0123456789"
8437db96d56Sopenharmony_ci                        decimal_changes[i] = int(value)
8447db96d56Sopenharmony_ci                    elif k == 8:
8457db96d56Sopenharmony_ci                        # Since 0 encodes "no change", the old value is better not 0
8467db96d56Sopenharmony_ci                        if not value:
8477db96d56Sopenharmony_ci                            numeric_changes[i] = -1
8487db96d56Sopenharmony_ci                        else:
8497db96d56Sopenharmony_ci                            numeric_changes[i] = float(value)
8507db96d56Sopenharmony_ci                            assert numeric_changes[i] not in (0, -1)
8517db96d56Sopenharmony_ci                    elif k == 9:
8527db96d56Sopenharmony_ci                        if value == 'Y':
8537db96d56Sopenharmony_ci                            mirrored_changes[i] = '1'
8547db96d56Sopenharmony_ci                        else:
8557db96d56Sopenharmony_ci                            mirrored_changes[i] = '0'
8567db96d56Sopenharmony_ci                    elif k == 11:
8577db96d56Sopenharmony_ci                        # change to ISO comment, ignore
8587db96d56Sopenharmony_ci                        pass
8597db96d56Sopenharmony_ci                    elif k == 12:
8607db96d56Sopenharmony_ci                        # change to simple uppercase mapping; ignore
8617db96d56Sopenharmony_ci                        pass
8627db96d56Sopenharmony_ci                    elif k == 13:
8637db96d56Sopenharmony_ci                        # change to simple lowercase mapping; ignore
8647db96d56Sopenharmony_ci                        pass
8657db96d56Sopenharmony_ci                    elif k == 14:
8667db96d56Sopenharmony_ci                        # change to simple titlecase mapping; ignore
8677db96d56Sopenharmony_ci                        pass
8687db96d56Sopenharmony_ci                    elif k == 15:
8697db96d56Sopenharmony_ci                        # change to east asian width
8707db96d56Sopenharmony_ci                        east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
8717db96d56Sopenharmony_ci                    elif k == 16:
8727db96d56Sopenharmony_ci                        # derived property changes; not yet
8737db96d56Sopenharmony_ci                        pass
8747db96d56Sopenharmony_ci                    elif k == 17:
8757db96d56Sopenharmony_ci                        # normalization quickchecks are not performed
8767db96d56Sopenharmony_ci                        # for older versions
8777db96d56Sopenharmony_ci                        pass
8787db96d56Sopenharmony_ci                    else:
8797db96d56Sopenharmony_ci                        class Difference(Exception):pass
8807db96d56Sopenharmony_ci                        raise Difference(hex(i), k, old.table[i], new.table[i])
8817db96d56Sopenharmony_ci    new.changed.append((version, list(zip(bidir_changes, category_changes,
8827db96d56Sopenharmony_ci                                          decimal_changes, mirrored_changes,
8837db96d56Sopenharmony_ci                                          east_asian_width_changes,
8847db96d56Sopenharmony_ci                                          numeric_changes)),
8857db96d56Sopenharmony_ci                        normalization_changes))
8867db96d56Sopenharmony_ci
8877db96d56Sopenharmony_ci
8887db96d56Sopenharmony_ciDATA_DIR = os.path.join('Tools', 'unicode', 'data')
8897db96d56Sopenharmony_ci
8907db96d56Sopenharmony_cidef open_data(template, version):
8917db96d56Sopenharmony_ci    local = os.path.join(DATA_DIR, template % ('-'+version,))
8927db96d56Sopenharmony_ci    if not os.path.exists(local):
8937db96d56Sopenharmony_ci        import urllib.request
8947db96d56Sopenharmony_ci        if version == '3.2.0':
8957db96d56Sopenharmony_ci            # irregular url structure
8967db96d56Sopenharmony_ci            url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
8977db96d56Sopenharmony_ci        else:
8987db96d56Sopenharmony_ci            url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
8997db96d56Sopenharmony_ci        os.makedirs(DATA_DIR, exist_ok=True)
9007db96d56Sopenharmony_ci        urllib.request.urlretrieve(url, filename=local)
9017db96d56Sopenharmony_ci    if local.endswith('.txt'):
9027db96d56Sopenharmony_ci        return open(local, encoding='utf-8')
9037db96d56Sopenharmony_ci    else:
9047db96d56Sopenharmony_ci        # Unihan.zip
9057db96d56Sopenharmony_ci        return open(local, 'rb')
9067db96d56Sopenharmony_ci
9077db96d56Sopenharmony_ci
9087db96d56Sopenharmony_cidef expand_range(char_range: str) -> Iterator[int]:
9097db96d56Sopenharmony_ci    '''
9107db96d56Sopenharmony_ci    Parses ranges of code points, as described in UAX #44:
9117db96d56Sopenharmony_ci      https://www.unicode.org/reports/tr44/#Code_Point_Ranges
9127db96d56Sopenharmony_ci    '''
9137db96d56Sopenharmony_ci    if '..' in char_range:
9147db96d56Sopenharmony_ci        first, last = [int(c, 16) for c in char_range.split('..')]
9157db96d56Sopenharmony_ci    else:
9167db96d56Sopenharmony_ci        first = last = int(char_range, 16)
9177db96d56Sopenharmony_ci    for char in range(first, last+1):
9187db96d56Sopenharmony_ci        yield char
9197db96d56Sopenharmony_ci
9207db96d56Sopenharmony_ci
9217db96d56Sopenharmony_ciclass UcdFile:
9227db96d56Sopenharmony_ci    '''
9237db96d56Sopenharmony_ci    A file in the standard format of the UCD.
9247db96d56Sopenharmony_ci
9257db96d56Sopenharmony_ci    See: https://www.unicode.org/reports/tr44/#Format_Conventions
9267db96d56Sopenharmony_ci
9277db96d56Sopenharmony_ci    Note that, as described there, the Unihan data files have their
9287db96d56Sopenharmony_ci    own separate format.
9297db96d56Sopenharmony_ci    '''
9307db96d56Sopenharmony_ci
9317db96d56Sopenharmony_ci    def __init__(self, template: str, version: str) -> None:
9327db96d56Sopenharmony_ci        self.template = template
9337db96d56Sopenharmony_ci        self.version = version
9347db96d56Sopenharmony_ci
9357db96d56Sopenharmony_ci    def records(self) -> Iterator[List[str]]:
9367db96d56Sopenharmony_ci        with open_data(self.template, self.version) as file:
9377db96d56Sopenharmony_ci            for line in file:
9387db96d56Sopenharmony_ci                line = line.split('#', 1)[0].strip()
9397db96d56Sopenharmony_ci                if not line:
9407db96d56Sopenharmony_ci                    continue
9417db96d56Sopenharmony_ci                yield [field.strip() for field in line.split(';')]
9427db96d56Sopenharmony_ci
9437db96d56Sopenharmony_ci    def __iter__(self) -> Iterator[List[str]]:
9447db96d56Sopenharmony_ci        return self.records()
9457db96d56Sopenharmony_ci
9467db96d56Sopenharmony_ci    def expanded(self) -> Iterator[Tuple[int, List[str]]]:
9477db96d56Sopenharmony_ci        for record in self.records():
9487db96d56Sopenharmony_ci            char_range, rest = record[0], record[1:]
9497db96d56Sopenharmony_ci            for char in expand_range(char_range):
9507db96d56Sopenharmony_ci                yield char, rest
9517db96d56Sopenharmony_ci
9527db96d56Sopenharmony_ci
9537db96d56Sopenharmony_ci@dataclasses.dataclass
9547db96d56Sopenharmony_ciclass UcdRecord:
9557db96d56Sopenharmony_ci    # 15 fields from UnicodeData.txt .  See:
9567db96d56Sopenharmony_ci    #   https://www.unicode.org/reports/tr44/#UnicodeData.txt
9577db96d56Sopenharmony_ci    codepoint: str
9587db96d56Sopenharmony_ci    name: str
9597db96d56Sopenharmony_ci    general_category: str
9607db96d56Sopenharmony_ci    canonical_combining_class: str
9617db96d56Sopenharmony_ci    bidi_class: str
9627db96d56Sopenharmony_ci    decomposition_type: str
9637db96d56Sopenharmony_ci    decomposition_mapping: str
9647db96d56Sopenharmony_ci    numeric_type: str
9657db96d56Sopenharmony_ci    numeric_value: str
9667db96d56Sopenharmony_ci    bidi_mirrored: str
9677db96d56Sopenharmony_ci    unicode_1_name: str  # obsolete
9687db96d56Sopenharmony_ci    iso_comment: str  # obsolete
9697db96d56Sopenharmony_ci    simple_uppercase_mapping: str
9707db96d56Sopenharmony_ci    simple_lowercase_mapping: str
9717db96d56Sopenharmony_ci    simple_titlecase_mapping: str
9727db96d56Sopenharmony_ci
9737db96d56Sopenharmony_ci    # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
9747db96d56Sopenharmony_ci    east_asian_width: Optional[str]
9757db96d56Sopenharmony_ci
9767db96d56Sopenharmony_ci    # Binary properties, as a set of those that are true.
9777db96d56Sopenharmony_ci    # Taken from multiple files:
9787db96d56Sopenharmony_ci    #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
9797db96d56Sopenharmony_ci    #   https://www.unicode.org/reports/tr44/#LineBreak.txt
9807db96d56Sopenharmony_ci    binary_properties: Set[str]
9817db96d56Sopenharmony_ci
9827db96d56Sopenharmony_ci    # The Quick_Check properties related to normalization:
9837db96d56Sopenharmony_ci    #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
9847db96d56Sopenharmony_ci    # We store them as a bitmask.
9857db96d56Sopenharmony_ci    quick_check: int
9867db96d56Sopenharmony_ci
9877db96d56Sopenharmony_ci
9887db96d56Sopenharmony_cidef from_row(row: List[str]) -> UcdRecord:
9897db96d56Sopenharmony_ci    return UcdRecord(*row, None, set(), 0)
9907db96d56Sopenharmony_ci
9917db96d56Sopenharmony_ci
9927db96d56Sopenharmony_ci# --------------------------------------------------------------------
9937db96d56Sopenharmony_ci# the following support code is taken from the unidb utilities
9947db96d56Sopenharmony_ci# Copyright (c) 1999-2000 by Secret Labs AB
9957db96d56Sopenharmony_ci
9967db96d56Sopenharmony_ci# load a unicode-data file from disk
9977db96d56Sopenharmony_ci
9987db96d56Sopenharmony_ciclass UnicodeData:
9997db96d56Sopenharmony_ci    # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
10007db96d56Sopenharmony_ci
10017db96d56Sopenharmony_ci    def __init__(self, version, cjk_check=True):
10027db96d56Sopenharmony_ci        self.changed = []
10037db96d56Sopenharmony_ci        table = [None] * 0x110000
10047db96d56Sopenharmony_ci        for s in UcdFile(UNICODE_DATA, version):
10057db96d56Sopenharmony_ci            char = int(s[0], 16)
10067db96d56Sopenharmony_ci            table[char] = from_row(s)
10077db96d56Sopenharmony_ci
10087db96d56Sopenharmony_ci        cjk_ranges_found = []
10097db96d56Sopenharmony_ci
10107db96d56Sopenharmony_ci        # expand first-last ranges
10117db96d56Sopenharmony_ci        field = None
10127db96d56Sopenharmony_ci        for i in range(0, 0x110000):
10137db96d56Sopenharmony_ci            # The file UnicodeData.txt has its own distinct way of
10147db96d56Sopenharmony_ci            # expressing ranges.  See:
10157db96d56Sopenharmony_ci            #   https://www.unicode.org/reports/tr44/#Code_Point_Ranges
10167db96d56Sopenharmony_ci            s = table[i]
10177db96d56Sopenharmony_ci            if s:
10187db96d56Sopenharmony_ci                if s.name[-6:] == "First>":
10197db96d56Sopenharmony_ci                    s.name = ""
10207db96d56Sopenharmony_ci                    field = dataclasses.astuple(s)[:15]
10217db96d56Sopenharmony_ci                elif s.name[-5:] == "Last>":
10227db96d56Sopenharmony_ci                    if s.name.startswith("<CJK Ideograph"):
10237db96d56Sopenharmony_ci                        cjk_ranges_found.append((field[0],
10247db96d56Sopenharmony_ci                                                 s.codepoint))
10257db96d56Sopenharmony_ci                    s.name = ""
10267db96d56Sopenharmony_ci                    field = None
10277db96d56Sopenharmony_ci            elif field:
10287db96d56Sopenharmony_ci                table[i] = from_row(('%X' % i,) + field[1:])
10297db96d56Sopenharmony_ci        if cjk_check and cjk_ranges != cjk_ranges_found:
10307db96d56Sopenharmony_ci            raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
10317db96d56Sopenharmony_ci
10327db96d56Sopenharmony_ci        # public attributes
10337db96d56Sopenharmony_ci        self.filename = UNICODE_DATA % ''
10347db96d56Sopenharmony_ci        self.table = table
10357db96d56Sopenharmony_ci        self.chars = list(range(0x110000)) # unicode 3.2
10367db96d56Sopenharmony_ci
10377db96d56Sopenharmony_ci        # check for name aliases and named sequences, see #12753
10387db96d56Sopenharmony_ci        # aliases and named sequences are not in 3.2.0
10397db96d56Sopenharmony_ci        if version != '3.2.0':
10407db96d56Sopenharmony_ci            self.aliases = []
10417db96d56Sopenharmony_ci            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
10427db96d56Sopenharmony_ci            # in order to take advantage of the compression and lookup
10437db96d56Sopenharmony_ci            # algorithms used for the other characters
10447db96d56Sopenharmony_ci            pua_index = NAME_ALIASES_START
10457db96d56Sopenharmony_ci            for char, name, abbrev in UcdFile(NAME_ALIASES, version):
10467db96d56Sopenharmony_ci                char = int(char, 16)
10477db96d56Sopenharmony_ci                self.aliases.append((name, char))
10487db96d56Sopenharmony_ci                # also store the name in the PUA 1
10497db96d56Sopenharmony_ci                self.table[pua_index].name = name
10507db96d56Sopenharmony_ci                pua_index += 1
10517db96d56Sopenharmony_ci            assert pua_index - NAME_ALIASES_START == len(self.aliases)
10527db96d56Sopenharmony_ci
10537db96d56Sopenharmony_ci            self.named_sequences = []
10547db96d56Sopenharmony_ci            # store named sequences in the PUA 1, in range U+F0100..,
10557db96d56Sopenharmony_ci            # in order to take advantage of the compression and lookup
10567db96d56Sopenharmony_ci            # algorithms used for the other characters.
10577db96d56Sopenharmony_ci
10587db96d56Sopenharmony_ci            assert pua_index < NAMED_SEQUENCES_START
10597db96d56Sopenharmony_ci            pua_index = NAMED_SEQUENCES_START
10607db96d56Sopenharmony_ci            for name, chars in UcdFile(NAMED_SEQUENCES, version):
10617db96d56Sopenharmony_ci                chars = tuple(int(char, 16) for char in chars.split())
10627db96d56Sopenharmony_ci                # check that the structure defined in makeunicodename is OK
10637db96d56Sopenharmony_ci                assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
10647db96d56Sopenharmony_ci                assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
10657db96d56Sopenharmony_ci                    "the NamedSequence struct and in unicodedata_lookup")
10667db96d56Sopenharmony_ci                self.named_sequences.append((name, chars))
10677db96d56Sopenharmony_ci                # also store these in the PUA 1
10687db96d56Sopenharmony_ci                self.table[pua_index].name = name
10697db96d56Sopenharmony_ci                pua_index += 1
10707db96d56Sopenharmony_ci            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
10717db96d56Sopenharmony_ci
10727db96d56Sopenharmony_ci        self.exclusions = {}
10737db96d56Sopenharmony_ci        for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
10747db96d56Sopenharmony_ci            char = int(char, 16)
10757db96d56Sopenharmony_ci            self.exclusions[char] = 1
10767db96d56Sopenharmony_ci
10777db96d56Sopenharmony_ci        widths = [None] * 0x110000
10787db96d56Sopenharmony_ci        for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
10797db96d56Sopenharmony_ci            widths[char] = width
10807db96d56Sopenharmony_ci
10817db96d56Sopenharmony_ci        for i in range(0, 0x110000):
10827db96d56Sopenharmony_ci            if table[i] is not None:
10837db96d56Sopenharmony_ci                table[i].east_asian_width = widths[i]
10847db96d56Sopenharmony_ci
10857db96d56Sopenharmony_ci        for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
10867db96d56Sopenharmony_ci            if table[char]:
10877db96d56Sopenharmony_ci                # Some properties (e.g. Default_Ignorable_Code_Point)
10887db96d56Sopenharmony_ci                # apply to unassigned code points; ignore them
10897db96d56Sopenharmony_ci                table[char].binary_properties.add(p)
10907db96d56Sopenharmony_ci
10917db96d56Sopenharmony_ci        for char_range, value in UcdFile(LINE_BREAK, version):
10927db96d56Sopenharmony_ci            if value not in MANDATORY_LINE_BREAKS:
10937db96d56Sopenharmony_ci                continue
10947db96d56Sopenharmony_ci            for char in expand_range(char_range):
10957db96d56Sopenharmony_ci                table[char].binary_properties.add('Line_Break')
10967db96d56Sopenharmony_ci
10977db96d56Sopenharmony_ci        # We only want the quickcheck properties
10987db96d56Sopenharmony_ci        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
10997db96d56Sopenharmony_ci        # Yes is the default, hence only N and M occur
11007db96d56Sopenharmony_ci        # In 3.2.0, the format was different (NF?_NO)
11017db96d56Sopenharmony_ci        # The parsing will incorrectly determine these as
11027db96d56Sopenharmony_ci        # "yes", however, unicodedata.c will not perform quickchecks
11037db96d56Sopenharmony_ci        # for older versions, and no delta records will be created.
11047db96d56Sopenharmony_ci        quickchecks = [0] * 0x110000
11057db96d56Sopenharmony_ci        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
11067db96d56Sopenharmony_ci        for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
11077db96d56Sopenharmony_ci            if len(s) < 2 or s[1] not in qc_order:
11087db96d56Sopenharmony_ci                continue
11097db96d56Sopenharmony_ci            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
11107db96d56Sopenharmony_ci            quickcheck_shift = qc_order.index(s[1])*2
11117db96d56Sopenharmony_ci            quickcheck <<= quickcheck_shift
11127db96d56Sopenharmony_ci            for char in expand_range(s[0]):
11137db96d56Sopenharmony_ci                assert not (quickchecks[char]>>quickcheck_shift)&3
11147db96d56Sopenharmony_ci                quickchecks[char] |= quickcheck
11157db96d56Sopenharmony_ci        for i in range(0, 0x110000):
11167db96d56Sopenharmony_ci            if table[i] is not None:
11177db96d56Sopenharmony_ci                table[i].quick_check = quickchecks[i]
11187db96d56Sopenharmony_ci
11197db96d56Sopenharmony_ci        with open_data(UNIHAN, version) as file:
11207db96d56Sopenharmony_ci            zip = zipfile.ZipFile(file)
11217db96d56Sopenharmony_ci            if version == '3.2.0':
11227db96d56Sopenharmony_ci                data = zip.open('Unihan-3.2.0.txt').read()
11237db96d56Sopenharmony_ci            else:
11247db96d56Sopenharmony_ci                data = zip.open('Unihan_NumericValues.txt').read()
11257db96d56Sopenharmony_ci        for line in data.decode("utf-8").splitlines():
11267db96d56Sopenharmony_ci            if not line.startswith('U+'):
11277db96d56Sopenharmony_ci                continue
11287db96d56Sopenharmony_ci            code, tag, value = line.split(None, 3)[:3]
11297db96d56Sopenharmony_ci            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
11307db96d56Sopenharmony_ci                           'kOtherNumeric'):
11317db96d56Sopenharmony_ci                continue
11327db96d56Sopenharmony_ci            value = value.strip().replace(',', '')
11337db96d56Sopenharmony_ci            i = int(code[2:], 16)
11347db96d56Sopenharmony_ci            # Patch the numeric field
11357db96d56Sopenharmony_ci            if table[i] is not None:
11367db96d56Sopenharmony_ci                table[i].numeric_value = value
11377db96d56Sopenharmony_ci
11387db96d56Sopenharmony_ci        sc = self.special_casing = {}
11397db96d56Sopenharmony_ci        for data in UcdFile(SPECIAL_CASING, version):
11407db96d56Sopenharmony_ci            if data[4]:
11417db96d56Sopenharmony_ci                # We ignore all conditionals (since they depend on
11427db96d56Sopenharmony_ci                # languages) except for one, which is hardcoded. See
11437db96d56Sopenharmony_ci                # handle_capital_sigma in unicodeobject.c.
11447db96d56Sopenharmony_ci                continue
11457db96d56Sopenharmony_ci            c = int(data[0], 16)
11467db96d56Sopenharmony_ci            lower = [int(char, 16) for char in data[1].split()]
11477db96d56Sopenharmony_ci            title = [int(char, 16) for char in data[2].split()]
11487db96d56Sopenharmony_ci            upper = [int(char, 16) for char in data[3].split()]
11497db96d56Sopenharmony_ci            sc[c] = (lower, title, upper)
11507db96d56Sopenharmony_ci
11517db96d56Sopenharmony_ci        cf = self.case_folding = {}
11527db96d56Sopenharmony_ci        if version != '3.2.0':
11537db96d56Sopenharmony_ci            for data in UcdFile(CASE_FOLDING, version):
11547db96d56Sopenharmony_ci                if data[1] in "CF":
11557db96d56Sopenharmony_ci                    c = int(data[0], 16)
11567db96d56Sopenharmony_ci                    cf[c] = [int(char, 16) for char in data[2].split()]
11577db96d56Sopenharmony_ci
11587db96d56Sopenharmony_ci    def uselatin1(self):
11597db96d56Sopenharmony_ci        # restrict character range to ISO Latin 1
11607db96d56Sopenharmony_ci        self.chars = list(range(256))
11617db96d56Sopenharmony_ci
11627db96d56Sopenharmony_ci
11637db96d56Sopenharmony_ci# hash table tools
11647db96d56Sopenharmony_ci
11657db96d56Sopenharmony_ci# this is a straight-forward reimplementation of Python's built-in
11667db96d56Sopenharmony_ci# dictionary type, using a static data structure, and a custom string
11677db96d56Sopenharmony_ci# hash algorithm.
11687db96d56Sopenharmony_ci
11697db96d56Sopenharmony_cidef myhash(s, magic):
11707db96d56Sopenharmony_ci    h = 0
11717db96d56Sopenharmony_ci    for c in map(ord, s.upper()):
11727db96d56Sopenharmony_ci        h = (h * magic) + c
11737db96d56Sopenharmony_ci        ix = h & 0xff000000
11747db96d56Sopenharmony_ci        if ix:
11757db96d56Sopenharmony_ci            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
11767db96d56Sopenharmony_ci    return h
11777db96d56Sopenharmony_ci
11787db96d56Sopenharmony_ci
11797db96d56Sopenharmony_ciSIZES = [
11807db96d56Sopenharmony_ci    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
11817db96d56Sopenharmony_ci    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
11827db96d56Sopenharmony_ci    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
11837db96d56Sopenharmony_ci    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
11847db96d56Sopenharmony_ci]
11857db96d56Sopenharmony_ci
11867db96d56Sopenharmony_ci
11877db96d56Sopenharmony_ciclass Hash:
11887db96d56Sopenharmony_ci    def __init__(self, name, data, magic):
11897db96d56Sopenharmony_ci        # turn a (key, value) list into a static hash table structure
11907db96d56Sopenharmony_ci
11917db96d56Sopenharmony_ci        # determine table size
11927db96d56Sopenharmony_ci        for size, poly in SIZES:
11937db96d56Sopenharmony_ci            if size > len(data):
11947db96d56Sopenharmony_ci                poly = size + poly
11957db96d56Sopenharmony_ci                break
11967db96d56Sopenharmony_ci        else:
11977db96d56Sopenharmony_ci            raise AssertionError("ran out of polynomials")
11987db96d56Sopenharmony_ci
11997db96d56Sopenharmony_ci        print(size, "slots in hash table")
12007db96d56Sopenharmony_ci
12017db96d56Sopenharmony_ci        table = [None] * size
12027db96d56Sopenharmony_ci
12037db96d56Sopenharmony_ci        mask = size-1
12047db96d56Sopenharmony_ci
12057db96d56Sopenharmony_ci        n = 0
12067db96d56Sopenharmony_ci
12077db96d56Sopenharmony_ci        hash = myhash
12087db96d56Sopenharmony_ci
12097db96d56Sopenharmony_ci        # initialize hash table
12107db96d56Sopenharmony_ci        for key, value in data:
12117db96d56Sopenharmony_ci            h = hash(key, magic)
12127db96d56Sopenharmony_ci            i = (~h) & mask
12137db96d56Sopenharmony_ci            v = table[i]
12147db96d56Sopenharmony_ci            if v is None:
12157db96d56Sopenharmony_ci                table[i] = value
12167db96d56Sopenharmony_ci                continue
12177db96d56Sopenharmony_ci            incr = (h ^ (h >> 3)) & mask
12187db96d56Sopenharmony_ci            if not incr:
12197db96d56Sopenharmony_ci                incr = mask
12207db96d56Sopenharmony_ci            while 1:
12217db96d56Sopenharmony_ci                n = n + 1
12227db96d56Sopenharmony_ci                i = (i + incr) & mask
12237db96d56Sopenharmony_ci                v = table[i]
12247db96d56Sopenharmony_ci                if v is None:
12257db96d56Sopenharmony_ci                    table[i] = value
12267db96d56Sopenharmony_ci                    break
12277db96d56Sopenharmony_ci                incr = incr << 1
12287db96d56Sopenharmony_ci                if incr > mask:
12297db96d56Sopenharmony_ci                    incr = incr ^ poly
12307db96d56Sopenharmony_ci
12317db96d56Sopenharmony_ci        print(n, "collisions")
12327db96d56Sopenharmony_ci        self.collisions = n
12337db96d56Sopenharmony_ci
12347db96d56Sopenharmony_ci        for i in range(len(table)):
12357db96d56Sopenharmony_ci            if table[i] is None:
12367db96d56Sopenharmony_ci                table[i] = 0
12377db96d56Sopenharmony_ci
12387db96d56Sopenharmony_ci        self.data = Array(name + "_hash", table)
12397db96d56Sopenharmony_ci        self.magic = magic
12407db96d56Sopenharmony_ci        self.name = name
12417db96d56Sopenharmony_ci        self.size = size
12427db96d56Sopenharmony_ci        self.poly = poly
12437db96d56Sopenharmony_ci
12447db96d56Sopenharmony_ci    def dump(self, file, trace):
12457db96d56Sopenharmony_ci        # write data to file, as a C array
12467db96d56Sopenharmony_ci        self.data.dump(file, trace)
12477db96d56Sopenharmony_ci        file.write("#define %s_magic %d\n" % (self.name, self.magic))
12487db96d56Sopenharmony_ci        file.write("#define %s_size %d\n" % (self.name, self.size))
12497db96d56Sopenharmony_ci        file.write("#define %s_poly %d\n" % (self.name, self.poly))
12507db96d56Sopenharmony_ci
12517db96d56Sopenharmony_ci
12527db96d56Sopenharmony_ci# stuff to deal with arrays of unsigned integers
12537db96d56Sopenharmony_ci
12547db96d56Sopenharmony_ciclass Array:
12557db96d56Sopenharmony_ci
12567db96d56Sopenharmony_ci    def __init__(self, name, data):
12577db96d56Sopenharmony_ci        self.name = name
12587db96d56Sopenharmony_ci        self.data = data
12597db96d56Sopenharmony_ci
12607db96d56Sopenharmony_ci    def dump(self, file, trace=0):
12617db96d56Sopenharmony_ci        # write data to file, as a C array
12627db96d56Sopenharmony_ci        size = getsize(self.data)
12637db96d56Sopenharmony_ci        if trace:
12647db96d56Sopenharmony_ci            print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
12657db96d56Sopenharmony_ci        file.write("static const ")
12667db96d56Sopenharmony_ci        if size == 1:
12677db96d56Sopenharmony_ci            file.write("unsigned char")
12687db96d56Sopenharmony_ci        elif size == 2:
12697db96d56Sopenharmony_ci            file.write("unsigned short")
12707db96d56Sopenharmony_ci        else:
12717db96d56Sopenharmony_ci            file.write("unsigned int")
12727db96d56Sopenharmony_ci        file.write(" " + self.name + "[] = {\n")
12737db96d56Sopenharmony_ci        if self.data:
12747db96d56Sopenharmony_ci            s = "    "
12757db96d56Sopenharmony_ci            for item in self.data:
12767db96d56Sopenharmony_ci                i = str(item) + ", "
12777db96d56Sopenharmony_ci                if len(s) + len(i) > 78:
12787db96d56Sopenharmony_ci                    file.write(s.rstrip() + "\n")
12797db96d56Sopenharmony_ci                    s = "    " + i
12807db96d56Sopenharmony_ci                else:
12817db96d56Sopenharmony_ci                    s = s + i
12827db96d56Sopenharmony_ci            if s.strip():
12837db96d56Sopenharmony_ci                file.write(s.rstrip() + "\n")
12847db96d56Sopenharmony_ci        file.write("};\n\n")
12857db96d56Sopenharmony_ci
12867db96d56Sopenharmony_ci
12877db96d56Sopenharmony_cidef getsize(data):
12887db96d56Sopenharmony_ci    # return smallest possible integer size for the given array
12897db96d56Sopenharmony_ci    maxdata = max(data)
12907db96d56Sopenharmony_ci    if maxdata < 256:
12917db96d56Sopenharmony_ci        return 1
12927db96d56Sopenharmony_ci    elif maxdata < 65536:
12937db96d56Sopenharmony_ci        return 2
12947db96d56Sopenharmony_ci    else:
12957db96d56Sopenharmony_ci        return 4
12967db96d56Sopenharmony_ci
12977db96d56Sopenharmony_ci
12987db96d56Sopenharmony_cidef splitbins(t, trace=0):
12997db96d56Sopenharmony_ci    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
13007db96d56Sopenharmony_ci
13017db96d56Sopenharmony_ci    t is a sequence of ints.  This function can be useful to save space if
13027db96d56Sopenharmony_ci    many of the ints are the same.  t1 and t2 are lists of ints, and shift
13037db96d56Sopenharmony_ci    is an int, chosen to minimize the combined size of t1 and t2 (in C
13047db96d56Sopenharmony_ci    code), and where for each i in range(len(t)),
13057db96d56Sopenharmony_ci        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
13067db96d56Sopenharmony_ci    where mask is a bitmask isolating the last "shift" bits.
13077db96d56Sopenharmony_ci
13087db96d56Sopenharmony_ci    If optional arg trace is non-zero (default zero), progress info
13097db96d56Sopenharmony_ci    is printed to sys.stderr.  The higher the value, the more info
13107db96d56Sopenharmony_ci    you'll get.
13117db96d56Sopenharmony_ci    """
13127db96d56Sopenharmony_ci
13137db96d56Sopenharmony_ci    if trace:
13147db96d56Sopenharmony_ci        def dump(t1, t2, shift, bytes):
13157db96d56Sopenharmony_ci            print("%d+%d bins at shift %d; %d bytes" % (
13167db96d56Sopenharmony_ci                len(t1), len(t2), shift, bytes), file=sys.stderr)
13177db96d56Sopenharmony_ci        print("Size of original table:", len(t)*getsize(t), "bytes",
13187db96d56Sopenharmony_ci              file=sys.stderr)
13197db96d56Sopenharmony_ci    n = len(t)-1    # last valid index
13207db96d56Sopenharmony_ci    maxshift = 0    # the most we can shift n and still have something left
13217db96d56Sopenharmony_ci    if n > 0:
13227db96d56Sopenharmony_ci        while n >> 1:
13237db96d56Sopenharmony_ci            n >>= 1
13247db96d56Sopenharmony_ci            maxshift += 1
13257db96d56Sopenharmony_ci    del n
13267db96d56Sopenharmony_ci    bytes = sys.maxsize  # smallest total size so far
13277db96d56Sopenharmony_ci    t = tuple(t)    # so slices can be dict keys
13287db96d56Sopenharmony_ci    for shift in range(maxshift + 1):
13297db96d56Sopenharmony_ci        t1 = []
13307db96d56Sopenharmony_ci        t2 = []
13317db96d56Sopenharmony_ci        size = 2**shift
13327db96d56Sopenharmony_ci        bincache = {}
13337db96d56Sopenharmony_ci        for i in range(0, len(t), size):
13347db96d56Sopenharmony_ci            bin = t[i:i+size]
13357db96d56Sopenharmony_ci            index = bincache.get(bin)
13367db96d56Sopenharmony_ci            if index is None:
13377db96d56Sopenharmony_ci                index = len(t2)
13387db96d56Sopenharmony_ci                bincache[bin] = index
13397db96d56Sopenharmony_ci                t2.extend(bin)
13407db96d56Sopenharmony_ci            t1.append(index >> shift)
13417db96d56Sopenharmony_ci        # determine memory size
13427db96d56Sopenharmony_ci        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
13437db96d56Sopenharmony_ci        if trace > 1:
13447db96d56Sopenharmony_ci            dump(t1, t2, shift, b)
13457db96d56Sopenharmony_ci        if b < bytes:
13467db96d56Sopenharmony_ci            best = t1, t2, shift
13477db96d56Sopenharmony_ci            bytes = b
13487db96d56Sopenharmony_ci    t1, t2, shift = best
13497db96d56Sopenharmony_ci    if trace:
13507db96d56Sopenharmony_ci        print("Best:", end=' ', file=sys.stderr)
13517db96d56Sopenharmony_ci        dump(t1, t2, shift, bytes)
13527db96d56Sopenharmony_ci    if __debug__:
13537db96d56Sopenharmony_ci        # exhaustively verify that the decomposition is correct
13547db96d56Sopenharmony_ci        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
13557db96d56Sopenharmony_ci        for i in range(len(t)):
13567db96d56Sopenharmony_ci            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
13577db96d56Sopenharmony_ci    return best
13587db96d56Sopenharmony_ci
13597db96d56Sopenharmony_ci
13607db96d56Sopenharmony_ciif __name__ == "__main__":
13617db96d56Sopenharmony_ci    maketables(1)
1362