17db96d56Sopenharmony_ci# 27db96d56Sopenharmony_ci# (re)generate unicode property and type databases 37db96d56Sopenharmony_ci# 47db96d56Sopenharmony_ci# This script converts Unicode database files to Modules/unicodedata_db.h, 57db96d56Sopenharmony_ci# Modules/unicodename_db.h, and Objects/unicodetype_db.h 67db96d56Sopenharmony_ci# 77db96d56Sopenharmony_ci# history: 87db96d56Sopenharmony_ci# 2000-09-24 fl created (based on bits and pieces from unidb) 97db96d56Sopenharmony_ci# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 107db96d56Sopenharmony_ci# 2000-09-25 fl added character type table 117db96d56Sopenharmony_ci# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) 127db96d56Sopenharmony_ci# 2000-11-03 fl expand first/last ranges 137db96d56Sopenharmony_ci# 2001-01-19 fl added character name tables (2.1) 147db96d56Sopenharmony_ci# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold 157db96d56Sopenharmony_ci# 2002-09-11 wd use string methods 167db96d56Sopenharmony_ci# 2002-10-18 mvl update to Unicode 3.2 177db96d56Sopenharmony_ci# 2002-10-22 mvl generate NFC tables 187db96d56Sopenharmony_ci# 2002-11-24 mvl expand all ranges, sort names version-independently 197db96d56Sopenharmony_ci# 2002-11-25 mvl add UNIDATA_VERSION 207db96d56Sopenharmony_ci# 2004-05-29 perky add east asian width information 217db96d56Sopenharmony_ci# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta 227db96d56Sopenharmony_ci# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch 237db96d56Sopenharmony_ci# 2011-10-21 ezio add support for name aliases and named sequences 247db96d56Sopenharmony_ci# 2012-01 benjamin add full case mappings 257db96d56Sopenharmony_ci# 267db96d56Sopenharmony_ci# written by Fredrik Lundh (fredrik@pythonware.com) 277db96d56Sopenharmony_ci# 287db96d56Sopenharmony_ci 297db96d56Sopenharmony_ciimport dataclasses 307db96d56Sopenharmony_ciimport os 317db96d56Sopenharmony_ciimport sys 327db96d56Sopenharmony_ciimport zipfile 337db96d56Sopenharmony_ci 347db96d56Sopenharmony_cifrom functools import partial 357db96d56Sopenharmony_cifrom textwrap import dedent 367db96d56Sopenharmony_cifrom typing import Iterator, List, Optional, Set, Tuple 377db96d56Sopenharmony_ci 387db96d56Sopenharmony_ciSCRIPT = sys.argv[0] 397db96d56Sopenharmony_ciVERSION = "3.3" 407db96d56Sopenharmony_ci 417db96d56Sopenharmony_ci# The Unicode Database 427db96d56Sopenharmony_ci# -------------------- 437db96d56Sopenharmony_ci# When changing UCD version please update 447db96d56Sopenharmony_ci# * Doc/library/stdtypes.rst, and 457db96d56Sopenharmony_ci# * Doc/library/unicodedata.rst 467db96d56Sopenharmony_ci# * Doc/reference/lexical_analysis.rst (two occurrences) 477db96d56Sopenharmony_ciUNIDATA_VERSION = "14.0.0" 487db96d56Sopenharmony_ciUNICODE_DATA = "UnicodeData%s.txt" 497db96d56Sopenharmony_ciCOMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" 507db96d56Sopenharmony_ciEASTASIAN_WIDTH = "EastAsianWidth%s.txt" 517db96d56Sopenharmony_ciUNIHAN = "Unihan%s.zip" 527db96d56Sopenharmony_ciDERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" 537db96d56Sopenharmony_ciDERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" 547db96d56Sopenharmony_ciLINE_BREAK = "LineBreak%s.txt" 557db96d56Sopenharmony_ciNAME_ALIASES = "NameAliases%s.txt" 567db96d56Sopenharmony_ciNAMED_SEQUENCES = "NamedSequences%s.txt" 577db96d56Sopenharmony_ciSPECIAL_CASING = "SpecialCasing%s.txt" 587db96d56Sopenharmony_ciCASE_FOLDING = "CaseFolding%s.txt" 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ci# Private Use Areas -- in planes 1, 15, 16 617db96d56Sopenharmony_ciPUA_1 = range(0xE000, 0xF900) 627db96d56Sopenharmony_ciPUA_15 = range(0xF0000, 0xFFFFE) 637db96d56Sopenharmony_ciPUA_16 = range(0x100000, 0x10FFFE) 647db96d56Sopenharmony_ci 657db96d56Sopenharmony_ci# we use this ranges of PUA_15 to store name aliases and named sequences 667db96d56Sopenharmony_ciNAME_ALIASES_START = 0xF0000 677db96d56Sopenharmony_ciNAMED_SEQUENCES_START = 0xF0200 687db96d56Sopenharmony_ci 697db96d56Sopenharmony_ciold_versions = ["3.2.0"] 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ciCATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 727db96d56Sopenharmony_ci "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 737db96d56Sopenharmony_ci "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 747db96d56Sopenharmony_ci "So" ] 757db96d56Sopenharmony_ci 767db96d56Sopenharmony_ciBIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 777db96d56Sopenharmony_ci "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 787db96d56Sopenharmony_ci "ON", "LRI", "RLI", "FSI", "PDI" ] 797db96d56Sopenharmony_ci 807db96d56Sopenharmony_ciEASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] 817db96d56Sopenharmony_ci 827db96d56Sopenharmony_ciMANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] 837db96d56Sopenharmony_ci 847db96d56Sopenharmony_ci# note: should match definitions in Objects/unicodectype.c 857db96d56Sopenharmony_ciALPHA_MASK = 0x01 867db96d56Sopenharmony_ciDECIMAL_MASK = 0x02 877db96d56Sopenharmony_ciDIGIT_MASK = 0x04 887db96d56Sopenharmony_ciLOWER_MASK = 0x08 897db96d56Sopenharmony_ciLINEBREAK_MASK = 0x10 907db96d56Sopenharmony_ciSPACE_MASK = 0x20 917db96d56Sopenharmony_ciTITLE_MASK = 0x40 927db96d56Sopenharmony_ciUPPER_MASK = 0x80 937db96d56Sopenharmony_ciXID_START_MASK = 0x100 947db96d56Sopenharmony_ciXID_CONTINUE_MASK = 0x200 957db96d56Sopenharmony_ciPRINTABLE_MASK = 0x400 967db96d56Sopenharmony_ciNUMERIC_MASK = 0x800 977db96d56Sopenharmony_ciCASE_IGNORABLE_MASK = 0x1000 987db96d56Sopenharmony_ciCASED_MASK = 0x2000 997db96d56Sopenharmony_ciEXTENDED_CASE_MASK = 0x4000 1007db96d56Sopenharmony_ci 1017db96d56Sopenharmony_ci# these ranges need to match unicodedata.c:is_unified_ideograph 1027db96d56Sopenharmony_cicjk_ranges = [ 1037db96d56Sopenharmony_ci ('3400', '4DBF'), 1047db96d56Sopenharmony_ci ('4E00', '9FFF'), 1057db96d56Sopenharmony_ci ('20000', '2A6DF'), 1067db96d56Sopenharmony_ci ('2A700', '2B738'), 1077db96d56Sopenharmony_ci ('2B740', '2B81D'), 1087db96d56Sopenharmony_ci ('2B820', '2CEA1'), 1097db96d56Sopenharmony_ci ('2CEB0', '2EBE0'), 1107db96d56Sopenharmony_ci ('30000', '3134A'), 1117db96d56Sopenharmony_ci] 1127db96d56Sopenharmony_ci 1137db96d56Sopenharmony_ci 1147db96d56Sopenharmony_cidef maketables(trace=0): 1157db96d56Sopenharmony_ci 1167db96d56Sopenharmony_ci print("--- Reading", UNICODE_DATA % "", "...") 1177db96d56Sopenharmony_ci 1187db96d56Sopenharmony_ci unicode = UnicodeData(UNIDATA_VERSION) 1197db96d56Sopenharmony_ci 1207db96d56Sopenharmony_ci print(len(list(filter(None, unicode.table))), "characters") 1217db96d56Sopenharmony_ci 1227db96d56Sopenharmony_ci for version in old_versions: 1237db96d56Sopenharmony_ci print("--- Reading", UNICODE_DATA % ("-"+version), "...") 1247db96d56Sopenharmony_ci old_unicode = UnicodeData(version, cjk_check=False) 1257db96d56Sopenharmony_ci print(len(list(filter(None, old_unicode.table))), "characters") 1267db96d56Sopenharmony_ci merge_old_version(version, unicode, old_unicode) 1277db96d56Sopenharmony_ci 1287db96d56Sopenharmony_ci makeunicodename(unicode, trace) 1297db96d56Sopenharmony_ci makeunicodedata(unicode, trace) 1307db96d56Sopenharmony_ci makeunicodetype(unicode, trace) 1317db96d56Sopenharmony_ci 1327db96d56Sopenharmony_ci 1337db96d56Sopenharmony_ci# -------------------------------------------------------------------- 1347db96d56Sopenharmony_ci# unicode character properties 1357db96d56Sopenharmony_ci 1367db96d56Sopenharmony_cidef makeunicodedata(unicode, trace): 1377db96d56Sopenharmony_ci 1387db96d56Sopenharmony_ci dummy = (0, 0, 0, 0, 0, 0) 1397db96d56Sopenharmony_ci table = [dummy] 1407db96d56Sopenharmony_ci cache = {0: dummy} 1417db96d56Sopenharmony_ci index = [0] * len(unicode.chars) 1427db96d56Sopenharmony_ci 1437db96d56Sopenharmony_ci FILE = "Modules/unicodedata_db.h" 1447db96d56Sopenharmony_ci 1457db96d56Sopenharmony_ci print("--- Preparing", FILE, "...") 1467db96d56Sopenharmony_ci 1477db96d56Sopenharmony_ci # 1) database properties 1487db96d56Sopenharmony_ci 1497db96d56Sopenharmony_ci for char in unicode.chars: 1507db96d56Sopenharmony_ci record = unicode.table[char] 1517db96d56Sopenharmony_ci if record: 1527db96d56Sopenharmony_ci # extract database properties 1537db96d56Sopenharmony_ci category = CATEGORY_NAMES.index(record.general_category) 1547db96d56Sopenharmony_ci combining = int(record.canonical_combining_class) 1557db96d56Sopenharmony_ci bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class) 1567db96d56Sopenharmony_ci mirrored = record.bidi_mirrored == "Y" 1577db96d56Sopenharmony_ci eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width) 1587db96d56Sopenharmony_ci normalizationquickcheck = record.quick_check 1597db96d56Sopenharmony_ci item = ( 1607db96d56Sopenharmony_ci category, combining, bidirectional, mirrored, eastasianwidth, 1617db96d56Sopenharmony_ci normalizationquickcheck 1627db96d56Sopenharmony_ci ) 1637db96d56Sopenharmony_ci # add entry to index and item tables 1647db96d56Sopenharmony_ci i = cache.get(item) 1657db96d56Sopenharmony_ci if i is None: 1667db96d56Sopenharmony_ci cache[item] = i = len(table) 1677db96d56Sopenharmony_ci table.append(item) 1687db96d56Sopenharmony_ci index[char] = i 1697db96d56Sopenharmony_ci 1707db96d56Sopenharmony_ci # 2) decomposition data 1717db96d56Sopenharmony_ci 1727db96d56Sopenharmony_ci decomp_data = [0] 1737db96d56Sopenharmony_ci decomp_prefix = [""] 1747db96d56Sopenharmony_ci decomp_index = [0] * len(unicode.chars) 1757db96d56Sopenharmony_ci decomp_size = 0 1767db96d56Sopenharmony_ci 1777db96d56Sopenharmony_ci comp_pairs = [] 1787db96d56Sopenharmony_ci comp_first = [None] * len(unicode.chars) 1797db96d56Sopenharmony_ci comp_last = [None] * len(unicode.chars) 1807db96d56Sopenharmony_ci 1817db96d56Sopenharmony_ci for char in unicode.chars: 1827db96d56Sopenharmony_ci record = unicode.table[char] 1837db96d56Sopenharmony_ci if record: 1847db96d56Sopenharmony_ci if record.decomposition_type: 1857db96d56Sopenharmony_ci decomp = record.decomposition_type.split() 1867db96d56Sopenharmony_ci if len(decomp) > 19: 1877db96d56Sopenharmony_ci raise Exception("character %x has a decomposition too large for nfd_nfkd" % char) 1887db96d56Sopenharmony_ci # prefix 1897db96d56Sopenharmony_ci if decomp[0][0] == "<": 1907db96d56Sopenharmony_ci prefix = decomp.pop(0) 1917db96d56Sopenharmony_ci else: 1927db96d56Sopenharmony_ci prefix = "" 1937db96d56Sopenharmony_ci try: 1947db96d56Sopenharmony_ci i = decomp_prefix.index(prefix) 1957db96d56Sopenharmony_ci except ValueError: 1967db96d56Sopenharmony_ci i = len(decomp_prefix) 1977db96d56Sopenharmony_ci decomp_prefix.append(prefix) 1987db96d56Sopenharmony_ci prefix = i 1997db96d56Sopenharmony_ci assert prefix < 256 2007db96d56Sopenharmony_ci # content 2017db96d56Sopenharmony_ci decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] 2027db96d56Sopenharmony_ci # Collect NFC pairs 2037db96d56Sopenharmony_ci if not prefix and len(decomp) == 3 and \ 2047db96d56Sopenharmony_ci char not in unicode.exclusions and \ 2057db96d56Sopenharmony_ci unicode.table[decomp[1]].canonical_combining_class == "0": 2067db96d56Sopenharmony_ci p, l, r = decomp 2077db96d56Sopenharmony_ci comp_first[l] = 1 2087db96d56Sopenharmony_ci comp_last[r] = 1 2097db96d56Sopenharmony_ci comp_pairs.append((l,r,char)) 2107db96d56Sopenharmony_ci try: 2117db96d56Sopenharmony_ci i = decomp_data.index(decomp) 2127db96d56Sopenharmony_ci except ValueError: 2137db96d56Sopenharmony_ci i = len(decomp_data) 2147db96d56Sopenharmony_ci decomp_data.extend(decomp) 2157db96d56Sopenharmony_ci decomp_size = decomp_size + len(decomp) * 2 2167db96d56Sopenharmony_ci else: 2177db96d56Sopenharmony_ci i = 0 2187db96d56Sopenharmony_ci decomp_index[char] = i 2197db96d56Sopenharmony_ci 2207db96d56Sopenharmony_ci f = l = 0 2217db96d56Sopenharmony_ci comp_first_ranges = [] 2227db96d56Sopenharmony_ci comp_last_ranges = [] 2237db96d56Sopenharmony_ci prev_f = prev_l = None 2247db96d56Sopenharmony_ci for i in unicode.chars: 2257db96d56Sopenharmony_ci if comp_first[i] is not None: 2267db96d56Sopenharmony_ci comp_first[i] = f 2277db96d56Sopenharmony_ci f += 1 2287db96d56Sopenharmony_ci if prev_f is None: 2297db96d56Sopenharmony_ci prev_f = (i,i) 2307db96d56Sopenharmony_ci elif prev_f[1]+1 == i: 2317db96d56Sopenharmony_ci prev_f = prev_f[0],i 2327db96d56Sopenharmony_ci else: 2337db96d56Sopenharmony_ci comp_first_ranges.append(prev_f) 2347db96d56Sopenharmony_ci prev_f = (i,i) 2357db96d56Sopenharmony_ci if comp_last[i] is not None: 2367db96d56Sopenharmony_ci comp_last[i] = l 2377db96d56Sopenharmony_ci l += 1 2387db96d56Sopenharmony_ci if prev_l is None: 2397db96d56Sopenharmony_ci prev_l = (i,i) 2407db96d56Sopenharmony_ci elif prev_l[1]+1 == i: 2417db96d56Sopenharmony_ci prev_l = prev_l[0],i 2427db96d56Sopenharmony_ci else: 2437db96d56Sopenharmony_ci comp_last_ranges.append(prev_l) 2447db96d56Sopenharmony_ci prev_l = (i,i) 2457db96d56Sopenharmony_ci comp_first_ranges.append(prev_f) 2467db96d56Sopenharmony_ci comp_last_ranges.append(prev_l) 2477db96d56Sopenharmony_ci total_first = f 2487db96d56Sopenharmony_ci total_last = l 2497db96d56Sopenharmony_ci 2507db96d56Sopenharmony_ci comp_data = [0]*(total_first*total_last) 2517db96d56Sopenharmony_ci for f,l,char in comp_pairs: 2527db96d56Sopenharmony_ci f = comp_first[f] 2537db96d56Sopenharmony_ci l = comp_last[l] 2547db96d56Sopenharmony_ci comp_data[f*total_last+l] = char 2557db96d56Sopenharmony_ci 2567db96d56Sopenharmony_ci print(len(table), "unique properties") 2577db96d56Sopenharmony_ci print(len(decomp_prefix), "unique decomposition prefixes") 2587db96d56Sopenharmony_ci print(len(decomp_data), "unique decomposition entries:", end=' ') 2597db96d56Sopenharmony_ci print(decomp_size, "bytes") 2607db96d56Sopenharmony_ci print(total_first, "first characters in NFC") 2617db96d56Sopenharmony_ci print(total_last, "last characters in NFC") 2627db96d56Sopenharmony_ci print(len(comp_pairs), "NFC pairs") 2637db96d56Sopenharmony_ci 2647db96d56Sopenharmony_ci print("--- Writing", FILE, "...") 2657db96d56Sopenharmony_ci 2667db96d56Sopenharmony_ci with open(FILE, "w") as fp: 2677db96d56Sopenharmony_ci fprint = partial(print, file=fp) 2687db96d56Sopenharmony_ci 2697db96d56Sopenharmony_ci fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 2707db96d56Sopenharmony_ci fprint() 2717db96d56Sopenharmony_ci fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION) 2727db96d56Sopenharmony_ci fprint("/* a list of unique database records */") 2737db96d56Sopenharmony_ci fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {") 2747db96d56Sopenharmony_ci for item in table: 2757db96d56Sopenharmony_ci fprint(" {%d, %d, %d, %d, %d, %d}," % item) 2767db96d56Sopenharmony_ci fprint("};") 2777db96d56Sopenharmony_ci fprint() 2787db96d56Sopenharmony_ci 2797db96d56Sopenharmony_ci fprint("/* Reindexing of NFC first characters. */") 2807db96d56Sopenharmony_ci fprint("#define TOTAL_FIRST",total_first) 2817db96d56Sopenharmony_ci fprint("#define TOTAL_LAST",total_last) 2827db96d56Sopenharmony_ci fprint("struct reindex{int start;short count,index;};") 2837db96d56Sopenharmony_ci fprint("static struct reindex nfc_first[] = {") 2847db96d56Sopenharmony_ci for start,end in comp_first_ranges: 2857db96d56Sopenharmony_ci fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start])) 2867db96d56Sopenharmony_ci fprint(" {0,0,0}") 2877db96d56Sopenharmony_ci fprint("};\n") 2887db96d56Sopenharmony_ci fprint("static struct reindex nfc_last[] = {") 2897db96d56Sopenharmony_ci for start,end in comp_last_ranges: 2907db96d56Sopenharmony_ci fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start])) 2917db96d56Sopenharmony_ci fprint(" {0,0,0}") 2927db96d56Sopenharmony_ci fprint("};\n") 2937db96d56Sopenharmony_ci 2947db96d56Sopenharmony_ci # FIXME: <fl> the following tables could be made static, and 2957db96d56Sopenharmony_ci # the support code moved into unicodedatabase.c 2967db96d56Sopenharmony_ci 2977db96d56Sopenharmony_ci fprint("/* string literals */") 2987db96d56Sopenharmony_ci fprint("const char *_PyUnicode_CategoryNames[] = {") 2997db96d56Sopenharmony_ci for name in CATEGORY_NAMES: 3007db96d56Sopenharmony_ci fprint(" \"%s\"," % name) 3017db96d56Sopenharmony_ci fprint(" NULL") 3027db96d56Sopenharmony_ci fprint("};") 3037db96d56Sopenharmony_ci 3047db96d56Sopenharmony_ci fprint("const char *_PyUnicode_BidirectionalNames[] = {") 3057db96d56Sopenharmony_ci for name in BIDIRECTIONAL_NAMES: 3067db96d56Sopenharmony_ci fprint(" \"%s\"," % name) 3077db96d56Sopenharmony_ci fprint(" NULL") 3087db96d56Sopenharmony_ci fprint("};") 3097db96d56Sopenharmony_ci 3107db96d56Sopenharmony_ci fprint("const char *_PyUnicode_EastAsianWidthNames[] = {") 3117db96d56Sopenharmony_ci for name in EASTASIANWIDTH_NAMES: 3127db96d56Sopenharmony_ci fprint(" \"%s\"," % name) 3137db96d56Sopenharmony_ci fprint(" NULL") 3147db96d56Sopenharmony_ci fprint("};") 3157db96d56Sopenharmony_ci 3167db96d56Sopenharmony_ci fprint("static const char *decomp_prefix[] = {") 3177db96d56Sopenharmony_ci for name in decomp_prefix: 3187db96d56Sopenharmony_ci fprint(" \"%s\"," % name) 3197db96d56Sopenharmony_ci fprint(" NULL") 3207db96d56Sopenharmony_ci fprint("};") 3217db96d56Sopenharmony_ci 3227db96d56Sopenharmony_ci # split record index table 3237db96d56Sopenharmony_ci index1, index2, shift = splitbins(index, trace) 3247db96d56Sopenharmony_ci 3257db96d56Sopenharmony_ci fprint("/* index tables for the database records */") 3267db96d56Sopenharmony_ci fprint("#define SHIFT", shift) 3277db96d56Sopenharmony_ci Array("index1", index1).dump(fp, trace) 3287db96d56Sopenharmony_ci Array("index2", index2).dump(fp, trace) 3297db96d56Sopenharmony_ci 3307db96d56Sopenharmony_ci # split decomposition index table 3317db96d56Sopenharmony_ci index1, index2, shift = splitbins(decomp_index, trace) 3327db96d56Sopenharmony_ci 3337db96d56Sopenharmony_ci fprint("/* decomposition data */") 3347db96d56Sopenharmony_ci Array("decomp_data", decomp_data).dump(fp, trace) 3357db96d56Sopenharmony_ci 3367db96d56Sopenharmony_ci fprint("/* index tables for the decomposition data */") 3377db96d56Sopenharmony_ci fprint("#define DECOMP_SHIFT", shift) 3387db96d56Sopenharmony_ci Array("decomp_index1", index1).dump(fp, trace) 3397db96d56Sopenharmony_ci Array("decomp_index2", index2).dump(fp, trace) 3407db96d56Sopenharmony_ci 3417db96d56Sopenharmony_ci index, index2, shift = splitbins(comp_data, trace) 3427db96d56Sopenharmony_ci fprint("/* NFC pairs */") 3437db96d56Sopenharmony_ci fprint("#define COMP_SHIFT", shift) 3447db96d56Sopenharmony_ci Array("comp_index", index).dump(fp, trace) 3457db96d56Sopenharmony_ci Array("comp_data", index2).dump(fp, trace) 3467db96d56Sopenharmony_ci 3477db96d56Sopenharmony_ci # Generate delta tables for old versions 3487db96d56Sopenharmony_ci for version, table, normalization in unicode.changed: 3497db96d56Sopenharmony_ci cversion = version.replace(".","_") 3507db96d56Sopenharmony_ci records = [table[0]] 3517db96d56Sopenharmony_ci cache = {table[0]:0} 3527db96d56Sopenharmony_ci index = [0] * len(table) 3537db96d56Sopenharmony_ci for i, record in enumerate(table): 3547db96d56Sopenharmony_ci try: 3557db96d56Sopenharmony_ci index[i] = cache[record] 3567db96d56Sopenharmony_ci except KeyError: 3577db96d56Sopenharmony_ci index[i] = cache[record] = len(records) 3587db96d56Sopenharmony_ci records.append(record) 3597db96d56Sopenharmony_ci index1, index2, shift = splitbins(index, trace) 3607db96d56Sopenharmony_ci fprint("static const change_record change_records_%s[] = {" % cversion) 3617db96d56Sopenharmony_ci for record in records: 3627db96d56Sopenharmony_ci fprint(" { %s }," % ", ".join(map(str,record))) 3637db96d56Sopenharmony_ci fprint("};") 3647db96d56Sopenharmony_ci Array("changes_%s_index" % cversion, index1).dump(fp, trace) 3657db96d56Sopenharmony_ci Array("changes_%s_data" % cversion, index2).dump(fp, trace) 3667db96d56Sopenharmony_ci fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion) 3677db96d56Sopenharmony_ci fprint("{") 3687db96d56Sopenharmony_ci fprint(" int index;") 3697db96d56Sopenharmony_ci fprint(" if (n >= 0x110000) index = 0;") 3707db96d56Sopenharmony_ci fprint(" else {") 3717db96d56Sopenharmony_ci fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift)) 3727db96d56Sopenharmony_ci fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \ 3737db96d56Sopenharmony_ci (cversion, shift, ((1<<shift)-1))) 3747db96d56Sopenharmony_ci fprint(" }") 3757db96d56Sopenharmony_ci fprint(" return change_records_%s+index;" % cversion) 3767db96d56Sopenharmony_ci fprint("}\n") 3777db96d56Sopenharmony_ci fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion) 3787db96d56Sopenharmony_ci fprint("{") 3797db96d56Sopenharmony_ci fprint(" switch(n) {") 3807db96d56Sopenharmony_ci for k, v in normalization: 3817db96d56Sopenharmony_ci fprint(" case %s: return 0x%s;" % (hex(k), v)) 3827db96d56Sopenharmony_ci fprint(" default: return 0;") 3837db96d56Sopenharmony_ci fprint(" }\n}\n") 3847db96d56Sopenharmony_ci 3857db96d56Sopenharmony_ci 3867db96d56Sopenharmony_ci# -------------------------------------------------------------------- 3877db96d56Sopenharmony_ci# unicode character type tables 3887db96d56Sopenharmony_ci 3897db96d56Sopenharmony_cidef makeunicodetype(unicode, trace): 3907db96d56Sopenharmony_ci 3917db96d56Sopenharmony_ci FILE = "Objects/unicodetype_db.h" 3927db96d56Sopenharmony_ci 3937db96d56Sopenharmony_ci print("--- Preparing", FILE, "...") 3947db96d56Sopenharmony_ci 3957db96d56Sopenharmony_ci # extract unicode types 3967db96d56Sopenharmony_ci dummy = (0, 0, 0, 0, 0, 0) 3977db96d56Sopenharmony_ci table = [dummy] 3987db96d56Sopenharmony_ci cache = {0: dummy} 3997db96d56Sopenharmony_ci index = [0] * len(unicode.chars) 4007db96d56Sopenharmony_ci numeric = {} 4017db96d56Sopenharmony_ci spaces = [] 4027db96d56Sopenharmony_ci linebreaks = [] 4037db96d56Sopenharmony_ci extra_casing = [] 4047db96d56Sopenharmony_ci 4057db96d56Sopenharmony_ci for char in unicode.chars: 4067db96d56Sopenharmony_ci record = unicode.table[char] 4077db96d56Sopenharmony_ci if record: 4087db96d56Sopenharmony_ci # extract database properties 4097db96d56Sopenharmony_ci category = record.general_category 4107db96d56Sopenharmony_ci bidirectional = record.bidi_class 4117db96d56Sopenharmony_ci properties = record.binary_properties 4127db96d56Sopenharmony_ci flags = 0 4137db96d56Sopenharmony_ci if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 4147db96d56Sopenharmony_ci flags |= ALPHA_MASK 4157db96d56Sopenharmony_ci if "Lowercase" in properties: 4167db96d56Sopenharmony_ci flags |= LOWER_MASK 4177db96d56Sopenharmony_ci if 'Line_Break' in properties or bidirectional == "B": 4187db96d56Sopenharmony_ci flags |= LINEBREAK_MASK 4197db96d56Sopenharmony_ci linebreaks.append(char) 4207db96d56Sopenharmony_ci if category == "Zs" or bidirectional in ("WS", "B", "S"): 4217db96d56Sopenharmony_ci flags |= SPACE_MASK 4227db96d56Sopenharmony_ci spaces.append(char) 4237db96d56Sopenharmony_ci if category == "Lt": 4247db96d56Sopenharmony_ci flags |= TITLE_MASK 4257db96d56Sopenharmony_ci if "Uppercase" in properties: 4267db96d56Sopenharmony_ci flags |= UPPER_MASK 4277db96d56Sopenharmony_ci if char == ord(" ") or category[0] not in ("C", "Z"): 4287db96d56Sopenharmony_ci flags |= PRINTABLE_MASK 4297db96d56Sopenharmony_ci if "XID_Start" in properties: 4307db96d56Sopenharmony_ci flags |= XID_START_MASK 4317db96d56Sopenharmony_ci if "XID_Continue" in properties: 4327db96d56Sopenharmony_ci flags |= XID_CONTINUE_MASK 4337db96d56Sopenharmony_ci if "Cased" in properties: 4347db96d56Sopenharmony_ci flags |= CASED_MASK 4357db96d56Sopenharmony_ci if "Case_Ignorable" in properties: 4367db96d56Sopenharmony_ci flags |= CASE_IGNORABLE_MASK 4377db96d56Sopenharmony_ci sc = unicode.special_casing.get(char) 4387db96d56Sopenharmony_ci cf = unicode.case_folding.get(char, [char]) 4397db96d56Sopenharmony_ci if record.simple_uppercase_mapping: 4407db96d56Sopenharmony_ci upper = int(record.simple_uppercase_mapping, 16) 4417db96d56Sopenharmony_ci else: 4427db96d56Sopenharmony_ci upper = char 4437db96d56Sopenharmony_ci if record.simple_lowercase_mapping: 4447db96d56Sopenharmony_ci lower = int(record.simple_lowercase_mapping, 16) 4457db96d56Sopenharmony_ci else: 4467db96d56Sopenharmony_ci lower = char 4477db96d56Sopenharmony_ci if record.simple_titlecase_mapping: 4487db96d56Sopenharmony_ci title = int(record.simple_titlecase_mapping, 16) 4497db96d56Sopenharmony_ci else: 4507db96d56Sopenharmony_ci title = upper 4517db96d56Sopenharmony_ci if sc is None and cf != [lower]: 4527db96d56Sopenharmony_ci sc = ([lower], [title], [upper]) 4537db96d56Sopenharmony_ci if sc is None: 4547db96d56Sopenharmony_ci if upper == lower == title: 4557db96d56Sopenharmony_ci upper = lower = title = 0 4567db96d56Sopenharmony_ci else: 4577db96d56Sopenharmony_ci upper = upper - char 4587db96d56Sopenharmony_ci lower = lower - char 4597db96d56Sopenharmony_ci title = title - char 4607db96d56Sopenharmony_ci assert (abs(upper) <= 2147483647 and 4617db96d56Sopenharmony_ci abs(lower) <= 2147483647 and 4627db96d56Sopenharmony_ci abs(title) <= 2147483647) 4637db96d56Sopenharmony_ci else: 4647db96d56Sopenharmony_ci # This happens either when some character maps to more than one 4657db96d56Sopenharmony_ci # character in uppercase, lowercase, or titlecase or the 4667db96d56Sopenharmony_ci # casefolded version of the character is different from the 4677db96d56Sopenharmony_ci # lowercase. The extra characters are stored in a different 4687db96d56Sopenharmony_ci # array. 4697db96d56Sopenharmony_ci flags |= EXTENDED_CASE_MASK 4707db96d56Sopenharmony_ci lower = len(extra_casing) | (len(sc[0]) << 24) 4717db96d56Sopenharmony_ci extra_casing.extend(sc[0]) 4727db96d56Sopenharmony_ci if cf != sc[0]: 4737db96d56Sopenharmony_ci lower |= len(cf) << 20 4747db96d56Sopenharmony_ci extra_casing.extend(cf) 4757db96d56Sopenharmony_ci upper = len(extra_casing) | (len(sc[2]) << 24) 4767db96d56Sopenharmony_ci extra_casing.extend(sc[2]) 4777db96d56Sopenharmony_ci # Title is probably equal to upper. 4787db96d56Sopenharmony_ci if sc[1] == sc[2]: 4797db96d56Sopenharmony_ci title = upper 4807db96d56Sopenharmony_ci else: 4817db96d56Sopenharmony_ci title = len(extra_casing) | (len(sc[1]) << 24) 4827db96d56Sopenharmony_ci extra_casing.extend(sc[1]) 4837db96d56Sopenharmony_ci # decimal digit, integer digit 4847db96d56Sopenharmony_ci decimal = 0 4857db96d56Sopenharmony_ci if record.decomposition_mapping: 4867db96d56Sopenharmony_ci flags |= DECIMAL_MASK 4877db96d56Sopenharmony_ci decimal = int(record.decomposition_mapping) 4887db96d56Sopenharmony_ci digit = 0 4897db96d56Sopenharmony_ci if record.numeric_type: 4907db96d56Sopenharmony_ci flags |= DIGIT_MASK 4917db96d56Sopenharmony_ci digit = int(record.numeric_type) 4927db96d56Sopenharmony_ci if record.numeric_value: 4937db96d56Sopenharmony_ci flags |= NUMERIC_MASK 4947db96d56Sopenharmony_ci numeric.setdefault(record.numeric_value, []).append(char) 4957db96d56Sopenharmony_ci item = ( 4967db96d56Sopenharmony_ci upper, lower, title, decimal, digit, flags 4977db96d56Sopenharmony_ci ) 4987db96d56Sopenharmony_ci # add entry to index and item tables 4997db96d56Sopenharmony_ci i = cache.get(item) 5007db96d56Sopenharmony_ci if i is None: 5017db96d56Sopenharmony_ci cache[item] = i = len(table) 5027db96d56Sopenharmony_ci table.append(item) 5037db96d56Sopenharmony_ci index[char] = i 5047db96d56Sopenharmony_ci 5057db96d56Sopenharmony_ci print(len(table), "unique character type entries") 5067db96d56Sopenharmony_ci print(sum(map(len, numeric.values())), "numeric code points") 5077db96d56Sopenharmony_ci print(len(spaces), "whitespace code points") 5087db96d56Sopenharmony_ci print(len(linebreaks), "linebreak code points") 5097db96d56Sopenharmony_ci print(len(extra_casing), "extended case array") 5107db96d56Sopenharmony_ci 5117db96d56Sopenharmony_ci print("--- Writing", FILE, "...") 5127db96d56Sopenharmony_ci 5137db96d56Sopenharmony_ci with open(FILE, "w") as fp: 5147db96d56Sopenharmony_ci fprint = partial(print, file=fp) 5157db96d56Sopenharmony_ci 5167db96d56Sopenharmony_ci fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 5177db96d56Sopenharmony_ci fprint() 5187db96d56Sopenharmony_ci fprint("/* a list of unique character type descriptors */") 5197db96d56Sopenharmony_ci fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {") 5207db96d56Sopenharmony_ci for item in table: 5217db96d56Sopenharmony_ci fprint(" {%d, %d, %d, %d, %d, %d}," % item) 5227db96d56Sopenharmony_ci fprint("};") 5237db96d56Sopenharmony_ci fprint() 5247db96d56Sopenharmony_ci 5257db96d56Sopenharmony_ci fprint("/* extended case mappings */") 5267db96d56Sopenharmony_ci fprint() 5277db96d56Sopenharmony_ci fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {") 5287db96d56Sopenharmony_ci for c in extra_casing: 5297db96d56Sopenharmony_ci fprint(" %d," % c) 5307db96d56Sopenharmony_ci fprint("};") 5317db96d56Sopenharmony_ci fprint() 5327db96d56Sopenharmony_ci 5337db96d56Sopenharmony_ci # split decomposition index table 5347db96d56Sopenharmony_ci index1, index2, shift = splitbins(index, trace) 5357db96d56Sopenharmony_ci 5367db96d56Sopenharmony_ci fprint("/* type indexes */") 5377db96d56Sopenharmony_ci fprint("#define SHIFT", shift) 5387db96d56Sopenharmony_ci Array("index1", index1).dump(fp, trace) 5397db96d56Sopenharmony_ci Array("index2", index2).dump(fp, trace) 5407db96d56Sopenharmony_ci 5417db96d56Sopenharmony_ci # Generate code for _PyUnicode_ToNumeric() 5427db96d56Sopenharmony_ci numeric_items = sorted(numeric.items()) 5437db96d56Sopenharmony_ci fprint('/* Returns the numeric value as double for Unicode characters') 5447db96d56Sopenharmony_ci fprint(' * having this property, -1.0 otherwise.') 5457db96d56Sopenharmony_ci fprint(' */') 5467db96d56Sopenharmony_ci fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)') 5477db96d56Sopenharmony_ci fprint('{') 5487db96d56Sopenharmony_ci fprint(' switch (ch) {') 5497db96d56Sopenharmony_ci for value, codepoints in numeric_items: 5507db96d56Sopenharmony_ci # Turn text into float literals 5517db96d56Sopenharmony_ci parts = value.split('/') 5527db96d56Sopenharmony_ci parts = [repr(float(part)) for part in parts] 5537db96d56Sopenharmony_ci value = '/'.join(parts) 5547db96d56Sopenharmony_ci 5557db96d56Sopenharmony_ci codepoints.sort() 5567db96d56Sopenharmony_ci for codepoint in codepoints: 5577db96d56Sopenharmony_ci fprint(' case 0x%04X:' % (codepoint,)) 5587db96d56Sopenharmony_ci fprint(' return (double) %s;' % (value,)) 5597db96d56Sopenharmony_ci fprint(' }') 5607db96d56Sopenharmony_ci fprint(' return -1.0;') 5617db96d56Sopenharmony_ci fprint('}') 5627db96d56Sopenharmony_ci fprint() 5637db96d56Sopenharmony_ci 5647db96d56Sopenharmony_ci # Generate code for _PyUnicode_IsWhitespace() 5657db96d56Sopenharmony_ci fprint("/* Returns 1 for Unicode characters having the bidirectional") 5667db96d56Sopenharmony_ci fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.") 5677db96d56Sopenharmony_ci fprint(" */") 5687db96d56Sopenharmony_ci fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)') 5697db96d56Sopenharmony_ci fprint('{') 5707db96d56Sopenharmony_ci fprint(' switch (ch) {') 5717db96d56Sopenharmony_ci 5727db96d56Sopenharmony_ci for codepoint in sorted(spaces): 5737db96d56Sopenharmony_ci fprint(' case 0x%04X:' % (codepoint,)) 5747db96d56Sopenharmony_ci fprint(' return 1;') 5757db96d56Sopenharmony_ci 5767db96d56Sopenharmony_ci fprint(' }') 5777db96d56Sopenharmony_ci fprint(' return 0;') 5787db96d56Sopenharmony_ci fprint('}') 5797db96d56Sopenharmony_ci fprint() 5807db96d56Sopenharmony_ci 5817db96d56Sopenharmony_ci # Generate code for _PyUnicode_IsLinebreak() 5827db96d56Sopenharmony_ci fprint("/* Returns 1 for Unicode characters having the line break") 5837db96d56Sopenharmony_ci fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") 5847db96d56Sopenharmony_ci fprint(" * type 'B', 0 otherwise.") 5857db96d56Sopenharmony_ci fprint(" */") 5867db96d56Sopenharmony_ci fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') 5877db96d56Sopenharmony_ci fprint('{') 5887db96d56Sopenharmony_ci fprint(' switch (ch) {') 5897db96d56Sopenharmony_ci for codepoint in sorted(linebreaks): 5907db96d56Sopenharmony_ci fprint(' case 0x%04X:' % (codepoint,)) 5917db96d56Sopenharmony_ci fprint(' return 1;') 5927db96d56Sopenharmony_ci 5937db96d56Sopenharmony_ci fprint(' }') 5947db96d56Sopenharmony_ci fprint(' return 0;') 5957db96d56Sopenharmony_ci fprint('}') 5967db96d56Sopenharmony_ci fprint() 5977db96d56Sopenharmony_ci 5987db96d56Sopenharmony_ci 5997db96d56Sopenharmony_ci# -------------------------------------------------------------------- 6007db96d56Sopenharmony_ci# unicode name database 6017db96d56Sopenharmony_ci 6027db96d56Sopenharmony_cidef makeunicodename(unicode, trace): 6037db96d56Sopenharmony_ci 6047db96d56Sopenharmony_ci FILE = "Modules/unicodename_db.h" 6057db96d56Sopenharmony_ci 6067db96d56Sopenharmony_ci print("--- Preparing", FILE, "...") 6077db96d56Sopenharmony_ci 6087db96d56Sopenharmony_ci # collect names 6097db96d56Sopenharmony_ci names = [None] * len(unicode.chars) 6107db96d56Sopenharmony_ci 6117db96d56Sopenharmony_ci for char in unicode.chars: 6127db96d56Sopenharmony_ci record = unicode.table[char] 6137db96d56Sopenharmony_ci if record: 6147db96d56Sopenharmony_ci name = record.name.strip() 6157db96d56Sopenharmony_ci if name and name[0] != "<": 6167db96d56Sopenharmony_ci names[char] = name + chr(0) 6177db96d56Sopenharmony_ci 6187db96d56Sopenharmony_ci print(len([n for n in names if n is not None]), "distinct names") 6197db96d56Sopenharmony_ci 6207db96d56Sopenharmony_ci # collect unique words from names (note that we differ between 6217db96d56Sopenharmony_ci # words inside a sentence, and words ending a sentence. the 6227db96d56Sopenharmony_ci # latter includes the trailing null byte. 6237db96d56Sopenharmony_ci 6247db96d56Sopenharmony_ci words = {} 6257db96d56Sopenharmony_ci n = b = 0 6267db96d56Sopenharmony_ci for char in unicode.chars: 6277db96d56Sopenharmony_ci name = names[char] 6287db96d56Sopenharmony_ci if name: 6297db96d56Sopenharmony_ci w = name.split() 6307db96d56Sopenharmony_ci b = b + len(name) 6317db96d56Sopenharmony_ci n = n + len(w) 6327db96d56Sopenharmony_ci for w in w: 6337db96d56Sopenharmony_ci l = words.get(w) 6347db96d56Sopenharmony_ci if l: 6357db96d56Sopenharmony_ci l.append(None) 6367db96d56Sopenharmony_ci else: 6377db96d56Sopenharmony_ci words[w] = [len(words)] 6387db96d56Sopenharmony_ci 6397db96d56Sopenharmony_ci print(n, "words in text;", b, "bytes") 6407db96d56Sopenharmony_ci 6417db96d56Sopenharmony_ci wordlist = list(words.items()) 6427db96d56Sopenharmony_ci 6437db96d56Sopenharmony_ci # sort on falling frequency, then by name 6447db96d56Sopenharmony_ci def word_key(a): 6457db96d56Sopenharmony_ci aword, alist = a 6467db96d56Sopenharmony_ci return -len(alist), aword 6477db96d56Sopenharmony_ci wordlist.sort(key=word_key) 6487db96d56Sopenharmony_ci 6497db96d56Sopenharmony_ci # figure out how many phrasebook escapes we need 6507db96d56Sopenharmony_ci escapes = 0 6517db96d56Sopenharmony_ci while escapes * 256 < len(wordlist): 6527db96d56Sopenharmony_ci escapes = escapes + 1 6537db96d56Sopenharmony_ci print(escapes, "escapes") 6547db96d56Sopenharmony_ci 6557db96d56Sopenharmony_ci short = 256 - escapes 6567db96d56Sopenharmony_ci 6577db96d56Sopenharmony_ci assert short > 0 6587db96d56Sopenharmony_ci 6597db96d56Sopenharmony_ci print(short, "short indexes in lexicon") 6607db96d56Sopenharmony_ci 6617db96d56Sopenharmony_ci # statistics 6627db96d56Sopenharmony_ci n = 0 6637db96d56Sopenharmony_ci for i in range(short): 6647db96d56Sopenharmony_ci n = n + len(wordlist[i][1]) 6657db96d56Sopenharmony_ci print(n, "short indexes in phrasebook") 6667db96d56Sopenharmony_ci 6677db96d56Sopenharmony_ci # pick the most commonly used words, and sort the rest on falling 6687db96d56Sopenharmony_ci # length (to maximize overlap) 6697db96d56Sopenharmony_ci 6707db96d56Sopenharmony_ci wordlist, wordtail = wordlist[:short], wordlist[short:] 6717db96d56Sopenharmony_ci wordtail.sort(key=lambda a: a[0], reverse=True) 6727db96d56Sopenharmony_ci wordlist.extend(wordtail) 6737db96d56Sopenharmony_ci 6747db96d56Sopenharmony_ci # generate lexicon from words 6757db96d56Sopenharmony_ci 6767db96d56Sopenharmony_ci lexicon_offset = [0] 6777db96d56Sopenharmony_ci lexicon = "" 6787db96d56Sopenharmony_ci words = {} 6797db96d56Sopenharmony_ci 6807db96d56Sopenharmony_ci # build a lexicon string 6817db96d56Sopenharmony_ci offset = 0 6827db96d56Sopenharmony_ci for w, x in wordlist: 6837db96d56Sopenharmony_ci # encoding: bit 7 indicates last character in word (chr(128) 6847db96d56Sopenharmony_ci # indicates the last character in an entire string) 6857db96d56Sopenharmony_ci ww = w[:-1] + chr(ord(w[-1])+128) 6867db96d56Sopenharmony_ci # reuse string tails, when possible 6877db96d56Sopenharmony_ci o = lexicon.find(ww) 6887db96d56Sopenharmony_ci if o < 0: 6897db96d56Sopenharmony_ci o = offset 6907db96d56Sopenharmony_ci lexicon = lexicon + ww 6917db96d56Sopenharmony_ci offset = offset + len(w) 6927db96d56Sopenharmony_ci words[w] = len(lexicon_offset) 6937db96d56Sopenharmony_ci lexicon_offset.append(o) 6947db96d56Sopenharmony_ci 6957db96d56Sopenharmony_ci lexicon = list(map(ord, lexicon)) 6967db96d56Sopenharmony_ci 6977db96d56Sopenharmony_ci # generate phrasebook from names and lexicon 6987db96d56Sopenharmony_ci phrasebook = [0] 6997db96d56Sopenharmony_ci phrasebook_offset = [0] * len(unicode.chars) 7007db96d56Sopenharmony_ci for char in unicode.chars: 7017db96d56Sopenharmony_ci name = names[char] 7027db96d56Sopenharmony_ci if name: 7037db96d56Sopenharmony_ci w = name.split() 7047db96d56Sopenharmony_ci phrasebook_offset[char] = len(phrasebook) 7057db96d56Sopenharmony_ci for w in w: 7067db96d56Sopenharmony_ci i = words[w] 7077db96d56Sopenharmony_ci if i < short: 7087db96d56Sopenharmony_ci phrasebook.append(i) 7097db96d56Sopenharmony_ci else: 7107db96d56Sopenharmony_ci # store as two bytes 7117db96d56Sopenharmony_ci phrasebook.append((i>>8) + short) 7127db96d56Sopenharmony_ci phrasebook.append(i&255) 7137db96d56Sopenharmony_ci 7147db96d56Sopenharmony_ci assert getsize(phrasebook) == 1 7157db96d56Sopenharmony_ci 7167db96d56Sopenharmony_ci # 7177db96d56Sopenharmony_ci # unicode name hash table 7187db96d56Sopenharmony_ci 7197db96d56Sopenharmony_ci # extract names 7207db96d56Sopenharmony_ci data = [] 7217db96d56Sopenharmony_ci for char in unicode.chars: 7227db96d56Sopenharmony_ci record = unicode.table[char] 7237db96d56Sopenharmony_ci if record: 7247db96d56Sopenharmony_ci name = record.name.strip() 7257db96d56Sopenharmony_ci if name and name[0] != "<": 7267db96d56Sopenharmony_ci data.append((name, char)) 7277db96d56Sopenharmony_ci 7287db96d56Sopenharmony_ci # the magic number 47 was chosen to minimize the number of 7297db96d56Sopenharmony_ci # collisions on the current data set. if you like, change it 7307db96d56Sopenharmony_ci # and see what happens... 7317db96d56Sopenharmony_ci 7327db96d56Sopenharmony_ci codehash = Hash("code", data, 47) 7337db96d56Sopenharmony_ci 7347db96d56Sopenharmony_ci print("--- Writing", FILE, "...") 7357db96d56Sopenharmony_ci 7367db96d56Sopenharmony_ci with open(FILE, "w") as fp: 7377db96d56Sopenharmony_ci fprint = partial(print, file=fp) 7387db96d56Sopenharmony_ci 7397db96d56Sopenharmony_ci fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 7407db96d56Sopenharmony_ci fprint() 7417db96d56Sopenharmony_ci fprint("#define NAME_MAXLEN", 256) 7427db96d56Sopenharmony_ci fprint() 7437db96d56Sopenharmony_ci fprint("/* lexicon */") 7447db96d56Sopenharmony_ci Array("lexicon", lexicon).dump(fp, trace) 7457db96d56Sopenharmony_ci Array("lexicon_offset", lexicon_offset).dump(fp, trace) 7467db96d56Sopenharmony_ci 7477db96d56Sopenharmony_ci # split decomposition index table 7487db96d56Sopenharmony_ci offset1, offset2, shift = splitbins(phrasebook_offset, trace) 7497db96d56Sopenharmony_ci 7507db96d56Sopenharmony_ci fprint("/* code->name phrasebook */") 7517db96d56Sopenharmony_ci fprint("#define phrasebook_shift", shift) 7527db96d56Sopenharmony_ci fprint("#define phrasebook_short", short) 7537db96d56Sopenharmony_ci 7547db96d56Sopenharmony_ci Array("phrasebook", phrasebook).dump(fp, trace) 7557db96d56Sopenharmony_ci Array("phrasebook_offset1", offset1).dump(fp, trace) 7567db96d56Sopenharmony_ci Array("phrasebook_offset2", offset2).dump(fp, trace) 7577db96d56Sopenharmony_ci 7587db96d56Sopenharmony_ci fprint("/* name->code dictionary */") 7597db96d56Sopenharmony_ci codehash.dump(fp, trace) 7607db96d56Sopenharmony_ci 7617db96d56Sopenharmony_ci fprint() 7627db96d56Sopenharmony_ci fprint('static const unsigned int aliases_start = %#x;' % 7637db96d56Sopenharmony_ci NAME_ALIASES_START) 7647db96d56Sopenharmony_ci fprint('static const unsigned int aliases_end = %#x;' % 7657db96d56Sopenharmony_ci (NAME_ALIASES_START + len(unicode.aliases))) 7667db96d56Sopenharmony_ci 7677db96d56Sopenharmony_ci fprint('static const unsigned int name_aliases[] = {') 7687db96d56Sopenharmony_ci for name, codepoint in unicode.aliases: 7697db96d56Sopenharmony_ci fprint(' 0x%04X,' % codepoint) 7707db96d56Sopenharmony_ci fprint('};') 7717db96d56Sopenharmony_ci 7727db96d56Sopenharmony_ci # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, 7737db96d56Sopenharmony_ci # so we are using Py_UCS2 seq[4]. This needs to be updated if longer 7747db96d56Sopenharmony_ci # sequences or sequences with non-BMP chars are added. 7757db96d56Sopenharmony_ci # unicodedata_lookup should be adapted too. 7767db96d56Sopenharmony_ci fprint(dedent(""" 7777db96d56Sopenharmony_ci typedef struct NamedSequence { 7787db96d56Sopenharmony_ci int seqlen; 7797db96d56Sopenharmony_ci Py_UCS2 seq[4]; 7807db96d56Sopenharmony_ci } named_sequence; 7817db96d56Sopenharmony_ci """)) 7827db96d56Sopenharmony_ci 7837db96d56Sopenharmony_ci fprint('static const unsigned int named_sequences_start = %#x;' % 7847db96d56Sopenharmony_ci NAMED_SEQUENCES_START) 7857db96d56Sopenharmony_ci fprint('static const unsigned int named_sequences_end = %#x;' % 7867db96d56Sopenharmony_ci (NAMED_SEQUENCES_START + len(unicode.named_sequences))) 7877db96d56Sopenharmony_ci 7887db96d56Sopenharmony_ci fprint('static const named_sequence named_sequences[] = {') 7897db96d56Sopenharmony_ci for name, sequence in unicode.named_sequences: 7907db96d56Sopenharmony_ci seq_str = ', '.join('0x%04X' % cp for cp in sequence) 7917db96d56Sopenharmony_ci fprint(' {%d, {%s}},' % (len(sequence), seq_str)) 7927db96d56Sopenharmony_ci fprint('};') 7937db96d56Sopenharmony_ci 7947db96d56Sopenharmony_ci 7957db96d56Sopenharmony_cidef merge_old_version(version, new, old): 7967db96d56Sopenharmony_ci # Changes to exclusion file not implemented yet 7977db96d56Sopenharmony_ci if old.exclusions != new.exclusions: 7987db96d56Sopenharmony_ci raise NotImplementedError("exclusions differ") 7997db96d56Sopenharmony_ci 8007db96d56Sopenharmony_ci # In these change records, 0xFF means "no change" 8017db96d56Sopenharmony_ci bidir_changes = [0xFF]*0x110000 8027db96d56Sopenharmony_ci category_changes = [0xFF]*0x110000 8037db96d56Sopenharmony_ci decimal_changes = [0xFF]*0x110000 8047db96d56Sopenharmony_ci mirrored_changes = [0xFF]*0x110000 8057db96d56Sopenharmony_ci east_asian_width_changes = [0xFF]*0x110000 8067db96d56Sopenharmony_ci # In numeric data, 0 means "no change", 8077db96d56Sopenharmony_ci # -1 means "did not have a numeric value 8087db96d56Sopenharmony_ci numeric_changes = [0] * 0x110000 8097db96d56Sopenharmony_ci # normalization_changes is a list of key-value pairs 8107db96d56Sopenharmony_ci normalization_changes = [] 8117db96d56Sopenharmony_ci for i in range(0x110000): 8127db96d56Sopenharmony_ci if new.table[i] is None: 8137db96d56Sopenharmony_ci # Characters unassigned in the new version ought to 8147db96d56Sopenharmony_ci # be unassigned in the old one 8157db96d56Sopenharmony_ci assert old.table[i] is None 8167db96d56Sopenharmony_ci continue 8177db96d56Sopenharmony_ci # check characters unassigned in the old version 8187db96d56Sopenharmony_ci if old.table[i] is None: 8197db96d56Sopenharmony_ci # category 0 is "unassigned" 8207db96d56Sopenharmony_ci category_changes[i] = 0 8217db96d56Sopenharmony_ci continue 8227db96d56Sopenharmony_ci # check characters that differ 8237db96d56Sopenharmony_ci if old.table[i] != new.table[i]: 8247db96d56Sopenharmony_ci for k, field in enumerate(dataclasses.fields(UcdRecord)): 8257db96d56Sopenharmony_ci value = getattr(old.table[i], field.name) 8267db96d56Sopenharmony_ci new_value = getattr(new.table[i], field.name) 8277db96d56Sopenharmony_ci if value != new_value: 8287db96d56Sopenharmony_ci if k == 1 and i in PUA_15: 8297db96d56Sopenharmony_ci # the name is not set in the old.table, but in the 8307db96d56Sopenharmony_ci # new.table we are using it for aliases and named seq 8317db96d56Sopenharmony_ci assert value == '' 8327db96d56Sopenharmony_ci elif k == 2: 8337db96d56Sopenharmony_ci category_changes[i] = CATEGORY_NAMES.index(value) 8347db96d56Sopenharmony_ci elif k == 4: 8357db96d56Sopenharmony_ci bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) 8367db96d56Sopenharmony_ci elif k == 5: 8377db96d56Sopenharmony_ci # We assume that all normalization changes are in 1:1 mappings 8387db96d56Sopenharmony_ci assert " " not in value 8397db96d56Sopenharmony_ci normalization_changes.append((i, value)) 8407db96d56Sopenharmony_ci elif k == 6: 8417db96d56Sopenharmony_ci # we only support changes where the old value is a single digit 8427db96d56Sopenharmony_ci assert value in "0123456789" 8437db96d56Sopenharmony_ci decimal_changes[i] = int(value) 8447db96d56Sopenharmony_ci elif k == 8: 8457db96d56Sopenharmony_ci # Since 0 encodes "no change", the old value is better not 0 8467db96d56Sopenharmony_ci if not value: 8477db96d56Sopenharmony_ci numeric_changes[i] = -1 8487db96d56Sopenharmony_ci else: 8497db96d56Sopenharmony_ci numeric_changes[i] = float(value) 8507db96d56Sopenharmony_ci assert numeric_changes[i] not in (0, -1) 8517db96d56Sopenharmony_ci elif k == 9: 8527db96d56Sopenharmony_ci if value == 'Y': 8537db96d56Sopenharmony_ci mirrored_changes[i] = '1' 8547db96d56Sopenharmony_ci else: 8557db96d56Sopenharmony_ci mirrored_changes[i] = '0' 8567db96d56Sopenharmony_ci elif k == 11: 8577db96d56Sopenharmony_ci # change to ISO comment, ignore 8587db96d56Sopenharmony_ci pass 8597db96d56Sopenharmony_ci elif k == 12: 8607db96d56Sopenharmony_ci # change to simple uppercase mapping; ignore 8617db96d56Sopenharmony_ci pass 8627db96d56Sopenharmony_ci elif k == 13: 8637db96d56Sopenharmony_ci # change to simple lowercase mapping; ignore 8647db96d56Sopenharmony_ci pass 8657db96d56Sopenharmony_ci elif k == 14: 8667db96d56Sopenharmony_ci # change to simple titlecase mapping; ignore 8677db96d56Sopenharmony_ci pass 8687db96d56Sopenharmony_ci elif k == 15: 8697db96d56Sopenharmony_ci # change to east asian width 8707db96d56Sopenharmony_ci east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value) 8717db96d56Sopenharmony_ci elif k == 16: 8727db96d56Sopenharmony_ci # derived property changes; not yet 8737db96d56Sopenharmony_ci pass 8747db96d56Sopenharmony_ci elif k == 17: 8757db96d56Sopenharmony_ci # normalization quickchecks are not performed 8767db96d56Sopenharmony_ci # for older versions 8777db96d56Sopenharmony_ci pass 8787db96d56Sopenharmony_ci else: 8797db96d56Sopenharmony_ci class Difference(Exception):pass 8807db96d56Sopenharmony_ci raise Difference(hex(i), k, old.table[i], new.table[i]) 8817db96d56Sopenharmony_ci new.changed.append((version, list(zip(bidir_changes, category_changes, 8827db96d56Sopenharmony_ci decimal_changes, mirrored_changes, 8837db96d56Sopenharmony_ci east_asian_width_changes, 8847db96d56Sopenharmony_ci numeric_changes)), 8857db96d56Sopenharmony_ci normalization_changes)) 8867db96d56Sopenharmony_ci 8877db96d56Sopenharmony_ci 8887db96d56Sopenharmony_ciDATA_DIR = os.path.join('Tools', 'unicode', 'data') 8897db96d56Sopenharmony_ci 8907db96d56Sopenharmony_cidef open_data(template, version): 8917db96d56Sopenharmony_ci local = os.path.join(DATA_DIR, template % ('-'+version,)) 8927db96d56Sopenharmony_ci if not os.path.exists(local): 8937db96d56Sopenharmony_ci import urllib.request 8947db96d56Sopenharmony_ci if version == '3.2.0': 8957db96d56Sopenharmony_ci # irregular url structure 8967db96d56Sopenharmony_ci url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,) 8977db96d56Sopenharmony_ci else: 8987db96d56Sopenharmony_ci url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '') 8997db96d56Sopenharmony_ci os.makedirs(DATA_DIR, exist_ok=True) 9007db96d56Sopenharmony_ci urllib.request.urlretrieve(url, filename=local) 9017db96d56Sopenharmony_ci if local.endswith('.txt'): 9027db96d56Sopenharmony_ci return open(local, encoding='utf-8') 9037db96d56Sopenharmony_ci else: 9047db96d56Sopenharmony_ci # Unihan.zip 9057db96d56Sopenharmony_ci return open(local, 'rb') 9067db96d56Sopenharmony_ci 9077db96d56Sopenharmony_ci 9087db96d56Sopenharmony_cidef expand_range(char_range: str) -> Iterator[int]: 9097db96d56Sopenharmony_ci ''' 9107db96d56Sopenharmony_ci Parses ranges of code points, as described in UAX #44: 9117db96d56Sopenharmony_ci https://www.unicode.org/reports/tr44/#Code_Point_Ranges 9127db96d56Sopenharmony_ci ''' 9137db96d56Sopenharmony_ci if '..' in char_range: 9147db96d56Sopenharmony_ci first, last = [int(c, 16) for c in char_range.split('..')] 9157db96d56Sopenharmony_ci else: 9167db96d56Sopenharmony_ci first = last = int(char_range, 16) 9177db96d56Sopenharmony_ci for char in range(first, last+1): 9187db96d56Sopenharmony_ci yield char 9197db96d56Sopenharmony_ci 9207db96d56Sopenharmony_ci 9217db96d56Sopenharmony_ciclass UcdFile: 9227db96d56Sopenharmony_ci ''' 9237db96d56Sopenharmony_ci A file in the standard format of the UCD. 9247db96d56Sopenharmony_ci 9257db96d56Sopenharmony_ci See: https://www.unicode.org/reports/tr44/#Format_Conventions 9267db96d56Sopenharmony_ci 9277db96d56Sopenharmony_ci Note that, as described there, the Unihan data files have their 9287db96d56Sopenharmony_ci own separate format. 9297db96d56Sopenharmony_ci ''' 9307db96d56Sopenharmony_ci 9317db96d56Sopenharmony_ci def __init__(self, template: str, version: str) -> None: 9327db96d56Sopenharmony_ci self.template = template 9337db96d56Sopenharmony_ci self.version = version 9347db96d56Sopenharmony_ci 9357db96d56Sopenharmony_ci def records(self) -> Iterator[List[str]]: 9367db96d56Sopenharmony_ci with open_data(self.template, self.version) as file: 9377db96d56Sopenharmony_ci for line in file: 9387db96d56Sopenharmony_ci line = line.split('#', 1)[0].strip() 9397db96d56Sopenharmony_ci if not line: 9407db96d56Sopenharmony_ci continue 9417db96d56Sopenharmony_ci yield [field.strip() for field in line.split(';')] 9427db96d56Sopenharmony_ci 9437db96d56Sopenharmony_ci def __iter__(self) -> Iterator[List[str]]: 9447db96d56Sopenharmony_ci return self.records() 9457db96d56Sopenharmony_ci 9467db96d56Sopenharmony_ci def expanded(self) -> Iterator[Tuple[int, List[str]]]: 9477db96d56Sopenharmony_ci for record in self.records(): 9487db96d56Sopenharmony_ci char_range, rest = record[0], record[1:] 9497db96d56Sopenharmony_ci for char in expand_range(char_range): 9507db96d56Sopenharmony_ci yield char, rest 9517db96d56Sopenharmony_ci 9527db96d56Sopenharmony_ci 9537db96d56Sopenharmony_ci@dataclasses.dataclass 9547db96d56Sopenharmony_ciclass UcdRecord: 9557db96d56Sopenharmony_ci # 15 fields from UnicodeData.txt . See: 9567db96d56Sopenharmony_ci # https://www.unicode.org/reports/tr44/#UnicodeData.txt 9577db96d56Sopenharmony_ci codepoint: str 9587db96d56Sopenharmony_ci name: str 9597db96d56Sopenharmony_ci general_category: str 9607db96d56Sopenharmony_ci canonical_combining_class: str 9617db96d56Sopenharmony_ci bidi_class: str 9627db96d56Sopenharmony_ci decomposition_type: str 9637db96d56Sopenharmony_ci decomposition_mapping: str 9647db96d56Sopenharmony_ci numeric_type: str 9657db96d56Sopenharmony_ci numeric_value: str 9667db96d56Sopenharmony_ci bidi_mirrored: str 9677db96d56Sopenharmony_ci unicode_1_name: str # obsolete 9687db96d56Sopenharmony_ci iso_comment: str # obsolete 9697db96d56Sopenharmony_ci simple_uppercase_mapping: str 9707db96d56Sopenharmony_ci simple_lowercase_mapping: str 9717db96d56Sopenharmony_ci simple_titlecase_mapping: str 9727db96d56Sopenharmony_ci 9737db96d56Sopenharmony_ci # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt 9747db96d56Sopenharmony_ci east_asian_width: Optional[str] 9757db96d56Sopenharmony_ci 9767db96d56Sopenharmony_ci # Binary properties, as a set of those that are true. 9777db96d56Sopenharmony_ci # Taken from multiple files: 9787db96d56Sopenharmony_ci # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt 9797db96d56Sopenharmony_ci # https://www.unicode.org/reports/tr44/#LineBreak.txt 9807db96d56Sopenharmony_ci binary_properties: Set[str] 9817db96d56Sopenharmony_ci 9827db96d56Sopenharmony_ci # The Quick_Check properties related to normalization: 9837db96d56Sopenharmony_ci # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization 9847db96d56Sopenharmony_ci # We store them as a bitmask. 9857db96d56Sopenharmony_ci quick_check: int 9867db96d56Sopenharmony_ci 9877db96d56Sopenharmony_ci 9887db96d56Sopenharmony_cidef from_row(row: List[str]) -> UcdRecord: 9897db96d56Sopenharmony_ci return UcdRecord(*row, None, set(), 0) 9907db96d56Sopenharmony_ci 9917db96d56Sopenharmony_ci 9927db96d56Sopenharmony_ci# -------------------------------------------------------------------- 9937db96d56Sopenharmony_ci# the following support code is taken from the unidb utilities 9947db96d56Sopenharmony_ci# Copyright (c) 1999-2000 by Secret Labs AB 9957db96d56Sopenharmony_ci 9967db96d56Sopenharmony_ci# load a unicode-data file from disk 9977db96d56Sopenharmony_ci 9987db96d56Sopenharmony_ciclass UnicodeData: 9997db96d56Sopenharmony_ci # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned 10007db96d56Sopenharmony_ci 10017db96d56Sopenharmony_ci def __init__(self, version, cjk_check=True): 10027db96d56Sopenharmony_ci self.changed = [] 10037db96d56Sopenharmony_ci table = [None] * 0x110000 10047db96d56Sopenharmony_ci for s in UcdFile(UNICODE_DATA, version): 10057db96d56Sopenharmony_ci char = int(s[0], 16) 10067db96d56Sopenharmony_ci table[char] = from_row(s) 10077db96d56Sopenharmony_ci 10087db96d56Sopenharmony_ci cjk_ranges_found = [] 10097db96d56Sopenharmony_ci 10107db96d56Sopenharmony_ci # expand first-last ranges 10117db96d56Sopenharmony_ci field = None 10127db96d56Sopenharmony_ci for i in range(0, 0x110000): 10137db96d56Sopenharmony_ci # The file UnicodeData.txt has its own distinct way of 10147db96d56Sopenharmony_ci # expressing ranges. See: 10157db96d56Sopenharmony_ci # https://www.unicode.org/reports/tr44/#Code_Point_Ranges 10167db96d56Sopenharmony_ci s = table[i] 10177db96d56Sopenharmony_ci if s: 10187db96d56Sopenharmony_ci if s.name[-6:] == "First>": 10197db96d56Sopenharmony_ci s.name = "" 10207db96d56Sopenharmony_ci field = dataclasses.astuple(s)[:15] 10217db96d56Sopenharmony_ci elif s.name[-5:] == "Last>": 10227db96d56Sopenharmony_ci if s.name.startswith("<CJK Ideograph"): 10237db96d56Sopenharmony_ci cjk_ranges_found.append((field[0], 10247db96d56Sopenharmony_ci s.codepoint)) 10257db96d56Sopenharmony_ci s.name = "" 10267db96d56Sopenharmony_ci field = None 10277db96d56Sopenharmony_ci elif field: 10287db96d56Sopenharmony_ci table[i] = from_row(('%X' % i,) + field[1:]) 10297db96d56Sopenharmony_ci if cjk_check and cjk_ranges != cjk_ranges_found: 10307db96d56Sopenharmony_ci raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) 10317db96d56Sopenharmony_ci 10327db96d56Sopenharmony_ci # public attributes 10337db96d56Sopenharmony_ci self.filename = UNICODE_DATA % '' 10347db96d56Sopenharmony_ci self.table = table 10357db96d56Sopenharmony_ci self.chars = list(range(0x110000)) # unicode 3.2 10367db96d56Sopenharmony_ci 10377db96d56Sopenharmony_ci # check for name aliases and named sequences, see #12753 10387db96d56Sopenharmony_ci # aliases and named sequences are not in 3.2.0 10397db96d56Sopenharmony_ci if version != '3.2.0': 10407db96d56Sopenharmony_ci self.aliases = [] 10417db96d56Sopenharmony_ci # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, 10427db96d56Sopenharmony_ci # in order to take advantage of the compression and lookup 10437db96d56Sopenharmony_ci # algorithms used for the other characters 10447db96d56Sopenharmony_ci pua_index = NAME_ALIASES_START 10457db96d56Sopenharmony_ci for char, name, abbrev in UcdFile(NAME_ALIASES, version): 10467db96d56Sopenharmony_ci char = int(char, 16) 10477db96d56Sopenharmony_ci self.aliases.append((name, char)) 10487db96d56Sopenharmony_ci # also store the name in the PUA 1 10497db96d56Sopenharmony_ci self.table[pua_index].name = name 10507db96d56Sopenharmony_ci pua_index += 1 10517db96d56Sopenharmony_ci assert pua_index - NAME_ALIASES_START == len(self.aliases) 10527db96d56Sopenharmony_ci 10537db96d56Sopenharmony_ci self.named_sequences = [] 10547db96d56Sopenharmony_ci # store named sequences in the PUA 1, in range U+F0100.., 10557db96d56Sopenharmony_ci # in order to take advantage of the compression and lookup 10567db96d56Sopenharmony_ci # algorithms used for the other characters. 10577db96d56Sopenharmony_ci 10587db96d56Sopenharmony_ci assert pua_index < NAMED_SEQUENCES_START 10597db96d56Sopenharmony_ci pua_index = NAMED_SEQUENCES_START 10607db96d56Sopenharmony_ci for name, chars in UcdFile(NAMED_SEQUENCES, version): 10617db96d56Sopenharmony_ci chars = tuple(int(char, 16) for char in chars.split()) 10627db96d56Sopenharmony_ci # check that the structure defined in makeunicodename is OK 10637db96d56Sopenharmony_ci assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" 10647db96d56Sopenharmony_ci assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " 10657db96d56Sopenharmony_ci "the NamedSequence struct and in unicodedata_lookup") 10667db96d56Sopenharmony_ci self.named_sequences.append((name, chars)) 10677db96d56Sopenharmony_ci # also store these in the PUA 1 10687db96d56Sopenharmony_ci self.table[pua_index].name = name 10697db96d56Sopenharmony_ci pua_index += 1 10707db96d56Sopenharmony_ci assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) 10717db96d56Sopenharmony_ci 10727db96d56Sopenharmony_ci self.exclusions = {} 10737db96d56Sopenharmony_ci for char, in UcdFile(COMPOSITION_EXCLUSIONS, version): 10747db96d56Sopenharmony_ci char = int(char, 16) 10757db96d56Sopenharmony_ci self.exclusions[char] = 1 10767db96d56Sopenharmony_ci 10777db96d56Sopenharmony_ci widths = [None] * 0x110000 10787db96d56Sopenharmony_ci for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded(): 10797db96d56Sopenharmony_ci widths[char] = width 10807db96d56Sopenharmony_ci 10817db96d56Sopenharmony_ci for i in range(0, 0x110000): 10827db96d56Sopenharmony_ci if table[i] is not None: 10837db96d56Sopenharmony_ci table[i].east_asian_width = widths[i] 10847db96d56Sopenharmony_ci 10857db96d56Sopenharmony_ci for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): 10867db96d56Sopenharmony_ci if table[char]: 10877db96d56Sopenharmony_ci # Some properties (e.g. Default_Ignorable_Code_Point) 10887db96d56Sopenharmony_ci # apply to unassigned code points; ignore them 10897db96d56Sopenharmony_ci table[char].binary_properties.add(p) 10907db96d56Sopenharmony_ci 10917db96d56Sopenharmony_ci for char_range, value in UcdFile(LINE_BREAK, version): 10927db96d56Sopenharmony_ci if value not in MANDATORY_LINE_BREAKS: 10937db96d56Sopenharmony_ci continue 10947db96d56Sopenharmony_ci for char in expand_range(char_range): 10957db96d56Sopenharmony_ci table[char].binary_properties.add('Line_Break') 10967db96d56Sopenharmony_ci 10977db96d56Sopenharmony_ci # We only want the quickcheck properties 10987db96d56Sopenharmony_ci # Format: NF?_QC; Y(es)/N(o)/M(aybe) 10997db96d56Sopenharmony_ci # Yes is the default, hence only N and M occur 11007db96d56Sopenharmony_ci # In 3.2.0, the format was different (NF?_NO) 11017db96d56Sopenharmony_ci # The parsing will incorrectly determine these as 11027db96d56Sopenharmony_ci # "yes", however, unicodedata.c will not perform quickchecks 11037db96d56Sopenharmony_ci # for older versions, and no delta records will be created. 11047db96d56Sopenharmony_ci quickchecks = [0] * 0x110000 11057db96d56Sopenharmony_ci qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() 11067db96d56Sopenharmony_ci for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version): 11077db96d56Sopenharmony_ci if len(s) < 2 or s[1] not in qc_order: 11087db96d56Sopenharmony_ci continue 11097db96d56Sopenharmony_ci quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No 11107db96d56Sopenharmony_ci quickcheck_shift = qc_order.index(s[1])*2 11117db96d56Sopenharmony_ci quickcheck <<= quickcheck_shift 11127db96d56Sopenharmony_ci for char in expand_range(s[0]): 11137db96d56Sopenharmony_ci assert not (quickchecks[char]>>quickcheck_shift)&3 11147db96d56Sopenharmony_ci quickchecks[char] |= quickcheck 11157db96d56Sopenharmony_ci for i in range(0, 0x110000): 11167db96d56Sopenharmony_ci if table[i] is not None: 11177db96d56Sopenharmony_ci table[i].quick_check = quickchecks[i] 11187db96d56Sopenharmony_ci 11197db96d56Sopenharmony_ci with open_data(UNIHAN, version) as file: 11207db96d56Sopenharmony_ci zip = zipfile.ZipFile(file) 11217db96d56Sopenharmony_ci if version == '3.2.0': 11227db96d56Sopenharmony_ci data = zip.open('Unihan-3.2.0.txt').read() 11237db96d56Sopenharmony_ci else: 11247db96d56Sopenharmony_ci data = zip.open('Unihan_NumericValues.txt').read() 11257db96d56Sopenharmony_ci for line in data.decode("utf-8").splitlines(): 11267db96d56Sopenharmony_ci if not line.startswith('U+'): 11277db96d56Sopenharmony_ci continue 11287db96d56Sopenharmony_ci code, tag, value = line.split(None, 3)[:3] 11297db96d56Sopenharmony_ci if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 11307db96d56Sopenharmony_ci 'kOtherNumeric'): 11317db96d56Sopenharmony_ci continue 11327db96d56Sopenharmony_ci value = value.strip().replace(',', '') 11337db96d56Sopenharmony_ci i = int(code[2:], 16) 11347db96d56Sopenharmony_ci # Patch the numeric field 11357db96d56Sopenharmony_ci if table[i] is not None: 11367db96d56Sopenharmony_ci table[i].numeric_value = value 11377db96d56Sopenharmony_ci 11387db96d56Sopenharmony_ci sc = self.special_casing = {} 11397db96d56Sopenharmony_ci for data in UcdFile(SPECIAL_CASING, version): 11407db96d56Sopenharmony_ci if data[4]: 11417db96d56Sopenharmony_ci # We ignore all conditionals (since they depend on 11427db96d56Sopenharmony_ci # languages) except for one, which is hardcoded. See 11437db96d56Sopenharmony_ci # handle_capital_sigma in unicodeobject.c. 11447db96d56Sopenharmony_ci continue 11457db96d56Sopenharmony_ci c = int(data[0], 16) 11467db96d56Sopenharmony_ci lower = [int(char, 16) for char in data[1].split()] 11477db96d56Sopenharmony_ci title = [int(char, 16) for char in data[2].split()] 11487db96d56Sopenharmony_ci upper = [int(char, 16) for char in data[3].split()] 11497db96d56Sopenharmony_ci sc[c] = (lower, title, upper) 11507db96d56Sopenharmony_ci 11517db96d56Sopenharmony_ci cf = self.case_folding = {} 11527db96d56Sopenharmony_ci if version != '3.2.0': 11537db96d56Sopenharmony_ci for data in UcdFile(CASE_FOLDING, version): 11547db96d56Sopenharmony_ci if data[1] in "CF": 11557db96d56Sopenharmony_ci c = int(data[0], 16) 11567db96d56Sopenharmony_ci cf[c] = [int(char, 16) for char in data[2].split()] 11577db96d56Sopenharmony_ci 11587db96d56Sopenharmony_ci def uselatin1(self): 11597db96d56Sopenharmony_ci # restrict character range to ISO Latin 1 11607db96d56Sopenharmony_ci self.chars = list(range(256)) 11617db96d56Sopenharmony_ci 11627db96d56Sopenharmony_ci 11637db96d56Sopenharmony_ci# hash table tools 11647db96d56Sopenharmony_ci 11657db96d56Sopenharmony_ci# this is a straight-forward reimplementation of Python's built-in 11667db96d56Sopenharmony_ci# dictionary type, using a static data structure, and a custom string 11677db96d56Sopenharmony_ci# hash algorithm. 11687db96d56Sopenharmony_ci 11697db96d56Sopenharmony_cidef myhash(s, magic): 11707db96d56Sopenharmony_ci h = 0 11717db96d56Sopenharmony_ci for c in map(ord, s.upper()): 11727db96d56Sopenharmony_ci h = (h * magic) + c 11737db96d56Sopenharmony_ci ix = h & 0xff000000 11747db96d56Sopenharmony_ci if ix: 11757db96d56Sopenharmony_ci h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff 11767db96d56Sopenharmony_ci return h 11777db96d56Sopenharmony_ci 11787db96d56Sopenharmony_ci 11797db96d56Sopenharmony_ciSIZES = [ 11807db96d56Sopenharmony_ci (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), 11817db96d56Sopenharmony_ci (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), 11827db96d56Sopenharmony_ci (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), 11837db96d56Sopenharmony_ci (2097152,5), (4194304,3), (8388608,33), (16777216,27) 11847db96d56Sopenharmony_ci] 11857db96d56Sopenharmony_ci 11867db96d56Sopenharmony_ci 11877db96d56Sopenharmony_ciclass Hash: 11887db96d56Sopenharmony_ci def __init__(self, name, data, magic): 11897db96d56Sopenharmony_ci # turn a (key, value) list into a static hash table structure 11907db96d56Sopenharmony_ci 11917db96d56Sopenharmony_ci # determine table size 11927db96d56Sopenharmony_ci for size, poly in SIZES: 11937db96d56Sopenharmony_ci if size > len(data): 11947db96d56Sopenharmony_ci poly = size + poly 11957db96d56Sopenharmony_ci break 11967db96d56Sopenharmony_ci else: 11977db96d56Sopenharmony_ci raise AssertionError("ran out of polynomials") 11987db96d56Sopenharmony_ci 11997db96d56Sopenharmony_ci print(size, "slots in hash table") 12007db96d56Sopenharmony_ci 12017db96d56Sopenharmony_ci table = [None] * size 12027db96d56Sopenharmony_ci 12037db96d56Sopenharmony_ci mask = size-1 12047db96d56Sopenharmony_ci 12057db96d56Sopenharmony_ci n = 0 12067db96d56Sopenharmony_ci 12077db96d56Sopenharmony_ci hash = myhash 12087db96d56Sopenharmony_ci 12097db96d56Sopenharmony_ci # initialize hash table 12107db96d56Sopenharmony_ci for key, value in data: 12117db96d56Sopenharmony_ci h = hash(key, magic) 12127db96d56Sopenharmony_ci i = (~h) & mask 12137db96d56Sopenharmony_ci v = table[i] 12147db96d56Sopenharmony_ci if v is None: 12157db96d56Sopenharmony_ci table[i] = value 12167db96d56Sopenharmony_ci continue 12177db96d56Sopenharmony_ci incr = (h ^ (h >> 3)) & mask 12187db96d56Sopenharmony_ci if not incr: 12197db96d56Sopenharmony_ci incr = mask 12207db96d56Sopenharmony_ci while 1: 12217db96d56Sopenharmony_ci n = n + 1 12227db96d56Sopenharmony_ci i = (i + incr) & mask 12237db96d56Sopenharmony_ci v = table[i] 12247db96d56Sopenharmony_ci if v is None: 12257db96d56Sopenharmony_ci table[i] = value 12267db96d56Sopenharmony_ci break 12277db96d56Sopenharmony_ci incr = incr << 1 12287db96d56Sopenharmony_ci if incr > mask: 12297db96d56Sopenharmony_ci incr = incr ^ poly 12307db96d56Sopenharmony_ci 12317db96d56Sopenharmony_ci print(n, "collisions") 12327db96d56Sopenharmony_ci self.collisions = n 12337db96d56Sopenharmony_ci 12347db96d56Sopenharmony_ci for i in range(len(table)): 12357db96d56Sopenharmony_ci if table[i] is None: 12367db96d56Sopenharmony_ci table[i] = 0 12377db96d56Sopenharmony_ci 12387db96d56Sopenharmony_ci self.data = Array(name + "_hash", table) 12397db96d56Sopenharmony_ci self.magic = magic 12407db96d56Sopenharmony_ci self.name = name 12417db96d56Sopenharmony_ci self.size = size 12427db96d56Sopenharmony_ci self.poly = poly 12437db96d56Sopenharmony_ci 12447db96d56Sopenharmony_ci def dump(self, file, trace): 12457db96d56Sopenharmony_ci # write data to file, as a C array 12467db96d56Sopenharmony_ci self.data.dump(file, trace) 12477db96d56Sopenharmony_ci file.write("#define %s_magic %d\n" % (self.name, self.magic)) 12487db96d56Sopenharmony_ci file.write("#define %s_size %d\n" % (self.name, self.size)) 12497db96d56Sopenharmony_ci file.write("#define %s_poly %d\n" % (self.name, self.poly)) 12507db96d56Sopenharmony_ci 12517db96d56Sopenharmony_ci 12527db96d56Sopenharmony_ci# stuff to deal with arrays of unsigned integers 12537db96d56Sopenharmony_ci 12547db96d56Sopenharmony_ciclass Array: 12557db96d56Sopenharmony_ci 12567db96d56Sopenharmony_ci def __init__(self, name, data): 12577db96d56Sopenharmony_ci self.name = name 12587db96d56Sopenharmony_ci self.data = data 12597db96d56Sopenharmony_ci 12607db96d56Sopenharmony_ci def dump(self, file, trace=0): 12617db96d56Sopenharmony_ci # write data to file, as a C array 12627db96d56Sopenharmony_ci size = getsize(self.data) 12637db96d56Sopenharmony_ci if trace: 12647db96d56Sopenharmony_ci print(self.name+":", size*len(self.data), "bytes", file=sys.stderr) 12657db96d56Sopenharmony_ci file.write("static const ") 12667db96d56Sopenharmony_ci if size == 1: 12677db96d56Sopenharmony_ci file.write("unsigned char") 12687db96d56Sopenharmony_ci elif size == 2: 12697db96d56Sopenharmony_ci file.write("unsigned short") 12707db96d56Sopenharmony_ci else: 12717db96d56Sopenharmony_ci file.write("unsigned int") 12727db96d56Sopenharmony_ci file.write(" " + self.name + "[] = {\n") 12737db96d56Sopenharmony_ci if self.data: 12747db96d56Sopenharmony_ci s = " " 12757db96d56Sopenharmony_ci for item in self.data: 12767db96d56Sopenharmony_ci i = str(item) + ", " 12777db96d56Sopenharmony_ci if len(s) + len(i) > 78: 12787db96d56Sopenharmony_ci file.write(s.rstrip() + "\n") 12797db96d56Sopenharmony_ci s = " " + i 12807db96d56Sopenharmony_ci else: 12817db96d56Sopenharmony_ci s = s + i 12827db96d56Sopenharmony_ci if s.strip(): 12837db96d56Sopenharmony_ci file.write(s.rstrip() + "\n") 12847db96d56Sopenharmony_ci file.write("};\n\n") 12857db96d56Sopenharmony_ci 12867db96d56Sopenharmony_ci 12877db96d56Sopenharmony_cidef getsize(data): 12887db96d56Sopenharmony_ci # return smallest possible integer size for the given array 12897db96d56Sopenharmony_ci maxdata = max(data) 12907db96d56Sopenharmony_ci if maxdata < 256: 12917db96d56Sopenharmony_ci return 1 12927db96d56Sopenharmony_ci elif maxdata < 65536: 12937db96d56Sopenharmony_ci return 2 12947db96d56Sopenharmony_ci else: 12957db96d56Sopenharmony_ci return 4 12967db96d56Sopenharmony_ci 12977db96d56Sopenharmony_ci 12987db96d56Sopenharmony_cidef splitbins(t, trace=0): 12997db96d56Sopenharmony_ci """t, trace=0 -> (t1, t2, shift). Split a table to save space. 13007db96d56Sopenharmony_ci 13017db96d56Sopenharmony_ci t is a sequence of ints. This function can be useful to save space if 13027db96d56Sopenharmony_ci many of the ints are the same. t1 and t2 are lists of ints, and shift 13037db96d56Sopenharmony_ci is an int, chosen to minimize the combined size of t1 and t2 (in C 13047db96d56Sopenharmony_ci code), and where for each i in range(len(t)), 13057db96d56Sopenharmony_ci t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 13067db96d56Sopenharmony_ci where mask is a bitmask isolating the last "shift" bits. 13077db96d56Sopenharmony_ci 13087db96d56Sopenharmony_ci If optional arg trace is non-zero (default zero), progress info 13097db96d56Sopenharmony_ci is printed to sys.stderr. The higher the value, the more info 13107db96d56Sopenharmony_ci you'll get. 13117db96d56Sopenharmony_ci """ 13127db96d56Sopenharmony_ci 13137db96d56Sopenharmony_ci if trace: 13147db96d56Sopenharmony_ci def dump(t1, t2, shift, bytes): 13157db96d56Sopenharmony_ci print("%d+%d bins at shift %d; %d bytes" % ( 13167db96d56Sopenharmony_ci len(t1), len(t2), shift, bytes), file=sys.stderr) 13177db96d56Sopenharmony_ci print("Size of original table:", len(t)*getsize(t), "bytes", 13187db96d56Sopenharmony_ci file=sys.stderr) 13197db96d56Sopenharmony_ci n = len(t)-1 # last valid index 13207db96d56Sopenharmony_ci maxshift = 0 # the most we can shift n and still have something left 13217db96d56Sopenharmony_ci if n > 0: 13227db96d56Sopenharmony_ci while n >> 1: 13237db96d56Sopenharmony_ci n >>= 1 13247db96d56Sopenharmony_ci maxshift += 1 13257db96d56Sopenharmony_ci del n 13267db96d56Sopenharmony_ci bytes = sys.maxsize # smallest total size so far 13277db96d56Sopenharmony_ci t = tuple(t) # so slices can be dict keys 13287db96d56Sopenharmony_ci for shift in range(maxshift + 1): 13297db96d56Sopenharmony_ci t1 = [] 13307db96d56Sopenharmony_ci t2 = [] 13317db96d56Sopenharmony_ci size = 2**shift 13327db96d56Sopenharmony_ci bincache = {} 13337db96d56Sopenharmony_ci for i in range(0, len(t), size): 13347db96d56Sopenharmony_ci bin = t[i:i+size] 13357db96d56Sopenharmony_ci index = bincache.get(bin) 13367db96d56Sopenharmony_ci if index is None: 13377db96d56Sopenharmony_ci index = len(t2) 13387db96d56Sopenharmony_ci bincache[bin] = index 13397db96d56Sopenharmony_ci t2.extend(bin) 13407db96d56Sopenharmony_ci t1.append(index >> shift) 13417db96d56Sopenharmony_ci # determine memory size 13427db96d56Sopenharmony_ci b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 13437db96d56Sopenharmony_ci if trace > 1: 13447db96d56Sopenharmony_ci dump(t1, t2, shift, b) 13457db96d56Sopenharmony_ci if b < bytes: 13467db96d56Sopenharmony_ci best = t1, t2, shift 13477db96d56Sopenharmony_ci bytes = b 13487db96d56Sopenharmony_ci t1, t2, shift = best 13497db96d56Sopenharmony_ci if trace: 13507db96d56Sopenharmony_ci print("Best:", end=' ', file=sys.stderr) 13517db96d56Sopenharmony_ci dump(t1, t2, shift, bytes) 13527db96d56Sopenharmony_ci if __debug__: 13537db96d56Sopenharmony_ci # exhaustively verify that the decomposition is correct 13547db96d56Sopenharmony_ci mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 13557db96d56Sopenharmony_ci for i in range(len(t)): 13567db96d56Sopenharmony_ci assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 13577db96d56Sopenharmony_ci return best 13587db96d56Sopenharmony_ci 13597db96d56Sopenharmony_ci 13607db96d56Sopenharmony_ciif __name__ == "__main__": 13617db96d56Sopenharmony_ci maketables(1) 1362