12e5b6d6dSopenharmony_ci#!/usr/bin/python3 -B 22e5b6d6dSopenharmony_ci# -*- coding: utf-8 -*- 32e5b6d6dSopenharmony_ci# © 2016 and later: Unicode, Inc. and others. 42e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html 52e5b6d6dSopenharmony_ci# Copyright (c) 2009-2016 International Business Machines 62e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved. 72e5b6d6dSopenharmony_ci# 82e5b6d6dSopenharmony_ci# file name: preparseucd.py 92e5b6d6dSopenharmony_ci# encoding: US-ASCII 102e5b6d6dSopenharmony_ci# tab size: 8 (not used) 112e5b6d6dSopenharmony_ci# indentation:4 122e5b6d6dSopenharmony_ci# 132e5b6d6dSopenharmony_ci# created on: 2011nov03 (forked from ucdcopy.py) 142e5b6d6dSopenharmony_ci# created by: Markus W. Scherer 152e5b6d6dSopenharmony_ci# 162e5b6d6dSopenharmony_ci# Copies Unicode Character Database (UCD) files from a tree 172e5b6d6dSopenharmony_ci# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/ 182e5b6d6dSopenharmony_ci# to ICU's source/data/unidata/ and source/test/testdata/ 192e5b6d6dSopenharmony_ci# and modifies some of the files to make them more compact. 202e5b6d6dSopenharmony_ci# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax. 212e5b6d6dSopenharmony_ci# 222e5b6d6dSopenharmony_ci# Invoke with two command-line parameters: 232e5b6d6dSopenharmony_ci# 1. source folder with UCD & idna files 242e5b6d6dSopenharmony_ci# 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools) 252e5b6d6dSopenharmony_ci# 262e5b6d6dSopenharmony_ci# Sample invocation: 272e5b6d6dSopenharmony_ci# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src 282e5b6d6dSopenharmony_ci 292e5b6d6dSopenharmony_ciimport array 302e5b6d6dSopenharmony_ciimport bisect 312e5b6d6dSopenharmony_ciimport codecs 322e5b6d6dSopenharmony_ciimport os 332e5b6d6dSopenharmony_ciimport os.path 342e5b6d6dSopenharmony_ciimport re 352e5b6d6dSopenharmony_ciimport shutil 362e5b6d6dSopenharmony_ciimport sys 372e5b6d6dSopenharmony_ci 382e5b6d6dSopenharmony_ci# Unicode version ---------------------------------------------------------- *** 392e5b6d6dSopenharmony_ci 402e5b6d6dSopenharmony_ci_ucd_version = "?" 412e5b6d6dSopenharmony_ci 422e5b6d6dSopenharmony_ci# ISO 15924 script codes --------------------------------------------------- *** 432e5b6d6dSopenharmony_ci 442e5b6d6dSopenharmony_ci# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html 452e5b6d6dSopenharmony_ci# that are not yet in the UCD. 462e5b6d6dSopenharmony_ci_scripts_only_in_iso15924 = ( 472e5b6d6dSopenharmony_ci "Afak", "Blis", "Cirt", "Cyrs", 482e5b6d6dSopenharmony_ci "Egyd", "Egyh", "Geok", 492e5b6d6dSopenharmony_ci "Hanb", "Hans", "Hant", 502e5b6d6dSopenharmony_ci "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma", 512e5b6d6dSopenharmony_ci "Maya", "Moon", "Nkgb", "Phlv", "Roro", 522e5b6d6dSopenharmony_ci "Sara", "Syre", "Syrj", "Syrn", 532e5b6d6dSopenharmony_ci "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx" 542e5b6d6dSopenharmony_ci) 552e5b6d6dSopenharmony_ci 562e5b6d6dSopenharmony_ci# Properties --------------------------------------------------------------- *** 572e5b6d6dSopenharmony_ci 582e5b6d6dSopenharmony_ci# Properties that we do not want to store in ppucd.txt. 592e5b6d6dSopenharmony_ci# Not a frozenset so that we can add aliases for simpler subsequent testing. 602e5b6d6dSopenharmony_ci_ignored_properties = set(( 612e5b6d6dSopenharmony_ci # Other_Xyz only contribute to Xyz, store only the latter. 622e5b6d6dSopenharmony_ci "OAlpha", 632e5b6d6dSopenharmony_ci "ODI", 642e5b6d6dSopenharmony_ci "OGr_Ext", 652e5b6d6dSopenharmony_ci "OIDC", 662e5b6d6dSopenharmony_ci "OIDS", 672e5b6d6dSopenharmony_ci "OLower", 682e5b6d6dSopenharmony_ci "OMath", 692e5b6d6dSopenharmony_ci "OUpper", 702e5b6d6dSopenharmony_ci # Further properties that just contribute to others. 712e5b6d6dSopenharmony_ci "CE", # Composition_Exclusion just contributes to Full_Composition_Exclusion. 722e5b6d6dSopenharmony_ci "JSN", 732e5b6d6dSopenharmony_ci # These properties just don't seem useful. 742e5b6d6dSopenharmony_ci # They are deprecated since Unicode 6.0. 752e5b6d6dSopenharmony_ci "XO_NFC", 762e5b6d6dSopenharmony_ci "XO_NFD", 772e5b6d6dSopenharmony_ci "XO_NFKC", 782e5b6d6dSopenharmony_ci "XO_NFKD", 792e5b6d6dSopenharmony_ci # ICU does not use Unihan properties. 802e5b6d6dSopenharmony_ci "cjkAccountingNumeric", 812e5b6d6dSopenharmony_ci "cjkOtherNumeric", 822e5b6d6dSopenharmony_ci "cjkPrimaryNumeric", 832e5b6d6dSopenharmony_ci "cjkCompatibilityVariant", 842e5b6d6dSopenharmony_ci "cjkIICore", 852e5b6d6dSopenharmony_ci "cjkIRG_GSource", 862e5b6d6dSopenharmony_ci "cjkIRG_HSource", 872e5b6d6dSopenharmony_ci "cjkIRG_JSource", 882e5b6d6dSopenharmony_ci "cjkIRG_KPSource", 892e5b6d6dSopenharmony_ci "cjkIRG_KSource", 902e5b6d6dSopenharmony_ci "cjkIRG_MSource", 912e5b6d6dSopenharmony_ci "cjkIRG_SSource", 922e5b6d6dSopenharmony_ci "cjkIRG_TSource", 932e5b6d6dSopenharmony_ci "cjkIRG_UKSource", 942e5b6d6dSopenharmony_ci "cjkIRG_USource", 952e5b6d6dSopenharmony_ci "cjkIRG_VSource", 962e5b6d6dSopenharmony_ci "cjkRSUnicode" 972e5b6d6dSopenharmony_ci)) 982e5b6d6dSopenharmony_ci 992e5b6d6dSopenharmony_ci# These properties (short names) map code points to 1002e5b6d6dSopenharmony_ci# strings or other unusual values (property types String or Miscellaneous) 1012e5b6d6dSopenharmony_ci# that cannot be block-compressed (or would be confusing). 1022e5b6d6dSopenharmony_ci_uncompressible_props = frozenset(( 1032e5b6d6dSopenharmony_ci "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC", 1042e5b6d6dSopenharmony_ci "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF", 1052e5b6d6dSopenharmony_ci # scx is block-compressible. 1062e5b6d6dSopenharmony_ci "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc" 1072e5b6d6dSopenharmony_ci)) 1082e5b6d6dSopenharmony_ci 1092e5b6d6dSopenharmony_ci# Dictionary of properties. 1102e5b6d6dSopenharmony_ci# Keyed by normalized property names and aliases. 1112e5b6d6dSopenharmony_ci# Each value is a tuple with 1122e5b6d6dSopenharmony_ci# 0: Type of property (binary, enum, ...) 1132e5b6d6dSopenharmony_ci# 1: List of aliases; short & long name followed by other aliases. 1142e5b6d6dSopenharmony_ci# The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt. 1152e5b6d6dSopenharmony_ci# 2: Set of short property value names. 1162e5b6d6dSopenharmony_ci# 3: Dictionary of property values. 1172e5b6d6dSopenharmony_ci# For Catalog & Enumerated properties, 1182e5b6d6dSopenharmony_ci# maps each value name to a list of aliases. 1192e5b6d6dSopenharmony_ci# Empty for other types of properties. 1202e5b6d6dSopenharmony_ci_properties = {} 1212e5b6d6dSopenharmony_ci 1222e5b6d6dSopenharmony_ci# Dictionary of binary-property values which we store as False/True. 1232e5b6d6dSopenharmony_ci# Same as the values dictionary of one of the binary properties. 1242e5b6d6dSopenharmony_ci_binary_values = {} 1252e5b6d6dSopenharmony_ci 1262e5b6d6dSopenharmony_ci# Dictionary of null values. 1272e5b6d6dSopenharmony_ci# Keyed by short property names. 1282e5b6d6dSopenharmony_ci# These are type-specific values for properties that occur in the data. 1292e5b6d6dSopenharmony_ci# They are overridden by _defaults, block and code point properties. 1302e5b6d6dSopenharmony_ci_null_values = {} 1312e5b6d6dSopenharmony_ci 1322e5b6d6dSopenharmony_ci# Property value names for null values. 1332e5b6d6dSopenharmony_ci# We do not store these in _defaults. 1342e5b6d6dSopenharmony_ci_null_names = frozenset(("<none>", "NaN")) 1352e5b6d6dSopenharmony_ci 1362e5b6d6dSopenharmony_ci# Dictionary of explicit default property values. 1372e5b6d6dSopenharmony_ci# Keyed by short property names. 1382e5b6d6dSopenharmony_ci_defaults = {"gc": "Cn"} 1392e5b6d6dSopenharmony_ci 1402e5b6d6dSopenharmony_ci# _null_values overridden by explicit _defaults. 1412e5b6d6dSopenharmony_ci# Initialized after parsing is done. 1422e5b6d6dSopenharmony_ci_null_or_defaults = {} 1432e5b6d6dSopenharmony_ci 1442e5b6d6dSopenharmony_ci# List of properties with an ICU UProperty enum. 1452e5b6d6dSopenharmony_ci# Each item is an (enum, pname, values) tuple. 1462e5b6d6dSopenharmony_ci# - enum: the ICU enum UProperty constant string 1472e5b6d6dSopenharmony_ci# - pname: the UCD short property name 1482e5b6d6dSopenharmony_ci# - values: list of (enum, vname) pairs per property value 1492e5b6d6dSopenharmony_ci# - enum: the ICU property value's enum constant string 1502e5b6d6dSopenharmony_ci# - vname: the UCD short property value name 1512e5b6d6dSopenharmony_ci_icu_properties = [] 1522e5b6d6dSopenharmony_ci 1532e5b6d6dSopenharmony_ci# Dictionary of short property names mapped to _icu_properties items. 1542e5b6d6dSopenharmony_ci_pname_to_icu_prop = {} 1552e5b6d6dSopenharmony_ci 1562e5b6d6dSopenharmony_ci_non_alnum_re = re.compile("[^a-zA-Z0-9]") 1572e5b6d6dSopenharmony_ci 1582e5b6d6dSopenharmony_cidef NormPropName(pname): 1592e5b6d6dSopenharmony_ci """Returns a normalized form of pname. 1602e5b6d6dSopenharmony_ci Removes non-ASCII-alphanumeric characters and lowercases letters.""" 1612e5b6d6dSopenharmony_ci return _non_alnum_re.sub("", pname).lower() 1622e5b6d6dSopenharmony_ci 1632e5b6d6dSopenharmony_ci 1642e5b6d6dSopenharmony_cidef GetProperty(pname): 1652e5b6d6dSopenharmony_ci """Returns the _properties value for the pname. 1662e5b6d6dSopenharmony_ci Returns null if the property is ignored. 1672e5b6d6dSopenharmony_ci Caches alternate spellings of the property name.""" 1682e5b6d6dSopenharmony_ci # Try the input name. 1692e5b6d6dSopenharmony_ci prop = _properties.get(pname) 1702e5b6d6dSopenharmony_ci if prop != None: return prop 1712e5b6d6dSopenharmony_ci if pname in _ignored_properties: return None 1722e5b6d6dSopenharmony_ci # Try the normalized input name. 1732e5b6d6dSopenharmony_ci norm_name = NormPropName(pname) 1742e5b6d6dSopenharmony_ci prop = _properties.get(norm_name) 1752e5b6d6dSopenharmony_ci if prop != None: 1762e5b6d6dSopenharmony_ci _properties[pname] = prop # Cache prop under this new name spelling. 1772e5b6d6dSopenharmony_ci return prop 1782e5b6d6dSopenharmony_ci elif pname in _ignored_properties: 1792e5b6d6dSopenharmony_ci _ignored_properties.add(pname) # Remember to ignore this new name spelling. 1802e5b6d6dSopenharmony_ci return None 1812e5b6d6dSopenharmony_ci else: 1822e5b6d6dSopenharmony_ci raise NameError("unknown property %s\n" % pname) 1832e5b6d6dSopenharmony_ci 1842e5b6d6dSopenharmony_ci 1852e5b6d6dSopenharmony_cidef GetShortPropertyName(pname): 1862e5b6d6dSopenharmony_ci if pname in _null_values: return pname # pname is already the short name. 1872e5b6d6dSopenharmony_ci prop = GetProperty(pname) 1882e5b6d6dSopenharmony_ci if not prop: return "" # For ignored properties. 1892e5b6d6dSopenharmony_ci return prop[1][0] or prop[1][1] # Long name if no short name. 1902e5b6d6dSopenharmony_ci 1912e5b6d6dSopenharmony_ci 1922e5b6d6dSopenharmony_cidef GetShortPropertyValueName(prop, vname): 1932e5b6d6dSopenharmony_ci if vname in prop[2]: return vname 1942e5b6d6dSopenharmony_ci values = prop[3] 1952e5b6d6dSopenharmony_ci aliases = values.get(vname) 1962e5b6d6dSopenharmony_ci if aliases == None: 1972e5b6d6dSopenharmony_ci norm_name = NormPropName(vname) 1982e5b6d6dSopenharmony_ci aliases = values.get(norm_name) 1992e5b6d6dSopenharmony_ci if aliases == None: 2002e5b6d6dSopenharmony_ci raise NameError("unknown value name %s for property %s\n" % 2012e5b6d6dSopenharmony_ci (vname, prop[1][0])) 2022e5b6d6dSopenharmony_ci values[vname] = aliases 2032e5b6d6dSopenharmony_ci return aliases[0] or aliases[1] # Long name if no short name. 2042e5b6d6dSopenharmony_ci 2052e5b6d6dSopenharmony_ci 2062e5b6d6dSopenharmony_cidef NormalizePropertyValue(prop, vname): 2072e5b6d6dSopenharmony_ci if prop[2]: # Binary/Catalog/Enumerated property. 2082e5b6d6dSopenharmony_ci value = GetShortPropertyValueName(prop, vname) 2092e5b6d6dSopenharmony_ci if prop[0] == "Binary": 2102e5b6d6dSopenharmony_ci value = value == "Y" 2112e5b6d6dSopenharmony_ci if prop[1][0].endswith("ccc"): 2122e5b6d6dSopenharmony_ci value = int(value) 2132e5b6d6dSopenharmony_ci else: 2142e5b6d6dSopenharmony_ci value = vname 2152e5b6d6dSopenharmony_ci return value 2162e5b6d6dSopenharmony_ci 2172e5b6d6dSopenharmony_ci# Character data ----------------------------------------------------------- *** 2182e5b6d6dSopenharmony_ci 2192e5b6d6dSopenharmony_ci# Lists of NamesList h1 and h2 headings. 2202e5b6d6dSopenharmony_ci# Each h1 value is a (start, end, comment) tuple. 2212e5b6d6dSopenharmony_ci# Each h2 value is a (cp, comment) tuple. 2222e5b6d6dSopenharmony_ci_h1 = [] 2232e5b6d6dSopenharmony_ci_h2 = [] 2242e5b6d6dSopenharmony_ci 2252e5b6d6dSopenharmony_ci# List of Unicode blocks. 2262e5b6d6dSopenharmony_ci# Each item is a tuple of start & end code point integers 2272e5b6d6dSopenharmony_ci# and a dictionary of default property values. 2282e5b6d6dSopenharmony_ci_blocks = [] 2292e5b6d6dSopenharmony_ci 2302e5b6d6dSopenharmony_ci# List of ranges with algorithmic names. 2312e5b6d6dSopenharmony_ci# Each value is a list of [start, end, type, prefix] 2322e5b6d6dSopenharmony_ci# where prefix is optional. 2332e5b6d6dSopenharmony_ci_alg_names_ranges = [] 2342e5b6d6dSopenharmony_ci 2352e5b6d6dSopenharmony_ci# List of Unicode character ranges and their properties, 2362e5b6d6dSopenharmony_ci# stored as an inversion map with range_start & props dictionary. 2372e5b6d6dSopenharmony_ci# Starts with one range for all of Unicode without any properties. 2382e5b6d6dSopenharmony_ci# Setting values subdivides ranges. 2392e5b6d6dSopenharmony_ci_starts = array.array('l', [0, 0x110000]) # array of int32_t 2402e5b6d6dSopenharmony_ci_props = [{}, {}] # props for 0 and 110000 2412e5b6d6dSopenharmony_ci 2422e5b6d6dSopenharmony_cidef FindRange(x): 2432e5b6d6dSopenharmony_ci """ Binary search for x in the inversion map. 2442e5b6d6dSopenharmony_ci Returns the smallest i where x < _starts[i]""" 2452e5b6d6dSopenharmony_ci return bisect.bisect(_starts, x) - 1 2462e5b6d6dSopenharmony_ci 2472e5b6d6dSopenharmony_ci 2482e5b6d6dSopenharmony_cidef GetProps(c): 2492e5b6d6dSopenharmony_ci i = FindRange(c) 2502e5b6d6dSopenharmony_ci return _props[i] 2512e5b6d6dSopenharmony_ci 2522e5b6d6dSopenharmony_ci 2532e5b6d6dSopenharmony_cidef UpdateProps(start, end, update): 2542e5b6d6dSopenharmony_ci assert 0 <= start <= end <= 0x10ffff 2552e5b6d6dSopenharmony_ci (need_to_update, do_update, u) = (update[0], update[1], update[2]) 2562e5b6d6dSopenharmony_ci # Find the index i of the range in _starts that contains start. 2572e5b6d6dSopenharmony_ci i = FindRange(start) 2582e5b6d6dSopenharmony_ci limit = end + 1 2592e5b6d6dSopenharmony_ci # Intersect [start, limit[ with ranges in _starts. 2602e5b6d6dSopenharmony_ci c_start = _starts[i] 2612e5b6d6dSopenharmony_ci c_limit = _starts[i + 1] 2622e5b6d6dSopenharmony_ci c_props = _props[i] 2632e5b6d6dSopenharmony_ci # c_start <= start < c_limit 2642e5b6d6dSopenharmony_ci if c_start < start: 2652e5b6d6dSopenharmony_ci update_limit = c_limit if c_limit <= limit else limit 2662e5b6d6dSopenharmony_ci if need_to_update(u, start, update_limit - 1, c_props): 2672e5b6d6dSopenharmony_ci # Split off [c_start, start[ with a copy of c_props. 2682e5b6d6dSopenharmony_ci i += 1 2692e5b6d6dSopenharmony_ci c_props = c_props.copy() 2702e5b6d6dSopenharmony_ci _starts.insert(i, start) 2712e5b6d6dSopenharmony_ci _props.insert(i, c_props) 2722e5b6d6dSopenharmony_ci c_start = start 2732e5b6d6dSopenharmony_ci # Modify all ranges that are fully inside [start, limit[. 2742e5b6d6dSopenharmony_ci while c_limit <= limit: 2752e5b6d6dSopenharmony_ci # start <= c_start < c_limit <= limit 2762e5b6d6dSopenharmony_ci if need_to_update(u, c_start, c_limit - 1, c_props): 2772e5b6d6dSopenharmony_ci do_update(u, c_start, c_limit - 1, c_props) 2782e5b6d6dSopenharmony_ci if c_limit == 0x110000: return 2792e5b6d6dSopenharmony_ci i += 1 2802e5b6d6dSopenharmony_ci c_start = c_limit 2812e5b6d6dSopenharmony_ci c_limit = _starts[i + 1] 2822e5b6d6dSopenharmony_ci c_props = _props[i] 2832e5b6d6dSopenharmony_ci if c_start < limit and need_to_update(u, c_start, limit - 1, c_props): 2842e5b6d6dSopenharmony_ci # Split off [limit, c_limit[ with a copy of c_props. 2852e5b6d6dSopenharmony_ci _starts.insert(i + 1, limit) 2862e5b6d6dSopenharmony_ci _props.insert(i + 1, c_props.copy()) 2872e5b6d6dSopenharmony_ci # Modify [c_start, limit[ c_props. 2882e5b6d6dSopenharmony_ci do_update(u, c_start, limit - 1, c_props) 2892e5b6d6dSopenharmony_ci 2902e5b6d6dSopenharmony_ci 2912e5b6d6dSopenharmony_cidef NeedToSetProps(props, start, end, c_props): 2922e5b6d6dSopenharmony_ci """Returns True if props is not a sub-dict of c_props.""" 2932e5b6d6dSopenharmony_ci for (pname, value) in props.items(): 2942e5b6d6dSopenharmony_ci if pname not in c_props or value != c_props[pname]: return True 2952e5b6d6dSopenharmony_ci return False 2962e5b6d6dSopenharmony_ci 2972e5b6d6dSopenharmony_ci 2982e5b6d6dSopenharmony_cidef DoSetProps(props, start, end, c_props): 2992e5b6d6dSopenharmony_ci c_props.update(props) 3002e5b6d6dSopenharmony_ci 3012e5b6d6dSopenharmony_ci 3022e5b6d6dSopenharmony_cidef SetProps(start, end, props): 3032e5b6d6dSopenharmony_ci UpdateProps(start, end, (NeedToSetProps, DoSetProps, props)) 3042e5b6d6dSopenharmony_ci 3052e5b6d6dSopenharmony_ci 3062e5b6d6dSopenharmony_cidef NeedToSetAlways(nv, start, end, c_props): 3072e5b6d6dSopenharmony_ci return True 3082e5b6d6dSopenharmony_ci 3092e5b6d6dSopenharmony_ci 3102e5b6d6dSopenharmony_ci# For restoring boundaries after merging adjacent same-props ranges. 3112e5b6d6dSopenharmony_cidef AddBoundary(x): 3122e5b6d6dSopenharmony_ci """Ensure that there is a range start/limit at x.""" 3132e5b6d6dSopenharmony_ci assert 0 <= x <= 0x10ffff 3142e5b6d6dSopenharmony_ci i = FindRange(x) 3152e5b6d6dSopenharmony_ci if _starts[i] == x: return 3162e5b6d6dSopenharmony_ci # Split the range at x. 3172e5b6d6dSopenharmony_ci c_start = _starts[i] 3182e5b6d6dSopenharmony_ci c_limit = _starts[i + 1] 3192e5b6d6dSopenharmony_ci c_props = _props[i] 3202e5b6d6dSopenharmony_ci # c_start < x < c_limit 3212e5b6d6dSopenharmony_ci i += 1 3222e5b6d6dSopenharmony_ci _starts.insert(i, x) 3232e5b6d6dSopenharmony_ci _props.insert(i, c_props.copy()) 3242e5b6d6dSopenharmony_ci 3252e5b6d6dSopenharmony_ci 3262e5b6d6dSopenharmony_cidef SetDefaultValue(pname, value): 3272e5b6d6dSopenharmony_ci """Sets the property's default value. Ignores null values.""" 3282e5b6d6dSopenharmony_ci prop = GetProperty(pname) 3292e5b6d6dSopenharmony_ci if prop and value not in _null_names: 3302e5b6d6dSopenharmony_ci value = NormalizePropertyValue(prop, value) 3312e5b6d6dSopenharmony_ci if value != _null_values[prop[1][0]]: 3322e5b6d6dSopenharmony_ci _defaults[prop[1][0]] = value 3332e5b6d6dSopenharmony_ci SetProps(0, 0x10ffff, {prop[1][0]: value}) 3342e5b6d6dSopenharmony_ci 3352e5b6d6dSopenharmony_ci 3362e5b6d6dSopenharmony_cidef SetBinaryPropertyToTrue(pname, start, end): 3372e5b6d6dSopenharmony_ci prop = GetProperty(pname) 3382e5b6d6dSopenharmony_ci if prop: 3392e5b6d6dSopenharmony_ci assert prop[0] == "Binary" 3402e5b6d6dSopenharmony_ci SetProps(start, end, {prop[1][0]: True}) 3412e5b6d6dSopenharmony_ci 3422e5b6d6dSopenharmony_ci 3432e5b6d6dSopenharmony_cidef SetPropValue(prop, vname, start, end): 3442e5b6d6dSopenharmony_ci value = NormalizePropertyValue(prop, vname) 3452e5b6d6dSopenharmony_ci SetProps(start, end, {prop[1][0]: value}) 3462e5b6d6dSopenharmony_ci 3472e5b6d6dSopenharmony_ci 3482e5b6d6dSopenharmony_cidef SetPropertyValue(pname, vname, start, end): 3492e5b6d6dSopenharmony_ci prop = GetProperty(pname) 3502e5b6d6dSopenharmony_ci if prop: SetPropValue(prop, vname, start, end) 3512e5b6d6dSopenharmony_ci 3522e5b6d6dSopenharmony_ci# Parsing ------------------------------------------------------------------ *** 3532e5b6d6dSopenharmony_ci 3542e5b6d6dSopenharmony_ci_stripped_cp_re = re.compile("([0-9a-fA-F]+)$") 3552e5b6d6dSopenharmony_ci_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$") 3562e5b6d6dSopenharmony_ci# Default value for all of Unicode. 3572e5b6d6dSopenharmony_ci_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$") 3582e5b6d6dSopenharmony_ci# Default value for some range. 3592e5b6d6dSopenharmony_ci_missing2_re = re.compile("# *@missing: *(.+)$") 3602e5b6d6dSopenharmony_ci 3612e5b6d6dSopenharmony_cidef ReadUCDLines(in_file, want_ranges=True, want_other=False, 3622e5b6d6dSopenharmony_ci want_comments=False, want_missing=False): 3632e5b6d6dSopenharmony_ci """Parses lines from a semicolon-delimited UCD text file. 3642e5b6d6dSopenharmony_ci Strips comments, ignores empty and all-comment lines. 3652e5b6d6dSopenharmony_ci Returns a tuple (type, line, ...). 3662e5b6d6dSopenharmony_ci """ 3672e5b6d6dSopenharmony_ci for line in in_file: 3682e5b6d6dSopenharmony_ci line = line.strip() 3692e5b6d6dSopenharmony_ci if not line: continue 3702e5b6d6dSopenharmony_ci if line.startswith("#"): # whole-line comment 3712e5b6d6dSopenharmony_ci parse_data = False 3722e5b6d6dSopenharmony_ci if want_missing: 3732e5b6d6dSopenharmony_ci match = _missing_re.match(line) 3742e5b6d6dSopenharmony_ci if match: 3752e5b6d6dSopenharmony_ci fields = match.group(1).split(";") 3762e5b6d6dSopenharmony_ci for i in range(len(fields)): fields[i] = fields[i].strip() 3772e5b6d6dSopenharmony_ci yield ("missing", line, fields) 3782e5b6d6dSopenharmony_ci continue 3792e5b6d6dSopenharmony_ci match = _missing2_re.match(line) 3802e5b6d6dSopenharmony_ci if match: 3812e5b6d6dSopenharmony_ci # Strip the "missing" comment prefix and fall through to 3822e5b6d6dSopenharmony_ci # parse the remainder of the line like regular data. 3832e5b6d6dSopenharmony_ci parse_data = True 3842e5b6d6dSopenharmony_ci line = match.group(1) 3852e5b6d6dSopenharmony_ci if not parse_data: 3862e5b6d6dSopenharmony_ci if want_comments: yield ("comment", line) 3872e5b6d6dSopenharmony_ci continue 3882e5b6d6dSopenharmony_ci comment_start = line.find("#") # inline comment 3892e5b6d6dSopenharmony_ci if comment_start >= 0: 3902e5b6d6dSopenharmony_ci line = line[:comment_start].rstrip() 3912e5b6d6dSopenharmony_ci if not line: continue 3922e5b6d6dSopenharmony_ci fields = line.split(";") 3932e5b6d6dSopenharmony_ci for i in range(len(fields)): fields[i] = fields[i].strip() 3942e5b6d6dSopenharmony_ci if want_ranges: 3952e5b6d6dSopenharmony_ci first = fields[0] 3962e5b6d6dSopenharmony_ci match = _stripped_range_re.match(first) 3972e5b6d6dSopenharmony_ci if match: 3982e5b6d6dSopenharmony_ci start = int(match.group(1), 16) 3992e5b6d6dSopenharmony_ci end = int(match.group(2), 16) 4002e5b6d6dSopenharmony_ci yield ("range", line, start, end, fields) 4012e5b6d6dSopenharmony_ci continue 4022e5b6d6dSopenharmony_ci match = _stripped_cp_re.match(first) 4032e5b6d6dSopenharmony_ci if match: 4042e5b6d6dSopenharmony_ci c = int(match.group(1), 16) 4052e5b6d6dSopenharmony_ci yield ("range", line, c, c, fields) 4062e5b6d6dSopenharmony_ci continue 4072e5b6d6dSopenharmony_ci if want_other: 4082e5b6d6dSopenharmony_ci yield ("other", line, fields) 4092e5b6d6dSopenharmony_ci else: 4102e5b6d6dSopenharmony_ci raise SyntaxError("unable to parse line\n %s\n" % line) 4112e5b6d6dSopenharmony_ci 4122e5b6d6dSopenharmony_ci 4132e5b6d6dSopenharmony_cidef AddBinaryProperty(short_name, long_name): 4142e5b6d6dSopenharmony_ci _null_values[short_name] = False 4152e5b6d6dSopenharmony_ci bin_prop = _properties["Math"] 4162e5b6d6dSopenharmony_ci prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3]) 4172e5b6d6dSopenharmony_ci _properties[short_name] = prop 4182e5b6d6dSopenharmony_ci _properties[long_name] = prop 4192e5b6d6dSopenharmony_ci _properties[NormPropName(short_name)] = prop 4202e5b6d6dSopenharmony_ci _properties[NormPropName(long_name)] = prop 4212e5b6d6dSopenharmony_ci 4222e5b6d6dSopenharmony_ci 4232e5b6d6dSopenharmony_cidef AddSingleNameBinaryProperty(name): 4242e5b6d6dSopenharmony_ci # For some properties, the short name is the same as the long name. 4252e5b6d6dSopenharmony_ci _null_values[name] = False 4262e5b6d6dSopenharmony_ci bin_prop = _properties["Math"] 4272e5b6d6dSopenharmony_ci prop = ("Binary", [name, name], bin_prop[2], bin_prop[3]) 4282e5b6d6dSopenharmony_ci _properties[name] = prop 4292e5b6d6dSopenharmony_ci _properties[NormPropName(name)] = prop 4302e5b6d6dSopenharmony_ci 4312e5b6d6dSopenharmony_ci 4322e5b6d6dSopenharmony_cidef AddPOSIXBinaryProperty(name): 4332e5b6d6dSopenharmony_ci # We only define a long name for ICU-specific (non-UCD) POSIX properties. 4342e5b6d6dSopenharmony_ci _null_values[name] = False 4352e5b6d6dSopenharmony_ci bin_prop = _properties["Math"] 4362e5b6d6dSopenharmony_ci prop = ("Binary", ["", name], bin_prop[2], bin_prop[3]) 4372e5b6d6dSopenharmony_ci _properties[name] = prop 4382e5b6d6dSopenharmony_ci _properties[NormPropName(name)] = prop 4392e5b6d6dSopenharmony_ci # This is to match UProperty UCHAR_POSIX_ALNUM etc. 4402e5b6d6dSopenharmony_ci _properties["posix" + NormPropName(name)] = prop 4412e5b6d6dSopenharmony_ci 4422e5b6d6dSopenharmony_ci 4432e5b6d6dSopenharmony_ci# Match a comment line like 4442e5b6d6dSopenharmony_ci# PropertyAliases-6.1.0.txt 4452e5b6d6dSopenharmony_ci# and extract the Unicode version. 4462e5b6d6dSopenharmony_ci_ucd_version_re = re.compile("# *PropertyAliases" + 4472e5b6d6dSopenharmony_ci "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" + 4482e5b6d6dSopenharmony_ci "\\.txt") 4492e5b6d6dSopenharmony_ci 4502e5b6d6dSopenharmony_cidef ParsePropertyAliases(in_file): 4512e5b6d6dSopenharmony_ci global _ucd_version 4522e5b6d6dSopenharmony_ci prop_type_nulls = { 4532e5b6d6dSopenharmony_ci "Binary": False, 4542e5b6d6dSopenharmony_ci "Catalog": "??", # Must be specified, e.g., in @missing line. 4552e5b6d6dSopenharmony_ci "Enumerated": "??", # Must be specified. 4562e5b6d6dSopenharmony_ci "Numeric": "NaN", 4572e5b6d6dSopenharmony_ci "String": "", 4582e5b6d6dSopenharmony_ci "Miscellaneous": "" 4592e5b6d6dSopenharmony_ci } 4602e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_ranges=False, 4612e5b6d6dSopenharmony_ci want_other=True, want_comments=True): 4622e5b6d6dSopenharmony_ci if data[0] == "comment": 4632e5b6d6dSopenharmony_ci line = data[1] 4642e5b6d6dSopenharmony_ci match = _ucd_version_re.match(line) 4652e5b6d6dSopenharmony_ci if match: 4662e5b6d6dSopenharmony_ci _ucd_version = match.group(1) 4672e5b6d6dSopenharmony_ci else: 4682e5b6d6dSopenharmony_ci words = line[1:].lstrip().split() 4692e5b6d6dSopenharmony_ci if len(words) == 2 and words[1] == "Properties": 4702e5b6d6dSopenharmony_ci prop_type = words[0] 4712e5b6d6dSopenharmony_ci null_value = prop_type_nulls[prop_type] 4722e5b6d6dSopenharmony_ci else: 4732e5b6d6dSopenharmony_ci # type == "other" 4742e5b6d6dSopenharmony_ci aliases = data[2] 4752e5b6d6dSopenharmony_ci name = aliases[0] 4762e5b6d6dSopenharmony_ci if name in _ignored_properties: 4772e5b6d6dSopenharmony_ci for alias in aliases: 4782e5b6d6dSopenharmony_ci _ignored_properties.add(alias) 4792e5b6d6dSopenharmony_ci _ignored_properties.add(NormPropName(alias)) 4802e5b6d6dSopenharmony_ci else: 4812e5b6d6dSopenharmony_ci if name.endswith("ccc"): 4822e5b6d6dSopenharmony_ci _null_values[name] = 0 4832e5b6d6dSopenharmony_ci else: 4842e5b6d6dSopenharmony_ci _null_values[name] = null_value 4852e5b6d6dSopenharmony_ci prop = (prop_type, aliases, set(), {}) 4862e5b6d6dSopenharmony_ci for alias in aliases: 4872e5b6d6dSopenharmony_ci _properties[alias] = prop 4882e5b6d6dSopenharmony_ci _properties[NormPropName(alias)] = prop 4892e5b6d6dSopenharmony_ci # Add provisional and ICU-specific properties we need. 4902e5b6d6dSopenharmony_ci # We add some in support of runtime API, even if we do not write 4912e5b6d6dSopenharmony_ci # data for them to ppucd.txt (e.g., lccc & tccc). 4922e5b6d6dSopenharmony_ci # We add others just to represent UCD data that contributes to 4932e5b6d6dSopenharmony_ci # some functionality, although Unicode has not "blessed" them 4942e5b6d6dSopenharmony_ci # as separate properties (e.g., Turkic_Case_Folding). 4952e5b6d6dSopenharmony_ci 4962e5b6d6dSopenharmony_ci # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt. 4972e5b6d6dSopenharmony_ci name = "Turkic_Case_Folding" 4982e5b6d6dSopenharmony_ci _null_values[name] = "" 4992e5b6d6dSopenharmony_ci prop = ("String", [name, name], set(), {}) 5002e5b6d6dSopenharmony_ci _properties[name] = prop 5012e5b6d6dSopenharmony_ci _properties[NormPropName(name)] = prop 5022e5b6d6dSopenharmony_ci # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions. 5032e5b6d6dSopenharmony_ci name = "Conditional_Case_Mappings" 5042e5b6d6dSopenharmony_ci _null_values[name] = "" 5052e5b6d6dSopenharmony_ci prop = ("Miscellaneous", [name, name], set(), {}) 5062e5b6d6dSopenharmony_ci _properties[name] = prop 5072e5b6d6dSopenharmony_ci _properties[NormPropName(name)] = prop 5082e5b6d6dSopenharmony_ci # lccc = ccc of first cp in canonical decomposition. 5092e5b6d6dSopenharmony_ci _null_values["lccc"] = 0 5102e5b6d6dSopenharmony_ci ccc_prop = list(_properties["ccc"]) 5112e5b6d6dSopenharmony_ci ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"] 5122e5b6d6dSopenharmony_ci prop = tuple(ccc_prop) 5132e5b6d6dSopenharmony_ci _properties["lccc"] = prop 5142e5b6d6dSopenharmony_ci _properties["Lead_Canonical_Combining_Class"] = prop 5152e5b6d6dSopenharmony_ci _properties["leadcanonicalcombiningclass"] = prop 5162e5b6d6dSopenharmony_ci # tccc = ccc of last cp in canonical decomposition. 5172e5b6d6dSopenharmony_ci _null_values["tccc"] = 0 5182e5b6d6dSopenharmony_ci ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"] 5192e5b6d6dSopenharmony_ci prop = tuple(ccc_prop) 5202e5b6d6dSopenharmony_ci _properties["tccc"] = prop 5212e5b6d6dSopenharmony_ci _properties["Trail_Canonical_Combining_Class"] = prop 5222e5b6d6dSopenharmony_ci _properties["trailcanonicalcombiningclass"] = prop 5232e5b6d6dSopenharmony_ci # Script_Extensions 5242e5b6d6dSopenharmony_ci if "scx" not in _properties: 5252e5b6d6dSopenharmony_ci _null_values["scx"] = "" 5262e5b6d6dSopenharmony_ci prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {}) 5272e5b6d6dSopenharmony_ci _properties["scx"] = prop 5282e5b6d6dSopenharmony_ci _properties["Script_Extensions"] = prop 5292e5b6d6dSopenharmony_ci _properties["scriptextensions"] = prop 5302e5b6d6dSopenharmony_ci # General Category as a bit mask. 5312e5b6d6dSopenharmony_ci _null_values["gcm"] = "??" 5322e5b6d6dSopenharmony_ci gc_prop = _properties["gc"] 5332e5b6d6dSopenharmony_ci prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3]) 5342e5b6d6dSopenharmony_ci _properties["gcm"] = prop 5352e5b6d6dSopenharmony_ci _properties["General_Category_Mask"] = prop 5362e5b6d6dSopenharmony_ci _properties["generalcategorymask"] = prop 5372e5b6d6dSopenharmony_ci # Various binary properties. 5382e5b6d6dSopenharmony_ci AddBinaryProperty("Sensitive", "Case_Sensitive") 5392e5b6d6dSopenharmony_ci AddBinaryProperty("nfdinert", "NFD_Inert") 5402e5b6d6dSopenharmony_ci AddBinaryProperty("nfkdinert", "NFKD_Inert") 5412e5b6d6dSopenharmony_ci AddBinaryProperty("nfcinert", "NFC_Inert") 5422e5b6d6dSopenharmony_ci AddBinaryProperty("nfkcinert", "NFKC_Inert") 5432e5b6d6dSopenharmony_ci AddBinaryProperty("segstart", "Segment_Starter") 5442e5b6d6dSopenharmony_ci # https://www.unicode.org/reports/tr51/#Emoji_Properties 5452e5b6d6dSopenharmony_ci AddBinaryProperty("Emoji", "Emoji") 5462e5b6d6dSopenharmony_ci AddBinaryProperty("EPres", "Emoji_Presentation") 5472e5b6d6dSopenharmony_ci AddBinaryProperty("EMod", "Emoji_Modifier") 5482e5b6d6dSopenharmony_ci AddBinaryProperty("EBase", "Emoji_Modifier_Base") 5492e5b6d6dSopenharmony_ci AddBinaryProperty("EComp", "Emoji_Component") 5502e5b6d6dSopenharmony_ci AddBinaryProperty("ExtPict", "Extended_Pictographic") 5512e5b6d6dSopenharmony_ci # https://www.unicode.org/reports/tr51/#Emoji_Sets 5522e5b6d6dSopenharmony_ci AddSingleNameBinaryProperty("Basic_Emoji") 5532e5b6d6dSopenharmony_ci AddSingleNameBinaryProperty("Emoji_Keycap_Sequence") 5542e5b6d6dSopenharmony_ci AddSingleNameBinaryProperty("RGI_Emoji_Modifier_Sequence") 5552e5b6d6dSopenharmony_ci AddSingleNameBinaryProperty("RGI_Emoji_Flag_Sequence") 5562e5b6d6dSopenharmony_ci AddSingleNameBinaryProperty("RGI_Emoji_Tag_Sequence") 5572e5b6d6dSopenharmony_ci AddSingleNameBinaryProperty("RGI_Emoji_ZWJ_Sequence") 5582e5b6d6dSopenharmony_ci AddSingleNameBinaryProperty("RGI_Emoji") 5592e5b6d6dSopenharmony_ci # C/POSIX character classes that do not have Unicode property [value] aliases. 5602e5b6d6dSopenharmony_ci # See uchar.h. 5612e5b6d6dSopenharmony_ci AddPOSIXBinaryProperty("alnum") 5622e5b6d6dSopenharmony_ci AddPOSIXBinaryProperty("blank") 5632e5b6d6dSopenharmony_ci AddPOSIXBinaryProperty("graph") 5642e5b6d6dSopenharmony_ci AddPOSIXBinaryProperty("print") 5652e5b6d6dSopenharmony_ci AddPOSIXBinaryProperty("xdigit") 5662e5b6d6dSopenharmony_ci 5672e5b6d6dSopenharmony_ci 5682e5b6d6dSopenharmony_cidef ParsePropertyValueAliases(in_file): 5692e5b6d6dSopenharmony_ci global _binary_values 5702e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_ranges=False, 5712e5b6d6dSopenharmony_ci want_other=True, want_missing=True): 5722e5b6d6dSopenharmony_ci if data[0] == "missing": 5732e5b6d6dSopenharmony_ci SetDefaultValue(data[2][0], data[2][1]) 5742e5b6d6dSopenharmony_ci else: 5752e5b6d6dSopenharmony_ci # type == "other" 5762e5b6d6dSopenharmony_ci fields = data[2] 5772e5b6d6dSopenharmony_ci pname = fields[0] 5782e5b6d6dSopenharmony_ci prop = GetProperty(pname) 5792e5b6d6dSopenharmony_ci if prop: 5802e5b6d6dSopenharmony_ci del fields[0] # Only the list of aliases remains. 5812e5b6d6dSopenharmony_ci short_name = fields[0] 5822e5b6d6dSopenharmony_ci if short_name == "n/a": # no short name 5832e5b6d6dSopenharmony_ci fields[0] = "" 5842e5b6d6dSopenharmony_ci short_name = fields[1] 5852e5b6d6dSopenharmony_ci prop[2].add(short_name) 5862e5b6d6dSopenharmony_ci values = prop[3] 5872e5b6d6dSopenharmony_ci for alias in fields: 5882e5b6d6dSopenharmony_ci if alias: 5892e5b6d6dSopenharmony_ci values[alias] = fields 5902e5b6d6dSopenharmony_ci values[NormPropName(alias)] = fields 5912e5b6d6dSopenharmony_ci if prop[0] == "Binary" and not _binary_values: 5922e5b6d6dSopenharmony_ci _binary_values = values 5932e5b6d6dSopenharmony_ci # Some of the @missing lines with non-null default property values 5942e5b6d6dSopenharmony_ci # are in files that we do not parse; 5952e5b6d6dSopenharmony_ci # either because the data for that property is easily 5962e5b6d6dSopenharmony_ci # (i.e., the @missing line would be the only reason to parse such a file) 5972e5b6d6dSopenharmony_ci # or because we compute the property at runtime, 5982e5b6d6dSopenharmony_ci # such as the Hangul_Syllable_Type. 5992e5b6d6dSopenharmony_ci if "dt" not in _defaults: # DerivedDecompositionType.txt 6002e5b6d6dSopenharmony_ci _defaults["dt"] = "None" 6012e5b6d6dSopenharmony_ci if "nt" not in _defaults: # DerivedNumericType.txt 6022e5b6d6dSopenharmony_ci _defaults["nt"] = "None" 6032e5b6d6dSopenharmony_ci if "hst" not in _defaults: # HangulSyllableType.txt 6042e5b6d6dSopenharmony_ci _defaults["hst"] = "NA" 6052e5b6d6dSopenharmony_ci if "gc" not in _defaults: # No @missing line in any .txt file? 6062e5b6d6dSopenharmony_ci _defaults["gc"] = "Cn" 6072e5b6d6dSopenharmony_ci # Copy the gc default value to gcm. 6082e5b6d6dSopenharmony_ci _defaults["gcm"] = _defaults["gc"] 6092e5b6d6dSopenharmony_ci # Add ISO 15924-only script codes. 6102e5b6d6dSopenharmony_ci # Only for the ICU script code API, not necessary for parsing the UCD. 6112e5b6d6dSopenharmony_ci script_prop = _properties["sc"] 6122e5b6d6dSopenharmony_ci short_script_names = script_prop[2] # set 6132e5b6d6dSopenharmony_ci script_values = script_prop[3] # dict 6142e5b6d6dSopenharmony_ci remove_scripts = [] 6152e5b6d6dSopenharmony_ci for script in _scripts_only_in_iso15924: 6162e5b6d6dSopenharmony_ci if script in short_script_names: 6172e5b6d6dSopenharmony_ci remove_scripts.append(script) 6182e5b6d6dSopenharmony_ci else: 6192e5b6d6dSopenharmony_ci short_script_names.add(script) 6202e5b6d6dSopenharmony_ci # Do not invent a Unicode long script name before the UCD adds the script. 6212e5b6d6dSopenharmony_ci script_list = [script, script] # [short, long] 6222e5b6d6dSopenharmony_ci script_values[script] = script_list 6232e5b6d6dSopenharmony_ci # Probably not necessary because 6242e5b6d6dSopenharmony_ci # we will not parse these scripts from the UCD: 6252e5b6d6dSopenharmony_ci script_values[NormPropName(script)] = script_list 6262e5b6d6dSopenharmony_ci if remove_scripts: 6272e5b6d6dSopenharmony_ci raise ValueError( 6282e5b6d6dSopenharmony_ci "remove %s from _scripts_only_in_iso15924" % remove_scripts) 6292e5b6d6dSopenharmony_ci 6302e5b6d6dSopenharmony_ci 6312e5b6d6dSopenharmony_cidef ParseBlocks(in_file): 6322e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_missing=True): 6332e5b6d6dSopenharmony_ci if data[0] == "missing": 6342e5b6d6dSopenharmony_ci SetDefaultValue("blk", data[2][0]) 6352e5b6d6dSopenharmony_ci else: 6362e5b6d6dSopenharmony_ci # type == "range" 6372e5b6d6dSopenharmony_ci (start, end, name) = (data[2], data[3], data[4][1]) 6382e5b6d6dSopenharmony_ci _blocks.append((start, end, {"blk": name})) 6392e5b6d6dSopenharmony_ci SetPropertyValue("blk", name, start, end) 6402e5b6d6dSopenharmony_ci _blocks.sort() 6412e5b6d6dSopenharmony_ci # Check for overlapping blocks. 6422e5b6d6dSopenharmony_ci prev_end = -1 6432e5b6d6dSopenharmony_ci for b in _blocks: 6442e5b6d6dSopenharmony_ci start = b[0] 6452e5b6d6dSopenharmony_ci end = b[1] 6462e5b6d6dSopenharmony_ci if prev_end >= start: 6472e5b6d6dSopenharmony_ci raise ValueError( 6482e5b6d6dSopenharmony_ci "block %04lX..%04lX %s overlaps with another " + 6492e5b6d6dSopenharmony_ci "ending at %04lX\n %s\n" % 6502e5b6d6dSopenharmony_ci (start, end, b[2]["blk"], prev_end)) 6512e5b6d6dSopenharmony_ci prev_end = end 6522e5b6d6dSopenharmony_ci 6532e5b6d6dSopenharmony_ci 6542e5b6d6dSopenharmony_cidef ParseUnicodeData(in_file): 6552e5b6d6dSopenharmony_ci dt_prop = GetProperty("dt") 6562e5b6d6dSopenharmony_ci range_first_line = "" 6572e5b6d6dSopenharmony_ci range_first = -1 6582e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_missing=True): 6592e5b6d6dSopenharmony_ci # type == "range" 6602e5b6d6dSopenharmony_ci (line, c, end, fields) = (data[1], data[2], data[3], data[4]) 6612e5b6d6dSopenharmony_ci assert c == end 6622e5b6d6dSopenharmony_ci name = fields[1] 6632e5b6d6dSopenharmony_ci if name.startswith("<"): 6642e5b6d6dSopenharmony_ci if name.endswith(", First>"): 6652e5b6d6dSopenharmony_ci if range_first >= 0: 6662e5b6d6dSopenharmony_ci raise SyntaxError( 6672e5b6d6dSopenharmony_ci "error: unterminated range started at\n %s\n" % 6682e5b6d6dSopenharmony_ci range_first_line) 6692e5b6d6dSopenharmony_ci range_first = c 6702e5b6d6dSopenharmony_ci range_first_line = line 6712e5b6d6dSopenharmony_ci continue 6722e5b6d6dSopenharmony_ci elif name.endswith(", Last>"): 6732e5b6d6dSopenharmony_ci if range_first < 0: 6742e5b6d6dSopenharmony_ci raise SyntaxError( 6752e5b6d6dSopenharmony_ci "error: range end without start at\n %s\n" % 6762e5b6d6dSopenharmony_ci line) 6772e5b6d6dSopenharmony_ci elif range_first > c: 6782e5b6d6dSopenharmony_ci raise SyntaxError( 6792e5b6d6dSopenharmony_ci "error: range start/end out of order at\n %s\n %s\n" % 6802e5b6d6dSopenharmony_ci (range_first_line, line)) 6812e5b6d6dSopenharmony_ci first_name = range_first_line.split(";")[1][1:-8] 6822e5b6d6dSopenharmony_ci name = name[1:-7] 6832e5b6d6dSopenharmony_ci if first_name != name: 6842e5b6d6dSopenharmony_ci raise SyntaxError( 6852e5b6d6dSopenharmony_ci "error: range start/end name mismatch at\n %s\n %s\n" % 6862e5b6d6dSopenharmony_ci (range_first_line, line)) 6872e5b6d6dSopenharmony_ci end = c 6882e5b6d6dSopenharmony_ci c = range_first 6892e5b6d6dSopenharmony_ci range_first = -1 6902e5b6d6dSopenharmony_ci # Remember algorithmic name ranges. 6912e5b6d6dSopenharmony_ci if "Ideograph" in name: 6922e5b6d6dSopenharmony_ci prefix = "CJK UNIFIED IDEOGRAPH-" 6932e5b6d6dSopenharmony_ci if c == 0x17000 or c == 0x18D00: prefix = "TANGUT IDEOGRAPH-" 6942e5b6d6dSopenharmony_ci _alg_names_ranges.append([c, end, "han", prefix]) 6952e5b6d6dSopenharmony_ci elif name == "Hangul Syllable": 6962e5b6d6dSopenharmony_ci _alg_names_ranges.append([c, end, "hangul"]) 6972e5b6d6dSopenharmony_ci name = "" 6982e5b6d6dSopenharmony_ci else: 6992e5b6d6dSopenharmony_ci # Ignore non-names like <control>. 7002e5b6d6dSopenharmony_ci name = "" 7012e5b6d6dSopenharmony_ci props = {} 7022e5b6d6dSopenharmony_ci if name: props["na"] = name 7032e5b6d6dSopenharmony_ci props["gc"] = fields[2] 7042e5b6d6dSopenharmony_ci ccc = int(fields[3]) 7052e5b6d6dSopenharmony_ci if ccc: props["ccc"] = ccc 7062e5b6d6dSopenharmony_ci props["bc"] = fields[4] 7072e5b6d6dSopenharmony_ci # Decomposition type & mapping. 7082e5b6d6dSopenharmony_ci dm = fields[5] 7092e5b6d6dSopenharmony_ci if dm: 7102e5b6d6dSopenharmony_ci if dm.startswith("<"): 7112e5b6d6dSopenharmony_ci dt_limit = dm.index(">") 7122e5b6d6dSopenharmony_ci dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit]) 7132e5b6d6dSopenharmony_ci dm = dm[dt_limit + 1:].lstrip() 7142e5b6d6dSopenharmony_ci else: 7152e5b6d6dSopenharmony_ci dt = "Can" 7162e5b6d6dSopenharmony_ci props["dt"] = dt 7172e5b6d6dSopenharmony_ci props["dm"] = dm 7182e5b6d6dSopenharmony_ci # Numeric type & value. 7192e5b6d6dSopenharmony_ci decimal = fields[6] 7202e5b6d6dSopenharmony_ci digit = fields[7] 7212e5b6d6dSopenharmony_ci nv = fields[8] 7222e5b6d6dSopenharmony_ci if (decimal and decimal != nv) or (digit and digit != nv): 7232e5b6d6dSopenharmony_ci raise SyntaxError("error: numeric values differ at\n %s\n" % line) 7242e5b6d6dSopenharmony_ci if nv: 7252e5b6d6dSopenharmony_ci # Map improper fractions to proper ones. 7262e5b6d6dSopenharmony_ci # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS 7272e5b6d6dSopenharmony_ci # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS 7282e5b6d6dSopenharmony_ci if nv == "2/12": 7292e5b6d6dSopenharmony_ci nv = "1/6" 7302e5b6d6dSopenharmony_ci elif nv == "3/12": 7312e5b6d6dSopenharmony_ci nv = "1/4" 7322e5b6d6dSopenharmony_ci elif nv == "4/12": 7332e5b6d6dSopenharmony_ci nv = "1/3" 7342e5b6d6dSopenharmony_ci elif nv == "6/12": 7352e5b6d6dSopenharmony_ci nv = "1/2" 7362e5b6d6dSopenharmony_ci elif nv == "8/12": 7372e5b6d6dSopenharmony_ci nv = "2/3" 7382e5b6d6dSopenharmony_ci elif nv == "9/12": 7392e5b6d6dSopenharmony_ci nv = "3/4" 7402e5b6d6dSopenharmony_ci elif nv == "10/12": 7412e5b6d6dSopenharmony_ci nv = "5/6" 7422e5b6d6dSopenharmony_ci props["nv"] = nv 7432e5b6d6dSopenharmony_ci props["nt"] = "De" if decimal else "Di" if digit else "Nu" 7442e5b6d6dSopenharmony_ci if fields[9] == "Y": props["Bidi_M"] = True 7452e5b6d6dSopenharmony_ci # ICU 49 and above does not support Unicode_1_Name any more. 7462e5b6d6dSopenharmony_ci # See ticket #9013. 7472e5b6d6dSopenharmony_ci # na1 = fields[10] 7482e5b6d6dSopenharmony_ci # if na1: props["na1"] = na1 7492e5b6d6dSopenharmony_ci # ISO_Comment is deprecated and has no values. 7502e5b6d6dSopenharmony_ci # isc = fields[11] 7512e5b6d6dSopenharmony_ci # if isc: props["isc"] = isc 7522e5b6d6dSopenharmony_ci # Simple case mappings. 7532e5b6d6dSopenharmony_ci suc = fields[12] 7542e5b6d6dSopenharmony_ci slc = fields[13] 7552e5b6d6dSopenharmony_ci stc = fields[14] 7562e5b6d6dSopenharmony_ci if suc: props["suc"] = suc 7572e5b6d6dSopenharmony_ci if slc: props["slc"] = slc 7582e5b6d6dSopenharmony_ci if stc: props["stc"] = stc 7592e5b6d6dSopenharmony_ci SetProps(c, end, props) 7602e5b6d6dSopenharmony_ci if range_first >= 0: 7612e5b6d6dSopenharmony_ci raise SyntaxError( 7622e5b6d6dSopenharmony_ci "error: unterminated range started at\n %s\n" % 7632e5b6d6dSopenharmony_ci range_first_line) 7642e5b6d6dSopenharmony_ci # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt. 7652e5b6d6dSopenharmony_ci SetPropertyValue("dt", "Can", 0xac00, 0xd7a3) 7662e5b6d6dSopenharmony_ci _alg_names_ranges.sort() 7672e5b6d6dSopenharmony_ci 7682e5b6d6dSopenharmony_ci 7692e5b6d6dSopenharmony_ci_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$") 7702e5b6d6dSopenharmony_ci_names_h2_re = re.compile("@\t\t(.+)") 7712e5b6d6dSopenharmony_ci_names_char_re = re.compile("([0-9a-fA-F]+)\t.+") 7722e5b6d6dSopenharmony_ci 7732e5b6d6dSopenharmony_cidef ParseNamesList(in_file): 7742e5b6d6dSopenharmony_ci pending_h2 = "" 7752e5b6d6dSopenharmony_ci for line in in_file: 7762e5b6d6dSopenharmony_ci line = line.strip() 7772e5b6d6dSopenharmony_ci if not line: continue 7782e5b6d6dSopenharmony_ci match = _names_h1_re.match(line) 7792e5b6d6dSopenharmony_ci if match: 7802e5b6d6dSopenharmony_ci pending_h2 = "" # Drop a pending h2 when we get to an h1. 7812e5b6d6dSopenharmony_ci start = int(match.group(1), 16) 7822e5b6d6dSopenharmony_ci end = int(match.group(3), 16) 7832e5b6d6dSopenharmony_ci comment = match.group(2).replace(u"\xa0", " ") 7842e5b6d6dSopenharmony_ci _h1.append((start, end, comment)) 7852e5b6d6dSopenharmony_ci continue 7862e5b6d6dSopenharmony_ci match = _names_h2_re.match(line) 7872e5b6d6dSopenharmony_ci if match: 7882e5b6d6dSopenharmony_ci pending_h2 = match.group(1).replace(u"\xa0", " ") 7892e5b6d6dSopenharmony_ci continue 7902e5b6d6dSopenharmony_ci if pending_h2: 7912e5b6d6dSopenharmony_ci match = _names_char_re.match(line) 7922e5b6d6dSopenharmony_ci if match: 7932e5b6d6dSopenharmony_ci c = int(match.group(1), 16) 7942e5b6d6dSopenharmony_ci _h2.append((c, pending_h2)) 7952e5b6d6dSopenharmony_ci pending_h2 = "" 7962e5b6d6dSopenharmony_ci _h1.sort() 7972e5b6d6dSopenharmony_ci _h2.sort() 7982e5b6d6dSopenharmony_ci 7992e5b6d6dSopenharmony_ci 8002e5b6d6dSopenharmony_cidef ParseNamedProperties(in_file): 8012e5b6d6dSopenharmony_ci """Parses a .txt file where the first column is a code point range 8022e5b6d6dSopenharmony_ci and the second column is a property name. 8032e5b6d6dSopenharmony_ci Sets binary properties to True, 8042e5b6d6dSopenharmony_ci and other properties to the values in the third column.""" 8052e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_missing=True): 8062e5b6d6dSopenharmony_ci if data[0] == "missing": 8072e5b6d6dSopenharmony_ci SetDefaultValue(data[2][0], data[2][1]) 8082e5b6d6dSopenharmony_ci else: 8092e5b6d6dSopenharmony_ci # type == "range" 8102e5b6d6dSopenharmony_ci if len(data[4]) == 2: 8112e5b6d6dSopenharmony_ci SetBinaryPropertyToTrue(data[4][1], data[2], data[3]) 8122e5b6d6dSopenharmony_ci else: 8132e5b6d6dSopenharmony_ci SetPropertyValue(data[4][1], data[4][2], data[2], data[3]) 8142e5b6d6dSopenharmony_ci 8152e5b6d6dSopenharmony_ci 8162e5b6d6dSopenharmony_cidef ParseOneProperty(in_file, pname): 8172e5b6d6dSopenharmony_ci """Parses a .txt file where the first column is a code point range 8182e5b6d6dSopenharmony_ci and the second column is the value of a known property.""" 8192e5b6d6dSopenharmony_ci prop = GetProperty(pname) 8202e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_missing=True): 8212e5b6d6dSopenharmony_ci if data[0] == "missing": 8222e5b6d6dSopenharmony_ci SetDefaultValue(pname, data[2][0]) 8232e5b6d6dSopenharmony_ci else: 8242e5b6d6dSopenharmony_ci # type == "range" 8252e5b6d6dSopenharmony_ci SetPropValue(prop, data[4][1], data[2], data[3]) 8262e5b6d6dSopenharmony_ci 8272e5b6d6dSopenharmony_ci 8282e5b6d6dSopenharmony_cidef ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg") 8292e5b6d6dSopenharmony_cidef ParseDerivedAge(in_file): ParseOneProperty(in_file, "age") 8302e5b6d6dSopenharmony_cidef ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc") 8312e5b6d6dSopenharmony_cidef ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg") 8322e5b6d6dSopenharmony_cidef ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt") 8332e5b6d6dSopenharmony_cidef ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea") 8342e5b6d6dSopenharmony_cidef ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB") 8352e5b6d6dSopenharmony_cidef ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC") 8362e5b6d6dSopenharmony_cidef ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC") 8372e5b6d6dSopenharmony_cidef ParseLineBreak(in_file): ParseOneProperty(in_file, "lb") 8382e5b6d6dSopenharmony_cidef ParseScripts(in_file): ParseOneProperty(in_file, "sc") 8392e5b6d6dSopenharmony_cidef ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx") 8402e5b6d6dSopenharmony_cidef ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB") 8412e5b6d6dSopenharmony_cidef ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo") 8422e5b6d6dSopenharmony_cidef ParseWordBreak(in_file): ParseOneProperty(in_file, "WB") 8432e5b6d6dSopenharmony_ci 8442e5b6d6dSopenharmony_ci 8452e5b6d6dSopenharmony_cidef DoSetNameAlias(alias, start, end, c_props): 8462e5b6d6dSopenharmony_ci if "Name_Alias" in c_props: 8472e5b6d6dSopenharmony_ci c_props["Name_Alias"] += ',' + alias 8482e5b6d6dSopenharmony_ci else: 8492e5b6d6dSopenharmony_ci c_props["Name_Alias"] = alias 8502e5b6d6dSopenharmony_ci 8512e5b6d6dSopenharmony_ci 8522e5b6d6dSopenharmony_cidef ParseNameAliases(in_file): 8532e5b6d6dSopenharmony_ci """Parses Name_Alias from NameAliases.txt. 8542e5b6d6dSopenharmony_ci A character can have multiple aliases. 8552e5b6d6dSopenharmony_ci 8562e5b6d6dSopenharmony_ci In Unicode 6.0, there are two columns, 8572e5b6d6dSopenharmony_ci with a name correction in the second column. 8582e5b6d6dSopenharmony_ci 8592e5b6d6dSopenharmony_ci In Unicode 6.1, there are three columns. 8602e5b6d6dSopenharmony_ci The second contains an alias, the third its type. 8612e5b6d6dSopenharmony_ci The documented types are: 8622e5b6d6dSopenharmony_ci correction, control, alternate, figment, abbreviation 8632e5b6d6dSopenharmony_ci 8642e5b6d6dSopenharmony_ci This function does not sort the types, assuming they appear in this order.""" 8652e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file): 8662e5b6d6dSopenharmony_ci start = data[2] 8672e5b6d6dSopenharmony_ci end = data[3] 8682e5b6d6dSopenharmony_ci if start != end: 8692e5b6d6dSopenharmony_ci raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" % 8702e5b6d6dSopenharmony_ci (start, end)) 8712e5b6d6dSopenharmony_ci fields = data[4] 8722e5b6d6dSopenharmony_ci if len(fields) == 2: 8732e5b6d6dSopenharmony_ci alias = "correction=" + fields[1] 8742e5b6d6dSopenharmony_ci else: 8752e5b6d6dSopenharmony_ci alias = fields[2] + '=' + fields[1] 8762e5b6d6dSopenharmony_ci update = (NeedToSetAlways, DoSetNameAlias, alias) 8772e5b6d6dSopenharmony_ci UpdateProps(start, end, update) 8782e5b6d6dSopenharmony_ci 8792e5b6d6dSopenharmony_ci 8802e5b6d6dSopenharmony_cidef NeedToSetNumericValue(nv, start, end, c_props): 8812e5b6d6dSopenharmony_ci c_nv = c_props.get("nv") 8822e5b6d6dSopenharmony_ci if c_nv == None: 8832e5b6d6dSopenharmony_ci # DerivedNumericValues.txt adds a Numeric_Value. 8842e5b6d6dSopenharmony_ci assert "nt" not in c_props 8852e5b6d6dSopenharmony_ci return True 8862e5b6d6dSopenharmony_ci if nv != c_nv: 8872e5b6d6dSopenharmony_ci raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " + 8882e5b6d6dSopenharmony_ci "but DerivedNumericValues.txt has nv=%s") % 8892e5b6d6dSopenharmony_ci (c_nv, start, end, nv)) 8902e5b6d6dSopenharmony_ci return False 8912e5b6d6dSopenharmony_ci 8922e5b6d6dSopenharmony_ci 8932e5b6d6dSopenharmony_cidef DoSetNumericValue(nv, start, end, c_props): 8942e5b6d6dSopenharmony_ci c_props.update({"nt": "Nu", "nv": nv}) 8952e5b6d6dSopenharmony_ci 8962e5b6d6dSopenharmony_ci 8972e5b6d6dSopenharmony_cidef ParseDerivedNumericValues(in_file): 8982e5b6d6dSopenharmony_ci """Parses DerivedNumericValues.txt. 8992e5b6d6dSopenharmony_ci For most characters, the numeric type & value were parsed previously 9002e5b6d6dSopenharmony_ci from UnicodeData.txt but that does not show the values for Han characters. 9012e5b6d6dSopenharmony_ci Here we check that values match those from UnicodeData.txt 9022e5b6d6dSopenharmony_ci and add new ones.""" 9032e5b6d6dSopenharmony_ci # Ignore the @missing line which has an incorrect number of fields, 9042e5b6d6dSopenharmony_ci # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1). 9052e5b6d6dSopenharmony_ci # Also, "NaN" is just the Numeric null value anyway. 9062e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file): 9072e5b6d6dSopenharmony_ci # Conditional update to the numeric value in the 4th field. 9082e5b6d6dSopenharmony_ci update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3]) 9092e5b6d6dSopenharmony_ci UpdateProps(data[2], data[3], update) 9102e5b6d6dSopenharmony_ci 9112e5b6d6dSopenharmony_ci 9122e5b6d6dSopenharmony_cidef ParseCaseFolding(in_file): 9132e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_missing=True): 9142e5b6d6dSopenharmony_ci if data[0] == "missing": 9152e5b6d6dSopenharmony_ci assert data[2][0] == "C" # common to scf & cf 9162e5b6d6dSopenharmony_ci SetDefaultValue("scf", data[2][1]) 9172e5b6d6dSopenharmony_ci SetDefaultValue("cf", data[2][1]) 9182e5b6d6dSopenharmony_ci else: 9192e5b6d6dSopenharmony_ci # type == "range" 9202e5b6d6dSopenharmony_ci start = data[2] 9212e5b6d6dSopenharmony_ci end = data[3] 9222e5b6d6dSopenharmony_ci status = data[4][1] 9232e5b6d6dSopenharmony_ci mapping = data[4][2] 9242e5b6d6dSopenharmony_ci assert status in "CSFT" 9252e5b6d6dSopenharmony_ci if status == "C": 9262e5b6d6dSopenharmony_ci SetProps(start, end, {"scf": mapping, "cf": mapping}) 9272e5b6d6dSopenharmony_ci elif status == "S": 9282e5b6d6dSopenharmony_ci SetPropertyValue("scf", mapping, start, end) 9292e5b6d6dSopenharmony_ci elif status == "F": 9302e5b6d6dSopenharmony_ci SetPropertyValue("cf", mapping, start, end) 9312e5b6d6dSopenharmony_ci else: # status == "T" 9322e5b6d6dSopenharmony_ci SetPropertyValue("Turkic_Case_Folding", mapping, start, end) 9332e5b6d6dSopenharmony_ci 9342e5b6d6dSopenharmony_ci 9352e5b6d6dSopenharmony_cidef DoSetConditionalCaseMappings(ccm, start, end, c_props): 9362e5b6d6dSopenharmony_ci if "Conditional_Case_Mappings" in c_props: 9372e5b6d6dSopenharmony_ci c_props["Conditional_Case_Mappings"] += ',' + ccm 9382e5b6d6dSopenharmony_ci else: 9392e5b6d6dSopenharmony_ci c_props["Conditional_Case_Mappings"] = ccm 9402e5b6d6dSopenharmony_ci 9412e5b6d6dSopenharmony_ci 9422e5b6d6dSopenharmony_cidef ParseSpecialCasing(in_file): 9432e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_missing=True): 9442e5b6d6dSopenharmony_ci if data[0] == "missing": 9452e5b6d6dSopenharmony_ci SetDefaultValue("lc", data[2][0]) 9462e5b6d6dSopenharmony_ci SetDefaultValue("tc", data[2][1]) 9472e5b6d6dSopenharmony_ci SetDefaultValue("uc", data[2][2]) 9482e5b6d6dSopenharmony_ci else: 9492e5b6d6dSopenharmony_ci # type == "range" 9502e5b6d6dSopenharmony_ci start = data[2] 9512e5b6d6dSopenharmony_ci end = data[3] 9522e5b6d6dSopenharmony_ci fields = data[4] 9532e5b6d6dSopenharmony_ci if len(fields) < 5 or not fields[4]: 9542e5b6d6dSopenharmony_ci # Unconditional mappings. 9552e5b6d6dSopenharmony_ci SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]}) 9562e5b6d6dSopenharmony_ci else: 9572e5b6d6dSopenharmony_ci # Conditional_Case_Mappings 9582e5b6d6dSopenharmony_ci ccm = (fields[4] + ":lc=" + fields[1] + 9592e5b6d6dSopenharmony_ci "&tc=" + fields[2] + "&uc=" + fields[3]) 9602e5b6d6dSopenharmony_ci update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm) 9612e5b6d6dSopenharmony_ci UpdateProps(start, end, update) 9622e5b6d6dSopenharmony_ci 9632e5b6d6dSopenharmony_ci 9642e5b6d6dSopenharmony_cidef ParseBidiBrackets(in_file): 9652e5b6d6dSopenharmony_ci for data in ReadUCDLines(in_file, want_missing=True): 9662e5b6d6dSopenharmony_ci if data[0] == "missing": 9672e5b6d6dSopenharmony_ci SetDefaultValue("bpt", data[2][1]) 9682e5b6d6dSopenharmony_ci else: 9692e5b6d6dSopenharmony_ci # type == "range" 9702e5b6d6dSopenharmony_ci start = data[2] 9712e5b6d6dSopenharmony_ci end = data[3] 9722e5b6d6dSopenharmony_ci assert start == end 9732e5b6d6dSopenharmony_ci mapping = data[4][1] 9742e5b6d6dSopenharmony_ci bracket_type = data[4][2] 9752e5b6d6dSopenharmony_ci SetProps(start, end, {"bpb": mapping, "bpt": bracket_type}) 9762e5b6d6dSopenharmony_ci 9772e5b6d6dSopenharmony_ci# Postprocessing ----------------------------------------------------------- *** 9782e5b6d6dSopenharmony_ci 9792e5b6d6dSopenharmony_cidef PrintedSize(pname, value): 9802e5b6d6dSopenharmony_ci if isinstance(value, bool): 9812e5b6d6dSopenharmony_ci if value: 9822e5b6d6dSopenharmony_ci return len(pname) + 1 # ";pname" 9832e5b6d6dSopenharmony_ci else: 9842e5b6d6dSopenharmony_ci return len(pname) + 2 # ";-pname" 9852e5b6d6dSopenharmony_ci else: 9862e5b6d6dSopenharmony_ci return len(pname) + len(str(value)) + 2 # ";pname=value" 9872e5b6d6dSopenharmony_ci 9882e5b6d6dSopenharmony_ci 9892e5b6d6dSopenharmony_cidef CompactBlock(b, i): 9902e5b6d6dSopenharmony_ci assert b[0] == _starts[i] 9912e5b6d6dSopenharmony_ci b_props = b[2] # Normally just blk from Blocks.txt. 9922e5b6d6dSopenharmony_ci # b_props["blk"] has not been canonicalized yet. 9932e5b6d6dSopenharmony_ci b_props["blk"] = _props[i]["blk"] 9942e5b6d6dSopenharmony_ci orig_i = i 9952e5b6d6dSopenharmony_ci # Count the number of occurrences of each property's value in this block. 9962e5b6d6dSopenharmony_ci # To minimize the output, count the number of assigned ranges, 9972e5b6d6dSopenharmony_ci # not the number of code points. 9982e5b6d6dSopenharmony_ci num_ranges = 0 9992e5b6d6dSopenharmony_ci prop_counters = {} 10002e5b6d6dSopenharmony_ci if "gc" in b_props: 10012e5b6d6dSopenharmony_ci b_is_unassigned = b_props["gc"] == "Cn" # Unreachable with normal data. 10022e5b6d6dSopenharmony_ci else: 10032e5b6d6dSopenharmony_ci b_is_unassigned = _defaults["gc"] == "Cn" # This is expected to be true. 10042e5b6d6dSopenharmony_ci while True: 10052e5b6d6dSopenharmony_ci start = _starts[i] 10062e5b6d6dSopenharmony_ci if start > b[1]: break 10072e5b6d6dSopenharmony_ci props = _props[i] 10082e5b6d6dSopenharmony_ci if "gc" in props: 10092e5b6d6dSopenharmony_ci is_unassigned = props["gc"] == "Cn" 10102e5b6d6dSopenharmony_ci else: 10112e5b6d6dSopenharmony_ci is_unassigned = b_is_unassigned 10122e5b6d6dSopenharmony_ci if is_unassigned: 10132e5b6d6dSopenharmony_ci # Compact an unassigned range inside the block and 10142e5b6d6dSopenharmony_ci # mark it to be written with "unassigned". 10152e5b6d6dSopenharmony_ci # It falls back to default properties, not block properties, 10162e5b6d6dSopenharmony_ci # except for the blk=Block property. 10172e5b6d6dSopenharmony_ci assert props["blk"] == b_props["blk"] 10182e5b6d6dSopenharmony_ci del props["blk"] 10192e5b6d6dSopenharmony_ci for pname in list(props.keys()): # .keys() is a copy so we can del props[pname]. 10202e5b6d6dSopenharmony_ci if props[pname] == _null_or_defaults[pname]: del props[pname] 10212e5b6d6dSopenharmony_ci # What remains are unusual default values for unassigned code points. 10222e5b6d6dSopenharmony_ci # For example, bc=R or lb=ID. 10232e5b6d6dSopenharmony_ci # See http://www.unicode.org/reports/tr44/#Default_Values_Table 10242e5b6d6dSopenharmony_ci props["unassigned"] = True 10252e5b6d6dSopenharmony_ci else: 10262e5b6d6dSopenharmony_ci for (pname, value) in props.items(): 10272e5b6d6dSopenharmony_ci if pname in prop_counters: 10282e5b6d6dSopenharmony_ci counter = prop_counters[pname] 10292e5b6d6dSopenharmony_ci else: 10302e5b6d6dSopenharmony_ci counter = {_null_or_defaults[pname]: num_ranges} 10312e5b6d6dSopenharmony_ci prop_counters[pname] = counter 10322e5b6d6dSopenharmony_ci if value in counter: 10332e5b6d6dSopenharmony_ci counter[value] += 1 10342e5b6d6dSopenharmony_ci else: 10352e5b6d6dSopenharmony_ci counter[value] = 1 10362e5b6d6dSopenharmony_ci # Also count default values for properties that do not occur in a range. 10372e5b6d6dSopenharmony_ci for pname in prop_counters: 10382e5b6d6dSopenharmony_ci if pname not in props: 10392e5b6d6dSopenharmony_ci counter = prop_counters[pname] 10402e5b6d6dSopenharmony_ci value = _null_or_defaults[pname] 10412e5b6d6dSopenharmony_ci counter[value] += 1 10422e5b6d6dSopenharmony_ci num_ranges += 1 10432e5b6d6dSopenharmony_ci # Invariant: For each counter, the sum of counts must equal num_ranges. 10442e5b6d6dSopenharmony_ci i += 1 10452e5b6d6dSopenharmony_ci # For each property that occurs within this block, 10462e5b6d6dSopenharmony_ci # set the value that reduces the file size the most as a block property value. 10472e5b6d6dSopenharmony_ci # This is usually the most common value. 10482e5b6d6dSopenharmony_ci for (pname, counter) in prop_counters.items(): 10492e5b6d6dSopenharmony_ci default_value = _null_or_defaults[pname] 10502e5b6d6dSopenharmony_ci default_size = PrintedSize(pname, default_value) * counter[default_value] 10512e5b6d6dSopenharmony_ci max_value = None 10522e5b6d6dSopenharmony_ci max_count = 0 10532e5b6d6dSopenharmony_ci max_savings = 0 10542e5b6d6dSopenharmony_ci for (value, count) in counter.items(): 10552e5b6d6dSopenharmony_ci if value != default_value and count > 1: 10562e5b6d6dSopenharmony_ci # Does the file get smaller by setting the block default? 10572e5b6d6dSopenharmony_ci # We save writing the block value as often as it occurs, 10582e5b6d6dSopenharmony_ci # minus once for writing it for the block, 10592e5b6d6dSopenharmony_ci # minus writing the default value instead. 10602e5b6d6dSopenharmony_ci savings = PrintedSize(pname, value) * (count - 1) - default_size 10612e5b6d6dSopenharmony_ci # For two values with the same savings, pick the one that compares lower, 10622e5b6d6dSopenharmony_ci # to make this deterministic (avoid flip-flopping). 10632e5b6d6dSopenharmony_ci if (savings > max_savings or 10642e5b6d6dSopenharmony_ci (savings > 0 and savings == max_savings and value < max_value)): 10652e5b6d6dSopenharmony_ci max_value = value 10662e5b6d6dSopenharmony_ci max_count = count 10672e5b6d6dSopenharmony_ci max_savings = savings 10682e5b6d6dSopenharmony_ci # Do not compress uncompressible properties, 10692e5b6d6dSopenharmony_ci # with an exception for many empty-string values in a block 10702e5b6d6dSopenharmony_ci # (NFKC_CF='' for tags and variation selectors). 10712e5b6d6dSopenharmony_ci if (max_savings > 0 and 10722e5b6d6dSopenharmony_ci ((pname not in _uncompressible_props) or 10732e5b6d6dSopenharmony_ci (max_value == '' and max_count >= 12))): 10742e5b6d6dSopenharmony_ci b_props[pname] = max_value 10752e5b6d6dSopenharmony_ci # For each range and property, remove the default+block value 10762e5b6d6dSopenharmony_ci # but set the default value if that property was not set 10772e5b6d6dSopenharmony_ci # (i.e., it used to inherit the default value). 10782e5b6d6dSopenharmony_ci b_defaults = _null_or_defaults.copy() 10792e5b6d6dSopenharmony_ci b_defaults.update(b_props) 10802e5b6d6dSopenharmony_ci i = orig_i 10812e5b6d6dSopenharmony_ci while True: 10822e5b6d6dSopenharmony_ci start = _starts[i] 10832e5b6d6dSopenharmony_ci if start > b[1]: break 10842e5b6d6dSopenharmony_ci props = _props[i] 10852e5b6d6dSopenharmony_ci if "unassigned" not in props: 10862e5b6d6dSopenharmony_ci # Compact an assigned range inside the block. 10872e5b6d6dSopenharmony_ci for pname in prop_counters: 10882e5b6d6dSopenharmony_ci if pname in props: 10892e5b6d6dSopenharmony_ci if props[pname] == b_defaults[pname]: del props[pname] 10902e5b6d6dSopenharmony_ci elif pname in b_props: 10912e5b6d6dSopenharmony_ci # b_props only has non-default values. 10922e5b6d6dSopenharmony_ci # Set the default value if it used to be inherited. 10932e5b6d6dSopenharmony_ci props[pname] = _null_or_defaults[pname] 10942e5b6d6dSopenharmony_ci # If there is only one assigned range, then move all of its properties 10952e5b6d6dSopenharmony_ci # to the block. 10962e5b6d6dSopenharmony_ci if num_ranges == 1: 10972e5b6d6dSopenharmony_ci b_props.update(props) 10982e5b6d6dSopenharmony_ci props.clear() 10992e5b6d6dSopenharmony_ci i += 1 11002e5b6d6dSopenharmony_ci # Return the _starts index of the first range after this block. 11012e5b6d6dSopenharmony_ci return i 11022e5b6d6dSopenharmony_ci 11032e5b6d6dSopenharmony_ci 11042e5b6d6dSopenharmony_cidef CompactNonBlock(limit, i): 11052e5b6d6dSopenharmony_ci """Remove default property values from between-block ranges.""" 11062e5b6d6dSopenharmony_ci default_is_unassigned = _defaults["gc"] == "Cn" # This is expected to be true. 11072e5b6d6dSopenharmony_ci while True: 11082e5b6d6dSopenharmony_ci start = _starts[i] 11092e5b6d6dSopenharmony_ci if start >= limit: break 11102e5b6d6dSopenharmony_ci props = _props[i] 11112e5b6d6dSopenharmony_ci if "gc" in props: 11122e5b6d6dSopenharmony_ci is_unassigned = props["gc"] == "Cn" 11132e5b6d6dSopenharmony_ci else: 11142e5b6d6dSopenharmony_ci is_unassigned = default_is_unassigned 11152e5b6d6dSopenharmony_ci for pname in list(props.keys()): # .keys() is a copy so we can del props[pname]. 11162e5b6d6dSopenharmony_ci if props[pname] == _null_or_defaults[pname]: del props[pname] 11172e5b6d6dSopenharmony_ci assert "blk" not in props 11182e5b6d6dSopenharmony_ci # If there are no props left, then nothing will be printed. 11192e5b6d6dSopenharmony_ci # Otherwise, add "unassigned" for more obvious output. 11202e5b6d6dSopenharmony_ci if props and is_unassigned: 11212e5b6d6dSopenharmony_ci props["unassigned"] = True 11222e5b6d6dSopenharmony_ci i += 1 11232e5b6d6dSopenharmony_ci # Return the _starts index of the first range after this block. 11242e5b6d6dSopenharmony_ci return i 11252e5b6d6dSopenharmony_ci 11262e5b6d6dSopenharmony_ci 11272e5b6d6dSopenharmony_cidef CompactBlocks(): 11282e5b6d6dSopenharmony_ci """Optimizes block properties. 11292e5b6d6dSopenharmony_ci Sets properties on blocks to the most commonly used values, 11302e5b6d6dSopenharmony_ci and removes default+block values from code point properties.""" 11312e5b6d6dSopenharmony_ci # Ensure that there is a boundary in _starts for each block 11322e5b6d6dSopenharmony_ci # so that the simple mixing method below works. 11332e5b6d6dSopenharmony_ci for b in _blocks: 11342e5b6d6dSopenharmony_ci AddBoundary(b[0]) 11352e5b6d6dSopenharmony_ci limit = b[1] + 1 11362e5b6d6dSopenharmony_ci if limit <= 0x10ffff: AddBoundary(limit) 11372e5b6d6dSopenharmony_ci # Walk through ranges and blocks together. 11382e5b6d6dSopenharmony_ci i = 0 11392e5b6d6dSopenharmony_ci for b in _blocks: 11402e5b6d6dSopenharmony_ci b_start = b[0] 11412e5b6d6dSopenharmony_ci if _starts[i] < b_start: 11422e5b6d6dSopenharmony_ci i = CompactNonBlock(b_start, i) 11432e5b6d6dSopenharmony_ci i = CompactBlock(b, i) 11442e5b6d6dSopenharmony_ci CompactNonBlock(0x110000, i) 11452e5b6d6dSopenharmony_ci 11462e5b6d6dSopenharmony_ci# Output ------------------------------------------------------------------- *** 11472e5b6d6dSopenharmony_ci 11482e5b6d6dSopenharmony_cidef AppendRange(fields, start, end): 11492e5b6d6dSopenharmony_ci if start == end: 11502e5b6d6dSopenharmony_ci fields.append("%04lX" % start) 11512e5b6d6dSopenharmony_ci else: 11522e5b6d6dSopenharmony_ci fields.append("%04lX..%04lX" % (start, end)) 11532e5b6d6dSopenharmony_ci 11542e5b6d6dSopenharmony_ci 11552e5b6d6dSopenharmony_cidef AppendProps(fields, props): 11562e5b6d6dSopenharmony_ci # Sort property names (props keys) by their normalized forms 11572e5b6d6dSopenharmony_ci # and output properties in that order. 11582e5b6d6dSopenharmony_ci for pname in sorted(props, key=NormPropName): 11592e5b6d6dSopenharmony_ci value = props[pname] 11602e5b6d6dSopenharmony_ci if isinstance(value, bool): 11612e5b6d6dSopenharmony_ci if not value: pname = "-" + pname 11622e5b6d6dSopenharmony_ci fields.append(pname) 11632e5b6d6dSopenharmony_ci else: 11642e5b6d6dSopenharmony_ci fields.append("%s=%s" % (pname, value)) 11652e5b6d6dSopenharmony_ci 11662e5b6d6dSopenharmony_ci 11672e5b6d6dSopenharmony_cidef WriteFieldsRangeProps(fields, start, end, props, out_file): 11682e5b6d6dSopenharmony_ci AppendRange(fields, start, end) 11692e5b6d6dSopenharmony_ci AppendProps(fields, props) 11702e5b6d6dSopenharmony_ci out_file.write(";".join(fields)) 11712e5b6d6dSopenharmony_ci out_file.write("\n") 11722e5b6d6dSopenharmony_ci 11732e5b6d6dSopenharmony_ci 11742e5b6d6dSopenharmony_cidef EscapeNonASCII(s): 11752e5b6d6dSopenharmony_ci i = 0 11762e5b6d6dSopenharmony_ci while i < len(s): 11772e5b6d6dSopenharmony_ci c = ord(s[i]) 11782e5b6d6dSopenharmony_ci if c <= 0x7f: 11792e5b6d6dSopenharmony_ci i = i + 1 11802e5b6d6dSopenharmony_ci else: 11812e5b6d6dSopenharmony_ci if c <= 0xffff: 11822e5b6d6dSopenharmony_ci esc = u"\\u%04X" % c 11832e5b6d6dSopenharmony_ci else: 11842e5b6d6dSopenharmony_ci esc = u"\\U%08X" % c 11852e5b6d6dSopenharmony_ci s = s[:i] + esc + s[i+1:] 11862e5b6d6dSopenharmony_ci i = i + len(esc) 11872e5b6d6dSopenharmony_ci return s 11882e5b6d6dSopenharmony_ci 11892e5b6d6dSopenharmony_ci 11902e5b6d6dSopenharmony_cidef WritePreparsedUCD(out_file): 11912e5b6d6dSopenharmony_ci out_file.write("""# Preparsed UCD generated by ICU preparseucd.py 11922e5b6d6dSopenharmony_ci# Copyright (C) 1991 and later: Unicode, Inc. and others. 11932e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html 11942e5b6d6dSopenharmony_ci"""); 11952e5b6d6dSopenharmony_ci out_file.write("ucd;%s\n\n" % _ucd_version) 11962e5b6d6dSopenharmony_ci # Sort property names (props keys) by their normalized forms 11972e5b6d6dSopenharmony_ci # and output properties in that order. 11982e5b6d6dSopenharmony_ci pnames = sorted(_null_values, key=NormPropName) 11992e5b6d6dSopenharmony_ci for pname in pnames: 12002e5b6d6dSopenharmony_ci prop = _properties[pname] 12012e5b6d6dSopenharmony_ci out_file.write(";".join(["property", prop[0]] + prop[1])) 12022e5b6d6dSopenharmony_ci out_file.write("\n") 12032e5b6d6dSopenharmony_ci out_file.write("\n") 12042e5b6d6dSopenharmony_ci out_file.write(";".join(["binary"] + _binary_values["N"])) 12052e5b6d6dSopenharmony_ci out_file.write("\n") 12062e5b6d6dSopenharmony_ci out_file.write(";".join(["binary"] + _binary_values["Y"])) 12072e5b6d6dSopenharmony_ci out_file.write("\n") 12082e5b6d6dSopenharmony_ci for pname in pnames: 12092e5b6d6dSopenharmony_ci prop = _properties[pname] 12102e5b6d6dSopenharmony_ci short_names = prop[2] 12112e5b6d6dSopenharmony_ci if short_names and prop[0] != "Binary": 12122e5b6d6dSopenharmony_ci for name in sorted(short_names): 12132e5b6d6dSopenharmony_ci out_file.write(";".join(["value", prop[1][0]] + prop[3][name])) 12142e5b6d6dSopenharmony_ci out_file.write("\n") 12152e5b6d6dSopenharmony_ci out_file.write("\n") 12162e5b6d6dSopenharmony_ci # Ensure that there is a boundary in _starts for each 12172e5b6d6dSopenharmony_ci # range of data we mix into the output, 12182e5b6d6dSopenharmony_ci # so that the simple mixing method below works. 12192e5b6d6dSopenharmony_ci for b in _blocks: AddBoundary(b[0]) 12202e5b6d6dSopenharmony_ci for r in _alg_names_ranges: AddBoundary(r[0]) 12212e5b6d6dSopenharmony_ci for h in _h1: AddBoundary(h[0]) 12222e5b6d6dSopenharmony_ci for h in _h2: AddBoundary(h[0]) 12232e5b6d6dSopenharmony_ci # Write the preparsed data. ppucd.txt = preparsed UCD 12242e5b6d6dSopenharmony_ci # Syntax: http://site.icu-project.org/design/props/ppucd 12252e5b6d6dSopenharmony_ci WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file) 12262e5b6d6dSopenharmony_ci i_blocks = 0 12272e5b6d6dSopenharmony_ci i_alg = 0 12282e5b6d6dSopenharmony_ci i_h1 = 0 12292e5b6d6dSopenharmony_ci i_h2 = 0 12302e5b6d6dSopenharmony_ci b_end = -1 12312e5b6d6dSopenharmony_ci for i in range(len(_starts) - 1): 12322e5b6d6dSopenharmony_ci start = _starts[i] 12332e5b6d6dSopenharmony_ci end = _starts[i + 1] - 1 12342e5b6d6dSopenharmony_ci # Block with default properties. 12352e5b6d6dSopenharmony_ci if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]: 12362e5b6d6dSopenharmony_ci b = _blocks[i_blocks] 12372e5b6d6dSopenharmony_ci b_end = b[1] 12382e5b6d6dSopenharmony_ci WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file) 12392e5b6d6dSopenharmony_ci i_blocks += 1 12402e5b6d6dSopenharmony_ci # NamesList h1 heading (for [most of] a block). 12412e5b6d6dSopenharmony_ci if i_h1 < len(_h1) and start == _h1[i_h1][0]: 12422e5b6d6dSopenharmony_ci h = _h1[i_h1] 12432e5b6d6dSopenharmony_ci out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2]))) 12442e5b6d6dSopenharmony_ci i_h1 += 1 12452e5b6d6dSopenharmony_ci # Algorithmic-names range. 12462e5b6d6dSopenharmony_ci if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]: 12472e5b6d6dSopenharmony_ci r = _alg_names_ranges[i_alg] 12482e5b6d6dSopenharmony_ci fields = ["algnamesrange"] 12492e5b6d6dSopenharmony_ci AppendRange(fields, r[0], r[1]) 12502e5b6d6dSopenharmony_ci fields.extend(r[2:]) 12512e5b6d6dSopenharmony_ci out_file.write(";".join(fields)) 12522e5b6d6dSopenharmony_ci out_file.write("\n") 12532e5b6d6dSopenharmony_ci i_alg += 1 12542e5b6d6dSopenharmony_ci # NamesList h2 heading. 12552e5b6d6dSopenharmony_ci if i_h2 < len(_h2) and start == _h2[i_h2][0]: 12562e5b6d6dSopenharmony_ci out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1])) 12572e5b6d6dSopenharmony_ci i_h2 += 1 12582e5b6d6dSopenharmony_ci # Code point/range data. 12592e5b6d6dSopenharmony_ci props = _props[i] 12602e5b6d6dSopenharmony_ci # Omit ranges with only default+block properties. 12612e5b6d6dSopenharmony_ci if props: 12622e5b6d6dSopenharmony_ci if start > b_end and b_end >= 0: 12632e5b6d6dSopenharmony_ci # First range with values after the last block. 12642e5b6d6dSopenharmony_ci # Separate it visually from the block lines. 12652e5b6d6dSopenharmony_ci out_file.write("\n# No block\n") 12662e5b6d6dSopenharmony_ci b_end = -1 12672e5b6d6dSopenharmony_ci if "unassigned" in props: 12682e5b6d6dSopenharmony_ci # Do not output "unassigned" as a property. 12692e5b6d6dSopenharmony_ci del props["unassigned"] 12702e5b6d6dSopenharmony_ci line_type = "unassigned" 12712e5b6d6dSopenharmony_ci else: 12722e5b6d6dSopenharmony_ci line_type = "cp" 12732e5b6d6dSopenharmony_ci WriteFieldsRangeProps([line_type], start, end, props, out_file) 12742e5b6d6dSopenharmony_ci 12752e5b6d6dSopenharmony_ci# Write Normalizer2 input files -------------------------------------------- *** 12762e5b6d6dSopenharmony_ci# Ported from gennorm/store.c. 12772e5b6d6dSopenharmony_ci 12782e5b6d6dSopenharmony_cidef WriteAllCC(out_file): 12792e5b6d6dSopenharmony_ci out_file.write("# Canonical_Combining_Class (ccc) values\n"); 12802e5b6d6dSopenharmony_ci prev_start = 0 12812e5b6d6dSopenharmony_ci prev_cc = 0 12822e5b6d6dSopenharmony_ci for i in range(len(_starts)): 12832e5b6d6dSopenharmony_ci start = _starts[i] 12842e5b6d6dSopenharmony_ci props = _props[i] 12852e5b6d6dSopenharmony_ci cc = props.get("ccc") 12862e5b6d6dSopenharmony_ci if not cc: cc = 0 12872e5b6d6dSopenharmony_ci if prev_cc != cc: 12882e5b6d6dSopenharmony_ci if prev_cc != 0: 12892e5b6d6dSopenharmony_ci last_code_point = start - 1 12902e5b6d6dSopenharmony_ci if prev_start == last_code_point: 12912e5b6d6dSopenharmony_ci out_file.write("%04X:%d\n" % (last_code_point, prev_cc)) 12922e5b6d6dSopenharmony_ci else: 12932e5b6d6dSopenharmony_ci out_file.write("%04X..%04X:%d\n" % 12942e5b6d6dSopenharmony_ci (prev_start, last_code_point, prev_cc)) 12952e5b6d6dSopenharmony_ci prev_start = start 12962e5b6d6dSopenharmony_ci prev_cc = cc 12972e5b6d6dSopenharmony_ci 12982e5b6d6dSopenharmony_ci 12992e5b6d6dSopenharmony_cidef HasMapping(c): 13002e5b6d6dSopenharmony_ci props = GetProps(c) 13012e5b6d6dSopenharmony_ci dt = props.get("dt") 13022e5b6d6dSopenharmony_ci return dt and dt != "None" 13032e5b6d6dSopenharmony_ci 13042e5b6d6dSopenharmony_ci 13052e5b6d6dSopenharmony_cidef HasOneWayMapping(c): 13062e5b6d6dSopenharmony_ci while True: 13072e5b6d6dSopenharmony_ci props = GetProps(c) 13082e5b6d6dSopenharmony_ci dt = props.get("dt") 13092e5b6d6dSopenharmony_ci if not dt or dt == "None": 13102e5b6d6dSopenharmony_ci return False # no mapping 13112e5b6d6dSopenharmony_ci elif dt == "Can": 13122e5b6d6dSopenharmony_ci # The canonical decomposition is a one-way mapping if 13132e5b6d6dSopenharmony_ci # - it does not map to exactly two code points 13142e5b6d6dSopenharmony_ci # - c has ccc!=0 13152e5b6d6dSopenharmony_ci # - c has the Composition_Exclusion property 13162e5b6d6dSopenharmony_ci # - its starter has a one-way mapping (loop for this) 13172e5b6d6dSopenharmony_ci # - its non-starter decomposes 13182e5b6d6dSopenharmony_ci nfd = props["dm"].split() 13192e5b6d6dSopenharmony_ci if (len(nfd) != 2 or 13202e5b6d6dSopenharmony_ci props.get("ccc") or 13212e5b6d6dSopenharmony_ci props.get("Comp_Ex") or 13222e5b6d6dSopenharmony_ci HasMapping(int(nfd[1], 16))): 13232e5b6d6dSopenharmony_ci return True 13242e5b6d6dSopenharmony_ci c = int(nfd[0], 16) # continue 13252e5b6d6dSopenharmony_ci else: 13262e5b6d6dSopenharmony_ci # c has a compatibility mapping. 13272e5b6d6dSopenharmony_ci return True 13282e5b6d6dSopenharmony_ci 13292e5b6d6dSopenharmony_ci 13302e5b6d6dSopenharmony_ci_data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others. 13312e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html 13322e5b6d6dSopenharmony_ci# Copyright (C) 1999-2016, International Business Machines 13332e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved. 13342e5b6d6dSopenharmony_ci# 13352e5b6d6dSopenharmony_ci""" 13362e5b6d6dSopenharmony_ci 13372e5b6d6dSopenharmony_cidef WriteNorm2NFCTextFile(path): 13382e5b6d6dSopenharmony_ci global _data_file_copyright 13392e5b6d6dSopenharmony_ci with open(os.path.join(path, "nfc.txt"), "w") as out_file: 13402e5b6d6dSopenharmony_ci out_file.write( 13412e5b6d6dSopenharmony_ci _data_file_copyright + """# file name: nfc.txt 13422e5b6d6dSopenharmony_ci# 13432e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py 13442e5b6d6dSopenharmony_ci# 13452e5b6d6dSopenharmony_ci# Complete data for Unicode NFC normalization. 13462e5b6d6dSopenharmony_ci 13472e5b6d6dSopenharmony_ci* Unicode """ + _ucd_version + """ 13482e5b6d6dSopenharmony_ci 13492e5b6d6dSopenharmony_ci""") 13502e5b6d6dSopenharmony_ci WriteAllCC(out_file) 13512e5b6d6dSopenharmony_ci out_file.write("\n# Canonical decomposition mappings\n") 13522e5b6d6dSopenharmony_ci for i in range(len(_starts) - 1): 13532e5b6d6dSopenharmony_ci start = _starts[i] 13542e5b6d6dSopenharmony_ci end = _starts[i + 1] - 1 13552e5b6d6dSopenharmony_ci props = _props[i] 13562e5b6d6dSopenharmony_ci dm = props.get("dm") 13572e5b6d6dSopenharmony_ci if dm and dm[0] != '<' and props["dt"] == "Can": 13582e5b6d6dSopenharmony_ci assert start == end 13592e5b6d6dSopenharmony_ci # The Comp_Ex=Full_Composition_Exclusion property tells us 13602e5b6d6dSopenharmony_ci # whether the canonical decomposition round-trips. 13612e5b6d6dSopenharmony_ci separator = '>' if props.get("Comp_Ex") else '=' 13622e5b6d6dSopenharmony_ci out_file.write("%04X%s%s\n" % (start, separator, dm)) 13632e5b6d6dSopenharmony_ci 13642e5b6d6dSopenharmony_ci 13652e5b6d6dSopenharmony_cidef WriteNorm2NFKCTextFile(path): 13662e5b6d6dSopenharmony_ci global _data_file_copyright 13672e5b6d6dSopenharmony_ci with open(os.path.join(path, "nfkc.txt"), "w") as out_file: 13682e5b6d6dSopenharmony_ci out_file.write( 13692e5b6d6dSopenharmony_ci _data_file_copyright + """# file name: nfkc.txt 13702e5b6d6dSopenharmony_ci# 13712e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py 13722e5b6d6dSopenharmony_ci# 13732e5b6d6dSopenharmony_ci# Data for Unicode NFKC normalization. 13742e5b6d6dSopenharmony_ci# This file contains only compatibility decomposition mappings, 13752e5b6d6dSopenharmony_ci# plus those canonical decompositions that change from NFC round-trip mappings 13762e5b6d6dSopenharmony_ci# to NFKC one-way mappings. 13772e5b6d6dSopenharmony_ci# Use this file as the second gennorm2 input file after nfc.txt. 13782e5b6d6dSopenharmony_ci 13792e5b6d6dSopenharmony_ci* Unicode """ + _ucd_version + """ 13802e5b6d6dSopenharmony_ci 13812e5b6d6dSopenharmony_ci""") 13822e5b6d6dSopenharmony_ci for i in range(len(_starts) - 1): 13832e5b6d6dSopenharmony_ci start = _starts[i] 13842e5b6d6dSopenharmony_ci end = _starts[i + 1] - 1 13852e5b6d6dSopenharmony_ci props = _props[i] 13862e5b6d6dSopenharmony_ci dm = props.get("dm") 13872e5b6d6dSopenharmony_ci if dm and dm[0] != '<': 13882e5b6d6dSopenharmony_ci assert start == end 13892e5b6d6dSopenharmony_ci if props["dt"] != "Can": 13902e5b6d6dSopenharmony_ci # Compatibility decomposition. 13912e5b6d6dSopenharmony_ci out_file.write("%04X>%s\n" % (start, dm)) 13922e5b6d6dSopenharmony_ci elif not props.get("Comp_Ex") and HasOneWayMapping(start): 13932e5b6d6dSopenharmony_ci # NFC round-trip mapping turns into NFKC one-way mapping. 13942e5b6d6dSopenharmony_ci out_file.write("%04X>%s # NFC round-trip, NFKC one-way\n" % 13952e5b6d6dSopenharmony_ci (start, dm)) 13962e5b6d6dSopenharmony_ci 13972e5b6d6dSopenharmony_ci 13982e5b6d6dSopenharmony_cidef WriteNorm2NFKC_CFTextFile(path): 13992e5b6d6dSopenharmony_ci global _data_file_copyright 14002e5b6d6dSopenharmony_ci with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file: 14012e5b6d6dSopenharmony_ci out_file.write( 14022e5b6d6dSopenharmony_ci _data_file_copyright + """# file name: nfkc_cf.txt 14032e5b6d6dSopenharmony_ci# 14042e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py 14052e5b6d6dSopenharmony_ci# 14062e5b6d6dSopenharmony_ci# This file contains the Unicode NFKC_CF mappings, 14072e5b6d6dSopenharmony_ci# extracted from the UCD file DerivedNormalizationProps.txt, 14082e5b6d6dSopenharmony_ci# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool. 14092e5b6d6dSopenharmony_ci# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt. 14102e5b6d6dSopenharmony_ci 14112e5b6d6dSopenharmony_ci""") 14122e5b6d6dSopenharmony_ci out_file.write("* Unicode " + _ucd_version + "\n\n") 14132e5b6d6dSopenharmony_ci prev_start = 0 14142e5b6d6dSopenharmony_ci prev_end = 0 14152e5b6d6dSopenharmony_ci prev_nfkc_cf = None 14162e5b6d6dSopenharmony_ci for i in range(len(_starts) - 1): 14172e5b6d6dSopenharmony_ci start = _starts[i] 14182e5b6d6dSopenharmony_ci end = _starts[i + 1] - 1 14192e5b6d6dSopenharmony_ci props = _props[i] 14202e5b6d6dSopenharmony_ci nfkc_cf = props.get("NFKC_CF") 14212e5b6d6dSopenharmony_ci # Merge with the previous range if possible, 14222e5b6d6dSopenharmony_ci # or remember this range for merging. 14232e5b6d6dSopenharmony_ci if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start: 14242e5b6d6dSopenharmony_ci prev_end = end 14252e5b6d6dSopenharmony_ci else: 14262e5b6d6dSopenharmony_ci if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'): 14272e5b6d6dSopenharmony_ci if prev_start == prev_end: 14282e5b6d6dSopenharmony_ci out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf)) 14292e5b6d6dSopenharmony_ci else: 14302e5b6d6dSopenharmony_ci out_file.write("%04X..%04X>%s\n" % 14312e5b6d6dSopenharmony_ci (prev_start, prev_end, prev_nfkc_cf)) 14322e5b6d6dSopenharmony_ci prev_start = start 14332e5b6d6dSopenharmony_ci prev_end = end 14342e5b6d6dSopenharmony_ci prev_nfkc_cf = nfkc_cf 14352e5b6d6dSopenharmony_ci 14362e5b6d6dSopenharmony_ci 14372e5b6d6dSopenharmony_cidef WriteNorm2(path): 14382e5b6d6dSopenharmony_ci WriteNorm2NFCTextFile(path) 14392e5b6d6dSopenharmony_ci WriteNorm2NFKCTextFile(path) 14402e5b6d6dSopenharmony_ci WriteNorm2NFKC_CFTextFile(path) 14412e5b6d6dSopenharmony_ci 14422e5b6d6dSopenharmony_ci# UTS #46 Normalizer2 input file ------------------------------------------- *** 14432e5b6d6dSopenharmony_ci 14442e5b6d6dSopenharmony_ci_idna_replacements = [ 14452e5b6d6dSopenharmony_ci # Several versions of avoiding circular FFFD>FFFD mappings, 14462e5b6d6dSopenharmony_ci # depending on the version of the input file. 14472e5b6d6dSopenharmony_ci (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"), 14482e5b6d6dSopenharmony_ci (re.compile(r"\.\.FFFD"), "..FFFC"), 14492e5b6d6dSopenharmony_ci (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"), 14502e5b6d6dSopenharmony_ci # Since we switch between checking and not checking for STD3 character 14512e5b6d6dSopenharmony_ci # restrictions at runtime, checking the non-LDH ASCII characters in code, 14522e5b6d6dSopenharmony_ci # we treat these values here like their regular siblings. 14532e5b6d6dSopenharmony_ci (re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"), 14542e5b6d6dSopenharmony_ci (re.compile(r"; disallowed_STD3_mapped +; "), ">"), 14552e5b6d6dSopenharmony_ci # For UTS #46, we do not care about "not valid in IDNA2008". 14562e5b6d6dSopenharmony_ci (re.compile(r"; *; NV8 +"), ""), 14572e5b6d6dSopenharmony_ci # ICU 63+ normalization no longer allows mappings for surrogate code points, 14582e5b6d6dSopenharmony_ci # and the UTS #46 code handles them instead. 14592e5b6d6dSopenharmony_ci (re.compile(r"^D800..DFFF ; disallowed"), r"# D800..DFFF disallowed in code"), 14602e5b6d6dSopenharmony_ci # Normal transformations. 14612e5b6d6dSopenharmony_ci (re.compile(r"; disallowed"), ">FFFD"), 14622e5b6d6dSopenharmony_ci (re.compile(r"; ignored"), ">"), 14632e5b6d6dSopenharmony_ci (re.compile(r"^([^;]+) ; valid"), r"# \1valid"), 14642e5b6d6dSopenharmony_ci (re.compile(r"; mapped +; "), ">"), 14652e5b6d6dSopenharmony_ci (re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >") 14662e5b6d6dSopenharmony_ci] 14672e5b6d6dSopenharmony_ci 14682e5b6d6dSopenharmony_cidef IdnaToUTS46TextFile(s, t): 14692e5b6d6dSopenharmony_ci """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format.""" 14702e5b6d6dSopenharmony_ci # Different input/output file names. 14712e5b6d6dSopenharmony_ci dest_path = os.path.dirname(t) 14722e5b6d6dSopenharmony_ci t = os.path.join(dest_path, "uts46.txt") 14732e5b6d6dSopenharmony_ci # TODO: With Python 2.7+, combine the two with statements into one. 14742e5b6d6dSopenharmony_ci with open(s, "r") as in_file: 14752e5b6d6dSopenharmony_ci with open(t, "w") as out_file: 14762e5b6d6dSopenharmony_ci out_file.write("# Original file:\n") 14772e5b6d6dSopenharmony_ci for line in in_file: 14782e5b6d6dSopenharmony_ci orig_line = line 14792e5b6d6dSopenharmony_ci if line.startswith("# For documentation"): 14802e5b6d6dSopenharmony_ci out_file.write(line) 14812e5b6d6dSopenharmony_ci out_file.write(r""" 14822e5b6d6dSopenharmony_ci# ================================================ 14832e5b6d6dSopenharmony_ci# This file has been reformatted into syntax for the 14842e5b6d6dSopenharmony_ci# gennorm2 Normalizer2 data generator tool. 14852e5b6d6dSopenharmony_ci# 14862e5b6d6dSopenharmony_ci# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out. 14872e5b6d6dSopenharmony_ci# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax. 14882e5b6d6dSopenharmony_ci# "disallowed" lines map to U+FFFD. 14892e5b6d6dSopenharmony_ci# "ignored" lines map to an empty string. 14902e5b6d6dSopenharmony_ci# 14912e5b6d6dSopenharmony_ci# Characters disallowed under STD3 rules are treated as valid or mapped; 14922e5b6d6dSopenharmony_ci# they are handled in code. 14932e5b6d6dSopenharmony_ci# Deviation characters are also handled in code. 14942e5b6d6dSopenharmony_ci# 14952e5b6d6dSopenharmony_ci# Use this file as the second gennorm2 input file after nfc.txt. 14962e5b6d6dSopenharmony_ci# ================================================ 14972e5b6d6dSopenharmony_ci""") 14982e5b6d6dSopenharmony_ci continue 14992e5b6d6dSopenharmony_ci if line[0] in "#\r\n": 15002e5b6d6dSopenharmony_ci out_file.write(line) 15012e5b6d6dSopenharmony_ci continue 15022e5b6d6dSopenharmony_ci for rep in _idna_replacements: line = rep[0].sub(rep[1], line) 15032e5b6d6dSopenharmony_ci # Align inline comments at column 40. 15042e5b6d6dSopenharmony_ci comment_pos = line.find("#", 1) 15052e5b6d6dSopenharmony_ci if comment_pos < 40: 15062e5b6d6dSopenharmony_ci line = (line[:comment_pos] + ((40 - comment_pos) * ' ') + 15072e5b6d6dSopenharmony_ci line[comment_pos:]) 15082e5b6d6dSopenharmony_ci elif comment_pos > 40: 15092e5b6d6dSopenharmony_ci space_pos = comment_pos 15102e5b6d6dSopenharmony_ci while space_pos > 0 and line[space_pos - 1] == ' ': 15112e5b6d6dSopenharmony_ci space_pos = space_pos - 1 15122e5b6d6dSopenharmony_ci if space_pos < 40: 15132e5b6d6dSopenharmony_ci # Fewer than 40 characters before the comment: 15142e5b6d6dSopenharmony_ci # Align comments at column 40. 15152e5b6d6dSopenharmony_ci line = line[:40] + line[comment_pos:] 15162e5b6d6dSopenharmony_ci else: 15172e5b6d6dSopenharmony_ci # 40 or more characters before the comment: 15182e5b6d6dSopenharmony_ci # Keep one space between contents and comment. 15192e5b6d6dSopenharmony_ci line = line[:space_pos] + " " + line[comment_pos:] 15202e5b6d6dSopenharmony_ci # Write the modified line. 15212e5b6d6dSopenharmony_ci out_file.write(line) 15222e5b6d6dSopenharmony_ci if "..FFFF" in orig_line and "..FFFC" in line: 15232e5b6d6dSopenharmony_ci out_file.write("FFFE..FFFF >FFFD\n"); 15242e5b6d6dSopenharmony_ci return t 15252e5b6d6dSopenharmony_ci 15262e5b6d6dSopenharmony_ci# Preprocessing ------------------------------------------------------------ *** 15272e5b6d6dSopenharmony_ci 15282e5b6d6dSopenharmony_ci_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*") 15292e5b6d6dSopenharmony_ci_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") 15302e5b6d6dSopenharmony_ci 15312e5b6d6dSopenharmony_cidef CopyAndStripWithOptionalMerge(s, t, do_merge): 15322e5b6d6dSopenharmony_ci # TODO: We do not seem to need the do_merge argument and logic any more. 15332e5b6d6dSopenharmony_ci with open(s, "r") as in_file, open(t, "w") as out_file: 15342e5b6d6dSopenharmony_ci first = -1 # First code point with first_data. 15352e5b6d6dSopenharmony_ci last = -1 # Last code point with first_data. 15362e5b6d6dSopenharmony_ci first_data = "" # Common data for code points [first..last]. 15372e5b6d6dSopenharmony_ci for line in in_file: 15382e5b6d6dSopenharmony_ci match = _strip_re.match(line) 15392e5b6d6dSopenharmony_ci if match: 15402e5b6d6dSopenharmony_ci line = match.group(1) 15412e5b6d6dSopenharmony_ci else: 15422e5b6d6dSopenharmony_ci line = line.rstrip() 15432e5b6d6dSopenharmony_ci if do_merge: 15442e5b6d6dSopenharmony_ci match = _code_point_re.match(line) 15452e5b6d6dSopenharmony_ci if match: 15462e5b6d6dSopenharmony_ci c = int(match.group(1), 16) 15472e5b6d6dSopenharmony_ci data = line[match.end() - 1:] 15482e5b6d6dSopenharmony_ci else: 15492e5b6d6dSopenharmony_ci c = -1 15502e5b6d6dSopenharmony_ci data = "" 15512e5b6d6dSopenharmony_ci if last >= 0 and (c != (last + 1) or data != first_data): 15522e5b6d6dSopenharmony_ci # output the current range 15532e5b6d6dSopenharmony_ci if first == last: 15542e5b6d6dSopenharmony_ci out_file.write("%04X%s\n" % (first, first_data)) 15552e5b6d6dSopenharmony_ci else: 15562e5b6d6dSopenharmony_ci out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 15572e5b6d6dSopenharmony_ci first = -1 15582e5b6d6dSopenharmony_ci last = -1 15592e5b6d6dSopenharmony_ci first_data = "" 15602e5b6d6dSopenharmony_ci if c < 0: 15612e5b6d6dSopenharmony_ci # no data on this line, output as is 15622e5b6d6dSopenharmony_ci out_file.write(line) 15632e5b6d6dSopenharmony_ci out_file.write("\n") 15642e5b6d6dSopenharmony_ci else: 15652e5b6d6dSopenharmony_ci # data on this line, store for possible range compaction 15662e5b6d6dSopenharmony_ci if last < 0: 15672e5b6d6dSopenharmony_ci # set as the first line in a possible range 15682e5b6d6dSopenharmony_ci first = c 15692e5b6d6dSopenharmony_ci last = c 15702e5b6d6dSopenharmony_ci first_data = data 15712e5b6d6dSopenharmony_ci else: 15722e5b6d6dSopenharmony_ci # must be c == (last + 1) and data == first_data 15732e5b6d6dSopenharmony_ci # because of previous conditions 15742e5b6d6dSopenharmony_ci # continue with the current range 15752e5b6d6dSopenharmony_ci last = c 15762e5b6d6dSopenharmony_ci else: 15772e5b6d6dSopenharmony_ci # Only strip, don't merge: just output the stripped line. 15782e5b6d6dSopenharmony_ci out_file.write(line) 15792e5b6d6dSopenharmony_ci out_file.write("\n") 15802e5b6d6dSopenharmony_ci if do_merge and last >= 0: 15812e5b6d6dSopenharmony_ci # output the last range in the file 15822e5b6d6dSopenharmony_ci if first == last: 15832e5b6d6dSopenharmony_ci out_file.write("%04X%s\n" % (first, first_data)) 15842e5b6d6dSopenharmony_ci else: 15852e5b6d6dSopenharmony_ci out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 15862e5b6d6dSopenharmony_ci first = -1 15872e5b6d6dSopenharmony_ci last = -1 15882e5b6d6dSopenharmony_ci first_data = "" 15892e5b6d6dSopenharmony_ci out_file.flush() 15902e5b6d6dSopenharmony_ci return t 15912e5b6d6dSopenharmony_ci 15922e5b6d6dSopenharmony_ci 15932e5b6d6dSopenharmony_cidef CopyAndStrip(s, t): 15942e5b6d6dSopenharmony_ci """Copies a file and removes comments behind data lines but not in others.""" 15952e5b6d6dSopenharmony_ci return CopyAndStripWithOptionalMerge(s, t, False) 15962e5b6d6dSopenharmony_ci 15972e5b6d6dSopenharmony_ci 15982e5b6d6dSopenharmony_cidef CopyAndStripAndMerge(s, t): 15992e5b6d6dSopenharmony_ci """Copies and strips a file and merges lines. 16002e5b6d6dSopenharmony_ci 16012e5b6d6dSopenharmony_ci Copies a file, removes comments, and 16022e5b6d6dSopenharmony_ci merges lines with adjacent code point ranges and identical per-code point 16032e5b6d6dSopenharmony_ci data lines into one line with range syntax. 16042e5b6d6dSopenharmony_ci """ 16052e5b6d6dSopenharmony_ci return CopyAndStripWithOptionalMerge(s, t, True) 16062e5b6d6dSopenharmony_ci 16072e5b6d6dSopenharmony_ci 16082e5b6d6dSopenharmony_cidef CopyOnly(s, t): 16092e5b6d6dSopenharmony_ci shutil.copy(s, t) 16102e5b6d6dSopenharmony_ci return t 16112e5b6d6dSopenharmony_ci 16122e5b6d6dSopenharmony_ci 16132e5b6d6dSopenharmony_cidef DontCopy(s, t): 16142e5b6d6dSopenharmony_ci return s 16152e5b6d6dSopenharmony_ci 16162e5b6d6dSopenharmony_ci 16172e5b6d6dSopenharmony_ci# Each _files value is a 16182e5b6d6dSopenharmony_ci# (preprocessor, dest_folder, parser, order) tuple 16192e5b6d6dSopenharmony_ci# where all fields except the preprocessor are optional. 16202e5b6d6dSopenharmony_ci# After the initial preprocessing (copy/strip/merge), 16212e5b6d6dSopenharmony_ci# if a parser is specified, then a tuple is added to _files_to_parse 16222e5b6d6dSopenharmony_ci# at index "order" (default order 9). 16232e5b6d6dSopenharmony_ci# An explicit order number is set only for files that must be parsed 16242e5b6d6dSopenharmony_ci# before others. 16252e5b6d6dSopenharmony_ci_files = { 16262e5b6d6dSopenharmony_ci "BidiBrackets.txt": (DontCopy, ParseBidiBrackets), 16272e5b6d6dSopenharmony_ci "BidiMirroring.txt": (DontCopy, ParseBidiMirroring), 16282e5b6d6dSopenharmony_ci "BidiTest.txt": (CopyOnly, "testdata"), 16292e5b6d6dSopenharmony_ci "Blocks.txt": (DontCopy, ParseBlocks), 16302e5b6d6dSopenharmony_ci "CaseFolding.txt": (CopyOnly, ParseCaseFolding), 16312e5b6d6dSopenharmony_ci "DerivedAge.txt": (DontCopy, ParseDerivedAge), 16322e5b6d6dSopenharmony_ci "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass), 16332e5b6d6dSopenharmony_ci "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties), 16342e5b6d6dSopenharmony_ci "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup), 16352e5b6d6dSopenharmony_ci "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType), 16362e5b6d6dSopenharmony_ci "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties), 16372e5b6d6dSopenharmony_ci "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues), 16382e5b6d6dSopenharmony_ci "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth), 16392e5b6d6dSopenharmony_ci "emoji-data.txt": (DontCopy, ParseNamedProperties), 16402e5b6d6dSopenharmony_ci "emoji-sequences.txt": (CopyOnly,), 16412e5b6d6dSopenharmony_ci "emoji-zwj-sequences.txt": (CopyOnly,), 16422e5b6d6dSopenharmony_ci "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty), 16432e5b6d6dSopenharmony_ci "GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"), 16442e5b6d6dSopenharmony_ci "IdnaTestV2.txt": (CopyOnly, "testdata"), 16452e5b6d6dSopenharmony_ci "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory), 16462e5b6d6dSopenharmony_ci "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory), 16472e5b6d6dSopenharmony_ci "LineBreak.txt": (DontCopy, ParseLineBreak), 16482e5b6d6dSopenharmony_ci "LineBreakTest.txt": (CopyOnly, "testdata"), 16492e5b6d6dSopenharmony_ci "NameAliases.txt": (DontCopy, ParseNameAliases), 16502e5b6d6dSopenharmony_ci "NamesList.txt": (DontCopy, ParseNamesList), 16512e5b6d6dSopenharmony_ci "NormalizationCorrections.txt": (CopyOnly,), # Only used in gensprep. 16522e5b6d6dSopenharmony_ci "NormalizationTest.txt": (CopyAndStrip,), 16532e5b6d6dSopenharmony_ci "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0), 16542e5b6d6dSopenharmony_ci "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1), 16552e5b6d6dSopenharmony_ci "PropList.txt": (DontCopy, ParseNamedProperties), 16562e5b6d6dSopenharmony_ci "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak), 16572e5b6d6dSopenharmony_ci "SentenceBreakTest.txt": (CopyOnly, "testdata"), 16582e5b6d6dSopenharmony_ci "Scripts.txt": (DontCopy, ParseScripts), 16592e5b6d6dSopenharmony_ci "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions), 16602e5b6d6dSopenharmony_ci "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing), 16612e5b6d6dSopenharmony_ci "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2), 16622e5b6d6dSopenharmony_ci "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation), 16632e5b6d6dSopenharmony_ci "WordBreakProperty.txt": (DontCopy, ParseWordBreak), 16642e5b6d6dSopenharmony_ci "WordBreakTest.txt": (CopyOnly, "testdata"), 16652e5b6d6dSopenharmony_ci # From www.unicode.org/Public/idna/<version>/ 16662e5b6d6dSopenharmony_ci "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2") 16672e5b6d6dSopenharmony_ci} 16682e5b6d6dSopenharmony_ci 16692e5b6d6dSopenharmony_ci# List of lists of files to be parsed in order. 16702e5b6d6dSopenharmony_ci# Inner lists contain (basename, path, parser) tuples. 16712e5b6d6dSopenharmony_ci_files_to_parse = [[], [], [], [], [], [], [], [], [], []] 16722e5b6d6dSopenharmony_ci 16732e5b6d6dSopenharmony_ci# Get the standard basename from a versioned filename. 16742e5b6d6dSopenharmony_ci# For example, match "UnicodeData-6.1.0d8.txt" 16752e5b6d6dSopenharmony_ci# so we can turn it into "UnicodeData.txt". 16762e5b6d6dSopenharmony_ci_file_version_re = re.compile("([a-zA-Z0-9_-]+)" + 16772e5b6d6dSopenharmony_ci "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" + 16782e5b6d6dSopenharmony_ci "(\\.[a-z]+)$") 16792e5b6d6dSopenharmony_ci 16802e5b6d6dSopenharmony_cidef PreprocessFiles(source_files, icu4c_src_root): 16812e5b6d6dSopenharmony_ci unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") 16822e5b6d6dSopenharmony_ci norm2_path = os.path.join(unidata_path, "norm2") 16832e5b6d6dSopenharmony_ci testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata") 16842e5b6d6dSopenharmony_ci folder_to_path = { 16852e5b6d6dSopenharmony_ci "unidata": unidata_path, 16862e5b6d6dSopenharmony_ci "norm2": norm2_path, 16872e5b6d6dSopenharmony_ci "testdata": testdata_path 16882e5b6d6dSopenharmony_ci } 16892e5b6d6dSopenharmony_ci files_processed = set() 16902e5b6d6dSopenharmony_ci for source_file in source_files: 16912e5b6d6dSopenharmony_ci (folder, basename) = os.path.split(source_file) 16922e5b6d6dSopenharmony_ci match = _file_version_re.match(basename) 16932e5b6d6dSopenharmony_ci if match: 16942e5b6d6dSopenharmony_ci new_basename = match.group(1) + match.group(2) 16952e5b6d6dSopenharmony_ci if new_basename != basename: 16962e5b6d6dSopenharmony_ci print("Removing version suffix from " + source_file) 16972e5b6d6dSopenharmony_ci # ... so that we can easily compare UCD files. 16982e5b6d6dSopenharmony_ci new_source_file = os.path.join(folder, new_basename) 16992e5b6d6dSopenharmony_ci shutil.move(source_file, new_source_file) 17002e5b6d6dSopenharmony_ci basename = new_basename 17012e5b6d6dSopenharmony_ci source_file = new_source_file 17022e5b6d6dSopenharmony_ci if basename in _files: 17032e5b6d6dSopenharmony_ci print("Preprocessing %s" % basename) 17042e5b6d6dSopenharmony_ci if basename in files_processed: 17052e5b6d6dSopenharmony_ci raise Exception("duplicate file basename %s!" % basename) 17062e5b6d6dSopenharmony_ci files_processed.add(basename) 17072e5b6d6dSopenharmony_ci value = _files[basename] 17082e5b6d6dSopenharmony_ci preprocessor = value[0] 17092e5b6d6dSopenharmony_ci if len(value) >= 2 and isinstance(value[1], (str)): 17102e5b6d6dSopenharmony_ci # The value was [preprocessor, dest_folder, ...], leave [...]. 17112e5b6d6dSopenharmony_ci dest_folder = value[1] 17122e5b6d6dSopenharmony_ci value = value[2:] 17132e5b6d6dSopenharmony_ci else: 17142e5b6d6dSopenharmony_ci # The value was [preprocessor, ...], leave [...]. 17152e5b6d6dSopenharmony_ci dest_folder = "unidata" 17162e5b6d6dSopenharmony_ci value = value[1:] 17172e5b6d6dSopenharmony_ci dest_path = folder_to_path[dest_folder] 17182e5b6d6dSopenharmony_ci if not os.path.exists(dest_path): os.makedirs(dest_path) 17192e5b6d6dSopenharmony_ci dest_basename = basename 17202e5b6d6dSopenharmony_ci # Source GraphemeBreakTest-cldr.txt --> destination GraphemeBreakTest.txt. 17212e5b6d6dSopenharmony_ci if basename.endswith("-cldr.txt"): 17222e5b6d6dSopenharmony_ci dest_basename = basename[:-9] + basename[-4:] 17232e5b6d6dSopenharmony_ci dest_file = os.path.join(dest_path, dest_basename) 17242e5b6d6dSopenharmony_ci parse_file = preprocessor(source_file, dest_file) 17252e5b6d6dSopenharmony_ci if value: 17262e5b6d6dSopenharmony_ci order = 9 if len(value) < 2 else value[1] 17272e5b6d6dSopenharmony_ci _files_to_parse[order].append((basename, parse_file, value[0])) 17282e5b6d6dSopenharmony_ci 17292e5b6d6dSopenharmony_ci# Character names ---------------------------------------------------------- *** 17302e5b6d6dSopenharmony_ci 17312e5b6d6dSopenharmony_ci# TODO: Turn this script into a module that 17322e5b6d6dSopenharmony_ci# a) gives access to the parsed data 17332e5b6d6dSopenharmony_ci# b) has a PreparseUCD(ucd_root, icu4c_src_root) function 17342e5b6d6dSopenharmony_ci# c) has a ParsePreparsedUCD(filename) function 17352e5b6d6dSopenharmony_ci# d) has a WritePreparsedUCD(filename) function 17362e5b6d6dSopenharmony_ci# and then use it from a new script for names. 17372e5b6d6dSopenharmony_ci# Some more API: 17382e5b6d6dSopenharmony_ci# - generator GetRangesAndProps() -> (start, end, props)* 17392e5b6d6dSopenharmony_ci 17402e5b6d6dSopenharmony_cidef IncCounter(counters, key, inc=1): 17412e5b6d6dSopenharmony_ci if key in counters: 17422e5b6d6dSopenharmony_ci counters[key] += inc 17432e5b6d6dSopenharmony_ci else: 17442e5b6d6dSopenharmony_ci counters[key] = inc 17452e5b6d6dSopenharmony_ci 17462e5b6d6dSopenharmony_ci 17472e5b6d6dSopenharmony_ciendings = ( 17482e5b6d6dSopenharmony_ci # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz. 17492e5b6d6dSopenharmony_ci "PHASE-", 17502e5b6d6dSopenharmony_ci "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ", 17512e5b6d6dSopenharmony_ci "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ", 17522e5b6d6dSopenharmony_ci "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ", 17532e5b6d6dSopenharmony_ci "ACROPHONIC ", "HIEROGLYPH ", 17542e5b6d6dSopenharmony_ci "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ", 17552e5b6d6dSopenharmony_ci "PUNCTUATION ", "SIGN ", "SYMBOL ", 17562e5b6d6dSopenharmony_ci "TILE ", "CARD ", "FACE ", 17572e5b6d6dSopenharmony_ci "ACCENT ", "POINT ", 17582e5b6d6dSopenharmony_ci # List SIGN before VOWEL to catch "vowel sign". 17592e5b6d6dSopenharmony_ci "VOWEL ", "TONE ", "RADICAL ", 17602e5b6d6dSopenharmony_ci # For names of math symbols, 17612e5b6d6dSopenharmony_ci # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A 17622e5b6d6dSopenharmony_ci "SCRIPT ", "FRAKTUR ", "MONOSPACE ", 17632e5b6d6dSopenharmony_ci "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ", 17642e5b6d6dSopenharmony_ci "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ", 17652e5b6d6dSopenharmony_ci # BRAILLE PATTERN DOTS-xyz 17662e5b6d6dSopenharmony_ci "DOTS-", 17672e5b6d6dSopenharmony_ci "SELECTOR ", "SELECTOR-" 17682e5b6d6dSopenharmony_ci) 17692e5b6d6dSopenharmony_ci 17702e5b6d6dSopenharmony_cidef SplitName(name, tokens): 17712e5b6d6dSopenharmony_ci start = 0 17722e5b6d6dSopenharmony_ci for e in endings: 17732e5b6d6dSopenharmony_ci i = name.find(e) 17742e5b6d6dSopenharmony_ci if i >= 0: 17752e5b6d6dSopenharmony_ci start = i + len(e) 17762e5b6d6dSopenharmony_ci token = name[:start] 17772e5b6d6dSopenharmony_ci IncCounter(tokens, token) 17782e5b6d6dSopenharmony_ci break 17792e5b6d6dSopenharmony_ci for i in range(start, len(name)): 17802e5b6d6dSopenharmony_ci c = name[i] 17812e5b6d6dSopenharmony_ci if c == ' ' or c == '-': 17822e5b6d6dSopenharmony_ci token = name[start:i + 1] 17832e5b6d6dSopenharmony_ci IncCounter(tokens, token) 17842e5b6d6dSopenharmony_ci start = i + 1 17852e5b6d6dSopenharmony_ci IncCounter(tokens, name[start:]) 17862e5b6d6dSopenharmony_ci 17872e5b6d6dSopenharmony_ci 17882e5b6d6dSopenharmony_cidef PrintNameStats(): 17892e5b6d6dSopenharmony_ci # TODO: This name analysis code is out of date. 17902e5b6d6dSopenharmony_ci # It needs to consider the multi-type Name_Alias values. 17912e5b6d6dSopenharmony_ci name_pnames = ("na", "na1", "Name_Alias") 17922e5b6d6dSopenharmony_ci counts = {} 17932e5b6d6dSopenharmony_ci for pname in name_pnames: 17942e5b6d6dSopenharmony_ci counts[pname] = 0 17952e5b6d6dSopenharmony_ci total_lengths = counts.copy() 17962e5b6d6dSopenharmony_ci max_length = 0 17972e5b6d6dSopenharmony_ci max_per_cp = 0 17982e5b6d6dSopenharmony_ci name_chars = set() 17992e5b6d6dSopenharmony_ci num_digits = 0 18002e5b6d6dSopenharmony_ci token_counters = {} 18012e5b6d6dSopenharmony_ci char_counters = {} 18022e5b6d6dSopenharmony_ci for i in range(len(_starts) - 1): 18032e5b6d6dSopenharmony_ci start = _starts[i] 18042e5b6d6dSopenharmony_ci # end = _starts[i + 1] - 1 18052e5b6d6dSopenharmony_ci props = _props[i] 18062e5b6d6dSopenharmony_ci per_cp = 0 18072e5b6d6dSopenharmony_ci for pname in name_pnames: 18082e5b6d6dSopenharmony_ci if pname in props: 18092e5b6d6dSopenharmony_ci counts[pname] += 1 18102e5b6d6dSopenharmony_ci name = props[pname] 18112e5b6d6dSopenharmony_ci total_lengths[pname] += len(name) 18122e5b6d6dSopenharmony_ci name_chars |= set(name) 18132e5b6d6dSopenharmony_ci if len(name) > max_length: max_length = len(name) 18142e5b6d6dSopenharmony_ci per_cp += len(name) + 1 18152e5b6d6dSopenharmony_ci if per_cp > max_per_cp: max_per_cp = per_cp 18162e5b6d6dSopenharmony_ci tokens = SplitName(name, token_counters) 18172e5b6d6dSopenharmony_ci for c in name: 18182e5b6d6dSopenharmony_ci if c in "0123456789": num_digits += 1 18192e5b6d6dSopenharmony_ci IncCounter(char_counters, c) 18202e5b6d6dSopenharmony_ci print 18212e5b6d6dSopenharmony_ci for pname in name_pnames: 18222e5b6d6dSopenharmony_ci print("'%s' character names: %d / %d bytes" % 18232e5b6d6dSopenharmony_ci (pname, counts[pname], total_lengths[pname])) 18242e5b6d6dSopenharmony_ci print("%d total bytes in character names" % sum(total_lengths.itervalues())) 18252e5b6d6dSopenharmony_ci print("%d name-characters: %s" % 18262e5b6d6dSopenharmony_ci (len(name_chars), "".join(sorted(name_chars)))) 18272e5b6d6dSopenharmony_ci print("%d digits 0-9" % num_digits) 18282e5b6d6dSopenharmony_ci count_chars = [(count, c) for (c, count) in char_counters.items()] 18292e5b6d6dSopenharmony_ci count_chars.sort(reverse=True) 18302e5b6d6dSopenharmony_ci for cc in count_chars: 18312e5b6d6dSopenharmony_ci print("name-chars: %6d * '%s'" % cc) 18322e5b6d6dSopenharmony_ci print("max. name length: %d" % max_length) 18332e5b6d6dSopenharmony_ci print("max. length of all (names+NUL) per cp: %d" % max_per_cp) 18342e5b6d6dSopenharmony_ci 18352e5b6d6dSopenharmony_ci token_lengths = sum([len(t) + 1 for t in token_counters]) 18362e5b6d6dSopenharmony_ci print("%d total tokens, %d bytes with NUL" % 18372e5b6d6dSopenharmony_ci (len(token_counters), token_lengths)) 18382e5b6d6dSopenharmony_ci 18392e5b6d6dSopenharmony_ci counts_tokens = [] 18402e5b6d6dSopenharmony_ci for (token, count) in token_counters.items(): 18412e5b6d6dSopenharmony_ci # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time 18422e5b6d6dSopenharmony_ci # but have to store the token string itself with a length or terminator byte, 18432e5b6d6dSopenharmony_ci # plus a 2-byte entry in an token index table. 18442e5b6d6dSopenharmony_ci savings = count * (len(token) - 1) - (len(token) + 1 + 2) 18452e5b6d6dSopenharmony_ci if savings > 0: 18462e5b6d6dSopenharmony_ci counts_tokens.append((savings, count, token)) 18472e5b6d6dSopenharmony_ci counts_tokens.sort(reverse=True) 18482e5b6d6dSopenharmony_ci print("%d tokens might save space with 1-byte codes" % len(counts_tokens)) 18492e5b6d6dSopenharmony_ci 18502e5b6d6dSopenharmony_ci # Codes=bytes, 40 byte values for name_chars. 18512e5b6d6dSopenharmony_ci # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens. 18522e5b6d6dSopenharmony_ci # Make each 2-byte token the token string index itself, rather than 18532e5b6d6dSopenharmony_ci # and index into a string index table. 18542e5b6d6dSopenharmony_ci # More lead bytes but also more savings. 18552e5b6d6dSopenharmony_ci num_units = 256 18562e5b6d6dSopenharmony_ci max_lead = (token_lengths + 255) / 256 18572e5b6d6dSopenharmony_ci max_token_units = num_units - len(name_chars) 18582e5b6d6dSopenharmony_ci results = [] 18592e5b6d6dSopenharmony_ci for num_lead in range(min(max_lead, max_token_units) + 1): 18602e5b6d6dSopenharmony_ci max1 = max_token_units - num_lead 18612e5b6d6dSopenharmony_ci ct = counts_tokens[:max1] 18622e5b6d6dSopenharmony_ci tokens1 = set([t for (s, c, t) in ct]) 18632e5b6d6dSopenharmony_ci for (token, count) in token_counters.items(): 18642e5b6d6dSopenharmony_ci if token in tokens1: continue 18652e5b6d6dSopenharmony_ci # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time 18662e5b6d6dSopenharmony_ci # but have to store the token string itself with a length or terminator byte. 18672e5b6d6dSopenharmony_ci savings = count * (len(token) - 2) - (len(token) + 1) 18682e5b6d6dSopenharmony_ci if savings > 0: 18692e5b6d6dSopenharmony_ci ct.append((savings, count, token)) 18702e5b6d6dSopenharmony_ci ct.sort(reverse=True) 18712e5b6d6dSopenharmony_ci # A 2-byte-code-token index cannot be limit_t_lengths or higher. 18722e5b6d6dSopenharmony_ci limit_t_lengths = num_lead * 256 18732e5b6d6dSopenharmony_ci token2_index = 0 18742e5b6d6dSopenharmony_ci for i in range(max1, len(ct)): 18752e5b6d6dSopenharmony_ci if token2_index >= limit_t_lengths: 18762e5b6d6dSopenharmony_ci del ct[i:] 18772e5b6d6dSopenharmony_ci break 18782e5b6d6dSopenharmony_ci token2_index += len(ct[i][2]) + 1 18792e5b6d6dSopenharmony_ci cumul_savings = sum([s for (s, c, t) in ct]) 18802e5b6d6dSopenharmony_ci # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" % 18812e5b6d6dSopenharmony_ci # (max1, len(ct), cumul_savings)) 18822e5b6d6dSopenharmony_ci results.append((cumul_savings, max1, ct)) 18832e5b6d6dSopenharmony_ci best = max(results) # (cumul_savings, max1, ct) 18842e5b6d6dSopenharmony_ci 18852e5b6d6dSopenharmony_ci max1 = best[1] 18862e5b6d6dSopenharmony_ci print("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" % 18872e5b6d6dSopenharmony_ci (best[0], max1, max_token_units - max1)) 18882e5b6d6dSopenharmony_ci counts_tokens = best[2] 18892e5b6d6dSopenharmony_ci cumul_savings = 0 18902e5b6d6dSopenharmony_ci for i in range(len(counts_tokens)): 18912e5b6d6dSopenharmony_ci n = 1 if i < max1 else 2 18922e5b6d6dSopenharmony_ci i1 = i + 1 18932e5b6d6dSopenharmony_ci t = counts_tokens[i] 18942e5b6d6dSopenharmony_ci cumul_savings += t[0] 18952e5b6d6dSopenharmony_ci if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens): 18962e5b6d6dSopenharmony_ci print(("%04d. cumul. %6d bytes save %6d bytes from " + 18972e5b6d6dSopenharmony_ci "%5d * %d-byte token for %2d='%s'") % 18982e5b6d6dSopenharmony_ci (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2])) 18992e5b6d6dSopenharmony_ci 19002e5b6d6dSopenharmony_ci# ICU API ------------------------------------------------------------------ *** 19012e5b6d6dSopenharmony_ci 19022e5b6d6dSopenharmony_ci# Sample line to match: 19032e5b6d6dSopenharmony_ci# UCHAR_UNIFIED_IDEOGRAPH=29, 19042e5b6d6dSopenharmony_ci_uchar_re = re.compile( 19052e5b6d6dSopenharmony_ci " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),") 19062e5b6d6dSopenharmony_ci 19072e5b6d6dSopenharmony_ci# Sample line to match: 19082e5b6d6dSopenharmony_ci# /** Zs @stable ICU 2.0 */ 19092e5b6d6dSopenharmony_ci_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ") 19102e5b6d6dSopenharmony_ci 19112e5b6d6dSopenharmony_ci# Sample line to match: 19122e5b6d6dSopenharmony_ci# U_SPACE_SEPARATOR = 12, 19132e5b6d6dSopenharmony_ci_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") 19142e5b6d6dSopenharmony_ci 19152e5b6d6dSopenharmony_ci# Sample line to match: 19162e5b6d6dSopenharmony_ci# /** L @stable ICU 2.0 */ 19172e5b6d6dSopenharmony_ci_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ") 19182e5b6d6dSopenharmony_ci 19192e5b6d6dSopenharmony_ci# Sample line to match: 19202e5b6d6dSopenharmony_ci# U_LEFT_TO_RIGHT = 0, 19212e5b6d6dSopenharmony_ci_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,") 19222e5b6d6dSopenharmony_ci 19232e5b6d6dSopenharmony_ci# Sample line to match: 19242e5b6d6dSopenharmony_ci# UBLOCK_CYRILLIC =9, 19252e5b6d6dSopenharmony_ci_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,") 19262e5b6d6dSopenharmony_ci 19272e5b6d6dSopenharmony_ci# Sample line to match: 19282e5b6d6dSopenharmony_ci# U_EA_AMBIGUOUS, 19292e5b6d6dSopenharmony_ci_prop_and_value_re = re.compile( 19302e5b6d6dSopenharmony_ci " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))") 19312e5b6d6dSopenharmony_ci 19322e5b6d6dSopenharmony_ci# Sample line to match if it has matched _prop_and_value_re 19332e5b6d6dSopenharmony_ci# (we want to exclude aliases): 19342e5b6d6dSopenharmony_ci# U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL, 19352e5b6d6dSopenharmony_ci_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U") 19362e5b6d6dSopenharmony_ci 19372e5b6d6dSopenharmony_cidef ParseUCharHeader(icu4c_src_root): 19382e5b6d6dSopenharmony_ci uchar_path = os.path.join(icu4c_src_root, "source", 19392e5b6d6dSopenharmony_ci "common", "unicode", "uchar.h") 19402e5b6d6dSopenharmony_ci with open(uchar_path, "r") as uchar_file: 19412e5b6d6dSopenharmony_ci mode = "" # Mode string (=pname) during context-sensitive parsing. 19422e5b6d6dSopenharmony_ci comment_value = "" # Property value from a comment preceding an enum. 19432e5b6d6dSopenharmony_ci # Note: The enum UProperty is first in uchar.h, before the enums for values. 19442e5b6d6dSopenharmony_ci for line in uchar_file: 19452e5b6d6dSopenharmony_ci # Parse some enums via context-sensitive "modes". 19462e5b6d6dSopenharmony_ci # Necessary because the enum constant names do not contain 19472e5b6d6dSopenharmony_ci # enough information. 19482e5b6d6dSopenharmony_ci if "enum UCharCategory" in line: 19492e5b6d6dSopenharmony_ci mode = "gc" 19502e5b6d6dSopenharmony_ci comment_value = "" 19512e5b6d6dSopenharmony_ci continue 19522e5b6d6dSopenharmony_ci if mode == "gc": 19532e5b6d6dSopenharmony_ci if line.startswith("}"): 19542e5b6d6dSopenharmony_ci mode = "" 19552e5b6d6dSopenharmony_ci continue 19562e5b6d6dSopenharmony_ci match = _gc_comment_re.match(line) 19572e5b6d6dSopenharmony_ci if match: 19582e5b6d6dSopenharmony_ci comment_value = match.group(1) 19592e5b6d6dSopenharmony_ci continue 19602e5b6d6dSopenharmony_ci match = _gc_re.match(line) 19612e5b6d6dSopenharmony_ci if match and comment_value: 19622e5b6d6dSopenharmony_ci gc_enum = match.group(1) 19632e5b6d6dSopenharmony_ci prop = _properties["gc"] 19642e5b6d6dSopenharmony_ci vname = GetShortPropertyValueName(prop, comment_value) 19652e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["gc"][2] 19662e5b6d6dSopenharmony_ci icu_values.append((gc_enum, vname)) 19672e5b6d6dSopenharmony_ci comment_value = "" 19682e5b6d6dSopenharmony_ci continue 19692e5b6d6dSopenharmony_ci if "enum UCharDirection {" in line: 19702e5b6d6dSopenharmony_ci mode = "bc" 19712e5b6d6dSopenharmony_ci comment_value = "" 19722e5b6d6dSopenharmony_ci continue 19732e5b6d6dSopenharmony_ci if mode == "bc": 19742e5b6d6dSopenharmony_ci if line.startswith("}"): 19752e5b6d6dSopenharmony_ci mode = "" 19762e5b6d6dSopenharmony_ci continue 19772e5b6d6dSopenharmony_ci match = _bc_comment_re.match(line) 19782e5b6d6dSopenharmony_ci if match: 19792e5b6d6dSopenharmony_ci comment_value = match.group(1) 19802e5b6d6dSopenharmony_ci continue 19812e5b6d6dSopenharmony_ci match = _bc_re.match(line) 19822e5b6d6dSopenharmony_ci if match and comment_value: 19832e5b6d6dSopenharmony_ci bc_enum = match.group(1) 19842e5b6d6dSopenharmony_ci prop = _properties["bc"] 19852e5b6d6dSopenharmony_ci vname = GetShortPropertyValueName(prop, comment_value) 19862e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["bc"][2] 19872e5b6d6dSopenharmony_ci icu_values.append((bc_enum, vname)) 19882e5b6d6dSopenharmony_ci comment_value = "" 19892e5b6d6dSopenharmony_ci continue 19902e5b6d6dSopenharmony_ci # No mode, parse enum constants whose names contain 19912e5b6d6dSopenharmony_ci # enough information to parse without requiring context. 19922e5b6d6dSopenharmony_ci match = _uchar_re.match(line) 19932e5b6d6dSopenharmony_ci if match: 19942e5b6d6dSopenharmony_ci prop_enum = match.group(1) 19952e5b6d6dSopenharmony_ci if prop_enum.endswith("_LIMIT"): 19962e5b6d6dSopenharmony_ci # Ignore "UCHAR_BINARY_LIMIT=57," etc. 19972e5b6d6dSopenharmony_ci continue 19982e5b6d6dSopenharmony_ci pname = GetShortPropertyName(prop_enum[6:]) 19992e5b6d6dSopenharmony_ci icu_prop = (prop_enum, pname, []) 20002e5b6d6dSopenharmony_ci _icu_properties.append(icu_prop) 20012e5b6d6dSopenharmony_ci _pname_to_icu_prop[pname] = icu_prop 20022e5b6d6dSopenharmony_ci continue 20032e5b6d6dSopenharmony_ci match = _ublock_re.match(line) 20042e5b6d6dSopenharmony_ci if match: 20052e5b6d6dSopenharmony_ci prop_enum = match.group(1) 20062e5b6d6dSopenharmony_ci if prop_enum == "UBLOCK_COUNT": 20072e5b6d6dSopenharmony_ci continue 20082e5b6d6dSopenharmony_ci prop = _properties["blk"] 20092e5b6d6dSopenharmony_ci vname = GetShortPropertyValueName(prop, prop_enum[7:]) 20102e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["blk"][2] 20112e5b6d6dSopenharmony_ci icu_values.append((prop_enum, vname)) 20122e5b6d6dSopenharmony_ci continue 20132e5b6d6dSopenharmony_ci match = _prop_and_value_re.match(line) 20142e5b6d6dSopenharmony_ci if match: 20152e5b6d6dSopenharmony_ci (prop_enum, vname) = match.group(1, 3) 20162e5b6d6dSopenharmony_ci if vname == "COUNT" or _prop_and_alias_re.match(line): 20172e5b6d6dSopenharmony_ci continue 20182e5b6d6dSopenharmony_ci pname = GetShortPropertyName(match.group(2)) 20192e5b6d6dSopenharmony_ci prop = _properties[pname] 20202e5b6d6dSopenharmony_ci vname = GetShortPropertyValueName(prop, vname) 20212e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop[pname][2] 20222e5b6d6dSopenharmony_ci icu_values.append((prop_enum, vname)) 20232e5b6d6dSopenharmony_ci # ccc, lccc, tccc use their numeric values as "enum" values. 20242e5b6d6dSopenharmony_ci # In the UCD data, these numeric values are the first value names, 20252e5b6d6dSopenharmony_ci # followed by the short & long value names. 20262e5b6d6dSopenharmony_ci # List the ccc values in numeric order. 20272e5b6d6dSopenharmony_ci prop = _properties["ccc"] 20282e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["ccc"][2] 20292e5b6d6dSopenharmony_ci for ccc in sorted([int(name) for name in prop[2]]): 20302e5b6d6dSopenharmony_ci icu_values.append((ccc, str(ccc))) 20312e5b6d6dSopenharmony_ci _pname_to_icu_prop["lccc"][2].extend(icu_values) # Copy ccc -> lccc. 20322e5b6d6dSopenharmony_ci _pname_to_icu_prop["tccc"][2].extend(icu_values) # Copy ccc -> tccc. 20332e5b6d6dSopenharmony_ci 20342e5b6d6dSopenharmony_ci # No need to parse predictable General_Category_Mask enum constants. 20352e5b6d6dSopenharmony_ci # Just define them in ASCII order. 20362e5b6d6dSopenharmony_ci prop = _properties["gcm"] 20372e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["gcm"][2] 20382e5b6d6dSopenharmony_ci for vname in sorted(prop[2]): 20392e5b6d6dSopenharmony_ci icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname)) 20402e5b6d6dSopenharmony_ci # Hardcode known values for the normalization quick check properties, 20412e5b6d6dSopenharmony_ci # see unorm2.h for the UNormalizationCheckResult enum. 20422e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["NFC_QC"][2] 20432e5b6d6dSopenharmony_ci icu_values.append(("UNORM_NO", "N")) 20442e5b6d6dSopenharmony_ci icu_values.append(("UNORM_YES", "Y")) 20452e5b6d6dSopenharmony_ci icu_values.append(("UNORM_MAYBE", "M")) 20462e5b6d6dSopenharmony_ci _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values) # Copy NFC -> NFKC. 20472e5b6d6dSopenharmony_ci # No "maybe" values for NF[K]D. 20482e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["NFD_QC"][2] 20492e5b6d6dSopenharmony_ci icu_values.append(("UNORM_NO", "N")) 20502e5b6d6dSopenharmony_ci icu_values.append(("UNORM_YES", "Y")) 20512e5b6d6dSopenharmony_ci _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values) # Copy NFD -> NFKD. 20522e5b6d6dSopenharmony_ci 20532e5b6d6dSopenharmony_ci 20542e5b6d6dSopenharmony_ci# Sample line to match: 20552e5b6d6dSopenharmony_ci# USCRIPT_LOMA = 139,/* Loma */ 20562e5b6d6dSopenharmony_ci_uscript_re = re.compile( 20572e5b6d6dSopenharmony_ci " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/") 20582e5b6d6dSopenharmony_ci 20592e5b6d6dSopenharmony_cidef ParseUScriptHeader(icu4c_src_root): 20602e5b6d6dSopenharmony_ci uscript_path = os.path.join(icu4c_src_root, "source", 20612e5b6d6dSopenharmony_ci "common", "unicode", "uscript.h") 20622e5b6d6dSopenharmony_ci icu_values = _pname_to_icu_prop["sc"][2] 20632e5b6d6dSopenharmony_ci with open(uscript_path, "r") as uscript_file: 20642e5b6d6dSopenharmony_ci for line in uscript_file: 20652e5b6d6dSopenharmony_ci match = _uscript_re.match(line) 20662e5b6d6dSopenharmony_ci if match: 20672e5b6d6dSopenharmony_ci (script_enum, script_code) = match.group(1, 2) 20682e5b6d6dSopenharmony_ci icu_values.append((script_enum, script_code)) 20692e5b6d6dSopenharmony_ci 20702e5b6d6dSopenharmony_ci 20712e5b6d6dSopenharmony_cidef CheckPNamesData(): 20722e5b6d6dSopenharmony_ci """Checks that every ICU property has a full set of value enum constants, 20732e5b6d6dSopenharmony_ci and that the _icu_properties value names map back to the UCD.""" 20742e5b6d6dSopenharmony_ci missing_enums = [] 20752e5b6d6dSopenharmony_ci for (p_enum, pname, values) in _icu_properties: 20762e5b6d6dSopenharmony_ci prop = _properties[pname] 20772e5b6d6dSopenharmony_ci vnames = set(prop[2]) # Modifiable copy of the set of short value names. 20782e5b6d6dSopenharmony_ci for (v_enum, vname) in values: 20792e5b6d6dSopenharmony_ci if vname not in vnames: 20802e5b6d6dSopenharmony_ci raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" % 20812e5b6d6dSopenharmony_ci (pname, vname, v_enum)) 20822e5b6d6dSopenharmony_ci vnames.remove(vname) 20832e5b6d6dSopenharmony_ci # Exceptions to the all-values check: 20842e5b6d6dSopenharmony_ci # - ICU does not have specific enum values for binary No/Yes. 20852e5b6d6dSopenharmony_ci # - ICU represents Age values via UVersionInfo rather than enum constants. 20862e5b6d6dSopenharmony_ci # - gc: ICU enum UCharCategory only has the single-category values. 20872e5b6d6dSopenharmony_ci # (ICU's gcm property has all of the UCD gc property values.) 20882e5b6d6dSopenharmony_ci if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")): 20892e5b6d6dSopenharmony_ci missing_enums.append((pname, vnames)) 20902e5b6d6dSopenharmony_ci if missing_enums: 20912e5b6d6dSopenharmony_ci raise ValueError( 20922e5b6d6dSopenharmony_ci "missing uchar.h enum constants for some property values: %s" % 20932e5b6d6dSopenharmony_ci missing_enums) 20942e5b6d6dSopenharmony_ci 20952e5b6d6dSopenharmony_ci 20962e5b6d6dSopenharmony_cidef WritePNamesDataHeader(out_path): 20972e5b6d6dSopenharmony_ci with open(out_path, "w") as out_file: 20982e5b6d6dSopenharmony_ci out_file.write("""// © 2016 and later: Unicode, Inc. and others. 20992e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 21002e5b6d6dSopenharmony_ci/** 21012e5b6d6dSopenharmony_ci * Copyright (C) 2002-2016, International Business Machines Corporation and 21022e5b6d6dSopenharmony_ci * others. All Rights Reserved. 21032e5b6d6dSopenharmony_ci * 21042e5b6d6dSopenharmony_ci * machine-generated by: icu/tools/unicode/py/preparseucd.py 21052e5b6d6dSopenharmony_ci */ 21062e5b6d6dSopenharmony_ci 21072e5b6d6dSopenharmony_ci""") 21082e5b6d6dSopenharmony_ci 21092e5b6d6dSopenharmony_ci # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties 21102e5b6d6dSopenharmony_ci # and values in the order of their definition, 21112e5b6d6dSopenharmony_ci # and this function writes them in that order. 21122e5b6d6dSopenharmony_ci # Since the ICU API constants are stable and new values are only 21132e5b6d6dSopenharmony_ci # appended at the end 21142e5b6d6dSopenharmony_ci # (new properties are added at the end of each binary/enum/... range), 21152e5b6d6dSopenharmony_ci # the output is stable as well. 21162e5b6d6dSopenharmony_ci # When a property or value constant is renamed, 21172e5b6d6dSopenharmony_ci # it only changes the name itself in the output; 21182e5b6d6dSopenharmony_ci # it does not move in the output since there is no sorting. 21192e5b6d6dSopenharmony_ci # This minimizes diffs and assists with reviewing and evaluating updates. 21202e5b6d6dSopenharmony_ci 21212e5b6d6dSopenharmony_ci version = _ucd_version.split('.') 21222e5b6d6dSopenharmony_ci while len(version) < 4: version.append("0") 21232e5b6d6dSopenharmony_ci out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version)) 21242e5b6d6dSopenharmony_ci 21252e5b6d6dSopenharmony_ci # Count the maximum number of aliases for any property or value. 21262e5b6d6dSopenharmony_ci # We write the final value at the end. 21272e5b6d6dSopenharmony_ci max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"])) 21282e5b6d6dSopenharmony_ci 21292e5b6d6dSopenharmony_ci # Write an array of "binprop" Value object initializers 21302e5b6d6dSopenharmony_ci # with the value aliases shared among all binary properties. 21312e5b6d6dSopenharmony_ci out_file.write("static const Value VALUES_binprop[2] = {\n") 21322e5b6d6dSopenharmony_ci out_file.write(' Value(0, "%s"),\n' % " ".join(_binary_values["N"])) 21332e5b6d6dSopenharmony_ci out_file.write(' Value(1, "%s"),\n' % " ".join(_binary_values["Y"])) 21342e5b6d6dSopenharmony_ci out_file.write("};\n\n") 21352e5b6d6dSopenharmony_ci 21362e5b6d6dSopenharmony_ci # For each property with named values, write an array of 21372e5b6d6dSopenharmony_ci # Value object initializers with the value enum and the aliases. 21382e5b6d6dSopenharmony_ci for (p_enum, pname, values) in _icu_properties: 21392e5b6d6dSopenharmony_ci prop = _properties[pname] 21402e5b6d6dSopenharmony_ci aliases = prop[1] 21412e5b6d6dSopenharmony_ci if len(aliases) > max_aliases: max_aliases = len(aliases) 21422e5b6d6dSopenharmony_ci if not values: continue 21432e5b6d6dSopenharmony_ci out_file.write("static const Value VALUES_%s[%d] = {\n" % 21442e5b6d6dSopenharmony_ci (pname, len(values))) 21452e5b6d6dSopenharmony_ci for (v_enum, vname) in values: 21462e5b6d6dSopenharmony_ci aliases = _properties[pname][3][vname] 21472e5b6d6dSopenharmony_ci # ccc, lccc, tccc: Omit the numeric strings from the aliases. 21482e5b6d6dSopenharmony_ci # (See the comment about ccc in the PropertyValueAliases.txt header.) 21492e5b6d6dSopenharmony_ci if pname.endswith("ccc"): aliases = aliases[1:] 21502e5b6d6dSopenharmony_ci if len(aliases) > max_aliases: max_aliases = len(aliases) 21512e5b6d6dSopenharmony_ci cast = "(int32_t)" if pname == "gcm" else "" 21522e5b6d6dSopenharmony_ci out_file.write(' Value(%s%s, "%s"),\n' % 21532e5b6d6dSopenharmony_ci (cast, v_enum, " ".join(aliases))) 21542e5b6d6dSopenharmony_ci out_file.write("};\n\n") 21552e5b6d6dSopenharmony_ci 21562e5b6d6dSopenharmony_ci # For each property, write a Property object initializer 21572e5b6d6dSopenharmony_ci # with the property enum, its aliases, and a reference to its values. 21582e5b6d6dSopenharmony_ci out_file.write("static const Property PROPERTIES[%d] = {\n" % 21592e5b6d6dSopenharmony_ci len(_icu_properties)) 21602e5b6d6dSopenharmony_ci for (enum, pname, values) in _icu_properties: 21612e5b6d6dSopenharmony_ci prop = _properties[pname] 21622e5b6d6dSopenharmony_ci aliases = " ".join(prop[1]) 21632e5b6d6dSopenharmony_ci if prop[0] == "Binary": 21642e5b6d6dSopenharmony_ci out_file.write(' Property(%s, "%s"),\n' % (enum, aliases)) 21652e5b6d6dSopenharmony_ci elif values: # Property with named values. 21662e5b6d6dSopenharmony_ci out_file.write(' Property(%s, "%s", VALUES_%s, %d),\n' % 21672e5b6d6dSopenharmony_ci (enum, aliases, pname, len(values))) 21682e5b6d6dSopenharmony_ci else: 21692e5b6d6dSopenharmony_ci out_file.write(' Property(%s, "%s"),\n' % (enum, aliases)) 21702e5b6d6dSopenharmony_ci out_file.write("};\n\n") 21712e5b6d6dSopenharmony_ci 21722e5b6d6dSopenharmony_ci out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases) 21732e5b6d6dSopenharmony_ci 21742e5b6d6dSopenharmony_ci# main() ------------------------------------------------------------------- *** 21752e5b6d6dSopenharmony_ci 21762e5b6d6dSopenharmony_cidef main(): 21772e5b6d6dSopenharmony_ci global _null_or_defaults 21782e5b6d6dSopenharmony_ci only_ppucd = False 21792e5b6d6dSopenharmony_ci if len(sys.argv) == 3: 21802e5b6d6dSopenharmony_ci (ucd_root, icu_src_root) = sys.argv[1:3] 21812e5b6d6dSopenharmony_ci ppucd_path = None 21822e5b6d6dSopenharmony_ci elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd": 21832e5b6d6dSopenharmony_ci # For debugging: 21842e5b6d6dSopenharmony_ci # preparseucd.py path/to/UCD/root --only_ppucd path/to/ppucd/outputfile 21852e5b6d6dSopenharmony_ci ucd_root = sys.argv[1] 21862e5b6d6dSopenharmony_ci ppucd_path = sys.argv[3] 21872e5b6d6dSopenharmony_ci only_ppucd = True 21882e5b6d6dSopenharmony_ci icu_src_root = "/tmp/ppucd" 21892e5b6d6dSopenharmony_ci else: 21902e5b6d6dSopenharmony_ci print("Usage: %s path/to/UCD/root path/to/ICU/src/root" % sys.argv[0]) 21912e5b6d6dSopenharmony_ci return 21922e5b6d6dSopenharmony_ci icu4c_src_root = os.path.join(icu_src_root, "icu4c") 21932e5b6d6dSopenharmony_ci icu_tools_root = os.path.join(icu_src_root, "tools") 21942e5b6d6dSopenharmony_ci source_files = [] 21952e5b6d6dSopenharmony_ci for root, dirs, files in os.walk(ucd_root): 21962e5b6d6dSopenharmony_ci for file in files: 21972e5b6d6dSopenharmony_ci source_files.append(os.path.join(root, file)) 21982e5b6d6dSopenharmony_ci PreprocessFiles(source_files, icu4c_src_root) 21992e5b6d6dSopenharmony_ci # Parse the processed files in a particular order. 22002e5b6d6dSopenharmony_ci for files in _files_to_parse: 22012e5b6d6dSopenharmony_ci for (basename, path, parser) in files: 22022e5b6d6dSopenharmony_ci print("Parsing %s" % basename) 22032e5b6d6dSopenharmony_ci value = _files[basename] 22042e5b6d6dSopenharmony_ci # Unicode data files are in UTF-8. 22052e5b6d6dSopenharmony_ci charset = "UTF-8" 22062e5b6d6dSopenharmony_ci if basename == "NamesList.txt": 22072e5b6d6dSopenharmony_ci # The NamesList used to be in Latin-1 before Unicode 6.2. 22082e5b6d6dSopenharmony_ci numeric_ucd_version = [int(field) for field in _ucd_version.split('.')] 22092e5b6d6dSopenharmony_ci if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1" 22102e5b6d6dSopenharmony_ci in_file = codecs.open(path, "r", charset) 22112e5b6d6dSopenharmony_ci with in_file: 22122e5b6d6dSopenharmony_ci parser(in_file) 22132e5b6d6dSopenharmony_ci _null_or_defaults = _null_values.copy() 22142e5b6d6dSopenharmony_ci _null_or_defaults.update(_defaults) 22152e5b6d6dSopenharmony_ci # Every Catalog and Enumerated property must have a default value, 22162e5b6d6dSopenharmony_ci # from a @missing line. "nv" = "null value". 22172e5b6d6dSopenharmony_ci pnv = [pname for (pname, nv) in _null_or_defaults.items() if nv == "??"] 22182e5b6d6dSopenharmony_ci if pnv: 22192e5b6d6dSopenharmony_ci raise Exception("no default values (@missing lines) for " + 22202e5b6d6dSopenharmony_ci "some Catalog or Enumerated properties: %s " % pnv) 22212e5b6d6dSopenharmony_ci unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata") 22222e5b6d6dSopenharmony_ci if not only_ppucd: 22232e5b6d6dSopenharmony_ci # Write Normalizer2 input text files. 22242e5b6d6dSopenharmony_ci # Do this before compacting the data so that we need not handle fallbacks. 22252e5b6d6dSopenharmony_ci norm2_path = os.path.join(unidata_path, "norm2") 22262e5b6d6dSopenharmony_ci if not os.path.exists(norm2_path): os.makedirs(norm2_path) 22272e5b6d6dSopenharmony_ci WriteNorm2(norm2_path) 22282e5b6d6dSopenharmony_ci # Optimize block vs. cp properties. 22292e5b6d6dSopenharmony_ci CompactBlocks() 22302e5b6d6dSopenharmony_ci # Write the ppucd.txt output file. 22312e5b6d6dSopenharmony_ci # Use US-ASCII so that ICU tests can parse it in the platform charset, 22322e5b6d6dSopenharmony_ci # which may be EBCDIC. 22332e5b6d6dSopenharmony_ci # Fix up non-ASCII data (NamesList.txt headings) to fit. 22342e5b6d6dSopenharmony_ci if not ppucd_path: 22352e5b6d6dSopenharmony_ci ppucd_path = os.path.join(unidata_path, "ppucd.txt") 22362e5b6d6dSopenharmony_ci with codecs.open(ppucd_path, "w", "US-ASCII") as out_file: 22372e5b6d6dSopenharmony_ci WritePreparsedUCD(out_file) 22382e5b6d6dSopenharmony_ci out_file.flush() 22392e5b6d6dSopenharmony_ci 22402e5b6d6dSopenharmony_ci # TODO: PrintNameStats() 22412e5b6d6dSopenharmony_ci 22422e5b6d6dSopenharmony_ci if only_ppucd: return 22432e5b6d6dSopenharmony_ci 22442e5b6d6dSopenharmony_ci # ICU data for property & value names API 22452e5b6d6dSopenharmony_ci ParseUCharHeader(icu4c_src_root) 22462e5b6d6dSopenharmony_ci ParseUScriptHeader(icu4c_src_root) 22472e5b6d6dSopenharmony_ci CheckPNamesData() 22482e5b6d6dSopenharmony_ci genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops") 22492e5b6d6dSopenharmony_ci if not os.path.exists(genprops_path): os.makedirs(genprops_path) 22502e5b6d6dSopenharmony_ci out_path = os.path.join(genprops_path, "pnames_data.h") 22512e5b6d6dSopenharmony_ci WritePNamesDataHeader(out_path) 22522e5b6d6dSopenharmony_ci 22532e5b6d6dSopenharmony_ci 22542e5b6d6dSopenharmony_ciif __name__ == "__main__": 22552e5b6d6dSopenharmony_ci main() 2256