12e5b6d6dSopenharmony_ci#!/usr/bin/python3 -B
22e5b6d6dSopenharmony_ci# -*- coding: utf-8 -*-
32e5b6d6dSopenharmony_ci# © 2016 and later: Unicode, Inc. and others.
42e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
52e5b6d6dSopenharmony_ci# Copyright (c) 2009-2016 International Business Machines
62e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved.
72e5b6d6dSopenharmony_ci#
82e5b6d6dSopenharmony_ci#   file name:  preparseucd.py
92e5b6d6dSopenharmony_ci#   encoding:   US-ASCII
102e5b6d6dSopenharmony_ci#   tab size:   8 (not used)
112e5b6d6dSopenharmony_ci#   indentation:4
122e5b6d6dSopenharmony_ci#
132e5b6d6dSopenharmony_ci#   created on: 2011nov03 (forked from ucdcopy.py)
142e5b6d6dSopenharmony_ci#   created by: Markus W. Scherer
152e5b6d6dSopenharmony_ci#
162e5b6d6dSopenharmony_ci# Copies Unicode Character Database (UCD) files from a tree
172e5b6d6dSopenharmony_ci# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/
182e5b6d6dSopenharmony_ci# to ICU's source/data/unidata/ and source/test/testdata/
192e5b6d6dSopenharmony_ci# and modifies some of the files to make them more compact.
202e5b6d6dSopenharmony_ci# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
212e5b6d6dSopenharmony_ci#
222e5b6d6dSopenharmony_ci# Invoke with two command-line parameters:
232e5b6d6dSopenharmony_ci# 1. source folder with UCD & idna files
242e5b6d6dSopenharmony_ci# 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools)
252e5b6d6dSopenharmony_ci#
262e5b6d6dSopenharmony_ci# Sample invocation:
272e5b6d6dSopenharmony_ci#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src
282e5b6d6dSopenharmony_ci
292e5b6d6dSopenharmony_ciimport array
302e5b6d6dSopenharmony_ciimport bisect
312e5b6d6dSopenharmony_ciimport codecs
322e5b6d6dSopenharmony_ciimport os
332e5b6d6dSopenharmony_ciimport os.path
342e5b6d6dSopenharmony_ciimport re
352e5b6d6dSopenharmony_ciimport shutil
362e5b6d6dSopenharmony_ciimport sys
372e5b6d6dSopenharmony_ci
382e5b6d6dSopenharmony_ci# Unicode version ---------------------------------------------------------- ***
392e5b6d6dSopenharmony_ci
402e5b6d6dSopenharmony_ci_ucd_version = "?"
412e5b6d6dSopenharmony_ci
422e5b6d6dSopenharmony_ci# ISO 15924 script codes --------------------------------------------------- ***
432e5b6d6dSopenharmony_ci
442e5b6d6dSopenharmony_ci# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
452e5b6d6dSopenharmony_ci# that are not yet in the UCD.
462e5b6d6dSopenharmony_ci_scripts_only_in_iso15924 = (
472e5b6d6dSopenharmony_ci    "Afak", "Blis", "Cirt", "Cyrs",
482e5b6d6dSopenharmony_ci    "Egyd", "Egyh", "Geok",
492e5b6d6dSopenharmony_ci    "Hanb", "Hans", "Hant",
502e5b6d6dSopenharmony_ci    "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
512e5b6d6dSopenharmony_ci    "Maya", "Moon", "Nkgb", "Phlv", "Roro",
522e5b6d6dSopenharmony_ci    "Sara", "Syre", "Syrj", "Syrn",
532e5b6d6dSopenharmony_ci    "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx"
542e5b6d6dSopenharmony_ci)
552e5b6d6dSopenharmony_ci
562e5b6d6dSopenharmony_ci# Properties --------------------------------------------------------------- ***
572e5b6d6dSopenharmony_ci
582e5b6d6dSopenharmony_ci# Properties that we do not want to store in ppucd.txt.
592e5b6d6dSopenharmony_ci# Not a frozenset so that we can add aliases for simpler subsequent testing.
602e5b6d6dSopenharmony_ci_ignored_properties = set((
612e5b6d6dSopenharmony_ci  # Other_Xyz only contribute to Xyz, store only the latter.
622e5b6d6dSopenharmony_ci  "OAlpha",
632e5b6d6dSopenharmony_ci  "ODI",
642e5b6d6dSopenharmony_ci  "OGr_Ext",
652e5b6d6dSopenharmony_ci  "OIDC",
662e5b6d6dSopenharmony_ci  "OIDS",
672e5b6d6dSopenharmony_ci  "OLower",
682e5b6d6dSopenharmony_ci  "OMath",
692e5b6d6dSopenharmony_ci  "OUpper",
702e5b6d6dSopenharmony_ci  # Further properties that just contribute to others.
712e5b6d6dSopenharmony_ci  "CE",  # Composition_Exclusion just contributes to Full_Composition_Exclusion.
722e5b6d6dSopenharmony_ci  "JSN",
732e5b6d6dSopenharmony_ci  # These properties just don't seem useful.
742e5b6d6dSopenharmony_ci  # They are deprecated since Unicode 6.0.
752e5b6d6dSopenharmony_ci  "XO_NFC",
762e5b6d6dSopenharmony_ci  "XO_NFD",
772e5b6d6dSopenharmony_ci  "XO_NFKC",
782e5b6d6dSopenharmony_ci  "XO_NFKD",
792e5b6d6dSopenharmony_ci  # ICU does not use Unihan properties.
802e5b6d6dSopenharmony_ci  "cjkAccountingNumeric",
812e5b6d6dSopenharmony_ci  "cjkOtherNumeric",
822e5b6d6dSopenharmony_ci  "cjkPrimaryNumeric",
832e5b6d6dSopenharmony_ci  "cjkCompatibilityVariant",
842e5b6d6dSopenharmony_ci  "cjkIICore",
852e5b6d6dSopenharmony_ci  "cjkIRG_GSource",
862e5b6d6dSopenharmony_ci  "cjkIRG_HSource",
872e5b6d6dSopenharmony_ci  "cjkIRG_JSource",
882e5b6d6dSopenharmony_ci  "cjkIRG_KPSource",
892e5b6d6dSopenharmony_ci  "cjkIRG_KSource",
902e5b6d6dSopenharmony_ci  "cjkIRG_MSource",
912e5b6d6dSopenharmony_ci  "cjkIRG_SSource",
922e5b6d6dSopenharmony_ci  "cjkIRG_TSource",
932e5b6d6dSopenharmony_ci  "cjkIRG_UKSource",
942e5b6d6dSopenharmony_ci  "cjkIRG_USource",
952e5b6d6dSopenharmony_ci  "cjkIRG_VSource",
962e5b6d6dSopenharmony_ci  "cjkRSUnicode"
972e5b6d6dSopenharmony_ci))
982e5b6d6dSopenharmony_ci
992e5b6d6dSopenharmony_ci# These properties (short names) map code points to
1002e5b6d6dSopenharmony_ci# strings or other unusual values (property types String or Miscellaneous)
1012e5b6d6dSopenharmony_ci# that cannot be block-compressed (or would be confusing).
1022e5b6d6dSopenharmony_ci_uncompressible_props = frozenset((
1032e5b6d6dSopenharmony_ci  "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC",
1042e5b6d6dSopenharmony_ci  "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF",
1052e5b6d6dSopenharmony_ci  # scx is block-compressible.
1062e5b6d6dSopenharmony_ci  "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc"
1072e5b6d6dSopenharmony_ci))
1082e5b6d6dSopenharmony_ci
1092e5b6d6dSopenharmony_ci# Dictionary of properties.
1102e5b6d6dSopenharmony_ci# Keyed by normalized property names and aliases.
1112e5b6d6dSopenharmony_ci# Each value is a tuple with
1122e5b6d6dSopenharmony_ci# 0: Type of property (binary, enum, ...)
1132e5b6d6dSopenharmony_ci# 1: List of aliases; short & long name followed by other aliases.
1142e5b6d6dSopenharmony_ci#    The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
1152e5b6d6dSopenharmony_ci# 2: Set of short property value names.
1162e5b6d6dSopenharmony_ci# 3: Dictionary of property values.
1172e5b6d6dSopenharmony_ci#    For Catalog & Enumerated properties,
1182e5b6d6dSopenharmony_ci#    maps each value name to a list of aliases.
1192e5b6d6dSopenharmony_ci#    Empty for other types of properties.
1202e5b6d6dSopenharmony_ci_properties = {}
1212e5b6d6dSopenharmony_ci
1222e5b6d6dSopenharmony_ci# Dictionary of binary-property values which we store as False/True.
1232e5b6d6dSopenharmony_ci# Same as the values dictionary of one of the binary properties.
1242e5b6d6dSopenharmony_ci_binary_values = {}
1252e5b6d6dSopenharmony_ci
1262e5b6d6dSopenharmony_ci# Dictionary of null values.
1272e5b6d6dSopenharmony_ci# Keyed by short property names.
1282e5b6d6dSopenharmony_ci# These are type-specific values for properties that occur in the data.
1292e5b6d6dSopenharmony_ci# They are overridden by _defaults, block and code point properties.
1302e5b6d6dSopenharmony_ci_null_values = {}
1312e5b6d6dSopenharmony_ci
1322e5b6d6dSopenharmony_ci# Property value names for null values.
1332e5b6d6dSopenharmony_ci# We do not store these in _defaults.
1342e5b6d6dSopenharmony_ci_null_names = frozenset(("<none>", "NaN"))
1352e5b6d6dSopenharmony_ci
1362e5b6d6dSopenharmony_ci# Dictionary of explicit default property values.
1372e5b6d6dSopenharmony_ci# Keyed by short property names.
1382e5b6d6dSopenharmony_ci_defaults = {"gc": "Cn"}
1392e5b6d6dSopenharmony_ci
1402e5b6d6dSopenharmony_ci# _null_values overridden by explicit _defaults.
1412e5b6d6dSopenharmony_ci# Initialized after parsing is done.
1422e5b6d6dSopenharmony_ci_null_or_defaults = {}
1432e5b6d6dSopenharmony_ci
1442e5b6d6dSopenharmony_ci# List of properties with an ICU UProperty enum.
1452e5b6d6dSopenharmony_ci# Each item is an (enum, pname, values) tuple.
1462e5b6d6dSopenharmony_ci# - enum: the ICU enum UProperty constant string
1472e5b6d6dSopenharmony_ci# - pname: the UCD short property name
1482e5b6d6dSopenharmony_ci# - values: list of (enum, vname) pairs per property value
1492e5b6d6dSopenharmony_ci#   - enum: the ICU property value's enum constant string
1502e5b6d6dSopenharmony_ci#   - vname: the UCD short property value name
1512e5b6d6dSopenharmony_ci_icu_properties = []
1522e5b6d6dSopenharmony_ci
1532e5b6d6dSopenharmony_ci# Dictionary of short property names mapped to _icu_properties items.
1542e5b6d6dSopenharmony_ci_pname_to_icu_prop = {}
1552e5b6d6dSopenharmony_ci
1562e5b6d6dSopenharmony_ci_non_alnum_re = re.compile("[^a-zA-Z0-9]")
1572e5b6d6dSopenharmony_ci
1582e5b6d6dSopenharmony_cidef NormPropName(pname):
1592e5b6d6dSopenharmony_ci  """Returns a normalized form of pname.
1602e5b6d6dSopenharmony_ci  Removes non-ASCII-alphanumeric characters and lowercases letters."""
1612e5b6d6dSopenharmony_ci  return _non_alnum_re.sub("", pname).lower()
1622e5b6d6dSopenharmony_ci
1632e5b6d6dSopenharmony_ci
1642e5b6d6dSopenharmony_cidef GetProperty(pname):
1652e5b6d6dSopenharmony_ci  """Returns the _properties value for the pname.
1662e5b6d6dSopenharmony_ci  Returns null if the property is ignored.
1672e5b6d6dSopenharmony_ci  Caches alternate spellings of the property name."""
1682e5b6d6dSopenharmony_ci  # Try the input name.
1692e5b6d6dSopenharmony_ci  prop = _properties.get(pname)
1702e5b6d6dSopenharmony_ci  if prop != None: return prop
1712e5b6d6dSopenharmony_ci  if pname in _ignored_properties: return None
1722e5b6d6dSopenharmony_ci  # Try the normalized input name.
1732e5b6d6dSopenharmony_ci  norm_name = NormPropName(pname)
1742e5b6d6dSopenharmony_ci  prop = _properties.get(norm_name)
1752e5b6d6dSopenharmony_ci  if prop != None:
1762e5b6d6dSopenharmony_ci    _properties[pname] = prop  # Cache prop under this new name spelling.
1772e5b6d6dSopenharmony_ci    return prop
1782e5b6d6dSopenharmony_ci  elif pname in _ignored_properties:
1792e5b6d6dSopenharmony_ci    _ignored_properties.add(pname)  # Remember to ignore this new name spelling.
1802e5b6d6dSopenharmony_ci    return None
1812e5b6d6dSopenharmony_ci  else:
1822e5b6d6dSopenharmony_ci    raise NameError("unknown property %s\n" % pname)
1832e5b6d6dSopenharmony_ci
1842e5b6d6dSopenharmony_ci
1852e5b6d6dSopenharmony_cidef GetShortPropertyName(pname):
1862e5b6d6dSopenharmony_ci  if pname in _null_values: return pname  # pname is already the short name.
1872e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
1882e5b6d6dSopenharmony_ci  if not prop: return ""  # For ignored properties.
1892e5b6d6dSopenharmony_ci  return prop[1][0] or prop[1][1]  # Long name if no short name.
1902e5b6d6dSopenharmony_ci
1912e5b6d6dSopenharmony_ci
1922e5b6d6dSopenharmony_cidef GetShortPropertyValueName(prop, vname):
1932e5b6d6dSopenharmony_ci  if vname in prop[2]: return vname
1942e5b6d6dSopenharmony_ci  values = prop[3]
1952e5b6d6dSopenharmony_ci  aliases = values.get(vname)
1962e5b6d6dSopenharmony_ci  if aliases == None:
1972e5b6d6dSopenharmony_ci    norm_name = NormPropName(vname)
1982e5b6d6dSopenharmony_ci    aliases = values.get(norm_name)
1992e5b6d6dSopenharmony_ci    if aliases == None:
2002e5b6d6dSopenharmony_ci      raise NameError("unknown value name %s for property %s\n" %
2012e5b6d6dSopenharmony_ci                      (vname, prop[1][0]))
2022e5b6d6dSopenharmony_ci    values[vname] = aliases
2032e5b6d6dSopenharmony_ci  return aliases[0] or aliases[1]  # Long name if no short name.
2042e5b6d6dSopenharmony_ci
2052e5b6d6dSopenharmony_ci
2062e5b6d6dSopenharmony_cidef NormalizePropertyValue(prop, vname):
2072e5b6d6dSopenharmony_ci  if prop[2]:  # Binary/Catalog/Enumerated property.
2082e5b6d6dSopenharmony_ci    value = GetShortPropertyValueName(prop, vname)
2092e5b6d6dSopenharmony_ci    if prop[0] == "Binary":
2102e5b6d6dSopenharmony_ci      value = value == "Y"
2112e5b6d6dSopenharmony_ci    if prop[1][0].endswith("ccc"):
2122e5b6d6dSopenharmony_ci      value = int(value)
2132e5b6d6dSopenharmony_ci  else:
2142e5b6d6dSopenharmony_ci    value = vname
2152e5b6d6dSopenharmony_ci  return value
2162e5b6d6dSopenharmony_ci
2172e5b6d6dSopenharmony_ci# Character data ----------------------------------------------------------- ***
2182e5b6d6dSopenharmony_ci
2192e5b6d6dSopenharmony_ci# Lists of NamesList h1 and h2 headings.
2202e5b6d6dSopenharmony_ci# Each h1 value is a (start, end, comment) tuple.
2212e5b6d6dSopenharmony_ci# Each h2 value is a (cp, comment) tuple.
2222e5b6d6dSopenharmony_ci_h1 = []
2232e5b6d6dSopenharmony_ci_h2 = []
2242e5b6d6dSopenharmony_ci
2252e5b6d6dSopenharmony_ci# List of Unicode blocks.
2262e5b6d6dSopenharmony_ci# Each item is a tuple of start & end code point integers
2272e5b6d6dSopenharmony_ci# and a dictionary of default property values.
2282e5b6d6dSopenharmony_ci_blocks = []
2292e5b6d6dSopenharmony_ci
2302e5b6d6dSopenharmony_ci# List of ranges with algorithmic names.
2312e5b6d6dSopenharmony_ci# Each value is a list of [start, end, type, prefix]
2322e5b6d6dSopenharmony_ci# where prefix is optional.
2332e5b6d6dSopenharmony_ci_alg_names_ranges = []
2342e5b6d6dSopenharmony_ci
2352e5b6d6dSopenharmony_ci# List of Unicode character ranges and their properties,
2362e5b6d6dSopenharmony_ci# stored as an inversion map with range_start & props dictionary.
2372e5b6d6dSopenharmony_ci# Starts with one range for all of Unicode without any properties.
2382e5b6d6dSopenharmony_ci# Setting values subdivides ranges.
2392e5b6d6dSopenharmony_ci_starts = array.array('l', [0, 0x110000])  # array of int32_t
2402e5b6d6dSopenharmony_ci_props = [{}, {}]  # props for 0 and 110000
2412e5b6d6dSopenharmony_ci
2422e5b6d6dSopenharmony_cidef FindRange(x):
2432e5b6d6dSopenharmony_ci  """ Binary search for x in the inversion map.
2442e5b6d6dSopenharmony_ci  Returns the smallest i where x < _starts[i]"""
2452e5b6d6dSopenharmony_ci  return bisect.bisect(_starts, x) - 1
2462e5b6d6dSopenharmony_ci
2472e5b6d6dSopenharmony_ci
2482e5b6d6dSopenharmony_cidef GetProps(c):
2492e5b6d6dSopenharmony_ci  i = FindRange(c)
2502e5b6d6dSopenharmony_ci  return _props[i]
2512e5b6d6dSopenharmony_ci
2522e5b6d6dSopenharmony_ci
2532e5b6d6dSopenharmony_cidef UpdateProps(start, end, update):
2542e5b6d6dSopenharmony_ci  assert 0 <= start <= end <= 0x10ffff
2552e5b6d6dSopenharmony_ci  (need_to_update, do_update, u) = (update[0], update[1], update[2])
2562e5b6d6dSopenharmony_ci  # Find the index i of the range in _starts that contains start.
2572e5b6d6dSopenharmony_ci  i = FindRange(start)
2582e5b6d6dSopenharmony_ci  limit = end + 1
2592e5b6d6dSopenharmony_ci  # Intersect [start, limit[ with ranges in _starts.
2602e5b6d6dSopenharmony_ci  c_start = _starts[i]
2612e5b6d6dSopenharmony_ci  c_limit = _starts[i + 1]
2622e5b6d6dSopenharmony_ci  c_props = _props[i]
2632e5b6d6dSopenharmony_ci  # c_start <= start < c_limit
2642e5b6d6dSopenharmony_ci  if c_start < start:
2652e5b6d6dSopenharmony_ci    update_limit = c_limit if c_limit <= limit else limit
2662e5b6d6dSopenharmony_ci    if need_to_update(u, start, update_limit - 1, c_props):
2672e5b6d6dSopenharmony_ci      # Split off [c_start, start[ with a copy of c_props.
2682e5b6d6dSopenharmony_ci      i += 1
2692e5b6d6dSopenharmony_ci      c_props = c_props.copy()
2702e5b6d6dSopenharmony_ci      _starts.insert(i, start)
2712e5b6d6dSopenharmony_ci      _props.insert(i, c_props)
2722e5b6d6dSopenharmony_ci      c_start = start
2732e5b6d6dSopenharmony_ci  # Modify all ranges that are fully inside [start, limit[.
2742e5b6d6dSopenharmony_ci  while c_limit <= limit:
2752e5b6d6dSopenharmony_ci    # start <= c_start < c_limit <= limit
2762e5b6d6dSopenharmony_ci    if need_to_update(u, c_start, c_limit - 1, c_props):
2772e5b6d6dSopenharmony_ci      do_update(u, c_start, c_limit - 1, c_props)
2782e5b6d6dSopenharmony_ci    if c_limit == 0x110000: return
2792e5b6d6dSopenharmony_ci    i += 1
2802e5b6d6dSopenharmony_ci    c_start = c_limit
2812e5b6d6dSopenharmony_ci    c_limit = _starts[i + 1]
2822e5b6d6dSopenharmony_ci    c_props = _props[i]
2832e5b6d6dSopenharmony_ci  if c_start < limit and need_to_update(u, c_start, limit - 1, c_props):
2842e5b6d6dSopenharmony_ci    # Split off [limit, c_limit[ with a copy of c_props.
2852e5b6d6dSopenharmony_ci    _starts.insert(i + 1, limit)
2862e5b6d6dSopenharmony_ci    _props.insert(i + 1, c_props.copy())
2872e5b6d6dSopenharmony_ci    # Modify [c_start, limit[ c_props.
2882e5b6d6dSopenharmony_ci    do_update(u, c_start, limit - 1, c_props)
2892e5b6d6dSopenharmony_ci
2902e5b6d6dSopenharmony_ci
2912e5b6d6dSopenharmony_cidef NeedToSetProps(props, start, end, c_props):
2922e5b6d6dSopenharmony_ci  """Returns True if props is not a sub-dict of c_props."""
2932e5b6d6dSopenharmony_ci  for (pname, value) in props.items():
2942e5b6d6dSopenharmony_ci    if pname not in c_props or value != c_props[pname]: return True
2952e5b6d6dSopenharmony_ci  return False
2962e5b6d6dSopenharmony_ci
2972e5b6d6dSopenharmony_ci
2982e5b6d6dSopenharmony_cidef DoSetProps(props, start, end, c_props):
2992e5b6d6dSopenharmony_ci  c_props.update(props)
3002e5b6d6dSopenharmony_ci
3012e5b6d6dSopenharmony_ci
3022e5b6d6dSopenharmony_cidef SetProps(start, end, props):
3032e5b6d6dSopenharmony_ci  UpdateProps(start, end, (NeedToSetProps, DoSetProps, props))
3042e5b6d6dSopenharmony_ci
3052e5b6d6dSopenharmony_ci
3062e5b6d6dSopenharmony_cidef NeedToSetAlways(nv, start, end, c_props):
3072e5b6d6dSopenharmony_ci  return True
3082e5b6d6dSopenharmony_ci
3092e5b6d6dSopenharmony_ci
3102e5b6d6dSopenharmony_ci# For restoring boundaries after merging adjacent same-props ranges.
3112e5b6d6dSopenharmony_cidef AddBoundary(x):
3122e5b6d6dSopenharmony_ci  """Ensure that there is a range start/limit at x."""
3132e5b6d6dSopenharmony_ci  assert 0 <= x <= 0x10ffff
3142e5b6d6dSopenharmony_ci  i = FindRange(x)
3152e5b6d6dSopenharmony_ci  if _starts[i] == x: return
3162e5b6d6dSopenharmony_ci  # Split the range at x.
3172e5b6d6dSopenharmony_ci  c_start = _starts[i]
3182e5b6d6dSopenharmony_ci  c_limit = _starts[i + 1]
3192e5b6d6dSopenharmony_ci  c_props = _props[i]
3202e5b6d6dSopenharmony_ci  # c_start < x < c_limit
3212e5b6d6dSopenharmony_ci  i += 1
3222e5b6d6dSopenharmony_ci  _starts.insert(i, x)
3232e5b6d6dSopenharmony_ci  _props.insert(i, c_props.copy())
3242e5b6d6dSopenharmony_ci
3252e5b6d6dSopenharmony_ci
3262e5b6d6dSopenharmony_cidef SetDefaultValue(pname, value):
3272e5b6d6dSopenharmony_ci  """Sets the property's default value. Ignores null values."""
3282e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
3292e5b6d6dSopenharmony_ci  if prop and value not in _null_names:
3302e5b6d6dSopenharmony_ci    value = NormalizePropertyValue(prop, value)
3312e5b6d6dSopenharmony_ci    if value != _null_values[prop[1][0]]:
3322e5b6d6dSopenharmony_ci      _defaults[prop[1][0]] = value
3332e5b6d6dSopenharmony_ci      SetProps(0, 0x10ffff, {prop[1][0]: value})
3342e5b6d6dSopenharmony_ci
3352e5b6d6dSopenharmony_ci
3362e5b6d6dSopenharmony_cidef SetBinaryPropertyToTrue(pname, start, end):
3372e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
3382e5b6d6dSopenharmony_ci  if prop:
3392e5b6d6dSopenharmony_ci    assert prop[0] == "Binary"
3402e5b6d6dSopenharmony_ci    SetProps(start, end, {prop[1][0]: True})
3412e5b6d6dSopenharmony_ci
3422e5b6d6dSopenharmony_ci
3432e5b6d6dSopenharmony_cidef SetPropValue(prop, vname, start, end):
3442e5b6d6dSopenharmony_ci  value = NormalizePropertyValue(prop, vname)
3452e5b6d6dSopenharmony_ci  SetProps(start, end, {prop[1][0]: value})
3462e5b6d6dSopenharmony_ci
3472e5b6d6dSopenharmony_ci
3482e5b6d6dSopenharmony_cidef SetPropertyValue(pname, vname, start, end):
3492e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
3502e5b6d6dSopenharmony_ci  if prop: SetPropValue(prop, vname, start, end)
3512e5b6d6dSopenharmony_ci
3522e5b6d6dSopenharmony_ci# Parsing ------------------------------------------------------------------ ***
3532e5b6d6dSopenharmony_ci
3542e5b6d6dSopenharmony_ci_stripped_cp_re = re.compile("([0-9a-fA-F]+)$")
3552e5b6d6dSopenharmony_ci_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$")
3562e5b6d6dSopenharmony_ci# Default value for all of Unicode.
3572e5b6d6dSopenharmony_ci_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$")
3582e5b6d6dSopenharmony_ci# Default value for some range.
3592e5b6d6dSopenharmony_ci_missing2_re = re.compile("# *@missing: *(.+)$")
3602e5b6d6dSopenharmony_ci
3612e5b6d6dSopenharmony_cidef ReadUCDLines(in_file, want_ranges=True, want_other=False,
3622e5b6d6dSopenharmony_ci                 want_comments=False, want_missing=False):
3632e5b6d6dSopenharmony_ci  """Parses lines from a semicolon-delimited UCD text file.
3642e5b6d6dSopenharmony_ci  Strips comments, ignores empty and all-comment lines.
3652e5b6d6dSopenharmony_ci  Returns a tuple (type, line, ...).
3662e5b6d6dSopenharmony_ci  """
3672e5b6d6dSopenharmony_ci  for line in in_file:
3682e5b6d6dSopenharmony_ci    line = line.strip()
3692e5b6d6dSopenharmony_ci    if not line: continue
3702e5b6d6dSopenharmony_ci    if line.startswith("#"):  # whole-line comment
3712e5b6d6dSopenharmony_ci      parse_data = False
3722e5b6d6dSopenharmony_ci      if want_missing:
3732e5b6d6dSopenharmony_ci        match = _missing_re.match(line)
3742e5b6d6dSopenharmony_ci        if match:
3752e5b6d6dSopenharmony_ci          fields = match.group(1).split(";")
3762e5b6d6dSopenharmony_ci          for i in range(len(fields)): fields[i] = fields[i].strip()
3772e5b6d6dSopenharmony_ci          yield ("missing", line, fields)
3782e5b6d6dSopenharmony_ci          continue
3792e5b6d6dSopenharmony_ci        match = _missing2_re.match(line)
3802e5b6d6dSopenharmony_ci        if match:
3812e5b6d6dSopenharmony_ci          # Strip the "missing" comment prefix and fall through to
3822e5b6d6dSopenharmony_ci          # parse the remainder of the line like regular data.
3832e5b6d6dSopenharmony_ci          parse_data = True
3842e5b6d6dSopenharmony_ci          line = match.group(1)
3852e5b6d6dSopenharmony_ci      if not parse_data:
3862e5b6d6dSopenharmony_ci        if want_comments: yield ("comment", line)
3872e5b6d6dSopenharmony_ci        continue
3882e5b6d6dSopenharmony_ci    comment_start = line.find("#")  # inline comment
3892e5b6d6dSopenharmony_ci    if comment_start >= 0:
3902e5b6d6dSopenharmony_ci      line = line[:comment_start].rstrip()
3912e5b6d6dSopenharmony_ci      if not line: continue
3922e5b6d6dSopenharmony_ci    fields = line.split(";")
3932e5b6d6dSopenharmony_ci    for i in range(len(fields)): fields[i] = fields[i].strip()
3942e5b6d6dSopenharmony_ci    if want_ranges:
3952e5b6d6dSopenharmony_ci      first = fields[0]
3962e5b6d6dSopenharmony_ci      match = _stripped_range_re.match(first)
3972e5b6d6dSopenharmony_ci      if match:
3982e5b6d6dSopenharmony_ci        start = int(match.group(1), 16)
3992e5b6d6dSopenharmony_ci        end = int(match.group(2), 16)
4002e5b6d6dSopenharmony_ci        yield ("range", line, start, end, fields)
4012e5b6d6dSopenharmony_ci        continue
4022e5b6d6dSopenharmony_ci      match = _stripped_cp_re.match(first)
4032e5b6d6dSopenharmony_ci      if match:
4042e5b6d6dSopenharmony_ci        c = int(match.group(1), 16)
4052e5b6d6dSopenharmony_ci        yield ("range", line, c, c, fields)
4062e5b6d6dSopenharmony_ci        continue
4072e5b6d6dSopenharmony_ci    if want_other:
4082e5b6d6dSopenharmony_ci      yield ("other", line, fields)
4092e5b6d6dSopenharmony_ci    else:
4102e5b6d6dSopenharmony_ci      raise SyntaxError("unable to parse line\n  %s\n" % line)
4112e5b6d6dSopenharmony_ci
4122e5b6d6dSopenharmony_ci
4132e5b6d6dSopenharmony_cidef AddBinaryProperty(short_name, long_name):
4142e5b6d6dSopenharmony_ci  _null_values[short_name] = False
4152e5b6d6dSopenharmony_ci  bin_prop = _properties["Math"]
4162e5b6d6dSopenharmony_ci  prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3])
4172e5b6d6dSopenharmony_ci  _properties[short_name] = prop
4182e5b6d6dSopenharmony_ci  _properties[long_name] = prop
4192e5b6d6dSopenharmony_ci  _properties[NormPropName(short_name)] = prop
4202e5b6d6dSopenharmony_ci  _properties[NormPropName(long_name)] = prop
4212e5b6d6dSopenharmony_ci
4222e5b6d6dSopenharmony_ci
4232e5b6d6dSopenharmony_cidef AddSingleNameBinaryProperty(name):
4242e5b6d6dSopenharmony_ci  # For some properties, the short name is the same as the long name.
4252e5b6d6dSopenharmony_ci  _null_values[name] = False
4262e5b6d6dSopenharmony_ci  bin_prop = _properties["Math"]
4272e5b6d6dSopenharmony_ci  prop = ("Binary", [name, name], bin_prop[2], bin_prop[3])
4282e5b6d6dSopenharmony_ci  _properties[name] = prop
4292e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
4302e5b6d6dSopenharmony_ci
4312e5b6d6dSopenharmony_ci
4322e5b6d6dSopenharmony_cidef AddPOSIXBinaryProperty(name):
4332e5b6d6dSopenharmony_ci  # We only define a long name for ICU-specific (non-UCD) POSIX properties.
4342e5b6d6dSopenharmony_ci  _null_values[name] = False
4352e5b6d6dSopenharmony_ci  bin_prop = _properties["Math"]
4362e5b6d6dSopenharmony_ci  prop = ("Binary", ["", name], bin_prop[2], bin_prop[3])
4372e5b6d6dSopenharmony_ci  _properties[name] = prop
4382e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
4392e5b6d6dSopenharmony_ci  # This is to match UProperty UCHAR_POSIX_ALNUM etc.
4402e5b6d6dSopenharmony_ci  _properties["posix" + NormPropName(name)] = prop
4412e5b6d6dSopenharmony_ci
4422e5b6d6dSopenharmony_ci
4432e5b6d6dSopenharmony_ci# Match a comment line like
4442e5b6d6dSopenharmony_ci# PropertyAliases-6.1.0.txt
4452e5b6d6dSopenharmony_ci# and extract the Unicode version.
4462e5b6d6dSopenharmony_ci_ucd_version_re = re.compile("# *PropertyAliases" +
4472e5b6d6dSopenharmony_ci                             "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" +
4482e5b6d6dSopenharmony_ci                             "\\.txt")
4492e5b6d6dSopenharmony_ci
4502e5b6d6dSopenharmony_cidef ParsePropertyAliases(in_file):
4512e5b6d6dSopenharmony_ci  global _ucd_version
4522e5b6d6dSopenharmony_ci  prop_type_nulls = {
4532e5b6d6dSopenharmony_ci    "Binary": False,
4542e5b6d6dSopenharmony_ci    "Catalog": "??",  # Must be specified, e.g., in @missing line.
4552e5b6d6dSopenharmony_ci    "Enumerated": "??",  # Must be specified.
4562e5b6d6dSopenharmony_ci    "Numeric": "NaN",
4572e5b6d6dSopenharmony_ci    "String": "",
4582e5b6d6dSopenharmony_ci    "Miscellaneous": ""
4592e5b6d6dSopenharmony_ci  }
4602e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_ranges=False,
4612e5b6d6dSopenharmony_ci                           want_other=True, want_comments=True):
4622e5b6d6dSopenharmony_ci    if data[0] == "comment":
4632e5b6d6dSopenharmony_ci      line = data[1]
4642e5b6d6dSopenharmony_ci      match = _ucd_version_re.match(line)
4652e5b6d6dSopenharmony_ci      if match:
4662e5b6d6dSopenharmony_ci        _ucd_version = match.group(1)
4672e5b6d6dSopenharmony_ci      else:
4682e5b6d6dSopenharmony_ci        words = line[1:].lstrip().split()
4692e5b6d6dSopenharmony_ci        if len(words) == 2 and words[1] == "Properties":
4702e5b6d6dSopenharmony_ci          prop_type = words[0]
4712e5b6d6dSopenharmony_ci          null_value = prop_type_nulls[prop_type]
4722e5b6d6dSopenharmony_ci    else:
4732e5b6d6dSopenharmony_ci      # type == "other"
4742e5b6d6dSopenharmony_ci      aliases = data[2]
4752e5b6d6dSopenharmony_ci      name = aliases[0]
4762e5b6d6dSopenharmony_ci      if name in _ignored_properties:
4772e5b6d6dSopenharmony_ci        for alias in aliases:
4782e5b6d6dSopenharmony_ci          _ignored_properties.add(alias)
4792e5b6d6dSopenharmony_ci          _ignored_properties.add(NormPropName(alias))
4802e5b6d6dSopenharmony_ci      else:
4812e5b6d6dSopenharmony_ci        if name.endswith("ccc"):
4822e5b6d6dSopenharmony_ci          _null_values[name] = 0
4832e5b6d6dSopenharmony_ci        else:
4842e5b6d6dSopenharmony_ci          _null_values[name] = null_value
4852e5b6d6dSopenharmony_ci        prop = (prop_type, aliases, set(), {})
4862e5b6d6dSopenharmony_ci        for alias in aliases:
4872e5b6d6dSopenharmony_ci          _properties[alias] = prop
4882e5b6d6dSopenharmony_ci          _properties[NormPropName(alias)] = prop
4892e5b6d6dSopenharmony_ci  # Add provisional and ICU-specific properties we need.
4902e5b6d6dSopenharmony_ci  # We add some in support of runtime API, even if we do not write
4912e5b6d6dSopenharmony_ci  # data for them to ppucd.txt (e.g., lccc & tccc).
4922e5b6d6dSopenharmony_ci  # We add others just to represent UCD data that contributes to
4932e5b6d6dSopenharmony_ci  # some functionality, although Unicode has not "blessed" them
4942e5b6d6dSopenharmony_ci  # as separate properties (e.g., Turkic_Case_Folding).
4952e5b6d6dSopenharmony_ci
4962e5b6d6dSopenharmony_ci  # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
4972e5b6d6dSopenharmony_ci  name = "Turkic_Case_Folding"
4982e5b6d6dSopenharmony_ci  _null_values[name] = ""
4992e5b6d6dSopenharmony_ci  prop = ("String", [name, name], set(), {})
5002e5b6d6dSopenharmony_ci  _properties[name] = prop
5012e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
5022e5b6d6dSopenharmony_ci  # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
5032e5b6d6dSopenharmony_ci  name = "Conditional_Case_Mappings"
5042e5b6d6dSopenharmony_ci  _null_values[name] = ""
5052e5b6d6dSopenharmony_ci  prop = ("Miscellaneous", [name, name], set(), {})
5062e5b6d6dSopenharmony_ci  _properties[name] = prop
5072e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
5082e5b6d6dSopenharmony_ci  # lccc = ccc of first cp in canonical decomposition.
5092e5b6d6dSopenharmony_ci  _null_values["lccc"] = 0
5102e5b6d6dSopenharmony_ci  ccc_prop = list(_properties["ccc"])
5112e5b6d6dSopenharmony_ci  ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"]
5122e5b6d6dSopenharmony_ci  prop = tuple(ccc_prop)
5132e5b6d6dSopenharmony_ci  _properties["lccc"] = prop
5142e5b6d6dSopenharmony_ci  _properties["Lead_Canonical_Combining_Class"] = prop
5152e5b6d6dSopenharmony_ci  _properties["leadcanonicalcombiningclass"] = prop
5162e5b6d6dSopenharmony_ci  # tccc = ccc of last cp in canonical decomposition.
5172e5b6d6dSopenharmony_ci  _null_values["tccc"] = 0
5182e5b6d6dSopenharmony_ci  ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"]
5192e5b6d6dSopenharmony_ci  prop = tuple(ccc_prop)
5202e5b6d6dSopenharmony_ci  _properties["tccc"] = prop
5212e5b6d6dSopenharmony_ci  _properties["Trail_Canonical_Combining_Class"] = prop
5222e5b6d6dSopenharmony_ci  _properties["trailcanonicalcombiningclass"] = prop
5232e5b6d6dSopenharmony_ci  # Script_Extensions
5242e5b6d6dSopenharmony_ci  if "scx" not in _properties:
5252e5b6d6dSopenharmony_ci    _null_values["scx"] = ""
5262e5b6d6dSopenharmony_ci    prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
5272e5b6d6dSopenharmony_ci    _properties["scx"] = prop
5282e5b6d6dSopenharmony_ci    _properties["Script_Extensions"] = prop
5292e5b6d6dSopenharmony_ci    _properties["scriptextensions"] = prop
5302e5b6d6dSopenharmony_ci  # General Category as a bit mask.
5312e5b6d6dSopenharmony_ci  _null_values["gcm"] = "??"
5322e5b6d6dSopenharmony_ci  gc_prop = _properties["gc"]
5332e5b6d6dSopenharmony_ci  prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3])
5342e5b6d6dSopenharmony_ci  _properties["gcm"] = prop
5352e5b6d6dSopenharmony_ci  _properties["General_Category_Mask"] = prop
5362e5b6d6dSopenharmony_ci  _properties["generalcategorymask"] = prop
5372e5b6d6dSopenharmony_ci  # Various binary properties.
5382e5b6d6dSopenharmony_ci  AddBinaryProperty("Sensitive", "Case_Sensitive")
5392e5b6d6dSopenharmony_ci  AddBinaryProperty("nfdinert", "NFD_Inert")
5402e5b6d6dSopenharmony_ci  AddBinaryProperty("nfkdinert", "NFKD_Inert")
5412e5b6d6dSopenharmony_ci  AddBinaryProperty("nfcinert", "NFC_Inert")
5422e5b6d6dSopenharmony_ci  AddBinaryProperty("nfkcinert", "NFKC_Inert")
5432e5b6d6dSopenharmony_ci  AddBinaryProperty("segstart", "Segment_Starter")
5442e5b6d6dSopenharmony_ci  # https://www.unicode.org/reports/tr51/#Emoji_Properties
5452e5b6d6dSopenharmony_ci  AddBinaryProperty("Emoji", "Emoji")
5462e5b6d6dSopenharmony_ci  AddBinaryProperty("EPres", "Emoji_Presentation")
5472e5b6d6dSopenharmony_ci  AddBinaryProperty("EMod", "Emoji_Modifier")
5482e5b6d6dSopenharmony_ci  AddBinaryProperty("EBase", "Emoji_Modifier_Base")
5492e5b6d6dSopenharmony_ci  AddBinaryProperty("EComp", "Emoji_Component")
5502e5b6d6dSopenharmony_ci  AddBinaryProperty("ExtPict", "Extended_Pictographic")
5512e5b6d6dSopenharmony_ci  # https://www.unicode.org/reports/tr51/#Emoji_Sets
5522e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("Basic_Emoji")
5532e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("Emoji_Keycap_Sequence")
5542e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_Modifier_Sequence")
5552e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_Flag_Sequence")
5562e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_Tag_Sequence")
5572e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_ZWJ_Sequence")
5582e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji")
5592e5b6d6dSopenharmony_ci  # C/POSIX character classes that do not have Unicode property [value] aliases.
5602e5b6d6dSopenharmony_ci  # See uchar.h.
5612e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("alnum")
5622e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("blank")
5632e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("graph")
5642e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("print")
5652e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("xdigit")
5662e5b6d6dSopenharmony_ci
5672e5b6d6dSopenharmony_ci
5682e5b6d6dSopenharmony_cidef ParsePropertyValueAliases(in_file):
5692e5b6d6dSopenharmony_ci  global _binary_values
5702e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_ranges=False,
5712e5b6d6dSopenharmony_ci                           want_other=True, want_missing=True):
5722e5b6d6dSopenharmony_ci    if data[0] == "missing":
5732e5b6d6dSopenharmony_ci      SetDefaultValue(data[2][0], data[2][1])
5742e5b6d6dSopenharmony_ci    else:
5752e5b6d6dSopenharmony_ci      # type == "other"
5762e5b6d6dSopenharmony_ci      fields = data[2]
5772e5b6d6dSopenharmony_ci      pname = fields[0]
5782e5b6d6dSopenharmony_ci      prop = GetProperty(pname)
5792e5b6d6dSopenharmony_ci      if prop:
5802e5b6d6dSopenharmony_ci        del fields[0]  # Only the list of aliases remains.
5812e5b6d6dSopenharmony_ci        short_name = fields[0]
5822e5b6d6dSopenharmony_ci        if short_name == "n/a":  # no short name
5832e5b6d6dSopenharmony_ci          fields[0] = ""
5842e5b6d6dSopenharmony_ci          short_name = fields[1]
5852e5b6d6dSopenharmony_ci        prop[2].add(short_name)
5862e5b6d6dSopenharmony_ci        values = prop[3]
5872e5b6d6dSopenharmony_ci        for alias in fields:
5882e5b6d6dSopenharmony_ci          if alias:
5892e5b6d6dSopenharmony_ci            values[alias] = fields
5902e5b6d6dSopenharmony_ci            values[NormPropName(alias)] = fields
5912e5b6d6dSopenharmony_ci        if prop[0] == "Binary" and not _binary_values:
5922e5b6d6dSopenharmony_ci          _binary_values = values
5932e5b6d6dSopenharmony_ci  # Some of the @missing lines with non-null default property values
5942e5b6d6dSopenharmony_ci  # are in files that we do not parse;
5952e5b6d6dSopenharmony_ci  # either because the data for that property is easily
5962e5b6d6dSopenharmony_ci  # (i.e., the @missing line would be the only reason to parse such a file)
5972e5b6d6dSopenharmony_ci  # or because we compute the property at runtime,
5982e5b6d6dSopenharmony_ci  # such as the Hangul_Syllable_Type.
5992e5b6d6dSopenharmony_ci  if "dt" not in _defaults:  # DerivedDecompositionType.txt
6002e5b6d6dSopenharmony_ci    _defaults["dt"] = "None"
6012e5b6d6dSopenharmony_ci  if "nt" not in _defaults:  # DerivedNumericType.txt
6022e5b6d6dSopenharmony_ci    _defaults["nt"] = "None"
6032e5b6d6dSopenharmony_ci  if "hst" not in _defaults:  # HangulSyllableType.txt
6042e5b6d6dSopenharmony_ci    _defaults["hst"] = "NA"
6052e5b6d6dSopenharmony_ci  if "gc" not in _defaults:  # No @missing line in any .txt file?
6062e5b6d6dSopenharmony_ci    _defaults["gc"] = "Cn"
6072e5b6d6dSopenharmony_ci  # Copy the gc default value to gcm.
6082e5b6d6dSopenharmony_ci  _defaults["gcm"] = _defaults["gc"]
6092e5b6d6dSopenharmony_ci  # Add ISO 15924-only script codes.
6102e5b6d6dSopenharmony_ci  # Only for the ICU script code API, not necessary for parsing the UCD.
6112e5b6d6dSopenharmony_ci  script_prop = _properties["sc"]
6122e5b6d6dSopenharmony_ci  short_script_names = script_prop[2]  # set
6132e5b6d6dSopenharmony_ci  script_values = script_prop[3]  # dict
6142e5b6d6dSopenharmony_ci  remove_scripts = []
6152e5b6d6dSopenharmony_ci  for script in _scripts_only_in_iso15924:
6162e5b6d6dSopenharmony_ci    if script in short_script_names:
6172e5b6d6dSopenharmony_ci      remove_scripts.append(script)
6182e5b6d6dSopenharmony_ci    else:
6192e5b6d6dSopenharmony_ci      short_script_names.add(script)
6202e5b6d6dSopenharmony_ci      # Do not invent a Unicode long script name before the UCD adds the script.
6212e5b6d6dSopenharmony_ci      script_list = [script, script]  # [short, long]
6222e5b6d6dSopenharmony_ci      script_values[script] = script_list
6232e5b6d6dSopenharmony_ci      # Probably not necessary because
6242e5b6d6dSopenharmony_ci      # we will not parse these scripts from the UCD:
6252e5b6d6dSopenharmony_ci      script_values[NormPropName(script)] = script_list
6262e5b6d6dSopenharmony_ci  if remove_scripts:
6272e5b6d6dSopenharmony_ci    raise ValueError(
6282e5b6d6dSopenharmony_ci        "remove %s from _scripts_only_in_iso15924" % remove_scripts)
6292e5b6d6dSopenharmony_ci
6302e5b6d6dSopenharmony_ci
6312e5b6d6dSopenharmony_cidef ParseBlocks(in_file):
6322e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
6332e5b6d6dSopenharmony_ci    if data[0] == "missing":
6342e5b6d6dSopenharmony_ci      SetDefaultValue("blk", data[2][0])
6352e5b6d6dSopenharmony_ci    else:
6362e5b6d6dSopenharmony_ci      # type == "range"
6372e5b6d6dSopenharmony_ci      (start, end, name) = (data[2], data[3], data[4][1])
6382e5b6d6dSopenharmony_ci      _blocks.append((start, end, {"blk": name}))
6392e5b6d6dSopenharmony_ci      SetPropertyValue("blk", name, start, end)
6402e5b6d6dSopenharmony_ci  _blocks.sort()
6412e5b6d6dSopenharmony_ci  # Check for overlapping blocks.
6422e5b6d6dSopenharmony_ci  prev_end = -1
6432e5b6d6dSopenharmony_ci  for b in _blocks:
6442e5b6d6dSopenharmony_ci    start = b[0]
6452e5b6d6dSopenharmony_ci    end = b[1]
6462e5b6d6dSopenharmony_ci    if prev_end >= start:
6472e5b6d6dSopenharmony_ci      raise ValueError(
6482e5b6d6dSopenharmony_ci          "block %04lX..%04lX %s overlaps with another " +
6492e5b6d6dSopenharmony_ci          "ending at %04lX\n  %s\n" %
6502e5b6d6dSopenharmony_ci          (start, end, b[2]["blk"], prev_end))
6512e5b6d6dSopenharmony_ci    prev_end = end
6522e5b6d6dSopenharmony_ci
6532e5b6d6dSopenharmony_ci
6542e5b6d6dSopenharmony_cidef ParseUnicodeData(in_file):
6552e5b6d6dSopenharmony_ci  dt_prop = GetProperty("dt")
6562e5b6d6dSopenharmony_ci  range_first_line = ""
6572e5b6d6dSopenharmony_ci  range_first = -1
6582e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
6592e5b6d6dSopenharmony_ci    # type == "range"
6602e5b6d6dSopenharmony_ci    (line, c, end, fields) = (data[1], data[2], data[3], data[4])
6612e5b6d6dSopenharmony_ci    assert c == end
6622e5b6d6dSopenharmony_ci    name = fields[1]
6632e5b6d6dSopenharmony_ci    if name.startswith("<"):
6642e5b6d6dSopenharmony_ci      if name.endswith(", First>"):
6652e5b6d6dSopenharmony_ci        if range_first >= 0:
6662e5b6d6dSopenharmony_ci          raise SyntaxError(
6672e5b6d6dSopenharmony_ci              "error: unterminated range started at\n  %s\n" %
6682e5b6d6dSopenharmony_ci              range_first_line)
6692e5b6d6dSopenharmony_ci        range_first = c
6702e5b6d6dSopenharmony_ci        range_first_line = line
6712e5b6d6dSopenharmony_ci        continue
6722e5b6d6dSopenharmony_ci      elif name.endswith(", Last>"):
6732e5b6d6dSopenharmony_ci        if range_first < 0:
6742e5b6d6dSopenharmony_ci          raise SyntaxError(
6752e5b6d6dSopenharmony_ci              "error: range end without start at\n  %s\n" %
6762e5b6d6dSopenharmony_ci              line)
6772e5b6d6dSopenharmony_ci        elif range_first > c:
6782e5b6d6dSopenharmony_ci          raise SyntaxError(
6792e5b6d6dSopenharmony_ci              "error: range start/end out of order at\n  %s\n  %s\n" %
6802e5b6d6dSopenharmony_ci              (range_first_line, line))
6812e5b6d6dSopenharmony_ci        first_name = range_first_line.split(";")[1][1:-8]
6822e5b6d6dSopenharmony_ci        name = name[1:-7]
6832e5b6d6dSopenharmony_ci        if first_name != name:
6842e5b6d6dSopenharmony_ci          raise SyntaxError(
6852e5b6d6dSopenharmony_ci              "error: range start/end name mismatch at\n  %s\n  %s\n" %
6862e5b6d6dSopenharmony_ci              (range_first_line, line))
6872e5b6d6dSopenharmony_ci        end = c
6882e5b6d6dSopenharmony_ci        c = range_first
6892e5b6d6dSopenharmony_ci        range_first = -1
6902e5b6d6dSopenharmony_ci        # Remember algorithmic name ranges.
6912e5b6d6dSopenharmony_ci        if "Ideograph" in name:
6922e5b6d6dSopenharmony_ci          prefix = "CJK UNIFIED IDEOGRAPH-"
6932e5b6d6dSopenharmony_ci          if c == 0x17000 or c == 0x18D00: prefix = "TANGUT IDEOGRAPH-"
6942e5b6d6dSopenharmony_ci          _alg_names_ranges.append([c, end, "han", prefix])
6952e5b6d6dSopenharmony_ci        elif name == "Hangul Syllable":
6962e5b6d6dSopenharmony_ci          _alg_names_ranges.append([c, end, "hangul"])
6972e5b6d6dSopenharmony_ci        name = ""
6982e5b6d6dSopenharmony_ci      else:
6992e5b6d6dSopenharmony_ci        # Ignore non-names like <control>.
7002e5b6d6dSopenharmony_ci        name = ""
7012e5b6d6dSopenharmony_ci    props = {}
7022e5b6d6dSopenharmony_ci    if name: props["na"] = name
7032e5b6d6dSopenharmony_ci    props["gc"] = fields[2]
7042e5b6d6dSopenharmony_ci    ccc = int(fields[3])
7052e5b6d6dSopenharmony_ci    if ccc: props["ccc"] = ccc
7062e5b6d6dSopenharmony_ci    props["bc"] = fields[4]
7072e5b6d6dSopenharmony_ci    # Decomposition type & mapping.
7082e5b6d6dSopenharmony_ci    dm = fields[5]
7092e5b6d6dSopenharmony_ci    if dm:
7102e5b6d6dSopenharmony_ci      if dm.startswith("<"):
7112e5b6d6dSopenharmony_ci        dt_limit = dm.index(">")
7122e5b6d6dSopenharmony_ci        dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit])
7132e5b6d6dSopenharmony_ci        dm = dm[dt_limit + 1:].lstrip()
7142e5b6d6dSopenharmony_ci      else:
7152e5b6d6dSopenharmony_ci        dt = "Can"
7162e5b6d6dSopenharmony_ci      props["dt"] = dt
7172e5b6d6dSopenharmony_ci      props["dm"] = dm
7182e5b6d6dSopenharmony_ci    # Numeric type & value.
7192e5b6d6dSopenharmony_ci    decimal = fields[6]
7202e5b6d6dSopenharmony_ci    digit = fields[7]
7212e5b6d6dSopenharmony_ci    nv = fields[8]
7222e5b6d6dSopenharmony_ci    if (decimal and decimal != nv) or (digit and digit != nv):
7232e5b6d6dSopenharmony_ci      raise SyntaxError("error: numeric values differ at\n  %s\n" % line)
7242e5b6d6dSopenharmony_ci    if nv:
7252e5b6d6dSopenharmony_ci      # Map improper fractions to proper ones.
7262e5b6d6dSopenharmony_ci      # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS
7272e5b6d6dSopenharmony_ci      # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS
7282e5b6d6dSopenharmony_ci      if nv == "2/12":
7292e5b6d6dSopenharmony_ci        nv = "1/6"
7302e5b6d6dSopenharmony_ci      elif nv == "3/12":
7312e5b6d6dSopenharmony_ci        nv = "1/4"
7322e5b6d6dSopenharmony_ci      elif nv == "4/12":
7332e5b6d6dSopenharmony_ci        nv = "1/3"
7342e5b6d6dSopenharmony_ci      elif nv == "6/12":
7352e5b6d6dSopenharmony_ci        nv = "1/2"
7362e5b6d6dSopenharmony_ci      elif nv == "8/12":
7372e5b6d6dSopenharmony_ci        nv = "2/3"
7382e5b6d6dSopenharmony_ci      elif nv == "9/12":
7392e5b6d6dSopenharmony_ci        nv = "3/4"
7402e5b6d6dSopenharmony_ci      elif nv == "10/12":
7412e5b6d6dSopenharmony_ci        nv = "5/6"
7422e5b6d6dSopenharmony_ci      props["nv"] = nv
7432e5b6d6dSopenharmony_ci      props["nt"] = "De" if decimal else "Di" if digit else "Nu"
7442e5b6d6dSopenharmony_ci    if fields[9] == "Y": props["Bidi_M"] = True
7452e5b6d6dSopenharmony_ci    # ICU 49 and above does not support Unicode_1_Name any more.
7462e5b6d6dSopenharmony_ci    # See ticket #9013.
7472e5b6d6dSopenharmony_ci    # na1 = fields[10]
7482e5b6d6dSopenharmony_ci    # if na1: props["na1"] = na1
7492e5b6d6dSopenharmony_ci    # ISO_Comment is deprecated and has no values.
7502e5b6d6dSopenharmony_ci    # isc = fields[11]
7512e5b6d6dSopenharmony_ci    # if isc: props["isc"] = isc
7522e5b6d6dSopenharmony_ci    # Simple case mappings.
7532e5b6d6dSopenharmony_ci    suc = fields[12]
7542e5b6d6dSopenharmony_ci    slc = fields[13]
7552e5b6d6dSopenharmony_ci    stc = fields[14]
7562e5b6d6dSopenharmony_ci    if suc: props["suc"] = suc
7572e5b6d6dSopenharmony_ci    if slc: props["slc"] = slc
7582e5b6d6dSopenharmony_ci    if stc: props["stc"] = stc
7592e5b6d6dSopenharmony_ci    SetProps(c, end, props)
7602e5b6d6dSopenharmony_ci  if range_first >= 0:
7612e5b6d6dSopenharmony_ci    raise SyntaxError(
7622e5b6d6dSopenharmony_ci        "error: unterminated range started at\n  %s\n" %
7632e5b6d6dSopenharmony_ci        range_first_line)
7642e5b6d6dSopenharmony_ci  # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt.
7652e5b6d6dSopenharmony_ci  SetPropertyValue("dt", "Can", 0xac00, 0xd7a3)
7662e5b6d6dSopenharmony_ci  _alg_names_ranges.sort()
7672e5b6d6dSopenharmony_ci
7682e5b6d6dSopenharmony_ci
7692e5b6d6dSopenharmony_ci_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$")
7702e5b6d6dSopenharmony_ci_names_h2_re = re.compile("@\t\t(.+)")
7712e5b6d6dSopenharmony_ci_names_char_re = re.compile("([0-9a-fA-F]+)\t.+")
7722e5b6d6dSopenharmony_ci
7732e5b6d6dSopenharmony_cidef ParseNamesList(in_file):
7742e5b6d6dSopenharmony_ci  pending_h2 = ""
7752e5b6d6dSopenharmony_ci  for line in in_file:
7762e5b6d6dSopenharmony_ci    line = line.strip()
7772e5b6d6dSopenharmony_ci    if not line: continue
7782e5b6d6dSopenharmony_ci    match = _names_h1_re.match(line)
7792e5b6d6dSopenharmony_ci    if match:
7802e5b6d6dSopenharmony_ci      pending_h2 = ""  # Drop a pending h2 when we get to an h1.
7812e5b6d6dSopenharmony_ci      start = int(match.group(1), 16)
7822e5b6d6dSopenharmony_ci      end = int(match.group(3), 16)
7832e5b6d6dSopenharmony_ci      comment = match.group(2).replace(u"\xa0", " ")
7842e5b6d6dSopenharmony_ci      _h1.append((start, end, comment))
7852e5b6d6dSopenharmony_ci      continue
7862e5b6d6dSopenharmony_ci    match = _names_h2_re.match(line)
7872e5b6d6dSopenharmony_ci    if match:
7882e5b6d6dSopenharmony_ci      pending_h2 = match.group(1).replace(u"\xa0", " ")
7892e5b6d6dSopenharmony_ci      continue
7902e5b6d6dSopenharmony_ci    if pending_h2:
7912e5b6d6dSopenharmony_ci      match = _names_char_re.match(line)
7922e5b6d6dSopenharmony_ci      if match:
7932e5b6d6dSopenharmony_ci        c = int(match.group(1), 16)
7942e5b6d6dSopenharmony_ci        _h2.append((c, pending_h2))
7952e5b6d6dSopenharmony_ci        pending_h2 = ""
7962e5b6d6dSopenharmony_ci  _h1.sort()
7972e5b6d6dSopenharmony_ci  _h2.sort()
7982e5b6d6dSopenharmony_ci
7992e5b6d6dSopenharmony_ci
8002e5b6d6dSopenharmony_cidef ParseNamedProperties(in_file):
8012e5b6d6dSopenharmony_ci  """Parses a .txt file where the first column is a code point range
8022e5b6d6dSopenharmony_ci  and the second column is a property name.
8032e5b6d6dSopenharmony_ci  Sets binary properties to True,
8042e5b6d6dSopenharmony_ci  and other properties to the values in the third column."""
8052e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
8062e5b6d6dSopenharmony_ci    if data[0] == "missing":
8072e5b6d6dSopenharmony_ci      SetDefaultValue(data[2][0], data[2][1])
8082e5b6d6dSopenharmony_ci    else:
8092e5b6d6dSopenharmony_ci      # type == "range"
8102e5b6d6dSopenharmony_ci      if len(data[4]) == 2:
8112e5b6d6dSopenharmony_ci        SetBinaryPropertyToTrue(data[4][1], data[2], data[3])
8122e5b6d6dSopenharmony_ci      else:
8132e5b6d6dSopenharmony_ci        SetPropertyValue(data[4][1], data[4][2], data[2], data[3])
8142e5b6d6dSopenharmony_ci
8152e5b6d6dSopenharmony_ci
8162e5b6d6dSopenharmony_cidef ParseOneProperty(in_file, pname):
8172e5b6d6dSopenharmony_ci  """Parses a .txt file where the first column is a code point range
8182e5b6d6dSopenharmony_ci  and the second column is the value of a known property."""
8192e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
8202e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
8212e5b6d6dSopenharmony_ci    if data[0] == "missing":
8222e5b6d6dSopenharmony_ci      SetDefaultValue(pname, data[2][0])
8232e5b6d6dSopenharmony_ci    else:
8242e5b6d6dSopenharmony_ci      # type == "range"
8252e5b6d6dSopenharmony_ci      SetPropValue(prop, data[4][1], data[2], data[3])
8262e5b6d6dSopenharmony_ci
8272e5b6d6dSopenharmony_ci
8282e5b6d6dSopenharmony_cidef ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg")
8292e5b6d6dSopenharmony_cidef ParseDerivedAge(in_file): ParseOneProperty(in_file, "age")
8302e5b6d6dSopenharmony_cidef ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc")
8312e5b6d6dSopenharmony_cidef ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
8322e5b6d6dSopenharmony_cidef ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
8332e5b6d6dSopenharmony_cidef ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
8342e5b6d6dSopenharmony_cidef ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
8352e5b6d6dSopenharmony_cidef ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
8362e5b6d6dSopenharmony_cidef ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
8372e5b6d6dSopenharmony_cidef ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
8382e5b6d6dSopenharmony_cidef ParseScripts(in_file): ParseOneProperty(in_file, "sc")
8392e5b6d6dSopenharmony_cidef ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx")
8402e5b6d6dSopenharmony_cidef ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB")
8412e5b6d6dSopenharmony_cidef ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo")
8422e5b6d6dSopenharmony_cidef ParseWordBreak(in_file): ParseOneProperty(in_file, "WB")
8432e5b6d6dSopenharmony_ci
8442e5b6d6dSopenharmony_ci
8452e5b6d6dSopenharmony_cidef DoSetNameAlias(alias, start, end, c_props):
8462e5b6d6dSopenharmony_ci  if "Name_Alias" in c_props:
8472e5b6d6dSopenharmony_ci    c_props["Name_Alias"] += ',' + alias
8482e5b6d6dSopenharmony_ci  else:
8492e5b6d6dSopenharmony_ci    c_props["Name_Alias"] = alias
8502e5b6d6dSopenharmony_ci
8512e5b6d6dSopenharmony_ci
8522e5b6d6dSopenharmony_cidef ParseNameAliases(in_file):
8532e5b6d6dSopenharmony_ci  """Parses Name_Alias from NameAliases.txt.
8542e5b6d6dSopenharmony_ci  A character can have multiple aliases.
8552e5b6d6dSopenharmony_ci
8562e5b6d6dSopenharmony_ci  In Unicode 6.0, there are two columns,
8572e5b6d6dSopenharmony_ci  with a name correction in the second column.
8582e5b6d6dSopenharmony_ci
8592e5b6d6dSopenharmony_ci  In Unicode 6.1, there are three columns.
8602e5b6d6dSopenharmony_ci  The second contains an alias, the third its type.
8612e5b6d6dSopenharmony_ci  The documented types are:
8622e5b6d6dSopenharmony_ci    correction, control, alternate, figment, abbreviation
8632e5b6d6dSopenharmony_ci
8642e5b6d6dSopenharmony_ci  This function does not sort the types, assuming they appear in this order."""
8652e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file):
8662e5b6d6dSopenharmony_ci    start = data[2]
8672e5b6d6dSopenharmony_ci    end = data[3]
8682e5b6d6dSopenharmony_ci    if start != end:
8692e5b6d6dSopenharmony_ci      raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" %
8702e5b6d6dSopenharmony_ci                       (start, end))
8712e5b6d6dSopenharmony_ci    fields = data[4]
8722e5b6d6dSopenharmony_ci    if len(fields) == 2:
8732e5b6d6dSopenharmony_ci      alias = "correction=" + fields[1]
8742e5b6d6dSopenharmony_ci    else:
8752e5b6d6dSopenharmony_ci      alias = fields[2] + '=' + fields[1]
8762e5b6d6dSopenharmony_ci    update = (NeedToSetAlways, DoSetNameAlias, alias)
8772e5b6d6dSopenharmony_ci    UpdateProps(start, end, update)
8782e5b6d6dSopenharmony_ci
8792e5b6d6dSopenharmony_ci
8802e5b6d6dSopenharmony_cidef NeedToSetNumericValue(nv, start, end, c_props):
8812e5b6d6dSopenharmony_ci  c_nv = c_props.get("nv")
8822e5b6d6dSopenharmony_ci  if c_nv == None:
8832e5b6d6dSopenharmony_ci    # DerivedNumericValues.txt adds a Numeric_Value.
8842e5b6d6dSopenharmony_ci    assert "nt" not in c_props
8852e5b6d6dSopenharmony_ci    return True
8862e5b6d6dSopenharmony_ci  if nv != c_nv:
8872e5b6d6dSopenharmony_ci    raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
8882e5b6d6dSopenharmony_ci                     "but DerivedNumericValues.txt has nv=%s") %
8892e5b6d6dSopenharmony_ci                     (c_nv, start, end, nv))
8902e5b6d6dSopenharmony_ci  return False
8912e5b6d6dSopenharmony_ci
8922e5b6d6dSopenharmony_ci
8932e5b6d6dSopenharmony_cidef DoSetNumericValue(nv, start, end, c_props):
8942e5b6d6dSopenharmony_ci  c_props.update({"nt": "Nu", "nv": nv})
8952e5b6d6dSopenharmony_ci
8962e5b6d6dSopenharmony_ci
8972e5b6d6dSopenharmony_cidef ParseDerivedNumericValues(in_file):
8982e5b6d6dSopenharmony_ci  """Parses DerivedNumericValues.txt.
8992e5b6d6dSopenharmony_ci  For most characters, the numeric type & value were parsed previously
9002e5b6d6dSopenharmony_ci  from UnicodeData.txt but that does not show the values for Han characters.
9012e5b6d6dSopenharmony_ci  Here we check that values match those from UnicodeData.txt
9022e5b6d6dSopenharmony_ci  and add new ones."""
9032e5b6d6dSopenharmony_ci  # Ignore the @missing line which has an incorrect number of fields,
9042e5b6d6dSopenharmony_ci  # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1).
9052e5b6d6dSopenharmony_ci  # Also, "NaN" is just the Numeric null value anyway.
9062e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file):
9072e5b6d6dSopenharmony_ci    # Conditional update to the numeric value in the 4th field.
9082e5b6d6dSopenharmony_ci    update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3])
9092e5b6d6dSopenharmony_ci    UpdateProps(data[2], data[3], update)
9102e5b6d6dSopenharmony_ci
9112e5b6d6dSopenharmony_ci
9122e5b6d6dSopenharmony_cidef ParseCaseFolding(in_file):
9132e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
9142e5b6d6dSopenharmony_ci    if data[0] == "missing":
9152e5b6d6dSopenharmony_ci      assert data[2][0] == "C"  # common to scf & cf
9162e5b6d6dSopenharmony_ci      SetDefaultValue("scf", data[2][1])
9172e5b6d6dSopenharmony_ci      SetDefaultValue("cf", data[2][1])
9182e5b6d6dSopenharmony_ci    else:
9192e5b6d6dSopenharmony_ci      # type == "range"
9202e5b6d6dSopenharmony_ci      start = data[2]
9212e5b6d6dSopenharmony_ci      end = data[3]
9222e5b6d6dSopenharmony_ci      status = data[4][1]
9232e5b6d6dSopenharmony_ci      mapping = data[4][2]
9242e5b6d6dSopenharmony_ci      assert status in "CSFT"
9252e5b6d6dSopenharmony_ci      if status == "C":
9262e5b6d6dSopenharmony_ci        SetProps(start, end, {"scf": mapping, "cf": mapping})
9272e5b6d6dSopenharmony_ci      elif status == "S":
9282e5b6d6dSopenharmony_ci        SetPropertyValue("scf", mapping, start, end)
9292e5b6d6dSopenharmony_ci      elif status == "F":
9302e5b6d6dSopenharmony_ci        SetPropertyValue("cf", mapping, start, end)
9312e5b6d6dSopenharmony_ci      else:  # status == "T"
9322e5b6d6dSopenharmony_ci        SetPropertyValue("Turkic_Case_Folding", mapping, start, end)
9332e5b6d6dSopenharmony_ci
9342e5b6d6dSopenharmony_ci
9352e5b6d6dSopenharmony_cidef DoSetConditionalCaseMappings(ccm, start, end, c_props):
9362e5b6d6dSopenharmony_ci  if "Conditional_Case_Mappings" in c_props:
9372e5b6d6dSopenharmony_ci    c_props["Conditional_Case_Mappings"] += ',' + ccm
9382e5b6d6dSopenharmony_ci  else:
9392e5b6d6dSopenharmony_ci    c_props["Conditional_Case_Mappings"] = ccm
9402e5b6d6dSopenharmony_ci
9412e5b6d6dSopenharmony_ci
9422e5b6d6dSopenharmony_cidef ParseSpecialCasing(in_file):
9432e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
9442e5b6d6dSopenharmony_ci    if data[0] == "missing":
9452e5b6d6dSopenharmony_ci      SetDefaultValue("lc", data[2][0])
9462e5b6d6dSopenharmony_ci      SetDefaultValue("tc", data[2][1])
9472e5b6d6dSopenharmony_ci      SetDefaultValue("uc", data[2][2])
9482e5b6d6dSopenharmony_ci    else:
9492e5b6d6dSopenharmony_ci      # type == "range"
9502e5b6d6dSopenharmony_ci      start = data[2]
9512e5b6d6dSopenharmony_ci      end = data[3]
9522e5b6d6dSopenharmony_ci      fields = data[4]
9532e5b6d6dSopenharmony_ci      if len(fields) < 5 or not fields[4]:
9542e5b6d6dSopenharmony_ci        # Unconditional mappings.
9552e5b6d6dSopenharmony_ci        SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]})
9562e5b6d6dSopenharmony_ci      else:
9572e5b6d6dSopenharmony_ci        # Conditional_Case_Mappings
9582e5b6d6dSopenharmony_ci        ccm = (fields[4] + ":lc=" + fields[1] +
9592e5b6d6dSopenharmony_ci               "&tc=" + fields[2] + "&uc=" + fields[3])
9602e5b6d6dSopenharmony_ci        update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm)
9612e5b6d6dSopenharmony_ci        UpdateProps(start, end, update)
9622e5b6d6dSopenharmony_ci
9632e5b6d6dSopenharmony_ci
9642e5b6d6dSopenharmony_cidef ParseBidiBrackets(in_file):
9652e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
9662e5b6d6dSopenharmony_ci    if data[0] == "missing":
9672e5b6d6dSopenharmony_ci      SetDefaultValue("bpt", data[2][1])
9682e5b6d6dSopenharmony_ci    else:
9692e5b6d6dSopenharmony_ci      # type == "range"
9702e5b6d6dSopenharmony_ci      start = data[2]
9712e5b6d6dSopenharmony_ci      end = data[3]
9722e5b6d6dSopenharmony_ci      assert start == end
9732e5b6d6dSopenharmony_ci      mapping = data[4][1]
9742e5b6d6dSopenharmony_ci      bracket_type = data[4][2]
9752e5b6d6dSopenharmony_ci      SetProps(start, end, {"bpb": mapping, "bpt": bracket_type})
9762e5b6d6dSopenharmony_ci
9772e5b6d6dSopenharmony_ci# Postprocessing ----------------------------------------------------------- ***
9782e5b6d6dSopenharmony_ci
9792e5b6d6dSopenharmony_cidef PrintedSize(pname, value):
9802e5b6d6dSopenharmony_ci  if isinstance(value, bool):
9812e5b6d6dSopenharmony_ci    if value:
9822e5b6d6dSopenharmony_ci      return len(pname) + 1  # ";pname"
9832e5b6d6dSopenharmony_ci    else:
9842e5b6d6dSopenharmony_ci      return len(pname) + 2  # ";-pname"
9852e5b6d6dSopenharmony_ci  else:
9862e5b6d6dSopenharmony_ci    return len(pname) + len(str(value)) + 2  # ";pname=value"
9872e5b6d6dSopenharmony_ci
9882e5b6d6dSopenharmony_ci
9892e5b6d6dSopenharmony_cidef CompactBlock(b, i):
9902e5b6d6dSopenharmony_ci  assert b[0] == _starts[i]
9912e5b6d6dSopenharmony_ci  b_props = b[2]  # Normally just blk from Blocks.txt.
9922e5b6d6dSopenharmony_ci  # b_props["blk"] has not been canonicalized yet.
9932e5b6d6dSopenharmony_ci  b_props["blk"] = _props[i]["blk"]
9942e5b6d6dSopenharmony_ci  orig_i = i
9952e5b6d6dSopenharmony_ci  # Count the number of occurrences of each property's value in this block.
9962e5b6d6dSopenharmony_ci  # To minimize the output, count the number of assigned ranges,
9972e5b6d6dSopenharmony_ci  # not the number of code points.
9982e5b6d6dSopenharmony_ci  num_ranges = 0
9992e5b6d6dSopenharmony_ci  prop_counters = {}
10002e5b6d6dSopenharmony_ci  if "gc" in b_props:
10012e5b6d6dSopenharmony_ci    b_is_unassigned = b_props["gc"] == "Cn"  # Unreachable with normal data.
10022e5b6d6dSopenharmony_ci  else:
10032e5b6d6dSopenharmony_ci    b_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
10042e5b6d6dSopenharmony_ci  while True:
10052e5b6d6dSopenharmony_ci    start = _starts[i]
10062e5b6d6dSopenharmony_ci    if start > b[1]: break
10072e5b6d6dSopenharmony_ci    props = _props[i]
10082e5b6d6dSopenharmony_ci    if "gc" in props:
10092e5b6d6dSopenharmony_ci      is_unassigned = props["gc"] == "Cn"
10102e5b6d6dSopenharmony_ci    else:
10112e5b6d6dSopenharmony_ci      is_unassigned = b_is_unassigned
10122e5b6d6dSopenharmony_ci    if is_unassigned:
10132e5b6d6dSopenharmony_ci      # Compact an unassigned range inside the block and
10142e5b6d6dSopenharmony_ci      # mark it to be written with "unassigned".
10152e5b6d6dSopenharmony_ci      # It falls back to default properties, not block properties,
10162e5b6d6dSopenharmony_ci      # except for the blk=Block property.
10172e5b6d6dSopenharmony_ci      assert props["blk"] == b_props["blk"]
10182e5b6d6dSopenharmony_ci      del props["blk"]
10192e5b6d6dSopenharmony_ci      for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
10202e5b6d6dSopenharmony_ci        if props[pname] == _null_or_defaults[pname]: del props[pname]
10212e5b6d6dSopenharmony_ci      # What remains are unusual default values for unassigned code points.
10222e5b6d6dSopenharmony_ci      # For example, bc=R or lb=ID.
10232e5b6d6dSopenharmony_ci      # See http://www.unicode.org/reports/tr44/#Default_Values_Table
10242e5b6d6dSopenharmony_ci      props["unassigned"] = True
10252e5b6d6dSopenharmony_ci    else:
10262e5b6d6dSopenharmony_ci      for (pname, value) in props.items():
10272e5b6d6dSopenharmony_ci        if pname in prop_counters:
10282e5b6d6dSopenharmony_ci          counter = prop_counters[pname]
10292e5b6d6dSopenharmony_ci        else:
10302e5b6d6dSopenharmony_ci          counter = {_null_or_defaults[pname]: num_ranges}
10312e5b6d6dSopenharmony_ci          prop_counters[pname] = counter
10322e5b6d6dSopenharmony_ci        if value in counter:
10332e5b6d6dSopenharmony_ci          counter[value] += 1
10342e5b6d6dSopenharmony_ci        else:
10352e5b6d6dSopenharmony_ci          counter[value] = 1
10362e5b6d6dSopenharmony_ci      # Also count default values for properties that do not occur in a range.
10372e5b6d6dSopenharmony_ci      for pname in prop_counters:
10382e5b6d6dSopenharmony_ci        if pname not in props:
10392e5b6d6dSopenharmony_ci          counter = prop_counters[pname]
10402e5b6d6dSopenharmony_ci          value = _null_or_defaults[pname]
10412e5b6d6dSopenharmony_ci          counter[value] += 1
10422e5b6d6dSopenharmony_ci      num_ranges += 1
10432e5b6d6dSopenharmony_ci      # Invariant: For each counter, the sum of counts must equal num_ranges.
10442e5b6d6dSopenharmony_ci    i += 1
10452e5b6d6dSopenharmony_ci  # For each property that occurs within this block,
10462e5b6d6dSopenharmony_ci  # set the value that reduces the file size the most as a block property value.
10472e5b6d6dSopenharmony_ci  # This is usually the most common value.
10482e5b6d6dSopenharmony_ci  for (pname, counter) in prop_counters.items():
10492e5b6d6dSopenharmony_ci    default_value = _null_or_defaults[pname]
10502e5b6d6dSopenharmony_ci    default_size = PrintedSize(pname, default_value) * counter[default_value]
10512e5b6d6dSopenharmony_ci    max_value = None
10522e5b6d6dSopenharmony_ci    max_count = 0
10532e5b6d6dSopenharmony_ci    max_savings = 0
10542e5b6d6dSopenharmony_ci    for (value, count) in counter.items():
10552e5b6d6dSopenharmony_ci      if value != default_value and count > 1:
10562e5b6d6dSopenharmony_ci        # Does the file get smaller by setting the block default?
10572e5b6d6dSopenharmony_ci        # We save writing the block value as often as it occurs,
10582e5b6d6dSopenharmony_ci        # minus once for writing it for the block,
10592e5b6d6dSopenharmony_ci        # minus writing the default value instead.
10602e5b6d6dSopenharmony_ci        savings = PrintedSize(pname, value) * (count - 1) - default_size
10612e5b6d6dSopenharmony_ci        # For two values with the same savings, pick the one that compares lower,
10622e5b6d6dSopenharmony_ci        # to make this deterministic (avoid flip-flopping).
10632e5b6d6dSopenharmony_ci        if (savings > max_savings or
10642e5b6d6dSopenharmony_ci            (savings > 0 and savings == max_savings and value < max_value)):
10652e5b6d6dSopenharmony_ci          max_value = value
10662e5b6d6dSopenharmony_ci          max_count = count
10672e5b6d6dSopenharmony_ci          max_savings = savings
10682e5b6d6dSopenharmony_ci    # Do not compress uncompressible properties,
10692e5b6d6dSopenharmony_ci    # with an exception for many empty-string values in a block
10702e5b6d6dSopenharmony_ci    # (NFKC_CF='' for tags and variation selectors).
10712e5b6d6dSopenharmony_ci    if (max_savings > 0 and
10722e5b6d6dSopenharmony_ci        ((pname not in _uncompressible_props) or
10732e5b6d6dSopenharmony_ci          (max_value == '' and max_count >= 12))):
10742e5b6d6dSopenharmony_ci      b_props[pname] = max_value
10752e5b6d6dSopenharmony_ci  # For each range and property, remove the default+block value
10762e5b6d6dSopenharmony_ci  # but set the default value if that property was not set
10772e5b6d6dSopenharmony_ci  # (i.e., it used to inherit the default value).
10782e5b6d6dSopenharmony_ci  b_defaults = _null_or_defaults.copy()
10792e5b6d6dSopenharmony_ci  b_defaults.update(b_props)
10802e5b6d6dSopenharmony_ci  i = orig_i
10812e5b6d6dSopenharmony_ci  while True:
10822e5b6d6dSopenharmony_ci    start = _starts[i]
10832e5b6d6dSopenharmony_ci    if start > b[1]: break
10842e5b6d6dSopenharmony_ci    props = _props[i]
10852e5b6d6dSopenharmony_ci    if "unassigned" not in props:
10862e5b6d6dSopenharmony_ci      # Compact an assigned range inside the block.
10872e5b6d6dSopenharmony_ci      for pname in prop_counters:
10882e5b6d6dSopenharmony_ci        if pname in props:
10892e5b6d6dSopenharmony_ci          if props[pname] == b_defaults[pname]: del props[pname]
10902e5b6d6dSopenharmony_ci        elif pname in b_props:
10912e5b6d6dSopenharmony_ci          # b_props only has non-default values.
10922e5b6d6dSopenharmony_ci          # Set the default value if it used to be inherited.
10932e5b6d6dSopenharmony_ci          props[pname] = _null_or_defaults[pname]
10942e5b6d6dSopenharmony_ci      # If there is only one assigned range, then move all of its properties
10952e5b6d6dSopenharmony_ci      # to the block.
10962e5b6d6dSopenharmony_ci      if num_ranges == 1:
10972e5b6d6dSopenharmony_ci        b_props.update(props)
10982e5b6d6dSopenharmony_ci        props.clear()
10992e5b6d6dSopenharmony_ci    i += 1
11002e5b6d6dSopenharmony_ci  # Return the _starts index of the first range after this block.
11012e5b6d6dSopenharmony_ci  return i
11022e5b6d6dSopenharmony_ci
11032e5b6d6dSopenharmony_ci
11042e5b6d6dSopenharmony_cidef CompactNonBlock(limit, i):
11052e5b6d6dSopenharmony_ci  """Remove default property values from between-block ranges."""
11062e5b6d6dSopenharmony_ci  default_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
11072e5b6d6dSopenharmony_ci  while True:
11082e5b6d6dSopenharmony_ci    start = _starts[i]
11092e5b6d6dSopenharmony_ci    if start >= limit: break
11102e5b6d6dSopenharmony_ci    props = _props[i]
11112e5b6d6dSopenharmony_ci    if "gc" in props:
11122e5b6d6dSopenharmony_ci      is_unassigned = props["gc"] == "Cn"
11132e5b6d6dSopenharmony_ci    else:
11142e5b6d6dSopenharmony_ci      is_unassigned = default_is_unassigned
11152e5b6d6dSopenharmony_ci    for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
11162e5b6d6dSopenharmony_ci      if props[pname] == _null_or_defaults[pname]: del props[pname]
11172e5b6d6dSopenharmony_ci    assert "blk" not in props
11182e5b6d6dSopenharmony_ci    # If there are no props left, then nothing will be printed.
11192e5b6d6dSopenharmony_ci    # Otherwise, add "unassigned" for more obvious output.
11202e5b6d6dSopenharmony_ci    if props and is_unassigned:
11212e5b6d6dSopenharmony_ci      props["unassigned"] = True
11222e5b6d6dSopenharmony_ci    i += 1
11232e5b6d6dSopenharmony_ci  # Return the _starts index of the first range after this block.
11242e5b6d6dSopenharmony_ci  return i
11252e5b6d6dSopenharmony_ci
11262e5b6d6dSopenharmony_ci
11272e5b6d6dSopenharmony_cidef CompactBlocks():
11282e5b6d6dSopenharmony_ci  """Optimizes block properties.
11292e5b6d6dSopenharmony_ci  Sets properties on blocks to the most commonly used values,
11302e5b6d6dSopenharmony_ci  and removes default+block values from code point properties."""
11312e5b6d6dSopenharmony_ci  # Ensure that there is a boundary in _starts for each block
11322e5b6d6dSopenharmony_ci  # so that the simple mixing method below works.
11332e5b6d6dSopenharmony_ci  for b in _blocks:
11342e5b6d6dSopenharmony_ci    AddBoundary(b[0])
11352e5b6d6dSopenharmony_ci    limit = b[1] + 1
11362e5b6d6dSopenharmony_ci    if limit <= 0x10ffff: AddBoundary(limit)
11372e5b6d6dSopenharmony_ci  # Walk through ranges and blocks together.
11382e5b6d6dSopenharmony_ci  i = 0
11392e5b6d6dSopenharmony_ci  for b in _blocks:
11402e5b6d6dSopenharmony_ci    b_start = b[0]
11412e5b6d6dSopenharmony_ci    if _starts[i] < b_start:
11422e5b6d6dSopenharmony_ci      i = CompactNonBlock(b_start, i)
11432e5b6d6dSopenharmony_ci    i = CompactBlock(b, i)
11442e5b6d6dSopenharmony_ci  CompactNonBlock(0x110000, i)
11452e5b6d6dSopenharmony_ci
11462e5b6d6dSopenharmony_ci# Output ------------------------------------------------------------------- ***
11472e5b6d6dSopenharmony_ci
11482e5b6d6dSopenharmony_cidef AppendRange(fields, start, end):
11492e5b6d6dSopenharmony_ci  if start == end:
11502e5b6d6dSopenharmony_ci    fields.append("%04lX" % start)
11512e5b6d6dSopenharmony_ci  else:
11522e5b6d6dSopenharmony_ci    fields.append("%04lX..%04lX" % (start, end))
11532e5b6d6dSopenharmony_ci
11542e5b6d6dSopenharmony_ci
11552e5b6d6dSopenharmony_cidef AppendProps(fields, props):
11562e5b6d6dSopenharmony_ci  # Sort property names (props keys) by their normalized forms
11572e5b6d6dSopenharmony_ci  # and output properties in that order.
11582e5b6d6dSopenharmony_ci  for pname in sorted(props, key=NormPropName):
11592e5b6d6dSopenharmony_ci    value = props[pname]
11602e5b6d6dSopenharmony_ci    if isinstance(value, bool):
11612e5b6d6dSopenharmony_ci      if not value: pname = "-" + pname
11622e5b6d6dSopenharmony_ci      fields.append(pname)
11632e5b6d6dSopenharmony_ci    else:
11642e5b6d6dSopenharmony_ci      fields.append("%s=%s" % (pname, value))
11652e5b6d6dSopenharmony_ci
11662e5b6d6dSopenharmony_ci
11672e5b6d6dSopenharmony_cidef WriteFieldsRangeProps(fields, start, end, props, out_file):
11682e5b6d6dSopenharmony_ci  AppendRange(fields, start, end)
11692e5b6d6dSopenharmony_ci  AppendProps(fields, props)
11702e5b6d6dSopenharmony_ci  out_file.write(";".join(fields))
11712e5b6d6dSopenharmony_ci  out_file.write("\n")
11722e5b6d6dSopenharmony_ci
11732e5b6d6dSopenharmony_ci
11742e5b6d6dSopenharmony_cidef EscapeNonASCII(s):
11752e5b6d6dSopenharmony_ci  i = 0
11762e5b6d6dSopenharmony_ci  while i < len(s):
11772e5b6d6dSopenharmony_ci    c = ord(s[i])
11782e5b6d6dSopenharmony_ci    if c <= 0x7f:
11792e5b6d6dSopenharmony_ci      i = i + 1
11802e5b6d6dSopenharmony_ci    else:
11812e5b6d6dSopenharmony_ci      if c <= 0xffff:
11822e5b6d6dSopenharmony_ci        esc = u"\\u%04X" % c
11832e5b6d6dSopenharmony_ci      else:
11842e5b6d6dSopenharmony_ci        esc = u"\\U%08X" % c
11852e5b6d6dSopenharmony_ci      s = s[:i] + esc + s[i+1:]
11862e5b6d6dSopenharmony_ci      i = i + len(esc)
11872e5b6d6dSopenharmony_ci  return s
11882e5b6d6dSopenharmony_ci
11892e5b6d6dSopenharmony_ci
11902e5b6d6dSopenharmony_cidef WritePreparsedUCD(out_file):
11912e5b6d6dSopenharmony_ci  out_file.write("""# Preparsed UCD generated by ICU preparseucd.py
11922e5b6d6dSopenharmony_ci# Copyright (C) 1991 and later: Unicode, Inc. and others.
11932e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
11942e5b6d6dSopenharmony_ci""");
11952e5b6d6dSopenharmony_ci  out_file.write("ucd;%s\n\n" % _ucd_version)
11962e5b6d6dSopenharmony_ci  # Sort property names (props keys) by their normalized forms
11972e5b6d6dSopenharmony_ci  # and output properties in that order.
11982e5b6d6dSopenharmony_ci  pnames = sorted(_null_values, key=NormPropName)
11992e5b6d6dSopenharmony_ci  for pname in pnames:
12002e5b6d6dSopenharmony_ci    prop = _properties[pname]
12012e5b6d6dSopenharmony_ci    out_file.write(";".join(["property", prop[0]] + prop[1]))
12022e5b6d6dSopenharmony_ci    out_file.write("\n")
12032e5b6d6dSopenharmony_ci  out_file.write("\n")
12042e5b6d6dSopenharmony_ci  out_file.write(";".join(["binary"] + _binary_values["N"]))
12052e5b6d6dSopenharmony_ci  out_file.write("\n")
12062e5b6d6dSopenharmony_ci  out_file.write(";".join(["binary"] + _binary_values["Y"]))
12072e5b6d6dSopenharmony_ci  out_file.write("\n")
12082e5b6d6dSopenharmony_ci  for pname in pnames:
12092e5b6d6dSopenharmony_ci    prop = _properties[pname]
12102e5b6d6dSopenharmony_ci    short_names = prop[2]
12112e5b6d6dSopenharmony_ci    if short_names and prop[0] != "Binary":
12122e5b6d6dSopenharmony_ci      for name in sorted(short_names):
12132e5b6d6dSopenharmony_ci        out_file.write(";".join(["value", prop[1][0]] + prop[3][name]))
12142e5b6d6dSopenharmony_ci        out_file.write("\n")
12152e5b6d6dSopenharmony_ci  out_file.write("\n")
12162e5b6d6dSopenharmony_ci  # Ensure that there is a boundary in _starts for each
12172e5b6d6dSopenharmony_ci  # range of data we mix into the output,
12182e5b6d6dSopenharmony_ci  # so that the simple mixing method below works.
12192e5b6d6dSopenharmony_ci  for b in _blocks: AddBoundary(b[0])
12202e5b6d6dSopenharmony_ci  for r in _alg_names_ranges: AddBoundary(r[0])
12212e5b6d6dSopenharmony_ci  for h in _h1: AddBoundary(h[0])
12222e5b6d6dSopenharmony_ci  for h in _h2: AddBoundary(h[0])
12232e5b6d6dSopenharmony_ci  # Write the preparsed data. ppucd.txt = preparsed UCD
12242e5b6d6dSopenharmony_ci  # Syntax: http://site.icu-project.org/design/props/ppucd
12252e5b6d6dSopenharmony_ci  WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file)
12262e5b6d6dSopenharmony_ci  i_blocks = 0
12272e5b6d6dSopenharmony_ci  i_alg = 0
12282e5b6d6dSopenharmony_ci  i_h1 = 0
12292e5b6d6dSopenharmony_ci  i_h2 = 0
12302e5b6d6dSopenharmony_ci  b_end = -1
12312e5b6d6dSopenharmony_ci  for i in range(len(_starts) - 1):
12322e5b6d6dSopenharmony_ci    start = _starts[i]
12332e5b6d6dSopenharmony_ci    end = _starts[i + 1] - 1
12342e5b6d6dSopenharmony_ci    # Block with default properties.
12352e5b6d6dSopenharmony_ci    if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]:
12362e5b6d6dSopenharmony_ci      b = _blocks[i_blocks]
12372e5b6d6dSopenharmony_ci      b_end = b[1]
12382e5b6d6dSopenharmony_ci      WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file)
12392e5b6d6dSopenharmony_ci      i_blocks += 1
12402e5b6d6dSopenharmony_ci    # NamesList h1 heading (for [most of] a block).
12412e5b6d6dSopenharmony_ci    if i_h1 < len(_h1) and start == _h1[i_h1][0]:
12422e5b6d6dSopenharmony_ci      h = _h1[i_h1]
12432e5b6d6dSopenharmony_ci      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2])))
12442e5b6d6dSopenharmony_ci      i_h1 += 1
12452e5b6d6dSopenharmony_ci    # Algorithmic-names range.
12462e5b6d6dSopenharmony_ci    if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
12472e5b6d6dSopenharmony_ci      r = _alg_names_ranges[i_alg]
12482e5b6d6dSopenharmony_ci      fields = ["algnamesrange"]
12492e5b6d6dSopenharmony_ci      AppendRange(fields, r[0], r[1])
12502e5b6d6dSopenharmony_ci      fields.extend(r[2:])
12512e5b6d6dSopenharmony_ci      out_file.write(";".join(fields))
12522e5b6d6dSopenharmony_ci      out_file.write("\n")
12532e5b6d6dSopenharmony_ci      i_alg += 1
12542e5b6d6dSopenharmony_ci    # NamesList h2 heading.
12552e5b6d6dSopenharmony_ci    if i_h2 < len(_h2) and start == _h2[i_h2][0]:
12562e5b6d6dSopenharmony_ci      out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1]))
12572e5b6d6dSopenharmony_ci      i_h2 += 1
12582e5b6d6dSopenharmony_ci    # Code point/range data.
12592e5b6d6dSopenharmony_ci    props = _props[i]
12602e5b6d6dSopenharmony_ci    # Omit ranges with only default+block properties.
12612e5b6d6dSopenharmony_ci    if props:
12622e5b6d6dSopenharmony_ci      if start > b_end and b_end >= 0:
12632e5b6d6dSopenharmony_ci        # First range with values after the last block.
12642e5b6d6dSopenharmony_ci        # Separate it visually from the block lines.
12652e5b6d6dSopenharmony_ci        out_file.write("\n# No block\n")
12662e5b6d6dSopenharmony_ci        b_end = -1
12672e5b6d6dSopenharmony_ci      if "unassigned" in props:
12682e5b6d6dSopenharmony_ci        # Do not output "unassigned" as a property.
12692e5b6d6dSopenharmony_ci        del props["unassigned"]
12702e5b6d6dSopenharmony_ci        line_type = "unassigned"
12712e5b6d6dSopenharmony_ci      else:
12722e5b6d6dSopenharmony_ci        line_type = "cp"
12732e5b6d6dSopenharmony_ci      WriteFieldsRangeProps([line_type], start, end, props, out_file)
12742e5b6d6dSopenharmony_ci
12752e5b6d6dSopenharmony_ci# Write Normalizer2 input files -------------------------------------------- ***
12762e5b6d6dSopenharmony_ci# Ported from gennorm/store.c.
12772e5b6d6dSopenharmony_ci
12782e5b6d6dSopenharmony_cidef WriteAllCC(out_file):
12792e5b6d6dSopenharmony_ci  out_file.write("# Canonical_Combining_Class (ccc) values\n");
12802e5b6d6dSopenharmony_ci  prev_start = 0
12812e5b6d6dSopenharmony_ci  prev_cc = 0
12822e5b6d6dSopenharmony_ci  for i in range(len(_starts)):
12832e5b6d6dSopenharmony_ci    start = _starts[i]
12842e5b6d6dSopenharmony_ci    props = _props[i]
12852e5b6d6dSopenharmony_ci    cc = props.get("ccc")
12862e5b6d6dSopenharmony_ci    if not cc: cc = 0
12872e5b6d6dSopenharmony_ci    if prev_cc != cc:
12882e5b6d6dSopenharmony_ci      if prev_cc != 0:
12892e5b6d6dSopenharmony_ci        last_code_point = start - 1
12902e5b6d6dSopenharmony_ci        if prev_start == last_code_point:
12912e5b6d6dSopenharmony_ci          out_file.write("%04X:%d\n" % (last_code_point, prev_cc))
12922e5b6d6dSopenharmony_ci        else:
12932e5b6d6dSopenharmony_ci          out_file.write("%04X..%04X:%d\n" %
12942e5b6d6dSopenharmony_ci                         (prev_start, last_code_point, prev_cc))
12952e5b6d6dSopenharmony_ci      prev_start = start
12962e5b6d6dSopenharmony_ci      prev_cc = cc
12972e5b6d6dSopenharmony_ci
12982e5b6d6dSopenharmony_ci
12992e5b6d6dSopenharmony_cidef HasMapping(c):
13002e5b6d6dSopenharmony_ci  props = GetProps(c)
13012e5b6d6dSopenharmony_ci  dt = props.get("dt")
13022e5b6d6dSopenharmony_ci  return dt and dt != "None"
13032e5b6d6dSopenharmony_ci
13042e5b6d6dSopenharmony_ci
13052e5b6d6dSopenharmony_cidef HasOneWayMapping(c):
13062e5b6d6dSopenharmony_ci  while True:
13072e5b6d6dSopenharmony_ci    props = GetProps(c)
13082e5b6d6dSopenharmony_ci    dt = props.get("dt")
13092e5b6d6dSopenharmony_ci    if not dt or dt == "None":
13102e5b6d6dSopenharmony_ci      return False  # no mapping
13112e5b6d6dSopenharmony_ci    elif dt == "Can":
13122e5b6d6dSopenharmony_ci      # The canonical decomposition is a one-way mapping if
13132e5b6d6dSopenharmony_ci      # - it does not map to exactly two code points
13142e5b6d6dSopenharmony_ci      # - c has ccc!=0
13152e5b6d6dSopenharmony_ci      # - c has the Composition_Exclusion property
13162e5b6d6dSopenharmony_ci      # - its starter has a one-way mapping (loop for this)
13172e5b6d6dSopenharmony_ci      # - its non-starter decomposes
13182e5b6d6dSopenharmony_ci      nfd = props["dm"].split()
13192e5b6d6dSopenharmony_ci      if (len(nfd) != 2 or
13202e5b6d6dSopenharmony_ci          props.get("ccc") or
13212e5b6d6dSopenharmony_ci          props.get("Comp_Ex") or
13222e5b6d6dSopenharmony_ci          HasMapping(int(nfd[1], 16))):
13232e5b6d6dSopenharmony_ci        return True
13242e5b6d6dSopenharmony_ci      c = int(nfd[0], 16)  # continue
13252e5b6d6dSopenharmony_ci    else:
13262e5b6d6dSopenharmony_ci      # c has a compatibility mapping.
13272e5b6d6dSopenharmony_ci      return True
13282e5b6d6dSopenharmony_ci
13292e5b6d6dSopenharmony_ci
13302e5b6d6dSopenharmony_ci_data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others.
13312e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
13322e5b6d6dSopenharmony_ci# Copyright (C) 1999-2016, International Business Machines
13332e5b6d6dSopenharmony_ci# Corporation and others.  All Rights Reserved.
13342e5b6d6dSopenharmony_ci#
13352e5b6d6dSopenharmony_ci"""
13362e5b6d6dSopenharmony_ci
13372e5b6d6dSopenharmony_cidef WriteNorm2NFCTextFile(path):
13382e5b6d6dSopenharmony_ci  global _data_file_copyright
13392e5b6d6dSopenharmony_ci  with open(os.path.join(path, "nfc.txt"), "w") as out_file:
13402e5b6d6dSopenharmony_ci    out_file.write(
13412e5b6d6dSopenharmony_ci        _data_file_copyright + """# file name: nfc.txt
13422e5b6d6dSopenharmony_ci#
13432e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py
13442e5b6d6dSopenharmony_ci#
13452e5b6d6dSopenharmony_ci# Complete data for Unicode NFC normalization.
13462e5b6d6dSopenharmony_ci
13472e5b6d6dSopenharmony_ci* Unicode """ + _ucd_version + """
13482e5b6d6dSopenharmony_ci
13492e5b6d6dSopenharmony_ci""")
13502e5b6d6dSopenharmony_ci    WriteAllCC(out_file)
13512e5b6d6dSopenharmony_ci    out_file.write("\n# Canonical decomposition mappings\n")
13522e5b6d6dSopenharmony_ci    for i in range(len(_starts) - 1):
13532e5b6d6dSopenharmony_ci      start = _starts[i]
13542e5b6d6dSopenharmony_ci      end = _starts[i + 1] - 1
13552e5b6d6dSopenharmony_ci      props = _props[i]
13562e5b6d6dSopenharmony_ci      dm = props.get("dm")
13572e5b6d6dSopenharmony_ci      if dm and dm[0] != '<' and props["dt"] == "Can":
13582e5b6d6dSopenharmony_ci        assert start == end
13592e5b6d6dSopenharmony_ci        # The Comp_Ex=Full_Composition_Exclusion property tells us
13602e5b6d6dSopenharmony_ci        # whether the canonical decomposition round-trips.
13612e5b6d6dSopenharmony_ci        separator = '>' if props.get("Comp_Ex") else '='
13622e5b6d6dSopenharmony_ci        out_file.write("%04X%s%s\n" % (start, separator, dm))
13632e5b6d6dSopenharmony_ci
13642e5b6d6dSopenharmony_ci
13652e5b6d6dSopenharmony_cidef WriteNorm2NFKCTextFile(path):
13662e5b6d6dSopenharmony_ci  global _data_file_copyright
13672e5b6d6dSopenharmony_ci  with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
13682e5b6d6dSopenharmony_ci    out_file.write(
13692e5b6d6dSopenharmony_ci        _data_file_copyright + """# file name: nfkc.txt
13702e5b6d6dSopenharmony_ci#
13712e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py
13722e5b6d6dSopenharmony_ci#
13732e5b6d6dSopenharmony_ci# Data for Unicode NFKC normalization.
13742e5b6d6dSopenharmony_ci# This file contains only compatibility decomposition mappings,
13752e5b6d6dSopenharmony_ci# plus those canonical decompositions that change from NFC round-trip mappings
13762e5b6d6dSopenharmony_ci# to NFKC one-way mappings.
13772e5b6d6dSopenharmony_ci# Use this file as the second gennorm2 input file after nfc.txt.
13782e5b6d6dSopenharmony_ci
13792e5b6d6dSopenharmony_ci* Unicode """ + _ucd_version + """
13802e5b6d6dSopenharmony_ci
13812e5b6d6dSopenharmony_ci""")
13822e5b6d6dSopenharmony_ci    for i in range(len(_starts) - 1):
13832e5b6d6dSopenharmony_ci      start = _starts[i]
13842e5b6d6dSopenharmony_ci      end = _starts[i + 1] - 1
13852e5b6d6dSopenharmony_ci      props = _props[i]
13862e5b6d6dSopenharmony_ci      dm = props.get("dm")
13872e5b6d6dSopenharmony_ci      if dm and dm[0] != '<':
13882e5b6d6dSopenharmony_ci        assert start == end
13892e5b6d6dSopenharmony_ci        if props["dt"] != "Can":
13902e5b6d6dSopenharmony_ci          # Compatibility decomposition.
13912e5b6d6dSopenharmony_ci          out_file.write("%04X>%s\n" % (start, dm))
13922e5b6d6dSopenharmony_ci        elif not props.get("Comp_Ex") and HasOneWayMapping(start):
13932e5b6d6dSopenharmony_ci          # NFC round-trip mapping turns into NFKC one-way mapping.
13942e5b6d6dSopenharmony_ci          out_file.write("%04X>%s  # NFC round-trip, NFKC one-way\n" %
13952e5b6d6dSopenharmony_ci                         (start, dm))
13962e5b6d6dSopenharmony_ci
13972e5b6d6dSopenharmony_ci
13982e5b6d6dSopenharmony_cidef WriteNorm2NFKC_CFTextFile(path):
13992e5b6d6dSopenharmony_ci  global _data_file_copyright
14002e5b6d6dSopenharmony_ci  with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file:
14012e5b6d6dSopenharmony_ci    out_file.write(
14022e5b6d6dSopenharmony_ci        _data_file_copyright + """# file name: nfkc_cf.txt
14032e5b6d6dSopenharmony_ci#
14042e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py
14052e5b6d6dSopenharmony_ci#
14062e5b6d6dSopenharmony_ci# This file contains the Unicode NFKC_CF mappings,
14072e5b6d6dSopenharmony_ci# extracted from the UCD file DerivedNormalizationProps.txt,
14082e5b6d6dSopenharmony_ci# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
14092e5b6d6dSopenharmony_ci# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
14102e5b6d6dSopenharmony_ci
14112e5b6d6dSopenharmony_ci""")
14122e5b6d6dSopenharmony_ci    out_file.write("* Unicode " + _ucd_version + "\n\n")
14132e5b6d6dSopenharmony_ci    prev_start = 0
14142e5b6d6dSopenharmony_ci    prev_end = 0
14152e5b6d6dSopenharmony_ci    prev_nfkc_cf = None
14162e5b6d6dSopenharmony_ci    for i in range(len(_starts) - 1):
14172e5b6d6dSopenharmony_ci      start = _starts[i]
14182e5b6d6dSopenharmony_ci      end = _starts[i + 1] - 1
14192e5b6d6dSopenharmony_ci      props = _props[i]
14202e5b6d6dSopenharmony_ci      nfkc_cf = props.get("NFKC_CF")
14212e5b6d6dSopenharmony_ci      # Merge with the previous range if possible,
14222e5b6d6dSopenharmony_ci      # or remember this range for merging.
14232e5b6d6dSopenharmony_ci      if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
14242e5b6d6dSopenharmony_ci        prev_end = end
14252e5b6d6dSopenharmony_ci      else:
14262e5b6d6dSopenharmony_ci        if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
14272e5b6d6dSopenharmony_ci          if prev_start == prev_end:
14282e5b6d6dSopenharmony_ci            out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
14292e5b6d6dSopenharmony_ci          else:
14302e5b6d6dSopenharmony_ci            out_file.write("%04X..%04X>%s\n" %
14312e5b6d6dSopenharmony_ci                           (prev_start, prev_end, prev_nfkc_cf))
14322e5b6d6dSopenharmony_ci        prev_start = start
14332e5b6d6dSopenharmony_ci        prev_end = end
14342e5b6d6dSopenharmony_ci        prev_nfkc_cf = nfkc_cf
14352e5b6d6dSopenharmony_ci
14362e5b6d6dSopenharmony_ci
14372e5b6d6dSopenharmony_cidef WriteNorm2(path):
14382e5b6d6dSopenharmony_ci  WriteNorm2NFCTextFile(path)
14392e5b6d6dSopenharmony_ci  WriteNorm2NFKCTextFile(path)
14402e5b6d6dSopenharmony_ci  WriteNorm2NFKC_CFTextFile(path)
14412e5b6d6dSopenharmony_ci
14422e5b6d6dSopenharmony_ci# UTS #46 Normalizer2 input file ------------------------------------------- ***
14432e5b6d6dSopenharmony_ci
14442e5b6d6dSopenharmony_ci_idna_replacements = [
14452e5b6d6dSopenharmony_ci  # Several versions of avoiding circular FFFD>FFFD mappings,
14462e5b6d6dSopenharmony_ci  # depending on the version of the input file.
14472e5b6d6dSopenharmony_ci  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
14482e5b6d6dSopenharmony_ci  (re.compile(r"\.\.FFFD"), "..FFFC"),
14492e5b6d6dSopenharmony_ci  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
14502e5b6d6dSopenharmony_ci  # Since we switch between checking and not checking for STD3 character
14512e5b6d6dSopenharmony_ci  # restrictions at runtime, checking the non-LDH ASCII characters in code,
14522e5b6d6dSopenharmony_ci  # we treat these values here like their regular siblings.
14532e5b6d6dSopenharmony_ci  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
14542e5b6d6dSopenharmony_ci  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
14552e5b6d6dSopenharmony_ci  # For UTS #46, we do not care about "not valid in IDNA2008".
14562e5b6d6dSopenharmony_ci  (re.compile(r"; *; NV8 +"), ""),
14572e5b6d6dSopenharmony_ci  # ICU 63+ normalization no longer allows mappings for surrogate code points,
14582e5b6d6dSopenharmony_ci  # and the UTS #46 code handles them instead.
14592e5b6d6dSopenharmony_ci  (re.compile(r"^D800..DFFF    ; disallowed"), r"# D800..DFFF disallowed in code"),
14602e5b6d6dSopenharmony_ci  # Normal transformations.
14612e5b6d6dSopenharmony_ci  (re.compile(r"; disallowed"), ">FFFD"),
14622e5b6d6dSopenharmony_ci  (re.compile(r"; ignored"), ">"),
14632e5b6d6dSopenharmony_ci  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
14642e5b6d6dSopenharmony_ci  (re.compile(r"; mapped +; "), ">"),
14652e5b6d6dSopenharmony_ci  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
14662e5b6d6dSopenharmony_ci]
14672e5b6d6dSopenharmony_ci
14682e5b6d6dSopenharmony_cidef IdnaToUTS46TextFile(s, t):
14692e5b6d6dSopenharmony_ci  """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
14702e5b6d6dSopenharmony_ci  # Different input/output file names.
14712e5b6d6dSopenharmony_ci  dest_path = os.path.dirname(t)
14722e5b6d6dSopenharmony_ci  t = os.path.join(dest_path, "uts46.txt")
14732e5b6d6dSopenharmony_ci  # TODO: With Python 2.7+, combine the two with statements into one.
14742e5b6d6dSopenharmony_ci  with open(s, "r") as in_file:
14752e5b6d6dSopenharmony_ci    with open(t, "w") as out_file:
14762e5b6d6dSopenharmony_ci      out_file.write("# Original file:\n")
14772e5b6d6dSopenharmony_ci      for line in in_file:
14782e5b6d6dSopenharmony_ci        orig_line = line
14792e5b6d6dSopenharmony_ci        if line.startswith("# For documentation"):
14802e5b6d6dSopenharmony_ci          out_file.write(line)
14812e5b6d6dSopenharmony_ci          out_file.write(r"""
14822e5b6d6dSopenharmony_ci# ================================================
14832e5b6d6dSopenharmony_ci# This file has been reformatted into syntax for the
14842e5b6d6dSopenharmony_ci# gennorm2 Normalizer2 data generator tool.
14852e5b6d6dSopenharmony_ci#
14862e5b6d6dSopenharmony_ci# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
14872e5b6d6dSopenharmony_ci# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
14882e5b6d6dSopenharmony_ci# "disallowed" lines map to U+FFFD.
14892e5b6d6dSopenharmony_ci# "ignored" lines map to an empty string.
14902e5b6d6dSopenharmony_ci#
14912e5b6d6dSopenharmony_ci# Characters disallowed under STD3 rules are treated as valid or mapped;
14922e5b6d6dSopenharmony_ci# they are handled in code.
14932e5b6d6dSopenharmony_ci# Deviation characters are also handled in code.
14942e5b6d6dSopenharmony_ci#
14952e5b6d6dSopenharmony_ci# Use this file as the second gennorm2 input file after nfc.txt.
14962e5b6d6dSopenharmony_ci# ================================================
14972e5b6d6dSopenharmony_ci""")
14982e5b6d6dSopenharmony_ci          continue
14992e5b6d6dSopenharmony_ci        if line[0] in "#\r\n":
15002e5b6d6dSopenharmony_ci          out_file.write(line)
15012e5b6d6dSopenharmony_ci          continue
15022e5b6d6dSopenharmony_ci        for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
15032e5b6d6dSopenharmony_ci        # Align inline comments at column 40.
15042e5b6d6dSopenharmony_ci        comment_pos = line.find("#", 1)
15052e5b6d6dSopenharmony_ci        if comment_pos < 40:
15062e5b6d6dSopenharmony_ci          line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
15072e5b6d6dSopenharmony_ci                  line[comment_pos:])
15082e5b6d6dSopenharmony_ci        elif comment_pos > 40:
15092e5b6d6dSopenharmony_ci          space_pos = comment_pos
15102e5b6d6dSopenharmony_ci          while space_pos > 0 and line[space_pos - 1] == ' ':
15112e5b6d6dSopenharmony_ci            space_pos = space_pos - 1
15122e5b6d6dSopenharmony_ci          if space_pos < 40:
15132e5b6d6dSopenharmony_ci            # Fewer than 40 characters before the comment:
15142e5b6d6dSopenharmony_ci            # Align comments at column 40.
15152e5b6d6dSopenharmony_ci            line = line[:40] + line[comment_pos:]
15162e5b6d6dSopenharmony_ci          else:
15172e5b6d6dSopenharmony_ci            # 40 or more characters before the comment:
15182e5b6d6dSopenharmony_ci            # Keep one space between contents and comment.
15192e5b6d6dSopenharmony_ci            line = line[:space_pos] + " " + line[comment_pos:]
15202e5b6d6dSopenharmony_ci        # Write the modified line.
15212e5b6d6dSopenharmony_ci        out_file.write(line)
15222e5b6d6dSopenharmony_ci        if "..FFFF" in orig_line and "..FFFC" in line:
15232e5b6d6dSopenharmony_ci          out_file.write("FFFE..FFFF    >FFFD\n");
15242e5b6d6dSopenharmony_ci  return t
15252e5b6d6dSopenharmony_ci
15262e5b6d6dSopenharmony_ci# Preprocessing ------------------------------------------------------------ ***
15272e5b6d6dSopenharmony_ci
15282e5b6d6dSopenharmony_ci_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
15292e5b6d6dSopenharmony_ci_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
15302e5b6d6dSopenharmony_ci
15312e5b6d6dSopenharmony_cidef CopyAndStripWithOptionalMerge(s, t, do_merge):
15322e5b6d6dSopenharmony_ci  # TODO: We do not seem to need the do_merge argument and logic any more.
15332e5b6d6dSopenharmony_ci  with open(s, "r") as in_file, open(t, "w") as out_file:
15342e5b6d6dSopenharmony_ci    first = -1  # First code point with first_data.
15352e5b6d6dSopenharmony_ci    last = -1  # Last code point with first_data.
15362e5b6d6dSopenharmony_ci    first_data = ""  # Common data for code points [first..last].
15372e5b6d6dSopenharmony_ci    for line in in_file:
15382e5b6d6dSopenharmony_ci      match = _strip_re.match(line)
15392e5b6d6dSopenharmony_ci      if match:
15402e5b6d6dSopenharmony_ci        line = match.group(1)
15412e5b6d6dSopenharmony_ci      else:
15422e5b6d6dSopenharmony_ci        line = line.rstrip()
15432e5b6d6dSopenharmony_ci      if do_merge:
15442e5b6d6dSopenharmony_ci        match = _code_point_re.match(line)
15452e5b6d6dSopenharmony_ci        if match:
15462e5b6d6dSopenharmony_ci          c = int(match.group(1), 16)
15472e5b6d6dSopenharmony_ci          data = line[match.end() - 1:]
15482e5b6d6dSopenharmony_ci        else:
15492e5b6d6dSopenharmony_ci          c = -1
15502e5b6d6dSopenharmony_ci          data = ""
15512e5b6d6dSopenharmony_ci        if last >= 0 and (c != (last + 1) or data != first_data):
15522e5b6d6dSopenharmony_ci          # output the current range
15532e5b6d6dSopenharmony_ci          if first == last:
15542e5b6d6dSopenharmony_ci            out_file.write("%04X%s\n" % (first, first_data))
15552e5b6d6dSopenharmony_ci          else:
15562e5b6d6dSopenharmony_ci            out_file.write("%04X..%04X%s\n" % (first, last, first_data))
15572e5b6d6dSopenharmony_ci          first = -1
15582e5b6d6dSopenharmony_ci          last = -1
15592e5b6d6dSopenharmony_ci          first_data = ""
15602e5b6d6dSopenharmony_ci        if c < 0:
15612e5b6d6dSopenharmony_ci          # no data on this line, output as is
15622e5b6d6dSopenharmony_ci          out_file.write(line)
15632e5b6d6dSopenharmony_ci          out_file.write("\n")
15642e5b6d6dSopenharmony_ci        else:
15652e5b6d6dSopenharmony_ci          # data on this line, store for possible range compaction
15662e5b6d6dSopenharmony_ci          if last < 0:
15672e5b6d6dSopenharmony_ci            # set as the first line in a possible range
15682e5b6d6dSopenharmony_ci            first = c
15692e5b6d6dSopenharmony_ci            last = c
15702e5b6d6dSopenharmony_ci            first_data = data
15712e5b6d6dSopenharmony_ci          else:
15722e5b6d6dSopenharmony_ci            # must be c == (last + 1) and data == first_data
15732e5b6d6dSopenharmony_ci            # because of previous conditions
15742e5b6d6dSopenharmony_ci            # continue with the current range
15752e5b6d6dSopenharmony_ci            last = c
15762e5b6d6dSopenharmony_ci      else:
15772e5b6d6dSopenharmony_ci        # Only strip, don't merge: just output the stripped line.
15782e5b6d6dSopenharmony_ci        out_file.write(line)
15792e5b6d6dSopenharmony_ci        out_file.write("\n")
15802e5b6d6dSopenharmony_ci    if do_merge and last >= 0:
15812e5b6d6dSopenharmony_ci      # output the last range in the file
15822e5b6d6dSopenharmony_ci      if first == last:
15832e5b6d6dSopenharmony_ci        out_file.write("%04X%s\n" % (first, first_data))
15842e5b6d6dSopenharmony_ci      else:
15852e5b6d6dSopenharmony_ci        out_file.write("%04X..%04X%s\n" % (first, last, first_data))
15862e5b6d6dSopenharmony_ci      first = -1
15872e5b6d6dSopenharmony_ci      last = -1
15882e5b6d6dSopenharmony_ci      first_data = ""
15892e5b6d6dSopenharmony_ci    out_file.flush()
15902e5b6d6dSopenharmony_ci  return t
15912e5b6d6dSopenharmony_ci
15922e5b6d6dSopenharmony_ci
15932e5b6d6dSopenharmony_cidef CopyAndStrip(s, t):
15942e5b6d6dSopenharmony_ci  """Copies a file and removes comments behind data lines but not in others."""
15952e5b6d6dSopenharmony_ci  return CopyAndStripWithOptionalMerge(s, t, False)
15962e5b6d6dSopenharmony_ci
15972e5b6d6dSopenharmony_ci
15982e5b6d6dSopenharmony_cidef CopyAndStripAndMerge(s, t):
15992e5b6d6dSopenharmony_ci  """Copies and strips a file and merges lines.
16002e5b6d6dSopenharmony_ci
16012e5b6d6dSopenharmony_ci  Copies a file, removes comments, and
16022e5b6d6dSopenharmony_ci  merges lines with adjacent code point ranges and identical per-code point
16032e5b6d6dSopenharmony_ci  data lines into one line with range syntax.
16042e5b6d6dSopenharmony_ci  """
16052e5b6d6dSopenharmony_ci  return CopyAndStripWithOptionalMerge(s, t, True)
16062e5b6d6dSopenharmony_ci
16072e5b6d6dSopenharmony_ci
16082e5b6d6dSopenharmony_cidef CopyOnly(s, t):
16092e5b6d6dSopenharmony_ci  shutil.copy(s, t)
16102e5b6d6dSopenharmony_ci  return t
16112e5b6d6dSopenharmony_ci
16122e5b6d6dSopenharmony_ci
16132e5b6d6dSopenharmony_cidef DontCopy(s, t):
16142e5b6d6dSopenharmony_ci  return s
16152e5b6d6dSopenharmony_ci
16162e5b6d6dSopenharmony_ci
16172e5b6d6dSopenharmony_ci# Each _files value is a
16182e5b6d6dSopenharmony_ci# (preprocessor, dest_folder, parser, order) tuple
16192e5b6d6dSopenharmony_ci# where all fields except the preprocessor are optional.
16202e5b6d6dSopenharmony_ci# After the initial preprocessing (copy/strip/merge),
16212e5b6d6dSopenharmony_ci# if a parser is specified, then a tuple is added to _files_to_parse
16222e5b6d6dSopenharmony_ci# at index "order" (default order 9).
16232e5b6d6dSopenharmony_ci# An explicit order number is set only for files that must be parsed
16242e5b6d6dSopenharmony_ci# before others.
16252e5b6d6dSopenharmony_ci_files = {
16262e5b6d6dSopenharmony_ci  "BidiBrackets.txt": (DontCopy, ParseBidiBrackets),
16272e5b6d6dSopenharmony_ci  "BidiMirroring.txt": (DontCopy, ParseBidiMirroring),
16282e5b6d6dSopenharmony_ci  "BidiTest.txt": (CopyOnly, "testdata"),
16292e5b6d6dSopenharmony_ci  "Blocks.txt": (DontCopy, ParseBlocks),
16302e5b6d6dSopenharmony_ci  "CaseFolding.txt": (CopyOnly, ParseCaseFolding),
16312e5b6d6dSopenharmony_ci  "DerivedAge.txt": (DontCopy, ParseDerivedAge),
16322e5b6d6dSopenharmony_ci  "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass),
16332e5b6d6dSopenharmony_ci  "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties),
16342e5b6d6dSopenharmony_ci  "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup),
16352e5b6d6dSopenharmony_ci  "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType),
16362e5b6d6dSopenharmony_ci  "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties),
16372e5b6d6dSopenharmony_ci  "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues),
16382e5b6d6dSopenharmony_ci  "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
16392e5b6d6dSopenharmony_ci  "emoji-data.txt": (DontCopy, ParseNamedProperties),
16402e5b6d6dSopenharmony_ci  "emoji-sequences.txt": (CopyOnly,),
16412e5b6d6dSopenharmony_ci  "emoji-zwj-sequences.txt": (CopyOnly,),
16422e5b6d6dSopenharmony_ci  "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
16432e5b6d6dSopenharmony_ci  "GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"),
16442e5b6d6dSopenharmony_ci  "IdnaTestV2.txt": (CopyOnly, "testdata"),
16452e5b6d6dSopenharmony_ci  "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
16462e5b6d6dSopenharmony_ci  "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
16472e5b6d6dSopenharmony_ci  "LineBreak.txt": (DontCopy, ParseLineBreak),
16482e5b6d6dSopenharmony_ci  "LineBreakTest.txt": (CopyOnly, "testdata"),
16492e5b6d6dSopenharmony_ci  "NameAliases.txt": (DontCopy, ParseNameAliases),
16502e5b6d6dSopenharmony_ci  "NamesList.txt": (DontCopy, ParseNamesList),
16512e5b6d6dSopenharmony_ci  "NormalizationCorrections.txt": (CopyOnly,),  # Only used in gensprep.
16522e5b6d6dSopenharmony_ci  "NormalizationTest.txt": (CopyAndStrip,),
16532e5b6d6dSopenharmony_ci  "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0),
16542e5b6d6dSopenharmony_ci  "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1),
16552e5b6d6dSopenharmony_ci  "PropList.txt": (DontCopy, ParseNamedProperties),
16562e5b6d6dSopenharmony_ci  "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak),
16572e5b6d6dSopenharmony_ci  "SentenceBreakTest.txt": (CopyOnly, "testdata"),
16582e5b6d6dSopenharmony_ci  "Scripts.txt": (DontCopy, ParseScripts),
16592e5b6d6dSopenharmony_ci  "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions),
16602e5b6d6dSopenharmony_ci  "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
16612e5b6d6dSopenharmony_ci  "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
16622e5b6d6dSopenharmony_ci  "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation),
16632e5b6d6dSopenharmony_ci  "WordBreakProperty.txt": (DontCopy, ParseWordBreak),
16642e5b6d6dSopenharmony_ci  "WordBreakTest.txt": (CopyOnly, "testdata"),
16652e5b6d6dSopenharmony_ci  # From www.unicode.org/Public/idna/<version>/
16662e5b6d6dSopenharmony_ci  "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
16672e5b6d6dSopenharmony_ci}
16682e5b6d6dSopenharmony_ci
16692e5b6d6dSopenharmony_ci# List of lists of files to be parsed in order.
16702e5b6d6dSopenharmony_ci# Inner lists contain (basename, path, parser) tuples.
16712e5b6d6dSopenharmony_ci_files_to_parse = [[], [], [], [], [], [], [], [], [], []]
16722e5b6d6dSopenharmony_ci
16732e5b6d6dSopenharmony_ci# Get the standard basename from a versioned filename.
16742e5b6d6dSopenharmony_ci# For example, match "UnicodeData-6.1.0d8.txt"
16752e5b6d6dSopenharmony_ci# so we can turn it into "UnicodeData.txt".
16762e5b6d6dSopenharmony_ci_file_version_re = re.compile("([a-zA-Z0-9_-]+)" +
16772e5b6d6dSopenharmony_ci                              "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" +
16782e5b6d6dSopenharmony_ci                              "(\\.[a-z]+)$")
16792e5b6d6dSopenharmony_ci
16802e5b6d6dSopenharmony_cidef PreprocessFiles(source_files, icu4c_src_root):
16812e5b6d6dSopenharmony_ci  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
16822e5b6d6dSopenharmony_ci  norm2_path = os.path.join(unidata_path, "norm2")
16832e5b6d6dSopenharmony_ci  testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata")
16842e5b6d6dSopenharmony_ci  folder_to_path = {
16852e5b6d6dSopenharmony_ci    "unidata": unidata_path,
16862e5b6d6dSopenharmony_ci    "norm2": norm2_path,
16872e5b6d6dSopenharmony_ci    "testdata": testdata_path
16882e5b6d6dSopenharmony_ci  }
16892e5b6d6dSopenharmony_ci  files_processed = set()
16902e5b6d6dSopenharmony_ci  for source_file in source_files:
16912e5b6d6dSopenharmony_ci    (folder, basename) = os.path.split(source_file)
16922e5b6d6dSopenharmony_ci    match = _file_version_re.match(basename)
16932e5b6d6dSopenharmony_ci    if match:
16942e5b6d6dSopenharmony_ci      new_basename = match.group(1) + match.group(2)
16952e5b6d6dSopenharmony_ci      if new_basename != basename:
16962e5b6d6dSopenharmony_ci        print("Removing version suffix from " + source_file)
16972e5b6d6dSopenharmony_ci        # ... so that we can easily compare UCD files.
16982e5b6d6dSopenharmony_ci        new_source_file = os.path.join(folder, new_basename)
16992e5b6d6dSopenharmony_ci        shutil.move(source_file, new_source_file)
17002e5b6d6dSopenharmony_ci        basename = new_basename
17012e5b6d6dSopenharmony_ci        source_file = new_source_file
17022e5b6d6dSopenharmony_ci    if basename in _files:
17032e5b6d6dSopenharmony_ci      print("Preprocessing %s" % basename)
17042e5b6d6dSopenharmony_ci      if basename in files_processed:
17052e5b6d6dSopenharmony_ci        raise Exception("duplicate file basename %s!" % basename)
17062e5b6d6dSopenharmony_ci      files_processed.add(basename)
17072e5b6d6dSopenharmony_ci      value = _files[basename]
17082e5b6d6dSopenharmony_ci      preprocessor = value[0]
17092e5b6d6dSopenharmony_ci      if len(value) >= 2 and isinstance(value[1], (str)):
17102e5b6d6dSopenharmony_ci        # The value was [preprocessor, dest_folder, ...], leave [...].
17112e5b6d6dSopenharmony_ci        dest_folder = value[1]
17122e5b6d6dSopenharmony_ci        value = value[2:]
17132e5b6d6dSopenharmony_ci      else:
17142e5b6d6dSopenharmony_ci        # The value was [preprocessor, ...], leave [...].
17152e5b6d6dSopenharmony_ci        dest_folder = "unidata"
17162e5b6d6dSopenharmony_ci        value = value[1:]
17172e5b6d6dSopenharmony_ci      dest_path = folder_to_path[dest_folder]
17182e5b6d6dSopenharmony_ci      if not os.path.exists(dest_path): os.makedirs(dest_path)
17192e5b6d6dSopenharmony_ci      dest_basename = basename
17202e5b6d6dSopenharmony_ci      # Source GraphemeBreakTest-cldr.txt --> destination GraphemeBreakTest.txt.
17212e5b6d6dSopenharmony_ci      if basename.endswith("-cldr.txt"):
17222e5b6d6dSopenharmony_ci        dest_basename = basename[:-9] + basename[-4:]
17232e5b6d6dSopenharmony_ci      dest_file = os.path.join(dest_path, dest_basename)
17242e5b6d6dSopenharmony_ci      parse_file = preprocessor(source_file, dest_file)
17252e5b6d6dSopenharmony_ci      if value:
17262e5b6d6dSopenharmony_ci        order = 9 if len(value) < 2 else value[1]
17272e5b6d6dSopenharmony_ci        _files_to_parse[order].append((basename, parse_file, value[0]))
17282e5b6d6dSopenharmony_ci
17292e5b6d6dSopenharmony_ci# Character names ---------------------------------------------------------- ***
17302e5b6d6dSopenharmony_ci
17312e5b6d6dSopenharmony_ci# TODO: Turn this script into a module that
17322e5b6d6dSopenharmony_ci# a) gives access to the parsed data
17332e5b6d6dSopenharmony_ci# b) has a PreparseUCD(ucd_root, icu4c_src_root) function
17342e5b6d6dSopenharmony_ci# c) has a ParsePreparsedUCD(filename) function
17352e5b6d6dSopenharmony_ci# d) has a WritePreparsedUCD(filename) function
17362e5b6d6dSopenharmony_ci# and then use it from a new script for names.
17372e5b6d6dSopenharmony_ci# Some more API:
17382e5b6d6dSopenharmony_ci# - generator GetRangesAndProps() -> (start, end, props)*
17392e5b6d6dSopenharmony_ci
17402e5b6d6dSopenharmony_cidef IncCounter(counters, key, inc=1):
17412e5b6d6dSopenharmony_ci  if key in counters:
17422e5b6d6dSopenharmony_ci    counters[key] += inc
17432e5b6d6dSopenharmony_ci  else:
17442e5b6d6dSopenharmony_ci    counters[key] = inc
17452e5b6d6dSopenharmony_ci
17462e5b6d6dSopenharmony_ci
17472e5b6d6dSopenharmony_ciendings = (
17482e5b6d6dSopenharmony_ci  # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz.
17492e5b6d6dSopenharmony_ci  "PHASE-",
17502e5b6d6dSopenharmony_ci  "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ",
17512e5b6d6dSopenharmony_ci  "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ",
17522e5b6d6dSopenharmony_ci  "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ",
17532e5b6d6dSopenharmony_ci  "ACROPHONIC ", "HIEROGLYPH ",
17542e5b6d6dSopenharmony_ci  "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ",
17552e5b6d6dSopenharmony_ci  "PUNCTUATION ", "SIGN ", "SYMBOL ",
17562e5b6d6dSopenharmony_ci  "TILE ", "CARD ", "FACE ",
17572e5b6d6dSopenharmony_ci  "ACCENT ", "POINT ",
17582e5b6d6dSopenharmony_ci  # List SIGN before VOWEL to catch "vowel sign".
17592e5b6d6dSopenharmony_ci  "VOWEL ", "TONE ", "RADICAL ",
17602e5b6d6dSopenharmony_ci  # For names of math symbols,
17612e5b6d6dSopenharmony_ci  # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A
17622e5b6d6dSopenharmony_ci  "SCRIPT ", "FRAKTUR ", "MONOSPACE ",
17632e5b6d6dSopenharmony_ci  "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ",
17642e5b6d6dSopenharmony_ci  "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ",
17652e5b6d6dSopenharmony_ci  # BRAILLE PATTERN DOTS-xyz
17662e5b6d6dSopenharmony_ci  "DOTS-",
17672e5b6d6dSopenharmony_ci  "SELECTOR ", "SELECTOR-"
17682e5b6d6dSopenharmony_ci)
17692e5b6d6dSopenharmony_ci
17702e5b6d6dSopenharmony_cidef SplitName(name, tokens):
17712e5b6d6dSopenharmony_ci  start = 0
17722e5b6d6dSopenharmony_ci  for e in endings:
17732e5b6d6dSopenharmony_ci    i = name.find(e)
17742e5b6d6dSopenharmony_ci    if i >= 0:
17752e5b6d6dSopenharmony_ci      start = i + len(e)
17762e5b6d6dSopenharmony_ci      token = name[:start]
17772e5b6d6dSopenharmony_ci      IncCounter(tokens, token)
17782e5b6d6dSopenharmony_ci      break
17792e5b6d6dSopenharmony_ci  for i in range(start, len(name)):
17802e5b6d6dSopenharmony_ci    c = name[i]
17812e5b6d6dSopenharmony_ci    if c == ' ' or c == '-':
17822e5b6d6dSopenharmony_ci      token = name[start:i + 1]
17832e5b6d6dSopenharmony_ci      IncCounter(tokens, token)
17842e5b6d6dSopenharmony_ci      start = i + 1
17852e5b6d6dSopenharmony_ci  IncCounter(tokens, name[start:])
17862e5b6d6dSopenharmony_ci
17872e5b6d6dSopenharmony_ci
17882e5b6d6dSopenharmony_cidef PrintNameStats():
17892e5b6d6dSopenharmony_ci  # TODO: This name analysis code is out of date.
17902e5b6d6dSopenharmony_ci  # It needs to consider the multi-type Name_Alias values.
17912e5b6d6dSopenharmony_ci  name_pnames = ("na", "na1", "Name_Alias")
17922e5b6d6dSopenharmony_ci  counts = {}
17932e5b6d6dSopenharmony_ci  for pname in name_pnames:
17942e5b6d6dSopenharmony_ci    counts[pname] = 0
17952e5b6d6dSopenharmony_ci  total_lengths = counts.copy()
17962e5b6d6dSopenharmony_ci  max_length = 0
17972e5b6d6dSopenharmony_ci  max_per_cp = 0
17982e5b6d6dSopenharmony_ci  name_chars = set()
17992e5b6d6dSopenharmony_ci  num_digits = 0
18002e5b6d6dSopenharmony_ci  token_counters = {}
18012e5b6d6dSopenharmony_ci  char_counters = {}
18022e5b6d6dSopenharmony_ci  for i in range(len(_starts) - 1):
18032e5b6d6dSopenharmony_ci    start = _starts[i]
18042e5b6d6dSopenharmony_ci    # end = _starts[i + 1] - 1
18052e5b6d6dSopenharmony_ci    props = _props[i]
18062e5b6d6dSopenharmony_ci    per_cp = 0
18072e5b6d6dSopenharmony_ci    for pname in name_pnames:
18082e5b6d6dSopenharmony_ci      if pname in props:
18092e5b6d6dSopenharmony_ci        counts[pname] += 1
18102e5b6d6dSopenharmony_ci        name = props[pname]
18112e5b6d6dSopenharmony_ci        total_lengths[pname] += len(name)
18122e5b6d6dSopenharmony_ci        name_chars |= set(name)
18132e5b6d6dSopenharmony_ci        if len(name) > max_length: max_length = len(name)
18142e5b6d6dSopenharmony_ci        per_cp += len(name) + 1
18152e5b6d6dSopenharmony_ci        if per_cp > max_per_cp: max_per_cp = per_cp
18162e5b6d6dSopenharmony_ci        tokens = SplitName(name, token_counters)
18172e5b6d6dSopenharmony_ci        for c in name:
18182e5b6d6dSopenharmony_ci          if c in "0123456789": num_digits += 1
18192e5b6d6dSopenharmony_ci          IncCounter(char_counters, c)
18202e5b6d6dSopenharmony_ci  print
18212e5b6d6dSopenharmony_ci  for pname in name_pnames:
18222e5b6d6dSopenharmony_ci    print("'%s' character names: %d / %d bytes" %
18232e5b6d6dSopenharmony_ci          (pname, counts[pname], total_lengths[pname]))
18242e5b6d6dSopenharmony_ci  print("%d total bytes in character names" % sum(total_lengths.itervalues()))
18252e5b6d6dSopenharmony_ci  print("%d name-characters: %s" %
18262e5b6d6dSopenharmony_ci        (len(name_chars), "".join(sorted(name_chars))))
18272e5b6d6dSopenharmony_ci  print("%d digits 0-9" % num_digits)
18282e5b6d6dSopenharmony_ci  count_chars = [(count, c) for (c, count) in char_counters.items()]
18292e5b6d6dSopenharmony_ci  count_chars.sort(reverse=True)
18302e5b6d6dSopenharmony_ci  for cc in count_chars:
18312e5b6d6dSopenharmony_ci    print("name-chars: %6d * '%s'" % cc)
18322e5b6d6dSopenharmony_ci  print("max. name length: %d" % max_length)
18332e5b6d6dSopenharmony_ci  print("max. length of all (names+NUL) per cp: %d" % max_per_cp)
18342e5b6d6dSopenharmony_ci
18352e5b6d6dSopenharmony_ci  token_lengths = sum([len(t) + 1 for t in token_counters])
18362e5b6d6dSopenharmony_ci  print("%d total tokens, %d bytes with NUL" %
18372e5b6d6dSopenharmony_ci        (len(token_counters), token_lengths))
18382e5b6d6dSopenharmony_ci
18392e5b6d6dSopenharmony_ci  counts_tokens = []
18402e5b6d6dSopenharmony_ci  for (token, count) in token_counters.items():
18412e5b6d6dSopenharmony_ci    # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time
18422e5b6d6dSopenharmony_ci    # but have to store the token string itself with a length or terminator byte,
18432e5b6d6dSopenharmony_ci    # plus a 2-byte entry in an token index table.
18442e5b6d6dSopenharmony_ci    savings = count * (len(token) - 1) - (len(token) + 1 + 2)
18452e5b6d6dSopenharmony_ci    if savings > 0:
18462e5b6d6dSopenharmony_ci      counts_tokens.append((savings, count, token))
18472e5b6d6dSopenharmony_ci  counts_tokens.sort(reverse=True)
18482e5b6d6dSopenharmony_ci  print("%d tokens might save space with 1-byte codes" % len(counts_tokens))
18492e5b6d6dSopenharmony_ci
18502e5b6d6dSopenharmony_ci  # Codes=bytes, 40 byte values for name_chars.
18512e5b6d6dSopenharmony_ci  # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens.
18522e5b6d6dSopenharmony_ci  # Make each 2-byte token the token string index itself, rather than
18532e5b6d6dSopenharmony_ci  # and index into a string index table.
18542e5b6d6dSopenharmony_ci  # More lead bytes but also more savings.
18552e5b6d6dSopenharmony_ci  num_units = 256
18562e5b6d6dSopenharmony_ci  max_lead = (token_lengths + 255) / 256
18572e5b6d6dSopenharmony_ci  max_token_units = num_units - len(name_chars)
18582e5b6d6dSopenharmony_ci  results = []
18592e5b6d6dSopenharmony_ci  for num_lead in range(min(max_lead, max_token_units) + 1):
18602e5b6d6dSopenharmony_ci    max1 = max_token_units - num_lead
18612e5b6d6dSopenharmony_ci    ct = counts_tokens[:max1]
18622e5b6d6dSopenharmony_ci    tokens1 = set([t for (s, c, t) in ct])
18632e5b6d6dSopenharmony_ci    for (token, count) in token_counters.items():
18642e5b6d6dSopenharmony_ci      if token in tokens1: continue
18652e5b6d6dSopenharmony_ci      # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time
18662e5b6d6dSopenharmony_ci      # but have to store the token string itself with a length or terminator byte.
18672e5b6d6dSopenharmony_ci      savings = count * (len(token) - 2) - (len(token) + 1)
18682e5b6d6dSopenharmony_ci      if savings > 0:
18692e5b6d6dSopenharmony_ci        ct.append((savings, count, token))
18702e5b6d6dSopenharmony_ci    ct.sort(reverse=True)
18712e5b6d6dSopenharmony_ci    # A 2-byte-code-token index cannot be limit_t_lengths or higher.
18722e5b6d6dSopenharmony_ci    limit_t_lengths = num_lead * 256
18732e5b6d6dSopenharmony_ci    token2_index = 0
18742e5b6d6dSopenharmony_ci    for i in range(max1, len(ct)):
18752e5b6d6dSopenharmony_ci      if token2_index >= limit_t_lengths:
18762e5b6d6dSopenharmony_ci        del ct[i:]
18772e5b6d6dSopenharmony_ci        break
18782e5b6d6dSopenharmony_ci      token2_index += len(ct[i][2]) + 1
18792e5b6d6dSopenharmony_ci    cumul_savings = sum([s for (s, c, t) in ct])
18802e5b6d6dSopenharmony_ci    # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" %
18812e5b6d6dSopenharmony_ci    #        (max1, len(ct), cumul_savings))
18822e5b6d6dSopenharmony_ci    results.append((cumul_savings, max1, ct))
18832e5b6d6dSopenharmony_ci  best = max(results)  # (cumul_savings, max1, ct)
18842e5b6d6dSopenharmony_ci
18852e5b6d6dSopenharmony_ci  max1 = best[1]
18862e5b6d6dSopenharmony_ci  print("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" %
18872e5b6d6dSopenharmony_ci         (best[0], max1, max_token_units - max1))
18882e5b6d6dSopenharmony_ci  counts_tokens = best[2]
18892e5b6d6dSopenharmony_ci  cumul_savings = 0
18902e5b6d6dSopenharmony_ci  for i in range(len(counts_tokens)):
18912e5b6d6dSopenharmony_ci    n = 1 if i < max1 else 2
18922e5b6d6dSopenharmony_ci    i1 = i + 1
18932e5b6d6dSopenharmony_ci    t = counts_tokens[i]
18942e5b6d6dSopenharmony_ci    cumul_savings += t[0]
18952e5b6d6dSopenharmony_ci    if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens):
18962e5b6d6dSopenharmony_ci      print(("%04d. cumul. %6d bytes save %6d bytes from " +
18972e5b6d6dSopenharmony_ci              "%5d * %d-byte token for %2d='%s'") %
18982e5b6d6dSopenharmony_ci          (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2]))
18992e5b6d6dSopenharmony_ci
19002e5b6d6dSopenharmony_ci# ICU API ------------------------------------------------------------------ ***
19012e5b6d6dSopenharmony_ci
19022e5b6d6dSopenharmony_ci# Sample line to match:
19032e5b6d6dSopenharmony_ci#    UCHAR_UNIFIED_IDEOGRAPH=29,
19042e5b6d6dSopenharmony_ci_uchar_re = re.compile(
19052e5b6d6dSopenharmony_ci    " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
19062e5b6d6dSopenharmony_ci
19072e5b6d6dSopenharmony_ci# Sample line to match:
19082e5b6d6dSopenharmony_ci#    /** Zs @stable ICU 2.0 */
19092e5b6d6dSopenharmony_ci_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ")
19102e5b6d6dSopenharmony_ci
19112e5b6d6dSopenharmony_ci# Sample line to match:
19122e5b6d6dSopenharmony_ci#    U_SPACE_SEPARATOR         = 12,
19132e5b6d6dSopenharmony_ci_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
19142e5b6d6dSopenharmony_ci
19152e5b6d6dSopenharmony_ci# Sample line to match:
19162e5b6d6dSopenharmony_ci#    /** L @stable ICU 2.0 */
19172e5b6d6dSopenharmony_ci_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ")
19182e5b6d6dSopenharmony_ci
19192e5b6d6dSopenharmony_ci# Sample line to match:
19202e5b6d6dSopenharmony_ci#    U_LEFT_TO_RIGHT               = 0,
19212e5b6d6dSopenharmony_ci_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
19222e5b6d6dSopenharmony_ci
19232e5b6d6dSopenharmony_ci# Sample line to match:
19242e5b6d6dSopenharmony_ci#    UBLOCK_CYRILLIC =9,
19252e5b6d6dSopenharmony_ci_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
19262e5b6d6dSopenharmony_ci
19272e5b6d6dSopenharmony_ci# Sample line to match:
19282e5b6d6dSopenharmony_ci#    U_EA_AMBIGUOUS,
19292e5b6d6dSopenharmony_ci_prop_and_value_re = re.compile(
19302e5b6d6dSopenharmony_ci    " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
19312e5b6d6dSopenharmony_ci
19322e5b6d6dSopenharmony_ci# Sample line to match if it has matched _prop_and_value_re
19332e5b6d6dSopenharmony_ci# (we want to exclude aliases):
19342e5b6d6dSopenharmony_ci#    U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL,
19352e5b6d6dSopenharmony_ci_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U")
19362e5b6d6dSopenharmony_ci
19372e5b6d6dSopenharmony_cidef ParseUCharHeader(icu4c_src_root):
19382e5b6d6dSopenharmony_ci  uchar_path = os.path.join(icu4c_src_root, "source",
19392e5b6d6dSopenharmony_ci                            "common", "unicode", "uchar.h")
19402e5b6d6dSopenharmony_ci  with open(uchar_path, "r") as uchar_file:
19412e5b6d6dSopenharmony_ci    mode = ""  # Mode string (=pname) during context-sensitive parsing.
19422e5b6d6dSopenharmony_ci    comment_value = ""  # Property value from a comment preceding an enum.
19432e5b6d6dSopenharmony_ci    # Note: The enum UProperty is first in uchar.h, before the enums for values.
19442e5b6d6dSopenharmony_ci    for line in uchar_file:
19452e5b6d6dSopenharmony_ci      # Parse some enums via context-sensitive "modes".
19462e5b6d6dSopenharmony_ci      # Necessary because the enum constant names do not contain
19472e5b6d6dSopenharmony_ci      # enough information.
19482e5b6d6dSopenharmony_ci      if "enum UCharCategory" in line:
19492e5b6d6dSopenharmony_ci        mode = "gc"
19502e5b6d6dSopenharmony_ci        comment_value = ""
19512e5b6d6dSopenharmony_ci        continue
19522e5b6d6dSopenharmony_ci      if mode == "gc":
19532e5b6d6dSopenharmony_ci        if line.startswith("}"):
19542e5b6d6dSopenharmony_ci          mode = ""
19552e5b6d6dSopenharmony_ci          continue
19562e5b6d6dSopenharmony_ci        match = _gc_comment_re.match(line)
19572e5b6d6dSopenharmony_ci        if match:
19582e5b6d6dSopenharmony_ci          comment_value = match.group(1)
19592e5b6d6dSopenharmony_ci          continue
19602e5b6d6dSopenharmony_ci        match = _gc_re.match(line)
19612e5b6d6dSopenharmony_ci        if match and comment_value:
19622e5b6d6dSopenharmony_ci          gc_enum = match.group(1)
19632e5b6d6dSopenharmony_ci          prop = _properties["gc"]
19642e5b6d6dSopenharmony_ci          vname = GetShortPropertyValueName(prop, comment_value)
19652e5b6d6dSopenharmony_ci          icu_values = _pname_to_icu_prop["gc"][2]
19662e5b6d6dSopenharmony_ci          icu_values.append((gc_enum, vname))
19672e5b6d6dSopenharmony_ci        comment_value = ""
19682e5b6d6dSopenharmony_ci        continue
19692e5b6d6dSopenharmony_ci      if "enum UCharDirection {" in line:
19702e5b6d6dSopenharmony_ci        mode = "bc"
19712e5b6d6dSopenharmony_ci        comment_value = ""
19722e5b6d6dSopenharmony_ci        continue
19732e5b6d6dSopenharmony_ci      if mode == "bc":
19742e5b6d6dSopenharmony_ci        if line.startswith("}"):
19752e5b6d6dSopenharmony_ci          mode = ""
19762e5b6d6dSopenharmony_ci          continue
19772e5b6d6dSopenharmony_ci        match = _bc_comment_re.match(line)
19782e5b6d6dSopenharmony_ci        if match:
19792e5b6d6dSopenharmony_ci          comment_value = match.group(1)
19802e5b6d6dSopenharmony_ci          continue
19812e5b6d6dSopenharmony_ci        match = _bc_re.match(line)
19822e5b6d6dSopenharmony_ci        if match and comment_value:
19832e5b6d6dSopenharmony_ci          bc_enum = match.group(1)
19842e5b6d6dSopenharmony_ci          prop = _properties["bc"]
19852e5b6d6dSopenharmony_ci          vname = GetShortPropertyValueName(prop, comment_value)
19862e5b6d6dSopenharmony_ci          icu_values = _pname_to_icu_prop["bc"][2]
19872e5b6d6dSopenharmony_ci          icu_values.append((bc_enum, vname))
19882e5b6d6dSopenharmony_ci        comment_value = ""
19892e5b6d6dSopenharmony_ci        continue
19902e5b6d6dSopenharmony_ci      # No mode, parse enum constants whose names contain
19912e5b6d6dSopenharmony_ci      # enough information to parse without requiring context.
19922e5b6d6dSopenharmony_ci      match = _uchar_re.match(line)
19932e5b6d6dSopenharmony_ci      if match:
19942e5b6d6dSopenharmony_ci        prop_enum = match.group(1)
19952e5b6d6dSopenharmony_ci        if prop_enum.endswith("_LIMIT"):
19962e5b6d6dSopenharmony_ci          # Ignore "UCHAR_BINARY_LIMIT=57," etc.
19972e5b6d6dSopenharmony_ci          continue
19982e5b6d6dSopenharmony_ci        pname = GetShortPropertyName(prop_enum[6:])
19992e5b6d6dSopenharmony_ci        icu_prop = (prop_enum, pname, [])
20002e5b6d6dSopenharmony_ci        _icu_properties.append(icu_prop)
20012e5b6d6dSopenharmony_ci        _pname_to_icu_prop[pname] = icu_prop
20022e5b6d6dSopenharmony_ci        continue
20032e5b6d6dSopenharmony_ci      match = _ublock_re.match(line)
20042e5b6d6dSopenharmony_ci      if match:
20052e5b6d6dSopenharmony_ci        prop_enum = match.group(1)
20062e5b6d6dSopenharmony_ci        if prop_enum == "UBLOCK_COUNT":
20072e5b6d6dSopenharmony_ci          continue
20082e5b6d6dSopenharmony_ci        prop = _properties["blk"]
20092e5b6d6dSopenharmony_ci        vname = GetShortPropertyValueName(prop, prop_enum[7:])
20102e5b6d6dSopenharmony_ci        icu_values = _pname_to_icu_prop["blk"][2]
20112e5b6d6dSopenharmony_ci        icu_values.append((prop_enum, vname))
20122e5b6d6dSopenharmony_ci        continue
20132e5b6d6dSopenharmony_ci      match = _prop_and_value_re.match(line)
20142e5b6d6dSopenharmony_ci      if match:
20152e5b6d6dSopenharmony_ci        (prop_enum, vname) = match.group(1, 3)
20162e5b6d6dSopenharmony_ci        if vname == "COUNT" or _prop_and_alias_re.match(line):
20172e5b6d6dSopenharmony_ci          continue
20182e5b6d6dSopenharmony_ci        pname = GetShortPropertyName(match.group(2))
20192e5b6d6dSopenharmony_ci        prop = _properties[pname]
20202e5b6d6dSopenharmony_ci        vname = GetShortPropertyValueName(prop, vname)
20212e5b6d6dSopenharmony_ci        icu_values = _pname_to_icu_prop[pname][2]
20222e5b6d6dSopenharmony_ci        icu_values.append((prop_enum, vname))
20232e5b6d6dSopenharmony_ci  # ccc, lccc, tccc use their numeric values as "enum" values.
20242e5b6d6dSopenharmony_ci  # In the UCD data, these numeric values are the first value names,
20252e5b6d6dSopenharmony_ci  # followed by the short & long value names.
20262e5b6d6dSopenharmony_ci  # List the ccc values in numeric order.
20272e5b6d6dSopenharmony_ci  prop = _properties["ccc"]
20282e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["ccc"][2]
20292e5b6d6dSopenharmony_ci  for ccc in sorted([int(name) for name in prop[2]]):
20302e5b6d6dSopenharmony_ci    icu_values.append((ccc, str(ccc)))
20312e5b6d6dSopenharmony_ci  _pname_to_icu_prop["lccc"][2].extend(icu_values)  # Copy ccc -> lccc.
20322e5b6d6dSopenharmony_ci  _pname_to_icu_prop["tccc"][2].extend(icu_values)  # Copy ccc -> tccc.
20332e5b6d6dSopenharmony_ci
20342e5b6d6dSopenharmony_ci  # No need to parse predictable General_Category_Mask enum constants.
20352e5b6d6dSopenharmony_ci  # Just define them in ASCII order.
20362e5b6d6dSopenharmony_ci  prop = _properties["gcm"]
20372e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["gcm"][2]
20382e5b6d6dSopenharmony_ci  for vname in sorted(prop[2]):
20392e5b6d6dSopenharmony_ci    icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname))
20402e5b6d6dSopenharmony_ci  # Hardcode known values for the normalization quick check properties,
20412e5b6d6dSopenharmony_ci  # see unorm2.h for the UNormalizationCheckResult enum.
20422e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["NFC_QC"][2]
20432e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_NO", "N"))
20442e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_YES", "Y"))
20452e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_MAYBE", "M"))
20462e5b6d6dSopenharmony_ci  _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values)  # Copy NFC -> NFKC.
20472e5b6d6dSopenharmony_ci  # No "maybe" values for NF[K]D.
20482e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["NFD_QC"][2]
20492e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_NO", "N"))
20502e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_YES", "Y"))
20512e5b6d6dSopenharmony_ci  _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values)  # Copy NFD -> NFKD.
20522e5b6d6dSopenharmony_ci
20532e5b6d6dSopenharmony_ci
20542e5b6d6dSopenharmony_ci# Sample line to match:
20552e5b6d6dSopenharmony_ci#    USCRIPT_LOMA   = 139,/* Loma */
20562e5b6d6dSopenharmony_ci_uscript_re = re.compile(
20572e5b6d6dSopenharmony_ci    " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
20582e5b6d6dSopenharmony_ci
20592e5b6d6dSopenharmony_cidef ParseUScriptHeader(icu4c_src_root):
20602e5b6d6dSopenharmony_ci  uscript_path = os.path.join(icu4c_src_root, "source",
20612e5b6d6dSopenharmony_ci                              "common", "unicode", "uscript.h")
20622e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["sc"][2]
20632e5b6d6dSopenharmony_ci  with open(uscript_path, "r") as uscript_file:
20642e5b6d6dSopenharmony_ci    for line in uscript_file:
20652e5b6d6dSopenharmony_ci      match = _uscript_re.match(line)
20662e5b6d6dSopenharmony_ci      if match:
20672e5b6d6dSopenharmony_ci        (script_enum, script_code) = match.group(1, 2)
20682e5b6d6dSopenharmony_ci        icu_values.append((script_enum, script_code))
20692e5b6d6dSopenharmony_ci
20702e5b6d6dSopenharmony_ci
20712e5b6d6dSopenharmony_cidef CheckPNamesData():
20722e5b6d6dSopenharmony_ci  """Checks that every ICU property has a full set of value enum constants,
20732e5b6d6dSopenharmony_ci  and that the _icu_properties value names map back to the UCD."""
20742e5b6d6dSopenharmony_ci  missing_enums = []
20752e5b6d6dSopenharmony_ci  for (p_enum, pname, values) in _icu_properties:
20762e5b6d6dSopenharmony_ci    prop = _properties[pname]
20772e5b6d6dSopenharmony_ci    vnames = set(prop[2])  # Modifiable copy of the set of short value names.
20782e5b6d6dSopenharmony_ci    for (v_enum, vname) in values:
20792e5b6d6dSopenharmony_ci      if vname not in vnames:
20802e5b6d6dSopenharmony_ci        raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" %
20812e5b6d6dSopenharmony_ci                         (pname, vname, v_enum))
20822e5b6d6dSopenharmony_ci      vnames.remove(vname)
20832e5b6d6dSopenharmony_ci    # Exceptions to the all-values check:
20842e5b6d6dSopenharmony_ci    # - ICU does not have specific enum values for binary No/Yes.
20852e5b6d6dSopenharmony_ci    # - ICU represents Age values via UVersionInfo rather than enum constants.
20862e5b6d6dSopenharmony_ci    # - gc: ICU enum UCharCategory only has the single-category values.
20872e5b6d6dSopenharmony_ci    #       (ICU's gcm property has all of the UCD gc property values.)
20882e5b6d6dSopenharmony_ci    if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")):
20892e5b6d6dSopenharmony_ci      missing_enums.append((pname, vnames))
20902e5b6d6dSopenharmony_ci  if missing_enums:
20912e5b6d6dSopenharmony_ci    raise ValueError(
20922e5b6d6dSopenharmony_ci        "missing uchar.h enum constants for some property values: %s" %
20932e5b6d6dSopenharmony_ci        missing_enums)
20942e5b6d6dSopenharmony_ci
20952e5b6d6dSopenharmony_ci
20962e5b6d6dSopenharmony_cidef WritePNamesDataHeader(out_path):
20972e5b6d6dSopenharmony_ci  with open(out_path, "w") as out_file:
20982e5b6d6dSopenharmony_ci    out_file.write("""// © 2016 and later: Unicode, Inc. and others.
20992e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
21002e5b6d6dSopenharmony_ci/**
21012e5b6d6dSopenharmony_ci * Copyright (C) 2002-2016, International Business Machines Corporation and
21022e5b6d6dSopenharmony_ci * others. All Rights Reserved.
21032e5b6d6dSopenharmony_ci *
21042e5b6d6dSopenharmony_ci * machine-generated by: icu/tools/unicode/py/preparseucd.py
21052e5b6d6dSopenharmony_ci */
21062e5b6d6dSopenharmony_ci
21072e5b6d6dSopenharmony_ci""")
21082e5b6d6dSopenharmony_ci
21092e5b6d6dSopenharmony_ci    # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties
21102e5b6d6dSopenharmony_ci    # and values in the order of their definition,
21112e5b6d6dSopenharmony_ci    # and this function writes them in that order.
21122e5b6d6dSopenharmony_ci    # Since the ICU API constants are stable and new values are only
21132e5b6d6dSopenharmony_ci    # appended at the end
21142e5b6d6dSopenharmony_ci    # (new properties are added at the end of each binary/enum/... range),
21152e5b6d6dSopenharmony_ci    # the output is stable as well.
21162e5b6d6dSopenharmony_ci    # When a property or value constant is renamed,
21172e5b6d6dSopenharmony_ci    # it only changes the name itself in the output;
21182e5b6d6dSopenharmony_ci    # it does not move in the output since there is no sorting.
21192e5b6d6dSopenharmony_ci    # This minimizes diffs and assists with reviewing and evaluating updates.
21202e5b6d6dSopenharmony_ci
21212e5b6d6dSopenharmony_ci    version = _ucd_version.split('.')
21222e5b6d6dSopenharmony_ci    while len(version) < 4: version.append("0")
21232e5b6d6dSopenharmony_ci    out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version))
21242e5b6d6dSopenharmony_ci
21252e5b6d6dSopenharmony_ci    # Count the maximum number of aliases for any property or value.
21262e5b6d6dSopenharmony_ci    # We write the final value at the end.
21272e5b6d6dSopenharmony_ci    max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"]))
21282e5b6d6dSopenharmony_ci
21292e5b6d6dSopenharmony_ci    # Write an array of "binprop" Value object initializers
21302e5b6d6dSopenharmony_ci    # with the value aliases shared among all binary properties.
21312e5b6d6dSopenharmony_ci    out_file.write("static const Value VALUES_binprop[2] = {\n")
21322e5b6d6dSopenharmony_ci    out_file.write('    Value(0, "%s"),\n' % " ".join(_binary_values["N"]))
21332e5b6d6dSopenharmony_ci    out_file.write('    Value(1, "%s"),\n' % " ".join(_binary_values["Y"]))
21342e5b6d6dSopenharmony_ci    out_file.write("};\n\n")
21352e5b6d6dSopenharmony_ci
21362e5b6d6dSopenharmony_ci    # For each property with named values, write an array of
21372e5b6d6dSopenharmony_ci    # Value object initializers with the value enum and the aliases.
21382e5b6d6dSopenharmony_ci    for (p_enum, pname, values) in _icu_properties:
21392e5b6d6dSopenharmony_ci      prop = _properties[pname]
21402e5b6d6dSopenharmony_ci      aliases = prop[1]
21412e5b6d6dSopenharmony_ci      if len(aliases) > max_aliases: max_aliases = len(aliases)
21422e5b6d6dSopenharmony_ci      if not values: continue
21432e5b6d6dSopenharmony_ci      out_file.write("static const Value VALUES_%s[%d] = {\n" %
21442e5b6d6dSopenharmony_ci                     (pname, len(values)))
21452e5b6d6dSopenharmony_ci      for (v_enum, vname) in values:
21462e5b6d6dSopenharmony_ci        aliases = _properties[pname][3][vname]
21472e5b6d6dSopenharmony_ci        # ccc, lccc, tccc: Omit the numeric strings from the aliases.
21482e5b6d6dSopenharmony_ci        # (See the comment about ccc in the PropertyValueAliases.txt header.)
21492e5b6d6dSopenharmony_ci        if pname.endswith("ccc"): aliases = aliases[1:]
21502e5b6d6dSopenharmony_ci        if len(aliases) > max_aliases: max_aliases = len(aliases)
21512e5b6d6dSopenharmony_ci        cast = "(int32_t)" if pname == "gcm" else ""
21522e5b6d6dSopenharmony_ci        out_file.write('    Value(%s%s, "%s"),\n' %
21532e5b6d6dSopenharmony_ci                       (cast, v_enum, " ".join(aliases)))
21542e5b6d6dSopenharmony_ci      out_file.write("};\n\n")
21552e5b6d6dSopenharmony_ci
21562e5b6d6dSopenharmony_ci    # For each property, write a Property object initializer
21572e5b6d6dSopenharmony_ci    # with the property enum, its aliases, and a reference to its values.
21582e5b6d6dSopenharmony_ci    out_file.write("static const Property PROPERTIES[%d] = {\n" %
21592e5b6d6dSopenharmony_ci                   len(_icu_properties))
21602e5b6d6dSopenharmony_ci    for (enum, pname, values) in _icu_properties:
21612e5b6d6dSopenharmony_ci      prop = _properties[pname]
21622e5b6d6dSopenharmony_ci      aliases = " ".join(prop[1])
21632e5b6d6dSopenharmony_ci      if prop[0] == "Binary":
21642e5b6d6dSopenharmony_ci        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
21652e5b6d6dSopenharmony_ci      elif values:  # Property with named values.
21662e5b6d6dSopenharmony_ci        out_file.write('    Property(%s, "%s", VALUES_%s, %d),\n' %
21672e5b6d6dSopenharmony_ci                       (enum, aliases, pname, len(values)))
21682e5b6d6dSopenharmony_ci      else:
21692e5b6d6dSopenharmony_ci        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
21702e5b6d6dSopenharmony_ci    out_file.write("};\n\n")
21712e5b6d6dSopenharmony_ci
21722e5b6d6dSopenharmony_ci    out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases)
21732e5b6d6dSopenharmony_ci
21742e5b6d6dSopenharmony_ci# main() ------------------------------------------------------------------- ***
21752e5b6d6dSopenharmony_ci
21762e5b6d6dSopenharmony_cidef main():
21772e5b6d6dSopenharmony_ci  global _null_or_defaults
21782e5b6d6dSopenharmony_ci  only_ppucd = False
21792e5b6d6dSopenharmony_ci  if len(sys.argv) == 3:
21802e5b6d6dSopenharmony_ci    (ucd_root, icu_src_root) = sys.argv[1:3]
21812e5b6d6dSopenharmony_ci    ppucd_path = None
21822e5b6d6dSopenharmony_ci  elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd":
21832e5b6d6dSopenharmony_ci    # For debugging:
21842e5b6d6dSopenharmony_ci    # preparseucd.py  path/to/UCD/root  --only_ppucd  path/to/ppucd/outputfile
21852e5b6d6dSopenharmony_ci    ucd_root = sys.argv[1]
21862e5b6d6dSopenharmony_ci    ppucd_path = sys.argv[3]
21872e5b6d6dSopenharmony_ci    only_ppucd = True
21882e5b6d6dSopenharmony_ci    icu_src_root = "/tmp/ppucd"
21892e5b6d6dSopenharmony_ci  else:
21902e5b6d6dSopenharmony_ci    print("Usage: %s  path/to/UCD/root  path/to/ICU/src/root" % sys.argv[0])
21912e5b6d6dSopenharmony_ci    return
21922e5b6d6dSopenharmony_ci  icu4c_src_root = os.path.join(icu_src_root, "icu4c")
21932e5b6d6dSopenharmony_ci  icu_tools_root = os.path.join(icu_src_root, "tools")
21942e5b6d6dSopenharmony_ci  source_files = []
21952e5b6d6dSopenharmony_ci  for root, dirs, files in os.walk(ucd_root):
21962e5b6d6dSopenharmony_ci    for file in files:
21972e5b6d6dSopenharmony_ci      source_files.append(os.path.join(root, file))
21982e5b6d6dSopenharmony_ci  PreprocessFiles(source_files, icu4c_src_root)
21992e5b6d6dSopenharmony_ci  # Parse the processed files in a particular order.
22002e5b6d6dSopenharmony_ci  for files in _files_to_parse:
22012e5b6d6dSopenharmony_ci    for (basename, path, parser) in files:
22022e5b6d6dSopenharmony_ci      print("Parsing %s" % basename)
22032e5b6d6dSopenharmony_ci      value = _files[basename]
22042e5b6d6dSopenharmony_ci      # Unicode data files are in UTF-8.
22052e5b6d6dSopenharmony_ci      charset = "UTF-8"
22062e5b6d6dSopenharmony_ci      if basename == "NamesList.txt":
22072e5b6d6dSopenharmony_ci        # The NamesList used to be in Latin-1 before Unicode 6.2.
22082e5b6d6dSopenharmony_ci        numeric_ucd_version = [int(field) for field in _ucd_version.split('.')]
22092e5b6d6dSopenharmony_ci        if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1"
22102e5b6d6dSopenharmony_ci      in_file = codecs.open(path, "r", charset)
22112e5b6d6dSopenharmony_ci      with in_file:
22122e5b6d6dSopenharmony_ci        parser(in_file)
22132e5b6d6dSopenharmony_ci  _null_or_defaults = _null_values.copy()
22142e5b6d6dSopenharmony_ci  _null_or_defaults.update(_defaults)
22152e5b6d6dSopenharmony_ci  # Every Catalog and Enumerated property must have a default value,
22162e5b6d6dSopenharmony_ci  # from a @missing line. "nv" = "null value".
22172e5b6d6dSopenharmony_ci  pnv = [pname for (pname, nv) in _null_or_defaults.items() if nv == "??"]
22182e5b6d6dSopenharmony_ci  if pnv:
22192e5b6d6dSopenharmony_ci    raise Exception("no default values (@missing lines) for " +
22202e5b6d6dSopenharmony_ci                    "some Catalog or Enumerated properties: %s " % pnv)
22212e5b6d6dSopenharmony_ci  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
22222e5b6d6dSopenharmony_ci  if not only_ppucd:
22232e5b6d6dSopenharmony_ci    # Write Normalizer2 input text files.
22242e5b6d6dSopenharmony_ci    # Do this before compacting the data so that we need not handle fallbacks.
22252e5b6d6dSopenharmony_ci    norm2_path = os.path.join(unidata_path, "norm2")
22262e5b6d6dSopenharmony_ci    if not os.path.exists(norm2_path): os.makedirs(norm2_path)
22272e5b6d6dSopenharmony_ci    WriteNorm2(norm2_path)
22282e5b6d6dSopenharmony_ci  # Optimize block vs. cp properties.
22292e5b6d6dSopenharmony_ci  CompactBlocks()
22302e5b6d6dSopenharmony_ci  # Write the ppucd.txt output file.
22312e5b6d6dSopenharmony_ci  # Use US-ASCII so that ICU tests can parse it in the platform charset,
22322e5b6d6dSopenharmony_ci  # which may be EBCDIC.
22332e5b6d6dSopenharmony_ci  # Fix up non-ASCII data (NamesList.txt headings) to fit.
22342e5b6d6dSopenharmony_ci  if not ppucd_path:
22352e5b6d6dSopenharmony_ci    ppucd_path = os.path.join(unidata_path, "ppucd.txt")
22362e5b6d6dSopenharmony_ci  with codecs.open(ppucd_path, "w", "US-ASCII") as out_file:
22372e5b6d6dSopenharmony_ci    WritePreparsedUCD(out_file)
22382e5b6d6dSopenharmony_ci    out_file.flush()
22392e5b6d6dSopenharmony_ci
22402e5b6d6dSopenharmony_ci  # TODO: PrintNameStats()
22412e5b6d6dSopenharmony_ci
22422e5b6d6dSopenharmony_ci  if only_ppucd: return
22432e5b6d6dSopenharmony_ci
22442e5b6d6dSopenharmony_ci  # ICU data for property & value names API
22452e5b6d6dSopenharmony_ci  ParseUCharHeader(icu4c_src_root)
22462e5b6d6dSopenharmony_ci  ParseUScriptHeader(icu4c_src_root)
22472e5b6d6dSopenharmony_ci  CheckPNamesData()
22482e5b6d6dSopenharmony_ci  genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
22492e5b6d6dSopenharmony_ci  if not os.path.exists(genprops_path): os.makedirs(genprops_path)
22502e5b6d6dSopenharmony_ci  out_path = os.path.join(genprops_path, "pnames_data.h")
22512e5b6d6dSopenharmony_ci  WritePNamesDataHeader(out_path)
22522e5b6d6dSopenharmony_ci
22532e5b6d6dSopenharmony_ci
22542e5b6d6dSopenharmony_ciif __name__ == "__main__":
22552e5b6d6dSopenharmony_ci  main()
2256