unicode/py/preparseucd.py

2e5b6d6dSopenharmony_ci#!/usr/bin/python3 -B
2e5b6d6dSopenharmony_ci# -*- coding: utf-8 -*-
2e5b6d6dSopenharmony_ci# © 2016 and later: Unicode, Inc. and others.
2e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
2e5b6d6dSopenharmony_ci# Copyright (c) 2009-2016 International Business Machines
2e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved.
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci#   file name:  preparseucd.py
2e5b6d6dSopenharmony_ci#   encoding:   US-ASCII
2e5b6d6dSopenharmony_ci#   tab size:   8 (not used)
2e5b6d6dSopenharmony_ci#   indentation:4
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci#   created on: 2011nov03 (forked from ucdcopy.py)
2e5b6d6dSopenharmony_ci#   created by: Markus W. Scherer
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# Copies Unicode Character Database (UCD) files from a tree
2e5b6d6dSopenharmony_ci# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/
2e5b6d6dSopenharmony_ci# to ICU's source/data/unidata/ and source/test/testdata/
2e5b6d6dSopenharmony_ci# and modifies some of the files to make them more compact.
2e5b6d6dSopenharmony_ci# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# Invoke with two command-line parameters:
2e5b6d6dSopenharmony_ci# 1. source folder with UCD & idna files
2e5b6d6dSopenharmony_ci# 2. ICU source root folder (ICU 59+ combined trunk with icu4c, icu4j, tools)
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# Sample invocation:
2e5b6d6dSopenharmony_ci#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ciimport array
2e5b6d6dSopenharmony_ciimport bisect
2e5b6d6dSopenharmony_ciimport codecs
2e5b6d6dSopenharmony_ciimport os
2e5b6d6dSopenharmony_ciimport os.path
2e5b6d6dSopenharmony_ciimport re
2e5b6d6dSopenharmony_ciimport shutil
2e5b6d6dSopenharmony_ciimport sys
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Unicode version ---------------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci_ucd_version = "?"
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# ISO 15924 script codes --------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
2e5b6d6dSopenharmony_ci# that are not yet in the UCD.
2e5b6d6dSopenharmony_ci_scripts_only_in_iso15924 = (
2e5b6d6dSopenharmony_ci    "Afak", "Blis", "Cirt", "Cyrs",
2e5b6d6dSopenharmony_ci    "Egyd", "Egyh", "Geok",
2e5b6d6dSopenharmony_ci    "Hanb", "Hans", "Hant",
2e5b6d6dSopenharmony_ci    "Inds", "Jamo", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
2e5b6d6dSopenharmony_ci    "Maya", "Moon", "Nkgb", "Phlv", "Roro",
2e5b6d6dSopenharmony_ci    "Sara", "Syre", "Syrj", "Syrn",
2e5b6d6dSopenharmony_ci    "Teng", "Visp", "Wole", "Zmth", "Zsye", "Zsym", "Zxxx"
2e5b6d6dSopenharmony_ci)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Properties --------------------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Properties that we do not want to store in ppucd.txt.
2e5b6d6dSopenharmony_ci# Not a frozenset so that we can add aliases for simpler subsequent testing.
2e5b6d6dSopenharmony_ci_ignored_properties = set((
2e5b6d6dSopenharmony_ci  # Other_Xyz only contribute to Xyz, store only the latter.
2e5b6d6dSopenharmony_ci  "OAlpha",
2e5b6d6dSopenharmony_ci  "ODI",
2e5b6d6dSopenharmony_ci  "OGr_Ext",
2e5b6d6dSopenharmony_ci  "OIDC",
2e5b6d6dSopenharmony_ci  "OIDS",
2e5b6d6dSopenharmony_ci  "OLower",
2e5b6d6dSopenharmony_ci  "OMath",
2e5b6d6dSopenharmony_ci  "OUpper",
2e5b6d6dSopenharmony_ci  # Further properties that just contribute to others.
2e5b6d6dSopenharmony_ci  "CE",  # Composition_Exclusion just contributes to Full_Composition_Exclusion.
2e5b6d6dSopenharmony_ci  "JSN",
2e5b6d6dSopenharmony_ci  # These properties just don't seem useful.
2e5b6d6dSopenharmony_ci  # They are deprecated since Unicode 6.0.
2e5b6d6dSopenharmony_ci  "XO_NFC",
2e5b6d6dSopenharmony_ci  "XO_NFD",
2e5b6d6dSopenharmony_ci  "XO_NFKC",
2e5b6d6dSopenharmony_ci  "XO_NFKD",
2e5b6d6dSopenharmony_ci  # ICU does not use Unihan properties.
2e5b6d6dSopenharmony_ci  "cjkAccountingNumeric",
2e5b6d6dSopenharmony_ci  "cjkOtherNumeric",
2e5b6d6dSopenharmony_ci  "cjkPrimaryNumeric",
2e5b6d6dSopenharmony_ci  "cjkCompatibilityVariant",
2e5b6d6dSopenharmony_ci  "cjkIICore",
2e5b6d6dSopenharmony_ci  "cjkIRG_GSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_HSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_JSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_KPSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_KSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_MSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_SSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_TSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_UKSource",
2e5b6d6dSopenharmony_ci  "cjkIRG_USource",
2e5b6d6dSopenharmony_ci  "cjkIRG_VSource",
2e5b6d6dSopenharmony_ci  "cjkRSUnicode"
2e5b6d6dSopenharmony_ci))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# These properties (short names) map code points to
2e5b6d6dSopenharmony_ci# strings or other unusual values (property types String or Miscellaneous)
2e5b6d6dSopenharmony_ci# that cannot be block-compressed (or would be confusing).
2e5b6d6dSopenharmony_ci_uncompressible_props = frozenset((
2e5b6d6dSopenharmony_ci  "bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC",
2e5b6d6dSopenharmony_ci  "isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF",
2e5b6d6dSopenharmony_ci  # scx is block-compressible.
2e5b6d6dSopenharmony_ci  "scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc"
2e5b6d6dSopenharmony_ci))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Dictionary of properties.
2e5b6d6dSopenharmony_ci# Keyed by normalized property names and aliases.
2e5b6d6dSopenharmony_ci# Each value is a tuple with
2e5b6d6dSopenharmony_ci# 0: Type of property (binary, enum, ...)
2e5b6d6dSopenharmony_ci# 1: List of aliases; short & long name followed by other aliases.
2e5b6d6dSopenharmony_ci#    The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
2e5b6d6dSopenharmony_ci# 2: Set of short property value names.
2e5b6d6dSopenharmony_ci# 3: Dictionary of property values.
2e5b6d6dSopenharmony_ci#    For Catalog & Enumerated properties,
2e5b6d6dSopenharmony_ci#    maps each value name to a list of aliases.
2e5b6d6dSopenharmony_ci#    Empty for other types of properties.
2e5b6d6dSopenharmony_ci_properties = {}
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Dictionary of binary-property values which we store as False/True.
2e5b6d6dSopenharmony_ci# Same as the values dictionary of one of the binary properties.
2e5b6d6dSopenharmony_ci_binary_values = {}
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Dictionary of null values.
2e5b6d6dSopenharmony_ci# Keyed by short property names.
2e5b6d6dSopenharmony_ci# These are type-specific values for properties that occur in the data.
2e5b6d6dSopenharmony_ci# They are overridden by _defaults, block and code point properties.
2e5b6d6dSopenharmony_ci_null_values = {}
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Property value names for null values.
2e5b6d6dSopenharmony_ci# We do not store these in _defaults.
2e5b6d6dSopenharmony_ci_null_names = frozenset(("<none>", "NaN"))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Dictionary of explicit default property values.
2e5b6d6dSopenharmony_ci# Keyed by short property names.
2e5b6d6dSopenharmony_ci_defaults = {"gc": "Cn"}
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# _null_values overridden by explicit _defaults.
2e5b6d6dSopenharmony_ci# Initialized after parsing is done.
2e5b6d6dSopenharmony_ci_null_or_defaults = {}
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# List of properties with an ICU UProperty enum.
2e5b6d6dSopenharmony_ci# Each item is an (enum, pname, values) tuple.
2e5b6d6dSopenharmony_ci# - enum: the ICU enum UProperty constant string
2e5b6d6dSopenharmony_ci# - pname: the UCD short property name
2e5b6d6dSopenharmony_ci# - values: list of (enum, vname) pairs per property value
2e5b6d6dSopenharmony_ci#   - enum: the ICU property value's enum constant string
2e5b6d6dSopenharmony_ci#   - vname: the UCD short property value name
2e5b6d6dSopenharmony_ci_icu_properties = []
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Dictionary of short property names mapped to _icu_properties items.
2e5b6d6dSopenharmony_ci_pname_to_icu_prop = {}
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci_non_alnum_re = re.compile("[^a-zA-Z0-9]")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef NormPropName(pname):
2e5b6d6dSopenharmony_ci  """Returns a normalized form of pname.
2e5b6d6dSopenharmony_ci  Removes non-ASCII-alphanumeric characters and lowercases letters."""
2e5b6d6dSopenharmony_ci  return _non_alnum_re.sub("", pname).lower()
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef GetProperty(pname):
2e5b6d6dSopenharmony_ci  """Returns the _properties value for the pname.
2e5b6d6dSopenharmony_ci  Returns null if the property is ignored.
2e5b6d6dSopenharmony_ci  Caches alternate spellings of the property name."""
2e5b6d6dSopenharmony_ci  # Try the input name.
2e5b6d6dSopenharmony_ci  prop = _properties.get(pname)
2e5b6d6dSopenharmony_ci  if prop != None: return prop
2e5b6d6dSopenharmony_ci  if pname in _ignored_properties: return None
2e5b6d6dSopenharmony_ci  # Try the normalized input name.
2e5b6d6dSopenharmony_ci  norm_name = NormPropName(pname)
2e5b6d6dSopenharmony_ci  prop = _properties.get(norm_name)
2e5b6d6dSopenharmony_ci  if prop != None:
2e5b6d6dSopenharmony_ci    _properties[pname] = prop  # Cache prop under this new name spelling.
2e5b6d6dSopenharmony_ci    return prop
2e5b6d6dSopenharmony_ci  elif pname in _ignored_properties:
2e5b6d6dSopenharmony_ci    _ignored_properties.add(pname)  # Remember to ignore this new name spelling.
2e5b6d6dSopenharmony_ci    return None
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    raise NameError("unknown property %s\n" % pname)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef GetShortPropertyName(pname):
2e5b6d6dSopenharmony_ci  if pname in _null_values: return pname  # pname is already the short name.
2e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
2e5b6d6dSopenharmony_ci  if not prop: return ""  # For ignored properties.
2e5b6d6dSopenharmony_ci  return prop[1][0] or prop[1][1]  # Long name if no short name.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef GetShortPropertyValueName(prop, vname):
2e5b6d6dSopenharmony_ci  if vname in prop[2]: return vname
2e5b6d6dSopenharmony_ci  values = prop[3]
2e5b6d6dSopenharmony_ci  aliases = values.get(vname)
2e5b6d6dSopenharmony_ci  if aliases == None:
2e5b6d6dSopenharmony_ci    norm_name = NormPropName(vname)
2e5b6d6dSopenharmony_ci    aliases = values.get(norm_name)
2e5b6d6dSopenharmony_ci    if aliases == None:
2e5b6d6dSopenharmony_ci      raise NameError("unknown value name %s for property %s\n" %
2e5b6d6dSopenharmony_ci                      (vname, prop[1][0]))
2e5b6d6dSopenharmony_ci    values[vname] = aliases
2e5b6d6dSopenharmony_ci  return aliases[0] or aliases[1]  # Long name if no short name.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef NormalizePropertyValue(prop, vname):
2e5b6d6dSopenharmony_ci  if prop[2]:  # Binary/Catalog/Enumerated property.
2e5b6d6dSopenharmony_ci    value = GetShortPropertyValueName(prop, vname)
2e5b6d6dSopenharmony_ci    if prop[0] == "Binary":
2e5b6d6dSopenharmony_ci      value = value == "Y"
2e5b6d6dSopenharmony_ci    if prop[1][0].endswith("ccc"):
2e5b6d6dSopenharmony_ci      value = int(value)
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    value = vname
2e5b6d6dSopenharmony_ci  return value
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Character data ----------------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Lists of NamesList h1 and h2 headings.
2e5b6d6dSopenharmony_ci# Each h1 value is a (start, end, comment) tuple.
2e5b6d6dSopenharmony_ci# Each h2 value is a (cp, comment) tuple.
2e5b6d6dSopenharmony_ci_h1 = []
2e5b6d6dSopenharmony_ci_h2 = []
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# List of Unicode blocks.
2e5b6d6dSopenharmony_ci# Each item is a tuple of start & end code point integers
2e5b6d6dSopenharmony_ci# and a dictionary of default property values.
2e5b6d6dSopenharmony_ci_blocks = []
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# List of ranges with algorithmic names.
2e5b6d6dSopenharmony_ci# Each value is a list of [start, end, type, prefix]
2e5b6d6dSopenharmony_ci# where prefix is optional.
2e5b6d6dSopenharmony_ci_alg_names_ranges = []
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# List of Unicode character ranges and their properties,
2e5b6d6dSopenharmony_ci# stored as an inversion map with range_start & props dictionary.
2e5b6d6dSopenharmony_ci# Starts with one range for all of Unicode without any properties.
2e5b6d6dSopenharmony_ci# Setting values subdivides ranges.
2e5b6d6dSopenharmony_ci_starts = array.array('l', [0, 0x110000])  # array of int32_t
2e5b6d6dSopenharmony_ci_props = [{}, {}]  # props for 0 and 110000
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef FindRange(x):
2e5b6d6dSopenharmony_ci  """ Binary search for x in the inversion map.
2e5b6d6dSopenharmony_ci  Returns the smallest i where x < _starts[i]"""
2e5b6d6dSopenharmony_ci  return bisect.bisect(_starts, x) - 1
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef GetProps(c):
2e5b6d6dSopenharmony_ci  i = FindRange(c)
2e5b6d6dSopenharmony_ci  return _props[i]
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef UpdateProps(start, end, update):
2e5b6d6dSopenharmony_ci  assert 0 <= start <= end <= 0x10ffff
2e5b6d6dSopenharmony_ci  (need_to_update, do_update, u) = (update[0], update[1], update[2])
2e5b6d6dSopenharmony_ci  # Find the index i of the range in _starts that contains start.
2e5b6d6dSopenharmony_ci  i = FindRange(start)
2e5b6d6dSopenharmony_ci  limit = end + 1
2e5b6d6dSopenharmony_ci  # Intersect [start, limit[ with ranges in _starts.
2e5b6d6dSopenharmony_ci  c_start = _starts[i]
2e5b6d6dSopenharmony_ci  c_limit = _starts[i + 1]
2e5b6d6dSopenharmony_ci  c_props = _props[i]
2e5b6d6dSopenharmony_ci  # c_start <= start < c_limit
2e5b6d6dSopenharmony_ci  if c_start < start:
2e5b6d6dSopenharmony_ci    update_limit = c_limit if c_limit <= limit else limit
2e5b6d6dSopenharmony_ci    if need_to_update(u, start, update_limit - 1, c_props):
2e5b6d6dSopenharmony_ci      # Split off [c_start, start[ with a copy of c_props.
2e5b6d6dSopenharmony_ci      i += 1
2e5b6d6dSopenharmony_ci      c_props = c_props.copy()
2e5b6d6dSopenharmony_ci      _starts.insert(i, start)
2e5b6d6dSopenharmony_ci      _props.insert(i, c_props)
2e5b6d6dSopenharmony_ci      c_start = start
2e5b6d6dSopenharmony_ci  # Modify all ranges that are fully inside [start, limit[.
2e5b6d6dSopenharmony_ci  while c_limit <= limit:
2e5b6d6dSopenharmony_ci    # start <= c_start < c_limit <= limit
2e5b6d6dSopenharmony_ci    if need_to_update(u, c_start, c_limit - 1, c_props):
2e5b6d6dSopenharmony_ci      do_update(u, c_start, c_limit - 1, c_props)
2e5b6d6dSopenharmony_ci    if c_limit == 0x110000: return
2e5b6d6dSopenharmony_ci    i += 1
2e5b6d6dSopenharmony_ci    c_start = c_limit
2e5b6d6dSopenharmony_ci    c_limit = _starts[i + 1]
2e5b6d6dSopenharmony_ci    c_props = _props[i]
2e5b6d6dSopenharmony_ci  if c_start < limit and need_to_update(u, c_start, limit - 1, c_props):
2e5b6d6dSopenharmony_ci    # Split off [limit, c_limit[ with a copy of c_props.
2e5b6d6dSopenharmony_ci    _starts.insert(i + 1, limit)
2e5b6d6dSopenharmony_ci    _props.insert(i + 1, c_props.copy())
2e5b6d6dSopenharmony_ci    # Modify [c_start, limit[ c_props.
2e5b6d6dSopenharmony_ci    do_update(u, c_start, limit - 1, c_props)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef NeedToSetProps(props, start, end, c_props):
2e5b6d6dSopenharmony_ci  """Returns True if props is not a sub-dict of c_props."""
2e5b6d6dSopenharmony_ci  for (pname, value) in props.items():
2e5b6d6dSopenharmony_ci    if pname not in c_props or value != c_props[pname]: return True
2e5b6d6dSopenharmony_ci  return False
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef DoSetProps(props, start, end, c_props):
2e5b6d6dSopenharmony_ci  c_props.update(props)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef SetProps(start, end, props):
2e5b6d6dSopenharmony_ci  UpdateProps(start, end, (NeedToSetProps, DoSetProps, props))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef NeedToSetAlways(nv, start, end, c_props):
2e5b6d6dSopenharmony_ci  return True
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# For restoring boundaries after merging adjacent same-props ranges.
2e5b6d6dSopenharmony_cidef AddBoundary(x):
2e5b6d6dSopenharmony_ci  """Ensure that there is a range start/limit at x."""
2e5b6d6dSopenharmony_ci  assert 0 <= x <= 0x10ffff
2e5b6d6dSopenharmony_ci  i = FindRange(x)
2e5b6d6dSopenharmony_ci  if _starts[i] == x: return
2e5b6d6dSopenharmony_ci  # Split the range at x.
2e5b6d6dSopenharmony_ci  c_start = _starts[i]
2e5b6d6dSopenharmony_ci  c_limit = _starts[i + 1]
2e5b6d6dSopenharmony_ci  c_props = _props[i]
2e5b6d6dSopenharmony_ci  # c_start < x < c_limit
2e5b6d6dSopenharmony_ci  i += 1
2e5b6d6dSopenharmony_ci  _starts.insert(i, x)
2e5b6d6dSopenharmony_ci  _props.insert(i, c_props.copy())
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef SetDefaultValue(pname, value):
2e5b6d6dSopenharmony_ci  """Sets the property's default value. Ignores null values."""
2e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
2e5b6d6dSopenharmony_ci  if prop and value not in _null_names:
2e5b6d6dSopenharmony_ci    value = NormalizePropertyValue(prop, value)
2e5b6d6dSopenharmony_ci    if value != _null_values[prop[1][0]]:
2e5b6d6dSopenharmony_ci      _defaults[prop[1][0]] = value
2e5b6d6dSopenharmony_ci      SetProps(0, 0x10ffff, {prop[1][0]: value})
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef SetBinaryPropertyToTrue(pname, start, end):
2e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
2e5b6d6dSopenharmony_ci  if prop:
2e5b6d6dSopenharmony_ci    assert prop[0] == "Binary"
2e5b6d6dSopenharmony_ci    SetProps(start, end, {prop[1][0]: True})
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef SetPropValue(prop, vname, start, end):
2e5b6d6dSopenharmony_ci  value = NormalizePropertyValue(prop, vname)
2e5b6d6dSopenharmony_ci  SetProps(start, end, {prop[1][0]: value})
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef SetPropertyValue(pname, vname, start, end):
2e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
2e5b6d6dSopenharmony_ci  if prop: SetPropValue(prop, vname, start, end)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Parsing ------------------------------------------------------------------ ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci_stripped_cp_re = re.compile("([0-9a-fA-F]+)$")
2e5b6d6dSopenharmony_ci_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$")
2e5b6d6dSopenharmony_ci# Default value for all of Unicode.
2e5b6d6dSopenharmony_ci_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$")
2e5b6d6dSopenharmony_ci# Default value for some range.
2e5b6d6dSopenharmony_ci_missing2_re = re.compile("# *@missing: *(.+)$")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ReadUCDLines(in_file, want_ranges=True, want_other=False,
2e5b6d6dSopenharmony_ci                 want_comments=False, want_missing=False):
2e5b6d6dSopenharmony_ci  """Parses lines from a semicolon-delimited UCD text file.
2e5b6d6dSopenharmony_ci  Strips comments, ignores empty and all-comment lines.
2e5b6d6dSopenharmony_ci  Returns a tuple (type, line, ...).
2e5b6d6dSopenharmony_ci  """
2e5b6d6dSopenharmony_ci  for line in in_file:
2e5b6d6dSopenharmony_ci    line = line.strip()
2e5b6d6dSopenharmony_ci    if not line: continue
2e5b6d6dSopenharmony_ci    if line.startswith("#"):  # whole-line comment
2e5b6d6dSopenharmony_ci      parse_data = False
2e5b6d6dSopenharmony_ci      if want_missing:
2e5b6d6dSopenharmony_ci        match = _missing_re.match(line)
2e5b6d6dSopenharmony_ci        if match:
2e5b6d6dSopenharmony_ci          fields = match.group(1).split(";")
2e5b6d6dSopenharmony_ci          for i in range(len(fields)): fields[i] = fields[i].strip()
2e5b6d6dSopenharmony_ci          yield ("missing", line, fields)
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        match = _missing2_re.match(line)
2e5b6d6dSopenharmony_ci        if match:
2e5b6d6dSopenharmony_ci          # Strip the "missing" comment prefix and fall through to
2e5b6d6dSopenharmony_ci          # parse the remainder of the line like regular data.
2e5b6d6dSopenharmony_ci          parse_data = True
2e5b6d6dSopenharmony_ci          line = match.group(1)
2e5b6d6dSopenharmony_ci      if not parse_data:
2e5b6d6dSopenharmony_ci        if want_comments: yield ("comment", line)
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci    comment_start = line.find("#")  # inline comment
2e5b6d6dSopenharmony_ci    if comment_start >= 0:
2e5b6d6dSopenharmony_ci      line = line[:comment_start].rstrip()
2e5b6d6dSopenharmony_ci      if not line: continue
2e5b6d6dSopenharmony_ci    fields = line.split(";")
2e5b6d6dSopenharmony_ci    for i in range(len(fields)): fields[i] = fields[i].strip()
2e5b6d6dSopenharmony_ci    if want_ranges:
2e5b6d6dSopenharmony_ci      first = fields[0]
2e5b6d6dSopenharmony_ci      match = _stripped_range_re.match(first)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        start = int(match.group(1), 16)
2e5b6d6dSopenharmony_ci        end = int(match.group(2), 16)
2e5b6d6dSopenharmony_ci        yield ("range", line, start, end, fields)
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      match = _stripped_cp_re.match(first)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        c = int(match.group(1), 16)
2e5b6d6dSopenharmony_ci        yield ("range", line, c, c, fields)
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci    if want_other:
2e5b6d6dSopenharmony_ci      yield ("other", line, fields)
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      raise SyntaxError("unable to parse line\n  %s\n" % line)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef AddBinaryProperty(short_name, long_name):
2e5b6d6dSopenharmony_ci  _null_values[short_name] = False
2e5b6d6dSopenharmony_ci  bin_prop = _properties["Math"]
2e5b6d6dSopenharmony_ci  prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3])
2e5b6d6dSopenharmony_ci  _properties[short_name] = prop
2e5b6d6dSopenharmony_ci  _properties[long_name] = prop
2e5b6d6dSopenharmony_ci  _properties[NormPropName(short_name)] = prop
2e5b6d6dSopenharmony_ci  _properties[NormPropName(long_name)] = prop
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef AddSingleNameBinaryProperty(name):
2e5b6d6dSopenharmony_ci  # For some properties, the short name is the same as the long name.
2e5b6d6dSopenharmony_ci  _null_values[name] = False
2e5b6d6dSopenharmony_ci  bin_prop = _properties["Math"]
2e5b6d6dSopenharmony_ci  prop = ("Binary", [name, name], bin_prop[2], bin_prop[3])
2e5b6d6dSopenharmony_ci  _properties[name] = prop
2e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef AddPOSIXBinaryProperty(name):
2e5b6d6dSopenharmony_ci  # We only define a long name for ICU-specific (non-UCD) POSIX properties.
2e5b6d6dSopenharmony_ci  _null_values[name] = False
2e5b6d6dSopenharmony_ci  bin_prop = _properties["Math"]
2e5b6d6dSopenharmony_ci  prop = ("Binary", ["", name], bin_prop[2], bin_prop[3])
2e5b6d6dSopenharmony_ci  _properties[name] = prop
2e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
2e5b6d6dSopenharmony_ci  # This is to match UProperty UCHAR_POSIX_ALNUM etc.
2e5b6d6dSopenharmony_ci  _properties["posix" + NormPropName(name)] = prop
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Match a comment line like
2e5b6d6dSopenharmony_ci# PropertyAliases-6.1.0.txt
2e5b6d6dSopenharmony_ci# and extract the Unicode version.
2e5b6d6dSopenharmony_ci_ucd_version_re = re.compile("# *PropertyAliases" +
2e5b6d6dSopenharmony_ci                             "-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" +
2e5b6d6dSopenharmony_ci                             "\\.txt")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParsePropertyAliases(in_file):
2e5b6d6dSopenharmony_ci  global _ucd_version
2e5b6d6dSopenharmony_ci  prop_type_nulls = {
2e5b6d6dSopenharmony_ci    "Binary": False,
2e5b6d6dSopenharmony_ci    "Catalog": "??",  # Must be specified, e.g., in @missing line.
2e5b6d6dSopenharmony_ci    "Enumerated": "??",  # Must be specified.
2e5b6d6dSopenharmony_ci    "Numeric": "NaN",
2e5b6d6dSopenharmony_ci    "String": "",
2e5b6d6dSopenharmony_ci    "Miscellaneous": ""
2e5b6d6dSopenharmony_ci  }
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_ranges=False,
2e5b6d6dSopenharmony_ci                           want_other=True, want_comments=True):
2e5b6d6dSopenharmony_ci    if data[0] == "comment":
2e5b6d6dSopenharmony_ci      line = data[1]
2e5b6d6dSopenharmony_ci      match = _ucd_version_re.match(line)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        _ucd_version = match.group(1)
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        words = line[1:].lstrip().split()
2e5b6d6dSopenharmony_ci        if len(words) == 2 and words[1] == "Properties":
2e5b6d6dSopenharmony_ci          prop_type = words[0]
2e5b6d6dSopenharmony_ci          null_value = prop_type_nulls[prop_type]
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "other"
2e5b6d6dSopenharmony_ci      aliases = data[2]
2e5b6d6dSopenharmony_ci      name = aliases[0]
2e5b6d6dSopenharmony_ci      if name in _ignored_properties:
2e5b6d6dSopenharmony_ci        for alias in aliases:
2e5b6d6dSopenharmony_ci          _ignored_properties.add(alias)
2e5b6d6dSopenharmony_ci          _ignored_properties.add(NormPropName(alias))
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        if name.endswith("ccc"):
2e5b6d6dSopenharmony_ci          _null_values[name] = 0
2e5b6d6dSopenharmony_ci        else:
2e5b6d6dSopenharmony_ci          _null_values[name] = null_value
2e5b6d6dSopenharmony_ci        prop = (prop_type, aliases, set(), {})
2e5b6d6dSopenharmony_ci        for alias in aliases:
2e5b6d6dSopenharmony_ci          _properties[alias] = prop
2e5b6d6dSopenharmony_ci          _properties[NormPropName(alias)] = prop
2e5b6d6dSopenharmony_ci  # Add provisional and ICU-specific properties we need.
2e5b6d6dSopenharmony_ci  # We add some in support of runtime API, even if we do not write
2e5b6d6dSopenharmony_ci  # data for them to ppucd.txt (e.g., lccc & tccc).
2e5b6d6dSopenharmony_ci  # We add others just to represent UCD data that contributes to
2e5b6d6dSopenharmony_ci  # some functionality, although Unicode has not "blessed" them
2e5b6d6dSopenharmony_ci  # as separate properties (e.g., Turkic_Case_Folding).
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  # Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
2e5b6d6dSopenharmony_ci  name = "Turkic_Case_Folding"
2e5b6d6dSopenharmony_ci  _null_values[name] = ""
2e5b6d6dSopenharmony_ci  prop = ("String", [name, name], set(), {})
2e5b6d6dSopenharmony_ci  _properties[name] = prop
2e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
2e5b6d6dSopenharmony_ci  # Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
2e5b6d6dSopenharmony_ci  name = "Conditional_Case_Mappings"
2e5b6d6dSopenharmony_ci  _null_values[name] = ""
2e5b6d6dSopenharmony_ci  prop = ("Miscellaneous", [name, name], set(), {})
2e5b6d6dSopenharmony_ci  _properties[name] = prop
2e5b6d6dSopenharmony_ci  _properties[NormPropName(name)] = prop
2e5b6d6dSopenharmony_ci  # lccc = ccc of first cp in canonical decomposition.
2e5b6d6dSopenharmony_ci  _null_values["lccc"] = 0
2e5b6d6dSopenharmony_ci  ccc_prop = list(_properties["ccc"])
2e5b6d6dSopenharmony_ci  ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"]
2e5b6d6dSopenharmony_ci  prop = tuple(ccc_prop)
2e5b6d6dSopenharmony_ci  _properties["lccc"] = prop
2e5b6d6dSopenharmony_ci  _properties["Lead_Canonical_Combining_Class"] = prop
2e5b6d6dSopenharmony_ci  _properties["leadcanonicalcombiningclass"] = prop
2e5b6d6dSopenharmony_ci  # tccc = ccc of last cp in canonical decomposition.
2e5b6d6dSopenharmony_ci  _null_values["tccc"] = 0
2e5b6d6dSopenharmony_ci  ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"]
2e5b6d6dSopenharmony_ci  prop = tuple(ccc_prop)
2e5b6d6dSopenharmony_ci  _properties["tccc"] = prop
2e5b6d6dSopenharmony_ci  _properties["Trail_Canonical_Combining_Class"] = prop
2e5b6d6dSopenharmony_ci  _properties["trailcanonicalcombiningclass"] = prop
2e5b6d6dSopenharmony_ci  # Script_Extensions
2e5b6d6dSopenharmony_ci  if "scx" not in _properties:
2e5b6d6dSopenharmony_ci    _null_values["scx"] = ""
2e5b6d6dSopenharmony_ci    prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
2e5b6d6dSopenharmony_ci    _properties["scx"] = prop
2e5b6d6dSopenharmony_ci    _properties["Script_Extensions"] = prop
2e5b6d6dSopenharmony_ci    _properties["scriptextensions"] = prop
2e5b6d6dSopenharmony_ci  # General Category as a bit mask.
2e5b6d6dSopenharmony_ci  _null_values["gcm"] = "??"
2e5b6d6dSopenharmony_ci  gc_prop = _properties["gc"]
2e5b6d6dSopenharmony_ci  prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3])
2e5b6d6dSopenharmony_ci  _properties["gcm"] = prop
2e5b6d6dSopenharmony_ci  _properties["General_Category_Mask"] = prop
2e5b6d6dSopenharmony_ci  _properties["generalcategorymask"] = prop
2e5b6d6dSopenharmony_ci  # Various binary properties.
2e5b6d6dSopenharmony_ci  AddBinaryProperty("Sensitive", "Case_Sensitive")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("nfdinert", "NFD_Inert")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("nfkdinert", "NFKD_Inert")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("nfcinert", "NFC_Inert")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("nfkcinert", "NFKC_Inert")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("segstart", "Segment_Starter")
2e5b6d6dSopenharmony_ci  # https://www.unicode.org/reports/tr51/#Emoji_Properties
2e5b6d6dSopenharmony_ci  AddBinaryProperty("Emoji", "Emoji")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("EPres", "Emoji_Presentation")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("EMod", "Emoji_Modifier")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("EBase", "Emoji_Modifier_Base")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("EComp", "Emoji_Component")
2e5b6d6dSopenharmony_ci  AddBinaryProperty("ExtPict", "Extended_Pictographic")
2e5b6d6dSopenharmony_ci  # https://www.unicode.org/reports/tr51/#Emoji_Sets
2e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("Basic_Emoji")
2e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("Emoji_Keycap_Sequence")
2e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_Modifier_Sequence")
2e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_Flag_Sequence")
2e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_Tag_Sequence")
2e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji_ZWJ_Sequence")
2e5b6d6dSopenharmony_ci  AddSingleNameBinaryProperty("RGI_Emoji")
2e5b6d6dSopenharmony_ci  # C/POSIX character classes that do not have Unicode property [value] aliases.
2e5b6d6dSopenharmony_ci  # See uchar.h.
2e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("alnum")
2e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("blank")
2e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("graph")
2e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("print")
2e5b6d6dSopenharmony_ci  AddPOSIXBinaryProperty("xdigit")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParsePropertyValueAliases(in_file):
2e5b6d6dSopenharmony_ci  global _binary_values
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_ranges=False,
2e5b6d6dSopenharmony_ci                           want_other=True, want_missing=True):
2e5b6d6dSopenharmony_ci    if data[0] == "missing":
2e5b6d6dSopenharmony_ci      SetDefaultValue(data[2][0], data[2][1])
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "other"
2e5b6d6dSopenharmony_ci      fields = data[2]
2e5b6d6dSopenharmony_ci      pname = fields[0]
2e5b6d6dSopenharmony_ci      prop = GetProperty(pname)
2e5b6d6dSopenharmony_ci      if prop:
2e5b6d6dSopenharmony_ci        del fields[0]  # Only the list of aliases remains.
2e5b6d6dSopenharmony_ci        short_name = fields[0]
2e5b6d6dSopenharmony_ci        if short_name == "n/a":  # no short name
2e5b6d6dSopenharmony_ci          fields[0] = ""
2e5b6d6dSopenharmony_ci          short_name = fields[1]
2e5b6d6dSopenharmony_ci        prop[2].add(short_name)
2e5b6d6dSopenharmony_ci        values = prop[3]
2e5b6d6dSopenharmony_ci        for alias in fields:
2e5b6d6dSopenharmony_ci          if alias:
2e5b6d6dSopenharmony_ci            values[alias] = fields
2e5b6d6dSopenharmony_ci            values[NormPropName(alias)] = fields
2e5b6d6dSopenharmony_ci        if prop[0] == "Binary" and not _binary_values:
2e5b6d6dSopenharmony_ci          _binary_values = values
2e5b6d6dSopenharmony_ci  # Some of the @missing lines with non-null default property values
2e5b6d6dSopenharmony_ci  # are in files that we do not parse;
2e5b6d6dSopenharmony_ci  # either because the data for that property is easily
2e5b6d6dSopenharmony_ci  # (i.e., the @missing line would be the only reason to parse such a file)
2e5b6d6dSopenharmony_ci  # or because we compute the property at runtime,
2e5b6d6dSopenharmony_ci  # such as the Hangul_Syllable_Type.
2e5b6d6dSopenharmony_ci  if "dt" not in _defaults:  # DerivedDecompositionType.txt
2e5b6d6dSopenharmony_ci    _defaults["dt"] = "None"
2e5b6d6dSopenharmony_ci  if "nt" not in _defaults:  # DerivedNumericType.txt
2e5b6d6dSopenharmony_ci    _defaults["nt"] = "None"
2e5b6d6dSopenharmony_ci  if "hst" not in _defaults:  # HangulSyllableType.txt
2e5b6d6dSopenharmony_ci    _defaults["hst"] = "NA"
2e5b6d6dSopenharmony_ci  if "gc" not in _defaults:  # No @missing line in any .txt file?
2e5b6d6dSopenharmony_ci    _defaults["gc"] = "Cn"
2e5b6d6dSopenharmony_ci  # Copy the gc default value to gcm.
2e5b6d6dSopenharmony_ci  _defaults["gcm"] = _defaults["gc"]
2e5b6d6dSopenharmony_ci  # Add ISO 15924-only script codes.
2e5b6d6dSopenharmony_ci  # Only for the ICU script code API, not necessary for parsing the UCD.
2e5b6d6dSopenharmony_ci  script_prop = _properties["sc"]
2e5b6d6dSopenharmony_ci  short_script_names = script_prop[2]  # set
2e5b6d6dSopenharmony_ci  script_values = script_prop[3]  # dict
2e5b6d6dSopenharmony_ci  remove_scripts = []
2e5b6d6dSopenharmony_ci  for script in _scripts_only_in_iso15924:
2e5b6d6dSopenharmony_ci    if script in short_script_names:
2e5b6d6dSopenharmony_ci      remove_scripts.append(script)
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      short_script_names.add(script)
2e5b6d6dSopenharmony_ci      # Do not invent a Unicode long script name before the UCD adds the script.
2e5b6d6dSopenharmony_ci      script_list = [script, script]  # [short, long]
2e5b6d6dSopenharmony_ci      script_values[script] = script_list
2e5b6d6dSopenharmony_ci      # Probably not necessary because
2e5b6d6dSopenharmony_ci      # we will not parse these scripts from the UCD:
2e5b6d6dSopenharmony_ci      script_values[NormPropName(script)] = script_list
2e5b6d6dSopenharmony_ci  if remove_scripts:
2e5b6d6dSopenharmony_ci    raise ValueError(
2e5b6d6dSopenharmony_ci        "remove %s from _scripts_only_in_iso15924" % remove_scripts)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseBlocks(in_file):
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
2e5b6d6dSopenharmony_ci    if data[0] == "missing":
2e5b6d6dSopenharmony_ci      SetDefaultValue("blk", data[2][0])
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "range"
2e5b6d6dSopenharmony_ci      (start, end, name) = (data[2], data[3], data[4][1])
2e5b6d6dSopenharmony_ci      _blocks.append((start, end, {"blk": name}))
2e5b6d6dSopenharmony_ci      SetPropertyValue("blk", name, start, end)
2e5b6d6dSopenharmony_ci  _blocks.sort()
2e5b6d6dSopenharmony_ci  # Check for overlapping blocks.
2e5b6d6dSopenharmony_ci  prev_end = -1
2e5b6d6dSopenharmony_ci  for b in _blocks:
2e5b6d6dSopenharmony_ci    start = b[0]
2e5b6d6dSopenharmony_ci    end = b[1]
2e5b6d6dSopenharmony_ci    if prev_end >= start:
2e5b6d6dSopenharmony_ci      raise ValueError(
2e5b6d6dSopenharmony_ci          "block %04lX..%04lX %s overlaps with another " +
2e5b6d6dSopenharmony_ci          "ending at %04lX\n  %s\n" %
2e5b6d6dSopenharmony_ci          (start, end, b[2]["blk"], prev_end))
2e5b6d6dSopenharmony_ci    prev_end = end
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseUnicodeData(in_file):
2e5b6d6dSopenharmony_ci  dt_prop = GetProperty("dt")
2e5b6d6dSopenharmony_ci  range_first_line = ""
2e5b6d6dSopenharmony_ci  range_first = -1
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
2e5b6d6dSopenharmony_ci    # type == "range"
2e5b6d6dSopenharmony_ci    (line, c, end, fields) = (data[1], data[2], data[3], data[4])
2e5b6d6dSopenharmony_ci    assert c == end
2e5b6d6dSopenharmony_ci    name = fields[1]
2e5b6d6dSopenharmony_ci    if name.startswith("<"):
2e5b6d6dSopenharmony_ci      if name.endswith(", First>"):
2e5b6d6dSopenharmony_ci        if range_first >= 0:
2e5b6d6dSopenharmony_ci          raise SyntaxError(
2e5b6d6dSopenharmony_ci              "error: unterminated range started at\n  %s\n" %
2e5b6d6dSopenharmony_ci              range_first_line)
2e5b6d6dSopenharmony_ci        range_first = c
2e5b6d6dSopenharmony_ci        range_first_line = line
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      elif name.endswith(", Last>"):
2e5b6d6dSopenharmony_ci        if range_first < 0:
2e5b6d6dSopenharmony_ci          raise SyntaxError(
2e5b6d6dSopenharmony_ci              "error: range end without start at\n  %s\n" %
2e5b6d6dSopenharmony_ci              line)
2e5b6d6dSopenharmony_ci        elif range_first > c:
2e5b6d6dSopenharmony_ci          raise SyntaxError(
2e5b6d6dSopenharmony_ci              "error: range start/end out of order at\n  %s\n  %s\n" %
2e5b6d6dSopenharmony_ci              (range_first_line, line))
2e5b6d6dSopenharmony_ci        first_name = range_first_line.split(";")[1][1:-8]
2e5b6d6dSopenharmony_ci        name = name[1:-7]
2e5b6d6dSopenharmony_ci        if first_name != name:
2e5b6d6dSopenharmony_ci          raise SyntaxError(
2e5b6d6dSopenharmony_ci              "error: range start/end name mismatch at\n  %s\n  %s\n" %
2e5b6d6dSopenharmony_ci              (range_first_line, line))
2e5b6d6dSopenharmony_ci        end = c
2e5b6d6dSopenharmony_ci        c = range_first
2e5b6d6dSopenharmony_ci        range_first = -1
2e5b6d6dSopenharmony_ci        # Remember algorithmic name ranges.
2e5b6d6dSopenharmony_ci        if "Ideograph" in name:
2e5b6d6dSopenharmony_ci          prefix = "CJK UNIFIED IDEOGRAPH-"
2e5b6d6dSopenharmony_ci          if c == 0x17000 or c == 0x18D00: prefix = "TANGUT IDEOGRAPH-"
2e5b6d6dSopenharmony_ci          _alg_names_ranges.append([c, end, "han", prefix])
2e5b6d6dSopenharmony_ci        elif name == "Hangul Syllable":
2e5b6d6dSopenharmony_ci          _alg_names_ranges.append([c, end, "hangul"])
2e5b6d6dSopenharmony_ci        name = ""
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        # Ignore non-names like <control>.
2e5b6d6dSopenharmony_ci        name = ""
2e5b6d6dSopenharmony_ci    props = {}
2e5b6d6dSopenharmony_ci    if name: props["na"] = name
2e5b6d6dSopenharmony_ci    props["gc"] = fields[2]
2e5b6d6dSopenharmony_ci    ccc = int(fields[3])
2e5b6d6dSopenharmony_ci    if ccc: props["ccc"] = ccc
2e5b6d6dSopenharmony_ci    props["bc"] = fields[4]
2e5b6d6dSopenharmony_ci    # Decomposition type & mapping.
2e5b6d6dSopenharmony_ci    dm = fields[5]
2e5b6d6dSopenharmony_ci    if dm:
2e5b6d6dSopenharmony_ci      if dm.startswith("<"):
2e5b6d6dSopenharmony_ci        dt_limit = dm.index(">")
2e5b6d6dSopenharmony_ci        dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit])
2e5b6d6dSopenharmony_ci        dm = dm[dt_limit + 1:].lstrip()
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        dt = "Can"
2e5b6d6dSopenharmony_ci      props["dt"] = dt
2e5b6d6dSopenharmony_ci      props["dm"] = dm
2e5b6d6dSopenharmony_ci    # Numeric type & value.
2e5b6d6dSopenharmony_ci    decimal = fields[6]
2e5b6d6dSopenharmony_ci    digit = fields[7]
2e5b6d6dSopenharmony_ci    nv = fields[8]
2e5b6d6dSopenharmony_ci    if (decimal and decimal != nv) or (digit and digit != nv):
2e5b6d6dSopenharmony_ci      raise SyntaxError("error: numeric values differ at\n  %s\n" % line)
2e5b6d6dSopenharmony_ci    if nv:
2e5b6d6dSopenharmony_ci      # Map improper fractions to proper ones.
2e5b6d6dSopenharmony_ci      # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS
2e5b6d6dSopenharmony_ci      # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS
2e5b6d6dSopenharmony_ci      if nv == "2/12":
2e5b6d6dSopenharmony_ci        nv = "1/6"
2e5b6d6dSopenharmony_ci      elif nv == "3/12":
2e5b6d6dSopenharmony_ci        nv = "1/4"
2e5b6d6dSopenharmony_ci      elif nv == "4/12":
2e5b6d6dSopenharmony_ci        nv = "1/3"
2e5b6d6dSopenharmony_ci      elif nv == "6/12":
2e5b6d6dSopenharmony_ci        nv = "1/2"
2e5b6d6dSopenharmony_ci      elif nv == "8/12":
2e5b6d6dSopenharmony_ci        nv = "2/3"
2e5b6d6dSopenharmony_ci      elif nv == "9/12":
2e5b6d6dSopenharmony_ci        nv = "3/4"
2e5b6d6dSopenharmony_ci      elif nv == "10/12":
2e5b6d6dSopenharmony_ci        nv = "5/6"
2e5b6d6dSopenharmony_ci      props["nv"] = nv
2e5b6d6dSopenharmony_ci      props["nt"] = "De" if decimal else "Di" if digit else "Nu"
2e5b6d6dSopenharmony_ci    if fields[9] == "Y": props["Bidi_M"] = True
2e5b6d6dSopenharmony_ci    # ICU 49 and above does not support Unicode_1_Name any more.
2e5b6d6dSopenharmony_ci    # See ticket #9013.
2e5b6d6dSopenharmony_ci    # na1 = fields[10]
2e5b6d6dSopenharmony_ci    # if na1: props["na1"] = na1
2e5b6d6dSopenharmony_ci    # ISO_Comment is deprecated and has no values.
2e5b6d6dSopenharmony_ci    # isc = fields[11]
2e5b6d6dSopenharmony_ci    # if isc: props["isc"] = isc
2e5b6d6dSopenharmony_ci    # Simple case mappings.
2e5b6d6dSopenharmony_ci    suc = fields[12]
2e5b6d6dSopenharmony_ci    slc = fields[13]
2e5b6d6dSopenharmony_ci    stc = fields[14]
2e5b6d6dSopenharmony_ci    if suc: props["suc"] = suc
2e5b6d6dSopenharmony_ci    if slc: props["slc"] = slc
2e5b6d6dSopenharmony_ci    if stc: props["stc"] = stc
2e5b6d6dSopenharmony_ci    SetProps(c, end, props)
2e5b6d6dSopenharmony_ci  if range_first >= 0:
2e5b6d6dSopenharmony_ci    raise SyntaxError(
2e5b6d6dSopenharmony_ci        "error: unterminated range started at\n  %s\n" %
2e5b6d6dSopenharmony_ci        range_first_line)
2e5b6d6dSopenharmony_ci  # Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt.
2e5b6d6dSopenharmony_ci  SetPropertyValue("dt", "Can", 0xac00, 0xd7a3)
2e5b6d6dSopenharmony_ci  _alg_names_ranges.sort()
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$")
2e5b6d6dSopenharmony_ci_names_h2_re = re.compile("@\t\t(.+)")
2e5b6d6dSopenharmony_ci_names_char_re = re.compile("([0-9a-fA-F]+)\t.+")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseNamesList(in_file):
2e5b6d6dSopenharmony_ci  pending_h2 = ""
2e5b6d6dSopenharmony_ci  for line in in_file:
2e5b6d6dSopenharmony_ci    line = line.strip()
2e5b6d6dSopenharmony_ci    if not line: continue
2e5b6d6dSopenharmony_ci    match = _names_h1_re.match(line)
2e5b6d6dSopenharmony_ci    if match:
2e5b6d6dSopenharmony_ci      pending_h2 = ""  # Drop a pending h2 when we get to an h1.
2e5b6d6dSopenharmony_ci      start = int(match.group(1), 16)
2e5b6d6dSopenharmony_ci      end = int(match.group(3), 16)
2e5b6d6dSopenharmony_ci      comment = match.group(2).replace(u"\xa0", " ")
2e5b6d6dSopenharmony_ci      _h1.append((start, end, comment))
2e5b6d6dSopenharmony_ci      continue
2e5b6d6dSopenharmony_ci    match = _names_h2_re.match(line)
2e5b6d6dSopenharmony_ci    if match:
2e5b6d6dSopenharmony_ci      pending_h2 = match.group(1).replace(u"\xa0", " ")
2e5b6d6dSopenharmony_ci      continue
2e5b6d6dSopenharmony_ci    if pending_h2:
2e5b6d6dSopenharmony_ci      match = _names_char_re.match(line)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        c = int(match.group(1), 16)
2e5b6d6dSopenharmony_ci        _h2.append((c, pending_h2))
2e5b6d6dSopenharmony_ci        pending_h2 = ""
2e5b6d6dSopenharmony_ci  _h1.sort()
2e5b6d6dSopenharmony_ci  _h2.sort()
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseNamedProperties(in_file):
2e5b6d6dSopenharmony_ci  """Parses a .txt file where the first column is a code point range
2e5b6d6dSopenharmony_ci  and the second column is a property name.
2e5b6d6dSopenharmony_ci  Sets binary properties to True,
2e5b6d6dSopenharmony_ci  and other properties to the values in the third column."""
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
2e5b6d6dSopenharmony_ci    if data[0] == "missing":
2e5b6d6dSopenharmony_ci      SetDefaultValue(data[2][0], data[2][1])
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "range"
2e5b6d6dSopenharmony_ci      if len(data[4]) == 2:
2e5b6d6dSopenharmony_ci        SetBinaryPropertyToTrue(data[4][1], data[2], data[3])
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        SetPropertyValue(data[4][1], data[4][2], data[2], data[3])
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseOneProperty(in_file, pname):
2e5b6d6dSopenharmony_ci  """Parses a .txt file where the first column is a code point range
2e5b6d6dSopenharmony_ci  and the second column is the value of a known property."""
2e5b6d6dSopenharmony_ci  prop = GetProperty(pname)
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
2e5b6d6dSopenharmony_ci    if data[0] == "missing":
2e5b6d6dSopenharmony_ci      SetDefaultValue(pname, data[2][0])
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "range"
2e5b6d6dSopenharmony_ci      SetPropValue(prop, data[4][1], data[2], data[3])
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg")
2e5b6d6dSopenharmony_cidef ParseDerivedAge(in_file): ParseOneProperty(in_file, "age")
2e5b6d6dSopenharmony_cidef ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc")
2e5b6d6dSopenharmony_cidef ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
2e5b6d6dSopenharmony_cidef ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
2e5b6d6dSopenharmony_cidef ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
2e5b6d6dSopenharmony_cidef ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
2e5b6d6dSopenharmony_cidef ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
2e5b6d6dSopenharmony_cidef ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
2e5b6d6dSopenharmony_cidef ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
2e5b6d6dSopenharmony_cidef ParseScripts(in_file): ParseOneProperty(in_file, "sc")
2e5b6d6dSopenharmony_cidef ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx")
2e5b6d6dSopenharmony_cidef ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB")
2e5b6d6dSopenharmony_cidef ParseVerticalOrientation(in_file): ParseOneProperty(in_file, "vo")
2e5b6d6dSopenharmony_cidef ParseWordBreak(in_file): ParseOneProperty(in_file, "WB")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef DoSetNameAlias(alias, start, end, c_props):
2e5b6d6dSopenharmony_ci  if "Name_Alias" in c_props:
2e5b6d6dSopenharmony_ci    c_props["Name_Alias"] += ',' + alias
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    c_props["Name_Alias"] = alias
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseNameAliases(in_file):
2e5b6d6dSopenharmony_ci  """Parses Name_Alias from NameAliases.txt.
2e5b6d6dSopenharmony_ci  A character can have multiple aliases.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  In Unicode 6.0, there are two columns,
2e5b6d6dSopenharmony_ci  with a name correction in the second column.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  In Unicode 6.1, there are three columns.
2e5b6d6dSopenharmony_ci  The second contains an alias, the third its type.
2e5b6d6dSopenharmony_ci  The documented types are:
2e5b6d6dSopenharmony_ci    correction, control, alternate, figment, abbreviation
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  This function does not sort the types, assuming they appear in this order."""
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file):
2e5b6d6dSopenharmony_ci    start = data[2]
2e5b6d6dSopenharmony_ci    end = data[3]
2e5b6d6dSopenharmony_ci    if start != end:
2e5b6d6dSopenharmony_ci      raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" %
2e5b6d6dSopenharmony_ci                       (start, end))
2e5b6d6dSopenharmony_ci    fields = data[4]
2e5b6d6dSopenharmony_ci    if len(fields) == 2:
2e5b6d6dSopenharmony_ci      alias = "correction=" + fields[1]
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      alias = fields[2] + '=' + fields[1]
2e5b6d6dSopenharmony_ci    update = (NeedToSetAlways, DoSetNameAlias, alias)
2e5b6d6dSopenharmony_ci    UpdateProps(start, end, update)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef NeedToSetNumericValue(nv, start, end, c_props):
2e5b6d6dSopenharmony_ci  c_nv = c_props.get("nv")
2e5b6d6dSopenharmony_ci  if c_nv == None:
2e5b6d6dSopenharmony_ci    # DerivedNumericValues.txt adds a Numeric_Value.
2e5b6d6dSopenharmony_ci    assert "nt" not in c_props
2e5b6d6dSopenharmony_ci    return True
2e5b6d6dSopenharmony_ci  if nv != c_nv:
2e5b6d6dSopenharmony_ci    raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
2e5b6d6dSopenharmony_ci                     "but DerivedNumericValues.txt has nv=%s") %
2e5b6d6dSopenharmony_ci                     (c_nv, start, end, nv))
2e5b6d6dSopenharmony_ci  return False
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef DoSetNumericValue(nv, start, end, c_props):
2e5b6d6dSopenharmony_ci  c_props.update({"nt": "Nu", "nv": nv})
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseDerivedNumericValues(in_file):
2e5b6d6dSopenharmony_ci  """Parses DerivedNumericValues.txt.
2e5b6d6dSopenharmony_ci  For most characters, the numeric type & value were parsed previously
2e5b6d6dSopenharmony_ci  from UnicodeData.txt but that does not show the values for Han characters.
2e5b6d6dSopenharmony_ci  Here we check that values match those from UnicodeData.txt
2e5b6d6dSopenharmony_ci  and add new ones."""
2e5b6d6dSopenharmony_ci  # Ignore the @missing line which has an incorrect number of fields,
2e5b6d6dSopenharmony_ci  # and the "NaN" in the wrong field (at least in Unicode 5.1..6.1).
2e5b6d6dSopenharmony_ci  # Also, "NaN" is just the Numeric null value anyway.
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file):
2e5b6d6dSopenharmony_ci    # Conditional update to the numeric value in the 4th field.
2e5b6d6dSopenharmony_ci    update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3])
2e5b6d6dSopenharmony_ci    UpdateProps(data[2], data[3], update)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseCaseFolding(in_file):
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
2e5b6d6dSopenharmony_ci    if data[0] == "missing":
2e5b6d6dSopenharmony_ci      assert data[2][0] == "C"  # common to scf & cf
2e5b6d6dSopenharmony_ci      SetDefaultValue("scf", data[2][1])
2e5b6d6dSopenharmony_ci      SetDefaultValue("cf", data[2][1])
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "range"
2e5b6d6dSopenharmony_ci      start = data[2]
2e5b6d6dSopenharmony_ci      end = data[3]
2e5b6d6dSopenharmony_ci      status = data[4][1]
2e5b6d6dSopenharmony_ci      mapping = data[4][2]
2e5b6d6dSopenharmony_ci      assert status in "CSFT"
2e5b6d6dSopenharmony_ci      if status == "C":
2e5b6d6dSopenharmony_ci        SetProps(start, end, {"scf": mapping, "cf": mapping})
2e5b6d6dSopenharmony_ci      elif status == "S":
2e5b6d6dSopenharmony_ci        SetPropertyValue("scf", mapping, start, end)
2e5b6d6dSopenharmony_ci      elif status == "F":
2e5b6d6dSopenharmony_ci        SetPropertyValue("cf", mapping, start, end)
2e5b6d6dSopenharmony_ci      else:  # status == "T"
2e5b6d6dSopenharmony_ci        SetPropertyValue("Turkic_Case_Folding", mapping, start, end)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef DoSetConditionalCaseMappings(ccm, start, end, c_props):
2e5b6d6dSopenharmony_ci  if "Conditional_Case_Mappings" in c_props:
2e5b6d6dSopenharmony_ci    c_props["Conditional_Case_Mappings"] += ',' + ccm
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    c_props["Conditional_Case_Mappings"] = ccm
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseSpecialCasing(in_file):
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
2e5b6d6dSopenharmony_ci    if data[0] == "missing":
2e5b6d6dSopenharmony_ci      SetDefaultValue("lc", data[2][0])
2e5b6d6dSopenharmony_ci      SetDefaultValue("tc", data[2][1])
2e5b6d6dSopenharmony_ci      SetDefaultValue("uc", data[2][2])
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "range"
2e5b6d6dSopenharmony_ci      start = data[2]
2e5b6d6dSopenharmony_ci      end = data[3]
2e5b6d6dSopenharmony_ci      fields = data[4]
2e5b6d6dSopenharmony_ci      if len(fields) < 5 or not fields[4]:
2e5b6d6dSopenharmony_ci        # Unconditional mappings.
2e5b6d6dSopenharmony_ci        SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]})
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        # Conditional_Case_Mappings
2e5b6d6dSopenharmony_ci        ccm = (fields[4] + ":lc=" + fields[1] +
2e5b6d6dSopenharmony_ci               "&tc=" + fields[2] + "&uc=" + fields[3])
2e5b6d6dSopenharmony_ci        update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm)
2e5b6d6dSopenharmony_ci        UpdateProps(start, end, update)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseBidiBrackets(in_file):
2e5b6d6dSopenharmony_ci  for data in ReadUCDLines(in_file, want_missing=True):
2e5b6d6dSopenharmony_ci    if data[0] == "missing":
2e5b6d6dSopenharmony_ci      SetDefaultValue("bpt", data[2][1])
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # type == "range"
2e5b6d6dSopenharmony_ci      start = data[2]
2e5b6d6dSopenharmony_ci      end = data[3]
2e5b6d6dSopenharmony_ci      assert start == end
2e5b6d6dSopenharmony_ci      mapping = data[4][1]
2e5b6d6dSopenharmony_ci      bracket_type = data[4][2]
2e5b6d6dSopenharmony_ci      SetProps(start, end, {"bpb": mapping, "bpt": bracket_type})
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Postprocessing ----------------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef PrintedSize(pname, value):
2e5b6d6dSopenharmony_ci  if isinstance(value, bool):
2e5b6d6dSopenharmony_ci    if value:
2e5b6d6dSopenharmony_ci      return len(pname) + 1  # ";pname"
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      return len(pname) + 2  # ";-pname"
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    return len(pname) + len(str(value)) + 2  # ";pname=value"
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CompactBlock(b, i):
2e5b6d6dSopenharmony_ci  assert b[0] == _starts[i]
2e5b6d6dSopenharmony_ci  b_props = b[2]  # Normally just blk from Blocks.txt.
2e5b6d6dSopenharmony_ci  # b_props["blk"] has not been canonicalized yet.
2e5b6d6dSopenharmony_ci  b_props["blk"] = _props[i]["blk"]
2e5b6d6dSopenharmony_ci  orig_i = i
2e5b6d6dSopenharmony_ci  # Count the number of occurrences of each property's value in this block.
2e5b6d6dSopenharmony_ci  # To minimize the output, count the number of assigned ranges,
2e5b6d6dSopenharmony_ci  # not the number of code points.
2e5b6d6dSopenharmony_ci  num_ranges = 0
2e5b6d6dSopenharmony_ci  prop_counters = {}
2e5b6d6dSopenharmony_ci  if "gc" in b_props:
2e5b6d6dSopenharmony_ci    b_is_unassigned = b_props["gc"] == "Cn"  # Unreachable with normal data.
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    b_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
2e5b6d6dSopenharmony_ci  while True:
2e5b6d6dSopenharmony_ci    start = _starts[i]
2e5b6d6dSopenharmony_ci    if start > b[1]: break
2e5b6d6dSopenharmony_ci    props = _props[i]
2e5b6d6dSopenharmony_ci    if "gc" in props:
2e5b6d6dSopenharmony_ci      is_unassigned = props["gc"] == "Cn"
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      is_unassigned = b_is_unassigned
2e5b6d6dSopenharmony_ci    if is_unassigned:
2e5b6d6dSopenharmony_ci      # Compact an unassigned range inside the block and
2e5b6d6dSopenharmony_ci      # mark it to be written with "unassigned".
2e5b6d6dSopenharmony_ci      # It falls back to default properties, not block properties,
2e5b6d6dSopenharmony_ci      # except for the blk=Block property.
2e5b6d6dSopenharmony_ci      assert props["blk"] == b_props["blk"]
2e5b6d6dSopenharmony_ci      del props["blk"]
2e5b6d6dSopenharmony_ci      for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
2e5b6d6dSopenharmony_ci        if props[pname] == _null_or_defaults[pname]: del props[pname]
2e5b6d6dSopenharmony_ci      # What remains are unusual default values for unassigned code points.
2e5b6d6dSopenharmony_ci      # For example, bc=R or lb=ID.
2e5b6d6dSopenharmony_ci      # See http://www.unicode.org/reports/tr44/#Default_Values_Table
2e5b6d6dSopenharmony_ci      props["unassigned"] = True
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      for (pname, value) in props.items():
2e5b6d6dSopenharmony_ci        if pname in prop_counters:
2e5b6d6dSopenharmony_ci          counter = prop_counters[pname]
2e5b6d6dSopenharmony_ci        else:
2e5b6d6dSopenharmony_ci          counter = {_null_or_defaults[pname]: num_ranges}
2e5b6d6dSopenharmony_ci          prop_counters[pname] = counter
2e5b6d6dSopenharmony_ci        if value in counter:
2e5b6d6dSopenharmony_ci          counter[value] += 1
2e5b6d6dSopenharmony_ci        else:
2e5b6d6dSopenharmony_ci          counter[value] = 1
2e5b6d6dSopenharmony_ci      # Also count default values for properties that do not occur in a range.
2e5b6d6dSopenharmony_ci      for pname in prop_counters:
2e5b6d6dSopenharmony_ci        if pname not in props:
2e5b6d6dSopenharmony_ci          counter = prop_counters[pname]
2e5b6d6dSopenharmony_ci          value = _null_or_defaults[pname]
2e5b6d6dSopenharmony_ci          counter[value] += 1
2e5b6d6dSopenharmony_ci      num_ranges += 1
2e5b6d6dSopenharmony_ci      # Invariant: For each counter, the sum of counts must equal num_ranges.
2e5b6d6dSopenharmony_ci    i += 1
2e5b6d6dSopenharmony_ci  # For each property that occurs within this block,
2e5b6d6dSopenharmony_ci  # set the value that reduces the file size the most as a block property value.
2e5b6d6dSopenharmony_ci  # This is usually the most common value.
2e5b6d6dSopenharmony_ci  for (pname, counter) in prop_counters.items():
2e5b6d6dSopenharmony_ci    default_value = _null_or_defaults[pname]
2e5b6d6dSopenharmony_ci    default_size = PrintedSize(pname, default_value) * counter[default_value]
2e5b6d6dSopenharmony_ci    max_value = None
2e5b6d6dSopenharmony_ci    max_count = 0
2e5b6d6dSopenharmony_ci    max_savings = 0
2e5b6d6dSopenharmony_ci    for (value, count) in counter.items():
2e5b6d6dSopenharmony_ci      if value != default_value and count > 1:
2e5b6d6dSopenharmony_ci        # Does the file get smaller by setting the block default?
2e5b6d6dSopenharmony_ci        # We save writing the block value as often as it occurs,
2e5b6d6dSopenharmony_ci        # minus once for writing it for the block,
2e5b6d6dSopenharmony_ci        # minus writing the default value instead.
2e5b6d6dSopenharmony_ci        savings = PrintedSize(pname, value) * (count - 1) - default_size
2e5b6d6dSopenharmony_ci        # For two values with the same savings, pick the one that compares lower,
2e5b6d6dSopenharmony_ci        # to make this deterministic (avoid flip-flopping).
2e5b6d6dSopenharmony_ci        if (savings > max_savings or
2e5b6d6dSopenharmony_ci            (savings > 0 and savings == max_savings and value < max_value)):
2e5b6d6dSopenharmony_ci          max_value = value
2e5b6d6dSopenharmony_ci          max_count = count
2e5b6d6dSopenharmony_ci          max_savings = savings
2e5b6d6dSopenharmony_ci    # Do not compress uncompressible properties,
2e5b6d6dSopenharmony_ci    # with an exception for many empty-string values in a block
2e5b6d6dSopenharmony_ci    # (NFKC_CF='' for tags and variation selectors).
2e5b6d6dSopenharmony_ci    if (max_savings > 0 and
2e5b6d6dSopenharmony_ci        ((pname not in _uncompressible_props) or
2e5b6d6dSopenharmony_ci          (max_value == '' and max_count >= 12))):
2e5b6d6dSopenharmony_ci      b_props[pname] = max_value
2e5b6d6dSopenharmony_ci  # For each range and property, remove the default+block value
2e5b6d6dSopenharmony_ci  # but set the default value if that property was not set
2e5b6d6dSopenharmony_ci  # (i.e., it used to inherit the default value).
2e5b6d6dSopenharmony_ci  b_defaults = _null_or_defaults.copy()
2e5b6d6dSopenharmony_ci  b_defaults.update(b_props)
2e5b6d6dSopenharmony_ci  i = orig_i
2e5b6d6dSopenharmony_ci  while True:
2e5b6d6dSopenharmony_ci    start = _starts[i]
2e5b6d6dSopenharmony_ci    if start > b[1]: break
2e5b6d6dSopenharmony_ci    props = _props[i]
2e5b6d6dSopenharmony_ci    if "unassigned" not in props:
2e5b6d6dSopenharmony_ci      # Compact an assigned range inside the block.
2e5b6d6dSopenharmony_ci      for pname in prop_counters:
2e5b6d6dSopenharmony_ci        if pname in props:
2e5b6d6dSopenharmony_ci          if props[pname] == b_defaults[pname]: del props[pname]
2e5b6d6dSopenharmony_ci        elif pname in b_props:
2e5b6d6dSopenharmony_ci          # b_props only has non-default values.
2e5b6d6dSopenharmony_ci          # Set the default value if it used to be inherited.
2e5b6d6dSopenharmony_ci          props[pname] = _null_or_defaults[pname]
2e5b6d6dSopenharmony_ci      # If there is only one assigned range, then move all of its properties
2e5b6d6dSopenharmony_ci      # to the block.
2e5b6d6dSopenharmony_ci      if num_ranges == 1:
2e5b6d6dSopenharmony_ci        b_props.update(props)
2e5b6d6dSopenharmony_ci        props.clear()
2e5b6d6dSopenharmony_ci    i += 1
2e5b6d6dSopenharmony_ci  # Return the _starts index of the first range after this block.
2e5b6d6dSopenharmony_ci  return i
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CompactNonBlock(limit, i):
2e5b6d6dSopenharmony_ci  """Remove default property values from between-block ranges."""
2e5b6d6dSopenharmony_ci  default_is_unassigned = _defaults["gc"] == "Cn"  # This is expected to be true.
2e5b6d6dSopenharmony_ci  while True:
2e5b6d6dSopenharmony_ci    start = _starts[i]
2e5b6d6dSopenharmony_ci    if start >= limit: break
2e5b6d6dSopenharmony_ci    props = _props[i]
2e5b6d6dSopenharmony_ci    if "gc" in props:
2e5b6d6dSopenharmony_ci      is_unassigned = props["gc"] == "Cn"
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      is_unassigned = default_is_unassigned
2e5b6d6dSopenharmony_ci    for pname in list(props.keys()):  # .keys() is a copy so we can del props[pname].
2e5b6d6dSopenharmony_ci      if props[pname] == _null_or_defaults[pname]: del props[pname]
2e5b6d6dSopenharmony_ci    assert "blk" not in props
2e5b6d6dSopenharmony_ci    # If there are no props left, then nothing will be printed.
2e5b6d6dSopenharmony_ci    # Otherwise, add "unassigned" for more obvious output.
2e5b6d6dSopenharmony_ci    if props and is_unassigned:
2e5b6d6dSopenharmony_ci      props["unassigned"] = True
2e5b6d6dSopenharmony_ci    i += 1
2e5b6d6dSopenharmony_ci  # Return the _starts index of the first range after this block.
2e5b6d6dSopenharmony_ci  return i
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CompactBlocks():
2e5b6d6dSopenharmony_ci  """Optimizes block properties.
2e5b6d6dSopenharmony_ci  Sets properties on blocks to the most commonly used values,
2e5b6d6dSopenharmony_ci  and removes default+block values from code point properties."""
2e5b6d6dSopenharmony_ci  # Ensure that there is a boundary in _starts for each block
2e5b6d6dSopenharmony_ci  # so that the simple mixing method below works.
2e5b6d6dSopenharmony_ci  for b in _blocks:
2e5b6d6dSopenharmony_ci    AddBoundary(b[0])
2e5b6d6dSopenharmony_ci    limit = b[1] + 1
2e5b6d6dSopenharmony_ci    if limit <= 0x10ffff: AddBoundary(limit)
2e5b6d6dSopenharmony_ci  # Walk through ranges and blocks together.
2e5b6d6dSopenharmony_ci  i = 0
2e5b6d6dSopenharmony_ci  for b in _blocks:
2e5b6d6dSopenharmony_ci    b_start = b[0]
2e5b6d6dSopenharmony_ci    if _starts[i] < b_start:
2e5b6d6dSopenharmony_ci      i = CompactNonBlock(b_start, i)
2e5b6d6dSopenharmony_ci    i = CompactBlock(b, i)
2e5b6d6dSopenharmony_ci  CompactNonBlock(0x110000, i)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Output ------------------------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef AppendRange(fields, start, end):
2e5b6d6dSopenharmony_ci  if start == end:
2e5b6d6dSopenharmony_ci    fields.append("%04lX" % start)
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    fields.append("%04lX..%04lX" % (start, end))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef AppendProps(fields, props):
2e5b6d6dSopenharmony_ci  # Sort property names (props keys) by their normalized forms
2e5b6d6dSopenharmony_ci  # and output properties in that order.
2e5b6d6dSopenharmony_ci  for pname in sorted(props, key=NormPropName):
2e5b6d6dSopenharmony_ci    value = props[pname]
2e5b6d6dSopenharmony_ci    if isinstance(value, bool):
2e5b6d6dSopenharmony_ci      if not value: pname = "-" + pname
2e5b6d6dSopenharmony_ci      fields.append(pname)
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      fields.append("%s=%s" % (pname, value))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WriteFieldsRangeProps(fields, start, end, props, out_file):
2e5b6d6dSopenharmony_ci  AppendRange(fields, start, end)
2e5b6d6dSopenharmony_ci  AppendProps(fields, props)
2e5b6d6dSopenharmony_ci  out_file.write(";".join(fields))
2e5b6d6dSopenharmony_ci  out_file.write("\n")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef EscapeNonASCII(s):
2e5b6d6dSopenharmony_ci  i = 0
2e5b6d6dSopenharmony_ci  while i < len(s):
2e5b6d6dSopenharmony_ci    c = ord(s[i])
2e5b6d6dSopenharmony_ci    if c <= 0x7f:
2e5b6d6dSopenharmony_ci      i = i + 1
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      if c <= 0xffff:
2e5b6d6dSopenharmony_ci        esc = u"\\u%04X" % c
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        esc = u"\\U%08X" % c
2e5b6d6dSopenharmony_ci      s = s[:i] + esc + s[i+1:]
2e5b6d6dSopenharmony_ci      i = i + len(esc)
2e5b6d6dSopenharmony_ci  return s
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WritePreparsedUCD(out_file):
2e5b6d6dSopenharmony_ci  out_file.write("""# Preparsed UCD generated by ICU preparseucd.py
2e5b6d6dSopenharmony_ci# Copyright (C) 1991 and later: Unicode, Inc. and others.
2e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
2e5b6d6dSopenharmony_ci""");
2e5b6d6dSopenharmony_ci  out_file.write("ucd;%s\n\n" % _ucd_version)
2e5b6d6dSopenharmony_ci  # Sort property names (props keys) by their normalized forms
2e5b6d6dSopenharmony_ci  # and output properties in that order.
2e5b6d6dSopenharmony_ci  pnames = sorted(_null_values, key=NormPropName)
2e5b6d6dSopenharmony_ci  for pname in pnames:
2e5b6d6dSopenharmony_ci    prop = _properties[pname]
2e5b6d6dSopenharmony_ci    out_file.write(";".join(["property", prop[0]] + prop[1]))
2e5b6d6dSopenharmony_ci    out_file.write("\n")
2e5b6d6dSopenharmony_ci  out_file.write("\n")
2e5b6d6dSopenharmony_ci  out_file.write(";".join(["binary"] + _binary_values["N"]))
2e5b6d6dSopenharmony_ci  out_file.write("\n")
2e5b6d6dSopenharmony_ci  out_file.write(";".join(["binary"] + _binary_values["Y"]))
2e5b6d6dSopenharmony_ci  out_file.write("\n")
2e5b6d6dSopenharmony_ci  for pname in pnames:
2e5b6d6dSopenharmony_ci    prop = _properties[pname]
2e5b6d6dSopenharmony_ci    short_names = prop[2]
2e5b6d6dSopenharmony_ci    if short_names and prop[0] != "Binary":
2e5b6d6dSopenharmony_ci      for name in sorted(short_names):
2e5b6d6dSopenharmony_ci        out_file.write(";".join(["value", prop[1][0]] + prop[3][name]))
2e5b6d6dSopenharmony_ci        out_file.write("\n")
2e5b6d6dSopenharmony_ci  out_file.write("\n")
2e5b6d6dSopenharmony_ci  # Ensure that there is a boundary in _starts for each
2e5b6d6dSopenharmony_ci  # range of data we mix into the output,
2e5b6d6dSopenharmony_ci  # so that the simple mixing method below works.
2e5b6d6dSopenharmony_ci  for b in _blocks: AddBoundary(b[0])
2e5b6d6dSopenharmony_ci  for r in _alg_names_ranges: AddBoundary(r[0])
2e5b6d6dSopenharmony_ci  for h in _h1: AddBoundary(h[0])
2e5b6d6dSopenharmony_ci  for h in _h2: AddBoundary(h[0])
2e5b6d6dSopenharmony_ci  # Write the preparsed data. ppucd.txt = preparsed UCD
2e5b6d6dSopenharmony_ci  # Syntax: http://site.icu-project.org/design/props/ppucd
2e5b6d6dSopenharmony_ci  WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file)
2e5b6d6dSopenharmony_ci  i_blocks = 0
2e5b6d6dSopenharmony_ci  i_alg = 0
2e5b6d6dSopenharmony_ci  i_h1 = 0
2e5b6d6dSopenharmony_ci  i_h2 = 0
2e5b6d6dSopenharmony_ci  b_end = -1
2e5b6d6dSopenharmony_ci  for i in range(len(_starts) - 1):
2e5b6d6dSopenharmony_ci    start = _starts[i]
2e5b6d6dSopenharmony_ci    end = _starts[i + 1] - 1
2e5b6d6dSopenharmony_ci    # Block with default properties.
2e5b6d6dSopenharmony_ci    if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]:
2e5b6d6dSopenharmony_ci      b = _blocks[i_blocks]
2e5b6d6dSopenharmony_ci      b_end = b[1]
2e5b6d6dSopenharmony_ci      WriteFieldsRangeProps(["\nblock"], b[0], b_end, b[2], out_file)
2e5b6d6dSopenharmony_ci      i_blocks += 1
2e5b6d6dSopenharmony_ci    # NamesList h1 heading (for [most of] a block).
2e5b6d6dSopenharmony_ci    if i_h1 < len(_h1) and start == _h1[i_h1][0]:
2e5b6d6dSopenharmony_ci      h = _h1[i_h1]
2e5b6d6dSopenharmony_ci      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2])))
2e5b6d6dSopenharmony_ci      i_h1 += 1
2e5b6d6dSopenharmony_ci    # Algorithmic-names range.
2e5b6d6dSopenharmony_ci    if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
2e5b6d6dSopenharmony_ci      r = _alg_names_ranges[i_alg]
2e5b6d6dSopenharmony_ci      fields = ["algnamesrange"]
2e5b6d6dSopenharmony_ci      AppendRange(fields, r[0], r[1])
2e5b6d6dSopenharmony_ci      fields.extend(r[2:])
2e5b6d6dSopenharmony_ci      out_file.write(";".join(fields))
2e5b6d6dSopenharmony_ci      out_file.write("\n")
2e5b6d6dSopenharmony_ci      i_alg += 1
2e5b6d6dSopenharmony_ci    # NamesList h2 heading.
2e5b6d6dSopenharmony_ci    if i_h2 < len(_h2) and start == _h2[i_h2][0]:
2e5b6d6dSopenharmony_ci      out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1]))
2e5b6d6dSopenharmony_ci      i_h2 += 1
2e5b6d6dSopenharmony_ci    # Code point/range data.
2e5b6d6dSopenharmony_ci    props = _props[i]
2e5b6d6dSopenharmony_ci    # Omit ranges with only default+block properties.
2e5b6d6dSopenharmony_ci    if props:
2e5b6d6dSopenharmony_ci      if start > b_end and b_end >= 0:
2e5b6d6dSopenharmony_ci        # First range with values after the last block.
2e5b6d6dSopenharmony_ci        # Separate it visually from the block lines.
2e5b6d6dSopenharmony_ci        out_file.write("\n# No block\n")
2e5b6d6dSopenharmony_ci        b_end = -1
2e5b6d6dSopenharmony_ci      if "unassigned" in props:
2e5b6d6dSopenharmony_ci        # Do not output "unassigned" as a property.
2e5b6d6dSopenharmony_ci        del props["unassigned"]
2e5b6d6dSopenharmony_ci        line_type = "unassigned"
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        line_type = "cp"
2e5b6d6dSopenharmony_ci      WriteFieldsRangeProps([line_type], start, end, props, out_file)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Write Normalizer2 input files -------------------------------------------- ***
2e5b6d6dSopenharmony_ci# Ported from gennorm/store.c.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WriteAllCC(out_file):
2e5b6d6dSopenharmony_ci  out_file.write("# Canonical_Combining_Class (ccc) values\n");
2e5b6d6dSopenharmony_ci  prev_start = 0
2e5b6d6dSopenharmony_ci  prev_cc = 0
2e5b6d6dSopenharmony_ci  for i in range(len(_starts)):
2e5b6d6dSopenharmony_ci    start = _starts[i]
2e5b6d6dSopenharmony_ci    props = _props[i]
2e5b6d6dSopenharmony_ci    cc = props.get("ccc")
2e5b6d6dSopenharmony_ci    if not cc: cc = 0
2e5b6d6dSopenharmony_ci    if prev_cc != cc:
2e5b6d6dSopenharmony_ci      if prev_cc != 0:
2e5b6d6dSopenharmony_ci        last_code_point = start - 1
2e5b6d6dSopenharmony_ci        if prev_start == last_code_point:
2e5b6d6dSopenharmony_ci          out_file.write("%04X:%d\n" % (last_code_point, prev_cc))
2e5b6d6dSopenharmony_ci        else:
2e5b6d6dSopenharmony_ci          out_file.write("%04X..%04X:%d\n" %
2e5b6d6dSopenharmony_ci                         (prev_start, last_code_point, prev_cc))
2e5b6d6dSopenharmony_ci      prev_start = start
2e5b6d6dSopenharmony_ci      prev_cc = cc
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef HasMapping(c):
2e5b6d6dSopenharmony_ci  props = GetProps(c)
2e5b6d6dSopenharmony_ci  dt = props.get("dt")
2e5b6d6dSopenharmony_ci  return dt and dt != "None"
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef HasOneWayMapping(c):
2e5b6d6dSopenharmony_ci  while True:
2e5b6d6dSopenharmony_ci    props = GetProps(c)
2e5b6d6dSopenharmony_ci    dt = props.get("dt")
2e5b6d6dSopenharmony_ci    if not dt or dt == "None":
2e5b6d6dSopenharmony_ci      return False  # no mapping
2e5b6d6dSopenharmony_ci    elif dt == "Can":
2e5b6d6dSopenharmony_ci      # The canonical decomposition is a one-way mapping if
2e5b6d6dSopenharmony_ci      # - it does not map to exactly two code points
2e5b6d6dSopenharmony_ci      # - c has ccc!=0
2e5b6d6dSopenharmony_ci      # - c has the Composition_Exclusion property
2e5b6d6dSopenharmony_ci      # - its starter has a one-way mapping (loop for this)
2e5b6d6dSopenharmony_ci      # - its non-starter decomposes
2e5b6d6dSopenharmony_ci      nfd = props["dm"].split()
2e5b6d6dSopenharmony_ci      if (len(nfd) != 2 or
2e5b6d6dSopenharmony_ci          props.get("ccc") or
2e5b6d6dSopenharmony_ci          props.get("Comp_Ex") or
2e5b6d6dSopenharmony_ci          HasMapping(int(nfd[1], 16))):
2e5b6d6dSopenharmony_ci        return True
2e5b6d6dSopenharmony_ci      c = int(nfd[0], 16)  # continue
2e5b6d6dSopenharmony_ci    else:
2e5b6d6dSopenharmony_ci      # c has a compatibility mapping.
2e5b6d6dSopenharmony_ci      return True
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci_data_file_copyright = """# Copyright (C) 2016 and later: Unicode, Inc. and others.
2e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
2e5b6d6dSopenharmony_ci# Copyright (C) 1999-2016, International Business Machines
2e5b6d6dSopenharmony_ci# Corporation and others.  All Rights Reserved.
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci"""
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WriteNorm2NFCTextFile(path):
2e5b6d6dSopenharmony_ci  global _data_file_copyright
2e5b6d6dSopenharmony_ci  with open(os.path.join(path, "nfc.txt"), "w") as out_file:
2e5b6d6dSopenharmony_ci    out_file.write(
2e5b6d6dSopenharmony_ci        _data_file_copyright + """# file name: nfc.txt
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# Complete data for Unicode NFC normalization.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci* Unicode """ + _ucd_version + """
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci""")
2e5b6d6dSopenharmony_ci    WriteAllCC(out_file)
2e5b6d6dSopenharmony_ci    out_file.write("\n# Canonical decomposition mappings\n")
2e5b6d6dSopenharmony_ci    for i in range(len(_starts) - 1):
2e5b6d6dSopenharmony_ci      start = _starts[i]
2e5b6d6dSopenharmony_ci      end = _starts[i + 1] - 1
2e5b6d6dSopenharmony_ci      props = _props[i]
2e5b6d6dSopenharmony_ci      dm = props.get("dm")
2e5b6d6dSopenharmony_ci      if dm and dm[0] != '<' and props["dt"] == "Can":
2e5b6d6dSopenharmony_ci        assert start == end
2e5b6d6dSopenharmony_ci        # The Comp_Ex=Full_Composition_Exclusion property tells us
2e5b6d6dSopenharmony_ci        # whether the canonical decomposition round-trips.
2e5b6d6dSopenharmony_ci        separator = '>' if props.get("Comp_Ex") else '='
2e5b6d6dSopenharmony_ci        out_file.write("%04X%s%s\n" % (start, separator, dm))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WriteNorm2NFKCTextFile(path):
2e5b6d6dSopenharmony_ci  global _data_file_copyright
2e5b6d6dSopenharmony_ci  with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
2e5b6d6dSopenharmony_ci    out_file.write(
2e5b6d6dSopenharmony_ci        _data_file_copyright + """# file name: nfkc.txt
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# Data for Unicode NFKC normalization.
2e5b6d6dSopenharmony_ci# This file contains only compatibility decomposition mappings,
2e5b6d6dSopenharmony_ci# plus those canonical decompositions that change from NFC round-trip mappings
2e5b6d6dSopenharmony_ci# to NFKC one-way mappings.
2e5b6d6dSopenharmony_ci# Use this file as the second gennorm2 input file after nfc.txt.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci* Unicode """ + _ucd_version + """
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci""")
2e5b6d6dSopenharmony_ci    for i in range(len(_starts) - 1):
2e5b6d6dSopenharmony_ci      start = _starts[i]
2e5b6d6dSopenharmony_ci      end = _starts[i + 1] - 1
2e5b6d6dSopenharmony_ci      props = _props[i]
2e5b6d6dSopenharmony_ci      dm = props.get("dm")
2e5b6d6dSopenharmony_ci      if dm and dm[0] != '<':
2e5b6d6dSopenharmony_ci        assert start == end
2e5b6d6dSopenharmony_ci        if props["dt"] != "Can":
2e5b6d6dSopenharmony_ci          # Compatibility decomposition.
2e5b6d6dSopenharmony_ci          out_file.write("%04X>%s\n" % (start, dm))
2e5b6d6dSopenharmony_ci        elif not props.get("Comp_Ex") and HasOneWayMapping(start):
2e5b6d6dSopenharmony_ci          # NFC round-trip mapping turns into NFKC one-way mapping.
2e5b6d6dSopenharmony_ci          out_file.write("%04X>%s  # NFC round-trip, NFKC one-way\n" %
2e5b6d6dSopenharmony_ci                         (start, dm))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WriteNorm2NFKC_CFTextFile(path):
2e5b6d6dSopenharmony_ci  global _data_file_copyright
2e5b6d6dSopenharmony_ci  with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file:
2e5b6d6dSopenharmony_ci    out_file.write(
2e5b6d6dSopenharmony_ci        _data_file_copyright + """# file name: nfkc_cf.txt
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# machine-generated by ICU preparseucd.py
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# This file contains the Unicode NFKC_CF mappings,
2e5b6d6dSopenharmony_ci# extracted from the UCD file DerivedNormalizationProps.txt,
2e5b6d6dSopenharmony_ci# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
2e5b6d6dSopenharmony_ci# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci""")
2e5b6d6dSopenharmony_ci    out_file.write("* Unicode " + _ucd_version + "\n\n")
2e5b6d6dSopenharmony_ci    prev_start = 0
2e5b6d6dSopenharmony_ci    prev_end = 0
2e5b6d6dSopenharmony_ci    prev_nfkc_cf = None
2e5b6d6dSopenharmony_ci    for i in range(len(_starts) - 1):
2e5b6d6dSopenharmony_ci      start = _starts[i]
2e5b6d6dSopenharmony_ci      end = _starts[i + 1] - 1
2e5b6d6dSopenharmony_ci      props = _props[i]
2e5b6d6dSopenharmony_ci      nfkc_cf = props.get("NFKC_CF")
2e5b6d6dSopenharmony_ci      # Merge with the previous range if possible,
2e5b6d6dSopenharmony_ci      # or remember this range for merging.
2e5b6d6dSopenharmony_ci      if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
2e5b6d6dSopenharmony_ci        prev_end = end
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
2e5b6d6dSopenharmony_ci          if prev_start == prev_end:
2e5b6d6dSopenharmony_ci            out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
2e5b6d6dSopenharmony_ci          else:
2e5b6d6dSopenharmony_ci            out_file.write("%04X..%04X>%s\n" %
2e5b6d6dSopenharmony_ci                           (prev_start, prev_end, prev_nfkc_cf))
2e5b6d6dSopenharmony_ci        prev_start = start
2e5b6d6dSopenharmony_ci        prev_end = end
2e5b6d6dSopenharmony_ci        prev_nfkc_cf = nfkc_cf
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WriteNorm2(path):
2e5b6d6dSopenharmony_ci  WriteNorm2NFCTextFile(path)
2e5b6d6dSopenharmony_ci  WriteNorm2NFKCTextFile(path)
2e5b6d6dSopenharmony_ci  WriteNorm2NFKC_CFTextFile(path)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# UTS #46 Normalizer2 input file ------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci_idna_replacements = [
2e5b6d6dSopenharmony_ci  # Several versions of avoiding circular FFFD>FFFD mappings,
2e5b6d6dSopenharmony_ci  # depending on the version of the input file.
2e5b6d6dSopenharmony_ci  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
2e5b6d6dSopenharmony_ci  (re.compile(r"\.\.FFFD"), "..FFFC"),
2e5b6d6dSopenharmony_ci  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
2e5b6d6dSopenharmony_ci  # Since we switch between checking and not checking for STD3 character
2e5b6d6dSopenharmony_ci  # restrictions at runtime, checking the non-LDH ASCII characters in code,
2e5b6d6dSopenharmony_ci  # we treat these values here like their regular siblings.
2e5b6d6dSopenharmony_ci  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
2e5b6d6dSopenharmony_ci  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
2e5b6d6dSopenharmony_ci  # For UTS #46, we do not care about "not valid in IDNA2008".
2e5b6d6dSopenharmony_ci  (re.compile(r"; *; NV8 +"), ""),
2e5b6d6dSopenharmony_ci  # ICU 63+ normalization no longer allows mappings for surrogate code points,
2e5b6d6dSopenharmony_ci  # and the UTS #46 code handles them instead.
2e5b6d6dSopenharmony_ci  (re.compile(r"^D800..DFFF    ; disallowed"), r"# D800..DFFF disallowed in code"),
2e5b6d6dSopenharmony_ci  # Normal transformations.
2e5b6d6dSopenharmony_ci  (re.compile(r"; disallowed"), ">FFFD"),
2e5b6d6dSopenharmony_ci  (re.compile(r"; ignored"), ">"),
2e5b6d6dSopenharmony_ci  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
2e5b6d6dSopenharmony_ci  (re.compile(r"; mapped +; "), ">"),
2e5b6d6dSopenharmony_ci  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
2e5b6d6dSopenharmony_ci]
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef IdnaToUTS46TextFile(s, t):
2e5b6d6dSopenharmony_ci  """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
2e5b6d6dSopenharmony_ci  # Different input/output file names.
2e5b6d6dSopenharmony_ci  dest_path = os.path.dirname(t)
2e5b6d6dSopenharmony_ci  t = os.path.join(dest_path, "uts46.txt")
2e5b6d6dSopenharmony_ci  # TODO: With Python 2.7+, combine the two with statements into one.
2e5b6d6dSopenharmony_ci  with open(s, "r") as in_file:
2e5b6d6dSopenharmony_ci    with open(t, "w") as out_file:
2e5b6d6dSopenharmony_ci      out_file.write("# Original file:\n")
2e5b6d6dSopenharmony_ci      for line in in_file:
2e5b6d6dSopenharmony_ci        orig_line = line
2e5b6d6dSopenharmony_ci        if line.startswith("# For documentation"):
2e5b6d6dSopenharmony_ci          out_file.write(line)
2e5b6d6dSopenharmony_ci          out_file.write(r"""
2e5b6d6dSopenharmony_ci# ================================================
2e5b6d6dSopenharmony_ci# This file has been reformatted into syntax for the
2e5b6d6dSopenharmony_ci# gennorm2 Normalizer2 data generator tool.
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
2e5b6d6dSopenharmony_ci# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
2e5b6d6dSopenharmony_ci# "disallowed" lines map to U+FFFD.
2e5b6d6dSopenharmony_ci# "ignored" lines map to an empty string.
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# Characters disallowed under STD3 rules are treated as valid or mapped;
2e5b6d6dSopenharmony_ci# they are handled in code.
2e5b6d6dSopenharmony_ci# Deviation characters are also handled in code.
2e5b6d6dSopenharmony_ci#
2e5b6d6dSopenharmony_ci# Use this file as the second gennorm2 input file after nfc.txt.
2e5b6d6dSopenharmony_ci# ================================================
2e5b6d6dSopenharmony_ci""")
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        if line[0] in "#\r\n":
2e5b6d6dSopenharmony_ci          out_file.write(line)
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
2e5b6d6dSopenharmony_ci        # Align inline comments at column 40.
2e5b6d6dSopenharmony_ci        comment_pos = line.find("#", 1)
2e5b6d6dSopenharmony_ci        if comment_pos < 40:
2e5b6d6dSopenharmony_ci          line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
2e5b6d6dSopenharmony_ci                  line[comment_pos:])
2e5b6d6dSopenharmony_ci        elif comment_pos > 40:
2e5b6d6dSopenharmony_ci          space_pos = comment_pos
2e5b6d6dSopenharmony_ci          while space_pos > 0 and line[space_pos - 1] == ' ':
2e5b6d6dSopenharmony_ci            space_pos = space_pos - 1
2e5b6d6dSopenharmony_ci          if space_pos < 40:
2e5b6d6dSopenharmony_ci            # Fewer than 40 characters before the comment:
2e5b6d6dSopenharmony_ci            # Align comments at column 40.
2e5b6d6dSopenharmony_ci            line = line[:40] + line[comment_pos:]
2e5b6d6dSopenharmony_ci          else:
2e5b6d6dSopenharmony_ci            # 40 or more characters before the comment:
2e5b6d6dSopenharmony_ci            # Keep one space between contents and comment.
2e5b6d6dSopenharmony_ci            line = line[:space_pos] + " " + line[comment_pos:]
2e5b6d6dSopenharmony_ci        # Write the modified line.
2e5b6d6dSopenharmony_ci        out_file.write(line)
2e5b6d6dSopenharmony_ci        if "..FFFF" in orig_line and "..FFFC" in line:
2e5b6d6dSopenharmony_ci          out_file.write("FFFE..FFFF    >FFFD\n");
2e5b6d6dSopenharmony_ci  return t
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Preprocessing ------------------------------------------------------------ ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
2e5b6d6dSopenharmony_ci_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CopyAndStripWithOptionalMerge(s, t, do_merge):
2e5b6d6dSopenharmony_ci  # TODO: We do not seem to need the do_merge argument and logic any more.
2e5b6d6dSopenharmony_ci  with open(s, "r") as in_file, open(t, "w") as out_file:
2e5b6d6dSopenharmony_ci    first = -1  # First code point with first_data.
2e5b6d6dSopenharmony_ci    last = -1  # Last code point with first_data.
2e5b6d6dSopenharmony_ci    first_data = ""  # Common data for code points [first..last].
2e5b6d6dSopenharmony_ci    for line in in_file:
2e5b6d6dSopenharmony_ci      match = _strip_re.match(line)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        line = match.group(1)
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        line = line.rstrip()
2e5b6d6dSopenharmony_ci      if do_merge:
2e5b6d6dSopenharmony_ci        match = _code_point_re.match(line)
2e5b6d6dSopenharmony_ci        if match:
2e5b6d6dSopenharmony_ci          c = int(match.group(1), 16)
2e5b6d6dSopenharmony_ci          data = line[match.end() - 1:]
2e5b6d6dSopenharmony_ci        else:
2e5b6d6dSopenharmony_ci          c = -1
2e5b6d6dSopenharmony_ci          data = ""
2e5b6d6dSopenharmony_ci        if last >= 0 and (c != (last + 1) or data != first_data):
2e5b6d6dSopenharmony_ci          # output the current range
2e5b6d6dSopenharmony_ci          if first == last:
2e5b6d6dSopenharmony_ci            out_file.write("%04X%s\n" % (first, first_data))
2e5b6d6dSopenharmony_ci          else:
2e5b6d6dSopenharmony_ci            out_file.write("%04X..%04X%s\n" % (first, last, first_data))
2e5b6d6dSopenharmony_ci          first = -1
2e5b6d6dSopenharmony_ci          last = -1
2e5b6d6dSopenharmony_ci          first_data = ""
2e5b6d6dSopenharmony_ci        if c < 0:
2e5b6d6dSopenharmony_ci          # no data on this line, output as is
2e5b6d6dSopenharmony_ci          out_file.write(line)
2e5b6d6dSopenharmony_ci          out_file.write("\n")
2e5b6d6dSopenharmony_ci        else:
2e5b6d6dSopenharmony_ci          # data on this line, store for possible range compaction
2e5b6d6dSopenharmony_ci          if last < 0:
2e5b6d6dSopenharmony_ci            # set as the first line in a possible range
2e5b6d6dSopenharmony_ci            first = c
2e5b6d6dSopenharmony_ci            last = c
2e5b6d6dSopenharmony_ci            first_data = data
2e5b6d6dSopenharmony_ci          else:
2e5b6d6dSopenharmony_ci            # must be c == (last + 1) and data == first_data
2e5b6d6dSopenharmony_ci            # because of previous conditions
2e5b6d6dSopenharmony_ci            # continue with the current range
2e5b6d6dSopenharmony_ci            last = c
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        # Only strip, don't merge: just output the stripped line.
2e5b6d6dSopenharmony_ci        out_file.write(line)
2e5b6d6dSopenharmony_ci        out_file.write("\n")
2e5b6d6dSopenharmony_ci    if do_merge and last >= 0:
2e5b6d6dSopenharmony_ci      # output the last range in the file
2e5b6d6dSopenharmony_ci      if first == last:
2e5b6d6dSopenharmony_ci        out_file.write("%04X%s\n" % (first, first_data))
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        out_file.write("%04X..%04X%s\n" % (first, last, first_data))
2e5b6d6dSopenharmony_ci      first = -1
2e5b6d6dSopenharmony_ci      last = -1
2e5b6d6dSopenharmony_ci      first_data = ""
2e5b6d6dSopenharmony_ci    out_file.flush()
2e5b6d6dSopenharmony_ci  return t
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CopyAndStrip(s, t):
2e5b6d6dSopenharmony_ci  """Copies a file and removes comments behind data lines but not in others."""
2e5b6d6dSopenharmony_ci  return CopyAndStripWithOptionalMerge(s, t, False)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CopyAndStripAndMerge(s, t):
2e5b6d6dSopenharmony_ci  """Copies and strips a file and merges lines.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  Copies a file, removes comments, and
2e5b6d6dSopenharmony_ci  merges lines with adjacent code point ranges and identical per-code point
2e5b6d6dSopenharmony_ci  data lines into one line with range syntax.
2e5b6d6dSopenharmony_ci  """
2e5b6d6dSopenharmony_ci  return CopyAndStripWithOptionalMerge(s, t, True)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CopyOnly(s, t):
2e5b6d6dSopenharmony_ci  shutil.copy(s, t)
2e5b6d6dSopenharmony_ci  return t
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef DontCopy(s, t):
2e5b6d6dSopenharmony_ci  return s
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Each _files value is a
2e5b6d6dSopenharmony_ci# (preprocessor, dest_folder, parser, order) tuple
2e5b6d6dSopenharmony_ci# where all fields except the preprocessor are optional.
2e5b6d6dSopenharmony_ci# After the initial preprocessing (copy/strip/merge),
2e5b6d6dSopenharmony_ci# if a parser is specified, then a tuple is added to _files_to_parse
2e5b6d6dSopenharmony_ci# at index "order" (default order 9).
2e5b6d6dSopenharmony_ci# An explicit order number is set only for files that must be parsed
2e5b6d6dSopenharmony_ci# before others.
2e5b6d6dSopenharmony_ci_files = {
2e5b6d6dSopenharmony_ci  "BidiBrackets.txt": (DontCopy, ParseBidiBrackets),
2e5b6d6dSopenharmony_ci  "BidiMirroring.txt": (DontCopy, ParseBidiMirroring),
2e5b6d6dSopenharmony_ci  "BidiTest.txt": (CopyOnly, "testdata"),
2e5b6d6dSopenharmony_ci  "Blocks.txt": (DontCopy, ParseBlocks),
2e5b6d6dSopenharmony_ci  "CaseFolding.txt": (CopyOnly, ParseCaseFolding),
2e5b6d6dSopenharmony_ci  "DerivedAge.txt": (DontCopy, ParseDerivedAge),
2e5b6d6dSopenharmony_ci  "DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass),
2e5b6d6dSopenharmony_ci  "DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties),
2e5b6d6dSopenharmony_ci  "DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup),
2e5b6d6dSopenharmony_ci  "DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType),
2e5b6d6dSopenharmony_ci  "DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties),
2e5b6d6dSopenharmony_ci  "DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues),
2e5b6d6dSopenharmony_ci  "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
2e5b6d6dSopenharmony_ci  "emoji-data.txt": (DontCopy, ParseNamedProperties),
2e5b6d6dSopenharmony_ci  "emoji-sequences.txt": (CopyOnly,),
2e5b6d6dSopenharmony_ci  "emoji-zwj-sequences.txt": (CopyOnly,),
2e5b6d6dSopenharmony_ci  "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
2e5b6d6dSopenharmony_ci  "GraphemeBreakTest-cldr.txt": (CopyOnly, "testdata"),
2e5b6d6dSopenharmony_ci  "IdnaTestV2.txt": (CopyOnly, "testdata"),
2e5b6d6dSopenharmony_ci  "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
2e5b6d6dSopenharmony_ci  "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
2e5b6d6dSopenharmony_ci  "LineBreak.txt": (DontCopy, ParseLineBreak),
2e5b6d6dSopenharmony_ci  "LineBreakTest.txt": (CopyOnly, "testdata"),
2e5b6d6dSopenharmony_ci  "NameAliases.txt": (DontCopy, ParseNameAliases),
2e5b6d6dSopenharmony_ci  "NamesList.txt": (DontCopy, ParseNamesList),
2e5b6d6dSopenharmony_ci  "NormalizationCorrections.txt": (CopyOnly,),  # Only used in gensprep.
2e5b6d6dSopenharmony_ci  "NormalizationTest.txt": (CopyAndStrip,),
2e5b6d6dSopenharmony_ci  "PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0),
2e5b6d6dSopenharmony_ci  "PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1),
2e5b6d6dSopenharmony_ci  "PropList.txt": (DontCopy, ParseNamedProperties),
2e5b6d6dSopenharmony_ci  "SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak),
2e5b6d6dSopenharmony_ci  "SentenceBreakTest.txt": (CopyOnly, "testdata"),
2e5b6d6dSopenharmony_ci  "Scripts.txt": (DontCopy, ParseScripts),
2e5b6d6dSopenharmony_ci  "ScriptExtensions.txt": (DontCopy, ParseScriptExtensions),
2e5b6d6dSopenharmony_ci  "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
2e5b6d6dSopenharmony_ci  "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
2e5b6d6dSopenharmony_ci  "VerticalOrientation.txt": (DontCopy, ParseVerticalOrientation),
2e5b6d6dSopenharmony_ci  "WordBreakProperty.txt": (DontCopy, ParseWordBreak),
2e5b6d6dSopenharmony_ci  "WordBreakTest.txt": (CopyOnly, "testdata"),
2e5b6d6dSopenharmony_ci  # From www.unicode.org/Public/idna/<version>/
2e5b6d6dSopenharmony_ci  "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
2e5b6d6dSopenharmony_ci}
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# List of lists of files to be parsed in order.
2e5b6d6dSopenharmony_ci# Inner lists contain (basename, path, parser) tuples.
2e5b6d6dSopenharmony_ci_files_to_parse = [[], [], [], [], [], [], [], [], [], []]
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Get the standard basename from a versioned filename.
2e5b6d6dSopenharmony_ci# For example, match "UnicodeData-6.1.0d8.txt"
2e5b6d6dSopenharmony_ci# so we can turn it into "UnicodeData.txt".
2e5b6d6dSopenharmony_ci_file_version_re = re.compile("([a-zA-Z0-9_-]+)" +
2e5b6d6dSopenharmony_ci                              "-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" +
2e5b6d6dSopenharmony_ci                              "(\\.[a-z]+)$")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef PreprocessFiles(source_files, icu4c_src_root):
2e5b6d6dSopenharmony_ci  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
2e5b6d6dSopenharmony_ci  norm2_path = os.path.join(unidata_path, "norm2")
2e5b6d6dSopenharmony_ci  testdata_path = os.path.join(icu4c_src_root, "source", "test", "testdata")
2e5b6d6dSopenharmony_ci  folder_to_path = {
2e5b6d6dSopenharmony_ci    "unidata": unidata_path,
2e5b6d6dSopenharmony_ci    "norm2": norm2_path,
2e5b6d6dSopenharmony_ci    "testdata": testdata_path
2e5b6d6dSopenharmony_ci  }
2e5b6d6dSopenharmony_ci  files_processed = set()
2e5b6d6dSopenharmony_ci  for source_file in source_files:
2e5b6d6dSopenharmony_ci    (folder, basename) = os.path.split(source_file)
2e5b6d6dSopenharmony_ci    match = _file_version_re.match(basename)
2e5b6d6dSopenharmony_ci    if match:
2e5b6d6dSopenharmony_ci      new_basename = match.group(1) + match.group(2)
2e5b6d6dSopenharmony_ci      if new_basename != basename:
2e5b6d6dSopenharmony_ci        print("Removing version suffix from " + source_file)
2e5b6d6dSopenharmony_ci        # ... so that we can easily compare UCD files.
2e5b6d6dSopenharmony_ci        new_source_file = os.path.join(folder, new_basename)
2e5b6d6dSopenharmony_ci        shutil.move(source_file, new_source_file)
2e5b6d6dSopenharmony_ci        basename = new_basename
2e5b6d6dSopenharmony_ci        source_file = new_source_file
2e5b6d6dSopenharmony_ci    if basename in _files:
2e5b6d6dSopenharmony_ci      print("Preprocessing %s" % basename)
2e5b6d6dSopenharmony_ci      if basename in files_processed:
2e5b6d6dSopenharmony_ci        raise Exception("duplicate file basename %s!" % basename)
2e5b6d6dSopenharmony_ci      files_processed.add(basename)
2e5b6d6dSopenharmony_ci      value = _files[basename]
2e5b6d6dSopenharmony_ci      preprocessor = value[0]
2e5b6d6dSopenharmony_ci      if len(value) >= 2 and isinstance(value[1], (str)):
2e5b6d6dSopenharmony_ci        # The value was [preprocessor, dest_folder, ...], leave [...].
2e5b6d6dSopenharmony_ci        dest_folder = value[1]
2e5b6d6dSopenharmony_ci        value = value[2:]
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        # The value was [preprocessor, ...], leave [...].
2e5b6d6dSopenharmony_ci        dest_folder = "unidata"
2e5b6d6dSopenharmony_ci        value = value[1:]
2e5b6d6dSopenharmony_ci      dest_path = folder_to_path[dest_folder]
2e5b6d6dSopenharmony_ci      if not os.path.exists(dest_path): os.makedirs(dest_path)
2e5b6d6dSopenharmony_ci      dest_basename = basename
2e5b6d6dSopenharmony_ci      # Source GraphemeBreakTest-cldr.txt --> destination GraphemeBreakTest.txt.
2e5b6d6dSopenharmony_ci      if basename.endswith("-cldr.txt"):
2e5b6d6dSopenharmony_ci        dest_basename = basename[:-9] + basename[-4:]
2e5b6d6dSopenharmony_ci      dest_file = os.path.join(dest_path, dest_basename)
2e5b6d6dSopenharmony_ci      parse_file = preprocessor(source_file, dest_file)
2e5b6d6dSopenharmony_ci      if value:
2e5b6d6dSopenharmony_ci        order = 9 if len(value) < 2 else value[1]
2e5b6d6dSopenharmony_ci        _files_to_parse[order].append((basename, parse_file, value[0]))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Character names ---------------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# TODO: Turn this script into a module that
2e5b6d6dSopenharmony_ci# a) gives access to the parsed data
2e5b6d6dSopenharmony_ci# b) has a PreparseUCD(ucd_root, icu4c_src_root) function
2e5b6d6dSopenharmony_ci# c) has a ParsePreparsedUCD(filename) function
2e5b6d6dSopenharmony_ci# d) has a WritePreparsedUCD(filename) function
2e5b6d6dSopenharmony_ci# and then use it from a new script for names.
2e5b6d6dSopenharmony_ci# Some more API:
2e5b6d6dSopenharmony_ci# - generator GetRangesAndProps() -> (start, end, props)*
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef IncCounter(counters, key, inc=1):
2e5b6d6dSopenharmony_ci  if key in counters:
2e5b6d6dSopenharmony_ci    counters[key] += inc
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    counters[key] = inc
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ciendings = (
2e5b6d6dSopenharmony_ci  # List PHASE- before LETTER for BAMUM LETTER PHASE-xyz.
2e5b6d6dSopenharmony_ci  "PHASE-",
2e5b6d6dSopenharmony_ci  "LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ",
2e5b6d6dSopenharmony_ci  "CHOSEONG ", "JUNGSEONG ", "JONGSEONG ",
2e5b6d6dSopenharmony_ci  "SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ",
2e5b6d6dSopenharmony_ci  "ACROPHONIC ", "HIEROGLYPH ",
2e5b6d6dSopenharmony_ci  "DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ",
2e5b6d6dSopenharmony_ci  "PUNCTUATION ", "SIGN ", "SYMBOL ",
2e5b6d6dSopenharmony_ci  "TILE ", "CARD ", "FACE ",
2e5b6d6dSopenharmony_ci  "ACCENT ", "POINT ",
2e5b6d6dSopenharmony_ci  # List SIGN before VOWEL to catch "vowel sign".
2e5b6d6dSopenharmony_ci  "VOWEL ", "TONE ", "RADICAL ",
2e5b6d6dSopenharmony_ci  # For names of math symbols,
2e5b6d6dSopenharmony_ci  # e.g., MATHEMATICAL BOLD ITALIC CAPITAL A
2e5b6d6dSopenharmony_ci  "SCRIPT ", "FRAKTUR ", "MONOSPACE ",
2e5b6d6dSopenharmony_ci  "ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ",
2e5b6d6dSopenharmony_ci  "INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ",
2e5b6d6dSopenharmony_ci  # BRAILLE PATTERN DOTS-xyz
2e5b6d6dSopenharmony_ci  "DOTS-",
2e5b6d6dSopenharmony_ci  "SELECTOR ", "SELECTOR-"
2e5b6d6dSopenharmony_ci)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef SplitName(name, tokens):
2e5b6d6dSopenharmony_ci  start = 0
2e5b6d6dSopenharmony_ci  for e in endings:
2e5b6d6dSopenharmony_ci    i = name.find(e)
2e5b6d6dSopenharmony_ci    if i >= 0:
2e5b6d6dSopenharmony_ci      start = i + len(e)
2e5b6d6dSopenharmony_ci      token = name[:start]
2e5b6d6dSopenharmony_ci      IncCounter(tokens, token)
2e5b6d6dSopenharmony_ci      break
2e5b6d6dSopenharmony_ci  for i in range(start, len(name)):
2e5b6d6dSopenharmony_ci    c = name[i]
2e5b6d6dSopenharmony_ci    if c == ' ' or c == '-':
2e5b6d6dSopenharmony_ci      token = name[start:i + 1]
2e5b6d6dSopenharmony_ci      IncCounter(tokens, token)
2e5b6d6dSopenharmony_ci      start = i + 1
2e5b6d6dSopenharmony_ci  IncCounter(tokens, name[start:])
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef PrintNameStats():
2e5b6d6dSopenharmony_ci  # TODO: This name analysis code is out of date.
2e5b6d6dSopenharmony_ci  # It needs to consider the multi-type Name_Alias values.
2e5b6d6dSopenharmony_ci  name_pnames = ("na", "na1", "Name_Alias")
2e5b6d6dSopenharmony_ci  counts = {}
2e5b6d6dSopenharmony_ci  for pname in name_pnames:
2e5b6d6dSopenharmony_ci    counts[pname] = 0
2e5b6d6dSopenharmony_ci  total_lengths = counts.copy()
2e5b6d6dSopenharmony_ci  max_length = 0
2e5b6d6dSopenharmony_ci  max_per_cp = 0
2e5b6d6dSopenharmony_ci  name_chars = set()
2e5b6d6dSopenharmony_ci  num_digits = 0
2e5b6d6dSopenharmony_ci  token_counters = {}
2e5b6d6dSopenharmony_ci  char_counters = {}
2e5b6d6dSopenharmony_ci  for i in range(len(_starts) - 1):
2e5b6d6dSopenharmony_ci    start = _starts[i]
2e5b6d6dSopenharmony_ci    # end = _starts[i + 1] - 1
2e5b6d6dSopenharmony_ci    props = _props[i]
2e5b6d6dSopenharmony_ci    per_cp = 0
2e5b6d6dSopenharmony_ci    for pname in name_pnames:
2e5b6d6dSopenharmony_ci      if pname in props:
2e5b6d6dSopenharmony_ci        counts[pname] += 1
2e5b6d6dSopenharmony_ci        name = props[pname]
2e5b6d6dSopenharmony_ci        total_lengths[pname] += len(name)
2e5b6d6dSopenharmony_ci        name_chars |= set(name)
2e5b6d6dSopenharmony_ci        if len(name) > max_length: max_length = len(name)
2e5b6d6dSopenharmony_ci        per_cp += len(name) + 1
2e5b6d6dSopenharmony_ci        if per_cp > max_per_cp: max_per_cp = per_cp
2e5b6d6dSopenharmony_ci        tokens = SplitName(name, token_counters)
2e5b6d6dSopenharmony_ci        for c in name:
2e5b6d6dSopenharmony_ci          if c in "0123456789": num_digits += 1
2e5b6d6dSopenharmony_ci          IncCounter(char_counters, c)
2e5b6d6dSopenharmony_ci  print
2e5b6d6dSopenharmony_ci  for pname in name_pnames:
2e5b6d6dSopenharmony_ci    print("'%s' character names: %d / %d bytes" %
2e5b6d6dSopenharmony_ci          (pname, counts[pname], total_lengths[pname]))
2e5b6d6dSopenharmony_ci  print("%d total bytes in character names" % sum(total_lengths.itervalues()))
2e5b6d6dSopenharmony_ci  print("%d name-characters: %s" %
2e5b6d6dSopenharmony_ci        (len(name_chars), "".join(sorted(name_chars))))
2e5b6d6dSopenharmony_ci  print("%d digits 0-9" % num_digits)
2e5b6d6dSopenharmony_ci  count_chars = [(count, c) for (c, count) in char_counters.items()]
2e5b6d6dSopenharmony_ci  count_chars.sort(reverse=True)
2e5b6d6dSopenharmony_ci  for cc in count_chars:
2e5b6d6dSopenharmony_ci    print("name-chars: %6d * '%s'" % cc)
2e5b6d6dSopenharmony_ci  print("max. name length: %d" % max_length)
2e5b6d6dSopenharmony_ci  print("max. length of all (names+NUL) per cp: %d" % max_per_cp)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  token_lengths = sum([len(t) + 1 for t in token_counters])
2e5b6d6dSopenharmony_ci  print("%d total tokens, %d bytes with NUL" %
2e5b6d6dSopenharmony_ci        (len(token_counters), token_lengths))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  counts_tokens = []
2e5b6d6dSopenharmony_ci  for (token, count) in token_counters.items():
2e5b6d6dSopenharmony_ci    # If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time
2e5b6d6dSopenharmony_ci    # but have to store the token string itself with a length or terminator byte,
2e5b6d6dSopenharmony_ci    # plus a 2-byte entry in an token index table.
2e5b6d6dSopenharmony_ci    savings = count * (len(token) - 1) - (len(token) + 1 + 2)
2e5b6d6dSopenharmony_ci    if savings > 0:
2e5b6d6dSopenharmony_ci      counts_tokens.append((savings, count, token))
2e5b6d6dSopenharmony_ci  counts_tokens.sort(reverse=True)
2e5b6d6dSopenharmony_ci  print("%d tokens might save space with 1-byte codes" % len(counts_tokens))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  # Codes=bytes, 40 byte values for name_chars.
2e5b6d6dSopenharmony_ci  # That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens.
2e5b6d6dSopenharmony_ci  # Make each 2-byte token the token string index itself, rather than
2e5b6d6dSopenharmony_ci  # and index into a string index table.
2e5b6d6dSopenharmony_ci  # More lead bytes but also more savings.
2e5b6d6dSopenharmony_ci  num_units = 256
2e5b6d6dSopenharmony_ci  max_lead = (token_lengths + 255) / 256
2e5b6d6dSopenharmony_ci  max_token_units = num_units - len(name_chars)
2e5b6d6dSopenharmony_ci  results = []
2e5b6d6dSopenharmony_ci  for num_lead in range(min(max_lead, max_token_units) + 1):
2e5b6d6dSopenharmony_ci    max1 = max_token_units - num_lead
2e5b6d6dSopenharmony_ci    ct = counts_tokens[:max1]
2e5b6d6dSopenharmony_ci    tokens1 = set([t for (s, c, t) in ct])
2e5b6d6dSopenharmony_ci    for (token, count) in token_counters.items():
2e5b6d6dSopenharmony_ci      if token in tokens1: continue
2e5b6d6dSopenharmony_ci      # If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time
2e5b6d6dSopenharmony_ci      # but have to store the token string itself with a length or terminator byte.
2e5b6d6dSopenharmony_ci      savings = count * (len(token) - 2) - (len(token) + 1)
2e5b6d6dSopenharmony_ci      if savings > 0:
2e5b6d6dSopenharmony_ci        ct.append((savings, count, token))
2e5b6d6dSopenharmony_ci    ct.sort(reverse=True)
2e5b6d6dSopenharmony_ci    # A 2-byte-code-token index cannot be limit_t_lengths or higher.
2e5b6d6dSopenharmony_ci    limit_t_lengths = num_lead * 256
2e5b6d6dSopenharmony_ci    token2_index = 0
2e5b6d6dSopenharmony_ci    for i in range(max1, len(ct)):
2e5b6d6dSopenharmony_ci      if token2_index >= limit_t_lengths:
2e5b6d6dSopenharmony_ci        del ct[i:]
2e5b6d6dSopenharmony_ci        break
2e5b6d6dSopenharmony_ci      token2_index += len(ct[i][2]) + 1
2e5b6d6dSopenharmony_ci    cumul_savings = sum([s for (s, c, t) in ct])
2e5b6d6dSopenharmony_ci    # print ("%2d 1-byte codes: %4d tokens might save %6d bytes" %
2e5b6d6dSopenharmony_ci    #        (max1, len(ct), cumul_savings))
2e5b6d6dSopenharmony_ci    results.append((cumul_savings, max1, ct))
2e5b6d6dSopenharmony_ci  best = max(results)  # (cumul_savings, max1, ct)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  max1 = best[1]
2e5b6d6dSopenharmony_ci  print("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" %
2e5b6d6dSopenharmony_ci         (best[0], max1, max_token_units - max1))
2e5b6d6dSopenharmony_ci  counts_tokens = best[2]
2e5b6d6dSopenharmony_ci  cumul_savings = 0
2e5b6d6dSopenharmony_ci  for i in range(len(counts_tokens)):
2e5b6d6dSopenharmony_ci    n = 1 if i < max1 else 2
2e5b6d6dSopenharmony_ci    i1 = i + 1
2e5b6d6dSopenharmony_ci    t = counts_tokens[i]
2e5b6d6dSopenharmony_ci    cumul_savings += t[0]
2e5b6d6dSopenharmony_ci    if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens):
2e5b6d6dSopenharmony_ci      print(("%04d. cumul. %6d bytes save %6d bytes from " +
2e5b6d6dSopenharmony_ci              "%5d * %d-byte token for %2d='%s'") %
2e5b6d6dSopenharmony_ci          (i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2]))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# ICU API ------------------------------------------------------------------ ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    UCHAR_UNIFIED_IDEOGRAPH=29,
2e5b6d6dSopenharmony_ci_uchar_re = re.compile(
2e5b6d6dSopenharmony_ci    " *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    /** Zs @stable ICU 2.0 */
2e5b6d6dSopenharmony_ci_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    U_SPACE_SEPARATOR         = 12,
2e5b6d6dSopenharmony_ci_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    /** L @stable ICU 2.0 */
2e5b6d6dSopenharmony_ci_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    U_LEFT_TO_RIGHT               = 0,
2e5b6d6dSopenharmony_ci_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    UBLOCK_CYRILLIC =9,
2e5b6d6dSopenharmony_ci_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    U_EA_AMBIGUOUS,
2e5b6d6dSopenharmony_ci_prop_and_value_re = re.compile(
2e5b6d6dSopenharmony_ci    " *(U_(BPT|DT|EA|GCB|HST|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match if it has matched _prop_and_value_re
2e5b6d6dSopenharmony_ci# (we want to exclude aliases):
2e5b6d6dSopenharmony_ci#    U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL,
2e5b6d6dSopenharmony_ci_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseUCharHeader(icu4c_src_root):
2e5b6d6dSopenharmony_ci  uchar_path = os.path.join(icu4c_src_root, "source",
2e5b6d6dSopenharmony_ci                            "common", "unicode", "uchar.h")
2e5b6d6dSopenharmony_ci  with open(uchar_path, "r") as uchar_file:
2e5b6d6dSopenharmony_ci    mode = ""  # Mode string (=pname) during context-sensitive parsing.
2e5b6d6dSopenharmony_ci    comment_value = ""  # Property value from a comment preceding an enum.
2e5b6d6dSopenharmony_ci    # Note: The enum UProperty is first in uchar.h, before the enums for values.
2e5b6d6dSopenharmony_ci    for line in uchar_file:
2e5b6d6dSopenharmony_ci      # Parse some enums via context-sensitive "modes".
2e5b6d6dSopenharmony_ci      # Necessary because the enum constant names do not contain
2e5b6d6dSopenharmony_ci      # enough information.
2e5b6d6dSopenharmony_ci      if "enum UCharCategory" in line:
2e5b6d6dSopenharmony_ci        mode = "gc"
2e5b6d6dSopenharmony_ci        comment_value = ""
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      if mode == "gc":
2e5b6d6dSopenharmony_ci        if line.startswith("}"):
2e5b6d6dSopenharmony_ci          mode = ""
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        match = _gc_comment_re.match(line)
2e5b6d6dSopenharmony_ci        if match:
2e5b6d6dSopenharmony_ci          comment_value = match.group(1)
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        match = _gc_re.match(line)
2e5b6d6dSopenharmony_ci        if match and comment_value:
2e5b6d6dSopenharmony_ci          gc_enum = match.group(1)
2e5b6d6dSopenharmony_ci          prop = _properties["gc"]
2e5b6d6dSopenharmony_ci          vname = GetShortPropertyValueName(prop, comment_value)
2e5b6d6dSopenharmony_ci          icu_values = _pname_to_icu_prop["gc"][2]
2e5b6d6dSopenharmony_ci          icu_values.append((gc_enum, vname))
2e5b6d6dSopenharmony_ci        comment_value = ""
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      if "enum UCharDirection {" in line:
2e5b6d6dSopenharmony_ci        mode = "bc"
2e5b6d6dSopenharmony_ci        comment_value = ""
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      if mode == "bc":
2e5b6d6dSopenharmony_ci        if line.startswith("}"):
2e5b6d6dSopenharmony_ci          mode = ""
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        match = _bc_comment_re.match(line)
2e5b6d6dSopenharmony_ci        if match:
2e5b6d6dSopenharmony_ci          comment_value = match.group(1)
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        match = _bc_re.match(line)
2e5b6d6dSopenharmony_ci        if match and comment_value:
2e5b6d6dSopenharmony_ci          bc_enum = match.group(1)
2e5b6d6dSopenharmony_ci          prop = _properties["bc"]
2e5b6d6dSopenharmony_ci          vname = GetShortPropertyValueName(prop, comment_value)
2e5b6d6dSopenharmony_ci          icu_values = _pname_to_icu_prop["bc"][2]
2e5b6d6dSopenharmony_ci          icu_values.append((bc_enum, vname))
2e5b6d6dSopenharmony_ci        comment_value = ""
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      # No mode, parse enum constants whose names contain
2e5b6d6dSopenharmony_ci      # enough information to parse without requiring context.
2e5b6d6dSopenharmony_ci      match = _uchar_re.match(line)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        prop_enum = match.group(1)
2e5b6d6dSopenharmony_ci        if prop_enum.endswith("_LIMIT"):
2e5b6d6dSopenharmony_ci          # Ignore "UCHAR_BINARY_LIMIT=57," etc.
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        pname = GetShortPropertyName(prop_enum[6:])
2e5b6d6dSopenharmony_ci        icu_prop = (prop_enum, pname, [])
2e5b6d6dSopenharmony_ci        _icu_properties.append(icu_prop)
2e5b6d6dSopenharmony_ci        _pname_to_icu_prop[pname] = icu_prop
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      match = _ublock_re.match(line)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        prop_enum = match.group(1)
2e5b6d6dSopenharmony_ci        if prop_enum == "UBLOCK_COUNT":
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        prop = _properties["blk"]
2e5b6d6dSopenharmony_ci        vname = GetShortPropertyValueName(prop, prop_enum[7:])
2e5b6d6dSopenharmony_ci        icu_values = _pname_to_icu_prop["blk"][2]
2e5b6d6dSopenharmony_ci        icu_values.append((prop_enum, vname))
2e5b6d6dSopenharmony_ci        continue
2e5b6d6dSopenharmony_ci      match = _prop_and_value_re.match(line)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        (prop_enum, vname) = match.group(1, 3)
2e5b6d6dSopenharmony_ci        if vname == "COUNT" or _prop_and_alias_re.match(line):
2e5b6d6dSopenharmony_ci          continue
2e5b6d6dSopenharmony_ci        pname = GetShortPropertyName(match.group(2))
2e5b6d6dSopenharmony_ci        prop = _properties[pname]
2e5b6d6dSopenharmony_ci        vname = GetShortPropertyValueName(prop, vname)
2e5b6d6dSopenharmony_ci        icu_values = _pname_to_icu_prop[pname][2]
2e5b6d6dSopenharmony_ci        icu_values.append((prop_enum, vname))
2e5b6d6dSopenharmony_ci  # ccc, lccc, tccc use their numeric values as "enum" values.
2e5b6d6dSopenharmony_ci  # In the UCD data, these numeric values are the first value names,
2e5b6d6dSopenharmony_ci  # followed by the short & long value names.
2e5b6d6dSopenharmony_ci  # List the ccc values in numeric order.
2e5b6d6dSopenharmony_ci  prop = _properties["ccc"]
2e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["ccc"][2]
2e5b6d6dSopenharmony_ci  for ccc in sorted([int(name) for name in prop[2]]):
2e5b6d6dSopenharmony_ci    icu_values.append((ccc, str(ccc)))
2e5b6d6dSopenharmony_ci  _pname_to_icu_prop["lccc"][2].extend(icu_values)  # Copy ccc -> lccc.
2e5b6d6dSopenharmony_ci  _pname_to_icu_prop["tccc"][2].extend(icu_values)  # Copy ccc -> tccc.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  # No need to parse predictable General_Category_Mask enum constants.
2e5b6d6dSopenharmony_ci  # Just define them in ASCII order.
2e5b6d6dSopenharmony_ci  prop = _properties["gcm"]
2e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["gcm"][2]
2e5b6d6dSopenharmony_ci  for vname in sorted(prop[2]):
2e5b6d6dSopenharmony_ci    icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname))
2e5b6d6dSopenharmony_ci  # Hardcode known values for the normalization quick check properties,
2e5b6d6dSopenharmony_ci  # see unorm2.h for the UNormalizationCheckResult enum.
2e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["NFC_QC"][2]
2e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_NO", "N"))
2e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_YES", "Y"))
2e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_MAYBE", "M"))
2e5b6d6dSopenharmony_ci  _pname_to_icu_prop["NFKC_QC"][2].extend(icu_values)  # Copy NFC -> NFKC.
2e5b6d6dSopenharmony_ci  # No "maybe" values for NF[K]D.
2e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["NFD_QC"][2]
2e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_NO", "N"))
2e5b6d6dSopenharmony_ci  icu_values.append(("UNORM_YES", "Y"))
2e5b6d6dSopenharmony_ci  _pname_to_icu_prop["NFKD_QC"][2].extend(icu_values)  # Copy NFD -> NFKD.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# Sample line to match:
2e5b6d6dSopenharmony_ci#    USCRIPT_LOMA   = 139,/* Loma */
2e5b6d6dSopenharmony_ci_uscript_re = re.compile(
2e5b6d6dSopenharmony_ci    " *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef ParseUScriptHeader(icu4c_src_root):
2e5b6d6dSopenharmony_ci  uscript_path = os.path.join(icu4c_src_root, "source",
2e5b6d6dSopenharmony_ci                              "common", "unicode", "uscript.h")
2e5b6d6dSopenharmony_ci  icu_values = _pname_to_icu_prop["sc"][2]
2e5b6d6dSopenharmony_ci  with open(uscript_path, "r") as uscript_file:
2e5b6d6dSopenharmony_ci    for line in uscript_file:
2e5b6d6dSopenharmony_ci      match = _uscript_re.match(line)
2e5b6d6dSopenharmony_ci      if match:
2e5b6d6dSopenharmony_ci        (script_enum, script_code) = match.group(1, 2)
2e5b6d6dSopenharmony_ci        icu_values.append((script_enum, script_code))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef CheckPNamesData():
2e5b6d6dSopenharmony_ci  """Checks that every ICU property has a full set of value enum constants,
2e5b6d6dSopenharmony_ci  and that the _icu_properties value names map back to the UCD."""
2e5b6d6dSopenharmony_ci  missing_enums = []
2e5b6d6dSopenharmony_ci  for (p_enum, pname, values) in _icu_properties:
2e5b6d6dSopenharmony_ci    prop = _properties[pname]
2e5b6d6dSopenharmony_ci    vnames = set(prop[2])  # Modifiable copy of the set of short value names.
2e5b6d6dSopenharmony_ci    for (v_enum, vname) in values:
2e5b6d6dSopenharmony_ci      if vname not in vnames:
2e5b6d6dSopenharmony_ci        raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" %
2e5b6d6dSopenharmony_ci                         (pname, vname, v_enum))
2e5b6d6dSopenharmony_ci      vnames.remove(vname)
2e5b6d6dSopenharmony_ci    # Exceptions to the all-values check:
2e5b6d6dSopenharmony_ci    # - ICU does not have specific enum values for binary No/Yes.
2e5b6d6dSopenharmony_ci    # - ICU represents Age values via UVersionInfo rather than enum constants.
2e5b6d6dSopenharmony_ci    # - gc: ICU enum UCharCategory only has the single-category values.
2e5b6d6dSopenharmony_ci    #       (ICU's gcm property has all of the UCD gc property values.)
2e5b6d6dSopenharmony_ci    if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")):
2e5b6d6dSopenharmony_ci      missing_enums.append((pname, vnames))
2e5b6d6dSopenharmony_ci  if missing_enums:
2e5b6d6dSopenharmony_ci    raise ValueError(
2e5b6d6dSopenharmony_ci        "missing uchar.h enum constants for some property values: %s" %
2e5b6d6dSopenharmony_ci        missing_enums)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef WritePNamesDataHeader(out_path):
2e5b6d6dSopenharmony_ci  with open(out_path, "w") as out_file:
2e5b6d6dSopenharmony_ci    out_file.write("""// © 2016 and later: Unicode, Inc. and others.
2e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
2e5b6d6dSopenharmony_ci/**
2e5b6d6dSopenharmony_ci * Copyright (C) 2002-2016, International Business Machines Corporation and
2e5b6d6dSopenharmony_ci * others. All Rights Reserved.
2e5b6d6dSopenharmony_ci *
2e5b6d6dSopenharmony_ci * machine-generated by: icu/tools/unicode/py/preparseucd.py
2e5b6d6dSopenharmony_ci */
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci""")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci    # Note: The uchar.h & uscript.h parsers store the ICU Unicode properties
2e5b6d6dSopenharmony_ci    # and values in the order of their definition,
2e5b6d6dSopenharmony_ci    # and this function writes them in that order.
2e5b6d6dSopenharmony_ci    # Since the ICU API constants are stable and new values are only
2e5b6d6dSopenharmony_ci    # appended at the end
2e5b6d6dSopenharmony_ci    # (new properties are added at the end of each binary/enum/... range),
2e5b6d6dSopenharmony_ci    # the output is stable as well.
2e5b6d6dSopenharmony_ci    # When a property or value constant is renamed,
2e5b6d6dSopenharmony_ci    # it only changes the name itself in the output;
2e5b6d6dSopenharmony_ci    # it does not move in the output since there is no sorting.
2e5b6d6dSopenharmony_ci    # This minimizes diffs and assists with reviewing and evaluating updates.
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci    version = _ucd_version.split('.')
2e5b6d6dSopenharmony_ci    while len(version) < 4: version.append("0")
2e5b6d6dSopenharmony_ci    out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci    # Count the maximum number of aliases for any property or value.
2e5b6d6dSopenharmony_ci    # We write the final value at the end.
2e5b6d6dSopenharmony_ci    max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"]))
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci    # Write an array of "binprop" Value object initializers
2e5b6d6dSopenharmony_ci    # with the value aliases shared among all binary properties.
2e5b6d6dSopenharmony_ci    out_file.write("static const Value VALUES_binprop[2] = {\n")
2e5b6d6dSopenharmony_ci    out_file.write('    Value(0, "%s"),\n' % " ".join(_binary_values["N"]))
2e5b6d6dSopenharmony_ci    out_file.write('    Value(1, "%s"),\n' % " ".join(_binary_values["Y"]))
2e5b6d6dSopenharmony_ci    out_file.write("};\n\n")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci    # For each property with named values, write an array of
2e5b6d6dSopenharmony_ci    # Value object initializers with the value enum and the aliases.
2e5b6d6dSopenharmony_ci    for (p_enum, pname, values) in _icu_properties:
2e5b6d6dSopenharmony_ci      prop = _properties[pname]
2e5b6d6dSopenharmony_ci      aliases = prop[1]
2e5b6d6dSopenharmony_ci      if len(aliases) > max_aliases: max_aliases = len(aliases)
2e5b6d6dSopenharmony_ci      if not values: continue
2e5b6d6dSopenharmony_ci      out_file.write("static const Value VALUES_%s[%d] = {\n" %
2e5b6d6dSopenharmony_ci                     (pname, len(values)))
2e5b6d6dSopenharmony_ci      for (v_enum, vname) in values:
2e5b6d6dSopenharmony_ci        aliases = _properties[pname][3][vname]
2e5b6d6dSopenharmony_ci        # ccc, lccc, tccc: Omit the numeric strings from the aliases.
2e5b6d6dSopenharmony_ci        # (See the comment about ccc in the PropertyValueAliases.txt header.)
2e5b6d6dSopenharmony_ci        if pname.endswith("ccc"): aliases = aliases[1:]
2e5b6d6dSopenharmony_ci        if len(aliases) > max_aliases: max_aliases = len(aliases)
2e5b6d6dSopenharmony_ci        cast = "(int32_t)" if pname == "gcm" else ""
2e5b6d6dSopenharmony_ci        out_file.write('    Value(%s%s, "%s"),\n' %
2e5b6d6dSopenharmony_ci                       (cast, v_enum, " ".join(aliases)))
2e5b6d6dSopenharmony_ci      out_file.write("};\n\n")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci    # For each property, write a Property object initializer
2e5b6d6dSopenharmony_ci    # with the property enum, its aliases, and a reference to its values.
2e5b6d6dSopenharmony_ci    out_file.write("static const Property PROPERTIES[%d] = {\n" %
2e5b6d6dSopenharmony_ci                   len(_icu_properties))
2e5b6d6dSopenharmony_ci    for (enum, pname, values) in _icu_properties:
2e5b6d6dSopenharmony_ci      prop = _properties[pname]
2e5b6d6dSopenharmony_ci      aliases = " ".join(prop[1])
2e5b6d6dSopenharmony_ci      if prop[0] == "Binary":
2e5b6d6dSopenharmony_ci        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
2e5b6d6dSopenharmony_ci      elif values:  # Property with named values.
2e5b6d6dSopenharmony_ci        out_file.write('    Property(%s, "%s", VALUES_%s, %d),\n' %
2e5b6d6dSopenharmony_ci                       (enum, aliases, pname, len(values)))
2e5b6d6dSopenharmony_ci      else:
2e5b6d6dSopenharmony_ci        out_file.write('    Property(%s, "%s"),\n' % (enum, aliases))
2e5b6d6dSopenharmony_ci    out_file.write("};\n\n")
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci    out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci# main() ------------------------------------------------------------------- ***
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_cidef main():
2e5b6d6dSopenharmony_ci  global _null_or_defaults
2e5b6d6dSopenharmony_ci  only_ppucd = False
2e5b6d6dSopenharmony_ci  if len(sys.argv) == 3:
2e5b6d6dSopenharmony_ci    (ucd_root, icu_src_root) = sys.argv[1:3]
2e5b6d6dSopenharmony_ci    ppucd_path = None
2e5b6d6dSopenharmony_ci  elif len(sys.argv) == 4 and sys.argv[2] == "--only_ppucd":
2e5b6d6dSopenharmony_ci    # For debugging:
2e5b6d6dSopenharmony_ci    # preparseucd.py  path/to/UCD/root  --only_ppucd  path/to/ppucd/outputfile
2e5b6d6dSopenharmony_ci    ucd_root = sys.argv[1]
2e5b6d6dSopenharmony_ci    ppucd_path = sys.argv[3]
2e5b6d6dSopenharmony_ci    only_ppucd = True
2e5b6d6dSopenharmony_ci    icu_src_root = "/tmp/ppucd"
2e5b6d6dSopenharmony_ci  else:
2e5b6d6dSopenharmony_ci    print("Usage: %s  path/to/UCD/root  path/to/ICU/src/root" % sys.argv[0])
2e5b6d6dSopenharmony_ci    return
2e5b6d6dSopenharmony_ci  icu4c_src_root = os.path.join(icu_src_root, "icu4c")
2e5b6d6dSopenharmony_ci  icu_tools_root = os.path.join(icu_src_root, "tools")
2e5b6d6dSopenharmony_ci  source_files = []
2e5b6d6dSopenharmony_ci  for root, dirs, files in os.walk(ucd_root):
2e5b6d6dSopenharmony_ci    for file in files:
2e5b6d6dSopenharmony_ci      source_files.append(os.path.join(root, file))
2e5b6d6dSopenharmony_ci  PreprocessFiles(source_files, icu4c_src_root)
2e5b6d6dSopenharmony_ci  # Parse the processed files in a particular order.
2e5b6d6dSopenharmony_ci  for files in _files_to_parse:
2e5b6d6dSopenharmony_ci    for (basename, path, parser) in files:
2e5b6d6dSopenharmony_ci      print("Parsing %s" % basename)
2e5b6d6dSopenharmony_ci      value = _files[basename]
2e5b6d6dSopenharmony_ci      # Unicode data files are in UTF-8.
2e5b6d6dSopenharmony_ci      charset = "UTF-8"
2e5b6d6dSopenharmony_ci      if basename == "NamesList.txt":
2e5b6d6dSopenharmony_ci        # The NamesList used to be in Latin-1 before Unicode 6.2.
2e5b6d6dSopenharmony_ci        numeric_ucd_version = [int(field) for field in _ucd_version.split('.')]
2e5b6d6dSopenharmony_ci        if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1"
2e5b6d6dSopenharmony_ci      in_file = codecs.open(path, "r", charset)
2e5b6d6dSopenharmony_ci      with in_file:
2e5b6d6dSopenharmony_ci        parser(in_file)
2e5b6d6dSopenharmony_ci  _null_or_defaults = _null_values.copy()
2e5b6d6dSopenharmony_ci  _null_or_defaults.update(_defaults)
2e5b6d6dSopenharmony_ci  # Every Catalog and Enumerated property must have a default value,
2e5b6d6dSopenharmony_ci  # from a @missing line. "nv" = "null value".
2e5b6d6dSopenharmony_ci  pnv = [pname for (pname, nv) in _null_or_defaults.items() if nv == "??"]
2e5b6d6dSopenharmony_ci  if pnv:
2e5b6d6dSopenharmony_ci    raise Exception("no default values (@missing lines) for " +
2e5b6d6dSopenharmony_ci                    "some Catalog or Enumerated properties: %s " % pnv)
2e5b6d6dSopenharmony_ci  unidata_path = os.path.join(icu4c_src_root, "source", "data", "unidata")
2e5b6d6dSopenharmony_ci  if not only_ppucd:
2e5b6d6dSopenharmony_ci    # Write Normalizer2 input text files.
2e5b6d6dSopenharmony_ci    # Do this before compacting the data so that we need not handle fallbacks.
2e5b6d6dSopenharmony_ci    norm2_path = os.path.join(unidata_path, "norm2")
2e5b6d6dSopenharmony_ci    if not os.path.exists(norm2_path): os.makedirs(norm2_path)
2e5b6d6dSopenharmony_ci    WriteNorm2(norm2_path)
2e5b6d6dSopenharmony_ci  # Optimize block vs. cp properties.
2e5b6d6dSopenharmony_ci  CompactBlocks()
2e5b6d6dSopenharmony_ci  # Write the ppucd.txt output file.
2e5b6d6dSopenharmony_ci  # Use US-ASCII so that ICU tests can parse it in the platform charset,
2e5b6d6dSopenharmony_ci  # which may be EBCDIC.
2e5b6d6dSopenharmony_ci  # Fix up non-ASCII data (NamesList.txt headings) to fit.
2e5b6d6dSopenharmony_ci  if not ppucd_path:
2e5b6d6dSopenharmony_ci    ppucd_path = os.path.join(unidata_path, "ppucd.txt")
2e5b6d6dSopenharmony_ci  with codecs.open(ppucd_path, "w", "US-ASCII") as out_file:
2e5b6d6dSopenharmony_ci    WritePreparsedUCD(out_file)
2e5b6d6dSopenharmony_ci    out_file.flush()
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  # TODO: PrintNameStats()
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  if only_ppucd: return
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci  # ICU data for property & value names API
2e5b6d6dSopenharmony_ci  ParseUCharHeader(icu4c_src_root)
2e5b6d6dSopenharmony_ci  ParseUScriptHeader(icu4c_src_root)
2e5b6d6dSopenharmony_ci  CheckPNamesData()
2e5b6d6dSopenharmony_ci  genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
2e5b6d6dSopenharmony_ci  if not os.path.exists(genprops_path): os.makedirs(genprops_path)
2e5b6d6dSopenharmony_ci  out_path = os.path.join(genprops_path, "pnames_data.h")
2e5b6d6dSopenharmony_ci  WritePNamesDataHeader(out_path)
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ci
2e5b6d6dSopenharmony_ciif __name__ == "__main__":
2e5b6d6dSopenharmony_ci  main()