12e5b6d6dSopenharmony_ci#!/usr/bin/python2.4 -B
22e5b6d6dSopenharmony_ci#
32e5b6d6dSopenharmony_ci# Copyright (C) 2017 and later: Unicode, Inc. and others.
42e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
52e5b6d6dSopenharmony_ci#
62e5b6d6dSopenharmony_ci# Copyright (c) 2009-2010 International Business Machines
72e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved.
82e5b6d6dSopenharmony_ci#
92e5b6d6dSopenharmony_ci#   file name:  ucdcopy.py
102e5b6d6dSopenharmony_ci#   encoding:   US-ASCII
112e5b6d6dSopenharmony_ci#   tab size:   8 (not used)
122e5b6d6dSopenharmony_ci#   indentation:4
132e5b6d6dSopenharmony_ci#
142e5b6d6dSopenharmony_ci#   created on: 2009aug04
152e5b6d6dSopenharmony_ci#   created by: Markus W. Scherer
162e5b6d6dSopenharmony_ci#
172e5b6d6dSopenharmony_ci# Copy Unicode Character Database (ucd) files from a tree
182e5b6d6dSopenharmony_ci# of files downloaded from ftp://www.unicode.org/Public/5.2.0/
192e5b6d6dSopenharmony_ci# to a folder like ICU's source/data/unidata/
202e5b6d6dSopenharmony_ci# and modify some of the files to make them more compact.
212e5b6d6dSopenharmony_ci#
222e5b6d6dSopenharmony_ci# Invoke with two command-line parameters, for the source
232e5b6d6dSopenharmony_ci# and destination folders.
242e5b6d6dSopenharmony_ci
252e5b6d6dSopenharmony_ciimport os
262e5b6d6dSopenharmony_ciimport os.path
272e5b6d6dSopenharmony_ciimport re
282e5b6d6dSopenharmony_ciimport shutil
292e5b6d6dSopenharmony_ciimport sys
302e5b6d6dSopenharmony_ci
312e5b6d6dSopenharmony_ci_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
322e5b6d6dSopenharmony_ci_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
332e5b6d6dSopenharmony_ci
342e5b6d6dSopenharmony_cidef CopyAndStripWithOptionalMerge(s, t, do_merge):
352e5b6d6dSopenharmony_ci  in_file = open(s, "r")
362e5b6d6dSopenharmony_ci  out_file = open(t, "w")
372e5b6d6dSopenharmony_ci  first = -1  # First code point with first_data.
382e5b6d6dSopenharmony_ci  last = -1  # Last code point with first_data.
392e5b6d6dSopenharmony_ci  first_data = ""  # Common data for code points [first..last].
402e5b6d6dSopenharmony_ci  for line in in_file:
412e5b6d6dSopenharmony_ci    match = _strip_re.match(line)
422e5b6d6dSopenharmony_ci    if match:
432e5b6d6dSopenharmony_ci      line = match.group(1)
442e5b6d6dSopenharmony_ci    else:
452e5b6d6dSopenharmony_ci      line = line.rstrip()
462e5b6d6dSopenharmony_ci    if do_merge:
472e5b6d6dSopenharmony_ci      match = _code_point_re.match(line)
482e5b6d6dSopenharmony_ci      if match:
492e5b6d6dSopenharmony_ci        c = int(match.group(1), 16)
502e5b6d6dSopenharmony_ci        data = line[match.end() - 1:]
512e5b6d6dSopenharmony_ci      else:
522e5b6d6dSopenharmony_ci        c = -1
532e5b6d6dSopenharmony_ci        data = ""
542e5b6d6dSopenharmony_ci      if last >= 0 and (c != (last + 1) or data != first_data):
552e5b6d6dSopenharmony_ci        # output the current range
562e5b6d6dSopenharmony_ci        if first == last:
572e5b6d6dSopenharmony_ci          out_file.write("%04X%s\n" % (first, first_data))
582e5b6d6dSopenharmony_ci        else:
592e5b6d6dSopenharmony_ci          out_file.write("%04X..%04X%s\n" % (first, last, first_data))
602e5b6d6dSopenharmony_ci        first = -1
612e5b6d6dSopenharmony_ci        last = -1
622e5b6d6dSopenharmony_ci        first_data = ""
632e5b6d6dSopenharmony_ci      if c < 0:
642e5b6d6dSopenharmony_ci        # no data on this line, output as is
652e5b6d6dSopenharmony_ci        out_file.write(line)
662e5b6d6dSopenharmony_ci        out_file.write("\n")
672e5b6d6dSopenharmony_ci      else:
682e5b6d6dSopenharmony_ci        # data on this line, store for possible range compaction
692e5b6d6dSopenharmony_ci        if last < 0:
702e5b6d6dSopenharmony_ci          # set as the first line in a possible range
712e5b6d6dSopenharmony_ci          first = c
722e5b6d6dSopenharmony_ci          last = c
732e5b6d6dSopenharmony_ci          first_data = data
742e5b6d6dSopenharmony_ci        else:
752e5b6d6dSopenharmony_ci          # must be c == (last + 1) and data == first_data
762e5b6d6dSopenharmony_ci          # because of previous conditions
772e5b6d6dSopenharmony_ci          # continue with the current range
782e5b6d6dSopenharmony_ci          last = c
792e5b6d6dSopenharmony_ci    else:
802e5b6d6dSopenharmony_ci      # Only strip, don't merge: just output the stripped line.
812e5b6d6dSopenharmony_ci      out_file.write(line)
822e5b6d6dSopenharmony_ci      out_file.write("\n")
832e5b6d6dSopenharmony_ci  if do_merge and last >= 0:
842e5b6d6dSopenharmony_ci    # output the last range in the file
852e5b6d6dSopenharmony_ci    if first == last:
862e5b6d6dSopenharmony_ci      out_file.write("%04X%s\n" % (first, first_data))
872e5b6d6dSopenharmony_ci    else:
882e5b6d6dSopenharmony_ci      out_file.write("%04X..%04X%s\n" % (first, last, first_data))
892e5b6d6dSopenharmony_ci    first = -1
902e5b6d6dSopenharmony_ci    last = -1
912e5b6d6dSopenharmony_ci    first_data = ""
922e5b6d6dSopenharmony_ci  in_file.close()
932e5b6d6dSopenharmony_ci  out_file.flush()
942e5b6d6dSopenharmony_ci  out_file.close()
952e5b6d6dSopenharmony_ci
962e5b6d6dSopenharmony_ci
972e5b6d6dSopenharmony_cidef CopyAndStrip(s, t):
982e5b6d6dSopenharmony_ci  """Copies a file and removes comments behind data lines but not in others."""
992e5b6d6dSopenharmony_ci  CopyAndStripWithOptionalMerge(s, t, False)
1002e5b6d6dSopenharmony_ci
1012e5b6d6dSopenharmony_ci
1022e5b6d6dSopenharmony_cidef CopyAndStripAndMerge(s, t):
1032e5b6d6dSopenharmony_ci  """Copies and strips a file and merges lines.
1042e5b6d6dSopenharmony_ci
1052e5b6d6dSopenharmony_ci  Copies a file, removes comments, and
1062e5b6d6dSopenharmony_ci  merges lines with adjacent code point ranges and identical per-code point
1072e5b6d6dSopenharmony_ci  data lines into one line with range syntax.
1082e5b6d6dSopenharmony_ci  """
1092e5b6d6dSopenharmony_ci  CopyAndStripWithOptionalMerge(s, t, True)
1102e5b6d6dSopenharmony_ci
1112e5b6d6dSopenharmony_ci
1122e5b6d6dSopenharmony_ci_files = {
1132e5b6d6dSopenharmony_ci  # Simply copy these files.
1142e5b6d6dSopenharmony_ci  "BidiMirroring.txt": shutil.copy,
1152e5b6d6dSopenharmony_ci  "BidiTest.txt": (shutil.copy, "testdata"),
1162e5b6d6dSopenharmony_ci  "Blocks.txt": shutil.copy,
1172e5b6d6dSopenharmony_ci  "CaseFolding.txt": shutil.copy,
1182e5b6d6dSopenharmony_ci  "DerivedAge.txt": shutil.copy,
1192e5b6d6dSopenharmony_ci  "DerivedBidiClass.txt": shutil.copy,
1202e5b6d6dSopenharmony_ci  "DerivedJoiningGroup.txt": shutil.copy,
1212e5b6d6dSopenharmony_ci  "DerivedJoiningType.txt": shutil.copy,
1222e5b6d6dSopenharmony_ci  "DerivedNumericValues.txt": shutil.copy,
1232e5b6d6dSopenharmony_ci  "GraphemeBreakTest.txt": (shutil.copy, "testdata"),
1242e5b6d6dSopenharmony_ci  "LineBreakTest.txt": (shutil.copy, "testdata"),
1252e5b6d6dSopenharmony_ci  "NameAliases.txt": shutil.copy,
1262e5b6d6dSopenharmony_ci  "NormalizationCorrections.txt": shutil.copy,
1272e5b6d6dSopenharmony_ci  "PropertyAliases.txt": shutil.copy,
1282e5b6d6dSopenharmony_ci  "PropertyValueAliases.txt": shutil.copy,
1292e5b6d6dSopenharmony_ci  "SentenceBreakTest.txt": (shutil.copy, "testdata"),
1302e5b6d6dSopenharmony_ci  "ScriptExtensions.txt": shutil.copy,
1312e5b6d6dSopenharmony_ci  "SpecialCasing.txt": shutil.copy,
1322e5b6d6dSopenharmony_ci  "UnicodeData.txt": shutil.copy,
1332e5b6d6dSopenharmony_ci  "WordBreakTest.txt": (shutil.copy, "testdata"),
1342e5b6d6dSopenharmony_ci
1352e5b6d6dSopenharmony_ci  # Copy these files and remove comments behind data lines but not in others.
1362e5b6d6dSopenharmony_ci  "DerivedCoreProperties.txt": CopyAndStrip,
1372e5b6d6dSopenharmony_ci  "DerivedNormalizationProps.txt": CopyAndStrip,
1382e5b6d6dSopenharmony_ci  "GraphemeBreakProperty.txt": CopyAndStrip,
1392e5b6d6dSopenharmony_ci  "NormalizationTest.txt": CopyAndStrip,
1402e5b6d6dSopenharmony_ci  "PropList.txt": CopyAndStrip,
1412e5b6d6dSopenharmony_ci  "Scripts.txt": CopyAndStrip,
1422e5b6d6dSopenharmony_ci  "SentenceBreakProperty.txt": CopyAndStrip,
1432e5b6d6dSopenharmony_ci  "WordBreakProperty.txt": CopyAndStrip,
1442e5b6d6dSopenharmony_ci
1452e5b6d6dSopenharmony_ci  # Also merge lines with adjacent code point ranges.
1462e5b6d6dSopenharmony_ci  "EastAsianWidth.txt": CopyAndStripAndMerge,
1472e5b6d6dSopenharmony_ci  "LineBreak.txt": CopyAndStripAndMerge
1482e5b6d6dSopenharmony_ci}
1492e5b6d6dSopenharmony_ci
1502e5b6d6dSopenharmony_ci_file_version_re = re.compile("^([a-zA-Z0-9]+)" +
1512e5b6d6dSopenharmony_ci                              "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
1522e5b6d6dSopenharmony_ci                              "(\\.[a-z]+)$")
1532e5b6d6dSopenharmony_ci
1542e5b6d6dSopenharmony_cidef main():
1552e5b6d6dSopenharmony_ci  source_root = sys.argv[1]
1562e5b6d6dSopenharmony_ci  dest_root = sys.argv[2]
1572e5b6d6dSopenharmony_ci  source_files = []
1582e5b6d6dSopenharmony_ci  for root, dirs, files in os.walk(source_root):
1592e5b6d6dSopenharmony_ci    for file in files:
1602e5b6d6dSopenharmony_ci      source_files.append(os.path.join(root, file))
1612e5b6d6dSopenharmony_ci  files_processed = set()
1622e5b6d6dSopenharmony_ci  for source_file in source_files:
1632e5b6d6dSopenharmony_ci    basename = os.path.basename(source_file)
1642e5b6d6dSopenharmony_ci    match = _file_version_re.match(basename)
1652e5b6d6dSopenharmony_ci    if match:
1662e5b6d6dSopenharmony_ci      basename = match.group(1) + match.group(2)
1672e5b6d6dSopenharmony_ci      print basename
1682e5b6d6dSopenharmony_ci    if basename in _files:
1692e5b6d6dSopenharmony_ci      if basename in files_processed:
1702e5b6d6dSopenharmony_ci        print "duplicate file basename %s!" % basename
1712e5b6d6dSopenharmony_ci        sys.exit(1)
1722e5b6d6dSopenharmony_ci      files_processed.add(basename)
1732e5b6d6dSopenharmony_ci      action = _files[basename]
1742e5b6d6dSopenharmony_ci      if isinstance(action, tuple):
1752e5b6d6dSopenharmony_ci        dest_folder = action[1]
1762e5b6d6dSopenharmony_ci        action = action[0]
1772e5b6d6dSopenharmony_ci      else:
1782e5b6d6dSopenharmony_ci        dest_folder = "unidata"
1792e5b6d6dSopenharmony_ci      dest_path = os.path.join(dest_root, dest_folder)
1802e5b6d6dSopenharmony_ci      if not os.path.exists(dest_path): os.makedirs(dest_path)
1812e5b6d6dSopenharmony_ci      dest_file = os.path.join(dest_path, basename)
1822e5b6d6dSopenharmony_ci      action(source_file, dest_file)
1832e5b6d6dSopenharmony_ci
1842e5b6d6dSopenharmony_ci
1852e5b6d6dSopenharmony_ciif __name__ == "__main__":
1862e5b6d6dSopenharmony_ci  main()
187