12e5b6d6dSopenharmony_ci#!/usr/bin/python2.4 -B 22e5b6d6dSopenharmony_ci# 32e5b6d6dSopenharmony_ci# Copyright (C) 2017 and later: Unicode, Inc. and others. 42e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html 52e5b6d6dSopenharmony_ci# 62e5b6d6dSopenharmony_ci# Copyright (c) 2009-2010 International Business Machines 72e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved. 82e5b6d6dSopenharmony_ci# 92e5b6d6dSopenharmony_ci# file name: ucdcopy.py 102e5b6d6dSopenharmony_ci# encoding: US-ASCII 112e5b6d6dSopenharmony_ci# tab size: 8 (not used) 122e5b6d6dSopenharmony_ci# indentation:4 132e5b6d6dSopenharmony_ci# 142e5b6d6dSopenharmony_ci# created on: 2009aug04 152e5b6d6dSopenharmony_ci# created by: Markus W. Scherer 162e5b6d6dSopenharmony_ci# 172e5b6d6dSopenharmony_ci# Copy Unicode Character Database (ucd) files from a tree 182e5b6d6dSopenharmony_ci# of files downloaded from ftp://www.unicode.org/Public/5.2.0/ 192e5b6d6dSopenharmony_ci# to a folder like ICU's source/data/unidata/ 202e5b6d6dSopenharmony_ci# and modify some of the files to make them more compact. 212e5b6d6dSopenharmony_ci# 222e5b6d6dSopenharmony_ci# Invoke with two command-line parameters, for the source 232e5b6d6dSopenharmony_ci# and destination folders. 242e5b6d6dSopenharmony_ci 252e5b6d6dSopenharmony_ciimport os 262e5b6d6dSopenharmony_ciimport os.path 272e5b6d6dSopenharmony_ciimport re 282e5b6d6dSopenharmony_ciimport shutil 292e5b6d6dSopenharmony_ciimport sys 302e5b6d6dSopenharmony_ci 312e5b6d6dSopenharmony_ci_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*") 322e5b6d6dSopenharmony_ci_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") 332e5b6d6dSopenharmony_ci 342e5b6d6dSopenharmony_cidef CopyAndStripWithOptionalMerge(s, t, do_merge): 352e5b6d6dSopenharmony_ci in_file = open(s, "r") 362e5b6d6dSopenharmony_ci out_file = open(t, "w") 372e5b6d6dSopenharmony_ci first = -1 # First code point with first_data. 382e5b6d6dSopenharmony_ci last = -1 # Last code point with first_data. 392e5b6d6dSopenharmony_ci first_data = "" # Common data for code points [first..last]. 402e5b6d6dSopenharmony_ci for line in in_file: 412e5b6d6dSopenharmony_ci match = _strip_re.match(line) 422e5b6d6dSopenharmony_ci if match: 432e5b6d6dSopenharmony_ci line = match.group(1) 442e5b6d6dSopenharmony_ci else: 452e5b6d6dSopenharmony_ci line = line.rstrip() 462e5b6d6dSopenharmony_ci if do_merge: 472e5b6d6dSopenharmony_ci match = _code_point_re.match(line) 482e5b6d6dSopenharmony_ci if match: 492e5b6d6dSopenharmony_ci c = int(match.group(1), 16) 502e5b6d6dSopenharmony_ci data = line[match.end() - 1:] 512e5b6d6dSopenharmony_ci else: 522e5b6d6dSopenharmony_ci c = -1 532e5b6d6dSopenharmony_ci data = "" 542e5b6d6dSopenharmony_ci if last >= 0 and (c != (last + 1) or data != first_data): 552e5b6d6dSopenharmony_ci # output the current range 562e5b6d6dSopenharmony_ci if first == last: 572e5b6d6dSopenharmony_ci out_file.write("%04X%s\n" % (first, first_data)) 582e5b6d6dSopenharmony_ci else: 592e5b6d6dSopenharmony_ci out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 602e5b6d6dSopenharmony_ci first = -1 612e5b6d6dSopenharmony_ci last = -1 622e5b6d6dSopenharmony_ci first_data = "" 632e5b6d6dSopenharmony_ci if c < 0: 642e5b6d6dSopenharmony_ci # no data on this line, output as is 652e5b6d6dSopenharmony_ci out_file.write(line) 662e5b6d6dSopenharmony_ci out_file.write("\n") 672e5b6d6dSopenharmony_ci else: 682e5b6d6dSopenharmony_ci # data on this line, store for possible range compaction 692e5b6d6dSopenharmony_ci if last < 0: 702e5b6d6dSopenharmony_ci # set as the first line in a possible range 712e5b6d6dSopenharmony_ci first = c 722e5b6d6dSopenharmony_ci last = c 732e5b6d6dSopenharmony_ci first_data = data 742e5b6d6dSopenharmony_ci else: 752e5b6d6dSopenharmony_ci # must be c == (last + 1) and data == first_data 762e5b6d6dSopenharmony_ci # because of previous conditions 772e5b6d6dSopenharmony_ci # continue with the current range 782e5b6d6dSopenharmony_ci last = c 792e5b6d6dSopenharmony_ci else: 802e5b6d6dSopenharmony_ci # Only strip, don't merge: just output the stripped line. 812e5b6d6dSopenharmony_ci out_file.write(line) 822e5b6d6dSopenharmony_ci out_file.write("\n") 832e5b6d6dSopenharmony_ci if do_merge and last >= 0: 842e5b6d6dSopenharmony_ci # output the last range in the file 852e5b6d6dSopenharmony_ci if first == last: 862e5b6d6dSopenharmony_ci out_file.write("%04X%s\n" % (first, first_data)) 872e5b6d6dSopenharmony_ci else: 882e5b6d6dSopenharmony_ci out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 892e5b6d6dSopenharmony_ci first = -1 902e5b6d6dSopenharmony_ci last = -1 912e5b6d6dSopenharmony_ci first_data = "" 922e5b6d6dSopenharmony_ci in_file.close() 932e5b6d6dSopenharmony_ci out_file.flush() 942e5b6d6dSopenharmony_ci out_file.close() 952e5b6d6dSopenharmony_ci 962e5b6d6dSopenharmony_ci 972e5b6d6dSopenharmony_cidef CopyAndStrip(s, t): 982e5b6d6dSopenharmony_ci """Copies a file and removes comments behind data lines but not in others.""" 992e5b6d6dSopenharmony_ci CopyAndStripWithOptionalMerge(s, t, False) 1002e5b6d6dSopenharmony_ci 1012e5b6d6dSopenharmony_ci 1022e5b6d6dSopenharmony_cidef CopyAndStripAndMerge(s, t): 1032e5b6d6dSopenharmony_ci """Copies and strips a file and merges lines. 1042e5b6d6dSopenharmony_ci 1052e5b6d6dSopenharmony_ci Copies a file, removes comments, and 1062e5b6d6dSopenharmony_ci merges lines with adjacent code point ranges and identical per-code point 1072e5b6d6dSopenharmony_ci data lines into one line with range syntax. 1082e5b6d6dSopenharmony_ci """ 1092e5b6d6dSopenharmony_ci CopyAndStripWithOptionalMerge(s, t, True) 1102e5b6d6dSopenharmony_ci 1112e5b6d6dSopenharmony_ci 1122e5b6d6dSopenharmony_ci_files = { 1132e5b6d6dSopenharmony_ci # Simply copy these files. 1142e5b6d6dSopenharmony_ci "BidiMirroring.txt": shutil.copy, 1152e5b6d6dSopenharmony_ci "BidiTest.txt": (shutil.copy, "testdata"), 1162e5b6d6dSopenharmony_ci "Blocks.txt": shutil.copy, 1172e5b6d6dSopenharmony_ci "CaseFolding.txt": shutil.copy, 1182e5b6d6dSopenharmony_ci "DerivedAge.txt": shutil.copy, 1192e5b6d6dSopenharmony_ci "DerivedBidiClass.txt": shutil.copy, 1202e5b6d6dSopenharmony_ci "DerivedJoiningGroup.txt": shutil.copy, 1212e5b6d6dSopenharmony_ci "DerivedJoiningType.txt": shutil.copy, 1222e5b6d6dSopenharmony_ci "DerivedNumericValues.txt": shutil.copy, 1232e5b6d6dSopenharmony_ci "GraphemeBreakTest.txt": (shutil.copy, "testdata"), 1242e5b6d6dSopenharmony_ci "LineBreakTest.txt": (shutil.copy, "testdata"), 1252e5b6d6dSopenharmony_ci "NameAliases.txt": shutil.copy, 1262e5b6d6dSopenharmony_ci "NormalizationCorrections.txt": shutil.copy, 1272e5b6d6dSopenharmony_ci "PropertyAliases.txt": shutil.copy, 1282e5b6d6dSopenharmony_ci "PropertyValueAliases.txt": shutil.copy, 1292e5b6d6dSopenharmony_ci "SentenceBreakTest.txt": (shutil.copy, "testdata"), 1302e5b6d6dSopenharmony_ci "ScriptExtensions.txt": shutil.copy, 1312e5b6d6dSopenharmony_ci "SpecialCasing.txt": shutil.copy, 1322e5b6d6dSopenharmony_ci "UnicodeData.txt": shutil.copy, 1332e5b6d6dSopenharmony_ci "WordBreakTest.txt": (shutil.copy, "testdata"), 1342e5b6d6dSopenharmony_ci 1352e5b6d6dSopenharmony_ci # Copy these files and remove comments behind data lines but not in others. 1362e5b6d6dSopenharmony_ci "DerivedCoreProperties.txt": CopyAndStrip, 1372e5b6d6dSopenharmony_ci "DerivedNormalizationProps.txt": CopyAndStrip, 1382e5b6d6dSopenharmony_ci "GraphemeBreakProperty.txt": CopyAndStrip, 1392e5b6d6dSopenharmony_ci "NormalizationTest.txt": CopyAndStrip, 1402e5b6d6dSopenharmony_ci "PropList.txt": CopyAndStrip, 1412e5b6d6dSopenharmony_ci "Scripts.txt": CopyAndStrip, 1422e5b6d6dSopenharmony_ci "SentenceBreakProperty.txt": CopyAndStrip, 1432e5b6d6dSopenharmony_ci "WordBreakProperty.txt": CopyAndStrip, 1442e5b6d6dSopenharmony_ci 1452e5b6d6dSopenharmony_ci # Also merge lines with adjacent code point ranges. 1462e5b6d6dSopenharmony_ci "EastAsianWidth.txt": CopyAndStripAndMerge, 1472e5b6d6dSopenharmony_ci "LineBreak.txt": CopyAndStripAndMerge 1482e5b6d6dSopenharmony_ci} 1492e5b6d6dSopenharmony_ci 1502e5b6d6dSopenharmony_ci_file_version_re = re.compile("^([a-zA-Z0-9]+)" + 1512e5b6d6dSopenharmony_ci "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" + 1522e5b6d6dSopenharmony_ci "(\\.[a-z]+)$") 1532e5b6d6dSopenharmony_ci 1542e5b6d6dSopenharmony_cidef main(): 1552e5b6d6dSopenharmony_ci source_root = sys.argv[1] 1562e5b6d6dSopenharmony_ci dest_root = sys.argv[2] 1572e5b6d6dSopenharmony_ci source_files = [] 1582e5b6d6dSopenharmony_ci for root, dirs, files in os.walk(source_root): 1592e5b6d6dSopenharmony_ci for file in files: 1602e5b6d6dSopenharmony_ci source_files.append(os.path.join(root, file)) 1612e5b6d6dSopenharmony_ci files_processed = set() 1622e5b6d6dSopenharmony_ci for source_file in source_files: 1632e5b6d6dSopenharmony_ci basename = os.path.basename(source_file) 1642e5b6d6dSopenharmony_ci match = _file_version_re.match(basename) 1652e5b6d6dSopenharmony_ci if match: 1662e5b6d6dSopenharmony_ci basename = match.group(1) + match.group(2) 1672e5b6d6dSopenharmony_ci print basename 1682e5b6d6dSopenharmony_ci if basename in _files: 1692e5b6d6dSopenharmony_ci if basename in files_processed: 1702e5b6d6dSopenharmony_ci print "duplicate file basename %s!" % basename 1712e5b6d6dSopenharmony_ci sys.exit(1) 1722e5b6d6dSopenharmony_ci files_processed.add(basename) 1732e5b6d6dSopenharmony_ci action = _files[basename] 1742e5b6d6dSopenharmony_ci if isinstance(action, tuple): 1752e5b6d6dSopenharmony_ci dest_folder = action[1] 1762e5b6d6dSopenharmony_ci action = action[0] 1772e5b6d6dSopenharmony_ci else: 1782e5b6d6dSopenharmony_ci dest_folder = "unidata" 1792e5b6d6dSopenharmony_ci dest_path = os.path.join(dest_root, dest_folder) 1802e5b6d6dSopenharmony_ci if not os.path.exists(dest_path): os.makedirs(dest_path) 1812e5b6d6dSopenharmony_ci dest_file = os.path.join(dest_path, basename) 1822e5b6d6dSopenharmony_ci action(source_file, dest_file) 1832e5b6d6dSopenharmony_ci 1842e5b6d6dSopenharmony_ci 1852e5b6d6dSopenharmony_ciif __name__ == "__main__": 1862e5b6d6dSopenharmony_ci main() 187