12e5b6d6dSopenharmony_ci#!/usr/bin/python3 -B 22e5b6d6dSopenharmony_ci# -*- coding: utf-8 -*- 32e5b6d6dSopenharmony_ci# 42e5b6d6dSopenharmony_ci# Copyright (C) 2017 and later: Unicode, Inc. and others. 52e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html 62e5b6d6dSopenharmony_ci# 72e5b6d6dSopenharmony_ci# Copyright (c) 2013-2016 International Business Machines 82e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved. 92e5b6d6dSopenharmony_ci# 102e5b6d6dSopenharmony_ci# parsescriptmetadata.py 112e5b6d6dSopenharmony_ci# 122e5b6d6dSopenharmony_ci# 2013feb15 Markus W. Scherer 132e5b6d6dSopenharmony_ci# 142e5b6d6dSopenharmony_ci# ./parsescriptmetadata.py 152e5b6d6dSopenharmony_ci# ~/svn.icu/trunk/src/source/common/unicode/uscript.h 162e5b6d6dSopenharmony_ci# ~/svn.cldr/trunk/common/properties/scriptMetadata.txt 172e5b6d6dSopenharmony_ci 182e5b6d6dSopenharmony_ci"""Parses ICU4C uscript.h & CLDR scriptMetadata.txt, 192e5b6d6dSopenharmony_ciand writes ICU script data initializers.""" 202e5b6d6dSopenharmony_ci 212e5b6d6dSopenharmony_ciimport re 222e5b6d6dSopenharmony_ciimport sys 232e5b6d6dSopenharmony_ci 242e5b6d6dSopenharmony_cidef main(): 252e5b6d6dSopenharmony_ci if len(sys.argv) < 3: 262e5b6d6dSopenharmony_ci print("Usage: {} path/to/ICU4C/uscript.h " 272e5b6d6dSopenharmony_ci "path/to/CLDR/scriptMetadata.txt".format(sys.argv[0])) 282e5b6d6dSopenharmony_ci return 292e5b6d6dSopenharmony_ci (uscript_path, smd_path) = sys.argv[1:3] 302e5b6d6dSopenharmony_ci 312e5b6d6dSopenharmony_ci iso_to_icu = {} 322e5b6d6dSopenharmony_ci max_icu_num = 0 332e5b6d6dSopenharmony_ci 342e5b6d6dSopenharmony_ci # Parse lines like 352e5b6d6dSopenharmony_ci # USCRIPT_ARABIC = 2, /* Arab */ 362e5b6d6dSopenharmony_ci # and extract the ICU numeric script code and the ISO script code. 372e5b6d6dSopenharmony_ci script_num_re = re.compile(r" *= *([0-9]+), */\* *([A-Z][a-z]{3}) *\*/") 382e5b6d6dSopenharmony_ci with open(uscript_path, "r") as uscript_file: 392e5b6d6dSopenharmony_ci for line in uscript_file: 402e5b6d6dSopenharmony_ci line = line.strip() 412e5b6d6dSopenharmony_ci if not line: continue 422e5b6d6dSopenharmony_ci if line.startswith("#"): continue # whole-line comment 432e5b6d6dSopenharmony_ci match = script_num_re.search(line) 442e5b6d6dSopenharmony_ci if match: 452e5b6d6dSopenharmony_ci icu_num = int(match.group(1)) 462e5b6d6dSopenharmony_ci iso_to_icu[match.group(2)] = icu_num 472e5b6d6dSopenharmony_ci if icu_num > max_icu_num: max_icu_num = icu_num 482e5b6d6dSopenharmony_ci 492e5b6d6dSopenharmony_ci icu_data = [None] * (max_icu_num + 1) 502e5b6d6dSopenharmony_ci 512e5b6d6dSopenharmony_ci # Parse lines like 522e5b6d6dSopenharmony_ci # Arab; 8; 0628; SA; 1; RECOMMENDED; YES; NO; YES; NO; NO 532e5b6d6dSopenharmony_ci # and put the data (as strings) into the icu_data list. 542e5b6d6dSopenharmony_ci with open(smd_path, "r") as smd_file: 552e5b6d6dSopenharmony_ci for line in smd_file: 562e5b6d6dSopenharmony_ci comment_start = line.find("#") 572e5b6d6dSopenharmony_ci if comment_start >= 0: line = line[0:comment_start] 582e5b6d6dSopenharmony_ci line = line.strip() 592e5b6d6dSopenharmony_ci if not line: continue 602e5b6d6dSopenharmony_ci 612e5b6d6dSopenharmony_ci fields = line.split(";") 622e5b6d6dSopenharmony_ci if not fields or len(fields) < 11: continue 632e5b6d6dSopenharmony_ci iso_code = fields[0].strip() 642e5b6d6dSopenharmony_ci icu_num = iso_to_icu[iso_code] 652e5b6d6dSopenharmony_ci icu_data[icu_num] = (iso_code, 662e5b6d6dSopenharmony_ci # sample, usage 672e5b6d6dSopenharmony_ci fields[2].strip(), fields[5].strip(), 682e5b6d6dSopenharmony_ci # RTL, LB, cased 692e5b6d6dSopenharmony_ci fields[6].strip(), fields[7].strip(), fields[10].strip()) 702e5b6d6dSopenharmony_ci 712e5b6d6dSopenharmony_ci # Print ICU array initializers with the relevant data. 722e5b6d6dSopenharmony_ci for t in icu_data: 732e5b6d6dSopenharmony_ci if t: 742e5b6d6dSopenharmony_ci (iso_code, sample, usage, rtl, lb, cased) = t 752e5b6d6dSopenharmony_ci s = "0x" + sample + " | " + usage 762e5b6d6dSopenharmony_ci if rtl == "YES": s += " | RTL" 772e5b6d6dSopenharmony_ci if lb == "YES": s += " | LB_LETTERS" 782e5b6d6dSopenharmony_ci if cased == "YES": s += " | CASED" 792e5b6d6dSopenharmony_ci print(" " + s + ", // " + iso_code) 802e5b6d6dSopenharmony_ci else: 812e5b6d6dSopenharmony_ci print(" 0,") 822e5b6d6dSopenharmony_ci 832e5b6d6dSopenharmony_ci 842e5b6d6dSopenharmony_ciif __name__ == "__main__": 852e5b6d6dSopenharmony_ci main() 86