12e5b6d6dSopenharmony_ci#!/usr/bin/python3 -B
22e5b6d6dSopenharmony_ci# -*- coding: utf-8 -*-
32e5b6d6dSopenharmony_ci#
42e5b6d6dSopenharmony_ci# Copyright (C) 2017 and later: Unicode, Inc. and others.
52e5b6d6dSopenharmony_ci# License & terms of use: http://www.unicode.org/copyright.html
62e5b6d6dSopenharmony_ci#
72e5b6d6dSopenharmony_ci# Copyright (c) 2013-2016 International Business Machines
82e5b6d6dSopenharmony_ci# Corporation and others. All Rights Reserved.
92e5b6d6dSopenharmony_ci#
102e5b6d6dSopenharmony_ci# parsescriptmetadata.py
112e5b6d6dSopenharmony_ci#
122e5b6d6dSopenharmony_ci# 2013feb15 Markus W. Scherer
132e5b6d6dSopenharmony_ci#
142e5b6d6dSopenharmony_ci# ./parsescriptmetadata.py
152e5b6d6dSopenharmony_ci#   ~/svn.icu/trunk/src/source/common/unicode/uscript.h
162e5b6d6dSopenharmony_ci#   ~/svn.cldr/trunk/common/properties/scriptMetadata.txt
172e5b6d6dSopenharmony_ci
182e5b6d6dSopenharmony_ci"""Parses ICU4C uscript.h & CLDR scriptMetadata.txt,
192e5b6d6dSopenharmony_ciand writes ICU script data initializers."""
202e5b6d6dSopenharmony_ci
212e5b6d6dSopenharmony_ciimport re
222e5b6d6dSopenharmony_ciimport sys
232e5b6d6dSopenharmony_ci
242e5b6d6dSopenharmony_cidef main():
252e5b6d6dSopenharmony_ci  if len(sys.argv) < 3:
262e5b6d6dSopenharmony_ci    print("Usage: {}  path/to/ICU4C/uscript.h  "
272e5b6d6dSopenharmony_ci          "path/to/CLDR/scriptMetadata.txt".format(sys.argv[0]))
282e5b6d6dSopenharmony_ci    return
292e5b6d6dSopenharmony_ci  (uscript_path, smd_path) = sys.argv[1:3]
302e5b6d6dSopenharmony_ci
312e5b6d6dSopenharmony_ci  iso_to_icu = {}
322e5b6d6dSopenharmony_ci  max_icu_num = 0
332e5b6d6dSopenharmony_ci
342e5b6d6dSopenharmony_ci  # Parse lines like
352e5b6d6dSopenharmony_ci  #   USCRIPT_ARABIC       =  2,  /* Arab */
362e5b6d6dSopenharmony_ci  # and extract the ICU numeric script code and the ISO script code.
372e5b6d6dSopenharmony_ci  script_num_re = re.compile(r" *= *([0-9]+), */\* *([A-Z][a-z]{3}) *\*/")
382e5b6d6dSopenharmony_ci  with open(uscript_path, "r") as uscript_file:
392e5b6d6dSopenharmony_ci    for line in uscript_file:
402e5b6d6dSopenharmony_ci      line = line.strip()
412e5b6d6dSopenharmony_ci      if not line: continue
422e5b6d6dSopenharmony_ci      if line.startswith("#"): continue  # whole-line comment
432e5b6d6dSopenharmony_ci      match = script_num_re.search(line)
442e5b6d6dSopenharmony_ci      if match:
452e5b6d6dSopenharmony_ci        icu_num = int(match.group(1))
462e5b6d6dSopenharmony_ci        iso_to_icu[match.group(2)] = icu_num
472e5b6d6dSopenharmony_ci        if icu_num > max_icu_num: max_icu_num = icu_num
482e5b6d6dSopenharmony_ci
492e5b6d6dSopenharmony_ci  icu_data = [None] * (max_icu_num + 1)
502e5b6d6dSopenharmony_ci
512e5b6d6dSopenharmony_ci  # Parse lines like
522e5b6d6dSopenharmony_ci  #   Arab; 8; 0628; SA; 1; RECOMMENDED; YES; NO; YES; NO; NO
532e5b6d6dSopenharmony_ci  # and put the data (as strings) into the icu_data list.
542e5b6d6dSopenharmony_ci  with open(smd_path, "r") as smd_file:
552e5b6d6dSopenharmony_ci    for line in smd_file:
562e5b6d6dSopenharmony_ci      comment_start = line.find("#")
572e5b6d6dSopenharmony_ci      if comment_start >= 0: line = line[0:comment_start]
582e5b6d6dSopenharmony_ci      line = line.strip()
592e5b6d6dSopenharmony_ci      if not line: continue
602e5b6d6dSopenharmony_ci
612e5b6d6dSopenharmony_ci      fields = line.split(";")
622e5b6d6dSopenharmony_ci      if not fields or len(fields) < 11: continue
632e5b6d6dSopenharmony_ci      iso_code = fields[0].strip()
642e5b6d6dSopenharmony_ci      icu_num = iso_to_icu[iso_code]
652e5b6d6dSopenharmony_ci      icu_data[icu_num] = (iso_code,
662e5b6d6dSopenharmony_ci          # sample, usage
672e5b6d6dSopenharmony_ci          fields[2].strip(), fields[5].strip(),
682e5b6d6dSopenharmony_ci          # RTL, LB, cased
692e5b6d6dSopenharmony_ci          fields[6].strip(), fields[7].strip(), fields[10].strip())
702e5b6d6dSopenharmony_ci
712e5b6d6dSopenharmony_ci  # Print ICU array initializers with the relevant data.
722e5b6d6dSopenharmony_ci  for t in icu_data:
732e5b6d6dSopenharmony_ci    if t:
742e5b6d6dSopenharmony_ci      (iso_code, sample, usage, rtl, lb, cased) = t
752e5b6d6dSopenharmony_ci      s = "0x" + sample + " | " + usage
762e5b6d6dSopenharmony_ci      if rtl == "YES": s += " | RTL"
772e5b6d6dSopenharmony_ci      if lb == "YES": s += " | LB_LETTERS"
782e5b6d6dSopenharmony_ci      if cased == "YES": s += " | CASED"
792e5b6d6dSopenharmony_ci      print("    " + s + ",  // " + iso_code)
802e5b6d6dSopenharmony_ci    else:
812e5b6d6dSopenharmony_ci      print("    0,")
822e5b6d6dSopenharmony_ci
832e5b6d6dSopenharmony_ci
842e5b6d6dSopenharmony_ciif __name__ == "__main__":
852e5b6d6dSopenharmony_ci  main()
86