1425bb815Sopenharmony_ci#!/usr/bin/env python 2425bb815Sopenharmony_ci 3425bb815Sopenharmony_ci# Copyright JS Foundation and other contributors, http://js.foundation 4425bb815Sopenharmony_ci# 5425bb815Sopenharmony_ci# Licensed under the Apache License, Version 2.0 (the "License"); 6425bb815Sopenharmony_ci# you may not use this file except in compliance with the License. 7425bb815Sopenharmony_ci# You may obtain a copy of the License at 8425bb815Sopenharmony_ci# 9425bb815Sopenharmony_ci# http://www.apache.org/licenses/LICENSE-2.0 10425bb815Sopenharmony_ci# 11425bb815Sopenharmony_ci# Unless required by applicable law or agreed to in writing, software 12425bb815Sopenharmony_ci# distributed under the License is distributed on an "AS IS" BASIS 13425bb815Sopenharmony_ci# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14425bb815Sopenharmony_ci# See the License for the specific language governing permissions and 15425bb815Sopenharmony_ci# limitations under the License. 16425bb815Sopenharmony_ci 17425bb815Sopenharmony_cifrom __future__ import print_function 18425bb815Sopenharmony_ci 19425bb815Sopenharmony_ciimport argparse 20425bb815Sopenharmony_ciimport bisect 21425bb815Sopenharmony_ciimport csv 22425bb815Sopenharmony_ciimport itertools 23425bb815Sopenharmony_ciimport os 24425bb815Sopenharmony_ciimport warnings 25425bb815Sopenharmony_ci 26425bb815Sopenharmony_cifrom gen_c_source import LICENSE, format_code 27425bb815Sopenharmony_cifrom settings import PROJECT_DIR 28425bb815Sopenharmony_ci 29425bb815Sopenharmony_ci 30425bb815Sopenharmony_ciRANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') 31425bb815Sopenharmony_ciCONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h') 32425bb815Sopenharmony_ci 33425bb815Sopenharmony_ci 34425bb815Sopenharmony_ci# common code generation 35425bb815Sopenharmony_ci 36425bb815Sopenharmony_ci 37425bb815Sopenharmony_ciclass UniCodeSource(object): 38425bb815Sopenharmony_ci def __init__(self, filepath): 39425bb815Sopenharmony_ci self.__filepath = filepath 40425bb815Sopenharmony_ci self.__header = [LICENSE, ""] 41425bb815Sopenharmony_ci self.__data = [] 42425bb815Sopenharmony_ci 43425bb815Sopenharmony_ci def complete_header(self, completion): 44425bb815Sopenharmony_ci self.__header.append(completion) 45425bb815Sopenharmony_ci self.__header.append("") # for an extra empty line 46425bb815Sopenharmony_ci 47425bb815Sopenharmony_ci def add_table(self, table, table_name, table_type, table_descr): 48425bb815Sopenharmony_ci self.__data.append(table_descr) 49425bb815Sopenharmony_ci self.__data.append("static const %s lit_%s[] JERRY_ATTR_CONST_DATA =" % (table_type, table_name)) 50425bb815Sopenharmony_ci self.__data.append("{") 51425bb815Sopenharmony_ci self.__data.append(format_code(table, 1)) 52425bb815Sopenharmony_ci self.__data.append("};") 53425bb815Sopenharmony_ci self.__data.append("") # for an extra empty line 54425bb815Sopenharmony_ci 55425bb815Sopenharmony_ci def generate(self): 56425bb815Sopenharmony_ci with open(self.__filepath, 'w') as generated_source: 57425bb815Sopenharmony_ci generated_source.write("\n".join(self.__header)) 58425bb815Sopenharmony_ci generated_source.write("\n".join(self.__data)) 59425bb815Sopenharmony_ci 60425bb815Sopenharmony_ciclass UnicodeCategorizer(object): 61425bb815Sopenharmony_ci def __init__(self): 62425bb815Sopenharmony_ci # unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs 63425bb815Sopenharmony_ci # Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So 64425bb815Sopenharmony_ci # letter: Lu Ll Lt Lm Lo Nl 65425bb815Sopenharmony_ci # non-letter-indent-part: 66425bb815Sopenharmony_ci # digit: Nd 67425bb815Sopenharmony_ci # punctuation mark: Mn Mc 68425bb815Sopenharmony_ci # connector punctuation: Pc 69425bb815Sopenharmony_ci # separators: Zs 70425bb815Sopenharmony_ci self._unicode_categories = { 71425bb815Sopenharmony_ci 'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"], 72425bb815Sopenharmony_ci 'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"], 73425bb815Sopenharmony_ci 'separators_category' : ["Zs"] 74425bb815Sopenharmony_ci } 75425bb815Sopenharmony_ci 76425bb815Sopenharmony_ci self._categories = { 77425bb815Sopenharmony_ci 'letters' : [], 78425bb815Sopenharmony_ci 'non_letters' : [], 79425bb815Sopenharmony_ci 'separators' : [] 80425bb815Sopenharmony_ci } 81425bb815Sopenharmony_ci 82425bb815Sopenharmony_ci def _store_by_category(self, unicode_id, category): 83425bb815Sopenharmony_ci """ 84425bb815Sopenharmony_ci Store the given unicode_id by its category 85425bb815Sopenharmony_ci """ 86425bb815Sopenharmony_ci for target_category in self._categories: 87425bb815Sopenharmony_ci if category in self._unicode_categories[target_category + '_category']: 88425bb815Sopenharmony_ci self._categories[target_category].append(unicode_id) 89425bb815Sopenharmony_ci 90425bb815Sopenharmony_ci def read_categories(self, unicode_data_file): 91425bb815Sopenharmony_ci """ 92425bb815Sopenharmony_ci Read the corresponding unicode values and store them in category lists. 93425bb815Sopenharmony_ci 94425bb815Sopenharmony_ci :return: List of letters, non_letter and separators. 95425bb815Sopenharmony_ci """ 96425bb815Sopenharmony_ci 97425bb815Sopenharmony_ci range_start_id = 0 98425bb815Sopenharmony_ci 99425bb815Sopenharmony_ci with open(unicode_data_file) as unicode_data: 100425bb815Sopenharmony_ci for line in csv.reader(unicode_data, delimiter=';'): 101425bb815Sopenharmony_ci unicode_id = int(line[0], 16) 102425bb815Sopenharmony_ci 103425bb815Sopenharmony_ci # Skip supplementary planes and ascii chars 104425bb815Sopenharmony_ci if unicode_id >= 0x10000 or unicode_id < 128: 105425bb815Sopenharmony_ci continue 106425bb815Sopenharmony_ci 107425bb815Sopenharmony_ci category = line[2] 108425bb815Sopenharmony_ci 109425bb815Sopenharmony_ci if range_start_id != 0: 110425bb815Sopenharmony_ci while range_start_id <= unicode_id: 111425bb815Sopenharmony_ci self._store_by_category(range_start_id, category) 112425bb815Sopenharmony_ci range_start_id += 1 113425bb815Sopenharmony_ci range_start_id = 0 114425bb815Sopenharmony_ci continue 115425bb815Sopenharmony_ci 116425bb815Sopenharmony_ci if line[1].startswith('<'): 117425bb815Sopenharmony_ci # Save the start position of the range 118425bb815Sopenharmony_ci range_start_id = unicode_id 119425bb815Sopenharmony_ci 120425bb815Sopenharmony_ci self._store_by_category(unicode_id, category) 121425bb815Sopenharmony_ci 122425bb815Sopenharmony_ci # This separator char is handled separatly 123425bb815Sopenharmony_ci separators = self._categories['separators'] 124425bb815Sopenharmony_ci non_breaking_space = 0x00A0 125425bb815Sopenharmony_ci if non_breaking_space in separators: 126425bb815Sopenharmony_ci separators.remove(int(non_breaking_space)) 127425bb815Sopenharmony_ci 128425bb815Sopenharmony_ci # These separator chars are not in the unicode data file or not in Zs category 129425bb815Sopenharmony_ci mongolian_vowel_separator = 0x180E 130425bb815Sopenharmony_ci medium_mathematical_space = 0x205F 131425bb815Sopenharmony_ci zero_width_space = 0x200B 132425bb815Sopenharmony_ci 133425bb815Sopenharmony_ci if mongolian_vowel_separator not in separators: 134425bb815Sopenharmony_ci bisect.insort(separators, int(mongolian_vowel_separator)) 135425bb815Sopenharmony_ci if medium_mathematical_space not in separators: 136425bb815Sopenharmony_ci bisect.insort(separators, int(medium_mathematical_space)) 137425bb815Sopenharmony_ci if zero_width_space not in separators: 138425bb815Sopenharmony_ci bisect.insort(separators, int(zero_width_space)) 139425bb815Sopenharmony_ci 140425bb815Sopenharmony_ci # https://www.ecma-international.org/ecma-262/5.1/#sec-7.1 format-control characters 141425bb815Sopenharmony_ci non_letters = self._categories['non_letters'] 142425bb815Sopenharmony_ci zero_width_non_joiner = 0x200C 143425bb815Sopenharmony_ci zero_width_joiner = 0x200D 144425bb815Sopenharmony_ci 145425bb815Sopenharmony_ci bisect.insort(non_letters, int(zero_width_non_joiner)) 146425bb815Sopenharmony_ci bisect.insort(non_letters, int(zero_width_joiner)) 147425bb815Sopenharmony_ci 148425bb815Sopenharmony_ci return self._categories['letters'], self._categories['non_letters'], self._categories['separators'] 149425bb815Sopenharmony_ci 150425bb815Sopenharmony_ci 151425bb815Sopenharmony_cidef group_ranges(i): 152425bb815Sopenharmony_ci """ 153425bb815Sopenharmony_ci Convert an increasing list of integers into a range list 154425bb815Sopenharmony_ci 155425bb815Sopenharmony_ci :return: List of ranges. 156425bb815Sopenharmony_ci """ 157425bb815Sopenharmony_ci for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])): 158425bb815Sopenharmony_ci group = list(group) 159425bb815Sopenharmony_ci yield group[0][1], group[-1][1] 160425bb815Sopenharmony_ci 161425bb815Sopenharmony_ci 162425bb815Sopenharmony_cidef split_list(category_list): 163425bb815Sopenharmony_ci """ 164425bb815Sopenharmony_ci Split list of ranges into intervals and single char lists. 165425bb815Sopenharmony_ci 166425bb815Sopenharmony_ci :return: List of interval starting points, interval lengths and single chars 167425bb815Sopenharmony_ci """ 168425bb815Sopenharmony_ci 169425bb815Sopenharmony_ci interval_sps = [] 170425bb815Sopenharmony_ci interval_lengths = [] 171425bb815Sopenharmony_ci chars = [] 172425bb815Sopenharmony_ci 173425bb815Sopenharmony_ci for element in category_list: 174425bb815Sopenharmony_ci interval_length = element[1] - element[0] 175425bb815Sopenharmony_ci if interval_length == 0: 176425bb815Sopenharmony_ci chars.append(element[0]) 177425bb815Sopenharmony_ci elif interval_length > 255: 178425bb815Sopenharmony_ci for i in range(element[0], element[1], 256): 179425bb815Sopenharmony_ci length = 255 if (element[1] - i > 255) else (element[1] - i) 180425bb815Sopenharmony_ci interval_sps.append(i) 181425bb815Sopenharmony_ci interval_lengths.append(length) 182425bb815Sopenharmony_ci else: 183425bb815Sopenharmony_ci interval_sps.append(element[0]) 184425bb815Sopenharmony_ci interval_lengths.append(element[1] - element[0]) 185425bb815Sopenharmony_ci 186425bb815Sopenharmony_ci return interval_sps, interval_lengths, chars 187425bb815Sopenharmony_ci 188425bb815Sopenharmony_ci 189425bb815Sopenharmony_cidef generate_ranges(script_args): 190425bb815Sopenharmony_ci categorizer = UnicodeCategorizer() 191425bb815Sopenharmony_ci letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data) 192425bb815Sopenharmony_ci 193425bb815Sopenharmony_ci letter_tables = split_list(list(group_ranges(letters))) 194425bb815Sopenharmony_ci non_letter_tables = split_list(list(group_ranges(non_letters))) 195425bb815Sopenharmony_ci separator_tables = split_list(list(group_ranges(separators))) 196425bb815Sopenharmony_ci 197425bb815Sopenharmony_ci c_source = UniCodeSource(RANGES_C_SOURCE) 198425bb815Sopenharmony_ci 199425bb815Sopenharmony_ci header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), 200425bb815Sopenharmony_ci " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data), 201425bb815Sopenharmony_ci ""] 202425bb815Sopenharmony_ci 203425bb815Sopenharmony_ci c_source.complete_header("\n".join(header_completion)) 204425bb815Sopenharmony_ci 205425bb815Sopenharmony_ci c_source.add_table(letter_tables[0], 206425bb815Sopenharmony_ci "unicode_letter_interval_sps", 207425bb815Sopenharmony_ci "uint16_t", 208425bb815Sopenharmony_ci ("/**\n" 209425bb815Sopenharmony_ci " * Character interval starting points for the unicode letters.\n" 210425bb815Sopenharmony_ci " *\n" 211425bb815Sopenharmony_ci " * The characters covered by these intervals are from\n" 212425bb815Sopenharmony_ci " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" 213425bb815Sopenharmony_ci " */")) 214425bb815Sopenharmony_ci 215425bb815Sopenharmony_ci c_source.add_table(letter_tables[1], 216425bb815Sopenharmony_ci "unicode_letter_interval_lengths", 217425bb815Sopenharmony_ci "uint8_t", 218425bb815Sopenharmony_ci ("/**\n" 219425bb815Sopenharmony_ci " * Character lengths for the unicode letters.\n" 220425bb815Sopenharmony_ci " *\n" 221425bb815Sopenharmony_ci " * The characters covered by these intervals are from\n" 222425bb815Sopenharmony_ci " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" 223425bb815Sopenharmony_ci " */")) 224425bb815Sopenharmony_ci 225425bb815Sopenharmony_ci c_source.add_table(letter_tables[2], 226425bb815Sopenharmony_ci "unicode_letter_chars", 227425bb815Sopenharmony_ci "uint16_t", 228425bb815Sopenharmony_ci ("/**\n" 229425bb815Sopenharmony_ci " * Those unicode letter characters that are not inside any of\n" 230425bb815Sopenharmony_ci " * the intervals specified in lit_unicode_letter_interval_sps array.\n" 231425bb815Sopenharmony_ci " *\n" 232425bb815Sopenharmony_ci " * The characters are from the following Unicode categories:\n" 233425bb815Sopenharmony_ci " * Lu, Ll, Lt, Lm, Lo, Nl\n" 234425bb815Sopenharmony_ci " */")) 235425bb815Sopenharmony_ci 236425bb815Sopenharmony_ci c_source.add_table(non_letter_tables[0], 237425bb815Sopenharmony_ci "unicode_non_letter_ident_part_interval_sps", 238425bb815Sopenharmony_ci "uint16_t", 239425bb815Sopenharmony_ci ("/**\n" 240425bb815Sopenharmony_ci " * Character interval starting points for non-letter character\n" 241425bb815Sopenharmony_ci " * that can be used as a non-first character of an identifier.\n" 242425bb815Sopenharmony_ci " *\n" 243425bb815Sopenharmony_ci " * The characters covered by these intervals are from\n" 244425bb815Sopenharmony_ci " * the following Unicode categories: Nd, Mn, Mc, Pc\n" 245425bb815Sopenharmony_ci " */")) 246425bb815Sopenharmony_ci 247425bb815Sopenharmony_ci c_source.add_table(non_letter_tables[1], 248425bb815Sopenharmony_ci "unicode_non_letter_ident_part_interval_lengths", 249425bb815Sopenharmony_ci "uint8_t", 250425bb815Sopenharmony_ci ("/**\n" 251425bb815Sopenharmony_ci " * Character interval lengths for non-letter character\n" 252425bb815Sopenharmony_ci " * that can be used as a non-first character of an identifier.\n" 253425bb815Sopenharmony_ci " *\n" 254425bb815Sopenharmony_ci " * The characters covered by these intervals are from\n" 255425bb815Sopenharmony_ci " * the following Unicode categories: Nd, Mn, Mc, Pc\n" 256425bb815Sopenharmony_ci " */")) 257425bb815Sopenharmony_ci 258425bb815Sopenharmony_ci c_source.add_table(non_letter_tables[2], 259425bb815Sopenharmony_ci "unicode_non_letter_ident_part_chars", 260425bb815Sopenharmony_ci "uint16_t", 261425bb815Sopenharmony_ci ("/**\n" 262425bb815Sopenharmony_ci " * Those non-letter characters that can be used as a non-first\n" 263425bb815Sopenharmony_ci " * character of an identifier and not included in any of the intervals\n" 264425bb815Sopenharmony_ci " * specified in lit_unicode_non_letter_ident_part_interval_sps array.\n" 265425bb815Sopenharmony_ci " *\n" 266425bb815Sopenharmony_ci " * The characters are from the following Unicode categories:\n" 267425bb815Sopenharmony_ci " * Nd, Mn, Mc, Pc\n" 268425bb815Sopenharmony_ci " */")) 269425bb815Sopenharmony_ci 270425bb815Sopenharmony_ci c_source.add_table(separator_tables[0], 271425bb815Sopenharmony_ci "unicode_separator_char_interval_sps", 272425bb815Sopenharmony_ci "uint16_t", 273425bb815Sopenharmony_ci ("/**\n" 274425bb815Sopenharmony_ci " * Unicode separator character interval starting points from Unicode category: Zs\n" 275425bb815Sopenharmony_ci " */")) 276425bb815Sopenharmony_ci 277425bb815Sopenharmony_ci c_source.add_table(separator_tables[1], 278425bb815Sopenharmony_ci "unicode_separator_char_interval_lengths", 279425bb815Sopenharmony_ci "uint8_t", 280425bb815Sopenharmony_ci ("/**\n" 281425bb815Sopenharmony_ci " * Unicode separator character interval lengths from Unicode category: Zs\n" 282425bb815Sopenharmony_ci " */")) 283425bb815Sopenharmony_ci 284425bb815Sopenharmony_ci c_source.add_table(separator_tables[2], 285425bb815Sopenharmony_ci "unicode_separator_chars", 286425bb815Sopenharmony_ci "uint16_t", 287425bb815Sopenharmony_ci ("/**\n" 288425bb815Sopenharmony_ci " * Unicode separator characters that are not in the\n" 289425bb815Sopenharmony_ci " * lit_unicode_separator_char_intervals array.\n" 290425bb815Sopenharmony_ci " *\n" 291425bb815Sopenharmony_ci " * Unicode category: Zs\n" 292425bb815Sopenharmony_ci " */")) 293425bb815Sopenharmony_ci 294425bb815Sopenharmony_ci c_source.generate() 295425bb815Sopenharmony_ci 296425bb815Sopenharmony_ci 297425bb815Sopenharmony_ci# functions for unicode conversions 298425bb815Sopenharmony_ci 299425bb815Sopenharmony_ci 300425bb815Sopenharmony_cidef parse_unicode_sequence(raw_data): 301425bb815Sopenharmony_ci """ 302425bb815Sopenharmony_ci Parse unicode sequence from raw data. 303425bb815Sopenharmony_ci 304425bb815Sopenharmony_ci :param raw_data: Contains the unicode sequence which needs to parse. 305425bb815Sopenharmony_ci :return: The parsed unicode sequence. 306425bb815Sopenharmony_ci """ 307425bb815Sopenharmony_ci 308425bb815Sopenharmony_ci result = '' 309425bb815Sopenharmony_ci 310425bb815Sopenharmony_ci for unicode_char in raw_data.split(' '): 311425bb815Sopenharmony_ci if unicode_char == '': 312425bb815Sopenharmony_ci continue 313425bb815Sopenharmony_ci 314425bb815Sopenharmony_ci # Convert it to unicode code point (from hex value without 0x prefix) 315425bb815Sopenharmony_ci hex_val = int(unicode_char, 16) 316425bb815Sopenharmony_ci try: 317425bb815Sopenharmony_ci result += unichr(hex_val) 318425bb815Sopenharmony_ci except NameError: 319425bb815Sopenharmony_ci result += chr(hex_val) 320425bb815Sopenharmony_ci 321425bb815Sopenharmony_ci return result 322425bb815Sopenharmony_ci 323425bb815Sopenharmony_ci 324425bb815Sopenharmony_cidef read_case_mappings(unicode_data_file, special_casing_file): 325425bb815Sopenharmony_ci """ 326425bb815Sopenharmony_ci Read the corresponding unicode values of lower and upper case letters and store these in tables. 327425bb815Sopenharmony_ci 328425bb815Sopenharmony_ci :param unicode_data_file: Contains the default case mappings (one-to-one mappings). 329425bb815Sopenharmony_ci :param special_casing_file: Contains additional informative case mappings that are either not one-to-one 330425bb815Sopenharmony_ci or which are context-sensitive. 331425bb815Sopenharmony_ci :return: Upper and lower case mappings. 332425bb815Sopenharmony_ci """ 333425bb815Sopenharmony_ci 334425bb815Sopenharmony_ci lower_case_mapping = {} 335425bb815Sopenharmony_ci upper_case_mapping = {} 336425bb815Sopenharmony_ci 337425bb815Sopenharmony_ci # Add one-to-one mappings 338425bb815Sopenharmony_ci with open(unicode_data_file) as unicode_data: 339425bb815Sopenharmony_ci unicode_data_reader = csv.reader(unicode_data, delimiter=';') 340425bb815Sopenharmony_ci 341425bb815Sopenharmony_ci for line in unicode_data_reader: 342425bb815Sopenharmony_ci letter_id = int(line[0], 16) 343425bb815Sopenharmony_ci 344425bb815Sopenharmony_ci # Skip supplementary planes and ascii chars 345425bb815Sopenharmony_ci if letter_id >= 0x10000 or letter_id < 128: 346425bb815Sopenharmony_ci continue 347425bb815Sopenharmony_ci 348425bb815Sopenharmony_ci capital_letter = line[12] 349425bb815Sopenharmony_ci small_letter = line[13] 350425bb815Sopenharmony_ci 351425bb815Sopenharmony_ci if capital_letter: 352425bb815Sopenharmony_ci upper_case_mapping[letter_id] = parse_unicode_sequence(capital_letter) 353425bb815Sopenharmony_ci 354425bb815Sopenharmony_ci if small_letter: 355425bb815Sopenharmony_ci lower_case_mapping[letter_id] = parse_unicode_sequence(small_letter) 356425bb815Sopenharmony_ci 357425bb815Sopenharmony_ci # Update the conversion tables with the special cases 358425bb815Sopenharmony_ci with open(special_casing_file) as special_casing: 359425bb815Sopenharmony_ci special_casing_reader = csv.reader(special_casing, delimiter=';') 360425bb815Sopenharmony_ci 361425bb815Sopenharmony_ci for line in special_casing_reader: 362425bb815Sopenharmony_ci # Skip comment sections and empty lines 363425bb815Sopenharmony_ci if not line or line[0].startswith('#'): 364425bb815Sopenharmony_ci continue 365425bb815Sopenharmony_ci 366425bb815Sopenharmony_ci # Replace '#' character with empty string 367425bb815Sopenharmony_ci for idx, i in enumerate(line): 368425bb815Sopenharmony_ci if i.find('#') >= 0: 369425bb815Sopenharmony_ci line[idx] = '' 370425bb815Sopenharmony_ci 371425bb815Sopenharmony_ci letter_id = int(line[0], 16) 372425bb815Sopenharmony_ci condition_list = line[4] 373425bb815Sopenharmony_ci 374425bb815Sopenharmony_ci # Skip supplementary planes, ascii chars, and condition_list 375425bb815Sopenharmony_ci if letter_id >= 0x10000 or letter_id < 128 or condition_list: 376425bb815Sopenharmony_ci continue 377425bb815Sopenharmony_ci 378425bb815Sopenharmony_ci small_letter = parse_unicode_sequence(line[1]) 379425bb815Sopenharmony_ci capital_letter = parse_unicode_sequence(line[3]) 380425bb815Sopenharmony_ci 381425bb815Sopenharmony_ci lower_case_mapping[letter_id] = small_letter 382425bb815Sopenharmony_ci upper_case_mapping[letter_id] = capital_letter 383425bb815Sopenharmony_ci 384425bb815Sopenharmony_ci return lower_case_mapping, upper_case_mapping 385425bb815Sopenharmony_ci 386425bb815Sopenharmony_ci 387425bb815Sopenharmony_cidef extract_ranges(letter_case, reverse_letter_case=None): 388425bb815Sopenharmony_ci """ 389425bb815Sopenharmony_ci Extract ranges from case mappings 390425bb815Sopenharmony_ci (the second param is optional, if it's not empty, a range will contains bidirectional conversions only). 391425bb815Sopenharmony_ci 392425bb815Sopenharmony_ci :param letter_id: An integer, representing the unicode code point of the character. 393425bb815Sopenharmony_ci :param letter_case: case mappings dictionary which contains the conversions. 394425bb815Sopenharmony_ci :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion. 395425bb815Sopenharmony_ci :return: A table with the start points and their mapped value, and another table with the lengths of the ranges. 396425bb815Sopenharmony_ci """ 397425bb815Sopenharmony_ci 398425bb815Sopenharmony_ci in_range = False 399425bb815Sopenharmony_ci range_position = -1 400425bb815Sopenharmony_ci ranges = [] 401425bb815Sopenharmony_ci range_lengths = [] 402425bb815Sopenharmony_ci 403425bb815Sopenharmony_ci for letter_id in sorted(letter_case.keys()): 404425bb815Sopenharmony_ci prev_letter_id = letter_id - 1 405425bb815Sopenharmony_ci 406425bb815Sopenharmony_ci # One-way conversions 407425bb815Sopenharmony_ci if reverse_letter_case is None: 408425bb815Sopenharmony_ci if len(letter_case[letter_id]) > 1: 409425bb815Sopenharmony_ci in_range = False 410425bb815Sopenharmony_ci continue 411425bb815Sopenharmony_ci 412425bb815Sopenharmony_ci if prev_letter_id not in letter_case or len(letter_case[prev_letter_id]) > 1: 413425bb815Sopenharmony_ci in_range = False 414425bb815Sopenharmony_ci continue 415425bb815Sopenharmony_ci 416425bb815Sopenharmony_ci # Two way conversions 417425bb815Sopenharmony_ci else: 418425bb815Sopenharmony_ci if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 419425bb815Sopenharmony_ci in_range = False 420425bb815Sopenharmony_ci continue 421425bb815Sopenharmony_ci 422425bb815Sopenharmony_ci if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case): 423425bb815Sopenharmony_ci in_range = False 424425bb815Sopenharmony_ci continue 425425bb815Sopenharmony_ci 426425bb815Sopenharmony_ci conv_distance = calculate_conversion_distance(letter_case, letter_id) 427425bb815Sopenharmony_ci prev_conv_distance = calculate_conversion_distance(letter_case, prev_letter_id) 428425bb815Sopenharmony_ci 429425bb815Sopenharmony_ci if conv_distance != prev_conv_distance: 430425bb815Sopenharmony_ci in_range = False 431425bb815Sopenharmony_ci continue 432425bb815Sopenharmony_ci 433425bb815Sopenharmony_ci if in_range: 434425bb815Sopenharmony_ci range_lengths[range_position] += 1 435425bb815Sopenharmony_ci else: 436425bb815Sopenharmony_ci in_range = True 437425bb815Sopenharmony_ci range_position += 1 438425bb815Sopenharmony_ci 439425bb815Sopenharmony_ci # Add the start point of the range and its mapped value 440425bb815Sopenharmony_ci ranges.extend([prev_letter_id, ord(letter_case[prev_letter_id])]) 441425bb815Sopenharmony_ci range_lengths.append(2) 442425bb815Sopenharmony_ci 443425bb815Sopenharmony_ci # Remove all ranges from the case mapping table. 444425bb815Sopenharmony_ci for idx in range(0, len(ranges), 2): 445425bb815Sopenharmony_ci range_length = range_lengths[idx // 2] 446425bb815Sopenharmony_ci 447425bb815Sopenharmony_ci for incr in range(range_length): 448425bb815Sopenharmony_ci del letter_case[ranges[idx] + incr] 449425bb815Sopenharmony_ci if reverse_letter_case is not None: 450425bb815Sopenharmony_ci del reverse_letter_case[ranges[idx + 1] + incr] 451425bb815Sopenharmony_ci 452425bb815Sopenharmony_ci return ranges, range_lengths 453425bb815Sopenharmony_ci 454425bb815Sopenharmony_ci 455425bb815Sopenharmony_cidef extract_character_pair_ranges(letter_case, reverse_letter_case): 456425bb815Sopenharmony_ci """ 457425bb815Sopenharmony_ci Extract two or more character pairs from the case mapping tables. 458425bb815Sopenharmony_ci 459425bb815Sopenharmony_ci :param letter_case: case mappings dictionary which contains the conversions. 460425bb815Sopenharmony_ci :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion. 461425bb815Sopenharmony_ci :return: A table with the start points, and another table with the lengths of the ranges. 462425bb815Sopenharmony_ci """ 463425bb815Sopenharmony_ci 464425bb815Sopenharmony_ci start_points = [] 465425bb815Sopenharmony_ci lengths = [] 466425bb815Sopenharmony_ci in_range = False 467425bb815Sopenharmony_ci element_counter = -1 468425bb815Sopenharmony_ci 469425bb815Sopenharmony_ci for letter_id in sorted(letter_case.keys()): 470425bb815Sopenharmony_ci # Only extract character pairs 471425bb815Sopenharmony_ci if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 472425bb815Sopenharmony_ci in_range = False 473425bb815Sopenharmony_ci continue 474425bb815Sopenharmony_ci 475425bb815Sopenharmony_ci if ord(letter_case[letter_id]) == letter_id + 1: 476425bb815Sopenharmony_ci prev_letter_id = letter_id - 2 477425bb815Sopenharmony_ci 478425bb815Sopenharmony_ci if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case): 479425bb815Sopenharmony_ci in_range = False 480425bb815Sopenharmony_ci 481425bb815Sopenharmony_ci if in_range: 482425bb815Sopenharmony_ci lengths[element_counter] += 2 483425bb815Sopenharmony_ci else: 484425bb815Sopenharmony_ci element_counter += 1 485425bb815Sopenharmony_ci start_points.append(letter_id) 486425bb815Sopenharmony_ci lengths.append(2) 487425bb815Sopenharmony_ci in_range = True 488425bb815Sopenharmony_ci 489425bb815Sopenharmony_ci else: 490425bb815Sopenharmony_ci in_range = False 491425bb815Sopenharmony_ci 492425bb815Sopenharmony_ci # Remove all found case mapping from the conversion tables after the scanning method 493425bb815Sopenharmony_ci for idx, letter_id in enumerate(start_points): 494425bb815Sopenharmony_ci conv_length = lengths[idx] 495425bb815Sopenharmony_ci 496425bb815Sopenharmony_ci for incr in range(0, conv_length, 2): 497425bb815Sopenharmony_ci del letter_case[letter_id + incr] 498425bb815Sopenharmony_ci del reverse_letter_case[letter_id + 1 + incr] 499425bb815Sopenharmony_ci 500425bb815Sopenharmony_ci return start_points, lengths 501425bb815Sopenharmony_ci 502425bb815Sopenharmony_ci 503425bb815Sopenharmony_cidef extract_character_pairs(letter_case, reverse_letter_case): 504425bb815Sopenharmony_ci """ 505425bb815Sopenharmony_ci Extract character pairs. Check that two unicode value are also a mapping value of each other. 506425bb815Sopenharmony_ci 507425bb815Sopenharmony_ci :param letter_case: case mappings dictionary which contains the conversions. 508425bb815Sopenharmony_ci :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion. 509425bb815Sopenharmony_ci :return: A table with character pairs. 510425bb815Sopenharmony_ci """ 511425bb815Sopenharmony_ci 512425bb815Sopenharmony_ci character_pairs = [] 513425bb815Sopenharmony_ci 514425bb815Sopenharmony_ci for letter_id in sorted(letter_case.keys()): 515425bb815Sopenharmony_ci if is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 516425bb815Sopenharmony_ci mapped_value = letter_case[letter_id] 517425bb815Sopenharmony_ci character_pairs.extend([letter_id, ord(mapped_value)]) 518425bb815Sopenharmony_ci 519425bb815Sopenharmony_ci # Remove character pairs from case mapping tables 520425bb815Sopenharmony_ci del letter_case[letter_id] 521425bb815Sopenharmony_ci del reverse_letter_case[ord(mapped_value)] 522425bb815Sopenharmony_ci 523425bb815Sopenharmony_ci return character_pairs 524425bb815Sopenharmony_ci 525425bb815Sopenharmony_ci 526425bb815Sopenharmony_cidef extract_special_ranges(letter_case): 527425bb815Sopenharmony_ci """ 528425bb815Sopenharmony_ci Extract special ranges. It contains start points of one-to-two letter case ranges 529425bb815Sopenharmony_ci where the second character is always the same. 530425bb815Sopenharmony_ci 531425bb815Sopenharmony_ci :param letter_case: case mappings dictionary which contains the conversions. 532425bb815Sopenharmony_ci 533425bb815Sopenharmony_ci :return: A table with the start points and their mapped values, and a table with the lengths of the ranges. 534425bb815Sopenharmony_ci """ 535425bb815Sopenharmony_ci 536425bb815Sopenharmony_ci special_ranges = [] 537425bb815Sopenharmony_ci special_range_lengths = [] 538425bb815Sopenharmony_ci 539425bb815Sopenharmony_ci range_position = -1 540425bb815Sopenharmony_ci 541425bb815Sopenharmony_ci for letter_id in sorted(letter_case.keys()): 542425bb815Sopenharmony_ci mapped_value = letter_case[letter_id] 543425bb815Sopenharmony_ci 544425bb815Sopenharmony_ci if len(mapped_value) != 2: 545425bb815Sopenharmony_ci continue 546425bb815Sopenharmony_ci 547425bb815Sopenharmony_ci prev_letter_id = letter_id - 1 548425bb815Sopenharmony_ci 549425bb815Sopenharmony_ci if prev_letter_id not in letter_case: 550425bb815Sopenharmony_ci in_range = False 551425bb815Sopenharmony_ci continue 552425bb815Sopenharmony_ci 553425bb815Sopenharmony_ci prev_mapped_value = letter_case[prev_letter_id] 554425bb815Sopenharmony_ci 555425bb815Sopenharmony_ci if len(prev_mapped_value) != 2: 556425bb815Sopenharmony_ci continue 557425bb815Sopenharmony_ci 558425bb815Sopenharmony_ci if prev_mapped_value[1] != mapped_value[1]: 559425bb815Sopenharmony_ci continue 560425bb815Sopenharmony_ci 561425bb815Sopenharmony_ci if (ord(prev_mapped_value[0]) - prev_letter_id) != (ord(mapped_value[0]) - letter_id): 562425bb815Sopenharmony_ci in_range = False 563425bb815Sopenharmony_ci continue 564425bb815Sopenharmony_ci 565425bb815Sopenharmony_ci if in_range: 566425bb815Sopenharmony_ci special_range_lengths[range_position] += 1 567425bb815Sopenharmony_ci else: 568425bb815Sopenharmony_ci range_position += 1 569425bb815Sopenharmony_ci in_range = True 570425bb815Sopenharmony_ci 571425bb815Sopenharmony_ci special_ranges.extend([prev_letter_id, ord(prev_mapped_value[0]), ord(prev_mapped_value[1])]) 572425bb815Sopenharmony_ci special_range_lengths.append(1) 573425bb815Sopenharmony_ci 574425bb815Sopenharmony_ci # Remove special ranges from the conversion table 575425bb815Sopenharmony_ci for idx in range(0, len(special_ranges), 3): 576425bb815Sopenharmony_ci range_length = special_range_lengths[idx // 3] 577425bb815Sopenharmony_ci letter_id = special_ranges[idx] 578425bb815Sopenharmony_ci 579425bb815Sopenharmony_ci for incr in range(range_length): 580425bb815Sopenharmony_ci del letter_case[special_ranges[idx] + incr] 581425bb815Sopenharmony_ci 582425bb815Sopenharmony_ci return special_ranges, special_range_lengths 583425bb815Sopenharmony_ci 584425bb815Sopenharmony_ci 585425bb815Sopenharmony_cidef extract_conversions(letter_case): 586425bb815Sopenharmony_ci """ 587425bb815Sopenharmony_ci Extract conversions. It provide the full (or remained) case mappings from the table. 588425bb815Sopenharmony_ci The counter table contains the information of how much one-to-one, one-to-two or one-to-three mappings 589425bb815Sopenharmony_ci exists successively in the conversion table. 590425bb815Sopenharmony_ci 591425bb815Sopenharmony_ci :return: A table with conversions, and a table with counters. 592425bb815Sopenharmony_ci """ 593425bb815Sopenharmony_ci 594425bb815Sopenharmony_ci unicodes = [[], [], []] 595425bb815Sopenharmony_ci unicode_lengths = [0, 0, 0] 596425bb815Sopenharmony_ci 597425bb815Sopenharmony_ci # 1 to 1 byte 598425bb815Sopenharmony_ci for letter_id in sorted(letter_case.keys()): 599425bb815Sopenharmony_ci mapped_value = letter_case[letter_id] 600425bb815Sopenharmony_ci 601425bb815Sopenharmony_ci if len(mapped_value) != 1: 602425bb815Sopenharmony_ci continue 603425bb815Sopenharmony_ci 604425bb815Sopenharmony_ci unicodes[0].extend([letter_id, ord(mapped_value)]) 605425bb815Sopenharmony_ci del letter_case[letter_id] 606425bb815Sopenharmony_ci 607425bb815Sopenharmony_ci # 1 to 2 bytes 608425bb815Sopenharmony_ci for letter_id in sorted(letter_case.keys()): 609425bb815Sopenharmony_ci mapped_value = letter_case[letter_id] 610425bb815Sopenharmony_ci 611425bb815Sopenharmony_ci if len(mapped_value) != 2: 612425bb815Sopenharmony_ci continue 613425bb815Sopenharmony_ci 614425bb815Sopenharmony_ci unicodes[1].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1])]) 615425bb815Sopenharmony_ci del letter_case[letter_id] 616425bb815Sopenharmony_ci 617425bb815Sopenharmony_ci # 1 to 3 bytes 618425bb815Sopenharmony_ci for letter_id in sorted(letter_case.keys()): 619425bb815Sopenharmony_ci mapped_value = letter_case[letter_id] 620425bb815Sopenharmony_ci 621425bb815Sopenharmony_ci if len(mapped_value) != 3: 622425bb815Sopenharmony_ci continue 623425bb815Sopenharmony_ci 624425bb815Sopenharmony_ci unicodes[2].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1]), ord(mapped_value[2])]) 625425bb815Sopenharmony_ci del letter_case[letter_id] 626425bb815Sopenharmony_ci 627425bb815Sopenharmony_ci unicode_lengths = [int(len(unicodes[0]) / 2), int(len(unicodes[1]) / 3), int(len(unicodes[2]) / 4)] 628425bb815Sopenharmony_ci 629425bb815Sopenharmony_ci return list(itertools.chain.from_iterable(unicodes)), unicode_lengths 630425bb815Sopenharmony_ci 631425bb815Sopenharmony_ci 632425bb815Sopenharmony_cidef is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 633425bb815Sopenharmony_ci """ 634425bb815Sopenharmony_ci Check that two unicode value are also a mapping value of each other. 635425bb815Sopenharmony_ci 636425bb815Sopenharmony_ci :param letter_id: An integer, representing the unicode code point of the character. 637425bb815Sopenharmony_ci :param other_case_mapping: Comparable case mapping table which possible contains 638425bb815Sopenharmony_ci the return direction of the conversion. 639425bb815Sopenharmony_ci :return: True, if it's a reverible conversion, false otherwise. 640425bb815Sopenharmony_ci """ 641425bb815Sopenharmony_ci 642425bb815Sopenharmony_ci if letter_id not in letter_case: 643425bb815Sopenharmony_ci return False 644425bb815Sopenharmony_ci 645425bb815Sopenharmony_ci # Check one-to-one mapping 646425bb815Sopenharmony_ci mapped_value = letter_case[letter_id] 647425bb815Sopenharmony_ci if len(mapped_value) > 1: 648425bb815Sopenharmony_ci return False 649425bb815Sopenharmony_ci 650425bb815Sopenharmony_ci # Check two way conversions 651425bb815Sopenharmony_ci mapped_value_id = ord(mapped_value) 652425bb815Sopenharmony_ci 653425bb815Sopenharmony_ci if mapped_value_id not in reverse_letter_case or len(reverse_letter_case[mapped_value_id]) > 1: 654425bb815Sopenharmony_ci return False 655425bb815Sopenharmony_ci 656425bb815Sopenharmony_ci if ord(reverse_letter_case[mapped_value_id]) != letter_id: 657425bb815Sopenharmony_ci return False 658425bb815Sopenharmony_ci 659425bb815Sopenharmony_ci return True 660425bb815Sopenharmony_ci 661425bb815Sopenharmony_ci 662425bb815Sopenharmony_cidef calculate_conversion_distance(letter_case, letter_id): 663425bb815Sopenharmony_ci """ 664425bb815Sopenharmony_ci Calculate the distance between the unicode character and its mapped value 665425bb815Sopenharmony_ci (only needs and works with one-to-one mappings). 666425bb815Sopenharmony_ci 667425bb815Sopenharmony_ci :param letter_case: case mappings dictionary which contains the conversions. 668425bb815Sopenharmony_ci :param letter_id: An integer, representing the unicode code point of the character. 669425bb815Sopenharmony_ci :return: The conversion distance. 670425bb815Sopenharmony_ci """ 671425bb815Sopenharmony_ci 672425bb815Sopenharmony_ci if letter_id not in letter_case or len(letter_case[letter_id]) > 1: 673425bb815Sopenharmony_ci return None 674425bb815Sopenharmony_ci 675425bb815Sopenharmony_ci return ord(letter_case[letter_id]) - letter_id 676425bb815Sopenharmony_ci 677425bb815Sopenharmony_ci 678425bb815Sopenharmony_cidef generate_conversions(script_args): 679425bb815Sopenharmony_ci # Read the corresponding unicode values of lower and upper case letters and store these in tables 680425bb815Sopenharmony_ci case_mappings = read_case_mappings(script_args.unicode_data, script_args.special_casing) 681425bb815Sopenharmony_ci lower_case = case_mappings[0] 682425bb815Sopenharmony_ci upper_case = case_mappings[1] 683425bb815Sopenharmony_ci 684425bb815Sopenharmony_ci character_case_ranges = extract_ranges(lower_case, upper_case) 685425bb815Sopenharmony_ci character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case) 686425bb815Sopenharmony_ci character_pairs = extract_character_pairs(lower_case, upper_case) 687425bb815Sopenharmony_ci upper_case_special_ranges = extract_special_ranges(upper_case) 688425bb815Sopenharmony_ci lower_case_ranges = extract_ranges(lower_case) 689425bb815Sopenharmony_ci lower_case_conversions = extract_conversions(lower_case) 690425bb815Sopenharmony_ci upper_case_conversions = extract_conversions(upper_case) 691425bb815Sopenharmony_ci 692425bb815Sopenharmony_ci if lower_case: 693425bb815Sopenharmony_ci warnings.warn('Not all elements extracted from the lowercase table!') 694425bb815Sopenharmony_ci if upper_case: 695425bb815Sopenharmony_ci warnings.warn('Not all elements extracted from the uppercase table!') 696425bb815Sopenharmony_ci 697425bb815Sopenharmony_ci # Generate conversions output 698425bb815Sopenharmony_ci c_source = UniCodeSource(CONVERSIONS_C_SOURCE) 699425bb815Sopenharmony_ci 700425bb815Sopenharmony_ci unicode_file = os.path.basename(script_args.unicode_data) 701425bb815Sopenharmony_ci spec_casing_file = os.path.basename(script_args.special_casing) 702425bb815Sopenharmony_ci 703425bb815Sopenharmony_ci header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), 704425bb815Sopenharmony_ci " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file), 705425bb815Sopenharmony_ci ""] 706425bb815Sopenharmony_ci 707425bb815Sopenharmony_ci c_source.complete_header("\n".join(header_completion)) 708425bb815Sopenharmony_ci 709425bb815Sopenharmony_ci c_source.add_table(character_case_ranges[0], 710425bb815Sopenharmony_ci "character_case_ranges", 711425bb815Sopenharmony_ci "uint16_t", 712425bb815Sopenharmony_ci ("/* Contains start points of character case ranges " 713425bb815Sopenharmony_ci "(these are bidirectional conversions). */")) 714425bb815Sopenharmony_ci 715425bb815Sopenharmony_ci c_source.add_table(character_case_ranges[1], 716425bb815Sopenharmony_ci "character_case_range_lengths", 717425bb815Sopenharmony_ci "uint8_t", 718425bb815Sopenharmony_ci "/* Interval lengths of start points in `character_case_ranges` table. */") 719425bb815Sopenharmony_ci 720425bb815Sopenharmony_ci c_source.add_table(character_pair_ranges[0], 721425bb815Sopenharmony_ci "character_pair_ranges", 722425bb815Sopenharmony_ci "uint16_t", 723425bb815Sopenharmony_ci "/* Contains the start points of bidirectional conversion ranges. */") 724425bb815Sopenharmony_ci 725425bb815Sopenharmony_ci c_source.add_table(character_pair_ranges[1], 726425bb815Sopenharmony_ci "character_pair_range_lengths", 727425bb815Sopenharmony_ci "uint8_t", 728425bb815Sopenharmony_ci "/* Interval lengths of start points in `character_pair_ranges` table. */") 729425bb815Sopenharmony_ci 730425bb815Sopenharmony_ci c_source.add_table(character_pairs, 731425bb815Sopenharmony_ci "character_pairs", 732425bb815Sopenharmony_ci "uint16_t", 733425bb815Sopenharmony_ci "/* Contains lower/upper case bidirectional conversion pairs. */") 734425bb815Sopenharmony_ci 735425bb815Sopenharmony_ci c_source.add_table(upper_case_special_ranges[0], 736425bb815Sopenharmony_ci "upper_case_special_ranges", 737425bb815Sopenharmony_ci "uint16_t", 738425bb815Sopenharmony_ci ("/* Contains start points of one-to-two uppercase ranges where the second character\n" 739425bb815Sopenharmony_ci " * is always the same.\n" 740425bb815Sopenharmony_ci " */")) 741425bb815Sopenharmony_ci 742425bb815Sopenharmony_ci c_source.add_table(upper_case_special_ranges[1], 743425bb815Sopenharmony_ci "upper_case_special_range_lengths", 744425bb815Sopenharmony_ci "uint8_t", 745425bb815Sopenharmony_ci "/* Interval lengths for start points in `upper_case_special_ranges` table. */") 746425bb815Sopenharmony_ci 747425bb815Sopenharmony_ci c_source.add_table(lower_case_ranges[0], 748425bb815Sopenharmony_ci "lower_case_ranges", 749425bb815Sopenharmony_ci "uint16_t", 750425bb815Sopenharmony_ci "/* Contains start points of lowercase ranges. */") 751425bb815Sopenharmony_ci 752425bb815Sopenharmony_ci c_source.add_table(lower_case_ranges[1], 753425bb815Sopenharmony_ci "lower_case_range_lengths", 754425bb815Sopenharmony_ci "uint8_t", 755425bb815Sopenharmony_ci "/* Interval lengths for start points in `lower_case_ranges` table. */") 756425bb815Sopenharmony_ci 757425bb815Sopenharmony_ci c_source.add_table(lower_case_conversions[0], 758425bb815Sopenharmony_ci "lower_case_conversions", 759425bb815Sopenharmony_ci "uint16_t", 760425bb815Sopenharmony_ci ("/* The remaining lowercase conversions. The lowercase variant can " 761425bb815Sopenharmony_ci "be one-to-three character long. */")) 762425bb815Sopenharmony_ci 763425bb815Sopenharmony_ci c_source.add_table(lower_case_conversions[1], 764425bb815Sopenharmony_ci "lower_case_conversion_counters", 765425bb815Sopenharmony_ci "uint8_t", 766425bb815Sopenharmony_ci "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */") 767425bb815Sopenharmony_ci 768425bb815Sopenharmony_ci c_source.add_table(upper_case_conversions[0], 769425bb815Sopenharmony_ci "upper_case_conversions", 770425bb815Sopenharmony_ci "uint16_t", 771425bb815Sopenharmony_ci ("/* The remaining uppercase conversions. The uppercase variant can " 772425bb815Sopenharmony_ci "be one-to-three character long. */")) 773425bb815Sopenharmony_ci 774425bb815Sopenharmony_ci c_source.add_table(upper_case_conversions[1], 775425bb815Sopenharmony_ci "upper_case_conversion_counters", 776425bb815Sopenharmony_ci "uint8_t", 777425bb815Sopenharmony_ci "/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */") 778425bb815Sopenharmony_ci 779425bb815Sopenharmony_ci c_source.generate() 780425bb815Sopenharmony_ci 781425bb815Sopenharmony_ci 782425bb815Sopenharmony_ci# entry point 783425bb815Sopenharmony_ci 784425bb815Sopenharmony_ci 785425bb815Sopenharmony_cidef main(): 786425bb815Sopenharmony_ci parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}.inc.h generator', 787425bb815Sopenharmony_ci epilog=''' 788425bb815Sopenharmony_ci The input files (UnicodeData.txt, SpecialCasing.txt) 789425bb815Sopenharmony_ci must be retrieved from 790425bb815Sopenharmony_ci http://www.unicode.org/Public/<VERSION>/ucd/. 791425bb815Sopenharmony_ci The last known good version is 13.0.0. 792425bb815Sopenharmony_ci ''') 793425bb815Sopenharmony_ci 794425bb815Sopenharmony_ci parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True, 795425bb815Sopenharmony_ci help='specify the unicode data file') 796425bb815Sopenharmony_ci parser.add_argument('--special-casing', metavar='FILE', action='store', required=True, 797425bb815Sopenharmony_ci help='specify the special casing file') 798425bb815Sopenharmony_ci 799425bb815Sopenharmony_ci script_args = parser.parse_args() 800425bb815Sopenharmony_ci 801425bb815Sopenharmony_ci if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): 802425bb815Sopenharmony_ci parser.error('The %s file is missing or not readable!' % script_args.unicode_data) 803425bb815Sopenharmony_ci 804425bb815Sopenharmony_ci if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK): 805425bb815Sopenharmony_ci parser.error('The %s file is missing or not readable!' % script_args.special_casing) 806425bb815Sopenharmony_ci 807425bb815Sopenharmony_ci generate_ranges(script_args) 808425bb815Sopenharmony_ci generate_conversions(script_args) 809425bb815Sopenharmony_ci 810425bb815Sopenharmony_ci 811425bb815Sopenharmony_ciif __name__ == "__main__": 812425bb815Sopenharmony_ci main() 813