1425bb815Sopenharmony_ci#!/usr/bin/env python
2425bb815Sopenharmony_ci
3425bb815Sopenharmony_ci# Copyright JS Foundation and other contributors, http://js.foundation
4425bb815Sopenharmony_ci#
5425bb815Sopenharmony_ci# Licensed under the Apache License, Version 2.0 (the "License");
6425bb815Sopenharmony_ci# you may not use this file except in compliance with the License.
7425bb815Sopenharmony_ci# You may obtain a copy of the License at
8425bb815Sopenharmony_ci#
9425bb815Sopenharmony_ci#     http://www.apache.org/licenses/LICENSE-2.0
10425bb815Sopenharmony_ci#
11425bb815Sopenharmony_ci# Unless required by applicable law or agreed to in writing, software
12425bb815Sopenharmony_ci# distributed under the License is distributed on an "AS IS" BASIS
13425bb815Sopenharmony_ci# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14425bb815Sopenharmony_ci# See the License for the specific language governing permissions and
15425bb815Sopenharmony_ci# limitations under the License.
16425bb815Sopenharmony_ci
17425bb815Sopenharmony_cifrom __future__ import print_function
18425bb815Sopenharmony_ci
19425bb815Sopenharmony_ciimport argparse
20425bb815Sopenharmony_ciimport bisect
21425bb815Sopenharmony_ciimport csv
22425bb815Sopenharmony_ciimport itertools
23425bb815Sopenharmony_ciimport os
24425bb815Sopenharmony_ciimport warnings
25425bb815Sopenharmony_ci
26425bb815Sopenharmony_cifrom gen_c_source import LICENSE, format_code
27425bb815Sopenharmony_cifrom settings import PROJECT_DIR
28425bb815Sopenharmony_ci
29425bb815Sopenharmony_ci
30425bb815Sopenharmony_ciRANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
31425bb815Sopenharmony_ciCONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h')
32425bb815Sopenharmony_ci
33425bb815Sopenharmony_ci
34425bb815Sopenharmony_ci# common code generation
35425bb815Sopenharmony_ci
36425bb815Sopenharmony_ci
37425bb815Sopenharmony_ciclass UniCodeSource(object):
38425bb815Sopenharmony_ci    def __init__(self, filepath):
39425bb815Sopenharmony_ci        self.__filepath = filepath
40425bb815Sopenharmony_ci        self.__header = [LICENSE, ""]
41425bb815Sopenharmony_ci        self.__data = []
42425bb815Sopenharmony_ci
43425bb815Sopenharmony_ci    def complete_header(self, completion):
44425bb815Sopenharmony_ci        self.__header.append(completion)
45425bb815Sopenharmony_ci        self.__header.append("")  # for an extra empty line
46425bb815Sopenharmony_ci
47425bb815Sopenharmony_ci    def add_table(self, table, table_name, table_type, table_descr):
48425bb815Sopenharmony_ci        self.__data.append(table_descr)
49425bb815Sopenharmony_ci        self.__data.append("static const %s lit_%s[] JERRY_ATTR_CONST_DATA =" % (table_type, table_name))
50425bb815Sopenharmony_ci        self.__data.append("{")
51425bb815Sopenharmony_ci        self.__data.append(format_code(table, 1))
52425bb815Sopenharmony_ci        self.__data.append("};")
53425bb815Sopenharmony_ci        self.__data.append("")  # for an extra empty line
54425bb815Sopenharmony_ci
55425bb815Sopenharmony_ci    def generate(self):
56425bb815Sopenharmony_ci        with open(self.__filepath, 'w') as generated_source:
57425bb815Sopenharmony_ci            generated_source.write("\n".join(self.__header))
58425bb815Sopenharmony_ci            generated_source.write("\n".join(self.__data))
59425bb815Sopenharmony_ci
60425bb815Sopenharmony_ciclass UnicodeCategorizer(object):
61425bb815Sopenharmony_ci    def __init__(self):
62425bb815Sopenharmony_ci        # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs
63425bb815Sopenharmony_ci        #                          Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
64425bb815Sopenharmony_ci        # letter:                  Lu Ll Lt Lm Lo Nl
65425bb815Sopenharmony_ci        # non-letter-indent-part:
66425bb815Sopenharmony_ci        #   digit:                 Nd
67425bb815Sopenharmony_ci        #   punctuation mark:      Mn Mc
68425bb815Sopenharmony_ci        #   connector punctuation: Pc
69425bb815Sopenharmony_ci        # separators:              Zs
70425bb815Sopenharmony_ci        self._unicode_categories = {
71425bb815Sopenharmony_ci            'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
72425bb815Sopenharmony_ci            'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
73425bb815Sopenharmony_ci            'separators_category' : ["Zs"]
74425bb815Sopenharmony_ci        }
75425bb815Sopenharmony_ci
76425bb815Sopenharmony_ci        self._categories = {
77425bb815Sopenharmony_ci            'letters' : [],
78425bb815Sopenharmony_ci            'non_letters' : [],
79425bb815Sopenharmony_ci            'separators' : []
80425bb815Sopenharmony_ci        }
81425bb815Sopenharmony_ci
82425bb815Sopenharmony_ci    def _store_by_category(self, unicode_id, category):
83425bb815Sopenharmony_ci        """
84425bb815Sopenharmony_ci        Store the given unicode_id by its category
85425bb815Sopenharmony_ci        """
86425bb815Sopenharmony_ci        for target_category in self._categories:
87425bb815Sopenharmony_ci            if category in self._unicode_categories[target_category + '_category']:
88425bb815Sopenharmony_ci                self._categories[target_category].append(unicode_id)
89425bb815Sopenharmony_ci
90425bb815Sopenharmony_ci    def read_categories(self, unicode_data_file):
91425bb815Sopenharmony_ci        """
92425bb815Sopenharmony_ci        Read the corresponding unicode values and store them in category lists.
93425bb815Sopenharmony_ci
94425bb815Sopenharmony_ci        :return: List of letters, non_letter and separators.
95425bb815Sopenharmony_ci        """
96425bb815Sopenharmony_ci
97425bb815Sopenharmony_ci        range_start_id = 0
98425bb815Sopenharmony_ci
99425bb815Sopenharmony_ci        with open(unicode_data_file) as unicode_data:
100425bb815Sopenharmony_ci            for line in csv.reader(unicode_data, delimiter=';'):
101425bb815Sopenharmony_ci                unicode_id = int(line[0], 16)
102425bb815Sopenharmony_ci
103425bb815Sopenharmony_ci                # Skip supplementary planes and ascii chars
104425bb815Sopenharmony_ci                if unicode_id >= 0x10000 or unicode_id < 128:
105425bb815Sopenharmony_ci                    continue
106425bb815Sopenharmony_ci
107425bb815Sopenharmony_ci                category = line[2]
108425bb815Sopenharmony_ci
109425bb815Sopenharmony_ci                if range_start_id != 0:
110425bb815Sopenharmony_ci                    while range_start_id <= unicode_id:
111425bb815Sopenharmony_ci                        self._store_by_category(range_start_id, category)
112425bb815Sopenharmony_ci                        range_start_id += 1
113425bb815Sopenharmony_ci                    range_start_id = 0
114425bb815Sopenharmony_ci                    continue
115425bb815Sopenharmony_ci
116425bb815Sopenharmony_ci                if line[1].startswith('<'):
117425bb815Sopenharmony_ci                    # Save the start position of the range
118425bb815Sopenharmony_ci                    range_start_id = unicode_id
119425bb815Sopenharmony_ci
120425bb815Sopenharmony_ci                self._store_by_category(unicode_id, category)
121425bb815Sopenharmony_ci
122425bb815Sopenharmony_ci        # This separator char is handled separatly
123425bb815Sopenharmony_ci        separators = self._categories['separators']
124425bb815Sopenharmony_ci        non_breaking_space = 0x00A0
125425bb815Sopenharmony_ci        if non_breaking_space in separators:
126425bb815Sopenharmony_ci            separators.remove(int(non_breaking_space))
127425bb815Sopenharmony_ci
128425bb815Sopenharmony_ci        # These separator chars are not in the unicode data file or not in Zs category
129425bb815Sopenharmony_ci        mongolian_vowel_separator = 0x180E
130425bb815Sopenharmony_ci        medium_mathematical_space = 0x205F
131425bb815Sopenharmony_ci        zero_width_space = 0x200B
132425bb815Sopenharmony_ci
133425bb815Sopenharmony_ci        if mongolian_vowel_separator not in separators:
134425bb815Sopenharmony_ci            bisect.insort(separators, int(mongolian_vowel_separator))
135425bb815Sopenharmony_ci        if medium_mathematical_space not in separators:
136425bb815Sopenharmony_ci            bisect.insort(separators, int(medium_mathematical_space))
137425bb815Sopenharmony_ci        if zero_width_space not in separators:
138425bb815Sopenharmony_ci            bisect.insort(separators, int(zero_width_space))
139425bb815Sopenharmony_ci
140425bb815Sopenharmony_ci        # https://www.ecma-international.org/ecma-262/5.1/#sec-7.1 format-control characters
141425bb815Sopenharmony_ci        non_letters = self._categories['non_letters']
142425bb815Sopenharmony_ci        zero_width_non_joiner = 0x200C
143425bb815Sopenharmony_ci        zero_width_joiner = 0x200D
144425bb815Sopenharmony_ci
145425bb815Sopenharmony_ci        bisect.insort(non_letters, int(zero_width_non_joiner))
146425bb815Sopenharmony_ci        bisect.insort(non_letters, int(zero_width_joiner))
147425bb815Sopenharmony_ci
148425bb815Sopenharmony_ci        return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
149425bb815Sopenharmony_ci
150425bb815Sopenharmony_ci
151425bb815Sopenharmony_cidef group_ranges(i):
152425bb815Sopenharmony_ci    """
153425bb815Sopenharmony_ci    Convert an increasing list of integers into a range list
154425bb815Sopenharmony_ci
155425bb815Sopenharmony_ci    :return: List of ranges.
156425bb815Sopenharmony_ci    """
157425bb815Sopenharmony_ci    for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])):
158425bb815Sopenharmony_ci        group = list(group)
159425bb815Sopenharmony_ci        yield group[0][1], group[-1][1]
160425bb815Sopenharmony_ci
161425bb815Sopenharmony_ci
162425bb815Sopenharmony_cidef split_list(category_list):
163425bb815Sopenharmony_ci    """
164425bb815Sopenharmony_ci    Split list of ranges into intervals and single char lists.
165425bb815Sopenharmony_ci
166425bb815Sopenharmony_ci    :return: List of interval starting points, interval lengths and single chars
167425bb815Sopenharmony_ci    """
168425bb815Sopenharmony_ci
169425bb815Sopenharmony_ci    interval_sps = []
170425bb815Sopenharmony_ci    interval_lengths = []
171425bb815Sopenharmony_ci    chars = []
172425bb815Sopenharmony_ci
173425bb815Sopenharmony_ci    for element in category_list:
174425bb815Sopenharmony_ci        interval_length = element[1] - element[0]
175425bb815Sopenharmony_ci        if interval_length == 0:
176425bb815Sopenharmony_ci            chars.append(element[0])
177425bb815Sopenharmony_ci        elif interval_length > 255:
178425bb815Sopenharmony_ci            for i in range(element[0], element[1], 256):
179425bb815Sopenharmony_ci                length = 255 if (element[1] - i > 255) else (element[1] - i)
180425bb815Sopenharmony_ci                interval_sps.append(i)
181425bb815Sopenharmony_ci                interval_lengths.append(length)
182425bb815Sopenharmony_ci        else:
183425bb815Sopenharmony_ci            interval_sps.append(element[0])
184425bb815Sopenharmony_ci            interval_lengths.append(element[1] - element[0])
185425bb815Sopenharmony_ci
186425bb815Sopenharmony_ci    return interval_sps, interval_lengths, chars
187425bb815Sopenharmony_ci
188425bb815Sopenharmony_ci
189425bb815Sopenharmony_cidef generate_ranges(script_args):
190425bb815Sopenharmony_ci    categorizer = UnicodeCategorizer()
191425bb815Sopenharmony_ci    letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
192425bb815Sopenharmony_ci
193425bb815Sopenharmony_ci    letter_tables = split_list(list(group_ranges(letters)))
194425bb815Sopenharmony_ci    non_letter_tables = split_list(list(group_ranges(non_letters)))
195425bb815Sopenharmony_ci    separator_tables = split_list(list(group_ranges(separators)))
196425bb815Sopenharmony_ci
197425bb815Sopenharmony_ci    c_source = UniCodeSource(RANGES_C_SOURCE)
198425bb815Sopenharmony_ci
199425bb815Sopenharmony_ci    header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
200425bb815Sopenharmony_ci                         " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data),
201425bb815Sopenharmony_ci                         ""]
202425bb815Sopenharmony_ci
203425bb815Sopenharmony_ci    c_source.complete_header("\n".join(header_completion))
204425bb815Sopenharmony_ci
205425bb815Sopenharmony_ci    c_source.add_table(letter_tables[0],
206425bb815Sopenharmony_ci                       "unicode_letter_interval_sps",
207425bb815Sopenharmony_ci                       "uint16_t",
208425bb815Sopenharmony_ci                       ("/**\n"
209425bb815Sopenharmony_ci                        " * Character interval starting points for the unicode letters.\n"
210425bb815Sopenharmony_ci                        " *\n"
211425bb815Sopenharmony_ci                        " * The characters covered by these intervals are from\n"
212425bb815Sopenharmony_ci                        " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
213425bb815Sopenharmony_ci                        " */"))
214425bb815Sopenharmony_ci
215425bb815Sopenharmony_ci    c_source.add_table(letter_tables[1],
216425bb815Sopenharmony_ci                       "unicode_letter_interval_lengths",
217425bb815Sopenharmony_ci                       "uint8_t",
218425bb815Sopenharmony_ci                       ("/**\n"
219425bb815Sopenharmony_ci                        " * Character lengths for the unicode letters.\n"
220425bb815Sopenharmony_ci                        " *\n"
221425bb815Sopenharmony_ci                        " * The characters covered by these intervals are from\n"
222425bb815Sopenharmony_ci                        " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
223425bb815Sopenharmony_ci                        " */"))
224425bb815Sopenharmony_ci
225425bb815Sopenharmony_ci    c_source.add_table(letter_tables[2],
226425bb815Sopenharmony_ci                       "unicode_letter_chars",
227425bb815Sopenharmony_ci                       "uint16_t",
228425bb815Sopenharmony_ci                       ("/**\n"
229425bb815Sopenharmony_ci                        " * Those unicode letter characters that are not inside any of\n"
230425bb815Sopenharmony_ci                        " * the intervals specified in lit_unicode_letter_interval_sps array.\n"
231425bb815Sopenharmony_ci                        " *\n"
232425bb815Sopenharmony_ci                        " * The characters are from the following Unicode categories:\n"
233425bb815Sopenharmony_ci                        " * Lu, Ll, Lt, Lm, Lo, Nl\n"
234425bb815Sopenharmony_ci                        " */"))
235425bb815Sopenharmony_ci
236425bb815Sopenharmony_ci    c_source.add_table(non_letter_tables[0],
237425bb815Sopenharmony_ci                       "unicode_non_letter_ident_part_interval_sps",
238425bb815Sopenharmony_ci                       "uint16_t",
239425bb815Sopenharmony_ci                       ("/**\n"
240425bb815Sopenharmony_ci                        " * Character interval starting points for non-letter character\n"
241425bb815Sopenharmony_ci                        " * that can be used as a non-first character of an identifier.\n"
242425bb815Sopenharmony_ci                        " *\n"
243425bb815Sopenharmony_ci                        " * The characters covered by these intervals are from\n"
244425bb815Sopenharmony_ci                        " * the following Unicode categories: Nd, Mn, Mc, Pc\n"
245425bb815Sopenharmony_ci                        " */"))
246425bb815Sopenharmony_ci
247425bb815Sopenharmony_ci    c_source.add_table(non_letter_tables[1],
248425bb815Sopenharmony_ci                       "unicode_non_letter_ident_part_interval_lengths",
249425bb815Sopenharmony_ci                       "uint8_t",
250425bb815Sopenharmony_ci                       ("/**\n"
251425bb815Sopenharmony_ci                        " * Character interval lengths for non-letter character\n"
252425bb815Sopenharmony_ci                        " * that can be used as a non-first character of an identifier.\n"
253425bb815Sopenharmony_ci                        " *\n"
254425bb815Sopenharmony_ci                        " * The characters covered by these intervals are from\n"
255425bb815Sopenharmony_ci                        " * the following Unicode categories: Nd, Mn, Mc, Pc\n"
256425bb815Sopenharmony_ci                        " */"))
257425bb815Sopenharmony_ci
258425bb815Sopenharmony_ci    c_source.add_table(non_letter_tables[2],
259425bb815Sopenharmony_ci                       "unicode_non_letter_ident_part_chars",
260425bb815Sopenharmony_ci                       "uint16_t",
261425bb815Sopenharmony_ci                       ("/**\n"
262425bb815Sopenharmony_ci                        " * Those non-letter characters that can be used as a non-first\n"
263425bb815Sopenharmony_ci                        " * character of an identifier and not included in any of the intervals\n"
264425bb815Sopenharmony_ci                        " * specified in lit_unicode_non_letter_ident_part_interval_sps array.\n"
265425bb815Sopenharmony_ci                        " *\n"
266425bb815Sopenharmony_ci                        " * The characters are from the following Unicode categories:\n"
267425bb815Sopenharmony_ci                        " * Nd, Mn, Mc, Pc\n"
268425bb815Sopenharmony_ci                        " */"))
269425bb815Sopenharmony_ci
270425bb815Sopenharmony_ci    c_source.add_table(separator_tables[0],
271425bb815Sopenharmony_ci                       "unicode_separator_char_interval_sps",
272425bb815Sopenharmony_ci                       "uint16_t",
273425bb815Sopenharmony_ci                       ("/**\n"
274425bb815Sopenharmony_ci                        " * Unicode separator character interval starting points from Unicode category: Zs\n"
275425bb815Sopenharmony_ci                        " */"))
276425bb815Sopenharmony_ci
277425bb815Sopenharmony_ci    c_source.add_table(separator_tables[1],
278425bb815Sopenharmony_ci                       "unicode_separator_char_interval_lengths",
279425bb815Sopenharmony_ci                       "uint8_t",
280425bb815Sopenharmony_ci                       ("/**\n"
281425bb815Sopenharmony_ci                        " * Unicode separator character interval lengths from Unicode category: Zs\n"
282425bb815Sopenharmony_ci                        " */"))
283425bb815Sopenharmony_ci
284425bb815Sopenharmony_ci    c_source.add_table(separator_tables[2],
285425bb815Sopenharmony_ci                       "unicode_separator_chars",
286425bb815Sopenharmony_ci                       "uint16_t",
287425bb815Sopenharmony_ci                       ("/**\n"
288425bb815Sopenharmony_ci                        " * Unicode separator characters that are not in the\n"
289425bb815Sopenharmony_ci                        " * lit_unicode_separator_char_intervals array.\n"
290425bb815Sopenharmony_ci                        " *\n"
291425bb815Sopenharmony_ci                        " * Unicode category: Zs\n"
292425bb815Sopenharmony_ci                        " */"))
293425bb815Sopenharmony_ci
294425bb815Sopenharmony_ci    c_source.generate()
295425bb815Sopenharmony_ci
296425bb815Sopenharmony_ci
297425bb815Sopenharmony_ci# functions for unicode conversions
298425bb815Sopenharmony_ci
299425bb815Sopenharmony_ci
300425bb815Sopenharmony_cidef parse_unicode_sequence(raw_data):
301425bb815Sopenharmony_ci    """
302425bb815Sopenharmony_ci    Parse unicode sequence from raw data.
303425bb815Sopenharmony_ci
304425bb815Sopenharmony_ci    :param raw_data: Contains the unicode sequence which needs to parse.
305425bb815Sopenharmony_ci    :return: The parsed unicode sequence.
306425bb815Sopenharmony_ci    """
307425bb815Sopenharmony_ci
308425bb815Sopenharmony_ci    result = ''
309425bb815Sopenharmony_ci
310425bb815Sopenharmony_ci    for unicode_char in raw_data.split(' '):
311425bb815Sopenharmony_ci        if unicode_char == '':
312425bb815Sopenharmony_ci            continue
313425bb815Sopenharmony_ci
314425bb815Sopenharmony_ci        # Convert it to unicode code point (from hex value without 0x prefix)
315425bb815Sopenharmony_ci        hex_val = int(unicode_char, 16)
316425bb815Sopenharmony_ci        try:
317425bb815Sopenharmony_ci            result += unichr(hex_val)
318425bb815Sopenharmony_ci        except NameError:
319425bb815Sopenharmony_ci            result += chr(hex_val)
320425bb815Sopenharmony_ci
321425bb815Sopenharmony_ci    return result
322425bb815Sopenharmony_ci
323425bb815Sopenharmony_ci
324425bb815Sopenharmony_cidef read_case_mappings(unicode_data_file, special_casing_file):
325425bb815Sopenharmony_ci    """
326425bb815Sopenharmony_ci    Read the corresponding unicode values of lower and upper case letters and store these in tables.
327425bb815Sopenharmony_ci
328425bb815Sopenharmony_ci    :param unicode_data_file: Contains the default case mappings (one-to-one mappings).
329425bb815Sopenharmony_ci    :param special_casing_file: Contains additional informative case mappings that are either not one-to-one
330425bb815Sopenharmony_ci                                or which are context-sensitive.
331425bb815Sopenharmony_ci    :return: Upper and lower case mappings.
332425bb815Sopenharmony_ci    """
333425bb815Sopenharmony_ci
334425bb815Sopenharmony_ci    lower_case_mapping = {}
335425bb815Sopenharmony_ci    upper_case_mapping = {}
336425bb815Sopenharmony_ci
337425bb815Sopenharmony_ci    # Add one-to-one mappings
338425bb815Sopenharmony_ci    with open(unicode_data_file) as unicode_data:
339425bb815Sopenharmony_ci        unicode_data_reader = csv.reader(unicode_data, delimiter=';')
340425bb815Sopenharmony_ci
341425bb815Sopenharmony_ci        for line in unicode_data_reader:
342425bb815Sopenharmony_ci            letter_id = int(line[0], 16)
343425bb815Sopenharmony_ci
344425bb815Sopenharmony_ci            # Skip supplementary planes and ascii chars
345425bb815Sopenharmony_ci            if letter_id >= 0x10000 or letter_id < 128:
346425bb815Sopenharmony_ci                continue
347425bb815Sopenharmony_ci
348425bb815Sopenharmony_ci            capital_letter = line[12]
349425bb815Sopenharmony_ci            small_letter = line[13]
350425bb815Sopenharmony_ci
351425bb815Sopenharmony_ci            if capital_letter:
352425bb815Sopenharmony_ci                upper_case_mapping[letter_id] = parse_unicode_sequence(capital_letter)
353425bb815Sopenharmony_ci
354425bb815Sopenharmony_ci            if small_letter:
355425bb815Sopenharmony_ci                lower_case_mapping[letter_id] = parse_unicode_sequence(small_letter)
356425bb815Sopenharmony_ci
357425bb815Sopenharmony_ci    # Update the conversion tables with the special cases
358425bb815Sopenharmony_ci    with open(special_casing_file) as special_casing:
359425bb815Sopenharmony_ci        special_casing_reader = csv.reader(special_casing, delimiter=';')
360425bb815Sopenharmony_ci
361425bb815Sopenharmony_ci        for line in special_casing_reader:
362425bb815Sopenharmony_ci            # Skip comment sections and empty lines
363425bb815Sopenharmony_ci            if not line or line[0].startswith('#'):
364425bb815Sopenharmony_ci                continue
365425bb815Sopenharmony_ci
366425bb815Sopenharmony_ci            # Replace '#' character with empty string
367425bb815Sopenharmony_ci            for idx, i in enumerate(line):
368425bb815Sopenharmony_ci                if i.find('#') >= 0:
369425bb815Sopenharmony_ci                    line[idx] = ''
370425bb815Sopenharmony_ci
371425bb815Sopenharmony_ci            letter_id = int(line[0], 16)
372425bb815Sopenharmony_ci            condition_list = line[4]
373425bb815Sopenharmony_ci
374425bb815Sopenharmony_ci            # Skip supplementary planes, ascii chars, and condition_list
375425bb815Sopenharmony_ci            if letter_id >= 0x10000 or letter_id < 128 or condition_list:
376425bb815Sopenharmony_ci                continue
377425bb815Sopenharmony_ci
378425bb815Sopenharmony_ci            small_letter = parse_unicode_sequence(line[1])
379425bb815Sopenharmony_ci            capital_letter = parse_unicode_sequence(line[3])
380425bb815Sopenharmony_ci
381425bb815Sopenharmony_ci            lower_case_mapping[letter_id] = small_letter
382425bb815Sopenharmony_ci            upper_case_mapping[letter_id] = capital_letter
383425bb815Sopenharmony_ci
384425bb815Sopenharmony_ci    return lower_case_mapping, upper_case_mapping
385425bb815Sopenharmony_ci
386425bb815Sopenharmony_ci
387425bb815Sopenharmony_cidef extract_ranges(letter_case, reverse_letter_case=None):
388425bb815Sopenharmony_ci    """
389425bb815Sopenharmony_ci    Extract ranges from case mappings
390425bb815Sopenharmony_ci    (the second param is optional, if it's not empty, a range will contains bidirectional conversions only).
391425bb815Sopenharmony_ci
392425bb815Sopenharmony_ci    :param letter_id: An integer, representing the unicode code point of the character.
393425bb815Sopenharmony_ci    :param letter_case: case mappings dictionary which contains the conversions.
394425bb815Sopenharmony_ci    :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
395425bb815Sopenharmony_ci    :return: A table with the start points and their mapped value, and another table with the lengths of the ranges.
396425bb815Sopenharmony_ci    """
397425bb815Sopenharmony_ci
398425bb815Sopenharmony_ci    in_range = False
399425bb815Sopenharmony_ci    range_position = -1
400425bb815Sopenharmony_ci    ranges = []
401425bb815Sopenharmony_ci    range_lengths = []
402425bb815Sopenharmony_ci
403425bb815Sopenharmony_ci    for letter_id in sorted(letter_case.keys()):
404425bb815Sopenharmony_ci        prev_letter_id = letter_id - 1
405425bb815Sopenharmony_ci
406425bb815Sopenharmony_ci        # One-way conversions
407425bb815Sopenharmony_ci        if reverse_letter_case is None:
408425bb815Sopenharmony_ci            if len(letter_case[letter_id]) > 1:
409425bb815Sopenharmony_ci                in_range = False
410425bb815Sopenharmony_ci                continue
411425bb815Sopenharmony_ci
412425bb815Sopenharmony_ci            if prev_letter_id not in letter_case or len(letter_case[prev_letter_id]) > 1:
413425bb815Sopenharmony_ci                in_range = False
414425bb815Sopenharmony_ci                continue
415425bb815Sopenharmony_ci
416425bb815Sopenharmony_ci        # Two way conversions
417425bb815Sopenharmony_ci        else:
418425bb815Sopenharmony_ci            if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
419425bb815Sopenharmony_ci                in_range = False
420425bb815Sopenharmony_ci                continue
421425bb815Sopenharmony_ci
422425bb815Sopenharmony_ci            if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case):
423425bb815Sopenharmony_ci                in_range = False
424425bb815Sopenharmony_ci                continue
425425bb815Sopenharmony_ci
426425bb815Sopenharmony_ci        conv_distance = calculate_conversion_distance(letter_case, letter_id)
427425bb815Sopenharmony_ci        prev_conv_distance = calculate_conversion_distance(letter_case, prev_letter_id)
428425bb815Sopenharmony_ci
429425bb815Sopenharmony_ci        if conv_distance != prev_conv_distance:
430425bb815Sopenharmony_ci            in_range = False
431425bb815Sopenharmony_ci            continue
432425bb815Sopenharmony_ci
433425bb815Sopenharmony_ci        if in_range:
434425bb815Sopenharmony_ci            range_lengths[range_position] += 1
435425bb815Sopenharmony_ci        else:
436425bb815Sopenharmony_ci            in_range = True
437425bb815Sopenharmony_ci            range_position += 1
438425bb815Sopenharmony_ci
439425bb815Sopenharmony_ci            # Add the start point of the range and its mapped value
440425bb815Sopenharmony_ci            ranges.extend([prev_letter_id, ord(letter_case[prev_letter_id])])
441425bb815Sopenharmony_ci            range_lengths.append(2)
442425bb815Sopenharmony_ci
443425bb815Sopenharmony_ci    # Remove all ranges from the case mapping table.
444425bb815Sopenharmony_ci    for idx in range(0, len(ranges), 2):
445425bb815Sopenharmony_ci        range_length = range_lengths[idx // 2]
446425bb815Sopenharmony_ci
447425bb815Sopenharmony_ci        for incr in range(range_length):
448425bb815Sopenharmony_ci            del letter_case[ranges[idx] + incr]
449425bb815Sopenharmony_ci            if reverse_letter_case is not None:
450425bb815Sopenharmony_ci                del reverse_letter_case[ranges[idx + 1] + incr]
451425bb815Sopenharmony_ci
452425bb815Sopenharmony_ci    return ranges, range_lengths
453425bb815Sopenharmony_ci
454425bb815Sopenharmony_ci
455425bb815Sopenharmony_cidef extract_character_pair_ranges(letter_case, reverse_letter_case):
456425bb815Sopenharmony_ci    """
457425bb815Sopenharmony_ci    Extract two or more character pairs from the case mapping tables.
458425bb815Sopenharmony_ci
459425bb815Sopenharmony_ci    :param letter_case: case mappings dictionary which contains the conversions.
460425bb815Sopenharmony_ci    :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
461425bb815Sopenharmony_ci    :return: A table with the start points, and another table with the lengths of the ranges.
462425bb815Sopenharmony_ci    """
463425bb815Sopenharmony_ci
464425bb815Sopenharmony_ci    start_points = []
465425bb815Sopenharmony_ci    lengths = []
466425bb815Sopenharmony_ci    in_range = False
467425bb815Sopenharmony_ci    element_counter = -1
468425bb815Sopenharmony_ci
469425bb815Sopenharmony_ci    for letter_id in sorted(letter_case.keys()):
470425bb815Sopenharmony_ci        # Only extract character pairs
471425bb815Sopenharmony_ci        if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
472425bb815Sopenharmony_ci            in_range = False
473425bb815Sopenharmony_ci            continue
474425bb815Sopenharmony_ci
475425bb815Sopenharmony_ci        if ord(letter_case[letter_id]) == letter_id + 1:
476425bb815Sopenharmony_ci            prev_letter_id = letter_id - 2
477425bb815Sopenharmony_ci
478425bb815Sopenharmony_ci            if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case):
479425bb815Sopenharmony_ci                in_range = False
480425bb815Sopenharmony_ci
481425bb815Sopenharmony_ci            if in_range:
482425bb815Sopenharmony_ci                lengths[element_counter] += 2
483425bb815Sopenharmony_ci            else:
484425bb815Sopenharmony_ci                element_counter += 1
485425bb815Sopenharmony_ci                start_points.append(letter_id)
486425bb815Sopenharmony_ci                lengths.append(2)
487425bb815Sopenharmony_ci                in_range = True
488425bb815Sopenharmony_ci
489425bb815Sopenharmony_ci        else:
490425bb815Sopenharmony_ci            in_range = False
491425bb815Sopenharmony_ci
492425bb815Sopenharmony_ci    # Remove all found case mapping from the conversion tables after the scanning method
493425bb815Sopenharmony_ci    for idx, letter_id in enumerate(start_points):
494425bb815Sopenharmony_ci        conv_length = lengths[idx]
495425bb815Sopenharmony_ci
496425bb815Sopenharmony_ci        for incr in range(0, conv_length, 2):
497425bb815Sopenharmony_ci            del letter_case[letter_id + incr]
498425bb815Sopenharmony_ci            del reverse_letter_case[letter_id + 1 + incr]
499425bb815Sopenharmony_ci
500425bb815Sopenharmony_ci    return start_points, lengths
501425bb815Sopenharmony_ci
502425bb815Sopenharmony_ci
503425bb815Sopenharmony_cidef extract_character_pairs(letter_case, reverse_letter_case):
504425bb815Sopenharmony_ci    """
505425bb815Sopenharmony_ci    Extract character pairs. Check that two unicode value are also a mapping value of each other.
506425bb815Sopenharmony_ci
507425bb815Sopenharmony_ci    :param letter_case: case mappings dictionary which contains the conversions.
508425bb815Sopenharmony_ci    :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
509425bb815Sopenharmony_ci    :return: A table with character pairs.
510425bb815Sopenharmony_ci    """
511425bb815Sopenharmony_ci
512425bb815Sopenharmony_ci    character_pairs = []
513425bb815Sopenharmony_ci
514425bb815Sopenharmony_ci    for letter_id in sorted(letter_case.keys()):
515425bb815Sopenharmony_ci        if is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
516425bb815Sopenharmony_ci            mapped_value = letter_case[letter_id]
517425bb815Sopenharmony_ci            character_pairs.extend([letter_id, ord(mapped_value)])
518425bb815Sopenharmony_ci
519425bb815Sopenharmony_ci            # Remove character pairs from case mapping tables
520425bb815Sopenharmony_ci            del letter_case[letter_id]
521425bb815Sopenharmony_ci            del reverse_letter_case[ord(mapped_value)]
522425bb815Sopenharmony_ci
523425bb815Sopenharmony_ci    return character_pairs
524425bb815Sopenharmony_ci
525425bb815Sopenharmony_ci
526425bb815Sopenharmony_cidef extract_special_ranges(letter_case):
527425bb815Sopenharmony_ci    """
528425bb815Sopenharmony_ci    Extract special ranges. It contains start points of one-to-two letter case ranges
529425bb815Sopenharmony_ci    where the second character is always the same.
530425bb815Sopenharmony_ci
531425bb815Sopenharmony_ci    :param letter_case: case mappings dictionary which contains the conversions.
532425bb815Sopenharmony_ci
533425bb815Sopenharmony_ci    :return: A table with the start points and their mapped values, and a table with the lengths of the ranges.
534425bb815Sopenharmony_ci    """
535425bb815Sopenharmony_ci
536425bb815Sopenharmony_ci    special_ranges = []
537425bb815Sopenharmony_ci    special_range_lengths = []
538425bb815Sopenharmony_ci
539425bb815Sopenharmony_ci    range_position = -1
540425bb815Sopenharmony_ci
541425bb815Sopenharmony_ci    for letter_id in sorted(letter_case.keys()):
542425bb815Sopenharmony_ci        mapped_value = letter_case[letter_id]
543425bb815Sopenharmony_ci
544425bb815Sopenharmony_ci        if len(mapped_value) != 2:
545425bb815Sopenharmony_ci            continue
546425bb815Sopenharmony_ci
547425bb815Sopenharmony_ci        prev_letter_id = letter_id - 1
548425bb815Sopenharmony_ci
549425bb815Sopenharmony_ci        if prev_letter_id not in letter_case:
550425bb815Sopenharmony_ci            in_range = False
551425bb815Sopenharmony_ci            continue
552425bb815Sopenharmony_ci
553425bb815Sopenharmony_ci        prev_mapped_value = letter_case[prev_letter_id]
554425bb815Sopenharmony_ci
555425bb815Sopenharmony_ci        if len(prev_mapped_value) != 2:
556425bb815Sopenharmony_ci            continue
557425bb815Sopenharmony_ci
558425bb815Sopenharmony_ci        if prev_mapped_value[1] != mapped_value[1]:
559425bb815Sopenharmony_ci            continue
560425bb815Sopenharmony_ci
561425bb815Sopenharmony_ci        if (ord(prev_mapped_value[0]) - prev_letter_id) != (ord(mapped_value[0]) - letter_id):
562425bb815Sopenharmony_ci            in_range = False
563425bb815Sopenharmony_ci            continue
564425bb815Sopenharmony_ci
565425bb815Sopenharmony_ci        if in_range:
566425bb815Sopenharmony_ci            special_range_lengths[range_position] += 1
567425bb815Sopenharmony_ci        else:
568425bb815Sopenharmony_ci            range_position += 1
569425bb815Sopenharmony_ci            in_range = True
570425bb815Sopenharmony_ci
571425bb815Sopenharmony_ci            special_ranges.extend([prev_letter_id, ord(prev_mapped_value[0]), ord(prev_mapped_value[1])])
572425bb815Sopenharmony_ci            special_range_lengths.append(1)
573425bb815Sopenharmony_ci
574425bb815Sopenharmony_ci    # Remove special ranges from the conversion table
575425bb815Sopenharmony_ci    for idx in range(0, len(special_ranges), 3):
576425bb815Sopenharmony_ci        range_length = special_range_lengths[idx // 3]
577425bb815Sopenharmony_ci        letter_id = special_ranges[idx]
578425bb815Sopenharmony_ci
579425bb815Sopenharmony_ci        for incr in range(range_length):
580425bb815Sopenharmony_ci            del letter_case[special_ranges[idx] + incr]
581425bb815Sopenharmony_ci
582425bb815Sopenharmony_ci    return special_ranges, special_range_lengths
583425bb815Sopenharmony_ci
584425bb815Sopenharmony_ci
585425bb815Sopenharmony_cidef extract_conversions(letter_case):
586425bb815Sopenharmony_ci    """
587425bb815Sopenharmony_ci    Extract conversions. It provide the full (or remained) case mappings from the table.
588425bb815Sopenharmony_ci    The counter table contains the information of how much one-to-one, one-to-two or one-to-three mappings
589425bb815Sopenharmony_ci    exists successively in the conversion table.
590425bb815Sopenharmony_ci
591425bb815Sopenharmony_ci    :return: A table with conversions, and a table with counters.
592425bb815Sopenharmony_ci    """
593425bb815Sopenharmony_ci
594425bb815Sopenharmony_ci    unicodes = [[], [], []]
595425bb815Sopenharmony_ci    unicode_lengths = [0, 0, 0]
596425bb815Sopenharmony_ci
597425bb815Sopenharmony_ci    # 1 to 1 byte
598425bb815Sopenharmony_ci    for letter_id in sorted(letter_case.keys()):
599425bb815Sopenharmony_ci        mapped_value = letter_case[letter_id]
600425bb815Sopenharmony_ci
601425bb815Sopenharmony_ci        if len(mapped_value) != 1:
602425bb815Sopenharmony_ci            continue
603425bb815Sopenharmony_ci
604425bb815Sopenharmony_ci        unicodes[0].extend([letter_id, ord(mapped_value)])
605425bb815Sopenharmony_ci        del letter_case[letter_id]
606425bb815Sopenharmony_ci
607425bb815Sopenharmony_ci    # 1 to 2 bytes
608425bb815Sopenharmony_ci    for letter_id in sorted(letter_case.keys()):
609425bb815Sopenharmony_ci        mapped_value = letter_case[letter_id]
610425bb815Sopenharmony_ci
611425bb815Sopenharmony_ci        if len(mapped_value) != 2:
612425bb815Sopenharmony_ci            continue
613425bb815Sopenharmony_ci
614425bb815Sopenharmony_ci        unicodes[1].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1])])
615425bb815Sopenharmony_ci        del letter_case[letter_id]
616425bb815Sopenharmony_ci
617425bb815Sopenharmony_ci    # 1 to 3 bytes
618425bb815Sopenharmony_ci    for letter_id in sorted(letter_case.keys()):
619425bb815Sopenharmony_ci        mapped_value = letter_case[letter_id]
620425bb815Sopenharmony_ci
621425bb815Sopenharmony_ci        if len(mapped_value) != 3:
622425bb815Sopenharmony_ci            continue
623425bb815Sopenharmony_ci
624425bb815Sopenharmony_ci        unicodes[2].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1]), ord(mapped_value[2])])
625425bb815Sopenharmony_ci        del letter_case[letter_id]
626425bb815Sopenharmony_ci
627425bb815Sopenharmony_ci    unicode_lengths = [int(len(unicodes[0]) / 2), int(len(unicodes[1]) / 3), int(len(unicodes[2]) / 4)]
628425bb815Sopenharmony_ci
629425bb815Sopenharmony_ci    return list(itertools.chain.from_iterable(unicodes)), unicode_lengths
630425bb815Sopenharmony_ci
631425bb815Sopenharmony_ci
632425bb815Sopenharmony_cidef is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
633425bb815Sopenharmony_ci    """
634425bb815Sopenharmony_ci    Check that two unicode value are also a mapping value of each other.
635425bb815Sopenharmony_ci
636425bb815Sopenharmony_ci    :param letter_id: An integer, representing the unicode code point of the character.
637425bb815Sopenharmony_ci    :param other_case_mapping: Comparable case mapping table which possible contains
638425bb815Sopenharmony_ci                               the return direction of the conversion.
639425bb815Sopenharmony_ci    :return: True, if it's a reverible conversion, false otherwise.
640425bb815Sopenharmony_ci    """
641425bb815Sopenharmony_ci
642425bb815Sopenharmony_ci    if letter_id not in letter_case:
643425bb815Sopenharmony_ci        return False
644425bb815Sopenharmony_ci
645425bb815Sopenharmony_ci    # Check one-to-one mapping
646425bb815Sopenharmony_ci    mapped_value = letter_case[letter_id]
647425bb815Sopenharmony_ci    if len(mapped_value) > 1:
648425bb815Sopenharmony_ci        return False
649425bb815Sopenharmony_ci
650425bb815Sopenharmony_ci    # Check two way conversions
651425bb815Sopenharmony_ci    mapped_value_id = ord(mapped_value)
652425bb815Sopenharmony_ci
653425bb815Sopenharmony_ci    if mapped_value_id not in reverse_letter_case or len(reverse_letter_case[mapped_value_id]) > 1:
654425bb815Sopenharmony_ci        return False
655425bb815Sopenharmony_ci
656425bb815Sopenharmony_ci    if ord(reverse_letter_case[mapped_value_id]) != letter_id:
657425bb815Sopenharmony_ci        return False
658425bb815Sopenharmony_ci
659425bb815Sopenharmony_ci    return True
660425bb815Sopenharmony_ci
661425bb815Sopenharmony_ci
662425bb815Sopenharmony_cidef calculate_conversion_distance(letter_case, letter_id):
663425bb815Sopenharmony_ci    """
664425bb815Sopenharmony_ci    Calculate the distance between the unicode character and its mapped value
665425bb815Sopenharmony_ci    (only needs and works with one-to-one mappings).
666425bb815Sopenharmony_ci
667425bb815Sopenharmony_ci    :param letter_case: case mappings dictionary which contains the conversions.
668425bb815Sopenharmony_ci    :param letter_id: An integer, representing the unicode code point of the character.
669425bb815Sopenharmony_ci    :return: The conversion distance.
670425bb815Sopenharmony_ci    """
671425bb815Sopenharmony_ci
672425bb815Sopenharmony_ci    if letter_id not in letter_case or len(letter_case[letter_id]) > 1:
673425bb815Sopenharmony_ci        return None
674425bb815Sopenharmony_ci
675425bb815Sopenharmony_ci    return ord(letter_case[letter_id]) - letter_id
676425bb815Sopenharmony_ci
677425bb815Sopenharmony_ci
678425bb815Sopenharmony_cidef generate_conversions(script_args):
679425bb815Sopenharmony_ci    # Read the corresponding unicode values of lower and upper case letters and store these in tables
680425bb815Sopenharmony_ci    case_mappings = read_case_mappings(script_args.unicode_data, script_args.special_casing)
681425bb815Sopenharmony_ci    lower_case = case_mappings[0]
682425bb815Sopenharmony_ci    upper_case = case_mappings[1]
683425bb815Sopenharmony_ci
684425bb815Sopenharmony_ci    character_case_ranges = extract_ranges(lower_case, upper_case)
685425bb815Sopenharmony_ci    character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case)
686425bb815Sopenharmony_ci    character_pairs = extract_character_pairs(lower_case, upper_case)
687425bb815Sopenharmony_ci    upper_case_special_ranges = extract_special_ranges(upper_case)
688425bb815Sopenharmony_ci    lower_case_ranges = extract_ranges(lower_case)
689425bb815Sopenharmony_ci    lower_case_conversions = extract_conversions(lower_case)
690425bb815Sopenharmony_ci    upper_case_conversions = extract_conversions(upper_case)
691425bb815Sopenharmony_ci
692425bb815Sopenharmony_ci    if lower_case:
693425bb815Sopenharmony_ci        warnings.warn('Not all elements extracted from the lowercase table!')
694425bb815Sopenharmony_ci    if upper_case:
695425bb815Sopenharmony_ci        warnings.warn('Not all elements extracted from the uppercase table!')
696425bb815Sopenharmony_ci
697425bb815Sopenharmony_ci    # Generate conversions output
698425bb815Sopenharmony_ci    c_source = UniCodeSource(CONVERSIONS_C_SOURCE)
699425bb815Sopenharmony_ci
700425bb815Sopenharmony_ci    unicode_file = os.path.basename(script_args.unicode_data)
701425bb815Sopenharmony_ci    spec_casing_file = os.path.basename(script_args.special_casing)
702425bb815Sopenharmony_ci
703425bb815Sopenharmony_ci    header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
704425bb815Sopenharmony_ci                         " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file),
705425bb815Sopenharmony_ci                         ""]
706425bb815Sopenharmony_ci
707425bb815Sopenharmony_ci    c_source.complete_header("\n".join(header_completion))
708425bb815Sopenharmony_ci
709425bb815Sopenharmony_ci    c_source.add_table(character_case_ranges[0],
710425bb815Sopenharmony_ci                       "character_case_ranges",
711425bb815Sopenharmony_ci                       "uint16_t",
712425bb815Sopenharmony_ci                       ("/* Contains start points of character case ranges "
713425bb815Sopenharmony_ci                        "(these are bidirectional conversions). */"))
714425bb815Sopenharmony_ci
715425bb815Sopenharmony_ci    c_source.add_table(character_case_ranges[1],
716425bb815Sopenharmony_ci                       "character_case_range_lengths",
717425bb815Sopenharmony_ci                       "uint8_t",
718425bb815Sopenharmony_ci                       "/* Interval lengths of start points in `character_case_ranges` table. */")
719425bb815Sopenharmony_ci
720425bb815Sopenharmony_ci    c_source.add_table(character_pair_ranges[0],
721425bb815Sopenharmony_ci                       "character_pair_ranges",
722425bb815Sopenharmony_ci                       "uint16_t",
723425bb815Sopenharmony_ci                       "/* Contains the start points of bidirectional conversion ranges. */")
724425bb815Sopenharmony_ci
725425bb815Sopenharmony_ci    c_source.add_table(character_pair_ranges[1],
726425bb815Sopenharmony_ci                       "character_pair_range_lengths",
727425bb815Sopenharmony_ci                       "uint8_t",
728425bb815Sopenharmony_ci                       "/* Interval lengths of start points in `character_pair_ranges` table. */")
729425bb815Sopenharmony_ci
730425bb815Sopenharmony_ci    c_source.add_table(character_pairs,
731425bb815Sopenharmony_ci                       "character_pairs",
732425bb815Sopenharmony_ci                       "uint16_t",
733425bb815Sopenharmony_ci                       "/* Contains lower/upper case bidirectional conversion pairs. */")
734425bb815Sopenharmony_ci
735425bb815Sopenharmony_ci    c_source.add_table(upper_case_special_ranges[0],
736425bb815Sopenharmony_ci                       "upper_case_special_ranges",
737425bb815Sopenharmony_ci                       "uint16_t",
738425bb815Sopenharmony_ci                       ("/* Contains start points of one-to-two uppercase ranges where the second character\n"
739425bb815Sopenharmony_ci                        " * is always the same.\n"
740425bb815Sopenharmony_ci                        " */"))
741425bb815Sopenharmony_ci
742425bb815Sopenharmony_ci    c_source.add_table(upper_case_special_ranges[1],
743425bb815Sopenharmony_ci                       "upper_case_special_range_lengths",
744425bb815Sopenharmony_ci                       "uint8_t",
745425bb815Sopenharmony_ci                       "/* Interval lengths for start points in `upper_case_special_ranges` table. */")
746425bb815Sopenharmony_ci
747425bb815Sopenharmony_ci    c_source.add_table(lower_case_ranges[0],
748425bb815Sopenharmony_ci                       "lower_case_ranges",
749425bb815Sopenharmony_ci                       "uint16_t",
750425bb815Sopenharmony_ci                       "/* Contains start points of lowercase ranges. */")
751425bb815Sopenharmony_ci
752425bb815Sopenharmony_ci    c_source.add_table(lower_case_ranges[1],
753425bb815Sopenharmony_ci                       "lower_case_range_lengths",
754425bb815Sopenharmony_ci                       "uint8_t",
755425bb815Sopenharmony_ci                       "/* Interval lengths for start points in `lower_case_ranges` table. */")
756425bb815Sopenharmony_ci
757425bb815Sopenharmony_ci    c_source.add_table(lower_case_conversions[0],
758425bb815Sopenharmony_ci                       "lower_case_conversions",
759425bb815Sopenharmony_ci                       "uint16_t",
760425bb815Sopenharmony_ci                       ("/* The remaining lowercase conversions. The lowercase variant can "
761425bb815Sopenharmony_ci                        "be one-to-three character long. */"))
762425bb815Sopenharmony_ci
763425bb815Sopenharmony_ci    c_source.add_table(lower_case_conversions[1],
764425bb815Sopenharmony_ci                       "lower_case_conversion_counters",
765425bb815Sopenharmony_ci                       "uint8_t",
766425bb815Sopenharmony_ci                       "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */")
767425bb815Sopenharmony_ci
768425bb815Sopenharmony_ci    c_source.add_table(upper_case_conversions[0],
769425bb815Sopenharmony_ci                       "upper_case_conversions",
770425bb815Sopenharmony_ci                       "uint16_t",
771425bb815Sopenharmony_ci                       ("/* The remaining uppercase conversions. The uppercase variant can "
772425bb815Sopenharmony_ci                        "be one-to-three character long. */"))
773425bb815Sopenharmony_ci
774425bb815Sopenharmony_ci    c_source.add_table(upper_case_conversions[1],
775425bb815Sopenharmony_ci                       "upper_case_conversion_counters",
776425bb815Sopenharmony_ci                       "uint8_t",
777425bb815Sopenharmony_ci                       "/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */")
778425bb815Sopenharmony_ci
779425bb815Sopenharmony_ci    c_source.generate()
780425bb815Sopenharmony_ci
781425bb815Sopenharmony_ci
782425bb815Sopenharmony_ci# entry point
783425bb815Sopenharmony_ci
784425bb815Sopenharmony_ci
785425bb815Sopenharmony_cidef main():
786425bb815Sopenharmony_ci    parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}.inc.h generator',
787425bb815Sopenharmony_ci                                     epilog='''
788425bb815Sopenharmony_ci                                        The input files (UnicodeData.txt, SpecialCasing.txt)
789425bb815Sopenharmony_ci                                        must be retrieved from
790425bb815Sopenharmony_ci                                        http://www.unicode.org/Public/<VERSION>/ucd/.
791425bb815Sopenharmony_ci                                        The last known good version is 13.0.0.
792425bb815Sopenharmony_ci                                        ''')
793425bb815Sopenharmony_ci
794425bb815Sopenharmony_ci    parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True,
795425bb815Sopenharmony_ci                        help='specify the unicode data file')
796425bb815Sopenharmony_ci    parser.add_argument('--special-casing', metavar='FILE', action='store', required=True,
797425bb815Sopenharmony_ci                        help='specify the special casing file')
798425bb815Sopenharmony_ci
799425bb815Sopenharmony_ci    script_args = parser.parse_args()
800425bb815Sopenharmony_ci
801425bb815Sopenharmony_ci    if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK):
802425bb815Sopenharmony_ci        parser.error('The %s file is missing or not readable!' % script_args.unicode_data)
803425bb815Sopenharmony_ci
804425bb815Sopenharmony_ci    if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK):
805425bb815Sopenharmony_ci        parser.error('The %s file is missing or not readable!' % script_args.special_casing)
806425bb815Sopenharmony_ci
807425bb815Sopenharmony_ci    generate_ranges(script_args)
808425bb815Sopenharmony_ci    generate_conversions(script_args)
809425bb815Sopenharmony_ci
810425bb815Sopenharmony_ci
811425bb815Sopenharmony_ciif __name__ == "__main__":
812425bb815Sopenharmony_ci    main()
813