1#! /usr/bin/python
2
3#                   PCRE2 UNICODE PROPERTY SUPPORT
4#                   ------------------------------
5
6# This file is a Python module containing common lists and functions for the
7# GenerateXXX scripts that create various.c and .h files from Unicode data
8# files. It was created as part of a re-organizaton of these scripts in
9# December 2021.
10
11
12import re
13
14
15# ---------------------------------------------------------------------------
16#                             DATA LISTS
17# ---------------------------------------------------------------------------
18
19# BIDI classes in the DerivedBidiClass.txt file, with comments.
20
21bidi_classes = [
22  'AL',  'Arabic letter',
23  'AN',  'Arabic number',
24  'B',   'Paragraph separator',
25  'BN',  'Boundary neutral',
26  'CS',  'Common separator',
27  'EN',  'European number',
28  'ES',  'European separator',
29  'ET',  'European terminator',
30  'FSI', 'First strong isolate',
31  'L',   'Left to right',
32  'LRE', 'Left to right embedding',
33  'LRI', 'Left to right isolate',
34  'LRO', 'Left to right override',
35  'NSM', 'Non-spacing mark',
36  'ON',  'Other neutral',
37  'PDF', 'Pop directional format',
38  'PDI', 'Pop directional isolate',
39  'R',   'Right to left',
40  'RLE', 'Right to left embedding',
41  'RLI', 'Right to left isolate',
42  'RLO', 'Right to left override',
43  'S',   'Segment separator',
44  'WS',  'White space'
45  ]
46
47# Particular category property names, with comments. NOTE: If ever this list
48# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
49# must be edited to keep in step.
50
51category_names = [
52  'Cc', 'Control',
53  'Cf', 'Format',
54  'Cn', 'Unassigned',
55  'Co', 'Private use',
56  'Cs', 'Surrogate',
57  'Ll', 'Lower case letter',
58  'Lm', 'Modifier letter',
59  'Lo', 'Other letter',
60  'Lt', 'Title case letter',
61  'Lu', 'Upper case letter',
62  'Mc', 'Spacing mark',
63  'Me', 'Enclosing mark',
64  'Mn', 'Non-spacing mark',
65  'Nd', 'Decimal number',
66  'Nl', 'Letter number',
67  'No', 'Other number',
68  'Pc', 'Connector punctuation',
69  'Pd', 'Dash punctuation',
70  'Pe', 'Close punctuation',
71  'Pf', 'Final punctuation',
72  'Pi', 'Initial punctuation',
73  'Po', 'Other punctuation',
74  'Ps', 'Open punctuation',
75  'Sc', 'Currency symbol',
76  'Sk', 'Modifier symbol',
77  'Sm', 'Mathematical symbol',
78  'So', 'Other symbol',
79  'Zl', 'Line separator',
80  'Zp', 'Paragraph separator',
81  'Zs', 'Space separator'
82  ]
83
84# The Extended_Pictographic property is not found in the file where all the
85# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
86# file, but we list it here so that the name has the correct index value.
87
88break_properties = [
89  'CR',                    ' 0',
90  'LF',                    ' 1',
91  'Control',               ' 2',
92  'Extend',                ' 3',
93  'Prepend',               ' 4',
94  'SpacingMark',           ' 5',
95  'L',                     ' 6 Hangul syllable type L',
96  'V',                     ' 7 Hangul syllable type V',
97  'T',                     ' 8 Hangul syllable type T',
98  'LV',                    ' 9 Hangul syllable type LV',
99  'LVT',                   '10 Hangul syllable type LVT',
100  'Regional_Indicator',    '11',
101  'Other',                 '12',
102  'ZWJ',                   '13',
103  'Extended_Pictographic', '14'
104  ]
105
106# List of files from which the names of Boolean properties are obtained, along
107# with a list of regex patterns for properties to be ignored, and a list of
108# extra pattern names to add.
109
110bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
111bool_propsignore = [r'^Other_', r'^Hyphen$']
112bool_propsextras = ['ASCII', 'Bidi_Mirrored']
113
114
115# ---------------------------------------------------------------------------
116#                   GET BOOLEAN PROPERTY NAMES
117# ---------------------------------------------------------------------------
118
119# Get a list of Boolean property names from a number of files.
120
121def getbpropslist():
122  bplist = []
123  bplast = ""
124
125  for filename in bool_propsfiles:
126    try:
127      file = open('Unicode.tables/' + filename, 'r')
128    except IOError:
129      print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
130      sys.exit(1)
131
132    for line in file:
133      line = re.sub(r'#.*', '', line)
134      data = list(map(str.strip, line.split(';')))
135      if len(data) <= 1 or data[1] == bplast:
136        continue
137      bplast = data[1]
138      for pat in bool_propsignore:
139        if re.match(pat, bplast) != None:
140          break
141      else:
142        bplist.append(bplast)
143
144    file.close()
145
146  bplist.extend(bool_propsextras)
147  bplist.sort()
148  return bplist
149
150bool_properties = getbpropslist()
151bool_props_list_item_size = (len(bool_properties) + 31) // 32
152
153
154
155# ---------------------------------------------------------------------------
156#                  COLLECTING PROPERTY NAMES AND ALIASES
157# ---------------------------------------------------------------------------
158
159script_names = ['Unknown']
160abbreviations = {}
161
162def collect_property_names():
163  global script_names
164  global abbreviations
165
166  names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
167
168  last_script_name = ""
169  with open("Unicode.tables/Scripts.txt") as f:
170    for line in f:
171      match_obj = names_re.match(line)
172
173      if match_obj == None or match_obj.group(1) == last_script_name:
174        continue
175
176      last_script_name = match_obj.group(1)
177      script_names.append(last_script_name)
178
179  # Sometimes there is comment in the line
180  # so splitting around semicolon is not enough
181  value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
182
183  with open("Unicode.tables/PropertyValueAliases.txt") as f:
184    for line in f:
185      match_obj = value_alias_re.match(line)
186
187      if match_obj == None:
188        continue
189
190      if match_obj.group(1) == "sc":
191        if match_obj.group(2) == match_obj.group(3):
192          abbreviations[match_obj.group(3)] = ()
193        elif match_obj.group(4) == None:
194          abbreviations[match_obj.group(3)] = (match_obj.group(2),)
195        else:
196          abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
197
198  # We can also collect Boolean property abbreviations into the same dictionary
199
200  bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
201  with open("Unicode.tables/PropertyAliases.txt") as f:
202    for line in f:
203      match_obj = bin_alias_re.match(line)
204      if match_obj == None:
205        continue
206
207      if match_obj.group(2) in bool_properties:
208        if match_obj.group(3) == None:
209          abbreviations[match_obj.group(2)] = (match_obj.group(1),)
210        else:
211          abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
212
213collect_property_names()
214
215
216
217# ---------------------------------------------------------------------------
218#                      REORDERING SCRIPT NAMES
219# ---------------------------------------------------------------------------
220
221script_abbrevs = []
222
223def reorder_scripts():
224  global script_names
225  global script_abbrevs
226  global abbreviations
227
228  for name in script_names:
229    abbrevs = abbreviations[name]
230    script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
231
232  extended_script_abbrevs = set()
233  with open("Unicode.tables/ScriptExtensions.txt") as f:
234    names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
235
236    for line in f:
237      match_obj = names_re.match(line)
238
239      if match_obj == None:
240        continue
241
242      for name in match_obj.group(1).split(" "):
243        extended_script_abbrevs.add(name)
244
245  new_script_names = []
246  new_script_abbrevs = []
247
248  for idx, abbrev in enumerate(script_abbrevs):
249    if abbrev in extended_script_abbrevs:
250      new_script_names.append(script_names[idx])
251      new_script_abbrevs.append(abbrev)
252
253  for idx, abbrev in enumerate(script_abbrevs):
254    if abbrev not in extended_script_abbrevs:
255      new_script_names.append(script_names[idx])
256      new_script_abbrevs.append(abbrev)
257
258  script_names = new_script_names
259  script_abbrevs = new_script_abbrevs
260
261reorder_scripts()
262script_list_item_size = (script_names.index('Unknown') + 31) // 32
263
264
265# ---------------------------------------------------------------------------
266#                         DERIVED LISTS
267# ---------------------------------------------------------------------------
268
269# Create general character property names from the first letters of the
270# particular categories.
271
272gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
273general_category_names = list(gcn_set)
274general_category_names.sort()
275
276
277# ---------------------------------------------------------------------------
278#                           FUNCTIONS
279# ---------------------------------------------------------------------------
280
281import sys
282
283# Open an output file, using the command's argument or a default. Write common
284# preliminary header information.
285
286def open_output(default):
287  if len(sys.argv) > 2:
288    print('** Too many arguments: just give a file name')
289    sys.exit(1)
290  if len(sys.argv) == 2:
291    output_name = sys.argv[1]
292  else:
293    output_name = default
294  try:
295    file = open(output_name, "w")
296  except IOError:
297    print ("** Couldn't open %s" % output_name)
298    sys.exit(1)
299
300  script_name = sys.argv[0]
301  i = script_name.rfind('/')
302  if i >= 0:
303    script_name = script_name[i+1:]
304
305  file.write("""\
306/*************************************************
307*      Perl-Compatible Regular Expressions       *
308*************************************************/
309
310/* PCRE is a library of functions to support regular expressions whose syntax
311and semantics are as close as possible to those of the Perl 5 language.
312
313                       Written by Philip Hazel
314     Original API code Copyright (c) 1997-2012 University of Cambridge
315          New API code Copyright (c) 2016-2022 University of Cambridge
316
317This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
318""")
319
320  file.write("Instead, modify the maint/%s script and run it to generate\n"
321  "a new version of this code.\n\n" % script_name)
322
323  file.write("""\
324-----------------------------------------------------------------------------
325Redistribution and use in source and binary forms, with or without
326modification, are permitted provided that the following conditions are met:
327
328    * Redistributions of source code must retain the above copyright notice,
329      this list of conditions and the following disclaimer.
330
331    * Redistributions in binary form must reproduce the above copyright
332      notice, this list of conditions and the following disclaimer in the
333      documentation and/or other materials provided with the distribution.
334
335    * Neither the name of the University of Cambridge nor the names of its
336      contributors may be used to endorse or promote products derived from
337      this software without specific prior written permission.
338
339THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
340AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
341IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
342ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
343LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
344CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
345SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
346INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
347CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
348ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
349POSSIBILITY OF SUCH DAMAGE.
350-----------------------------------------------------------------------------
351*/
352\n""")
353  return file
354
355# End of UcpCommon.py
356