1#! /usr/bin/python 2 3# PCRE2 UNICODE PROPERTY SUPPORT 4# ------------------------------ 5 6# This file is a Python module containing common lists and functions for the 7# GenerateXXX scripts that create various.c and .h files from Unicode data 8# files. It was created as part of a re-organizaton of these scripts in 9# December 2021. 10 11 12import re 13 14 15# --------------------------------------------------------------------------- 16# DATA LISTS 17# --------------------------------------------------------------------------- 18 19# BIDI classes in the DerivedBidiClass.txt file, with comments. 20 21bidi_classes = [ 22 'AL', 'Arabic letter', 23 'AN', 'Arabic number', 24 'B', 'Paragraph separator', 25 'BN', 'Boundary neutral', 26 'CS', 'Common separator', 27 'EN', 'European number', 28 'ES', 'European separator', 29 'ET', 'European terminator', 30 'FSI', 'First strong isolate', 31 'L', 'Left to right', 32 'LRE', 'Left to right embedding', 33 'LRI', 'Left to right isolate', 34 'LRO', 'Left to right override', 35 'NSM', 'Non-spacing mark', 36 'ON', 'Other neutral', 37 'PDF', 'Pop directional format', 38 'PDI', 'Pop directional isolate', 39 'R', 'Right to left', 40 'RLE', 'Right to left embedding', 41 'RLI', 'Right to left isolate', 42 'RLO', 'Right to left override', 43 'S', 'Segment separator', 44 'WS', 'White space' 45 ] 46 47# Particular category property names, with comments. NOTE: If ever this list 48# is changed, the table called "catposstab" in the pcre2_auto_possess.c file 49# must be edited to keep in step. 50 51category_names = [ 52 'Cc', 'Control', 53 'Cf', 'Format', 54 'Cn', 'Unassigned', 55 'Co', 'Private use', 56 'Cs', 'Surrogate', 57 'Ll', 'Lower case letter', 58 'Lm', 'Modifier letter', 59 'Lo', 'Other letter', 60 'Lt', 'Title case letter', 61 'Lu', 'Upper case letter', 62 'Mc', 'Spacing mark', 63 'Me', 'Enclosing mark', 64 'Mn', 'Non-spacing mark', 65 'Nd', 'Decimal number', 66 'Nl', 'Letter number', 67 'No', 'Other number', 68 'Pc', 'Connector punctuation', 69 'Pd', 'Dash punctuation', 70 'Pe', 'Close punctuation', 71 'Pf', 'Final punctuation', 72 'Pi', 'Initial punctuation', 73 'Po', 'Other punctuation', 74 'Ps', 'Open punctuation', 75 'Sc', 'Currency symbol', 76 'Sk', 'Modifier symbol', 77 'Sm', 'Mathematical symbol', 78 'So', 'Other symbol', 79 'Zl', 'Line separator', 80 'Zp', 'Paragraph separator', 81 'Zs', 'Space separator' 82 ] 83 84# The Extended_Pictographic property is not found in the file where all the 85# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt 86# file, but we list it here so that the name has the correct index value. 87 88break_properties = [ 89 'CR', ' 0', 90 'LF', ' 1', 91 'Control', ' 2', 92 'Extend', ' 3', 93 'Prepend', ' 4', 94 'SpacingMark', ' 5', 95 'L', ' 6 Hangul syllable type L', 96 'V', ' 7 Hangul syllable type V', 97 'T', ' 8 Hangul syllable type T', 98 'LV', ' 9 Hangul syllable type LV', 99 'LVT', '10 Hangul syllable type LVT', 100 'Regional_Indicator', '11', 101 'Other', '12', 102 'ZWJ', '13', 103 'Extended_Pictographic', '14' 104 ] 105 106# List of files from which the names of Boolean properties are obtained, along 107# with a list of regex patterns for properties to be ignored, and a list of 108# extra pattern names to add. 109 110bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt'] 111bool_propsignore = [r'^Other_', r'^Hyphen$'] 112bool_propsextras = ['ASCII', 'Bidi_Mirrored'] 113 114 115# --------------------------------------------------------------------------- 116# GET BOOLEAN PROPERTY NAMES 117# --------------------------------------------------------------------------- 118 119# Get a list of Boolean property names from a number of files. 120 121def getbpropslist(): 122 bplist = [] 123 bplast = "" 124 125 for filename in bool_propsfiles: 126 try: 127 file = open('Unicode.tables/' + filename, 'r') 128 except IOError: 129 print(f"** Couldn't open {'Unicode.tables/' + filename}\n") 130 sys.exit(1) 131 132 for line in file: 133 line = re.sub(r'#.*', '', line) 134 data = list(map(str.strip, line.split(';'))) 135 if len(data) <= 1 or data[1] == bplast: 136 continue 137 bplast = data[1] 138 for pat in bool_propsignore: 139 if re.match(pat, bplast) != None: 140 break 141 else: 142 bplist.append(bplast) 143 144 file.close() 145 146 bplist.extend(bool_propsextras) 147 bplist.sort() 148 return bplist 149 150bool_properties = getbpropslist() 151bool_props_list_item_size = (len(bool_properties) + 31) // 32 152 153 154 155# --------------------------------------------------------------------------- 156# COLLECTING PROPERTY NAMES AND ALIASES 157# --------------------------------------------------------------------------- 158 159script_names = ['Unknown'] 160abbreviations = {} 161 162def collect_property_names(): 163 global script_names 164 global abbreviations 165 166 names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') 167 168 last_script_name = "" 169 with open("Unicode.tables/Scripts.txt") as f: 170 for line in f: 171 match_obj = names_re.match(line) 172 173 if match_obj == None or match_obj.group(1) == last_script_name: 174 continue 175 176 last_script_name = match_obj.group(1) 177 script_names.append(last_script_name) 178 179 # Sometimes there is comment in the line 180 # so splitting around semicolon is not enough 181 value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') 182 183 with open("Unicode.tables/PropertyValueAliases.txt") as f: 184 for line in f: 185 match_obj = value_alias_re.match(line) 186 187 if match_obj == None: 188 continue 189 190 if match_obj.group(1) == "sc": 191 if match_obj.group(2) == match_obj.group(3): 192 abbreviations[match_obj.group(3)] = () 193 elif match_obj.group(4) == None: 194 abbreviations[match_obj.group(3)] = (match_obj.group(2),) 195 else: 196 abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) 197 198 # We can also collect Boolean property abbreviations into the same dictionary 199 200 bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?') 201 with open("Unicode.tables/PropertyAliases.txt") as f: 202 for line in f: 203 match_obj = bin_alias_re.match(line) 204 if match_obj == None: 205 continue 206 207 if match_obj.group(2) in bool_properties: 208 if match_obj.group(3) == None: 209 abbreviations[match_obj.group(2)] = (match_obj.group(1),) 210 else: 211 abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3)) 212 213collect_property_names() 214 215 216 217# --------------------------------------------------------------------------- 218# REORDERING SCRIPT NAMES 219# --------------------------------------------------------------------------- 220 221script_abbrevs = [] 222 223def reorder_scripts(): 224 global script_names 225 global script_abbrevs 226 global abbreviations 227 228 for name in script_names: 229 abbrevs = abbreviations[name] 230 script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) 231 232 extended_script_abbrevs = set() 233 with open("Unicode.tables/ScriptExtensions.txt") as f: 234 names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #') 235 236 for line in f: 237 match_obj = names_re.match(line) 238 239 if match_obj == None: 240 continue 241 242 for name in match_obj.group(1).split(" "): 243 extended_script_abbrevs.add(name) 244 245 new_script_names = [] 246 new_script_abbrevs = [] 247 248 for idx, abbrev in enumerate(script_abbrevs): 249 if abbrev in extended_script_abbrevs: 250 new_script_names.append(script_names[idx]) 251 new_script_abbrevs.append(abbrev) 252 253 for idx, abbrev in enumerate(script_abbrevs): 254 if abbrev not in extended_script_abbrevs: 255 new_script_names.append(script_names[idx]) 256 new_script_abbrevs.append(abbrev) 257 258 script_names = new_script_names 259 script_abbrevs = new_script_abbrevs 260 261reorder_scripts() 262script_list_item_size = (script_names.index('Unknown') + 31) // 32 263 264 265# --------------------------------------------------------------------------- 266# DERIVED LISTS 267# --------------------------------------------------------------------------- 268 269# Create general character property names from the first letters of the 270# particular categories. 271 272gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) 273general_category_names = list(gcn_set) 274general_category_names.sort() 275 276 277# --------------------------------------------------------------------------- 278# FUNCTIONS 279# --------------------------------------------------------------------------- 280 281import sys 282 283# Open an output file, using the command's argument or a default. Write common 284# preliminary header information. 285 286def open_output(default): 287 if len(sys.argv) > 2: 288 print('** Too many arguments: just give a file name') 289 sys.exit(1) 290 if len(sys.argv) == 2: 291 output_name = sys.argv[1] 292 else: 293 output_name = default 294 try: 295 file = open(output_name, "w") 296 except IOError: 297 print ("** Couldn't open %s" % output_name) 298 sys.exit(1) 299 300 script_name = sys.argv[0] 301 i = script_name.rfind('/') 302 if i >= 0: 303 script_name = script_name[i+1:] 304 305 file.write("""\ 306/************************************************* 307* Perl-Compatible Regular Expressions * 308*************************************************/ 309 310/* PCRE is a library of functions to support regular expressions whose syntax 311and semantics are as close as possible to those of the Perl 5 language. 312 313 Written by Philip Hazel 314 Original API code Copyright (c) 1997-2012 University of Cambridge 315 New API code Copyright (c) 2016-2022 University of Cambridge 316 317This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! 318""") 319 320 file.write("Instead, modify the maint/%s script and run it to generate\n" 321 "a new version of this code.\n\n" % script_name) 322 323 file.write("""\ 324----------------------------------------------------------------------------- 325Redistribution and use in source and binary forms, with or without 326modification, are permitted provided that the following conditions are met: 327 328 * Redistributions of source code must retain the above copyright notice, 329 this list of conditions and the following disclaimer. 330 331 * Redistributions in binary form must reproduce the above copyright 332 notice, this list of conditions and the following disclaimer in the 333 documentation and/or other materials provided with the distribution. 334 335 * Neither the name of the University of Cambridge nor the names of its 336 contributors may be used to endorse or promote products derived from 337 this software without specific prior written permission. 338 339THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 340AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 341IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 342ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 343LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 344CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 345SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 346INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 347CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 348ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 349POSSIBILITY OF SUCH DAMAGE. 350----------------------------------------------------------------------------- 351*/ 352\n""") 353 return file 354 355# End of UcpCommon.py 356