17db96d56Sopenharmony_ci#!/usr/bin/env python3 27db96d56Sopenharmony_ci""" 37db96d56Sopenharmony_ciUtility for parsing HTML5 entity definitions available from: 47db96d56Sopenharmony_ci 57db96d56Sopenharmony_ci http://dev.w3.org/html5/spec/entities.json 67db96d56Sopenharmony_ci 77db96d56Sopenharmony_ciWritten by Ezio Melotti and Iuliia Proskurnia. 87db96d56Sopenharmony_ci 97db96d56Sopenharmony_ci""" 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ciimport os 127db96d56Sopenharmony_ciimport sys 137db96d56Sopenharmony_ciimport json 147db96d56Sopenharmony_cifrom urllib.request import urlopen 157db96d56Sopenharmony_cifrom html.entities import html5 167db96d56Sopenharmony_ci 177db96d56Sopenharmony_cientities_url = 'http://dev.w3.org/html5/spec/entities.json' 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_cidef get_json(url): 207db96d56Sopenharmony_ci """Download the json file from the url and returns a decoded object.""" 217db96d56Sopenharmony_ci with urlopen(url) as f: 227db96d56Sopenharmony_ci data = f.read().decode('utf-8') 237db96d56Sopenharmony_ci return json.loads(data) 247db96d56Sopenharmony_ci 257db96d56Sopenharmony_cidef create_dict(entities): 267db96d56Sopenharmony_ci """Create the html5 dict from the decoded json object.""" 277db96d56Sopenharmony_ci new_html5 = {} 287db96d56Sopenharmony_ci for name, value in entities.items(): 297db96d56Sopenharmony_ci new_html5[name.lstrip('&')] = value['characters'] 307db96d56Sopenharmony_ci return new_html5 317db96d56Sopenharmony_ci 327db96d56Sopenharmony_cidef compare_dicts(old, new): 337db96d56Sopenharmony_ci """Compare the old and new dicts and print the differences.""" 347db96d56Sopenharmony_ci added = new.keys() - old.keys() 357db96d56Sopenharmony_ci if added: 367db96d56Sopenharmony_ci print('{} entitie(s) have been added:'.format(len(added))) 377db96d56Sopenharmony_ci for name in sorted(added): 387db96d56Sopenharmony_ci print(' {!r}: {!r}'.format(name, new[name])) 397db96d56Sopenharmony_ci removed = old.keys() - new.keys() 407db96d56Sopenharmony_ci if removed: 417db96d56Sopenharmony_ci print('{} entitie(s) have been removed:'.format(len(removed))) 427db96d56Sopenharmony_ci for name in sorted(removed): 437db96d56Sopenharmony_ci print(' {!r}: {!r}'.format(name, old[name])) 447db96d56Sopenharmony_ci changed = set() 457db96d56Sopenharmony_ci for name in (old.keys() & new.keys()): 467db96d56Sopenharmony_ci if old[name] != new[name]: 477db96d56Sopenharmony_ci changed.add((name, old[name], new[name])) 487db96d56Sopenharmony_ci if changed: 497db96d56Sopenharmony_ci print('{} entitie(s) have been modified:'.format(len(changed))) 507db96d56Sopenharmony_ci for item in sorted(changed): 517db96d56Sopenharmony_ci print(' {!r}: {!r} -> {!r}'.format(*item)) 527db96d56Sopenharmony_ci 537db96d56Sopenharmony_cidef write_items(entities, file=sys.stdout): 547db96d56Sopenharmony_ci """Write the items of the dictionary in the specified file.""" 557db96d56Sopenharmony_ci # The keys in the generated dictionary should be sorted 567db96d56Sopenharmony_ci # in a case-insensitive way, however, when two keys are equal, 577db96d56Sopenharmony_ci # the uppercase version should come first so that the result 587db96d56Sopenharmony_ci # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] 597db96d56Sopenharmony_ci # To do this we first sort in a case-sensitive way (so all the 607db96d56Sopenharmony_ci # uppercase chars come first) and then sort with key=str.lower. 617db96d56Sopenharmony_ci # Since the sorting is stable the uppercase keys will eventually 627db96d56Sopenharmony_ci # be before their equivalent lowercase version. 637db96d56Sopenharmony_ci keys = sorted(entities.keys()) 647db96d56Sopenharmony_ci keys = sorted(keys, key=str.lower) 657db96d56Sopenharmony_ci print('html5 = {', file=file) 667db96d56Sopenharmony_ci for name in keys: 677db96d56Sopenharmony_ci print(' {!r}: {!a},'.format(name, entities[name]), file=file) 687db96d56Sopenharmony_ci print('}', file=file) 697db96d56Sopenharmony_ci 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ciif __name__ == '__main__': 727db96d56Sopenharmony_ci # without args print a diff between html.entities.html5 and new_html5 737db96d56Sopenharmony_ci # with --create print the new html5 dict 747db96d56Sopenharmony_ci # with --patch patch the Lib/html/entities.py file 757db96d56Sopenharmony_ci new_html5 = create_dict(get_json(entities_url)) 767db96d56Sopenharmony_ci if '--create' in sys.argv: 777db96d56Sopenharmony_ci print('# map the HTML5 named character references to the ' 787db96d56Sopenharmony_ci 'equivalent Unicode character(s)') 797db96d56Sopenharmony_ci print('# Generated by {}. Do not edit manually.'.format(__file__)) 807db96d56Sopenharmony_ci write_items(new_html5) 817db96d56Sopenharmony_ci elif '--patch' in sys.argv: 827db96d56Sopenharmony_ci fname = 'Lib/html/entities.py' 837db96d56Sopenharmony_ci temp_fname = fname + '.temp' 847db96d56Sopenharmony_ci with open(fname) as f1, open(temp_fname, 'w') as f2: 857db96d56Sopenharmony_ci skip = False 867db96d56Sopenharmony_ci for line in f1: 877db96d56Sopenharmony_ci if line.startswith('html5 = {'): 887db96d56Sopenharmony_ci write_items(new_html5, file=f2) 897db96d56Sopenharmony_ci skip = True 907db96d56Sopenharmony_ci continue 917db96d56Sopenharmony_ci if skip: 927db96d56Sopenharmony_ci # skip the old items until the } 937db96d56Sopenharmony_ci if line.startswith('}'): 947db96d56Sopenharmony_ci skip = False 957db96d56Sopenharmony_ci continue 967db96d56Sopenharmony_ci f2.write(line) 977db96d56Sopenharmony_ci os.remove(fname) 987db96d56Sopenharmony_ci os.rename(temp_fname, fname) 997db96d56Sopenharmony_ci else: 1007db96d56Sopenharmony_ci if html5 == new_html5: 1017db96d56Sopenharmony_ci print('The current dictionary is updated.') 1027db96d56Sopenharmony_ci else: 1037db96d56Sopenharmony_ci compare_dicts(html5, new_html5) 1047db96d56Sopenharmony_ci print('Run "./python {0} --patch" to update Lib/html/entities.html ' 1057db96d56Sopenharmony_ci 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) 106