17db96d56Sopenharmony_ci#!/usr/bin/env python3
27db96d56Sopenharmony_ci"""
37db96d56Sopenharmony_ciUtility for parsing HTML5 entity definitions available from:
47db96d56Sopenharmony_ci
57db96d56Sopenharmony_ci    http://dev.w3.org/html5/spec/entities.json
67db96d56Sopenharmony_ci
77db96d56Sopenharmony_ciWritten by Ezio Melotti and Iuliia Proskurnia.
87db96d56Sopenharmony_ci
97db96d56Sopenharmony_ci"""
107db96d56Sopenharmony_ci
117db96d56Sopenharmony_ciimport os
127db96d56Sopenharmony_ciimport sys
137db96d56Sopenharmony_ciimport json
147db96d56Sopenharmony_cifrom urllib.request import urlopen
157db96d56Sopenharmony_cifrom html.entities import html5
167db96d56Sopenharmony_ci
177db96d56Sopenharmony_cientities_url = 'http://dev.w3.org/html5/spec/entities.json'
187db96d56Sopenharmony_ci
197db96d56Sopenharmony_cidef get_json(url):
207db96d56Sopenharmony_ci    """Download the json file from the url and returns a decoded object."""
217db96d56Sopenharmony_ci    with urlopen(url) as f:
227db96d56Sopenharmony_ci        data = f.read().decode('utf-8')
237db96d56Sopenharmony_ci    return json.loads(data)
247db96d56Sopenharmony_ci
257db96d56Sopenharmony_cidef create_dict(entities):
267db96d56Sopenharmony_ci    """Create the html5 dict from the decoded json object."""
277db96d56Sopenharmony_ci    new_html5 = {}
287db96d56Sopenharmony_ci    for name, value in entities.items():
297db96d56Sopenharmony_ci        new_html5[name.lstrip('&')] = value['characters']
307db96d56Sopenharmony_ci    return new_html5
317db96d56Sopenharmony_ci
327db96d56Sopenharmony_cidef compare_dicts(old, new):
337db96d56Sopenharmony_ci    """Compare the old and new dicts and print the differences."""
347db96d56Sopenharmony_ci    added = new.keys() - old.keys()
357db96d56Sopenharmony_ci    if added:
367db96d56Sopenharmony_ci        print('{} entitie(s) have been added:'.format(len(added)))
377db96d56Sopenharmony_ci        for name in sorted(added):
387db96d56Sopenharmony_ci            print('  {!r}: {!r}'.format(name, new[name]))
397db96d56Sopenharmony_ci    removed = old.keys() - new.keys()
407db96d56Sopenharmony_ci    if removed:
417db96d56Sopenharmony_ci        print('{} entitie(s) have been removed:'.format(len(removed)))
427db96d56Sopenharmony_ci        for name in sorted(removed):
437db96d56Sopenharmony_ci            print('  {!r}: {!r}'.format(name, old[name]))
447db96d56Sopenharmony_ci    changed = set()
457db96d56Sopenharmony_ci    for name in (old.keys() & new.keys()):
467db96d56Sopenharmony_ci        if old[name] != new[name]:
477db96d56Sopenharmony_ci            changed.add((name, old[name], new[name]))
487db96d56Sopenharmony_ci    if changed:
497db96d56Sopenharmony_ci        print('{} entitie(s) have been modified:'.format(len(changed)))
507db96d56Sopenharmony_ci        for item in sorted(changed):
517db96d56Sopenharmony_ci            print('  {!r}: {!r} -> {!r}'.format(*item))
527db96d56Sopenharmony_ci
537db96d56Sopenharmony_cidef write_items(entities, file=sys.stdout):
547db96d56Sopenharmony_ci    """Write the items of the dictionary in the specified file."""
557db96d56Sopenharmony_ci    # The keys in the generated dictionary should be sorted
567db96d56Sopenharmony_ci    # in a case-insensitive way, however, when two keys are equal,
577db96d56Sopenharmony_ci    # the uppercase version should come first so that the result
587db96d56Sopenharmony_ci    # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
597db96d56Sopenharmony_ci    # To do this we first sort in a case-sensitive way (so all the
607db96d56Sopenharmony_ci    # uppercase chars come first) and then sort with key=str.lower.
617db96d56Sopenharmony_ci    # Since the sorting is stable the uppercase keys will eventually
627db96d56Sopenharmony_ci    # be before their equivalent lowercase version.
637db96d56Sopenharmony_ci    keys = sorted(entities.keys())
647db96d56Sopenharmony_ci    keys = sorted(keys, key=str.lower)
657db96d56Sopenharmony_ci    print('html5 = {', file=file)
667db96d56Sopenharmony_ci    for name in keys:
677db96d56Sopenharmony_ci        print('    {!r}: {!a},'.format(name, entities[name]), file=file)
687db96d56Sopenharmony_ci    print('}', file=file)
697db96d56Sopenharmony_ci
707db96d56Sopenharmony_ci
717db96d56Sopenharmony_ciif __name__ == '__main__':
727db96d56Sopenharmony_ci    # without args print a diff between html.entities.html5 and new_html5
737db96d56Sopenharmony_ci    # with --create print the new html5 dict
747db96d56Sopenharmony_ci    # with --patch patch the Lib/html/entities.py file
757db96d56Sopenharmony_ci    new_html5 = create_dict(get_json(entities_url))
767db96d56Sopenharmony_ci    if '--create' in sys.argv:
777db96d56Sopenharmony_ci        print('# map the HTML5 named character references to the '
787db96d56Sopenharmony_ci              'equivalent Unicode character(s)')
797db96d56Sopenharmony_ci        print('# Generated by {}.  Do not edit manually.'.format(__file__))
807db96d56Sopenharmony_ci        write_items(new_html5)
817db96d56Sopenharmony_ci    elif '--patch' in sys.argv:
827db96d56Sopenharmony_ci        fname = 'Lib/html/entities.py'
837db96d56Sopenharmony_ci        temp_fname = fname + '.temp'
847db96d56Sopenharmony_ci        with open(fname) as f1, open(temp_fname, 'w') as f2:
857db96d56Sopenharmony_ci            skip = False
867db96d56Sopenharmony_ci            for line in f1:
877db96d56Sopenharmony_ci                if line.startswith('html5 = {'):
887db96d56Sopenharmony_ci                    write_items(new_html5, file=f2)
897db96d56Sopenharmony_ci                    skip = True
907db96d56Sopenharmony_ci                    continue
917db96d56Sopenharmony_ci                if skip:
927db96d56Sopenharmony_ci                    # skip the old items until the }
937db96d56Sopenharmony_ci                    if line.startswith('}'):
947db96d56Sopenharmony_ci                        skip = False
957db96d56Sopenharmony_ci                    continue
967db96d56Sopenharmony_ci                f2.write(line)
977db96d56Sopenharmony_ci        os.remove(fname)
987db96d56Sopenharmony_ci        os.rename(temp_fname, fname)
997db96d56Sopenharmony_ci    else:
1007db96d56Sopenharmony_ci        if html5 == new_html5:
1017db96d56Sopenharmony_ci            print('The current dictionary is updated.')
1027db96d56Sopenharmony_ci        else:
1037db96d56Sopenharmony_ci            compare_dicts(html5, new_html5)
1047db96d56Sopenharmony_ci            print('Run "./python {0} --patch" to update Lib/html/entities.html '
1057db96d56Sopenharmony_ci                  'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))
106