17db96d56Sopenharmony_ci#!/usr/bin/env python3
27db96d56Sopenharmony_ci""" Utility for parsing HTML entity definitions available from:
37db96d56Sopenharmony_ci
47db96d56Sopenharmony_ci      http://www.w3.org/ as e.g.
57db96d56Sopenharmony_ci      http://www.w3.org/TR/REC-html40/HTMLlat1.ent
67db96d56Sopenharmony_ci
77db96d56Sopenharmony_ci    Input is read from stdin, output is written to stdout in form of a
87db96d56Sopenharmony_ci    Python snippet defining a dictionary "entitydefs" mapping literal
97db96d56Sopenharmony_ci    entity name to character or numeric entity.
107db96d56Sopenharmony_ci
117db96d56Sopenharmony_ci    Marc-Andre Lemburg, mal@lemburg.com, 1999.
127db96d56Sopenharmony_ci    Use as you like. NO WARRANTIES.
137db96d56Sopenharmony_ci
147db96d56Sopenharmony_ci"""
157db96d56Sopenharmony_ciimport re,sys
167db96d56Sopenharmony_ci
177db96d56Sopenharmony_cientityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
187db96d56Sopenharmony_ci
197db96d56Sopenharmony_cidef parse(text,pos=0,endpos=None):
207db96d56Sopenharmony_ci
217db96d56Sopenharmony_ci    pos = 0
227db96d56Sopenharmony_ci    if endpos is None:
237db96d56Sopenharmony_ci        endpos = len(text)
247db96d56Sopenharmony_ci    d = {}
257db96d56Sopenharmony_ci    while 1:
267db96d56Sopenharmony_ci        m = entityRE.search(text,pos,endpos)
277db96d56Sopenharmony_ci        if not m:
287db96d56Sopenharmony_ci            break
297db96d56Sopenharmony_ci        name,charcode,comment = m.groups()
307db96d56Sopenharmony_ci        d[name] = charcode,comment
317db96d56Sopenharmony_ci        pos = m.end()
327db96d56Sopenharmony_ci    return d
337db96d56Sopenharmony_ci
347db96d56Sopenharmony_cidef writefile(f,defs):
357db96d56Sopenharmony_ci
367db96d56Sopenharmony_ci    f.write("entitydefs = {\n")
377db96d56Sopenharmony_ci    items = sorted(defs.items())
387db96d56Sopenharmony_ci    for name, (charcode,comment) in items:
397db96d56Sopenharmony_ci        if charcode[:2] == '&#':
407db96d56Sopenharmony_ci            code = int(charcode[2:-1])
417db96d56Sopenharmony_ci            if code < 256:
427db96d56Sopenharmony_ci                charcode = r"'\%o'" % code
437db96d56Sopenharmony_ci            else:
447db96d56Sopenharmony_ci                charcode = repr(charcode)
457db96d56Sopenharmony_ci        else:
467db96d56Sopenharmony_ci            charcode = repr(charcode)
477db96d56Sopenharmony_ci        comment = ' '.join(comment.split())
487db96d56Sopenharmony_ci        f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
497db96d56Sopenharmony_ci    f.write('\n}\n')
507db96d56Sopenharmony_ci
517db96d56Sopenharmony_ciif __name__ == '__main__':
527db96d56Sopenharmony_ci    if len(sys.argv) > 1:
537db96d56Sopenharmony_ci        with open(sys.argv[1]) as infile:
547db96d56Sopenharmony_ci            text = infile.read()
557db96d56Sopenharmony_ci    else:
567db96d56Sopenharmony_ci        text = sys.stdin.read()
577db96d56Sopenharmony_ci
587db96d56Sopenharmony_ci    defs = parse(text)
597db96d56Sopenharmony_ci
607db96d56Sopenharmony_ci    if len(sys.argv) > 2:
617db96d56Sopenharmony_ci        with open(sys.argv[2],'w') as outfile:
627db96d56Sopenharmony_ci            writefile(outfile, defs)
637db96d56Sopenharmony_ci    else:
647db96d56Sopenharmony_ci        writefile(sys.stdout, defs)
65