17db96d56Sopenharmony_ci#!/usr/bin/env python3 27db96d56Sopenharmony_ci""" Utility for parsing HTML entity definitions available from: 37db96d56Sopenharmony_ci 47db96d56Sopenharmony_ci http://www.w3.org/ as e.g. 57db96d56Sopenharmony_ci http://www.w3.org/TR/REC-html40/HTMLlat1.ent 67db96d56Sopenharmony_ci 77db96d56Sopenharmony_ci Input is read from stdin, output is written to stdout in form of a 87db96d56Sopenharmony_ci Python snippet defining a dictionary "entitydefs" mapping literal 97db96d56Sopenharmony_ci entity name to character or numeric entity. 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ci Marc-Andre Lemburg, mal@lemburg.com, 1999. 127db96d56Sopenharmony_ci Use as you like. NO WARRANTIES. 137db96d56Sopenharmony_ci 147db96d56Sopenharmony_ci""" 157db96d56Sopenharmony_ciimport re,sys 167db96d56Sopenharmony_ci 177db96d56Sopenharmony_cientityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_cidef parse(text,pos=0,endpos=None): 207db96d56Sopenharmony_ci 217db96d56Sopenharmony_ci pos = 0 227db96d56Sopenharmony_ci if endpos is None: 237db96d56Sopenharmony_ci endpos = len(text) 247db96d56Sopenharmony_ci d = {} 257db96d56Sopenharmony_ci while 1: 267db96d56Sopenharmony_ci m = entityRE.search(text,pos,endpos) 277db96d56Sopenharmony_ci if not m: 287db96d56Sopenharmony_ci break 297db96d56Sopenharmony_ci name,charcode,comment = m.groups() 307db96d56Sopenharmony_ci d[name] = charcode,comment 317db96d56Sopenharmony_ci pos = m.end() 327db96d56Sopenharmony_ci return d 337db96d56Sopenharmony_ci 347db96d56Sopenharmony_cidef writefile(f,defs): 357db96d56Sopenharmony_ci 367db96d56Sopenharmony_ci f.write("entitydefs = {\n") 377db96d56Sopenharmony_ci items = sorted(defs.items()) 387db96d56Sopenharmony_ci for name, (charcode,comment) in items: 397db96d56Sopenharmony_ci if charcode[:2] == '&#': 407db96d56Sopenharmony_ci code = int(charcode[2:-1]) 417db96d56Sopenharmony_ci if code < 256: 427db96d56Sopenharmony_ci charcode = r"'\%o'" % code 437db96d56Sopenharmony_ci else: 447db96d56Sopenharmony_ci charcode = repr(charcode) 457db96d56Sopenharmony_ci else: 467db96d56Sopenharmony_ci charcode = repr(charcode) 477db96d56Sopenharmony_ci comment = ' '.join(comment.split()) 487db96d56Sopenharmony_ci f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment)) 497db96d56Sopenharmony_ci f.write('\n}\n') 507db96d56Sopenharmony_ci 517db96d56Sopenharmony_ciif __name__ == '__main__': 527db96d56Sopenharmony_ci if len(sys.argv) > 1: 537db96d56Sopenharmony_ci with open(sys.argv[1]) as infile: 547db96d56Sopenharmony_ci text = infile.read() 557db96d56Sopenharmony_ci else: 567db96d56Sopenharmony_ci text = sys.stdin.read() 577db96d56Sopenharmony_ci 587db96d56Sopenharmony_ci defs = parse(text) 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ci if len(sys.argv) > 2: 617db96d56Sopenharmony_ci with open(sys.argv[2],'w') as outfile: 627db96d56Sopenharmony_ci writefile(outfile, defs) 637db96d56Sopenharmony_ci else: 647db96d56Sopenharmony_ci writefile(sys.stdout, defs) 65