17db96d56Sopenharmony_ci#
27db96d56Sopenharmony_ci# genmap_ja_codecs.py: Japanese Codecs Map Generator
37db96d56Sopenharmony_ci#
47db96d56Sopenharmony_ci# Original Author:  Hye-Shik Chang <perky@FreeBSD.org>
57db96d56Sopenharmony_ci# Modified Author:  Dong-hee Na <donghee.na92@gmail.com>
67db96d56Sopenharmony_ci#
77db96d56Sopenharmony_ciimport os
87db96d56Sopenharmony_ci
97db96d56Sopenharmony_cifrom genmap_support import *
107db96d56Sopenharmony_ci
117db96d56Sopenharmony_ciJISX0208_C1 = (0x21, 0x74)
127db96d56Sopenharmony_ciJISX0208_C2 = (0x21, 0x7e)
137db96d56Sopenharmony_ciJISX0212_C1 = (0x22, 0x6d)
147db96d56Sopenharmony_ciJISX0212_C2 = (0x21, 0x7e)
157db96d56Sopenharmony_ciJISX0213_C1 = (0x21, 0x7e)
167db96d56Sopenharmony_ciJISX0213_C2 = (0x21, 0x7e)
177db96d56Sopenharmony_ciCP932P0_C1  = (0x81, 0x81) # patches between shift-jis and cp932
187db96d56Sopenharmony_ciCP932P0_C2  = (0x5f, 0xca)
197db96d56Sopenharmony_ciCP932P1_C1  = (0x87, 0x87) # CP932 P1
207db96d56Sopenharmony_ciCP932P1_C2  = (0x40, 0x9c)
217db96d56Sopenharmony_ciCP932P2_C1  = (0xed, 0xfc) # CP932 P2
227db96d56Sopenharmony_ciCP932P2_C2  = (0x40, 0xfc)
237db96d56Sopenharmony_ci
247db96d56Sopenharmony_ciMAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
257db96d56Sopenharmony_ciMAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'
267db96d56Sopenharmony_ciMAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'
277db96d56Sopenharmony_ciMAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'
287db96d56Sopenharmony_ci
297db96d56Sopenharmony_ci
307db96d56Sopenharmony_cidef loadmap_jisx0213(fo):
317db96d56Sopenharmony_ci    decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4
327db96d56Sopenharmony_ci    decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4
337db96d56Sopenharmony_ci    decmap3_pair = {} # maps to BMP-pair for level 3
347db96d56Sopenharmony_ci    for line in fo:
357db96d56Sopenharmony_ci        line = line.split('#', 1)[0].strip()
367db96d56Sopenharmony_ci        if not line or len(line.split()) < 2:
377db96d56Sopenharmony_ci            continue
387db96d56Sopenharmony_ci
397db96d56Sopenharmony_ci        row = line.split()
407db96d56Sopenharmony_ci        loc = eval('0x' + row[0][2:])
417db96d56Sopenharmony_ci        level = eval(row[0][0])
427db96d56Sopenharmony_ci        m = None
437db96d56Sopenharmony_ci        if len(row[1].split('+')) == 2: # single unicode
447db96d56Sopenharmony_ci            uni = eval('0x' + row[1][2:])
457db96d56Sopenharmony_ci            if level == 3:
467db96d56Sopenharmony_ci                if uni < 0x10000:
477db96d56Sopenharmony_ci                    m = decmap3
487db96d56Sopenharmony_ci                elif 0x20000 <= uni < 0x30000:
497db96d56Sopenharmony_ci                    uni -= 0x20000
507db96d56Sopenharmony_ci                    m = decmap3_2
517db96d56Sopenharmony_ci            elif level == 4:
527db96d56Sopenharmony_ci                if uni < 0x10000:
537db96d56Sopenharmony_ci                    m = decmap4
547db96d56Sopenharmony_ci                elif 0x20000 <= uni < 0x30000:
557db96d56Sopenharmony_ci                    uni -= 0x20000
567db96d56Sopenharmony_ci                    m = decmap4_2
577db96d56Sopenharmony_ci            m.setdefault((loc >> 8), {})
587db96d56Sopenharmony_ci            m[(loc >> 8)][(loc & 0xff)] = uni
597db96d56Sopenharmony_ci        else: # pair
607db96d56Sopenharmony_ci            uniprefix = eval('0x' + row[1][2:6]) # body
617db96d56Sopenharmony_ci            uni = eval('0x' + row[1][7:11]) # modifier
627db96d56Sopenharmony_ci            if level != 3:
637db96d56Sopenharmony_ci                raise ValueError("invalid map")
647db96d56Sopenharmony_ci            decmap3_pair.setdefault(uniprefix, {})
657db96d56Sopenharmony_ci            m = decmap3_pair[uniprefix]
667db96d56Sopenharmony_ci
677db96d56Sopenharmony_ci        if m is None:
687db96d56Sopenharmony_ci            raise ValueError("invalid map")
697db96d56Sopenharmony_ci        m.setdefault((loc >> 8), {})
707db96d56Sopenharmony_ci        m[(loc >> 8)][(loc & 0xff)] = uni
717db96d56Sopenharmony_ci
727db96d56Sopenharmony_ci    return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair
737db96d56Sopenharmony_ci
747db96d56Sopenharmony_ci
757db96d56Sopenharmony_cidef main():
767db96d56Sopenharmony_ci    jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)
777db96d56Sopenharmony_ci    jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)
787db96d56Sopenharmony_ci    cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)
797db96d56Sopenharmony_ci    jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)
807db96d56Sopenharmony_ci
817db96d56Sopenharmony_ci    print("Loading Mapping File...")
827db96d56Sopenharmony_ci
837db96d56Sopenharmony_ci    sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)
847db96d56Sopenharmony_ci    jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)
857db96d56Sopenharmony_ci    jisx0212decmap = loadmap(jisx0212file)
867db96d56Sopenharmony_ci    cp932decmap = loadmap(cp932file)
877db96d56Sopenharmony_ci    jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)
887db96d56Sopenharmony_ci
897db96d56Sopenharmony_ci    if jis3decmap[0x21][0x24] != 0xff0c:
907db96d56Sopenharmony_ci        raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')
917db96d56Sopenharmony_ci
927db96d56Sopenharmony_ci    sjisencmap, cp932encmap = {}, {}
937db96d56Sopenharmony_ci    jisx0208_0212encmap = {}
947db96d56Sopenharmony_ci    for c1, m in sjisdecmap.items():
957db96d56Sopenharmony_ci        for c2, code in m.items():
967db96d56Sopenharmony_ci            sjisencmap.setdefault(code >> 8, {})
977db96d56Sopenharmony_ci            sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2
987db96d56Sopenharmony_ci    for c1, m in cp932decmap.items():
997db96d56Sopenharmony_ci        for c2, code in m.items():
1007db96d56Sopenharmony_ci            cp932encmap.setdefault(code >> 8, {})
1017db96d56Sopenharmony_ci            if (code & 0xff) not in cp932encmap[code >> 8]:
1027db96d56Sopenharmony_ci                cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2
1037db96d56Sopenharmony_ci    for c1, m in cp932encmap.copy().items():
1047db96d56Sopenharmony_ci        for c2, code in m.copy().items():
1057db96d56Sopenharmony_ci            if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:
1067db96d56Sopenharmony_ci                del cp932encmap[c1][c2]
1077db96d56Sopenharmony_ci                if not cp932encmap[c1]:
1087db96d56Sopenharmony_ci                    del cp932encmap[c1]
1097db96d56Sopenharmony_ci
1107db96d56Sopenharmony_ci    jisx0213pairdecmap = {}
1117db96d56Sopenharmony_ci    jisx0213pairencmap = []
1127db96d56Sopenharmony_ci    for unibody, m1 in jis3_pairdecmap.items():
1137db96d56Sopenharmony_ci        for c1, m2 in m1.items():
1147db96d56Sopenharmony_ci            for c2, modifier in m2.items():
1157db96d56Sopenharmony_ci                jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))
1167db96d56Sopenharmony_ci                jisx0213pairdecmap.setdefault(c1, {})
1177db96d56Sopenharmony_ci                jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier
1187db96d56Sopenharmony_ci
1197db96d56Sopenharmony_ci    # Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)
1207db96d56Sopenharmony_ci    for c1, m in jisx0208decmap.items():
1217db96d56Sopenharmony_ci        for c2, code in m.items():
1227db96d56Sopenharmony_ci            jisx0208_0212encmap.setdefault(code >> 8, {})
1237db96d56Sopenharmony_ci            jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2
1247db96d56Sopenharmony_ci
1257db96d56Sopenharmony_ci    for c1, m in jisx0212decmap.items():
1267db96d56Sopenharmony_ci        for c2, code in m.items():
1277db96d56Sopenharmony_ci            jisx0208_0212encmap.setdefault(code >> 8, {})
1287db96d56Sopenharmony_ci            if (code & 0xff) in jisx0208_0212encmap[code >> 8]:
1297db96d56Sopenharmony_ci                print("OOPS!!!", (code))
1307db96d56Sopenharmony_ci            jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
1317db96d56Sopenharmony_ci
1327db96d56Sopenharmony_ci    jisx0213bmpencmap = {}
1337db96d56Sopenharmony_ci    for c1, m in jis3decmap.copy().items():
1347db96d56Sopenharmony_ci        for c2, code in m.copy().items():
1357db96d56Sopenharmony_ci            if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:
1367db96d56Sopenharmony_ci                if code in jis3_pairdecmap:
1377db96d56Sopenharmony_ci                    jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
1387db96d56Sopenharmony_ci                    jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
1397db96d56Sopenharmony_ci                elif jisx0208decmap[c1][c2] == code:
1407db96d56Sopenharmony_ci                    del jis3decmap[c1][c2]
1417db96d56Sopenharmony_ci                    if not jis3decmap[c1]:
1427db96d56Sopenharmony_ci                        del jis3decmap[c1]
1437db96d56Sopenharmony_ci                else:
1447db96d56Sopenharmony_ci                    raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")
1457db96d56Sopenharmony_ci            else:
1467db96d56Sopenharmony_ci                jisx0213bmpencmap.setdefault(code >> 8, {})
1477db96d56Sopenharmony_ci                if code not in jis3_pairdecmap:
1487db96d56Sopenharmony_ci                    jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2
1497db96d56Sopenharmony_ci                else:
1507db96d56Sopenharmony_ci                    jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
1517db96d56Sopenharmony_ci                    jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
1527db96d56Sopenharmony_ci
1537db96d56Sopenharmony_ci    for c1, m in jis4decmap.items():
1547db96d56Sopenharmony_ci        for c2, code in m.items():
1557db96d56Sopenharmony_ci            jisx0213bmpencmap.setdefault(code >> 8, {})
1567db96d56Sopenharmony_ci            jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
1577db96d56Sopenharmony_ci
1587db96d56Sopenharmony_ci    jisx0213empencmap = {}
1597db96d56Sopenharmony_ci    for c1, m in jis3_2_decmap.items():
1607db96d56Sopenharmony_ci        for c2, code in m.items():
1617db96d56Sopenharmony_ci            jisx0213empencmap.setdefault(code >> 8, {})
1627db96d56Sopenharmony_ci            jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2
1637db96d56Sopenharmony_ci    for c1, m in jis4_2_decmap.items():
1647db96d56Sopenharmony_ci        for c2, code in m.items():
1657db96d56Sopenharmony_ci            jisx0213empencmap.setdefault(code >> 8, {})
1667db96d56Sopenharmony_ci            jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
1677db96d56Sopenharmony_ci
1687db96d56Sopenharmony_ci    with open("mappings_jp.h", "w") as fp:
1697db96d56Sopenharmony_ci        print_autogen(fp, os.path.basename(__file__))
1707db96d56Sopenharmony_ci        print("Generating JIS X 0208 decode map...")
1717db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)
1727db96d56Sopenharmony_ci        writer.update_decode_map(JISX0208_C1, JISX0208_C2)
1737db96d56Sopenharmony_ci        writer.generate()
1747db96d56Sopenharmony_ci
1757db96d56Sopenharmony_ci        print("Generating JIS X 0212 decode map...")
1767db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)
1777db96d56Sopenharmony_ci        writer.update_decode_map(JISX0212_C1, JISX0212_C2)
1787db96d56Sopenharmony_ci        writer.generate()
1797db96d56Sopenharmony_ci
1807db96d56Sopenharmony_ci        print("Generating JIS X 0208 && JIS X 0212 encode map...")
1817db96d56Sopenharmony_ci        writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)
1827db96d56Sopenharmony_ci        writer.generate()
1837db96d56Sopenharmony_ci
1847db96d56Sopenharmony_ci        print("Generating CP932 Extension decode map...")
1857db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)
1867db96d56Sopenharmony_ci        writer.update_decode_map(CP932P0_C1, CP932P0_C2)
1877db96d56Sopenharmony_ci        writer.update_decode_map(CP932P1_C1, CP932P1_C2)
1887db96d56Sopenharmony_ci        writer.update_decode_map(CP932P2_C1, CP932P2_C2)
1897db96d56Sopenharmony_ci        writer.generate()
1907db96d56Sopenharmony_ci
1917db96d56Sopenharmony_ci        print("Generating CP932 Extension encode map...")
1927db96d56Sopenharmony_ci        writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)
1937db96d56Sopenharmony_ci        writer.generate()
1947db96d56Sopenharmony_ci
1957db96d56Sopenharmony_ci        print("Generating JIS X 0213 Plane 1 BMP decode map...")
1967db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)
1977db96d56Sopenharmony_ci        writer.update_decode_map(JISX0213_C1, JISX0213_C2)
1987db96d56Sopenharmony_ci        writer.generate()
1997db96d56Sopenharmony_ci
2007db96d56Sopenharmony_ci        print("Generating JIS X 0213 Plane 2 BMP decode map...")
2017db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)
2027db96d56Sopenharmony_ci        writer.update_decode_map(JISX0213_C1, JISX0213_C2)
2037db96d56Sopenharmony_ci        writer.generate()
2047db96d56Sopenharmony_ci
2057db96d56Sopenharmony_ci        print("Generating JIS X 0213 BMP encode map...")
2067db96d56Sopenharmony_ci        writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)
2077db96d56Sopenharmony_ci        writer.generate()
2087db96d56Sopenharmony_ci
2097db96d56Sopenharmony_ci        print("Generating JIS X 0213 Plane 1 EMP decode map...")
2107db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)
2117db96d56Sopenharmony_ci        writer.update_decode_map(JISX0213_C1, JISX0213_C2)
2127db96d56Sopenharmony_ci        writer.generate()
2137db96d56Sopenharmony_ci
2147db96d56Sopenharmony_ci        print("Generating JIS X 0213 Plane 2 EMP decode map...")
2157db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)
2167db96d56Sopenharmony_ci        writer.update_decode_map(JISX0213_C1, JISX0213_C2)
2177db96d56Sopenharmony_ci        writer.generate()
2187db96d56Sopenharmony_ci
2197db96d56Sopenharmony_ci        print("Generating JIS X 0213 EMP encode map...")
2207db96d56Sopenharmony_ci        writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)
2217db96d56Sopenharmony_ci        writer.generate()
2227db96d56Sopenharmony_ci
2237db96d56Sopenharmony_ci    with open('mappings_jisx0213_pair.h', 'w') as fp:
2247db96d56Sopenharmony_ci        print_autogen(fp, os.path.basename(__file__))
2257db96d56Sopenharmony_ci        fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")
2267db96d56Sopenharmony_ci        fp.write("""\
2277db96d56Sopenharmony_ci#ifdef EXTERN_JISX0213_PAIR
2287db96d56Sopenharmony_cistatic const struct widedbcs_index *jisx0213_pair_decmap;
2297db96d56Sopenharmony_cistatic const struct pair_encodemap *jisx0213_pair_encmap;
2307db96d56Sopenharmony_ci#else
2317db96d56Sopenharmony_ci""")
2327db96d56Sopenharmony_ci
2337db96d56Sopenharmony_ci        print("Generating JIS X 0213 unicode-pair decode map...")
2347db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)
2357db96d56Sopenharmony_ci        writer.update_decode_map(JISX0213_C1, JISX0213_C2)
2367db96d56Sopenharmony_ci        writer.generate(wide=True)
2377db96d56Sopenharmony_ci
2387db96d56Sopenharmony_ci        print("Generating JIS X 0213 unicode-pair encode map...")
2397db96d56Sopenharmony_ci        jisx0213pairencmap.sort()
2407db96d56Sopenharmony_ci        fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")
2417db96d56Sopenharmony_ci        filler = BufferedFiller()
2427db96d56Sopenharmony_ci        for body, modifier, jis in jisx0213pairencmap:
2437db96d56Sopenharmony_ci            filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')
2447db96d56Sopenharmony_ci        filler.printout(fp)
2457db96d56Sopenharmony_ci        fp.write("};\n")
2467db96d56Sopenharmony_ci        fp.write("#endif\n")
2477db96d56Sopenharmony_ci
2487db96d56Sopenharmony_ci    print("Done!")
2497db96d56Sopenharmony_ci
2507db96d56Sopenharmony_ciif __name__ == '__main__':
2517db96d56Sopenharmony_ci    main()
252