17db96d56Sopenharmony_ci#
27db96d56Sopenharmony_ci# genmap_schinese.py: Simplified Chinese Codecs Map Generator
37db96d56Sopenharmony_ci#
47db96d56Sopenharmony_ci# Original Author:  Hye-Shik Chang <perky@FreeBSD.org>
57db96d56Sopenharmony_ci# Modified Author:  Dong-hee Na <donghee.na92@gmail.com>
67db96d56Sopenharmony_ci#
77db96d56Sopenharmony_ciimport os
87db96d56Sopenharmony_ciimport re
97db96d56Sopenharmony_ci
107db96d56Sopenharmony_cifrom genmap_support import *
117db96d56Sopenharmony_ci
127db96d56Sopenharmony_ci
137db96d56Sopenharmony_ciGB2312_C1   = (0x21, 0x7e)
147db96d56Sopenharmony_ciGB2312_C2   = (0x21, 0x7e)
157db96d56Sopenharmony_ciGBKL1_C1    = (0x81, 0xa8)
167db96d56Sopenharmony_ciGBKL1_C2    = (0x40, 0xfe)
177db96d56Sopenharmony_ciGBKL2_C1    = (0xa9, 0xfe)
187db96d56Sopenharmony_ciGBKL2_C2    = (0x40, 0xa0)
197db96d56Sopenharmony_ciGB18030EXTP1_C1 = (0xa1, 0xa9)
207db96d56Sopenharmony_ciGB18030EXTP1_C2 = (0x40, 0xfe)
217db96d56Sopenharmony_ciGB18030EXTP2_C1 = (0xaa, 0xaf)
227db96d56Sopenharmony_ciGB18030EXTP2_C2 = (0xa1, 0xfe)
237db96d56Sopenharmony_ciGB18030EXTP3_C1 = (0xd7, 0xd7)
247db96d56Sopenharmony_ciGB18030EXTP3_C2 = (0xfa, 0xfe)
257db96d56Sopenharmony_ciGB18030EXTP4_C1 = (0xf8, 0xfd)
267db96d56Sopenharmony_ciGB18030EXTP4_C2 = (0xa1, 0xfe)
277db96d56Sopenharmony_ciGB18030EXTP5_C1 = (0xfe, 0xfe)
287db96d56Sopenharmony_ciGB18030EXTP5_C2 = (0x50, 0xfe)
297db96d56Sopenharmony_ci
307db96d56Sopenharmony_ciMAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'
317db96d56Sopenharmony_ciMAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'
327db96d56Sopenharmony_ciMAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'
337db96d56Sopenharmony_ci
347db96d56Sopenharmony_cire_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')
357db96d56Sopenharmony_ci
367db96d56Sopenharmony_ci
377db96d56Sopenharmony_cidef parse_gb18030map(fo):
387db96d56Sopenharmony_ci    m, gbuni = {}, {}
397db96d56Sopenharmony_ci    for i in range(65536):
407db96d56Sopenharmony_ci        if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area
417db96d56Sopenharmony_ci            gbuni[i] = None
427db96d56Sopenharmony_ci    for uni, native in re_gb18030ass.findall(fo.read()):
437db96d56Sopenharmony_ci        uni = eval('0x'+uni)
447db96d56Sopenharmony_ci        native = [eval('0x'+u) for u in native.split()]
457db96d56Sopenharmony_ci        if len(native) <= 2:
467db96d56Sopenharmony_ci            del gbuni[uni]
477db96d56Sopenharmony_ci        if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes
487db96d56Sopenharmony_ci            m.setdefault(native[0], {})
497db96d56Sopenharmony_ci            m[native[0]][native[1]] = uni
507db96d56Sopenharmony_ci    gbuni = [k for k in gbuni.keys()]
517db96d56Sopenharmony_ci    gbuni.sort()
527db96d56Sopenharmony_ci    return m, gbuni
537db96d56Sopenharmony_ci
547db96d56Sopenharmony_cidef main():
557db96d56Sopenharmony_ci    print("Loading Mapping File...")
567db96d56Sopenharmony_ci    gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)
577db96d56Sopenharmony_ci    cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)
587db96d56Sopenharmony_ci    gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)
597db96d56Sopenharmony_ci
607db96d56Sopenharmony_ci    gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)
617db96d56Sopenharmony_ci    gbkdecmap = loadmap(cp936map)
627db96d56Sopenharmony_ci    gb2312decmap = loadmap(gb2312map)
637db96d56Sopenharmony_ci    difmap = {}
647db96d56Sopenharmony_ci    for c1, m in gbkdecmap.items():
657db96d56Sopenharmony_ci        for c2, code in m.items():
667db96d56Sopenharmony_ci            del gb18030decmap[c1][c2]
677db96d56Sopenharmony_ci            if not gb18030decmap[c1]:
687db96d56Sopenharmony_ci                del gb18030decmap[c1]
697db96d56Sopenharmony_ci    for c1, m in gb2312decmap.items():
707db96d56Sopenharmony_ci        for c2, code in m.items():
717db96d56Sopenharmony_ci            gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80
727db96d56Sopenharmony_ci            if gbkdecmap[gbkc1][gbkc2] == code:
737db96d56Sopenharmony_ci                del gbkdecmap[gbkc1][gbkc2]
747db96d56Sopenharmony_ci                if not gbkdecmap[gbkc1]:
757db96d56Sopenharmony_ci                    del gbkdecmap[gbkc1]
767db96d56Sopenharmony_ci
777db96d56Sopenharmony_ci    gb2312_gbkencmap, gb18030encmap = {}, {}
787db96d56Sopenharmony_ci    for c1, m in gbkdecmap.items():
797db96d56Sopenharmony_ci        for c2, code in m.items():
807db96d56Sopenharmony_ci            gb2312_gbkencmap.setdefault(code >> 8, {})
817db96d56Sopenharmony_ci            gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set
827db96d56Sopenharmony_ci    for c1, m in gb2312decmap.items():
837db96d56Sopenharmony_ci        for c2, code in m.items():
847db96d56Sopenharmony_ci            gb2312_gbkencmap.setdefault(code >> 8, {})
857db96d56Sopenharmony_ci            gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset
867db96d56Sopenharmony_ci    for c1, m in gb18030decmap.items():
877db96d56Sopenharmony_ci        for c2, code in m.items():
887db96d56Sopenharmony_ci            gb18030encmap.setdefault(code >> 8, {})
897db96d56Sopenharmony_ci            gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2
907db96d56Sopenharmony_ci
917db96d56Sopenharmony_ci    with open('mappings_cn.h', 'w') as fp:
927db96d56Sopenharmony_ci        print_autogen(fp, os.path.basename(__file__))
937db96d56Sopenharmony_ci
947db96d56Sopenharmony_ci        print("Generating GB2312 decode map...")
957db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)
967db96d56Sopenharmony_ci        writer.update_decode_map(GB2312_C1, GB2312_C2)
977db96d56Sopenharmony_ci        writer.generate()
987db96d56Sopenharmony_ci
997db96d56Sopenharmony_ci        print("Generating GBK decode map...")
1007db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)
1017db96d56Sopenharmony_ci        writer.update_decode_map(GBKL1_C1, GBKL1_C2)
1027db96d56Sopenharmony_ci        writer.update_decode_map(GBKL2_C1, GBKL2_C2)
1037db96d56Sopenharmony_ci        writer.generate()
1047db96d56Sopenharmony_ci
1057db96d56Sopenharmony_ci        print("Generating GB2312 && GBK encode map...")
1067db96d56Sopenharmony_ci        writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)
1077db96d56Sopenharmony_ci        writer.generate()
1087db96d56Sopenharmony_ci
1097db96d56Sopenharmony_ci        print("Generating GB18030 extension decode map...")
1107db96d56Sopenharmony_ci        writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)
1117db96d56Sopenharmony_ci        for i in range(1, 6):
1127db96d56Sopenharmony_ci            writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))
1137db96d56Sopenharmony_ci
1147db96d56Sopenharmony_ci        writer.generate()
1157db96d56Sopenharmony_ci
1167db96d56Sopenharmony_ci        print("Generating GB18030 extension encode map...")
1177db96d56Sopenharmony_ci        writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)
1187db96d56Sopenharmony_ci        writer.generate()
1197db96d56Sopenharmony_ci
1207db96d56Sopenharmony_ci        print("Generating GB18030 Unicode BMP Mapping Ranges...")
1217db96d56Sopenharmony_ci        ranges = [[-1, -1, -1]]
1227db96d56Sopenharmony_ci        gblinnum = 0
1237db96d56Sopenharmony_ci        fp.write("""
1247db96d56Sopenharmony_cistatic const struct _gb18030_to_unibmp_ranges {
1257db96d56Sopenharmony_ci    Py_UCS4   first, last;
1267db96d56Sopenharmony_ci    DBCHAR       base;
1277db96d56Sopenharmony_ci} gb18030_to_unibmp_ranges[] = {
1287db96d56Sopenharmony_ci""")
1297db96d56Sopenharmony_ci
1307db96d56Sopenharmony_ci        for uni in gb18030unilinear:
1317db96d56Sopenharmony_ci            if uni == ranges[-1][1] + 1:
1327db96d56Sopenharmony_ci                ranges[-1][1] = uni
1337db96d56Sopenharmony_ci            else:
1347db96d56Sopenharmony_ci                ranges.append([uni, uni, gblinnum])
1357db96d56Sopenharmony_ci            gblinnum += 1
1367db96d56Sopenharmony_ci
1377db96d56Sopenharmony_ci        filler = BufferedFiller()
1387db96d56Sopenharmony_ci        for first, last, base in ranges[1:]:
1397db96d56Sopenharmony_ci            filler.write('{', str(first), ',', str(last), ',', str(base), '},')
1407db96d56Sopenharmony_ci
1417db96d56Sopenharmony_ci        filler.write('{', '0,', '0,', str(
1427db96d56Sopenharmony_ci            ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')
1437db96d56Sopenharmony_ci        filler.printout(fp)
1447db96d56Sopenharmony_ci
1457db96d56Sopenharmony_ci    print("Done!")
1467db96d56Sopenharmony_ci
1477db96d56Sopenharmony_ci
1487db96d56Sopenharmony_ciif __name__ == '__main__':
1497db96d56Sopenharmony_ci    main()
150