17db96d56Sopenharmony_ci# 27db96d56Sopenharmony_ci# genmap_schinese.py: Simplified Chinese Codecs Map Generator 37db96d56Sopenharmony_ci# 47db96d56Sopenharmony_ci# Original Author: Hye-Shik Chang <perky@FreeBSD.org> 57db96d56Sopenharmony_ci# Modified Author: Dong-hee Na <donghee.na92@gmail.com> 67db96d56Sopenharmony_ci# 77db96d56Sopenharmony_ciimport os 87db96d56Sopenharmony_ciimport re 97db96d56Sopenharmony_ci 107db96d56Sopenharmony_cifrom genmap_support import * 117db96d56Sopenharmony_ci 127db96d56Sopenharmony_ci 137db96d56Sopenharmony_ciGB2312_C1 = (0x21, 0x7e) 147db96d56Sopenharmony_ciGB2312_C2 = (0x21, 0x7e) 157db96d56Sopenharmony_ciGBKL1_C1 = (0x81, 0xa8) 167db96d56Sopenharmony_ciGBKL1_C2 = (0x40, 0xfe) 177db96d56Sopenharmony_ciGBKL2_C1 = (0xa9, 0xfe) 187db96d56Sopenharmony_ciGBKL2_C2 = (0x40, 0xa0) 197db96d56Sopenharmony_ciGB18030EXTP1_C1 = (0xa1, 0xa9) 207db96d56Sopenharmony_ciGB18030EXTP1_C2 = (0x40, 0xfe) 217db96d56Sopenharmony_ciGB18030EXTP2_C1 = (0xaa, 0xaf) 227db96d56Sopenharmony_ciGB18030EXTP2_C2 = (0xa1, 0xfe) 237db96d56Sopenharmony_ciGB18030EXTP3_C1 = (0xd7, 0xd7) 247db96d56Sopenharmony_ciGB18030EXTP3_C2 = (0xfa, 0xfe) 257db96d56Sopenharmony_ciGB18030EXTP4_C1 = (0xf8, 0xfd) 267db96d56Sopenharmony_ciGB18030EXTP4_C2 = (0xa1, 0xfe) 277db96d56Sopenharmony_ciGB18030EXTP5_C1 = (0xfe, 0xfe) 287db96d56Sopenharmony_ciGB18030EXTP5_C2 = (0x50, 0xfe) 297db96d56Sopenharmony_ci 307db96d56Sopenharmony_ciMAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT' 317db96d56Sopenharmony_ciMAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT' 327db96d56Sopenharmony_ciMAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml' 337db96d56Sopenharmony_ci 347db96d56Sopenharmony_cire_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>') 357db96d56Sopenharmony_ci 367db96d56Sopenharmony_ci 377db96d56Sopenharmony_cidef parse_gb18030map(fo): 387db96d56Sopenharmony_ci m, gbuni = {}, {} 397db96d56Sopenharmony_ci for i in range(65536): 407db96d56Sopenharmony_ci if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area 417db96d56Sopenharmony_ci gbuni[i] = None 427db96d56Sopenharmony_ci for uni, native in re_gb18030ass.findall(fo.read()): 437db96d56Sopenharmony_ci uni = eval('0x'+uni) 447db96d56Sopenharmony_ci native = [eval('0x'+u) for u in native.split()] 457db96d56Sopenharmony_ci if len(native) <= 2: 467db96d56Sopenharmony_ci del gbuni[uni] 477db96d56Sopenharmony_ci if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes 487db96d56Sopenharmony_ci m.setdefault(native[0], {}) 497db96d56Sopenharmony_ci m[native[0]][native[1]] = uni 507db96d56Sopenharmony_ci gbuni = [k for k in gbuni.keys()] 517db96d56Sopenharmony_ci gbuni.sort() 527db96d56Sopenharmony_ci return m, gbuni 537db96d56Sopenharmony_ci 547db96d56Sopenharmony_cidef main(): 557db96d56Sopenharmony_ci print("Loading Mapping File...") 567db96d56Sopenharmony_ci gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312) 577db96d56Sopenharmony_ci cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936) 587db96d56Sopenharmony_ci gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030) 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ci gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map) 617db96d56Sopenharmony_ci gbkdecmap = loadmap(cp936map) 627db96d56Sopenharmony_ci gb2312decmap = loadmap(gb2312map) 637db96d56Sopenharmony_ci difmap = {} 647db96d56Sopenharmony_ci for c1, m in gbkdecmap.items(): 657db96d56Sopenharmony_ci for c2, code in m.items(): 667db96d56Sopenharmony_ci del gb18030decmap[c1][c2] 677db96d56Sopenharmony_ci if not gb18030decmap[c1]: 687db96d56Sopenharmony_ci del gb18030decmap[c1] 697db96d56Sopenharmony_ci for c1, m in gb2312decmap.items(): 707db96d56Sopenharmony_ci for c2, code in m.items(): 717db96d56Sopenharmony_ci gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80 727db96d56Sopenharmony_ci if gbkdecmap[gbkc1][gbkc2] == code: 737db96d56Sopenharmony_ci del gbkdecmap[gbkc1][gbkc2] 747db96d56Sopenharmony_ci if not gbkdecmap[gbkc1]: 757db96d56Sopenharmony_ci del gbkdecmap[gbkc1] 767db96d56Sopenharmony_ci 777db96d56Sopenharmony_ci gb2312_gbkencmap, gb18030encmap = {}, {} 787db96d56Sopenharmony_ci for c1, m in gbkdecmap.items(): 797db96d56Sopenharmony_ci for c2, code in m.items(): 807db96d56Sopenharmony_ci gb2312_gbkencmap.setdefault(code >> 8, {}) 817db96d56Sopenharmony_ci gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set 827db96d56Sopenharmony_ci for c1, m in gb2312decmap.items(): 837db96d56Sopenharmony_ci for c2, code in m.items(): 847db96d56Sopenharmony_ci gb2312_gbkencmap.setdefault(code >> 8, {}) 857db96d56Sopenharmony_ci gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset 867db96d56Sopenharmony_ci for c1, m in gb18030decmap.items(): 877db96d56Sopenharmony_ci for c2, code in m.items(): 887db96d56Sopenharmony_ci gb18030encmap.setdefault(code >> 8, {}) 897db96d56Sopenharmony_ci gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2 907db96d56Sopenharmony_ci 917db96d56Sopenharmony_ci with open('mappings_cn.h', 'w') as fp: 927db96d56Sopenharmony_ci print_autogen(fp, os.path.basename(__file__)) 937db96d56Sopenharmony_ci 947db96d56Sopenharmony_ci print("Generating GB2312 decode map...") 957db96d56Sopenharmony_ci writer = DecodeMapWriter(fp, "gb2312", gb2312decmap) 967db96d56Sopenharmony_ci writer.update_decode_map(GB2312_C1, GB2312_C2) 977db96d56Sopenharmony_ci writer.generate() 987db96d56Sopenharmony_ci 997db96d56Sopenharmony_ci print("Generating GBK decode map...") 1007db96d56Sopenharmony_ci writer = DecodeMapWriter(fp, "gbkext", gbkdecmap) 1017db96d56Sopenharmony_ci writer.update_decode_map(GBKL1_C1, GBKL1_C2) 1027db96d56Sopenharmony_ci writer.update_decode_map(GBKL2_C1, GBKL2_C2) 1037db96d56Sopenharmony_ci writer.generate() 1047db96d56Sopenharmony_ci 1057db96d56Sopenharmony_ci print("Generating GB2312 && GBK encode map...") 1067db96d56Sopenharmony_ci writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap) 1077db96d56Sopenharmony_ci writer.generate() 1087db96d56Sopenharmony_ci 1097db96d56Sopenharmony_ci print("Generating GB18030 extension decode map...") 1107db96d56Sopenharmony_ci writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap) 1117db96d56Sopenharmony_ci for i in range(1, 6): 1127db96d56Sopenharmony_ci writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i)) 1137db96d56Sopenharmony_ci 1147db96d56Sopenharmony_ci writer.generate() 1157db96d56Sopenharmony_ci 1167db96d56Sopenharmony_ci print("Generating GB18030 extension encode map...") 1177db96d56Sopenharmony_ci writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap) 1187db96d56Sopenharmony_ci writer.generate() 1197db96d56Sopenharmony_ci 1207db96d56Sopenharmony_ci print("Generating GB18030 Unicode BMP Mapping Ranges...") 1217db96d56Sopenharmony_ci ranges = [[-1, -1, -1]] 1227db96d56Sopenharmony_ci gblinnum = 0 1237db96d56Sopenharmony_ci fp.write(""" 1247db96d56Sopenharmony_cistatic const struct _gb18030_to_unibmp_ranges { 1257db96d56Sopenharmony_ci Py_UCS4 first, last; 1267db96d56Sopenharmony_ci DBCHAR base; 1277db96d56Sopenharmony_ci} gb18030_to_unibmp_ranges[] = { 1287db96d56Sopenharmony_ci""") 1297db96d56Sopenharmony_ci 1307db96d56Sopenharmony_ci for uni in gb18030unilinear: 1317db96d56Sopenharmony_ci if uni == ranges[-1][1] + 1: 1327db96d56Sopenharmony_ci ranges[-1][1] = uni 1337db96d56Sopenharmony_ci else: 1347db96d56Sopenharmony_ci ranges.append([uni, uni, gblinnum]) 1357db96d56Sopenharmony_ci gblinnum += 1 1367db96d56Sopenharmony_ci 1377db96d56Sopenharmony_ci filler = BufferedFiller() 1387db96d56Sopenharmony_ci for first, last, base in ranges[1:]: 1397db96d56Sopenharmony_ci filler.write('{', str(first), ',', str(last), ',', str(base), '},') 1407db96d56Sopenharmony_ci 1417db96d56Sopenharmony_ci filler.write('{', '0,', '0,', str( 1427db96d56Sopenharmony_ci ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};') 1437db96d56Sopenharmony_ci filler.printout(fp) 1447db96d56Sopenharmony_ci 1457db96d56Sopenharmony_ci print("Done!") 1467db96d56Sopenharmony_ci 1477db96d56Sopenharmony_ci 1487db96d56Sopenharmony_ciif __name__ == '__main__': 1497db96d56Sopenharmony_ci main() 150