1/* 2 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings. 3 * 4 * Written by Hye-Shik Chang <perky@FreeBSD.org> 5 */ 6 7#define USING_IMPORTED_MAPS 8#define USING_BINARY_PAIR_SEARCH 9#define EXTERN_JISX0213_PAIR 10#define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE 11#define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE 12 13#include "cjkcodecs.h" 14#include "alg_jisx0201.h" 15#include "emu_jisx0213_2000.h" 16#include "mappings_jisx0213_pair.h" 17 18/* STATE 19 20 state->c[0-3] 21 22 00000000 23 ||^^^^^| 24 |+-----+---- G0-3 Character Set 25 +----------- Is G0-3 double byte? 26 27 state->c[4] 28 29 00000000 30 || 31 |+---- Locked-Shift? 32 +----- ESC Throughout 33*/ 34 35#define ESC 0x1B 36#define SO 0x0E 37#define SI 0x0F 38#define LF 0x0A 39 40#define MAX_ESCSEQLEN 16 41 42#define CHARSET_ISO8859_1 'A' 43#define CHARSET_ASCII 'B' 44#define CHARSET_ISO8859_7 'F' 45#define CHARSET_JISX0201_K 'I' 46#define CHARSET_JISX0201_R 'J' 47 48#define CHARSET_GB2312 ('A'|CHARSET_DBCS) 49#define CHARSET_JISX0208 ('B'|CHARSET_DBCS) 50#define CHARSET_KSX1001 ('C'|CHARSET_DBCS) 51#define CHARSET_JISX0212 ('D'|CHARSET_DBCS) 52#define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS) 53#define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS) 54#define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS) 55#define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS) 56#define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS) 57#define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS) 58#define CHARSET_JISX0208_O ('@'|CHARSET_DBCS) 59 60#define CHARSET_DBCS 0x80 61#define ESCMARK(mark) ((mark) & 0x7f) 62 63#define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@') 64#define IS_ISO2022ESC(c2) \ 65 ((c2) == '(' || (c2) == ')' || (c2) == '$' || \ 66 (c2) == '.' || (c2) == '&') 67 /* this is not a complete list of ISO-2022 escape sequence headers. 68 * but, it's enough to implement CJK instances of iso-2022. */ 69 70#define MAP_UNMAPPABLE 0xFFFF 71#define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */ 72 73#define F_SHIFTED 0x01 74#define F_ESCTHROUGHOUT 0x02 75 76#define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0) 77#define STATE_GETG(dn) ((state)->c[dn]) 78 79#define STATE_G0 STATE_GETG(0) 80#define STATE_G1 STATE_GETG(1) 81#define STATE_G2 STATE_GETG(2) 82#define STATE_G3 STATE_GETG(3) 83#define STATE_SETG0(v) STATE_SETG(0, v) 84#define STATE_SETG1(v) STATE_SETG(1, v) 85#define STATE_SETG2(v) STATE_SETG(2, v) 86#define STATE_SETG3(v) STATE_SETG(3, v) 87 88#define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0) 89#define STATE_GETFLAG(f) ((state)->c[4] & (f)) 90#define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0) 91#define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0) 92 93#define ISO2022_CONFIG ((const struct iso2022_config *)config) 94#define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag)) 95#define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations) 96 97/* iso2022_config.flags */ 98#define NO_SHIFT 0x01 99#define USE_G2 0x02 100#define USE_JISX0208_EXT 0x04 101 102/*-*- internal data structures -*-*/ 103 104typedef int (*iso2022_init_func)(void); 105typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data); 106typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length); 107 108struct iso2022_designation { 109 unsigned char mark; 110 unsigned char plane; 111 unsigned char width; 112 iso2022_init_func initializer; 113 iso2022_decode_func decoder; 114 iso2022_encode_func encoder; 115}; 116 117struct iso2022_config { 118 int flags; 119 const struct iso2022_designation *designations; /* non-ascii desigs */ 120}; 121 122/*-*- iso-2022 codec implementation -*-*/ 123 124CODEC_INIT(iso2022) 125{ 126 const struct iso2022_designation *desig; 127 for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) 128 if (desig->initializer != NULL && desig->initializer() != 0) 129 return -1; 130 return 0; 131} 132 133ENCODER_INIT(iso2022) 134{ 135 STATE_CLEARFLAGS(); 136 STATE_SETG0(CHARSET_ASCII); 137 STATE_SETG1(CHARSET_ASCII); 138 return 0; 139} 140 141ENCODER_RESET(iso2022) 142{ 143 if (STATE_GETFLAG(F_SHIFTED)) { 144 WRITEBYTE1(SI); 145 NEXT_OUT(1); 146 STATE_CLEARFLAG(F_SHIFTED); 147 } 148 if (STATE_G0 != CHARSET_ASCII) { 149 WRITEBYTE3(ESC, '(', 'B'); 150 NEXT_OUT(3); 151 STATE_SETG0(CHARSET_ASCII); 152 } 153 return 0; 154} 155 156ENCODER(iso2022) 157{ 158 while (*inpos < inlen) { 159 const struct iso2022_designation *dsg; 160 DBCHAR encoded; 161 Py_UCS4 c = INCHAR1; 162 Py_ssize_t insize; 163 164 if (c < 0x80) { 165 if (STATE_G0 != CHARSET_ASCII) { 166 WRITEBYTE3(ESC, '(', 'B'); 167 STATE_SETG0(CHARSET_ASCII); 168 NEXT_OUT(3); 169 } 170 if (STATE_GETFLAG(F_SHIFTED)) { 171 WRITEBYTE1(SI); 172 STATE_CLEARFLAG(F_SHIFTED); 173 NEXT_OUT(1); 174 } 175 WRITEBYTE1((unsigned char)c); 176 NEXT(1, 1); 177 continue; 178 } 179 180 insize = 1; 181 182 encoded = MAP_UNMAPPABLE; 183 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { 184 Py_ssize_t length = 1; 185 encoded = dsg->encoder(&c, &length); 186 if (encoded == MAP_MULTIPLE_AVAIL) { 187 /* this implementation won't work for pair 188 * of non-bmp characters. */ 189 if (inlen - *inpos < 2) { 190 if (!(flags & MBENC_FLUSH)) 191 return MBERR_TOOFEW; 192 length = -1; 193 } 194 else 195 length = 2; 196 encoded = dsg->encoder(&c, &length); 197 if (encoded != MAP_UNMAPPABLE) { 198 insize = length; 199 break; 200 } 201 } 202 else if (encoded != MAP_UNMAPPABLE) 203 break; 204 } 205 206 if (!dsg->mark) 207 return 1; 208 assert(dsg->width == 1 || dsg->width == 2); 209 210 switch (dsg->plane) { 211 case 0: /* G0 */ 212 if (STATE_GETFLAG(F_SHIFTED)) { 213 WRITEBYTE1(SI); 214 STATE_CLEARFLAG(F_SHIFTED); 215 NEXT_OUT(1); 216 } 217 if (STATE_G0 != dsg->mark) { 218 if (dsg->width == 1) { 219 WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark)); 220 STATE_SETG0(dsg->mark); 221 NEXT_OUT(3); 222 } 223 else if (dsg->mark == CHARSET_JISX0208) { 224 WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark)); 225 STATE_SETG0(dsg->mark); 226 NEXT_OUT(3); 227 } 228 else { 229 WRITEBYTE4(ESC, '$', '(', 230 ESCMARK(dsg->mark)); 231 STATE_SETG0(dsg->mark); 232 NEXT_OUT(4); 233 } 234 } 235 break; 236 case 1: /* G1 */ 237 if (STATE_G1 != dsg->mark) { 238 if (dsg->width == 1) { 239 WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark)); 240 STATE_SETG1(dsg->mark); 241 NEXT_OUT(3); 242 } 243 else { 244 WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark)); 245 STATE_SETG1(dsg->mark); 246 NEXT_OUT(4); 247 } 248 } 249 if (!STATE_GETFLAG(F_SHIFTED)) { 250 WRITEBYTE1(SO); 251 STATE_SETFLAG(F_SHIFTED); 252 NEXT_OUT(1); 253 } 254 break; 255 default: /* G2 and G3 is not supported: no encoding in 256 * CJKCodecs are using them yet */ 257 return MBERR_INTERNAL; 258 } 259 260 if (dsg->width == 1) { 261 WRITEBYTE1((unsigned char)encoded); 262 NEXT_OUT(1); 263 } 264 else { 265 WRITEBYTE2(encoded >> 8, encoded & 0xff); 266 NEXT_OUT(2); 267 } 268 NEXT_INCHAR(insize); 269 } 270 271 return 0; 272} 273 274DECODER_INIT(iso2022) 275{ 276 STATE_CLEARFLAGS(); 277 STATE_SETG0(CHARSET_ASCII); 278 STATE_SETG1(CHARSET_ASCII); 279 STATE_SETG2(CHARSET_ASCII); 280 return 0; 281} 282 283DECODER_RESET(iso2022) 284{ 285 STATE_SETG0(CHARSET_ASCII); 286 STATE_CLEARFLAG(F_SHIFTED); 287 return 0; 288} 289 290static Py_ssize_t 291iso2022processesc(const void *config, MultibyteCodec_State *state, 292 const unsigned char **inbuf, Py_ssize_t *inleft) 293{ 294 unsigned char charset, designation; 295 Py_ssize_t i, esclen = 0; 296 297 for (i = 1;i < MAX_ESCSEQLEN;i++) { 298 if (i >= *inleft) 299 return MBERR_TOOFEW; 300 if (IS_ESCEND((*inbuf)[i])) { 301 esclen = i + 1; 302 break; 303 } 304 else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft && 305 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') { 306 i += 2; 307 } 308 } 309 310 switch (esclen) { 311 case 0: 312 return 1; /* unterminated escape sequence */ 313 case 3: 314 if (INBYTE2 == '$') { 315 charset = INBYTE3 | CHARSET_DBCS; 316 designation = 0; 317 } 318 else { 319 charset = INBYTE3; 320 if (INBYTE2 == '(') 321 designation = 0; 322 else if (INBYTE2 == ')') 323 designation = 1; 324 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.') 325 designation = 2; 326 else 327 return 3; 328 } 329 break; 330 case 4: 331 if (INBYTE2 != '$') 332 return 4; 333 334 charset = INBYTE4 | CHARSET_DBCS; 335 if (INBYTE3 == '(') 336 designation = 0; 337 else if (INBYTE3 == ')') 338 designation = 1; 339 else 340 return 4; 341 break; 342 case 6: /* designation with prefix */ 343 if (CONFIG_ISSET(USE_JISX0208_EXT) && 344 (*inbuf)[3] == ESC && (*inbuf)[4] == '$' && 345 (*inbuf)[5] == 'B') { 346 charset = 'B' | CHARSET_DBCS; 347 designation = 0; 348 } 349 else 350 return 6; 351 break; 352 default: 353 return esclen; 354 } 355 356 /* raise error when the charset is not designated for this encoding */ 357 if (charset != CHARSET_ASCII) { 358 const struct iso2022_designation *dsg; 359 360 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { 361 if (dsg->mark == charset) 362 break; 363 } 364 if (!dsg->mark) 365 return esclen; 366 } 367 368 STATE_SETG(designation, charset); 369 *inleft -= esclen; 370 (*inbuf) += esclen; 371 return 0; 372} 373 374#define ISO8859_7_DECODE(c, writer) \ 375 if ((c) < 0xa0) { \ 376 OUTCHAR(c); \ 377 } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \ 378 OUTCHAR(c); \ 379 } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \ 380 (0xbffffd77L & (1L << ((c)-0xb4))))) { \ 381 OUTCHAR(0x02d0 + (c)); \ 382 } else if ((c) == 0xa1) { \ 383 OUTCHAR(0x2018); \ 384 } else if ((c) == 0xa2) { \ 385 OUTCHAR(0x2019); \ 386 } else if ((c) == 0xaf) { \ 387 OUTCHAR(0x2015); \ 388 } 389 390static Py_ssize_t 391iso2022processg2(const void *config, MultibyteCodec_State *state, 392 const unsigned char **inbuf, Py_ssize_t *inleft, 393 _PyUnicodeWriter *writer) 394{ 395 /* not written to use encoder, decoder functions because only few 396 * encodings use G2 designations in CJKCodecs */ 397 if (STATE_G2 == CHARSET_ISO8859_1) { 398 if (INBYTE3 < 0x80) 399 OUTCHAR(INBYTE3 + 0x80); 400 else 401 return 3; 402 } 403 else if (STATE_G2 == CHARSET_ISO8859_7) { 404 ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer) 405 else 406 return 3; 407 } 408 else if (STATE_G2 == CHARSET_ASCII) { 409 if (INBYTE3 & 0x80) 410 return 3; 411 else 412 OUTCHAR(INBYTE3); 413 } 414 else 415 return MBERR_INTERNAL; 416 417 (*inbuf) += 3; 418 *inleft -= 3; 419 return 0; 420} 421 422DECODER(iso2022) 423{ 424 const struct iso2022_designation *dsgcache = NULL; 425 426 while (inleft > 0) { 427 unsigned char c = INBYTE1; 428 Py_ssize_t err; 429 430 if (STATE_GETFLAG(F_ESCTHROUGHOUT)) { 431 /* ESC throughout mode: 432 * for non-iso2022 escape sequences */ 433 OUTCHAR(c); /* assume as ISO-8859-1 */ 434 NEXT_IN(1); 435 if (IS_ESCEND(c)) { 436 STATE_CLEARFLAG(F_ESCTHROUGHOUT); 437 } 438 continue; 439 } 440 441 switch (c) { 442 case ESC: 443 REQUIRE_INBUF(2); 444 if (IS_ISO2022ESC(INBYTE2)) { 445 err = iso2022processesc(config, state, 446 inbuf, &inleft); 447 if (err != 0) 448 return err; 449 } 450 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */ 451 REQUIRE_INBUF(3); 452 err = iso2022processg2(config, state, 453 inbuf, &inleft, writer); 454 if (err != 0) 455 return err; 456 } 457 else { 458 OUTCHAR(ESC); 459 STATE_SETFLAG(F_ESCTHROUGHOUT); 460 NEXT_IN(1); 461 } 462 break; 463 case SI: 464 if (CONFIG_ISSET(NO_SHIFT)) 465 goto bypass; 466 STATE_CLEARFLAG(F_SHIFTED); 467 NEXT_IN(1); 468 break; 469 case SO: 470 if (CONFIG_ISSET(NO_SHIFT)) 471 goto bypass; 472 STATE_SETFLAG(F_SHIFTED); 473 NEXT_IN(1); 474 break; 475 case LF: 476 STATE_CLEARFLAG(F_SHIFTED); 477 OUTCHAR(LF); 478 NEXT_IN(1); 479 break; 480 default: 481 if (c < 0x20) /* C0 */ 482 goto bypass; 483 else if (c >= 0x80) 484 return 1; 485 else { 486 const struct iso2022_designation *dsg; 487 unsigned char charset; 488 Py_UCS4 decoded; 489 490 if (STATE_GETFLAG(F_SHIFTED)) 491 charset = STATE_G1; 492 else 493 charset = STATE_G0; 494 495 if (charset == CHARSET_ASCII) { 496bypass: 497 OUTCHAR(c); 498 NEXT_IN(1); 499 break; 500 } 501 502 if (dsgcache != NULL && 503 dsgcache->mark == charset) 504 dsg = dsgcache; 505 else { 506 for (dsg = CONFIG_DESIGNATIONS; 507 dsg->mark != charset 508#ifdef Py_DEBUG 509 && dsg->mark != '\0' 510#endif 511 ; dsg++) 512 { 513 /* noop */ 514 } 515 assert(dsg->mark != '\0'); 516 dsgcache = dsg; 517 } 518 519 REQUIRE_INBUF(dsg->width); 520 decoded = dsg->decoder(*inbuf); 521 if (decoded == MAP_UNMAPPABLE) 522 return dsg->width; 523 524 if (decoded < 0x10000) { 525 OUTCHAR(decoded); 526 } 527 else if (decoded < 0x30000) { 528 OUTCHAR(decoded); 529 } 530 else { /* JIS X 0213 pairs */ 531 OUTCHAR2(decoded >> 16, decoded & 0xffff); 532 } 533 NEXT_IN(dsg->width); 534 } 535 break; 536 } 537 } 538 return 0; 539} 540 541/*-*- mapping table holders -*-*/ 542 543#define ENCMAP(enc) static const encode_map *enc##_encmap = NULL; 544#define DECMAP(enc) static const decode_map *enc##_decmap = NULL; 545 546/* kr */ 547ENCMAP(cp949) 548DECMAP(ksx1001) 549 550/* jp */ 551ENCMAP(jisxcommon) 552DECMAP(jisx0208) 553DECMAP(jisx0212) 554ENCMAP(jisx0213_bmp) 555DECMAP(jisx0213_1_bmp) 556DECMAP(jisx0213_2_bmp) 557ENCMAP(jisx0213_emp) 558DECMAP(jisx0213_1_emp) 559DECMAP(jisx0213_2_emp) 560 561/* cn */ 562ENCMAP(gbcommon) 563DECMAP(gb2312) 564 565/* tw */ 566 567/*-*- mapping access functions -*-*/ 568 569static int 570ksx1001_init(void) 571{ 572 static int initialized = 0; 573 574 if (!initialized && ( 575 IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) || 576 IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap))) 577 return -1; 578 initialized = 1; 579 return 0; 580} 581 582static Py_UCS4 583ksx1001_decoder(const unsigned char *data) 584{ 585 Py_UCS4 u; 586 if (TRYMAP_DEC(ksx1001, u, data[0], data[1])) 587 return u; 588 else 589 return MAP_UNMAPPABLE; 590} 591 592static DBCHAR 593ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length) 594{ 595 DBCHAR coded; 596 assert(*length == 1); 597 if (*data < 0x10000) { 598 if (TRYMAP_ENC(cp949, coded, *data)) { 599 if (!(coded & 0x8000)) 600 return coded; 601 } 602 } 603 return MAP_UNMAPPABLE; 604} 605 606static int 607jisx0208_init(void) 608{ 609 static int initialized = 0; 610 611 if (!initialized && ( 612 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || 613 IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap))) 614 return -1; 615 initialized = 1; 616 return 0; 617} 618 619static Py_UCS4 620jisx0208_decoder(const unsigned char *data) 621{ 622 Py_UCS4 u; 623 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ 624 return 0xff3c; 625 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) 626 return u; 627 else 628 return MAP_UNMAPPABLE; 629} 630 631static DBCHAR 632jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length) 633{ 634 DBCHAR coded; 635 assert(*length == 1); 636 if (*data < 0x10000) { 637 if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */ 638 return 0x2140; 639 else if (TRYMAP_ENC(jisxcommon, coded, *data)) { 640 if (!(coded & 0x8000)) 641 return coded; 642 } 643 } 644 return MAP_UNMAPPABLE; 645} 646 647static int 648jisx0212_init(void) 649{ 650 static int initialized = 0; 651 652 if (!initialized && ( 653 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || 654 IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap))) 655 return -1; 656 initialized = 1; 657 return 0; 658} 659 660static Py_UCS4 661jisx0212_decoder(const unsigned char *data) 662{ 663 Py_UCS4 u; 664 if (TRYMAP_DEC(jisx0212, u, data[0], data[1])) 665 return u; 666 else 667 return MAP_UNMAPPABLE; 668} 669 670static DBCHAR 671jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length) 672{ 673 DBCHAR coded; 674 assert(*length == 1); 675 if (*data < 0x10000) { 676 if (TRYMAP_ENC(jisxcommon, coded, *data)) { 677 if (coded & 0x8000) 678 return coded & 0x7fff; 679 } 680 } 681 return MAP_UNMAPPABLE; 682} 683 684static int 685jisx0213_init(void) 686{ 687 static int initialized = 0; 688 689 if (!initialized && ( 690 jisx0208_init() || 691 IMPORT_MAP(jp, jisx0213_bmp, 692 &jisx0213_bmp_encmap, NULL) || 693 IMPORT_MAP(jp, jisx0213_1_bmp, 694 NULL, &jisx0213_1_bmp_decmap) || 695 IMPORT_MAP(jp, jisx0213_2_bmp, 696 NULL, &jisx0213_2_bmp_decmap) || 697 IMPORT_MAP(jp, jisx0213_emp, 698 &jisx0213_emp_encmap, NULL) || 699 IMPORT_MAP(jp, jisx0213_1_emp, 700 NULL, &jisx0213_1_emp_decmap) || 701 IMPORT_MAP(jp, jisx0213_2_emp, 702 NULL, &jisx0213_2_emp_decmap) || 703 IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap, 704 &jisx0213_pair_decmap))) 705 return -1; 706 initialized = 1; 707 return 0; 708} 709 710#define config ((void *)2000) 711static Py_UCS4 712jisx0213_2000_1_decoder(const unsigned char *data) 713{ 714 Py_UCS4 u; 715 EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1]) 716 else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ 717 return 0xff3c; 718 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) 719 ; 720 else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1])) 721 ; 722 else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])) 723 u |= 0x20000; 724 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1])) 725 ; 726 else 727 return MAP_UNMAPPABLE; 728 return u; 729} 730 731static Py_UCS4 732jisx0213_2000_2_decoder(const unsigned char *data) 733{ 734 Py_UCS4 u; 735 EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1]) 736 if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1])) 737 ; 738 else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])) 739 u |= 0x20000; 740 else 741 return MAP_UNMAPPABLE; 742 return u; 743} 744#undef config 745 746static Py_UCS4 747jisx0213_2004_1_decoder(const unsigned char *data) 748{ 749 Py_UCS4 u; 750 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ 751 return 0xff3c; 752 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) 753 ; 754 else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1])) 755 ; 756 else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])) 757 u |= 0x20000; 758 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1])) 759 ; 760 else 761 return MAP_UNMAPPABLE; 762 return u; 763} 764 765static Py_UCS4 766jisx0213_2004_2_decoder(const unsigned char *data) 767{ 768 Py_UCS4 u; 769 if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1])) 770 ; 771 else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])) 772 u |= 0x20000; 773 else 774 return MAP_UNMAPPABLE; 775 return u; 776} 777 778static DBCHAR 779jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config) 780{ 781 DBCHAR coded; 782 783 switch (*length) { 784 case 1: /* first character */ 785 if (*data >= 0x10000) { 786 if ((*data) >> 16 == 0x20000 >> 16) { 787 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data) 788 else if (TRYMAP_ENC(jisx0213_emp, coded, (*data) & 0xffff)) 789 return coded; 790 } 791 return MAP_UNMAPPABLE; 792 } 793 794 EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data) 795 else if (TRYMAP_ENC(jisx0213_bmp, coded, *data)) { 796 if (coded == MULTIC) 797 return MAP_MULTIPLE_AVAIL; 798 } 799 else if (TRYMAP_ENC(jisxcommon, coded, *data)) { 800 if (coded & 0x8000) 801 return MAP_UNMAPPABLE; 802 } 803 else 804 return MAP_UNMAPPABLE; 805 return coded; 806 807 case 2: /* second character of unicode pair */ 808 coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], 809 jisx0213_pair_encmap, JISX0213_ENCPAIRS); 810 if (coded != DBCINV) 811 return coded; 812 /* fall through */ 813 814 case -1: /* flush unterminated */ 815 *length = 1; 816 coded = find_pairencmap((ucs2_t)data[0], 0, 817 jisx0213_pair_encmap, JISX0213_ENCPAIRS); 818 if (coded == DBCINV) 819 return MAP_UNMAPPABLE; 820 else 821 return coded; 822 break; 823 824 default: 825 return MAP_UNMAPPABLE; 826 } 827} 828 829static DBCHAR 830jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) 831{ 832 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); 833 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 834 return coded; 835 else if (coded & 0x8000) 836 return MAP_UNMAPPABLE; 837 else 838 return coded; 839} 840 841static DBCHAR 842jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) 843{ 844 DBCHAR coded; 845 Py_ssize_t ilength = *length; 846 847 coded = jisx0213_encoder(data, length, (void *)2000); 848 switch (ilength) { 849 case 1: 850 if (coded == MAP_MULTIPLE_AVAIL) 851 return MAP_MULTIPLE_AVAIL; 852 else 853 return MAP_UNMAPPABLE; 854 case 2: 855 if (*length != 2) 856 return MAP_UNMAPPABLE; 857 else 858 return coded; 859 default: 860 return MAP_UNMAPPABLE; 861 } 862} 863 864static DBCHAR 865jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) 866{ 867 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); 868 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 869 return coded; 870 else if (coded & 0x8000) 871 return coded & 0x7fff; 872 else 873 return MAP_UNMAPPABLE; 874} 875 876static DBCHAR 877jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) 878{ 879 DBCHAR coded = jisx0213_encoder(data, length, NULL); 880 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 881 return coded; 882 else if (coded & 0x8000) 883 return MAP_UNMAPPABLE; 884 else 885 return coded; 886} 887 888static DBCHAR 889jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) 890{ 891 DBCHAR coded; 892 Py_ssize_t ilength = *length; 893 894 coded = jisx0213_encoder(data, length, NULL); 895 switch (ilength) { 896 case 1: 897 if (coded == MAP_MULTIPLE_AVAIL) 898 return MAP_MULTIPLE_AVAIL; 899 else 900 return MAP_UNMAPPABLE; 901 case 2: 902 if (*length != 2) 903 return MAP_UNMAPPABLE; 904 else 905 return coded; 906 default: 907 return MAP_UNMAPPABLE; 908 } 909} 910 911static DBCHAR 912jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) 913{ 914 DBCHAR coded = jisx0213_encoder(data, length, NULL); 915 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) 916 return coded; 917 else if (coded & 0x8000) 918 return coded & 0x7fff; 919 else 920 return MAP_UNMAPPABLE; 921} 922 923static Py_UCS4 924jisx0201_r_decoder(const unsigned char *data) 925{ 926 Py_UCS4 u; 927 JISX0201_R_DECODE_CHAR(*data, u) 928 else 929 return MAP_UNMAPPABLE; 930 return u; 931} 932 933static DBCHAR 934jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length) 935{ 936 DBCHAR coded; 937 JISX0201_R_ENCODE(*data, coded) 938 else 939 return MAP_UNMAPPABLE; 940 return coded; 941} 942 943static Py_UCS4 944jisx0201_k_decoder(const unsigned char *data) 945{ 946 Py_UCS4 u; 947 JISX0201_K_DECODE_CHAR(*data ^ 0x80, u) 948 else 949 return MAP_UNMAPPABLE; 950 return u; 951} 952 953static DBCHAR 954jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length) 955{ 956 DBCHAR coded; 957 JISX0201_K_ENCODE(*data, coded) 958 else 959 return MAP_UNMAPPABLE; 960 return coded - 0x80; 961} 962 963static int 964gb2312_init(void) 965{ 966 static int initialized = 0; 967 968 if (!initialized && ( 969 IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) || 970 IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap))) 971 return -1; 972 initialized = 1; 973 return 0; 974} 975 976static Py_UCS4 977gb2312_decoder(const unsigned char *data) 978{ 979 Py_UCS4 u; 980 if (TRYMAP_DEC(gb2312, u, data[0], data[1])) 981 return u; 982 else 983 return MAP_UNMAPPABLE; 984} 985 986static DBCHAR 987gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length) 988{ 989 DBCHAR coded; 990 assert(*length == 1); 991 if (*data < 0x10000) { 992 if (TRYMAP_ENC(gbcommon, coded, *data)) { 993 if (!(coded & 0x8000)) 994 return coded; 995 } 996 } 997 return MAP_UNMAPPABLE; 998} 999 1000 1001static Py_UCS4 1002dummy_decoder(const unsigned char *data) 1003{ 1004 return MAP_UNMAPPABLE; 1005} 1006 1007static DBCHAR 1008dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length) 1009{ 1010 return MAP_UNMAPPABLE; 1011} 1012 1013/*-*- registry tables -*-*/ 1014 1015#define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \ 1016 ksx1001_init, \ 1017 ksx1001_decoder, ksx1001_encoder } 1018#define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \ 1019 ksx1001_init, \ 1020 ksx1001_decoder, ksx1001_encoder } 1021#define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \ 1022 NULL, \ 1023 jisx0201_r_decoder, jisx0201_r_encoder } 1024#define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \ 1025 NULL, \ 1026 jisx0201_k_decoder, jisx0201_k_encoder } 1027#define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \ 1028 jisx0208_init, \ 1029 jisx0208_decoder, jisx0208_encoder } 1030#define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \ 1031 jisx0208_init, \ 1032 jisx0208_decoder, jisx0208_encoder } 1033#define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \ 1034 jisx0212_init, \ 1035 jisx0212_decoder, jisx0212_encoder } 1036#define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \ 1037 jisx0213_init, \ 1038 jisx0213_2000_1_decoder, \ 1039 jisx0213_2000_1_encoder } 1040#define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \ 1041 jisx0213_init, \ 1042 jisx0213_2000_1_decoder, \ 1043 jisx0213_2000_1_encoder_paironly } 1044#define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \ 1045 jisx0213_init, \ 1046 jisx0213_2000_2_decoder, \ 1047 jisx0213_2000_2_encoder } 1048#define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \ 1049 jisx0213_init, \ 1050 jisx0213_2004_1_decoder, \ 1051 jisx0213_2004_1_encoder } 1052#define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \ 1053 jisx0213_init, \ 1054 jisx0213_2004_1_decoder, \ 1055 jisx0213_2004_1_encoder_paironly } 1056#define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \ 1057 jisx0213_init, \ 1058 jisx0213_2004_2_decoder, \ 1059 jisx0213_2004_2_encoder } 1060#define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \ 1061 gb2312_init, \ 1062 gb2312_decoder, gb2312_encoder } 1063#define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \ 1064 cns11643_init, \ 1065 cns11643_1_decoder, cns11643_1_encoder } 1066#define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \ 1067 cns11643_init, \ 1068 cns11643_2_decoder, cns11643_2_encoder } 1069#define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \ 1070 NULL, dummy_decoder, dummy_encoder } 1071#define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \ 1072 NULL, dummy_decoder, dummy_encoder } 1073#define REGISTRY_SENTINEL { 0, } 1074#define CONFIGDEF(var, attrs) \ 1075 static const struct iso2022_config iso2022_##var##_config = { \ 1076 attrs, iso2022_##var##_designations \ 1077 }; 1078 1079static const struct iso2022_designation iso2022_kr_designations[] = { 1080 REGISTRY_KSX1001_G1, REGISTRY_SENTINEL 1081}; 1082CONFIGDEF(kr, 0) 1083 1084static const struct iso2022_designation iso2022_jp_designations[] = { 1085 REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, 1086 REGISTRY_SENTINEL 1087}; 1088CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT) 1089 1090static const struct iso2022_designation iso2022_jp_1_designations[] = { 1091 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, 1092 REGISTRY_JISX0208_O, REGISTRY_SENTINEL 1093}; 1094CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT) 1095 1096static const struct iso2022_designation iso2022_jp_2_designations[] = { 1097 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0, 1098 REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, 1099 REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL 1100}; 1101CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT) 1102 1103static const struct iso2022_designation iso2022_jp_2004_designations[] = { 1104 REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208, 1105 REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL 1106}; 1107CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT) 1108 1109static const struct iso2022_designation iso2022_jp_3_designations[] = { 1110 REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208, 1111 REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL 1112}; 1113CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT) 1114 1115static const struct iso2022_designation iso2022_jp_ext_designations[] = { 1116 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, 1117 REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL 1118}; 1119CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT) 1120 1121 1122BEGIN_MAPPINGS_LIST 1123 /* no mapping table here */ 1124END_MAPPINGS_LIST 1125 1126#define ISO2022_CODEC(variation) { \ 1127 "iso2022_" #variation, \ 1128 &iso2022_##variation##_config, \ 1129 iso2022_codec_init, \ 1130 _STATEFUL_METHODS(iso2022) \ 1131}, 1132 1133BEGIN_CODECS_LIST 1134 ISO2022_CODEC(kr) 1135 ISO2022_CODEC(jp) 1136 ISO2022_CODEC(jp_1) 1137 ISO2022_CODEC(jp_2) 1138 ISO2022_CODEC(jp_2004) 1139 ISO2022_CODEC(jp_3) 1140 ISO2022_CODEC(jp_ext) 1141END_CODECS_LIST 1142 1143I_AM_A_MODULE_FOR(iso2022) 1144