1/*
2 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7#define USING_IMPORTED_MAPS
8#define USING_BINARY_PAIR_SEARCH
9#define EXTERN_JISX0213_PAIR
10#define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
11#define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
12
13#include "cjkcodecs.h"
14#include "alg_jisx0201.h"
15#include "emu_jisx0213_2000.h"
16#include "mappings_jisx0213_pair.h"
17
18/* STATE
19
20   state->c[0-3]
21
22    00000000
23    ||^^^^^|
24    |+-----+----  G0-3 Character Set
25    +-----------  Is G0-3 double byte?
26
27   state->c[4]
28
29    00000000
30          ||
31          |+----  Locked-Shift?
32          +-----  ESC Throughout
33*/
34
35#define ESC                     0x1B
36#define SO                      0x0E
37#define SI                      0x0F
38#define LF                      0x0A
39
40#define MAX_ESCSEQLEN           16
41
42#define CHARSET_ISO8859_1       'A'
43#define CHARSET_ASCII           'B'
44#define CHARSET_ISO8859_7       'F'
45#define CHARSET_JISX0201_K      'I'
46#define CHARSET_JISX0201_R      'J'
47
48#define CHARSET_GB2312          ('A'|CHARSET_DBCS)
49#define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
50#define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
51#define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
52#define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
53#define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
54#define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
55#define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
56#define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
57#define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
58#define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
59
60#define CHARSET_DBCS            0x80
61#define ESCMARK(mark)           ((mark) & 0x7f)
62
63#define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
64#define IS_ISO2022ESC(c2) \
65        ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
66         (c2) == '.' || (c2) == '&')
67    /* this is not a complete list of ISO-2022 escape sequence headers.
68     * but, it's enough to implement CJK instances of iso-2022. */
69
70#define MAP_UNMAPPABLE          0xFFFF
71#define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
72
73#define F_SHIFTED               0x01
74#define F_ESCTHROUGHOUT         0x02
75
76#define STATE_SETG(dn, v)       do { ((state)->c[dn]) = (v); } while (0)
77#define STATE_GETG(dn)          ((state)->c[dn])
78
79#define STATE_G0                STATE_GETG(0)
80#define STATE_G1                STATE_GETG(1)
81#define STATE_G2                STATE_GETG(2)
82#define STATE_G3                STATE_GETG(3)
83#define STATE_SETG0(v)          STATE_SETG(0, v)
84#define STATE_SETG1(v)          STATE_SETG(1, v)
85#define STATE_SETG2(v)          STATE_SETG(2, v)
86#define STATE_SETG3(v)          STATE_SETG(3, v)
87
88#define STATE_SETFLAG(f)        do { ((state)->c[4]) |= (f); } while (0)
89#define STATE_GETFLAG(f)        ((state)->c[4] & (f))
90#define STATE_CLEARFLAG(f)      do { ((state)->c[4]) &= ~(f); } while (0)
91#define STATE_CLEARFLAGS()      do { ((state)->c[4]) = 0; } while (0)
92
93#define ISO2022_CONFIG          ((const struct iso2022_config *)config)
94#define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
95#define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
96
97/* iso2022_config.flags */
98#define NO_SHIFT                0x01
99#define USE_G2                  0x02
100#define USE_JISX0208_EXT        0x04
101
102/*-*- internal data structures -*-*/
103
104typedef int (*iso2022_init_func)(void);
105typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data);
106typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length);
107
108struct iso2022_designation {
109    unsigned char mark;
110    unsigned char plane;
111    unsigned char width;
112    iso2022_init_func initializer;
113    iso2022_decode_func decoder;
114    iso2022_encode_func encoder;
115};
116
117struct iso2022_config {
118    int flags;
119    const struct iso2022_designation *designations; /* non-ascii desigs */
120};
121
122/*-*- iso-2022 codec implementation -*-*/
123
124CODEC_INIT(iso2022)
125{
126    const struct iso2022_designation *desig;
127    for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
128        if (desig->initializer != NULL && desig->initializer() != 0)
129            return -1;
130    return 0;
131}
132
133ENCODER_INIT(iso2022)
134{
135    STATE_CLEARFLAGS();
136    STATE_SETG0(CHARSET_ASCII);
137    STATE_SETG1(CHARSET_ASCII);
138    return 0;
139}
140
141ENCODER_RESET(iso2022)
142{
143    if (STATE_GETFLAG(F_SHIFTED)) {
144        WRITEBYTE1(SI);
145        NEXT_OUT(1);
146        STATE_CLEARFLAG(F_SHIFTED);
147    }
148    if (STATE_G0 != CHARSET_ASCII) {
149        WRITEBYTE3(ESC, '(', 'B');
150        NEXT_OUT(3);
151        STATE_SETG0(CHARSET_ASCII);
152    }
153    return 0;
154}
155
156ENCODER(iso2022)
157{
158    while (*inpos < inlen) {
159        const struct iso2022_designation *dsg;
160        DBCHAR encoded;
161        Py_UCS4 c = INCHAR1;
162        Py_ssize_t insize;
163
164        if (c < 0x80) {
165            if (STATE_G0 != CHARSET_ASCII) {
166                WRITEBYTE3(ESC, '(', 'B');
167                STATE_SETG0(CHARSET_ASCII);
168                NEXT_OUT(3);
169            }
170            if (STATE_GETFLAG(F_SHIFTED)) {
171                WRITEBYTE1(SI);
172                STATE_CLEARFLAG(F_SHIFTED);
173                NEXT_OUT(1);
174            }
175            WRITEBYTE1((unsigned char)c);
176            NEXT(1, 1);
177            continue;
178        }
179
180        insize = 1;
181
182        encoded = MAP_UNMAPPABLE;
183        for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
184            Py_ssize_t length = 1;
185            encoded = dsg->encoder(&c, &length);
186            if (encoded == MAP_MULTIPLE_AVAIL) {
187                /* this implementation won't work for pair
188                 * of non-bmp characters. */
189                if (inlen - *inpos < 2) {
190                    if (!(flags & MBENC_FLUSH))
191                        return MBERR_TOOFEW;
192                    length = -1;
193                }
194                else
195                    length = 2;
196                encoded = dsg->encoder(&c, &length);
197                if (encoded != MAP_UNMAPPABLE) {
198                    insize = length;
199                    break;
200                }
201            }
202            else if (encoded != MAP_UNMAPPABLE)
203                break;
204        }
205
206        if (!dsg->mark)
207            return 1;
208        assert(dsg->width == 1 || dsg->width == 2);
209
210        switch (dsg->plane) {
211        case 0: /* G0 */
212            if (STATE_GETFLAG(F_SHIFTED)) {
213                WRITEBYTE1(SI);
214                STATE_CLEARFLAG(F_SHIFTED);
215                NEXT_OUT(1);
216            }
217            if (STATE_G0 != dsg->mark) {
218                if (dsg->width == 1) {
219                    WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
220                    STATE_SETG0(dsg->mark);
221                    NEXT_OUT(3);
222                }
223                else if (dsg->mark == CHARSET_JISX0208) {
224                    WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
225                    STATE_SETG0(dsg->mark);
226                    NEXT_OUT(3);
227                }
228                else {
229                    WRITEBYTE4(ESC, '$', '(',
230                        ESCMARK(dsg->mark));
231                    STATE_SETG0(dsg->mark);
232                    NEXT_OUT(4);
233                }
234            }
235            break;
236        case 1: /* G1 */
237            if (STATE_G1 != dsg->mark) {
238                if (dsg->width == 1) {
239                    WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
240                    STATE_SETG1(dsg->mark);
241                    NEXT_OUT(3);
242                }
243                else {
244                    WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
245                    STATE_SETG1(dsg->mark);
246                    NEXT_OUT(4);
247                }
248            }
249            if (!STATE_GETFLAG(F_SHIFTED)) {
250                WRITEBYTE1(SO);
251                STATE_SETFLAG(F_SHIFTED);
252                NEXT_OUT(1);
253            }
254            break;
255        default: /* G2 and G3 is not supported: no encoding in
256                  * CJKCodecs are using them yet */
257            return MBERR_INTERNAL;
258        }
259
260        if (dsg->width == 1) {
261            WRITEBYTE1((unsigned char)encoded);
262            NEXT_OUT(1);
263        }
264        else {
265            WRITEBYTE2(encoded >> 8, encoded & 0xff);
266            NEXT_OUT(2);
267        }
268        NEXT_INCHAR(insize);
269    }
270
271    return 0;
272}
273
274DECODER_INIT(iso2022)
275{
276    STATE_CLEARFLAGS();
277    STATE_SETG0(CHARSET_ASCII);
278    STATE_SETG1(CHARSET_ASCII);
279    STATE_SETG2(CHARSET_ASCII);
280    return 0;
281}
282
283DECODER_RESET(iso2022)
284{
285    STATE_SETG0(CHARSET_ASCII);
286    STATE_CLEARFLAG(F_SHIFTED);
287    return 0;
288}
289
290static Py_ssize_t
291iso2022processesc(const void *config, MultibyteCodec_State *state,
292                  const unsigned char **inbuf, Py_ssize_t *inleft)
293{
294    unsigned char charset, designation;
295    Py_ssize_t i, esclen = 0;
296
297    for (i = 1;i < MAX_ESCSEQLEN;i++) {
298        if (i >= *inleft)
299            return MBERR_TOOFEW;
300        if (IS_ESCEND((*inbuf)[i])) {
301            esclen = i + 1;
302            break;
303        }
304        else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
305                 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
306            i += 2;
307        }
308    }
309
310    switch (esclen) {
311    case 0:
312        return 1; /* unterminated escape sequence */
313    case 3:
314        if (INBYTE2 == '$') {
315            charset = INBYTE3 | CHARSET_DBCS;
316            designation = 0;
317        }
318        else {
319            charset = INBYTE3;
320            if (INBYTE2 == '(')
321                designation = 0;
322            else if (INBYTE2 == ')')
323                designation = 1;
324            else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
325                designation = 2;
326            else
327                return 3;
328        }
329        break;
330    case 4:
331        if (INBYTE2 != '$')
332            return 4;
333
334        charset = INBYTE4 | CHARSET_DBCS;
335        if (INBYTE3 == '(')
336            designation = 0;
337        else if (INBYTE3 == ')')
338            designation = 1;
339        else
340            return 4;
341        break;
342    case 6: /* designation with prefix */
343        if (CONFIG_ISSET(USE_JISX0208_EXT) &&
344            (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
345            (*inbuf)[5] == 'B') {
346            charset = 'B' | CHARSET_DBCS;
347            designation = 0;
348        }
349        else
350            return 6;
351        break;
352    default:
353        return esclen;
354    }
355
356    /* raise error when the charset is not designated for this encoding */
357    if (charset != CHARSET_ASCII) {
358        const struct iso2022_designation *dsg;
359
360        for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
361            if (dsg->mark == charset)
362                break;
363        }
364        if (!dsg->mark)
365            return esclen;
366    }
367
368    STATE_SETG(designation, charset);
369    *inleft -= esclen;
370    (*inbuf) += esclen;
371    return 0;
372}
373
374#define ISO8859_7_DECODE(c, writer)                                \
375    if ((c) < 0xa0) {                                              \
376        OUTCHAR(c);                                                \
377    } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
378        OUTCHAR(c);                                                \
379    } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||       \
380             (0xbffffd77L & (1L << ((c)-0xb4))))) {                \
381        OUTCHAR(0x02d0 + (c));                                     \
382    } else if ((c) == 0xa1) {                                      \
383        OUTCHAR(0x2018);                                           \
384    } else if ((c) == 0xa2) {                                      \
385        OUTCHAR(0x2019);                                           \
386    } else if ((c) == 0xaf) {                                      \
387        OUTCHAR(0x2015);                                           \
388    }
389
390static Py_ssize_t
391iso2022processg2(const void *config, MultibyteCodec_State *state,
392                 const unsigned char **inbuf, Py_ssize_t *inleft,
393                 _PyUnicodeWriter *writer)
394{
395    /* not written to use encoder, decoder functions because only few
396     * encodings use G2 designations in CJKCodecs */
397    if (STATE_G2 == CHARSET_ISO8859_1) {
398        if (INBYTE3 < 0x80)
399            OUTCHAR(INBYTE3 + 0x80);
400        else
401            return 3;
402    }
403    else if (STATE_G2 == CHARSET_ISO8859_7) {
404        ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
405        else
406            return 3;
407    }
408    else if (STATE_G2 == CHARSET_ASCII) {
409        if (INBYTE3 & 0x80)
410            return 3;
411        else
412            OUTCHAR(INBYTE3);
413    }
414    else
415        return MBERR_INTERNAL;
416
417    (*inbuf) += 3;
418    *inleft -= 3;
419    return 0;
420}
421
422DECODER(iso2022)
423{
424    const struct iso2022_designation *dsgcache = NULL;
425
426    while (inleft > 0) {
427        unsigned char c = INBYTE1;
428        Py_ssize_t err;
429
430        if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
431            /* ESC throughout mode:
432             * for non-iso2022 escape sequences */
433            OUTCHAR(c); /* assume as ISO-8859-1 */
434            NEXT_IN(1);
435            if (IS_ESCEND(c)) {
436                STATE_CLEARFLAG(F_ESCTHROUGHOUT);
437            }
438            continue;
439        }
440
441        switch (c) {
442        case ESC:
443            REQUIRE_INBUF(2);
444            if (IS_ISO2022ESC(INBYTE2)) {
445                err = iso2022processesc(config, state,
446                                        inbuf, &inleft);
447                if (err != 0)
448                    return err;
449            }
450            else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
451                REQUIRE_INBUF(3);
452                err = iso2022processg2(config, state,
453                                       inbuf, &inleft, writer);
454                if (err != 0)
455                    return err;
456            }
457            else {
458                OUTCHAR(ESC);
459                STATE_SETFLAG(F_ESCTHROUGHOUT);
460                NEXT_IN(1);
461            }
462            break;
463        case SI:
464            if (CONFIG_ISSET(NO_SHIFT))
465                goto bypass;
466            STATE_CLEARFLAG(F_SHIFTED);
467            NEXT_IN(1);
468            break;
469        case SO:
470            if (CONFIG_ISSET(NO_SHIFT))
471                goto bypass;
472            STATE_SETFLAG(F_SHIFTED);
473            NEXT_IN(1);
474            break;
475        case LF:
476            STATE_CLEARFLAG(F_SHIFTED);
477            OUTCHAR(LF);
478            NEXT_IN(1);
479            break;
480        default:
481            if (c < 0x20) /* C0 */
482                goto bypass;
483            else if (c >= 0x80)
484                return 1;
485            else {
486                const struct iso2022_designation *dsg;
487                unsigned char charset;
488                Py_UCS4 decoded;
489
490                if (STATE_GETFLAG(F_SHIFTED))
491                    charset = STATE_G1;
492                else
493                    charset = STATE_G0;
494
495                if (charset == CHARSET_ASCII) {
496bypass:
497                    OUTCHAR(c);
498                    NEXT_IN(1);
499                    break;
500                }
501
502                if (dsgcache != NULL &&
503                    dsgcache->mark == charset)
504                        dsg = dsgcache;
505                else {
506                    for (dsg = CONFIG_DESIGNATIONS;
507                         dsg->mark != charset
508#ifdef Py_DEBUG
509                            && dsg->mark != '\0'
510#endif
511                         ; dsg++)
512                    {
513                        /* noop */
514                    }
515                    assert(dsg->mark != '\0');
516                    dsgcache = dsg;
517                }
518
519                REQUIRE_INBUF(dsg->width);
520                decoded = dsg->decoder(*inbuf);
521                if (decoded == MAP_UNMAPPABLE)
522                    return dsg->width;
523
524                if (decoded < 0x10000) {
525                    OUTCHAR(decoded);
526                }
527                else if (decoded < 0x30000) {
528                    OUTCHAR(decoded);
529                }
530                else { /* JIS X 0213 pairs */
531                    OUTCHAR2(decoded >> 16, decoded & 0xffff);
532                }
533                NEXT_IN(dsg->width);
534            }
535            break;
536        }
537    }
538    return 0;
539}
540
541/*-*- mapping table holders -*-*/
542
543#define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
544#define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
545
546/* kr */
547ENCMAP(cp949)
548DECMAP(ksx1001)
549
550/* jp */
551ENCMAP(jisxcommon)
552DECMAP(jisx0208)
553DECMAP(jisx0212)
554ENCMAP(jisx0213_bmp)
555DECMAP(jisx0213_1_bmp)
556DECMAP(jisx0213_2_bmp)
557ENCMAP(jisx0213_emp)
558DECMAP(jisx0213_1_emp)
559DECMAP(jisx0213_2_emp)
560
561/* cn */
562ENCMAP(gbcommon)
563DECMAP(gb2312)
564
565/* tw */
566
567/*-*- mapping access functions -*-*/
568
569static int
570ksx1001_init(void)
571{
572    static int initialized = 0;
573
574    if (!initialized && (
575                    IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
576                    IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
577        return -1;
578    initialized = 1;
579    return 0;
580}
581
582static Py_UCS4
583ksx1001_decoder(const unsigned char *data)
584{
585    Py_UCS4 u;
586    if (TRYMAP_DEC(ksx1001, u, data[0], data[1]))
587        return u;
588    else
589        return MAP_UNMAPPABLE;
590}
591
592static DBCHAR
593ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length)
594{
595    DBCHAR coded;
596    assert(*length == 1);
597    if (*data < 0x10000) {
598        if (TRYMAP_ENC(cp949, coded, *data)) {
599            if (!(coded & 0x8000))
600                return coded;
601        }
602    }
603    return MAP_UNMAPPABLE;
604}
605
606static int
607jisx0208_init(void)
608{
609    static int initialized = 0;
610
611    if (!initialized && (
612                    IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
613                    IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
614        return -1;
615    initialized = 1;
616    return 0;
617}
618
619static Py_UCS4
620jisx0208_decoder(const unsigned char *data)
621{
622    Py_UCS4 u;
623    if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
624        return 0xff3c;
625    else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
626        return u;
627    else
628        return MAP_UNMAPPABLE;
629}
630
631static DBCHAR
632jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length)
633{
634    DBCHAR coded;
635    assert(*length == 1);
636    if (*data < 0x10000) {
637        if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
638            return 0x2140;
639        else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
640            if (!(coded & 0x8000))
641                return coded;
642        }
643    }
644    return MAP_UNMAPPABLE;
645}
646
647static int
648jisx0212_init(void)
649{
650    static int initialized = 0;
651
652    if (!initialized && (
653                    IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
654                    IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
655        return -1;
656    initialized = 1;
657    return 0;
658}
659
660static Py_UCS4
661jisx0212_decoder(const unsigned char *data)
662{
663    Py_UCS4 u;
664    if (TRYMAP_DEC(jisx0212, u, data[0], data[1]))
665        return u;
666    else
667        return MAP_UNMAPPABLE;
668}
669
670static DBCHAR
671jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length)
672{
673    DBCHAR coded;
674    assert(*length == 1);
675    if (*data < 0x10000) {
676        if (TRYMAP_ENC(jisxcommon, coded, *data)) {
677            if (coded & 0x8000)
678                return coded & 0x7fff;
679        }
680    }
681    return MAP_UNMAPPABLE;
682}
683
684static int
685jisx0213_init(void)
686{
687    static int initialized = 0;
688
689    if (!initialized && (
690                    jisx0208_init() ||
691                    IMPORT_MAP(jp, jisx0213_bmp,
692                               &jisx0213_bmp_encmap, NULL) ||
693                    IMPORT_MAP(jp, jisx0213_1_bmp,
694                               NULL, &jisx0213_1_bmp_decmap) ||
695                    IMPORT_MAP(jp, jisx0213_2_bmp,
696                               NULL, &jisx0213_2_bmp_decmap) ||
697                    IMPORT_MAP(jp, jisx0213_emp,
698                               &jisx0213_emp_encmap, NULL) ||
699                    IMPORT_MAP(jp, jisx0213_1_emp,
700                               NULL, &jisx0213_1_emp_decmap) ||
701                    IMPORT_MAP(jp, jisx0213_2_emp,
702                               NULL, &jisx0213_2_emp_decmap) ||
703                    IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
704                               &jisx0213_pair_decmap)))
705        return -1;
706    initialized = 1;
707    return 0;
708}
709
710#define config ((void *)2000)
711static Py_UCS4
712jisx0213_2000_1_decoder(const unsigned char *data)
713{
714    Py_UCS4 u;
715    EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
716    else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
717        return 0xff3c;
718    else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
719        ;
720    else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
721        ;
722    else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
723        u |= 0x20000;
724    else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
725        ;
726    else
727        return MAP_UNMAPPABLE;
728    return u;
729}
730
731static Py_UCS4
732jisx0213_2000_2_decoder(const unsigned char *data)
733{
734    Py_UCS4 u;
735    EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1])
736    if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
737        ;
738    else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
739        u |= 0x20000;
740    else
741        return MAP_UNMAPPABLE;
742    return u;
743}
744#undef config
745
746static Py_UCS4
747jisx0213_2004_1_decoder(const unsigned char *data)
748{
749    Py_UCS4 u;
750    if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
751        return 0xff3c;
752    else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
753        ;
754    else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
755        ;
756    else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
757        u |= 0x20000;
758    else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
759        ;
760    else
761        return MAP_UNMAPPABLE;
762    return u;
763}
764
765static Py_UCS4
766jisx0213_2004_2_decoder(const unsigned char *data)
767{
768    Py_UCS4 u;
769    if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
770        ;
771    else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
772        u |= 0x20000;
773    else
774        return MAP_UNMAPPABLE;
775    return u;
776}
777
778static DBCHAR
779jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config)
780{
781    DBCHAR coded;
782
783    switch (*length) {
784    case 1: /* first character */
785        if (*data >= 0x10000) {
786            if ((*data) >> 16 == 0x20000 >> 16) {
787                EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
788                else if (TRYMAP_ENC(jisx0213_emp, coded, (*data) & 0xffff))
789                    return coded;
790            }
791            return MAP_UNMAPPABLE;
792        }
793
794        EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
795        else if (TRYMAP_ENC(jisx0213_bmp, coded, *data)) {
796            if (coded == MULTIC)
797                return MAP_MULTIPLE_AVAIL;
798        }
799        else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
800            if (coded & 0x8000)
801                return MAP_UNMAPPABLE;
802        }
803        else
804            return MAP_UNMAPPABLE;
805        return coded;
806
807    case 2: /* second character of unicode pair */
808        coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
809                                jisx0213_pair_encmap, JISX0213_ENCPAIRS);
810        if (coded != DBCINV)
811            return coded;
812        /* fall through */
813
814    case -1: /* flush unterminated */
815        *length = 1;
816        coded = find_pairencmap((ucs2_t)data[0], 0,
817                                jisx0213_pair_encmap, JISX0213_ENCPAIRS);
818        if (coded == DBCINV)
819            return MAP_UNMAPPABLE;
820        else
821            return coded;
822        break;
823
824    default:
825        return MAP_UNMAPPABLE;
826    }
827}
828
829static DBCHAR
830jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
831{
832    DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
833    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
834        return coded;
835    else if (coded & 0x8000)
836        return MAP_UNMAPPABLE;
837    else
838        return coded;
839}
840
841static DBCHAR
842jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
843{
844    DBCHAR coded;
845    Py_ssize_t ilength = *length;
846
847    coded = jisx0213_encoder(data, length, (void *)2000);
848    switch (ilength) {
849    case 1:
850        if (coded == MAP_MULTIPLE_AVAIL)
851            return MAP_MULTIPLE_AVAIL;
852        else
853            return MAP_UNMAPPABLE;
854    case 2:
855        if (*length != 2)
856            return MAP_UNMAPPABLE;
857        else
858            return coded;
859    default:
860        return MAP_UNMAPPABLE;
861    }
862}
863
864static DBCHAR
865jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
866{
867    DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
868    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
869        return coded;
870    else if (coded & 0x8000)
871        return coded & 0x7fff;
872    else
873        return MAP_UNMAPPABLE;
874}
875
876static DBCHAR
877jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
878{
879    DBCHAR coded = jisx0213_encoder(data, length, NULL);
880    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
881        return coded;
882    else if (coded & 0x8000)
883        return MAP_UNMAPPABLE;
884    else
885        return coded;
886}
887
888static DBCHAR
889jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
890{
891    DBCHAR coded;
892    Py_ssize_t ilength = *length;
893
894    coded = jisx0213_encoder(data, length, NULL);
895    switch (ilength) {
896    case 1:
897        if (coded == MAP_MULTIPLE_AVAIL)
898            return MAP_MULTIPLE_AVAIL;
899        else
900            return MAP_UNMAPPABLE;
901    case 2:
902        if (*length != 2)
903            return MAP_UNMAPPABLE;
904        else
905            return coded;
906    default:
907        return MAP_UNMAPPABLE;
908    }
909}
910
911static DBCHAR
912jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
913{
914    DBCHAR coded = jisx0213_encoder(data, length, NULL);
915    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
916        return coded;
917    else if (coded & 0x8000)
918        return coded & 0x7fff;
919    else
920        return MAP_UNMAPPABLE;
921}
922
923static Py_UCS4
924jisx0201_r_decoder(const unsigned char *data)
925{
926    Py_UCS4 u;
927    JISX0201_R_DECODE_CHAR(*data, u)
928    else
929        return MAP_UNMAPPABLE;
930    return u;
931}
932
933static DBCHAR
934jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length)
935{
936    DBCHAR coded;
937    JISX0201_R_ENCODE(*data, coded)
938    else
939        return MAP_UNMAPPABLE;
940    return coded;
941}
942
943static Py_UCS4
944jisx0201_k_decoder(const unsigned char *data)
945{
946    Py_UCS4 u;
947    JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
948    else
949        return MAP_UNMAPPABLE;
950    return u;
951}
952
953static DBCHAR
954jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length)
955{
956    DBCHAR coded;
957    JISX0201_K_ENCODE(*data, coded)
958    else
959        return MAP_UNMAPPABLE;
960    return coded - 0x80;
961}
962
963static int
964gb2312_init(void)
965{
966    static int initialized = 0;
967
968    if (!initialized && (
969                    IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
970                    IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
971        return -1;
972    initialized = 1;
973    return 0;
974}
975
976static Py_UCS4
977gb2312_decoder(const unsigned char *data)
978{
979    Py_UCS4 u;
980    if (TRYMAP_DEC(gb2312, u, data[0], data[1]))
981        return u;
982    else
983        return MAP_UNMAPPABLE;
984}
985
986static DBCHAR
987gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length)
988{
989    DBCHAR coded;
990    assert(*length == 1);
991    if (*data < 0x10000) {
992        if (TRYMAP_ENC(gbcommon, coded, *data)) {
993            if (!(coded & 0x8000))
994                return coded;
995        }
996    }
997    return MAP_UNMAPPABLE;
998}
999
1000
1001static Py_UCS4
1002dummy_decoder(const unsigned char *data)
1003{
1004    return MAP_UNMAPPABLE;
1005}
1006
1007static DBCHAR
1008dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length)
1009{
1010    return MAP_UNMAPPABLE;
1011}
1012
1013/*-*- registry tables -*-*/
1014
1015#define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
1016                  ksx1001_init,                                         \
1017                  ksx1001_decoder, ksx1001_encoder }
1018#define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
1019                  ksx1001_init,                                         \
1020                  ksx1001_decoder, ksx1001_encoder }
1021#define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
1022                  NULL,                                                 \
1023                  jisx0201_r_decoder, jisx0201_r_encoder }
1024#define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
1025                  NULL,                                                 \
1026                  jisx0201_k_decoder, jisx0201_k_encoder }
1027#define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
1028                  jisx0208_init,                                        \
1029                  jisx0208_decoder, jisx0208_encoder }
1030#define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
1031                  jisx0208_init,                                        \
1032                  jisx0208_decoder, jisx0208_encoder }
1033#define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
1034                  jisx0212_init,                                        \
1035                  jisx0212_decoder, jisx0212_encoder }
1036#define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
1037                  jisx0213_init,                                        \
1038                  jisx0213_2000_1_decoder,                              \
1039                  jisx0213_2000_1_encoder }
1040#define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
1041                  jisx0213_init,                                        \
1042                  jisx0213_2000_1_decoder,                              \
1043                  jisx0213_2000_1_encoder_paironly }
1044#define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
1045                  jisx0213_init,                                        \
1046                  jisx0213_2000_2_decoder,                              \
1047                  jisx0213_2000_2_encoder }
1048#define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
1049                  jisx0213_init,                                        \
1050                  jisx0213_2004_1_decoder,                              \
1051                  jisx0213_2004_1_encoder }
1052#define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
1053                  jisx0213_init,                                        \
1054                  jisx0213_2004_1_decoder,                              \
1055                  jisx0213_2004_1_encoder_paironly }
1056#define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
1057                  jisx0213_init,                                        \
1058                  jisx0213_2004_2_decoder,                              \
1059                  jisx0213_2004_2_encoder }
1060#define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
1061                  gb2312_init,                                          \
1062                  gb2312_decoder, gb2312_encoder }
1063#define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
1064                  cns11643_init,                                        \
1065                  cns11643_1_decoder, cns11643_1_encoder }
1066#define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
1067                  cns11643_init,                                        \
1068                  cns11643_2_decoder, cns11643_2_encoder }
1069#define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
1070                  NULL, dummy_decoder, dummy_encoder }
1071#define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
1072                  NULL, dummy_decoder, dummy_encoder }
1073#define REGISTRY_SENTINEL       { 0, }
1074#define CONFIGDEF(var, attrs)                                           \
1075    static const struct iso2022_config iso2022_##var##_config = {       \
1076        attrs, iso2022_##var##_designations                             \
1077    };
1078
1079static const struct iso2022_designation iso2022_kr_designations[] = {
1080    REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
1081};
1082CONFIGDEF(kr, 0)
1083
1084static const struct iso2022_designation iso2022_jp_designations[] = {
1085    REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1086    REGISTRY_SENTINEL
1087};
1088CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
1089
1090static const struct iso2022_designation iso2022_jp_1_designations[] = {
1091    REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1092    REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1093};
1094CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
1095
1096static const struct iso2022_designation iso2022_jp_2_designations[] = {
1097    REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
1098    REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1099    REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
1100};
1101CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
1102
1103static const struct iso2022_designation iso2022_jp_2004_designations[] = {
1104    REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
1105    REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
1106};
1107CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
1108
1109static const struct iso2022_designation iso2022_jp_3_designations[] = {
1110    REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
1111    REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
1112};
1113CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
1114
1115static const struct iso2022_designation iso2022_jp_ext_designations[] = {
1116    REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1117    REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1118};
1119CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
1120
1121
1122BEGIN_MAPPINGS_LIST
1123  /* no mapping table here */
1124END_MAPPINGS_LIST
1125
1126#define ISO2022_CODEC(variation) {              \
1127    "iso2022_" #variation,                      \
1128    &iso2022_##variation##_config,              \
1129    iso2022_codec_init,                         \
1130    _STATEFUL_METHODS(iso2022)                  \
1131},
1132
1133BEGIN_CODECS_LIST
1134  ISO2022_CODEC(kr)
1135  ISO2022_CODEC(jp)
1136  ISO2022_CODEC(jp_1)
1137  ISO2022_CODEC(jp_2)
1138  ISO2022_CODEC(jp_2004)
1139  ISO2022_CODEC(jp_3)
1140  ISO2022_CODEC(jp_ext)
1141END_CODECS_LIST
1142
1143I_AM_A_MODULE_FOR(iso2022)
1144