1/*
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7#include "cjkcodecs.h"
8#include "mappings_cn.h"
9
10/**
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
13 */
14#ifdef _AIX
15#undef hz
16#endif
17
18/* GBK and GB2312 map differently in few code points that are listed below:
19 *
20 *              gb2312                          gbk
21 * A1A4         U+30FB KATAKANA MIDDLE DOT      U+00B7 MIDDLE DOT
22 * A1AA         U+2015 HORIZONTAL BAR           U+2014 EM DASH
23 * A844         undefined                       U+2015 HORIZONTAL BAR
24 */
25
26#define GBK_DECODE(dc1, dc2, writer)                                \
27    if ((dc1) == 0xa1 && (dc2) == 0xaa) {                           \
28        OUTCHAR(0x2014);                                            \
29    }                                                               \
30    else if ((dc1) == 0xa8 && (dc2) == 0x44) {                      \
31        OUTCHAR(0x2015);                                            \
32    }                                                               \
33    else if ((dc1) == 0xa1 && (dc2) == 0xa4) {                      \
34        OUTCHAR(0x00b7);                                            \
35    }                                                               \
36    else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \
37        OUTCHAR(decoded);                                           \
38    }                                                               \
39    else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) {               \
40        OUTCHAR(decoded);                                           \
41    }
42
43#define GBK_ENCODE(code, assi)                                         \
44    if ((code) == 0x2014) {                                            \
45        (assi) = 0xa1aa;                                               \
46    } else if ((code) == 0x2015) {                                     \
47        (assi) = 0xa844;                                               \
48    } else if ((code) == 0x00b7) {                                     \
49        (assi) = 0xa1a4;                                               \
50    } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \
51        ;                                                              \
52    }
53
54/*
55 * codecs in this file use the first byte of MultibyteCodec_State.c[8]
56 * to store a 0 or 1 state value
57 */
58#define CN_STATE_OFFSET 0
59
60/*
61 * GB2312 codec
62 */
63
64ENCODER(gb2312)
65{
66    while (*inpos < inlen) {
67        Py_UCS4 c = INCHAR1;
68        DBCHAR code;
69
70        if (c < 0x80) {
71            WRITEBYTE1((unsigned char)c);
72            NEXT(1, 1);
73            continue;
74        }
75
76        if (c > 0xFFFF)
77            return 1;
78
79        REQUIRE_OUTBUF(2);
80        if (TRYMAP_ENC(gbcommon, code, c))
81            ;
82        else
83            return 1;
84
85        if (code & 0x8000) /* MSB set: GBK */
86            return 1;
87
88        OUTBYTE1((code >> 8) | 0x80);
89        OUTBYTE2((code & 0xFF) | 0x80);
90        NEXT(1, 2);
91    }
92
93    return 0;
94}
95
96DECODER(gb2312)
97{
98    while (inleft > 0) {
99        unsigned char c = **inbuf;
100        Py_UCS4 decoded;
101
102        if (c < 0x80) {
103            OUTCHAR(c);
104            NEXT_IN(1);
105            continue;
106        }
107
108        REQUIRE_INBUF(2);
109        if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
110            OUTCHAR(decoded);
111            NEXT_IN(2);
112        }
113        else
114            return 1;
115    }
116
117    return 0;
118}
119
120
121/*
122 * GBK codec
123 */
124
125ENCODER(gbk)
126{
127    while (*inpos < inlen) {
128        Py_UCS4 c = INCHAR1;
129        DBCHAR code;
130
131        if (c < 0x80) {
132            WRITEBYTE1((unsigned char)c);
133            NEXT(1, 1);
134            continue;
135        }
136
137        if (c > 0xFFFF)
138            return 1;
139
140        REQUIRE_OUTBUF(2);
141
142        GBK_ENCODE(c, code)
143        else
144            return 1;
145
146        OUTBYTE1((code >> 8) | 0x80);
147        if (code & 0x8000)
148            OUTBYTE2((code & 0xFF)); /* MSB set: GBK */
149        else
150            OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
151        NEXT(1, 2);
152    }
153
154    return 0;
155}
156
157DECODER(gbk)
158{
159    while (inleft > 0) {
160        unsigned char c = INBYTE1;
161        Py_UCS4 decoded;
162
163        if (c < 0x80) {
164            OUTCHAR(c);
165            NEXT_IN(1);
166            continue;
167        }
168
169        REQUIRE_INBUF(2);
170
171        GBK_DECODE(c, INBYTE2, writer)
172        else
173            return 1;
174
175        NEXT_IN(2);
176    }
177
178    return 0;
179}
180
181
182/*
183 * GB18030 codec
184 */
185
186ENCODER(gb18030)
187{
188    while (*inpos < inlen) {
189        Py_UCS4 c = INCHAR1;
190        DBCHAR code;
191
192        if (c < 0x80) {
193            WRITEBYTE1(c);
194            NEXT(1, 1);
195            continue;
196        }
197
198        if (c >= 0x10000) {
199            Py_UCS4 tc = c - 0x10000;
200            assert (c <= 0x10FFFF);
201
202            REQUIRE_OUTBUF(4);
203
204            OUTBYTE4((unsigned char)(tc % 10) + 0x30);
205            tc /= 10;
206            OUTBYTE3((unsigned char)(tc % 126) + 0x81);
207            tc /= 126;
208            OUTBYTE2((unsigned char)(tc % 10) + 0x30);
209            tc /= 10;
210            OUTBYTE1((unsigned char)(tc + 0x90));
211
212            NEXT(1, 4);
213            continue;
214        }
215
216        REQUIRE_OUTBUF(2);
217
218        GBK_ENCODE(c, code)
219        else if (TRYMAP_ENC(gb18030ext, code, c))
220            ;
221        else {
222            const struct _gb18030_to_unibmp_ranges *utrrange;
223
224            REQUIRE_OUTBUF(4);
225
226            for (utrrange = gb18030_to_unibmp_ranges;
227                 utrrange->first != 0;
228                 utrrange++)
229                if (utrrange->first <= c &&
230                    c <= utrrange->last) {
231                    Py_UCS4 tc;
232
233                    tc = c - utrrange->first +
234                         utrrange->base;
235
236                    OUTBYTE4((unsigned char)(tc % 10) + 0x30);
237                    tc /= 10;
238                    OUTBYTE3((unsigned char)(tc % 126) + 0x81);
239                    tc /= 126;
240                    OUTBYTE2((unsigned char)(tc % 10) + 0x30);
241                    tc /= 10;
242                    OUTBYTE1((unsigned char)tc + 0x81);
243
244                    NEXT(1, 4);
245                    break;
246                }
247
248            if (utrrange->first == 0)
249                return 1;
250            continue;
251        }
252
253        OUTBYTE1((code >> 8) | 0x80);
254        if (code & 0x8000)
255            OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */
256        else
257            OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
258
259        NEXT(1, 2);
260    }
261
262    return 0;
263}
264
265DECODER(gb18030)
266{
267    while (inleft > 0) {
268        unsigned char c = INBYTE1, c2;
269        Py_UCS4 decoded;
270
271        if (c < 0x80) {
272            OUTCHAR(c);
273            NEXT_IN(1);
274            continue;
275        }
276
277        REQUIRE_INBUF(2);
278
279        c2 = INBYTE2;
280        if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
281            const struct _gb18030_to_unibmp_ranges *utr;
282            unsigned char c3, c4;
283            Py_UCS4 lseq;
284
285            REQUIRE_INBUF(4);
286            c3 = INBYTE3;
287            c4 = INBYTE4;
288            if (c  < 0x81 || c  > 0xFE ||
289                c3 < 0x81 || c3 > 0xFE ||
290                c4 < 0x30 || c4 > 0x39)
291                return 1;
292            c -= 0x81;  c2 -= 0x30;
293            c3 -= 0x81; c4 -= 0x30;
294
295            if (c < 4) { /* U+0080 - U+FFFF */
296                lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
297                    (Py_UCS4)c3 * 10 + c4;
298                if (lseq < 39420) {
299                    for (utr = gb18030_to_unibmp_ranges;
300                         lseq >= (utr + 1)->base;
301                         utr++) ;
302                    OUTCHAR(utr->first - utr->base + lseq);
303                    NEXT_IN(4);
304                    continue;
305                }
306            }
307            else if (c >= 15) { /* U+10000 - U+10FFFF */
308                lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
309                    * 1260 + (Py_UCS4)c3 * 10 + c4;
310                if (lseq <= 0x10FFFF) {
311                    OUTCHAR(lseq);
312                    NEXT_IN(4);
313                    continue;
314                }
315            }
316            return 1;
317        }
318
319        GBK_DECODE(c, c2, writer)
320        else if (TRYMAP_DEC(gb18030ext, decoded, c, c2))
321            OUTCHAR(decoded);
322        else
323            return 1;
324
325        NEXT_IN(2);
326    }
327
328    return 0;
329}
330
331
332/*
333 * HZ codec
334 */
335
336ENCODER_INIT(hz)
337{
338    state->c[CN_STATE_OFFSET] = 0;
339    return 0;
340}
341
342ENCODER_RESET(hz)
343{
344    if (state->c[CN_STATE_OFFSET] != 0) {
345        WRITEBYTE2('~', '}');
346        state->c[CN_STATE_OFFSET] = 0;
347        NEXT_OUT(2);
348    }
349    return 0;
350}
351
352ENCODER(hz)
353{
354    while (*inpos < inlen) {
355        Py_UCS4 c = INCHAR1;
356        DBCHAR code;
357
358        if (c < 0x80) {
359            if (state->c[CN_STATE_OFFSET]) {
360                WRITEBYTE2('~', '}');
361                NEXT_OUT(2);
362                state->c[CN_STATE_OFFSET] = 0;
363            }
364            WRITEBYTE1((unsigned char)c);
365            NEXT(1, 1);
366            if (c == '~') {
367                WRITEBYTE1('~');
368                NEXT_OUT(1);
369            }
370            continue;
371        }
372
373        if (c > 0xFFFF)
374            return 1;
375
376        if (TRYMAP_ENC(gbcommon, code, c))
377            ;
378        else
379            return 1;
380
381        if (code & 0x8000) /* MSB set: GBK */
382            return 1;
383
384        if (state->c[CN_STATE_OFFSET] == 0) {
385            WRITEBYTE4('~', '{', code >> 8, code & 0xff);
386            NEXT(1, 4);
387            state->c[CN_STATE_OFFSET] = 1;
388        }
389        else {
390            WRITEBYTE2(code >> 8, code & 0xff);
391            NEXT(1, 2);
392        }
393    }
394
395    return 0;
396}
397
398DECODER_INIT(hz)
399{
400    state->c[CN_STATE_OFFSET] = 0;
401    return 0;
402}
403
404DECODER_RESET(hz)
405{
406    state->c[CN_STATE_OFFSET] = 0;
407    return 0;
408}
409
410DECODER(hz)
411{
412    while (inleft > 0) {
413        unsigned char c = INBYTE1;
414        Py_UCS4 decoded;
415
416        if (c == '~') {
417            unsigned char c2 = INBYTE2;
418
419            REQUIRE_INBUF(2);
420            if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
421                OUTCHAR('~');
422            else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
423                state->c[CN_STATE_OFFSET] = 1; /* set GB */
424            else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
425                ; /* line-continuation */
426            else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
427                state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
428            else
429                return 1;
430            NEXT_IN(2);
431            continue;
432        }
433
434        if (c & 0x80)
435            return 1;
436
437        if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
438            OUTCHAR(c);
439            NEXT_IN(1);
440        }
441        else { /* GB mode */
442            REQUIRE_INBUF(2);
443            if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) {
444                OUTCHAR(decoded);
445                NEXT_IN(2);
446            }
447            else
448                return 1;
449        }
450    }
451
452    return 0;
453}
454
455
456BEGIN_MAPPINGS_LIST
457  MAPPING_DECONLY(gb2312)
458  MAPPING_DECONLY(gbkext)
459  MAPPING_ENCONLY(gbcommon)
460  MAPPING_ENCDEC(gb18030ext)
461END_MAPPINGS_LIST
462
463BEGIN_CODECS_LIST
464  CODEC_STATELESS(gb2312)
465  CODEC_STATELESS(gbk)
466  CODEC_STATELESS(gb18030)
467  CODEC_STATEFUL(hz)
468END_CODECS_LIST
469
470I_AM_A_MODULE_FOR(cn)
471