1/* 2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings 3 * 4 * Written by Hye-Shik Chang <perky@FreeBSD.org> 5 */ 6 7#include "cjkcodecs.h" 8#include "mappings_cn.h" 9 10/** 11 * hz is predefined as 100 on AIX. So we undefine it to avoid 12 * conflict against hz codec's. 13 */ 14#ifdef _AIX 15#undef hz 16#endif 17 18/* GBK and GB2312 map differently in few code points that are listed below: 19 * 20 * gb2312 gbk 21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT 22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH 23 * A844 undefined U+2015 HORIZONTAL BAR 24 */ 25 26#define GBK_DECODE(dc1, dc2, writer) \ 27 if ((dc1) == 0xa1 && (dc2) == 0xaa) { \ 28 OUTCHAR(0x2014); \ 29 } \ 30 else if ((dc1) == 0xa8 && (dc2) == 0x44) { \ 31 OUTCHAR(0x2015); \ 32 } \ 33 else if ((dc1) == 0xa1 && (dc2) == 0xa4) { \ 34 OUTCHAR(0x00b7); \ 35 } \ 36 else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \ 37 OUTCHAR(decoded); \ 38 } \ 39 else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) { \ 40 OUTCHAR(decoded); \ 41 } 42 43#define GBK_ENCODE(code, assi) \ 44 if ((code) == 0x2014) { \ 45 (assi) = 0xa1aa; \ 46 } else if ((code) == 0x2015) { \ 47 (assi) = 0xa844; \ 48 } else if ((code) == 0x00b7) { \ 49 (assi) = 0xa1a4; \ 50 } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \ 51 ; \ 52 } 53 54/* 55 * codecs in this file use the first byte of MultibyteCodec_State.c[8] 56 * to store a 0 or 1 state value 57 */ 58#define CN_STATE_OFFSET 0 59 60/* 61 * GB2312 codec 62 */ 63 64ENCODER(gb2312) 65{ 66 while (*inpos < inlen) { 67 Py_UCS4 c = INCHAR1; 68 DBCHAR code; 69 70 if (c < 0x80) { 71 WRITEBYTE1((unsigned char)c); 72 NEXT(1, 1); 73 continue; 74 } 75 76 if (c > 0xFFFF) 77 return 1; 78 79 REQUIRE_OUTBUF(2); 80 if (TRYMAP_ENC(gbcommon, code, c)) 81 ; 82 else 83 return 1; 84 85 if (code & 0x8000) /* MSB set: GBK */ 86 return 1; 87 88 OUTBYTE1((code >> 8) | 0x80); 89 OUTBYTE2((code & 0xFF) | 0x80); 90 NEXT(1, 2); 91 } 92 93 return 0; 94} 95 96DECODER(gb2312) 97{ 98 while (inleft > 0) { 99 unsigned char c = **inbuf; 100 Py_UCS4 decoded; 101 102 if (c < 0x80) { 103 OUTCHAR(c); 104 NEXT_IN(1); 105 continue; 106 } 107 108 REQUIRE_INBUF(2); 109 if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) { 110 OUTCHAR(decoded); 111 NEXT_IN(2); 112 } 113 else 114 return 1; 115 } 116 117 return 0; 118} 119 120 121/* 122 * GBK codec 123 */ 124 125ENCODER(gbk) 126{ 127 while (*inpos < inlen) { 128 Py_UCS4 c = INCHAR1; 129 DBCHAR code; 130 131 if (c < 0x80) { 132 WRITEBYTE1((unsigned char)c); 133 NEXT(1, 1); 134 continue; 135 } 136 137 if (c > 0xFFFF) 138 return 1; 139 140 REQUIRE_OUTBUF(2); 141 142 GBK_ENCODE(c, code) 143 else 144 return 1; 145 146 OUTBYTE1((code >> 8) | 0x80); 147 if (code & 0x8000) 148 OUTBYTE2((code & 0xFF)); /* MSB set: GBK */ 149 else 150 OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */ 151 NEXT(1, 2); 152 } 153 154 return 0; 155} 156 157DECODER(gbk) 158{ 159 while (inleft > 0) { 160 unsigned char c = INBYTE1; 161 Py_UCS4 decoded; 162 163 if (c < 0x80) { 164 OUTCHAR(c); 165 NEXT_IN(1); 166 continue; 167 } 168 169 REQUIRE_INBUF(2); 170 171 GBK_DECODE(c, INBYTE2, writer) 172 else 173 return 1; 174 175 NEXT_IN(2); 176 } 177 178 return 0; 179} 180 181 182/* 183 * GB18030 codec 184 */ 185 186ENCODER(gb18030) 187{ 188 while (*inpos < inlen) { 189 Py_UCS4 c = INCHAR1; 190 DBCHAR code; 191 192 if (c < 0x80) { 193 WRITEBYTE1(c); 194 NEXT(1, 1); 195 continue; 196 } 197 198 if (c >= 0x10000) { 199 Py_UCS4 tc = c - 0x10000; 200 assert (c <= 0x10FFFF); 201 202 REQUIRE_OUTBUF(4); 203 204 OUTBYTE4((unsigned char)(tc % 10) + 0x30); 205 tc /= 10; 206 OUTBYTE3((unsigned char)(tc % 126) + 0x81); 207 tc /= 126; 208 OUTBYTE2((unsigned char)(tc % 10) + 0x30); 209 tc /= 10; 210 OUTBYTE1((unsigned char)(tc + 0x90)); 211 212 NEXT(1, 4); 213 continue; 214 } 215 216 REQUIRE_OUTBUF(2); 217 218 GBK_ENCODE(c, code) 219 else if (TRYMAP_ENC(gb18030ext, code, c)) 220 ; 221 else { 222 const struct _gb18030_to_unibmp_ranges *utrrange; 223 224 REQUIRE_OUTBUF(4); 225 226 for (utrrange = gb18030_to_unibmp_ranges; 227 utrrange->first != 0; 228 utrrange++) 229 if (utrrange->first <= c && 230 c <= utrrange->last) { 231 Py_UCS4 tc; 232 233 tc = c - utrrange->first + 234 utrrange->base; 235 236 OUTBYTE4((unsigned char)(tc % 10) + 0x30); 237 tc /= 10; 238 OUTBYTE3((unsigned char)(tc % 126) + 0x81); 239 tc /= 126; 240 OUTBYTE2((unsigned char)(tc % 10) + 0x30); 241 tc /= 10; 242 OUTBYTE1((unsigned char)tc + 0x81); 243 244 NEXT(1, 4); 245 break; 246 } 247 248 if (utrrange->first == 0) 249 return 1; 250 continue; 251 } 252 253 OUTBYTE1((code >> 8) | 0x80); 254 if (code & 0x8000) 255 OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */ 256 else 257 OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */ 258 259 NEXT(1, 2); 260 } 261 262 return 0; 263} 264 265DECODER(gb18030) 266{ 267 while (inleft > 0) { 268 unsigned char c = INBYTE1, c2; 269 Py_UCS4 decoded; 270 271 if (c < 0x80) { 272 OUTCHAR(c); 273 NEXT_IN(1); 274 continue; 275 } 276 277 REQUIRE_INBUF(2); 278 279 c2 = INBYTE2; 280 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */ 281 const struct _gb18030_to_unibmp_ranges *utr; 282 unsigned char c3, c4; 283 Py_UCS4 lseq; 284 285 REQUIRE_INBUF(4); 286 c3 = INBYTE3; 287 c4 = INBYTE4; 288 if (c < 0x81 || c > 0xFE || 289 c3 < 0x81 || c3 > 0xFE || 290 c4 < 0x30 || c4 > 0x39) 291 return 1; 292 c -= 0x81; c2 -= 0x30; 293 c3 -= 0x81; c4 -= 0x30; 294 295 if (c < 4) { /* U+0080 - U+FFFF */ 296 lseq = ((Py_UCS4)c * 10 + c2) * 1260 + 297 (Py_UCS4)c3 * 10 + c4; 298 if (lseq < 39420) { 299 for (utr = gb18030_to_unibmp_ranges; 300 lseq >= (utr + 1)->base; 301 utr++) ; 302 OUTCHAR(utr->first - utr->base + lseq); 303 NEXT_IN(4); 304 continue; 305 } 306 } 307 else if (c >= 15) { /* U+10000 - U+10FFFF */ 308 lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2) 309 * 1260 + (Py_UCS4)c3 * 10 + c4; 310 if (lseq <= 0x10FFFF) { 311 OUTCHAR(lseq); 312 NEXT_IN(4); 313 continue; 314 } 315 } 316 return 1; 317 } 318 319 GBK_DECODE(c, c2, writer) 320 else if (TRYMAP_DEC(gb18030ext, decoded, c, c2)) 321 OUTCHAR(decoded); 322 else 323 return 1; 324 325 NEXT_IN(2); 326 } 327 328 return 0; 329} 330 331 332/* 333 * HZ codec 334 */ 335 336ENCODER_INIT(hz) 337{ 338 state->c[CN_STATE_OFFSET] = 0; 339 return 0; 340} 341 342ENCODER_RESET(hz) 343{ 344 if (state->c[CN_STATE_OFFSET] != 0) { 345 WRITEBYTE2('~', '}'); 346 state->c[CN_STATE_OFFSET] = 0; 347 NEXT_OUT(2); 348 } 349 return 0; 350} 351 352ENCODER(hz) 353{ 354 while (*inpos < inlen) { 355 Py_UCS4 c = INCHAR1; 356 DBCHAR code; 357 358 if (c < 0x80) { 359 if (state->c[CN_STATE_OFFSET]) { 360 WRITEBYTE2('~', '}'); 361 NEXT_OUT(2); 362 state->c[CN_STATE_OFFSET] = 0; 363 } 364 WRITEBYTE1((unsigned char)c); 365 NEXT(1, 1); 366 if (c == '~') { 367 WRITEBYTE1('~'); 368 NEXT_OUT(1); 369 } 370 continue; 371 } 372 373 if (c > 0xFFFF) 374 return 1; 375 376 if (TRYMAP_ENC(gbcommon, code, c)) 377 ; 378 else 379 return 1; 380 381 if (code & 0x8000) /* MSB set: GBK */ 382 return 1; 383 384 if (state->c[CN_STATE_OFFSET] == 0) { 385 WRITEBYTE4('~', '{', code >> 8, code & 0xff); 386 NEXT(1, 4); 387 state->c[CN_STATE_OFFSET] = 1; 388 } 389 else { 390 WRITEBYTE2(code >> 8, code & 0xff); 391 NEXT(1, 2); 392 } 393 } 394 395 return 0; 396} 397 398DECODER_INIT(hz) 399{ 400 state->c[CN_STATE_OFFSET] = 0; 401 return 0; 402} 403 404DECODER_RESET(hz) 405{ 406 state->c[CN_STATE_OFFSET] = 0; 407 return 0; 408} 409 410DECODER(hz) 411{ 412 while (inleft > 0) { 413 unsigned char c = INBYTE1; 414 Py_UCS4 decoded; 415 416 if (c == '~') { 417 unsigned char c2 = INBYTE2; 418 419 REQUIRE_INBUF(2); 420 if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0) 421 OUTCHAR('~'); 422 else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0) 423 state->c[CN_STATE_OFFSET] = 1; /* set GB */ 424 else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0) 425 ; /* line-continuation */ 426 else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1) 427 state->c[CN_STATE_OFFSET] = 0; /* set ASCII */ 428 else 429 return 1; 430 NEXT_IN(2); 431 continue; 432 } 433 434 if (c & 0x80) 435 return 1; 436 437 if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */ 438 OUTCHAR(c); 439 NEXT_IN(1); 440 } 441 else { /* GB mode */ 442 REQUIRE_INBUF(2); 443 if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) { 444 OUTCHAR(decoded); 445 NEXT_IN(2); 446 } 447 else 448 return 1; 449 } 450 } 451 452 return 0; 453} 454 455 456BEGIN_MAPPINGS_LIST 457 MAPPING_DECONLY(gb2312) 458 MAPPING_DECONLY(gbkext) 459 MAPPING_ENCONLY(gbcommon) 460 MAPPING_ENCDEC(gb18030ext) 461END_MAPPINGS_LIST 462 463BEGIN_CODECS_LIST 464 CODEC_STATELESS(gb2312) 465 CODEC_STATELESS(gbk) 466 CODEC_STATELESS(gb18030) 467 CODEC_STATEFUL(hz) 468END_CODECS_LIST 469 470I_AM_A_MODULE_FOR(cn) 471