1/*
2 * cjkcodecs.h: common header for cjkcodecs
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7#ifndef _CJKCODECS_H_
8#define _CJKCODECS_H_
9
10#define PY_SSIZE_T_CLEAN
11#include "Python.h"
12#include "multibytecodec.h"
13
14
15/* a unicode "undefined" code point */
16#define UNIINV  0xFFFE
17
18/* internal-use DBCS code points which aren't used by any charsets */
19#define NOCHAR  0xFFFF
20#define MULTIC  0xFFFE
21#define DBCINV  0xFFFD
22
23/* shorter macros to save source size of mapping tables */
24#define U UNIINV
25#define N NOCHAR
26#define M MULTIC
27#define D DBCINV
28
29struct dbcs_index {
30    const ucs2_t *map;
31    unsigned char bottom, top;
32};
33typedef struct dbcs_index decode_map;
34
35struct widedbcs_index {
36    const Py_UCS4 *map;
37    unsigned char bottom, top;
38};
39typedef struct widedbcs_index widedecode_map;
40
41struct unim_index {
42    const DBCHAR *map;
43    unsigned char bottom, top;
44};
45typedef struct unim_index encode_map;
46
47struct unim_index_bytebased {
48    const unsigned char *map;
49    unsigned char bottom, top;
50};
51
52struct dbcs_map {
53    const char *charset;
54    const struct unim_index *encmap;
55    const struct dbcs_index *decmap;
56};
57
58struct pair_encodemap {
59    Py_UCS4 uniseq;
60    DBCHAR code;
61};
62
63static const MultibyteCodec *codec_list;
64static const struct dbcs_map *mapping_list;
65
66#define CODEC_INIT(encoding)                                            \
67    static int encoding##_codec_init(const void *config)
68
69#define ENCODER_INIT(encoding)                                          \
70    static int encoding##_encode_init(                                  \
71        MultibyteCodec_State *state, const void *config)
72#define ENCODER(encoding)                                               \
73    static Py_ssize_t encoding##_encode(                                \
74        MultibyteCodec_State *state, const void *config,                \
75        int kind, const void *data,                                     \
76        Py_ssize_t *inpos, Py_ssize_t inlen,                            \
77        unsigned char **outbuf, Py_ssize_t outleft, int flags)
78#define ENCODER_RESET(encoding)                                         \
79    static Py_ssize_t encoding##_encode_reset(                          \
80        MultibyteCodec_State *state, const void *config,                \
81        unsigned char **outbuf, Py_ssize_t outleft)
82
83#define DECODER_INIT(encoding)                                          \
84    static int encoding##_decode_init(                                  \
85        MultibyteCodec_State *state, const void *config)
86#define DECODER(encoding)                                               \
87    static Py_ssize_t encoding##_decode(                                \
88        MultibyteCodec_State *state, const void *config,                \
89        const unsigned char **inbuf, Py_ssize_t inleft,                 \
90        _PyUnicodeWriter *writer)
91#define DECODER_RESET(encoding)                                         \
92    static Py_ssize_t encoding##_decode_reset(                          \
93        MultibyteCodec_State *state, const void *config)
94
95#define NEXT_IN(i)                              \
96    do {                                        \
97        (*inbuf) += (i);                        \
98        (inleft) -= (i);                        \
99    } while (0)
100#define NEXT_INCHAR(i)                          \
101    do {                                        \
102        (*inpos) += (i);                        \
103    } while (0)
104#define NEXT_OUT(o)                             \
105    do {                                        \
106        (*outbuf) += (o);                       \
107        (outleft) -= (o);                       \
108    } while (0)
109#define NEXT(i, o)                              \
110    do {                                        \
111        NEXT_INCHAR(i);                         \
112        NEXT_OUT(o);                            \
113    } while (0)
114
115#define REQUIRE_INBUF(n)                        \
116    do {                                        \
117        if (inleft < (n))                       \
118            return MBERR_TOOFEW;                \
119    } while (0)
120
121#define REQUIRE_OUTBUF(n)                       \
122    do {                                        \
123        if (outleft < (n))                      \
124            return MBERR_TOOSMALL;              \
125    } while (0)
126
127#define INBYTE1 ((*inbuf)[0])
128#define INBYTE2 ((*inbuf)[1])
129#define INBYTE3 ((*inbuf)[2])
130#define INBYTE4 ((*inbuf)[3])
131
132#define INCHAR1 (PyUnicode_READ(kind, data, *inpos))
133#define INCHAR2 (PyUnicode_READ(kind, data, *inpos + 1))
134
135#define OUTCHAR(c)                                                         \
136    do {                                                                   \
137        if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0)                   \
138            return MBERR_EXCEPTION;                                         \
139    } while (0)
140
141#define OUTCHAR2(c1, c2)                                                   \
142    do {                                                                   \
143        Py_UCS4 _c1 = (c1);                                                \
144        Py_UCS4 _c2 = (c2);                                                \
145        if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0)      \
146            return MBERR_EXCEPTION;                                        \
147        PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1);     \
148        PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
149        writer->pos += 2;                                                  \
150    } while (0)
151
152#define OUTBYTEI(c, i)                     \
153    do {                                   \
154        assert((unsigned char)(c) == (c)); \
155        ((*outbuf)[i]) = (c);              \
156    } while (0)
157
158#define OUTBYTE1(c) OUTBYTEI(c, 0)
159#define OUTBYTE2(c) OUTBYTEI(c, 1)
160#define OUTBYTE3(c) OUTBYTEI(c, 2)
161#define OUTBYTE4(c) OUTBYTEI(c, 3)
162
163#define WRITEBYTE1(c1)              \
164    do {                            \
165        REQUIRE_OUTBUF(1);          \
166        OUTBYTE1(c1);               \
167    } while (0)
168#define WRITEBYTE2(c1, c2)          \
169    do {                            \
170        REQUIRE_OUTBUF(2);          \
171        OUTBYTE1(c1);               \
172        OUTBYTE2(c2);               \
173    } while (0)
174#define WRITEBYTE3(c1, c2, c3)      \
175    do {                            \
176        REQUIRE_OUTBUF(3);          \
177        OUTBYTE1(c1);               \
178        OUTBYTE2(c2);               \
179        OUTBYTE3(c3);               \
180    } while (0)
181#define WRITEBYTE4(c1, c2, c3, c4)  \
182    do {                            \
183        REQUIRE_OUTBUF(4);          \
184        OUTBYTE1(c1);               \
185        OUTBYTE2(c2);               \
186        OUTBYTE3(c3);               \
187        OUTBYTE4(c4);               \
188    } while (0)
189
190#define _TRYMAP_ENC(m, assi, val)                               \
191    ((m)->map != NULL && (val) >= (m)->bottom &&                \
192        (val)<= (m)->top && ((assi) = (m)->map[(val) -          \
193        (m)->bottom]) != NOCHAR)
194#define TRYMAP_ENC(charset, assi, uni)                     \
195    _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
196
197#define _TRYMAP_DEC(m, assi, val)                             \
198    ((m)->map != NULL &&                                        \
199     (val) >= (m)->bottom &&                                    \
200     (val)<= (m)->top &&                                        \
201     ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
202#define TRYMAP_DEC(charset, assi, c1, c2)                     \
203    _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
204
205#define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = {
206#define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL},
207#define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap},
208#define MAPPING_ENCDEC(enc) {#enc, (void*)enc##_encmap, (void*)enc##_decmap},
209#define END_MAPPINGS_LIST                               \
210    {"", NULL, NULL} };                                 \
211    static const struct dbcs_map *mapping_list =        \
212        (const struct dbcs_map *)_mapping_list;
213
214#define BEGIN_CODECS_LIST static const MultibyteCodec _codec_list[] = {
215#define _STATEFUL_METHODS(enc)          \
216    enc##_encode,                       \
217    enc##_encode_init,                  \
218    enc##_encode_reset,                 \
219    enc##_decode,                       \
220    enc##_decode_init,                  \
221    enc##_decode_reset,
222#define _STATELESS_METHODS(enc)         \
223    enc##_encode, NULL, NULL,           \
224    enc##_decode, NULL, NULL,
225#define CODEC_STATEFUL(enc) {           \
226    #enc, NULL, NULL,                   \
227    _STATEFUL_METHODS(enc)              \
228},
229#define CODEC_STATELESS(enc) {          \
230    #enc, NULL, NULL,                   \
231    _STATELESS_METHODS(enc)             \
232},
233#define CODEC_STATELESS_WINIT(enc) {    \
234    #enc, NULL,                         \
235    enc##_codec_init,                   \
236    _STATELESS_METHODS(enc)             \
237},
238#define END_CODECS_LIST                                 \
239    {"", NULL,} };                                      \
240    static const MultibyteCodec *codec_list =           \
241        (const MultibyteCodec *)_codec_list;
242
243
244
245static PyObject *
246getmultibytecodec(void)
247{
248    PyObject *mod = PyImport_ImportModuleNoBlock("_multibytecodec");
249    if (mod == NULL) {
250        return NULL;
251    }
252
253    PyObject *cofunc = PyObject_GetAttrString(mod, "__create_codec");
254    Py_DECREF(mod);
255    return cofunc;
256}
257
258static PyObject *
259getcodec(PyObject *self, PyObject *encoding)
260{
261    PyObject *codecobj, *r, *cofunc;
262    const MultibyteCodec *codec;
263    const char *enc;
264
265    if (!PyUnicode_Check(encoding)) {
266        PyErr_SetString(PyExc_TypeError,
267                        "encoding name must be a string.");
268        return NULL;
269    }
270    enc = PyUnicode_AsUTF8(encoding);
271    if (enc == NULL)
272        return NULL;
273
274    cofunc = getmultibytecodec();
275    if (cofunc == NULL)
276        return NULL;
277
278    for (codec = codec_list; codec->encoding[0]; codec++)
279        if (strcmp(codec->encoding, enc) == 0)
280            break;
281
282    if (codec->encoding[0] == '\0') {
283        PyErr_SetString(PyExc_LookupError,
284                        "no such codec is supported.");
285        return NULL;
286    }
287
288    codecobj = PyCapsule_New((void *)codec, PyMultibyteCodec_CAPSULE_NAME, NULL);
289    if (codecobj == NULL)
290        return NULL;
291
292    r = PyObject_CallOneArg(cofunc, codecobj);
293    Py_DECREF(codecobj);
294    Py_DECREF(cofunc);
295
296    return r;
297}
298
299
300static int
301register_maps(PyObject *module)
302{
303    const struct dbcs_map *h;
304
305    for (h = mapping_list; h->charset[0] != '\0'; h++) {
306        char mhname[256] = "__map_";
307        strcpy(mhname + sizeof("__map_") - 1, h->charset);
308
309        PyObject *capsule = PyCapsule_New((void *)h,
310                                          PyMultibyteCodec_CAPSULE_NAME, NULL);
311        if (capsule == NULL) {
312            return -1;
313        }
314        if (PyModule_AddObject(module, mhname, capsule) < 0) {
315            Py_DECREF(capsule);
316            return -1;
317        }
318    }
319    return 0;
320}
321
322#ifdef USING_BINARY_PAIR_SEARCH
323static DBCHAR
324find_pairencmap(ucs2_t body, ucs2_t modifier,
325                const struct pair_encodemap *haystack, int haystacksize)
326{
327    int pos, min, max;
328    Py_UCS4 value = body << 16 | modifier;
329
330    min = 0;
331    max = haystacksize;
332
333    for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) {
334        if (value < haystack[pos].uniseq) {
335            if (max != pos) {
336                max = pos;
337                continue;
338            }
339        }
340        else if (value > haystack[pos].uniseq) {
341            if (min != pos) {
342                min = pos;
343                continue;
344            }
345        }
346        break;
347    }
348
349    if (value == haystack[pos].uniseq) {
350        return haystack[pos].code;
351    }
352    return DBCINV;
353}
354#endif
355
356#ifdef USING_IMPORTED_MAPS
357#define IMPORT_MAP(locale, charset, encmap, decmap) \
358    importmap("_codecs_" #locale, "__map_" #charset, \
359              (const void**)encmap, (const void**)decmap)
360
361static int
362importmap(const char *modname, const char *symbol,
363          const void **encmap, const void **decmap)
364{
365    PyObject *o, *mod;
366
367    mod = PyImport_ImportModule(modname);
368    if (mod == NULL)
369        return -1;
370
371    o = PyObject_GetAttrString(mod, symbol);
372    if (o == NULL)
373        goto errorexit;
374    else if (!PyCapsule_IsValid(o, PyMultibyteCodec_CAPSULE_NAME)) {
375        PyErr_SetString(PyExc_ValueError,
376                        "map data must be a Capsule.");
377        goto errorexit;
378    }
379    else {
380        struct dbcs_map *map;
381        map = PyCapsule_GetPointer(o, PyMultibyteCodec_CAPSULE_NAME);
382        if (encmap != NULL)
383            *encmap = map->encmap;
384        if (decmap != NULL)
385            *decmap = map->decmap;
386        Py_DECREF(o);
387    }
388
389    Py_DECREF(mod);
390    return 0;
391
392errorexit:
393    Py_DECREF(mod);
394    return -1;
395}
396#endif
397
398static int
399_cjk_exec(PyObject *module)
400{
401    return register_maps(module);
402}
403
404
405static struct PyMethodDef _cjk_methods[] = {
406    {"getcodec", (PyCFunction)getcodec, METH_O, ""},
407    {NULL, NULL},
408};
409
410static PyModuleDef_Slot _cjk_slots[] = {
411    {Py_mod_exec, _cjk_exec},
412    {0, NULL}
413};
414
415#define I_AM_A_MODULE_FOR(loc)                                          \
416    static struct PyModuleDef _cjk_module = {                           \
417        PyModuleDef_HEAD_INIT,                                          \
418        .m_name = "_codecs_"#loc,                                       \
419        .m_size = 0,                                                    \
420        .m_methods = _cjk_methods,                                      \
421        .m_slots = _cjk_slots,                                          \
422    };                                                                  \
423                                                                        \
424    PyMODINIT_FUNC                                                      \
425    PyInit__codecs_##loc(void)                                          \
426    {                                                                   \
427        return PyModuleDef_Init(&_cjk_module);                          \
428    }
429
430#endif
431