1"use strict"; 2 3// Description of supported double byte encodings and aliases. 4// Tables are not require()-d until they are needed to speed up library load. 5// require()-s are direct to support Browserify. 6 7module.exports = { 8 9 // == Japanese/ShiftJIS ==================================================== 10 // All japanese encodings are based on JIS X set of standards: 11 // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF. 12 // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. 13 // Has several variations in 1978, 1983, 1990 and 1997. 14 // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead. 15 // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233. 16 // 2 planes, first is superset of 0208, second - revised 0212. 17 // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx) 18 19 // Byte encodings are: 20 // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte 21 // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC. 22 // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI. 23 // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes. 24 // 0x00-0x7F - lower part of 0201 25 // 0x8E, 0xA1-0xDF - upper part of 0201 26 // (0xA1-0xFE)x2 - 0208 plane (94x94). 27 // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94). 28 // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon. 29 // Used as-is in ISO2022 family. 30 // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, 31 // 0201-1976 Roman, 0208-1978, 0208-1983. 32 // * ISO2022-JP-1: Adds esc seq for 0212-1990. 33 // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7. 34 // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2. 35 // * ISO2022-JP-2004: Adds 0213-2004 Plane 1. 36 // 37 // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes. 38 // 39 // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html 40 41 'shiftjis': { 42 type: '_dbcs', 43 table: function() { return require('./tables/shiftjis.json') }, 44 encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 45 encodeSkipVals: [{from: 0xED40, to: 0xF940}], 46 }, 47 'csshiftjis': 'shiftjis', 48 'mskanji': 'shiftjis', 49 'sjis': 'shiftjis', 50 'windows31j': 'shiftjis', 51 'ms31j': 'shiftjis', 52 'xsjis': 'shiftjis', 53 'windows932': 'shiftjis', 54 'ms932': 'shiftjis', 55 '932': 'shiftjis', 56 'cp932': 'shiftjis', 57 58 'eucjp': { 59 type: '_dbcs', 60 table: function() { return require('./tables/eucjp.json') }, 61 encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 62 }, 63 64 // TODO: KDDI extension to Shift_JIS 65 // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes. 66 // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars. 67 68 69 // == Chinese/GBK ========================================================== 70 // http://en.wikipedia.org/wiki/GBK 71 // We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder 72 73 // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936 74 'gb2312': 'cp936', 75 'gb231280': 'cp936', 76 'gb23121980': 'cp936', 77 'csgb2312': 'cp936', 78 'csiso58gb231280': 'cp936', 79 'euccn': 'cp936', 80 81 // Microsoft's CP936 is a subset and approximation of GBK. 82 'windows936': 'cp936', 83 'ms936': 'cp936', 84 '936': 'cp936', 85 'cp936': { 86 type: '_dbcs', 87 table: function() { return require('./tables/cp936.json') }, 88 }, 89 90 // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other. 91 'gbk': { 92 type: '_dbcs', 93 table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 94 }, 95 'xgbk': 'gbk', 96 'isoir58': 'gbk', 97 98 // GB18030 is an algorithmic extension of GBK. 99 // Main source: https://www.w3.org/TR/encoding/#gbk-encoder 100 // http://icu-project.org/docs/papers/gb18030.html 101 // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml 102 // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0 103 'gb18030': { 104 type: '_dbcs', 105 table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 106 gb18030: function() { return require('./tables/gb18030-ranges.json') }, 107 encodeSkipVals: [0x80], 108 encodeAdd: {'€': 0xA2E3}, 109 }, 110 111 'chinese': 'gb18030', 112 113 114 // == Korean =============================================================== 115 // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same. 116 'windows949': 'cp949', 117 'ms949': 'cp949', 118 '949': 'cp949', 119 'cp949': { 120 type: '_dbcs', 121 table: function() { return require('./tables/cp949.json') }, 122 }, 123 124 'cseuckr': 'cp949', 125 'csksc56011987': 'cp949', 126 'euckr': 'cp949', 127 'isoir149': 'cp949', 128 'korean': 'cp949', 129 'ksc56011987': 'cp949', 130 'ksc56011989': 'cp949', 131 'ksc5601': 'cp949', 132 133 134 // == Big5/Taiwan/Hong Kong ================================================ 135 // There are lots of tables for Big5 and cp950. Please see the following links for history: 136 // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html 137 // Variations, in roughly number of defined chars: 138 // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT 139 // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/ 140 // * Big5-2003 (Taiwan standard) almost superset of cp950. 141 // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers. 142 // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. 143 // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years. 144 // Plus, it has 4 combining sequences. 145 // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299 146 // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way. 147 // Implementations are not consistent within browsers; sometimes labeled as just big5. 148 // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied. 149 // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31 150 // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s. 151 // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt 152 // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt 153 // 154 // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder 155 // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong. 156 157 'windows950': 'cp950', 158 'ms950': 'cp950', 159 '950': 'cp950', 160 'cp950': { 161 type: '_dbcs', 162 table: function() { return require('./tables/cp950.json') }, 163 }, 164 165 // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus. 166 'big5': 'big5hkscs', 167 'big5hkscs': { 168 type: '_dbcs', 169 table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) }, 170 encodeSkipVals: [ 171 // Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of 172 // https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU. 173 // But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter. 174 0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe, 175 0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca, 176 0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62, 177 0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef, 178 0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed, 179 180 // Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345 181 0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce, 182 ], 183 }, 184 185 'cnbig5': 'big5hkscs', 186 'csbig5': 'big5hkscs', 187 'xxbig5': 'big5hkscs', 188}; 189