11cb0ef41Sopenharmony_ci"use strict"; 21cb0ef41Sopenharmony_ci 31cb0ef41Sopenharmony_ci// Description of supported double byte encodings and aliases. 41cb0ef41Sopenharmony_ci// Tables are not require()-d until they are needed to speed up library load. 51cb0ef41Sopenharmony_ci// require()-s are direct to support Browserify. 61cb0ef41Sopenharmony_ci 71cb0ef41Sopenharmony_cimodule.exports = { 81cb0ef41Sopenharmony_ci 91cb0ef41Sopenharmony_ci // == Japanese/ShiftJIS ==================================================== 101cb0ef41Sopenharmony_ci // All japanese encodings are based on JIS X set of standards: 111cb0ef41Sopenharmony_ci // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF. 121cb0ef41Sopenharmony_ci // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. 131cb0ef41Sopenharmony_ci // Has several variations in 1978, 1983, 1990 and 1997. 141cb0ef41Sopenharmony_ci // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead. 151cb0ef41Sopenharmony_ci // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233. 161cb0ef41Sopenharmony_ci // 2 planes, first is superset of 0208, second - revised 0212. 171cb0ef41Sopenharmony_ci // Introduced in 2000, revised 2004. Some characters are in Unicode Plane 2 (0x2xxxx) 181cb0ef41Sopenharmony_ci 191cb0ef41Sopenharmony_ci // Byte encodings are: 201cb0ef41Sopenharmony_ci // * Shift_JIS: Compatible with 0201, uses not defined chars in top half as lead bytes for double-byte 211cb0ef41Sopenharmony_ci // encoding of 0208. Lead byte ranges: 0x81-0x9F, 0xE0-0xEF; Trail byte ranges: 0x40-0x7E, 0x80-0x9E, 0x9F-0xFC. 221cb0ef41Sopenharmony_ci // Windows CP932 is a superset of Shift_JIS. Some companies added more chars, notably KDDI. 231cb0ef41Sopenharmony_ci // * EUC-JP: Up to 3 bytes per character. Used mostly on *nixes. 241cb0ef41Sopenharmony_ci // 0x00-0x7F - lower part of 0201 251cb0ef41Sopenharmony_ci // 0x8E, 0xA1-0xDF - upper part of 0201 261cb0ef41Sopenharmony_ci // (0xA1-0xFE)x2 - 0208 plane (94x94). 271cb0ef41Sopenharmony_ci // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94). 281cb0ef41Sopenharmony_ci // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon. 291cb0ef41Sopenharmony_ci // Used as-is in ISO2022 family. 301cb0ef41Sopenharmony_ci // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, 311cb0ef41Sopenharmony_ci // 0201-1976 Roman, 0208-1978, 0208-1983. 321cb0ef41Sopenharmony_ci // * ISO2022-JP-1: Adds esc seq for 0212-1990. 331cb0ef41Sopenharmony_ci // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7. 341cb0ef41Sopenharmony_ci // * ISO2022-JP-3: Adds esc seq for 0201-1976 Kana set, 0213-2000 Planes 1, 2. 351cb0ef41Sopenharmony_ci // * ISO2022-JP-2004: Adds 0213-2004 Plane 1. 361cb0ef41Sopenharmony_ci // 371cb0ef41Sopenharmony_ci // After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes. 381cb0ef41Sopenharmony_ci // 391cb0ef41Sopenharmony_ci // Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html 401cb0ef41Sopenharmony_ci 411cb0ef41Sopenharmony_ci 'shiftjis': { 421cb0ef41Sopenharmony_ci type: '_dbcs', 431cb0ef41Sopenharmony_ci table: function() { return require('./tables/shiftjis.json') }, 441cb0ef41Sopenharmony_ci encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 451cb0ef41Sopenharmony_ci encodeSkipVals: [{from: 0xED40, to: 0xF940}], 461cb0ef41Sopenharmony_ci }, 471cb0ef41Sopenharmony_ci 'csshiftjis': 'shiftjis', 481cb0ef41Sopenharmony_ci 'mskanji': 'shiftjis', 491cb0ef41Sopenharmony_ci 'sjis': 'shiftjis', 501cb0ef41Sopenharmony_ci 'windows31j': 'shiftjis', 511cb0ef41Sopenharmony_ci 'ms31j': 'shiftjis', 521cb0ef41Sopenharmony_ci 'xsjis': 'shiftjis', 531cb0ef41Sopenharmony_ci 'windows932': 'shiftjis', 541cb0ef41Sopenharmony_ci 'ms932': 'shiftjis', 551cb0ef41Sopenharmony_ci '932': 'shiftjis', 561cb0ef41Sopenharmony_ci 'cp932': 'shiftjis', 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_ci 'eucjp': { 591cb0ef41Sopenharmony_ci type: '_dbcs', 601cb0ef41Sopenharmony_ci table: function() { return require('./tables/eucjp.json') }, 611cb0ef41Sopenharmony_ci encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, 621cb0ef41Sopenharmony_ci }, 631cb0ef41Sopenharmony_ci 641cb0ef41Sopenharmony_ci // TODO: KDDI extension to Shift_JIS 651cb0ef41Sopenharmony_ci // TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes. 661cb0ef41Sopenharmony_ci // TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars. 671cb0ef41Sopenharmony_ci 681cb0ef41Sopenharmony_ci 691cb0ef41Sopenharmony_ci // == Chinese/GBK ========================================================== 701cb0ef41Sopenharmony_ci // http://en.wikipedia.org/wiki/GBK 711cb0ef41Sopenharmony_ci // We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder 721cb0ef41Sopenharmony_ci 731cb0ef41Sopenharmony_ci // Oldest GB2312 (1981, ~7600 chars) is a subset of CP936 741cb0ef41Sopenharmony_ci 'gb2312': 'cp936', 751cb0ef41Sopenharmony_ci 'gb231280': 'cp936', 761cb0ef41Sopenharmony_ci 'gb23121980': 'cp936', 771cb0ef41Sopenharmony_ci 'csgb2312': 'cp936', 781cb0ef41Sopenharmony_ci 'csiso58gb231280': 'cp936', 791cb0ef41Sopenharmony_ci 'euccn': 'cp936', 801cb0ef41Sopenharmony_ci 811cb0ef41Sopenharmony_ci // Microsoft's CP936 is a subset and approximation of GBK. 821cb0ef41Sopenharmony_ci 'windows936': 'cp936', 831cb0ef41Sopenharmony_ci 'ms936': 'cp936', 841cb0ef41Sopenharmony_ci '936': 'cp936', 851cb0ef41Sopenharmony_ci 'cp936': { 861cb0ef41Sopenharmony_ci type: '_dbcs', 871cb0ef41Sopenharmony_ci table: function() { return require('./tables/cp936.json') }, 881cb0ef41Sopenharmony_ci }, 891cb0ef41Sopenharmony_ci 901cb0ef41Sopenharmony_ci // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other. 911cb0ef41Sopenharmony_ci 'gbk': { 921cb0ef41Sopenharmony_ci type: '_dbcs', 931cb0ef41Sopenharmony_ci table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 941cb0ef41Sopenharmony_ci }, 951cb0ef41Sopenharmony_ci 'xgbk': 'gbk', 961cb0ef41Sopenharmony_ci 'isoir58': 'gbk', 971cb0ef41Sopenharmony_ci 981cb0ef41Sopenharmony_ci // GB18030 is an algorithmic extension of GBK. 991cb0ef41Sopenharmony_ci // Main source: https://www.w3.org/TR/encoding/#gbk-encoder 1001cb0ef41Sopenharmony_ci // http://icu-project.org/docs/papers/gb18030.html 1011cb0ef41Sopenharmony_ci // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml 1021cb0ef41Sopenharmony_ci // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0 1031cb0ef41Sopenharmony_ci 'gb18030': { 1041cb0ef41Sopenharmony_ci type: '_dbcs', 1051cb0ef41Sopenharmony_ci table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, 1061cb0ef41Sopenharmony_ci gb18030: function() { return require('./tables/gb18030-ranges.json') }, 1071cb0ef41Sopenharmony_ci encodeSkipVals: [0x80], 1081cb0ef41Sopenharmony_ci encodeAdd: {'€': 0xA2E3}, 1091cb0ef41Sopenharmony_ci }, 1101cb0ef41Sopenharmony_ci 1111cb0ef41Sopenharmony_ci 'chinese': 'gb18030', 1121cb0ef41Sopenharmony_ci 1131cb0ef41Sopenharmony_ci 1141cb0ef41Sopenharmony_ci // == Korean =============================================================== 1151cb0ef41Sopenharmony_ci // EUC-KR, KS_C_5601 and KS X 1001 are exactly the same. 1161cb0ef41Sopenharmony_ci 'windows949': 'cp949', 1171cb0ef41Sopenharmony_ci 'ms949': 'cp949', 1181cb0ef41Sopenharmony_ci '949': 'cp949', 1191cb0ef41Sopenharmony_ci 'cp949': { 1201cb0ef41Sopenharmony_ci type: '_dbcs', 1211cb0ef41Sopenharmony_ci table: function() { return require('./tables/cp949.json') }, 1221cb0ef41Sopenharmony_ci }, 1231cb0ef41Sopenharmony_ci 1241cb0ef41Sopenharmony_ci 'cseuckr': 'cp949', 1251cb0ef41Sopenharmony_ci 'csksc56011987': 'cp949', 1261cb0ef41Sopenharmony_ci 'euckr': 'cp949', 1271cb0ef41Sopenharmony_ci 'isoir149': 'cp949', 1281cb0ef41Sopenharmony_ci 'korean': 'cp949', 1291cb0ef41Sopenharmony_ci 'ksc56011987': 'cp949', 1301cb0ef41Sopenharmony_ci 'ksc56011989': 'cp949', 1311cb0ef41Sopenharmony_ci 'ksc5601': 'cp949', 1321cb0ef41Sopenharmony_ci 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_ci // == Big5/Taiwan/Hong Kong ================================================ 1351cb0ef41Sopenharmony_ci // There are lots of tables for Big5 and cp950. Please see the following links for history: 1361cb0ef41Sopenharmony_ci // http://moztw.org/docs/big5/ http://www.haible.de/bruno/charsets/conversion-tables/Big5.html 1371cb0ef41Sopenharmony_ci // Variations, in roughly number of defined chars: 1381cb0ef41Sopenharmony_ci // * Windows CP 950: Microsoft variant of Big5. Canonical: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT 1391cb0ef41Sopenharmony_ci // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/ 1401cb0ef41Sopenharmony_ci // * Big5-2003 (Taiwan standard) almost superset of cp950. 1411cb0ef41Sopenharmony_ci // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers. 1421cb0ef41Sopenharmony_ci // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. 1431cb0ef41Sopenharmony_ci // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years. 1441cb0ef41Sopenharmony_ci // Plus, it has 4 combining sequences. 1451cb0ef41Sopenharmony_ci // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299 1461cb0ef41Sopenharmony_ci // because big5-hkscs is the only encoding to include astral characters in non-algorithmic way. 1471cb0ef41Sopenharmony_ci // Implementations are not consistent within browsers; sometimes labeled as just big5. 1481cb0ef41Sopenharmony_ci // MS Internet Explorer switches from big5 to big5-hkscs when a patch applied. 1491cb0ef41Sopenharmony_ci // Great discussion & recap of what's going on https://bugzilla.mozilla.org/show_bug.cgi?id=912470#c31 1501cb0ef41Sopenharmony_ci // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s. 1511cb0ef41Sopenharmony_ci // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt 1521cb0ef41Sopenharmony_ci // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt 1531cb0ef41Sopenharmony_ci // 1541cb0ef41Sopenharmony_ci // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder 1551cb0ef41Sopenharmony_ci // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong. 1561cb0ef41Sopenharmony_ci 1571cb0ef41Sopenharmony_ci 'windows950': 'cp950', 1581cb0ef41Sopenharmony_ci 'ms950': 'cp950', 1591cb0ef41Sopenharmony_ci '950': 'cp950', 1601cb0ef41Sopenharmony_ci 'cp950': { 1611cb0ef41Sopenharmony_ci type: '_dbcs', 1621cb0ef41Sopenharmony_ci table: function() { return require('./tables/cp950.json') }, 1631cb0ef41Sopenharmony_ci }, 1641cb0ef41Sopenharmony_ci 1651cb0ef41Sopenharmony_ci // Big5 has many variations and is an extension of cp950. We use Encoding Standard's as a consensus. 1661cb0ef41Sopenharmony_ci 'big5': 'big5hkscs', 1671cb0ef41Sopenharmony_ci 'big5hkscs': { 1681cb0ef41Sopenharmony_ci type: '_dbcs', 1691cb0ef41Sopenharmony_ci table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) }, 1701cb0ef41Sopenharmony_ci encodeSkipVals: [ 1711cb0ef41Sopenharmony_ci // Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of 1721cb0ef41Sopenharmony_ci // https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU. 1731cb0ef41Sopenharmony_ci // But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter. 1741cb0ef41Sopenharmony_ci 0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe, 1751cb0ef41Sopenharmony_ci 0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca, 1761cb0ef41Sopenharmony_ci 0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62, 1771cb0ef41Sopenharmony_ci 0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef, 1781cb0ef41Sopenharmony_ci 0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed, 1791cb0ef41Sopenharmony_ci 1801cb0ef41Sopenharmony_ci // Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345 1811cb0ef41Sopenharmony_ci 0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce, 1821cb0ef41Sopenharmony_ci ], 1831cb0ef41Sopenharmony_ci }, 1841cb0ef41Sopenharmony_ci 1851cb0ef41Sopenharmony_ci 'cnbig5': 'big5hkscs', 1861cb0ef41Sopenharmony_ci 'csbig5': 'big5hkscs', 1871cb0ef41Sopenharmony_ci 'xxbig5': 'big5hkscs', 1881cb0ef41Sopenharmony_ci}; 189