11cb0ef41Sopenharmony_ci"use strict"; 21cb0ef41Sopenharmony_civar Buffer = require("safer-buffer").Buffer; 31cb0ef41Sopenharmony_ci 41cb0ef41Sopenharmony_ci// Multibyte codec. In this scheme, a character is represented by 1 or more bytes. 51cb0ef41Sopenharmony_ci// Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. 61cb0ef41Sopenharmony_ci// To save memory and loading time, we read table files only when requested. 71cb0ef41Sopenharmony_ci 81cb0ef41Sopenharmony_ciexports._dbcs = DBCSCodec; 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_civar UNASSIGNED = -1, 111cb0ef41Sopenharmony_ci GB18030_CODE = -2, 121cb0ef41Sopenharmony_ci SEQ_START = -10, 131cb0ef41Sopenharmony_ci NODE_START = -1000, 141cb0ef41Sopenharmony_ci UNASSIGNED_NODE = new Array(0x100), 151cb0ef41Sopenharmony_ci DEF_CHAR = -1; 161cb0ef41Sopenharmony_ci 171cb0ef41Sopenharmony_cifor (var i = 0; i < 0x100; i++) 181cb0ef41Sopenharmony_ci UNASSIGNED_NODE[i] = UNASSIGNED; 191cb0ef41Sopenharmony_ci 201cb0ef41Sopenharmony_ci 211cb0ef41Sopenharmony_ci// Class DBCSCodec reads and initializes mapping tables. 221cb0ef41Sopenharmony_cifunction DBCSCodec(codecOptions, iconv) { 231cb0ef41Sopenharmony_ci this.encodingName = codecOptions.encodingName; 241cb0ef41Sopenharmony_ci if (!codecOptions) 251cb0ef41Sopenharmony_ci throw new Error("DBCS codec is called without the data.") 261cb0ef41Sopenharmony_ci if (!codecOptions.table) 271cb0ef41Sopenharmony_ci throw new Error("Encoding '" + this.encodingName + "' has no data."); 281cb0ef41Sopenharmony_ci 291cb0ef41Sopenharmony_ci // Load tables. 301cb0ef41Sopenharmony_ci var mappingTable = codecOptions.table(); 311cb0ef41Sopenharmony_ci 321cb0ef41Sopenharmony_ci 331cb0ef41Sopenharmony_ci // Decode tables: MBCS -> Unicode. 341cb0ef41Sopenharmony_ci 351cb0ef41Sopenharmony_ci // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. 361cb0ef41Sopenharmony_ci // Trie root is decodeTables[0]. 371cb0ef41Sopenharmony_ci // Values: >= 0 -> unicode character code. can be > 0xFFFF 381cb0ef41Sopenharmony_ci // == UNASSIGNED -> unknown/unassigned sequence. 391cb0ef41Sopenharmony_ci // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. 401cb0ef41Sopenharmony_ci // <= NODE_START -> index of the next node in our trie to process next byte. 411cb0ef41Sopenharmony_ci // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. 421cb0ef41Sopenharmony_ci this.decodeTables = []; 431cb0ef41Sopenharmony_ci this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. 441cb0ef41Sopenharmony_ci 451cb0ef41Sopenharmony_ci // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. 461cb0ef41Sopenharmony_ci this.decodeTableSeq = []; 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_ci // Actual mapping tables consist of chunks. Use them to fill up decode tables. 491cb0ef41Sopenharmony_ci for (var i = 0; i < mappingTable.length; i++) 501cb0ef41Sopenharmony_ci this._addDecodeChunk(mappingTable[i]); 511cb0ef41Sopenharmony_ci 521cb0ef41Sopenharmony_ci // Load & create GB18030 tables when needed. 531cb0ef41Sopenharmony_ci if (typeof codecOptions.gb18030 === 'function') { 541cb0ef41Sopenharmony_ci this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. 551cb0ef41Sopenharmony_ci 561cb0ef41Sopenharmony_ci // Add GB18030 common decode nodes. 571cb0ef41Sopenharmony_ci var commonThirdByteNodeIdx = this.decodeTables.length; 581cb0ef41Sopenharmony_ci this.decodeTables.push(UNASSIGNED_NODE.slice(0)); 591cb0ef41Sopenharmony_ci 601cb0ef41Sopenharmony_ci var commonFourthByteNodeIdx = this.decodeTables.length; 611cb0ef41Sopenharmony_ci this.decodeTables.push(UNASSIGNED_NODE.slice(0)); 621cb0ef41Sopenharmony_ci 631cb0ef41Sopenharmony_ci // Fill out the tree 641cb0ef41Sopenharmony_ci var firstByteNode = this.decodeTables[0]; 651cb0ef41Sopenharmony_ci for (var i = 0x81; i <= 0xFE; i++) { 661cb0ef41Sopenharmony_ci var secondByteNode = this.decodeTables[NODE_START - firstByteNode[i]]; 671cb0ef41Sopenharmony_ci for (var j = 0x30; j <= 0x39; j++) { 681cb0ef41Sopenharmony_ci if (secondByteNode[j] === UNASSIGNED) { 691cb0ef41Sopenharmony_ci secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; 701cb0ef41Sopenharmony_ci } else if (secondByteNode[j] > NODE_START) { 711cb0ef41Sopenharmony_ci throw new Error("gb18030 decode tables conflict at byte 2"); 721cb0ef41Sopenharmony_ci } 731cb0ef41Sopenharmony_ci 741cb0ef41Sopenharmony_ci var thirdByteNode = this.decodeTables[NODE_START - secondByteNode[j]]; 751cb0ef41Sopenharmony_ci for (var k = 0x81; k <= 0xFE; k++) { 761cb0ef41Sopenharmony_ci if (thirdByteNode[k] === UNASSIGNED) { 771cb0ef41Sopenharmony_ci thirdByteNode[k] = NODE_START - commonFourthByteNodeIdx; 781cb0ef41Sopenharmony_ci } else if (thirdByteNode[k] === NODE_START - commonFourthByteNodeIdx) { 791cb0ef41Sopenharmony_ci continue; 801cb0ef41Sopenharmony_ci } else if (thirdByteNode[k] > NODE_START) { 811cb0ef41Sopenharmony_ci throw new Error("gb18030 decode tables conflict at byte 3"); 821cb0ef41Sopenharmony_ci } 831cb0ef41Sopenharmony_ci 841cb0ef41Sopenharmony_ci var fourthByteNode = this.decodeTables[NODE_START - thirdByteNode[k]]; 851cb0ef41Sopenharmony_ci for (var l = 0x30; l <= 0x39; l++) { 861cb0ef41Sopenharmony_ci if (fourthByteNode[l] === UNASSIGNED) 871cb0ef41Sopenharmony_ci fourthByteNode[l] = GB18030_CODE; 881cb0ef41Sopenharmony_ci } 891cb0ef41Sopenharmony_ci } 901cb0ef41Sopenharmony_ci } 911cb0ef41Sopenharmony_ci } 921cb0ef41Sopenharmony_ci } 931cb0ef41Sopenharmony_ci 941cb0ef41Sopenharmony_ci this.defaultCharUnicode = iconv.defaultCharUnicode; 951cb0ef41Sopenharmony_ci 961cb0ef41Sopenharmony_ci 971cb0ef41Sopenharmony_ci // Encode tables: Unicode -> DBCS. 981cb0ef41Sopenharmony_ci 991cb0ef41Sopenharmony_ci // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. 1001cb0ef41Sopenharmony_ci // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. 1011cb0ef41Sopenharmony_ci // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). 1021cb0ef41Sopenharmony_ci // == UNASSIGNED -> no conversion found. Output a default char. 1031cb0ef41Sopenharmony_ci // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. 1041cb0ef41Sopenharmony_ci this.encodeTable = []; 1051cb0ef41Sopenharmony_ci 1061cb0ef41Sopenharmony_ci // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of 1071cb0ef41Sopenharmony_ci // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key 1081cb0ef41Sopenharmony_ci // means end of sequence (needed when one sequence is a strict subsequence of another). 1091cb0ef41Sopenharmony_ci // Objects are kept separately from encodeTable to increase performance. 1101cb0ef41Sopenharmony_ci this.encodeTableSeq = []; 1111cb0ef41Sopenharmony_ci 1121cb0ef41Sopenharmony_ci // Some chars can be decoded, but need not be encoded. 1131cb0ef41Sopenharmony_ci var skipEncodeChars = {}; 1141cb0ef41Sopenharmony_ci if (codecOptions.encodeSkipVals) 1151cb0ef41Sopenharmony_ci for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) { 1161cb0ef41Sopenharmony_ci var val = codecOptions.encodeSkipVals[i]; 1171cb0ef41Sopenharmony_ci if (typeof val === 'number') 1181cb0ef41Sopenharmony_ci skipEncodeChars[val] = true; 1191cb0ef41Sopenharmony_ci else 1201cb0ef41Sopenharmony_ci for (var j = val.from; j <= val.to; j++) 1211cb0ef41Sopenharmony_ci skipEncodeChars[j] = true; 1221cb0ef41Sopenharmony_ci } 1231cb0ef41Sopenharmony_ci 1241cb0ef41Sopenharmony_ci // Use decode trie to recursively fill out encode tables. 1251cb0ef41Sopenharmony_ci this._fillEncodeTable(0, 0, skipEncodeChars); 1261cb0ef41Sopenharmony_ci 1271cb0ef41Sopenharmony_ci // Add more encoding pairs when needed. 1281cb0ef41Sopenharmony_ci if (codecOptions.encodeAdd) { 1291cb0ef41Sopenharmony_ci for (var uChar in codecOptions.encodeAdd) 1301cb0ef41Sopenharmony_ci if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar)) 1311cb0ef41Sopenharmony_ci this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); 1321cb0ef41Sopenharmony_ci } 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_ci this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; 1351cb0ef41Sopenharmony_ci if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?']; 1361cb0ef41Sopenharmony_ci if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0); 1371cb0ef41Sopenharmony_ci} 1381cb0ef41Sopenharmony_ci 1391cb0ef41Sopenharmony_ciDBCSCodec.prototype.encoder = DBCSEncoder; 1401cb0ef41Sopenharmony_ciDBCSCodec.prototype.decoder = DBCSDecoder; 1411cb0ef41Sopenharmony_ci 1421cb0ef41Sopenharmony_ci// Decoder helpers 1431cb0ef41Sopenharmony_ciDBCSCodec.prototype._getDecodeTrieNode = function(addr) { 1441cb0ef41Sopenharmony_ci var bytes = []; 1451cb0ef41Sopenharmony_ci for (; addr > 0; addr >>>= 8) 1461cb0ef41Sopenharmony_ci bytes.push(addr & 0xFF); 1471cb0ef41Sopenharmony_ci if (bytes.length == 0) 1481cb0ef41Sopenharmony_ci bytes.push(0); 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ci var node = this.decodeTables[0]; 1511cb0ef41Sopenharmony_ci for (var i = bytes.length-1; i > 0; i--) { // Traverse nodes deeper into the trie. 1521cb0ef41Sopenharmony_ci var val = node[bytes[i]]; 1531cb0ef41Sopenharmony_ci 1541cb0ef41Sopenharmony_ci if (val == UNASSIGNED) { // Create new node. 1551cb0ef41Sopenharmony_ci node[bytes[i]] = NODE_START - this.decodeTables.length; 1561cb0ef41Sopenharmony_ci this.decodeTables.push(node = UNASSIGNED_NODE.slice(0)); 1571cb0ef41Sopenharmony_ci } 1581cb0ef41Sopenharmony_ci else if (val <= NODE_START) { // Existing node. 1591cb0ef41Sopenharmony_ci node = this.decodeTables[NODE_START - val]; 1601cb0ef41Sopenharmony_ci } 1611cb0ef41Sopenharmony_ci else 1621cb0ef41Sopenharmony_ci throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16)); 1631cb0ef41Sopenharmony_ci } 1641cb0ef41Sopenharmony_ci return node; 1651cb0ef41Sopenharmony_ci} 1661cb0ef41Sopenharmony_ci 1671cb0ef41Sopenharmony_ci 1681cb0ef41Sopenharmony_ciDBCSCodec.prototype._addDecodeChunk = function(chunk) { 1691cb0ef41Sopenharmony_ci // First element of chunk is the hex mbcs code where we start. 1701cb0ef41Sopenharmony_ci var curAddr = parseInt(chunk[0], 16); 1711cb0ef41Sopenharmony_ci 1721cb0ef41Sopenharmony_ci // Choose the decoding node where we'll write our chars. 1731cb0ef41Sopenharmony_ci var writeTable = this._getDecodeTrieNode(curAddr); 1741cb0ef41Sopenharmony_ci curAddr = curAddr & 0xFF; 1751cb0ef41Sopenharmony_ci 1761cb0ef41Sopenharmony_ci // Write all other elements of the chunk to the table. 1771cb0ef41Sopenharmony_ci for (var k = 1; k < chunk.length; k++) { 1781cb0ef41Sopenharmony_ci var part = chunk[k]; 1791cb0ef41Sopenharmony_ci if (typeof part === "string") { // String, write as-is. 1801cb0ef41Sopenharmony_ci for (var l = 0; l < part.length;) { 1811cb0ef41Sopenharmony_ci var code = part.charCodeAt(l++); 1821cb0ef41Sopenharmony_ci if (0xD800 <= code && code < 0xDC00) { // Decode surrogate 1831cb0ef41Sopenharmony_ci var codeTrail = part.charCodeAt(l++); 1841cb0ef41Sopenharmony_ci if (0xDC00 <= codeTrail && codeTrail < 0xE000) 1851cb0ef41Sopenharmony_ci writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00); 1861cb0ef41Sopenharmony_ci else 1871cb0ef41Sopenharmony_ci throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]); 1881cb0ef41Sopenharmony_ci } 1891cb0ef41Sopenharmony_ci else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used) 1901cb0ef41Sopenharmony_ci var len = 0xFFF - code + 2; 1911cb0ef41Sopenharmony_ci var seq = []; 1921cb0ef41Sopenharmony_ci for (var m = 0; m < len; m++) 1931cb0ef41Sopenharmony_ci seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq. 1941cb0ef41Sopenharmony_ci 1951cb0ef41Sopenharmony_ci writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; 1961cb0ef41Sopenharmony_ci this.decodeTableSeq.push(seq); 1971cb0ef41Sopenharmony_ci } 1981cb0ef41Sopenharmony_ci else 1991cb0ef41Sopenharmony_ci writeTable[curAddr++] = code; // Basic char 2001cb0ef41Sopenharmony_ci } 2011cb0ef41Sopenharmony_ci } 2021cb0ef41Sopenharmony_ci else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character. 2031cb0ef41Sopenharmony_ci var charCode = writeTable[curAddr - 1] + 1; 2041cb0ef41Sopenharmony_ci for (var l = 0; l < part; l++) 2051cb0ef41Sopenharmony_ci writeTable[curAddr++] = charCode++; 2061cb0ef41Sopenharmony_ci } 2071cb0ef41Sopenharmony_ci else 2081cb0ef41Sopenharmony_ci throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]); 2091cb0ef41Sopenharmony_ci } 2101cb0ef41Sopenharmony_ci if (curAddr > 0xFF) 2111cb0ef41Sopenharmony_ci throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr); 2121cb0ef41Sopenharmony_ci} 2131cb0ef41Sopenharmony_ci 2141cb0ef41Sopenharmony_ci// Encoder helpers 2151cb0ef41Sopenharmony_ciDBCSCodec.prototype._getEncodeBucket = function(uCode) { 2161cb0ef41Sopenharmony_ci var high = uCode >> 8; // This could be > 0xFF because of astral characters. 2171cb0ef41Sopenharmony_ci if (this.encodeTable[high] === undefined) 2181cb0ef41Sopenharmony_ci this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. 2191cb0ef41Sopenharmony_ci return this.encodeTable[high]; 2201cb0ef41Sopenharmony_ci} 2211cb0ef41Sopenharmony_ci 2221cb0ef41Sopenharmony_ciDBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) { 2231cb0ef41Sopenharmony_ci var bucket = this._getEncodeBucket(uCode); 2241cb0ef41Sopenharmony_ci var low = uCode & 0xFF; 2251cb0ef41Sopenharmony_ci if (bucket[low] <= SEQ_START) 2261cb0ef41Sopenharmony_ci this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode; // There's already a sequence, set a single-char subsequence of it. 2271cb0ef41Sopenharmony_ci else if (bucket[low] == UNASSIGNED) 2281cb0ef41Sopenharmony_ci bucket[low] = dbcsCode; 2291cb0ef41Sopenharmony_ci} 2301cb0ef41Sopenharmony_ci 2311cb0ef41Sopenharmony_ciDBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) { 2321cb0ef41Sopenharmony_ci 2331cb0ef41Sopenharmony_ci // Get the root of character tree according to first character of the sequence. 2341cb0ef41Sopenharmony_ci var uCode = seq[0]; 2351cb0ef41Sopenharmony_ci var bucket = this._getEncodeBucket(uCode); 2361cb0ef41Sopenharmony_ci var low = uCode & 0xFF; 2371cb0ef41Sopenharmony_ci 2381cb0ef41Sopenharmony_ci var node; 2391cb0ef41Sopenharmony_ci if (bucket[low] <= SEQ_START) { 2401cb0ef41Sopenharmony_ci // There's already a sequence with - use it. 2411cb0ef41Sopenharmony_ci node = this.encodeTableSeq[SEQ_START-bucket[low]]; 2421cb0ef41Sopenharmony_ci } 2431cb0ef41Sopenharmony_ci else { 2441cb0ef41Sopenharmony_ci // There was no sequence object - allocate a new one. 2451cb0ef41Sopenharmony_ci node = {}; 2461cb0ef41Sopenharmony_ci if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. 2471cb0ef41Sopenharmony_ci bucket[low] = SEQ_START - this.encodeTableSeq.length; 2481cb0ef41Sopenharmony_ci this.encodeTableSeq.push(node); 2491cb0ef41Sopenharmony_ci } 2501cb0ef41Sopenharmony_ci 2511cb0ef41Sopenharmony_ci // Traverse the character tree, allocating new nodes as needed. 2521cb0ef41Sopenharmony_ci for (var j = 1; j < seq.length-1; j++) { 2531cb0ef41Sopenharmony_ci var oldVal = node[uCode]; 2541cb0ef41Sopenharmony_ci if (typeof oldVal === 'object') 2551cb0ef41Sopenharmony_ci node = oldVal; 2561cb0ef41Sopenharmony_ci else { 2571cb0ef41Sopenharmony_ci node = node[uCode] = {} 2581cb0ef41Sopenharmony_ci if (oldVal !== undefined) 2591cb0ef41Sopenharmony_ci node[DEF_CHAR] = oldVal 2601cb0ef41Sopenharmony_ci } 2611cb0ef41Sopenharmony_ci } 2621cb0ef41Sopenharmony_ci 2631cb0ef41Sopenharmony_ci // Set the leaf to given dbcsCode. 2641cb0ef41Sopenharmony_ci uCode = seq[seq.length-1]; 2651cb0ef41Sopenharmony_ci node[uCode] = dbcsCode; 2661cb0ef41Sopenharmony_ci} 2671cb0ef41Sopenharmony_ci 2681cb0ef41Sopenharmony_ciDBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) { 2691cb0ef41Sopenharmony_ci var node = this.decodeTables[nodeIdx]; 2701cb0ef41Sopenharmony_ci var hasValues = false; 2711cb0ef41Sopenharmony_ci var subNodeEmpty = {}; 2721cb0ef41Sopenharmony_ci for (var i = 0; i < 0x100; i++) { 2731cb0ef41Sopenharmony_ci var uCode = node[i]; 2741cb0ef41Sopenharmony_ci var mbCode = prefix + i; 2751cb0ef41Sopenharmony_ci if (skipEncodeChars[mbCode]) 2761cb0ef41Sopenharmony_ci continue; 2771cb0ef41Sopenharmony_ci 2781cb0ef41Sopenharmony_ci if (uCode >= 0) { 2791cb0ef41Sopenharmony_ci this._setEncodeChar(uCode, mbCode); 2801cb0ef41Sopenharmony_ci hasValues = true; 2811cb0ef41Sopenharmony_ci } else if (uCode <= NODE_START) { 2821cb0ef41Sopenharmony_ci var subNodeIdx = NODE_START - uCode; 2831cb0ef41Sopenharmony_ci if (!subNodeEmpty[subNodeIdx]) { // Skip empty subtrees (they are too large in gb18030). 2841cb0ef41Sopenharmony_ci var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. 2851cb0ef41Sopenharmony_ci if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) 2861cb0ef41Sopenharmony_ci hasValues = true; 2871cb0ef41Sopenharmony_ci else 2881cb0ef41Sopenharmony_ci subNodeEmpty[subNodeIdx] = true; 2891cb0ef41Sopenharmony_ci } 2901cb0ef41Sopenharmony_ci } else if (uCode <= SEQ_START) { 2911cb0ef41Sopenharmony_ci this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); 2921cb0ef41Sopenharmony_ci hasValues = true; 2931cb0ef41Sopenharmony_ci } 2941cb0ef41Sopenharmony_ci } 2951cb0ef41Sopenharmony_ci return hasValues; 2961cb0ef41Sopenharmony_ci} 2971cb0ef41Sopenharmony_ci 2981cb0ef41Sopenharmony_ci 2991cb0ef41Sopenharmony_ci 3001cb0ef41Sopenharmony_ci// == Encoder ================================================================== 3011cb0ef41Sopenharmony_ci 3021cb0ef41Sopenharmony_cifunction DBCSEncoder(options, codec) { 3031cb0ef41Sopenharmony_ci // Encoder state 3041cb0ef41Sopenharmony_ci this.leadSurrogate = -1; 3051cb0ef41Sopenharmony_ci this.seqObj = undefined; 3061cb0ef41Sopenharmony_ci 3071cb0ef41Sopenharmony_ci // Static data 3081cb0ef41Sopenharmony_ci this.encodeTable = codec.encodeTable; 3091cb0ef41Sopenharmony_ci this.encodeTableSeq = codec.encodeTableSeq; 3101cb0ef41Sopenharmony_ci this.defaultCharSingleByte = codec.defCharSB; 3111cb0ef41Sopenharmony_ci this.gb18030 = codec.gb18030; 3121cb0ef41Sopenharmony_ci} 3131cb0ef41Sopenharmony_ci 3141cb0ef41Sopenharmony_ciDBCSEncoder.prototype.write = function(str) { 3151cb0ef41Sopenharmony_ci var newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)), 3161cb0ef41Sopenharmony_ci leadSurrogate = this.leadSurrogate, 3171cb0ef41Sopenharmony_ci seqObj = this.seqObj, nextChar = -1, 3181cb0ef41Sopenharmony_ci i = 0, j = 0; 3191cb0ef41Sopenharmony_ci 3201cb0ef41Sopenharmony_ci while (true) { 3211cb0ef41Sopenharmony_ci // 0. Get next character. 3221cb0ef41Sopenharmony_ci if (nextChar === -1) { 3231cb0ef41Sopenharmony_ci if (i == str.length) break; 3241cb0ef41Sopenharmony_ci var uCode = str.charCodeAt(i++); 3251cb0ef41Sopenharmony_ci } 3261cb0ef41Sopenharmony_ci else { 3271cb0ef41Sopenharmony_ci var uCode = nextChar; 3281cb0ef41Sopenharmony_ci nextChar = -1; 3291cb0ef41Sopenharmony_ci } 3301cb0ef41Sopenharmony_ci 3311cb0ef41Sopenharmony_ci // 1. Handle surrogates. 3321cb0ef41Sopenharmony_ci if (0xD800 <= uCode && uCode < 0xE000) { // Char is one of surrogates. 3331cb0ef41Sopenharmony_ci if (uCode < 0xDC00) { // We've got lead surrogate. 3341cb0ef41Sopenharmony_ci if (leadSurrogate === -1) { 3351cb0ef41Sopenharmony_ci leadSurrogate = uCode; 3361cb0ef41Sopenharmony_ci continue; 3371cb0ef41Sopenharmony_ci } else { 3381cb0ef41Sopenharmony_ci leadSurrogate = uCode; 3391cb0ef41Sopenharmony_ci // Double lead surrogate found. 3401cb0ef41Sopenharmony_ci uCode = UNASSIGNED; 3411cb0ef41Sopenharmony_ci } 3421cb0ef41Sopenharmony_ci } else { // We've got trail surrogate. 3431cb0ef41Sopenharmony_ci if (leadSurrogate !== -1) { 3441cb0ef41Sopenharmony_ci uCode = 0x10000 + (leadSurrogate - 0xD800) * 0x400 + (uCode - 0xDC00); 3451cb0ef41Sopenharmony_ci leadSurrogate = -1; 3461cb0ef41Sopenharmony_ci } else { 3471cb0ef41Sopenharmony_ci // Incomplete surrogate pair - only trail surrogate found. 3481cb0ef41Sopenharmony_ci uCode = UNASSIGNED; 3491cb0ef41Sopenharmony_ci } 3501cb0ef41Sopenharmony_ci 3511cb0ef41Sopenharmony_ci } 3521cb0ef41Sopenharmony_ci } 3531cb0ef41Sopenharmony_ci else if (leadSurrogate !== -1) { 3541cb0ef41Sopenharmony_ci // Incomplete surrogate pair - only lead surrogate found. 3551cb0ef41Sopenharmony_ci nextChar = uCode; uCode = UNASSIGNED; // Write an error, then current char. 3561cb0ef41Sopenharmony_ci leadSurrogate = -1; 3571cb0ef41Sopenharmony_ci } 3581cb0ef41Sopenharmony_ci 3591cb0ef41Sopenharmony_ci // 2. Convert uCode character. 3601cb0ef41Sopenharmony_ci var dbcsCode = UNASSIGNED; 3611cb0ef41Sopenharmony_ci if (seqObj !== undefined && uCode != UNASSIGNED) { // We are in the middle of the sequence 3621cb0ef41Sopenharmony_ci var resCode = seqObj[uCode]; 3631cb0ef41Sopenharmony_ci if (typeof resCode === 'object') { // Sequence continues. 3641cb0ef41Sopenharmony_ci seqObj = resCode; 3651cb0ef41Sopenharmony_ci continue; 3661cb0ef41Sopenharmony_ci 3671cb0ef41Sopenharmony_ci } else if (typeof resCode == 'number') { // Sequence finished. Write it. 3681cb0ef41Sopenharmony_ci dbcsCode = resCode; 3691cb0ef41Sopenharmony_ci 3701cb0ef41Sopenharmony_ci } else if (resCode == undefined) { // Current character is not part of the sequence. 3711cb0ef41Sopenharmony_ci 3721cb0ef41Sopenharmony_ci // Try default character for this sequence 3731cb0ef41Sopenharmony_ci resCode = seqObj[DEF_CHAR]; 3741cb0ef41Sopenharmony_ci if (resCode !== undefined) { 3751cb0ef41Sopenharmony_ci dbcsCode = resCode; // Found. Write it. 3761cb0ef41Sopenharmony_ci nextChar = uCode; // Current character will be written too in the next iteration. 3771cb0ef41Sopenharmony_ci 3781cb0ef41Sopenharmony_ci } else { 3791cb0ef41Sopenharmony_ci // TODO: What if we have no default? (resCode == undefined) 3801cb0ef41Sopenharmony_ci // Then, we should write first char of the sequence as-is and try the rest recursively. 3811cb0ef41Sopenharmony_ci // Didn't do it for now because no encoding has this situation yet. 3821cb0ef41Sopenharmony_ci // Currently, just skip the sequence and write current char. 3831cb0ef41Sopenharmony_ci } 3841cb0ef41Sopenharmony_ci } 3851cb0ef41Sopenharmony_ci seqObj = undefined; 3861cb0ef41Sopenharmony_ci } 3871cb0ef41Sopenharmony_ci else if (uCode >= 0) { // Regular character 3881cb0ef41Sopenharmony_ci var subtable = this.encodeTable[uCode >> 8]; 3891cb0ef41Sopenharmony_ci if (subtable !== undefined) 3901cb0ef41Sopenharmony_ci dbcsCode = subtable[uCode & 0xFF]; 3911cb0ef41Sopenharmony_ci 3921cb0ef41Sopenharmony_ci if (dbcsCode <= SEQ_START) { // Sequence start 3931cb0ef41Sopenharmony_ci seqObj = this.encodeTableSeq[SEQ_START-dbcsCode]; 3941cb0ef41Sopenharmony_ci continue; 3951cb0ef41Sopenharmony_ci } 3961cb0ef41Sopenharmony_ci 3971cb0ef41Sopenharmony_ci if (dbcsCode == UNASSIGNED && this.gb18030) { 3981cb0ef41Sopenharmony_ci // Use GB18030 algorithm to find character(s) to write. 3991cb0ef41Sopenharmony_ci var idx = findIdx(this.gb18030.uChars, uCode); 4001cb0ef41Sopenharmony_ci if (idx != -1) { 4011cb0ef41Sopenharmony_ci var dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); 4021cb0ef41Sopenharmony_ci newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); dbcsCode = dbcsCode % 12600; 4031cb0ef41Sopenharmony_ci newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); dbcsCode = dbcsCode % 1260; 4041cb0ef41Sopenharmony_ci newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); dbcsCode = dbcsCode % 10; 4051cb0ef41Sopenharmony_ci newBuf[j++] = 0x30 + dbcsCode; 4061cb0ef41Sopenharmony_ci continue; 4071cb0ef41Sopenharmony_ci } 4081cb0ef41Sopenharmony_ci } 4091cb0ef41Sopenharmony_ci } 4101cb0ef41Sopenharmony_ci 4111cb0ef41Sopenharmony_ci // 3. Write dbcsCode character. 4121cb0ef41Sopenharmony_ci if (dbcsCode === UNASSIGNED) 4131cb0ef41Sopenharmony_ci dbcsCode = this.defaultCharSingleByte; 4141cb0ef41Sopenharmony_ci 4151cb0ef41Sopenharmony_ci if (dbcsCode < 0x100) { 4161cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode; 4171cb0ef41Sopenharmony_ci } 4181cb0ef41Sopenharmony_ci else if (dbcsCode < 0x10000) { 4191cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode >> 8; // high byte 4201cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode & 0xFF; // low byte 4211cb0ef41Sopenharmony_ci } 4221cb0ef41Sopenharmony_ci else if (dbcsCode < 0x1000000) { 4231cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode >> 16; 4241cb0ef41Sopenharmony_ci newBuf[j++] = (dbcsCode >> 8) & 0xFF; 4251cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode & 0xFF; 4261cb0ef41Sopenharmony_ci } else { 4271cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode >>> 24; 4281cb0ef41Sopenharmony_ci newBuf[j++] = (dbcsCode >>> 16) & 0xFF; 4291cb0ef41Sopenharmony_ci newBuf[j++] = (dbcsCode >>> 8) & 0xFF; 4301cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode & 0xFF; 4311cb0ef41Sopenharmony_ci } 4321cb0ef41Sopenharmony_ci } 4331cb0ef41Sopenharmony_ci 4341cb0ef41Sopenharmony_ci this.seqObj = seqObj; 4351cb0ef41Sopenharmony_ci this.leadSurrogate = leadSurrogate; 4361cb0ef41Sopenharmony_ci return newBuf.slice(0, j); 4371cb0ef41Sopenharmony_ci} 4381cb0ef41Sopenharmony_ci 4391cb0ef41Sopenharmony_ciDBCSEncoder.prototype.end = function() { 4401cb0ef41Sopenharmony_ci if (this.leadSurrogate === -1 && this.seqObj === undefined) 4411cb0ef41Sopenharmony_ci return; // All clean. Most often case. 4421cb0ef41Sopenharmony_ci 4431cb0ef41Sopenharmony_ci var newBuf = Buffer.alloc(10), j = 0; 4441cb0ef41Sopenharmony_ci 4451cb0ef41Sopenharmony_ci if (this.seqObj) { // We're in the sequence. 4461cb0ef41Sopenharmony_ci var dbcsCode = this.seqObj[DEF_CHAR]; 4471cb0ef41Sopenharmony_ci if (dbcsCode !== undefined) { // Write beginning of the sequence. 4481cb0ef41Sopenharmony_ci if (dbcsCode < 0x100) { 4491cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode; 4501cb0ef41Sopenharmony_ci } 4511cb0ef41Sopenharmony_ci else { 4521cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode >> 8; // high byte 4531cb0ef41Sopenharmony_ci newBuf[j++] = dbcsCode & 0xFF; // low byte 4541cb0ef41Sopenharmony_ci } 4551cb0ef41Sopenharmony_ci } else { 4561cb0ef41Sopenharmony_ci // See todo above. 4571cb0ef41Sopenharmony_ci } 4581cb0ef41Sopenharmony_ci this.seqObj = undefined; 4591cb0ef41Sopenharmony_ci } 4601cb0ef41Sopenharmony_ci 4611cb0ef41Sopenharmony_ci if (this.leadSurrogate !== -1) { 4621cb0ef41Sopenharmony_ci // Incomplete surrogate pair - only lead surrogate found. 4631cb0ef41Sopenharmony_ci newBuf[j++] = this.defaultCharSingleByte; 4641cb0ef41Sopenharmony_ci this.leadSurrogate = -1; 4651cb0ef41Sopenharmony_ci } 4661cb0ef41Sopenharmony_ci 4671cb0ef41Sopenharmony_ci return newBuf.slice(0, j); 4681cb0ef41Sopenharmony_ci} 4691cb0ef41Sopenharmony_ci 4701cb0ef41Sopenharmony_ci// Export for testing 4711cb0ef41Sopenharmony_ciDBCSEncoder.prototype.findIdx = findIdx; 4721cb0ef41Sopenharmony_ci 4731cb0ef41Sopenharmony_ci 4741cb0ef41Sopenharmony_ci// == Decoder ================================================================== 4751cb0ef41Sopenharmony_ci 4761cb0ef41Sopenharmony_cifunction DBCSDecoder(options, codec) { 4771cb0ef41Sopenharmony_ci // Decoder state 4781cb0ef41Sopenharmony_ci this.nodeIdx = 0; 4791cb0ef41Sopenharmony_ci this.prevBytes = []; 4801cb0ef41Sopenharmony_ci 4811cb0ef41Sopenharmony_ci // Static data 4821cb0ef41Sopenharmony_ci this.decodeTables = codec.decodeTables; 4831cb0ef41Sopenharmony_ci this.decodeTableSeq = codec.decodeTableSeq; 4841cb0ef41Sopenharmony_ci this.defaultCharUnicode = codec.defaultCharUnicode; 4851cb0ef41Sopenharmony_ci this.gb18030 = codec.gb18030; 4861cb0ef41Sopenharmony_ci} 4871cb0ef41Sopenharmony_ci 4881cb0ef41Sopenharmony_ciDBCSDecoder.prototype.write = function(buf) { 4891cb0ef41Sopenharmony_ci var newBuf = Buffer.alloc(buf.length*2), 4901cb0ef41Sopenharmony_ci nodeIdx = this.nodeIdx, 4911cb0ef41Sopenharmony_ci prevBytes = this.prevBytes, prevOffset = this.prevBytes.length, 4921cb0ef41Sopenharmony_ci seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. 4931cb0ef41Sopenharmony_ci uCode; 4941cb0ef41Sopenharmony_ci 4951cb0ef41Sopenharmony_ci for (var i = 0, j = 0; i < buf.length; i++) { 4961cb0ef41Sopenharmony_ci var curByte = (i >= 0) ? buf[i] : prevBytes[i + prevOffset]; 4971cb0ef41Sopenharmony_ci 4981cb0ef41Sopenharmony_ci // Lookup in current trie node. 4991cb0ef41Sopenharmony_ci var uCode = this.decodeTables[nodeIdx][curByte]; 5001cb0ef41Sopenharmony_ci 5011cb0ef41Sopenharmony_ci if (uCode >= 0) { 5021cb0ef41Sopenharmony_ci // Normal character, just use it. 5031cb0ef41Sopenharmony_ci } 5041cb0ef41Sopenharmony_ci else if (uCode === UNASSIGNED) { // Unknown char. 5051cb0ef41Sopenharmony_ci // TODO: Callback with seq. 5061cb0ef41Sopenharmony_ci uCode = this.defaultCharUnicode.charCodeAt(0); 5071cb0ef41Sopenharmony_ci i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. 5081cb0ef41Sopenharmony_ci } 5091cb0ef41Sopenharmony_ci else if (uCode === GB18030_CODE) { 5101cb0ef41Sopenharmony_ci if (i >= 3) { 5111cb0ef41Sopenharmony_ci var ptr = (buf[i-3]-0x81)*12600 + (buf[i-2]-0x30)*1260 + (buf[i-1]-0x81)*10 + (curByte-0x30); 5121cb0ef41Sopenharmony_ci } else { 5131cb0ef41Sopenharmony_ci var ptr = (prevBytes[i-3+prevOffset]-0x81)*12600 + 5141cb0ef41Sopenharmony_ci (((i-2 >= 0) ? buf[i-2] : prevBytes[i-2+prevOffset])-0x30)*1260 + 5151cb0ef41Sopenharmony_ci (((i-1 >= 0) ? buf[i-1] : prevBytes[i-1+prevOffset])-0x81)*10 + 5161cb0ef41Sopenharmony_ci (curByte-0x30); 5171cb0ef41Sopenharmony_ci } 5181cb0ef41Sopenharmony_ci var idx = findIdx(this.gb18030.gbChars, ptr); 5191cb0ef41Sopenharmony_ci uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; 5201cb0ef41Sopenharmony_ci } 5211cb0ef41Sopenharmony_ci else if (uCode <= NODE_START) { // Go to next trie node. 5221cb0ef41Sopenharmony_ci nodeIdx = NODE_START - uCode; 5231cb0ef41Sopenharmony_ci continue; 5241cb0ef41Sopenharmony_ci } 5251cb0ef41Sopenharmony_ci else if (uCode <= SEQ_START) { // Output a sequence of chars. 5261cb0ef41Sopenharmony_ci var seq = this.decodeTableSeq[SEQ_START - uCode]; 5271cb0ef41Sopenharmony_ci for (var k = 0; k < seq.length - 1; k++) { 5281cb0ef41Sopenharmony_ci uCode = seq[k]; 5291cb0ef41Sopenharmony_ci newBuf[j++] = uCode & 0xFF; 5301cb0ef41Sopenharmony_ci newBuf[j++] = uCode >> 8; 5311cb0ef41Sopenharmony_ci } 5321cb0ef41Sopenharmony_ci uCode = seq[seq.length-1]; 5331cb0ef41Sopenharmony_ci } 5341cb0ef41Sopenharmony_ci else 5351cb0ef41Sopenharmony_ci throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte); 5361cb0ef41Sopenharmony_ci 5371cb0ef41Sopenharmony_ci // Write the character to buffer, handling higher planes using surrogate pair. 5381cb0ef41Sopenharmony_ci if (uCode >= 0x10000) { 5391cb0ef41Sopenharmony_ci uCode -= 0x10000; 5401cb0ef41Sopenharmony_ci var uCodeLead = 0xD800 | (uCode >> 10); 5411cb0ef41Sopenharmony_ci newBuf[j++] = uCodeLead & 0xFF; 5421cb0ef41Sopenharmony_ci newBuf[j++] = uCodeLead >> 8; 5431cb0ef41Sopenharmony_ci 5441cb0ef41Sopenharmony_ci uCode = 0xDC00 | (uCode & 0x3FF); 5451cb0ef41Sopenharmony_ci } 5461cb0ef41Sopenharmony_ci newBuf[j++] = uCode & 0xFF; 5471cb0ef41Sopenharmony_ci newBuf[j++] = uCode >> 8; 5481cb0ef41Sopenharmony_ci 5491cb0ef41Sopenharmony_ci // Reset trie node. 5501cb0ef41Sopenharmony_ci nodeIdx = 0; seqStart = i+1; 5511cb0ef41Sopenharmony_ci } 5521cb0ef41Sopenharmony_ci 5531cb0ef41Sopenharmony_ci this.nodeIdx = nodeIdx; 5541cb0ef41Sopenharmony_ci this.prevBytes = (seqStart >= 0) 5551cb0ef41Sopenharmony_ci ? Array.prototype.slice.call(buf, seqStart) 5561cb0ef41Sopenharmony_ci : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); 5571cb0ef41Sopenharmony_ci 5581cb0ef41Sopenharmony_ci return newBuf.slice(0, j).toString('ucs2'); 5591cb0ef41Sopenharmony_ci} 5601cb0ef41Sopenharmony_ci 5611cb0ef41Sopenharmony_ciDBCSDecoder.prototype.end = function() { 5621cb0ef41Sopenharmony_ci var ret = ''; 5631cb0ef41Sopenharmony_ci 5641cb0ef41Sopenharmony_ci // Try to parse all remaining chars. 5651cb0ef41Sopenharmony_ci while (this.prevBytes.length > 0) { 5661cb0ef41Sopenharmony_ci // Skip 1 character in the buffer. 5671cb0ef41Sopenharmony_ci ret += this.defaultCharUnicode; 5681cb0ef41Sopenharmony_ci var bytesArr = this.prevBytes.slice(1); 5691cb0ef41Sopenharmony_ci 5701cb0ef41Sopenharmony_ci // Parse remaining as usual. 5711cb0ef41Sopenharmony_ci this.prevBytes = []; 5721cb0ef41Sopenharmony_ci this.nodeIdx = 0; 5731cb0ef41Sopenharmony_ci if (bytesArr.length > 0) 5741cb0ef41Sopenharmony_ci ret += this.write(bytesArr); 5751cb0ef41Sopenharmony_ci } 5761cb0ef41Sopenharmony_ci 5771cb0ef41Sopenharmony_ci this.prevBytes = []; 5781cb0ef41Sopenharmony_ci this.nodeIdx = 0; 5791cb0ef41Sopenharmony_ci return ret; 5801cb0ef41Sopenharmony_ci} 5811cb0ef41Sopenharmony_ci 5821cb0ef41Sopenharmony_ci// Binary search for GB18030. Returns largest i such that table[i] <= val. 5831cb0ef41Sopenharmony_cifunction findIdx(table, val) { 5841cb0ef41Sopenharmony_ci if (table[0] > val) 5851cb0ef41Sopenharmony_ci return -1; 5861cb0ef41Sopenharmony_ci 5871cb0ef41Sopenharmony_ci var l = 0, r = table.length; 5881cb0ef41Sopenharmony_ci while (l < r-1) { // always table[l] <= val < table[r] 5891cb0ef41Sopenharmony_ci var mid = l + ((r-l+1) >> 1); 5901cb0ef41Sopenharmony_ci if (table[mid] <= val) 5911cb0ef41Sopenharmony_ci l = mid; 5921cb0ef41Sopenharmony_ci else 5931cb0ef41Sopenharmony_ci r = mid; 5941cb0ef41Sopenharmony_ci } 5951cb0ef41Sopenharmony_ci return l; 5961cb0ef41Sopenharmony_ci} 5971cb0ef41Sopenharmony_ci 598