1"use strict"; 2var Buffer = require("safer-buffer").Buffer; 3 4// Multibyte codec. In this scheme, a character is represented by 1 or more bytes. 5// Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. 6// To save memory and loading time, we read table files only when requested. 7 8exports._dbcs = DBCSCodec; 9 10var UNASSIGNED = -1, 11 GB18030_CODE = -2, 12 SEQ_START = -10, 13 NODE_START = -1000, 14 UNASSIGNED_NODE = new Array(0x100), 15 DEF_CHAR = -1; 16 17for (var i = 0; i < 0x100; i++) 18 UNASSIGNED_NODE[i] = UNASSIGNED; 19 20 21// Class DBCSCodec reads and initializes mapping tables. 22function DBCSCodec(codecOptions, iconv) { 23 this.encodingName = codecOptions.encodingName; 24 if (!codecOptions) 25 throw new Error("DBCS codec is called without the data.") 26 if (!codecOptions.table) 27 throw new Error("Encoding '" + this.encodingName + "' has no data."); 28 29 // Load tables. 30 var mappingTable = codecOptions.table(); 31 32 33 // Decode tables: MBCS -> Unicode. 34 35 // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. 36 // Trie root is decodeTables[0]. 37 // Values: >= 0 -> unicode character code. can be > 0xFFFF 38 // == UNASSIGNED -> unknown/unassigned sequence. 39 // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. 40 // <= NODE_START -> index of the next node in our trie to process next byte. 41 // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. 42 this.decodeTables = []; 43 this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. 44 45 // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. 46 this.decodeTableSeq = []; 47 48 // Actual mapping tables consist of chunks. Use them to fill up decode tables. 49 for (var i = 0; i < mappingTable.length; i++) 50 this._addDecodeChunk(mappingTable[i]); 51 52 // Load & create GB18030 tables when needed. 53 if (typeof codecOptions.gb18030 === 'function') { 54 this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. 55 56 // Add GB18030 common decode nodes. 57 var commonThirdByteNodeIdx = this.decodeTables.length; 58 this.decodeTables.push(UNASSIGNED_NODE.slice(0)); 59 60 var commonFourthByteNodeIdx = this.decodeTables.length; 61 this.decodeTables.push(UNASSIGNED_NODE.slice(0)); 62 63 // Fill out the tree 64 var firstByteNode = this.decodeTables[0]; 65 for (var i = 0x81; i <= 0xFE; i++) { 66 var secondByteNode = this.decodeTables[NODE_START - firstByteNode[i]]; 67 for (var j = 0x30; j <= 0x39; j++) { 68 if (secondByteNode[j] === UNASSIGNED) { 69 secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; 70 } else if (secondByteNode[j] > NODE_START) { 71 throw new Error("gb18030 decode tables conflict at byte 2"); 72 } 73 74 var thirdByteNode = this.decodeTables[NODE_START - secondByteNode[j]]; 75 for (var k = 0x81; k <= 0xFE; k++) { 76 if (thirdByteNode[k] === UNASSIGNED) { 77 thirdByteNode[k] = NODE_START - commonFourthByteNodeIdx; 78 } else if (thirdByteNode[k] === NODE_START - commonFourthByteNodeIdx) { 79 continue; 80 } else if (thirdByteNode[k] > NODE_START) { 81 throw new Error("gb18030 decode tables conflict at byte 3"); 82 } 83 84 var fourthByteNode = this.decodeTables[NODE_START - thirdByteNode[k]]; 85 for (var l = 0x30; l <= 0x39; l++) { 86 if (fourthByteNode[l] === UNASSIGNED) 87 fourthByteNode[l] = GB18030_CODE; 88 } 89 } 90 } 91 } 92 } 93 94 this.defaultCharUnicode = iconv.defaultCharUnicode; 95 96 97 // Encode tables: Unicode -> DBCS. 98 99 // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. 100 // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. 101 // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). 102 // == UNASSIGNED -> no conversion found. Output a default char. 103 // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. 104 this.encodeTable = []; 105 106 // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of 107 // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key 108 // means end of sequence (needed when one sequence is a strict subsequence of another). 109 // Objects are kept separately from encodeTable to increase performance. 110 this.encodeTableSeq = []; 111 112 // Some chars can be decoded, but need not be encoded. 113 var skipEncodeChars = {}; 114 if (codecOptions.encodeSkipVals) 115 for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) { 116 var val = codecOptions.encodeSkipVals[i]; 117 if (typeof val === 'number') 118 skipEncodeChars[val] = true; 119 else 120 for (var j = val.from; j <= val.to; j++) 121 skipEncodeChars[j] = true; 122 } 123 124 // Use decode trie to recursively fill out encode tables. 125 this._fillEncodeTable(0, 0, skipEncodeChars); 126 127 // Add more encoding pairs when needed. 128 if (codecOptions.encodeAdd) { 129 for (var uChar in codecOptions.encodeAdd) 130 if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar)) 131 this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); 132 } 133 134 this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; 135 if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?']; 136 if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0); 137} 138 139DBCSCodec.prototype.encoder = DBCSEncoder; 140DBCSCodec.prototype.decoder = DBCSDecoder; 141 142// Decoder helpers 143DBCSCodec.prototype._getDecodeTrieNode = function(addr) { 144 var bytes = []; 145 for (; addr > 0; addr >>>= 8) 146 bytes.push(addr & 0xFF); 147 if (bytes.length == 0) 148 bytes.push(0); 149 150 var node = this.decodeTables[0]; 151 for (var i = bytes.length-1; i > 0; i--) { // Traverse nodes deeper into the trie. 152 var val = node[bytes[i]]; 153 154 if (val == UNASSIGNED) { // Create new node. 155 node[bytes[i]] = NODE_START - this.decodeTables.length; 156 this.decodeTables.push(node = UNASSIGNED_NODE.slice(0)); 157 } 158 else if (val <= NODE_START) { // Existing node. 159 node = this.decodeTables[NODE_START - val]; 160 } 161 else 162 throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16)); 163 } 164 return node; 165} 166 167 168DBCSCodec.prototype._addDecodeChunk = function(chunk) { 169 // First element of chunk is the hex mbcs code where we start. 170 var curAddr = parseInt(chunk[0], 16); 171 172 // Choose the decoding node where we'll write our chars. 173 var writeTable = this._getDecodeTrieNode(curAddr); 174 curAddr = curAddr & 0xFF; 175 176 // Write all other elements of the chunk to the table. 177 for (var k = 1; k < chunk.length; k++) { 178 var part = chunk[k]; 179 if (typeof part === "string") { // String, write as-is. 180 for (var l = 0; l < part.length;) { 181 var code = part.charCodeAt(l++); 182 if (0xD800 <= code && code < 0xDC00) { // Decode surrogate 183 var codeTrail = part.charCodeAt(l++); 184 if (0xDC00 <= codeTrail && codeTrail < 0xE000) 185 writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00); 186 else 187 throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]); 188 } 189 else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used) 190 var len = 0xFFF - code + 2; 191 var seq = []; 192 for (var m = 0; m < len; m++) 193 seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq. 194 195 writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; 196 this.decodeTableSeq.push(seq); 197 } 198 else 199 writeTable[curAddr++] = code; // Basic char 200 } 201 } 202 else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character. 203 var charCode = writeTable[curAddr - 1] + 1; 204 for (var l = 0; l < part; l++) 205 writeTable[curAddr++] = charCode++; 206 } 207 else 208 throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]); 209 } 210 if (curAddr > 0xFF) 211 throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr); 212} 213 214// Encoder helpers 215DBCSCodec.prototype._getEncodeBucket = function(uCode) { 216 var high = uCode >> 8; // This could be > 0xFF because of astral characters. 217 if (this.encodeTable[high] === undefined) 218 this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. 219 return this.encodeTable[high]; 220} 221 222DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) { 223 var bucket = this._getEncodeBucket(uCode); 224 var low = uCode & 0xFF; 225 if (bucket[low] <= SEQ_START) 226 this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode; // There's already a sequence, set a single-char subsequence of it. 227 else if (bucket[low] == UNASSIGNED) 228 bucket[low] = dbcsCode; 229} 230 231DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) { 232 233 // Get the root of character tree according to first character of the sequence. 234 var uCode = seq[0]; 235 var bucket = this._getEncodeBucket(uCode); 236 var low = uCode & 0xFF; 237 238 var node; 239 if (bucket[low] <= SEQ_START) { 240 // There's already a sequence with - use it. 241 node = this.encodeTableSeq[SEQ_START-bucket[low]]; 242 } 243 else { 244 // There was no sequence object - allocate a new one. 245 node = {}; 246 if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. 247 bucket[low] = SEQ_START - this.encodeTableSeq.length; 248 this.encodeTableSeq.push(node); 249 } 250 251 // Traverse the character tree, allocating new nodes as needed. 252 for (var j = 1; j < seq.length-1; j++) { 253 var oldVal = node[uCode]; 254 if (typeof oldVal === 'object') 255 node = oldVal; 256 else { 257 node = node[uCode] = {} 258 if (oldVal !== undefined) 259 node[DEF_CHAR] = oldVal 260 } 261 } 262 263 // Set the leaf to given dbcsCode. 264 uCode = seq[seq.length-1]; 265 node[uCode] = dbcsCode; 266} 267 268DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) { 269 var node = this.decodeTables[nodeIdx]; 270 var hasValues = false; 271 var subNodeEmpty = {}; 272 for (var i = 0; i < 0x100; i++) { 273 var uCode = node[i]; 274 var mbCode = prefix + i; 275 if (skipEncodeChars[mbCode]) 276 continue; 277 278 if (uCode >= 0) { 279 this._setEncodeChar(uCode, mbCode); 280 hasValues = true; 281 } else if (uCode <= NODE_START) { 282 var subNodeIdx = NODE_START - uCode; 283 if (!subNodeEmpty[subNodeIdx]) { // Skip empty subtrees (they are too large in gb18030). 284 var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. 285 if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) 286 hasValues = true; 287 else 288 subNodeEmpty[subNodeIdx] = true; 289 } 290 } else if (uCode <= SEQ_START) { 291 this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); 292 hasValues = true; 293 } 294 } 295 return hasValues; 296} 297 298 299 300// == Encoder ================================================================== 301 302function DBCSEncoder(options, codec) { 303 // Encoder state 304 this.leadSurrogate = -1; 305 this.seqObj = undefined; 306 307 // Static data 308 this.encodeTable = codec.encodeTable; 309 this.encodeTableSeq = codec.encodeTableSeq; 310 this.defaultCharSingleByte = codec.defCharSB; 311 this.gb18030 = codec.gb18030; 312} 313 314DBCSEncoder.prototype.write = function(str) { 315 var newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)), 316 leadSurrogate = this.leadSurrogate, 317 seqObj = this.seqObj, nextChar = -1, 318 i = 0, j = 0; 319 320 while (true) { 321 // 0. Get next character. 322 if (nextChar === -1) { 323 if (i == str.length) break; 324 var uCode = str.charCodeAt(i++); 325 } 326 else { 327 var uCode = nextChar; 328 nextChar = -1; 329 } 330 331 // 1. Handle surrogates. 332 if (0xD800 <= uCode && uCode < 0xE000) { // Char is one of surrogates. 333 if (uCode < 0xDC00) { // We've got lead surrogate. 334 if (leadSurrogate === -1) { 335 leadSurrogate = uCode; 336 continue; 337 } else { 338 leadSurrogate = uCode; 339 // Double lead surrogate found. 340 uCode = UNASSIGNED; 341 } 342 } else { // We've got trail surrogate. 343 if (leadSurrogate !== -1) { 344 uCode = 0x10000 + (leadSurrogate - 0xD800) * 0x400 + (uCode - 0xDC00); 345 leadSurrogate = -1; 346 } else { 347 // Incomplete surrogate pair - only trail surrogate found. 348 uCode = UNASSIGNED; 349 } 350 351 } 352 } 353 else if (leadSurrogate !== -1) { 354 // Incomplete surrogate pair - only lead surrogate found. 355 nextChar = uCode; uCode = UNASSIGNED; // Write an error, then current char. 356 leadSurrogate = -1; 357 } 358 359 // 2. Convert uCode character. 360 var dbcsCode = UNASSIGNED; 361 if (seqObj !== undefined && uCode != UNASSIGNED) { // We are in the middle of the sequence 362 var resCode = seqObj[uCode]; 363 if (typeof resCode === 'object') { // Sequence continues. 364 seqObj = resCode; 365 continue; 366 367 } else if (typeof resCode == 'number') { // Sequence finished. Write it. 368 dbcsCode = resCode; 369 370 } else if (resCode == undefined) { // Current character is not part of the sequence. 371 372 // Try default character for this sequence 373 resCode = seqObj[DEF_CHAR]; 374 if (resCode !== undefined) { 375 dbcsCode = resCode; // Found. Write it. 376 nextChar = uCode; // Current character will be written too in the next iteration. 377 378 } else { 379 // TODO: What if we have no default? (resCode == undefined) 380 // Then, we should write first char of the sequence as-is and try the rest recursively. 381 // Didn't do it for now because no encoding has this situation yet. 382 // Currently, just skip the sequence and write current char. 383 } 384 } 385 seqObj = undefined; 386 } 387 else if (uCode >= 0) { // Regular character 388 var subtable = this.encodeTable[uCode >> 8]; 389 if (subtable !== undefined) 390 dbcsCode = subtable[uCode & 0xFF]; 391 392 if (dbcsCode <= SEQ_START) { // Sequence start 393 seqObj = this.encodeTableSeq[SEQ_START-dbcsCode]; 394 continue; 395 } 396 397 if (dbcsCode == UNASSIGNED && this.gb18030) { 398 // Use GB18030 algorithm to find character(s) to write. 399 var idx = findIdx(this.gb18030.uChars, uCode); 400 if (idx != -1) { 401 var dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); 402 newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); dbcsCode = dbcsCode % 12600; 403 newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); dbcsCode = dbcsCode % 1260; 404 newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); dbcsCode = dbcsCode % 10; 405 newBuf[j++] = 0x30 + dbcsCode; 406 continue; 407 } 408 } 409 } 410 411 // 3. Write dbcsCode character. 412 if (dbcsCode === UNASSIGNED) 413 dbcsCode = this.defaultCharSingleByte; 414 415 if (dbcsCode < 0x100) { 416 newBuf[j++] = dbcsCode; 417 } 418 else if (dbcsCode < 0x10000) { 419 newBuf[j++] = dbcsCode >> 8; // high byte 420 newBuf[j++] = dbcsCode & 0xFF; // low byte 421 } 422 else if (dbcsCode < 0x1000000) { 423 newBuf[j++] = dbcsCode >> 16; 424 newBuf[j++] = (dbcsCode >> 8) & 0xFF; 425 newBuf[j++] = dbcsCode & 0xFF; 426 } else { 427 newBuf[j++] = dbcsCode >>> 24; 428 newBuf[j++] = (dbcsCode >>> 16) & 0xFF; 429 newBuf[j++] = (dbcsCode >>> 8) & 0xFF; 430 newBuf[j++] = dbcsCode & 0xFF; 431 } 432 } 433 434 this.seqObj = seqObj; 435 this.leadSurrogate = leadSurrogate; 436 return newBuf.slice(0, j); 437} 438 439DBCSEncoder.prototype.end = function() { 440 if (this.leadSurrogate === -1 && this.seqObj === undefined) 441 return; // All clean. Most often case. 442 443 var newBuf = Buffer.alloc(10), j = 0; 444 445 if (this.seqObj) { // We're in the sequence. 446 var dbcsCode = this.seqObj[DEF_CHAR]; 447 if (dbcsCode !== undefined) { // Write beginning of the sequence. 448 if (dbcsCode < 0x100) { 449 newBuf[j++] = dbcsCode; 450 } 451 else { 452 newBuf[j++] = dbcsCode >> 8; // high byte 453 newBuf[j++] = dbcsCode & 0xFF; // low byte 454 } 455 } else { 456 // See todo above. 457 } 458 this.seqObj = undefined; 459 } 460 461 if (this.leadSurrogate !== -1) { 462 // Incomplete surrogate pair - only lead surrogate found. 463 newBuf[j++] = this.defaultCharSingleByte; 464 this.leadSurrogate = -1; 465 } 466 467 return newBuf.slice(0, j); 468} 469 470// Export for testing 471DBCSEncoder.prototype.findIdx = findIdx; 472 473 474// == Decoder ================================================================== 475 476function DBCSDecoder(options, codec) { 477 // Decoder state 478 this.nodeIdx = 0; 479 this.prevBytes = []; 480 481 // Static data 482 this.decodeTables = codec.decodeTables; 483 this.decodeTableSeq = codec.decodeTableSeq; 484 this.defaultCharUnicode = codec.defaultCharUnicode; 485 this.gb18030 = codec.gb18030; 486} 487 488DBCSDecoder.prototype.write = function(buf) { 489 var newBuf = Buffer.alloc(buf.length*2), 490 nodeIdx = this.nodeIdx, 491 prevBytes = this.prevBytes, prevOffset = this.prevBytes.length, 492 seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. 493 uCode; 494 495 for (var i = 0, j = 0; i < buf.length; i++) { 496 var curByte = (i >= 0) ? buf[i] : prevBytes[i + prevOffset]; 497 498 // Lookup in current trie node. 499 var uCode = this.decodeTables[nodeIdx][curByte]; 500 501 if (uCode >= 0) { 502 // Normal character, just use it. 503 } 504 else if (uCode === UNASSIGNED) { // Unknown char. 505 // TODO: Callback with seq. 506 uCode = this.defaultCharUnicode.charCodeAt(0); 507 i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. 508 } 509 else if (uCode === GB18030_CODE) { 510 if (i >= 3) { 511 var ptr = (buf[i-3]-0x81)*12600 + (buf[i-2]-0x30)*1260 + (buf[i-1]-0x81)*10 + (curByte-0x30); 512 } else { 513 var ptr = (prevBytes[i-3+prevOffset]-0x81)*12600 + 514 (((i-2 >= 0) ? buf[i-2] : prevBytes[i-2+prevOffset])-0x30)*1260 + 515 (((i-1 >= 0) ? buf[i-1] : prevBytes[i-1+prevOffset])-0x81)*10 + 516 (curByte-0x30); 517 } 518 var idx = findIdx(this.gb18030.gbChars, ptr); 519 uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; 520 } 521 else if (uCode <= NODE_START) { // Go to next trie node. 522 nodeIdx = NODE_START - uCode; 523 continue; 524 } 525 else if (uCode <= SEQ_START) { // Output a sequence of chars. 526 var seq = this.decodeTableSeq[SEQ_START - uCode]; 527 for (var k = 0; k < seq.length - 1; k++) { 528 uCode = seq[k]; 529 newBuf[j++] = uCode & 0xFF; 530 newBuf[j++] = uCode >> 8; 531 } 532 uCode = seq[seq.length-1]; 533 } 534 else 535 throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte); 536 537 // Write the character to buffer, handling higher planes using surrogate pair. 538 if (uCode >= 0x10000) { 539 uCode -= 0x10000; 540 var uCodeLead = 0xD800 | (uCode >> 10); 541 newBuf[j++] = uCodeLead & 0xFF; 542 newBuf[j++] = uCodeLead >> 8; 543 544 uCode = 0xDC00 | (uCode & 0x3FF); 545 } 546 newBuf[j++] = uCode & 0xFF; 547 newBuf[j++] = uCode >> 8; 548 549 // Reset trie node. 550 nodeIdx = 0; seqStart = i+1; 551 } 552 553 this.nodeIdx = nodeIdx; 554 this.prevBytes = (seqStart >= 0) 555 ? Array.prototype.slice.call(buf, seqStart) 556 : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); 557 558 return newBuf.slice(0, j).toString('ucs2'); 559} 560 561DBCSDecoder.prototype.end = function() { 562 var ret = ''; 563 564 // Try to parse all remaining chars. 565 while (this.prevBytes.length > 0) { 566 // Skip 1 character in the buffer. 567 ret += this.defaultCharUnicode; 568 var bytesArr = this.prevBytes.slice(1); 569 570 // Parse remaining as usual. 571 this.prevBytes = []; 572 this.nodeIdx = 0; 573 if (bytesArr.length > 0) 574 ret += this.write(bytesArr); 575 } 576 577 this.prevBytes = []; 578 this.nodeIdx = 0; 579 return ret; 580} 581 582// Binary search for GB18030. Returns largest i such that table[i] <= val. 583function findIdx(table, val) { 584 if (table[0] > val) 585 return -1; 586 587 var l = 0, r = table.length; 588 while (l < r-1) { // always table[l] <= val < table[r] 589 var mid = l + ((r-l+1) >> 1); 590 if (table[mid] <= val) 591 l = mid; 592 else 593 r = mid; 594 } 595 return l; 596} 597 598