1"use strict"; 2var Buffer = require("safer-buffer").Buffer; 3 4// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js 5 6// == UTF16-BE codec. ========================================================== 7 8exports.utf16be = Utf16BECodec; 9function Utf16BECodec() { 10} 11 12Utf16BECodec.prototype.encoder = Utf16BEEncoder; 13Utf16BECodec.prototype.decoder = Utf16BEDecoder; 14Utf16BECodec.prototype.bomAware = true; 15 16 17// -- Encoding 18 19function Utf16BEEncoder() { 20} 21 22Utf16BEEncoder.prototype.write = function(str) { 23 var buf = Buffer.from(str, 'ucs2'); 24 for (var i = 0; i < buf.length; i += 2) { 25 var tmp = buf[i]; buf[i] = buf[i+1]; buf[i+1] = tmp; 26 } 27 return buf; 28} 29 30Utf16BEEncoder.prototype.end = function() { 31} 32 33 34// -- Decoding 35 36function Utf16BEDecoder() { 37 this.overflowByte = -1; 38} 39 40Utf16BEDecoder.prototype.write = function(buf) { 41 if (buf.length == 0) 42 return ''; 43 44 var buf2 = Buffer.alloc(buf.length + 1), 45 i = 0, j = 0; 46 47 if (this.overflowByte !== -1) { 48 buf2[0] = buf[0]; 49 buf2[1] = this.overflowByte; 50 i = 1; j = 2; 51 } 52 53 for (; i < buf.length-1; i += 2, j+= 2) { 54 buf2[j] = buf[i+1]; 55 buf2[j+1] = buf[i]; 56 } 57 58 this.overflowByte = (i == buf.length-1) ? buf[buf.length-1] : -1; 59 60 return buf2.slice(0, j).toString('ucs2'); 61} 62 63Utf16BEDecoder.prototype.end = function() { 64 this.overflowByte = -1; 65} 66 67 68// == UTF-16 codec ============================================================= 69// Decoder chooses automatically from UTF-16LE and UTF-16BE using BOM and space-based heuristic. 70// Defaults to UTF-16LE, as it's prevalent and default in Node. 71// http://en.wikipedia.org/wiki/UTF-16 and http://encoding.spec.whatwg.org/#utf-16le 72// Decoder default can be changed: iconv.decode(buf, 'utf16', {defaultEncoding: 'utf-16be'}); 73 74// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false). 75 76exports.utf16 = Utf16Codec; 77function Utf16Codec(codecOptions, iconv) { 78 this.iconv = iconv; 79} 80 81Utf16Codec.prototype.encoder = Utf16Encoder; 82Utf16Codec.prototype.decoder = Utf16Decoder; 83 84 85// -- Encoding (pass-through) 86 87function Utf16Encoder(options, codec) { 88 options = options || {}; 89 if (options.addBOM === undefined) 90 options.addBOM = true; 91 this.encoder = codec.iconv.getEncoder('utf-16le', options); 92} 93 94Utf16Encoder.prototype.write = function(str) { 95 return this.encoder.write(str); 96} 97 98Utf16Encoder.prototype.end = function() { 99 return this.encoder.end(); 100} 101 102 103// -- Decoding 104 105function Utf16Decoder(options, codec) { 106 this.decoder = null; 107 this.initialBufs = []; 108 this.initialBufsLen = 0; 109 110 this.options = options || {}; 111 this.iconv = codec.iconv; 112} 113 114Utf16Decoder.prototype.write = function(buf) { 115 if (!this.decoder) { 116 // Codec is not chosen yet. Accumulate initial bytes. 117 this.initialBufs.push(buf); 118 this.initialBufsLen += buf.length; 119 120 if (this.initialBufsLen < 16) // We need more bytes to use space heuristic (see below) 121 return ''; 122 123 // We have enough bytes -> detect endianness. 124 var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 125 this.decoder = this.iconv.getDecoder(encoding, this.options); 126 127 var resStr = ''; 128 for (var i = 0; i < this.initialBufs.length; i++) 129 resStr += this.decoder.write(this.initialBufs[i]); 130 131 this.initialBufs.length = this.initialBufsLen = 0; 132 return resStr; 133 } 134 135 return this.decoder.write(buf); 136} 137 138Utf16Decoder.prototype.end = function() { 139 if (!this.decoder) { 140 var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 141 this.decoder = this.iconv.getDecoder(encoding, this.options); 142 143 var resStr = ''; 144 for (var i = 0; i < this.initialBufs.length; i++) 145 resStr += this.decoder.write(this.initialBufs[i]); 146 147 var trail = this.decoder.end(); 148 if (trail) 149 resStr += trail; 150 151 this.initialBufs.length = this.initialBufsLen = 0; 152 return resStr; 153 } 154 return this.decoder.end(); 155} 156 157function detectEncoding(bufs, defaultEncoding) { 158 var b = []; 159 var charsProcessed = 0; 160 var asciiCharsLE = 0, asciiCharsBE = 0; // Number of ASCII chars when decoded as LE or BE. 161 162 outer_loop: 163 for (var i = 0; i < bufs.length; i++) { 164 var buf = bufs[i]; 165 for (var j = 0; j < buf.length; j++) { 166 b.push(buf[j]); 167 if (b.length === 2) { 168 if (charsProcessed === 0) { 169 // Check BOM first. 170 if (b[0] === 0xFF && b[1] === 0xFE) return 'utf-16le'; 171 if (b[0] === 0xFE && b[1] === 0xFF) return 'utf-16be'; 172 } 173 174 if (b[0] === 0 && b[1] !== 0) asciiCharsBE++; 175 if (b[0] !== 0 && b[1] === 0) asciiCharsLE++; 176 177 b.length = 0; 178 charsProcessed++; 179 180 if (charsProcessed >= 100) { 181 break outer_loop; 182 } 183 } 184 } 185 } 186 187 // Make decisions. 188 // Most of the time, the content has ASCII chars (U+00**), but the opposite (U+**00) is uncommon. 189 // So, we count ASCII as if it was LE or BE, and decide from that. 190 if (asciiCharsBE > asciiCharsLE) return 'utf-16be'; 191 if (asciiCharsBE < asciiCharsLE) return 'utf-16le'; 192 193 // Couldn't decide (likely all zeros or not enough data). 194 return defaultEncoding || 'utf-16le'; 195} 196 197 198