1'use strict'; 2 3var Buffer = require('safer-buffer').Buffer; 4 5// == UTF32-LE/BE codec. ========================================================== 6 7exports._utf32 = Utf32Codec; 8 9function Utf32Codec(codecOptions, iconv) { 10 this.iconv = iconv; 11 this.bomAware = true; 12 this.isLE = codecOptions.isLE; 13} 14 15exports.utf32le = { type: '_utf32', isLE: true }; 16exports.utf32be = { type: '_utf32', isLE: false }; 17 18// Aliases 19exports.ucs4le = 'utf32le'; 20exports.ucs4be = 'utf32be'; 21 22Utf32Codec.prototype.encoder = Utf32Encoder; 23Utf32Codec.prototype.decoder = Utf32Decoder; 24 25// -- Encoding 26 27function Utf32Encoder(options, codec) { 28 this.isLE = codec.isLE; 29 this.highSurrogate = 0; 30} 31 32Utf32Encoder.prototype.write = function(str) { 33 var src = Buffer.from(str, 'ucs2'); 34 var dst = Buffer.alloc(src.length * 2); 35 var write32 = this.isLE ? dst.writeUInt32LE : dst.writeUInt32BE; 36 var offset = 0; 37 38 for (var i = 0; i < src.length; i += 2) { 39 var code = src.readUInt16LE(i); 40 var isHighSurrogate = (0xD800 <= code && code < 0xDC00); 41 var isLowSurrogate = (0xDC00 <= code && code < 0xE000); 42 43 if (this.highSurrogate) { 44 if (isHighSurrogate || !isLowSurrogate) { 45 // There shouldn't be two high surrogates in a row, nor a high surrogate which isn't followed by a low 46 // surrogate. If this happens, keep the pending high surrogate as a stand-alone semi-invalid character 47 // (technically wrong, but expected by some applications, like Windows file names). 48 write32.call(dst, this.highSurrogate, offset); 49 offset += 4; 50 } 51 else { 52 // Create 32-bit value from high and low surrogates; 53 var codepoint = (((this.highSurrogate - 0xD800) << 10) | (code - 0xDC00)) + 0x10000; 54 55 write32.call(dst, codepoint, offset); 56 offset += 4; 57 this.highSurrogate = 0; 58 59 continue; 60 } 61 } 62 63 if (isHighSurrogate) 64 this.highSurrogate = code; 65 else { 66 // Even if the current character is a low surrogate, with no previous high surrogate, we'll 67 // encode it as a semi-invalid stand-alone character for the same reasons expressed above for 68 // unpaired high surrogates. 69 write32.call(dst, code, offset); 70 offset += 4; 71 this.highSurrogate = 0; 72 } 73 } 74 75 if (offset < dst.length) 76 dst = dst.slice(0, offset); 77 78 return dst; 79}; 80 81Utf32Encoder.prototype.end = function() { 82 // Treat any leftover high surrogate as a semi-valid independent character. 83 if (!this.highSurrogate) 84 return; 85 86 var buf = Buffer.alloc(4); 87 88 if (this.isLE) 89 buf.writeUInt32LE(this.highSurrogate, 0); 90 else 91 buf.writeUInt32BE(this.highSurrogate, 0); 92 93 this.highSurrogate = 0; 94 95 return buf; 96}; 97 98// -- Decoding 99 100function Utf32Decoder(options, codec) { 101 this.isLE = codec.isLE; 102 this.badChar = codec.iconv.defaultCharUnicode.charCodeAt(0); 103 this.overflow = []; 104} 105 106Utf32Decoder.prototype.write = function(src) { 107 if (src.length === 0) 108 return ''; 109 110 var i = 0; 111 var codepoint = 0; 112 var dst = Buffer.alloc(src.length + 4); 113 var offset = 0; 114 var isLE = this.isLE; 115 var overflow = this.overflow; 116 var badChar = this.badChar; 117 118 if (overflow.length > 0) { 119 for (; i < src.length && overflow.length < 4; i++) 120 overflow.push(src[i]); 121 122 if (overflow.length === 4) { 123 // NOTE: codepoint is a signed int32 and can be negative. 124 // NOTE: We copied this block from below to help V8 optimize it (it works with array, not buffer). 125 if (isLE) { 126 codepoint = overflow[i] | (overflow[i+1] << 8) | (overflow[i+2] << 16) | (overflow[i+3] << 24); 127 } else { 128 codepoint = overflow[i+3] | (overflow[i+2] << 8) | (overflow[i+1] << 16) | (overflow[i] << 24); 129 } 130 overflow.length = 0; 131 132 offset = _writeCodepoint(dst, offset, codepoint, badChar); 133 } 134 } 135 136 // Main loop. Should be as optimized as possible. 137 for (; i < src.length - 3; i += 4) { 138 // NOTE: codepoint is a signed int32 and can be negative. 139 if (isLE) { 140 codepoint = src[i] | (src[i+1] << 8) | (src[i+2] << 16) | (src[i+3] << 24); 141 } else { 142 codepoint = src[i+3] | (src[i+2] << 8) | (src[i+1] << 16) | (src[i] << 24); 143 } 144 offset = _writeCodepoint(dst, offset, codepoint, badChar); 145 } 146 147 // Keep overflowing bytes. 148 for (; i < src.length; i++) { 149 overflow.push(src[i]); 150 } 151 152 return dst.slice(0, offset).toString('ucs2'); 153}; 154 155function _writeCodepoint(dst, offset, codepoint, badChar) { 156 // NOTE: codepoint is signed int32 and can be negative. We keep it that way to help V8 with optimizations. 157 if (codepoint < 0 || codepoint > 0x10FFFF) { 158 // Not a valid Unicode codepoint 159 codepoint = badChar; 160 } 161 162 // Ephemeral Planes: Write high surrogate. 163 if (codepoint >= 0x10000) { 164 codepoint -= 0x10000; 165 166 var high = 0xD800 | (codepoint >> 10); 167 dst[offset++] = high & 0xff; 168 dst[offset++] = high >> 8; 169 170 // Low surrogate is written below. 171 var codepoint = 0xDC00 | (codepoint & 0x3FF); 172 } 173 174 // Write BMP char or low surrogate. 175 dst[offset++] = codepoint & 0xff; 176 dst[offset++] = codepoint >> 8; 177 178 return offset; 179}; 180 181Utf32Decoder.prototype.end = function() { 182 this.overflow.length = 0; 183}; 184 185// == UTF-32 Auto codec ============================================================= 186// Decoder chooses automatically from UTF-32LE and UTF-32BE using BOM and space-based heuristic. 187// Defaults to UTF-32LE. http://en.wikipedia.org/wiki/UTF-32 188// Encoder/decoder default can be changed: iconv.decode(buf, 'utf32', {defaultEncoding: 'utf-32be'}); 189 190// Encoder prepends BOM (which can be overridden with (addBOM: false}). 191 192exports.utf32 = Utf32AutoCodec; 193exports.ucs4 = 'utf32'; 194 195function Utf32AutoCodec(options, iconv) { 196 this.iconv = iconv; 197} 198 199Utf32AutoCodec.prototype.encoder = Utf32AutoEncoder; 200Utf32AutoCodec.prototype.decoder = Utf32AutoDecoder; 201 202// -- Encoding 203 204function Utf32AutoEncoder(options, codec) { 205 options = options || {}; 206 207 if (options.addBOM === undefined) 208 options.addBOM = true; 209 210 this.encoder = codec.iconv.getEncoder(options.defaultEncoding || 'utf-32le', options); 211} 212 213Utf32AutoEncoder.prototype.write = function(str) { 214 return this.encoder.write(str); 215}; 216 217Utf32AutoEncoder.prototype.end = function() { 218 return this.encoder.end(); 219}; 220 221// -- Decoding 222 223function Utf32AutoDecoder(options, codec) { 224 this.decoder = null; 225 this.initialBufs = []; 226 this.initialBufsLen = 0; 227 this.options = options || {}; 228 this.iconv = codec.iconv; 229} 230 231Utf32AutoDecoder.prototype.write = function(buf) { 232 if (!this.decoder) { 233 // Codec is not chosen yet. Accumulate initial bytes. 234 this.initialBufs.push(buf); 235 this.initialBufsLen += buf.length; 236 237 if (this.initialBufsLen < 32) // We need more bytes to use space heuristic (see below) 238 return ''; 239 240 // We have enough bytes -> detect endianness. 241 var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 242 this.decoder = this.iconv.getDecoder(encoding, this.options); 243 244 var resStr = ''; 245 for (var i = 0; i < this.initialBufs.length; i++) 246 resStr += this.decoder.write(this.initialBufs[i]); 247 248 this.initialBufs.length = this.initialBufsLen = 0; 249 return resStr; 250 } 251 252 return this.decoder.write(buf); 253}; 254 255Utf32AutoDecoder.prototype.end = function() { 256 if (!this.decoder) { 257 var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding); 258 this.decoder = this.iconv.getDecoder(encoding, this.options); 259 260 var resStr = ''; 261 for (var i = 0; i < this.initialBufs.length; i++) 262 resStr += this.decoder.write(this.initialBufs[i]); 263 264 var trail = this.decoder.end(); 265 if (trail) 266 resStr += trail; 267 268 this.initialBufs.length = this.initialBufsLen = 0; 269 return resStr; 270 } 271 272 return this.decoder.end(); 273}; 274 275function detectEncoding(bufs, defaultEncoding) { 276 var b = []; 277 var charsProcessed = 0; 278 var invalidLE = 0, invalidBE = 0; // Number of invalid chars when decoded as LE or BE. 279 var bmpCharsLE = 0, bmpCharsBE = 0; // Number of BMP chars when decoded as LE or BE. 280 281 outer_loop: 282 for (var i = 0; i < bufs.length; i++) { 283 var buf = bufs[i]; 284 for (var j = 0; j < buf.length; j++) { 285 b.push(buf[j]); 286 if (b.length === 4) { 287 if (charsProcessed === 0) { 288 // Check BOM first. 289 if (b[0] === 0xFF && b[1] === 0xFE && b[2] === 0 && b[3] === 0) { 290 return 'utf-32le'; 291 } 292 if (b[0] === 0 && b[1] === 0 && b[2] === 0xFE && b[3] === 0xFF) { 293 return 'utf-32be'; 294 } 295 } 296 297 if (b[0] !== 0 || b[1] > 0x10) invalidBE++; 298 if (b[3] !== 0 || b[2] > 0x10) invalidLE++; 299 300 if (b[0] === 0 && b[1] === 0 && (b[2] !== 0 || b[3] !== 0)) bmpCharsBE++; 301 if ((b[0] !== 0 || b[1] !== 0) && b[2] === 0 && b[3] === 0) bmpCharsLE++; 302 303 b.length = 0; 304 charsProcessed++; 305 306 if (charsProcessed >= 100) { 307 break outer_loop; 308 } 309 } 310 } 311 } 312 313 // Make decisions. 314 if (bmpCharsBE - invalidBE > bmpCharsLE - invalidLE) return 'utf-32be'; 315 if (bmpCharsBE - invalidBE < bmpCharsLE - invalidLE) return 'utf-32le'; 316 317 // Couldn't decide (likely all zeros or not enough data). 318 return defaultEncoding || 'utf-32le'; 319} 320