1'use strict';
2
3// An implementation of the WHATWG Encoding Standard
4// https://encoding.spec.whatwg.org
5
6const {
7  Boolean,
8  ObjectCreate,
9  ObjectDefineProperties,
10  ObjectGetOwnPropertyDescriptors,
11  ObjectSetPrototypeOf,
12  ObjectValues,
13  SafeMap,
14  StringPrototypeSlice,
15  Symbol,
16  SymbolToStringTag,
17  Uint32Array,
18  Uint8Array,
19} = primordials;
20
21const {
22  ERR_ENCODING_NOT_SUPPORTED,
23  ERR_INVALID_ARG_TYPE,
24  ERR_INVALID_THIS,
25  ERR_NO_ICU,
26} = require('internal/errors').codes;
27const kHandle = Symbol('handle');
28const kFlags = Symbol('flags');
29const kEncoding = Symbol('encoding');
30const kDecoder = Symbol('decoder');
31const kEncoder = Symbol('encoder');
32const kFatal = Symbol('kFatal');
33const kUTF8FastPath = Symbol('kUTF8FastPath');
34const kIgnoreBOM = Symbol('kIgnoreBOM');
35
36const {
37  getConstructorOf,
38  customInspectSymbol: inspect,
39  kEmptyObject,
40  kEnumerableProperty,
41} = require('internal/util');
42
43const {
44  isAnyArrayBuffer,
45  isArrayBufferView,
46  isUint8Array,
47} = require('internal/util/types');
48
49const {
50  validateString,
51  validateObject,
52} = require('internal/validators');
53
54const {
55  encodeInto,
56  encodeUtf8String,
57  decodeUTF8,
58} = internalBinding('buffer');
59
60let Buffer;
61function lazyBuffer() {
62  if (Buffer === undefined)
63    Buffer = require('buffer').Buffer;
64  return Buffer;
65}
66
67function validateEncoder(obj) {
68  if (obj == null || obj[kEncoder] !== true)
69    throw new ERR_INVALID_THIS('TextEncoder');
70}
71
72function validateDecoder(obj) {
73  if (obj == null || obj[kDecoder] !== true)
74    throw new ERR_INVALID_THIS('TextDecoder');
75}
76
77const CONVERTER_FLAGS_FLUSH = 0x1;
78const CONVERTER_FLAGS_FATAL = 0x2;
79const CONVERTER_FLAGS_IGNORE_BOM = 0x4;
80
81const empty = new Uint8Array(0);
82
83const encodings = new SafeMap([
84  ['unicode-1-1-utf-8', 'utf-8'],
85  ['utf8', 'utf-8'],
86  ['utf-8', 'utf-8'],
87  ['866', 'ibm866'],
88  ['cp866', 'ibm866'],
89  ['csibm866', 'ibm866'],
90  ['ibm866', 'ibm866'],
91  ['csisolatin2', 'iso-8859-2'],
92  ['iso-8859-2', 'iso-8859-2'],
93  ['iso-ir-101', 'iso-8859-2'],
94  ['iso8859-2', 'iso-8859-2'],
95  ['iso88592', 'iso-8859-2'],
96  ['iso_8859-2', 'iso-8859-2'],
97  ['iso_8859-2:1987', 'iso-8859-2'],
98  ['l2', 'iso-8859-2'],
99  ['latin2', 'iso-8859-2'],
100  ['csisolatin3', 'iso-8859-3'],
101  ['iso-8859-3', 'iso-8859-3'],
102  ['iso-ir-109', 'iso-8859-3'],
103  ['iso8859-3', 'iso-8859-3'],
104  ['iso88593', 'iso-8859-3'],
105  ['iso_8859-3', 'iso-8859-3'],
106  ['iso_8859-3:1988', 'iso-8859-3'],
107  ['l3', 'iso-8859-3'],
108  ['latin3', 'iso-8859-3'],
109  ['csisolatin4', 'iso-8859-4'],
110  ['iso-8859-4', 'iso-8859-4'],
111  ['iso-ir-110', 'iso-8859-4'],
112  ['iso8859-4', 'iso-8859-4'],
113  ['iso88594', 'iso-8859-4'],
114  ['iso_8859-4', 'iso-8859-4'],
115  ['iso_8859-4:1988', 'iso-8859-4'],
116  ['l4', 'iso-8859-4'],
117  ['latin4', 'iso-8859-4'],
118  ['csisolatincyrillic', 'iso-8859-5'],
119  ['cyrillic', 'iso-8859-5'],
120  ['iso-8859-5', 'iso-8859-5'],
121  ['iso-ir-144', 'iso-8859-5'],
122  ['iso8859-5', 'iso-8859-5'],
123  ['iso88595', 'iso-8859-5'],
124  ['iso_8859-5', 'iso-8859-5'],
125  ['iso_8859-5:1988', 'iso-8859-5'],
126  ['arabic', 'iso-8859-6'],
127  ['asmo-708', 'iso-8859-6'],
128  ['csiso88596e', 'iso-8859-6'],
129  ['csiso88596i', 'iso-8859-6'],
130  ['csisolatinarabic', 'iso-8859-6'],
131  ['ecma-114', 'iso-8859-6'],
132  ['iso-8859-6', 'iso-8859-6'],
133  ['iso-8859-6-e', 'iso-8859-6'],
134  ['iso-8859-6-i', 'iso-8859-6'],
135  ['iso-ir-127', 'iso-8859-6'],
136  ['iso8859-6', 'iso-8859-6'],
137  ['iso88596', 'iso-8859-6'],
138  ['iso_8859-6', 'iso-8859-6'],
139  ['iso_8859-6:1987', 'iso-8859-6'],
140  ['csisolatingreek', 'iso-8859-7'],
141  ['ecma-118', 'iso-8859-7'],
142  ['elot_928', 'iso-8859-7'],
143  ['greek', 'iso-8859-7'],
144  ['greek8', 'iso-8859-7'],
145  ['iso-8859-7', 'iso-8859-7'],
146  ['iso-ir-126', 'iso-8859-7'],
147  ['iso8859-7', 'iso-8859-7'],
148  ['iso88597', 'iso-8859-7'],
149  ['iso_8859-7', 'iso-8859-7'],
150  ['iso_8859-7:1987', 'iso-8859-7'],
151  ['sun_eu_greek', 'iso-8859-7'],
152  ['csiso88598e', 'iso-8859-8'],
153  ['csisolatinhebrew', 'iso-8859-8'],
154  ['hebrew', 'iso-8859-8'],
155  ['iso-8859-8', 'iso-8859-8'],
156  ['iso-8859-8-e', 'iso-8859-8'],
157  ['iso-ir-138', 'iso-8859-8'],
158  ['iso8859-8', 'iso-8859-8'],
159  ['iso88598', 'iso-8859-8'],
160  ['iso_8859-8', 'iso-8859-8'],
161  ['iso_8859-8:1988', 'iso-8859-8'],
162  ['visual', 'iso-8859-8'],
163  ['csiso88598i', 'iso-8859-8-i'],
164  ['iso-8859-8-i', 'iso-8859-8-i'],
165  ['logical', 'iso-8859-8-i'],
166  ['csisolatin6', 'iso-8859-10'],
167  ['iso-8859-10', 'iso-8859-10'],
168  ['iso-ir-157', 'iso-8859-10'],
169  ['iso8859-10', 'iso-8859-10'],
170  ['iso885910', 'iso-8859-10'],
171  ['l6', 'iso-8859-10'],
172  ['latin6', 'iso-8859-10'],
173  ['iso-8859-13', 'iso-8859-13'],
174  ['iso8859-13', 'iso-8859-13'],
175  ['iso885913', 'iso-8859-13'],
176  ['iso-8859-14', 'iso-8859-14'],
177  ['iso8859-14', 'iso-8859-14'],
178  ['iso885914', 'iso-8859-14'],
179  ['csisolatin9', 'iso-8859-15'],
180  ['iso-8859-15', 'iso-8859-15'],
181  ['iso8859-15', 'iso-8859-15'],
182  ['iso885915', 'iso-8859-15'],
183  ['iso_8859-15', 'iso-8859-15'],
184  ['l9', 'iso-8859-15'],
185  ['cskoi8r', 'koi8-r'],
186  ['koi', 'koi8-r'],
187  ['koi8', 'koi8-r'],
188  ['koi8-r', 'koi8-r'],
189  ['koi8_r', 'koi8-r'],
190  ['koi8-ru', 'koi8-u'],
191  ['koi8-u', 'koi8-u'],
192  ['csmacintosh', 'macintosh'],
193  ['mac', 'macintosh'],
194  ['macintosh', 'macintosh'],
195  ['x-mac-roman', 'macintosh'],
196  ['dos-874', 'windows-874'],
197  ['iso-8859-11', 'windows-874'],
198  ['iso8859-11', 'windows-874'],
199  ['iso885911', 'windows-874'],
200  ['tis-620', 'windows-874'],
201  ['windows-874', 'windows-874'],
202  ['cp1250', 'windows-1250'],
203  ['windows-1250', 'windows-1250'],
204  ['x-cp1250', 'windows-1250'],
205  ['cp1251', 'windows-1251'],
206  ['windows-1251', 'windows-1251'],
207  ['x-cp1251', 'windows-1251'],
208  ['ansi_x3.4-1968', 'windows-1252'],
209  ['ascii', 'windows-1252'],
210  ['cp1252', 'windows-1252'],
211  ['cp819', 'windows-1252'],
212  ['csisolatin1', 'windows-1252'],
213  ['ibm819', 'windows-1252'],
214  ['iso-8859-1', 'windows-1252'],
215  ['iso-ir-100', 'windows-1252'],
216  ['iso8859-1', 'windows-1252'],
217  ['iso88591', 'windows-1252'],
218  ['iso_8859-1', 'windows-1252'],
219  ['iso_8859-1:1987', 'windows-1252'],
220  ['l1', 'windows-1252'],
221  ['latin1', 'windows-1252'],
222  ['us-ascii', 'windows-1252'],
223  ['windows-1252', 'windows-1252'],
224  ['x-cp1252', 'windows-1252'],
225  ['cp1253', 'windows-1253'],
226  ['windows-1253', 'windows-1253'],
227  ['x-cp1253', 'windows-1253'],
228  ['cp1254', 'windows-1254'],
229  ['csisolatin5', 'windows-1254'],
230  ['iso-8859-9', 'windows-1254'],
231  ['iso-ir-148', 'windows-1254'],
232  ['iso8859-9', 'windows-1254'],
233  ['iso88599', 'windows-1254'],
234  ['iso_8859-9', 'windows-1254'],
235  ['iso_8859-9:1989', 'windows-1254'],
236  ['l5', 'windows-1254'],
237  ['latin5', 'windows-1254'],
238  ['windows-1254', 'windows-1254'],
239  ['x-cp1254', 'windows-1254'],
240  ['cp1255', 'windows-1255'],
241  ['windows-1255', 'windows-1255'],
242  ['x-cp1255', 'windows-1255'],
243  ['cp1256', 'windows-1256'],
244  ['windows-1256', 'windows-1256'],
245  ['x-cp1256', 'windows-1256'],
246  ['cp1257', 'windows-1257'],
247  ['windows-1257', 'windows-1257'],
248  ['x-cp1257', 'windows-1257'],
249  ['cp1258', 'windows-1258'],
250  ['windows-1258', 'windows-1258'],
251  ['x-cp1258', 'windows-1258'],
252  ['x-mac-cyrillic', 'x-mac-cyrillic'],
253  ['x-mac-ukrainian', 'x-mac-cyrillic'],
254  ['chinese', 'gbk'],
255  ['csgb2312', 'gbk'],
256  ['csiso58gb231280', 'gbk'],
257  ['gb2312', 'gbk'],
258  ['gb_2312', 'gbk'],
259  ['gb_2312-80', 'gbk'],
260  ['gbk', 'gbk'],
261  ['iso-ir-58', 'gbk'],
262  ['x-gbk', 'gbk'],
263  ['gb18030', 'gb18030'],
264  ['big5', 'big5'],
265  ['big5-hkscs', 'big5'],
266  ['cn-big5', 'big5'],
267  ['csbig5', 'big5'],
268  ['x-x-big5', 'big5'],
269  ['cseucpkdfmtjapanese', 'euc-jp'],
270  ['euc-jp', 'euc-jp'],
271  ['x-euc-jp', 'euc-jp'],
272  ['csiso2022jp', 'iso-2022-jp'],
273  ['iso-2022-jp', 'iso-2022-jp'],
274  ['csshiftjis', 'shift_jis'],
275  ['ms932', 'shift_jis'],
276  ['ms_kanji', 'shift_jis'],
277  ['shift-jis', 'shift_jis'],
278  ['shift_jis', 'shift_jis'],
279  ['sjis', 'shift_jis'],
280  ['windows-31j', 'shift_jis'],
281  ['x-sjis', 'shift_jis'],
282  ['cseuckr', 'euc-kr'],
283  ['csksc56011987', 'euc-kr'],
284  ['euc-kr', 'euc-kr'],
285  ['iso-ir-149', 'euc-kr'],
286  ['korean', 'euc-kr'],
287  ['ks_c_5601-1987', 'euc-kr'],
288  ['ks_c_5601-1989', 'euc-kr'],
289  ['ksc5601', 'euc-kr'],
290  ['ksc_5601', 'euc-kr'],
291  ['windows-949', 'euc-kr'],
292  ['utf-16be', 'utf-16be'],
293  ['utf-16le', 'utf-16le'],
294  ['utf-16', 'utf-16le'],
295]);
296
297// Unfortunately, String.prototype.trim also removes non-ascii whitespace,
298// so we have to do this manually
299function trimAsciiWhitespace(label) {
300  let s = 0;
301  let e = label.length;
302  while (s < e && (
303    label[s] === '\u0009' ||
304    label[s] === '\u000a' ||
305    label[s] === '\u000c' ||
306    label[s] === '\u000d' ||
307    label[s] === '\u0020')) {
308    s++;
309  }
310  while (e > s && (
311    label[e - 1] === '\u0009' ||
312    label[e - 1] === '\u000a' ||
313    label[e - 1] === '\u000c' ||
314    label[e - 1] === '\u000d' ||
315    label[e - 1] === '\u0020')) {
316    e--;
317  }
318  return StringPrototypeSlice(label, s, e);
319}
320
321function getEncodingFromLabel(label) {
322  const enc = encodings.get(label);
323  if (enc !== undefined) return enc;
324  return encodings.get(trimAsciiWhitespace(label.toLowerCase()));
325}
326
327const encodeIntoResults = new Uint32Array(2);
328
329class TextEncoder {
330  constructor() {
331    this[kEncoder] = true;
332  }
333
334  get encoding() {
335    validateEncoder(this);
336    return 'utf-8';
337  }
338
339  encode(input = '') {
340    validateEncoder(this);
341    return encodeUtf8String(`${input}`);
342  }
343
344  encodeInto(src, dest) {
345    validateEncoder(this);
346    validateString(src, 'src');
347    if (!dest || !isUint8Array(dest))
348      throw new ERR_INVALID_ARG_TYPE('dest', 'Uint8Array', dest);
349    encodeInto(src, dest, encodeIntoResults);
350    return { read: encodeIntoResults[0], written: encodeIntoResults[1] };
351  }
352
353  [inspect](depth, opts) {
354    validateEncoder(this);
355    if (typeof depth === 'number' && depth < 0)
356      return this;
357    const ctor = getConstructorOf(this);
358    const obj = ObjectCreate({
359      constructor: ctor === null ? TextEncoder : ctor,
360    });
361    obj.encoding = this.encoding;
362    // Lazy to avoid circular dependency
363    return require('internal/util/inspect').inspect(obj, opts);
364  }
365}
366
367ObjectDefineProperties(
368  TextEncoder.prototype, {
369    'encode': kEnumerableProperty,
370    'encodeInto': kEnumerableProperty,
371    'encoding': kEnumerableProperty,
372    [SymbolToStringTag]: { __proto__: null, configurable: true, value: 'TextEncoder' },
373  });
374
375const TextDecoder =
376  internalBinding('config').hasIntl ?
377    makeTextDecoderICU() :
378    makeTextDecoderJS();
379
380function makeTextDecoderICU() {
381  const {
382    decode: _decode,
383    getConverter,
384  } = internalBinding('icu');
385
386  class TextDecoder {
387    constructor(encoding = 'utf-8', options = kEmptyObject) {
388      encoding = `${encoding}`;
389      validateObject(options, 'options', {
390        nullable: true,
391        allowArray: true,
392        allowFunction: true,
393      });
394
395      const enc = getEncodingFromLabel(encoding);
396      if (enc === undefined)
397        throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
398
399      let flags = 0;
400      if (options !== null) {
401        flags |= options.fatal ? CONVERTER_FLAGS_FATAL : 0;
402        flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0;
403      }
404
405      this[kDecoder] = true;
406      this[kFlags] = flags;
407      this[kEncoding] = enc;
408      this[kIgnoreBOM] = Boolean(options?.ignoreBOM);
409      this[kFatal] = Boolean(options?.fatal);
410      // Only support fast path for UTF-8.
411      this[kUTF8FastPath] = enc === 'utf-8';
412      this[kHandle] = undefined;
413
414      if (!this[kUTF8FastPath]) {
415        this.#prepareConverter();
416      }
417    }
418
419    #prepareConverter() {
420      if (this[kHandle] !== undefined) return;
421      const handle = getConverter(this[kEncoding], this[kFlags]);
422      if (handle === undefined)
423        throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
424      this[kHandle] = handle;
425    }
426
427    decode(input = empty, options = kEmptyObject) {
428      validateDecoder(this);
429
430      this[kUTF8FastPath] &&= !(options?.stream);
431
432      if (this[kUTF8FastPath]) {
433        return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
434      }
435
436      this.#prepareConverter();
437
438      validateObject(options, 'options', {
439        nullable: true,
440        allowArray: true,
441        allowFunction: true,
442      });
443
444      let flags = 0;
445      if (options !== null)
446        flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH;
447
448      return _decode(this[kHandle], input, flags, this.encoding);
449    }
450  }
451
452  return TextDecoder;
453}
454
455function makeTextDecoderJS() {
456  let StringDecoder;
457  function lazyStringDecoder() {
458    if (StringDecoder === undefined)
459      ({ StringDecoder } = require('string_decoder'));
460    return StringDecoder;
461  }
462
463  const kBOMSeen = Symbol('BOM seen');
464
465  function hasConverter(encoding) {
466    return encoding === 'utf-8' || encoding === 'utf-16le';
467  }
468
469  class TextDecoder {
470    constructor(encoding = 'utf-8', options = kEmptyObject) {
471      encoding = `${encoding}`;
472      validateObject(options, 'options', {
473        nullable: true,
474        allowArray: true,
475        allowFunction: true,
476      });
477
478      const enc = getEncodingFromLabel(encoding);
479      if (enc === undefined || !hasConverter(enc))
480        throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
481
482      let flags = 0;
483      if (options !== null) {
484        if (options.fatal) {
485          throw new ERR_NO_ICU('"fatal" option');
486        }
487        flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0;
488      }
489
490      this[kDecoder] = true;
491      // StringDecoder will normalize WHATWG encoding to Node.js encoding.
492      this[kHandle] = new (lazyStringDecoder())(enc);
493      this[kFlags] = flags;
494      this[kEncoding] = enc;
495      this[kBOMSeen] = false;
496    }
497
498    decode(input = empty, options = kEmptyObject) {
499      validateDecoder(this);
500      if (isAnyArrayBuffer(input)) {
501        try {
502          input = lazyBuffer().from(input);
503        } catch {
504          input = empty;
505        }
506      } else if (isArrayBufferView(input)) {
507        try {
508          input = lazyBuffer().from(input.buffer, input.byteOffset,
509                                    input.byteLength);
510        } catch {
511          input = empty;
512        }
513      } else {
514        throw new ERR_INVALID_ARG_TYPE('input',
515                                       ['ArrayBuffer', 'ArrayBufferView'],
516                                       input);
517      }
518      validateObject(options, 'options', {
519        nullable: true,
520        allowArray: true,
521        allowFunction: true,
522      });
523
524      if (this[kFlags] & CONVERTER_FLAGS_FLUSH) {
525        this[kBOMSeen] = false;
526      }
527
528      if (options !== null && options.stream) {
529        this[kFlags] &= ~CONVERTER_FLAGS_FLUSH;
530      } else {
531        this[kFlags] |= CONVERTER_FLAGS_FLUSH;
532      }
533
534      let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ?
535        this[kHandle].end(input) :
536        this[kHandle].write(input);
537
538      if (result.length > 0 &&
539          !this[kBOMSeen] &&
540          !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
541        // If the very first result in the stream is a BOM, and we are not
542        // explicitly told to ignore it, then we discard it.
543        if (result[0] === '\ufeff') {
544          result = StringPrototypeSlice(result, 1);
545        }
546        this[kBOMSeen] = true;
547      }
548
549      return result;
550    }
551  }
552
553  return TextDecoder;
554}
555
556// Mix in some shared properties.
557const sharedProperties = ObjectGetOwnPropertyDescriptors({
558  get encoding() {
559    validateDecoder(this);
560    return this[kEncoding];
561  },
562
563  get fatal() {
564    validateDecoder(this);
565    return (this[kFlags] & CONVERTER_FLAGS_FATAL) === CONVERTER_FLAGS_FATAL;
566  },
567
568  get ignoreBOM() {
569    validateDecoder(this);
570    return (this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM) ===
571              CONVERTER_FLAGS_IGNORE_BOM;
572  },
573
574  [inspect](depth, opts) {
575    validateDecoder(this);
576    if (typeof depth === 'number' && depth < 0)
577      return this;
578    const constructor = getConstructorOf(this) || TextDecoder;
579    const obj = ObjectCreate({ constructor });
580    obj.encoding = this.encoding;
581    obj.fatal = this.fatal;
582    obj.ignoreBOM = this.ignoreBOM;
583    if (opts.showHidden) {
584      obj[kFlags] = this[kFlags];
585      obj[kHandle] = this[kHandle];
586    }
587    // Lazy to avoid circular dependency
588    const { inspect } = require('internal/util/inspect');
589    return `${constructor.name} ${inspect(obj)}`;
590  },
591});
592const propertiesValues = ObjectValues(sharedProperties);
593for (let i = 0; i < propertiesValues.length; i++) {
594  // We want to use null-prototype objects to not rely on globally mutable
595  // %Object.prototype%.
596  ObjectSetPrototypeOf(propertiesValues[i], null);
597}
598sharedProperties[inspect].enumerable = false;
599
600ObjectDefineProperties(TextDecoder.prototype, {
601  decode: kEnumerableProperty,
602  ...sharedProperties,
603  [SymbolToStringTag]: {
604    __proto__: null,
605    configurable: true,
606    value: 'TextDecoder',
607  },
608});
609
610module.exports = {
611  getEncodingFromLabel,
612  TextDecoder,
613  TextEncoder,
614};
615