1/**
2 * @fileoverview A UTF8 decoder.
3 */
4goog.module('protobuf.binary.textencoding');
5
6const {checkElementIndex} = goog.require('protobuf.internal.checks');
7
8/**
9 * Combines an array of codePoints into a string.
10 * @param {!Array<number>} codePoints
11 * @return {string}
12 */
13function codePointsToString(codePoints) {
14  // Performance: http://jsperf.com/string-fromcharcode-test/13
15  let s = '', i = 0;
16  const length = codePoints.length;
17  const BATCH_SIZE = 10000;
18  while (i < length) {
19    const end = Math.min(i + BATCH_SIZE, length);
20    s += String.fromCharCode.apply(null, codePoints.slice(i, end));
21    i = end;
22  }
23  return s;
24}
25
26/**
27 * Decodes raw bytes into a string.
28 * Supports codepoints from U+0000 up to U+10FFFF.
29 * (http://en.wikipedia.org/wiki/UTF-8).
30 * @param {!DataView} bytes
31 * @return {string}
32 */
33function decode(bytes) {
34  let cursor = 0;
35  const codePoints = [];
36
37  while (cursor < bytes.byteLength) {
38    const c = bytes.getUint8(cursor++);
39    if (c < 0x80) {  // Regular 7-bit ASCII.
40      codePoints.push(c);
41    } else if (c < 0xC0) {
42      // UTF-8 continuation mark. We are out of sync. This
43      // might happen if we attempted to read a character
44      // with more than four bytes.
45      continue;
46    } else if (c < 0xE0) {  // UTF-8 with two bytes.
47      checkElementIndex(cursor, bytes.byteLength);
48      const c2 = bytes.getUint8(cursor++);
49      codePoints.push(((c & 0x1F) << 6) | (c2 & 0x3F));
50    } else if (c < 0xF0) {  // UTF-8 with three bytes.
51      checkElementIndex(cursor + 1, bytes.byteLength);
52      const c2 = bytes.getUint8(cursor++);
53      const c3 = bytes.getUint8(cursor++);
54      codePoints.push(((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
55    } else if (c < 0xF8) {  // UTF-8 with 4 bytes.
56      checkElementIndex(cursor + 2, bytes.byteLength);
57      const c2 = bytes.getUint8(cursor++);
58      const c3 = bytes.getUint8(cursor++);
59      const c4 = bytes.getUint8(cursor++);
60      // Characters written on 4 bytes have 21 bits for a codepoint.
61      // We can't fit that on 16bit characters, so we use surrogates.
62      let codepoint = ((c & 0x07) << 18) | ((c2 & 0x3F) << 12) |
63          ((c3 & 0x3F) << 6) | (c4 & 0x3F);
64      // Surrogates formula from wikipedia.
65      // 1. Subtract 0x10000 from codepoint
66      codepoint -= 0x10000;
67      // 2. Split this into the high 10-bit value and the low 10-bit value
68      // 3. Add 0xD800 to the high value to form the high surrogate
69      // 4. Add 0xDC00 to the low value to form the low surrogate:
70      const low = (codepoint & 0x3FF) + 0xDC00;
71      const high = ((codepoint >> 10) & 0x3FF) + 0xD800;
72      codePoints.push(high, low);
73    }
74  }
75  return codePointsToString(codePoints);
76}
77
78/**
79 * Writes a UTF16 JavaScript string to the buffer encoded as UTF8.
80 * @param {string} value The string to write.
81 * @return {!Uint8Array} An array containing the encoded bytes.
82 */
83function encode(value) {
84  const buffer = [];
85
86  for (let i = 0; i < value.length; i++) {
87    const c1 = value.charCodeAt(i);
88
89    if (c1 < 0x80) {
90      buffer.push(c1);
91    } else if (c1 < 0x800) {
92      buffer.push((c1 >> 6) | 0xC0);
93      buffer.push((c1 & 0x3F) | 0x80);
94    } else if (c1 < 0xD800 || c1 >= 0xE000) {
95      buffer.push((c1 >> 12) | 0xE0);
96      buffer.push(((c1 >> 6) & 0x3F) | 0x80);
97      buffer.push((c1 & 0x3F) | 0x80);
98    } else {
99      // surrogate pair
100      i++;
101      checkElementIndex(i, value.length);
102      const c2 = value.charCodeAt(i);
103      const paired = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF));
104      buffer.push((paired >> 18) | 0xF0);
105      buffer.push(((paired >> 12) & 0x3F) | 0x80);
106      buffer.push(((paired >> 6) & 0x3F) | 0x80);
107      buffer.push((paired & 0x3F) | 0x80);
108    }
109  }
110  return new Uint8Array(buffer);
111}
112
113exports = {
114  decode,
115  encode,
116};
117