1ffe3c632Sopenharmony_ci/**
2ffe3c632Sopenharmony_ci * @fileoverview A UTF8 decoder.
3ffe3c632Sopenharmony_ci */
4ffe3c632Sopenharmony_cigoog.module('protobuf.binary.textencoding');
5ffe3c632Sopenharmony_ci
6ffe3c632Sopenharmony_ciconst {checkElementIndex} = goog.require('protobuf.internal.checks');
7ffe3c632Sopenharmony_ci
8ffe3c632Sopenharmony_ci/**
9ffe3c632Sopenharmony_ci * Combines an array of codePoints into a string.
10ffe3c632Sopenharmony_ci * @param {!Array<number>} codePoints
11ffe3c632Sopenharmony_ci * @return {string}
12ffe3c632Sopenharmony_ci */
13ffe3c632Sopenharmony_cifunction codePointsToString(codePoints) {
14ffe3c632Sopenharmony_ci  // Performance: http://jsperf.com/string-fromcharcode-test/13
15ffe3c632Sopenharmony_ci  let s = '', i = 0;
16ffe3c632Sopenharmony_ci  const length = codePoints.length;
17ffe3c632Sopenharmony_ci  const BATCH_SIZE = 10000;
18ffe3c632Sopenharmony_ci  while (i < length) {
19ffe3c632Sopenharmony_ci    const end = Math.min(i + BATCH_SIZE, length);
20ffe3c632Sopenharmony_ci    s += String.fromCharCode.apply(null, codePoints.slice(i, end));
21ffe3c632Sopenharmony_ci    i = end;
22ffe3c632Sopenharmony_ci  }
23ffe3c632Sopenharmony_ci  return s;
24ffe3c632Sopenharmony_ci}
25ffe3c632Sopenharmony_ci
26ffe3c632Sopenharmony_ci/**
27ffe3c632Sopenharmony_ci * Decodes raw bytes into a string.
28ffe3c632Sopenharmony_ci * Supports codepoints from U+0000 up to U+10FFFF.
29ffe3c632Sopenharmony_ci * (http://en.wikipedia.org/wiki/UTF-8).
30ffe3c632Sopenharmony_ci * @param {!DataView} bytes
31ffe3c632Sopenharmony_ci * @return {string}
32ffe3c632Sopenharmony_ci */
33ffe3c632Sopenharmony_cifunction decode(bytes) {
34ffe3c632Sopenharmony_ci  let cursor = 0;
35ffe3c632Sopenharmony_ci  const codePoints = [];
36ffe3c632Sopenharmony_ci
37ffe3c632Sopenharmony_ci  while (cursor < bytes.byteLength) {
38ffe3c632Sopenharmony_ci    const c = bytes.getUint8(cursor++);
39ffe3c632Sopenharmony_ci    if (c < 0x80) {  // Regular 7-bit ASCII.
40ffe3c632Sopenharmony_ci      codePoints.push(c);
41ffe3c632Sopenharmony_ci    } else if (c < 0xC0) {
42ffe3c632Sopenharmony_ci      // UTF-8 continuation mark. We are out of sync. This
43ffe3c632Sopenharmony_ci      // might happen if we attempted to read a character
44ffe3c632Sopenharmony_ci      // with more than four bytes.
45ffe3c632Sopenharmony_ci      continue;
46ffe3c632Sopenharmony_ci    } else if (c < 0xE0) {  // UTF-8 with two bytes.
47ffe3c632Sopenharmony_ci      checkElementIndex(cursor, bytes.byteLength);
48ffe3c632Sopenharmony_ci      const c2 = bytes.getUint8(cursor++);
49ffe3c632Sopenharmony_ci      codePoints.push(((c & 0x1F) << 6) | (c2 & 0x3F));
50ffe3c632Sopenharmony_ci    } else if (c < 0xF0) {  // UTF-8 with three bytes.
51ffe3c632Sopenharmony_ci      checkElementIndex(cursor + 1, bytes.byteLength);
52ffe3c632Sopenharmony_ci      const c2 = bytes.getUint8(cursor++);
53ffe3c632Sopenharmony_ci      const c3 = bytes.getUint8(cursor++);
54ffe3c632Sopenharmony_ci      codePoints.push(((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
55ffe3c632Sopenharmony_ci    } else if (c < 0xF8) {  // UTF-8 with 4 bytes.
56ffe3c632Sopenharmony_ci      checkElementIndex(cursor + 2, bytes.byteLength);
57ffe3c632Sopenharmony_ci      const c2 = bytes.getUint8(cursor++);
58ffe3c632Sopenharmony_ci      const c3 = bytes.getUint8(cursor++);
59ffe3c632Sopenharmony_ci      const c4 = bytes.getUint8(cursor++);
60ffe3c632Sopenharmony_ci      // Characters written on 4 bytes have 21 bits for a codepoint.
61ffe3c632Sopenharmony_ci      // We can't fit that on 16bit characters, so we use surrogates.
62ffe3c632Sopenharmony_ci      let codepoint = ((c & 0x07) << 18) | ((c2 & 0x3F) << 12) |
63ffe3c632Sopenharmony_ci          ((c3 & 0x3F) << 6) | (c4 & 0x3F);
64ffe3c632Sopenharmony_ci      // Surrogates formula from wikipedia.
65ffe3c632Sopenharmony_ci      // 1. Subtract 0x10000 from codepoint
66ffe3c632Sopenharmony_ci      codepoint -= 0x10000;
67ffe3c632Sopenharmony_ci      // 2. Split this into the high 10-bit value and the low 10-bit value
68ffe3c632Sopenharmony_ci      // 3. Add 0xD800 to the high value to form the high surrogate
69ffe3c632Sopenharmony_ci      // 4. Add 0xDC00 to the low value to form the low surrogate:
70ffe3c632Sopenharmony_ci      const low = (codepoint & 0x3FF) + 0xDC00;
71ffe3c632Sopenharmony_ci      const high = ((codepoint >> 10) & 0x3FF) + 0xD800;
72ffe3c632Sopenharmony_ci      codePoints.push(high, low);
73ffe3c632Sopenharmony_ci    }
74ffe3c632Sopenharmony_ci  }
75ffe3c632Sopenharmony_ci  return codePointsToString(codePoints);
76ffe3c632Sopenharmony_ci}
77ffe3c632Sopenharmony_ci
78ffe3c632Sopenharmony_ci/**
79ffe3c632Sopenharmony_ci * Writes a UTF16 JavaScript string to the buffer encoded as UTF8.
80ffe3c632Sopenharmony_ci * @param {string} value The string to write.
81ffe3c632Sopenharmony_ci * @return {!Uint8Array} An array containing the encoded bytes.
82ffe3c632Sopenharmony_ci */
83ffe3c632Sopenharmony_cifunction encode(value) {
84ffe3c632Sopenharmony_ci  const buffer = [];
85ffe3c632Sopenharmony_ci
86ffe3c632Sopenharmony_ci  for (let i = 0; i < value.length; i++) {
87ffe3c632Sopenharmony_ci    const c1 = value.charCodeAt(i);
88ffe3c632Sopenharmony_ci
89ffe3c632Sopenharmony_ci    if (c1 < 0x80) {
90ffe3c632Sopenharmony_ci      buffer.push(c1);
91ffe3c632Sopenharmony_ci    } else if (c1 < 0x800) {
92ffe3c632Sopenharmony_ci      buffer.push((c1 >> 6) | 0xC0);
93ffe3c632Sopenharmony_ci      buffer.push((c1 & 0x3F) | 0x80);
94ffe3c632Sopenharmony_ci    } else if (c1 < 0xD800 || c1 >= 0xE000) {
95ffe3c632Sopenharmony_ci      buffer.push((c1 >> 12) | 0xE0);
96ffe3c632Sopenharmony_ci      buffer.push(((c1 >> 6) & 0x3F) | 0x80);
97ffe3c632Sopenharmony_ci      buffer.push((c1 & 0x3F) | 0x80);
98ffe3c632Sopenharmony_ci    } else {
99ffe3c632Sopenharmony_ci      // surrogate pair
100ffe3c632Sopenharmony_ci      i++;
101ffe3c632Sopenharmony_ci      checkElementIndex(i, value.length);
102ffe3c632Sopenharmony_ci      const c2 = value.charCodeAt(i);
103ffe3c632Sopenharmony_ci      const paired = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF));
104ffe3c632Sopenharmony_ci      buffer.push((paired >> 18) | 0xF0);
105ffe3c632Sopenharmony_ci      buffer.push(((paired >> 12) & 0x3F) | 0x80);
106ffe3c632Sopenharmony_ci      buffer.push(((paired >> 6) & 0x3F) | 0x80);
107ffe3c632Sopenharmony_ci      buffer.push((paired & 0x3F) | 0x80);
108ffe3c632Sopenharmony_ci    }
109ffe3c632Sopenharmony_ci  }
110ffe3c632Sopenharmony_ci  return new Uint8Array(buffer);
111ffe3c632Sopenharmony_ci}
112ffe3c632Sopenharmony_ci
113ffe3c632Sopenharmony_ciexports = {
114ffe3c632Sopenharmony_ci  decode,
115ffe3c632Sopenharmony_ci  encode,
116ffe3c632Sopenharmony_ci};
117