1ffe3c632Sopenharmony_ci/** 2ffe3c632Sopenharmony_ci * @fileoverview A UTF8 decoder. 3ffe3c632Sopenharmony_ci */ 4ffe3c632Sopenharmony_cigoog.module('protobuf.binary.textencoding'); 5ffe3c632Sopenharmony_ci 6ffe3c632Sopenharmony_ciconst {checkElementIndex} = goog.require('protobuf.internal.checks'); 7ffe3c632Sopenharmony_ci 8ffe3c632Sopenharmony_ci/** 9ffe3c632Sopenharmony_ci * Combines an array of codePoints into a string. 10ffe3c632Sopenharmony_ci * @param {!Array<number>} codePoints 11ffe3c632Sopenharmony_ci * @return {string} 12ffe3c632Sopenharmony_ci */ 13ffe3c632Sopenharmony_cifunction codePointsToString(codePoints) { 14ffe3c632Sopenharmony_ci // Performance: http://jsperf.com/string-fromcharcode-test/13 15ffe3c632Sopenharmony_ci let s = '', i = 0; 16ffe3c632Sopenharmony_ci const length = codePoints.length; 17ffe3c632Sopenharmony_ci const BATCH_SIZE = 10000; 18ffe3c632Sopenharmony_ci while (i < length) { 19ffe3c632Sopenharmony_ci const end = Math.min(i + BATCH_SIZE, length); 20ffe3c632Sopenharmony_ci s += String.fromCharCode.apply(null, codePoints.slice(i, end)); 21ffe3c632Sopenharmony_ci i = end; 22ffe3c632Sopenharmony_ci } 23ffe3c632Sopenharmony_ci return s; 24ffe3c632Sopenharmony_ci} 25ffe3c632Sopenharmony_ci 26ffe3c632Sopenharmony_ci/** 27ffe3c632Sopenharmony_ci * Decodes raw bytes into a string. 28ffe3c632Sopenharmony_ci * Supports codepoints from U+0000 up to U+10FFFF. 29ffe3c632Sopenharmony_ci * (http://en.wikipedia.org/wiki/UTF-8). 30ffe3c632Sopenharmony_ci * @param {!DataView} bytes 31ffe3c632Sopenharmony_ci * @return {string} 32ffe3c632Sopenharmony_ci */ 33ffe3c632Sopenharmony_cifunction decode(bytes) { 34ffe3c632Sopenharmony_ci let cursor = 0; 35ffe3c632Sopenharmony_ci const codePoints = []; 36ffe3c632Sopenharmony_ci 37ffe3c632Sopenharmony_ci while (cursor < bytes.byteLength) { 38ffe3c632Sopenharmony_ci const c = bytes.getUint8(cursor++); 39ffe3c632Sopenharmony_ci if (c < 0x80) { // Regular 7-bit ASCII. 40ffe3c632Sopenharmony_ci codePoints.push(c); 41ffe3c632Sopenharmony_ci } else if (c < 0xC0) { 42ffe3c632Sopenharmony_ci // UTF-8 continuation mark. We are out of sync. This 43ffe3c632Sopenharmony_ci // might happen if we attempted to read a character 44ffe3c632Sopenharmony_ci // with more than four bytes. 45ffe3c632Sopenharmony_ci continue; 46ffe3c632Sopenharmony_ci } else if (c < 0xE0) { // UTF-8 with two bytes. 47ffe3c632Sopenharmony_ci checkElementIndex(cursor, bytes.byteLength); 48ffe3c632Sopenharmony_ci const c2 = bytes.getUint8(cursor++); 49ffe3c632Sopenharmony_ci codePoints.push(((c & 0x1F) << 6) | (c2 & 0x3F)); 50ffe3c632Sopenharmony_ci } else if (c < 0xF0) { // UTF-8 with three bytes. 51ffe3c632Sopenharmony_ci checkElementIndex(cursor + 1, bytes.byteLength); 52ffe3c632Sopenharmony_ci const c2 = bytes.getUint8(cursor++); 53ffe3c632Sopenharmony_ci const c3 = bytes.getUint8(cursor++); 54ffe3c632Sopenharmony_ci codePoints.push(((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); 55ffe3c632Sopenharmony_ci } else if (c < 0xF8) { // UTF-8 with 4 bytes. 56ffe3c632Sopenharmony_ci checkElementIndex(cursor + 2, bytes.byteLength); 57ffe3c632Sopenharmony_ci const c2 = bytes.getUint8(cursor++); 58ffe3c632Sopenharmony_ci const c3 = bytes.getUint8(cursor++); 59ffe3c632Sopenharmony_ci const c4 = bytes.getUint8(cursor++); 60ffe3c632Sopenharmony_ci // Characters written on 4 bytes have 21 bits for a codepoint. 61ffe3c632Sopenharmony_ci // We can't fit that on 16bit characters, so we use surrogates. 62ffe3c632Sopenharmony_ci let codepoint = ((c & 0x07) << 18) | ((c2 & 0x3F) << 12) | 63ffe3c632Sopenharmony_ci ((c3 & 0x3F) << 6) | (c4 & 0x3F); 64ffe3c632Sopenharmony_ci // Surrogates formula from wikipedia. 65ffe3c632Sopenharmony_ci // 1. Subtract 0x10000 from codepoint 66ffe3c632Sopenharmony_ci codepoint -= 0x10000; 67ffe3c632Sopenharmony_ci // 2. Split this into the high 10-bit value and the low 10-bit value 68ffe3c632Sopenharmony_ci // 3. Add 0xD800 to the high value to form the high surrogate 69ffe3c632Sopenharmony_ci // 4. Add 0xDC00 to the low value to form the low surrogate: 70ffe3c632Sopenharmony_ci const low = (codepoint & 0x3FF) + 0xDC00; 71ffe3c632Sopenharmony_ci const high = ((codepoint >> 10) & 0x3FF) + 0xD800; 72ffe3c632Sopenharmony_ci codePoints.push(high, low); 73ffe3c632Sopenharmony_ci } 74ffe3c632Sopenharmony_ci } 75ffe3c632Sopenharmony_ci return codePointsToString(codePoints); 76ffe3c632Sopenharmony_ci} 77ffe3c632Sopenharmony_ci 78ffe3c632Sopenharmony_ci/** 79ffe3c632Sopenharmony_ci * Writes a UTF16 JavaScript string to the buffer encoded as UTF8. 80ffe3c632Sopenharmony_ci * @param {string} value The string to write. 81ffe3c632Sopenharmony_ci * @return {!Uint8Array} An array containing the encoded bytes. 82ffe3c632Sopenharmony_ci */ 83ffe3c632Sopenharmony_cifunction encode(value) { 84ffe3c632Sopenharmony_ci const buffer = []; 85ffe3c632Sopenharmony_ci 86ffe3c632Sopenharmony_ci for (let i = 0; i < value.length; i++) { 87ffe3c632Sopenharmony_ci const c1 = value.charCodeAt(i); 88ffe3c632Sopenharmony_ci 89ffe3c632Sopenharmony_ci if (c1 < 0x80) { 90ffe3c632Sopenharmony_ci buffer.push(c1); 91ffe3c632Sopenharmony_ci } else if (c1 < 0x800) { 92ffe3c632Sopenharmony_ci buffer.push((c1 >> 6) | 0xC0); 93ffe3c632Sopenharmony_ci buffer.push((c1 & 0x3F) | 0x80); 94ffe3c632Sopenharmony_ci } else if (c1 < 0xD800 || c1 >= 0xE000) { 95ffe3c632Sopenharmony_ci buffer.push((c1 >> 12) | 0xE0); 96ffe3c632Sopenharmony_ci buffer.push(((c1 >> 6) & 0x3F) | 0x80); 97ffe3c632Sopenharmony_ci buffer.push((c1 & 0x3F) | 0x80); 98ffe3c632Sopenharmony_ci } else { 99ffe3c632Sopenharmony_ci // surrogate pair 100ffe3c632Sopenharmony_ci i++; 101ffe3c632Sopenharmony_ci checkElementIndex(i, value.length); 102ffe3c632Sopenharmony_ci const c2 = value.charCodeAt(i); 103ffe3c632Sopenharmony_ci const paired = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF)); 104ffe3c632Sopenharmony_ci buffer.push((paired >> 18) | 0xF0); 105ffe3c632Sopenharmony_ci buffer.push(((paired >> 12) & 0x3F) | 0x80); 106ffe3c632Sopenharmony_ci buffer.push(((paired >> 6) & 0x3F) | 0x80); 107ffe3c632Sopenharmony_ci buffer.push((paired & 0x3F) | 0x80); 108ffe3c632Sopenharmony_ci } 109ffe3c632Sopenharmony_ci } 110ffe3c632Sopenharmony_ci return new Uint8Array(buffer); 111ffe3c632Sopenharmony_ci} 112ffe3c632Sopenharmony_ci 113ffe3c632Sopenharmony_ciexports = { 114ffe3c632Sopenharmony_ci decode, 115ffe3c632Sopenharmony_ci encode, 116ffe3c632Sopenharmony_ci}; 117