1cb93a386Sopenharmony_ci// Copyright 2018 Google LLC.
2cb93a386Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3cb93a386Sopenharmony_ci
4cb93a386Sopenharmony_ci#include "include/private/SkTFitsIn.h"
5cb93a386Sopenharmony_ci#include "src/utils/SkUTF.h"
6cb93a386Sopenharmony_ci
7cb93a386Sopenharmony_ci#include <climits>
8cb93a386Sopenharmony_ci
9cb93a386Sopenharmony_cistatic constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
10cb93a386Sopenharmony_ci    return (int32_t) ((uint32_t) value << shift);
11cb93a386Sopenharmony_ci}
12cb93a386Sopenharmony_ci
13cb93a386Sopenharmony_citemplate <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
14cb93a386Sopenharmony_ci
15cb93a386Sopenharmony_citemplate <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
16cb93a386Sopenharmony_ci
17cb93a386Sopenharmony_cistatic constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
18cb93a386Sopenharmony_ci
19cb93a386Sopenharmony_cistatic constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
20cb93a386Sopenharmony_ci
21cb93a386Sopenharmony_ci/** @returns   -1  iff invalid UTF8 byte,
22cb93a386Sopenharmony_ci                0  iff UTF8 continuation byte,
23cb93a386Sopenharmony_ci                1  iff ASCII byte,
24cb93a386Sopenharmony_ci                2  iff leading byte of 2-byte sequence,
25cb93a386Sopenharmony_ci                3  iff leading byte of 3-byte sequence, and
26cb93a386Sopenharmony_ci                4  iff leading byte of 4-byte sequence.
27cb93a386Sopenharmony_ci      I.e.: if return value > 0, then gives length of sequence.
28cb93a386Sopenharmony_ci*/
29cb93a386Sopenharmony_cistatic int utf8_byte_type(uint8_t c) {
30cb93a386Sopenharmony_ci    if (c < 0x80) {
31cb93a386Sopenharmony_ci        return 1;
32cb93a386Sopenharmony_ci    } else if (c < 0xC0) {
33cb93a386Sopenharmony_ci        return 0;
34cb93a386Sopenharmony_ci    } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
35cb93a386Sopenharmony_ci        return -1;
36cb93a386Sopenharmony_ci    } else {
37cb93a386Sopenharmony_ci        int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
38cb93a386Sopenharmony_ci        // assert(value >= 2 && value <=4);
39cb93a386Sopenharmony_ci        return value;
40cb93a386Sopenharmony_ci    }
41cb93a386Sopenharmony_ci}
42cb93a386Sopenharmony_cistatic bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
43cb93a386Sopenharmony_ci
44cb93a386Sopenharmony_cistatic bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
45cb93a386Sopenharmony_ci
46cb93a386Sopenharmony_ci////////////////////////////////////////////////////////////////////////////////
47cb93a386Sopenharmony_ci
48cb93a386Sopenharmony_ciint SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
49cb93a386Sopenharmony_ci    if (!utf8) {
50cb93a386Sopenharmony_ci        return -1;
51cb93a386Sopenharmony_ci    }
52cb93a386Sopenharmony_ci    int count = 0;
53cb93a386Sopenharmony_ci    const char* stop = utf8 + byteLength;
54cb93a386Sopenharmony_ci    while (utf8 < stop) {
55cb93a386Sopenharmony_ci        int type = utf8_byte_type(*(const uint8_t*)utf8);
56cb93a386Sopenharmony_ci        if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
57cb93a386Sopenharmony_ci            return -1;  // Sequence extends beyond end.
58cb93a386Sopenharmony_ci        }
59cb93a386Sopenharmony_ci        while(type-- > 1) {
60cb93a386Sopenharmony_ci            ++utf8;
61cb93a386Sopenharmony_ci            if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
62cb93a386Sopenharmony_ci                return -1;
63cb93a386Sopenharmony_ci            }
64cb93a386Sopenharmony_ci        }
65cb93a386Sopenharmony_ci        ++utf8;
66cb93a386Sopenharmony_ci        ++count;
67cb93a386Sopenharmony_ci    }
68cb93a386Sopenharmony_ci    return count;
69cb93a386Sopenharmony_ci}
70cb93a386Sopenharmony_ci
71cb93a386Sopenharmony_ciint SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
72cb93a386Sopenharmony_ci    if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
73cb93a386Sopenharmony_ci        return -1;
74cb93a386Sopenharmony_ci    }
75cb93a386Sopenharmony_ci    const uint16_t* src = (const uint16_t*)utf16;
76cb93a386Sopenharmony_ci    const uint16_t* stop = src + (byteLength >> 1);
77cb93a386Sopenharmony_ci    int count = 0;
78cb93a386Sopenharmony_ci    while (src < stop) {
79cb93a386Sopenharmony_ci        unsigned c = *src++;
80cb93a386Sopenharmony_ci        if (utf16_is_low_surrogate(c)) {
81cb93a386Sopenharmony_ci            return -1;
82cb93a386Sopenharmony_ci        }
83cb93a386Sopenharmony_ci        if (utf16_is_high_surrogate(c)) {
84cb93a386Sopenharmony_ci            if (src >= stop) {
85cb93a386Sopenharmony_ci                return -1;
86cb93a386Sopenharmony_ci            }
87cb93a386Sopenharmony_ci            c = *src++;
88cb93a386Sopenharmony_ci            if (!utf16_is_low_surrogate(c)) {
89cb93a386Sopenharmony_ci                return -1;
90cb93a386Sopenharmony_ci            }
91cb93a386Sopenharmony_ci        }
92cb93a386Sopenharmony_ci        count += 1;
93cb93a386Sopenharmony_ci    }
94cb93a386Sopenharmony_ci    return count;
95cb93a386Sopenharmony_ci}
96cb93a386Sopenharmony_ci
97cb93a386Sopenharmony_ciint SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
98cb93a386Sopenharmony_ci    if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) {
99cb93a386Sopenharmony_ci        return -1;
100cb93a386Sopenharmony_ci    }
101cb93a386Sopenharmony_ci    const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
102cb93a386Sopenharmony_ci    const uint32_t* ptr = (const uint32_t*)utf32;
103cb93a386Sopenharmony_ci    const uint32_t* stop = ptr + (byteLength >> 2);
104cb93a386Sopenharmony_ci    while (ptr < stop) {
105cb93a386Sopenharmony_ci        if (*ptr & kInvalidUnicharMask) {
106cb93a386Sopenharmony_ci            return -1;
107cb93a386Sopenharmony_ci        }
108cb93a386Sopenharmony_ci        ptr += 1;
109cb93a386Sopenharmony_ci    }
110cb93a386Sopenharmony_ci    return (int)(byteLength >> 2);
111cb93a386Sopenharmony_ci}
112cb93a386Sopenharmony_ci
113cb93a386Sopenharmony_citemplate <typename T>
114cb93a386Sopenharmony_cistatic SkUnichar next_fail(const T** ptr, const T* end) {
115cb93a386Sopenharmony_ci    *ptr = end;
116cb93a386Sopenharmony_ci    return -1;
117cb93a386Sopenharmony_ci}
118cb93a386Sopenharmony_ci
119cb93a386Sopenharmony_ciSkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
120cb93a386Sopenharmony_ci    if (!ptr || !end ) {
121cb93a386Sopenharmony_ci        return -1;
122cb93a386Sopenharmony_ci    }
123cb93a386Sopenharmony_ci    const uint8_t*  p = (const uint8_t*)*ptr;
124cb93a386Sopenharmony_ci    if (!p || p >= (const uint8_t*)end) {
125cb93a386Sopenharmony_ci        return next_fail(ptr, end);
126cb93a386Sopenharmony_ci    }
127cb93a386Sopenharmony_ci    int             c = *p;
128cb93a386Sopenharmony_ci    int             hic = c << 24;
129cb93a386Sopenharmony_ci
130cb93a386Sopenharmony_ci    if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
131cb93a386Sopenharmony_ci        return next_fail(ptr, end);
132cb93a386Sopenharmony_ci    }
133cb93a386Sopenharmony_ci    if (hic < 0) {
134cb93a386Sopenharmony_ci        uint32_t mask = (uint32_t)~0x3F;
135cb93a386Sopenharmony_ci        hic = left_shift(hic, 1);
136cb93a386Sopenharmony_ci        do {
137cb93a386Sopenharmony_ci            ++p;
138cb93a386Sopenharmony_ci            if (p >= (const uint8_t*)end) {
139cb93a386Sopenharmony_ci                return next_fail(ptr, end);
140cb93a386Sopenharmony_ci            }
141cb93a386Sopenharmony_ci            // check before reading off end of array.
142cb93a386Sopenharmony_ci            uint8_t nextByte = *p;
143cb93a386Sopenharmony_ci            if (!utf8_byte_is_continuation(nextByte)) {
144cb93a386Sopenharmony_ci                return next_fail(ptr, end);
145cb93a386Sopenharmony_ci            }
146cb93a386Sopenharmony_ci            c = (c << 6) | (nextByte & 0x3F);
147cb93a386Sopenharmony_ci            mask <<= 5;
148cb93a386Sopenharmony_ci        } while ((hic = left_shift(hic, 1)) < 0);
149cb93a386Sopenharmony_ci        c &= ~mask;
150cb93a386Sopenharmony_ci    }
151cb93a386Sopenharmony_ci    *ptr = (char*)p + 1;
152cb93a386Sopenharmony_ci    return c;
153cb93a386Sopenharmony_ci}
154cb93a386Sopenharmony_ci
155cb93a386Sopenharmony_ciSkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
156cb93a386Sopenharmony_ci    if (!ptr || !end ) {
157cb93a386Sopenharmony_ci        return -1;
158cb93a386Sopenharmony_ci    }
159cb93a386Sopenharmony_ci    const uint16_t* src = *ptr;
160cb93a386Sopenharmony_ci    if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
161cb93a386Sopenharmony_ci        return next_fail(ptr, end);
162cb93a386Sopenharmony_ci    }
163cb93a386Sopenharmony_ci    uint16_t c = *src++;
164cb93a386Sopenharmony_ci    SkUnichar result = c;
165cb93a386Sopenharmony_ci    if (utf16_is_low_surrogate(c)) {
166cb93a386Sopenharmony_ci        return next_fail(ptr, end);  // srcPtr should never point at low surrogate.
167cb93a386Sopenharmony_ci    }
168cb93a386Sopenharmony_ci    if (utf16_is_high_surrogate(c)) {
169cb93a386Sopenharmony_ci        if (src + 1 > end) {
170cb93a386Sopenharmony_ci            return next_fail(ptr, end);  // Truncated string.
171cb93a386Sopenharmony_ci        }
172cb93a386Sopenharmony_ci        uint16_t low = *src++;
173cb93a386Sopenharmony_ci        if (!utf16_is_low_surrogate(low)) {
174cb93a386Sopenharmony_ci            return next_fail(ptr, end);
175cb93a386Sopenharmony_ci        }
176cb93a386Sopenharmony_ci        /*
177cb93a386Sopenharmony_ci        [paraphrased from wikipedia]
178cb93a386Sopenharmony_ci        Take the high surrogate and subtract 0xD800, then multiply by 0x400.
179cb93a386Sopenharmony_ci        Take the low surrogate and subtract 0xDC00.  Add these two results
180cb93a386Sopenharmony_ci        together, and finally add 0x10000 to get the final decoded codepoint.
181cb93a386Sopenharmony_ci
182cb93a386Sopenharmony_ci        unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
183cb93a386Sopenharmony_ci        unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
184cb93a386Sopenharmony_ci        unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
185cb93a386Sopenharmony_ci        unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
186cb93a386Sopenharmony_ci        */
187cb93a386Sopenharmony_ci        result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
188cb93a386Sopenharmony_ci    }
189cb93a386Sopenharmony_ci    *ptr = src;
190cb93a386Sopenharmony_ci    return result;
191cb93a386Sopenharmony_ci}
192cb93a386Sopenharmony_ci
193cb93a386Sopenharmony_ciSkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
194cb93a386Sopenharmony_ci    if (!ptr || !end ) {
195cb93a386Sopenharmony_ci        return -1;
196cb93a386Sopenharmony_ci    }
197cb93a386Sopenharmony_ci    const int32_t* s = *ptr;
198cb93a386Sopenharmony_ci    if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
199cb93a386Sopenharmony_ci        return next_fail(ptr, end);
200cb93a386Sopenharmony_ci    }
201cb93a386Sopenharmony_ci    int32_t value = *s;
202cb93a386Sopenharmony_ci    const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
203cb93a386Sopenharmony_ci    if (value & kInvalidUnicharMask) {
204cb93a386Sopenharmony_ci        return next_fail(ptr, end);
205cb93a386Sopenharmony_ci    }
206cb93a386Sopenharmony_ci    *ptr = s + 1;
207cb93a386Sopenharmony_ci    return value;
208cb93a386Sopenharmony_ci}
209cb93a386Sopenharmony_ci
210cb93a386Sopenharmony_cisize_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
211cb93a386Sopenharmony_ci    if ((uint32_t)uni > 0x10FFFF) {
212cb93a386Sopenharmony_ci        return 0;
213cb93a386Sopenharmony_ci    }
214cb93a386Sopenharmony_ci    if (uni <= 127) {
215cb93a386Sopenharmony_ci        if (utf8) {
216cb93a386Sopenharmony_ci            *utf8 = (char)uni;
217cb93a386Sopenharmony_ci        }
218cb93a386Sopenharmony_ci        return 1;
219cb93a386Sopenharmony_ci    }
220cb93a386Sopenharmony_ci    char    tmp[4];
221cb93a386Sopenharmony_ci    char*   p = tmp;
222cb93a386Sopenharmony_ci    size_t  count = 1;
223cb93a386Sopenharmony_ci    while (uni > 0x7F >> count) {
224cb93a386Sopenharmony_ci        *p++ = (char)(0x80 | (uni & 0x3F));
225cb93a386Sopenharmony_ci        uni >>= 6;
226cb93a386Sopenharmony_ci        count += 1;
227cb93a386Sopenharmony_ci    }
228cb93a386Sopenharmony_ci    if (utf8) {
229cb93a386Sopenharmony_ci        p = tmp;
230cb93a386Sopenharmony_ci        utf8 += count;
231cb93a386Sopenharmony_ci        while (p < tmp + count - 1) {
232cb93a386Sopenharmony_ci            *--utf8 = *p++;
233cb93a386Sopenharmony_ci        }
234cb93a386Sopenharmony_ci        *--utf8 = (char)(~(0xFF >> count) | uni);
235cb93a386Sopenharmony_ci    }
236cb93a386Sopenharmony_ci    return count;
237cb93a386Sopenharmony_ci}
238cb93a386Sopenharmony_ci
239cb93a386Sopenharmony_cisize_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
240cb93a386Sopenharmony_ci    if ((uint32_t)uni > 0x10FFFF) {
241cb93a386Sopenharmony_ci        return 0;
242cb93a386Sopenharmony_ci    }
243cb93a386Sopenharmony_ci    int extra = (uni > 0xFFFF);
244cb93a386Sopenharmony_ci    if (utf16) {
245cb93a386Sopenharmony_ci        if (extra) {
246cb93a386Sopenharmony_ci            utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
247cb93a386Sopenharmony_ci            utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
248cb93a386Sopenharmony_ci        } else {
249cb93a386Sopenharmony_ci            utf16[0] = (uint16_t)uni;
250cb93a386Sopenharmony_ci        }
251cb93a386Sopenharmony_ci    }
252cb93a386Sopenharmony_ci    return 1 + extra;
253cb93a386Sopenharmony_ci}
254cb93a386Sopenharmony_ci
255cb93a386Sopenharmony_ciint SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) {
256cb93a386Sopenharmony_ci    if (!dst) {
257cb93a386Sopenharmony_ci        dstCapacity = 0;
258cb93a386Sopenharmony_ci    }
259cb93a386Sopenharmony_ci
260cb93a386Sopenharmony_ci    int dstLength = 0;
261cb93a386Sopenharmony_ci    uint16_t* endDst = dst + dstCapacity;
262cb93a386Sopenharmony_ci    const char* endSrc = src + srcByteLength;
263cb93a386Sopenharmony_ci    while (src < endSrc) {
264cb93a386Sopenharmony_ci        SkUnichar uni = NextUTF8(&src, endSrc);
265cb93a386Sopenharmony_ci        if (uni < 0) {
266cb93a386Sopenharmony_ci            return -1;
267cb93a386Sopenharmony_ci        }
268cb93a386Sopenharmony_ci
269cb93a386Sopenharmony_ci        uint16_t utf16[2];
270cb93a386Sopenharmony_ci        size_t count = ToUTF16(uni, utf16);
271cb93a386Sopenharmony_ci        if (count == 0) {
272cb93a386Sopenharmony_ci            return -1;
273cb93a386Sopenharmony_ci        }
274cb93a386Sopenharmony_ci        dstLength += count;
275cb93a386Sopenharmony_ci
276cb93a386Sopenharmony_ci        if (dst) {
277cb93a386Sopenharmony_ci            uint16_t* elems = utf16;
278cb93a386Sopenharmony_ci            while (dst < endDst && count > 0) {
279cb93a386Sopenharmony_ci                *dst++ = *elems++;
280cb93a386Sopenharmony_ci                count -= 1;
281cb93a386Sopenharmony_ci            }
282cb93a386Sopenharmony_ci        }
283cb93a386Sopenharmony_ci    }
284cb93a386Sopenharmony_ci    return dstLength;
285cb93a386Sopenharmony_ci}
286cb93a386Sopenharmony_ci
287cb93a386Sopenharmony_ciint SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) {
288cb93a386Sopenharmony_ci    if (!dst) {
289cb93a386Sopenharmony_ci        dstCapacity = 0;
290cb93a386Sopenharmony_ci    }
291cb93a386Sopenharmony_ci
292cb93a386Sopenharmony_ci    int dstLength = 0;
293cb93a386Sopenharmony_ci    const char* endDst = dst + dstCapacity;
294cb93a386Sopenharmony_ci    const uint16_t* endSrc = src + srcLength;
295cb93a386Sopenharmony_ci    while (src < endSrc) {
296cb93a386Sopenharmony_ci        SkUnichar uni = NextUTF16(&src, endSrc);
297cb93a386Sopenharmony_ci        if (uni < 0) {
298cb93a386Sopenharmony_ci            return -1;
299cb93a386Sopenharmony_ci        }
300cb93a386Sopenharmony_ci
301cb93a386Sopenharmony_ci        char utf8[SkUTF::kMaxBytesInUTF8Sequence];
302cb93a386Sopenharmony_ci        size_t count = ToUTF8(uni, utf8);
303cb93a386Sopenharmony_ci        if (count == 0) {
304cb93a386Sopenharmony_ci            return -1;
305cb93a386Sopenharmony_ci        }
306cb93a386Sopenharmony_ci        dstLength += count;
307cb93a386Sopenharmony_ci
308cb93a386Sopenharmony_ci        if (dst) {
309cb93a386Sopenharmony_ci            const char* elems = utf8;
310cb93a386Sopenharmony_ci            while (dst < endDst && count > 0) {
311cb93a386Sopenharmony_ci                *dst++ = *elems++;
312cb93a386Sopenharmony_ci                count -= 1;
313cb93a386Sopenharmony_ci            }
314cb93a386Sopenharmony_ci        }
315cb93a386Sopenharmony_ci    }
316cb93a386Sopenharmony_ci    return dstLength;
317cb93a386Sopenharmony_ci}
318