1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf.h"
17 
18 #include <cstddef>
19 #include <cstring>
20 
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24 
25 namespace panda::utf {
26 
27 constexpr size_t MAX_U16 = 0xffff;
28 constexpr size_t CONST_2 = 2;
29 constexpr size_t CONST_3 = 3;
30 constexpr size_t CONST_4 = 4;
31 constexpr size_t CONST_6 = 6;
32 constexpr size_t CONST_12 = 12;
33 
34 struct MUtf8Char {
35     size_t n;
36     std::array<uint8_t, CONST_4> ch;
37 };
38 
39 /*
40  * MUtf-8
41  *
42  * U+0000 => C0 80
43  *
44  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
45  *    code point   code point   code point
46  * 1  7            U+0000       U+007F      0xxxxxxx
47  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
48  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
49  * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
50  * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
51  */
52 
53 /*
54  * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
55  * In case of invalid sequence return first byte of it.
56  */
ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)57 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)
58 {
59     // TODO(d.kovalneko): make the function safe
60     Span<const uint8_t> sp(data, max_bytes);
61     uint8_t d0 = sp[0];
62     if ((d0 & MASK1) == 0) {
63         return {d0, 1};
64     }
65 
66     if (max_bytes < CONST_2) {
67         return {d0, 1};
68     }
69     uint8_t d1 = sp[1];
70     if ((d0 & MASK2) == 0) {
71         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
72     }
73 
74     if (max_bytes < CONST_3) {
75         return {d0, 1};
76     }
77     uint8_t d2 = sp[CONST_2];
78     if ((d0 & MASK3) == 0) {
79         return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
80                 CONST_3};
81     }
82 
83     if (max_bytes < CONST_4) {
84         return {d0, 1};
85     }
86     uint8_t d3 = sp[CONST_3];
87     uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
88                           ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
89 
90     uint32_t pair = 0;
91     pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
92     pair <<= PAIR_ELEMENT_WIDTH;
93     pair |= (code_point & MASK_10BIT) + U16_TAIL;
94 
95     return {pair, CONST_4};
96 }
97 
CombineTwoU16(uint16_t d0, uint16_t d1)98 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
99 {
100     uint32_t codePoint = d0 - HI_SURROGATE_MIN;
101     codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
102     codePoint |= d1 - LO_SURROGATE_MIN;
103     codePoint += LO_SUPPLEMENTS_MIN;
104     return codePoint;
105 }
106 
ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)107 constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
108 {
109     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
110     // means that is a single code point, it needs to be represented by three MUTF8 code.
111     if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
112         auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
113         auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
114         auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
115         return {CONST_3, {ch0, ch1, ch2}};
116     }
117 
118     if (d0 == 0) {
119         return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}};
120     }
121     if (d0 <= MUTF8_1B_MAX) {
122         return {1, {static_cast<uint8_t>(d0)}};
123     }
124     if (d0 <= MUTF8_2B_MAX) {
125         auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6));
126         auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT));
127         return {CONST_2, {ch0, ch1}};
128     }
129     if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
130         auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
131         auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
132         auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
133         return {CONST_3, {ch0, ch1, ch2}};
134     }
135 
136     uint32_t codePoint = CombineTwoU16(d0, d1);
137 
138     auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST);
139     auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1);
140     auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1);
141     auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
142 
143     return {CONST_4, {ch0, ch1, ch2, ch3}};
144 }
145 
IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)146 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)
147 {
148     while (*mutf8_in != '\0') {    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
149         if (*mutf8_in >= MASK1) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
150             return false;
151         }
152         mutf8_in += 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
153     }
154     return true;
155 }
156 
ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, size_t start)157 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
158                                  size_t start)
159 {
160     size_t mutf8_pos = 0;
161     if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) {
162         return 0;
163     }
164     size_t end = start + utf16_len;
165     for (size_t i = start; i < end; ++i) {
166         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
167         uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0;
168         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
169         MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code);
170         if (mutf8_pos + ch.n > mutf8_len) {
171             break;
172         }
173         for (size_t c = 0; c < ch.n; ++c) {
174             mutf8_out[mutf8_pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
175         }
176         if (ch.n == CONST_4) {  // Two UTF-16 chars are used
177             ++i;
178         }
179     }
180     return mutf8_pos;
181 }
182 
ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)183 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)
184 {
185     size_t in_pos = 0;
186     while (in_pos < mutf8_len) {
187         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
188         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
189 
190         if (p_hi != 0) {
191             *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
192         }
193         *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194 
195         mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
196         in_pos += nbytes;
197     }
198 }
199 
ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, size_t start)200 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
201                                  size_t start)
202 {
203     size_t in_pos = 0;
204     size_t out_pos = 0;
205     while (in_pos < mutf8_len) {
206         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
207         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
208 
209         mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
210         in_pos += nbytes;
211         if (start > 0) {
212             start -= nbytes;
213             continue;
214         }
215 
216         if (p_hi != 0) {
217             ASSERT(utf16_len >= 1);
218             if (out_pos++ >= utf16_len - 1) {  // check for place for two uint16
219                 --out_pos;
220                 break;
221             }
222             *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
223         }
224         if (out_pos++ >= utf16_len) {
225             --out_pos;
226             break;
227         }
228         *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
229     }
230     return out_pos;
231 }
232 
CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)233 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
234 {
235     uint32_t c1;
236     uint32_t c2;
237     uint32_t n1;
238     uint32_t n2;
239 
240     do {
241         c1 = *mutf8_1;
242         c2 = *mutf8_2;
243 
244         if (c1 == 0 && c2 == 0) {
245             return 0;
246         }
247 
248         if (c1 == 0 && c2 != 0) {
249             return -1;
250         }
251 
252         if (c1 != 0 && c2 == 0) {
253             return 1;
254         }
255 
256         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1);
257         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2);
258 
259         mutf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
260         mutf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
261     } while (c1 == c2);
262 
263     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
264     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
265 
266     auto result = static_cast<int>(c1p1 - c2p1);
267     if (result != 0) {
268         return result;
269     }
270 
271     return c1p2 - c2p2;
272 }
273 
274 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)275 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)
276 {
277     uint32_t c1;
278     uint32_t c2;
279     uint32_t n1;
280     uint32_t n2;
281 
282     uint32_t utf8_1_index = 0;
283     uint32_t utf8_2_index = 0;
284 
285     do {
286         if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) {
287             return 0;
288         }
289 
290         if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) {
291             return -1;
292         }
293 
294         if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) {
295             return 1;
296         }
297 
298         c1 = *utf8_1;
299         c2 = *utf8_2;
300 
301         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1);
302         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2);
303 
304         utf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
305         utf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
306         utf8_1_index += n1;
307         utf8_2_index += n2;
308     } while (c1 == c2);
309 
310     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
311     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
312 
313     auto result = static_cast<int>(c1p1 - c2p1);
314     if (result != 0) {
315         return result;
316     }
317 
318     return c1p2 - c2p2;
319 }
320 
Mutf8Size(const uint8_t *mutf8)321 size_t Mutf8Size(const uint8_t *mutf8)
322 {
323     return strlen(Mutf8AsCString(mutf8));
324 }
325 
MUtf8ToUtf16Size(const uint8_t *mutf8)326 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
327 {
328     // TODO(d.kovalenko): make it faster
329     size_t res = 0;
330     while (*mutf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
331         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
332         res += pair > MAX_U16 ? CONST_2 : 1;
333         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
334     }
335     return res;
336 }
337 
MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)338 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)
339 {
340     size_t pos = 0;
341     size_t res = 0;
342     while (pos != mutf8_len) {
343         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos);
344         if (nbytes == 0) {
345             nbytes = 1;
346         }
347         res += pair > MAX_U16 ? CONST_2 : 1;
348         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
349         pos += nbytes;
350     }
351     return res;
352 }
353 
Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)354 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
355 {
356     size_t res = 1;  // zero byte
357     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
358     // means that is a single code point, it needs to be represented by three MUTF8 code.
359     if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360         mutf16[0] <= LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
361         res += CONST_3;
362         return res;
363     }
364 
365     for (uint32_t i = 0; i < length; ++i) {
366         // NOLINTNEXTLINE(bugprone-branch-clone)
367         if (mutf16[i] == 0) {                    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
368             res += CONST_2;                      // special case for U+0000 => C0 80
369         } else if (mutf16[i] <= MUTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
370             res += 1;
371         } else if (mutf16[i] <= MUTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
372             res += CONST_2;
373             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
374         } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) {
375             res += CONST_3;
376         } else {
377             res += CONST_4;
378             ++i;
379         }
380     }
381     return res;
382 }
383 
IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)384 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)
385 {
386     if (utf8_1.size() != utf8_2.size()) {
387         return false;
388     }
389 
390     return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0;
391 }
392 
IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)393 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
394 {
395     return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0;
396 }
397 
IsValidModifiedUTF8(const uint8_t *elems)398 bool IsValidModifiedUTF8(const uint8_t *elems)
399 {
400     ASSERT(elems);
401 
402     while (*elems != '\0') {
403         // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
404         switch (*elems & 0xf0) {
405             case 0x00:
406             case 0x10:  // NOLINT(readability-magic-numbers)
407             case 0x20:  // NOLINT(readability-magic-numbers)
408             case 0x30:  // NOLINT(readability-magic-numbers)
409             case 0x40:  // NOLINT(readability-magic-numbers)
410             case 0x50:  // NOLINT(readability-magic-numbers)
411             case 0x60:  // NOLINT(readability-magic-numbers)
412             case 0x70:  // NOLINT(readability-magic-numbers)
413                 // pattern 0xxx
414                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
415                 ++elems;
416                 break;
417             case 0x80:  // NOLINT(readability-magic-numbers)
418             case 0x90:  // NOLINT(readability-magic-numbers)
419             case 0xa0:  // NOLINT(readability-magic-numbers)
420             case 0xb0:  // NOLINT(readability-magic-numbers)
421                 // pattern 10xx is illegal start
422                 return false;
423 
424             case 0xf0:  // NOLINT(readability-magic-numbers)
425                 // pattern 1111 0xxx starts four byte section
426                 if ((*elems & 0x08) == 0) {  // NOLINT(hicpp-signed-bitwise)
427                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
428                     ++elems;
429                     if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
430                         return false;
431                     }
432                 } else {
433                     return false;
434                 }
435                 // no need break
436                 [[fallthrough]];
437 
438             case 0xe0:  // NOLINT(readability-magic-numbers)
439                 // pattern 1110
440                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
441                 ++elems;
442                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
443                     return false;
444                 }
445                 // no need break
446                 [[fallthrough]];
447 
448             case 0xc0:  // NOLINT(readability-magic-numbers)
449             case 0xd0:  // NOLINT(readability-magic-numbers)
450                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
451                 ++elems;
452                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
453                     return false;
454                 }
455                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
456                 ++elems;
457                 break;
458             default:
459                 break;
460         }
461     }
462     return true;
463 }
464 
465 }  // namespace panda::utf
466