1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf.h"
17 
18 #include <cstddef>
19 #include <cstring>
20 
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24 
25 // NOLINTNEXTLINE(hicpp-signed-bitwise)
26 static constexpr uint32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
27 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
28 #define U16_GET_SUPPLEMENTARY(lead, trail) \
29     ((static_cast<uint32_t>(lead) << 10UL) + static_cast<uint32_t>(trail) - U16_SURROGATE_OFFSET)
30 
31 namespace ark::utf {
32 
33 /*
34  * MUtf-8
35  *
36  * U+0000 => C0 80
37  *
38  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
39  *    code point   code point   code point
40  * 1  7            U+0000       U+007F      0xxxxxxx
41  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
42  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
43  * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
44  * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
45  */
46 
47 /*
48  * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
49  * In case of invalid sequence return first byte of it.
50  */
ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes)51 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes)
52 {
53     // NOTE(d.kovalneko): make the function safe
54     Span<const uint8_t> sp(data, maxBytes);
55     uint8_t d0 = sp[0];
56     if ((d0 & MASK1) == 0) {
57         return {d0, 1};
58     }
59 
60     if (maxBytes < CONST_2) {
61         return {d0, 1};
62     }
63     uint8_t d1 = sp[1];
64     if ((d0 & MASK2) == 0) {
65         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
66     }
67 
68     if (maxBytes < CONST_3) {
69         return {d0, 1};
70     }
71     uint8_t d2 = sp[CONST_2];
72     if ((d0 & MASK3) == 0) {
73         return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
74                 CONST_3};
75     }
76 
77     if (maxBytes < CONST_4) {
78         return {d0, 1};
79     }
80     uint8_t d3 = sp[CONST_3];
81     uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
82                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
83 
84     uint32_t pair = 0;
85     pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
86     pair <<= PAIR_ELEMENT_WIDTH;
87     pair |= (codePoint & MASK_10BIT) + U16_TAIL;
88 
89     return {pair, CONST_4};
90 }
91 
CombineTwoU16(uint16_t d0, uint16_t d1)92 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
93 {
94     uint32_t codePoint = d0 - DECODE_LEAD_LOW;
95     codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
96     codePoint |= d1 - DECODE_TRAIL_LOW;  // NOLINT(hicpp-signed-bitwise
97     codePoint += DECODE_SECOND_FACTOR;
98     return codePoint;
99 }
100 
IsMUtf8OnlySingleBytes(const uint8_t *mutf8In)101 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8In)
102 {
103     while (*mutf8In != '\0') {    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104         if (*mutf8In >= MASK1) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
105             return false;
106         }
107         mutf8In += 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
108     }
109     return true;
110 }
111 
ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len, size_t mutf8Len, size_t start)112 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len, size_t mutf8Len,
113                                  size_t start)
114 {
115     return ConvertRegionUtf16ToUtf8(utf16In, mutf8Out, utf16Len, mutf8Len, start, true);
116 }
117 
ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out)118 void ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out)
119 {
120     size_t inPos = 0;
121     while (inPos < mutf8Len) {
122         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
123         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
124 
125         if (p_hi != 0) {
126             *utf16Out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
127         }
128         *utf16Out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
129 
130         mutf8In += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
131         inPos += nbytes;
132     }
133 }
134 
ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)135 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len, size_t utf16Len,
136                                  size_t start)
137 {
138     size_t inPos = 0;
139     size_t outPos = 0;
140     while (inPos < mutf8Len) {
141         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
142         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
143 
144         mutf8In += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145         inPos += nbytes;
146         if (start > 0) {
147             start -= nbytes;
148             continue;
149         }
150 
151         if (p_hi != 0) {
152             if (outPos++ >= utf16Len - 1) {  // check for place for two uint16
153                 --outPos;
154                 break;
155             }
156             *utf16Out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
157         }
158         if (outPos++ >= utf16Len) {
159             --outPos;
160             break;
161         }
162         *utf16Out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
163     }
164     return outPos;
165 }
166 
CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82)167 int CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82)
168 {
169     uint32_t c1;
170     uint32_t c2;
171     uint32_t n1;
172     uint32_t n2;
173 
174     do {
175         c1 = *mutf81;
176         c2 = *mutf82;
177 
178         if (c1 == 0 && c2 == 0) {
179             return 0;
180         }
181 
182         if (c1 == 0 && c2 != 0) {
183             return -1;
184         }
185 
186         if (c1 != 0 && c2 == 0) {
187             return 1;
188         }
189 
190         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf81);
191         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf82);
192 
193         mutf81 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194         mutf82 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195     } while (c1 == c2);
196 
197     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
198     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
199 
200     auto result = static_cast<int>(c1p1 - c2p1);
201     if (result != 0) {
202         return result;
203     }
204 
205     return c1p2 - c2p2;
206 }
207 
208 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82, size_t utf82Length)209 int CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82, size_t utf82Length)
210 {
211     uint32_t c1;
212     uint32_t c2;
213     uint32_t n1;
214     uint32_t n2;
215 
216     uint32_t utf81Index = 0;
217     uint32_t utf82Index = 0;
218 
219     do {
220         if (utf81Index == utf81Length && utf82Index == utf82Length) {
221             return 0;
222         }
223 
224         if (utf81Index == utf81Length && utf82Index < utf82Length) {
225             return -1;
226         }
227 
228         if (utf81Index < utf81Length && utf82Index == utf82Length) {
229             return 1;
230         }
231 
232         c1 = *utf81;
233         c2 = *utf82;
234 
235         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf81);
236         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf82);
237 
238         utf81 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
239         utf82 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240         utf81Index += n1;
241         utf82Index += n2;
242     } while (c1 == c2);
243 
244     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
245     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
246 
247     auto result = static_cast<int>(c1p1 - c2p1);
248     if (result != 0) {
249         return result;
250     }
251 
252     return c1p2 - c2p2;
253 }
254 
Mutf8Size(const uint8_t *mutf8)255 size_t Mutf8Size(const uint8_t *mutf8)
256 {
257     return strlen(Mutf8AsCString(mutf8));
258 }
259 
MUtf8ToUtf16Size(const uint8_t *mutf8)260 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
261 {
262     // NOTE(d.kovalenko): make it faster
263     size_t res = 0;
264     while (*mutf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
266         res += pair > MAX_U16 ? CONST_2 : 1;
267         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
268     }
269     return res;
270 }
271 
MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len)272 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len)
273 {
274     size_t pos = 0;
275     size_t res = 0;
276     while (pos != mutf8Len) {
277         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
278         if (nbytes == 0) {
279             nbytes = 1;
280         }
281         res += pair > MAX_U16 ? CONST_2 : 1;
282         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
283         pos += nbytes;
284     }
285     return res;
286 }
287 
IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82)288 bool IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82)
289 {
290     if (utf81.size() != utf82.size()) {
291         return false;
292     }
293 
294     return memcmp(utf81.data(), utf82.data(), utf81.size()) == 0;
295 }
296 
IsEqual(const uint8_t *mutf81, const uint8_t *mutf82)297 bool IsEqual(const uint8_t *mutf81, const uint8_t *mutf82)
298 {
299     return strcmp(Mutf8AsCString(mutf81), Mutf8AsCString(mutf82)) == 0;
300 }
301 
IsValidModifiedUTF8(const uint8_t *elems)302 bool IsValidModifiedUTF8(const uint8_t *elems)
303 {
304     ASSERT(elems);
305 
306     while (*elems != '\0') {
307         // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
308         switch (*elems & 0xf0) {
309             case 0x00:
310             case 0x10:  // NOLINT(readability-magic-numbers)
311             case 0x20:  // NOLINT(readability-magic-numbers)
312             case 0x30:  // NOLINT(readability-magic-numbers)
313             case 0x40:  // NOLINT(readability-magic-numbers)
314             case 0x50:  // NOLINT(readability-magic-numbers)
315             case 0x60:  // NOLINT(readability-magic-numbers)
316             case 0x70:  // NOLINT(readability-magic-numbers)
317                 // pattern 0xxx
318                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
319                 ++elems;
320                 break;
321             case 0x80:  // NOLINT(readability-magic-numbers)
322             case 0x90:  // NOLINT(readability-magic-numbers)
323             case 0xa0:  // NOLINT(readability-magic-numbers)
324             case 0xb0:  // NOLINT(readability-magic-numbers)
325                 // pattern 10xx is illegal start
326                 return false;
327 
328             case 0xf0:  // NOLINT(readability-magic-numbers)
329                 // pattern 1111 0xxx starts four byte section
330                 if ((*elems & 0x08) == 0) {  // NOLINT(hicpp-signed-bitwise)
331                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
332                     ++elems;
333                     if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
334                         return false;
335                     }
336                 } else {
337                     return false;
338                 }
339                 // no need break
340                 [[fallthrough]];
341 
342             case 0xe0:  // NOLINT(readability-magic-numbers)
343                 // pattern 1110
344                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
345                 ++elems;
346                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
347                     return false;
348                 }
349                 // no need break
350                 [[fallthrough]];
351 
352             case 0xc0:  // NOLINT(readability-magic-numbers)
353             case 0xd0:  // NOLINT(readability-magic-numbers)
354                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
355                 ++elems;
356                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
357                     return false;
358                 }
359                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360                 ++elems;
361                 break;
362             default:
363                 UNREACHABLE();
364                 break;
365         }
366     }
367     return true;
368 }
369 
UTF16Decode(uint16_t lead, uint16_t trail)370 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
371 {
372     ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
373            (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
374     uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
375     return cp;
376 }
377 
IsValidUTF8(const std::vector<uint8_t> &data)378 bool IsValidUTF8(const std::vector<uint8_t> &data)
379 {
380     uint32_t length = data.size();
381     switch (length) {
382         case UtfLength::ONE:
383             if (data.at(0) >= BIT_MASK_1) {
384                 return false;
385             }
386             break;
387         case UtfLength::TWO:
388             if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
389                 return false;
390             }
391             break;
392         case UtfLength::THREE:
393             if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
394                 return false;
395             }
396             break;
397         case UtfLength::FOUR:
398             if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
399                 return false;
400             }
401             break;
402         default:
403             UNREACHABLE();
404             break;
405     }
406 
407     for (uint32_t i = 1; i < length; i++) {
408         if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
409             return false;
410         }
411     }
412     return true;
413 }
414 
ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)415 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
416 {
417     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
418     // means that is a single code point, it needs to be represented by three UTF8 code.
419     if (d1 == 0 && d0 >= DECODE_LEAD_LOW && d0 <= DECODE_TRAIL_HIGH) {
420         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
421         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
422         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
423         return {UtfLength::THREE, {ch0, ch1, ch2}};
424     }
425 
426     if (d0 == 0) {
427         if (modify) {
428             // special case for \u0000 ==> C080 - 1100'0000 1000'0000
429             return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
430         }
431         // For print string, just skip '\u0000'
432         return {0, {0x00U}};
433     }
434     if (d0 <= UTF8_1B_MAX) {
435         return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
436     }
437     if (d0 <= UTF8_2B_MAX) {
438         auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
439         auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & MASK_6BIT));
440         return {UtfLength::TWO, {ch0, ch1}};
441     }
442     if (d0 < DECODE_LEAD_LOW || d0 > DECODE_LEAD_HIGH) {
443         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
444         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
445         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
446         return {UtfLength::THREE, {ch0, ch1, ch2}};
447     }
448     if (d1 < DECODE_TRAIL_LOW || d1 > DECODE_TRAIL_HIGH) {
449         // Bad sequence
450         UNREACHABLE();
451     }
452 
453     uint32_t codePoint = CombineTwoU16(d0, d1);
454 
455     auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
456     auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & MASK_6BIT) | MASK1);
457     auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & MASK_6BIT) | MASK1);
458     auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
459 
460     return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
461 }
462 
Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)463 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
464 {
465     size_t res = 1;  // zero byte
466     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
467     // means that is a single code point, it needs to be represented by three UTF8 code.
468     if (length == 1 && utf16[0] >= DECODE_LEAD_LOW &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
469         utf16[0] <= DECODE_TRAIL_HIGH) {               // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
470         res += UtfLength::THREE;
471         return res;
472     }
473 
474     for (uint32_t i = 0; i < length; ++i) {
475         if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
476             if (modify) {
477                 res += UtfLength::TWO;  // special case for U+0000 => C0 80
478             }
479         } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480             res += 1;
481         } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
482             res += UtfLength::TWO;
483             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
484         } else if (utf16[i] < DECODE_LEAD_LOW || utf16[i] > DECODE_LEAD_HIGH) {
485             res += UtfLength::THREE;
486         } else {
487             if (i < length - 1 &&
488                 utf16[i + 1] >= DECODE_TRAIL_LOW &&   // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
489                 utf16[i + 1] <= DECODE_TRAIL_HIGH) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
490                 res += UtfLength::FOUR;
491                 ++i;
492             } else {
493                 res += UtfLength::THREE;
494             }
495         }
496     }
497     return res;
498 }
499 
Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)500 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
501 {
502     return Utf16ToUtf8Size(mutf16, length, true);
503 }
504 
ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, size_t start, bool modify)505 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
506                                 size_t start, bool modify)
507 {
508     size_t utf8Pos = 0;
509     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
510         return 0;
511     }
512     size_t end = start + utf16Len;
513     for (size_t i = start; i < end; ++i) {
514         uint16_t next16Code = 0;
515         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
516         if ((i + 1) != end && IsAvailableNextUtf16Code(utf16In[i + 1])) {
517             next16Code = utf16In[i + 1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
518         }
519         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
520         Utf8Char ch = ConvertUtf16ToUtf8(utf16In[i], next16Code, modify);
521         if (utf8Pos + ch.n > utf8Len) {
522             break;
523         }
524         for (size_t c = 0; c < ch.n; ++c) {
525             utf8Out[utf8Pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
526         }
527         if (ch.n == UtfLength::FOUR) {  // Two UTF-16 chars are used
528             ++i;
529         }
530     }
531     return utf8Pos;
532 }
533 
ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)534 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
535 {
536     uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
537     if ((d0 & MASK1) == 0) {
538         return {d0, 1};
539     }
540 
541     uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
542     if ((d0 & MASK2) == 0) {
543         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), UtfLength::TWO};
544     }
545 
546     uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
547     if ((d0 & MASK3) == 0) {
548         return {((d0 & MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
549                 UtfLength::THREE};
550     }
551 
552     uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
553     uint32_t codePoint = ((d0 & MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & MASK_6BIT) << UtfOffset::TWELVE) |
554                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
555 
556     uint32_t pair = 0;
557     if (combine) {
558         uint32_t lead = ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD);
559         uint32_t tail = ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
560         pair = U16_GET_SUPPLEMENTARY(lead, tail);  // NOLINT(hicpp-signed-bitwise)
561     } else {
562         pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) << PAIR_ELEMENT_WIDTH;
563         pair |= ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
564     }
565 
566     return {pair, UtfLength::FOUR};
567 }
568 
Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)569 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
570 {
571     return MUtf8ToUtf16Size(utf8, utf8Len);
572 }
573 
ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, size_t start)574 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
575                                 size_t start)
576 {
577     return ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
578 }
579 
IsUTF16SurrogatePair(const uint16_t lead)580 bool IsUTF16SurrogatePair(const uint16_t lead)
581 {
582     return lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH;
583 }
584 
585 /**
586  * The table below is to translate integer numbers from [0..99] range to pairs of corresponding utf16 codes.
587  * The pairs are packed into utf::BidigitsCode type.
588  *
589  * Example: 0  -> 0x00300030 ("00")
590  *          1  -> 0x00310030 ("01")
591  *          ...
592  *          99 -> 0x00390039 ("99")
593  */
594 using BidigitsCode = uint32_t;
595 static constexpr size_t BIDIGITS_CODE_TAB_SIZE = 100U;
596 
597 static constexpr std::array<BidigitsCode, BIDIGITS_CODE_TAB_SIZE> BIDIGITS_CODE_TAB = {
598     0x00300030, 0x00310030, 0x00320030, 0x00330030, 0x00340030, 0x00350030, 0x00360030, 0x00370030, 0x00380030,
599     0x00390030, 0x00300031, 0x00310031, 0x00320031, 0x00330031, 0x00340031, 0x00350031, 0x00360031, 0x00370031,
600     0x00380031, 0x00390031, 0x00300032, 0x00310032, 0x00320032, 0x00330032, 0x00340032, 0x00350032, 0x00360032,
601     0x00370032, 0x00380032, 0x00390032, 0x00300033, 0x00310033, 0x00320033, 0x00330033, 0x00340033, 0x00350033,
602     0x00360033, 0x00370033, 0x00380033, 0x00390033, 0x00300034, 0x00310034, 0x00320034, 0x00330034, 0x00340034,
603     0x00350034, 0x00360034, 0x00370034, 0x00380034, 0x00390034, 0x00300035, 0x00310035, 0x00320035, 0x00330035,
604     0x00340035, 0x00350035, 0x00360035, 0x00370035, 0x00380035, 0x00390035, 0x00300036, 0x00310036, 0x00320036,
605     0x00330036, 0x00340036, 0x00350036, 0x00360036, 0x00370036, 0x00380036, 0x00390036, 0x00300037, 0x00310037,
606     0x00320037, 0x00330037, 0x00340037, 0x00350037, 0x00360037, 0x00370037, 0x00380037, 0x00390037, 0x00300038,
607     0x00310038, 0x00320038, 0x00330038, 0x00340038, 0x00350038, 0x00360038, 0x00370038, 0x00380038, 0x00390038,
608     0x00300039, 0x00310039, 0x00320039, 0x00330039, 0x00340039, 0x00350039, 0x00360039, 0x00370039, 0x00380039,
609     0x00390039};
610 
UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative)611 void UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative)
612 {
613     ASSERT(outUtf16Buf != nullptr && nDigits != 0);
614 
615     constexpr uint64_t POW10_1 = 10U;
616     constexpr uint64_t POW10_2 = 100U;
617 
618     Span<uint16_t> outSpan(outUtf16Buf, nDigits);
619     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
620     auto *out = reinterpret_cast<uint32_t *>(outUtf16Buf + nDigits);
621     int i = 0;
622     while (v >= POW10_2) {
623         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
624         out[--i] = BIDIGITS_CODE_TAB[v % POW10_2];
625         v /= POW10_2;
626     }
627     if (v >= POW10_1) {
628         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
629         out[--i] = BIDIGITS_CODE_TAB[v];
630     } else {
631         outSpan[negative ? 1U : 0] = v + '0';
632     }
633     if (negative) {
634         outSpan[0] = '-';
635     }
636 }
637 
638 static constexpr uint16_t C_SPACE = 0x0020;
639 static constexpr uint16_t C_0009 = 0x0009;
640 static constexpr uint16_t C_000D = 0x000D;
641 static constexpr uint16_t C_000E = 0x000E;
642 static constexpr uint16_t C_00A0 = 0x00A0;
643 static constexpr uint16_t C_1680 = 0x1680;
644 static constexpr uint16_t C_2000 = 0x2000;
645 static constexpr uint16_t C_200A = 0x200A;
646 static constexpr uint16_t C_2028 = 0x2028;
647 static constexpr uint16_t C_2029 = 0x2029;
648 static constexpr uint16_t C_202F = 0x202F;
649 static constexpr uint16_t C_205F = 0x205F;
650 static constexpr uint16_t C_3000 = 0x3000;
651 static constexpr uint16_t C_FEFF = 0xFEFF;
652 
IsWhiteSpaceChar(uint16_t c)653 bool IsWhiteSpaceChar(uint16_t c)
654 {
655     if (c == C_SPACE) {
656         return true;
657     }
658     // [0x000E, 0x009F] -- common non-whitespace characters
659     if (C_000E <= c && c < C_00A0) {
660         return false;
661     }
662     // 0x0009 -- horizontal tab
663     if (c < C_0009) {
664         return false;
665     }
666     // 0x000A -- line feed or new line
667     // 0x000B -- vertical tab
668     // 0x000C -- formfeed
669     // 0x000D -- carriage return
670     if (c <= C_000D) {
671         return true;
672     }
673     // 0x00A0 -- no-break space
674     if (c == C_00A0) {
675         return true;
676     }
677     // 0x1680 -- Ogham space mark
678     if (c == C_1680) {
679         return true;
680     }
681     // 0x2000 -- en quad
682     if (c < C_2000) {
683         return false;
684     }
685     // 0x2001 -- em quad
686     // 0x2002 -- en space
687     // 0x2003 -- em space
688     // 0x2004 -- three-per-em space
689     // 0x2005 -- four-per-em space
690     // 0x2006 -- six-per-em space
691     // 0x2007 -- figure space
692     // 0x2008 -- punctuation space
693     // 0x2009 -- thin space
694     // 0x200A -- hair space
695     if (c <= C_200A) {
696         return true;
697     }
698     // 0x2028 -- line separator
699     if (c == C_2028) {
700         return true;
701     }
702     // 0x2029 -- paragraph separator
703     if (c == C_2029) {
704         return true;
705     }
706     // 0x202F -- narrow no-break space
707     if (c == C_202F) {
708         return true;
709     }
710     // 0x205F -- medium mathematical space
711     if (c == C_205F) {
712         return true;
713     }
714     // 0xFEFF -- byte order mark
715     if (c == C_FEFF) {
716         return true;
717     }
718     // 0x3000 -- ideographic space
719     if (c == C_3000) {
720         return true;
721     }
722     return false;
723 }
724 
725 }  // namespace ark::utf
726