1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf.h"
17
18 #include <cstddef>
19 #include <cstring>
20
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24
25 // NOLINTNEXTLINE(hicpp-signed-bitwise)
26 static constexpr uint32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
27 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
28 #define U16_GET_SUPPLEMENTARY(lead, trail) \
29 ((static_cast<uint32_t>(lead) << 10UL) + static_cast<uint32_t>(trail) - U16_SURROGATE_OFFSET)
30
31 namespace ark::utf {
32
33 /*
34 * MUtf-8
35 *
36 * U+0000 => C0 80
37 *
38 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
39 * code point code point code point
40 * 1 7 U+0000 U+007F 0xxxxxxx
41 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
42 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
43 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
44 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
45 */
46
47 /*
48 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
49 * In case of invalid sequence return first byte of it.
50 */
ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes)51 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes)
52 {
53 // NOTE(d.kovalneko): make the function safe
54 Span<const uint8_t> sp(data, maxBytes);
55 uint8_t d0 = sp[0];
56 if ((d0 & MASK1) == 0) {
57 return {d0, 1};
58 }
59
60 if (maxBytes < CONST_2) {
61 return {d0, 1};
62 }
63 uint8_t d1 = sp[1];
64 if ((d0 & MASK2) == 0) {
65 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
66 }
67
68 if (maxBytes < CONST_3) {
69 return {d0, 1};
70 }
71 uint8_t d2 = sp[CONST_2];
72 if ((d0 & MASK3) == 0) {
73 return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
74 CONST_3};
75 }
76
77 if (maxBytes < CONST_4) {
78 return {d0, 1};
79 }
80 uint8_t d3 = sp[CONST_3];
81 uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
82 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
83
84 uint32_t pair = 0;
85 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
86 pair <<= PAIR_ELEMENT_WIDTH;
87 pair |= (codePoint & MASK_10BIT) + U16_TAIL;
88
89 return {pair, CONST_4};
90 }
91
CombineTwoU16(uint16_t d0, uint16_t d1)92 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
93 {
94 uint32_t codePoint = d0 - DECODE_LEAD_LOW;
95 codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
96 codePoint |= d1 - DECODE_TRAIL_LOW; // NOLINT(hicpp-signed-bitwise
97 codePoint += DECODE_SECOND_FACTOR;
98 return codePoint;
99 }
100
IsMUtf8OnlySingleBytes(const uint8_t *mutf8In)101 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8In)
102 {
103 while (*mutf8In != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104 if (*mutf8In >= MASK1) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
105 return false;
106 }
107 mutf8In += 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
108 }
109 return true;
110 }
111
ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len, size_t mutf8Len, size_t start)112 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len, size_t mutf8Len,
113 size_t start)
114 {
115 return ConvertRegionUtf16ToUtf8(utf16In, mutf8Out, utf16Len, mutf8Len, start, true);
116 }
117
ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out)118 void ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out)
119 {
120 size_t inPos = 0;
121 while (inPos < mutf8Len) {
122 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
123 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
124
125 if (p_hi != 0) {
126 *utf16Out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
127 }
128 *utf16Out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
129
130 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
131 inPos += nbytes;
132 }
133 }
134
ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)135 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len, size_t utf16Len,
136 size_t start)
137 {
138 size_t inPos = 0;
139 size_t outPos = 0;
140 while (inPos < mutf8Len) {
141 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
142 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
143
144 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145 inPos += nbytes;
146 if (start > 0) {
147 start -= nbytes;
148 continue;
149 }
150
151 if (p_hi != 0) {
152 if (outPos++ >= utf16Len - 1) { // check for place for two uint16
153 --outPos;
154 break;
155 }
156 *utf16Out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
157 }
158 if (outPos++ >= utf16Len) {
159 --outPos;
160 break;
161 }
162 *utf16Out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
163 }
164 return outPos;
165 }
166
CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82)167 int CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82)
168 {
169 uint32_t c1;
170 uint32_t c2;
171 uint32_t n1;
172 uint32_t n2;
173
174 do {
175 c1 = *mutf81;
176 c2 = *mutf82;
177
178 if (c1 == 0 && c2 == 0) {
179 return 0;
180 }
181
182 if (c1 == 0 && c2 != 0) {
183 return -1;
184 }
185
186 if (c1 != 0 && c2 == 0) {
187 return 1;
188 }
189
190 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf81);
191 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf82);
192
193 mutf81 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194 mutf82 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195 } while (c1 == c2);
196
197 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
198 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
199
200 auto result = static_cast<int>(c1p1 - c2p1);
201 if (result != 0) {
202 return result;
203 }
204
205 return c1p2 - c2p2;
206 }
207
208 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82, size_t utf82Length)209 int CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82, size_t utf82Length)
210 {
211 uint32_t c1;
212 uint32_t c2;
213 uint32_t n1;
214 uint32_t n2;
215
216 uint32_t utf81Index = 0;
217 uint32_t utf82Index = 0;
218
219 do {
220 if (utf81Index == utf81Length && utf82Index == utf82Length) {
221 return 0;
222 }
223
224 if (utf81Index == utf81Length && utf82Index < utf82Length) {
225 return -1;
226 }
227
228 if (utf81Index < utf81Length && utf82Index == utf82Length) {
229 return 1;
230 }
231
232 c1 = *utf81;
233 c2 = *utf82;
234
235 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf81);
236 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf82);
237
238 utf81 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
239 utf82 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240 utf81Index += n1;
241 utf82Index += n2;
242 } while (c1 == c2);
243
244 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
245 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
246
247 auto result = static_cast<int>(c1p1 - c2p1);
248 if (result != 0) {
249 return result;
250 }
251
252 return c1p2 - c2p2;
253 }
254
Mutf8Size(const uint8_t *mutf8)255 size_t Mutf8Size(const uint8_t *mutf8)
256 {
257 return strlen(Mutf8AsCString(mutf8));
258 }
259
MUtf8ToUtf16Size(const uint8_t *mutf8)260 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
261 {
262 // NOTE(d.kovalenko): make it faster
263 size_t res = 0;
264 while (*mutf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
266 res += pair > MAX_U16 ? CONST_2 : 1;
267 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
268 }
269 return res;
270 }
271
MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len)272 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len)
273 {
274 size_t pos = 0;
275 size_t res = 0;
276 while (pos != mutf8Len) {
277 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
278 if (nbytes == 0) {
279 nbytes = 1;
280 }
281 res += pair > MAX_U16 ? CONST_2 : 1;
282 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
283 pos += nbytes;
284 }
285 return res;
286 }
287
IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82)288 bool IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82)
289 {
290 if (utf81.size() != utf82.size()) {
291 return false;
292 }
293
294 return memcmp(utf81.data(), utf82.data(), utf81.size()) == 0;
295 }
296
IsEqual(const uint8_t *mutf81, const uint8_t *mutf82)297 bool IsEqual(const uint8_t *mutf81, const uint8_t *mutf82)
298 {
299 return strcmp(Mutf8AsCString(mutf81), Mutf8AsCString(mutf82)) == 0;
300 }
301
IsValidModifiedUTF8(const uint8_t *elems)302 bool IsValidModifiedUTF8(const uint8_t *elems)
303 {
304 ASSERT(elems);
305
306 while (*elems != '\0') {
307 // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
308 switch (*elems & 0xf0) {
309 case 0x00:
310 case 0x10: // NOLINT(readability-magic-numbers)
311 case 0x20: // NOLINT(readability-magic-numbers)
312 case 0x30: // NOLINT(readability-magic-numbers)
313 case 0x40: // NOLINT(readability-magic-numbers)
314 case 0x50: // NOLINT(readability-magic-numbers)
315 case 0x60: // NOLINT(readability-magic-numbers)
316 case 0x70: // NOLINT(readability-magic-numbers)
317 // pattern 0xxx
318 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
319 ++elems;
320 break;
321 case 0x80: // NOLINT(readability-magic-numbers)
322 case 0x90: // NOLINT(readability-magic-numbers)
323 case 0xa0: // NOLINT(readability-magic-numbers)
324 case 0xb0: // NOLINT(readability-magic-numbers)
325 // pattern 10xx is illegal start
326 return false;
327
328 case 0xf0: // NOLINT(readability-magic-numbers)
329 // pattern 1111 0xxx starts four byte section
330 if ((*elems & 0x08) == 0) { // NOLINT(hicpp-signed-bitwise)
331 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
332 ++elems;
333 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
334 return false;
335 }
336 } else {
337 return false;
338 }
339 // no need break
340 [[fallthrough]];
341
342 case 0xe0: // NOLINT(readability-magic-numbers)
343 // pattern 1110
344 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
345 ++elems;
346 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
347 return false;
348 }
349 // no need break
350 [[fallthrough]];
351
352 case 0xc0: // NOLINT(readability-magic-numbers)
353 case 0xd0: // NOLINT(readability-magic-numbers)
354 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
355 ++elems;
356 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
357 return false;
358 }
359 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360 ++elems;
361 break;
362 default:
363 UNREACHABLE();
364 break;
365 }
366 }
367 return true;
368 }
369
UTF16Decode(uint16_t lead, uint16_t trail)370 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
371 {
372 ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
373 (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
374 uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
375 return cp;
376 }
377
IsValidUTF8(const std::vector<uint8_t> &data)378 bool IsValidUTF8(const std::vector<uint8_t> &data)
379 {
380 uint32_t length = data.size();
381 switch (length) {
382 case UtfLength::ONE:
383 if (data.at(0) >= BIT_MASK_1) {
384 return false;
385 }
386 break;
387 case UtfLength::TWO:
388 if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
389 return false;
390 }
391 break;
392 case UtfLength::THREE:
393 if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
394 return false;
395 }
396 break;
397 case UtfLength::FOUR:
398 if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
399 return false;
400 }
401 break;
402 default:
403 UNREACHABLE();
404 break;
405 }
406
407 for (uint32_t i = 1; i < length; i++) {
408 if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
409 return false;
410 }
411 }
412 return true;
413 }
414
ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)415 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
416 {
417 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
418 // means that is a single code point, it needs to be represented by three UTF8 code.
419 if (d1 == 0 && d0 >= DECODE_LEAD_LOW && d0 <= DECODE_TRAIL_HIGH) {
420 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
421 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
422 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
423 return {UtfLength::THREE, {ch0, ch1, ch2}};
424 }
425
426 if (d0 == 0) {
427 if (modify) {
428 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
429 return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
430 }
431 // For print string, just skip '\u0000'
432 return {0, {0x00U}};
433 }
434 if (d0 <= UTF8_1B_MAX) {
435 return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
436 }
437 if (d0 <= UTF8_2B_MAX) {
438 auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
439 auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & MASK_6BIT));
440 return {UtfLength::TWO, {ch0, ch1}};
441 }
442 if (d0 < DECODE_LEAD_LOW || d0 > DECODE_LEAD_HIGH) {
443 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
444 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
445 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
446 return {UtfLength::THREE, {ch0, ch1, ch2}};
447 }
448 if (d1 < DECODE_TRAIL_LOW || d1 > DECODE_TRAIL_HIGH) {
449 // Bad sequence
450 UNREACHABLE();
451 }
452
453 uint32_t codePoint = CombineTwoU16(d0, d1);
454
455 auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
456 auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & MASK_6BIT) | MASK1);
457 auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & MASK_6BIT) | MASK1);
458 auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
459
460 return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
461 }
462
Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)463 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
464 {
465 size_t res = 1; // zero byte
466 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
467 // means that is a single code point, it needs to be represented by three UTF8 code.
468 if (length == 1 && utf16[0] >= DECODE_LEAD_LOW && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
469 utf16[0] <= DECODE_TRAIL_HIGH) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
470 res += UtfLength::THREE;
471 return res;
472 }
473
474 for (uint32_t i = 0; i < length; ++i) {
475 if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
476 if (modify) {
477 res += UtfLength::TWO; // special case for U+0000 => C0 80
478 }
479 } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480 res += 1;
481 } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
482 res += UtfLength::TWO;
483 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
484 } else if (utf16[i] < DECODE_LEAD_LOW || utf16[i] > DECODE_LEAD_HIGH) {
485 res += UtfLength::THREE;
486 } else {
487 if (i < length - 1 &&
488 utf16[i + 1] >= DECODE_TRAIL_LOW && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
489 utf16[i + 1] <= DECODE_TRAIL_HIGH) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
490 res += UtfLength::FOUR;
491 ++i;
492 } else {
493 res += UtfLength::THREE;
494 }
495 }
496 }
497 return res;
498 }
499
Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)500 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
501 {
502 return Utf16ToUtf8Size(mutf16, length, true);
503 }
504
ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, size_t start, bool modify)505 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
506 size_t start, bool modify)
507 {
508 size_t utf8Pos = 0;
509 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
510 return 0;
511 }
512 size_t end = start + utf16Len;
513 for (size_t i = start; i < end; ++i) {
514 uint16_t next16Code = 0;
515 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
516 if ((i + 1) != end && IsAvailableNextUtf16Code(utf16In[i + 1])) {
517 next16Code = utf16In[i + 1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
518 }
519 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
520 Utf8Char ch = ConvertUtf16ToUtf8(utf16In[i], next16Code, modify);
521 if (utf8Pos + ch.n > utf8Len) {
522 break;
523 }
524 for (size_t c = 0; c < ch.n; ++c) {
525 utf8Out[utf8Pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
526 }
527 if (ch.n == UtfLength::FOUR) { // Two UTF-16 chars are used
528 ++i;
529 }
530 }
531 return utf8Pos;
532 }
533
ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)534 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
535 {
536 uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
537 if ((d0 & MASK1) == 0) {
538 return {d0, 1};
539 }
540
541 uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
542 if ((d0 & MASK2) == 0) {
543 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), UtfLength::TWO};
544 }
545
546 uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
547 if ((d0 & MASK3) == 0) {
548 return {((d0 & MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
549 UtfLength::THREE};
550 }
551
552 uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
553 uint32_t codePoint = ((d0 & MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & MASK_6BIT) << UtfOffset::TWELVE) |
554 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
555
556 uint32_t pair = 0;
557 if (combine) {
558 uint32_t lead = ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD);
559 uint32_t tail = ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
560 pair = U16_GET_SUPPLEMENTARY(lead, tail); // NOLINT(hicpp-signed-bitwise)
561 } else {
562 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) << PAIR_ELEMENT_WIDTH;
563 pair |= ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
564 }
565
566 return {pair, UtfLength::FOUR};
567 }
568
Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)569 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
570 {
571 return MUtf8ToUtf16Size(utf8, utf8Len);
572 }
573
ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, size_t start)574 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
575 size_t start)
576 {
577 return ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
578 }
579
IsUTF16SurrogatePair(const uint16_t lead)580 bool IsUTF16SurrogatePair(const uint16_t lead)
581 {
582 return lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH;
583 }
584
585 /**
586 * The table below is to translate integer numbers from [0..99] range to pairs of corresponding utf16 codes.
587 * The pairs are packed into utf::BidigitsCode type.
588 *
589 * Example: 0 -> 0x00300030 ("00")
590 * 1 -> 0x00310030 ("01")
591 * ...
592 * 99 -> 0x00390039 ("99")
593 */
594 using BidigitsCode = uint32_t;
595 static constexpr size_t BIDIGITS_CODE_TAB_SIZE = 100U;
596
597 static constexpr std::array<BidigitsCode, BIDIGITS_CODE_TAB_SIZE> BIDIGITS_CODE_TAB = {
598 0x00300030, 0x00310030, 0x00320030, 0x00330030, 0x00340030, 0x00350030, 0x00360030, 0x00370030, 0x00380030,
599 0x00390030, 0x00300031, 0x00310031, 0x00320031, 0x00330031, 0x00340031, 0x00350031, 0x00360031, 0x00370031,
600 0x00380031, 0x00390031, 0x00300032, 0x00310032, 0x00320032, 0x00330032, 0x00340032, 0x00350032, 0x00360032,
601 0x00370032, 0x00380032, 0x00390032, 0x00300033, 0x00310033, 0x00320033, 0x00330033, 0x00340033, 0x00350033,
602 0x00360033, 0x00370033, 0x00380033, 0x00390033, 0x00300034, 0x00310034, 0x00320034, 0x00330034, 0x00340034,
603 0x00350034, 0x00360034, 0x00370034, 0x00380034, 0x00390034, 0x00300035, 0x00310035, 0x00320035, 0x00330035,
604 0x00340035, 0x00350035, 0x00360035, 0x00370035, 0x00380035, 0x00390035, 0x00300036, 0x00310036, 0x00320036,
605 0x00330036, 0x00340036, 0x00350036, 0x00360036, 0x00370036, 0x00380036, 0x00390036, 0x00300037, 0x00310037,
606 0x00320037, 0x00330037, 0x00340037, 0x00350037, 0x00360037, 0x00370037, 0x00380037, 0x00390037, 0x00300038,
607 0x00310038, 0x00320038, 0x00330038, 0x00340038, 0x00350038, 0x00360038, 0x00370038, 0x00380038, 0x00390038,
608 0x00300039, 0x00310039, 0x00320039, 0x00330039, 0x00340039, 0x00350039, 0x00360039, 0x00370039, 0x00380039,
609 0x00390039};
610
UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative)611 void UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative)
612 {
613 ASSERT(outUtf16Buf != nullptr && nDigits != 0);
614
615 constexpr uint64_t POW10_1 = 10U;
616 constexpr uint64_t POW10_2 = 100U;
617
618 Span<uint16_t> outSpan(outUtf16Buf, nDigits);
619 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
620 auto *out = reinterpret_cast<uint32_t *>(outUtf16Buf + nDigits);
621 int i = 0;
622 while (v >= POW10_2) {
623 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
624 out[--i] = BIDIGITS_CODE_TAB[v % POW10_2];
625 v /= POW10_2;
626 }
627 if (v >= POW10_1) {
628 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
629 out[--i] = BIDIGITS_CODE_TAB[v];
630 } else {
631 outSpan[negative ? 1U : 0] = v + '0';
632 }
633 if (negative) {
634 outSpan[0] = '-';
635 }
636 }
637
638 static constexpr uint16_t C_SPACE = 0x0020;
639 static constexpr uint16_t C_0009 = 0x0009;
640 static constexpr uint16_t C_000D = 0x000D;
641 static constexpr uint16_t C_000E = 0x000E;
642 static constexpr uint16_t C_00A0 = 0x00A0;
643 static constexpr uint16_t C_1680 = 0x1680;
644 static constexpr uint16_t C_2000 = 0x2000;
645 static constexpr uint16_t C_200A = 0x200A;
646 static constexpr uint16_t C_2028 = 0x2028;
647 static constexpr uint16_t C_2029 = 0x2029;
648 static constexpr uint16_t C_202F = 0x202F;
649 static constexpr uint16_t C_205F = 0x205F;
650 static constexpr uint16_t C_3000 = 0x3000;
651 static constexpr uint16_t C_FEFF = 0xFEFF;
652
IsWhiteSpaceChar(uint16_t c)653 bool IsWhiteSpaceChar(uint16_t c)
654 {
655 if (c == C_SPACE) {
656 return true;
657 }
658 // [0x000E, 0x009F] -- common non-whitespace characters
659 if (C_000E <= c && c < C_00A0) {
660 return false;
661 }
662 // 0x0009 -- horizontal tab
663 if (c < C_0009) {
664 return false;
665 }
666 // 0x000A -- line feed or new line
667 // 0x000B -- vertical tab
668 // 0x000C -- formfeed
669 // 0x000D -- carriage return
670 if (c <= C_000D) {
671 return true;
672 }
673 // 0x00A0 -- no-break space
674 if (c == C_00A0) {
675 return true;
676 }
677 // 0x1680 -- Ogham space mark
678 if (c == C_1680) {
679 return true;
680 }
681 // 0x2000 -- en quad
682 if (c < C_2000) {
683 return false;
684 }
685 // 0x2001 -- em quad
686 // 0x2002 -- en space
687 // 0x2003 -- em space
688 // 0x2004 -- three-per-em space
689 // 0x2005 -- four-per-em space
690 // 0x2006 -- six-per-em space
691 // 0x2007 -- figure space
692 // 0x2008 -- punctuation space
693 // 0x2009 -- thin space
694 // 0x200A -- hair space
695 if (c <= C_200A) {
696 return true;
697 }
698 // 0x2028 -- line separator
699 if (c == C_2028) {
700 return true;
701 }
702 // 0x2029 -- paragraph separator
703 if (c == C_2029) {
704 return true;
705 }
706 // 0x202F -- narrow no-break space
707 if (c == C_202F) {
708 return true;
709 }
710 // 0x205F -- medium mathematical space
711 if (c == C_205F) {
712 return true;
713 }
714 // 0xFEFF -- byte order mark
715 if (c == C_FEFF) {
716 return true;
717 }
718 // 0x3000 -- ideographic space
719 if (c == C_3000) {
720 return true;
721 }
722 return false;
723 }
724
725 } // namespace ark::utf
726