1 /**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf.h"
17
18 #include <cstddef>
19 #include <cstring>
20
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24
25 namespace panda::utf {
26
27 constexpr size_t MAX_U16 = 0xffff;
28 constexpr size_t CONST_2 = 2;
29 constexpr size_t CONST_3 = 3;
30 constexpr size_t CONST_4 = 4;
31 constexpr size_t CONST_6 = 6;
32 constexpr size_t CONST_12 = 12;
33
34 struct MUtf8Char {
35 size_t n;
36 std::array<uint8_t, CONST_4> ch;
37 };
38
39 /*
40 * MUtf-8
41 *
42 * U+0000 => C0 80
43 *
44 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
45 * code point code point code point
46 * 1 7 U+0000 U+007F 0xxxxxxx
47 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
48 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
49 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
50 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
51 */
52
53 /*
54 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
55 * In case of invalid sequence return first byte of it.
56 */
ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)57 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)
58 {
59 // TODO(d.kovalneko): make the function safe
60 Span<const uint8_t> sp(data, max_bytes);
61 uint8_t d0 = sp[0];
62 if ((d0 & MASK1) == 0) {
63 return {d0, 1};
64 }
65
66 if (max_bytes < CONST_2) {
67 return {d0, 1};
68 }
69 uint8_t d1 = sp[1];
70 if ((d0 & MASK2) == 0) {
71 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
72 }
73
74 if (max_bytes < CONST_3) {
75 return {d0, 1};
76 }
77 uint8_t d2 = sp[CONST_2];
78 if ((d0 & MASK3) == 0) {
79 return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
80 CONST_3};
81 }
82
83 if (max_bytes < CONST_4) {
84 return {d0, 1};
85 }
86 uint8_t d3 = sp[CONST_3];
87 uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
88 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
89
90 uint32_t pair = 0;
91 pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
92 pair <<= PAIR_ELEMENT_WIDTH;
93 pair |= (code_point & MASK_10BIT) + U16_TAIL;
94
95 return {pair, CONST_4};
96 }
97
CombineTwoU16(uint16_t d0, uint16_t d1)98 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
99 {
100 uint32_t codePoint = d0 - HI_SURROGATE_MIN;
101 codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
102 codePoint |= d1 - LO_SURROGATE_MIN;
103 codePoint += LO_SUPPLEMENTS_MIN;
104 return codePoint;
105 }
106
ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)107 constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
108 {
109 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
110 // means that is a single code point, it needs to be represented by three MUTF8 code.
111 if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
112 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
113 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
114 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
115 return {CONST_3, {ch0, ch1, ch2}};
116 }
117
118 if (d0 == 0) {
119 return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}};
120 }
121 if (d0 <= MUTF8_1B_MAX) {
122 return {1, {static_cast<uint8_t>(d0)}};
123 }
124 if (d0 <= MUTF8_2B_MAX) {
125 auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6));
126 auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT));
127 return {CONST_2, {ch0, ch1}};
128 }
129 if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
130 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
131 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
132 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
133 return {CONST_3, {ch0, ch1, ch2}};
134 }
135
136 uint32_t codePoint = CombineTwoU16(d0, d1);
137
138 auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST);
139 auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1);
140 auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1);
141 auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
142
143 return {CONST_4, {ch0, ch1, ch2, ch3}};
144 }
145
IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)146 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)
147 {
148 while (*mutf8_in != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
149 if (*mutf8_in >= MASK1) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
150 return false;
151 }
152 mutf8_in += 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
153 }
154 return true;
155 }
156
ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, size_t start)157 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
158 size_t start)
159 {
160 size_t mutf8_pos = 0;
161 if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) {
162 return 0;
163 }
164 size_t end = start + utf16_len;
165 for (size_t i = start; i < end; ++i) {
166 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
167 uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0;
168 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
169 MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code);
170 if (mutf8_pos + ch.n > mutf8_len) {
171 break;
172 }
173 for (size_t c = 0; c < ch.n; ++c) {
174 mutf8_out[mutf8_pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
175 }
176 if (ch.n == CONST_4) { // Two UTF-16 chars are used
177 ++i;
178 }
179 }
180 return mutf8_pos;
181 }
182
ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)183 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)
184 {
185 size_t in_pos = 0;
186 while (in_pos < mutf8_len) {
187 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
188 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
189
190 if (p_hi != 0) {
191 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
192 }
193 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194
195 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
196 in_pos += nbytes;
197 }
198 }
199
ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, size_t start)200 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
201 size_t start)
202 {
203 size_t in_pos = 0;
204 size_t out_pos = 0;
205 while (in_pos < mutf8_len) {
206 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
207 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
208
209 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
210 in_pos += nbytes;
211 if (start > 0) {
212 start -= nbytes;
213 continue;
214 }
215
216 if (p_hi != 0) {
217 ASSERT(utf16_len >= 1);
218 if (out_pos++ >= utf16_len - 1) { // check for place for two uint16
219 --out_pos;
220 break;
221 }
222 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
223 }
224 if (out_pos++ >= utf16_len) {
225 --out_pos;
226 break;
227 }
228 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
229 }
230 return out_pos;
231 }
232
CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)233 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
234 {
235 uint32_t c1;
236 uint32_t c2;
237 uint32_t n1;
238 uint32_t n2;
239
240 do {
241 c1 = *mutf8_1;
242 c2 = *mutf8_2;
243
244 if (c1 == 0 && c2 == 0) {
245 return 0;
246 }
247
248 if (c1 == 0 && c2 != 0) {
249 return -1;
250 }
251
252 if (c1 != 0 && c2 == 0) {
253 return 1;
254 }
255
256 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1);
257 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2);
258
259 mutf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
260 mutf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
261 } while (c1 == c2);
262
263 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
264 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
265
266 auto result = static_cast<int>(c1p1 - c2p1);
267 if (result != 0) {
268 return result;
269 }
270
271 return c1p2 - c2p2;
272 }
273
274 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)275 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)
276 {
277 uint32_t c1;
278 uint32_t c2;
279 uint32_t n1;
280 uint32_t n2;
281
282 uint32_t utf8_1_index = 0;
283 uint32_t utf8_2_index = 0;
284
285 do {
286 if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) {
287 return 0;
288 }
289
290 if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) {
291 return -1;
292 }
293
294 if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) {
295 return 1;
296 }
297
298 c1 = *utf8_1;
299 c2 = *utf8_2;
300
301 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1);
302 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2);
303
304 utf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
305 utf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
306 utf8_1_index += n1;
307 utf8_2_index += n2;
308 } while (c1 == c2);
309
310 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
311 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
312
313 auto result = static_cast<int>(c1p1 - c2p1);
314 if (result != 0) {
315 return result;
316 }
317
318 return c1p2 - c2p2;
319 }
320
Mutf8Size(const uint8_t *mutf8)321 size_t Mutf8Size(const uint8_t *mutf8)
322 {
323 return strlen(Mutf8AsCString(mutf8));
324 }
325
MUtf8ToUtf16Size(const uint8_t *mutf8)326 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
327 {
328 // TODO(d.kovalenko): make it faster
329 size_t res = 0;
330 while (*mutf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
331 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
332 res += pair > MAX_U16 ? CONST_2 : 1;
333 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
334 }
335 return res;
336 }
337
MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)338 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)
339 {
340 size_t pos = 0;
341 size_t res = 0;
342 while (pos != mutf8_len) {
343 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos);
344 if (nbytes == 0) {
345 nbytes = 1;
346 }
347 res += pair > MAX_U16 ? CONST_2 : 1;
348 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
349 pos += nbytes;
350 }
351 return res;
352 }
353
Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)354 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
355 {
356 size_t res = 1; // zero byte
357 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
358 // means that is a single code point, it needs to be represented by three MUTF8 code.
359 if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360 mutf16[0] <= LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
361 res += CONST_3;
362 return res;
363 }
364
365 for (uint32_t i = 0; i < length; ++i) {
366 // NOLINTNEXTLINE(bugprone-branch-clone)
367 if (mutf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
368 res += CONST_2; // special case for U+0000 => C0 80
369 } else if (mutf16[i] <= MUTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
370 res += 1;
371 } else if (mutf16[i] <= MUTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
372 res += CONST_2;
373 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
374 } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) {
375 res += CONST_3;
376 } else {
377 res += CONST_4;
378 ++i;
379 }
380 }
381 return res;
382 }
383
IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)384 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)
385 {
386 if (utf8_1.size() != utf8_2.size()) {
387 return false;
388 }
389
390 return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0;
391 }
392
IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)393 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
394 {
395 return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0;
396 }
397
IsValidModifiedUTF8(const uint8_t *elems)398 bool IsValidModifiedUTF8(const uint8_t *elems)
399 {
400 ASSERT(elems);
401
402 while (*elems != '\0') {
403 // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
404 switch (*elems & 0xf0) {
405 case 0x00:
406 case 0x10: // NOLINT(readability-magic-numbers)
407 case 0x20: // NOLINT(readability-magic-numbers)
408 case 0x30: // NOLINT(readability-magic-numbers)
409 case 0x40: // NOLINT(readability-magic-numbers)
410 case 0x50: // NOLINT(readability-magic-numbers)
411 case 0x60: // NOLINT(readability-magic-numbers)
412 case 0x70: // NOLINT(readability-magic-numbers)
413 // pattern 0xxx
414 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
415 ++elems;
416 break;
417 case 0x80: // NOLINT(readability-magic-numbers)
418 case 0x90: // NOLINT(readability-magic-numbers)
419 case 0xa0: // NOLINT(readability-magic-numbers)
420 case 0xb0: // NOLINT(readability-magic-numbers)
421 // pattern 10xx is illegal start
422 return false;
423
424 case 0xf0: // NOLINT(readability-magic-numbers)
425 // pattern 1111 0xxx starts four byte section
426 if ((*elems & 0x08) == 0) { // NOLINT(hicpp-signed-bitwise)
427 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
428 ++elems;
429 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
430 return false;
431 }
432 } else {
433 return false;
434 }
435 // no need break
436 [[fallthrough]];
437
438 case 0xe0: // NOLINT(readability-magic-numbers)
439 // pattern 1110
440 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
441 ++elems;
442 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
443 return false;
444 }
445 // no need break
446 [[fallthrough]];
447
448 case 0xc0: // NOLINT(readability-magic-numbers)
449 case 0xd0: // NOLINT(readability-magic-numbers)
450 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
451 ++elems;
452 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
453 return false;
454 }
455 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
456 ++elems;
457 break;
458 default:
459 break;
460 }
461 }
462 return true;
463 }
464
465 } // namespace panda::utf
466