1/**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "utf.h"
17
18#include <cstddef>
19#include <cstring>
20
21#include <limits>
22#include <tuple>
23#include <utility>
24
25namespace panda::utf {
26
27constexpr size_t MAX_U16 = 0xffff;
28constexpr size_t CONST_2 = 2;
29constexpr size_t CONST_3 = 3;
30constexpr size_t CONST_4 = 4;
31constexpr size_t CONST_6 = 6;
32constexpr size_t CONST_12 = 12;
33
34struct MUtf8Char {
35    size_t n;
36    std::array<uint8_t, CONST_4> ch;
37};
38
39/*
40 * MUtf-8
41 *
42 * U+0000 => C0 80
43 *
44 * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
45 *    code point   code point   code point
46 * 1  7            U+0000       U+007F      0xxxxxxx
47 * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
48 * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
49 * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
50 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
51 */
52
53/*
54 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
55 * In case of invalid sequence return first byte of it.
56 */
57std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)
58{
59    // TODO(d.kovalneko): make the function safe
60    Span<const uint8_t> sp(data, max_bytes);
61    uint8_t d0 = sp[0];
62    if ((d0 & MASK1) == 0) {
63        return {d0, 1};
64    }
65
66    if (max_bytes < CONST_2) {
67        return {d0, 1};
68    }
69    uint8_t d1 = sp[1];
70    if ((d0 & MASK2) == 0) {
71        return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
72    }
73
74    if (max_bytes < CONST_3) {
75        return {d0, 1};
76    }
77    uint8_t d2 = sp[CONST_2];
78    if ((d0 & MASK3) == 0) {
79        return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
80                CONST_3};
81    }
82
83    if (max_bytes < CONST_4) {
84        return {d0, 1};
85    }
86    uint8_t d3 = sp[CONST_3];
87    uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
88                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
89
90    uint32_t pair = 0;
91    pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
92    pair <<= PAIR_ELEMENT_WIDTH;
93    pair |= (code_point & MASK_10BIT) + U16_TAIL;
94
95    return {pair, CONST_4};
96}
97
98static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
99{
100    uint32_t codePoint = d0 - HI_SURROGATE_MIN;
101    codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
102    codePoint |= d1 - LO_SURROGATE_MIN;
103    codePoint += LO_SUPPLEMENTS_MIN;
104    return codePoint;
105}
106
107constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
108{
109    // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
110    // means that is a single code point, it needs to be represented by three MUTF8 code.
111    if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
112        auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
113        auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
114        auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
115        return {CONST_3, {ch0, ch1, ch2}};
116    }
117
118    if (d0 == 0) {
119        return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}};
120    }
121    if (d0 <= MUTF8_1B_MAX) {
122        return {1, {static_cast<uint8_t>(d0)}};
123    }
124    if (d0 <= MUTF8_2B_MAX) {
125        auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6));
126        auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT));
127        return {CONST_2, {ch0, ch1}};
128    }
129    if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
130        auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
131        auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
132        auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
133        return {CONST_3, {ch0, ch1, ch2}};
134    }
135
136    uint32_t codePoint = CombineTwoU16(d0, d1);
137
138    auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST);
139    auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1);
140    auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1);
141    auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
142
143    return {CONST_4, {ch0, ch1, ch2, ch3}};
144}
145
146bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)
147{
148    while (*mutf8_in != '\0') {    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
149        if (*mutf8_in >= MASK1) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
150            return false;
151        }
152        mutf8_in += 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
153    }
154    return true;
155}
156
157size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
158                                 size_t start)
159{
160    size_t mutf8_pos = 0;
161    if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) {
162        return 0;
163    }
164    size_t end = start + utf16_len;
165    for (size_t i = start; i < end; ++i) {
166        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
167        uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0;
168        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
169        MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code);
170        if (mutf8_pos + ch.n > mutf8_len) {
171            break;
172        }
173        for (size_t c = 0; c < ch.n; ++c) {
174            mutf8_out[mutf8_pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
175        }
176        if (ch.n == CONST_4) {  // Two UTF-16 chars are used
177            ++i;
178        }
179    }
180    return mutf8_pos;
181}
182
183void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)
184{
185    size_t in_pos = 0;
186    while (in_pos < mutf8_len) {
187        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
188        auto [p_hi, p_lo] = SplitUtf16Pair(pair);
189
190        if (p_hi != 0) {
191            *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
192        }
193        *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194
195        mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
196        in_pos += nbytes;
197    }
198}
199
200size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
201                                 size_t start)
202{
203    size_t in_pos = 0;
204    size_t out_pos = 0;
205    while (in_pos < mutf8_len) {
206        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
207        auto [p_hi, p_lo] = SplitUtf16Pair(pair);
208
209        mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
210        in_pos += nbytes;
211        if (start > 0) {
212            start -= nbytes;
213            continue;
214        }
215
216        if (p_hi != 0) {
217            ASSERT(utf16_len >= 1);
218            if (out_pos++ >= utf16_len - 1) {  // check for place for two uint16
219                --out_pos;
220                break;
221            }
222            *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
223        }
224        if (out_pos++ >= utf16_len) {
225            --out_pos;
226            break;
227        }
228        *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
229    }
230    return out_pos;
231}
232
233int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
234{
235    uint32_t c1;
236    uint32_t c2;
237    uint32_t n1;
238    uint32_t n2;
239
240    do {
241        c1 = *mutf8_1;
242        c2 = *mutf8_2;
243
244        if (c1 == 0 && c2 == 0) {
245            return 0;
246        }
247
248        if (c1 == 0 && c2 != 0) {
249            return -1;
250        }
251
252        if (c1 != 0 && c2 == 0) {
253            return 1;
254        }
255
256        std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1);
257        std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2);
258
259        mutf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
260        mutf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
261    } while (c1 == c2);
262
263    auto [c1p1, c1p2] = SplitUtf16Pair(c1);
264    auto [c2p1, c2p2] = SplitUtf16Pair(c2);
265
266    auto result = static_cast<int>(c1p1 - c2p1);
267    if (result != 0) {
268        return result;
269    }
270
271    return c1p2 - c2p2;
272}
273
274// compare plain utf8, which allows 0 inside a string
275int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)
276{
277    uint32_t c1;
278    uint32_t c2;
279    uint32_t n1;
280    uint32_t n2;
281
282    uint32_t utf8_1_index = 0;
283    uint32_t utf8_2_index = 0;
284
285    do {
286        if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) {
287            return 0;
288        }
289
290        if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) {
291            return -1;
292        }
293
294        if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) {
295            return 1;
296        }
297
298        c1 = *utf8_1;
299        c2 = *utf8_2;
300
301        std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1);
302        std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2);
303
304        utf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
305        utf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
306        utf8_1_index += n1;
307        utf8_2_index += n2;
308    } while (c1 == c2);
309
310    auto [c1p1, c1p2] = SplitUtf16Pair(c1);
311    auto [c2p1, c2p2] = SplitUtf16Pair(c2);
312
313    auto result = static_cast<int>(c1p1 - c2p1);
314    if (result != 0) {
315        return result;
316    }
317
318    return c1p2 - c2p2;
319}
320
321size_t Mutf8Size(const uint8_t *mutf8)
322{
323    return strlen(Mutf8AsCString(mutf8));
324}
325
326size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
327{
328    // TODO(d.kovalenko): make it faster
329    size_t res = 0;
330    while (*mutf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
331        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
332        res += pair > MAX_U16 ? CONST_2 : 1;
333        mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
334    }
335    return res;
336}
337
338size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)
339{
340    size_t pos = 0;
341    size_t res = 0;
342    while (pos != mutf8_len) {
343        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos);
344        if (nbytes == 0) {
345            nbytes = 1;
346        }
347        res += pair > MAX_U16 ? CONST_2 : 1;
348        mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
349        pos += nbytes;
350    }
351    return res;
352}
353
354size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
355{
356    size_t res = 1;  // zero byte
357    // when utf16 data length is only 1 and code in 0xd800-0xdfff,
358    // means that is a single code point, it needs to be represented by three MUTF8 code.
359    if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360        mutf16[0] <= LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
361        res += CONST_3;
362        return res;
363    }
364
365    for (uint32_t i = 0; i < length; ++i) {
366        // NOLINTNEXTLINE(bugprone-branch-clone)
367        if (mutf16[i] == 0) {                    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
368            res += CONST_2;                      // special case for U+0000 => C0 80
369        } else if (mutf16[i] <= MUTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
370            res += 1;
371        } else if (mutf16[i] <= MUTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
372            res += CONST_2;
373            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
374        } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) {
375            res += CONST_3;
376        } else {
377            res += CONST_4;
378            ++i;
379        }
380    }
381    return res;
382}
383
384bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)
385{
386    if (utf8_1.size() != utf8_2.size()) {
387        return false;
388    }
389
390    return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0;
391}
392
393bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
394{
395    return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0;
396}
397
398bool IsValidModifiedUTF8(const uint8_t *elems)
399{
400    ASSERT(elems);
401
402    while (*elems != '\0') {
403        // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
404        switch (*elems & 0xf0) {
405            case 0x00:
406            case 0x10:  // NOLINT(readability-magic-numbers)
407            case 0x20:  // NOLINT(readability-magic-numbers)
408            case 0x30:  // NOLINT(readability-magic-numbers)
409            case 0x40:  // NOLINT(readability-magic-numbers)
410            case 0x50:  // NOLINT(readability-magic-numbers)
411            case 0x60:  // NOLINT(readability-magic-numbers)
412            case 0x70:  // NOLINT(readability-magic-numbers)
413                // pattern 0xxx
414                // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
415                ++elems;
416                break;
417            case 0x80:  // NOLINT(readability-magic-numbers)
418            case 0x90:  // NOLINT(readability-magic-numbers)
419            case 0xa0:  // NOLINT(readability-magic-numbers)
420            case 0xb0:  // NOLINT(readability-magic-numbers)
421                // pattern 10xx is illegal start
422                return false;
423
424            case 0xf0:  // NOLINT(readability-magic-numbers)
425                // pattern 1111 0xxx starts four byte section
426                if ((*elems & 0x08) == 0) {  // NOLINT(hicpp-signed-bitwise)
427                    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
428                    ++elems;
429                    if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
430                        return false;
431                    }
432                } else {
433                    return false;
434                }
435                // no need break
436                [[fallthrough]];
437
438            case 0xe0:  // NOLINT(readability-magic-numbers)
439                // pattern 1110
440                // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
441                ++elems;
442                if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
443                    return false;
444                }
445                // no need break
446                [[fallthrough]];
447
448            case 0xc0:  // NOLINT(readability-magic-numbers)
449            case 0xd0:  // NOLINT(readability-magic-numbers)
450                // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
451                ++elems;
452                if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
453                    return false;
454                }
455                // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
456                ++elems;
457                break;
458            default:
459                break;
460        }
461    }
462    return true;
463}
464
465}  // namespace panda::utf
466