1b1994897Sopenharmony_ci/**
2b1994897Sopenharmony_ci * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3b1994897Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
4b1994897Sopenharmony_ci * you may not use this file except in compliance with the License.
5b1994897Sopenharmony_ci * You may obtain a copy of the License at
6b1994897Sopenharmony_ci *
7b1994897Sopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0
8b1994897Sopenharmony_ci *
9b1994897Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software
10b1994897Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS,
11b1994897Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12b1994897Sopenharmony_ci * See the License for the specific language governing permissions and
13b1994897Sopenharmony_ci * limitations under the License.
14b1994897Sopenharmony_ci */
15b1994897Sopenharmony_ci
16b1994897Sopenharmony_ci#ifndef LIBPANDABASE_UTILS_UTF_H
17b1994897Sopenharmony_ci#define LIBPANDABASE_UTILS_UTF_H
18b1994897Sopenharmony_ci
19b1994897Sopenharmony_ci#include <cstdint>
20b1994897Sopenharmony_ci#include <cstddef>
21b1994897Sopenharmony_ci
22b1994897Sopenharmony_ci#include "utils/hash.h"
23b1994897Sopenharmony_ci#include "utils/span.h"
24b1994897Sopenharmony_ci
25b1994897Sopenharmony_cinamespace panda::utf {
26b1994897Sopenharmony_ci
27b1994897Sopenharmony_ci/*
28b1994897Sopenharmony_ci * https://en.wikipedia.org/wiki/UTF-8
29b1994897Sopenharmony_ci *
30b1994897Sopenharmony_ci * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4
31b1994897Sopenharmony_ci *    code point   code point   code point
32b1994897Sopenharmony_ci * 1  7            U+0000       U+007F      0xxxxxxx
33b1994897Sopenharmony_ci * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
34b1994897Sopenharmony_ci * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
35b1994897Sopenharmony_ci * 4  21           U+10000      U+10FFFF    11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
36b1994897Sopenharmony_ci */
37b1994897Sopenharmony_ciconstexpr size_t MASK1 = 0x80;
38b1994897Sopenharmony_ciconstexpr size_t MASK2 = 0x20;
39b1994897Sopenharmony_ciconstexpr size_t MASK3 = 0x10;
40b1994897Sopenharmony_ci
41b1994897Sopenharmony_ciconstexpr size_t MASK_4BIT = 0x0f;
42b1994897Sopenharmony_ciconstexpr size_t MASK_5BIT = 0x1f;
43b1994897Sopenharmony_ciconstexpr size_t MASK_6BIT = 0x3f;
44b1994897Sopenharmony_ciconstexpr size_t MASK_10BIT = 0x03ff;
45b1994897Sopenharmony_ciconstexpr size_t MASK_16BIT = 0xffff;
46b1994897Sopenharmony_ci
47b1994897Sopenharmony_ciconstexpr size_t DATA_WIDTH = 6;
48b1994897Sopenharmony_ciconstexpr size_t PAIR_ELEMENT_WIDTH = 16;
49b1994897Sopenharmony_ci
50b1994897Sopenharmony_ciconstexpr size_t HI_SURROGATE_MIN = 0xd800;
51b1994897Sopenharmony_ciconstexpr size_t HI_SURROGATE_MAX = 0xdbff;
52b1994897Sopenharmony_ciconstexpr size_t LO_SURROGATE_MIN = 0xdc00;
53b1994897Sopenharmony_ciconstexpr size_t LO_SURROGATE_MAX = 0xdfff;
54b1994897Sopenharmony_ci
55b1994897Sopenharmony_ciconstexpr size_t LO_SUPPLEMENTS_MIN = 0x10000;
56b1994897Sopenharmony_ci
57b1994897Sopenharmony_ciconstexpr size_t U16_LEAD = 0xd7c0;
58b1994897Sopenharmony_ciconstexpr size_t U16_TAIL = 0xdc00;
59b1994897Sopenharmony_ci
60b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_1B_MAX = 0x7f;
61b1994897Sopenharmony_ci
62b1994897Sopenharmony_ciconstexpr uint16_t MUTF8_2B_MAX = 0x7ff;
63b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_2B_FIRST = 0xc0;
64b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_2B_SECOND = 0x80;
65b1994897Sopenharmony_ci
66b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_3B_FIRST = 0xe0;
67b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_3B_SECOND = 0x80;
68b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_3B_THIRD = 0x80;
69b1994897Sopenharmony_ci
70b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_4B_FIRST = 0xf0;
71b1994897Sopenharmony_ci
72b1994897Sopenharmony_ciWEAK_FOR_LTO_START
73b1994897Sopenharmony_ci
74b1994897Sopenharmony_cistd::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = 4);
75b1994897Sopenharmony_ci
76b1994897Sopenharmony_cibool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in);
77b1994897Sopenharmony_ci
78b1994897Sopenharmony_civoid ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out);
79b1994897Sopenharmony_ci
80b1994897Sopenharmony_cisize_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
81b1994897Sopenharmony_ci                                 size_t start);
82b1994897Sopenharmony_ci
83b1994897Sopenharmony_cisize_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
84b1994897Sopenharmony_ci                                 size_t start);
85b1994897Sopenharmony_ci
86b1994897Sopenharmony_ciint CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2);
87b1994897Sopenharmony_ci
88b1994897Sopenharmony_ciint CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length);
89b1994897Sopenharmony_ci
90b1994897Sopenharmony_cibool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2);
91b1994897Sopenharmony_ci
92b1994897Sopenharmony_cibool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2);
93b1994897Sopenharmony_ci
94b1994897Sopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t *mutf8);
95b1994897Sopenharmony_ci
96b1994897Sopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len);
97b1994897Sopenharmony_ci
98b1994897Sopenharmony_cisize_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length);
99b1994897Sopenharmony_ci
100b1994897Sopenharmony_cisize_t Mutf8Size(const uint8_t *mutf8);
101b1994897Sopenharmony_ci
102b1994897Sopenharmony_cibool IsValidModifiedUTF8(const uint8_t *elems);
103b1994897Sopenharmony_ci
104b1994897Sopenharmony_ciWEAK_FOR_LTO_END
105b1994897Sopenharmony_ci
106b1994897Sopenharmony_ciinline const uint8_t *CStringAsMutf8(const char *str)
107b1994897Sopenharmony_ci{
108b1994897Sopenharmony_ci    return reinterpret_cast<const uint8_t *>(str);
109b1994897Sopenharmony_ci}
110b1994897Sopenharmony_ci
111b1994897Sopenharmony_ciinline const char *Mutf8AsCString(const uint8_t *mutf8)
112b1994897Sopenharmony_ci{
113b1994897Sopenharmony_ci    return reinterpret_cast<const char *>(mutf8);
114b1994897Sopenharmony_ci}
115b1994897Sopenharmony_ci
116b1994897Sopenharmony_ciinline constexpr bool IsAvailableNextUtf16Code(uint16_t val)
117b1994897Sopenharmony_ci{
118b1994897Sopenharmony_ci    return val >= HI_SURROGATE_MIN && val <= LO_SURROGATE_MAX;
119b1994897Sopenharmony_ci}
120b1994897Sopenharmony_ci
121b1994897Sopenharmony_cistruct Mutf8Hash {
122b1994897Sopenharmony_ci    uint32_t operator()(const uint8_t *data) const
123b1994897Sopenharmony_ci    {
124b1994897Sopenharmony_ci        return GetHash32String(data);
125b1994897Sopenharmony_ci    }
126b1994897Sopenharmony_ci};
127b1994897Sopenharmony_ci
128b1994897Sopenharmony_cistruct Mutf8Equal {
129b1994897Sopenharmony_ci    bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const
130b1994897Sopenharmony_ci    {
131b1994897Sopenharmony_ci        return IsEqual(mutf8_1, mutf8_2);
132b1994897Sopenharmony_ci    }
133b1994897Sopenharmony_ci};
134b1994897Sopenharmony_ci
135b1994897Sopenharmony_cistruct Mutf8Less {
136b1994897Sopenharmony_ci    bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const
137b1994897Sopenharmony_ci    {
138b1994897Sopenharmony_ci        return CompareMUtf8ToMUtf8(mutf8_1, mutf8_2) < 0;
139b1994897Sopenharmony_ci    }
140b1994897Sopenharmony_ci};
141b1994897Sopenharmony_ci
142b1994897Sopenharmony_cistatic inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair)
143b1994897Sopenharmony_ci{
144b1994897Sopenharmony_ci    constexpr size_t P1_MASK = 0xffff;
145b1994897Sopenharmony_ci    constexpr size_t P2_SHIFT = 16;
146b1994897Sopenharmony_ci    return {pair >> P2_SHIFT, pair & P1_MASK};
147b1994897Sopenharmony_ci}
148b1994897Sopenharmony_ci
149b1994897Sopenharmony_ci}  // namespace panda::utf
150b1994897Sopenharmony_ci
151b1994897Sopenharmony_ci#endif  // LIBPANDABASE_UTILS_UTF_H
152