1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef PANDA_LIBPANDABASE_UTILS_UTF_H_
17 #define PANDA_LIBPANDABASE_UTILS_UTF_H_
18 
19 #include <cstdint>
20 #include <cstddef>
21 
22 #include "utils/hash.h"
23 #include "utils/span.h"
24 
25 namespace ark::utf {
26 
27 /*
28  * https://en.wikipedia.org/wiki/UTF-8
29  *
30  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4
31  *    code point   code point   code point
32  * 1  7            U+0000       U+007F      0xxxxxxx
33  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
34  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
35  * 4  21           U+10000      U+10FFFF    11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
36  */
37 constexpr size_t MASK1 = 0x80;
38 constexpr size_t MASK2 = 0x20;
39 constexpr size_t MASK3 = 0x10;
40 
41 constexpr size_t MASK_4BIT = 0x0f;
42 constexpr size_t MASK_5BIT = 0x1f;
43 constexpr size_t MASK_6BIT = 0x3f;
44 constexpr size_t MASK_10BIT = 0x03ff;
45 constexpr size_t MASK_16BIT = 0xffff;
46 
47 constexpr size_t DATA_WIDTH = 6;
48 constexpr size_t PAIR_ELEMENT_WIDTH = 16;
49 
50 constexpr size_t U16_LEAD = 0xd7c0;
51 constexpr size_t U16_TAIL = 0xdc00;
52 
53 constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
54 constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
55 constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
56 constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
57 constexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
58 constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
59 
60 constexpr uint8_t BIT_MASK_1 = 0x80;
61 constexpr uint8_t BIT_MASK_2 = 0xC0;
62 constexpr uint8_t BIT_MASK_3 = 0xE0;
63 constexpr uint8_t BIT_MASK_4 = 0xF0;
64 constexpr uint8_t BIT_MASK_5 = 0xF8;
65 
66 constexpr uint8_t UTF8_1B_MAX = 0x7f;
67 
68 constexpr uint16_t UTF8_2B_MAX = 0x7ff;
69 constexpr uint8_t UTF8_2B_FIRST = 0xc0;
70 constexpr uint8_t UTF8_2B_SECOND = 0x80;
71 constexpr uint8_t UTF8_2B_THIRD = 0x3f;
72 
73 constexpr uint8_t UTF8_3B_FIRST = 0xe0;
74 constexpr uint8_t UTF8_3B_SECOND = 0x80;
75 constexpr uint8_t UTF8_3B_THIRD = 0x80;
76 
77 constexpr uint8_t UTF8_4B_FIRST = 0xf0;
78 
79 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
80 enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 };
81 
82 constexpr size_t MAX_BYTES = 4;
83 struct Utf8Char {
84     size_t n;
85     std::array<uint8_t, MAX_BYTES> ch;
86 };
87 
88 constexpr size_t MAX_U16 = 0xffff;
89 constexpr size_t CONST_2 = 2;
90 constexpr size_t CONST_3 = 3;
91 constexpr size_t CONST_4 = 4;
92 constexpr size_t CONST_6 = 6;
93 constexpr size_t CONST_12 = 12;
94 
95 WEAK_FOR_LTO_START
96 
97 PANDA_PUBLIC_API std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes = 4);
98 
99 PANDA_PUBLIC_API bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8In);
100 
101 PANDA_PUBLIC_API void ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out);
102 
103 PANDA_PUBLIC_API size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len,
104                                                   size_t utf16Len, size_t start);
105 
106 PANDA_PUBLIC_API size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len,
107                                                   size_t mutf8Len, size_t start);
108 
109 PANDA_PUBLIC_API int CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82);
110 
111 PANDA_PUBLIC_API int CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82,
112                                        size_t utf82Length);
113 
114 PANDA_PUBLIC_API bool IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82);
115 
116 PANDA_PUBLIC_API bool IsEqual(const uint8_t *mutf81, const uint8_t *mutf82);
117 
118 PANDA_PUBLIC_API size_t MUtf8ToUtf16Size(const uint8_t *mutf8);
119 
120 PANDA_PUBLIC_API size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len);
121 
122 PANDA_PUBLIC_API size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length);
123 
124 PANDA_PUBLIC_API size_t Mutf8Size(const uint8_t *mutf8);
125 
126 PANDA_PUBLIC_API bool IsValidModifiedUTF8(const uint8_t *elems);
127 
128 PANDA_PUBLIC_API uint32_t UTF16Decode(uint16_t lead, uint16_t trail);
129 
130 PANDA_PUBLIC_API bool IsValidUTF8(const std::vector<uint8_t> &data);
131 
132 PANDA_PUBLIC_API Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify);
133 
134 PANDA_PUBLIC_API size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true);
135 
136 PANDA_PUBLIC_API size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len,
137                                                  size_t utf8Len, size_t start, bool modify = true);
138 
139 PANDA_PUBLIC_API std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);
140 
141 PANDA_PUBLIC_API size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);
142 
143 PANDA_PUBLIC_API size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len,
144                                                  size_t utf16Len, size_t start);
145 
146 PANDA_PUBLIC_API bool IsUTF16SurrogatePair(uint16_t lead);
147 
148 PANDA_PUBLIC_API void UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative);
149 
150 PANDA_PUBLIC_API bool IsWhiteSpaceChar(uint16_t c);
151 
152 WEAK_FOR_LTO_END
153 
CStringAsMutf8(const char *str)154 inline const uint8_t *CStringAsMutf8(const char *str)
155 {
156     return reinterpret_cast<const uint8_t *>(str);
157 }
158 
Mutf8AsCString(const uint8_t *mutf8)159 inline const char *Mutf8AsCString(const uint8_t *mutf8)
160 {
161     return reinterpret_cast<const char *>(mutf8);
162 }
163 
IsAvailableNextUtf16Code(uint16_t val)164 inline constexpr bool IsAvailableNextUtf16Code(uint16_t val)
165 {
166     return val >= DECODE_LEAD_LOW && val <= DECODE_TRAIL_HIGH;
167 }
168 
ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)169 inline Utf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
170 {
171     return ConvertUtf16ToUtf8(d0, d1, true);
172 }
173 
174 struct Mutf8Hash {
operator ()ark::utf::Mutf8Hash175     uint32_t operator()(const uint8_t *data) const
176     {
177         return GetHash32String(data);
178     }
179 };
180 
181 struct Mutf8Equal {
operator ()ark::utf::Mutf8Equal182     bool operator()(const uint8_t *mutf81, const uint8_t *mutf82) const
183     {
184         return IsEqual(mutf81, mutf82);
185     }
186 };
187 
188 struct Mutf8Less {
operator ()ark::utf::Mutf8Less189     bool operator()(const uint8_t *mutf81, const uint8_t *mutf82) const
190     {
191         return CompareMUtf8ToMUtf8(mutf81, mutf82) < 0;
192     }
193 };
194 
SplitUtf16Pair(uint32_t pair)195 static inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair)
196 {
197     constexpr size_t P1_MASK = 0xffff;
198     constexpr size_t P2_SHIFT = 16;
199     return {pair >> P2_SHIFT, pair & P1_MASK};
200 }
201 
202 }  // namespace ark::utf
203 
204 #endif  // PANDA_LIBPANDABASE_UTILS_UTF_H_
205