1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/base/utf_helper.h"
17
18 #include "ecmascript/log_wrapper.h"
19
20 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
21 static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
22 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
23 #define U16_GET_SUPPLEMENTARY(lead, trail) \
24 ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
25
26 namespace panda::ecmascript::base::utf_helper {
27
UTF16Decode(uint16_t lead, uint16_t trail)28 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
29 {
30 ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
31 (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
32 uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
33 return cp;
34 }
35
IsUTF16HighSurrogate(uint16_t ch)36 bool IsUTF16HighSurrogate(uint16_t ch)
37 {
38 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
39 }
40
IsUTF16LowSurrogate(uint16_t ch)41 bool IsUTF16LowSurrogate(uint16_t ch)
42 {
43 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
44 }
45
46 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8)47 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8)
48 {
49 uint16_t high = utf16[*index];
50 if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
51 return high;
52 }
53 uint16_t low = utf16[*index + 1];
54 if (!IsUTF16LowSurrogate(low) || cesu8) {
55 return high;
56 }
57 (*index)++;
58 return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
59 }
60
HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)61 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
62 {
63 uint16_t first = utf16[*index];
64 // A valid surrogate pair should always start with a High Surrogate
65 if (IsUTF16LowSurrogate(first)) {
66 return UTF16_REPLACEMENT_CHARACTER;
67 }
68 if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
69 if (*index == len - 1) {
70 // A High surrogate not paired with another surrogate
71 return UTF16_REPLACEMENT_CHARACTER;
72 }
73 uint16_t second = utf16[*index + 1];
74 if (!IsUTF16LowSurrogate(second)) {
75 // A High surrogate not followed by a low surrogate
76 return UTF16_REPLACEMENT_CHARACTER;
77 }
78 // A valid surrogate pair, decode normally
79 (*index)++;
80 return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
81 }
82 // A unicode not fallen into the range of representing by surrogate pair, return as it is
83 return first;
84 }
85
UTF8Length(uint32_t codepoint)86 inline size_t UTF8Length(uint32_t codepoint)
87 {
88 if (codepoint <= UTF8_1B_MAX) {
89 return UtfLength::ONE;
90 }
91 if (codepoint <= UTF8_2B_MAX) {
92 return UtfLength::TWO;
93 }
94 if (codepoint <= UTF8_3B_MAX) {
95 return UtfLength::THREE;
96 }
97 return UtfLength::FOUR;
98 }
99
100 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)101 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
102 {
103 size_t size = UTF8Length(codepoint);
104 if (index + size > len) {
105 return 0;
106 }
107 for (size_t j = size - 1; j > 0; j--) {
108 uint8_t cont = ((codepoint | byteMark) & byteMask);
109 utf8[index + j] = cont;
110 codepoint >>= UTF8_OFFSET;
111 }
112 utf8[index] = codepoint | firstByteMark[size];
113 return size;
114 }
115
IsValidUTF8(const std::vector<uint8_t> &data)116 bool IsValidUTF8(const std::vector<uint8_t> &data)
117 {
118 uint32_t length = data.size();
119 switch (length) {
120 case UtfLength::ONE:
121 if (data.at(0) >= BIT_MASK_1) {
122 return false;
123 }
124 break;
125 case UtfLength::TWO:
126 if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
127 return false;
128 }
129 if (data.at(0) < UTF8_2B_FIRST_MIN) {
130 return false;
131 }
132 break;
133 case UtfLength::THREE:
134 if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
135 return false;
136 }
137 if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) {
138 return false;
139 }
140 // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs, corresponds to %ED%A0%80~%ED%BF%BF
141 if (data.at(0) == UTF8_3B_RESERVED_FIRST && data.at(1) >= UTF8_3B_RESERVED_SECOND_MIN &&
142 data.at(1) <= UTF8_3B_RESERVED_SECOND_MAX) {
143 return false;
144 }
145 break;
146 case UtfLength::FOUR:
147 if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
148 return false;
149 }
150 if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) {
151 return false;
152 }
153 // max four length binary: 11110(100) 10(001111) 10(111111) 10(111111), max data[0] is 0xF4, data[1] is 0x8F
154 if (data.at(0) > UTF8_4B_FIRST_MAX ||
155 (data.at(0) == UTF8_4B_FIRST_MAX && data.at(1) > UTF8_4B_SECOND_MAX)) {
156 return false;
157 }
158 break;
159 default:
160 LOG_ECMA(FATAL) << "this branch is unreachable";
161 UNREACHABLE();
162 break;
163 }
164
165 for (uint32_t i = 1; i < length; i++) {
166 if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
167 return false;
168 }
169 }
170 return true;
171 }
172
ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)173 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)
174 {
175 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
176 // means that is a single code point, it needs to be represented by three UTF8 code.
177 if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
178 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
179 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
180 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
181 return {UtfLength::THREE, {ch0, ch1, ch2}};
182 }
183
184 if (d0 == 0) {
185 if (isWriteBuffer) {
186 return {1, {0x00U}};
187 }
188 if (modify) {
189 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
190 return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
191 }
192 // For print string, just skip '\u0000'
193 return {0, {0x00U}};
194 }
195 if (d0 <= UTF8_1B_MAX) {
196 return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
197 }
198 if (d0 <= UTF8_2B_MAX) {
199 auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
200 auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
201 return {UtfLength::TWO, {ch0, ch1}};
202 }
203 if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
204 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
205 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
206 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
207 return {UtfLength::THREE, {ch0, ch1, ch2}};
208 }
209 if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
210 // Bad sequence
211 LOG_ECMA(FATAL) << "this branch is unreachable";
212 UNREACHABLE();
213 }
214
215 uint32_t codePoint = CombineTwoU16(d0, d1);
216
217 auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
218 auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
219 auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
220 auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
221 return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
222 }
223
Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8)224 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8)
225 {
226 size_t res = 1; // zero byte
227 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
228 // means that is a single code point, it needs to be represented by three UTF8 code.
229 if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
230 utf16[0] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
231 res += UtfLength::THREE;
232 return res;
233 }
234
235 for (uint32_t i = 0; i < length; ++i) {
236 if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
237 if (isGetBufferSize) {
238 res += UtfLength::ONE;
239 } else if (modify) {
240 res += UtfLength::TWO; // special case for U+0000 => C0 80
241 }
242 } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
243 res += 1;
244 } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
245 res += UtfLength::TWO;
246 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
247 } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
248 res += UtfLength::THREE;
249 } else {
250 if (!cesu8 && i < length - 1 &&
251 utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
252 utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
253 res += UtfLength::FOUR;
254 ++i;
255 } else {
256 res += UtfLength::THREE;
257 }
258 }
259 }
260 return res;
261 }
262
ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, size_t start, bool modify, bool isWriteBuffer, bool cesu8)263 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
264 size_t start, bool modify, bool isWriteBuffer, bool cesu8)
265 {
266 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
267 return 0;
268 }
269 size_t utf8Pos = 0;
270 size_t end = start + utf16Len;
271 for (size_t i = start; i < end; ++i) {
272 uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8);
273 if (codepoint == 0) {
274 if (isWriteBuffer) {
275 utf8Out[utf8Pos++] = 0x00U;
276 continue;
277 }
278 if (modify) {
279 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
280 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
281 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
282 }
283 continue;
284 }
285 utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
286 }
287 return utf8Pos;
288 }
289
DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, size_t start, bool modify, bool isWriteBuffer)290 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
291 size_t start, bool modify, bool isWriteBuffer)
292 {
293 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
294 return 0;
295 }
296 size_t utf8Pos = 0;
297 size_t end = start + utf16Len;
298 for (size_t i = start; i < end; ++i) {
299 uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
300 if (codepoint == 0) {
301 if (isWriteBuffer) {
302 utf8Out[utf8Pos++] = 0x00U;
303 continue;
304 }
305 if (modify) {
306 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
307 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
308 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
309 }
310 continue;
311 }
312 utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
313 }
314 return utf8Pos;
315 }
316
ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)317 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
318 {
319 uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
320 if ((d0 & utf::MASK1) == 0) {
321 return {d0, 1};
322 }
323
324 uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
325 if ((d0 & utf::MASK2) == 0) {
326 return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
327 }
328
329 uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
330 if ((d0 & utf::MASK3) == 0) {
331 return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
332 (d2 & utf::MASK_6BIT),
333 UtfLength::THREE};
334 }
335
336 uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
337 uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
338 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
339
340 uint32_t pair = 0;
341 if (combine) {
342 uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
343 uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
344 pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail)); // NOLINTNEXTLINE(hicpp-signed-bitwise)
345 } else {
346 pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
347 pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
348 }
349
350 return {pair, UtfLength::FOUR};
351 }
352
353 // drop the tail bytes if the remain length can't fill the length it represents.
FixUtf8Len(const uint8_t* utf8, size_t utf8Len)354 static inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
355 {
356 size_t trimSize = 0;
357 if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
358 // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
359 trimSize = 1;
360 }
361 if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
362 // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
363 trimSize = CONST_2;
364 }
365 if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
366 // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
367 trimSize = CONST_3;
368 }
369 return utf8Len - trimSize;
370 }
371
Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)372 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
373 {
374 size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
375 size_t in_pos = 0;
376 size_t res = 0;
377 while (in_pos < safeUtf8Len) {
378 uint8_t src = utf8[in_pos];
379 switch (src & 0xF0) {
380 case 0xF0: {
381 const uint8_t c2 = utf8[++in_pos];
382 const uint8_t c3 = utf8[++in_pos];
383 const uint8_t c4 = utf8[++in_pos];
384 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
385 ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
386 if (codePoint >= SURROGATE_RAIR_START) {
387 res += CONST_2;
388 } else {
389 res++;
390 }
391 in_pos++;
392 break;
393 }
394 case 0xE0: {
395 in_pos += CONST_3;
396 res++;
397 break;
398 }
399 case 0xD0:
400 case 0xC0: {
401 in_pos += CONST_2;
402 res++;
403 break;
404 }
405 default:
406 do {
407 in_pos++;
408 res++;
409 } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
410 break;
411 }
412 }
413 // The remain chars should be treated as single byte char.
414 res += utf8Len - in_pos;
415 return res;
416 }
417
ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)418 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
419 {
420 size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
421 size_t in_pos = 0;
422 size_t out_pos = 0;
423 while (in_pos < safeUtf8Len && out_pos < utf16Len) {
424 uint8_t src = utf8In[in_pos];
425 switch (src & 0xF0) {
426 case 0xF0: {
427 const uint8_t c2 = utf8In[++in_pos];
428 const uint8_t c3 = utf8In[++in_pos];
429 const uint8_t c4 = utf8In[++in_pos];
430 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
431 ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
432 if (codePoint >= SURROGATE_RAIR_START) {
433 ASSERT(utf16Len >= 1);
434 if (out_pos >= utf16Len - 1) {
435 return out_pos;
436 }
437 codePoint -= SURROGATE_RAIR_START;
438 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
439 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
440 } else {
441 utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
442 }
443 in_pos++;
444 break;
445 }
446 case 0xE0: {
447 const uint8_t c2 = utf8In[++in_pos];
448 const uint8_t c3 = utf8In[++in_pos];
449 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
450 ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
451 in_pos++;
452 break;
453 }
454 case 0xD0:
455 case 0xC0: {
456 const uint8_t c2 = utf8In[++in_pos];
457 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
458 in_pos++;
459 break;
460 }
461 default:
462 do {
463 utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
464 } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
465 break;
466 }
467 }
468 // The remain chars should be treated as single byte char.
469 while (in_pos < utf8Len && out_pos < utf16Len) {
470 utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
471 }
472 return out_pos;
473 }
474
ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)475 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
476 {
477 if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) {
478 return 0;
479 }
480 size_t latin1Pos = 0;
481 size_t end = utf16Len;
482 for (size_t i = 0; i < end; ++i) {
483 if (latin1Pos == latin1Len) {
484 break;
485 }
486 uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
487 uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit);
488 latin1Out[latin1Pos++] = latin1Code;
489 }
490 return latin1Pos;
491 }
492
ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen)493 std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen)
494 {
495 if (maxLen == 0) {
496 return {INVALID_UTF8, 0};
497 }
498 Span<const uint8_t> sp(utf8, maxLen);
499 // one byte
500 uint8_t d0 = sp[0];
501 if ((d0 & BIT_MASK_1) == 0) {
502 return {d0, UtfLength::ONE};
503 }
504 if (maxLen < UtfLength::TWO) {
505 return {INVALID_UTF8, 0};
506 }
507 // two bytes
508 uint8_t d1 = sp[UtfLength::ONE];
509 if ((d0 & BIT_MASK_3) == BIT_MASK_2) {
510 if ((d1 & BIT_MASK_2) == BIT_MASK_1) {
511 return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
512 } else {
513 return {INVALID_UTF8, 0};
514 }
515 }
516 if (maxLen < UtfLength::THREE) {
517 return {INVALID_UTF8, 0};
518 }
519 // three bytes
520 uint8_t d2 = sp[UtfLength::TWO];
521 if ((d0 & BIT_MASK_4) == BIT_MASK_3) {
522 if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) {
523 return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) |
524 ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d2 & utf::MASK_6BIT), UtfLength::THREE};
525 } else {
526 return {INVALID_UTF8, 0};
527 }
528 }
529 if (maxLen < UtfLength::FOUR) {
530 return {INVALID_UTF8, 0};
531 }
532 // four bytes
533 uint8_t d3 = sp[UtfLength::THREE];
534 if ((d0 & BIT_MASK_5) == BIT_MASK_4) {
535 if (((d1 & BIT_MASK_2) == BIT_MASK_1) &&
536 ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) {
537 return {((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
538 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT), UtfLength::FOUR};
539 } else {
540 return {INVALID_UTF8, 0};
541 }
542 }
543 return {INVALID_UTF8, 0};
544 }
545 } // namespace panda::ecmascript::base::utf_helper
546