1/* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "ecmascript/base/utf_helper.h" 17 18#include "ecmascript/log_wrapper.h" 19 20// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) 21static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000; 22// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) 23#define U16_GET_SUPPLEMENTARY(lead, trail) \ 24 ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET) 25 26namespace panda::ecmascript::base::utf_helper { 27 28uint32_t UTF16Decode(uint16_t lead, uint16_t trail) 29{ 30 ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) && 31 (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH)); 32 uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 33 return cp; 34} 35 36bool IsUTF16HighSurrogate(uint16_t ch) 37{ 38 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH; 39} 40 41bool IsUTF16LowSurrogate(uint16_t ch) 42{ 43 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH; 44} 45 46// Methods for decode utf16 to unicode 47uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8) 48{ 49 uint16_t high = utf16[*index]; 50 if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) { 51 return high; 52 } 53 uint16_t low = utf16[*index + 1]; 54 if (!IsUTF16LowSurrogate(low) || cesu8) { 55 return high; 56 } 57 (*index)++; 58 return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 59} 60 61uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index) 62{ 63 uint16_t first = utf16[*index]; 64 // A valid surrogate pair should always start with a High Surrogate 65 if (IsUTF16LowSurrogate(first)) { 66 return UTF16_REPLACEMENT_CHARACTER; 67 } 68 if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) { 69 if (*index == len - 1) { 70 // A High surrogate not paired with another surrogate 71 return UTF16_REPLACEMENT_CHARACTER; 72 } 73 uint16_t second = utf16[*index + 1]; 74 if (!IsUTF16LowSurrogate(second)) { 75 // A High surrogate not followed by a low surrogate 76 return UTF16_REPLACEMENT_CHARACTER; 77 } 78 // A valid surrogate pair, decode normally 79 (*index)++; 80 return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 81 } 82 // A unicode not fallen into the range of representing by surrogate pair, return as it is 83 return first; 84} 85 86inline size_t UTF8Length(uint32_t codepoint) 87{ 88 if (codepoint <= UTF8_1B_MAX) { 89 return UtfLength::ONE; 90 } 91 if (codepoint <= UTF8_2B_MAX) { 92 return UtfLength::TWO; 93 } 94 if (codepoint <= UTF8_3B_MAX) { 95 return UtfLength::THREE; 96 } 97 return UtfLength::FOUR; 98} 99 100// Methods for encode unicode to unicode 101size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index) 102{ 103 size_t size = UTF8Length(codepoint); 104 if (index + size > len) { 105 return 0; 106 } 107 for (size_t j = size - 1; j > 0; j--) { 108 uint8_t cont = ((codepoint | byteMark) & byteMask); 109 utf8[index + j] = cont; 110 codepoint >>= UTF8_OFFSET; 111 } 112 utf8[index] = codepoint | firstByteMark[size]; 113 return size; 114} 115 116bool IsValidUTF8(const std::vector<uint8_t> &data) 117{ 118 uint32_t length = data.size(); 119 switch (length) { 120 case UtfLength::ONE: 121 if (data.at(0) >= BIT_MASK_1) { 122 return false; 123 } 124 break; 125 case UtfLength::TWO: 126 if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) { 127 return false; 128 } 129 if (data.at(0) < UTF8_2B_FIRST_MIN) { 130 return false; 131 } 132 break; 133 case UtfLength::THREE: 134 if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) { 135 return false; 136 } 137 if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) { 138 return false; 139 } 140 // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs, corresponds to %ED%A0%80~%ED%BF%BF 141 if (data.at(0) == UTF8_3B_RESERVED_FIRST && data.at(1) >= UTF8_3B_RESERVED_SECOND_MIN && 142 data.at(1) <= UTF8_3B_RESERVED_SECOND_MAX) { 143 return false; 144 } 145 break; 146 case UtfLength::FOUR: 147 if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) { 148 return false; 149 } 150 if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) { 151 return false; 152 } 153 // max four length binary: 11110(100) 10(001111) 10(111111) 10(111111), max data[0] is 0xF4, data[1] is 0x8F 154 if (data.at(0) > UTF8_4B_FIRST_MAX || 155 (data.at(0) == UTF8_4B_FIRST_MAX && data.at(1) > UTF8_4B_SECOND_MAX)) { 156 return false; 157 } 158 break; 159 default: 160 LOG_ECMA(FATAL) << "this branch is unreachable"; 161 UNREACHABLE(); 162 break; 163 } 164 165 for (uint32_t i = 1; i < length; i++) { 166 if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) { 167 return false; 168 } 169 } 170 return true; 171} 172 173Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer) 174{ 175 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0, 176 // means that is a single code point, it needs to be represented by three UTF8 code. 177 if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) { 178 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE)); 179 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); 180 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); 181 return {UtfLength::THREE, {ch0, ch1, ch2}}; 182 } 183 184 if (d0 == 0) { 185 if (isWriteBuffer) { 186 return {1, {0x00U}}; 187 } 188 if (modify) { 189 // special case for \u0000 ==> C080 - 1100'0000 1000'0000 190 return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}}; 191 } 192 // For print string, just skip '\u0000' 193 return {0, {0x00U}}; 194 } 195 if (d0 <= UTF8_1B_MAX) { 196 return {UtfLength::ONE, {static_cast<uint8_t>(d0)}}; 197 } 198 if (d0 <= UTF8_2B_MAX) { 199 auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX)); 200 auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT)); 201 return {UtfLength::TWO, {ch0, ch1}}; 202 } 203 if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) { 204 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE)); 205 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); 206 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); 207 return {UtfLength::THREE, {ch0, ch1, ch2}}; 208 } 209 if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) { 210 // Bad sequence 211 LOG_ECMA(FATAL) << "this branch is unreachable"; 212 UNREACHABLE(); 213 } 214 215 uint32_t codePoint = CombineTwoU16(d0, d1); 216 217 auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST); 218 auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1); 219 auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1); 220 auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1); 221 return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}}; 222} 223 224size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8) 225{ 226 size_t res = 1; // zero byte 227 // when utf16 data length is only 1 and code in 0xd800-0xdfff, 228 // means that is a single code point, it needs to be represented by three UTF8 code. 229 if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 230 utf16[0] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 231 res += UtfLength::THREE; 232 return res; 233 } 234 235 for (uint32_t i = 0; i < length; ++i) { 236 if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 237 if (isGetBufferSize) { 238 res += UtfLength::ONE; 239 } else if (modify) { 240 res += UtfLength::TWO; // special case for U+0000 => C0 80 241 } 242 } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 243 res += 1; 244 } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 245 res += UtfLength::TWO; 246 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 247 } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) { 248 res += UtfLength::THREE; 249 } else { 250 if (!cesu8 && i < length - 1 && 251 utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 252 utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 253 res += UtfLength::FOUR; 254 ++i; 255 } else { 256 res += UtfLength::THREE; 257 } 258 } 259 } 260 return res; 261} 262 263size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, 264 size_t start, bool modify, bool isWriteBuffer, bool cesu8) 265{ 266 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { 267 return 0; 268 } 269 size_t utf8Pos = 0; 270 size_t end = start + utf16Len; 271 for (size_t i = start; i < end; ++i) { 272 uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8); 273 if (codepoint == 0) { 274 if (isWriteBuffer) { 275 utf8Out[utf8Pos++] = 0x00U; 276 continue; 277 } 278 if (modify) { 279 // special case for \u0000 ==> C080 - 1100'0000 1000'0000 280 utf8Out[utf8Pos++] = UTF8_2B_FIRST; 281 utf8Out[utf8Pos++] = UTF8_2B_SECOND; 282 } 283 continue; 284 } 285 utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos); 286 } 287 return utf8Pos; 288} 289 290size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, 291 size_t start, bool modify, bool isWriteBuffer) 292{ 293 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { 294 return 0; 295 } 296 size_t utf8Pos = 0; 297 size_t end = start + utf16Len; 298 for (size_t i = start; i < end; ++i) { 299 uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i); 300 if (codepoint == 0) { 301 if (isWriteBuffer) { 302 utf8Out[utf8Pos++] = 0x00U; 303 continue; 304 } 305 if (modify) { 306 // special case for \u0000 ==> C080 - 1100'0000 1000'0000 307 utf8Out[utf8Pos++] = UTF8_2B_FIRST; 308 utf8Out[utf8Pos++] = UTF8_2B_SECOND; 309 } 310 continue; 311 } 312 utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos); 313 } 314 return utf8Pos; 315} 316 317std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine) 318{ 319 uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 320 if ((d0 & utf::MASK1) == 0) { 321 return {d0, 1}; 322 } 323 324 uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 325 if ((d0 & utf::MASK2) == 0) { 326 return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO}; 327 } 328 329 uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 330 if ((d0 & utf::MASK3) == 0) { 331 return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | 332 (d2 & utf::MASK_6BIT), 333 UtfLength::THREE}; 334 } 335 336 uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 337 uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) | 338 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT); 339 340 uint32_t pair = 0; 341 if (combine) { 342 uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD); 343 uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; 344 pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail)); // NOLINTNEXTLINE(hicpp-signed-bitwise) 345 } else { 346 pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH; 347 pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; 348 } 349 350 return {pair, UtfLength::FOUR}; 351} 352 353// drop the tail bytes if the remain length can't fill the length it represents. 354static inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len) 355{ 356 size_t trimSize = 0; 357 if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) { 358 // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one. 359 trimSize = 1; 360 } 361 if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) { 362 // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two. 363 trimSize = CONST_2; 364 } 365 if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) { 366 // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three. 367 trimSize = CONST_3; 368 } 369 return utf8Len - trimSize; 370} 371 372size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len) 373{ 374 size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len); 375 size_t in_pos = 0; 376 size_t res = 0; 377 while (in_pos < safeUtf8Len) { 378 uint8_t src = utf8[in_pos]; 379 switch (src & 0xF0) { 380 case 0xF0: { 381 const uint8_t c2 = utf8[++in_pos]; 382 const uint8_t c3 = utf8[++in_pos]; 383 const uint8_t c4 = utf8[++in_pos]; 384 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) | 385 ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS); 386 if (codePoint >= SURROGATE_RAIR_START) { 387 res += CONST_2; 388 } else { 389 res++; 390 } 391 in_pos++; 392 break; 393 } 394 case 0xE0: { 395 in_pos += CONST_3; 396 res++; 397 break; 398 } 399 case 0xD0: 400 case 0xC0: { 401 in_pos += CONST_2; 402 res++; 403 break; 404 } 405 default: 406 do { 407 in_pos++; 408 res++; 409 } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80); 410 break; 411 } 412 } 413 // The remain chars should be treated as single byte char. 414 res += utf8Len - in_pos; 415 return res; 416} 417 418size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len) 419{ 420 size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len); 421 size_t in_pos = 0; 422 size_t out_pos = 0; 423 while (in_pos < safeUtf8Len && out_pos < utf16Len) { 424 uint8_t src = utf8In[in_pos]; 425 switch (src & 0xF0) { 426 case 0xF0: { 427 const uint8_t c2 = utf8In[++in_pos]; 428 const uint8_t c3 = utf8In[++in_pos]; 429 const uint8_t c4 = utf8In[++in_pos]; 430 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) | 431 ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS); 432 if (codePoint >= SURROGATE_RAIR_START) { 433 ASSERT(utf16Len >= 1); 434 if (out_pos >= utf16Len - 1) { 435 return out_pos; 436 } 437 codePoint -= SURROGATE_RAIR_START; 438 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START); 439 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START); 440 } else { 441 utf16Out[out_pos++] = static_cast<uint16_t>(codePoint); 442 } 443 in_pos++; 444 break; 445 } 446 case 0xE0: { 447 const uint8_t c2 = utf8In[++in_pos]; 448 const uint8_t c3 = utf8In[++in_pos]; 449 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) | 450 ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS)); 451 in_pos++; 452 break; 453 } 454 case 0xD0: 455 case 0xC0: { 456 const uint8_t c2 = utf8In[++in_pos]; 457 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS)); 458 in_pos++; 459 break; 460 } 461 default: 462 do { 463 utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]); 464 } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80); 465 break; 466 } 467 } 468 // The remain chars should be treated as single byte char. 469 while (in_pos < utf8Len && out_pos < utf16Len) { 470 utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]); 471 } 472 return out_pos; 473} 474 475size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len) 476{ 477 if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) { 478 return 0; 479 } 480 size_t latin1Pos = 0; 481 size_t end = utf16Len; 482 for (size_t i = 0; i < end; ++i) { 483 if (latin1Pos == latin1Len) { 484 break; 485 } 486 uint32_t codepoint = DecodeUTF16(utf16In, end, &i); 487 uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit); 488 latin1Out[latin1Pos++] = latin1Code; 489 } 490 return latin1Pos; 491} 492 493std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen) 494{ 495 if (maxLen == 0) { 496 return {INVALID_UTF8, 0}; 497 } 498 Span<const uint8_t> sp(utf8, maxLen); 499 // one byte 500 uint8_t d0 = sp[0]; 501 if ((d0 & BIT_MASK_1) == 0) { 502 return {d0, UtfLength::ONE}; 503 } 504 if (maxLen < UtfLength::TWO) { 505 return {INVALID_UTF8, 0}; 506 } 507 // two bytes 508 uint8_t d1 = sp[UtfLength::ONE]; 509 if ((d0 & BIT_MASK_3) == BIT_MASK_2) { 510 if ((d1 & BIT_MASK_2) == BIT_MASK_1) { 511 return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO}; 512 } else { 513 return {INVALID_UTF8, 0}; 514 } 515 } 516 if (maxLen < UtfLength::THREE) { 517 return {INVALID_UTF8, 0}; 518 } 519 // three bytes 520 uint8_t d2 = sp[UtfLength::TWO]; 521 if ((d0 & BIT_MASK_4) == BIT_MASK_3) { 522 if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) { 523 return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | 524 ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d2 & utf::MASK_6BIT), UtfLength::THREE}; 525 } else { 526 return {INVALID_UTF8, 0}; 527 } 528 } 529 if (maxLen < UtfLength::FOUR) { 530 return {INVALID_UTF8, 0}; 531 } 532 // four bytes 533 uint8_t d3 = sp[UtfLength::THREE]; 534 if ((d0 & BIT_MASK_5) == BIT_MASK_4) { 535 if (((d1 & BIT_MASK_2) == BIT_MASK_1) && 536 ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) { 537 return {((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) | 538 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT), UtfLength::FOUR}; 539 } else { 540 return {INVALID_UTF8, 0}; 541 } 542 } 543 return {INVALID_UTF8, 0}; 544} 545} // namespace panda::ecmascript::base::utf_helper 546