14514f5e3Sopenharmony_ci/* 24514f5e3Sopenharmony_ci * Copyright (c) 2021 Huawei Device Co., Ltd. 34514f5e3Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License"); 44514f5e3Sopenharmony_ci * you may not use this file except in compliance with the License. 54514f5e3Sopenharmony_ci * You may obtain a copy of the License at 64514f5e3Sopenharmony_ci * 74514f5e3Sopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0 84514f5e3Sopenharmony_ci * 94514f5e3Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software 104514f5e3Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS, 114514f5e3Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 124514f5e3Sopenharmony_ci * See the License for the specific language governing permissions and 134514f5e3Sopenharmony_ci * limitations under the License. 144514f5e3Sopenharmony_ci */ 154514f5e3Sopenharmony_ci 164514f5e3Sopenharmony_ci#include "ecmascript/base/utf_helper.h" 174514f5e3Sopenharmony_ci 184514f5e3Sopenharmony_ci#include "ecmascript/log_wrapper.h" 194514f5e3Sopenharmony_ci 204514f5e3Sopenharmony_ci// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) 214514f5e3Sopenharmony_cistatic constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000; 224514f5e3Sopenharmony_ci// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) 234514f5e3Sopenharmony_ci#define U16_GET_SUPPLEMENTARY(lead, trail) \ 244514f5e3Sopenharmony_ci ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET) 254514f5e3Sopenharmony_ci 264514f5e3Sopenharmony_cinamespace panda::ecmascript::base::utf_helper { 274514f5e3Sopenharmony_ci 284514f5e3Sopenharmony_ciuint32_t UTF16Decode(uint16_t lead, uint16_t trail) 294514f5e3Sopenharmony_ci{ 304514f5e3Sopenharmony_ci ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) && 314514f5e3Sopenharmony_ci (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH)); 324514f5e3Sopenharmony_ci uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 334514f5e3Sopenharmony_ci return cp; 344514f5e3Sopenharmony_ci} 354514f5e3Sopenharmony_ci 364514f5e3Sopenharmony_cibool IsUTF16HighSurrogate(uint16_t ch) 374514f5e3Sopenharmony_ci{ 384514f5e3Sopenharmony_ci return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH; 394514f5e3Sopenharmony_ci} 404514f5e3Sopenharmony_ci 414514f5e3Sopenharmony_cibool IsUTF16LowSurrogate(uint16_t ch) 424514f5e3Sopenharmony_ci{ 434514f5e3Sopenharmony_ci return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH; 444514f5e3Sopenharmony_ci} 454514f5e3Sopenharmony_ci 464514f5e3Sopenharmony_ci// Methods for decode utf16 to unicode 474514f5e3Sopenharmony_ciuint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8) 484514f5e3Sopenharmony_ci{ 494514f5e3Sopenharmony_ci uint16_t high = utf16[*index]; 504514f5e3Sopenharmony_ci if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) { 514514f5e3Sopenharmony_ci return high; 524514f5e3Sopenharmony_ci } 534514f5e3Sopenharmony_ci uint16_t low = utf16[*index + 1]; 544514f5e3Sopenharmony_ci if (!IsUTF16LowSurrogate(low) || cesu8) { 554514f5e3Sopenharmony_ci return high; 564514f5e3Sopenharmony_ci } 574514f5e3Sopenharmony_ci (*index)++; 584514f5e3Sopenharmony_ci return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 594514f5e3Sopenharmony_ci} 604514f5e3Sopenharmony_ci 614514f5e3Sopenharmony_ciuint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index) 624514f5e3Sopenharmony_ci{ 634514f5e3Sopenharmony_ci uint16_t first = utf16[*index]; 644514f5e3Sopenharmony_ci // A valid surrogate pair should always start with a High Surrogate 654514f5e3Sopenharmony_ci if (IsUTF16LowSurrogate(first)) { 664514f5e3Sopenharmony_ci return UTF16_REPLACEMENT_CHARACTER; 674514f5e3Sopenharmony_ci } 684514f5e3Sopenharmony_ci if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) { 694514f5e3Sopenharmony_ci if (*index == len - 1) { 704514f5e3Sopenharmony_ci // A High surrogate not paired with another surrogate 714514f5e3Sopenharmony_ci return UTF16_REPLACEMENT_CHARACTER; 724514f5e3Sopenharmony_ci } 734514f5e3Sopenharmony_ci uint16_t second = utf16[*index + 1]; 744514f5e3Sopenharmony_ci if (!IsUTF16LowSurrogate(second)) { 754514f5e3Sopenharmony_ci // A High surrogate not followed by a low surrogate 764514f5e3Sopenharmony_ci return UTF16_REPLACEMENT_CHARACTER; 774514f5e3Sopenharmony_ci } 784514f5e3Sopenharmony_ci // A valid surrogate pair, decode normally 794514f5e3Sopenharmony_ci (*index)++; 804514f5e3Sopenharmony_ci return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 814514f5e3Sopenharmony_ci } 824514f5e3Sopenharmony_ci // A unicode not fallen into the range of representing by surrogate pair, return as it is 834514f5e3Sopenharmony_ci return first; 844514f5e3Sopenharmony_ci} 854514f5e3Sopenharmony_ci 864514f5e3Sopenharmony_ciinline size_t UTF8Length(uint32_t codepoint) 874514f5e3Sopenharmony_ci{ 884514f5e3Sopenharmony_ci if (codepoint <= UTF8_1B_MAX) { 894514f5e3Sopenharmony_ci return UtfLength::ONE; 904514f5e3Sopenharmony_ci } 914514f5e3Sopenharmony_ci if (codepoint <= UTF8_2B_MAX) { 924514f5e3Sopenharmony_ci return UtfLength::TWO; 934514f5e3Sopenharmony_ci } 944514f5e3Sopenharmony_ci if (codepoint <= UTF8_3B_MAX) { 954514f5e3Sopenharmony_ci return UtfLength::THREE; 964514f5e3Sopenharmony_ci } 974514f5e3Sopenharmony_ci return UtfLength::FOUR; 984514f5e3Sopenharmony_ci} 994514f5e3Sopenharmony_ci 1004514f5e3Sopenharmony_ci// Methods for encode unicode to unicode 1014514f5e3Sopenharmony_cisize_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index) 1024514f5e3Sopenharmony_ci{ 1034514f5e3Sopenharmony_ci size_t size = UTF8Length(codepoint); 1044514f5e3Sopenharmony_ci if (index + size > len) { 1054514f5e3Sopenharmony_ci return 0; 1064514f5e3Sopenharmony_ci } 1074514f5e3Sopenharmony_ci for (size_t j = size - 1; j > 0; j--) { 1084514f5e3Sopenharmony_ci uint8_t cont = ((codepoint | byteMark) & byteMask); 1094514f5e3Sopenharmony_ci utf8[index + j] = cont; 1104514f5e3Sopenharmony_ci codepoint >>= UTF8_OFFSET; 1114514f5e3Sopenharmony_ci } 1124514f5e3Sopenharmony_ci utf8[index] = codepoint | firstByteMark[size]; 1134514f5e3Sopenharmony_ci return size; 1144514f5e3Sopenharmony_ci} 1154514f5e3Sopenharmony_ci 1164514f5e3Sopenharmony_cibool IsValidUTF8(const std::vector<uint8_t> &data) 1174514f5e3Sopenharmony_ci{ 1184514f5e3Sopenharmony_ci uint32_t length = data.size(); 1194514f5e3Sopenharmony_ci switch (length) { 1204514f5e3Sopenharmony_ci case UtfLength::ONE: 1214514f5e3Sopenharmony_ci if (data.at(0) >= BIT_MASK_1) { 1224514f5e3Sopenharmony_ci return false; 1234514f5e3Sopenharmony_ci } 1244514f5e3Sopenharmony_ci break; 1254514f5e3Sopenharmony_ci case UtfLength::TWO: 1264514f5e3Sopenharmony_ci if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) { 1274514f5e3Sopenharmony_ci return false; 1284514f5e3Sopenharmony_ci } 1294514f5e3Sopenharmony_ci if (data.at(0) < UTF8_2B_FIRST_MIN) { 1304514f5e3Sopenharmony_ci return false; 1314514f5e3Sopenharmony_ci } 1324514f5e3Sopenharmony_ci break; 1334514f5e3Sopenharmony_ci case UtfLength::THREE: 1344514f5e3Sopenharmony_ci if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) { 1354514f5e3Sopenharmony_ci return false; 1364514f5e3Sopenharmony_ci } 1374514f5e3Sopenharmony_ci if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) { 1384514f5e3Sopenharmony_ci return false; 1394514f5e3Sopenharmony_ci } 1404514f5e3Sopenharmony_ci // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs, corresponds to %ED%A0%80~%ED%BF%BF 1414514f5e3Sopenharmony_ci if (data.at(0) == UTF8_3B_RESERVED_FIRST && data.at(1) >= UTF8_3B_RESERVED_SECOND_MIN && 1424514f5e3Sopenharmony_ci data.at(1) <= UTF8_3B_RESERVED_SECOND_MAX) { 1434514f5e3Sopenharmony_ci return false; 1444514f5e3Sopenharmony_ci } 1454514f5e3Sopenharmony_ci break; 1464514f5e3Sopenharmony_ci case UtfLength::FOUR: 1474514f5e3Sopenharmony_ci if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) { 1484514f5e3Sopenharmony_ci return false; 1494514f5e3Sopenharmony_ci } 1504514f5e3Sopenharmony_ci if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) { 1514514f5e3Sopenharmony_ci return false; 1524514f5e3Sopenharmony_ci } 1534514f5e3Sopenharmony_ci // max four length binary: 11110(100) 10(001111) 10(111111) 10(111111), max data[0] is 0xF4, data[1] is 0x8F 1544514f5e3Sopenharmony_ci if (data.at(0) > UTF8_4B_FIRST_MAX || 1554514f5e3Sopenharmony_ci (data.at(0) == UTF8_4B_FIRST_MAX && data.at(1) > UTF8_4B_SECOND_MAX)) { 1564514f5e3Sopenharmony_ci return false; 1574514f5e3Sopenharmony_ci } 1584514f5e3Sopenharmony_ci break; 1594514f5e3Sopenharmony_ci default: 1604514f5e3Sopenharmony_ci LOG_ECMA(FATAL) << "this branch is unreachable"; 1614514f5e3Sopenharmony_ci UNREACHABLE(); 1624514f5e3Sopenharmony_ci break; 1634514f5e3Sopenharmony_ci } 1644514f5e3Sopenharmony_ci 1654514f5e3Sopenharmony_ci for (uint32_t i = 1; i < length; i++) { 1664514f5e3Sopenharmony_ci if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) { 1674514f5e3Sopenharmony_ci return false; 1684514f5e3Sopenharmony_ci } 1694514f5e3Sopenharmony_ci } 1704514f5e3Sopenharmony_ci return true; 1714514f5e3Sopenharmony_ci} 1724514f5e3Sopenharmony_ci 1734514f5e3Sopenharmony_ciUtf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer) 1744514f5e3Sopenharmony_ci{ 1754514f5e3Sopenharmony_ci // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0, 1764514f5e3Sopenharmony_ci // means that is a single code point, it needs to be represented by three UTF8 code. 1774514f5e3Sopenharmony_ci if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) { 1784514f5e3Sopenharmony_ci auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE)); 1794514f5e3Sopenharmony_ci auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); 1804514f5e3Sopenharmony_ci auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); 1814514f5e3Sopenharmony_ci return {UtfLength::THREE, {ch0, ch1, ch2}}; 1824514f5e3Sopenharmony_ci } 1834514f5e3Sopenharmony_ci 1844514f5e3Sopenharmony_ci if (d0 == 0) { 1854514f5e3Sopenharmony_ci if (isWriteBuffer) { 1864514f5e3Sopenharmony_ci return {1, {0x00U}}; 1874514f5e3Sopenharmony_ci } 1884514f5e3Sopenharmony_ci if (modify) { 1894514f5e3Sopenharmony_ci // special case for \u0000 ==> C080 - 1100'0000 1000'0000 1904514f5e3Sopenharmony_ci return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}}; 1914514f5e3Sopenharmony_ci } 1924514f5e3Sopenharmony_ci // For print string, just skip '\u0000' 1934514f5e3Sopenharmony_ci return {0, {0x00U}}; 1944514f5e3Sopenharmony_ci } 1954514f5e3Sopenharmony_ci if (d0 <= UTF8_1B_MAX) { 1964514f5e3Sopenharmony_ci return {UtfLength::ONE, {static_cast<uint8_t>(d0)}}; 1974514f5e3Sopenharmony_ci } 1984514f5e3Sopenharmony_ci if (d0 <= UTF8_2B_MAX) { 1994514f5e3Sopenharmony_ci auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX)); 2004514f5e3Sopenharmony_ci auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT)); 2014514f5e3Sopenharmony_ci return {UtfLength::TWO, {ch0, ch1}}; 2024514f5e3Sopenharmony_ci } 2034514f5e3Sopenharmony_ci if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) { 2044514f5e3Sopenharmony_ci auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE)); 2054514f5e3Sopenharmony_ci auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); 2064514f5e3Sopenharmony_ci auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); 2074514f5e3Sopenharmony_ci return {UtfLength::THREE, {ch0, ch1, ch2}}; 2084514f5e3Sopenharmony_ci } 2094514f5e3Sopenharmony_ci if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) { 2104514f5e3Sopenharmony_ci // Bad sequence 2114514f5e3Sopenharmony_ci LOG_ECMA(FATAL) << "this branch is unreachable"; 2124514f5e3Sopenharmony_ci UNREACHABLE(); 2134514f5e3Sopenharmony_ci } 2144514f5e3Sopenharmony_ci 2154514f5e3Sopenharmony_ci uint32_t codePoint = CombineTwoU16(d0, d1); 2164514f5e3Sopenharmony_ci 2174514f5e3Sopenharmony_ci auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST); 2184514f5e3Sopenharmony_ci auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1); 2194514f5e3Sopenharmony_ci auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1); 2204514f5e3Sopenharmony_ci auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1); 2214514f5e3Sopenharmony_ci return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}}; 2224514f5e3Sopenharmony_ci} 2234514f5e3Sopenharmony_ci 2244514f5e3Sopenharmony_cisize_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8) 2254514f5e3Sopenharmony_ci{ 2264514f5e3Sopenharmony_ci size_t res = 1; // zero byte 2274514f5e3Sopenharmony_ci // when utf16 data length is only 1 and code in 0xd800-0xdfff, 2284514f5e3Sopenharmony_ci // means that is a single code point, it needs to be represented by three UTF8 code. 2294514f5e3Sopenharmony_ci if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2304514f5e3Sopenharmony_ci utf16[0] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2314514f5e3Sopenharmony_ci res += UtfLength::THREE; 2324514f5e3Sopenharmony_ci return res; 2334514f5e3Sopenharmony_ci } 2344514f5e3Sopenharmony_ci 2354514f5e3Sopenharmony_ci for (uint32_t i = 0; i < length; ++i) { 2364514f5e3Sopenharmony_ci if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2374514f5e3Sopenharmony_ci if (isGetBufferSize) { 2384514f5e3Sopenharmony_ci res += UtfLength::ONE; 2394514f5e3Sopenharmony_ci } else if (modify) { 2404514f5e3Sopenharmony_ci res += UtfLength::TWO; // special case for U+0000 => C0 80 2414514f5e3Sopenharmony_ci } 2424514f5e3Sopenharmony_ci } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2434514f5e3Sopenharmony_ci res += 1; 2444514f5e3Sopenharmony_ci } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2454514f5e3Sopenharmony_ci res += UtfLength::TWO; 2464514f5e3Sopenharmony_ci // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2474514f5e3Sopenharmony_ci } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) { 2484514f5e3Sopenharmony_ci res += UtfLength::THREE; 2494514f5e3Sopenharmony_ci } else { 2504514f5e3Sopenharmony_ci if (!cesu8 && i < length - 1 && 2514514f5e3Sopenharmony_ci utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2524514f5e3Sopenharmony_ci utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 2534514f5e3Sopenharmony_ci res += UtfLength::FOUR; 2544514f5e3Sopenharmony_ci ++i; 2554514f5e3Sopenharmony_ci } else { 2564514f5e3Sopenharmony_ci res += UtfLength::THREE; 2574514f5e3Sopenharmony_ci } 2584514f5e3Sopenharmony_ci } 2594514f5e3Sopenharmony_ci } 2604514f5e3Sopenharmony_ci return res; 2614514f5e3Sopenharmony_ci} 2624514f5e3Sopenharmony_ci 2634514f5e3Sopenharmony_cisize_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, 2644514f5e3Sopenharmony_ci size_t start, bool modify, bool isWriteBuffer, bool cesu8) 2654514f5e3Sopenharmony_ci{ 2664514f5e3Sopenharmony_ci if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { 2674514f5e3Sopenharmony_ci return 0; 2684514f5e3Sopenharmony_ci } 2694514f5e3Sopenharmony_ci size_t utf8Pos = 0; 2704514f5e3Sopenharmony_ci size_t end = start + utf16Len; 2714514f5e3Sopenharmony_ci for (size_t i = start; i < end; ++i) { 2724514f5e3Sopenharmony_ci uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8); 2734514f5e3Sopenharmony_ci if (codepoint == 0) { 2744514f5e3Sopenharmony_ci if (isWriteBuffer) { 2754514f5e3Sopenharmony_ci utf8Out[utf8Pos++] = 0x00U; 2764514f5e3Sopenharmony_ci continue; 2774514f5e3Sopenharmony_ci } 2784514f5e3Sopenharmony_ci if (modify) { 2794514f5e3Sopenharmony_ci // special case for \u0000 ==> C080 - 1100'0000 1000'0000 2804514f5e3Sopenharmony_ci utf8Out[utf8Pos++] = UTF8_2B_FIRST; 2814514f5e3Sopenharmony_ci utf8Out[utf8Pos++] = UTF8_2B_SECOND; 2824514f5e3Sopenharmony_ci } 2834514f5e3Sopenharmony_ci continue; 2844514f5e3Sopenharmony_ci } 2854514f5e3Sopenharmony_ci utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos); 2864514f5e3Sopenharmony_ci } 2874514f5e3Sopenharmony_ci return utf8Pos; 2884514f5e3Sopenharmony_ci} 2894514f5e3Sopenharmony_ci 2904514f5e3Sopenharmony_cisize_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, 2914514f5e3Sopenharmony_ci size_t start, bool modify, bool isWriteBuffer) 2924514f5e3Sopenharmony_ci{ 2934514f5e3Sopenharmony_ci if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { 2944514f5e3Sopenharmony_ci return 0; 2954514f5e3Sopenharmony_ci } 2964514f5e3Sopenharmony_ci size_t utf8Pos = 0; 2974514f5e3Sopenharmony_ci size_t end = start + utf16Len; 2984514f5e3Sopenharmony_ci for (size_t i = start; i < end; ++i) { 2994514f5e3Sopenharmony_ci uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i); 3004514f5e3Sopenharmony_ci if (codepoint == 0) { 3014514f5e3Sopenharmony_ci if (isWriteBuffer) { 3024514f5e3Sopenharmony_ci utf8Out[utf8Pos++] = 0x00U; 3034514f5e3Sopenharmony_ci continue; 3044514f5e3Sopenharmony_ci } 3054514f5e3Sopenharmony_ci if (modify) { 3064514f5e3Sopenharmony_ci // special case for \u0000 ==> C080 - 1100'0000 1000'0000 3074514f5e3Sopenharmony_ci utf8Out[utf8Pos++] = UTF8_2B_FIRST; 3084514f5e3Sopenharmony_ci utf8Out[utf8Pos++] = UTF8_2B_SECOND; 3094514f5e3Sopenharmony_ci } 3104514f5e3Sopenharmony_ci continue; 3114514f5e3Sopenharmony_ci } 3124514f5e3Sopenharmony_ci utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos); 3134514f5e3Sopenharmony_ci } 3144514f5e3Sopenharmony_ci return utf8Pos; 3154514f5e3Sopenharmony_ci} 3164514f5e3Sopenharmony_ci 3174514f5e3Sopenharmony_cistd::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine) 3184514f5e3Sopenharmony_ci{ 3194514f5e3Sopenharmony_ci uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 3204514f5e3Sopenharmony_ci if ((d0 & utf::MASK1) == 0) { 3214514f5e3Sopenharmony_ci return {d0, 1}; 3224514f5e3Sopenharmony_ci } 3234514f5e3Sopenharmony_ci 3244514f5e3Sopenharmony_ci uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 3254514f5e3Sopenharmony_ci if ((d0 & utf::MASK2) == 0) { 3264514f5e3Sopenharmony_ci return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO}; 3274514f5e3Sopenharmony_ci } 3284514f5e3Sopenharmony_ci 3294514f5e3Sopenharmony_ci uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 3304514f5e3Sopenharmony_ci if ((d0 & utf::MASK3) == 0) { 3314514f5e3Sopenharmony_ci return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | 3324514f5e3Sopenharmony_ci (d2 & utf::MASK_6BIT), 3334514f5e3Sopenharmony_ci UtfLength::THREE}; 3344514f5e3Sopenharmony_ci } 3354514f5e3Sopenharmony_ci 3364514f5e3Sopenharmony_ci uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 3374514f5e3Sopenharmony_ci uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) | 3384514f5e3Sopenharmony_ci ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT); 3394514f5e3Sopenharmony_ci 3404514f5e3Sopenharmony_ci uint32_t pair = 0; 3414514f5e3Sopenharmony_ci if (combine) { 3424514f5e3Sopenharmony_ci uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD); 3434514f5e3Sopenharmony_ci uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; 3444514f5e3Sopenharmony_ci pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail)); // NOLINTNEXTLINE(hicpp-signed-bitwise) 3454514f5e3Sopenharmony_ci } else { 3464514f5e3Sopenharmony_ci pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH; 3474514f5e3Sopenharmony_ci pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; 3484514f5e3Sopenharmony_ci } 3494514f5e3Sopenharmony_ci 3504514f5e3Sopenharmony_ci return {pair, UtfLength::FOUR}; 3514514f5e3Sopenharmony_ci} 3524514f5e3Sopenharmony_ci 3534514f5e3Sopenharmony_ci// drop the tail bytes if the remain length can't fill the length it represents. 3544514f5e3Sopenharmony_cistatic inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len) 3554514f5e3Sopenharmony_ci{ 3564514f5e3Sopenharmony_ci size_t trimSize = 0; 3574514f5e3Sopenharmony_ci if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) { 3584514f5e3Sopenharmony_ci // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one. 3594514f5e3Sopenharmony_ci trimSize = 1; 3604514f5e3Sopenharmony_ci } 3614514f5e3Sopenharmony_ci if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) { 3624514f5e3Sopenharmony_ci // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two. 3634514f5e3Sopenharmony_ci trimSize = CONST_2; 3644514f5e3Sopenharmony_ci } 3654514f5e3Sopenharmony_ci if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) { 3664514f5e3Sopenharmony_ci // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three. 3674514f5e3Sopenharmony_ci trimSize = CONST_3; 3684514f5e3Sopenharmony_ci } 3694514f5e3Sopenharmony_ci return utf8Len - trimSize; 3704514f5e3Sopenharmony_ci} 3714514f5e3Sopenharmony_ci 3724514f5e3Sopenharmony_cisize_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len) 3734514f5e3Sopenharmony_ci{ 3744514f5e3Sopenharmony_ci size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len); 3754514f5e3Sopenharmony_ci size_t in_pos = 0; 3764514f5e3Sopenharmony_ci size_t res = 0; 3774514f5e3Sopenharmony_ci while (in_pos < safeUtf8Len) { 3784514f5e3Sopenharmony_ci uint8_t src = utf8[in_pos]; 3794514f5e3Sopenharmony_ci switch (src & 0xF0) { 3804514f5e3Sopenharmony_ci case 0xF0: { 3814514f5e3Sopenharmony_ci const uint8_t c2 = utf8[++in_pos]; 3824514f5e3Sopenharmony_ci const uint8_t c3 = utf8[++in_pos]; 3834514f5e3Sopenharmony_ci const uint8_t c4 = utf8[++in_pos]; 3844514f5e3Sopenharmony_ci uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) | 3854514f5e3Sopenharmony_ci ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS); 3864514f5e3Sopenharmony_ci if (codePoint >= SURROGATE_RAIR_START) { 3874514f5e3Sopenharmony_ci res += CONST_2; 3884514f5e3Sopenharmony_ci } else { 3894514f5e3Sopenharmony_ci res++; 3904514f5e3Sopenharmony_ci } 3914514f5e3Sopenharmony_ci in_pos++; 3924514f5e3Sopenharmony_ci break; 3934514f5e3Sopenharmony_ci } 3944514f5e3Sopenharmony_ci case 0xE0: { 3954514f5e3Sopenharmony_ci in_pos += CONST_3; 3964514f5e3Sopenharmony_ci res++; 3974514f5e3Sopenharmony_ci break; 3984514f5e3Sopenharmony_ci } 3994514f5e3Sopenharmony_ci case 0xD0: 4004514f5e3Sopenharmony_ci case 0xC0: { 4014514f5e3Sopenharmony_ci in_pos += CONST_2; 4024514f5e3Sopenharmony_ci res++; 4034514f5e3Sopenharmony_ci break; 4044514f5e3Sopenharmony_ci } 4054514f5e3Sopenharmony_ci default: 4064514f5e3Sopenharmony_ci do { 4074514f5e3Sopenharmony_ci in_pos++; 4084514f5e3Sopenharmony_ci res++; 4094514f5e3Sopenharmony_ci } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80); 4104514f5e3Sopenharmony_ci break; 4114514f5e3Sopenharmony_ci } 4124514f5e3Sopenharmony_ci } 4134514f5e3Sopenharmony_ci // The remain chars should be treated as single byte char. 4144514f5e3Sopenharmony_ci res += utf8Len - in_pos; 4154514f5e3Sopenharmony_ci return res; 4164514f5e3Sopenharmony_ci} 4174514f5e3Sopenharmony_ci 4184514f5e3Sopenharmony_cisize_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len) 4194514f5e3Sopenharmony_ci{ 4204514f5e3Sopenharmony_ci size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len); 4214514f5e3Sopenharmony_ci size_t in_pos = 0; 4224514f5e3Sopenharmony_ci size_t out_pos = 0; 4234514f5e3Sopenharmony_ci while (in_pos < safeUtf8Len && out_pos < utf16Len) { 4244514f5e3Sopenharmony_ci uint8_t src = utf8In[in_pos]; 4254514f5e3Sopenharmony_ci switch (src & 0xF0) { 4264514f5e3Sopenharmony_ci case 0xF0: { 4274514f5e3Sopenharmony_ci const uint8_t c2 = utf8In[++in_pos]; 4284514f5e3Sopenharmony_ci const uint8_t c3 = utf8In[++in_pos]; 4294514f5e3Sopenharmony_ci const uint8_t c4 = utf8In[++in_pos]; 4304514f5e3Sopenharmony_ci uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) | 4314514f5e3Sopenharmony_ci ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS); 4324514f5e3Sopenharmony_ci if (codePoint >= SURROGATE_RAIR_START) { 4334514f5e3Sopenharmony_ci ASSERT(utf16Len >= 1); 4344514f5e3Sopenharmony_ci if (out_pos >= utf16Len - 1) { 4354514f5e3Sopenharmony_ci return out_pos; 4364514f5e3Sopenharmony_ci } 4374514f5e3Sopenharmony_ci codePoint -= SURROGATE_RAIR_START; 4384514f5e3Sopenharmony_ci utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START); 4394514f5e3Sopenharmony_ci utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START); 4404514f5e3Sopenharmony_ci } else { 4414514f5e3Sopenharmony_ci utf16Out[out_pos++] = static_cast<uint16_t>(codePoint); 4424514f5e3Sopenharmony_ci } 4434514f5e3Sopenharmony_ci in_pos++; 4444514f5e3Sopenharmony_ci break; 4454514f5e3Sopenharmony_ci } 4464514f5e3Sopenharmony_ci case 0xE0: { 4474514f5e3Sopenharmony_ci const uint8_t c2 = utf8In[++in_pos]; 4484514f5e3Sopenharmony_ci const uint8_t c3 = utf8In[++in_pos]; 4494514f5e3Sopenharmony_ci utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) | 4504514f5e3Sopenharmony_ci ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS)); 4514514f5e3Sopenharmony_ci in_pos++; 4524514f5e3Sopenharmony_ci break; 4534514f5e3Sopenharmony_ci } 4544514f5e3Sopenharmony_ci case 0xD0: 4554514f5e3Sopenharmony_ci case 0xC0: { 4564514f5e3Sopenharmony_ci const uint8_t c2 = utf8In[++in_pos]; 4574514f5e3Sopenharmony_ci utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS)); 4584514f5e3Sopenharmony_ci in_pos++; 4594514f5e3Sopenharmony_ci break; 4604514f5e3Sopenharmony_ci } 4614514f5e3Sopenharmony_ci default: 4624514f5e3Sopenharmony_ci do { 4634514f5e3Sopenharmony_ci utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]); 4644514f5e3Sopenharmony_ci } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80); 4654514f5e3Sopenharmony_ci break; 4664514f5e3Sopenharmony_ci } 4674514f5e3Sopenharmony_ci } 4684514f5e3Sopenharmony_ci // The remain chars should be treated as single byte char. 4694514f5e3Sopenharmony_ci while (in_pos < utf8Len && out_pos < utf16Len) { 4704514f5e3Sopenharmony_ci utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]); 4714514f5e3Sopenharmony_ci } 4724514f5e3Sopenharmony_ci return out_pos; 4734514f5e3Sopenharmony_ci} 4744514f5e3Sopenharmony_ci 4754514f5e3Sopenharmony_cisize_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len) 4764514f5e3Sopenharmony_ci{ 4774514f5e3Sopenharmony_ci if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) { 4784514f5e3Sopenharmony_ci return 0; 4794514f5e3Sopenharmony_ci } 4804514f5e3Sopenharmony_ci size_t latin1Pos = 0; 4814514f5e3Sopenharmony_ci size_t end = utf16Len; 4824514f5e3Sopenharmony_ci for (size_t i = 0; i < end; ++i) { 4834514f5e3Sopenharmony_ci if (latin1Pos == latin1Len) { 4844514f5e3Sopenharmony_ci break; 4854514f5e3Sopenharmony_ci } 4864514f5e3Sopenharmony_ci uint32_t codepoint = DecodeUTF16(utf16In, end, &i); 4874514f5e3Sopenharmony_ci uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit); 4884514f5e3Sopenharmony_ci latin1Out[latin1Pos++] = latin1Code; 4894514f5e3Sopenharmony_ci } 4904514f5e3Sopenharmony_ci return latin1Pos; 4914514f5e3Sopenharmony_ci} 4924514f5e3Sopenharmony_ci 4934514f5e3Sopenharmony_cistd::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen) 4944514f5e3Sopenharmony_ci{ 4954514f5e3Sopenharmony_ci if (maxLen == 0) { 4964514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 4974514f5e3Sopenharmony_ci } 4984514f5e3Sopenharmony_ci Span<const uint8_t> sp(utf8, maxLen); 4994514f5e3Sopenharmony_ci // one byte 5004514f5e3Sopenharmony_ci uint8_t d0 = sp[0]; 5014514f5e3Sopenharmony_ci if ((d0 & BIT_MASK_1) == 0) { 5024514f5e3Sopenharmony_ci return {d0, UtfLength::ONE}; 5034514f5e3Sopenharmony_ci } 5044514f5e3Sopenharmony_ci if (maxLen < UtfLength::TWO) { 5054514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 5064514f5e3Sopenharmony_ci } 5074514f5e3Sopenharmony_ci // two bytes 5084514f5e3Sopenharmony_ci uint8_t d1 = sp[UtfLength::ONE]; 5094514f5e3Sopenharmony_ci if ((d0 & BIT_MASK_3) == BIT_MASK_2) { 5104514f5e3Sopenharmony_ci if ((d1 & BIT_MASK_2) == BIT_MASK_1) { 5114514f5e3Sopenharmony_ci return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO}; 5124514f5e3Sopenharmony_ci } else { 5134514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 5144514f5e3Sopenharmony_ci } 5154514f5e3Sopenharmony_ci } 5164514f5e3Sopenharmony_ci if (maxLen < UtfLength::THREE) { 5174514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 5184514f5e3Sopenharmony_ci } 5194514f5e3Sopenharmony_ci // three bytes 5204514f5e3Sopenharmony_ci uint8_t d2 = sp[UtfLength::TWO]; 5214514f5e3Sopenharmony_ci if ((d0 & BIT_MASK_4) == BIT_MASK_3) { 5224514f5e3Sopenharmony_ci if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) { 5234514f5e3Sopenharmony_ci return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | 5244514f5e3Sopenharmony_ci ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d2 & utf::MASK_6BIT), UtfLength::THREE}; 5254514f5e3Sopenharmony_ci } else { 5264514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 5274514f5e3Sopenharmony_ci } 5284514f5e3Sopenharmony_ci } 5294514f5e3Sopenharmony_ci if (maxLen < UtfLength::FOUR) { 5304514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 5314514f5e3Sopenharmony_ci } 5324514f5e3Sopenharmony_ci // four bytes 5334514f5e3Sopenharmony_ci uint8_t d3 = sp[UtfLength::THREE]; 5344514f5e3Sopenharmony_ci if ((d0 & BIT_MASK_5) == BIT_MASK_4) { 5354514f5e3Sopenharmony_ci if (((d1 & BIT_MASK_2) == BIT_MASK_1) && 5364514f5e3Sopenharmony_ci ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) { 5374514f5e3Sopenharmony_ci return {((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) | 5384514f5e3Sopenharmony_ci ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT), UtfLength::FOUR}; 5394514f5e3Sopenharmony_ci } else { 5404514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 5414514f5e3Sopenharmony_ci } 5424514f5e3Sopenharmony_ci } 5434514f5e3Sopenharmony_ci return {INVALID_UTF8, 0}; 5444514f5e3Sopenharmony_ci} 5454514f5e3Sopenharmony_ci} // namespace panda::ecmascript::base::utf_helper 546