123b3eb3cSopenharmony_ci/** 223b3eb3cSopenharmony_ci * Copyright (c) 2024 Huawei Device Co., Ltd. 323b3eb3cSopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License"); 423b3eb3cSopenharmony_ci * you may not use this file except in compliance with the License. 523b3eb3cSopenharmony_ci * You may obtain a copy of the License at 623b3eb3cSopenharmony_ci * 723b3eb3cSopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0 823b3eb3cSopenharmony_ci * 923b3eb3cSopenharmony_ci * Unless required by applicable law or agreed to in writing, software 1023b3eb3cSopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS, 1123b3eb3cSopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1223b3eb3cSopenharmony_ci * See the License for the specific language governing permissions and 1323b3eb3cSopenharmony_ci * limitations under the License. 1423b3eb3cSopenharmony_ci */ 1523b3eb3cSopenharmony_ci 1623b3eb3cSopenharmony_ci#include "utf.h" 1723b3eb3cSopenharmony_ci#include <memory> 1823b3eb3cSopenharmony_ci 1923b3eb3cSopenharmony_cinamespace OHOS::Ace { 2023b3eb3cSopenharmony_ci 2123b3eb3cSopenharmony_ci/* 2223b3eb3cSopenharmony_ci * MUtf-8 2323b3eb3cSopenharmony_ci * 2423b3eb3cSopenharmony_ci * U+0000 => C0 80 2523b3eb3cSopenharmony_ci * 2623b3eb3cSopenharmony_ci * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6 2723b3eb3cSopenharmony_ci * code point code point code point 2823b3eb3cSopenharmony_ci * 1 7 U+0000 U+007F 0xxxxxxx 2923b3eb3cSopenharmony_ci * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 3023b3eb3cSopenharmony_ci * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 3123b3eb3cSopenharmony_ci * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx 3223b3eb3cSopenharmony_ci * for U+10000 -- U+10FFFF encodes the following (value - 0x10000) 3323b3eb3cSopenharmony_ci */ 3423b3eb3cSopenharmony_ci 3523b3eb3cSopenharmony_ci/* 3623b3eb3cSopenharmony_ci * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size]. 3723b3eb3cSopenharmony_ci * In case of invalid sequence return first byte of it. 3823b3eb3cSopenharmony_ci */ 3923b3eb3cSopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len) 4023b3eb3cSopenharmony_ci{ 4123b3eb3cSopenharmony_ci size_t pos = 0; 4223b3eb3cSopenharmony_ci size_t res = 0; 4323b3eb3cSopenharmony_ci while (pos != mutf8Len) { 4423b3eb3cSopenharmony_ci auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos); 4523b3eb3cSopenharmony_ci if (nbytes == 0) { 4623b3eb3cSopenharmony_ci nbytes = 1; 4723b3eb3cSopenharmony_ci } 4823b3eb3cSopenharmony_ci res += pair > MAX_U16 ? CONST_2 : 1; 4923b3eb3cSopenharmony_ci mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 5023b3eb3cSopenharmony_ci pos += nbytes; 5123b3eb3cSopenharmony_ci } 5223b3eb3cSopenharmony_ci return res; 5323b3eb3cSopenharmony_ci} 5423b3eb3cSopenharmony_ci 5523b3eb3cSopenharmony_cistd::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes) 5623b3eb3cSopenharmony_ci{ 5723b3eb3cSopenharmony_ci uint8_t d0 = *data; 5823b3eb3cSopenharmony_ci if ((d0 & MASK1) == 0) { 5923b3eb3cSopenharmony_ci return { d0, 1 }; 6023b3eb3cSopenharmony_ci } 6123b3eb3cSopenharmony_ci 6223b3eb3cSopenharmony_ci if (maxBytes < CONST_2) { 6323b3eb3cSopenharmony_ci return { d0, 1 }; 6423b3eb3cSopenharmony_ci } 6523b3eb3cSopenharmony_ci uint8_t d1 = *(data + 1); 6623b3eb3cSopenharmony_ci if ((d0 & MASK2) == 0) { 6723b3eb3cSopenharmony_ci return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 }; 6823b3eb3cSopenharmony_ci } 6923b3eb3cSopenharmony_ci 7023b3eb3cSopenharmony_ci if (maxBytes < CONST_3) { 7123b3eb3cSopenharmony_ci return { d0, 1 }; 7223b3eb3cSopenharmony_ci } 7323b3eb3cSopenharmony_ci uint8_t d2 = *(data + CONST_2); 7423b3eb3cSopenharmony_ci if ((d0 & MASK3) == 0) { 7523b3eb3cSopenharmony_ci return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT), 7623b3eb3cSopenharmony_ci CONST_3 }; 7723b3eb3cSopenharmony_ci } 7823b3eb3cSopenharmony_ci 7923b3eb3cSopenharmony_ci if (maxBytes < CONST_4) { 8023b3eb3cSopenharmony_ci return { d0, 1 }; 8123b3eb3cSopenharmony_ci } 8223b3eb3cSopenharmony_ci uint8_t d3 = *(data + CONST_3); 8323b3eb3cSopenharmony_ci uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) | 8423b3eb3cSopenharmony_ci ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT); 8523b3eb3cSopenharmony_ci 8623b3eb3cSopenharmony_ci uint32_t pair = 0; 8723b3eb3cSopenharmony_ci pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT; 8823b3eb3cSopenharmony_ci pair <<= PAIR_ELEMENT_WIDTH; 8923b3eb3cSopenharmony_ci pair |= (codePoint & MASK_10BIT) + U16_TAIL; 9023b3eb3cSopenharmony_ci 9123b3eb3cSopenharmony_ci return { pair, CONST_4 }; 9223b3eb3cSopenharmony_ci} 9323b3eb3cSopenharmony_ci 9423b3eb3cSopenharmony_cisize_t ConvertRegionUtf8ToUtf16( 9523b3eb3cSopenharmony_ci const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start) 9623b3eb3cSopenharmony_ci{ 9723b3eb3cSopenharmony_ci size_t inPos = 0; 9823b3eb3cSopenharmony_ci size_t outPos = 0; 9923b3eb3cSopenharmony_ci while (inPos < mutf8Len) { 10023b3eb3cSopenharmony_ci auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos); 10123b3eb3cSopenharmony_ci auto [pHi, pLo] = SplitUtf16Pair(pair); 10223b3eb3cSopenharmony_ci 10323b3eb3cSopenharmony_ci mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 10423b3eb3cSopenharmony_ci inPos += nbytes; 10523b3eb3cSopenharmony_ci if (start > 0) { 10623b3eb3cSopenharmony_ci start -= nbytes; 10723b3eb3cSopenharmony_ci continue; 10823b3eb3cSopenharmony_ci } 10923b3eb3cSopenharmony_ci 11023b3eb3cSopenharmony_ci if (pHi != 0) { 11123b3eb3cSopenharmony_ci if (outPos++ >= utf16Len - 1) { // check for place for two uint16 11223b3eb3cSopenharmony_ci --outPos; 11323b3eb3cSopenharmony_ci break; 11423b3eb3cSopenharmony_ci } 11523b3eb3cSopenharmony_ci *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 11623b3eb3cSopenharmony_ci } 11723b3eb3cSopenharmony_ci if (outPos++ >= utf16Len) { 11823b3eb3cSopenharmony_ci --outPos; 11923b3eb3cSopenharmony_ci break; 12023b3eb3cSopenharmony_ci } 12123b3eb3cSopenharmony_ci *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 12223b3eb3cSopenharmony_ci } 12323b3eb3cSopenharmony_ci return outPos; 12423b3eb3cSopenharmony_ci} 12523b3eb3cSopenharmony_ci 12623b3eb3cSopenharmony_cibool IsUTF16HighSurrogate(uint16_t ch) 12723b3eb3cSopenharmony_ci{ 12823b3eb3cSopenharmony_ci return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH; 12923b3eb3cSopenharmony_ci} 13023b3eb3cSopenharmony_ci 13123b3eb3cSopenharmony_cibool IsUTF16LowSurrogate(uint16_t ch) 13223b3eb3cSopenharmony_ci{ 13323b3eb3cSopenharmony_ci return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH; 13423b3eb3cSopenharmony_ci} 13523b3eb3cSopenharmony_ci 13623b3eb3cSopenharmony_cisize_t UTF8Length(uint32_t codePoint) 13723b3eb3cSopenharmony_ci{ 13823b3eb3cSopenharmony_ci if (codePoint <= UTF8_1B_MAX) { 13923b3eb3cSopenharmony_ci return UtfLength::ONE; 14023b3eb3cSopenharmony_ci } 14123b3eb3cSopenharmony_ci if (codePoint <= UTF8_2B_MAX) { 14223b3eb3cSopenharmony_ci return UtfLength::TWO; 14323b3eb3cSopenharmony_ci } 14423b3eb3cSopenharmony_ci if (codePoint <= UTF8_3B_MAX) { 14523b3eb3cSopenharmony_ci return UtfLength::THREE; 14623b3eb3cSopenharmony_ci } 14723b3eb3cSopenharmony_ci return UtfLength::FOUR; 14823b3eb3cSopenharmony_ci} 14923b3eb3cSopenharmony_ci 15023b3eb3cSopenharmony_ci// Methods for encode unicode to unicode 15123b3eb3cSopenharmony_cisize_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index) 15223b3eb3cSopenharmony_ci{ 15323b3eb3cSopenharmony_ci size_t size = UTF8Length(codePoint); 15423b3eb3cSopenharmony_ci if (index + size > len) { 15523b3eb3cSopenharmony_ci return 0; 15623b3eb3cSopenharmony_ci } 15723b3eb3cSopenharmony_ci for (size_t j = size - 1; j > 0; j--) { 15823b3eb3cSopenharmony_ci uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK); 15923b3eb3cSopenharmony_ci utf8[index + j] = cont; 16023b3eb3cSopenharmony_ci codePoint >>= UTF8_OFFSET; 16123b3eb3cSopenharmony_ci } 16223b3eb3cSopenharmony_ci utf8[index] = codePoint | FIRST_BYTE_MARK[size]; 16323b3eb3cSopenharmony_ci return size; 16423b3eb3cSopenharmony_ci} 16523b3eb3cSopenharmony_ci 16623b3eb3cSopenharmony_ciuint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index) 16723b3eb3cSopenharmony_ci{ 16823b3eb3cSopenharmony_ci uint16_t first = utf16[*index]; 16923b3eb3cSopenharmony_ci // A valid surrogate pair should always start with a High Surrogate 17023b3eb3cSopenharmony_ci if (IsUTF16LowSurrogate(first)) { 17123b3eb3cSopenharmony_ci return UTF16_REPLACEMENT_CHARACTER; 17223b3eb3cSopenharmony_ci } 17323b3eb3cSopenharmony_ci if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) { 17423b3eb3cSopenharmony_ci if (*index == len - 1) { 17523b3eb3cSopenharmony_ci // A High surrogate not paired with another surrogate 17623b3eb3cSopenharmony_ci return UTF16_REPLACEMENT_CHARACTER; 17723b3eb3cSopenharmony_ci } 17823b3eb3cSopenharmony_ci uint16_t second = utf16[*index + 1]; 17923b3eb3cSopenharmony_ci if (!IsUTF16LowSurrogate(second)) { 18023b3eb3cSopenharmony_ci // A High surrogate not followed by a low surrogate 18123b3eb3cSopenharmony_ci return UTF16_REPLACEMENT_CHARACTER; 18223b3eb3cSopenharmony_ci } 18323b3eb3cSopenharmony_ci // A valid surrogate pair, decode normally 18423b3eb3cSopenharmony_ci (*index)++; 18523b3eb3cSopenharmony_ci return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 18623b3eb3cSopenharmony_ci } 18723b3eb3cSopenharmony_ci // A unicode not fallen into the range of representing by surrogate pair, return as it is 18823b3eb3cSopenharmony_ci return first; 18923b3eb3cSopenharmony_ci} 19023b3eb3cSopenharmony_ci 19123b3eb3cSopenharmony_cisize_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len, 19223b3eb3cSopenharmony_ci size_t start) 19323b3eb3cSopenharmony_ci{ 19423b3eb3cSopenharmony_ci if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { 19523b3eb3cSopenharmony_ci return 0; 19623b3eb3cSopenharmony_ci } 19723b3eb3cSopenharmony_ci size_t utf8Pos = 0; 19823b3eb3cSopenharmony_ci size_t end = start + utf16Len; 19923b3eb3cSopenharmony_ci for (size_t i = start; i < end; ++i) { 20023b3eb3cSopenharmony_ci uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i); 20123b3eb3cSopenharmony_ci if (codePoint == 0) { 20223b3eb3cSopenharmony_ci continue; 20323b3eb3cSopenharmony_ci } 20423b3eb3cSopenharmony_ci utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos); 20523b3eb3cSopenharmony_ci } 20623b3eb3cSopenharmony_ci return utf8Pos; 20723b3eb3cSopenharmony_ci} 20823b3eb3cSopenharmony_ci 20923b3eb3cSopenharmony_cibool IsUTF8(std::string& data) 21023b3eb3cSopenharmony_ci{ 21123b3eb3cSopenharmony_ci if (data.empty()) { 21223b3eb3cSopenharmony_ci return false; 21323b3eb3cSopenharmony_ci } 21423b3eb3cSopenharmony_ci 21523b3eb3cSopenharmony_ci bool hasZeroByte = false; 21623b3eb3cSopenharmony_ci bool hasMultiByteUTF8 = false; 21723b3eb3cSopenharmony_ci 21823b3eb3cSopenharmony_ci for (size_t i = 0; i < data.size(); ++i) { 21923b3eb3cSopenharmony_ci unsigned char c = data[i]; 22023b3eb3cSopenharmony_ci 22123b3eb3cSopenharmony_ci // Check for UTF-16LE byte order mark (BOM) 22223b3eb3cSopenharmony_ci if (i == 0 && data.size() >= INDEX_TWO && data[INDEX_ONE] == UTF16LE_ZERO_BYTE && 22323b3eb3cSopenharmony_ci (c == UTF16LE_BOM_FF || c == UTF16LE_BOM_FE)) { 22423b3eb3cSopenharmony_ci return false; 22523b3eb3cSopenharmony_ci } 22623b3eb3cSopenharmony_ci 22723b3eb3cSopenharmony_ci // Check for zero bytes, which are common in UTF-16LE 22823b3eb3cSopenharmony_ci if (c == UTF16LE_ZERO_BYTE) { 22923b3eb3cSopenharmony_ci hasZeroByte = true; 23023b3eb3cSopenharmony_ci } 23123b3eb3cSopenharmony_ci 23223b3eb3cSopenharmony_ci // Check for multi-byte UTF-8 sequences 23323b3eb3cSopenharmony_ci if ((c & UTF8_HIGH_BIT) != 0) { // High bit is set, indicating a non-ASCII character 23423b3eb3cSopenharmony_ci if ((c & UTF8_TWO_BYTE_MASK) == UTF8_TWO_BYTE_PATTERN && i + INDEX_ONE < data.size() && 23523b3eb3cSopenharmony_ci (data[i + INDEX_ONE ] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { 23623b3eb3cSopenharmony_ci // Two-byte UTF-8 character 23723b3eb3cSopenharmony_ci hasMultiByteUTF8 = true; 23823b3eb3cSopenharmony_ci i += INDEX_ONE; // Skip the next byte 23923b3eb3cSopenharmony_ci } else if ((c & UTF8_THREE_BYTE_MASK) == UTF8_THREE_BYTE_PATTERN && i + INDEX_TWO < data.size() && 24023b3eb3cSopenharmony_ci (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && 24123b3eb3cSopenharmony_ci (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { 24223b3eb3cSopenharmony_ci // Three-byte UTF-8 character 24323b3eb3cSopenharmony_ci hasMultiByteUTF8 = true; 24423b3eb3cSopenharmony_ci i += INDEX_TWO; // Skip the next two bytes 24523b3eb3cSopenharmony_ci } else if ((c & UTF8_FOUR_BYTE_MASK) == UTF8_FOUR_BYTE_PATTERN && i + INDEX_THREE < data.size() && 24623b3eb3cSopenharmony_ci (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && 24723b3eb3cSopenharmony_ci (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && 24823b3eb3cSopenharmony_ci (data[i + INDEX_THREE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { 24923b3eb3cSopenharmony_ci // Four-byte UTF-8 character 25023b3eb3cSopenharmony_ci hasMultiByteUTF8 = true; 25123b3eb3cSopenharmony_ci i += INDEX_THREE; // Skip the next three bytes 25223b3eb3cSopenharmony_ci } 25323b3eb3cSopenharmony_ci } 25423b3eb3cSopenharmony_ci } 25523b3eb3cSopenharmony_ci 25623b3eb3cSopenharmony_ci if (hasZeroByte && !hasMultiByteUTF8) { 25723b3eb3cSopenharmony_ci // If we found zero bytes and no multi-byte UTF-8 sequences, it's likely UTF-16LE 25823b3eb3cSopenharmony_ci return false; 25923b3eb3cSopenharmony_ci } else if (hasMultiByteUTF8) { 26023b3eb3cSopenharmony_ci // If we found multi-byte UTF-8 sequences, it's likely UTF-8 26123b3eb3cSopenharmony_ci return true; 26223b3eb3cSopenharmony_ci } else { 26323b3eb3cSopenharmony_ci // If all characters are ASCII, it's either pure ASCII or we don't have enough data to determine the encoding 26423b3eb3cSopenharmony_ci return false; 26523b3eb3cSopenharmony_ci } 26623b3eb3cSopenharmony_ci} 26723b3eb3cSopenharmony_ci 26823b3eb3cSopenharmony_civoid ConvertIllegalStr(std::string& str) 26923b3eb3cSopenharmony_ci{ 27023b3eb3cSopenharmony_ci if (IsUTF8(str)) { 27123b3eb3cSopenharmony_ci uint8_t* buf8 = reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str())); 27223b3eb3cSopenharmony_ci size_t utf8Len = str.size(); 27323b3eb3cSopenharmony_ci auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len); 27423b3eb3cSopenharmony_ci std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len); 27523b3eb3cSopenharmony_ci auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0); 27623b3eb3cSopenharmony_ci if (resultLen == utf16Len) { 27723b3eb3cSopenharmony_ci DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0); 27823b3eb3cSopenharmony_ci } 27923b3eb3cSopenharmony_ci } 28023b3eb3cSopenharmony_ci} 28123b3eb3cSopenharmony_ci 28223b3eb3cSopenharmony_ci} // namespace OHOS::Ace 283