1/** 2 * Copyright (c) 2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "utf.h" 17#include <memory> 18 19namespace OHOS::Ace { 20 21/* 22 * MUtf-8 23 * 24 * U+0000 => C0 80 25 * 26 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6 27 * code point code point code point 28 * 1 7 U+0000 U+007F 0xxxxxxx 29 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 30 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 31 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx 32 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000) 33 */ 34 35/* 36 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size]. 37 * In case of invalid sequence return first byte of it. 38 */ 39size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len) 40{ 41 size_t pos = 0; 42 size_t res = 0; 43 while (pos != mutf8Len) { 44 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos); 45 if (nbytes == 0) { 46 nbytes = 1; 47 } 48 res += pair > MAX_U16 ? CONST_2 : 1; 49 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 50 pos += nbytes; 51 } 52 return res; 53} 54 55std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes) 56{ 57 uint8_t d0 = *data; 58 if ((d0 & MASK1) == 0) { 59 return { d0, 1 }; 60 } 61 62 if (maxBytes < CONST_2) { 63 return { d0, 1 }; 64 } 65 uint8_t d1 = *(data + 1); 66 if ((d0 & MASK2) == 0) { 67 return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 }; 68 } 69 70 if (maxBytes < CONST_3) { 71 return { d0, 1 }; 72 } 73 uint8_t d2 = *(data + CONST_2); 74 if ((d0 & MASK3) == 0) { 75 return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT), 76 CONST_3 }; 77 } 78 79 if (maxBytes < CONST_4) { 80 return { d0, 1 }; 81 } 82 uint8_t d3 = *(data + CONST_3); 83 uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) | 84 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT); 85 86 uint32_t pair = 0; 87 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT; 88 pair <<= PAIR_ELEMENT_WIDTH; 89 pair |= (codePoint & MASK_10BIT) + U16_TAIL; 90 91 return { pair, CONST_4 }; 92} 93 94size_t ConvertRegionUtf8ToUtf16( 95 const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start) 96{ 97 size_t inPos = 0; 98 size_t outPos = 0; 99 while (inPos < mutf8Len) { 100 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos); 101 auto [pHi, pLo] = SplitUtf16Pair(pair); 102 103 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 104 inPos += nbytes; 105 if (start > 0) { 106 start -= nbytes; 107 continue; 108 } 109 110 if (pHi != 0) { 111 if (outPos++ >= utf16Len - 1) { // check for place for two uint16 112 --outPos; 113 break; 114 } 115 *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 116 } 117 if (outPos++ >= utf16Len) { 118 --outPos; 119 break; 120 } 121 *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 122 } 123 return outPos; 124} 125 126bool IsUTF16HighSurrogate(uint16_t ch) 127{ 128 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH; 129} 130 131bool IsUTF16LowSurrogate(uint16_t ch) 132{ 133 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH; 134} 135 136size_t UTF8Length(uint32_t codePoint) 137{ 138 if (codePoint <= UTF8_1B_MAX) { 139 return UtfLength::ONE; 140 } 141 if (codePoint <= UTF8_2B_MAX) { 142 return UtfLength::TWO; 143 } 144 if (codePoint <= UTF8_3B_MAX) { 145 return UtfLength::THREE; 146 } 147 return UtfLength::FOUR; 148} 149 150// Methods for encode unicode to unicode 151size_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index) 152{ 153 size_t size = UTF8Length(codePoint); 154 if (index + size > len) { 155 return 0; 156 } 157 for (size_t j = size - 1; j > 0; j--) { 158 uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK); 159 utf8[index + j] = cont; 160 codePoint >>= UTF8_OFFSET; 161 } 162 utf8[index] = codePoint | FIRST_BYTE_MARK[size]; 163 return size; 164} 165 166uint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index) 167{ 168 uint16_t first = utf16[*index]; 169 // A valid surrogate pair should always start with a High Surrogate 170 if (IsUTF16LowSurrogate(first)) { 171 return UTF16_REPLACEMENT_CHARACTER; 172 } 173 if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) { 174 if (*index == len - 1) { 175 // A High surrogate not paired with another surrogate 176 return UTF16_REPLACEMENT_CHARACTER; 177 } 178 uint16_t second = utf16[*index + 1]; 179 if (!IsUTF16LowSurrogate(second)) { 180 // A High surrogate not followed by a low surrogate 181 return UTF16_REPLACEMENT_CHARACTER; 182 } 183 // A valid surrogate pair, decode normally 184 (*index)++; 185 return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 186 } 187 // A unicode not fallen into the range of representing by surrogate pair, return as it is 188 return first; 189} 190 191size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len, 192 size_t start) 193{ 194 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { 195 return 0; 196 } 197 size_t utf8Pos = 0; 198 size_t end = start + utf16Len; 199 for (size_t i = start; i < end; ++i) { 200 uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i); 201 if (codePoint == 0) { 202 continue; 203 } 204 utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos); 205 } 206 return utf8Pos; 207} 208 209bool IsUTF8(std::string& data) 210{ 211 if (data.empty()) { 212 return false; 213 } 214 215 bool hasZeroByte = false; 216 bool hasMultiByteUTF8 = false; 217 218 for (size_t i = 0; i < data.size(); ++i) { 219 unsigned char c = data[i]; 220 221 // Check for UTF-16LE byte order mark (BOM) 222 if (i == 0 && data.size() >= INDEX_TWO && data[INDEX_ONE] == UTF16LE_ZERO_BYTE && 223 (c == UTF16LE_BOM_FF || c == UTF16LE_BOM_FE)) { 224 return false; 225 } 226 227 // Check for zero bytes, which are common in UTF-16LE 228 if (c == UTF16LE_ZERO_BYTE) { 229 hasZeroByte = true; 230 } 231 232 // Check for multi-byte UTF-8 sequences 233 if ((c & UTF8_HIGH_BIT) != 0) { // High bit is set, indicating a non-ASCII character 234 if ((c & UTF8_TWO_BYTE_MASK) == UTF8_TWO_BYTE_PATTERN && i + INDEX_ONE < data.size() && 235 (data[i + INDEX_ONE ] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { 236 // Two-byte UTF-8 character 237 hasMultiByteUTF8 = true; 238 i += INDEX_ONE; // Skip the next byte 239 } else if ((c & UTF8_THREE_BYTE_MASK) == UTF8_THREE_BYTE_PATTERN && i + INDEX_TWO < data.size() && 240 (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && 241 (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { 242 // Three-byte UTF-8 character 243 hasMultiByteUTF8 = true; 244 i += INDEX_TWO; // Skip the next two bytes 245 } else if ((c & UTF8_FOUR_BYTE_MASK) == UTF8_FOUR_BYTE_PATTERN && i + INDEX_THREE < data.size() && 246 (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && 247 (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && 248 (data[i + INDEX_THREE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { 249 // Four-byte UTF-8 character 250 hasMultiByteUTF8 = true; 251 i += INDEX_THREE; // Skip the next three bytes 252 } 253 } 254 } 255 256 if (hasZeroByte && !hasMultiByteUTF8) { 257 // If we found zero bytes and no multi-byte UTF-8 sequences, it's likely UTF-16LE 258 return false; 259 } else if (hasMultiByteUTF8) { 260 // If we found multi-byte UTF-8 sequences, it's likely UTF-8 261 return true; 262 } else { 263 // If all characters are ASCII, it's either pure ASCII or we don't have enough data to determine the encoding 264 return false; 265 } 266} 267 268void ConvertIllegalStr(std::string& str) 269{ 270 if (IsUTF8(str)) { 271 uint8_t* buf8 = reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str())); 272 size_t utf8Len = str.size(); 273 auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len); 274 std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len); 275 auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0); 276 if (resultLen == utf16Len) { 277 DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0); 278 } 279 } 280} 281 282} // namespace OHOS::Ace 283