1/** 2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "utf.h" 17 18#include <cstddef> 19#include <cstring> 20 21#include <limits> 22#include <tuple> 23#include <utility> 24 25namespace panda::utf { 26 27constexpr size_t MAX_U16 = 0xffff; 28constexpr size_t CONST_2 = 2; 29constexpr size_t CONST_3 = 3; 30constexpr size_t CONST_4 = 4; 31constexpr size_t CONST_6 = 6; 32constexpr size_t CONST_12 = 12; 33 34struct MUtf8Char { 35 size_t n; 36 std::array<uint8_t, CONST_4> ch; 37}; 38 39/* 40 * MUtf-8 41 * 42 * U+0000 => C0 80 43 * 44 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6 45 * code point code point code point 46 * 1 7 U+0000 U+007F 0xxxxxxx 47 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 48 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 49 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx 50 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000) 51 */ 52 53/* 54 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size]. 55 * In case of invalid sequence return first byte of it. 56 */ 57std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes) 58{ 59 // TODO(d.kovalneko): make the function safe 60 Span<const uint8_t> sp(data, max_bytes); 61 uint8_t d0 = sp[0]; 62 if ((d0 & MASK1) == 0) { 63 return {d0, 1}; 64 } 65 66 if (max_bytes < CONST_2) { 67 return {d0, 1}; 68 } 69 uint8_t d1 = sp[1]; 70 if ((d0 & MASK2) == 0) { 71 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2}; 72 } 73 74 if (max_bytes < CONST_3) { 75 return {d0, 1}; 76 } 77 uint8_t d2 = sp[CONST_2]; 78 if ((d0 & MASK3) == 0) { 79 return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT), 80 CONST_3}; 81 } 82 83 if (max_bytes < CONST_4) { 84 return {d0, 1}; 85 } 86 uint8_t d3 = sp[CONST_3]; 87 uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) | 88 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT); 89 90 uint32_t pair = 0; 91 pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT; 92 pair <<= PAIR_ELEMENT_WIDTH; 93 pair |= (code_point & MASK_10BIT) + U16_TAIL; 94 95 return {pair, CONST_4}; 96} 97 98static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) 99{ 100 uint32_t codePoint = d0 - HI_SURROGATE_MIN; 101 codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH); 102 codePoint |= d1 - LO_SURROGATE_MIN; 103 codePoint += LO_SUPPLEMENTS_MIN; 104 return codePoint; 105} 106 107constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1) 108{ 109 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0, 110 // means that is a single code point, it needs to be represented by three MUTF8 code. 111 if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) { 112 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12)); 113 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT)); 114 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT)); 115 return {CONST_3, {ch0, ch1, ch2}}; 116 } 117 118 if (d0 == 0) { 119 return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}}; 120 } 121 if (d0 <= MUTF8_1B_MAX) { 122 return {1, {static_cast<uint8_t>(d0)}}; 123 } 124 if (d0 <= MUTF8_2B_MAX) { 125 auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6)); 126 auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT)); 127 return {CONST_2, {ch0, ch1}}; 128 } 129 if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) { 130 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12)); 131 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT)); 132 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT)); 133 return {CONST_3, {ch0, ch1, ch2}}; 134 } 135 136 uint32_t codePoint = CombineTwoU16(d0, d1); 137 138 auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST); 139 auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1); 140 auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1); 141 auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1); 142 143 return {CONST_4, {ch0, ch1, ch2, ch3}}; 144} 145 146bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in) 147{ 148 while (*mutf8_in != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 149 if (*mutf8_in >= MASK1) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 150 return false; 151 } 152 mutf8_in += 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 153 } 154 return true; 155} 156 157size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, 158 size_t start) 159{ 160 size_t mutf8_pos = 0; 161 if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) { 162 return 0; 163 } 164 size_t end = start + utf16_len; 165 for (size_t i = start; i < end; ++i) { 166 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 167 uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0; 168 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 169 MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code); 170 if (mutf8_pos + ch.n > mutf8_len) { 171 break; 172 } 173 for (size_t c = 0; c < ch.n; ++c) { 174 mutf8_out[mutf8_pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 175 } 176 if (ch.n == CONST_4) { // Two UTF-16 chars are used 177 ++i; 178 } 179 } 180 return mutf8_pos; 181} 182 183void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out) 184{ 185 size_t in_pos = 0; 186 while (in_pos < mutf8_len) { 187 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos); 188 auto [p_hi, p_lo] = SplitUtf16Pair(pair); 189 190 if (p_hi != 0) { 191 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 192 } 193 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 194 195 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 196 in_pos += nbytes; 197 } 198} 199 200size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, 201 size_t start) 202{ 203 size_t in_pos = 0; 204 size_t out_pos = 0; 205 while (in_pos < mutf8_len) { 206 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos); 207 auto [p_hi, p_lo] = SplitUtf16Pair(pair); 208 209 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 210 in_pos += nbytes; 211 if (start > 0) { 212 start -= nbytes; 213 continue; 214 } 215 216 if (p_hi != 0) { 217 ASSERT(utf16_len >= 1); 218 if (out_pos++ >= utf16_len - 1) { // check for place for two uint16 219 --out_pos; 220 break; 221 } 222 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 223 } 224 if (out_pos++ >= utf16_len) { 225 --out_pos; 226 break; 227 } 228 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 229 } 230 return out_pos; 231} 232 233int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2) 234{ 235 uint32_t c1; 236 uint32_t c2; 237 uint32_t n1; 238 uint32_t n2; 239 240 do { 241 c1 = *mutf8_1; 242 c2 = *mutf8_2; 243 244 if (c1 == 0 && c2 == 0) { 245 return 0; 246 } 247 248 if (c1 == 0 && c2 != 0) { 249 return -1; 250 } 251 252 if (c1 != 0 && c2 == 0) { 253 return 1; 254 } 255 256 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1); 257 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2); 258 259 mutf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 260 mutf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 261 } while (c1 == c2); 262 263 auto [c1p1, c1p2] = SplitUtf16Pair(c1); 264 auto [c2p1, c2p2] = SplitUtf16Pair(c2); 265 266 auto result = static_cast<int>(c1p1 - c2p1); 267 if (result != 0) { 268 return result; 269 } 270 271 return c1p2 - c2p2; 272} 273 274// compare plain utf8, which allows 0 inside a string 275int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length) 276{ 277 uint32_t c1; 278 uint32_t c2; 279 uint32_t n1; 280 uint32_t n2; 281 282 uint32_t utf8_1_index = 0; 283 uint32_t utf8_2_index = 0; 284 285 do { 286 if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) { 287 return 0; 288 } 289 290 if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) { 291 return -1; 292 } 293 294 if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) { 295 return 1; 296 } 297 298 c1 = *utf8_1; 299 c2 = *utf8_2; 300 301 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1); 302 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2); 303 304 utf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 305 utf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 306 utf8_1_index += n1; 307 utf8_2_index += n2; 308 } while (c1 == c2); 309 310 auto [c1p1, c1p2] = SplitUtf16Pair(c1); 311 auto [c2p1, c2p2] = SplitUtf16Pair(c2); 312 313 auto result = static_cast<int>(c1p1 - c2p1); 314 if (result != 0) { 315 return result; 316 } 317 318 return c1p2 - c2p2; 319} 320 321size_t Mutf8Size(const uint8_t *mutf8) 322{ 323 return strlen(Mutf8AsCString(mutf8)); 324} 325 326size_t MUtf8ToUtf16Size(const uint8_t *mutf8) 327{ 328 // TODO(d.kovalenko): make it faster 329 size_t res = 0; 330 while (*mutf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 331 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8); 332 res += pair > MAX_U16 ? CONST_2 : 1; 333 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 334 } 335 return res; 336} 337 338size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len) 339{ 340 size_t pos = 0; 341 size_t res = 0; 342 while (pos != mutf8_len) { 343 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos); 344 if (nbytes == 0) { 345 nbytes = 1; 346 } 347 res += pair > MAX_U16 ? CONST_2 : 1; 348 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 349 pos += nbytes; 350 } 351 return res; 352} 353 354size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length) 355{ 356 size_t res = 1; // zero byte 357 // when utf16 data length is only 1 and code in 0xd800-0xdfff, 358 // means that is a single code point, it needs to be represented by three MUTF8 code. 359 if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 360 mutf16[0] <= LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 361 res += CONST_3; 362 return res; 363 } 364 365 for (uint32_t i = 0; i < length; ++i) { 366 // NOLINTNEXTLINE(bugprone-branch-clone) 367 if (mutf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 368 res += CONST_2; // special case for U+0000 => C0 80 369 } else if (mutf16[i] <= MUTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 370 res += 1; 371 } else if (mutf16[i] <= MUTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 372 res += CONST_2; 373 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 374 } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) { 375 res += CONST_3; 376 } else { 377 res += CONST_4; 378 ++i; 379 } 380 } 381 return res; 382} 383 384bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2) 385{ 386 if (utf8_1.size() != utf8_2.size()) { 387 return false; 388 } 389 390 return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0; 391} 392 393bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2) 394{ 395 return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0; 396} 397 398bool IsValidModifiedUTF8(const uint8_t *elems) 399{ 400 ASSERT(elems); 401 402 while (*elems != '\0') { 403 // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers) 404 switch (*elems & 0xf0) { 405 case 0x00: 406 case 0x10: // NOLINT(readability-magic-numbers) 407 case 0x20: // NOLINT(readability-magic-numbers) 408 case 0x30: // NOLINT(readability-magic-numbers) 409 case 0x40: // NOLINT(readability-magic-numbers) 410 case 0x50: // NOLINT(readability-magic-numbers) 411 case 0x60: // NOLINT(readability-magic-numbers) 412 case 0x70: // NOLINT(readability-magic-numbers) 413 // pattern 0xxx 414 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 415 ++elems; 416 break; 417 case 0x80: // NOLINT(readability-magic-numbers) 418 case 0x90: // NOLINT(readability-magic-numbers) 419 case 0xa0: // NOLINT(readability-magic-numbers) 420 case 0xb0: // NOLINT(readability-magic-numbers) 421 // pattern 10xx is illegal start 422 return false; 423 424 case 0xf0: // NOLINT(readability-magic-numbers) 425 // pattern 1111 0xxx starts four byte section 426 if ((*elems & 0x08) == 0) { // NOLINT(hicpp-signed-bitwise) 427 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 428 ++elems; 429 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers) 430 return false; 431 } 432 } else { 433 return false; 434 } 435 // no need break 436 [[fallthrough]]; 437 438 case 0xe0: // NOLINT(readability-magic-numbers) 439 // pattern 1110 440 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 441 ++elems; 442 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers) 443 return false; 444 } 445 // no need break 446 [[fallthrough]]; 447 448 case 0xc0: // NOLINT(readability-magic-numbers) 449 case 0xd0: // NOLINT(readability-magic-numbers) 450 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 451 ++elems; 452 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers) 453 return false; 454 } 455 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 456 ++elems; 457 break; 458 default: 459 break; 460 } 461 } 462 return true; 463} 464 465} // namespace panda::utf 466