1b1994897Sopenharmony_ci/** 2b1994897Sopenharmony_ci * Copyright (c) 2021-2022 Huawei Device Co., Ltd. 3b1994897Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License"); 4b1994897Sopenharmony_ci * you may not use this file except in compliance with the License. 5b1994897Sopenharmony_ci * You may obtain a copy of the License at 6b1994897Sopenharmony_ci * 7b1994897Sopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0 8b1994897Sopenharmony_ci * 9b1994897Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software 10b1994897Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS, 11b1994897Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12b1994897Sopenharmony_ci * See the License for the specific language governing permissions and 13b1994897Sopenharmony_ci * limitations under the License. 14b1994897Sopenharmony_ci */ 15b1994897Sopenharmony_ci 16b1994897Sopenharmony_ci#ifndef LIBPANDABASE_UTILS_UTF_H 17b1994897Sopenharmony_ci#define LIBPANDABASE_UTILS_UTF_H 18b1994897Sopenharmony_ci 19b1994897Sopenharmony_ci#include <cstdint> 20b1994897Sopenharmony_ci#include <cstddef> 21b1994897Sopenharmony_ci 22b1994897Sopenharmony_ci#include "utils/hash.h" 23b1994897Sopenharmony_ci#include "utils/span.h" 24b1994897Sopenharmony_ci 25b1994897Sopenharmony_cinamespace panda::utf { 26b1994897Sopenharmony_ci 27b1994897Sopenharmony_ci/* 28b1994897Sopenharmony_ci * https://en.wikipedia.org/wiki/UTF-8 29b1994897Sopenharmony_ci * 30b1994897Sopenharmony_ci * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 31b1994897Sopenharmony_ci * code point code point code point 32b1994897Sopenharmony_ci * 1 7 U+0000 U+007F 0xxxxxxx 33b1994897Sopenharmony_ci * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 34b1994897Sopenharmony_ci * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 35b1994897Sopenharmony_ci * 4 21 U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 36b1994897Sopenharmony_ci */ 37b1994897Sopenharmony_ciconstexpr size_t MASK1 = 0x80; 38b1994897Sopenharmony_ciconstexpr size_t MASK2 = 0x20; 39b1994897Sopenharmony_ciconstexpr size_t MASK3 = 0x10; 40b1994897Sopenharmony_ci 41b1994897Sopenharmony_ciconstexpr size_t MASK_4BIT = 0x0f; 42b1994897Sopenharmony_ciconstexpr size_t MASK_5BIT = 0x1f; 43b1994897Sopenharmony_ciconstexpr size_t MASK_6BIT = 0x3f; 44b1994897Sopenharmony_ciconstexpr size_t MASK_10BIT = 0x03ff; 45b1994897Sopenharmony_ciconstexpr size_t MASK_16BIT = 0xffff; 46b1994897Sopenharmony_ci 47b1994897Sopenharmony_ciconstexpr size_t DATA_WIDTH = 6; 48b1994897Sopenharmony_ciconstexpr size_t PAIR_ELEMENT_WIDTH = 16; 49b1994897Sopenharmony_ci 50b1994897Sopenharmony_ciconstexpr size_t HI_SURROGATE_MIN = 0xd800; 51b1994897Sopenharmony_ciconstexpr size_t HI_SURROGATE_MAX = 0xdbff; 52b1994897Sopenharmony_ciconstexpr size_t LO_SURROGATE_MIN = 0xdc00; 53b1994897Sopenharmony_ciconstexpr size_t LO_SURROGATE_MAX = 0xdfff; 54b1994897Sopenharmony_ci 55b1994897Sopenharmony_ciconstexpr size_t LO_SUPPLEMENTS_MIN = 0x10000; 56b1994897Sopenharmony_ci 57b1994897Sopenharmony_ciconstexpr size_t U16_LEAD = 0xd7c0; 58b1994897Sopenharmony_ciconstexpr size_t U16_TAIL = 0xdc00; 59b1994897Sopenharmony_ci 60b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_1B_MAX = 0x7f; 61b1994897Sopenharmony_ci 62b1994897Sopenharmony_ciconstexpr uint16_t MUTF8_2B_MAX = 0x7ff; 63b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_2B_FIRST = 0xc0; 64b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_2B_SECOND = 0x80; 65b1994897Sopenharmony_ci 66b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_3B_FIRST = 0xe0; 67b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_3B_SECOND = 0x80; 68b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_3B_THIRD = 0x80; 69b1994897Sopenharmony_ci 70b1994897Sopenharmony_ciconstexpr uint8_t MUTF8_4B_FIRST = 0xf0; 71b1994897Sopenharmony_ci 72b1994897Sopenharmony_ciWEAK_FOR_LTO_START 73b1994897Sopenharmony_ci 74b1994897Sopenharmony_cistd::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = 4); 75b1994897Sopenharmony_ci 76b1994897Sopenharmony_cibool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in); 77b1994897Sopenharmony_ci 78b1994897Sopenharmony_civoid ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out); 79b1994897Sopenharmony_ci 80b1994897Sopenharmony_cisize_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, 81b1994897Sopenharmony_ci size_t start); 82b1994897Sopenharmony_ci 83b1994897Sopenharmony_cisize_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, 84b1994897Sopenharmony_ci size_t start); 85b1994897Sopenharmony_ci 86b1994897Sopenharmony_ciint CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2); 87b1994897Sopenharmony_ci 88b1994897Sopenharmony_ciint CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length); 89b1994897Sopenharmony_ci 90b1994897Sopenharmony_cibool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2); 91b1994897Sopenharmony_ci 92b1994897Sopenharmony_cibool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2); 93b1994897Sopenharmony_ci 94b1994897Sopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t *mutf8); 95b1994897Sopenharmony_ci 96b1994897Sopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len); 97b1994897Sopenharmony_ci 98b1994897Sopenharmony_cisize_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length); 99b1994897Sopenharmony_ci 100b1994897Sopenharmony_cisize_t Mutf8Size(const uint8_t *mutf8); 101b1994897Sopenharmony_ci 102b1994897Sopenharmony_cibool IsValidModifiedUTF8(const uint8_t *elems); 103b1994897Sopenharmony_ci 104b1994897Sopenharmony_ciWEAK_FOR_LTO_END 105b1994897Sopenharmony_ci 106b1994897Sopenharmony_ciinline const uint8_t *CStringAsMutf8(const char *str) 107b1994897Sopenharmony_ci{ 108b1994897Sopenharmony_ci return reinterpret_cast<const uint8_t *>(str); 109b1994897Sopenharmony_ci} 110b1994897Sopenharmony_ci 111b1994897Sopenharmony_ciinline const char *Mutf8AsCString(const uint8_t *mutf8) 112b1994897Sopenharmony_ci{ 113b1994897Sopenharmony_ci return reinterpret_cast<const char *>(mutf8); 114b1994897Sopenharmony_ci} 115b1994897Sopenharmony_ci 116b1994897Sopenharmony_ciinline constexpr bool IsAvailableNextUtf16Code(uint16_t val) 117b1994897Sopenharmony_ci{ 118b1994897Sopenharmony_ci return val >= HI_SURROGATE_MIN && val <= LO_SURROGATE_MAX; 119b1994897Sopenharmony_ci} 120b1994897Sopenharmony_ci 121b1994897Sopenharmony_cistruct Mutf8Hash { 122b1994897Sopenharmony_ci uint32_t operator()(const uint8_t *data) const 123b1994897Sopenharmony_ci { 124b1994897Sopenharmony_ci return GetHash32String(data); 125b1994897Sopenharmony_ci } 126b1994897Sopenharmony_ci}; 127b1994897Sopenharmony_ci 128b1994897Sopenharmony_cistruct Mutf8Equal { 129b1994897Sopenharmony_ci bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const 130b1994897Sopenharmony_ci { 131b1994897Sopenharmony_ci return IsEqual(mutf8_1, mutf8_2); 132b1994897Sopenharmony_ci } 133b1994897Sopenharmony_ci}; 134b1994897Sopenharmony_ci 135b1994897Sopenharmony_cistruct Mutf8Less { 136b1994897Sopenharmony_ci bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const 137b1994897Sopenharmony_ci { 138b1994897Sopenharmony_ci return CompareMUtf8ToMUtf8(mutf8_1, mutf8_2) < 0; 139b1994897Sopenharmony_ci } 140b1994897Sopenharmony_ci}; 141b1994897Sopenharmony_ci 142b1994897Sopenharmony_cistatic inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair) 143b1994897Sopenharmony_ci{ 144b1994897Sopenharmony_ci constexpr size_t P1_MASK = 0xffff; 145b1994897Sopenharmony_ci constexpr size_t P2_SHIFT = 16; 146b1994897Sopenharmony_ci return {pair >> P2_SHIFT, pair & P1_MASK}; 147b1994897Sopenharmony_ci} 148b1994897Sopenharmony_ci 149b1994897Sopenharmony_ci} // namespace panda::utf 150b1994897Sopenharmony_ci 151b1994897Sopenharmony_ci#endif // LIBPANDABASE_UTILS_UTF_H 152