1/** 2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#ifndef LIBPANDABASE_UTILS_UTF_H 17#define LIBPANDABASE_UTILS_UTF_H 18 19#include <cstdint> 20#include <cstddef> 21 22#include "utils/hash.h" 23#include "utils/span.h" 24 25namespace panda::utf { 26 27/* 28 * https://en.wikipedia.org/wiki/UTF-8 29 * 30 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 31 * code point code point code point 32 * 1 7 U+0000 U+007F 0xxxxxxx 33 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 34 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 35 * 4 21 U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 36 */ 37constexpr size_t MASK1 = 0x80; 38constexpr size_t MASK2 = 0x20; 39constexpr size_t MASK3 = 0x10; 40 41constexpr size_t MASK_4BIT = 0x0f; 42constexpr size_t MASK_5BIT = 0x1f; 43constexpr size_t MASK_6BIT = 0x3f; 44constexpr size_t MASK_10BIT = 0x03ff; 45constexpr size_t MASK_16BIT = 0xffff; 46 47constexpr size_t DATA_WIDTH = 6; 48constexpr size_t PAIR_ELEMENT_WIDTH = 16; 49 50constexpr size_t HI_SURROGATE_MIN = 0xd800; 51constexpr size_t HI_SURROGATE_MAX = 0xdbff; 52constexpr size_t LO_SURROGATE_MIN = 0xdc00; 53constexpr size_t LO_SURROGATE_MAX = 0xdfff; 54 55constexpr size_t LO_SUPPLEMENTS_MIN = 0x10000; 56 57constexpr size_t U16_LEAD = 0xd7c0; 58constexpr size_t U16_TAIL = 0xdc00; 59 60constexpr uint8_t MUTF8_1B_MAX = 0x7f; 61 62constexpr uint16_t MUTF8_2B_MAX = 0x7ff; 63constexpr uint8_t MUTF8_2B_FIRST = 0xc0; 64constexpr uint8_t MUTF8_2B_SECOND = 0x80; 65 66constexpr uint8_t MUTF8_3B_FIRST = 0xe0; 67constexpr uint8_t MUTF8_3B_SECOND = 0x80; 68constexpr uint8_t MUTF8_3B_THIRD = 0x80; 69 70constexpr uint8_t MUTF8_4B_FIRST = 0xf0; 71 72WEAK_FOR_LTO_START 73 74std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = 4); 75 76bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in); 77 78void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out); 79 80size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, 81 size_t start); 82 83size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, 84 size_t start); 85 86int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2); 87 88int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length); 89 90bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2); 91 92bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2); 93 94size_t MUtf8ToUtf16Size(const uint8_t *mutf8); 95 96size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len); 97 98size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length); 99 100size_t Mutf8Size(const uint8_t *mutf8); 101 102bool IsValidModifiedUTF8(const uint8_t *elems); 103 104WEAK_FOR_LTO_END 105 106inline const uint8_t *CStringAsMutf8(const char *str) 107{ 108 return reinterpret_cast<const uint8_t *>(str); 109} 110 111inline const char *Mutf8AsCString(const uint8_t *mutf8) 112{ 113 return reinterpret_cast<const char *>(mutf8); 114} 115 116inline constexpr bool IsAvailableNextUtf16Code(uint16_t val) 117{ 118 return val >= HI_SURROGATE_MIN && val <= LO_SURROGATE_MAX; 119} 120 121struct Mutf8Hash { 122 uint32_t operator()(const uint8_t *data) const 123 { 124 return GetHash32String(data); 125 } 126}; 127 128struct Mutf8Equal { 129 bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const 130 { 131 return IsEqual(mutf8_1, mutf8_2); 132 } 133}; 134 135struct Mutf8Less { 136 bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const 137 { 138 return CompareMUtf8ToMUtf8(mutf8_1, mutf8_2) < 0; 139 } 140}; 141 142static inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair) 143{ 144 constexpr size_t P1_MASK = 0xffff; 145 constexpr size_t P2_SHIFT = 16; 146 return {pair >> P2_SHIFT, pair & P1_MASK}; 147} 148 149} // namespace panda::utf 150 151#endif // LIBPANDABASE_UTILS_UTF_H 152