1/** 2 * Copyright (c) 2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#ifndef FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H 17#define FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H 18 19#include <cstddef> 20#include <cstdint> 21#include <utility> 22#include <string> 23 24namespace OHOS::Ace { 25 26/* 27 * https://en.wikipedia.org/wiki/UTF-8 28 * 29 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 30 * code point code point code point 31 * 1 7 U+0000 U+007F 0xxxxxxx 32 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 33 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 34 * 4 21 U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 35 */ 36constexpr size_t MASK1 = 0x80; 37constexpr size_t MASK2 = 0x20; 38constexpr size_t MASK3 = 0x10; 39 40constexpr size_t MASK_4BIT = 0x0f; 41constexpr size_t MASK_5BIT = 0x1f; 42constexpr size_t MASK_6BIT = 0x3f; 43constexpr size_t MASK_10BIT = 0x03ff; 44constexpr size_t MASK_16BIT = 0xffff; 45 46constexpr size_t DATA_WIDTH = 6; 47constexpr size_t PAIR_ELEMENT_WIDTH = 16; 48 49constexpr size_t HI_SURROGATE_MIN = 0xd800; 50constexpr size_t HI_SURROGATE_MAX = 0xdbff; 51constexpr size_t LO_SURROGATE_MIN = 0xdc00; 52constexpr size_t LO_SURROGATE_MAX = 0xdfff; 53 54constexpr size_t LO_SUPPLEMENTS_MIN = 0x10000; 55 56constexpr size_t U16_LEAD = 0xd7c0; 57constexpr size_t U16_TAIL = 0xdc00; 58 59constexpr uint8_t MUTF8_1B_MAX = 0x7f; 60 61constexpr uint16_t MUTF8_2B_MAX = 0x7ff; 62constexpr uint8_t MUTF8_2B_FIRST = 0xc0; 63constexpr uint8_t MUTF8_2B_SECOND = 0x80; 64 65constexpr uint8_t MUTF8_3B_FIRST = 0xe0; 66constexpr uint8_t MUTF8_3B_SECOND = 0x80; 67constexpr uint8_t MUTF8_3B_THIRD = 0x80; 68 69constexpr uint8_t MUTF8_4B_FIRST = 0xf0; 70 71constexpr size_t MAX_U16 = 0xffff; 72constexpr size_t CONST_2 = 2; 73constexpr size_t CONST_3 = 3; 74constexpr size_t CONST_4 = 4; 75constexpr size_t CONST_6 = 6; 76constexpr size_t CONST_12 = 12; 77 78constexpr uint16_t DECODE_LEAD_LOW = 0xD800; 79constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF; 80constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00; 81constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF; 82constexpr uint32_t DECODE_FIRST_FACTOR = 0x400; 83constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000; 84constexpr uint32_t UTF8_OFFSET = 6; 85constexpr uint32_t UTF16_OFFSET = 10; 86constexpr uint16_t SURROGATE_MASK = 0xF800; 87constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD; 88 89constexpr uint8_t UTF8_1B_MAX = 0x7f; 90 91constexpr uint16_t UTF8_2B_MAX = 0x7ff; 92constexpr uint8_t UTF8_2B_FIRST = 0xc0; 93constexpr uint8_t UTF8_2B_SECOND = 0x80; 94constexpr uint8_t UTF8_2B_THIRD = 0x3f; 95constexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2; // the minimum for 2 bytes is 128, which is 0xc280 96 97constexpr uint16_t UTF8_3B_MAX = 0xffff; 98constexpr uint8_t UTF8_3B_FIRST = 0xe0; 99constexpr uint8_t UTF8_3B_SECOND = 0x80; 100constexpr uint8_t UTF8_3B_THIRD = 0x80; 101constexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0; // the minimum for 3 bytes is 2048, which is 0xe0a080 102 103constexpr uint8_t UTF8_4B_FIRST = 0xf0; 104constexpr uint8_t UTF8_4B_SECOND_MIN = 0x90; // the minimum for 4 bytes is 65536, which is 0xf0908080 105 106constexpr uint8_t BYTE_MASK = 0xbf; 107constexpr uint8_t BYTE_MARK = 0x80; 108 109enum UTF8BytePatterns { 110 UTF8_TWO_BYTE_MASK = 0xE0, 111 UTF8_TWO_BYTE_PATTERN = 0xC0, 112 UTF8_THREE_BYTE_MASK = 0xF0, 113 UTF8_THREE_BYTE_PATTERN = 0xE0, 114 UTF8_FOUR_BYTE_MASK = 0xF8, 115 UTF8_FOUR_BYTE_PATTERN = 0xF0, 116 UTF8_MULTIBYTE_FOLLOWER = 0x80, 117 UTF8_HIGH_BIT = 0x80 118}; 119 120enum UTF16LEPatterns { 121 UTF16LE_BOM_FF = 0xFF, 122 UTF16LE_BOM_FE = 0xFE, 123 UTF16LE_ZERO_BYTE = 0x00 124}; 125 126enum INDEX { 127 INDEX_ONE = 1, 128 INDEX_TWO = 2, 129 INDEX_THREE = 3 130}; 131 132enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 }; 133 134const unsigned char FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 135 136std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes = 4); 137 138size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len); 139 140size_t ConvertRegionUtf8ToUtf16( 141 const uint8_t* utf8In, uint16_t* utf16Out, size_t utf8Len, size_t utf16Len, size_t start); 142 143size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len, 144 size_t start); 145 146void ConvertIllegalStr(std::string& str); 147 148bool IsUTF8(std::string& data); 149 150inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair) 151{ 152 constexpr size_t P1_MASK = 0xffff; 153 constexpr size_t P2_SHIFT = 16; 154 return { pair >> P2_SHIFT, pair & P1_MASK }; 155} 156 157} // namespace OHOS::Ace 158 159#endif