123b3eb3cSopenharmony_ci/** 223b3eb3cSopenharmony_ci * Copyright (c) 2024 Huawei Device Co., Ltd. 323b3eb3cSopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License"); 423b3eb3cSopenharmony_ci * you may not use this file except in compliance with the License. 523b3eb3cSopenharmony_ci * You may obtain a copy of the License at 623b3eb3cSopenharmony_ci * 723b3eb3cSopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0 823b3eb3cSopenharmony_ci * 923b3eb3cSopenharmony_ci * Unless required by applicable law or agreed to in writing, software 1023b3eb3cSopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS, 1123b3eb3cSopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1223b3eb3cSopenharmony_ci * See the License for the specific language governing permissions and 1323b3eb3cSopenharmony_ci * limitations under the License. 1423b3eb3cSopenharmony_ci */ 1523b3eb3cSopenharmony_ci 1623b3eb3cSopenharmony_ci#ifndef FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H 1723b3eb3cSopenharmony_ci#define FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H 1823b3eb3cSopenharmony_ci 1923b3eb3cSopenharmony_ci#include <cstddef> 2023b3eb3cSopenharmony_ci#include <cstdint> 2123b3eb3cSopenharmony_ci#include <utility> 2223b3eb3cSopenharmony_ci#include <string> 2323b3eb3cSopenharmony_ci 2423b3eb3cSopenharmony_cinamespace OHOS::Ace { 2523b3eb3cSopenharmony_ci 2623b3eb3cSopenharmony_ci/* 2723b3eb3cSopenharmony_ci * https://en.wikipedia.org/wiki/UTF-8 2823b3eb3cSopenharmony_ci * 2923b3eb3cSopenharmony_ci * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 3023b3eb3cSopenharmony_ci * code point code point code point 3123b3eb3cSopenharmony_ci * 1 7 U+0000 U+007F 0xxxxxxx 3223b3eb3cSopenharmony_ci * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 3323b3eb3cSopenharmony_ci * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 3423b3eb3cSopenharmony_ci * 4 21 U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 3523b3eb3cSopenharmony_ci */ 3623b3eb3cSopenharmony_ciconstexpr size_t MASK1 = 0x80; 3723b3eb3cSopenharmony_ciconstexpr size_t MASK2 = 0x20; 3823b3eb3cSopenharmony_ciconstexpr size_t MASK3 = 0x10; 3923b3eb3cSopenharmony_ci 4023b3eb3cSopenharmony_ciconstexpr size_t MASK_4BIT = 0x0f; 4123b3eb3cSopenharmony_ciconstexpr size_t MASK_5BIT = 0x1f; 4223b3eb3cSopenharmony_ciconstexpr size_t MASK_6BIT = 0x3f; 4323b3eb3cSopenharmony_ciconstexpr size_t MASK_10BIT = 0x03ff; 4423b3eb3cSopenharmony_ciconstexpr size_t MASK_16BIT = 0xffff; 4523b3eb3cSopenharmony_ci 4623b3eb3cSopenharmony_ciconstexpr size_t DATA_WIDTH = 6; 4723b3eb3cSopenharmony_ciconstexpr size_t PAIR_ELEMENT_WIDTH = 16; 4823b3eb3cSopenharmony_ci 4923b3eb3cSopenharmony_ciconstexpr size_t HI_SURROGATE_MIN = 0xd800; 5023b3eb3cSopenharmony_ciconstexpr size_t HI_SURROGATE_MAX = 0xdbff; 5123b3eb3cSopenharmony_ciconstexpr size_t LO_SURROGATE_MIN = 0xdc00; 5223b3eb3cSopenharmony_ciconstexpr size_t LO_SURROGATE_MAX = 0xdfff; 5323b3eb3cSopenharmony_ci 5423b3eb3cSopenharmony_ciconstexpr size_t LO_SUPPLEMENTS_MIN = 0x10000; 5523b3eb3cSopenharmony_ci 5623b3eb3cSopenharmony_ciconstexpr size_t U16_LEAD = 0xd7c0; 5723b3eb3cSopenharmony_ciconstexpr size_t U16_TAIL = 0xdc00; 5823b3eb3cSopenharmony_ci 5923b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_1B_MAX = 0x7f; 6023b3eb3cSopenharmony_ci 6123b3eb3cSopenharmony_ciconstexpr uint16_t MUTF8_2B_MAX = 0x7ff; 6223b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_2B_FIRST = 0xc0; 6323b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_2B_SECOND = 0x80; 6423b3eb3cSopenharmony_ci 6523b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_3B_FIRST = 0xe0; 6623b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_3B_SECOND = 0x80; 6723b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_3B_THIRD = 0x80; 6823b3eb3cSopenharmony_ci 6923b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_4B_FIRST = 0xf0; 7023b3eb3cSopenharmony_ci 7123b3eb3cSopenharmony_ciconstexpr size_t MAX_U16 = 0xffff; 7223b3eb3cSopenharmony_ciconstexpr size_t CONST_2 = 2; 7323b3eb3cSopenharmony_ciconstexpr size_t CONST_3 = 3; 7423b3eb3cSopenharmony_ciconstexpr size_t CONST_4 = 4; 7523b3eb3cSopenharmony_ciconstexpr size_t CONST_6 = 6; 7623b3eb3cSopenharmony_ciconstexpr size_t CONST_12 = 12; 7723b3eb3cSopenharmony_ci 7823b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_LEAD_LOW = 0xD800; 7923b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF; 8023b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_TRAIL_LOW = 0xDC00; 8123b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF; 8223b3eb3cSopenharmony_ciconstexpr uint32_t DECODE_FIRST_FACTOR = 0x400; 8323b3eb3cSopenharmony_ciconstexpr uint32_t DECODE_SECOND_FACTOR = 0x10000; 8423b3eb3cSopenharmony_ciconstexpr uint32_t UTF8_OFFSET = 6; 8523b3eb3cSopenharmony_ciconstexpr uint32_t UTF16_OFFSET = 10; 8623b3eb3cSopenharmony_ciconstexpr uint16_t SURROGATE_MASK = 0xF800; 8723b3eb3cSopenharmony_ciconstexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD; 8823b3eb3cSopenharmony_ci 8923b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_1B_MAX = 0x7f; 9023b3eb3cSopenharmony_ci 9123b3eb3cSopenharmony_ciconstexpr uint16_t UTF8_2B_MAX = 0x7ff; 9223b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_FIRST = 0xc0; 9323b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_SECOND = 0x80; 9423b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_THIRD = 0x3f; 9523b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2; // the minimum for 2 bytes is 128, which is 0xc280 9623b3eb3cSopenharmony_ci 9723b3eb3cSopenharmony_ciconstexpr uint16_t UTF8_3B_MAX = 0xffff; 9823b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_FIRST = 0xe0; 9923b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_SECOND = 0x80; 10023b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_THIRD = 0x80; 10123b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0; // the minimum for 3 bytes is 2048, which is 0xe0a080 10223b3eb3cSopenharmony_ci 10323b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_4B_FIRST = 0xf0; 10423b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_4B_SECOND_MIN = 0x90; // the minimum for 4 bytes is 65536, which is 0xf0908080 10523b3eb3cSopenharmony_ci 10623b3eb3cSopenharmony_ciconstexpr uint8_t BYTE_MASK = 0xbf; 10723b3eb3cSopenharmony_ciconstexpr uint8_t BYTE_MARK = 0x80; 10823b3eb3cSopenharmony_ci 10923b3eb3cSopenharmony_cienum UTF8BytePatterns { 11023b3eb3cSopenharmony_ci UTF8_TWO_BYTE_MASK = 0xE0, 11123b3eb3cSopenharmony_ci UTF8_TWO_BYTE_PATTERN = 0xC0, 11223b3eb3cSopenharmony_ci UTF8_THREE_BYTE_MASK = 0xF0, 11323b3eb3cSopenharmony_ci UTF8_THREE_BYTE_PATTERN = 0xE0, 11423b3eb3cSopenharmony_ci UTF8_FOUR_BYTE_MASK = 0xF8, 11523b3eb3cSopenharmony_ci UTF8_FOUR_BYTE_PATTERN = 0xF0, 11623b3eb3cSopenharmony_ci UTF8_MULTIBYTE_FOLLOWER = 0x80, 11723b3eb3cSopenharmony_ci UTF8_HIGH_BIT = 0x80 11823b3eb3cSopenharmony_ci}; 11923b3eb3cSopenharmony_ci 12023b3eb3cSopenharmony_cienum UTF16LEPatterns { 12123b3eb3cSopenharmony_ci UTF16LE_BOM_FF = 0xFF, 12223b3eb3cSopenharmony_ci UTF16LE_BOM_FE = 0xFE, 12323b3eb3cSopenharmony_ci UTF16LE_ZERO_BYTE = 0x00 12423b3eb3cSopenharmony_ci}; 12523b3eb3cSopenharmony_ci 12623b3eb3cSopenharmony_cienum INDEX { 12723b3eb3cSopenharmony_ci INDEX_ONE = 1, 12823b3eb3cSopenharmony_ci INDEX_TWO = 2, 12923b3eb3cSopenharmony_ci INDEX_THREE = 3 13023b3eb3cSopenharmony_ci}; 13123b3eb3cSopenharmony_ci 13223b3eb3cSopenharmony_cienum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 }; 13323b3eb3cSopenharmony_ci 13423b3eb3cSopenharmony_ciconst unsigned char FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 13523b3eb3cSopenharmony_ci 13623b3eb3cSopenharmony_cistd::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes = 4); 13723b3eb3cSopenharmony_ci 13823b3eb3cSopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len); 13923b3eb3cSopenharmony_ci 14023b3eb3cSopenharmony_cisize_t ConvertRegionUtf8ToUtf16( 14123b3eb3cSopenharmony_ci const uint8_t* utf8In, uint16_t* utf16Out, size_t utf8Len, size_t utf16Len, size_t start); 14223b3eb3cSopenharmony_ci 14323b3eb3cSopenharmony_cisize_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len, 14423b3eb3cSopenharmony_ci size_t start); 14523b3eb3cSopenharmony_ci 14623b3eb3cSopenharmony_civoid ConvertIllegalStr(std::string& str); 14723b3eb3cSopenharmony_ci 14823b3eb3cSopenharmony_cibool IsUTF8(std::string& data); 14923b3eb3cSopenharmony_ci 15023b3eb3cSopenharmony_ciinline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair) 15123b3eb3cSopenharmony_ci{ 15223b3eb3cSopenharmony_ci constexpr size_t P1_MASK = 0xffff; 15323b3eb3cSopenharmony_ci constexpr size_t P2_SHIFT = 16; 15423b3eb3cSopenharmony_ci return { pair >> P2_SHIFT, pair & P1_MASK }; 15523b3eb3cSopenharmony_ci} 15623b3eb3cSopenharmony_ci 15723b3eb3cSopenharmony_ci} // namespace OHOS::Ace 15823b3eb3cSopenharmony_ci 15923b3eb3cSopenharmony_ci#endif