123b3eb3cSopenharmony_ci/**
223b3eb3cSopenharmony_ci * Copyright (c) 2024 Huawei Device Co., Ltd.
323b3eb3cSopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
423b3eb3cSopenharmony_ci * you may not use this file except in compliance with the License.
523b3eb3cSopenharmony_ci * You may obtain a copy of the License at
623b3eb3cSopenharmony_ci *
723b3eb3cSopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0
823b3eb3cSopenharmony_ci *
923b3eb3cSopenharmony_ci * Unless required by applicable law or agreed to in writing, software
1023b3eb3cSopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS,
1123b3eb3cSopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1223b3eb3cSopenharmony_ci * See the License for the specific language governing permissions and
1323b3eb3cSopenharmony_ci * limitations under the License.
1423b3eb3cSopenharmony_ci */
1523b3eb3cSopenharmony_ci
1623b3eb3cSopenharmony_ci#ifndef FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H
1723b3eb3cSopenharmony_ci#define FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H
1823b3eb3cSopenharmony_ci
1923b3eb3cSopenharmony_ci#include <cstddef>
2023b3eb3cSopenharmony_ci#include <cstdint>
2123b3eb3cSopenharmony_ci#include <utility>
2223b3eb3cSopenharmony_ci#include <string>
2323b3eb3cSopenharmony_ci
2423b3eb3cSopenharmony_cinamespace OHOS::Ace {
2523b3eb3cSopenharmony_ci
2623b3eb3cSopenharmony_ci/*
2723b3eb3cSopenharmony_ci * https://en.wikipedia.org/wiki/UTF-8
2823b3eb3cSopenharmony_ci *
2923b3eb3cSopenharmony_ci * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4
3023b3eb3cSopenharmony_ci *    code point   code point   code point
3123b3eb3cSopenharmony_ci * 1  7            U+0000       U+007F      0xxxxxxx
3223b3eb3cSopenharmony_ci * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
3323b3eb3cSopenharmony_ci * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
3423b3eb3cSopenharmony_ci * 4  21           U+10000      U+10FFFF    11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
3523b3eb3cSopenharmony_ci */
3623b3eb3cSopenharmony_ciconstexpr size_t MASK1 = 0x80;
3723b3eb3cSopenharmony_ciconstexpr size_t MASK2 = 0x20;
3823b3eb3cSopenharmony_ciconstexpr size_t MASK3 = 0x10;
3923b3eb3cSopenharmony_ci
4023b3eb3cSopenharmony_ciconstexpr size_t MASK_4BIT = 0x0f;
4123b3eb3cSopenharmony_ciconstexpr size_t MASK_5BIT = 0x1f;
4223b3eb3cSopenharmony_ciconstexpr size_t MASK_6BIT = 0x3f;
4323b3eb3cSopenharmony_ciconstexpr size_t MASK_10BIT = 0x03ff;
4423b3eb3cSopenharmony_ciconstexpr size_t MASK_16BIT = 0xffff;
4523b3eb3cSopenharmony_ci
4623b3eb3cSopenharmony_ciconstexpr size_t DATA_WIDTH = 6;
4723b3eb3cSopenharmony_ciconstexpr size_t PAIR_ELEMENT_WIDTH = 16;
4823b3eb3cSopenharmony_ci
4923b3eb3cSopenharmony_ciconstexpr size_t HI_SURROGATE_MIN = 0xd800;
5023b3eb3cSopenharmony_ciconstexpr size_t HI_SURROGATE_MAX = 0xdbff;
5123b3eb3cSopenharmony_ciconstexpr size_t LO_SURROGATE_MIN = 0xdc00;
5223b3eb3cSopenharmony_ciconstexpr size_t LO_SURROGATE_MAX = 0xdfff;
5323b3eb3cSopenharmony_ci
5423b3eb3cSopenharmony_ciconstexpr size_t LO_SUPPLEMENTS_MIN = 0x10000;
5523b3eb3cSopenharmony_ci
5623b3eb3cSopenharmony_ciconstexpr size_t U16_LEAD = 0xd7c0;
5723b3eb3cSopenharmony_ciconstexpr size_t U16_TAIL = 0xdc00;
5823b3eb3cSopenharmony_ci
5923b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_1B_MAX = 0x7f;
6023b3eb3cSopenharmony_ci
6123b3eb3cSopenharmony_ciconstexpr uint16_t MUTF8_2B_MAX = 0x7ff;
6223b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_2B_FIRST = 0xc0;
6323b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_2B_SECOND = 0x80;
6423b3eb3cSopenharmony_ci
6523b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_3B_FIRST = 0xe0;
6623b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_3B_SECOND = 0x80;
6723b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_3B_THIRD = 0x80;
6823b3eb3cSopenharmony_ci
6923b3eb3cSopenharmony_ciconstexpr uint8_t MUTF8_4B_FIRST = 0xf0;
7023b3eb3cSopenharmony_ci
7123b3eb3cSopenharmony_ciconstexpr size_t MAX_U16 = 0xffff;
7223b3eb3cSopenharmony_ciconstexpr size_t CONST_2 = 2;
7323b3eb3cSopenharmony_ciconstexpr size_t CONST_3 = 3;
7423b3eb3cSopenharmony_ciconstexpr size_t CONST_4 = 4;
7523b3eb3cSopenharmony_ciconstexpr size_t CONST_6 = 6;
7623b3eb3cSopenharmony_ciconstexpr size_t CONST_12 = 12;
7723b3eb3cSopenharmony_ci
7823b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_LEAD_LOW = 0xD800;
7923b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
8023b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
8123b3eb3cSopenharmony_ciconstexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
8223b3eb3cSopenharmony_ciconstexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
8323b3eb3cSopenharmony_ciconstexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
8423b3eb3cSopenharmony_ciconstexpr uint32_t UTF8_OFFSET = 6;
8523b3eb3cSopenharmony_ciconstexpr uint32_t UTF16_OFFSET = 10;
8623b3eb3cSopenharmony_ciconstexpr uint16_t SURROGATE_MASK = 0xF800;
8723b3eb3cSopenharmony_ciconstexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD;
8823b3eb3cSopenharmony_ci
8923b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_1B_MAX = 0x7f;
9023b3eb3cSopenharmony_ci
9123b3eb3cSopenharmony_ciconstexpr uint16_t UTF8_2B_MAX = 0x7ff;
9223b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_FIRST = 0xc0;
9323b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_SECOND = 0x80;
9423b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_THIRD = 0x3f;
9523b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2; // the minimum for 2 bytes is 128, which is 0xc280
9623b3eb3cSopenharmony_ci
9723b3eb3cSopenharmony_ciconstexpr uint16_t UTF8_3B_MAX = 0xffff;
9823b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_FIRST = 0xe0;
9923b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_SECOND = 0x80;
10023b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_THIRD = 0x80;
10123b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0; // the minimum for 3 bytes is 2048, which is 0xe0a080
10223b3eb3cSopenharmony_ci
10323b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_4B_FIRST = 0xf0;
10423b3eb3cSopenharmony_ciconstexpr uint8_t UTF8_4B_SECOND_MIN = 0x90; // the minimum for 4 bytes is 65536, which is 0xf0908080
10523b3eb3cSopenharmony_ci
10623b3eb3cSopenharmony_ciconstexpr uint8_t BYTE_MASK = 0xbf;
10723b3eb3cSopenharmony_ciconstexpr uint8_t BYTE_MARK = 0x80;
10823b3eb3cSopenharmony_ci
10923b3eb3cSopenharmony_cienum UTF8BytePatterns {
11023b3eb3cSopenharmony_ci    UTF8_TWO_BYTE_MASK = 0xE0,
11123b3eb3cSopenharmony_ci    UTF8_TWO_BYTE_PATTERN = 0xC0,
11223b3eb3cSopenharmony_ci    UTF8_THREE_BYTE_MASK = 0xF0,
11323b3eb3cSopenharmony_ci    UTF8_THREE_BYTE_PATTERN = 0xE0,
11423b3eb3cSopenharmony_ci    UTF8_FOUR_BYTE_MASK = 0xF8,
11523b3eb3cSopenharmony_ci    UTF8_FOUR_BYTE_PATTERN = 0xF0,
11623b3eb3cSopenharmony_ci    UTF8_MULTIBYTE_FOLLOWER = 0x80,
11723b3eb3cSopenharmony_ci    UTF8_HIGH_BIT = 0x80
11823b3eb3cSopenharmony_ci};
11923b3eb3cSopenharmony_ci
12023b3eb3cSopenharmony_cienum UTF16LEPatterns {
12123b3eb3cSopenharmony_ci    UTF16LE_BOM_FF = 0xFF,
12223b3eb3cSopenharmony_ci    UTF16LE_BOM_FE = 0xFE,
12323b3eb3cSopenharmony_ci    UTF16LE_ZERO_BYTE = 0x00
12423b3eb3cSopenharmony_ci};
12523b3eb3cSopenharmony_ci
12623b3eb3cSopenharmony_cienum INDEX {
12723b3eb3cSopenharmony_ci    INDEX_ONE = 1,
12823b3eb3cSopenharmony_ci    INDEX_TWO = 2,
12923b3eb3cSopenharmony_ci    INDEX_THREE = 3
13023b3eb3cSopenharmony_ci};
13123b3eb3cSopenharmony_ci
13223b3eb3cSopenharmony_cienum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
13323b3eb3cSopenharmony_ci
13423b3eb3cSopenharmony_ciconst unsigned char FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
13523b3eb3cSopenharmony_ci
13623b3eb3cSopenharmony_cistd::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes = 4);
13723b3eb3cSopenharmony_ci
13823b3eb3cSopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len);
13923b3eb3cSopenharmony_ci
14023b3eb3cSopenharmony_cisize_t ConvertRegionUtf8ToUtf16(
14123b3eb3cSopenharmony_ci    const uint8_t* utf8In, uint16_t* utf16Out, size_t utf8Len, size_t utf16Len, size_t start);
14223b3eb3cSopenharmony_ci
14323b3eb3cSopenharmony_cisize_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
14423b3eb3cSopenharmony_ci    size_t start);
14523b3eb3cSopenharmony_ci
14623b3eb3cSopenharmony_civoid ConvertIllegalStr(std::string& str);
14723b3eb3cSopenharmony_ci
14823b3eb3cSopenharmony_cibool IsUTF8(std::string& data);
14923b3eb3cSopenharmony_ci
15023b3eb3cSopenharmony_ciinline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair)
15123b3eb3cSopenharmony_ci{
15223b3eb3cSopenharmony_ci    constexpr size_t P1_MASK = 0xffff;
15323b3eb3cSopenharmony_ci    constexpr size_t P2_SHIFT = 16;
15423b3eb3cSopenharmony_ci    return { pair >> P2_SHIFT, pair & P1_MASK };
15523b3eb3cSopenharmony_ci}
15623b3eb3cSopenharmony_ci
15723b3eb3cSopenharmony_ci} // namespace OHOS::Ace
15823b3eb3cSopenharmony_ci
15923b3eb3cSopenharmony_ci#endif