123b3eb3cSopenharmony_ci/**
223b3eb3cSopenharmony_ci * Copyright (c) 2024 Huawei Device Co., Ltd.
323b3eb3cSopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
423b3eb3cSopenharmony_ci * you may not use this file except in compliance with the License.
523b3eb3cSopenharmony_ci * You may obtain a copy of the License at
623b3eb3cSopenharmony_ci *
723b3eb3cSopenharmony_ci * http://www.apache.org/licenses/LICENSE-2.0
823b3eb3cSopenharmony_ci *
923b3eb3cSopenharmony_ci * Unless required by applicable law or agreed to in writing, software
1023b3eb3cSopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS,
1123b3eb3cSopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1223b3eb3cSopenharmony_ci * See the License for the specific language governing permissions and
1323b3eb3cSopenharmony_ci * limitations under the License.
1423b3eb3cSopenharmony_ci */
1523b3eb3cSopenharmony_ci
1623b3eb3cSopenharmony_ci#include "utf.h"
1723b3eb3cSopenharmony_ci#include <memory>
1823b3eb3cSopenharmony_ci
1923b3eb3cSopenharmony_cinamespace OHOS::Ace {
2023b3eb3cSopenharmony_ci
2123b3eb3cSopenharmony_ci/*
2223b3eb3cSopenharmony_ci * MUtf-8
2323b3eb3cSopenharmony_ci *
2423b3eb3cSopenharmony_ci * U+0000 => C0 80
2523b3eb3cSopenharmony_ci *
2623b3eb3cSopenharmony_ci * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
2723b3eb3cSopenharmony_ci *    code point   code point   code point
2823b3eb3cSopenharmony_ci * 1  7            U+0000       U+007F      0xxxxxxx
2923b3eb3cSopenharmony_ci * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
3023b3eb3cSopenharmony_ci * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
3123b3eb3cSopenharmony_ci * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
3223b3eb3cSopenharmony_ci * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
3323b3eb3cSopenharmony_ci */
3423b3eb3cSopenharmony_ci
3523b3eb3cSopenharmony_ci/*
3623b3eb3cSopenharmony_ci * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
3723b3eb3cSopenharmony_ci * In case of invalid sequence return first byte of it.
3823b3eb3cSopenharmony_ci */
3923b3eb3cSopenharmony_cisize_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len)
4023b3eb3cSopenharmony_ci{
4123b3eb3cSopenharmony_ci    size_t pos = 0;
4223b3eb3cSopenharmony_ci    size_t res = 0;
4323b3eb3cSopenharmony_ci    while (pos != mutf8Len) {
4423b3eb3cSopenharmony_ci        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
4523b3eb3cSopenharmony_ci        if (nbytes == 0) {
4623b3eb3cSopenharmony_ci            nbytes = 1;
4723b3eb3cSopenharmony_ci        }
4823b3eb3cSopenharmony_ci        res += pair > MAX_U16 ? CONST_2 : 1;
4923b3eb3cSopenharmony_ci        mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
5023b3eb3cSopenharmony_ci        pos += nbytes;
5123b3eb3cSopenharmony_ci    }
5223b3eb3cSopenharmony_ci    return res;
5323b3eb3cSopenharmony_ci}
5423b3eb3cSopenharmony_ci
5523b3eb3cSopenharmony_cistd::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes)
5623b3eb3cSopenharmony_ci{
5723b3eb3cSopenharmony_ci    uint8_t d0 = *data;
5823b3eb3cSopenharmony_ci    if ((d0 & MASK1) == 0) {
5923b3eb3cSopenharmony_ci        return { d0, 1 };
6023b3eb3cSopenharmony_ci    }
6123b3eb3cSopenharmony_ci
6223b3eb3cSopenharmony_ci    if (maxBytes < CONST_2) {
6323b3eb3cSopenharmony_ci        return { d0, 1 };
6423b3eb3cSopenharmony_ci    }
6523b3eb3cSopenharmony_ci    uint8_t d1 = *(data + 1);
6623b3eb3cSopenharmony_ci    if ((d0 & MASK2) == 0) {
6723b3eb3cSopenharmony_ci        return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 };
6823b3eb3cSopenharmony_ci    }
6923b3eb3cSopenharmony_ci
7023b3eb3cSopenharmony_ci    if (maxBytes < CONST_3) {
7123b3eb3cSopenharmony_ci        return { d0, 1 };
7223b3eb3cSopenharmony_ci    }
7323b3eb3cSopenharmony_ci    uint8_t d2 = *(data + CONST_2);
7423b3eb3cSopenharmony_ci    if ((d0 & MASK3) == 0) {
7523b3eb3cSopenharmony_ci        return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
7623b3eb3cSopenharmony_ci            CONST_3 };
7723b3eb3cSopenharmony_ci    }
7823b3eb3cSopenharmony_ci
7923b3eb3cSopenharmony_ci    if (maxBytes < CONST_4) {
8023b3eb3cSopenharmony_ci        return { d0, 1 };
8123b3eb3cSopenharmony_ci    }
8223b3eb3cSopenharmony_ci    uint8_t d3 = *(data + CONST_3);
8323b3eb3cSopenharmony_ci    uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
8423b3eb3cSopenharmony_ci                         ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
8523b3eb3cSopenharmony_ci
8623b3eb3cSopenharmony_ci    uint32_t pair = 0;
8723b3eb3cSopenharmony_ci    pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
8823b3eb3cSopenharmony_ci    pair <<= PAIR_ELEMENT_WIDTH;
8923b3eb3cSopenharmony_ci    pair |= (codePoint & MASK_10BIT) + U16_TAIL;
9023b3eb3cSopenharmony_ci
9123b3eb3cSopenharmony_ci    return { pair, CONST_4 };
9223b3eb3cSopenharmony_ci}
9323b3eb3cSopenharmony_ci
9423b3eb3cSopenharmony_cisize_t ConvertRegionUtf8ToUtf16(
9523b3eb3cSopenharmony_ci    const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)
9623b3eb3cSopenharmony_ci{
9723b3eb3cSopenharmony_ci    size_t inPos = 0;
9823b3eb3cSopenharmony_ci    size_t outPos = 0;
9923b3eb3cSopenharmony_ci    while (inPos < mutf8Len) {
10023b3eb3cSopenharmony_ci        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
10123b3eb3cSopenharmony_ci        auto [pHi, pLo] = SplitUtf16Pair(pair);
10223b3eb3cSopenharmony_ci
10323b3eb3cSopenharmony_ci        mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
10423b3eb3cSopenharmony_ci        inPos += nbytes;
10523b3eb3cSopenharmony_ci        if (start > 0) {
10623b3eb3cSopenharmony_ci            start -= nbytes;
10723b3eb3cSopenharmony_ci            continue;
10823b3eb3cSopenharmony_ci        }
10923b3eb3cSopenharmony_ci
11023b3eb3cSopenharmony_ci        if (pHi != 0) {
11123b3eb3cSopenharmony_ci            if (outPos++ >= utf16Len - 1) { // check for place for two uint16
11223b3eb3cSopenharmony_ci                --outPos;
11323b3eb3cSopenharmony_ci                break;
11423b3eb3cSopenharmony_ci            }
11523b3eb3cSopenharmony_ci            *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
11623b3eb3cSopenharmony_ci        }
11723b3eb3cSopenharmony_ci        if (outPos++ >= utf16Len) {
11823b3eb3cSopenharmony_ci            --outPos;
11923b3eb3cSopenharmony_ci            break;
12023b3eb3cSopenharmony_ci        }
12123b3eb3cSopenharmony_ci        *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
12223b3eb3cSopenharmony_ci    }
12323b3eb3cSopenharmony_ci    return outPos;
12423b3eb3cSopenharmony_ci}
12523b3eb3cSopenharmony_ci
12623b3eb3cSopenharmony_cibool IsUTF16HighSurrogate(uint16_t ch)
12723b3eb3cSopenharmony_ci{
12823b3eb3cSopenharmony_ci    return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
12923b3eb3cSopenharmony_ci}
13023b3eb3cSopenharmony_ci
13123b3eb3cSopenharmony_cibool IsUTF16LowSurrogate(uint16_t ch)
13223b3eb3cSopenharmony_ci{
13323b3eb3cSopenharmony_ci    return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
13423b3eb3cSopenharmony_ci}
13523b3eb3cSopenharmony_ci
13623b3eb3cSopenharmony_cisize_t UTF8Length(uint32_t codePoint)
13723b3eb3cSopenharmony_ci{
13823b3eb3cSopenharmony_ci    if (codePoint <= UTF8_1B_MAX) {
13923b3eb3cSopenharmony_ci        return UtfLength::ONE;
14023b3eb3cSopenharmony_ci    }
14123b3eb3cSopenharmony_ci    if (codePoint <= UTF8_2B_MAX) {
14223b3eb3cSopenharmony_ci        return UtfLength::TWO;
14323b3eb3cSopenharmony_ci    }
14423b3eb3cSopenharmony_ci    if (codePoint <= UTF8_3B_MAX) {
14523b3eb3cSopenharmony_ci        return UtfLength::THREE;
14623b3eb3cSopenharmony_ci    }
14723b3eb3cSopenharmony_ci    return UtfLength::FOUR;
14823b3eb3cSopenharmony_ci}
14923b3eb3cSopenharmony_ci
15023b3eb3cSopenharmony_ci// Methods for encode unicode to unicode
15123b3eb3cSopenharmony_cisize_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index)
15223b3eb3cSopenharmony_ci{
15323b3eb3cSopenharmony_ci    size_t size = UTF8Length(codePoint);
15423b3eb3cSopenharmony_ci    if (index + size > len) {
15523b3eb3cSopenharmony_ci        return 0;
15623b3eb3cSopenharmony_ci    }
15723b3eb3cSopenharmony_ci    for (size_t j = size - 1; j > 0; j--) {
15823b3eb3cSopenharmony_ci        uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK);
15923b3eb3cSopenharmony_ci        utf8[index + j] = cont;
16023b3eb3cSopenharmony_ci        codePoint >>= UTF8_OFFSET;
16123b3eb3cSopenharmony_ci    }
16223b3eb3cSopenharmony_ci    utf8[index] = codePoint | FIRST_BYTE_MARK[size];
16323b3eb3cSopenharmony_ci    return size;
16423b3eb3cSopenharmony_ci}
16523b3eb3cSopenharmony_ci
16623b3eb3cSopenharmony_ciuint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index)
16723b3eb3cSopenharmony_ci{
16823b3eb3cSopenharmony_ci    uint16_t first = utf16[*index];
16923b3eb3cSopenharmony_ci    // A valid surrogate pair should always start with a High Surrogate
17023b3eb3cSopenharmony_ci    if (IsUTF16LowSurrogate(first)) {
17123b3eb3cSopenharmony_ci        return UTF16_REPLACEMENT_CHARACTER;
17223b3eb3cSopenharmony_ci    }
17323b3eb3cSopenharmony_ci    if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
17423b3eb3cSopenharmony_ci        if (*index == len - 1) {
17523b3eb3cSopenharmony_ci            // A High surrogate not paired with another surrogate
17623b3eb3cSopenharmony_ci            return UTF16_REPLACEMENT_CHARACTER;
17723b3eb3cSopenharmony_ci        }
17823b3eb3cSopenharmony_ci        uint16_t second = utf16[*index + 1];
17923b3eb3cSopenharmony_ci        if (!IsUTF16LowSurrogate(second)) {
18023b3eb3cSopenharmony_ci            // A High surrogate not followed by a low surrogate
18123b3eb3cSopenharmony_ci            return UTF16_REPLACEMENT_CHARACTER;
18223b3eb3cSopenharmony_ci        }
18323b3eb3cSopenharmony_ci        // A valid surrogate pair, decode normally
18423b3eb3cSopenharmony_ci        (*index)++;
18523b3eb3cSopenharmony_ci        return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
18623b3eb3cSopenharmony_ci    }
18723b3eb3cSopenharmony_ci    // A unicode not fallen into the range of representing by surrogate pair, return as it is
18823b3eb3cSopenharmony_ci    return first;
18923b3eb3cSopenharmony_ci}
19023b3eb3cSopenharmony_ci
19123b3eb3cSopenharmony_cisize_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
19223b3eb3cSopenharmony_ci    size_t start)
19323b3eb3cSopenharmony_ci{
19423b3eb3cSopenharmony_ci    if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
19523b3eb3cSopenharmony_ci        return 0;
19623b3eb3cSopenharmony_ci    }
19723b3eb3cSopenharmony_ci    size_t utf8Pos = 0;
19823b3eb3cSopenharmony_ci    size_t end = start + utf16Len;
19923b3eb3cSopenharmony_ci    for (size_t i = start; i < end; ++i) {
20023b3eb3cSopenharmony_ci        uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
20123b3eb3cSopenharmony_ci        if (codePoint == 0) {
20223b3eb3cSopenharmony_ci            continue;
20323b3eb3cSopenharmony_ci        }
20423b3eb3cSopenharmony_ci        utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos);
20523b3eb3cSopenharmony_ci    }
20623b3eb3cSopenharmony_ci    return utf8Pos;
20723b3eb3cSopenharmony_ci}
20823b3eb3cSopenharmony_ci
20923b3eb3cSopenharmony_cibool IsUTF8(std::string& data)
21023b3eb3cSopenharmony_ci{
21123b3eb3cSopenharmony_ci    if (data.empty()) {
21223b3eb3cSopenharmony_ci        return false;
21323b3eb3cSopenharmony_ci    }
21423b3eb3cSopenharmony_ci
21523b3eb3cSopenharmony_ci    bool hasZeroByte = false;
21623b3eb3cSopenharmony_ci    bool hasMultiByteUTF8 = false;
21723b3eb3cSopenharmony_ci
21823b3eb3cSopenharmony_ci    for (size_t i = 0; i < data.size(); ++i) {
21923b3eb3cSopenharmony_ci        unsigned char c = data[i];
22023b3eb3cSopenharmony_ci
22123b3eb3cSopenharmony_ci        // Check for UTF-16LE byte order mark (BOM)
22223b3eb3cSopenharmony_ci        if (i == 0 && data.size() >= INDEX_TWO && data[INDEX_ONE] == UTF16LE_ZERO_BYTE &&
22323b3eb3cSopenharmony_ci            (c == UTF16LE_BOM_FF || c == UTF16LE_BOM_FE)) {
22423b3eb3cSopenharmony_ci            return false;
22523b3eb3cSopenharmony_ci        }
22623b3eb3cSopenharmony_ci
22723b3eb3cSopenharmony_ci        // Check for zero bytes, which are common in UTF-16LE
22823b3eb3cSopenharmony_ci        if (c == UTF16LE_ZERO_BYTE) {
22923b3eb3cSopenharmony_ci            hasZeroByte = true;
23023b3eb3cSopenharmony_ci        }
23123b3eb3cSopenharmony_ci
23223b3eb3cSopenharmony_ci        // Check for multi-byte UTF-8 sequences
23323b3eb3cSopenharmony_ci        if ((c & UTF8_HIGH_BIT) != 0) { // High bit is set, indicating a non-ASCII character
23423b3eb3cSopenharmony_ci            if ((c & UTF8_TWO_BYTE_MASK) == UTF8_TWO_BYTE_PATTERN && i + INDEX_ONE < data.size() &&
23523b3eb3cSopenharmony_ci                (data[i + INDEX_ONE ] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
23623b3eb3cSopenharmony_ci                // Two-byte UTF-8 character
23723b3eb3cSopenharmony_ci                hasMultiByteUTF8 = true;
23823b3eb3cSopenharmony_ci                i += INDEX_ONE; // Skip the next byte
23923b3eb3cSopenharmony_ci            } else if ((c & UTF8_THREE_BYTE_MASK) == UTF8_THREE_BYTE_PATTERN && i + INDEX_TWO < data.size() &&
24023b3eb3cSopenharmony_ci                       (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
24123b3eb3cSopenharmony_ci                       (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
24223b3eb3cSopenharmony_ci                // Three-byte UTF-8 character
24323b3eb3cSopenharmony_ci                hasMultiByteUTF8 = true;
24423b3eb3cSopenharmony_ci                i += INDEX_TWO; // Skip the next two bytes
24523b3eb3cSopenharmony_ci            } else if ((c & UTF8_FOUR_BYTE_MASK) == UTF8_FOUR_BYTE_PATTERN && i + INDEX_THREE < data.size() &&
24623b3eb3cSopenharmony_ci                       (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
24723b3eb3cSopenharmony_ci                       (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
24823b3eb3cSopenharmony_ci                       (data[i + INDEX_THREE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
24923b3eb3cSopenharmony_ci                // Four-byte UTF-8 character
25023b3eb3cSopenharmony_ci                hasMultiByteUTF8 = true;
25123b3eb3cSopenharmony_ci                i += INDEX_THREE; // Skip the next three bytes
25223b3eb3cSopenharmony_ci            }
25323b3eb3cSopenharmony_ci        }
25423b3eb3cSopenharmony_ci    }
25523b3eb3cSopenharmony_ci
25623b3eb3cSopenharmony_ci    if (hasZeroByte && !hasMultiByteUTF8) {
25723b3eb3cSopenharmony_ci        // If we found zero bytes and no multi-byte UTF-8 sequences, it's likely UTF-16LE
25823b3eb3cSopenharmony_ci        return false;
25923b3eb3cSopenharmony_ci    } else if (hasMultiByteUTF8) {
26023b3eb3cSopenharmony_ci        // If we found multi-byte UTF-8 sequences, it's likely UTF-8
26123b3eb3cSopenharmony_ci        return true;
26223b3eb3cSopenharmony_ci    } else {
26323b3eb3cSopenharmony_ci        // If all characters are ASCII, it's either pure ASCII or we don't have enough data to determine the encoding
26423b3eb3cSopenharmony_ci        return false;
26523b3eb3cSopenharmony_ci    }
26623b3eb3cSopenharmony_ci}
26723b3eb3cSopenharmony_ci
26823b3eb3cSopenharmony_civoid ConvertIllegalStr(std::string& str)
26923b3eb3cSopenharmony_ci{
27023b3eb3cSopenharmony_ci    if (IsUTF8(str)) {
27123b3eb3cSopenharmony_ci        uint8_t* buf8 =  reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str()));
27223b3eb3cSopenharmony_ci        size_t utf8Len = str.size();
27323b3eb3cSopenharmony_ci        auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len);
27423b3eb3cSopenharmony_ci        std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len);
27523b3eb3cSopenharmony_ci        auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0);
27623b3eb3cSopenharmony_ci        if (resultLen == utf16Len) {
27723b3eb3cSopenharmony_ci            DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0);
27823b3eb3cSopenharmony_ci        }
27923b3eb3cSopenharmony_ci    }
28023b3eb3cSopenharmony_ci}
28123b3eb3cSopenharmony_ci
28223b3eb3cSopenharmony_ci} // namespace OHOS::Ace
283