14514f5e3Sopenharmony_ci/*
24514f5e3Sopenharmony_ci * Copyright (c) 2021 Huawei Device Co., Ltd.
34514f5e3Sopenharmony_ci * Licensed under the Apache License, Version 2.0 (the "License");
44514f5e3Sopenharmony_ci * you may not use this file except in compliance with the License.
54514f5e3Sopenharmony_ci * You may obtain a copy of the License at
64514f5e3Sopenharmony_ci *
74514f5e3Sopenharmony_ci *     http://www.apache.org/licenses/LICENSE-2.0
84514f5e3Sopenharmony_ci *
94514f5e3Sopenharmony_ci * Unless required by applicable law or agreed to in writing, software
104514f5e3Sopenharmony_ci * distributed under the License is distributed on an "AS IS" BASIS,
114514f5e3Sopenharmony_ci * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
124514f5e3Sopenharmony_ci * See the License for the specific language governing permissions and
134514f5e3Sopenharmony_ci * limitations under the License.
144514f5e3Sopenharmony_ci */
154514f5e3Sopenharmony_ci
164514f5e3Sopenharmony_ci#include "ecmascript/base/utf_helper.h"
174514f5e3Sopenharmony_ci
184514f5e3Sopenharmony_ci#include "ecmascript/log_wrapper.h"
194514f5e3Sopenharmony_ci
204514f5e3Sopenharmony_ci// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
214514f5e3Sopenharmony_cistatic constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
224514f5e3Sopenharmony_ci// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
234514f5e3Sopenharmony_ci#define U16_GET_SUPPLEMENTARY(lead, trail) \
244514f5e3Sopenharmony_ci    ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
254514f5e3Sopenharmony_ci
264514f5e3Sopenharmony_cinamespace panda::ecmascript::base::utf_helper {
274514f5e3Sopenharmony_ci
284514f5e3Sopenharmony_ciuint32_t UTF16Decode(uint16_t lead, uint16_t trail)
294514f5e3Sopenharmony_ci{
304514f5e3Sopenharmony_ci    ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
314514f5e3Sopenharmony_ci           (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
324514f5e3Sopenharmony_ci    uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
334514f5e3Sopenharmony_ci    return cp;
344514f5e3Sopenharmony_ci}
354514f5e3Sopenharmony_ci
364514f5e3Sopenharmony_cibool IsUTF16HighSurrogate(uint16_t ch)
374514f5e3Sopenharmony_ci{
384514f5e3Sopenharmony_ci    return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
394514f5e3Sopenharmony_ci}
404514f5e3Sopenharmony_ci
414514f5e3Sopenharmony_cibool IsUTF16LowSurrogate(uint16_t ch)
424514f5e3Sopenharmony_ci{
434514f5e3Sopenharmony_ci    return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
444514f5e3Sopenharmony_ci}
454514f5e3Sopenharmony_ci
464514f5e3Sopenharmony_ci// Methods for decode utf16 to unicode
474514f5e3Sopenharmony_ciuint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8)
484514f5e3Sopenharmony_ci{
494514f5e3Sopenharmony_ci    uint16_t high = utf16[*index];
504514f5e3Sopenharmony_ci    if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
514514f5e3Sopenharmony_ci        return high;
524514f5e3Sopenharmony_ci    }
534514f5e3Sopenharmony_ci    uint16_t low = utf16[*index + 1];
544514f5e3Sopenharmony_ci    if (!IsUTF16LowSurrogate(low) || cesu8) {
554514f5e3Sopenharmony_ci        return high;
564514f5e3Sopenharmony_ci    }
574514f5e3Sopenharmony_ci    (*index)++;
584514f5e3Sopenharmony_ci    return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
594514f5e3Sopenharmony_ci}
604514f5e3Sopenharmony_ci
614514f5e3Sopenharmony_ciuint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
624514f5e3Sopenharmony_ci{
634514f5e3Sopenharmony_ci    uint16_t first = utf16[*index];
644514f5e3Sopenharmony_ci    // A valid surrogate pair should always start with a High Surrogate
654514f5e3Sopenharmony_ci    if (IsUTF16LowSurrogate(first)) {
664514f5e3Sopenharmony_ci        return UTF16_REPLACEMENT_CHARACTER;
674514f5e3Sopenharmony_ci    }
684514f5e3Sopenharmony_ci    if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
694514f5e3Sopenharmony_ci        if (*index == len - 1) {
704514f5e3Sopenharmony_ci            // A High surrogate not paired with another surrogate
714514f5e3Sopenharmony_ci            return UTF16_REPLACEMENT_CHARACTER;
724514f5e3Sopenharmony_ci        }
734514f5e3Sopenharmony_ci        uint16_t second = utf16[*index + 1];
744514f5e3Sopenharmony_ci        if (!IsUTF16LowSurrogate(second)) {
754514f5e3Sopenharmony_ci            // A High surrogate not followed by a low surrogate
764514f5e3Sopenharmony_ci            return UTF16_REPLACEMENT_CHARACTER;
774514f5e3Sopenharmony_ci        }
784514f5e3Sopenharmony_ci        // A valid surrogate pair, decode normally
794514f5e3Sopenharmony_ci        (*index)++;
804514f5e3Sopenharmony_ci        return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
814514f5e3Sopenharmony_ci    }
824514f5e3Sopenharmony_ci    // A unicode not fallen into the range of representing by surrogate pair, return as it is
834514f5e3Sopenharmony_ci    return first;
844514f5e3Sopenharmony_ci}
854514f5e3Sopenharmony_ci
864514f5e3Sopenharmony_ciinline size_t UTF8Length(uint32_t codepoint)
874514f5e3Sopenharmony_ci{
884514f5e3Sopenharmony_ci    if (codepoint <= UTF8_1B_MAX) {
894514f5e3Sopenharmony_ci        return UtfLength::ONE;
904514f5e3Sopenharmony_ci    }
914514f5e3Sopenharmony_ci    if (codepoint <= UTF8_2B_MAX) {
924514f5e3Sopenharmony_ci        return UtfLength::TWO;
934514f5e3Sopenharmony_ci    }
944514f5e3Sopenharmony_ci    if (codepoint <= UTF8_3B_MAX) {
954514f5e3Sopenharmony_ci        return UtfLength::THREE;
964514f5e3Sopenharmony_ci    }
974514f5e3Sopenharmony_ci    return UtfLength::FOUR;
984514f5e3Sopenharmony_ci}
994514f5e3Sopenharmony_ci
1004514f5e3Sopenharmony_ci// Methods for encode unicode to unicode
1014514f5e3Sopenharmony_cisize_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
1024514f5e3Sopenharmony_ci{
1034514f5e3Sopenharmony_ci    size_t size = UTF8Length(codepoint);
1044514f5e3Sopenharmony_ci    if (index + size > len) {
1054514f5e3Sopenharmony_ci        return 0;
1064514f5e3Sopenharmony_ci    }
1074514f5e3Sopenharmony_ci    for (size_t j = size - 1; j > 0; j--) {
1084514f5e3Sopenharmony_ci        uint8_t cont = ((codepoint | byteMark) & byteMask);
1094514f5e3Sopenharmony_ci        utf8[index + j] = cont;
1104514f5e3Sopenharmony_ci        codepoint >>= UTF8_OFFSET;
1114514f5e3Sopenharmony_ci    }
1124514f5e3Sopenharmony_ci    utf8[index] = codepoint | firstByteMark[size];
1134514f5e3Sopenharmony_ci    return size;
1144514f5e3Sopenharmony_ci}
1154514f5e3Sopenharmony_ci
1164514f5e3Sopenharmony_cibool IsValidUTF8(const std::vector<uint8_t> &data)
1174514f5e3Sopenharmony_ci{
1184514f5e3Sopenharmony_ci    uint32_t length = data.size();
1194514f5e3Sopenharmony_ci    switch (length) {
1204514f5e3Sopenharmony_ci        case UtfLength::ONE:
1214514f5e3Sopenharmony_ci            if (data.at(0) >= BIT_MASK_1) {
1224514f5e3Sopenharmony_ci                return false;
1234514f5e3Sopenharmony_ci            }
1244514f5e3Sopenharmony_ci            break;
1254514f5e3Sopenharmony_ci        case UtfLength::TWO:
1264514f5e3Sopenharmony_ci            if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
1274514f5e3Sopenharmony_ci                return false;
1284514f5e3Sopenharmony_ci            }
1294514f5e3Sopenharmony_ci            if (data.at(0) < UTF8_2B_FIRST_MIN) {
1304514f5e3Sopenharmony_ci                return false;
1314514f5e3Sopenharmony_ci            }
1324514f5e3Sopenharmony_ci            break;
1334514f5e3Sopenharmony_ci        case UtfLength::THREE:
1344514f5e3Sopenharmony_ci            if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
1354514f5e3Sopenharmony_ci                return false;
1364514f5e3Sopenharmony_ci            }
1374514f5e3Sopenharmony_ci            if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) {
1384514f5e3Sopenharmony_ci                return false;
1394514f5e3Sopenharmony_ci            }
1404514f5e3Sopenharmony_ci            // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs, corresponds to %ED%A0%80~%ED%BF%BF
1414514f5e3Sopenharmony_ci            if (data.at(0) == UTF8_3B_RESERVED_FIRST && data.at(1) >= UTF8_3B_RESERVED_SECOND_MIN &&
1424514f5e3Sopenharmony_ci                data.at(1) <= UTF8_3B_RESERVED_SECOND_MAX) {
1434514f5e3Sopenharmony_ci                return false;
1444514f5e3Sopenharmony_ci            }
1454514f5e3Sopenharmony_ci            break;
1464514f5e3Sopenharmony_ci        case UtfLength::FOUR:
1474514f5e3Sopenharmony_ci            if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
1484514f5e3Sopenharmony_ci                return false;
1494514f5e3Sopenharmony_ci            }
1504514f5e3Sopenharmony_ci            if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) {
1514514f5e3Sopenharmony_ci                return false;
1524514f5e3Sopenharmony_ci            }
1534514f5e3Sopenharmony_ci            // max four length binary: 11110(100) 10(001111) 10(111111) 10(111111), max data[0] is 0xF4, data[1] is 0x8F
1544514f5e3Sopenharmony_ci            if (data.at(0) > UTF8_4B_FIRST_MAX ||
1554514f5e3Sopenharmony_ci               (data.at(0) == UTF8_4B_FIRST_MAX && data.at(1) > UTF8_4B_SECOND_MAX)) {
1564514f5e3Sopenharmony_ci                return false;
1574514f5e3Sopenharmony_ci            }
1584514f5e3Sopenharmony_ci            break;
1594514f5e3Sopenharmony_ci        default:
1604514f5e3Sopenharmony_ci            LOG_ECMA(FATAL) << "this branch is unreachable";
1614514f5e3Sopenharmony_ci            UNREACHABLE();
1624514f5e3Sopenharmony_ci            break;
1634514f5e3Sopenharmony_ci    }
1644514f5e3Sopenharmony_ci
1654514f5e3Sopenharmony_ci    for (uint32_t i = 1; i < length; i++) {
1664514f5e3Sopenharmony_ci        if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
1674514f5e3Sopenharmony_ci            return false;
1684514f5e3Sopenharmony_ci        }
1694514f5e3Sopenharmony_ci    }
1704514f5e3Sopenharmony_ci    return true;
1714514f5e3Sopenharmony_ci}
1724514f5e3Sopenharmony_ci
1734514f5e3Sopenharmony_ciUtf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)
1744514f5e3Sopenharmony_ci{
1754514f5e3Sopenharmony_ci    // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
1764514f5e3Sopenharmony_ci    // means that is a single code point, it needs to be represented by three UTF8 code.
1774514f5e3Sopenharmony_ci    if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
1784514f5e3Sopenharmony_ci        auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
1794514f5e3Sopenharmony_ci        auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
1804514f5e3Sopenharmony_ci        auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
1814514f5e3Sopenharmony_ci        return {UtfLength::THREE, {ch0, ch1, ch2}};
1824514f5e3Sopenharmony_ci    }
1834514f5e3Sopenharmony_ci
1844514f5e3Sopenharmony_ci    if (d0 == 0) {
1854514f5e3Sopenharmony_ci        if (isWriteBuffer) {
1864514f5e3Sopenharmony_ci            return {1, {0x00U}};
1874514f5e3Sopenharmony_ci        }
1884514f5e3Sopenharmony_ci        if (modify) {
1894514f5e3Sopenharmony_ci            // special case for \u0000 ==> C080 - 1100'0000 1000'0000
1904514f5e3Sopenharmony_ci            return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
1914514f5e3Sopenharmony_ci        }
1924514f5e3Sopenharmony_ci        // For print string, just skip '\u0000'
1934514f5e3Sopenharmony_ci        return {0, {0x00U}};
1944514f5e3Sopenharmony_ci    }
1954514f5e3Sopenharmony_ci    if (d0 <= UTF8_1B_MAX) {
1964514f5e3Sopenharmony_ci        return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
1974514f5e3Sopenharmony_ci    }
1984514f5e3Sopenharmony_ci    if (d0 <= UTF8_2B_MAX) {
1994514f5e3Sopenharmony_ci        auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
2004514f5e3Sopenharmony_ci        auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
2014514f5e3Sopenharmony_ci        return {UtfLength::TWO, {ch0, ch1}};
2024514f5e3Sopenharmony_ci    }
2034514f5e3Sopenharmony_ci    if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
2044514f5e3Sopenharmony_ci        auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
2054514f5e3Sopenharmony_ci        auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
2064514f5e3Sopenharmony_ci        auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
2074514f5e3Sopenharmony_ci        return {UtfLength::THREE, {ch0, ch1, ch2}};
2084514f5e3Sopenharmony_ci    }
2094514f5e3Sopenharmony_ci    if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
2104514f5e3Sopenharmony_ci        // Bad sequence
2114514f5e3Sopenharmony_ci        LOG_ECMA(FATAL) << "this branch is unreachable";
2124514f5e3Sopenharmony_ci        UNREACHABLE();
2134514f5e3Sopenharmony_ci    }
2144514f5e3Sopenharmony_ci
2154514f5e3Sopenharmony_ci    uint32_t codePoint = CombineTwoU16(d0, d1);
2164514f5e3Sopenharmony_ci
2174514f5e3Sopenharmony_ci    auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
2184514f5e3Sopenharmony_ci    auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
2194514f5e3Sopenharmony_ci    auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
2204514f5e3Sopenharmony_ci    auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
2214514f5e3Sopenharmony_ci    return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
2224514f5e3Sopenharmony_ci}
2234514f5e3Sopenharmony_ci
2244514f5e3Sopenharmony_cisize_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8)
2254514f5e3Sopenharmony_ci{
2264514f5e3Sopenharmony_ci    size_t res = 1;  // zero byte
2274514f5e3Sopenharmony_ci    // when utf16 data length is only 1 and code in 0xd800-0xdfff,
2284514f5e3Sopenharmony_ci    // means that is a single code point, it needs to be represented by three UTF8 code.
2294514f5e3Sopenharmony_ci    if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2304514f5e3Sopenharmony_ci        utf16[0] <= utf::LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2314514f5e3Sopenharmony_ci        res += UtfLength::THREE;
2324514f5e3Sopenharmony_ci        return res;
2334514f5e3Sopenharmony_ci    }
2344514f5e3Sopenharmony_ci
2354514f5e3Sopenharmony_ci    for (uint32_t i = 0; i < length; ++i) {
2364514f5e3Sopenharmony_ci        if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2374514f5e3Sopenharmony_ci            if (isGetBufferSize) {
2384514f5e3Sopenharmony_ci                res += UtfLength::ONE;
2394514f5e3Sopenharmony_ci            } else if (modify) {
2404514f5e3Sopenharmony_ci                res += UtfLength::TWO;  // special case for U+0000 => C0 80
2414514f5e3Sopenharmony_ci            }
2424514f5e3Sopenharmony_ci        } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2434514f5e3Sopenharmony_ci            res += 1;
2444514f5e3Sopenharmony_ci        } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2454514f5e3Sopenharmony_ci            res += UtfLength::TWO;
2464514f5e3Sopenharmony_ci            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2474514f5e3Sopenharmony_ci        } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
2484514f5e3Sopenharmony_ci            res += UtfLength::THREE;
2494514f5e3Sopenharmony_ci        } else {
2504514f5e3Sopenharmony_ci            if (!cesu8 && i < length - 1 &&
2514514f5e3Sopenharmony_ci                utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2524514f5e3Sopenharmony_ci                utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
2534514f5e3Sopenharmony_ci                res += UtfLength::FOUR;
2544514f5e3Sopenharmony_ci                ++i;
2554514f5e3Sopenharmony_ci            } else {
2564514f5e3Sopenharmony_ci                res += UtfLength::THREE;
2574514f5e3Sopenharmony_ci            }
2584514f5e3Sopenharmony_ci        }
2594514f5e3Sopenharmony_ci    }
2604514f5e3Sopenharmony_ci    return res;
2614514f5e3Sopenharmony_ci}
2624514f5e3Sopenharmony_ci
2634514f5e3Sopenharmony_cisize_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
2644514f5e3Sopenharmony_ci                                size_t start, bool modify, bool isWriteBuffer, bool cesu8)
2654514f5e3Sopenharmony_ci{
2664514f5e3Sopenharmony_ci    if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
2674514f5e3Sopenharmony_ci        return 0;
2684514f5e3Sopenharmony_ci    }
2694514f5e3Sopenharmony_ci    size_t utf8Pos = 0;
2704514f5e3Sopenharmony_ci    size_t end = start + utf16Len;
2714514f5e3Sopenharmony_ci    for (size_t i = start; i < end; ++i) {
2724514f5e3Sopenharmony_ci        uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8);
2734514f5e3Sopenharmony_ci        if (codepoint == 0) {
2744514f5e3Sopenharmony_ci            if (isWriteBuffer) {
2754514f5e3Sopenharmony_ci                utf8Out[utf8Pos++] = 0x00U;
2764514f5e3Sopenharmony_ci                continue;
2774514f5e3Sopenharmony_ci            }
2784514f5e3Sopenharmony_ci            if (modify) {
2794514f5e3Sopenharmony_ci                // special case for \u0000 ==> C080 - 1100'0000 1000'0000
2804514f5e3Sopenharmony_ci                utf8Out[utf8Pos++] = UTF8_2B_FIRST;
2814514f5e3Sopenharmony_ci                utf8Out[utf8Pos++] = UTF8_2B_SECOND;
2824514f5e3Sopenharmony_ci            }
2834514f5e3Sopenharmony_ci            continue;
2844514f5e3Sopenharmony_ci        }
2854514f5e3Sopenharmony_ci        utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
2864514f5e3Sopenharmony_ci    }
2874514f5e3Sopenharmony_ci    return utf8Pos;
2884514f5e3Sopenharmony_ci}
2894514f5e3Sopenharmony_ci
2904514f5e3Sopenharmony_cisize_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
2914514f5e3Sopenharmony_ci                                        size_t start, bool modify, bool isWriteBuffer)
2924514f5e3Sopenharmony_ci{
2934514f5e3Sopenharmony_ci    if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
2944514f5e3Sopenharmony_ci        return 0;
2954514f5e3Sopenharmony_ci    }
2964514f5e3Sopenharmony_ci    size_t utf8Pos = 0;
2974514f5e3Sopenharmony_ci    size_t end = start + utf16Len;
2984514f5e3Sopenharmony_ci    for (size_t i = start; i < end; ++i) {
2994514f5e3Sopenharmony_ci        uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
3004514f5e3Sopenharmony_ci        if (codepoint == 0) {
3014514f5e3Sopenharmony_ci            if (isWriteBuffer) {
3024514f5e3Sopenharmony_ci                utf8Out[utf8Pos++] = 0x00U;
3034514f5e3Sopenharmony_ci                continue;
3044514f5e3Sopenharmony_ci            }
3054514f5e3Sopenharmony_ci            if (modify) {
3064514f5e3Sopenharmony_ci                // special case for \u0000 ==> C080 - 1100'0000 1000'0000
3074514f5e3Sopenharmony_ci                utf8Out[utf8Pos++] = UTF8_2B_FIRST;
3084514f5e3Sopenharmony_ci                utf8Out[utf8Pos++] = UTF8_2B_SECOND;
3094514f5e3Sopenharmony_ci            }
3104514f5e3Sopenharmony_ci            continue;
3114514f5e3Sopenharmony_ci        }
3124514f5e3Sopenharmony_ci        utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
3134514f5e3Sopenharmony_ci    }
3144514f5e3Sopenharmony_ci    return utf8Pos;
3154514f5e3Sopenharmony_ci}
3164514f5e3Sopenharmony_ci
3174514f5e3Sopenharmony_cistd::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
3184514f5e3Sopenharmony_ci{
3194514f5e3Sopenharmony_ci    uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
3204514f5e3Sopenharmony_ci    if ((d0 & utf::MASK1) == 0) {
3214514f5e3Sopenharmony_ci        return {d0, 1};
3224514f5e3Sopenharmony_ci    }
3234514f5e3Sopenharmony_ci
3244514f5e3Sopenharmony_ci    uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
3254514f5e3Sopenharmony_ci    if ((d0 & utf::MASK2) == 0) {
3264514f5e3Sopenharmony_ci        return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
3274514f5e3Sopenharmony_ci    }
3284514f5e3Sopenharmony_ci
3294514f5e3Sopenharmony_ci    uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
3304514f5e3Sopenharmony_ci    if ((d0 & utf::MASK3) == 0) {
3314514f5e3Sopenharmony_ci        return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
3324514f5e3Sopenharmony_ci                    (d2 & utf::MASK_6BIT),
3334514f5e3Sopenharmony_ci                UtfLength::THREE};
3344514f5e3Sopenharmony_ci    }
3354514f5e3Sopenharmony_ci
3364514f5e3Sopenharmony_ci    uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
3374514f5e3Sopenharmony_ci    uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
3384514f5e3Sopenharmony_ci                         ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
3394514f5e3Sopenharmony_ci
3404514f5e3Sopenharmony_ci    uint32_t pair = 0;
3414514f5e3Sopenharmony_ci    if (combine) {
3424514f5e3Sopenharmony_ci        uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
3434514f5e3Sopenharmony_ci        uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
3444514f5e3Sopenharmony_ci        pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
3454514f5e3Sopenharmony_ci    } else {
3464514f5e3Sopenharmony_ci        pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
3474514f5e3Sopenharmony_ci        pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
3484514f5e3Sopenharmony_ci    }
3494514f5e3Sopenharmony_ci
3504514f5e3Sopenharmony_ci    return {pair, UtfLength::FOUR};
3514514f5e3Sopenharmony_ci}
3524514f5e3Sopenharmony_ci
3534514f5e3Sopenharmony_ci// drop the tail bytes if the remain length can't fill the length it represents.
3544514f5e3Sopenharmony_cistatic inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
3554514f5e3Sopenharmony_ci{
3564514f5e3Sopenharmony_ci    size_t trimSize = 0;
3574514f5e3Sopenharmony_ci    if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
3584514f5e3Sopenharmony_ci        // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
3594514f5e3Sopenharmony_ci        trimSize = 1;
3604514f5e3Sopenharmony_ci    }
3614514f5e3Sopenharmony_ci    if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
3624514f5e3Sopenharmony_ci        // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
3634514f5e3Sopenharmony_ci        trimSize = CONST_2;
3644514f5e3Sopenharmony_ci    }
3654514f5e3Sopenharmony_ci    if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
3664514f5e3Sopenharmony_ci        // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
3674514f5e3Sopenharmony_ci        trimSize = CONST_3;
3684514f5e3Sopenharmony_ci    }
3694514f5e3Sopenharmony_ci    return utf8Len - trimSize;
3704514f5e3Sopenharmony_ci}
3714514f5e3Sopenharmony_ci
3724514f5e3Sopenharmony_cisize_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
3734514f5e3Sopenharmony_ci{
3744514f5e3Sopenharmony_ci    size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
3754514f5e3Sopenharmony_ci    size_t in_pos = 0;
3764514f5e3Sopenharmony_ci    size_t res = 0;
3774514f5e3Sopenharmony_ci    while (in_pos < safeUtf8Len) {
3784514f5e3Sopenharmony_ci        uint8_t src = utf8[in_pos];
3794514f5e3Sopenharmony_ci        switch (src & 0xF0) {
3804514f5e3Sopenharmony_ci            case 0xF0: {
3814514f5e3Sopenharmony_ci                const uint8_t c2 = utf8[++in_pos];
3824514f5e3Sopenharmony_ci                const uint8_t c3 = utf8[++in_pos];
3834514f5e3Sopenharmony_ci                const uint8_t c4 = utf8[++in_pos];
3844514f5e3Sopenharmony_ci                uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
3854514f5e3Sopenharmony_ci                    ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
3864514f5e3Sopenharmony_ci                if (codePoint >= SURROGATE_RAIR_START) {
3874514f5e3Sopenharmony_ci                    res += CONST_2;
3884514f5e3Sopenharmony_ci                } else {
3894514f5e3Sopenharmony_ci                    res++;
3904514f5e3Sopenharmony_ci                }
3914514f5e3Sopenharmony_ci                in_pos++;
3924514f5e3Sopenharmony_ci                break;
3934514f5e3Sopenharmony_ci            }
3944514f5e3Sopenharmony_ci            case 0xE0: {
3954514f5e3Sopenharmony_ci                in_pos += CONST_3;
3964514f5e3Sopenharmony_ci                res++;
3974514f5e3Sopenharmony_ci                break;
3984514f5e3Sopenharmony_ci            }
3994514f5e3Sopenharmony_ci            case 0xD0:
4004514f5e3Sopenharmony_ci            case 0xC0: {
4014514f5e3Sopenharmony_ci                in_pos += CONST_2;
4024514f5e3Sopenharmony_ci                res++;
4034514f5e3Sopenharmony_ci                break;
4044514f5e3Sopenharmony_ci            }
4054514f5e3Sopenharmony_ci            default:
4064514f5e3Sopenharmony_ci                do {
4074514f5e3Sopenharmony_ci                    in_pos++;
4084514f5e3Sopenharmony_ci                    res++;
4094514f5e3Sopenharmony_ci                } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
4104514f5e3Sopenharmony_ci                break;
4114514f5e3Sopenharmony_ci        }
4124514f5e3Sopenharmony_ci    }
4134514f5e3Sopenharmony_ci    // The remain chars should be treated as single byte char.
4144514f5e3Sopenharmony_ci    res += utf8Len - in_pos;
4154514f5e3Sopenharmony_ci    return res;
4164514f5e3Sopenharmony_ci}
4174514f5e3Sopenharmony_ci
4184514f5e3Sopenharmony_cisize_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
4194514f5e3Sopenharmony_ci{
4204514f5e3Sopenharmony_ci    size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
4214514f5e3Sopenharmony_ci    size_t in_pos = 0;
4224514f5e3Sopenharmony_ci    size_t out_pos = 0;
4234514f5e3Sopenharmony_ci    while (in_pos < safeUtf8Len && out_pos < utf16Len) {
4244514f5e3Sopenharmony_ci        uint8_t src = utf8In[in_pos];
4254514f5e3Sopenharmony_ci        switch (src & 0xF0) {
4264514f5e3Sopenharmony_ci            case 0xF0: {
4274514f5e3Sopenharmony_ci                const uint8_t c2 = utf8In[++in_pos];
4284514f5e3Sopenharmony_ci                const uint8_t c3 = utf8In[++in_pos];
4294514f5e3Sopenharmony_ci                const uint8_t c4 = utf8In[++in_pos];
4304514f5e3Sopenharmony_ci                uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
4314514f5e3Sopenharmony_ci                    ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
4324514f5e3Sopenharmony_ci                if (codePoint >= SURROGATE_RAIR_START) {
4334514f5e3Sopenharmony_ci                    ASSERT(utf16Len >= 1);
4344514f5e3Sopenharmony_ci                    if (out_pos >= utf16Len - 1) {
4354514f5e3Sopenharmony_ci                        return out_pos;
4364514f5e3Sopenharmony_ci                    }
4374514f5e3Sopenharmony_ci                    codePoint -= SURROGATE_RAIR_START;
4384514f5e3Sopenharmony_ci                    utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
4394514f5e3Sopenharmony_ci                    utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
4404514f5e3Sopenharmony_ci                } else {
4414514f5e3Sopenharmony_ci                    utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
4424514f5e3Sopenharmony_ci                }
4434514f5e3Sopenharmony_ci                in_pos++;
4444514f5e3Sopenharmony_ci                break;
4454514f5e3Sopenharmony_ci            }
4464514f5e3Sopenharmony_ci            case 0xE0: {
4474514f5e3Sopenharmony_ci                const uint8_t c2 = utf8In[++in_pos];
4484514f5e3Sopenharmony_ci                const uint8_t c3 = utf8In[++in_pos];
4494514f5e3Sopenharmony_ci                utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
4504514f5e3Sopenharmony_ci                    ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
4514514f5e3Sopenharmony_ci                in_pos++;
4524514f5e3Sopenharmony_ci                break;
4534514f5e3Sopenharmony_ci            }
4544514f5e3Sopenharmony_ci            case 0xD0:
4554514f5e3Sopenharmony_ci            case 0xC0: {
4564514f5e3Sopenharmony_ci                const uint8_t c2 = utf8In[++in_pos];
4574514f5e3Sopenharmony_ci                utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
4584514f5e3Sopenharmony_ci                in_pos++;
4594514f5e3Sopenharmony_ci                break;
4604514f5e3Sopenharmony_ci            }
4614514f5e3Sopenharmony_ci            default:
4624514f5e3Sopenharmony_ci                do {
4634514f5e3Sopenharmony_ci                    utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
4644514f5e3Sopenharmony_ci                } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
4654514f5e3Sopenharmony_ci                break;
4664514f5e3Sopenharmony_ci        }
4674514f5e3Sopenharmony_ci    }
4684514f5e3Sopenharmony_ci    // The remain chars should be treated as single byte char.
4694514f5e3Sopenharmony_ci    while (in_pos < utf8Len && out_pos < utf16Len) {
4704514f5e3Sopenharmony_ci        utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
4714514f5e3Sopenharmony_ci    }
4724514f5e3Sopenharmony_ci    return out_pos;
4734514f5e3Sopenharmony_ci}
4744514f5e3Sopenharmony_ci
4754514f5e3Sopenharmony_cisize_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
4764514f5e3Sopenharmony_ci{
4774514f5e3Sopenharmony_ci    if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) {
4784514f5e3Sopenharmony_ci        return 0;
4794514f5e3Sopenharmony_ci    }
4804514f5e3Sopenharmony_ci    size_t latin1Pos = 0;
4814514f5e3Sopenharmony_ci    size_t end = utf16Len;
4824514f5e3Sopenharmony_ci    for (size_t i = 0; i < end; ++i) {
4834514f5e3Sopenharmony_ci        if (latin1Pos == latin1Len) {
4844514f5e3Sopenharmony_ci            break;
4854514f5e3Sopenharmony_ci        }
4864514f5e3Sopenharmony_ci        uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
4874514f5e3Sopenharmony_ci        uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit);
4884514f5e3Sopenharmony_ci        latin1Out[latin1Pos++] = latin1Code;
4894514f5e3Sopenharmony_ci    }
4904514f5e3Sopenharmony_ci    return latin1Pos;
4914514f5e3Sopenharmony_ci}
4924514f5e3Sopenharmony_ci
4934514f5e3Sopenharmony_cistd::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen)
4944514f5e3Sopenharmony_ci{
4954514f5e3Sopenharmony_ci    if (maxLen == 0) {
4964514f5e3Sopenharmony_ci        return {INVALID_UTF8, 0};
4974514f5e3Sopenharmony_ci    }
4984514f5e3Sopenharmony_ci    Span<const uint8_t> sp(utf8, maxLen);
4994514f5e3Sopenharmony_ci    // one byte
5004514f5e3Sopenharmony_ci    uint8_t d0 = sp[0];
5014514f5e3Sopenharmony_ci    if ((d0 & BIT_MASK_1) == 0) {
5024514f5e3Sopenharmony_ci        return {d0, UtfLength::ONE};
5034514f5e3Sopenharmony_ci    }
5044514f5e3Sopenharmony_ci    if (maxLen < UtfLength::TWO) {
5054514f5e3Sopenharmony_ci        return {INVALID_UTF8, 0};
5064514f5e3Sopenharmony_ci    }
5074514f5e3Sopenharmony_ci    // two bytes
5084514f5e3Sopenharmony_ci    uint8_t d1 = sp[UtfLength::ONE];
5094514f5e3Sopenharmony_ci    if ((d0 & BIT_MASK_3) == BIT_MASK_2) {
5104514f5e3Sopenharmony_ci        if ((d1 & BIT_MASK_2) == BIT_MASK_1) {
5114514f5e3Sopenharmony_ci            return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
5124514f5e3Sopenharmony_ci        } else {
5134514f5e3Sopenharmony_ci            return {INVALID_UTF8, 0};
5144514f5e3Sopenharmony_ci        }
5154514f5e3Sopenharmony_ci    }
5164514f5e3Sopenharmony_ci    if (maxLen < UtfLength::THREE) {
5174514f5e3Sopenharmony_ci        return {INVALID_UTF8, 0};
5184514f5e3Sopenharmony_ci    }
5194514f5e3Sopenharmony_ci    // three bytes
5204514f5e3Sopenharmony_ci    uint8_t d2 = sp[UtfLength::TWO];
5214514f5e3Sopenharmony_ci    if ((d0 & BIT_MASK_4) == BIT_MASK_3) {
5224514f5e3Sopenharmony_ci        if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) {
5234514f5e3Sopenharmony_ci            return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) |
5244514f5e3Sopenharmony_ci                ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d2 & utf::MASK_6BIT), UtfLength::THREE};
5254514f5e3Sopenharmony_ci        } else {
5264514f5e3Sopenharmony_ci            return {INVALID_UTF8, 0};
5274514f5e3Sopenharmony_ci        }
5284514f5e3Sopenharmony_ci    }
5294514f5e3Sopenharmony_ci    if (maxLen < UtfLength::FOUR) {
5304514f5e3Sopenharmony_ci        return {INVALID_UTF8, 0};
5314514f5e3Sopenharmony_ci    }
5324514f5e3Sopenharmony_ci    // four bytes
5334514f5e3Sopenharmony_ci    uint8_t d3 = sp[UtfLength::THREE];
5344514f5e3Sopenharmony_ci    if ((d0 & BIT_MASK_5) == BIT_MASK_4) {
5354514f5e3Sopenharmony_ci        if (((d1 & BIT_MASK_2) == BIT_MASK_1) &&
5364514f5e3Sopenharmony_ci            ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) {
5374514f5e3Sopenharmony_ci            return {((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
5384514f5e3Sopenharmony_ci                ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT), UtfLength::FOUR};
5394514f5e3Sopenharmony_ci        } else {
5404514f5e3Sopenharmony_ci            return {INVALID_UTF8, 0};
5414514f5e3Sopenharmony_ci        }
5424514f5e3Sopenharmony_ci    }
5434514f5e3Sopenharmony_ci    return {INVALID_UTF8, 0};
5444514f5e3Sopenharmony_ci}
5454514f5e3Sopenharmony_ci}  // namespace panda::ecmascript::base::utf_helper
546