1/**
2 * Copyright (c) 2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "utf.h"
17#include <memory>
18
19namespace OHOS::Ace {
20
21/*
22 * MUtf-8
23 *
24 * U+0000 => C0 80
25 *
26 * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
27 *    code point   code point   code point
28 * 1  7            U+0000       U+007F      0xxxxxxx
29 * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
30 * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
31 * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
32 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
33 */
34
35/*
36 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
37 * In case of invalid sequence return first byte of it.
38 */
39size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len)
40{
41    size_t pos = 0;
42    size_t res = 0;
43    while (pos != mutf8Len) {
44        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
45        if (nbytes == 0) {
46            nbytes = 1;
47        }
48        res += pair > MAX_U16 ? CONST_2 : 1;
49        mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
50        pos += nbytes;
51    }
52    return res;
53}
54
55std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes)
56{
57    uint8_t d0 = *data;
58    if ((d0 & MASK1) == 0) {
59        return { d0, 1 };
60    }
61
62    if (maxBytes < CONST_2) {
63        return { d0, 1 };
64    }
65    uint8_t d1 = *(data + 1);
66    if ((d0 & MASK2) == 0) {
67        return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 };
68    }
69
70    if (maxBytes < CONST_3) {
71        return { d0, 1 };
72    }
73    uint8_t d2 = *(data + CONST_2);
74    if ((d0 & MASK3) == 0) {
75        return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
76            CONST_3 };
77    }
78
79    if (maxBytes < CONST_4) {
80        return { d0, 1 };
81    }
82    uint8_t d3 = *(data + CONST_3);
83    uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
84                         ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
85
86    uint32_t pair = 0;
87    pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
88    pair <<= PAIR_ELEMENT_WIDTH;
89    pair |= (codePoint & MASK_10BIT) + U16_TAIL;
90
91    return { pair, CONST_4 };
92}
93
94size_t ConvertRegionUtf8ToUtf16(
95    const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)
96{
97    size_t inPos = 0;
98    size_t outPos = 0;
99    while (inPos < mutf8Len) {
100        auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
101        auto [pHi, pLo] = SplitUtf16Pair(pair);
102
103        mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104        inPos += nbytes;
105        if (start > 0) {
106            start -= nbytes;
107            continue;
108        }
109
110        if (pHi != 0) {
111            if (outPos++ >= utf16Len - 1) { // check for place for two uint16
112                --outPos;
113                break;
114            }
115            *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
116        }
117        if (outPos++ >= utf16Len) {
118            --outPos;
119            break;
120        }
121        *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
122    }
123    return outPos;
124}
125
126bool IsUTF16HighSurrogate(uint16_t ch)
127{
128    return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
129}
130
131bool IsUTF16LowSurrogate(uint16_t ch)
132{
133    return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
134}
135
136size_t UTF8Length(uint32_t codePoint)
137{
138    if (codePoint <= UTF8_1B_MAX) {
139        return UtfLength::ONE;
140    }
141    if (codePoint <= UTF8_2B_MAX) {
142        return UtfLength::TWO;
143    }
144    if (codePoint <= UTF8_3B_MAX) {
145        return UtfLength::THREE;
146    }
147    return UtfLength::FOUR;
148}
149
150// Methods for encode unicode to unicode
151size_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index)
152{
153    size_t size = UTF8Length(codePoint);
154    if (index + size > len) {
155        return 0;
156    }
157    for (size_t j = size - 1; j > 0; j--) {
158        uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK);
159        utf8[index + j] = cont;
160        codePoint >>= UTF8_OFFSET;
161    }
162    utf8[index] = codePoint | FIRST_BYTE_MARK[size];
163    return size;
164}
165
166uint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index)
167{
168    uint16_t first = utf16[*index];
169    // A valid surrogate pair should always start with a High Surrogate
170    if (IsUTF16LowSurrogate(first)) {
171        return UTF16_REPLACEMENT_CHARACTER;
172    }
173    if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
174        if (*index == len - 1) {
175            // A High surrogate not paired with another surrogate
176            return UTF16_REPLACEMENT_CHARACTER;
177        }
178        uint16_t second = utf16[*index + 1];
179        if (!IsUTF16LowSurrogate(second)) {
180            // A High surrogate not followed by a low surrogate
181            return UTF16_REPLACEMENT_CHARACTER;
182        }
183        // A valid surrogate pair, decode normally
184        (*index)++;
185        return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
186    }
187    // A unicode not fallen into the range of representing by surrogate pair, return as it is
188    return first;
189}
190
191size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
192    size_t start)
193{
194    if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
195        return 0;
196    }
197    size_t utf8Pos = 0;
198    size_t end = start + utf16Len;
199    for (size_t i = start; i < end; ++i) {
200        uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
201        if (codePoint == 0) {
202            continue;
203        }
204        utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos);
205    }
206    return utf8Pos;
207}
208
209bool IsUTF8(std::string& data)
210{
211    if (data.empty()) {
212        return false;
213    }
214
215    bool hasZeroByte = false;
216    bool hasMultiByteUTF8 = false;
217
218    for (size_t i = 0; i < data.size(); ++i) {
219        unsigned char c = data[i];
220
221        // Check for UTF-16LE byte order mark (BOM)
222        if (i == 0 && data.size() >= INDEX_TWO && data[INDEX_ONE] == UTF16LE_ZERO_BYTE &&
223            (c == UTF16LE_BOM_FF || c == UTF16LE_BOM_FE)) {
224            return false;
225        }
226
227        // Check for zero bytes, which are common in UTF-16LE
228        if (c == UTF16LE_ZERO_BYTE) {
229            hasZeroByte = true;
230        }
231
232        // Check for multi-byte UTF-8 sequences
233        if ((c & UTF8_HIGH_BIT) != 0) { // High bit is set, indicating a non-ASCII character
234            if ((c & UTF8_TWO_BYTE_MASK) == UTF8_TWO_BYTE_PATTERN && i + INDEX_ONE < data.size() &&
235                (data[i + INDEX_ONE ] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
236                // Two-byte UTF-8 character
237                hasMultiByteUTF8 = true;
238                i += INDEX_ONE; // Skip the next byte
239            } else if ((c & UTF8_THREE_BYTE_MASK) == UTF8_THREE_BYTE_PATTERN && i + INDEX_TWO < data.size() &&
240                       (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
241                       (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
242                // Three-byte UTF-8 character
243                hasMultiByteUTF8 = true;
244                i += INDEX_TWO; // Skip the next two bytes
245            } else if ((c & UTF8_FOUR_BYTE_MASK) == UTF8_FOUR_BYTE_PATTERN && i + INDEX_THREE < data.size() &&
246                       (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
247                       (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
248                       (data[i + INDEX_THREE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
249                // Four-byte UTF-8 character
250                hasMultiByteUTF8 = true;
251                i += INDEX_THREE; // Skip the next three bytes
252            }
253        }
254    }
255
256    if (hasZeroByte && !hasMultiByteUTF8) {
257        // If we found zero bytes and no multi-byte UTF-8 sequences, it's likely UTF-16LE
258        return false;
259    } else if (hasMultiByteUTF8) {
260        // If we found multi-byte UTF-8 sequences, it's likely UTF-8
261        return true;
262    } else {
263        // If all characters are ASCII, it's either pure ASCII or we don't have enough data to determine the encoding
264        return false;
265    }
266}
267
268void ConvertIllegalStr(std::string& str)
269{
270    if (IsUTF8(str)) {
271        uint8_t* buf8 =  reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str()));
272        size_t utf8Len = str.size();
273        auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len);
274        std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len);
275        auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0);
276        if (resultLen == utf16Len) {
277            DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0);
278        }
279    }
280}
281
282} // namespace OHOS::Ace
283