1/*
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef ECMASCRIPT_BASE_STRING_HELP_H
17#define ECMASCRIPT_BASE_STRING_HELP_H
18
19#include <algorithm>
20#include <codecvt>
21#include <locale>
22#include <regex>
23#include <sstream>
24#include <string>
25#include <vector>
26
27#include "ecmascript/base/utf_helper.h"
28#include "ecmascript/mem/c_containers.h"
29#include "ecmascript/mem/c_string.h"
30
31#include "securec.h"
32#include "unicode/unistr.h"
33
34namespace panda::ecmascript::base {
35// White Space Code Points and Line Terminators Code Point
36// NOLINTNEXTLINE(modernize-avoid-c-arrays)
37static constexpr uint16_t SPACE_OR_LINE_TERMINAL[] = {
38    0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004,
39    0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF,
40};
41static constexpr int UICODE_FROM_UTF8[] = {
42    0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
43};
44static constexpr int UTF8_MIN_CODE[] = {
45    0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
46};
47static constexpr char UTF8_FIRST_CODE[] = {
48    0x1f, 0xf, 0x7, 0x3, 0x1,
49};
50class StringHelper {
51public:
52    static constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
53
54    static inline CString ReplaceAll(CString str, const CString &oldValue,
55                                     const CString &newValue)
56    {
57        if (oldValue.empty() || oldValue == newValue) {
58            return str;
59        }
60        CString::size_type pos(0);
61        while ((pos = str.find(oldValue, pos)) != CString::npos) {
62            str.replace(pos, oldValue.length(), newValue);
63            pos += newValue.length();
64        }
65        return str;
66    }
67
68    static inline CString Replace(CString str, const CString &oldValue,
69                                  const CString &newValue)
70    {
71        if (oldValue.empty() || oldValue == newValue) {
72            return str;
73        }
74        CString::size_type pos(0);
75        if ((pos = str.find(oldValue, pos)) != CString::npos) {
76            str.replace(pos, oldValue.length(), newValue);
77        }
78        return str;
79    }
80
81    static inline std::u16string Utf16ToU16String(const uint16_t *utf16Data, uint32_t dataLen)
82    {
83        auto *char16tData = reinterpret_cast<const char16_t *>(utf16Data);
84        std::u16string u16str(char16tData, dataLen);
85        return u16str;
86    }
87
88    static inline std::string Utf8ToString(const uint8_t *utf8Data, uint32_t dataLen)
89    {
90        auto *charData = reinterpret_cast<const char *>(utf8Data);
91        std::string str(charData, dataLen);
92        return str;
93    }
94
95    static inline std::u16string Utf8ToU16String(const uint8_t *utf8Data, uint32_t dataLen)
96    {
97        auto *charData = reinterpret_cast<const char *>(utf8Data);
98        std::string str(charData, dataLen);
99        std::u16string u16str = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str);
100        return u16str;
101    }
102
103    static inline std::string WstringToString(const std::wstring &wstr)
104    {
105        return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.to_bytes(wstr);
106    }
107
108    static inline std::wstring StringToWstring(const std::string &str)
109    {
110        return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.from_bytes(str);
111    }
112
113    static inline std::string U16stringToString(const std::u16string &u16str)
114    {
115        return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str);
116    }
117
118    static inline std::u16string StringToU16string(const std::string &str)
119    {
120        return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str);
121    }
122
123    static inline size_t Find(const std::string &thisStr, const std::string &searchStr, int32_t pos)
124    {
125        size_t idx = thisStr.find(searchStr, pos);
126        return idx;
127    }
128
129    static inline size_t Find(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos)
130    {
131        size_t idx = thisStr.find(searchStr, pos);
132        return idx;
133    }
134
135    static inline size_t RFind(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos)
136    {
137        size_t idx = thisStr.rfind(searchStr, pos);
138        return idx;
139    }
140
141    static inline std::string ToUpper(const std::u16string &str)
142    {
143        std::u16string tmpStr = str;
144        const char16_t *constChar16tData = tmpStr.data();
145        icu::UnicodeString uString(constChar16tData);
146        icu::UnicodeString up = uString.toUpper();
147        std::string res;
148        up.toUTF8String(res);
149        return res;
150    }
151
152    static inline std::string ToLocaleUpper(const std::u16string &str, const icu::Locale &locale)
153    {
154        std::u16string tmpStr = str;
155        const char16_t *constChar16tData = tmpStr.data();
156        icu::UnicodeString uString(constChar16tData);
157        icu::UnicodeString up = uString.toUpper(locale);
158        std::string res;
159        up.toUTF8String(res);
160        return res;
161    }
162
163    static inline std::string ToLower(const std::u16string &str)
164    {
165        const char16_t *constChar16tData = str.data();
166        icu::UnicodeString uString(constChar16tData, str.length());
167        std::string res;
168        uString.toLower().toUTF8String(res);
169        return res;
170    }
171
172    static inline std::string ToLocaleLower(const std::u16string &str, const icu::Locale &locale)
173    {
174        std::u16string tmpStr = str;
175        const char16_t *constChar16tData = tmpStr.data();
176        icu::UnicodeString uString(constChar16tData);
177        icu::UnicodeString low = uString.toLower(locale);
178        std::string res;
179        low.toUTF8String(res);
180        return res;
181    }
182
183    static inline size_t FindFromU16ToUpper(const std::u16string &thisStr, uint16_t *u16Data)
184    {
185        std::u16string tmpStr = Utf16ToU16String(u16Data, 1);
186        const char16_t *constChar16tData = tmpStr.data();
187        icu::UnicodeString uString(constChar16tData);
188        icu::UnicodeString up = uString.toUpper();
189        std::string res;
190        up.toUTF8String(res);
191        std::u16string searchStr = StringToU16string(res);
192        size_t idx = Find(thisStr, searchStr, 0);
193        return idx;
194    }
195
196    static inline size_t FindFromU8ToUpper(const std::string &thisStr, uint8_t *u8Data)
197    {
198        std::string tmpStr = Utf8ToString(u8Data, 1);
199        std::transform(tmpStr.begin(), tmpStr.end(), tmpStr.begin(), [](unsigned char c) { return std::toupper(c); });
200        size_t idx = Find(thisStr, tmpStr, 0);
201        return idx;
202    }
203
204    static int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
205    {
206        int c = *p++;
207        if (c < UICODE_FROM_UTF8[0]) {
208            *pp = p;
209            return c;
210        }
211        int l = 0;
212        if (c >= UICODE_FROM_UTF8[1] && c <= UICODE_FROM_UTF8[2]) { // 1 - 2: 0000 0080 - 0000 07FF
213            l = 1; // 1: 0000 0080 - 0000 07FF Unicode
214        } else if (c >= UICODE_FROM_UTF8[3] && c <= UICODE_FROM_UTF8[4]) { // 3 - 4: 0000 0800 - 0000 FFFF
215            l = 2; // 2: 0000 0800 - 0000 FFFF Unicode
216        } else if (c >= UICODE_FROM_UTF8[5] && c <= UICODE_FROM_UTF8[6]) { // 5 - 6: 0001 0000 - 0010 FFFF
217            l = 3; // 3: 0001 0000 - 0010 FFFF Unicode
218        } else if (c >= UICODE_FROM_UTF8[7] && c <= UICODE_FROM_UTF8[8]) { // 7 - 8: 0020 0000 - 03FF FFFF
219            l = 4; // 4: 0020 0000 - 03FF FFFF Unicode
220        } else if (c == UICODE_FROM_UTF8[9] || c == UICODE_FROM_UTF8[10]) { // 9 - 10: 0400 0000 - 7FFF FFFF
221            l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode
222        } else {
223            return INVALID_UNICODE_FROM_UTF8;
224        }
225        /* check that we have enough characters */
226        if ((l + 1) > maxLen) {
227            return INVALID_UNICODE_FROM_UTF8;
228        }
229        return FromUtf8(c, l, p, pp);
230    }
231
232    static int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
233    {
234        uint32_t b;
235        c &= UTF8_FIRST_CODE[l - 1];
236        for (int i = 0; i < l; i++) {
237            b = *p++;
238            if (b < utf_helper::UTF8_2B_SECOND || b >= utf_helper::UTF8_2B_FIRST) {
239                return INVALID_UNICODE_FROM_UTF8;
240            }
241            c = (c << 6) | (b & utf_helper::UTF8_2B_THIRD); // 6: Maximum Unicode range
242        }
243        if (c < UTF8_MIN_CODE[l - 1]) {
244            return INVALID_UNICODE_FROM_UTF8;
245        }
246        *pp = p;
247        return c;
248    }
249
250    static inline void InplaceAppend(std::u16string &str1, const std::u16string &str2)
251    {
252        str1.append(str2);
253    }
254
255    static inline std::u16string Append(const std::u16string &str1, const std::u16string &str2)
256    {
257        std::u16string tmpStr = str1;
258        return tmpStr.append(str2);
259    }
260
261    static inline uint32_t Utf8ToU32String(const std::vector<uint8_t> &data)
262    {
263        std::string str(data.begin(), data.end());
264        std::u32string u32str = std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(str);
265        auto u32data = reinterpret_cast<uint32_t *>(u32str.data());
266        return *u32data;
267    }
268
269    static inline std::string Utf32ToString(uint32_t u32Data)
270    {
271        UChar32 charData = static_cast<int32_t>(u32Data);
272        icu::UnicodeString uString(charData);
273        std::string res;
274        uString.toUTF8String(res);
275        return res;
276    }
277
278    static inline std::string GetSpecifiedLine(const std::string &srcStr, int lineNumber)
279    {
280        if (lineNumber < 1) {
281            return "";
282        }
283        bool escape = true;
284        if (srcStr.find('\n') == std::string::npos) {
285            escape = false;
286        }
287        size_t prePos = 0;
288        int findPrePos = lineNumber - 1;
289        for (int i = 0; i < findPrePos; i++) {
290            if (escape) {
291                prePos = srcStr.find('\n', prePos);
292                if (prePos == std::string::npos) {
293                    return "";
294                }
295                prePos += 1;
296            } else {
297                prePos = srcStr.find("\\n", prePos);
298                if (prePos == std::string::npos) {
299                    return "";
300                }
301                prePos += 2; // 2 : add the two characters found to start searching again
302            }
303        }
304        size_t findEndPos = 0;
305        if (escape) {
306            findEndPos = srcStr.find('\n', prePos);
307        } else {
308            findEndPos = srcStr.find("\\n", prePos);
309        }
310        if (findEndPos == std::string::npos) {
311            return srcStr.substr(prePos, srcStr.length() - prePos);
312        }
313        ASSERT(findEndPos > prePos);
314        return srcStr.substr(prePos, findEndPos - prePos);
315    }
316
317    static inline bool IsNonspace(uint16_t c)
318    {
319        uint32_t len = sizeof(SPACE_OR_LINE_TERMINAL) / sizeof(SPACE_OR_LINE_TERMINAL[0]);
320        for (uint32_t i = 0; i < len; i++) {
321            if (c == SPACE_OR_LINE_TERMINAL[i]) {
322                return true;
323            }
324            if (c < SPACE_OR_LINE_TERMINAL[i]) {
325                return false;
326            }
327        }
328        return false;
329    }
330
331    template<typename T>
332    static inline uint32_t GetStart(Span<T> &data, uint32_t length)
333    {
334        uint32_t start = 0;
335        while (start < length && IsNonspace(data[start])) {
336            start++;
337        }
338        return start;
339    }
340
341    template<typename T>
342    static inline int32_t GetEnd(Span<T> &data, int32_t start, uint32_t length)
343    {
344        if (length == 0U) {
345            return 0;
346        }
347        int32_t end = static_cast<int32_t>(length - 1);
348        while (end >= start && IsNonspace(data[end])) {
349            end--;
350        }
351        return end;
352    }
353
354    static bool Utf8CharInRange(uint8_t value, char start, char end)
355    {
356        ASSERT(start <= end);
357        return (value >= static_cast<uint8_t>(start)) && (value <= static_cast<uint8_t>(end));
358    }
359
360    static inline std::string Vformat(const char *fmt, va_list args)
361    {
362        static constexpr size_t SIZE = 1024;
363
364        std::string result;
365        result.resize(SIZE);
366
367        bool is_truncated = true;
368        while (is_truncated) {
369            va_list copy_args;
370            va_copy(copy_args, args);
371            int r = vsnprintf_truncated_s(result.data(), result.size() + 1, fmt, copy_args);
372            va_end(copy_args);
373
374            if (r < 0) {
375                return "";
376            }
377
378            is_truncated = static_cast<size_t>(r) == result.size();
379            result.resize(result.size() * 2U);
380        }
381
382        result.erase(std::find(result.begin(), result.end(), '\0'), result.end());
383
384        return result;
385    }
386
387    static std::vector<std::string> SplitString(const std::string &str, const std::string &delimiter)
388    {
389        std::size_t strIndex = 0;
390        std::vector<std::string> value;
391        std::size_t pos = str.find_first_of(delimiter, strIndex);
392        while ((pos < str.size()) && (pos > strIndex)) {
393            std::string subStr = str.substr(strIndex, pos - strIndex);
394            value.push_back(std::move(subStr));
395            strIndex = pos;
396            strIndex = str.find_first_not_of(delimiter, strIndex);
397            pos = str.find_first_of(delimiter, strIndex);
398        }
399        if (pos > strIndex) {
400            std::string subStr = str.substr(strIndex, pos - strIndex);
401            if (!subStr.empty()) {
402                value.push_back(std::move(subStr));
403            }
404        }
405        return value;
406    }
407
408    static bool EndsWith(const std::string &str, const std::string &suffix)
409    {
410        if (str.length() < suffix.length()) {
411            return false;
412        }
413        std::string subStr = str.substr(str.length() - suffix.length(), str.length());
414        return subStr == suffix;
415    }
416
417    static bool StrToUInt32(const char *content, uint32_t *result)
418    {
419        const int DEC = 10;
420        char *endPtr = nullptr;
421        *result = std::strtoul(content, &endPtr, DEC);
422        if (endPtr == content || *endPtr != '\0') {
423            return false;
424        }
425        return true;
426    }
427
428    static bool StringStartWith(const CString& str, const CString& startStr)
429    {
430        size_t startStrLen = startStr.length();
431        return ((str.length() >= startStrLen) && (str.compare(0, startStrLen, startStr) == 0));
432    }
433
434    static bool StringEndWith(const CString& str, const CString& endStr)
435    {
436        size_t endStrLen = endStr.length();
437        size_t len = str.length();
438        return ((len >= endStrLen) && (str.compare(len - endStrLen, endStrLen, endStr) == 0));
439    }
440
441    static void SplitString(const CString& str, CVector<CString>& out, size_t startPos, size_t times = 0, char c = '/')
442    {
443        size_t left = startPos;
444        size_t pos = 0;
445        size_t index = 0;
446        while ((pos = str.find(c, left)) != CString::npos) {
447            if (times != 0 && index >= times) {
448                return;
449            }
450            out.emplace_back(str.substr(left, pos - left));
451            left = pos + 1;
452            index++;
453        }
454
455        if ((times == 0 || index < times) && left < str.length()) {
456            out.emplace_back(str.substr(left));
457        }
458    }
459
460    static CString JoinString(const CVector<CString>& strs, size_t startIndex, size_t endIndex, char c = '/')
461    {
462        CString out;
463        for (size_t index = startIndex; index < strs.size() && index <= endIndex; ++index) {
464            if (!strs[index].empty()) {
465                out.append(strs[index]) += c;
466            }
467        }
468        if (!out.empty()) {
469            out.pop_back();
470        }
471        return out;
472    }
473};
474}  // namespace panda::ecmascript::base
475#endif  // ECMASCRIPT_BASE_STRING_HELP_H
476