1/**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17#define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19#include "macros.h"
20#include "utils/arena_containers.h"
21
22#include <cstddef>
23#include <limits>
24#include <memory>
25#include <string>
26#include <string_view>
27
28namespace ark::es2panda::util {
29class StringView {
30public:
31    explicit StringView() noexcept = default;
32    explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
33    // NOLINTNEXTLINE(google-explicit-constructor)
34    StringView(std::string_view sv) noexcept : sv_(sv) {}
35    // NOLINTNEXTLINE(google-explicit-constructor)
36    StringView(const char *str) noexcept : sv_(str == nullptr ? "" : str) {}
37    DEFAULT_COPY_SEMANTIC(StringView);
38    DEFAULT_MOVE_SEMANTIC(StringView);
39    ~StringView() = default;
40
41    bool operator==(const StringView &rhs) const noexcept
42    {
43        return sv_ == rhs.sv_;
44    }
45
46    bool operator!=(const StringView &rhs) const noexcept
47    {
48        return sv_ != rhs.sv_;
49    }
50
51    bool operator<(const StringView &rhs) const noexcept
52    {
53        return sv_ < rhs.sv_;
54    }
55
56    bool operator>(const StringView &rhs) const noexcept
57    {
58        return sv_ > rhs.sv_;
59    }
60
61    int Compare(const StringView &other) const noexcept
62    {
63        return sv_.compare(other.sv_);
64    }
65
66    int Compare(const std::string_view &other) const noexcept
67    {
68        return sv_.compare(other);
69    }
70
71    bool Is(const char *str) const noexcept
72    {
73        return sv_ == str;
74    }
75
76    bool Is(const std::string_view &str) const noexcept
77    {
78        return sv_ == str;
79    }
80
81    size_t Length() const noexcept
82    {
83        return sv_.length();
84    }
85
86    bool Empty() const noexcept
87    {
88        return sv_.empty();
89    }
90
91    const std::string_view &Utf8() const noexcept
92    {
93        return sv_;
94    }
95
96    explicit operator std::string() const noexcept
97    {
98        return std::string {sv_};
99    }
100
101    const char *Bytes() const noexcept
102    {
103        return sv_.data();
104    }
105
106    StringView Substr(size_t begin, size_t end) const noexcept
107    {
108        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
109        return StringView(std::string_view(sv_.data() + begin, end - begin));
110    }
111
112    static bool IsHighSurrogate(char32_t cp)
113    {
114        return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
115    }
116
117    static bool IsLowSurrogate(char32_t cp)
118    {
119        return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
120    }
121
122    std::string Mutf8() const noexcept;
123    static char32_t DecodeSurrogates(char32_t high, char32_t low);
124    static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
125
126    template <void ENCODER(std::string *, char32_t)>
127    std::string EscapeSymbol() const;
128
129    template <typename T>
130    static void Utf8Encode(T *str, char32_t cu);
131    template <typename T>
132    static void Mutf8Encode(T *str, char32_t cu);
133
134    bool IsConvertibleToChar() const;
135
136    class Iterator {
137    public:
138        static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
139
140        explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
141        DEFAULT_COPY_SEMANTIC(Iterator);
142        DEFAULT_MOVE_SEMANTIC(Iterator);
143        ~Iterator() = default;
144
145        inline size_t Index() const
146        {
147            return static_cast<size_t>(iter_ - sv_.begin());
148        }
149
150        inline char32_t Next()
151        {
152            return DecodeCP<true>(nullptr);
153        }
154
155        inline char32_t Peek() const
156        {
157            return HasNext() ? *iter_ : INVALID_CP;
158        }
159
160        inline char32_t PeekCp() const
161        {
162            return DecodeCP<false>(nullptr);
163        }
164
165        inline char32_t PeekCp(size_t *cpSize) const
166        {
167            return DecodeCP<false, true>(cpSize);
168        }
169
170        inline void Forward(size_t offset)
171        {
172            iter_ += offset;
173        }
174
175        inline void Backward(size_t offset)
176        {
177            iter_ -= offset;
178        }
179
180        inline void Reset(size_t offset)
181        {
182            iter_ = sv_.begin() + offset;
183        }
184
185        inline void Rewind(std::string_view::const_iterator pos)
186        {
187            iter_ = pos;
188        }
189
190        inline std::string_view::const_iterator Save() const
191        {
192            return iter_;
193        }
194
195        inline bool HasNext() const
196        {
197            return iter_ != sv_.end();
198        }
199
200        void SkipCp();
201
202    private:
203        template <bool MOVE_ITER, bool SET_CP_SIZE = false>
204        char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
205
206        std::string_view sv_;
207        mutable std::string_view::const_iterator iter_;
208    };
209
210    class Constants {
211    public:
212        static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
213        static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
214        static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
215
216        static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
217        static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
218        static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
219
220        static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
221        static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
222
223        static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
224        static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
225        static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
226
227        static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
228        static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
229        static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
230
231        static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
232        static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
233
234        static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
235        static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
236        static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
237        static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
238        static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
239        static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
240    };
241
242private:
243    friend class Iterator;
244    std::string_view sv_;
245};
246
247class UString {
248public:
249    UString() = default;
250    explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
251    explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
252    {
253        Alloc();
254        *str_ = str;
255    }
256
257    explicit UString(const std::string_view &str, ArenaAllocator *allocator) : UString(allocator)
258    {
259        Alloc();
260        *str_ = str;
261    }
262
263    explicit UString(const util::StringView &str, ArenaAllocator *allocator) : UString(str.Utf8(), allocator) {}
264
265    DEFAULT_COPY_SEMANTIC(UString);
266    DEFAULT_MOVE_SEMANTIC(UString);
267    ~UString() = default;
268
269    util::StringView View() const
270    {
271        if (str_ == nullptr) {
272            return util::StringView();
273        }
274
275        return util::StringView(str_);
276    }
277
278    util::StringView View()
279    {
280        if (str_ == nullptr) {
281            return util::StringView();
282        }
283
284        return util::StringView(str_);
285    }
286
287    void Append(char32_t ch) noexcept
288    {
289        if (str_ == nullptr) {
290            Alloc();
291        }
292
293        StringView::Utf8Encode<ArenaString>(str_, ch);
294    }
295
296    void Append(const StringView &other) noexcept
297    {
298        if (str_ == nullptr) {
299            Alloc();
300        }
301
302        *str_ += other.Utf8();
303    }
304
305    void Append(const char *other) noexcept
306    {
307        if (str_ == nullptr) {
308            Alloc();
309        }
310        *str_ += other;
311    }
312
313    void Append(const std::string &other) noexcept
314    {
315        if (str_ == nullptr) {
316            Alloc();
317        }
318        *str_ += other;
319    }
320
321private:
322    void Alloc()
323    {
324        str_ = allocator_->New<ArenaString>(allocator_->Adapter());
325    }
326
327protected:
328    // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
329    ArenaString *str_ {};
330    ArenaAllocator *allocator_ {};
331    // NOLINTEND(misc-non-private-member-variables-in-classes)
332};
333
334template <bool MOVE_ITER, bool SET_CP_SIZE>
335char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
336{
337    if (!HasNext()) {
338        return INVALID_CP;
339    }
340
341    const auto *iterNext = iter_;
342
343    char32_t cu0 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344    char32_t res {};
345
346    if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
347        res = cu0;
348    } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
349        char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
350        res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
351    } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
352        char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
353        char32_t cu2 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
354        res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
355              ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
356    } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
357               (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
358        char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
359        char32_t cu2 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360        char32_t cu3 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
361        res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
362              ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
363              ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
364    } else {
365        res = INVALID_CP;
366    }
367
368    // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
369    if constexpr (MOVE_ITER) {
370        iter_ = iterNext;
371        return res;
372    }
373
374    // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
375    if constexpr (SET_CP_SIZE) {
376        *cpSize = iterNext - iter_;
377    }
378
379    return res;
380}
381
382template <void ENCODER(std::string *, char32_t)>
383std::string StringView::EscapeSymbol() const
384{
385    std::string str;
386    str.reserve(Length());
387
388    auto skipNewLine = [](auto &iter) {
389        if (iter.HasNext()) {
390            iter.Forward(1);
391
392            if (iter.Peek() != '\n') {
393                iter.Backward(1);
394            }
395        }
396    };
397
398    Iterator iter(*this);
399    while (iter.HasNext()) {
400        auto cp = iter.Next();
401
402        switch (cp) {
403            case '\r': {
404                skipNewLine(iter);
405                [[fallthrough]];
406            }
407            case '\n': {
408                str += "\\n";
409                break;
410            }
411            case '\b': {
412                str += "\\b";
413                break;
414            }
415            case '\t': {
416                str += "\\t";
417                break;
418            }
419            case '\f': {
420                str += "\\f";
421                break;
422            }
423            case '"': {
424                str += "\\\"";
425                break;
426            }
427            case '\\': {
428                str += "\\\\";
429                break;
430            }
431            default: {
432                ENCODER(&str, cp);
433            }
434        }
435    }
436
437    return str;
438}
439
440template <typename T>
441void StringView::Utf8Encode(T *str, char32_t cu)
442{
443    if (cu < Constants::UTF8_1BYTE_LIMIT) {
444        str->push_back(static_cast<char>(cu));
445    } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
446        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
447                                         Constants::UTF8_2BYTE_HEADER));
448        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
449    } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
450        str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
451                                         Constants::UTF8_3BYTE_HEADER));
452        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
453                                         Constants::UTF8_CONT_HEADER));
454        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
455    } else {
456        str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
457                                         Constants::UTF8_4BYTE_HEADER));
458        str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
459                                         Constants::UTF8_CONT_HEADER));
460        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
461                                         Constants::UTF8_CONT_HEADER));
462        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
463    }
464}
465
466template <typename T>
467void StringView::Mutf8Encode(T *str, char32_t cu)
468{
469    if (cu == 0) {
470        str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
471        str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
472    } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
473        str->push_back(static_cast<char>(cu));
474    } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
475        str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
476        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
477    } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
478        str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
479        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
480                                         Constants::UTF8_CONT_HEADER));
481        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
482    } else {
483        auto [cu1, cu2] = EncodeSurrogate(cu);
484        Mutf8Encode(str, cu1);
485        Mutf8Encode(str, cu2);
486    }
487}
488}  // namespace ark::es2panda::util
489
490// NOLINTNEXTLINE(cert-dcl58-cpp)
491namespace std {
492
493template <>
494// NOLINTNEXTLINE(altera-struct-pack-align)
495struct hash<ark::es2panda::util::StringView> {
496    std::size_t operator()(const ark::es2panda::util::StringView &str) const
497    {
498        return std::hash<std::string_view> {}(str.Utf8());
499    }
500};
501
502ostream &operator<<(ostream &os, const ark::es2panda::util::StringView &us);
503
504}  // namespace std
505
506#endif
507