1/*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17#define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19#include <macros.h>
20#include <utils/arena_containers.h>
21
22#include <cstddef>
23#include <limits>
24#include <memory>
25#include <string>
26#include <string_view>
27
28namespace panda::es2panda::util {
29
30class StringView {
31public:
32    explicit StringView() noexcept = default;
33    explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34    // NOLINTNEXTLINE(google-explicit-constructor)
35    StringView(const std::string_view &sv) noexcept : sv_(sv) {}
36    // NOLINTNEXTLINE(google-explicit-constructor)
37    StringView(const char *str) noexcept : sv_(str) {}
38    DEFAULT_COPY_SEMANTIC(StringView);
39    DEFAULT_MOVE_SEMANTIC(StringView);
40    ~StringView() = default;
41
42    bool operator==(const StringView &rhs) const noexcept
43    {
44        return sv_ == rhs.sv_;
45    }
46
47    bool operator!=(const StringView &rhs) const noexcept
48    {
49        return sv_ != rhs.sv_;
50    }
51
52    bool operator<(const StringView &rhs) const noexcept
53    {
54        return sv_ < rhs.sv_;
55    }
56
57    bool operator>(const StringView &rhs) const noexcept
58    {
59        return sv_ > rhs.sv_;
60    }
61
62    int Compare(const StringView &other) const noexcept
63    {
64        return sv_.compare(other.sv_);
65    }
66
67    int Compare(const std::string_view &other) const noexcept
68    {
69        return sv_.compare(other);
70    }
71
72    bool Is(const char *str) const noexcept
73    {
74        return sv_ == str;
75    }
76
77    bool Is(const std::string_view &str) const noexcept
78    {
79        return sv_ == str;
80    }
81
82    size_t Length() const noexcept
83    {
84        return sv_.length();
85    }
86
87    bool Empty() const noexcept
88    {
89        return sv_.empty();
90    }
91
92    const std::string_view &Utf8() const noexcept
93    {
94        return sv_;
95    }
96
97    explicit operator std::string() const noexcept
98    {
99        return std::string {sv_};
100    }
101
102    const char *Bytes() const noexcept
103    {
104        return sv_.data();
105    }
106
107    StringView Substr(size_t begin, size_t end) const noexcept
108    {
109        return StringView(std::string_view(sv_.data() + begin, end - begin));
110    }
111
112    constexpr size_t Find(const char *str) const
113    {
114        return sv_.find(str);
115    }
116
117    static bool IsHighSurrogate(char32_t cp)
118    {
119        return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
120    }
121
122    static bool IsLowSurrogate(char32_t cp)
123    {
124        return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
125    }
126
127    std::string Mutf8() const noexcept;
128    static char32_t DecodeSurrogates(char32_t high, char32_t low);
129    static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
130
131    template <void encoder(std::string *, char32_t)>
132    std::string EscapeSymbol() const;
133
134    template <typename T>
135    static void Utf8Encode(T *str, char32_t cu);
136    template <typename T>
137    static void Mutf8Encode(T *str, char32_t cu);
138
139    class Iterator {
140    public:
141        static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
142
143        explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
144        DEFAULT_COPY_SEMANTIC(Iterator);
145        DEFAULT_MOVE_SEMANTIC(Iterator);
146        ~Iterator() = default;
147
148        inline size_t Index() const
149        {
150            return static_cast<size_t>(iter_ - sv_.begin());
151        }
152
153        inline char32_t Next()
154        {
155            return DecodeCP<true>(nullptr);
156        }
157
158        inline char32_t Peek() const
159        {
160            return HasNext() ? *iter_ : INVALID_CP;
161        }
162
163        inline char32_t PeekCp() const
164        {
165            return DecodeCP<false>(nullptr);
166        }
167
168        inline char32_t PeekCp(size_t *cpSize) const
169        {
170            return DecodeCP<false, true>(cpSize);
171        }
172
173        inline void Forward(size_t offset) const
174        {
175            iter_ += offset;
176        }
177
178        inline void Backward(size_t offset) const
179        {
180            iter_ -= offset;
181        }
182
183        inline void Reset(size_t offset)
184        {
185            iter_ = sv_.begin() + offset;
186        }
187
188        inline void Rewind(std::string_view::const_iterator pos) const
189        {
190            iter_ = pos;
191        }
192
193        inline std::string_view::const_iterator Save() const
194        {
195            return iter_;
196        }
197
198        inline bool HasNext() const
199        {
200            return iter_ != sv_.end();
201        }
202
203        bool HasExpectedNumberOfBytes(size_t count) const
204        {
205            for (size_t i = 0; i < count; ++i) {
206                if (!HasNext()) {
207                    return false;
208                }
209                ++iter_;
210            }
211            iter_ -= count;
212            return true;
213        }
214
215        void SkipCp() const;
216
217    private:
218        template <bool moveIter, bool setCpSize = false>
219        char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
220
221        std::string_view sv_;
222        mutable std::string_view::const_iterator iter_;
223    };
224
225private:
226    class Constants {
227    public:
228        static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
229        static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
230        static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
231
232        static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
233        static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
234        static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
235
236        static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
237        static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
238
239        static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
240        static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
241        static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
242
243        static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
244        static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
245        static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
246
247        static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
248        static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
249
250        static constexpr size_t UTF8_NEXT_ONE_BYTE = 1;
251        static constexpr size_t UTF8_NEXT_TWO_BYTE = 2;
252        static constexpr size_t UTF8_NEXT_THREE_BYTE = 3;
253        static constexpr size_t UTF8_NEXT_FOUR_BYTE = 4;
254
255        static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
256        static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
257        static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
258        static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
259        static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
260        static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
261    };
262
263    friend class Iterator;
264    std::string_view sv_;
265};
266
267class UString {
268public:
269    UString() = default;
270    explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
271    explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
272    {
273        Alloc();
274        *str_ = str;
275    }
276
277    DEFAULT_COPY_SEMANTIC(UString);
278    DEFAULT_MOVE_SEMANTIC(UString);
279    ~UString() = default;
280
281    util::StringView View() const
282    {
283        if (!str_) {
284            return util::StringView();
285        }
286
287        return util::StringView(str_);
288    }
289
290    void Append(char32_t ch) noexcept
291    {
292        if (!str_) {
293            Alloc();
294        }
295
296        StringView::Utf8Encode<ArenaString>(str_, ch);
297    }
298
299    void Append(const StringView &other) noexcept
300    {
301        if (!str_) {
302            Alloc();
303        }
304
305        *str_ += other.Utf8();
306    }
307
308    void Append(const char *other) noexcept
309    {
310        if (!str_) {
311            Alloc();
312        }
313        *str_ += other;
314    }
315
316private:
317    void Alloc()
318    {
319        str_ = allocator_->New<ArenaString>(allocator_->Adapter());
320        CHECK_NOT_NULL(str_);
321    }
322
323protected:
324    ArenaString *str_ {};
325    ArenaAllocator *allocator_ {};
326};
327
328template <bool moveIter, bool setCpSize>
329char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
330{
331    if (!HasNext()) {
332        return INVALID_CP;
333    }
334
335    const auto *iterNext = iter_;
336
337    char32_t cu0 = static_cast<uint8_t>(*iterNext++);
338    char32_t res {};
339
340    if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
341        res = cu0;
342    } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
343        // Should be 2 bytes decoded in UTF-8
344        if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) {
345            return INVALID_CP;
346        }
347        char32_t cu1 = static_cast<uint8_t>(*iterNext++);
348        res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
349    } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
350        // Should be 3 bytes decoded in UTF-8
351        if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) {
352            return INVALID_CP;
353        }
354        char32_t cu1 = static_cast<uint8_t>(*iterNext++);
355        char32_t cu2 = static_cast<uint8_t>(*iterNext++);
356        res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
357              ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
358    } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
359               (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
360        // Should be 4 bytes decoded in UTF-8
361        if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_FOUR_BYTE)) {
362            return INVALID_CP;
363        }
364        char32_t cu1 = static_cast<uint8_t>(*iterNext++);
365        char32_t cu2 = static_cast<uint8_t>(*iterNext++);
366        char32_t cu3 = static_cast<uint8_t>(*iterNext++);
367        res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
368              ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
369              ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
370    } else {
371        res = INVALID_CP;
372    }
373
374    // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
375    if constexpr (moveIter) {
376        iter_ = iterNext;
377        return res;
378    }
379
380    // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
381    if constexpr (setCpSize) {
382        *cpSize = iterNext - iter_;
383    }
384
385    return res;
386}
387
388template <void encoder(std::string *, char32_t)>
389std::string StringView::EscapeSymbol() const
390{
391    std::string str;
392    str.reserve(Length());
393
394    Iterator iter(*this);
395
396    while (iter.HasNext()) {
397        auto cp = iter.Next();
398
399        switch (cp) {
400            case '\r': {
401                if (iter.HasNext()) {
402                    iter.Forward(1);
403
404                    if (iter.Peek() != '\n') {
405                        iter.Backward(1);
406                    }
407                }
408
409                [[fallthrough]];
410            }
411            case '\n': {
412                str += "\\n";
413                break;
414            }
415            case '\b': {
416                str += "\\b";
417                break;
418            }
419            case '\t': {
420                str += "\\t";
421                break;
422            }
423            case '\f': {
424                str += "\\f";
425                break;
426            }
427            case '"': {
428                str += "\\\"";
429                break;
430            }
431            case '\\': {
432                str += "\\\\";
433                break;
434            }
435            default: {
436                encoder(&str, cp);
437            }
438        }
439    }
440
441    return str;
442}
443
444template <typename T>
445void StringView::Utf8Encode(T *str, char32_t cu)
446{
447    if (cu < Constants::UTF8_1BYTE_LIMIT) {
448        str->push_back(static_cast<char>(cu));
449    } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
450        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
451                                         Constants::UTF8_2BYTE_HEADER));
452        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
453    } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
454        str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
455                                         Constants::UTF8_3BYTE_HEADER));
456        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
457                                         Constants::UTF8_CONT_HEADER));
458        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
459    } else {
460        str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
461                                         Constants::UTF8_4BYTE_HEADER));
462        str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
463                                         Constants::UTF8_CONT_HEADER));
464        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
465                                         Constants::UTF8_CONT_HEADER));
466        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
467    }
468}
469
470template <typename T>
471void StringView::Mutf8Encode(T *str, char32_t cu)
472{
473    if (cu == 0) {
474        str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
475        str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
476    } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
477        str->push_back(static_cast<char>(cu));
478    } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
479        str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
480        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
481    } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
482        str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
483        str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
484                                         Constants::UTF8_CONT_HEADER));
485        str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
486    } else {
487        auto [cu1, cu2] = EncodeSurrogate(cu);
488        Mutf8Encode(str, cu1);
489        Mutf8Encode(str, cu2);
490    }
491}
492
493}  // namespace panda::es2panda::util
494
495// NOLINTNEXTLINE(cert-dcl58-cpp)
496namespace std {
497
498template <>
499// NOLINTNEXTLINE(altera-struct-pack-align)
500struct hash<panda::es2panda::util::StringView> {
501    std::size_t operator()(const panda::es2panda::util::StringView &str) const
502    {
503        return std::hash<std::string_view> {}(str.Utf8());
504    }
505};
506
507ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
508
509}  // namespace std
510
511#ifndef NDEBUG
512#define DCOUT std::cout
513#else
514#define DCOUT false && std::cout
515#endif  // NDEBUG
516
517#endif
518