1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18 
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21 
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27 
28 namespace panda::es2panda::util {
29 
30 class StringView {
31 public:
32     explicit StringView() noexcept = default;
33     explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34     // NOLINTNEXTLINE(google-explicit-constructor)
sv_(sv)35     StringView(const std::string_view &sv) noexcept : sv_(sv) {}
36     // NOLINTNEXTLINE(google-explicit-constructor)
sv_(str)37     StringView(const char *str) noexcept : sv_(str) {}
38     DEFAULT_COPY_SEMANTIC(StringView);
39     DEFAULT_MOVE_SEMANTIC(StringView);
40     ~StringView() = default;
41 
42     bool operator==(const StringView &rhs) const noexcept
43     {
44         return sv_ == rhs.sv_;
45     }
46 
47     bool operator!=(const StringView &rhs) const noexcept
48     {
49         return sv_ != rhs.sv_;
50     }
51 
52     bool operator<(const StringView &rhs) const noexcept
53     {
54         return sv_ < rhs.sv_;
55     }
56 
57     bool operator>(const StringView &rhs) const noexcept
58     {
59         return sv_ > rhs.sv_;
60     }
61 
62     int Compare(const StringView &other) const noexcept
63     {
64         return sv_.compare(other.sv_);
65     }
66 
67     int Compare(const std::string_view &other) const noexcept
68     {
69         return sv_.compare(other);
70     }
71 
72     bool Is(const char *str) const noexcept
73     {
74         return sv_ == str;
75     }
76 
77     bool Is(const std::string_view &str) const noexcept
78     {
79         return sv_ == str;
80     }
81 
82     size_t Length() const noexcept
83     {
84         return sv_.length();
85     }
86 
87     bool Empty() const noexcept
88     {
89         return sv_.empty();
90     }
91 
92     const std::string_view &Utf8() const noexcept
93     {
94         return sv_;
95     }
96 
97     explicit operator std::string() const noexcept
98     {
99         return std::string {sv_};
100     }
101 
102     const char *Bytes() const noexcept
103     {
104         return sv_.data();
105     }
106 
107     StringView Substr(size_t begin, size_t end) const noexcept
108     {
109         return StringView(std::string_view(sv_.data() + begin, end - begin));
110     }
111 
Find(const char *str) const112     constexpr size_t Find(const char *str) const
113     {
114         return sv_.find(str);
115     }
116 
IsHighSurrogate(char32_t cp)117     static bool IsHighSurrogate(char32_t cp)
118     {
119         return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
120     }
121 
IsLowSurrogate(char32_t cp)122     static bool IsLowSurrogate(char32_t cp)
123     {
124         return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
125     }
126 
127     std::string Mutf8() const noexcept;
128     static char32_t DecodeSurrogates(char32_t high, char32_t low);
129     static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
130 
131     template <void encoder(std::string *, char32_t)>
132     std::string EscapeSymbol() const;
133 
134     template <typename T>
135     static void Utf8Encode(T *str, char32_t cu);
136     template <typename T>
137     static void Mutf8Encode(T *str, char32_t cu);
138 
139     class Iterator {
140     public:
141         static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
142 
iter_(sv_.begin())143         explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
144         DEFAULT_COPY_SEMANTIC(Iterator);
145         DEFAULT_MOVE_SEMANTIC(Iterator);
146         ~Iterator() = default;
147 
Index() const148         inline size_t Index() const
149         {
150             return static_cast<size_t>(iter_ - sv_.begin());
151         }
152 
Next()153         inline char32_t Next()
154         {
155             return DecodeCP<true>(nullptr);
156         }
157 
Peek() const158         inline char32_t Peek() const
159         {
160             return HasNext() ? *iter_ : INVALID_CP;
161         }
162 
PeekCp() const163         inline char32_t PeekCp() const
164         {
165             return DecodeCP<false>(nullptr);
166         }
167 
PeekCp(size_t *cpSize) const168         inline char32_t PeekCp(size_t *cpSize) const
169         {
170             return DecodeCP<false, true>(cpSize);
171         }
172 
Forward(size_t offset) const173         inline void Forward(size_t offset) const
174         {
175             iter_ += offset;
176         }
177 
Backward(size_t offset) const178         inline void Backward(size_t offset) const
179         {
180             iter_ -= offset;
181         }
182 
Reset(size_t offset)183         inline void Reset(size_t offset)
184         {
185             iter_ = sv_.begin() + offset;
186         }
187 
Rewind(std::string_view::const_iterator pos) const188         inline void Rewind(std::string_view::const_iterator pos) const
189         {
190             iter_ = pos;
191         }
192 
Save() const193         inline std::string_view::const_iterator Save() const
194         {
195             return iter_;
196         }
197 
HasNext() const198         inline bool HasNext() const
199         {
200             return iter_ != sv_.end();
201         }
202 
HasExpectedNumberOfBytes(size_t count) const203         bool HasExpectedNumberOfBytes(size_t count) const
204         {
205             for (size_t i = 0; i < count; ++i) {
206                 if (!HasNext()) {
207                     return false;
208                 }
209                 ++iter_;
210             }
211             iter_ -= count;
212             return true;
213         }
214 
215         void SkipCp() const;
216 
217     private:
218         template <bool moveIter, bool setCpSize = false>
219         char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
220 
221         std::string_view sv_;
222         mutable std::string_view::const_iterator iter_;
223     };
224 
225 private:
226     class Constants {
227     public:
228         static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
229         static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
230         static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
231 
232         static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
233         static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
234         static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
235 
236         static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
237         static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
238 
239         static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
240         static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
241         static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
242 
243         static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
244         static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
245         static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
246 
247         static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
248         static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
249 
250         static constexpr size_t UTF8_NEXT_ONE_BYTE = 1;
251         static constexpr size_t UTF8_NEXT_TWO_BYTE = 2;
252         static constexpr size_t UTF8_NEXT_THREE_BYTE = 3;
253         static constexpr size_t UTF8_NEXT_FOUR_BYTE = 4;
254 
255         static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
256         static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
257         static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
258         static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
259         static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
260         static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
261     };
262 
263     friend class Iterator;
264     std::string_view sv_;
265 };
266 
267 class UString {
268 public:
269     UString() = default;
UString(ArenaAllocator *allocator)270     explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string &str, ArenaAllocator *allocator)271     explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
272     {
273         Alloc();
274         *str_ = str;
275     }
276 
277     DEFAULT_COPY_SEMANTIC(UString);
278     DEFAULT_MOVE_SEMANTIC(UString);
279     ~UString() = default;
280 
View() const281     util::StringView View() const
282     {
283         if (!str_) {
284             return util::StringView();
285         }
286 
287         return util::StringView(str_);
288     }
289 
290     void Append(char32_t ch) noexcept
291     {
292         if (!str_) {
293             Alloc();
294         }
295 
296         StringView::Utf8Encode<ArenaString>(str_, ch);
297     }
298 
299     void Append(const StringView &other) noexcept
300     {
301         if (!str_) {
302             Alloc();
303         }
304 
305         *str_ += other.Utf8();
306     }
307 
308     void Append(const char *other) noexcept
309     {
310         if (!str_) {
311             Alloc();
312         }
313         *str_ += other;
314     }
315 
316 private:
Alloc()317     void Alloc()
318     {
319         str_ = allocator_->New<ArenaString>(allocator_->Adapter());
320         CHECK_NOT_NULL(str_);
321     }
322 
323 protected:
324     ArenaString *str_ {};
325     ArenaAllocator *allocator_ {};
326 };
327 
328 template <bool moveIter, bool setCpSize>
DecodeCP([[maybe_unused]] size_t *cpSize) const329 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
330 {
331     if (!HasNext()) {
332         return INVALID_CP;
333     }
334 
335     const auto *iterNext = iter_;
336 
337     char32_t cu0 = static_cast<uint8_t>(*iterNext++);
338     char32_t res {};
339 
340     if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
341         res = cu0;
342     } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
343         // Should be 2 bytes decoded in UTF-8
344         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) {
345             return INVALID_CP;
346         }
347         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
348         res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
349     } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
350         // Should be 3 bytes decoded in UTF-8
351         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) {
352             return INVALID_CP;
353         }
354         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
355         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
356         res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
357               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
358     } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
359                (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
360         // Should be 4 bytes decoded in UTF-8
361         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_FOUR_BYTE)) {
362             return INVALID_CP;
363         }
364         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
365         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
366         char32_t cu3 = static_cast<uint8_t>(*iterNext++);
367         res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
368               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
369               ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
370     } else {
371         res = INVALID_CP;
372     }
373 
374     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
375     if constexpr (moveIter) {
376         iter_ = iterNext;
377         return res;
378     }
379 
380     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
381     if constexpr (setCpSize) {
382         *cpSize = iterNext - iter_;
383     }
384 
385     return res;
386 }
387 
388 template <void encoder(std::string *, char32_t)>
EscapeSymbol() const389 std::string StringView::EscapeSymbol() const
390 {
391     std::string str;
392     str.reserve(Length());
393 
394     Iterator iter(*this);
395 
396     while (iter.HasNext()) {
397         auto cp = iter.Next();
398 
399         switch (cp) {
400             case '\r': {
401                 if (iter.HasNext()) {
402                     iter.Forward(1);
403 
404                     if (iter.Peek() != '\n') {
405                         iter.Backward(1);
406                     }
407                 }
408 
409                 [[fallthrough]];
410             }
411             case '\n': {
412                 str += "\\n";
413                 break;
414             }
415             case '\b': {
416                 str += "\\b";
417                 break;
418             }
419             case '\t': {
420                 str += "\\t";
421                 break;
422             }
423             case '\f': {
424                 str += "\\f";
425                 break;
426             }
427             case '"': {
428                 str += "\\\"";
429                 break;
430             }
431             case '\\': {
432                 str += "\\\\";
433                 break;
434             }
435             default: {
436                 encoder(&str, cp);
437             }
438         }
439     }
440 
441     return str;
442 }
443 
444 template <typename T>
Utf8Encode(T *str, char32_t cu)445 void StringView::Utf8Encode(T *str, char32_t cu)
446 {
447     if (cu < Constants::UTF8_1BYTE_LIMIT) {
448         str->push_back(static_cast<char>(cu));
449     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
450         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
451                                          Constants::UTF8_2BYTE_HEADER));
452         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
453     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
454         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
455                                          Constants::UTF8_3BYTE_HEADER));
456         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
457                                          Constants::UTF8_CONT_HEADER));
458         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
459     } else {
460         str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
461                                          Constants::UTF8_4BYTE_HEADER));
462         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
463                                          Constants::UTF8_CONT_HEADER));
464         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
465                                          Constants::UTF8_CONT_HEADER));
466         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
467     }
468 }
469 
470 template <typename T>
Mutf8Encode(T *str, char32_t cu)471 void StringView::Mutf8Encode(T *str, char32_t cu)
472 {
473     if (cu == 0) {
474         str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
475         str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
476     } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
477         str->push_back(static_cast<char>(cu));
478     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
479         str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
480         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
481     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
482         str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
483         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
484                                          Constants::UTF8_CONT_HEADER));
485         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
486     } else {
487         auto [cu1, cu2] = EncodeSurrogate(cu);
488         Mutf8Encode(str, cu1);
489         Mutf8Encode(str, cu2);
490     }
491 }
492 
493 }  // namespace panda::es2panda::util
494 
495 // NOLINTNEXTLINE(cert-dcl58-cpp)
496 namespace std {
497 
498 template <>
499 // NOLINTNEXTLINE(altera-struct-pack-align)
500 struct hash<panda::es2panda::util::StringView> {
operator ()panda::es2panda::std::hash501     std::size_t operator()(const panda::es2panda::util::StringView &str) const
502     {
503         return std::hash<std::string_view> {}(str.Utf8());
504     }
505 };
506 
507 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
508 
509 }  // namespace std
510 
511 #ifndef NDEBUG
512 #define DCOUT std::cout
513 #else
514 #define DCOUT false && std::cout
515 #endif  // NDEBUG
516 
517 #endif
518