1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27
28 namespace panda::es2panda::util {
29
30 class StringView {
31 public:
32 explicit StringView() noexcept = default;
33 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34 // NOLINTNEXTLINE(google-explicit-constructor)
sv_(sv)35 StringView(const std::string_view &sv) noexcept : sv_(sv) {}
36 // NOLINTNEXTLINE(google-explicit-constructor)
sv_(str)37 StringView(const char *str) noexcept : sv_(str) {}
38 DEFAULT_COPY_SEMANTIC(StringView);
39 DEFAULT_MOVE_SEMANTIC(StringView);
40 ~StringView() = default;
41
42 bool operator==(const StringView &rhs) const noexcept
43 {
44 return sv_ == rhs.sv_;
45 }
46
47 bool operator!=(const StringView &rhs) const noexcept
48 {
49 return sv_ != rhs.sv_;
50 }
51
52 bool operator<(const StringView &rhs) const noexcept
53 {
54 return sv_ < rhs.sv_;
55 }
56
57 bool operator>(const StringView &rhs) const noexcept
58 {
59 return sv_ > rhs.sv_;
60 }
61
62 int Compare(const StringView &other) const noexcept
63 {
64 return sv_.compare(other.sv_);
65 }
66
67 int Compare(const std::string_view &other) const noexcept
68 {
69 return sv_.compare(other);
70 }
71
72 bool Is(const char *str) const noexcept
73 {
74 return sv_ == str;
75 }
76
77 bool Is(const std::string_view &str) const noexcept
78 {
79 return sv_ == str;
80 }
81
82 size_t Length() const noexcept
83 {
84 return sv_.length();
85 }
86
87 bool Empty() const noexcept
88 {
89 return sv_.empty();
90 }
91
92 const std::string_view &Utf8() const noexcept
93 {
94 return sv_;
95 }
96
97 explicit operator std::string() const noexcept
98 {
99 return std::string {sv_};
100 }
101
102 const char *Bytes() const noexcept
103 {
104 return sv_.data();
105 }
106
107 StringView Substr(size_t begin, size_t end) const noexcept
108 {
109 return StringView(std::string_view(sv_.data() + begin, end - begin));
110 }
111
Find(const char *str) const112 constexpr size_t Find(const char *str) const
113 {
114 return sv_.find(str);
115 }
116
IsHighSurrogate(char32_t cp)117 static bool IsHighSurrogate(char32_t cp)
118 {
119 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
120 }
121
IsLowSurrogate(char32_t cp)122 static bool IsLowSurrogate(char32_t cp)
123 {
124 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
125 }
126
127 std::string Mutf8() const noexcept;
128 static char32_t DecodeSurrogates(char32_t high, char32_t low);
129 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
130
131 template <void encoder(std::string *, char32_t)>
132 std::string EscapeSymbol() const;
133
134 template <typename T>
135 static void Utf8Encode(T *str, char32_t cu);
136 template <typename T>
137 static void Mutf8Encode(T *str, char32_t cu);
138
139 class Iterator {
140 public:
141 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
142
iter_(sv_.begin())143 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
144 DEFAULT_COPY_SEMANTIC(Iterator);
145 DEFAULT_MOVE_SEMANTIC(Iterator);
146 ~Iterator() = default;
147
Index() const148 inline size_t Index() const
149 {
150 return static_cast<size_t>(iter_ - sv_.begin());
151 }
152
Next()153 inline char32_t Next()
154 {
155 return DecodeCP<true>(nullptr);
156 }
157
Peek() const158 inline char32_t Peek() const
159 {
160 return HasNext() ? *iter_ : INVALID_CP;
161 }
162
PeekCp() const163 inline char32_t PeekCp() const
164 {
165 return DecodeCP<false>(nullptr);
166 }
167
PeekCp(size_t *cpSize) const168 inline char32_t PeekCp(size_t *cpSize) const
169 {
170 return DecodeCP<false, true>(cpSize);
171 }
172
Forward(size_t offset) const173 inline void Forward(size_t offset) const
174 {
175 iter_ += offset;
176 }
177
Backward(size_t offset) const178 inline void Backward(size_t offset) const
179 {
180 iter_ -= offset;
181 }
182
Reset(size_t offset)183 inline void Reset(size_t offset)
184 {
185 iter_ = sv_.begin() + offset;
186 }
187
Rewind(std::string_view::const_iterator pos) const188 inline void Rewind(std::string_view::const_iterator pos) const
189 {
190 iter_ = pos;
191 }
192
Save() const193 inline std::string_view::const_iterator Save() const
194 {
195 return iter_;
196 }
197
HasNext() const198 inline bool HasNext() const
199 {
200 return iter_ != sv_.end();
201 }
202
HasExpectedNumberOfBytes(size_t count) const203 bool HasExpectedNumberOfBytes(size_t count) const
204 {
205 for (size_t i = 0; i < count; ++i) {
206 if (!HasNext()) {
207 return false;
208 }
209 ++iter_;
210 }
211 iter_ -= count;
212 return true;
213 }
214
215 void SkipCp() const;
216
217 private:
218 template <bool moveIter, bool setCpSize = false>
219 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
220
221 std::string_view sv_;
222 mutable std::string_view::const_iterator iter_;
223 };
224
225 private:
226 class Constants {
227 public:
228 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
229 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
230 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
231
232 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
233 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
234 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
235
236 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
237 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
238
239 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
240 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
241 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
242
243 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
244 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
245 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
246
247 static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
248 static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
249
250 static constexpr size_t UTF8_NEXT_ONE_BYTE = 1;
251 static constexpr size_t UTF8_NEXT_TWO_BYTE = 2;
252 static constexpr size_t UTF8_NEXT_THREE_BYTE = 3;
253 static constexpr size_t UTF8_NEXT_FOUR_BYTE = 4;
254
255 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
256 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
257 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
258 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
259 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
260 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
261 };
262
263 friend class Iterator;
264 std::string_view sv_;
265 };
266
267 class UString {
268 public:
269 UString() = default;
UString(ArenaAllocator *allocator)270 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string &str, ArenaAllocator *allocator)271 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
272 {
273 Alloc();
274 *str_ = str;
275 }
276
277 DEFAULT_COPY_SEMANTIC(UString);
278 DEFAULT_MOVE_SEMANTIC(UString);
279 ~UString() = default;
280
View() const281 util::StringView View() const
282 {
283 if (!str_) {
284 return util::StringView();
285 }
286
287 return util::StringView(str_);
288 }
289
290 void Append(char32_t ch) noexcept
291 {
292 if (!str_) {
293 Alloc();
294 }
295
296 StringView::Utf8Encode<ArenaString>(str_, ch);
297 }
298
299 void Append(const StringView &other) noexcept
300 {
301 if (!str_) {
302 Alloc();
303 }
304
305 *str_ += other.Utf8();
306 }
307
308 void Append(const char *other) noexcept
309 {
310 if (!str_) {
311 Alloc();
312 }
313 *str_ += other;
314 }
315
316 private:
Alloc()317 void Alloc()
318 {
319 str_ = allocator_->New<ArenaString>(allocator_->Adapter());
320 CHECK_NOT_NULL(str_);
321 }
322
323 protected:
324 ArenaString *str_ {};
325 ArenaAllocator *allocator_ {};
326 };
327
328 template <bool moveIter, bool setCpSize>
DecodeCP([[maybe_unused]] size_t *cpSize) const329 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
330 {
331 if (!HasNext()) {
332 return INVALID_CP;
333 }
334
335 const auto *iterNext = iter_;
336
337 char32_t cu0 = static_cast<uint8_t>(*iterNext++);
338 char32_t res {};
339
340 if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
341 res = cu0;
342 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
343 // Should be 2 bytes decoded in UTF-8
344 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) {
345 return INVALID_CP;
346 }
347 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
348 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
349 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
350 // Should be 3 bytes decoded in UTF-8
351 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) {
352 return INVALID_CP;
353 }
354 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
355 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
356 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
357 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
358 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
359 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
360 // Should be 4 bytes decoded in UTF-8
361 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_FOUR_BYTE)) {
362 return INVALID_CP;
363 }
364 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
365 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
366 char32_t cu3 = static_cast<uint8_t>(*iterNext++);
367 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
368 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
369 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
370 } else {
371 res = INVALID_CP;
372 }
373
374 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
375 if constexpr (moveIter) {
376 iter_ = iterNext;
377 return res;
378 }
379
380 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
381 if constexpr (setCpSize) {
382 *cpSize = iterNext - iter_;
383 }
384
385 return res;
386 }
387
388 template <void encoder(std::string *, char32_t)>
EscapeSymbol() const389 std::string StringView::EscapeSymbol() const
390 {
391 std::string str;
392 str.reserve(Length());
393
394 Iterator iter(*this);
395
396 while (iter.HasNext()) {
397 auto cp = iter.Next();
398
399 switch (cp) {
400 case '\r': {
401 if (iter.HasNext()) {
402 iter.Forward(1);
403
404 if (iter.Peek() != '\n') {
405 iter.Backward(1);
406 }
407 }
408
409 [[fallthrough]];
410 }
411 case '\n': {
412 str += "\\n";
413 break;
414 }
415 case '\b': {
416 str += "\\b";
417 break;
418 }
419 case '\t': {
420 str += "\\t";
421 break;
422 }
423 case '\f': {
424 str += "\\f";
425 break;
426 }
427 case '"': {
428 str += "\\\"";
429 break;
430 }
431 case '\\': {
432 str += "\\\\";
433 break;
434 }
435 default: {
436 encoder(&str, cp);
437 }
438 }
439 }
440
441 return str;
442 }
443
444 template <typename T>
Utf8Encode(T *str, char32_t cu)445 void StringView::Utf8Encode(T *str, char32_t cu)
446 {
447 if (cu < Constants::UTF8_1BYTE_LIMIT) {
448 str->push_back(static_cast<char>(cu));
449 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
450 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
451 Constants::UTF8_2BYTE_HEADER));
452 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
453 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
454 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
455 Constants::UTF8_3BYTE_HEADER));
456 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
457 Constants::UTF8_CONT_HEADER));
458 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
459 } else {
460 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
461 Constants::UTF8_4BYTE_HEADER));
462 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
463 Constants::UTF8_CONT_HEADER));
464 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
465 Constants::UTF8_CONT_HEADER));
466 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
467 }
468 }
469
470 template <typename T>
Mutf8Encode(T *str, char32_t cu)471 void StringView::Mutf8Encode(T *str, char32_t cu)
472 {
473 if (cu == 0) {
474 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
475 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
476 } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
477 str->push_back(static_cast<char>(cu));
478 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
479 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
480 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
481 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
482 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
483 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
484 Constants::UTF8_CONT_HEADER));
485 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
486 } else {
487 auto [cu1, cu2] = EncodeSurrogate(cu);
488 Mutf8Encode(str, cu1);
489 Mutf8Encode(str, cu2);
490 }
491 }
492
493 } // namespace panda::es2panda::util
494
495 // NOLINTNEXTLINE(cert-dcl58-cpp)
496 namespace std {
497
498 template <>
499 // NOLINTNEXTLINE(altera-struct-pack-align)
500 struct hash<panda::es2panda::util::StringView> {
operator ()panda::es2panda::std::hash501 std::size_t operator()(const panda::es2panda::util::StringView &str) const
502 {
503 return std::hash<std::string_view> {}(str.Utf8());
504 }
505 };
506
507 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
508
509 } // namespace std
510
511 #ifndef NDEBUG
512 #define DCOUT std::cout
513 #else
514 #define DCOUT false && std::cout
515 #endif // NDEBUG
516
517 #endif
518