1/* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H 17#define ES2PANDA_UTIL_INCLUDE_USTRING_H 18 19#include <macros.h> 20#include <utils/arena_containers.h> 21 22#include <cstddef> 23#include <limits> 24#include <memory> 25#include <string> 26#include <string_view> 27 28namespace panda::es2panda::util { 29 30class StringView { 31public: 32 explicit StringView() noexcept = default; 33 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {} 34 // NOLINTNEXTLINE(google-explicit-constructor) 35 StringView(const std::string_view &sv) noexcept : sv_(sv) {} 36 // NOLINTNEXTLINE(google-explicit-constructor) 37 StringView(const char *str) noexcept : sv_(str) {} 38 DEFAULT_COPY_SEMANTIC(StringView); 39 DEFAULT_MOVE_SEMANTIC(StringView); 40 ~StringView() = default; 41 42 bool operator==(const StringView &rhs) const noexcept 43 { 44 return sv_ == rhs.sv_; 45 } 46 47 bool operator!=(const StringView &rhs) const noexcept 48 { 49 return sv_ != rhs.sv_; 50 } 51 52 bool operator<(const StringView &rhs) const noexcept 53 { 54 return sv_ < rhs.sv_; 55 } 56 57 bool operator>(const StringView &rhs) const noexcept 58 { 59 return sv_ > rhs.sv_; 60 } 61 62 int Compare(const StringView &other) const noexcept 63 { 64 return sv_.compare(other.sv_); 65 } 66 67 int Compare(const std::string_view &other) const noexcept 68 { 69 return sv_.compare(other); 70 } 71 72 bool Is(const char *str) const noexcept 73 { 74 return sv_ == str; 75 } 76 77 bool Is(const std::string_view &str) const noexcept 78 { 79 return sv_ == str; 80 } 81 82 size_t Length() const noexcept 83 { 84 return sv_.length(); 85 } 86 87 bool Empty() const noexcept 88 { 89 return sv_.empty(); 90 } 91 92 const std::string_view &Utf8() const noexcept 93 { 94 return sv_; 95 } 96 97 explicit operator std::string() const noexcept 98 { 99 return std::string {sv_}; 100 } 101 102 const char *Bytes() const noexcept 103 { 104 return sv_.data(); 105 } 106 107 StringView Substr(size_t begin, size_t end) const noexcept 108 { 109 return StringView(std::string_view(sv_.data() + begin, end - begin)); 110 } 111 112 constexpr size_t Find(const char *str) const 113 { 114 return sv_.find(str); 115 } 116 117 static bool IsHighSurrogate(char32_t cp) 118 { 119 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX); 120 } 121 122 static bool IsLowSurrogate(char32_t cp) 123 { 124 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX); 125 } 126 127 std::string Mutf8() const noexcept; 128 static char32_t DecodeSurrogates(char32_t high, char32_t low); 129 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp); 130 131 template <void encoder(std::string *, char32_t)> 132 std::string EscapeSymbol() const; 133 134 template <typename T> 135 static void Utf8Encode(T *str, char32_t cu); 136 template <typename T> 137 static void Mutf8Encode(T *str, char32_t cu); 138 139 class Iterator { 140 public: 141 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max(); 142 143 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {} 144 DEFAULT_COPY_SEMANTIC(Iterator); 145 DEFAULT_MOVE_SEMANTIC(Iterator); 146 ~Iterator() = default; 147 148 inline size_t Index() const 149 { 150 return static_cast<size_t>(iter_ - sv_.begin()); 151 } 152 153 inline char32_t Next() 154 { 155 return DecodeCP<true>(nullptr); 156 } 157 158 inline char32_t Peek() const 159 { 160 return HasNext() ? *iter_ : INVALID_CP; 161 } 162 163 inline char32_t PeekCp() const 164 { 165 return DecodeCP<false>(nullptr); 166 } 167 168 inline char32_t PeekCp(size_t *cpSize) const 169 { 170 return DecodeCP<false, true>(cpSize); 171 } 172 173 inline void Forward(size_t offset) const 174 { 175 iter_ += offset; 176 } 177 178 inline void Backward(size_t offset) const 179 { 180 iter_ -= offset; 181 } 182 183 inline void Reset(size_t offset) 184 { 185 iter_ = sv_.begin() + offset; 186 } 187 188 inline void Rewind(std::string_view::const_iterator pos) const 189 { 190 iter_ = pos; 191 } 192 193 inline std::string_view::const_iterator Save() const 194 { 195 return iter_; 196 } 197 198 inline bool HasNext() const 199 { 200 return iter_ != sv_.end(); 201 } 202 203 bool HasExpectedNumberOfBytes(size_t count) const 204 { 205 for (size_t i = 0; i < count; ++i) { 206 if (!HasNext()) { 207 return false; 208 } 209 ++iter_; 210 } 211 iter_ -= count; 212 return true; 213 } 214 215 void SkipCp() const; 216 217 private: 218 template <bool moveIter, bool setCpSize = false> 219 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const; 220 221 std::string_view sv_; 222 mutable std::string_view::const_iterator iter_; 223 }; 224 225private: 226 class Constants { 227 public: 228 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80; 229 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800; 230 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000; 231 232 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F; 233 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F; 234 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07; 235 236 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8; 237 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4; 238 239 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0; 240 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0; 241 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0; 242 243 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U; 244 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U; 245 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U; 246 247 static constexpr uint16_t UTF8_CONT_MASK = 0x3F; 248 static constexpr uint16_t UTF8_CONT_HEADER = 0x80; 249 250 static constexpr size_t UTF8_NEXT_ONE_BYTE = 1; 251 static constexpr size_t UTF8_NEXT_TWO_BYTE = 2; 252 static constexpr size_t UTF8_NEXT_THREE_BYTE = 3; 253 static constexpr size_t UTF8_NEXT_FOUR_BYTE = 4; 254 255 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800; 256 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00; 257 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00; 258 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000; 259 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff; 260 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT; 261 }; 262 263 friend class Iterator; 264 std::string_view sv_; 265}; 266 267class UString { 268public: 269 UString() = default; 270 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {} 271 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator) 272 { 273 Alloc(); 274 *str_ = str; 275 } 276 277 DEFAULT_COPY_SEMANTIC(UString); 278 DEFAULT_MOVE_SEMANTIC(UString); 279 ~UString() = default; 280 281 util::StringView View() const 282 { 283 if (!str_) { 284 return util::StringView(); 285 } 286 287 return util::StringView(str_); 288 } 289 290 void Append(char32_t ch) noexcept 291 { 292 if (!str_) { 293 Alloc(); 294 } 295 296 StringView::Utf8Encode<ArenaString>(str_, ch); 297 } 298 299 void Append(const StringView &other) noexcept 300 { 301 if (!str_) { 302 Alloc(); 303 } 304 305 *str_ += other.Utf8(); 306 } 307 308 void Append(const char *other) noexcept 309 { 310 if (!str_) { 311 Alloc(); 312 } 313 *str_ += other; 314 } 315 316private: 317 void Alloc() 318 { 319 str_ = allocator_->New<ArenaString>(allocator_->Adapter()); 320 CHECK_NOT_NULL(str_); 321 } 322 323protected: 324 ArenaString *str_ {}; 325 ArenaAllocator *allocator_ {}; 326}; 327 328template <bool moveIter, bool setCpSize> 329char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const 330{ 331 if (!HasNext()) { 332 return INVALID_CP; 333 } 334 335 const auto *iterNext = iter_; 336 337 char32_t cu0 = static_cast<uint8_t>(*iterNext++); 338 char32_t res {}; 339 340 if (cu0 < Constants::UTF8_1BYTE_LIMIT) { 341 res = cu0; 342 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) { 343 // Should be 2 bytes decoded in UTF-8 344 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) { 345 return INVALID_CP; 346 } 347 char32_t cu1 = static_cast<uint8_t>(*iterNext++); 348 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK); 349 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) { 350 // Should be 3 bytes decoded in UTF-8 351 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) { 352 return INVALID_CP; 353 } 354 char32_t cu1 = static_cast<uint8_t>(*iterNext++); 355 char32_t cu2 = static_cast<uint8_t>(*iterNext++); 356 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) | 357 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK); 358 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) && 359 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) { 360 // Should be 4 bytes decoded in UTF-8 361 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_FOUR_BYTE)) { 362 return INVALID_CP; 363 } 364 char32_t cu1 = static_cast<uint8_t>(*iterNext++); 365 char32_t cu2 = static_cast<uint8_t>(*iterNext++); 366 char32_t cu3 = static_cast<uint8_t>(*iterNext++); 367 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) | 368 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) | 369 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK); 370 } else { 371 res = INVALID_CP; 372 } 373 374 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 375 if constexpr (moveIter) { 376 iter_ = iterNext; 377 return res; 378 } 379 380 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 381 if constexpr (setCpSize) { 382 *cpSize = iterNext - iter_; 383 } 384 385 return res; 386} 387 388template <void encoder(std::string *, char32_t)> 389std::string StringView::EscapeSymbol() const 390{ 391 std::string str; 392 str.reserve(Length()); 393 394 Iterator iter(*this); 395 396 while (iter.HasNext()) { 397 auto cp = iter.Next(); 398 399 switch (cp) { 400 case '\r': { 401 if (iter.HasNext()) { 402 iter.Forward(1); 403 404 if (iter.Peek() != '\n') { 405 iter.Backward(1); 406 } 407 } 408 409 [[fallthrough]]; 410 } 411 case '\n': { 412 str += "\\n"; 413 break; 414 } 415 case '\b': { 416 str += "\\b"; 417 break; 418 } 419 case '\t': { 420 str += "\\t"; 421 break; 422 } 423 case '\f': { 424 str += "\\f"; 425 break; 426 } 427 case '"': { 428 str += "\\\""; 429 break; 430 } 431 case '\\': { 432 str += "\\\\"; 433 break; 434 } 435 default: { 436 encoder(&str, cp); 437 } 438 } 439 } 440 441 return str; 442} 443 444template <typename T> 445void StringView::Utf8Encode(T *str, char32_t cu) 446{ 447 if (cu < Constants::UTF8_1BYTE_LIMIT) { 448 str->push_back(static_cast<char>(cu)); 449 } else if (cu < Constants::UTF8_2BYTE_LIMIT) { 450 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) | 451 Constants::UTF8_2BYTE_HEADER)); 452 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 453 } else if (cu < Constants::UTF8_3BYTE_LIMIT) { 454 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) | 455 Constants::UTF8_3BYTE_HEADER)); 456 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 457 Constants::UTF8_CONT_HEADER)); 458 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 459 } else { 460 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) | 461 Constants::UTF8_4BYTE_HEADER)); 462 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 463 Constants::UTF8_CONT_HEADER)); 464 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 465 Constants::UTF8_CONT_HEADER)); 466 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 467 } 468} 469 470template <typename T> 471void StringView::Mutf8Encode(T *str, char32_t cu) 472{ 473 if (cu == 0) { 474 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER)); 475 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER)); 476 } else if (cu < Constants::UTF8_1BYTE_LIMIT) { 477 str->push_back(static_cast<char>(cu)); 478 } else if (cu < Constants::UTF8_2BYTE_LIMIT) { 479 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER)); 480 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 481 } else if (cu < Constants::UTF8_3BYTE_LIMIT) { 482 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER)); 483 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 484 Constants::UTF8_CONT_HEADER)); 485 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 486 } else { 487 auto [cu1, cu2] = EncodeSurrogate(cu); 488 Mutf8Encode(str, cu1); 489 Mutf8Encode(str, cu2); 490 } 491} 492 493} // namespace panda::es2panda::util 494 495// NOLINTNEXTLINE(cert-dcl58-cpp) 496namespace std { 497 498template <> 499// NOLINTNEXTLINE(altera-struct-pack-align) 500struct hash<panda::es2panda::util::StringView> { 501 std::size_t operator()(const panda::es2panda::util::StringView &str) const 502 { 503 return std::hash<std::string_view> {}(str.Utf8()); 504 } 505}; 506 507ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us); 508 509} // namespace std 510 511#ifndef NDEBUG 512#define DCOUT std::cout 513#else 514#define DCOUT false && std::cout 515#endif // NDEBUG 516 517#endif 518