1/** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H 17#define ES2PANDA_UTIL_INCLUDE_USTRING_H 18 19#include "macros.h" 20#include "utils/arena_containers.h" 21 22#include <cstddef> 23#include <limits> 24#include <memory> 25#include <string> 26#include <string_view> 27 28namespace ark::es2panda::util { 29class StringView { 30public: 31 explicit StringView() noexcept = default; 32 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {} 33 // NOLINTNEXTLINE(google-explicit-constructor) 34 StringView(std::string_view sv) noexcept : sv_(sv) {} 35 // NOLINTNEXTLINE(google-explicit-constructor) 36 StringView(const char *str) noexcept : sv_(str == nullptr ? "" : str) {} 37 DEFAULT_COPY_SEMANTIC(StringView); 38 DEFAULT_MOVE_SEMANTIC(StringView); 39 ~StringView() = default; 40 41 bool operator==(const StringView &rhs) const noexcept 42 { 43 return sv_ == rhs.sv_; 44 } 45 46 bool operator!=(const StringView &rhs) const noexcept 47 { 48 return sv_ != rhs.sv_; 49 } 50 51 bool operator<(const StringView &rhs) const noexcept 52 { 53 return sv_ < rhs.sv_; 54 } 55 56 bool operator>(const StringView &rhs) const noexcept 57 { 58 return sv_ > rhs.sv_; 59 } 60 61 int Compare(const StringView &other) const noexcept 62 { 63 return sv_.compare(other.sv_); 64 } 65 66 int Compare(const std::string_view &other) const noexcept 67 { 68 return sv_.compare(other); 69 } 70 71 bool Is(const char *str) const noexcept 72 { 73 return sv_ == str; 74 } 75 76 bool Is(const std::string_view &str) const noexcept 77 { 78 return sv_ == str; 79 } 80 81 size_t Length() const noexcept 82 { 83 return sv_.length(); 84 } 85 86 bool Empty() const noexcept 87 { 88 return sv_.empty(); 89 } 90 91 const std::string_view &Utf8() const noexcept 92 { 93 return sv_; 94 } 95 96 explicit operator std::string() const noexcept 97 { 98 return std::string {sv_}; 99 } 100 101 const char *Bytes() const noexcept 102 { 103 return sv_.data(); 104 } 105 106 StringView Substr(size_t begin, size_t end) const noexcept 107 { 108 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 109 return StringView(std::string_view(sv_.data() + begin, end - begin)); 110 } 111 112 static bool IsHighSurrogate(char32_t cp) 113 { 114 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX); 115 } 116 117 static bool IsLowSurrogate(char32_t cp) 118 { 119 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX); 120 } 121 122 std::string Mutf8() const noexcept; 123 static char32_t DecodeSurrogates(char32_t high, char32_t low); 124 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp); 125 126 template <void ENCODER(std::string *, char32_t)> 127 std::string EscapeSymbol() const; 128 129 template <typename T> 130 static void Utf8Encode(T *str, char32_t cu); 131 template <typename T> 132 static void Mutf8Encode(T *str, char32_t cu); 133 134 bool IsConvertibleToChar() const; 135 136 class Iterator { 137 public: 138 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max(); 139 140 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {} 141 DEFAULT_COPY_SEMANTIC(Iterator); 142 DEFAULT_MOVE_SEMANTIC(Iterator); 143 ~Iterator() = default; 144 145 inline size_t Index() const 146 { 147 return static_cast<size_t>(iter_ - sv_.begin()); 148 } 149 150 inline char32_t Next() 151 { 152 return DecodeCP<true>(nullptr); 153 } 154 155 inline char32_t Peek() const 156 { 157 return HasNext() ? *iter_ : INVALID_CP; 158 } 159 160 inline char32_t PeekCp() const 161 { 162 return DecodeCP<false>(nullptr); 163 } 164 165 inline char32_t PeekCp(size_t *cpSize) const 166 { 167 return DecodeCP<false, true>(cpSize); 168 } 169 170 inline void Forward(size_t offset) 171 { 172 iter_ += offset; 173 } 174 175 inline void Backward(size_t offset) 176 { 177 iter_ -= offset; 178 } 179 180 inline void Reset(size_t offset) 181 { 182 iter_ = sv_.begin() + offset; 183 } 184 185 inline void Rewind(std::string_view::const_iterator pos) 186 { 187 iter_ = pos; 188 } 189 190 inline std::string_view::const_iterator Save() const 191 { 192 return iter_; 193 } 194 195 inline bool HasNext() const 196 { 197 return iter_ != sv_.end(); 198 } 199 200 void SkipCp(); 201 202 private: 203 template <bool MOVE_ITER, bool SET_CP_SIZE = false> 204 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const; 205 206 std::string_view sv_; 207 mutable std::string_view::const_iterator iter_; 208 }; 209 210 class Constants { 211 public: 212 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80; 213 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800; 214 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000; 215 216 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F; 217 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F; 218 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07; 219 220 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8; 221 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4; 222 223 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0; 224 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0; 225 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0; 226 227 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U; 228 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U; 229 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U; 230 231 static constexpr uint16_t UTF8_CONT_MASK = 0x3F; 232 static constexpr uint16_t UTF8_CONT_HEADER = 0x80; 233 234 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800; 235 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00; 236 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00; 237 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000; 238 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff; 239 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT; 240 }; 241 242private: 243 friend class Iterator; 244 std::string_view sv_; 245}; 246 247class UString { 248public: 249 UString() = default; 250 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {} 251 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator) 252 { 253 Alloc(); 254 *str_ = str; 255 } 256 257 explicit UString(const std::string_view &str, ArenaAllocator *allocator) : UString(allocator) 258 { 259 Alloc(); 260 *str_ = str; 261 } 262 263 explicit UString(const util::StringView &str, ArenaAllocator *allocator) : UString(str.Utf8(), allocator) {} 264 265 DEFAULT_COPY_SEMANTIC(UString); 266 DEFAULT_MOVE_SEMANTIC(UString); 267 ~UString() = default; 268 269 util::StringView View() const 270 { 271 if (str_ == nullptr) { 272 return util::StringView(); 273 } 274 275 return util::StringView(str_); 276 } 277 278 util::StringView View() 279 { 280 if (str_ == nullptr) { 281 return util::StringView(); 282 } 283 284 return util::StringView(str_); 285 } 286 287 void Append(char32_t ch) noexcept 288 { 289 if (str_ == nullptr) { 290 Alloc(); 291 } 292 293 StringView::Utf8Encode<ArenaString>(str_, ch); 294 } 295 296 void Append(const StringView &other) noexcept 297 { 298 if (str_ == nullptr) { 299 Alloc(); 300 } 301 302 *str_ += other.Utf8(); 303 } 304 305 void Append(const char *other) noexcept 306 { 307 if (str_ == nullptr) { 308 Alloc(); 309 } 310 *str_ += other; 311 } 312 313 void Append(const std::string &other) noexcept 314 { 315 if (str_ == nullptr) { 316 Alloc(); 317 } 318 *str_ += other; 319 } 320 321private: 322 void Alloc() 323 { 324 str_ = allocator_->New<ArenaString>(allocator_->Adapter()); 325 } 326 327protected: 328 // NOLINTBEGIN(misc-non-private-member-variables-in-classes) 329 ArenaString *str_ {}; 330 ArenaAllocator *allocator_ {}; 331 // NOLINTEND(misc-non-private-member-variables-in-classes) 332}; 333 334template <bool MOVE_ITER, bool SET_CP_SIZE> 335char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const 336{ 337 if (!HasNext()) { 338 return INVALID_CP; 339 } 340 341 const auto *iterNext = iter_; 342 343 char32_t cu0 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 344 char32_t res {}; 345 346 if (cu0 < Constants::UTF8_1BYTE_LIMIT) { 347 res = cu0; 348 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) { 349 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 350 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK); 351 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) { 352 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 353 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 354 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) | 355 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK); 356 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) && 357 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) { 358 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 359 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 360 char32_t cu3 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 361 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) | 362 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) | 363 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK); 364 } else { 365 res = INVALID_CP; 366 } 367 368 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 369 if constexpr (MOVE_ITER) { 370 iter_ = iterNext; 371 return res; 372 } 373 374 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 375 if constexpr (SET_CP_SIZE) { 376 *cpSize = iterNext - iter_; 377 } 378 379 return res; 380} 381 382template <void ENCODER(std::string *, char32_t)> 383std::string StringView::EscapeSymbol() const 384{ 385 std::string str; 386 str.reserve(Length()); 387 388 auto skipNewLine = [](auto &iter) { 389 if (iter.HasNext()) { 390 iter.Forward(1); 391 392 if (iter.Peek() != '\n') { 393 iter.Backward(1); 394 } 395 } 396 }; 397 398 Iterator iter(*this); 399 while (iter.HasNext()) { 400 auto cp = iter.Next(); 401 402 switch (cp) { 403 case '\r': { 404 skipNewLine(iter); 405 [[fallthrough]]; 406 } 407 case '\n': { 408 str += "\\n"; 409 break; 410 } 411 case '\b': { 412 str += "\\b"; 413 break; 414 } 415 case '\t': { 416 str += "\\t"; 417 break; 418 } 419 case '\f': { 420 str += "\\f"; 421 break; 422 } 423 case '"': { 424 str += "\\\""; 425 break; 426 } 427 case '\\': { 428 str += "\\\\"; 429 break; 430 } 431 default: { 432 ENCODER(&str, cp); 433 } 434 } 435 } 436 437 return str; 438} 439 440template <typename T> 441void StringView::Utf8Encode(T *str, char32_t cu) 442{ 443 if (cu < Constants::UTF8_1BYTE_LIMIT) { 444 str->push_back(static_cast<char>(cu)); 445 } else if (cu < Constants::UTF8_2BYTE_LIMIT) { 446 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) | 447 Constants::UTF8_2BYTE_HEADER)); 448 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 449 } else if (cu < Constants::UTF8_3BYTE_LIMIT) { 450 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) | 451 Constants::UTF8_3BYTE_HEADER)); 452 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 453 Constants::UTF8_CONT_HEADER)); 454 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 455 } else { 456 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) | 457 Constants::UTF8_4BYTE_HEADER)); 458 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 459 Constants::UTF8_CONT_HEADER)); 460 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 461 Constants::UTF8_CONT_HEADER)); 462 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 463 } 464} 465 466template <typename T> 467void StringView::Mutf8Encode(T *str, char32_t cu) 468{ 469 if (cu == 0) { 470 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER)); 471 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER)); 472 } else if (cu < Constants::UTF8_1BYTE_LIMIT) { 473 str->push_back(static_cast<char>(cu)); 474 } else if (cu < Constants::UTF8_2BYTE_LIMIT) { 475 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER)); 476 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 477 } else if (cu < Constants::UTF8_3BYTE_LIMIT) { 478 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER)); 479 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) | 480 Constants::UTF8_CONT_HEADER)); 481 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER)); 482 } else { 483 auto [cu1, cu2] = EncodeSurrogate(cu); 484 Mutf8Encode(str, cu1); 485 Mutf8Encode(str, cu2); 486 } 487} 488} // namespace ark::es2panda::util 489 490// NOLINTNEXTLINE(cert-dcl58-cpp) 491namespace std { 492 493template <> 494// NOLINTNEXTLINE(altera-struct-pack-align) 495struct hash<ark::es2panda::util::StringView> { 496 std::size_t operator()(const ark::es2panda::util::StringView &str) const 497 { 498 return std::hash<std::string_view> {}(str.Utf8()); 499 } 500}; 501 502ostream &operator<<(ostream &os, const ark::es2panda::util::StringView &us); 503 504} // namespace std 505 506#endif 507