1/** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#ifndef ES2PANDA_PARSER_CORE_LEXER_H 17#define ES2PANDA_PARSER_CORE_LEXER_H 18 19#include <ios> 20#include "lexer/regexp/regexp.h" 21#include "lexer/token/letters.h" 22#include "lexer/token/token.h" 23#include "util/enumbitops.h" 24 25namespace ark::es2panda::parser { 26class ParserContext; 27class ETSNolintParser; 28} // namespace ark::es2panda::parser 29 30namespace ark::es2panda::lexer { 31class Keywords; 32 33using ENUMBITOPS_OPERATORS; 34 35enum class NextTokenFlags : uint32_t { 36 NONE = 0U, 37 KEYWORD_TO_IDENT = 1U << 0U, 38 NUMERIC_SEPARATOR_ALLOWED = 1U << 1U, 39 BIGINT_ALLOWED = 1U << 2U, 40}; 41 42class LexerPosition { 43public: 44 explicit LexerPosition(const util::StringView &source); 45 DEFAULT_COPY_SEMANTIC(LexerPosition); 46 DEFAULT_MOVE_SEMANTIC(LexerPosition); 47 ~LexerPosition() = default; 48 49 util::StringView::Iterator &Iterator() 50 { 51 return iterator_; 52 } 53 54 const util::StringView::Iterator &Iterator() const 55 { 56 return iterator_; 57 } 58 59 size_t Line() const 60 { 61 return line_; 62 } 63 64 Token &GetToken() 65 { 66 return token_; 67 } 68 69 const Token &GetToken() const 70 { 71 return token_; 72 } 73 74 size_t &NextTokenLine() 75 { 76 return nextTokenLine_; 77 } 78 79private: 80 friend class Lexer; 81 82 Token token_ {}; 83 util::StringView::Iterator iterator_; 84 size_t line_ {}; 85 size_t nextTokenLine_ {}; 86}; 87 88class LexerTemplateString { 89public: 90 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {} 91 DEFAULT_COPY_SEMANTIC(LexerTemplateString); 92 DEFAULT_MOVE_SEMANTIC(LexerTemplateString); 93 ~LexerTemplateString() = default; 94 95 // NOLINTBEGIN(misc-non-private-member-variables-in-classes) 96 util::UString str; 97 size_t end {}; 98 bool scanExpression {}; 99 // NOLINTEND(misc-non-private-member-variables-in-classes) 100}; 101 102class TemplateLiteralParserContext; 103 104class Lexer { 105public: 106 explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true); 107 NO_COPY_SEMANTIC(Lexer); 108 NO_MOVE_SEMANTIC(Lexer); 109 virtual ~Lexer() = default; 110 111 // NOLINTNEXTLINE(google-default-arguments) 112 virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE); 113 virtual void ScanAsteriskPunctuator(); 114 115 Token &GetToken(); 116 const Token &GetToken() const; 117 size_t Line() const; 118 119 bool TryEatTokenType(lexer::TokenType type) 120 { 121 auto token = GetToken(); 122 if (token.Type() == type) { 123 NextToken(); 124 return true; 125 } 126 return false; 127 } 128 129 std::optional<Token> TryEatTokenKeyword(lexer::TokenType type) 130 { 131 auto token = GetToken(); 132 if (token.KeywordType() == type) { 133 NextToken(); 134 return token; 135 } 136 return std::nullopt; 137 } 138 139 LexerPosition Save() const; 140 void Rewind(const LexerPosition &pos); 141 void BackwardToken(TokenType type, size_t offset); 142 void ForwardToken(TokenType type, size_t offset); 143 144 char32_t Lookahead(); 145 bool CheckArrow(); 146 147 RegExp ScanRegExp(); 148 template <char32_t END> 149 void ScanString(); 150 void ResetTokenEnd(); 151 LexerTemplateString ScanTemplateString(); 152 void ScanTemplateStringEnd(); 153 void PushTemplateContext(TemplateLiteralParserContext *ctx); 154 [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const 155 { 156 ThrowError("Unexpected strict mode reserved keyword"); 157 } 158 159 enum class ConversionResult : uint8_t { 160 SUCCESS, 161 INVALID_ARGUMENT, 162 OUT_OF_RANGE, 163 }; 164 165 template <typename Tret, typename Ret = Tret, typename... Base> 166 static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str, 167 ConversionResult &result, Base... base) noexcept 168 { 169 Ret ret {}; 170 char *endPtr; 171 // NOLINTBEGIN(cppcoreguidelines-special-member-functions) 172 struct SaveErrno { 173 explicit SaveErrno() : errno_(errno) 174 { 175 errno = 0; 176 } 177 ~SaveErrno() 178 { 179 if (errno == 0) { 180 errno = errno_; 181 } 182 } 183 184 private: 185 decltype(errno) errno_; 186 } const savedErrno; 187 // NOLINTEND(cppcoreguidelines-special-member-functions) 188 189 const Tret tmp = converter(str, &endPtr, base...); 190 191 bool outOfRange = false; 192 if constexpr (std::is_same_v<Ret, int>) { 193 outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) || 194 tmp > static_cast<Tret>(std::numeric_limits<int>::max()); 195 } 196 197 if (endPtr == str) { 198 result = ConversionResult::INVALID_ARGUMENT; 199 } else if (errno == ERANGE || outOfRange) { 200 result = ConversionResult::OUT_OF_RANGE; 201 } else { 202 result = ConversionResult::SUCCESS; 203 ret = tmp; 204 } 205 206 return ret; 207 } 208 209 util::StringView SourceView(size_t begin, size_t end) const; 210 211protected: 212 void NextToken(Keywords *kws); 213 ArenaAllocator *Allocator(); 214 bool IsLineTerminatorOrEos() const; 215 void ScanRegExpPattern(); 216 RegExpFlags ScanRegExpFlags(); 217 218 [[noreturn]] void ThrowError(std::string_view message) const; 219 [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const; 220 221 void SetTokenStart(); 222 void SetTokenEnd(); 223 224 inline util::StringView::Iterator &Iterator() 225 { 226 return pos_.iterator_; 227 } 228 229 inline const util::StringView::Iterator &Iterator() const 230 { 231 return pos_.iterator_; 232 } 233 234 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const; 235 236 void SkipWhiteSpaces(); 237 void SkipSingleLineComment(); 238 239 bool ScanPunctuator(); 240 void ScanQuestionPunctuator(); 241 void ScanLessThanPunctuator(); 242 void ScanGreaterThanPunctuator(); 243 virtual void ScanEqualsPunctuator(); 244 virtual void ScanExclamationPunctuator(); 245 void ScanAmpersandPunctuator(); 246 void ScanVLinePunctuator(); 247 void ScanCircumflexPunctuator(); 248 void ScanPlusPunctuator(); 249 void ScanMinusPunctuator(); 250 void ScanSlashPunctuator(); 251 void ScanPercentPunctuator(); 252 void ScanDotPunctuator(); 253 void ScanColonPunctuator(); 254 virtual bool ScanDollarPunctuator(); 255 void ScanAtPunctuator(); 256 257 virtual void SkipMultiLineComment(); 258 virtual void ScanHashMark(); 259 virtual void ScanBackTick(); 260 261 virtual bool ScanCharLiteral() 262 { 263 return false; 264 } 265 266 char32_t ScanUnicodeEscapeSequence(); 267 template <int N, bool IN_AS = false> 268 char32_t ScanHexEscape(); 269 char32_t ScanUnicodeCodePointEscape(); 270 271 void ScanStringUnicodePart(util::UString *str); 272 char32_t ScanUnicodeCharacter(); 273 274 void ScanDecimalNumbers(); 275 276 virtual void ScanNumberLeadingZero() 277 { 278 ScanNumberLeadingZeroImpl<double>(); 279 } 280 281 template <typename RadixType, typename RadixLimit = void *> 282 bool ScanNumberLeadingZeroImpl(); 283 void ScanNumberLeadingZeroImplNonAllowedCases(); 284 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit> 285 bool ScanNumberRadix(bool allowNumericSeparator = true); 286 void ScanNumber(bool allowBigInt = true); 287 std::tuple<size_t, bool, NumberFlags> ScanCharLex(bool allowBigInt, bool parseExponent, NumberFlags flags); 288 size_t ScanSignOfNumber(); 289 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit> 290 bool ScanTooLargeNumber(RadixType number); 291 virtual void ConvertNumber(const std::string &utf8, NumberFlags flags); 292 void ScanDecimalLiteral(); 293 void ScanDecimalDigits(bool allowNumericSeparator); 294 virtual void CheckNumberLiteralEnd(); 295 void CheckOctal(); 296 297 inline static uint32_t HexValue(char32_t ch); 298 inline static bool IsDecimalDigit(uint32_t cp); 299 inline static bool IsHexDigit(char32_t ch); 300 inline static bool IsBinaryDigit(char32_t ch); 301 inline static bool IsOctalDigit(char32_t ch); 302 303 friend class KeywordsUtil; 304 friend class TemplateLiteralParserContext; 305 friend class parser::ETSNolintParser; 306 307 LexerPosition &Pos(); 308 const LexerPosition &Pos() const; 309 310private: 311 TemplateLiteralParserContext *tlCtx_ {}; 312 ArenaAllocator *allocator_; 313 Keywords *kws_ {}; 314 const parser::ParserContext *parserContext_; 315 util::StringView source_; 316 LexerPosition pos_; 317}; 318 319class TemplateLiteralParserContext { 320public: 321 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {} 322 NO_MOVE_SEMANTIC(TemplateLiteralParserContext); 323 NO_COPY_SEMANTIC(TemplateLiteralParserContext); 324 325 ~TemplateLiteralParserContext() 326 { 327 lexer_->tlCtx_ = prev_; 328 } 329 330 void ConsumeLeftBrace() 331 { 332 braceDepth_++; 333 } 334 335 bool ConsumeRightBrace() 336 { 337 braceDepth_--; 338 339 return braceDepth_ == 0; 340 } 341 342private: 343 Lexer *lexer_; 344 TemplateLiteralParserContext *prev_ {}; 345 size_t braceDepth_ {1}; 346}; 347 348template <char32_t END> 349void Lexer::ScanString() 350{ 351 util::UString str(Allocator()); 352 GetToken().type_ = TokenType::LITERAL_STRING; 353 GetToken().keywordType_ = TokenType::LITERAL_STRING; 354 355 const auto startPos = Iterator().Index(); 356 auto escapeEnd = startPos; 357 358 do { 359 char32_t cp = Iterator().Peek(); 360 361 switch (cp) { 362 case util::StringView::Iterator::INVALID_CP: { 363 ThrowError("Unterminated string"); 364 break; 365 } 366 case LEX_CHAR_CR: 367 case LEX_CHAR_LF: { 368 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 369 if constexpr (END != LEX_CHAR_BACK_TICK) { 370 ThrowError("Newline is not allowed in strings"); 371 } 372 373 GetToken().flags_ |= TokenFlags::HAS_ESCAPE; 374 str.Append(SourceView(escapeEnd, Iterator().Index())); 375 376 if (cp == LEX_CHAR_CR) { 377 Iterator().Forward(1); 378 379 if (Iterator().Peek() != LEX_CHAR_LF) { 380 Iterator().Backward(1); 381 } 382 } 383 384 pos_.line_++; 385 str.Append(LEX_CHAR_LF); 386 Iterator().Forward(1); 387 escapeEnd = Iterator().Index(); 388 continue; 389 } 390 case LEX_CHAR_BACKSLASH: { 391 GetToken().flags_ |= TokenFlags::HAS_ESCAPE; 392 str.Append(SourceView(escapeEnd, Iterator().Index())); 393 394 Iterator().Forward(1); 395 ScanStringUnicodePart(&str); 396 escapeEnd = Iterator().Index(); 397 continue; 398 } 399 case LEX_CHAR_BACK_TICK: 400 case LEX_CHAR_SINGLE_QUOTE: 401 case LEX_CHAR_DOUBLE_QUOTE: { 402 if (END == cp) { 403 break; 404 } 405 406 Iterator().Forward(1); 407 continue; 408 } 409 case LEX_CHAR_DOLLAR_SIGN: { 410 Iterator().Forward(1); 411 412 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 413 if constexpr (END == LEX_CHAR_BACK_TICK) { 414 if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) { 415 Iterator().Backward(1); 416 break; 417 } 418 } 419 420 continue; 421 } 422 default: { 423 Iterator().SkipCp(); 424 continue; 425 } 426 } 427 428 if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) { 429 str.Append(SourceView(escapeEnd, Iterator().Index())); 430 GetToken().src_ = str.View(); 431 } else { 432 GetToken().src_ = SourceView(startPos, Iterator().Index()); 433 } 434 435 break; 436 } while (true); 437 438 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 439 if constexpr (END != LEX_CHAR_BACK_TICK) { 440 Iterator().Forward(1); 441 } 442} 443 444template <int N, bool IN_AS> 445char32_t Lexer::ScanHexEscape() 446{ 447 char32_t code = 0; 448 449 for (size_t i = 0; i < N; ++i) { 450 const auto cp = Iterator().Peek(); 451 if (IN_AS && cp == LEX_CHAR_BACK_TICK) { 452 break; 453 } 454 455 Iterator().Forward(1); 456 457 if (!IsHexDigit(cp)) { 458 ThrowError("Invalid unicode escape sequence"); 459 } 460 461 constexpr auto MULTIPLIER = 16; 462 code = code * MULTIPLIER + HexValue(cp); 463 } 464 465 return code; 466} 467 468template <typename RadixType, typename RadixLimit> 469bool Lexer::ScanNumberLeadingZeroImpl() 470{ 471 GetToken().type_ = TokenType::LITERAL_NUMBER; 472 GetToken().keywordType_ = TokenType::LITERAL_NUMBER; 473 474 switch (Iterator().Peek()) { 475 case LEX_CHAR_LOWERCASE_X: 476 case LEX_CHAR_UPPERCASE_X: { 477 Iterator().Forward(1); 478 constexpr auto RADIX = 16; 479 if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>()) { 480 return false; 481 } 482 CheckNumberLiteralEnd(); 483 return true; 484 } 485 case LEX_CHAR_LOWERCASE_B: 486 case LEX_CHAR_UPPERCASE_B: { 487 Iterator().Forward(1); 488 constexpr auto RADIX = 2; 489 if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>()) { 490 return false; 491 } 492 CheckNumberLiteralEnd(); 493 return true; 494 } 495 case LEX_CHAR_LOWERCASE_O: 496 case LEX_CHAR_UPPERCASE_O: { 497 Iterator().Forward(1); 498 constexpr auto RADIX = 8; 499 if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>()) { 500 return false; 501 } 502 CheckOctal(); 503 CheckNumberLiteralEnd(); 504 return true; 505 } 506 default: { 507 ScanNumberLeadingZeroImplNonAllowedCases(); 508 break; 509 } 510 } 511 512 ScanNumber(); 513 return true; 514} 515 516template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit> 517bool Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number) 518{ 519 if constexpr (std::is_arithmetic_v<RadixLimit>) { 520 if (number > std::numeric_limits<RadixLimit>::max() / RADIX) { 521 return false; 522 } 523 } 524 return true; 525} 526 527template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit> 528bool Lexer::ScanNumberRadix(bool allowNumericSeparator) 529{ 530 RadixType number {}; 531 532 auto cp = Iterator().Peek(); 533 if (!RANGE_CHECK(cp)) { 534 ThrowError("Invalid digit"); 535 } 536 537 bool allowNumericOnNext = true; 538 539 do { 540 cp = Iterator().Peek(); 541 if (RANGE_CHECK(cp)) { 542 auto digit = HexValue(cp); 543 544 if (!ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number)) { 545 return false; 546 } 547 548 number = number * RADIX + digit; 549 Iterator().Forward(1); 550 allowNumericOnNext = true; 551 continue; 552 } 553 554 if (cp == LEX_CHAR_UNDERSCORE) { 555 if (!allowNumericSeparator || !allowNumericOnNext) { 556 ThrowError("Invalid numeric separator"); 557 } 558 559 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE; 560 Iterator().Forward(1); 561 allowNumericOnNext = false; 562 continue; 563 } 564 565 if (!allowNumericOnNext) { 566 Iterator().Backward(1); 567 ThrowError("Numeric separators are not allowed at the end of numeric literals"); 568 } 569 570 break; 571 } while (true); 572 573 GetToken().number_ = lexer::Number(number); 574 return true; 575} 576 577inline uint32_t Lexer::HexValue(char32_t ch) 578{ 579 constexpr uint32_t HEX_MASK = 0xF; 580 constexpr uint32_t DEC_OFFSET = 10; 581 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK); 582} 583 584inline bool Lexer::IsDecimalDigit(uint32_t cp) 585{ 586 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9); 587} 588 589inline bool Lexer::IsHexDigit(char32_t ch) 590{ 591 return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0); 592} 593 594inline bool Lexer::IsBinaryDigit(char32_t ch) 595{ 596 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1; 597} 598 599inline bool Lexer::IsOctalDigit(char32_t ch) 600{ 601 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7); 602} 603} // namespace ark::es2panda::lexer 604 605template <> 606struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type { 607}; 608 609#endif 610