1/** 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#ifndef ES2PANDA_PARSER_CORE_LEXER_H 17#define ES2PANDA_PARSER_CORE_LEXER_H 18 19#include <lexer/regexp/regexp.h> 20#include <lexer/token/letters.h> 21#include <lexer/token/token.h> 22#include <util/enumbitops.h> 23 24namespace panda::es2panda::parser { 25class ParserContext; 26} // namespace panda::es2panda::parser 27 28namespace panda::es2panda::lexer { 29 30enum class LexerNextTokenFlags : uint8_t { 31 NONE = 0, 32 KEYWORD_TO_IDENT = 1 << 0, 33 NUMERIC_SEPARATOR_ALLOWED = 1 << 1, 34 BIGINT_ALLOWED = 1 << 2, 35}; 36 37DEFINE_BITOPS(LexerNextTokenFlags) 38 39class LexerPosition { 40public: 41 explicit LexerPosition(const util::StringView &source); 42 DEFAULT_COPY_SEMANTIC(LexerPosition); 43 DEFAULT_MOVE_SEMANTIC(LexerPosition); 44 ~LexerPosition() = default; 45 46 Token token {}; 47 util::StringView::Iterator iterator; 48 size_t line {}; 49 size_t nextTokenLine {}; 50}; 51 52class LexerTemplateString { 53public: 54 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {} 55 DEFAULT_COPY_SEMANTIC(LexerTemplateString); 56 DEFAULT_MOVE_SEMANTIC(LexerTemplateString); 57 ~LexerTemplateString() = default; 58 59 util::UString str; 60 size_t end {}; 61 bool scanExpression {}; 62}; 63 64class TemplateLiteralParserContext; 65 66class Lexer { 67public: 68 explicit Lexer(const parser::ParserContext *parserContext); 69 NO_COPY_SEMANTIC(Lexer); 70 NO_MOVE_SEMANTIC(Lexer); 71 ~Lexer() = default; 72 73 void NextToken(LexerNextTokenFlags flags = LexerNextTokenFlags::NONE); 74 75 Token &GetToken(); 76 const Token &GetToken() const; 77 size_t Line() const; 78 79 LexerPosition Save() const; 80 void Rewind(const LexerPosition &pos); 81 void BackwardToken(TokenType type, size_t offset); 82 void ForwardToken(TokenType type, size_t offset); 83 84 char32_t Lookahead(); 85 bool CheckArrow(); 86 87 RegExp ScanRegExp(); 88 template <char32_t end> 89 void ScanString(); 90 void ResetTokenEnd(); 91 LexerTemplateString ScanTemplateString(); 92 void ScanTemplateStringEnd(); 93 void PushTemplateContext(TemplateLiteralParserContext *ctx); 94 void AssignTokenTaggedTemplate(); 95 96private: 97 ArenaAllocator *Allocator(); 98 bool IsLineTerminatorOrEos() const; 99 void ScanRegExpPattern(); 100 bool GetRegExpFlag(char32_t cp, RegExpFlags &flag); 101 RegExpFlags ScanRegExpFlags(); 102 103 void ThrowError(std::string_view message); 104 105 void SetTokenStart(); 106 void SetTokenEnd(); 107 bool CheckTokenIsTaggedTemplate() const; 108 109 inline util::StringView::Iterator &Iterator() 110 { 111 return pos_.iterator; 112 } 113 114 inline const util::StringView::Iterator &Iterator() const 115 { 116 return pos_.iterator; 117 } 118 119 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const; 120 util::StringView SourceView(size_t begin, size_t end) const; 121 122 void SkipWhiteSpaces(); 123 void SkipSingleLineComment(); 124 void SkipMultiLineComment(); 125 template <TokenType keyword_type> 126 void CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags); 127 void CheckKeywordEscape(TokenType type); 128 void CheckAwaitKeyword(); 129 void CheckArgumentsKeyword(); 130 void CheckEnumKeyword(); 131 void CheckLetKeyword(); 132 void CheckYieldKeyword(); 133 void CheckFutureReservedKeyword(TokenType keywordType); 134 135 bool ScanPunctuator(); 136 void ScanQuestionPunctuator(); 137 void ScanLessThanPunctuator(); 138 void ScanGreaterThanPunctuator(); 139 void ScanEqualsPunctuator(); 140 void ScanExclamationPunctuator(); 141 void ScanAmpersandPunctuator(); 142 void ScanVLinePunctuator(); 143 void ScanCircumflexPunctuator(); 144 void ScanPlusPunctuator(); 145 void ScanMinusPunctuator(); 146 void ScanAsterixPunctuator(); 147 void ScanSlashPunctuator(); 148 void ScanPercentPunctuator(); 149 void ScanDotPunctuator(); 150 151 char32_t ScanUnicodeEscapeSequence(); 152 template <int N> 153 char32_t ScanHexEscape(); 154 char32_t ScanUnicodeCodePointEscape(); 155 156 void ScanStringUnicodePart(util::UString *str); 157 158 void ScanNumberLeadingZero(); 159 void ScanDecimalNumbers(bool allowNumericSeparator); 160 template <bool rangeCheck(char32_t), int radix> 161 void ScanNumberRadix(bool allowNumericSeparator = true); 162 void ScanNumber(bool allowNumericSeparator = true, bool allowBigInt = true); 163 void ConvertNumber(size_t exponentSignPos); 164 void ScanDecimalLiteral(); 165 void ScanDecimalDigits(bool allowNumericSeparator); 166 void CheckNumberLiteralEnd(); 167 void AssignTokenEscapeError(); 168 169 inline static uint32_t HexValue(char32_t ch); 170 inline static bool IsDecimalDigit(uint32_t cp); 171 inline static bool IsHexDigit(char32_t ch); 172 inline static bool IsBinaryDigit(char32_t ch); 173 inline static bool IsOctalDigit(char32_t ch); 174 175 friend class KeywordsUtil; 176 friend class TemplateLiteralParserContext; 177 TemplateLiteralParserContext *tlCtx_ {}; 178 ArenaAllocator *allocator_; 179 const parser::ParserContext *parserContext_; 180 util::StringView source_; 181 LexerPosition pos_; 182 bool isUnderscore_ = false; 183}; 184 185class TemplateLiteralParserContext { 186public: 187 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {} 188 NO_MOVE_SEMANTIC(TemplateLiteralParserContext); 189 NO_COPY_SEMANTIC(TemplateLiteralParserContext); 190 191 ~TemplateLiteralParserContext() 192 { 193 lexer_->tlCtx_ = prev_; 194 } 195 196 void ConsumeLeftBrace() 197 { 198 braceDepth_++; 199 } 200 201 bool ConsumeRightBrace() 202 { 203 braceDepth_--; 204 205 return braceDepth_ == 0; 206 } 207 208private: 209 Lexer *lexer_; 210 TemplateLiteralParserContext *prev_ {}; 211 size_t braceDepth_ {1}; 212}; 213 214template <char32_t end> 215void Lexer::ScanString() 216{ 217 util::UString str(Allocator()); 218 GetToken().type_ = TokenType::LITERAL_STRING; 219 220 const auto startPos = Iterator().Index(); 221 auto escapeEnd = startPos; 222 223 do { 224 char32_t cp = Iterator().Peek(); 225 226 switch (cp) { 227 case util::StringView::Iterator::INVALID_CP: { 228 ThrowError("Unterminated string"); 229 break; 230 } 231 case LEX_CHAR_CR: 232 case LEX_CHAR_LF: { 233 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 234 if constexpr (end != LEX_CHAR_BACK_TICK) { 235 ThrowError("Newline is not allowed in strings"); 236 } 237 238 GetToken().flags_ |= TokenFlags::HAS_ESCAPE; 239 str.Append(SourceView(escapeEnd, Iterator().Index())); 240 241 if (cp == LEX_CHAR_CR) { 242 Iterator().Forward(1); 243 244 if (Iterator().Peek() != LEX_CHAR_LF) { 245 Iterator().Backward(1); 246 } 247 } 248 249 pos_.line++; 250 str.Append(LEX_CHAR_LF); 251 Iterator().Forward(1); 252 escapeEnd = Iterator().Index(); 253 continue; 254 } 255 case LEX_CHAR_BACKSLASH: { 256 GetToken().flags_ |= TokenFlags::HAS_ESCAPE; 257 str.Append(SourceView(escapeEnd, Iterator().Index())); 258 259 Iterator().Forward(1); 260 ScanStringUnicodePart(&str); 261 escapeEnd = Iterator().Index(); 262 continue; 263 } 264 case LEX_CHAR_BACK_TICK: 265 case LEX_CHAR_SINGLE_QUOTE: 266 case LEX_CHAR_DOUBLE_QUOTE: { 267 if (end == cp) { 268 break; 269 } 270 271 Iterator().Forward(1); 272 continue; 273 } 274 case LEX_CHAR_DOLLAR_SIGN: { 275 Iterator().Forward(1); 276 277 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 278 if constexpr (end == LEX_CHAR_BACK_TICK) { 279 if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) { 280 Iterator().Backward(1); 281 break; 282 } 283 } 284 285 continue; 286 } 287 default: { 288 Iterator().SkipCp(); 289 continue; 290 } 291 } 292 293 if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) { 294 str.Append(SourceView(escapeEnd, Iterator().Index())); 295 GetToken().src_ = str.View(); 296 } else { 297 GetToken().src_ = SourceView(startPos, Iterator().Index()); 298 } 299 300 break; 301 } while (true); 302 303 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon) 304 if constexpr (end != LEX_CHAR_BACK_TICK) { 305 Iterator().Forward(1); 306 } 307} 308 309template <int N> 310char32_t Lexer::ScanHexEscape() 311{ 312 char32_t code = 0; 313 314 for (size_t i = 0; i < N; ++i) { 315 const auto cp = Iterator().Peek(); 316 Iterator().Forward(1); 317 318 if (!IsHexDigit(cp)) { 319 // Should not throw error in tagged template in ES2021 320 if (CheckTokenIsTaggedTemplate()) { 321 AssignTokenEscapeError(); 322 } else { 323 ThrowError("Invalid unicode escape sequence"); 324 } 325 } 326 327 constexpr auto MULTIPLIER = 16; 328 code = code * MULTIPLIER + HexValue(cp); 329 } 330 331 return code; 332} 333 334template <bool rangeCheck(char32_t), int radix> 335void Lexer::ScanNumberRadix(bool allowNumericSeparator) 336{ 337 double number = 0.0; 338 339 auto cp = Iterator().Peek(); 340 if (!rangeCheck(cp)) { 341 ThrowError("Invalid digit"); 342 } 343 344 bool allowNumericOnNext = true; 345 346 do { 347 cp = Iterator().Peek(); 348 if (rangeCheck(cp)) { 349 number = number * radix + HexValue(cp); 350 Iterator().Forward(1); 351 allowNumericOnNext = true; 352 continue; 353 } 354 355 if (cp == LEX_CHAR_UNDERSCORE) { 356 if (!allowNumericSeparator || !allowNumericOnNext) { 357 ThrowError("Invalid numeric separator"); 358 } 359 360 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE; 361 Iterator().Forward(1); 362 allowNumericOnNext = false; 363 continue; 364 } 365 366 if (!allowNumericOnNext) { 367 Iterator().Backward(1); 368 ThrowError("Numeric separators are not allowed at the end of numeric literals"); 369 } 370 371 break; 372 } while (true); 373 374 GetToken().number_ = number; 375} 376 377template <TokenType keyword_type> 378void Lexer::CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags) 379{ 380 // NOLINTNEXTLINE 381 if constexpr (keyword_type == TokenType::KEYW_AWAIT) { 382 CheckAwaitKeyword(); 383 return; 384 } 385 386 if constexpr (keyword_type == TokenType::KEYW_ARGUMENTS) { 387 CheckArgumentsKeyword(); 388 } 389 390 // NOLINTNEXTLINE 391 if constexpr (keyword_type == TokenType::KEYW_ENUM) { 392 CheckEnumKeyword(); 393 return; 394 } 395 396 // NOLINTNEXTLINE 397 if constexpr (keyword_type == TokenType::KEYW_YIELD) { 398 CheckYieldKeyword(); 399 return; 400 } 401 402 // NOLINTNEXTLINE 403 if constexpr (keyword_type == TokenType::KEYW_LET) { 404 CheckLetKeyword(); 405 return; 406 } 407 408 // NOLINTNEXTLINE 409 if constexpr (keyword_type <= TokenType::KEYW_ASYNC) { 410 CheckKeywordEscape(type); 411 return; 412 } 413 414 // NOLINTNEXTLINE 415 if constexpr (keyword_type >= TokenType::KEYW_PUBLIC) { 416 // NOLINTNEXTLINE 417 CheckFutureReservedKeyword(keyword_type); 418 return; 419 } 420 421 GetToken().type_ = TokenType::LITERAL_IDENT; 422} 423 424inline uint32_t Lexer::HexValue(char32_t ch) 425{ 426 constexpr uint32_t HEX_MASK = 0xF; 427 constexpr uint32_t DEC_OFFSET = 10; 428 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK); 429} 430 431inline bool Lexer::IsDecimalDigit(uint32_t cp) 432{ 433 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9); 434} 435 436inline bool Lexer::IsHexDigit(char32_t ch) 437{ 438 return ch < LEX_ASCII_MAX_BITS && std::isxdigit(static_cast<unsigned char>(ch)); 439} 440 441inline bool Lexer::IsBinaryDigit(char32_t ch) 442{ 443 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1; 444} 445 446inline bool Lexer::IsOctalDigit(char32_t ch) 447{ 448 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7); 449} 450 451} // namespace panda::es2panda::lexer 452 453#endif 454