1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18 
19 #include <ios>
20 #include "lexer/regexp/regexp.h"
21 #include "lexer/token/letters.h"
22 #include "lexer/token/token.h"
23 #include "util/enumbitops.h"
24 
25 namespace ark::es2panda::parser {
26 class ParserContext;
27 class ETSNolintParser;
28 }  // namespace ark::es2panda::parser
29 
30 namespace ark::es2panda::lexer {
31 class Keywords;
32 
33 using ENUMBITOPS_OPERATORS;
34 
35 enum class NextTokenFlags : uint32_t {
36     NONE = 0U,
37     KEYWORD_TO_IDENT = 1U << 0U,
38     NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
39     BIGINT_ALLOWED = 1U << 2U,
40 };
41 
42 class LexerPosition {
43 public:
44     explicit LexerPosition(const util::StringView &source);
45     DEFAULT_COPY_SEMANTIC(LexerPosition);
46     DEFAULT_MOVE_SEMANTIC(LexerPosition);
47     ~LexerPosition() = default;
48 
Iterator()49     util::StringView::Iterator &Iterator()
50     {
51         return iterator_;
52     }
53 
Iterator() const54     const util::StringView::Iterator &Iterator() const
55     {
56         return iterator_;
57     }
58 
Line() const59     size_t Line() const
60     {
61         return line_;
62     }
63 
GetToken()64     Token &GetToken()
65     {
66         return token_;
67     }
68 
GetToken() const69     const Token &GetToken() const
70     {
71         return token_;
72     }
73 
NextTokenLine()74     size_t &NextTokenLine()
75     {
76         return nextTokenLine_;
77     }
78 
79 private:
80     friend class Lexer;
81 
82     Token token_ {};
83     util::StringView::Iterator iterator_;
84     size_t line_ {};
85     size_t nextTokenLine_ {};
86 };
87 
88 class LexerTemplateString {
89 public:
LexerTemplateString(ArenaAllocator *allocator)90     explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
91     DEFAULT_COPY_SEMANTIC(LexerTemplateString);
92     DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
93     ~LexerTemplateString() = default;
94 
95     // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
96     util::UString str;
97     size_t end {};
98     bool scanExpression {};
99     // NOLINTEND(misc-non-private-member-variables-in-classes)
100 };
101 
102 class TemplateLiteralParserContext;
103 
104 class Lexer {
105 public:
106     explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true);
107     NO_COPY_SEMANTIC(Lexer);
108     NO_MOVE_SEMANTIC(Lexer);
109     virtual ~Lexer() = default;
110 
111     // NOLINTNEXTLINE(google-default-arguments)
112     virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
113     virtual void ScanAsteriskPunctuator();
114 
115     Token &GetToken();
116     const Token &GetToken() const;
117     size_t Line() const;
118 
TryEatTokenType(lexer::TokenType type)119     bool TryEatTokenType(lexer::TokenType type)
120     {
121         auto token = GetToken();
122         if (token.Type() == type) {
123             NextToken();
124             return true;
125         }
126         return false;
127     }
128 
TryEatTokenKeyword(lexer::TokenType type)129     std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
130     {
131         auto token = GetToken();
132         if (token.KeywordType() == type) {
133             NextToken();
134             return token;
135         }
136         return std::nullopt;
137     }
138 
139     LexerPosition Save() const;
140     void Rewind(const LexerPosition &pos);
141     void BackwardToken(TokenType type, size_t offset);
142     void ForwardToken(TokenType type, size_t offset);
143 
144     char32_t Lookahead();
145     bool CheckArrow();
146 
147     RegExp ScanRegExp();
148     template <char32_t END>
149     void ScanString();
150     void ResetTokenEnd();
151     LexerTemplateString ScanTemplateString();
152     void ScanTemplateStringEnd();
153     void PushTemplateContext(TemplateLiteralParserContext *ctx);
ThrowUnexpectedStrictModeReservedKeyword() const154     [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const
155     {
156         ThrowError("Unexpected strict mode reserved keyword");
157     }
158 
159     enum class ConversionResult : uint8_t {
160         SUCCESS,
161         INVALID_ARGUMENT,
162         OUT_OF_RANGE,
163     };
164 
165     template <typename Tret, typename Ret = Tret, typename... Base>
166     static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
167                             ConversionResult &result, Base... base) noexcept
168     {
169         Ret ret {};
170         char *endPtr;
171         // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
172         struct SaveErrno {
SaveErrnoark::es2panda::ark::es2panda::lexer::Lexer::SaveErrno173             explicit SaveErrno() : errno_(errno)
174             {
175                 errno = 0;
176             }
~SaveErrnoark::es2panda::ark::es2panda::lexer::Lexer::SaveErrno177             ~SaveErrno()
178             {
179                 if (errno == 0) {
180                     errno = errno_;
181                 }
182             }
183 
184         private:
185             decltype(errno) errno_;
186         } const savedErrno;
187         // NOLINTEND(cppcoreguidelines-special-member-functions)
188 
189         const Tret tmp = converter(str, &endPtr, base...);
190 
191         bool outOfRange = false;
onstexpr(std::is_same_v<Ret, int>)192         if constexpr (std::is_same_v<Ret, int>) {
193             outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) ||
194                          tmp > static_cast<Tret>(std::numeric_limits<int>::max());
195         }
196 
197         if (endPtr == str) {
198             result = ConversionResult::INVALID_ARGUMENT;
199         } else if (errno == ERANGE || outOfRange) {
200             result = ConversionResult::OUT_OF_RANGE;
201         } else {
202             result = ConversionResult::SUCCESS;
203             ret = tmp;
204         }
205 
206         return ret;
207     }
208 
209     util::StringView SourceView(size_t begin, size_t end) const;
210 
211 protected:
212     void NextToken(Keywords *kws);
213     ArenaAllocator *Allocator();
214     bool IsLineTerminatorOrEos() const;
215     void ScanRegExpPattern();
216     RegExpFlags ScanRegExpFlags();
217 
218     [[noreturn]] void ThrowError(std::string_view message) const;
219     [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const;
220 
221     void SetTokenStart();
222     void SetTokenEnd();
223 
Iterator()224     inline util::StringView::Iterator &Iterator()
225     {
226         return pos_.iterator_;
227     }
228 
Iterator() const229     inline const util::StringView::Iterator &Iterator() const
230     {
231         return pos_.iterator_;
232     }
233 
234     util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
235 
236     void SkipWhiteSpaces();
237     void SkipSingleLineComment();
238 
239     bool ScanPunctuator();
240     void ScanQuestionPunctuator();
241     void ScanLessThanPunctuator();
242     void ScanGreaterThanPunctuator();
243     virtual void ScanEqualsPunctuator();
244     virtual void ScanExclamationPunctuator();
245     void ScanAmpersandPunctuator();
246     void ScanVLinePunctuator();
247     void ScanCircumflexPunctuator();
248     void ScanPlusPunctuator();
249     void ScanMinusPunctuator();
250     void ScanSlashPunctuator();
251     void ScanPercentPunctuator();
252     void ScanDotPunctuator();
253     void ScanColonPunctuator();
254     virtual bool ScanDollarPunctuator();
255     void ScanAtPunctuator();
256 
257     virtual void SkipMultiLineComment();
258     virtual void ScanHashMark();
259     virtual void ScanBackTick();
260 
ScanCharLiteral()261     virtual bool ScanCharLiteral()
262     {
263         return false;
264     }
265 
266     char32_t ScanUnicodeEscapeSequence();
267     template <int N, bool IN_AS = false>
268     char32_t ScanHexEscape();
269     char32_t ScanUnicodeCodePointEscape();
270 
271     void ScanStringUnicodePart(util::UString *str);
272     char32_t ScanUnicodeCharacter();
273 
274     void ScanDecimalNumbers();
275 
ScanNumberLeadingZero()276     virtual void ScanNumberLeadingZero()
277     {
278         ScanNumberLeadingZeroImpl<double>();
279     }
280 
281     template <typename RadixType, typename RadixLimit = void *>
282     bool ScanNumberLeadingZeroImpl();
283     void ScanNumberLeadingZeroImplNonAllowedCases();
284     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
285     bool ScanNumberRadix(bool allowNumericSeparator = true);
286     void ScanNumber(bool allowBigInt = true);
287     std::tuple<size_t, bool, NumberFlags> ScanCharLex(bool allowBigInt, bool parseExponent, NumberFlags flags);
288     size_t ScanSignOfNumber();
289     template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
290     bool ScanTooLargeNumber(RadixType number);
291     virtual void ConvertNumber(const std::string &utf8, NumberFlags flags);
292     void ScanDecimalLiteral();
293     void ScanDecimalDigits(bool allowNumericSeparator);
294     virtual void CheckNumberLiteralEnd();
295     void CheckOctal();
296 
297     inline static uint32_t HexValue(char32_t ch);
298     inline static bool IsDecimalDigit(uint32_t cp);
299     inline static bool IsHexDigit(char32_t ch);
300     inline static bool IsBinaryDigit(char32_t ch);
301     inline static bool IsOctalDigit(char32_t ch);
302 
303     friend class KeywordsUtil;
304     friend class TemplateLiteralParserContext;
305     friend class parser::ETSNolintParser;
306 
307     LexerPosition &Pos();
308     const LexerPosition &Pos() const;
309 
310 private:
311     TemplateLiteralParserContext *tlCtx_ {};
312     ArenaAllocator *allocator_;
313     Keywords *kws_ {};
314     const parser::ParserContext *parserContext_;
315     util::StringView source_;
316     LexerPosition pos_;
317 };
318 
319 class TemplateLiteralParserContext {
320 public:
TemplateLiteralParserContext(Lexer *lexer)321     explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
322     NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
323     NO_COPY_SEMANTIC(TemplateLiteralParserContext);
324 
~TemplateLiteralParserContext()325     ~TemplateLiteralParserContext()
326     {
327         lexer_->tlCtx_ = prev_;
328     }
329 
ConsumeLeftBrace()330     void ConsumeLeftBrace()
331     {
332         braceDepth_++;
333     }
334 
ConsumeRightBrace()335     bool ConsumeRightBrace()
336     {
337         braceDepth_--;
338 
339         return braceDepth_ == 0;
340     }
341 
342 private:
343     Lexer *lexer_;
344     TemplateLiteralParserContext *prev_ {};
345     size_t braceDepth_ {1};
346 };
347 
348 template <char32_t END>
ScanString()349 void Lexer::ScanString()
350 {
351     util::UString str(Allocator());
352     GetToken().type_ = TokenType::LITERAL_STRING;
353     GetToken().keywordType_ = TokenType::LITERAL_STRING;
354 
355     const auto startPos = Iterator().Index();
356     auto escapeEnd = startPos;
357 
358     do {
359         char32_t cp = Iterator().Peek();
360 
361         switch (cp) {
362             case util::StringView::Iterator::INVALID_CP: {
363                 ThrowError("Unterminated string");
364                 break;
365             }
366             case LEX_CHAR_CR:
367             case LEX_CHAR_LF: {
368                 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
369                 if constexpr (END != LEX_CHAR_BACK_TICK) {
370                     ThrowError("Newline is not allowed in strings");
371                 }
372 
373                 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
374                 str.Append(SourceView(escapeEnd, Iterator().Index()));
375 
376                 if (cp == LEX_CHAR_CR) {
377                     Iterator().Forward(1);
378 
379                     if (Iterator().Peek() != LEX_CHAR_LF) {
380                         Iterator().Backward(1);
381                     }
382                 }
383 
384                 pos_.line_++;
385                 str.Append(LEX_CHAR_LF);
386                 Iterator().Forward(1);
387                 escapeEnd = Iterator().Index();
388                 continue;
389             }
390             case LEX_CHAR_BACKSLASH: {
391                 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
392                 str.Append(SourceView(escapeEnd, Iterator().Index()));
393 
394                 Iterator().Forward(1);
395                 ScanStringUnicodePart(&str);
396                 escapeEnd = Iterator().Index();
397                 continue;
398             }
399             case LEX_CHAR_BACK_TICK:
400             case LEX_CHAR_SINGLE_QUOTE:
401             case LEX_CHAR_DOUBLE_QUOTE: {
402                 if (END == cp) {
403                     break;
404                 }
405 
406                 Iterator().Forward(1);
407                 continue;
408             }
409             case LEX_CHAR_DOLLAR_SIGN: {
410                 Iterator().Forward(1);
411 
412                 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
413                 if constexpr (END == LEX_CHAR_BACK_TICK) {
414                     if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
415                         Iterator().Backward(1);
416                         break;
417                     }
418                 }
419 
420                 continue;
421             }
422             default: {
423                 Iterator().SkipCp();
424                 continue;
425             }
426         }
427 
428         if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
429             str.Append(SourceView(escapeEnd, Iterator().Index()));
430             GetToken().src_ = str.View();
431         } else {
432             GetToken().src_ = SourceView(startPos, Iterator().Index());
433         }
434 
435         break;
436     } while (true);
437 
438     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
439     if constexpr (END != LEX_CHAR_BACK_TICK) {
440         Iterator().Forward(1);
441     }
442 }
443 
444 template <int N, bool IN_AS>
ScanHexEscape()445 char32_t Lexer::ScanHexEscape()
446 {
447     char32_t code = 0;
448 
449     for (size_t i = 0; i < N; ++i) {
450         const auto cp = Iterator().Peek();
451         if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
452             break;
453         }
454 
455         Iterator().Forward(1);
456 
457         if (!IsHexDigit(cp)) {
458             ThrowError("Invalid unicode escape sequence");
459         }
460 
461         constexpr auto MULTIPLIER = 16;
462         code = code * MULTIPLIER + HexValue(cp);
463     }
464 
465     return code;
466 }
467 
468 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl()469 bool Lexer::ScanNumberLeadingZeroImpl()
470 {
471     GetToken().type_ = TokenType::LITERAL_NUMBER;
472     GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
473 
474     switch (Iterator().Peek()) {
475         case LEX_CHAR_LOWERCASE_X:
476         case LEX_CHAR_UPPERCASE_X: {
477             Iterator().Forward(1);
478             constexpr auto RADIX = 16;
479             if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>()) {
480                 return false;
481             }
482             CheckNumberLiteralEnd();
483             return true;
484         }
485         case LEX_CHAR_LOWERCASE_B:
486         case LEX_CHAR_UPPERCASE_B: {
487             Iterator().Forward(1);
488             constexpr auto RADIX = 2;
489             if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>()) {
490                 return false;
491             }
492             CheckNumberLiteralEnd();
493             return true;
494         }
495         case LEX_CHAR_LOWERCASE_O:
496         case LEX_CHAR_UPPERCASE_O: {
497             Iterator().Forward(1);
498             constexpr auto RADIX = 8;
499             if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>()) {
500                 return false;
501             }
502             CheckOctal();
503             CheckNumberLiteralEnd();
504             return true;
505         }
506         default: {
507             ScanNumberLeadingZeroImplNonAllowedCases();
508             break;
509         }
510     }
511 
512     ScanNumber();
513     return true;
514 }
515 
516 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber([[maybe_unused]] RadixType number)517 bool Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number)
518 {
519     if constexpr (std::is_arithmetic_v<RadixLimit>) {
520         if (number > std::numeric_limits<RadixLimit>::max() / RADIX) {
521             return false;
522         }
523     }
524     return true;
525 }
526 
527 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool allowNumericSeparator)528 bool Lexer::ScanNumberRadix(bool allowNumericSeparator)
529 {
530     RadixType number {};
531 
532     auto cp = Iterator().Peek();
533     if (!RANGE_CHECK(cp)) {
534         ThrowError("Invalid digit");
535     }
536 
537     bool allowNumericOnNext = true;
538 
539     do {
540         cp = Iterator().Peek();
541         if (RANGE_CHECK(cp)) {
542             auto digit = HexValue(cp);
543 
544             if (!ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number)) {
545                 return false;
546             }
547 
548             number = number * RADIX + digit;
549             Iterator().Forward(1);
550             allowNumericOnNext = true;
551             continue;
552         }
553 
554         if (cp == LEX_CHAR_UNDERSCORE) {
555             if (!allowNumericSeparator || !allowNumericOnNext) {
556                 ThrowError("Invalid numeric separator");
557             }
558 
559             GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
560             Iterator().Forward(1);
561             allowNumericOnNext = false;
562             continue;
563         }
564 
565         if (!allowNumericOnNext) {
566             Iterator().Backward(1);
567             ThrowError("Numeric separators are not allowed at the end of numeric literals");
568         }
569 
570         break;
571     } while (true);
572 
573     GetToken().number_ = lexer::Number(number);
574     return true;
575 }
576 
HexValue(char32_t ch)577 inline uint32_t Lexer::HexValue(char32_t ch)
578 {
579     constexpr uint32_t HEX_MASK = 0xF;
580     constexpr uint32_t DEC_OFFSET = 10;
581     return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
582 }
583 
IsDecimalDigit(uint32_t cp)584 inline bool Lexer::IsDecimalDigit(uint32_t cp)
585 {
586     return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
587 }
588 
IsHexDigit(char32_t ch)589 inline bool Lexer::IsHexDigit(char32_t ch)
590 {
591     return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
592 }
593 
IsBinaryDigit(char32_t ch)594 inline bool Lexer::IsBinaryDigit(char32_t ch)
595 {
596     return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
597 }
598 
IsOctalDigit(char32_t ch)599 inline bool Lexer::IsOctalDigit(char32_t ch)
600 {
601     return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
602 }
603 }  // namespace ark::es2panda::lexer
604 
605 template <>
606 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
607 };
608 
609 #endif
610