1/**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef ES2PANDA_PARSER_CORE_LEXER_H
17#define ES2PANDA_PARSER_CORE_LEXER_H
18
19#include <ios>
20#include "lexer/regexp/regexp.h"
21#include "lexer/token/letters.h"
22#include "lexer/token/token.h"
23#include "util/enumbitops.h"
24
25namespace ark::es2panda::parser {
26class ParserContext;
27class ETSNolintParser;
28}  // namespace ark::es2panda::parser
29
30namespace ark::es2panda::lexer {
31class Keywords;
32
33using ENUMBITOPS_OPERATORS;
34
35enum class NextTokenFlags : uint32_t {
36    NONE = 0U,
37    KEYWORD_TO_IDENT = 1U << 0U,
38    NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
39    BIGINT_ALLOWED = 1U << 2U,
40};
41
42class LexerPosition {
43public:
44    explicit LexerPosition(const util::StringView &source);
45    DEFAULT_COPY_SEMANTIC(LexerPosition);
46    DEFAULT_MOVE_SEMANTIC(LexerPosition);
47    ~LexerPosition() = default;
48
49    util::StringView::Iterator &Iterator()
50    {
51        return iterator_;
52    }
53
54    const util::StringView::Iterator &Iterator() const
55    {
56        return iterator_;
57    }
58
59    size_t Line() const
60    {
61        return line_;
62    }
63
64    Token &GetToken()
65    {
66        return token_;
67    }
68
69    const Token &GetToken() const
70    {
71        return token_;
72    }
73
74    size_t &NextTokenLine()
75    {
76        return nextTokenLine_;
77    }
78
79private:
80    friend class Lexer;
81
82    Token token_ {};
83    util::StringView::Iterator iterator_;
84    size_t line_ {};
85    size_t nextTokenLine_ {};
86};
87
88class LexerTemplateString {
89public:
90    explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
91    DEFAULT_COPY_SEMANTIC(LexerTemplateString);
92    DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
93    ~LexerTemplateString() = default;
94
95    // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
96    util::UString str;
97    size_t end {};
98    bool scanExpression {};
99    // NOLINTEND(misc-non-private-member-variables-in-classes)
100};
101
102class TemplateLiteralParserContext;
103
104class Lexer {
105public:
106    explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true);
107    NO_COPY_SEMANTIC(Lexer);
108    NO_MOVE_SEMANTIC(Lexer);
109    virtual ~Lexer() = default;
110
111    // NOLINTNEXTLINE(google-default-arguments)
112    virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
113    virtual void ScanAsteriskPunctuator();
114
115    Token &GetToken();
116    const Token &GetToken() const;
117    size_t Line() const;
118
119    bool TryEatTokenType(lexer::TokenType type)
120    {
121        auto token = GetToken();
122        if (token.Type() == type) {
123            NextToken();
124            return true;
125        }
126        return false;
127    }
128
129    std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
130    {
131        auto token = GetToken();
132        if (token.KeywordType() == type) {
133            NextToken();
134            return token;
135        }
136        return std::nullopt;
137    }
138
139    LexerPosition Save() const;
140    void Rewind(const LexerPosition &pos);
141    void BackwardToken(TokenType type, size_t offset);
142    void ForwardToken(TokenType type, size_t offset);
143
144    char32_t Lookahead();
145    bool CheckArrow();
146
147    RegExp ScanRegExp();
148    template <char32_t END>
149    void ScanString();
150    void ResetTokenEnd();
151    LexerTemplateString ScanTemplateString();
152    void ScanTemplateStringEnd();
153    void PushTemplateContext(TemplateLiteralParserContext *ctx);
154    [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const
155    {
156        ThrowError("Unexpected strict mode reserved keyword");
157    }
158
159    enum class ConversionResult : uint8_t {
160        SUCCESS,
161        INVALID_ARGUMENT,
162        OUT_OF_RANGE,
163    };
164
165    template <typename Tret, typename Ret = Tret, typename... Base>
166    static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
167                            ConversionResult &result, Base... base) noexcept
168    {
169        Ret ret {};
170        char *endPtr;
171        // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
172        struct SaveErrno {
173            explicit SaveErrno() : errno_(errno)
174            {
175                errno = 0;
176            }
177            ~SaveErrno()
178            {
179                if (errno == 0) {
180                    errno = errno_;
181                }
182            }
183
184        private:
185            decltype(errno) errno_;
186        } const savedErrno;
187        // NOLINTEND(cppcoreguidelines-special-member-functions)
188
189        const Tret tmp = converter(str, &endPtr, base...);
190
191        bool outOfRange = false;
192        if constexpr (std::is_same_v<Ret, int>) {
193            outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) ||
194                         tmp > static_cast<Tret>(std::numeric_limits<int>::max());
195        }
196
197        if (endPtr == str) {
198            result = ConversionResult::INVALID_ARGUMENT;
199        } else if (errno == ERANGE || outOfRange) {
200            result = ConversionResult::OUT_OF_RANGE;
201        } else {
202            result = ConversionResult::SUCCESS;
203            ret = tmp;
204        }
205
206        return ret;
207    }
208
209    util::StringView SourceView(size_t begin, size_t end) const;
210
211protected:
212    void NextToken(Keywords *kws);
213    ArenaAllocator *Allocator();
214    bool IsLineTerminatorOrEos() const;
215    void ScanRegExpPattern();
216    RegExpFlags ScanRegExpFlags();
217
218    [[noreturn]] void ThrowError(std::string_view message) const;
219    [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const;
220
221    void SetTokenStart();
222    void SetTokenEnd();
223
224    inline util::StringView::Iterator &Iterator()
225    {
226        return pos_.iterator_;
227    }
228
229    inline const util::StringView::Iterator &Iterator() const
230    {
231        return pos_.iterator_;
232    }
233
234    util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
235
236    void SkipWhiteSpaces();
237    void SkipSingleLineComment();
238
239    bool ScanPunctuator();
240    void ScanQuestionPunctuator();
241    void ScanLessThanPunctuator();
242    void ScanGreaterThanPunctuator();
243    virtual void ScanEqualsPunctuator();
244    virtual void ScanExclamationPunctuator();
245    void ScanAmpersandPunctuator();
246    void ScanVLinePunctuator();
247    void ScanCircumflexPunctuator();
248    void ScanPlusPunctuator();
249    void ScanMinusPunctuator();
250    void ScanSlashPunctuator();
251    void ScanPercentPunctuator();
252    void ScanDotPunctuator();
253    void ScanColonPunctuator();
254    virtual bool ScanDollarPunctuator();
255    void ScanAtPunctuator();
256
257    virtual void SkipMultiLineComment();
258    virtual void ScanHashMark();
259    virtual void ScanBackTick();
260
261    virtual bool ScanCharLiteral()
262    {
263        return false;
264    }
265
266    char32_t ScanUnicodeEscapeSequence();
267    template <int N, bool IN_AS = false>
268    char32_t ScanHexEscape();
269    char32_t ScanUnicodeCodePointEscape();
270
271    void ScanStringUnicodePart(util::UString *str);
272    char32_t ScanUnicodeCharacter();
273
274    void ScanDecimalNumbers();
275
276    virtual void ScanNumberLeadingZero()
277    {
278        ScanNumberLeadingZeroImpl<double>();
279    }
280
281    template <typename RadixType, typename RadixLimit = void *>
282    bool ScanNumberLeadingZeroImpl();
283    void ScanNumberLeadingZeroImplNonAllowedCases();
284    template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
285    bool ScanNumberRadix(bool allowNumericSeparator = true);
286    void ScanNumber(bool allowBigInt = true);
287    std::tuple<size_t, bool, NumberFlags> ScanCharLex(bool allowBigInt, bool parseExponent, NumberFlags flags);
288    size_t ScanSignOfNumber();
289    template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
290    bool ScanTooLargeNumber(RadixType number);
291    virtual void ConvertNumber(const std::string &utf8, NumberFlags flags);
292    void ScanDecimalLiteral();
293    void ScanDecimalDigits(bool allowNumericSeparator);
294    virtual void CheckNumberLiteralEnd();
295    void CheckOctal();
296
297    inline static uint32_t HexValue(char32_t ch);
298    inline static bool IsDecimalDigit(uint32_t cp);
299    inline static bool IsHexDigit(char32_t ch);
300    inline static bool IsBinaryDigit(char32_t ch);
301    inline static bool IsOctalDigit(char32_t ch);
302
303    friend class KeywordsUtil;
304    friend class TemplateLiteralParserContext;
305    friend class parser::ETSNolintParser;
306
307    LexerPosition &Pos();
308    const LexerPosition &Pos() const;
309
310private:
311    TemplateLiteralParserContext *tlCtx_ {};
312    ArenaAllocator *allocator_;
313    Keywords *kws_ {};
314    const parser::ParserContext *parserContext_;
315    util::StringView source_;
316    LexerPosition pos_;
317};
318
319class TemplateLiteralParserContext {
320public:
321    explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
322    NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
323    NO_COPY_SEMANTIC(TemplateLiteralParserContext);
324
325    ~TemplateLiteralParserContext()
326    {
327        lexer_->tlCtx_ = prev_;
328    }
329
330    void ConsumeLeftBrace()
331    {
332        braceDepth_++;
333    }
334
335    bool ConsumeRightBrace()
336    {
337        braceDepth_--;
338
339        return braceDepth_ == 0;
340    }
341
342private:
343    Lexer *lexer_;
344    TemplateLiteralParserContext *prev_ {};
345    size_t braceDepth_ {1};
346};
347
348template <char32_t END>
349void Lexer::ScanString()
350{
351    util::UString str(Allocator());
352    GetToken().type_ = TokenType::LITERAL_STRING;
353    GetToken().keywordType_ = TokenType::LITERAL_STRING;
354
355    const auto startPos = Iterator().Index();
356    auto escapeEnd = startPos;
357
358    do {
359        char32_t cp = Iterator().Peek();
360
361        switch (cp) {
362            case util::StringView::Iterator::INVALID_CP: {
363                ThrowError("Unterminated string");
364                break;
365            }
366            case LEX_CHAR_CR:
367            case LEX_CHAR_LF: {
368                // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
369                if constexpr (END != LEX_CHAR_BACK_TICK) {
370                    ThrowError("Newline is not allowed in strings");
371                }
372
373                GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
374                str.Append(SourceView(escapeEnd, Iterator().Index()));
375
376                if (cp == LEX_CHAR_CR) {
377                    Iterator().Forward(1);
378
379                    if (Iterator().Peek() != LEX_CHAR_LF) {
380                        Iterator().Backward(1);
381                    }
382                }
383
384                pos_.line_++;
385                str.Append(LEX_CHAR_LF);
386                Iterator().Forward(1);
387                escapeEnd = Iterator().Index();
388                continue;
389            }
390            case LEX_CHAR_BACKSLASH: {
391                GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
392                str.Append(SourceView(escapeEnd, Iterator().Index()));
393
394                Iterator().Forward(1);
395                ScanStringUnicodePart(&str);
396                escapeEnd = Iterator().Index();
397                continue;
398            }
399            case LEX_CHAR_BACK_TICK:
400            case LEX_CHAR_SINGLE_QUOTE:
401            case LEX_CHAR_DOUBLE_QUOTE: {
402                if (END == cp) {
403                    break;
404                }
405
406                Iterator().Forward(1);
407                continue;
408            }
409            case LEX_CHAR_DOLLAR_SIGN: {
410                Iterator().Forward(1);
411
412                // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
413                if constexpr (END == LEX_CHAR_BACK_TICK) {
414                    if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
415                        Iterator().Backward(1);
416                        break;
417                    }
418                }
419
420                continue;
421            }
422            default: {
423                Iterator().SkipCp();
424                continue;
425            }
426        }
427
428        if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
429            str.Append(SourceView(escapeEnd, Iterator().Index()));
430            GetToken().src_ = str.View();
431        } else {
432            GetToken().src_ = SourceView(startPos, Iterator().Index());
433        }
434
435        break;
436    } while (true);
437
438    // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
439    if constexpr (END != LEX_CHAR_BACK_TICK) {
440        Iterator().Forward(1);
441    }
442}
443
444template <int N, bool IN_AS>
445char32_t Lexer::ScanHexEscape()
446{
447    char32_t code = 0;
448
449    for (size_t i = 0; i < N; ++i) {
450        const auto cp = Iterator().Peek();
451        if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
452            break;
453        }
454
455        Iterator().Forward(1);
456
457        if (!IsHexDigit(cp)) {
458            ThrowError("Invalid unicode escape sequence");
459        }
460
461        constexpr auto MULTIPLIER = 16;
462        code = code * MULTIPLIER + HexValue(cp);
463    }
464
465    return code;
466}
467
468template <typename RadixType, typename RadixLimit>
469bool Lexer::ScanNumberLeadingZeroImpl()
470{
471    GetToken().type_ = TokenType::LITERAL_NUMBER;
472    GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
473
474    switch (Iterator().Peek()) {
475        case LEX_CHAR_LOWERCASE_X:
476        case LEX_CHAR_UPPERCASE_X: {
477            Iterator().Forward(1);
478            constexpr auto RADIX = 16;
479            if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>()) {
480                return false;
481            }
482            CheckNumberLiteralEnd();
483            return true;
484        }
485        case LEX_CHAR_LOWERCASE_B:
486        case LEX_CHAR_UPPERCASE_B: {
487            Iterator().Forward(1);
488            constexpr auto RADIX = 2;
489            if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>()) {
490                return false;
491            }
492            CheckNumberLiteralEnd();
493            return true;
494        }
495        case LEX_CHAR_LOWERCASE_O:
496        case LEX_CHAR_UPPERCASE_O: {
497            Iterator().Forward(1);
498            constexpr auto RADIX = 8;
499            if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>()) {
500                return false;
501            }
502            CheckOctal();
503            CheckNumberLiteralEnd();
504            return true;
505        }
506        default: {
507            ScanNumberLeadingZeroImplNonAllowedCases();
508            break;
509        }
510    }
511
512    ScanNumber();
513    return true;
514}
515
516template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
517bool Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number)
518{
519    if constexpr (std::is_arithmetic_v<RadixLimit>) {
520        if (number > std::numeric_limits<RadixLimit>::max() / RADIX) {
521            return false;
522        }
523    }
524    return true;
525}
526
527template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
528bool Lexer::ScanNumberRadix(bool allowNumericSeparator)
529{
530    RadixType number {};
531
532    auto cp = Iterator().Peek();
533    if (!RANGE_CHECK(cp)) {
534        ThrowError("Invalid digit");
535    }
536
537    bool allowNumericOnNext = true;
538
539    do {
540        cp = Iterator().Peek();
541        if (RANGE_CHECK(cp)) {
542            auto digit = HexValue(cp);
543
544            if (!ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number)) {
545                return false;
546            }
547
548            number = number * RADIX + digit;
549            Iterator().Forward(1);
550            allowNumericOnNext = true;
551            continue;
552        }
553
554        if (cp == LEX_CHAR_UNDERSCORE) {
555            if (!allowNumericSeparator || !allowNumericOnNext) {
556                ThrowError("Invalid numeric separator");
557            }
558
559            GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
560            Iterator().Forward(1);
561            allowNumericOnNext = false;
562            continue;
563        }
564
565        if (!allowNumericOnNext) {
566            Iterator().Backward(1);
567            ThrowError("Numeric separators are not allowed at the end of numeric literals");
568        }
569
570        break;
571    } while (true);
572
573    GetToken().number_ = lexer::Number(number);
574    return true;
575}
576
577inline uint32_t Lexer::HexValue(char32_t ch)
578{
579    constexpr uint32_t HEX_MASK = 0xF;
580    constexpr uint32_t DEC_OFFSET = 10;
581    return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
582}
583
584inline bool Lexer::IsDecimalDigit(uint32_t cp)
585{
586    return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
587}
588
589inline bool Lexer::IsHexDigit(char32_t ch)
590{
591    return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
592}
593
594inline bool Lexer::IsBinaryDigit(char32_t ch)
595{
596    return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
597}
598
599inline bool Lexer::IsOctalDigit(char32_t ch)
600{
601    return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
602}
603}  // namespace ark::es2panda::lexer
604
605template <>
606struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
607};
608
609#endif
610