1/**
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef ES2PANDA_PARSER_CORE_LEXER_H
17#define ES2PANDA_PARSER_CORE_LEXER_H
18
19#include <lexer/regexp/regexp.h>
20#include <lexer/token/letters.h>
21#include <lexer/token/token.h>
22#include <util/enumbitops.h>
23
24namespace panda::es2panda::parser {
25class ParserContext;
26}  // namespace panda::es2panda::parser
27
28namespace panda::es2panda::lexer {
29
30enum class LexerNextTokenFlags : uint8_t {
31    NONE = 0,
32    KEYWORD_TO_IDENT = 1 << 0,
33    NUMERIC_SEPARATOR_ALLOWED = 1 << 1,
34    BIGINT_ALLOWED = 1 << 2,
35};
36
37DEFINE_BITOPS(LexerNextTokenFlags)
38
39class LexerPosition {
40public:
41    explicit LexerPosition(const util::StringView &source);
42    DEFAULT_COPY_SEMANTIC(LexerPosition);
43    DEFAULT_MOVE_SEMANTIC(LexerPosition);
44    ~LexerPosition() = default;
45
46    Token token {};
47    util::StringView::Iterator iterator;
48    size_t line {};
49    size_t nextTokenLine {};
50};
51
52class LexerTemplateString {
53public:
54    explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
55    DEFAULT_COPY_SEMANTIC(LexerTemplateString);
56    DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
57    ~LexerTemplateString() = default;
58
59    util::UString str;
60    size_t end {};
61    bool scanExpression {};
62};
63
64class TemplateLiteralParserContext;
65
66class Lexer {
67public:
68    explicit Lexer(const parser::ParserContext *parserContext);
69    NO_COPY_SEMANTIC(Lexer);
70    NO_MOVE_SEMANTIC(Lexer);
71    ~Lexer() = default;
72
73    void NextToken(LexerNextTokenFlags flags = LexerNextTokenFlags::NONE);
74
75    Token &GetToken();
76    const Token &GetToken() const;
77    size_t Line() const;
78
79    LexerPosition Save() const;
80    void Rewind(const LexerPosition &pos);
81    void BackwardToken(TokenType type, size_t offset);
82    void ForwardToken(TokenType type, size_t offset);
83
84    char32_t Lookahead();
85    bool CheckArrow();
86
87    RegExp ScanRegExp();
88    template <char32_t end>
89    void ScanString();
90    void ResetTokenEnd();
91    LexerTemplateString ScanTemplateString();
92    void ScanTemplateStringEnd();
93    void PushTemplateContext(TemplateLiteralParserContext *ctx);
94    void AssignTokenTaggedTemplate();
95
96private:
97    ArenaAllocator *Allocator();
98    bool IsLineTerminatorOrEos() const;
99    void ScanRegExpPattern();
100    bool GetRegExpFlag(char32_t cp, RegExpFlags &flag);
101    RegExpFlags ScanRegExpFlags();
102
103    void ThrowError(std::string_view message);
104
105    void SetTokenStart();
106    void SetTokenEnd();
107    bool CheckTokenIsTaggedTemplate() const;
108
109    inline util::StringView::Iterator &Iterator()
110    {
111        return pos_.iterator;
112    }
113
114    inline const util::StringView::Iterator &Iterator() const
115    {
116        return pos_.iterator;
117    }
118
119    util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
120    util::StringView SourceView(size_t begin, size_t end) const;
121
122    void SkipWhiteSpaces();
123    void SkipSingleLineComment();
124    void SkipMultiLineComment();
125    template <TokenType keyword_type>
126    void CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags);
127    void CheckKeywordEscape(TokenType type);
128    void CheckAwaitKeyword();
129    void CheckArgumentsKeyword();
130    void CheckEnumKeyword();
131    void CheckLetKeyword();
132    void CheckYieldKeyword();
133    void CheckFutureReservedKeyword(TokenType keywordType);
134
135    bool ScanPunctuator();
136    void ScanQuestionPunctuator();
137    void ScanLessThanPunctuator();
138    void ScanGreaterThanPunctuator();
139    void ScanEqualsPunctuator();
140    void ScanExclamationPunctuator();
141    void ScanAmpersandPunctuator();
142    void ScanVLinePunctuator();
143    void ScanCircumflexPunctuator();
144    void ScanPlusPunctuator();
145    void ScanMinusPunctuator();
146    void ScanAsterixPunctuator();
147    void ScanSlashPunctuator();
148    void ScanPercentPunctuator();
149    void ScanDotPunctuator();
150
151    char32_t ScanUnicodeEscapeSequence();
152    template <int N>
153    char32_t ScanHexEscape();
154    char32_t ScanUnicodeCodePointEscape();
155
156    void ScanStringUnicodePart(util::UString *str);
157
158    void ScanNumberLeadingZero();
159    void ScanDecimalNumbers(bool allowNumericSeparator);
160    template <bool rangeCheck(char32_t), int radix>
161    void ScanNumberRadix(bool allowNumericSeparator = true);
162    void ScanNumber(bool allowNumericSeparator = true, bool allowBigInt = true);
163    void ConvertNumber(size_t exponentSignPos);
164    void ScanDecimalLiteral();
165    void ScanDecimalDigits(bool allowNumericSeparator);
166    void CheckNumberLiteralEnd();
167    void AssignTokenEscapeError();
168
169    inline static uint32_t HexValue(char32_t ch);
170    inline static bool IsDecimalDigit(uint32_t cp);
171    inline static bool IsHexDigit(char32_t ch);
172    inline static bool IsBinaryDigit(char32_t ch);
173    inline static bool IsOctalDigit(char32_t ch);
174
175    friend class KeywordsUtil;
176    friend class TemplateLiteralParserContext;
177    TemplateLiteralParserContext *tlCtx_ {};
178    ArenaAllocator *allocator_;
179    const parser::ParserContext *parserContext_;
180    util::StringView source_;
181    LexerPosition pos_;
182    bool isUnderscore_ = false;
183};
184
185class TemplateLiteralParserContext {
186public:
187    explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
188    NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
189    NO_COPY_SEMANTIC(TemplateLiteralParserContext);
190
191    ~TemplateLiteralParserContext()
192    {
193        lexer_->tlCtx_ = prev_;
194    }
195
196    void ConsumeLeftBrace()
197    {
198        braceDepth_++;
199    }
200
201    bool ConsumeRightBrace()
202    {
203        braceDepth_--;
204
205        return braceDepth_ == 0;
206    }
207
208private:
209    Lexer *lexer_;
210    TemplateLiteralParserContext *prev_ {};
211    size_t braceDepth_ {1};
212};
213
214template <char32_t end>
215void Lexer::ScanString()
216{
217    util::UString str(Allocator());
218    GetToken().type_ = TokenType::LITERAL_STRING;
219
220    const auto startPos = Iterator().Index();
221    auto escapeEnd = startPos;
222
223    do {
224        char32_t cp = Iterator().Peek();
225
226        switch (cp) {
227            case util::StringView::Iterator::INVALID_CP: {
228                ThrowError("Unterminated string");
229                break;
230            }
231            case LEX_CHAR_CR:
232            case LEX_CHAR_LF: {
233                // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
234                if constexpr (end != LEX_CHAR_BACK_TICK) {
235                    ThrowError("Newline is not allowed in strings");
236                }
237
238                GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
239                str.Append(SourceView(escapeEnd, Iterator().Index()));
240
241                if (cp == LEX_CHAR_CR) {
242                    Iterator().Forward(1);
243
244                    if (Iterator().Peek() != LEX_CHAR_LF) {
245                        Iterator().Backward(1);
246                    }
247                }
248
249                pos_.line++;
250                str.Append(LEX_CHAR_LF);
251                Iterator().Forward(1);
252                escapeEnd = Iterator().Index();
253                continue;
254            }
255            case LEX_CHAR_BACKSLASH: {
256                GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
257                str.Append(SourceView(escapeEnd, Iterator().Index()));
258
259                Iterator().Forward(1);
260                ScanStringUnicodePart(&str);
261                escapeEnd = Iterator().Index();
262                continue;
263            }
264            case LEX_CHAR_BACK_TICK:
265            case LEX_CHAR_SINGLE_QUOTE:
266            case LEX_CHAR_DOUBLE_QUOTE: {
267                if (end == cp) {
268                    break;
269                }
270
271                Iterator().Forward(1);
272                continue;
273            }
274            case LEX_CHAR_DOLLAR_SIGN: {
275                Iterator().Forward(1);
276
277                // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
278                if constexpr (end == LEX_CHAR_BACK_TICK) {
279                    if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
280                        Iterator().Backward(1);
281                        break;
282                    }
283                }
284
285                continue;
286            }
287            default: {
288                Iterator().SkipCp();
289                continue;
290            }
291        }
292
293        if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
294            str.Append(SourceView(escapeEnd, Iterator().Index()));
295            GetToken().src_ = str.View();
296        } else {
297            GetToken().src_ = SourceView(startPos, Iterator().Index());
298        }
299
300        break;
301    } while (true);
302
303    // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
304    if constexpr (end != LEX_CHAR_BACK_TICK) {
305        Iterator().Forward(1);
306    }
307}
308
309template <int N>
310char32_t Lexer::ScanHexEscape()
311{
312    char32_t code = 0;
313
314    for (size_t i = 0; i < N; ++i) {
315        const auto cp = Iterator().Peek();
316        Iterator().Forward(1);
317
318        if (!IsHexDigit(cp)) {
319            // Should not throw error in tagged template in ES2021
320            if (CheckTokenIsTaggedTemplate()) {
321                AssignTokenEscapeError();
322            } else {
323                ThrowError("Invalid unicode escape sequence");
324            }
325        }
326
327        constexpr auto MULTIPLIER = 16;
328        code = code * MULTIPLIER + HexValue(cp);
329    }
330
331    return code;
332}
333
334template <bool rangeCheck(char32_t), int radix>
335void Lexer::ScanNumberRadix(bool allowNumericSeparator)
336{
337    double number = 0.0;
338
339    auto cp = Iterator().Peek();
340    if (!rangeCheck(cp)) {
341        ThrowError("Invalid digit");
342    }
343
344    bool allowNumericOnNext = true;
345
346    do {
347        cp = Iterator().Peek();
348        if (rangeCheck(cp)) {
349            number = number * radix + HexValue(cp);
350            Iterator().Forward(1);
351            allowNumericOnNext = true;
352            continue;
353        }
354
355        if (cp == LEX_CHAR_UNDERSCORE) {
356            if (!allowNumericSeparator || !allowNumericOnNext) {
357                ThrowError("Invalid numeric separator");
358            }
359
360            GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
361            Iterator().Forward(1);
362            allowNumericOnNext = false;
363            continue;
364        }
365
366        if (!allowNumericOnNext) {
367            Iterator().Backward(1);
368            ThrowError("Numeric separators are not allowed at the end of numeric literals");
369        }
370
371        break;
372    } while (true);
373
374    GetToken().number_ = number;
375}
376
377template <TokenType keyword_type>
378void Lexer::CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags)
379{
380    // NOLINTNEXTLINE
381    if constexpr (keyword_type == TokenType::KEYW_AWAIT) {
382        CheckAwaitKeyword();
383        return;
384    }
385
386    if constexpr (keyword_type == TokenType::KEYW_ARGUMENTS) {
387        CheckArgumentsKeyword();
388    }
389
390    // NOLINTNEXTLINE
391    if constexpr (keyword_type == TokenType::KEYW_ENUM) {
392        CheckEnumKeyword();
393        return;
394    }
395
396    // NOLINTNEXTLINE
397    if constexpr (keyword_type == TokenType::KEYW_YIELD) {
398        CheckYieldKeyword();
399        return;
400    }
401
402    // NOLINTNEXTLINE
403    if constexpr (keyword_type == TokenType::KEYW_LET) {
404        CheckLetKeyword();
405        return;
406    }
407
408    // NOLINTNEXTLINE
409    if constexpr (keyword_type <= TokenType::KEYW_ASYNC) {
410        CheckKeywordEscape(type);
411        return;
412    }
413
414    // NOLINTNEXTLINE
415    if constexpr (keyword_type >= TokenType::KEYW_PUBLIC) {
416        // NOLINTNEXTLINE
417        CheckFutureReservedKeyword(keyword_type);
418        return;
419    }
420
421    GetToken().type_ = TokenType::LITERAL_IDENT;
422}
423
424inline uint32_t Lexer::HexValue(char32_t ch)
425{
426    constexpr uint32_t HEX_MASK = 0xF;
427    constexpr uint32_t DEC_OFFSET = 10;
428    return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
429}
430
431inline bool Lexer::IsDecimalDigit(uint32_t cp)
432{
433    return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
434}
435
436inline bool Lexer::IsHexDigit(char32_t ch)
437{
438    return ch < LEX_ASCII_MAX_BITS && std::isxdigit(static_cast<unsigned char>(ch));
439}
440
441inline bool Lexer::IsBinaryDigit(char32_t ch)
442{
443    return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
444}
445
446inline bool Lexer::IsOctalDigit(char32_t ch)
447{
448    return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
449}
450
451}  // namespace panda::es2panda::lexer
452
453#endif
454