1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18
19 #include <ios>
20 #include "lexer/regexp/regexp.h"
21 #include "lexer/token/letters.h"
22 #include "lexer/token/token.h"
23 #include "util/enumbitops.h"
24
25 namespace ark::es2panda::parser {
26 class ParserContext;
27 class ETSNolintParser;
28 } // namespace ark::es2panda::parser
29
30 namespace ark::es2panda::lexer {
31 class Keywords;
32
33 using ENUMBITOPS_OPERATORS;
34
35 enum class NextTokenFlags : uint32_t {
36 NONE = 0U,
37 KEYWORD_TO_IDENT = 1U << 0U,
38 NUMERIC_SEPARATOR_ALLOWED = 1U << 1U,
39 BIGINT_ALLOWED = 1U << 2U,
40 };
41
42 class LexerPosition {
43 public:
44 explicit LexerPosition(const util::StringView &source);
45 DEFAULT_COPY_SEMANTIC(LexerPosition);
46 DEFAULT_MOVE_SEMANTIC(LexerPosition);
47 ~LexerPosition() = default;
48
Iterator()49 util::StringView::Iterator &Iterator()
50 {
51 return iterator_;
52 }
53
Iterator() const54 const util::StringView::Iterator &Iterator() const
55 {
56 return iterator_;
57 }
58
Line() const59 size_t Line() const
60 {
61 return line_;
62 }
63
GetToken()64 Token &GetToken()
65 {
66 return token_;
67 }
68
GetToken() const69 const Token &GetToken() const
70 {
71 return token_;
72 }
73
NextTokenLine()74 size_t &NextTokenLine()
75 {
76 return nextTokenLine_;
77 }
78
79 private:
80 friend class Lexer;
81
82 Token token_ {};
83 util::StringView::Iterator iterator_;
84 size_t line_ {};
85 size_t nextTokenLine_ {};
86 };
87
88 class LexerTemplateString {
89 public:
LexerTemplateString(ArenaAllocator *allocator)90 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
91 DEFAULT_COPY_SEMANTIC(LexerTemplateString);
92 DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
93 ~LexerTemplateString() = default;
94
95 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
96 util::UString str;
97 size_t end {};
98 bool scanExpression {};
99 // NOLINTEND(misc-non-private-member-variables-in-classes)
100 };
101
102 class TemplateLiteralParserContext;
103
104 class Lexer {
105 public:
106 explicit Lexer(const parser::ParserContext *parserContext, bool startLexer = true);
107 NO_COPY_SEMANTIC(Lexer);
108 NO_MOVE_SEMANTIC(Lexer);
109 virtual ~Lexer() = default;
110
111 // NOLINTNEXTLINE(google-default-arguments)
112 virtual void NextToken(NextTokenFlags flags = NextTokenFlags::NONE);
113 virtual void ScanAsteriskPunctuator();
114
115 Token &GetToken();
116 const Token &GetToken() const;
117 size_t Line() const;
118
TryEatTokenType(lexer::TokenType type)119 bool TryEatTokenType(lexer::TokenType type)
120 {
121 auto token = GetToken();
122 if (token.Type() == type) {
123 NextToken();
124 return true;
125 }
126 return false;
127 }
128
TryEatTokenKeyword(lexer::TokenType type)129 std::optional<Token> TryEatTokenKeyword(lexer::TokenType type)
130 {
131 auto token = GetToken();
132 if (token.KeywordType() == type) {
133 NextToken();
134 return token;
135 }
136 return std::nullopt;
137 }
138
139 LexerPosition Save() const;
140 void Rewind(const LexerPosition &pos);
141 void BackwardToken(TokenType type, size_t offset);
142 void ForwardToken(TokenType type, size_t offset);
143
144 char32_t Lookahead();
145 bool CheckArrow();
146
147 RegExp ScanRegExp();
148 template <char32_t END>
149 void ScanString();
150 void ResetTokenEnd();
151 LexerTemplateString ScanTemplateString();
152 void ScanTemplateStringEnd();
153 void PushTemplateContext(TemplateLiteralParserContext *ctx);
ThrowUnexpectedStrictModeReservedKeyword() const154 [[noreturn]] void ThrowUnexpectedStrictModeReservedKeyword() const
155 {
156 ThrowError("Unexpected strict mode reserved keyword");
157 }
158
159 enum class ConversionResult : uint8_t {
160 SUCCESS,
161 INVALID_ARGUMENT,
162 OUT_OF_RANGE,
163 };
164
165 template <typename Tret, typename Ret = Tret, typename... Base>
166 static Ret StrToNumeric(Tret (*converter)(const char *, char **, Base...), const char *str,
167 ConversionResult &result, Base... base) noexcept
168 {
169 Ret ret {};
170 char *endPtr;
171 // NOLINTBEGIN(cppcoreguidelines-special-member-functions)
172 struct SaveErrno {
SaveErrnoark::es2panda::ark::es2panda::lexer::Lexer::SaveErrno173 explicit SaveErrno() : errno_(errno)
174 {
175 errno = 0;
176 }
~SaveErrnoark::es2panda::ark::es2panda::lexer::Lexer::SaveErrno177 ~SaveErrno()
178 {
179 if (errno == 0) {
180 errno = errno_;
181 }
182 }
183
184 private:
185 decltype(errno) errno_;
186 } const savedErrno;
187 // NOLINTEND(cppcoreguidelines-special-member-functions)
188
189 const Tret tmp = converter(str, &endPtr, base...);
190
191 bool outOfRange = false;
onstexpr(std::is_same_v<Ret, int>)192 if constexpr (std::is_same_v<Ret, int>) {
193 outOfRange = tmp < static_cast<Tret>(std::numeric_limits<int>::min()) ||
194 tmp > static_cast<Tret>(std::numeric_limits<int>::max());
195 }
196
197 if (endPtr == str) {
198 result = ConversionResult::INVALID_ARGUMENT;
199 } else if (errno == ERANGE || outOfRange) {
200 result = ConversionResult::OUT_OF_RANGE;
201 } else {
202 result = ConversionResult::SUCCESS;
203 ret = tmp;
204 }
205
206 return ret;
207 }
208
209 util::StringView SourceView(size_t begin, size_t end) const;
210
211 protected:
212 void NextToken(Keywords *kws);
213 ArenaAllocator *Allocator();
214 bool IsLineTerminatorOrEos() const;
215 void ScanRegExpPattern();
216 RegExpFlags ScanRegExpFlags();
217
218 [[noreturn]] void ThrowError(std::string_view message) const;
219 [[noreturn]] void ThrowUnexpectedToken(lexer::TokenType tokenType) const;
220
221 void SetTokenStart();
222 void SetTokenEnd();
223
Iterator()224 inline util::StringView::Iterator &Iterator()
225 {
226 return pos_.iterator_;
227 }
228
Iterator() const229 inline const util::StringView::Iterator &Iterator() const
230 {
231 return pos_.iterator_;
232 }
233
234 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
235
236 void SkipWhiteSpaces();
237 void SkipSingleLineComment();
238
239 bool ScanPunctuator();
240 void ScanQuestionPunctuator();
241 void ScanLessThanPunctuator();
242 void ScanGreaterThanPunctuator();
243 virtual void ScanEqualsPunctuator();
244 virtual void ScanExclamationPunctuator();
245 void ScanAmpersandPunctuator();
246 void ScanVLinePunctuator();
247 void ScanCircumflexPunctuator();
248 void ScanPlusPunctuator();
249 void ScanMinusPunctuator();
250 void ScanSlashPunctuator();
251 void ScanPercentPunctuator();
252 void ScanDotPunctuator();
253 void ScanColonPunctuator();
254 virtual bool ScanDollarPunctuator();
255 void ScanAtPunctuator();
256
257 virtual void SkipMultiLineComment();
258 virtual void ScanHashMark();
259 virtual void ScanBackTick();
260
ScanCharLiteral()261 virtual bool ScanCharLiteral()
262 {
263 return false;
264 }
265
266 char32_t ScanUnicodeEscapeSequence();
267 template <int N, bool IN_AS = false>
268 char32_t ScanHexEscape();
269 char32_t ScanUnicodeCodePointEscape();
270
271 void ScanStringUnicodePart(util::UString *str);
272 char32_t ScanUnicodeCharacter();
273
274 void ScanDecimalNumbers();
275
ScanNumberLeadingZero()276 virtual void ScanNumberLeadingZero()
277 {
278 ScanNumberLeadingZeroImpl<double>();
279 }
280
281 template <typename RadixType, typename RadixLimit = void *>
282 bool ScanNumberLeadingZeroImpl();
283 void ScanNumberLeadingZeroImplNonAllowedCases();
284 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
285 bool ScanNumberRadix(bool allowNumericSeparator = true);
286 void ScanNumber(bool allowBigInt = true);
287 std::tuple<size_t, bool, NumberFlags> ScanCharLex(bool allowBigInt, bool parseExponent, NumberFlags flags);
288 size_t ScanSignOfNumber();
289 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
290 bool ScanTooLargeNumber(RadixType number);
291 virtual void ConvertNumber(const std::string &utf8, NumberFlags flags);
292 void ScanDecimalLiteral();
293 void ScanDecimalDigits(bool allowNumericSeparator);
294 virtual void CheckNumberLiteralEnd();
295 void CheckOctal();
296
297 inline static uint32_t HexValue(char32_t ch);
298 inline static bool IsDecimalDigit(uint32_t cp);
299 inline static bool IsHexDigit(char32_t ch);
300 inline static bool IsBinaryDigit(char32_t ch);
301 inline static bool IsOctalDigit(char32_t ch);
302
303 friend class KeywordsUtil;
304 friend class TemplateLiteralParserContext;
305 friend class parser::ETSNolintParser;
306
307 LexerPosition &Pos();
308 const LexerPosition &Pos() const;
309
310 private:
311 TemplateLiteralParserContext *tlCtx_ {};
312 ArenaAllocator *allocator_;
313 Keywords *kws_ {};
314 const parser::ParserContext *parserContext_;
315 util::StringView source_;
316 LexerPosition pos_;
317 };
318
319 class TemplateLiteralParserContext {
320 public:
TemplateLiteralParserContext(Lexer *lexer)321 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
322 NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
323 NO_COPY_SEMANTIC(TemplateLiteralParserContext);
324
~TemplateLiteralParserContext()325 ~TemplateLiteralParserContext()
326 {
327 lexer_->tlCtx_ = prev_;
328 }
329
ConsumeLeftBrace()330 void ConsumeLeftBrace()
331 {
332 braceDepth_++;
333 }
334
ConsumeRightBrace()335 bool ConsumeRightBrace()
336 {
337 braceDepth_--;
338
339 return braceDepth_ == 0;
340 }
341
342 private:
343 Lexer *lexer_;
344 TemplateLiteralParserContext *prev_ {};
345 size_t braceDepth_ {1};
346 };
347
348 template <char32_t END>
ScanString()349 void Lexer::ScanString()
350 {
351 util::UString str(Allocator());
352 GetToken().type_ = TokenType::LITERAL_STRING;
353 GetToken().keywordType_ = TokenType::LITERAL_STRING;
354
355 const auto startPos = Iterator().Index();
356 auto escapeEnd = startPos;
357
358 do {
359 char32_t cp = Iterator().Peek();
360
361 switch (cp) {
362 case util::StringView::Iterator::INVALID_CP: {
363 ThrowError("Unterminated string");
364 break;
365 }
366 case LEX_CHAR_CR:
367 case LEX_CHAR_LF: {
368 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
369 if constexpr (END != LEX_CHAR_BACK_TICK) {
370 ThrowError("Newline is not allowed in strings");
371 }
372
373 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
374 str.Append(SourceView(escapeEnd, Iterator().Index()));
375
376 if (cp == LEX_CHAR_CR) {
377 Iterator().Forward(1);
378
379 if (Iterator().Peek() != LEX_CHAR_LF) {
380 Iterator().Backward(1);
381 }
382 }
383
384 pos_.line_++;
385 str.Append(LEX_CHAR_LF);
386 Iterator().Forward(1);
387 escapeEnd = Iterator().Index();
388 continue;
389 }
390 case LEX_CHAR_BACKSLASH: {
391 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
392 str.Append(SourceView(escapeEnd, Iterator().Index()));
393
394 Iterator().Forward(1);
395 ScanStringUnicodePart(&str);
396 escapeEnd = Iterator().Index();
397 continue;
398 }
399 case LEX_CHAR_BACK_TICK:
400 case LEX_CHAR_SINGLE_QUOTE:
401 case LEX_CHAR_DOUBLE_QUOTE: {
402 if (END == cp) {
403 break;
404 }
405
406 Iterator().Forward(1);
407 continue;
408 }
409 case LEX_CHAR_DOLLAR_SIGN: {
410 Iterator().Forward(1);
411
412 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
413 if constexpr (END == LEX_CHAR_BACK_TICK) {
414 if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
415 Iterator().Backward(1);
416 break;
417 }
418 }
419
420 continue;
421 }
422 default: {
423 Iterator().SkipCp();
424 continue;
425 }
426 }
427
428 if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
429 str.Append(SourceView(escapeEnd, Iterator().Index()));
430 GetToken().src_ = str.View();
431 } else {
432 GetToken().src_ = SourceView(startPos, Iterator().Index());
433 }
434
435 break;
436 } while (true);
437
438 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
439 if constexpr (END != LEX_CHAR_BACK_TICK) {
440 Iterator().Forward(1);
441 }
442 }
443
444 template <int N, bool IN_AS>
ScanHexEscape()445 char32_t Lexer::ScanHexEscape()
446 {
447 char32_t code = 0;
448
449 for (size_t i = 0; i < N; ++i) {
450 const auto cp = Iterator().Peek();
451 if (IN_AS && cp == LEX_CHAR_BACK_TICK) {
452 break;
453 }
454
455 Iterator().Forward(1);
456
457 if (!IsHexDigit(cp)) {
458 ThrowError("Invalid unicode escape sequence");
459 }
460
461 constexpr auto MULTIPLIER = 16;
462 code = code * MULTIPLIER + HexValue(cp);
463 }
464
465 return code;
466 }
467
468 template <typename RadixType, typename RadixLimit>
ScanNumberLeadingZeroImpl()469 bool Lexer::ScanNumberLeadingZeroImpl()
470 {
471 GetToken().type_ = TokenType::LITERAL_NUMBER;
472 GetToken().keywordType_ = TokenType::LITERAL_NUMBER;
473
474 switch (Iterator().Peek()) {
475 case LEX_CHAR_LOWERCASE_X:
476 case LEX_CHAR_UPPERCASE_X: {
477 Iterator().Forward(1);
478 constexpr auto RADIX = 16;
479 if (!ScanNumberRadix<IsHexDigit, RADIX, RadixType, RadixLimit>()) {
480 return false;
481 }
482 CheckNumberLiteralEnd();
483 return true;
484 }
485 case LEX_CHAR_LOWERCASE_B:
486 case LEX_CHAR_UPPERCASE_B: {
487 Iterator().Forward(1);
488 constexpr auto RADIX = 2;
489 if (!ScanNumberRadix<IsBinaryDigit, RADIX, RadixType, RadixLimit>()) {
490 return false;
491 }
492 CheckNumberLiteralEnd();
493 return true;
494 }
495 case LEX_CHAR_LOWERCASE_O:
496 case LEX_CHAR_UPPERCASE_O: {
497 Iterator().Forward(1);
498 constexpr auto RADIX = 8;
499 if (!ScanNumberRadix<IsOctalDigit, RADIX, RadixType, RadixLimit>()) {
500 return false;
501 }
502 CheckOctal();
503 CheckNumberLiteralEnd();
504 return true;
505 }
506 default: {
507 ScanNumberLeadingZeroImplNonAllowedCases();
508 break;
509 }
510 }
511
512 ScanNumber();
513 return true;
514 }
515
516 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanTooLargeNumber([[maybe_unused]] RadixType number)517 bool Lexer::ScanTooLargeNumber([[maybe_unused]] RadixType number)
518 {
519 if constexpr (std::is_arithmetic_v<RadixLimit>) {
520 if (number > std::numeric_limits<RadixLimit>::max() / RADIX) {
521 return false;
522 }
523 }
524 return true;
525 }
526
527 template <bool RANGE_CHECK(char32_t), int RADIX, typename RadixType, typename RadixLimit>
ScanNumberRadix(bool allowNumericSeparator)528 bool Lexer::ScanNumberRadix(bool allowNumericSeparator)
529 {
530 RadixType number {};
531
532 auto cp = Iterator().Peek();
533 if (!RANGE_CHECK(cp)) {
534 ThrowError("Invalid digit");
535 }
536
537 bool allowNumericOnNext = true;
538
539 do {
540 cp = Iterator().Peek();
541 if (RANGE_CHECK(cp)) {
542 auto digit = HexValue(cp);
543
544 if (!ScanTooLargeNumber<RANGE_CHECK, RADIX, RadixType, RadixLimit>(number)) {
545 return false;
546 }
547
548 number = number * RADIX + digit;
549 Iterator().Forward(1);
550 allowNumericOnNext = true;
551 continue;
552 }
553
554 if (cp == LEX_CHAR_UNDERSCORE) {
555 if (!allowNumericSeparator || !allowNumericOnNext) {
556 ThrowError("Invalid numeric separator");
557 }
558
559 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
560 Iterator().Forward(1);
561 allowNumericOnNext = false;
562 continue;
563 }
564
565 if (!allowNumericOnNext) {
566 Iterator().Backward(1);
567 ThrowError("Numeric separators are not allowed at the end of numeric literals");
568 }
569
570 break;
571 } while (true);
572
573 GetToken().number_ = lexer::Number(number);
574 return true;
575 }
576
HexValue(char32_t ch)577 inline uint32_t Lexer::HexValue(char32_t ch)
578 {
579 constexpr uint32_t HEX_MASK = 0xF;
580 constexpr uint32_t DEC_OFFSET = 10;
581 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
582 }
583
IsDecimalDigit(uint32_t cp)584 inline bool Lexer::IsDecimalDigit(uint32_t cp)
585 {
586 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
587 }
588
IsHexDigit(char32_t ch)589 inline bool Lexer::IsHexDigit(char32_t ch)
590 {
591 return ch < LEX_ASCII_MAX_BITS && (std::isxdigit(static_cast<unsigned char>(ch)) != 0);
592 }
593
IsBinaryDigit(char32_t ch)594 inline bool Lexer::IsBinaryDigit(char32_t ch)
595 {
596 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
597 }
598
IsOctalDigit(char32_t ch)599 inline bool Lexer::IsOctalDigit(char32_t ch)
600 {
601 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
602 }
603 } // namespace ark::es2panda::lexer
604
605 template <>
606 struct enumbitops::IsAllowedType<ark::es2panda::lexer::NextTokenFlags> : std::true_type {
607 };
608
609 #endif
610