1 /**
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_PARSER_CORE_LEXER_H
17 #define ES2PANDA_PARSER_CORE_LEXER_H
18
19 #include <lexer/regexp/regexp.h>
20 #include <lexer/token/letters.h>
21 #include <lexer/token/token.h>
22 #include <util/enumbitops.h>
23
24 namespace panda::es2panda::parser {
25 class ParserContext;
26 } // namespace panda::es2panda::parser
27
28 namespace panda::es2panda::lexer {
29
30 enum class LexerNextTokenFlags : uint8_t {
31 NONE = 0,
32 KEYWORD_TO_IDENT = 1 << 0,
33 NUMERIC_SEPARATOR_ALLOWED = 1 << 1,
34 BIGINT_ALLOWED = 1 << 2,
35 };
36
37 DEFINE_BITOPS(LexerNextTokenFlags)
38
39 class LexerPosition {
40 public:
41 explicit LexerPosition(const util::StringView &source);
42 DEFAULT_COPY_SEMANTIC(LexerPosition);
43 DEFAULT_MOVE_SEMANTIC(LexerPosition);
44 ~LexerPosition() = default;
45
46 Token token {};
47 util::StringView::Iterator iterator;
48 size_t line {};
49 size_t nextTokenLine {};
50 };
51
52 class LexerTemplateString {
53 public:
LexerTemplateString(ArenaAllocator *allocator)54 explicit LexerTemplateString(ArenaAllocator *allocator) : str(allocator) {}
55 DEFAULT_COPY_SEMANTIC(LexerTemplateString);
56 DEFAULT_MOVE_SEMANTIC(LexerTemplateString);
57 ~LexerTemplateString() = default;
58
59 util::UString str;
60 size_t end {};
61 bool scanExpression {};
62 };
63
64 class TemplateLiteralParserContext;
65
66 class Lexer {
67 public:
68 explicit Lexer(const parser::ParserContext *parserContext);
69 NO_COPY_SEMANTIC(Lexer);
70 NO_MOVE_SEMANTIC(Lexer);
71 ~Lexer() = default;
72
73 void NextToken(LexerNextTokenFlags flags = LexerNextTokenFlags::NONE);
74
75 Token &GetToken();
76 const Token &GetToken() const;
77 size_t Line() const;
78
79 LexerPosition Save() const;
80 void Rewind(const LexerPosition &pos);
81 void BackwardToken(TokenType type, size_t offset);
82 void ForwardToken(TokenType type, size_t offset);
83
84 char32_t Lookahead();
85 bool CheckArrow();
86
87 RegExp ScanRegExp();
88 template <char32_t end>
89 void ScanString();
90 void ResetTokenEnd();
91 LexerTemplateString ScanTemplateString();
92 void ScanTemplateStringEnd();
93 void PushTemplateContext(TemplateLiteralParserContext *ctx);
94 void AssignTokenTaggedTemplate();
95
96 private:
97 ArenaAllocator *Allocator();
98 bool IsLineTerminatorOrEos() const;
99 void ScanRegExpPattern();
100 bool GetRegExpFlag(char32_t cp, RegExpFlags &flag);
101 RegExpFlags ScanRegExpFlags();
102
103 void ThrowError(std::string_view message);
104
105 void SetTokenStart();
106 void SetTokenEnd();
107 bool CheckTokenIsTaggedTemplate() const;
108
Iterator()109 inline util::StringView::Iterator &Iterator()
110 {
111 return pos_.iterator;
112 }
113
Iterator() const114 inline const util::StringView::Iterator &Iterator() const
115 {
116 return pos_.iterator;
117 }
118
119 util::StringView SourceView(const util::StringView::Iterator &begin, const util::StringView::Iterator &end) const;
120 util::StringView SourceView(size_t begin, size_t end) const;
121
122 void SkipWhiteSpaces();
123 void SkipSingleLineComment();
124 void SkipMultiLineComment();
125 template <TokenType keyword_type>
126 void CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags);
127 void CheckKeywordEscape(TokenType type);
128 void CheckAwaitKeyword();
129 void CheckArgumentsKeyword();
130 void CheckEnumKeyword();
131 void CheckLetKeyword();
132 void CheckYieldKeyword();
133 void CheckFutureReservedKeyword(TokenType keywordType);
134
135 bool ScanPunctuator();
136 void ScanQuestionPunctuator();
137 void ScanLessThanPunctuator();
138 void ScanGreaterThanPunctuator();
139 void ScanEqualsPunctuator();
140 void ScanExclamationPunctuator();
141 void ScanAmpersandPunctuator();
142 void ScanVLinePunctuator();
143 void ScanCircumflexPunctuator();
144 void ScanPlusPunctuator();
145 void ScanMinusPunctuator();
146 void ScanAsterixPunctuator();
147 void ScanSlashPunctuator();
148 void ScanPercentPunctuator();
149 void ScanDotPunctuator();
150
151 char32_t ScanUnicodeEscapeSequence();
152 template <int N>
153 char32_t ScanHexEscape();
154 char32_t ScanUnicodeCodePointEscape();
155
156 void ScanStringUnicodePart(util::UString *str);
157
158 void ScanNumberLeadingZero();
159 void ScanDecimalNumbers(bool allowNumericSeparator);
160 template <bool rangeCheck(char32_t), int radix>
161 void ScanNumberRadix(bool allowNumericSeparator = true);
162 void ScanNumber(bool allowNumericSeparator = true, bool allowBigInt = true);
163 void ConvertNumber(size_t exponentSignPos);
164 void ScanDecimalLiteral();
165 void ScanDecimalDigits(bool allowNumericSeparator);
166 void CheckNumberLiteralEnd();
167 void AssignTokenEscapeError();
168
169 inline static uint32_t HexValue(char32_t ch);
170 inline static bool IsDecimalDigit(uint32_t cp);
171 inline static bool IsHexDigit(char32_t ch);
172 inline static bool IsBinaryDigit(char32_t ch);
173 inline static bool IsOctalDigit(char32_t ch);
174
175 friend class KeywordsUtil;
176 friend class TemplateLiteralParserContext;
177 TemplateLiteralParserContext *tlCtx_ {};
178 ArenaAllocator *allocator_;
179 const parser::ParserContext *parserContext_;
180 util::StringView source_;
181 LexerPosition pos_;
182 bool isUnderscore_ = false;
183 };
184
185 class TemplateLiteralParserContext {
186 public:
TemplateLiteralParserContext(Lexer *lexer)187 explicit TemplateLiteralParserContext(Lexer *lexer) : lexer_(lexer), prev_(lexer_->tlCtx_) {}
188 NO_MOVE_SEMANTIC(TemplateLiteralParserContext);
189 NO_COPY_SEMANTIC(TemplateLiteralParserContext);
190
~TemplateLiteralParserContext()191 ~TemplateLiteralParserContext()
192 {
193 lexer_->tlCtx_ = prev_;
194 }
195
ConsumeLeftBrace()196 void ConsumeLeftBrace()
197 {
198 braceDepth_++;
199 }
200
ConsumeRightBrace()201 bool ConsumeRightBrace()
202 {
203 braceDepth_--;
204
205 return braceDepth_ == 0;
206 }
207
208 private:
209 Lexer *lexer_;
210 TemplateLiteralParserContext *prev_ {};
211 size_t braceDepth_ {1};
212 };
213
214 template <char32_t end>
ScanString()215 void Lexer::ScanString()
216 {
217 util::UString str(Allocator());
218 GetToken().type_ = TokenType::LITERAL_STRING;
219
220 const auto startPos = Iterator().Index();
221 auto escapeEnd = startPos;
222
223 do {
224 char32_t cp = Iterator().Peek();
225
226 switch (cp) {
227 case util::StringView::Iterator::INVALID_CP: {
228 ThrowError("Unterminated string");
229 break;
230 }
231 case LEX_CHAR_CR:
232 case LEX_CHAR_LF: {
233 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
234 if constexpr (end != LEX_CHAR_BACK_TICK) {
235 ThrowError("Newline is not allowed in strings");
236 }
237
238 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
239 str.Append(SourceView(escapeEnd, Iterator().Index()));
240
241 if (cp == LEX_CHAR_CR) {
242 Iterator().Forward(1);
243
244 if (Iterator().Peek() != LEX_CHAR_LF) {
245 Iterator().Backward(1);
246 }
247 }
248
249 pos_.line++;
250 str.Append(LEX_CHAR_LF);
251 Iterator().Forward(1);
252 escapeEnd = Iterator().Index();
253 continue;
254 }
255 case LEX_CHAR_BACKSLASH: {
256 GetToken().flags_ |= TokenFlags::HAS_ESCAPE;
257 str.Append(SourceView(escapeEnd, Iterator().Index()));
258
259 Iterator().Forward(1);
260 ScanStringUnicodePart(&str);
261 escapeEnd = Iterator().Index();
262 continue;
263 }
264 case LEX_CHAR_BACK_TICK:
265 case LEX_CHAR_SINGLE_QUOTE:
266 case LEX_CHAR_DOUBLE_QUOTE: {
267 if (end == cp) {
268 break;
269 }
270
271 Iterator().Forward(1);
272 continue;
273 }
274 case LEX_CHAR_DOLLAR_SIGN: {
275 Iterator().Forward(1);
276
277 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
278 if constexpr (end == LEX_CHAR_BACK_TICK) {
279 if (Iterator().Peek() == LEX_CHAR_LEFT_BRACE) {
280 Iterator().Backward(1);
281 break;
282 }
283 }
284
285 continue;
286 }
287 default: {
288 Iterator().SkipCp();
289 continue;
290 }
291 }
292
293 if (GetToken().flags_ & TokenFlags::HAS_ESCAPE) {
294 str.Append(SourceView(escapeEnd, Iterator().Index()));
295 GetToken().src_ = str.View();
296 } else {
297 GetToken().src_ = SourceView(startPos, Iterator().Index());
298 }
299
300 break;
301 } while (true);
302
303 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
304 if constexpr (end != LEX_CHAR_BACK_TICK) {
305 Iterator().Forward(1);
306 }
307 }
308
309 template <int N>
ScanHexEscape()310 char32_t Lexer::ScanHexEscape()
311 {
312 char32_t code = 0;
313
314 for (size_t i = 0; i < N; ++i) {
315 const auto cp = Iterator().Peek();
316 Iterator().Forward(1);
317
318 if (!IsHexDigit(cp)) {
319 // Should not throw error in tagged template in ES2021
320 if (CheckTokenIsTaggedTemplate()) {
321 AssignTokenEscapeError();
322 } else {
323 ThrowError("Invalid unicode escape sequence");
324 }
325 }
326
327 constexpr auto MULTIPLIER = 16;
328 code = code * MULTIPLIER + HexValue(cp);
329 }
330
331 return code;
332 }
333
334 template <bool rangeCheck(char32_t), int radix>
ScanNumberRadix(bool allowNumericSeparator)335 void Lexer::ScanNumberRadix(bool allowNumericSeparator)
336 {
337 double number = 0.0;
338
339 auto cp = Iterator().Peek();
340 if (!rangeCheck(cp)) {
341 ThrowError("Invalid digit");
342 }
343
344 bool allowNumericOnNext = true;
345
346 do {
347 cp = Iterator().Peek();
348 if (rangeCheck(cp)) {
349 number = number * radix + HexValue(cp);
350 Iterator().Forward(1);
351 allowNumericOnNext = true;
352 continue;
353 }
354
355 if (cp == LEX_CHAR_UNDERSCORE) {
356 if (!allowNumericSeparator || !allowNumericOnNext) {
357 ThrowError("Invalid numeric separator");
358 }
359
360 GetToken().flags_ |= TokenFlags::NUMBER_HAS_UNDERSCORE;
361 Iterator().Forward(1);
362 allowNumericOnNext = false;
363 continue;
364 }
365
366 if (!allowNumericOnNext) {
367 Iterator().Backward(1);
368 ThrowError("Numeric separators are not allowed at the end of numeric literals");
369 }
370
371 break;
372 } while (true);
373
374 GetToken().number_ = number;
375 }
376
377 template <TokenType keyword_type>
CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags)378 void Lexer::CheckKeyword([[maybe_unused]] TokenType type, [[maybe_unused]] LexerNextTokenFlags flags)
379 {
380 // NOLINTNEXTLINE
381 if constexpr (keyword_type == TokenType::KEYW_AWAIT) {
382 CheckAwaitKeyword();
383 return;
384 }
385
386 if constexpr (keyword_type == TokenType::KEYW_ARGUMENTS) {
387 CheckArgumentsKeyword();
388 }
389
390 // NOLINTNEXTLINE
391 if constexpr (keyword_type == TokenType::KEYW_ENUM) {
392 CheckEnumKeyword();
393 return;
394 }
395
396 // NOLINTNEXTLINE
397 if constexpr (keyword_type == TokenType::KEYW_YIELD) {
398 CheckYieldKeyword();
399 return;
400 }
401
402 // NOLINTNEXTLINE
403 if constexpr (keyword_type == TokenType::KEYW_LET) {
404 CheckLetKeyword();
405 return;
406 }
407
408 // NOLINTNEXTLINE
409 if constexpr (keyword_type <= TokenType::KEYW_ASYNC) {
410 CheckKeywordEscape(type);
411 return;
412 }
413
414 // NOLINTNEXTLINE
415 if constexpr (keyword_type >= TokenType::KEYW_PUBLIC) {
416 // NOLINTNEXTLINE
417 CheckFutureReservedKeyword(keyword_type);
418 return;
419 }
420
421 GetToken().type_ = TokenType::LITERAL_IDENT;
422 }
423
HexValue(char32_t ch)424 inline uint32_t Lexer::HexValue(char32_t ch)
425 {
426 constexpr uint32_t HEX_MASK = 0xF;
427 constexpr uint32_t DEC_OFFSET = 10;
428 return ch < LEX_CHAR_UPPERCASE_A ? ch - LEX_CHAR_0 : ((ch - LEX_CHAR_UPPERCASE_A + DEC_OFFSET) & HEX_MASK);
429 }
430
IsDecimalDigit(uint32_t cp)431 inline bool Lexer::IsDecimalDigit(uint32_t cp)
432 {
433 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
434 }
435
IsHexDigit(char32_t ch)436 inline bool Lexer::IsHexDigit(char32_t ch)
437 {
438 return ch < LEX_ASCII_MAX_BITS && std::isxdigit(static_cast<unsigned char>(ch));
439 }
440
IsBinaryDigit(char32_t ch)441 inline bool Lexer::IsBinaryDigit(char32_t ch)
442 {
443 return ch == LEX_CHAR_0 || ch == LEX_CHAR_1;
444 }
445
IsOctalDigit(char32_t ch)446 inline bool Lexer::IsOctalDigit(char32_t ch)
447 {
448 return (ch >= LEX_CHAR_0 && ch <= LEX_CHAR_7);
449 }
450
451 } // namespace panda::es2panda::lexer
452
453 #endif
454