1/**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "keywordsUtil.h"
17
18#include "generated/keywords.h"
19#include "lexer/lexer.h"
20#include "unicode/uchar.h"
21#include "util/enumbitops.h"
22
23namespace ark::es2panda::lexer {
24enum class AsciiFlags : uint32_t {
25    NONE = 0U,
26    ID_START = 1U << 0U,
27    ID_CONTINUE = 1U << 1U,
28};
29
30constexpr AsciiFlags operator|(AsciiFlags a, AsciiFlags b)
31{
32    using Utype = std::underlying_type_t<AsciiFlags>;
33    return static_cast<AsciiFlags>(static_cast<Utype>(a) | static_cast<Utype>(b));
34}
35
36inline std::underlying_type_t<AsciiFlags> operator&(AsciiFlags a, AsciiFlags b)
37{
38    using Utype = std::underlying_type_t<AsciiFlags>;
39    /* NOLINTNEXTLINE(hicpp-signed-bitwise) */
40    return static_cast<Utype>(static_cast<Utype>(a) & static_cast<Utype>(b));
41}
42
43constexpr std::array<AsciiFlags, 128> ASCII_FLAGS = {{
44    AsciiFlags::NONE,                               /* NUL */
45    AsciiFlags::NONE,                               /* SOH */
46    AsciiFlags::NONE,                               /* STX */
47    AsciiFlags::NONE,                               /* ETX */
48    AsciiFlags::NONE,                               /* EOT */
49    AsciiFlags::NONE,                               /* ENQ */
50    AsciiFlags::NONE,                               /* ACK */
51    AsciiFlags::NONE,                               /* BEL */
52    AsciiFlags::NONE,                               /* BS */
53    AsciiFlags::NONE,                               /* TAB */
54    AsciiFlags::NONE,                               /* LF */
55    AsciiFlags::NONE,                               /* VT */
56    AsciiFlags::NONE,                               /* FF */
57    AsciiFlags::NONE,                               /* CR */
58    AsciiFlags::NONE,                               /* SO */
59    AsciiFlags::NONE,                               /* SI */
60    AsciiFlags::NONE,                               /* DLE */
61    AsciiFlags::NONE,                               /* DC1 */
62    AsciiFlags::NONE,                               /* DC2 */
63    AsciiFlags::NONE,                               /* DC3 */
64    AsciiFlags::NONE,                               /* DC4 */
65    AsciiFlags::NONE,                               /* NAK */
66    AsciiFlags::NONE,                               /* SYN */
67    AsciiFlags::NONE,                               /* ETB */
68    AsciiFlags::NONE,                               /* CAN */
69    AsciiFlags::NONE,                               /* EM */
70    AsciiFlags::NONE,                               /* SUB */
71    AsciiFlags::NONE,                               /* ESC */
72    AsciiFlags::NONE,                               /* FS */
73    AsciiFlags::NONE,                               /* GS */
74    AsciiFlags::NONE,                               /* RS */
75    AsciiFlags::NONE,                               /* US */
76    AsciiFlags::NONE,                               /* Space */
77    AsciiFlags::NONE,                               /* ! */
78    AsciiFlags::NONE,                               /* " */
79    AsciiFlags::NONE,                               /* # */
80    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* $ */
81    AsciiFlags::NONE,                               /* % */
82    AsciiFlags::NONE,                               /* & */
83    AsciiFlags::NONE,                               /* ' */
84    AsciiFlags::NONE,                               /* ( */
85    AsciiFlags::NONE,                               /* ) */
86    AsciiFlags::NONE,                               /* * */
87    AsciiFlags::NONE,                               /* + */
88    AsciiFlags::NONE,                               /* , */
89    AsciiFlags::NONE,                               /* - */
90    AsciiFlags::NONE,                               /* . */
91    AsciiFlags::NONE,                               /* / */
92    AsciiFlags::ID_CONTINUE,                        /* 0 */
93    AsciiFlags::ID_CONTINUE,                        /* 1 */
94    AsciiFlags::ID_CONTINUE,                        /* 2 */
95    AsciiFlags::ID_CONTINUE,                        /* 3 */
96    AsciiFlags::ID_CONTINUE,                        /* 4 */
97    AsciiFlags::ID_CONTINUE,                        /* 5 */
98    AsciiFlags::ID_CONTINUE,                        /* 6 */
99    AsciiFlags::ID_CONTINUE,                        /* 7 */
100    AsciiFlags::ID_CONTINUE,                        /* 8 */
101    AsciiFlags::ID_CONTINUE,                        /* 9 */
102    AsciiFlags::NONE,                               /* : */
103    AsciiFlags::NONE,                               /* ; */
104    AsciiFlags::NONE,                               /* < */
105    AsciiFlags::NONE,                               /* = */
106    AsciiFlags::NONE,                               /* > */
107    AsciiFlags::NONE,                               /* ? */
108    AsciiFlags::NONE,                               /* @ */
109    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* A */
110    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* B */
111    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* C */
112    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* D */
113    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* E */
114    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* F */
115    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* G */
116    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* H */
117    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* I */
118    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* J */
119    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* K */
120    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* L */
121    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* M */
122    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* N */
123    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* O */
124    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* P */
125    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Q */
126    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* R */
127    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* S */
128    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* T */
129    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* U */
130    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* V */
131    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* W */
132    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* X */
133    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Y */
134    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Z */
135    AsciiFlags::NONE,                               /* [ */
136    AsciiFlags::NONE,                               /* \ */
137    AsciiFlags::NONE,                               /* ] */
138    AsciiFlags::NONE,                               /* ^ */
139    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* _ */
140    AsciiFlags::NONE,                               /* ` */
141    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* a */
142    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* b */
143    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* c */
144    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* d */
145    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* e */
146    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* f */
147    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* g */
148    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* h */
149    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* i */
150    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* j */
151    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* k */
152    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* l */
153    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* m */
154    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* n */
155    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* o */
156    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* p */
157    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* q */
158    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* r */
159    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* s */
160    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* t */
161    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* u */
162    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* v */
163    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* w */
164    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* x */
165    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* y */
166    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* z */
167    AsciiFlags::NONE,                               /* { */
168    AsciiFlags::NONE,                               /* | */
169    AsciiFlags::NONE,                               /* } */
170    AsciiFlags::NONE,                               /* ~ */
171    AsciiFlags::NONE                                /* DEL */
172}};
173
174bool KeywordsUtil::IsIdentifierStart(char32_t cp)
175{
176    if (cp < LEX_ASCII_MAX_BITS) {
177        return (ASCII_FLAGS[cp] & AsciiFlags::ID_START) != 0;
178    }
179
180    auto uchar = static_cast<UChar32>(cp);
181    return u_hasBinaryProperty(uchar, UCHAR_ID_START);
182}
183
184bool KeywordsUtil::IsIdentifierPart(char32_t cp)
185{
186    if (cp < LEX_ASCII_MAX_BITS) {
187        return (ASCII_FLAGS[cp] & AsciiFlags::ID_CONTINUE) != 0;
188    }
189
190    // u_isIDPart or Other_ID_Continue characters or ZWJ/ZWNJ.
191    auto uchar = static_cast<UChar32>(cp);
192    return (u_hasBinaryProperty(uchar, UCHAR_ID_CONTINUE) || cp == LEX_CHAR_ZWNJ || cp == LEX_CHAR_ZWJ);
193}
194
195void KeywordsUtil::ScanIdentifierStart(const Keywords *kws, char32_t cp)
196{
197    if (!KeywordsUtil::IsIdentifierStart(cp)) {
198        lexer_->ThrowError("Expected an identifier");
199    }
200
201    cp_ = cp;
202    const auto map = kws->KeywordMap(cp);
203    ScanIdContinueMaybeKeyword(kws, map);
204}
205
206void KeywordsUtil::ScanIdContinue()
207{
208    util::UString ident(lexer_->Allocator());
209    size_t startPos = lexer_->GetToken().Start().index;
210
211    if (HasEscape()) {
212        ident.Append(cp_);
213        startPos = Iterator().Index();
214    }
215
216    auto escapeEnd = startPos;
217
218    while (true) {
219        if (Iterator().Peek() == LEX_CHAR_BACKSLASH) {
220            ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
221
222            auto cp = ScanUnicodeEscapeSequence();
223            if (!IsIdentifierPart(cp)) {
224                lexer_->ThrowError("Invalid identifier part");
225            }
226
227            escapeEnd = Iterator().Index();
228            ident.Append(cp);
229            continue;
230        }
231
232        size_t cpSize {};
233        auto cp = Iterator().PeekCp(&cpSize);
234        if (!IsIdentifierPart(cp)) {
235            break;
236        }
237
238        Iterator().Forward(cpSize);
239    }
240
241    lexer_->GetToken().type_ = TokenType::LITERAL_IDENT;
242    lexer_->GetToken().keywordType_ = TokenType::LITERAL_IDENT;
243
244    if (HasEscape()) {
245        ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
246        lexer_->GetToken().src_ = ident.View();
247    } else {
248        lexer_->GetToken().src_ = lexer_->SourceView(startPos, Iterator().Index());
249    }
250}
251
252void KeywordsUtil::ScanIdContinueMaybeKeyword(const Keywords *kws, Span<const KeywordString> map)
253{
254    ScanIdContinue();
255
256    if (!HasEscape() || map.empty()) {
257        return;
258    }
259
260    const auto &str = lexer_->GetToken().Ident().Utf8();
261
262    size_t start = 0;
263    size_t end = map.size();
264    size_t middle = end / 2;
265
266    while (true) {
267        const auto &kw = map[middle];
268
269        int relation = str.compare(kw.Str());
270        if (relation == 0) {
271            kws->HandlePotentialEscapedKeyword(kw);
272            return;
273        }
274
275        if (relation > 0) {
276            start = middle + 1;
277        } else {
278            end = middle;
279        }
280
281        middle = (start + end) / 2U;
282
283        if (start >= end) {
284            return;
285        }
286    }
287}
288
289char32_t KeywordsUtil::ScanUnicodeEscapeSequence()
290{
291    ASSERT(Iterator().Peek() == LEX_CHAR_BACKSLASH);
292
293    lexer_->GetToken().flags_ |= lexer::TokenFlags::HAS_ESCAPE;
294
295    Iterator().Forward(1);
296
297    if (Iterator().Peek() != LEX_CHAR_LOWERCASE_U) {
298        return util::StringView::Iterator::INVALID_CP;
299    }
300
301    return lexer_->ScanUnicodeEscapeSequence();
302}
303}  // namespace ark::es2panda::lexer
304