1/**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "keywordsUtil.h"
17
18#include <gen/keywords.h>
19#include <lexer/lexer.h>
20#include <unicode/uchar.h>
21#include <util/enumbitops.h>
22
23namespace panda::es2panda::lexer {
24
25enum class AsciiFlags : uint8_t {
26    NONE = 0,
27    ID_START = 1 << 0,
28    ID_CONTINUE = 1 << 1,
29};
30
31constexpr AsciiFlags operator|(AsciiFlags a, AsciiFlags b)
32{
33    using utype = std::underlying_type_t<AsciiFlags>;
34    return static_cast<AsciiFlags>(static_cast<utype>(a) | static_cast<utype>(b));
35}
36
37inline std::underlying_type_t<AsciiFlags> operator&(AsciiFlags a, AsciiFlags b)
38{
39    using utype = std::underlying_type_t<AsciiFlags>;
40    /* NOLINTNEXTLINE(hicpp-signed-bitwise) */
41    return static_cast<utype>(static_cast<utype>(a) & static_cast<utype>(b));
42}
43
44constexpr std::array<AsciiFlags, 128> ASCII_FLAGS = {{
45    AsciiFlags::NONE,                               /* NUL */
46    AsciiFlags::NONE,                               /* SOH */
47    AsciiFlags::NONE,                               /* STX */
48    AsciiFlags::NONE,                               /* ETX */
49    AsciiFlags::NONE,                               /* EOT */
50    AsciiFlags::NONE,                               /* ENQ */
51    AsciiFlags::NONE,                               /* ACK */
52    AsciiFlags::NONE,                               /* BEL */
53    AsciiFlags::NONE,                               /* BS */
54    AsciiFlags::NONE,                               /* TAB */
55    AsciiFlags::NONE,                               /* LF */
56    AsciiFlags::NONE,                               /* VT */
57    AsciiFlags::NONE,                               /* FF */
58    AsciiFlags::NONE,                               /* CR */
59    AsciiFlags::NONE,                               /* SO */
60    AsciiFlags::NONE,                               /* SI */
61    AsciiFlags::NONE,                               /* DLE */
62    AsciiFlags::NONE,                               /* DC1 */
63    AsciiFlags::NONE,                               /* DC2 */
64    AsciiFlags::NONE,                               /* DC3 */
65    AsciiFlags::NONE,                               /* DC4 */
66    AsciiFlags::NONE,                               /* NAK */
67    AsciiFlags::NONE,                               /* SYN */
68    AsciiFlags::NONE,                               /* ETB */
69    AsciiFlags::NONE,                               /* CAN */
70    AsciiFlags::NONE,                               /* EM */
71    AsciiFlags::NONE,                               /* SUB */
72    AsciiFlags::NONE,                               /* ESC */
73    AsciiFlags::NONE,                               /* FS */
74    AsciiFlags::NONE,                               /* GS */
75    AsciiFlags::NONE,                               /* RS */
76    AsciiFlags::NONE,                               /* US */
77    AsciiFlags::NONE,                               /* Space */
78    AsciiFlags::NONE,                               /* ! */
79    AsciiFlags::NONE,                               /* " */
80    AsciiFlags::NONE,                               /* # */
81    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* $ */
82    AsciiFlags::NONE,                               /* % */
83    AsciiFlags::NONE,                               /* & */
84    AsciiFlags::NONE,                               /* ' */
85    AsciiFlags::NONE,                               /* ( */
86    AsciiFlags::NONE,                               /* ) */
87    AsciiFlags::NONE,                               /* * */
88    AsciiFlags::NONE,                               /* + */
89    AsciiFlags::NONE,                               /* , */
90    AsciiFlags::NONE,                               /* - */
91    AsciiFlags::NONE,                               /* . */
92    AsciiFlags::NONE,                               /* / */
93    AsciiFlags::ID_CONTINUE,                        /* 0 */
94    AsciiFlags::ID_CONTINUE,                        /* 1 */
95    AsciiFlags::ID_CONTINUE,                        /* 2 */
96    AsciiFlags::ID_CONTINUE,                        /* 3 */
97    AsciiFlags::ID_CONTINUE,                        /* 4 */
98    AsciiFlags::ID_CONTINUE,                        /* 5 */
99    AsciiFlags::ID_CONTINUE,                        /* 6 */
100    AsciiFlags::ID_CONTINUE,                        /* 7 */
101    AsciiFlags::ID_CONTINUE,                        /* 8 */
102    AsciiFlags::ID_CONTINUE,                        /* 9 */
103    AsciiFlags::NONE,                               /* : */
104    AsciiFlags::NONE,                               /* ; */
105    AsciiFlags::NONE,                               /* < */
106    AsciiFlags::NONE,                               /* = */
107    AsciiFlags::NONE,                               /* > */
108    AsciiFlags::NONE,                               /* ? */
109    AsciiFlags::NONE,                               /* @ */
110    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* A */
111    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* B */
112    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* C */
113    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* D */
114    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* E */
115    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* F */
116    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* G */
117    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* H */
118    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* I */
119    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* J */
120    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* K */
121    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* L */
122    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* M */
123    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* N */
124    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* O */
125    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* P */
126    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Q */
127    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* R */
128    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* S */
129    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* T */
130    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* U */
131    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* V */
132    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* W */
133    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* X */
134    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Y */
135    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Z */
136    AsciiFlags::NONE,                               /* [ */
137    AsciiFlags::NONE,                               /* \ */
138    AsciiFlags::NONE,                               /* ] */
139    AsciiFlags::NONE,                               /* ^ */
140    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* _ */
141    AsciiFlags::NONE,                               /* ` */
142    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* a */
143    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* b */
144    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* c */
145    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* d */
146    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* e */
147    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* f */
148    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* g */
149    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* h */
150    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* i */
151    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* j */
152    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* k */
153    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* l */
154    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* m */
155    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* n */
156    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* o */
157    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* p */
158    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* q */
159    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* r */
160    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* s */
161    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* t */
162    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* u */
163    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* v */
164    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* w */
165    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* x */
166    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* y */
167    AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* z */
168    AsciiFlags::NONE,                               /* { */
169    AsciiFlags::NONE,                               /* | */
170    AsciiFlags::NONE,                               /* } */
171    AsciiFlags::NONE,                               /* ~ */
172    AsciiFlags::NONE                                /* DEL */
173}};
174
175bool KeywordsUtil::IsIdentifierStart(char32_t cp)
176{
177    if (cp < LEX_ASCII_MAX_BITS) {
178        return (ASCII_FLAGS[cp] & AsciiFlags::ID_START) != 0;
179    }
180    // Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes
181    auto uchar = static_cast<UChar32>(cp);
182    return u_hasBinaryProperty(uchar, UCHAR_ID_START);
183}
184
185bool KeywordsUtil::IsIdentifierPart(char32_t cp)
186{
187    if (cp < LEX_ASCII_MAX_BITS) {
188        return (ASCII_FLAGS[cp] & AsciiFlags::ID_CONTINUE) != 0;
189    }
190
191    /**
192     * u_isIDPart or Other_ID_Continue characters or ZWJ/ZWNJ.
193     * Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes
194     */
195    auto uchar = static_cast<UChar32>(cp);
196    return (u_hasBinaryProperty(uchar, UCHAR_ID_CONTINUE) || cp == LEX_CHAR_ZWNJ || cp == LEX_CHAR_ZWJ);
197}
198
199void KeywordsUtil::ScanIdentifierStart(char32_t cp)
200{
201    if (!KeywordsUtil::IsIdentifierStart(cp)) {
202        lexer_->ThrowError("Expected an identifier");
203    }
204
205    cp_ = cp;
206    const auto map = KeywordsMap::Map(cp);
207    ScanIdContinueMaybeKeyword(map);
208}
209
210void KeywordsUtil::ScanIdContinue()
211{
212    util::UString ident(lexer_->Allocator());
213    size_t startPos = lexer_->GetToken().Start().index;
214
215    if (HasEscape()) {
216        ident.Append(cp_);
217        startPos = Iterator().Index();
218    }
219
220    auto escapeEnd = startPos;
221
222    do {
223        if (Iterator().Peek() == LEX_CHAR_BACKSLASH) {
224            ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
225
226            auto cp = ScanUnicodeEscapeSequence();
227            if (!IsIdentifierPart(cp)) {
228                lexer_->ThrowError("Invalid identifier part");
229            }
230
231            escapeEnd = Iterator().Index();
232            ident.Append(cp);
233            continue;
234        }
235
236        size_t cpSize {};
237        auto cp = Iterator().PeekCp(&cpSize);
238        if (!IsIdentifierPart(cp)) {
239            break;
240        }
241
242        Iterator().Forward(cpSize);
243    } while (true);
244
245    lexer_->GetToken().type_ = TokenType::LITERAL_IDENT;
246    lexer_->GetToken().keywordType_ = TokenType::EOS;
247
248    if (HasEscape()) {
249        ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index()));
250        lexer_->GetToken().src_ = ident.View();
251    } else {
252        lexer_->GetToken().src_ = lexer_->SourceView(startPos, Iterator().Index());
253    }
254}
255
256void KeywordsUtil::ScanIdContinueMaybeKeyword(Span<const KeywordString> map)
257{
258    ScanIdContinue();
259
260    if (!HasEscape() || map.empty()) {
261        return;
262    }
263
264    const auto &str = lexer_->GetToken().Ident().Utf8();
265
266    int start = 0;
267    int end = static_cast<int>(map.size());
268    int middle = end / 2;
269
270    while (true) {
271        const auto &kws = map[middle];
272
273        int relation = str.compare(kws.str);
274        if (relation == 0) {
275            Keywords::SetKeyword(this, kws);
276        }
277
278        if (relation > 0) {
279            start = middle + 1;
280        } else {
281            end = middle;
282        }
283
284        middle = (start + end) / 2;
285
286        if (start >= end) {
287            return;
288        }
289    }
290}
291
292char32_t KeywordsUtil::ScanUnicodeEscapeSequence()
293{
294    ASSERT(Iterator().Peek() == LEX_CHAR_BACKSLASH);
295
296    lexer_->GetToken().flags_ |= lexer::TokenFlags::HAS_ESCAPE;
297
298    Iterator().Forward(1);
299
300    if (Iterator().Peek() != LEX_CHAR_LOWERCASE_U) {
301        return util::StringView::Iterator::INVALID_CP;
302    }
303
304    return lexer_->ScanUnicodeEscapeSequence();
305}
306
307}  // namespace panda::es2panda::lexer
308