1/** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "keywordsUtil.h" 17 18#include "generated/keywords.h" 19#include "lexer/lexer.h" 20#include "unicode/uchar.h" 21#include "util/enumbitops.h" 22 23namespace ark::es2panda::lexer { 24enum class AsciiFlags : uint32_t { 25 NONE = 0U, 26 ID_START = 1U << 0U, 27 ID_CONTINUE = 1U << 1U, 28}; 29 30constexpr AsciiFlags operator|(AsciiFlags a, AsciiFlags b) 31{ 32 using Utype = std::underlying_type_t<AsciiFlags>; 33 return static_cast<AsciiFlags>(static_cast<Utype>(a) | static_cast<Utype>(b)); 34} 35 36inline std::underlying_type_t<AsciiFlags> operator&(AsciiFlags a, AsciiFlags b) 37{ 38 using Utype = std::underlying_type_t<AsciiFlags>; 39 /* NOLINTNEXTLINE(hicpp-signed-bitwise) */ 40 return static_cast<Utype>(static_cast<Utype>(a) & static_cast<Utype>(b)); 41} 42 43constexpr std::array<AsciiFlags, 128> ASCII_FLAGS = {{ 44 AsciiFlags::NONE, /* NUL */ 45 AsciiFlags::NONE, /* SOH */ 46 AsciiFlags::NONE, /* STX */ 47 AsciiFlags::NONE, /* ETX */ 48 AsciiFlags::NONE, /* EOT */ 49 AsciiFlags::NONE, /* ENQ */ 50 AsciiFlags::NONE, /* ACK */ 51 AsciiFlags::NONE, /* BEL */ 52 AsciiFlags::NONE, /* BS */ 53 AsciiFlags::NONE, /* TAB */ 54 AsciiFlags::NONE, /* LF */ 55 AsciiFlags::NONE, /* VT */ 56 AsciiFlags::NONE, /* FF */ 57 AsciiFlags::NONE, /* CR */ 58 AsciiFlags::NONE, /* SO */ 59 AsciiFlags::NONE, /* SI */ 60 AsciiFlags::NONE, /* DLE */ 61 AsciiFlags::NONE, /* DC1 */ 62 AsciiFlags::NONE, /* DC2 */ 63 AsciiFlags::NONE, /* DC3 */ 64 AsciiFlags::NONE, /* DC4 */ 65 AsciiFlags::NONE, /* NAK */ 66 AsciiFlags::NONE, /* SYN */ 67 AsciiFlags::NONE, /* ETB */ 68 AsciiFlags::NONE, /* CAN */ 69 AsciiFlags::NONE, /* EM */ 70 AsciiFlags::NONE, /* SUB */ 71 AsciiFlags::NONE, /* ESC */ 72 AsciiFlags::NONE, /* FS */ 73 AsciiFlags::NONE, /* GS */ 74 AsciiFlags::NONE, /* RS */ 75 AsciiFlags::NONE, /* US */ 76 AsciiFlags::NONE, /* Space */ 77 AsciiFlags::NONE, /* ! */ 78 AsciiFlags::NONE, /* " */ 79 AsciiFlags::NONE, /* # */ 80 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* $ */ 81 AsciiFlags::NONE, /* % */ 82 AsciiFlags::NONE, /* & */ 83 AsciiFlags::NONE, /* ' */ 84 AsciiFlags::NONE, /* ( */ 85 AsciiFlags::NONE, /* ) */ 86 AsciiFlags::NONE, /* * */ 87 AsciiFlags::NONE, /* + */ 88 AsciiFlags::NONE, /* , */ 89 AsciiFlags::NONE, /* - */ 90 AsciiFlags::NONE, /* . */ 91 AsciiFlags::NONE, /* / */ 92 AsciiFlags::ID_CONTINUE, /* 0 */ 93 AsciiFlags::ID_CONTINUE, /* 1 */ 94 AsciiFlags::ID_CONTINUE, /* 2 */ 95 AsciiFlags::ID_CONTINUE, /* 3 */ 96 AsciiFlags::ID_CONTINUE, /* 4 */ 97 AsciiFlags::ID_CONTINUE, /* 5 */ 98 AsciiFlags::ID_CONTINUE, /* 6 */ 99 AsciiFlags::ID_CONTINUE, /* 7 */ 100 AsciiFlags::ID_CONTINUE, /* 8 */ 101 AsciiFlags::ID_CONTINUE, /* 9 */ 102 AsciiFlags::NONE, /* : */ 103 AsciiFlags::NONE, /* ; */ 104 AsciiFlags::NONE, /* < */ 105 AsciiFlags::NONE, /* = */ 106 AsciiFlags::NONE, /* > */ 107 AsciiFlags::NONE, /* ? */ 108 AsciiFlags::NONE, /* @ */ 109 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* A */ 110 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* B */ 111 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* C */ 112 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* D */ 113 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* E */ 114 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* F */ 115 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* G */ 116 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* H */ 117 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* I */ 118 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* J */ 119 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* K */ 120 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* L */ 121 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* M */ 122 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* N */ 123 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* O */ 124 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* P */ 125 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Q */ 126 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* R */ 127 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* S */ 128 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* T */ 129 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* U */ 130 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* V */ 131 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* W */ 132 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* X */ 133 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Y */ 134 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Z */ 135 AsciiFlags::NONE, /* [ */ 136 AsciiFlags::NONE, /* \ */ 137 AsciiFlags::NONE, /* ] */ 138 AsciiFlags::NONE, /* ^ */ 139 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* _ */ 140 AsciiFlags::NONE, /* ` */ 141 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* a */ 142 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* b */ 143 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* c */ 144 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* d */ 145 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* e */ 146 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* f */ 147 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* g */ 148 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* h */ 149 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* i */ 150 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* j */ 151 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* k */ 152 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* l */ 153 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* m */ 154 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* n */ 155 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* o */ 156 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* p */ 157 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* q */ 158 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* r */ 159 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* s */ 160 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* t */ 161 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* u */ 162 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* v */ 163 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* w */ 164 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* x */ 165 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* y */ 166 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* z */ 167 AsciiFlags::NONE, /* { */ 168 AsciiFlags::NONE, /* | */ 169 AsciiFlags::NONE, /* } */ 170 AsciiFlags::NONE, /* ~ */ 171 AsciiFlags::NONE /* DEL */ 172}}; 173 174bool KeywordsUtil::IsIdentifierStart(char32_t cp) 175{ 176 if (cp < LEX_ASCII_MAX_BITS) { 177 return (ASCII_FLAGS[cp] & AsciiFlags::ID_START) != 0; 178 } 179 180 auto uchar = static_cast<UChar32>(cp); 181 return u_hasBinaryProperty(uchar, UCHAR_ID_START); 182} 183 184bool KeywordsUtil::IsIdentifierPart(char32_t cp) 185{ 186 if (cp < LEX_ASCII_MAX_BITS) { 187 return (ASCII_FLAGS[cp] & AsciiFlags::ID_CONTINUE) != 0; 188 } 189 190 // u_isIDPart or Other_ID_Continue characters or ZWJ/ZWNJ. 191 auto uchar = static_cast<UChar32>(cp); 192 return (u_hasBinaryProperty(uchar, UCHAR_ID_CONTINUE) || cp == LEX_CHAR_ZWNJ || cp == LEX_CHAR_ZWJ); 193} 194 195void KeywordsUtil::ScanIdentifierStart(const Keywords *kws, char32_t cp) 196{ 197 if (!KeywordsUtil::IsIdentifierStart(cp)) { 198 lexer_->ThrowError("Expected an identifier"); 199 } 200 201 cp_ = cp; 202 const auto map = kws->KeywordMap(cp); 203 ScanIdContinueMaybeKeyword(kws, map); 204} 205 206void KeywordsUtil::ScanIdContinue() 207{ 208 util::UString ident(lexer_->Allocator()); 209 size_t startPos = lexer_->GetToken().Start().index; 210 211 if (HasEscape()) { 212 ident.Append(cp_); 213 startPos = Iterator().Index(); 214 } 215 216 auto escapeEnd = startPos; 217 218 while (true) { 219 if (Iterator().Peek() == LEX_CHAR_BACKSLASH) { 220 ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index())); 221 222 auto cp = ScanUnicodeEscapeSequence(); 223 if (!IsIdentifierPart(cp)) { 224 lexer_->ThrowError("Invalid identifier part"); 225 } 226 227 escapeEnd = Iterator().Index(); 228 ident.Append(cp); 229 continue; 230 } 231 232 size_t cpSize {}; 233 auto cp = Iterator().PeekCp(&cpSize); 234 if (!IsIdentifierPart(cp)) { 235 break; 236 } 237 238 Iterator().Forward(cpSize); 239 } 240 241 lexer_->GetToken().type_ = TokenType::LITERAL_IDENT; 242 lexer_->GetToken().keywordType_ = TokenType::LITERAL_IDENT; 243 244 if (HasEscape()) { 245 ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index())); 246 lexer_->GetToken().src_ = ident.View(); 247 } else { 248 lexer_->GetToken().src_ = lexer_->SourceView(startPos, Iterator().Index()); 249 } 250} 251 252void KeywordsUtil::ScanIdContinueMaybeKeyword(const Keywords *kws, Span<const KeywordString> map) 253{ 254 ScanIdContinue(); 255 256 if (!HasEscape() || map.empty()) { 257 return; 258 } 259 260 const auto &str = lexer_->GetToken().Ident().Utf8(); 261 262 size_t start = 0; 263 size_t end = map.size(); 264 size_t middle = end / 2; 265 266 while (true) { 267 const auto &kw = map[middle]; 268 269 int relation = str.compare(kw.Str()); 270 if (relation == 0) { 271 kws->HandlePotentialEscapedKeyword(kw); 272 return; 273 } 274 275 if (relation > 0) { 276 start = middle + 1; 277 } else { 278 end = middle; 279 } 280 281 middle = (start + end) / 2U; 282 283 if (start >= end) { 284 return; 285 } 286 } 287} 288 289char32_t KeywordsUtil::ScanUnicodeEscapeSequence() 290{ 291 ASSERT(Iterator().Peek() == LEX_CHAR_BACKSLASH); 292 293 lexer_->GetToken().flags_ |= lexer::TokenFlags::HAS_ESCAPE; 294 295 Iterator().Forward(1); 296 297 if (Iterator().Peek() != LEX_CHAR_LOWERCASE_U) { 298 return util::StringView::Iterator::INVALID_CP; 299 } 300 301 return lexer_->ScanUnicodeEscapeSequence(); 302} 303} // namespace ark::es2panda::lexer 304