1/** 2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "keywordsUtil.h" 17 18#include <gen/keywords.h> 19#include <lexer/lexer.h> 20#include <unicode/uchar.h> 21#include <util/enumbitops.h> 22 23namespace panda::es2panda::lexer { 24 25enum class AsciiFlags : uint8_t { 26 NONE = 0, 27 ID_START = 1 << 0, 28 ID_CONTINUE = 1 << 1, 29}; 30 31constexpr AsciiFlags operator|(AsciiFlags a, AsciiFlags b) 32{ 33 using utype = std::underlying_type_t<AsciiFlags>; 34 return static_cast<AsciiFlags>(static_cast<utype>(a) | static_cast<utype>(b)); 35} 36 37inline std::underlying_type_t<AsciiFlags> operator&(AsciiFlags a, AsciiFlags b) 38{ 39 using utype = std::underlying_type_t<AsciiFlags>; 40 /* NOLINTNEXTLINE(hicpp-signed-bitwise) */ 41 return static_cast<utype>(static_cast<utype>(a) & static_cast<utype>(b)); 42} 43 44constexpr std::array<AsciiFlags, 128> ASCII_FLAGS = {{ 45 AsciiFlags::NONE, /* NUL */ 46 AsciiFlags::NONE, /* SOH */ 47 AsciiFlags::NONE, /* STX */ 48 AsciiFlags::NONE, /* ETX */ 49 AsciiFlags::NONE, /* EOT */ 50 AsciiFlags::NONE, /* ENQ */ 51 AsciiFlags::NONE, /* ACK */ 52 AsciiFlags::NONE, /* BEL */ 53 AsciiFlags::NONE, /* BS */ 54 AsciiFlags::NONE, /* TAB */ 55 AsciiFlags::NONE, /* LF */ 56 AsciiFlags::NONE, /* VT */ 57 AsciiFlags::NONE, /* FF */ 58 AsciiFlags::NONE, /* CR */ 59 AsciiFlags::NONE, /* SO */ 60 AsciiFlags::NONE, /* SI */ 61 AsciiFlags::NONE, /* DLE */ 62 AsciiFlags::NONE, /* DC1 */ 63 AsciiFlags::NONE, /* DC2 */ 64 AsciiFlags::NONE, /* DC3 */ 65 AsciiFlags::NONE, /* DC4 */ 66 AsciiFlags::NONE, /* NAK */ 67 AsciiFlags::NONE, /* SYN */ 68 AsciiFlags::NONE, /* ETB */ 69 AsciiFlags::NONE, /* CAN */ 70 AsciiFlags::NONE, /* EM */ 71 AsciiFlags::NONE, /* SUB */ 72 AsciiFlags::NONE, /* ESC */ 73 AsciiFlags::NONE, /* FS */ 74 AsciiFlags::NONE, /* GS */ 75 AsciiFlags::NONE, /* RS */ 76 AsciiFlags::NONE, /* US */ 77 AsciiFlags::NONE, /* Space */ 78 AsciiFlags::NONE, /* ! */ 79 AsciiFlags::NONE, /* " */ 80 AsciiFlags::NONE, /* # */ 81 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* $ */ 82 AsciiFlags::NONE, /* % */ 83 AsciiFlags::NONE, /* & */ 84 AsciiFlags::NONE, /* ' */ 85 AsciiFlags::NONE, /* ( */ 86 AsciiFlags::NONE, /* ) */ 87 AsciiFlags::NONE, /* * */ 88 AsciiFlags::NONE, /* + */ 89 AsciiFlags::NONE, /* , */ 90 AsciiFlags::NONE, /* - */ 91 AsciiFlags::NONE, /* . */ 92 AsciiFlags::NONE, /* / */ 93 AsciiFlags::ID_CONTINUE, /* 0 */ 94 AsciiFlags::ID_CONTINUE, /* 1 */ 95 AsciiFlags::ID_CONTINUE, /* 2 */ 96 AsciiFlags::ID_CONTINUE, /* 3 */ 97 AsciiFlags::ID_CONTINUE, /* 4 */ 98 AsciiFlags::ID_CONTINUE, /* 5 */ 99 AsciiFlags::ID_CONTINUE, /* 6 */ 100 AsciiFlags::ID_CONTINUE, /* 7 */ 101 AsciiFlags::ID_CONTINUE, /* 8 */ 102 AsciiFlags::ID_CONTINUE, /* 9 */ 103 AsciiFlags::NONE, /* : */ 104 AsciiFlags::NONE, /* ; */ 105 AsciiFlags::NONE, /* < */ 106 AsciiFlags::NONE, /* = */ 107 AsciiFlags::NONE, /* > */ 108 AsciiFlags::NONE, /* ? */ 109 AsciiFlags::NONE, /* @ */ 110 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* A */ 111 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* B */ 112 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* C */ 113 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* D */ 114 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* E */ 115 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* F */ 116 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* G */ 117 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* H */ 118 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* I */ 119 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* J */ 120 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* K */ 121 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* L */ 122 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* M */ 123 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* N */ 124 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* O */ 125 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* P */ 126 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Q */ 127 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* R */ 128 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* S */ 129 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* T */ 130 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* U */ 131 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* V */ 132 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* W */ 133 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* X */ 134 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Y */ 135 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* Z */ 136 AsciiFlags::NONE, /* [ */ 137 AsciiFlags::NONE, /* \ */ 138 AsciiFlags::NONE, /* ] */ 139 AsciiFlags::NONE, /* ^ */ 140 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* _ */ 141 AsciiFlags::NONE, /* ` */ 142 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* a */ 143 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* b */ 144 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* c */ 145 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* d */ 146 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* e */ 147 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* f */ 148 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* g */ 149 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* h */ 150 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* i */ 151 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* j */ 152 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* k */ 153 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* l */ 154 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* m */ 155 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* n */ 156 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* o */ 157 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* p */ 158 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* q */ 159 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* r */ 160 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* s */ 161 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* t */ 162 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* u */ 163 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* v */ 164 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* w */ 165 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* x */ 166 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* y */ 167 AsciiFlags::ID_START | AsciiFlags::ID_CONTINUE, /* z */ 168 AsciiFlags::NONE, /* { */ 169 AsciiFlags::NONE, /* | */ 170 AsciiFlags::NONE, /* } */ 171 AsciiFlags::NONE, /* ~ */ 172 AsciiFlags::NONE /* DEL */ 173}}; 174 175bool KeywordsUtil::IsIdentifierStart(char32_t cp) 176{ 177 if (cp < LEX_ASCII_MAX_BITS) { 178 return (ASCII_FLAGS[cp] & AsciiFlags::ID_START) != 0; 179 } 180 // Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes 181 auto uchar = static_cast<UChar32>(cp); 182 return u_hasBinaryProperty(uchar, UCHAR_ID_START); 183} 184 185bool KeywordsUtil::IsIdentifierPart(char32_t cp) 186{ 187 if (cp < LEX_ASCII_MAX_BITS) { 188 return (ASCII_FLAGS[cp] & AsciiFlags::ID_CONTINUE) != 0; 189 } 190 191 /** 192 * u_isIDPart or Other_ID_Continue characters or ZWJ/ZWNJ. 193 * Unicode {xxxxx} may consist of 4 bytes information and cannot be forcibly converted to 2 bytes 194 */ 195 auto uchar = static_cast<UChar32>(cp); 196 return (u_hasBinaryProperty(uchar, UCHAR_ID_CONTINUE) || cp == LEX_CHAR_ZWNJ || cp == LEX_CHAR_ZWJ); 197} 198 199void KeywordsUtil::ScanIdentifierStart(char32_t cp) 200{ 201 if (!KeywordsUtil::IsIdentifierStart(cp)) { 202 lexer_->ThrowError("Expected an identifier"); 203 } 204 205 cp_ = cp; 206 const auto map = KeywordsMap::Map(cp); 207 ScanIdContinueMaybeKeyword(map); 208} 209 210void KeywordsUtil::ScanIdContinue() 211{ 212 util::UString ident(lexer_->Allocator()); 213 size_t startPos = lexer_->GetToken().Start().index; 214 215 if (HasEscape()) { 216 ident.Append(cp_); 217 startPos = Iterator().Index(); 218 } 219 220 auto escapeEnd = startPos; 221 222 do { 223 if (Iterator().Peek() == LEX_CHAR_BACKSLASH) { 224 ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index())); 225 226 auto cp = ScanUnicodeEscapeSequence(); 227 if (!IsIdentifierPart(cp)) { 228 lexer_->ThrowError("Invalid identifier part"); 229 } 230 231 escapeEnd = Iterator().Index(); 232 ident.Append(cp); 233 continue; 234 } 235 236 size_t cpSize {}; 237 auto cp = Iterator().PeekCp(&cpSize); 238 if (!IsIdentifierPart(cp)) { 239 break; 240 } 241 242 Iterator().Forward(cpSize); 243 } while (true); 244 245 lexer_->GetToken().type_ = TokenType::LITERAL_IDENT; 246 lexer_->GetToken().keywordType_ = TokenType::EOS; 247 248 if (HasEscape()) { 249 ident.Append(lexer_->SourceView(escapeEnd, Iterator().Index())); 250 lexer_->GetToken().src_ = ident.View(); 251 } else { 252 lexer_->GetToken().src_ = lexer_->SourceView(startPos, Iterator().Index()); 253 } 254} 255 256void KeywordsUtil::ScanIdContinueMaybeKeyword(Span<const KeywordString> map) 257{ 258 ScanIdContinue(); 259 260 if (!HasEscape() || map.empty()) { 261 return; 262 } 263 264 const auto &str = lexer_->GetToken().Ident().Utf8(); 265 266 int start = 0; 267 int end = static_cast<int>(map.size()); 268 int middle = end / 2; 269 270 while (true) { 271 const auto &kws = map[middle]; 272 273 int relation = str.compare(kws.str); 274 if (relation == 0) { 275 Keywords::SetKeyword(this, kws); 276 } 277 278 if (relation > 0) { 279 start = middle + 1; 280 } else { 281 end = middle; 282 } 283 284 middle = (start + end) / 2; 285 286 if (start >= end) { 287 return; 288 } 289 } 290} 291 292char32_t KeywordsUtil::ScanUnicodeEscapeSequence() 293{ 294 ASSERT(Iterator().Peek() == LEX_CHAR_BACKSLASH); 295 296 lexer_->GetToken().flags_ |= lexer::TokenFlags::HAS_ESCAPE; 297 298 Iterator().Forward(1); 299 300 if (Iterator().Peek() != LEX_CHAR_LOWERCASE_U) { 301 return util::StringView::Iterator::INVALID_CP; 302 } 303 304 return lexer_->ScanUnicodeEscapeSequence(); 305} 306 307} // namespace panda::es2panda::lexer 308