1/* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "ecmascript/regexp/regexp_parser.h" 17 18#include "ecmascript/base/string_helper.h" 19#include "libpandabase/utils/utils.h" 20#define _NO_DEBUG_ 21 22namespace panda::ecmascript { 23static constexpr uint32_t CACHE_SIZE = 128; 24static constexpr uint32_t CHAR_MAXS = 128; 25static constexpr uint32_t ID_START_TABLE_ASCII[4] = { 26 /* $ A-Z _ a-z */ 27 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE 28}; 29static RangeSet g_rangeD(0x30, 0x39); // NOLINTNEXTLINE(fuchsia-statically-constructed-objects) 30// NOLINTNEXTLINE(fuchsia-statically-constructed-objects) 31static RangeSet g_rangeS({ 32 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINTNEXTLINE(readability-magic-numbers) 33 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINTNEXTLINE(readability-magic-numbers) 34 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINTNEXTLINE(readability-magic-numbers) 35 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINTNEXTLINE(readability-magic-numbers) 36 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINTNEXTLINE(readability-magic-numbers) 37 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ 38 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ 39 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINTNEXTLINE(readability-magic-numbers) 40 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINTNEXTLINE(readability-magic-numbers) 41 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINTNEXTLINE(readability-magic-numbers) 42 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINTNEXTLINE(readability-magic-numbers) 43 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ 44 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINTNEXTLINE(readability-magic-numbers) 45}); 46 47// NOLINTNEXTLINE(fuchsia-statically-constructed-objects) 48static RangeSet g_rangeW({ 49 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers) 50 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers) 51 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINTNEXTLINE(readability-magic-numbers) 52 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers) 53}); 54 55// NOLINTNEXTLINE(fuchsia-statically-constructed-objects) 56static RangeSet g_regexpIdentifyStart({ 57 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers) 58 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers) 59 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers) 60}); 61 62// NOLINTNEXTLINE(fuchsia-statically-constructed-objects) 63static RangeSet g_regexpIdentifyContinue({ 64 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers) 65 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers) 66 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers) 67 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers) 68}); 69 70void RegExpParser::Parse() 71{ 72 // dynbuffer head init [size,capture_count,statck_count,flags,prefilter] 73 buffer_.EmitU32(0); 74 buffer_.EmitU32(0); 75 buffer_.EmitU32(0); 76 buffer_.EmitU32(0); 77 buffer_.EmitU32(0); 78 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 79 PrintF("Parse Pattern------\n"); 80 // Pattern[U, N]:: 81 // Disjunction[?U, ?N] 82 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 83 Advance(); 84 SaveStartOpCode saveStartOp; 85 int captureIndex = captureCount_++; 86 saveStartOp.EmitOpCode(&buffer_, captureIndex); 87 ParseDisjunction(false); 88 if (isError_) { 89 return; 90 } 91 if (c0_ != KEY_EOF) { 92 ParseError("extraneous characters at the end"); 93 return; 94 } 95 SaveEndOpCode saveEndOp; 96 saveEndOp.EmitOpCode(&buffer_, captureIndex); 97 MatchEndOpCode matchEndOp; 98 matchEndOp.EmitOpCode(&buffer_, 0); 99 100 uint32_t ptr = RegExpParser::OP_START_OFFSET; 101 ptr += static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_SAVE_START)->GetSize()); 102 uint8_t opCode = buffer_.GetU8(ptr); 103 uint16_t expectedChar = 0; 104 if (opCode == RegExpOpCode::OP_CHAR && !IsIgnoreCase()) { 105 expectedChar = buffer_.GetU16(ptr + 1); 106 if (expectedChar > UINT8_MAX) { 107 expectedChar = 0; 108 } 109 } 110 111 // dynbuffer head assignments 112 buffer_.PutU32(0, buffer_.size_); 113 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_); 114 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_); 115 buffer_.PutU32(FLAGS_OFFSET, flags_); 116 buffer_.PutU32(PREFILTER_OFFSET, expectedChar); 117#ifndef _NO_DEBUG_ 118 RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_); 119#endif 120} 121 122void RegExpParser::ParseDisjunction(bool isBackward) 123{ 124 // check stack overflow because infinite recursion may occur 125 DoParserStackOverflowCheck("invalid regular expression."); 126 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 127 PrintF("Parse Disjunction------\n"); 128 if (c0_ == ')') { 129 isEmpty_ = true; 130 return; 131 } 132 size_t start = buffer_.size_; 133 ParseAlternative(isBackward); 134 if (isError_) { 135 return; 136 } 137 uint32_t para = 0; 138 do { 139 if (c0_ == '|') { 140 SplitNextOpCode splitOp; 141 uint32_t len = buffer_.size_ - start; 142 GotoOpCode gotoOp; 143 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize()); 144 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize(); 145 gotoOp.UpdateOpPara(&buffer_, pos, para); 146 Advance(); 147 ParseAlternative(isBackward); 148 para = buffer_.size_ - pos - gotoOp.GetSize(); 149 if (c0_ != '|') { 150 uint16_t cnt = 0; 151 uint32_t opCharSize = 152 static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_CHAR)->GetSize()); 153 uint32_t opSplitSize = 154 static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_SPLIT_NEXT)->GetSize()); 155 std::vector<uint16_t> chars; 156 std::vector<uint32_t> offsets; 157 std::set<uint16_t> checkSet; 158 uint32_t ptr = start; 159 bool isSparseable = true; 160 do { 161 uint8_t opCode = buffer_.GetU8(ptr); 162 uint32_t offset = 0; 163 uint32_t branch = ptr; 164 bool isLastBranch = false; 165 if (opCode == RegExpOpCode::OP_SPLIT_NEXT) { 166 offset = buffer_.GetU32(ptr + 1); 167 branch = ptr + offset + opSplitSize; 168 } else { 169 isLastBranch = true; 170 } 171 uint8_t opCodeChar = buffer_.GetU8(branch); 172 if (opCodeChar == RegExpOpCode::OP_CHAR) { 173 chars.push_back(buffer_.GetU16(branch + 1)); 174 offsets.push_back(offset); 175 if (checkSet.find(chars[cnt]) != checkSet.end()) { 176 isSparseable = false; 177 break; 178 } 179 checkSet.insert(chars[cnt]); 180 } else { 181 isSparseable = false; 182 break; 183 } 184 cnt++; 185 if (isLastBranch) { 186 break; 187 } 188 ptr += opSplitSize; 189 } while (true); 190 191 if (isSparseable) { 192 uint32_t sparseLen = SPARSE_HEAD_OFFSET + static_cast<uint32_t>(cnt) * SPARSE_MAX_OFFSET; 193 uint32_t splitsLen = static_cast<uint32_t>(cnt - 1) * opSplitSize; 194 ptr = start; 195 buffer_.Insert(start, sparseLen - splitsLen); 196 pos += sparseLen - splitsLen; 197 buffer_.PutU8(ptr, RegExpOpCode::OP_SPARSE); 198 buffer_.PutU16(ptr + 1, cnt); 199 ptr += SPARSE_HEAD_OFFSET; 200 ASSERT(chars.size() > 0); 201 for (int32_t i = static_cast<int32_t>(chars.size() - 1); i >= 0; i--) { 202 buffer_.PutU16(ptr, chars[i]); 203 // 2: cnt = count of splits + 1, for invert index should be extra - 1, so -1-1=-2 204 offsets[i] += opCharSize - opSplitSize * std::max(0, cnt - i -2); 205 buffer_.PutU32(ptr + SPARSE_OFF_OFFSET, offsets[i]); 206 ptr += SPARSE_MAX_OFFSET; 207 } 208 } 209 bool isEnd = false; 210 do { 211 uint32_t paraTmp = buffer_.GetU32(pos + 1); 212 if (paraTmp == 0) { 213 isEnd = true; 214 } 215 buffer_.PutU32(pos + 1, para); 216 para += paraTmp + gotoOp.GetSize(); 217 pos -= paraTmp + gotoOp.GetSize(); 218 } while (!isEnd); 219 } 220 if (isError_) { 221 return; 222 } 223 } 224 } while (c0_ != KEY_EOF && c0_ != ')'); 225} 226 227uint32_t RegExpParser::ParseOctalLiteral() 228{ 229 // For compatibility with some other browsers (not all), we parse 230 // up to three octal digits with a value below 256. 231 // ES#prod-annexB-LegacyOctalEscapeSequence 232 uint32_t value = c0_ - '0'; 233 Advance(); 234 if (c0_ >= '0' && c0_ <= '7') { 235 value = value * OCTAL_VALUE + c0_ - '0'; 236 Advance(); 237 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') { 238 value = value * OCTAL_VALUE + c0_ - '0'; 239 Advance(); 240 } 241 } 242 return value; 243} 244 245bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value) 246{ 247 uint32_t x = 0; 248 int d = static_cast<int>(HexValue(c0_)); 249 if (d < 0) { 250 return false; 251 } 252 while (d >= 0) { 253 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) { 254 LOG_FULL(FATAL) << "value overflow"; 255 return false; 256 } 257 x = x * HEX_VALUE + static_cast<uint32_t>(d); 258 if (x > maxValue) { 259 return false; 260 } 261 Advance(); 262 d = static_cast<int>(HexValue(c0_)); 263 } 264 *value = x; 265 return true; 266} 267 268// This parses RegExpUnicodeEscapeSequence as described in ECMA262. 269bool RegExpParser::ParseUnicodeEscape(uint32_t *value) 270{ 271 // Accept both \uxxxx and \u{xxxxxx} (if allowed). 272 // In the latter case, the number of hex digits between { } is arbitrary. 273 // \ and u have already been read. 274 if (c0_ == '{' && IsUtf16()) { 275 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 276 Advance(); 277 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINTNEXTLINE(readability-magic-numbers) 278 if (c0_ == '}') { 279 Advance(); 280 return true; 281 } 282 } 283 pc_ = start; 284 Advance(); 285 return false; 286 } 287 // \u but no {, or \u{...} escapes not allowed. 288 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value); 289 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') { 290 // Attempt to read trail surrogate. 291 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 292 if (*pc_ == 'u') { 293 Advance(UNICODE_HEX_ADVANCE); 294 uint32_t trail = 0; 295 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) { 296 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINTNEXTLINE(hicpp-signed-bitwise) 297 return true; 298 } 299 } 300 pc_ = start; 301 Advance(); 302 } 303 return result; 304} 305 306bool RegExpParser::ParseHexEscape(int length, uint32_t *value) 307{ 308 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 309 uint32_t val = 0; 310 for (int i = 0; i < length; ++i) { 311 uint32_t c = c0_; 312 int d = static_cast<int>(HexValue(c)); 313 if (d < 0) { 314 pc_ = start; 315 Advance(); 316 return false; 317 } 318 val = val * HEX_VALUE + static_cast<uint32_t>(d); 319 Advance(); 320 } 321 *value = val; 322 return true; 323} 324 325// NOLINTNEXTLINE(readability-function-size) 326void RegExpParser::ParseAlternative(bool isBackward) 327{ 328 size_t start = buffer_.size_; 329 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') { 330 if (isError_) { 331 return; 332 } 333 size_t atomBcStart = buffer_.GetSize(); 334 int captureIndex = 0; 335 bool isAtom = false; 336 switch (c0_) { 337 case '^': { 338 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 339 PrintF("Assertion %c line start \n", c0_); 340 LineStartOpCode lineStartOp; 341 lineStartOp.EmitOpCode(&buffer_, 0); 342 Advance(); 343 break; 344 } 345 case '$': { 346 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 347 PrintF("Assertion %c line end \n", c0_); 348 LineEndOpCode lineEndOp; 349 lineEndOp.EmitOpCode(&buffer_, 0); 350 Advance(); 351 break; 352 } 353 case '\\': { 354 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 355 PrintF("Escape %c \n", c0_); 356 Advance(); 357 switch (c0_) { 358 case 'b': { 359 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 360 PrintF("Assertion %c \n", c0_); 361 WordBoundaryOpCode wordBoundaryOp; 362 wordBoundaryOp.EmitOpCode(&buffer_, 0); 363 Advance(); 364 break; 365 } 366 case 'B': { 367 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 368 PrintF("Assertion %c \n", c0_); 369 NotWordBoundaryOpCode notWordBoundaryOp; 370 notWordBoundaryOp.EmitOpCode(&buffer_, 0); 371 Advance(); 372 break; 373 } 374 default: { 375 isAtom = true; 376 int atomValue = ParseAtomEscape(isBackward); 377 if (atomValue != -1) { 378 PrevOpCode prevOp; 379 if (isBackward) { 380 prevOp.EmitOpCode(&buffer_, 0); 381 } 382 if (IsIgnoreCase()) { 383 if (!IsUtf16()) { 384 atomValue = Canonicalize(atomValue, false); 385 } else { 386 icu::UnicodeSet set(atomValue, atomValue); 387 set.closeOver(USET_CASE_INSENSITIVE); 388 set.removeAllStrings(); 389 uint32_t size = static_cast<uint32_t>(set.size()); 390 RangeOpCode rangeOp; 391 RangeSet rangeResult; 392 for (uint32_t idx = 0; idx < size; idx++) { 393 int32_t uc = set.charAt(idx); 394 RangeSet curRange(uc); 395 rangeResult.Insert(curRange); 396 } 397 rangeOp.InsertOpCode(&buffer_, rangeResult); 398 break; 399 } 400 } 401 if (atomValue <= UINT16_MAX) { 402 CharOpCode charOp; 403 charOp.EmitOpCode(&buffer_, atomValue); 404 } else { 405 Char32OpCode charOp; 406 charOp.EmitOpCode(&buffer_, atomValue); 407 } 408 if (isBackward) { 409 prevOp.EmitOpCode(&buffer_, 0); 410 } 411 } 412 break; 413 } 414 } 415 break; 416 } 417 case '(': { 418 Advance(); 419 isAtom = ParseAssertionCapture(&captureIndex, isBackward); 420 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 421 Advance(); 422 break; 423 } 424 case '.': { 425 PrevOpCode prevOp; 426 if (isBackward) { 427 prevOp.EmitOpCode(&buffer_, 0); 428 } 429 if (IsDotAll()) { 430 AllOpCode allOp; 431 allOp.EmitOpCode(&buffer_, 0); 432 } else { 433 DotsOpCode dotsOp; 434 dotsOp.EmitOpCode(&buffer_, 0); 435 } 436 if (isBackward) { 437 prevOp.EmitOpCode(&buffer_, 0); 438 } 439 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 440 PrintF("Atom %c match any \n", c0_); 441 isAtom = true; 442 Advance(); 443 break; 444 } 445 case '[': { 446 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 447 PrintF("Atom %c match range \n", c0_); 448 isAtom = true; 449 PrevOpCode prevOp; 450 Advance(); 451 if (isBackward) { 452 prevOp.EmitOpCode(&buffer_, 0); 453 } 454 bool isInvert = false; 455 if (c0_ == '^') { 456 isInvert = true; 457 Advance(); 458 } 459 RangeSet rangeResult; 460 if (!ParseClassRanges(&rangeResult)) { 461 break; 462 } 463 if (isInvert) { 464 rangeResult.Invert(IsUtf16()); 465 } 466 uint32_t highValue = rangeResult.HighestValue(); 467 if (highValue <= UINT16_MAX) { 468 RangeOpCode rangeOp; 469 rangeOp.InsertOpCode(&buffer_, rangeResult); 470 } else { 471 Range32OpCode rangeOp; 472 rangeOp.InsertOpCode(&buffer_, rangeResult); 473 } 474 475 if (isBackward) { 476 prevOp.EmitOpCode(&buffer_, 0); 477 } 478 break; 479 } 480 case '*': 481 case '+': 482 case '?': 483 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 484 ParseError("nothing to repeat"); 485 return; 486 case '{': { 487 uint8_t *begin = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 488 int dummy; 489 if (ParserIntervalQuantifier(&dummy, &dummy)) { 490 ParseError("nothing to repeat"); 491 return; 492 } 493 pc_ = begin; 494 Advance(); 495 } 496 [[fallthrough]]; 497 case '}': 498 case ']': 499 if (IsUtf16()) { 500 ParseError("syntax error"); 501 return; 502 } 503 [[fallthrough]]; 504 default: { 505 // PatternCharacter 506 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 507 PrintF("PatternCharacter %c\n", c0_); 508 isAtom = true; 509 { 510 PrevOpCode prevOp; 511 if (isBackward) { 512 prevOp.EmitOpCode(&buffer_, 0); 513 } 514 uint32_t matchedChar = c0_; 515 if (c0_ > (INT8_MAX + 1)) { 516 Prev(); 517 UChar32 c; 518 int32_t length = end_ - pc_ + 1; 519 // NOLINTNEXTLINE(hicpp-signed-bitwise) 520 auto unicodeChar = base::utf_helper::ConvertUtf8ToUnicodeChar(pc_, length); 521 c = unicodeChar.first; 522 matchedChar = static_cast<uint32_t>(c); 523 pc_ += unicodeChar.second; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 524 } 525 if (IsIgnoreCase()) { 526 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16())); 527 } 528 if (matchedChar > UINT16_MAX) { 529 Char32OpCode charOp; 530 charOp.EmitOpCode(&buffer_, matchedChar); 531 } else { 532 CharOpCode charOp; 533 charOp.EmitOpCode(&buffer_, matchedChar); 534 } 535 if (isBackward) { 536 prevOp.EmitOpCode(&buffer_, 0); 537 } 538 } 539 Advance(); 540 break; 541 } 542 } 543 if (isAtom && !isError_) { 544 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1); 545 } 546 if (isBackward) { 547 size_t end = buffer_.GetSize(); 548 size_t termSize = end - atomBcStart; 549 size_t moveSize = end - start; 550 buffer_.Expand(end + termSize); 551 if (memmove_s(buffer_.buf_ + start + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 552 termSize, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 553 moveSize, 554 buffer_.buf_ + start, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 555 moveSize) != EOK) { 556 LOG_FULL(FATAL) << "memmove_s failed"; 557 UNREACHABLE(); 558 } 559 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 560 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) { 561 LOG_FULL(FATAL) << "memcpy_s failed"; 562 UNREACHABLE(); 563 } 564 } 565 } 566} 567 568int RegExpParser::FindGroupName(const CString &name) 569{ 570 size_t len = 0; 571 size_t nameLen = name.size(); 572 const char *p = reinterpret_cast<char *>(groupNames_.buf_); 573 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 574 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_; 575 int captureIndex = 1; 576 while (p < bufEnd) { 577 len = strlen(p); 578 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) { 579 return captureIndex; 580 } 581 p += len + 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 582 captureIndex++; 583 } 584 return -1; 585} 586 587bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward) 588{ 589 bool isAtom = false; 590 do { 591 if (c0_ == '?') { 592 Advance(); 593 switch (c0_) { 594 // (?=Disjunction[?U, ?N]) 595 case '=': { 596 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 597 PrintF("Assertion(?= Disjunction)\n"); 598 Advance(); 599 uint32_t start = buffer_.size_; 600 ParseDisjunction(isBackward); 601 MatchOpCode matchOp; 602 matchOp.EmitOpCode(&buffer_, 0); 603 MatchAheadOpCode matchAheadOp; 604 uint32_t len = buffer_.size_ - start; 605 matchAheadOp.InsertOpCode(&buffer_, start, len); 606 break; 607 } 608 // (?!Disjunction[?U, ?N]) 609 case '!': { 610 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 611 PrintF("Assertion(?! Disjunction)\n"); 612 uint32_t start = buffer_.size_; 613 Advance(); 614 ParseDisjunction(isBackward); 615 MatchOpCode matchOp; 616 matchOp.EmitOpCode(&buffer_, 0); 617 NegativeMatchAheadOpCode matchAheadOp; 618 uint32_t len = buffer_.size_ - start; 619 matchAheadOp.InsertOpCode(&buffer_, start, len); 620 break; 621 } 622 case '<': { 623 Advance(); 624 // (?<=Disjunction[?U, ?N]) 625 if (c0_ == '=') { 626 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 627 PrintF("Assertion(?<= Disjunction)\n"); 628 Advance(); 629 uint32_t start = buffer_.size_; 630 ParseDisjunction(true); 631 MatchOpCode matchOp; 632 matchOp.EmitOpCode(&buffer_, 0); 633 MatchAheadOpCode matchAheadOp; 634 uint32_t len = buffer_.size_ - start; 635 matchAheadOp.InsertOpCode(&buffer_, start, len); 636 // (?<!Disjunction[?U, ?N]) 637 } else if (c0_ == '!') { 638 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 639 PrintF("Assertion(?<! Disjunction)\n"); 640 Advance(); 641 uint32_t start = buffer_.size_; 642 ParseDisjunction(true); 643 MatchOpCode matchOp; 644 matchOp.EmitOpCode(&buffer_, 0); 645 NegativeMatchAheadOpCode matchAheadOp; 646 uint32_t len = buffer_.size_ - start; 647 matchAheadOp.InsertOpCode(&buffer_, start, len); 648 } else { 649 Prev(); 650 CString name; 651 auto **pp = const_cast<const uint8_t **>(&pc_); 652 if (!ParseGroupSpecifier(pp, name)) { 653 ParseError("GroupName Syntax error."); 654 return false; 655 } 656 if (FindGroupName(name) > 0) { 657 ParseError("Duplicate GroupName error."); 658 return false; 659 } 660 groupNames_.EmitStr(name.c_str()); 661 newGroupNames_.push_back(name); 662 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 663 PrintF("group name %s", name.c_str()); 664 Advance(); 665 goto parseCapture; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto) 666 } 667 break; 668 } 669 // (?:Disjunction[?U, ?N]) 670 case ':': 671 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 672 PrintF("Atom(?<: Disjunction)\n"); 673 isAtom = true; 674 Advance(); 675 ParseDisjunction(isBackward); 676 break; 677 default: 678 Advance(); 679 ParseError("? Syntax error."); 680 return false; 681 } 682 if (isError_) { 683 return false; 684 } 685 } else { 686 groupNames_.EmitChar(0); 687 parseCapture: 688 isAtom = true; 689 *captureIndex = captureCount_++; 690 SaveEndOpCode saveEndOp; 691 SaveStartOpCode saveStartOp; 692 if (isBackward) { 693 saveEndOp.EmitOpCode(&buffer_, *captureIndex); 694 } else { 695 saveStartOp.EmitOpCode(&buffer_, *captureIndex); 696 } 697 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 698 PrintF("capture start %d \n", *captureIndex); 699 ParseDisjunction(isBackward); 700 if (isError_) { 701 return false; 702 } 703 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 704 PrintF("capture end %d \n", *captureIndex); 705 if (isBackward) { 706 saveStartOp.EmitOpCode(&buffer_, *captureIndex); 707 } else { 708 saveEndOp.EmitOpCode(&buffer_, *captureIndex); 709 } 710 } 711 } while (c0_ != ')' && c0_ != KEY_EOF); 712 if (c0_ != ')') { 713 ParseError("capture syntax error"); 714 return false; 715 } 716 return isAtom; 717} 718 719int RegExpParser::ParseDecimalDigits() 720{ 721 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 722 PrintF("Parse DecimalDigits------\n"); 723 uint32_t result = 0; 724 bool overflow = false; 725 while (true) { 726 if (c0_ < '0' || c0_ > '9') { 727 break; 728 } 729 if (!overflow) { 730 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) { 731 overflow = true; 732 } else { 733 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0'; 734 } 735 } 736 Advance(); 737 } 738 if (overflow) { 739 return INT32_MAX; 740 } 741 return result; 742} 743 744bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax) 745{ 746 // Quantifier:: 747 // QuantifierPrefix 748 // QuantifierPrefix? 749 // QuantifierPrefix:: 750 // * 751 // + 752 // ? 753 // {DecimalDigits} 754 // {DecimalDigits,} 755 // {DecimalDigits,DecimalDigits} 756 Advance(); 757 *pmin = ParseDecimalDigits(); 758 *pmax = *pmin; 759 switch (c0_) { 760 case ',': { 761 Advance(); 762 if (c0_ == '}') { 763 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 764 PrintF("QuantifierPrefix{DecimalDigits,}\n"); 765 *pmax = INT32_MAX; 766 Advance(); 767 } else { 768 *pmax = ParseDecimalDigits(); 769 if (c0_ == '}') { 770 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 771 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n"); 772 Advance(); 773 } else { 774 return false; 775 } 776 } 777 break; 778 } 779 case '}': 780 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 781 PrintF("QuantifierPrefix{DecimalDigits}\n"); 782 Advance(); 783 break; 784 default: 785 Advance(); 786 return false; 787 } 788 return true; 789} 790 791void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd) 792{ 793 int min = -1; 794 int max = -1; 795 bool isGreedy = true; 796 switch (c0_) { 797 case '*': 798 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 799 PrintF("QuantifierPrefix %c\n", c0_); 800 min = 0; 801 max = INT32_MAX; 802 Advance(); 803 break; 804 case '+': 805 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 806 PrintF("QuantifierPrefix %c\n", c0_); 807 min = 1; 808 max = INT32_MAX; 809 Advance(); 810 break; 811 case '?': 812 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 813 PrintF("QuantifierPrefix %c\n", c0_); 814 Advance(); 815 min = 0; 816 max = 1; 817 break; 818 case '{': { 819 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 820 if (!ParserIntervalQuantifier(&min, &max)) { 821 pc_ = start; 822 Advance(); // back to '{' 823 return; 824 } 825 if (min > max) { 826 ParseError("Invalid repetition count"); 827 return; 828 } 829 break; 830 } 831 default: 832 break; 833 } 834 if (c0_ == '?') { 835 isGreedy = false; 836 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 837 PrintF("Quantifier::QuantifierPrefix?\n"); 838 Advance(); 839 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') { 840 ParseError("nothing to repeat"); 841 return; 842 } 843 844 if (max == 0) { 845 buffer_.size_ = atomBcStart; // Drop all unnecessary bytecode 846 } else if (min != -1 && max != -1 && !isEmpty_) { 847 bool isLoopOp = false; 848 size_t checkCharPara = SIZE_MAX; 849 850 if (captureStart != 0) { 851 SaveResetOpCode saveResetOp; 852 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd); 853 } 854 855 // zero advance check 856 uint8_t firstOp = buffer_.GetU8(atomBcStart); 857 if (max == INT32_MAX && firstOp != RegExpOpCode::OP_CHAR && firstOp != RegExpOpCode::OP_CHAR32 && 858 firstOp != RegExpOpCode::OP_RANGE && firstOp != RegExpOpCode::OP_RANGE32 && 859 firstOp != RegExpOpCode::OP_ALL && firstOp != RegExpOpCode::OP_DOTS && 860 firstOp != RegExpOpCode::OP_SPARSE) { 861 stackCount_++; 862 PushCharOpCode pushCharOp; 863 pushCharOp.InsertOpCode(&buffer_, atomBcStart); 864 CheckCharOpCode checkCharOp; 865 checkCharPara = buffer_.GetSize() + 1; 866 // NOLINTNEXTLINE(readability-magic-numbers) 867 checkCharOp.EmitOpCode(&buffer_, 0); 868 } 869 870 if (min <= 1 && max == INT32_MAX) { 871 if (checkCharPara != SIZE_MAX) { 872 buffer_.PutU32(checkCharPara, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_SPLIT_NEXT)->GetSize()); 873 } 874 if (isGreedy) { 875 SplitFirstOpCode splitOp; 876 splitOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - splitOp.GetSize()); 877 } else { 878 SplitNextOpCode splitOp; 879 splitOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - splitOp.GetSize()); 880 } 881 } else if (max > 1) { 882 if (checkCharPara != SIZE_MAX) { 883 buffer_.PutU32(checkCharPara, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize()); 884 } 885 if (isGreedy) { 886 LoopGreedyOpCode loopOp; 887 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max); 888 isLoopOp = true; 889 } else { 890 LoopOpCode loopOp; 891 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max); 892 isLoopOp = true; 893 } 894 } 895 896 if (min == 0) { 897 if (isGreedy) { 898 SplitNextOpCode splitNextOp; 899 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart); 900 } else { 901 SplitFirstOpCode splitFirstOp; 902 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart); 903 } 904 } 905 if (isLoopOp) { 906 stackCount_++; 907 PushOpCode pushOp; 908 pushOp.InsertOpCode(&buffer_, atomBcStart); 909 PopOpCode popOp; 910 popOp.EmitOpCode(&buffer_); 911 } 912 } 913 isEmpty_ = false; 914} 915 916bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name) 917{ 918 const uint8_t *p = *pp; 919 uint32_t c = 0; 920 char buffer[CACHE_SIZE] = {0}; 921 char *q = buffer; 922 while (true) { 923 if (p <= end_) { 924 c = *p; 925 } else { 926 c = KEY_EOF; 927 } 928 if (c == '\\') { 929 p++; 930 if (*p != 'u') { 931 return false; 932 } 933 if (!ParseUnicodeEscape(&c)) { 934 return false; 935 } 936 } else if (c == '>') { 937 break; 938 } else if (c > CACHE_SIZE && c != KEY_EOF) { 939 c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p)); 940 } else if (c != KEY_EOF) { 941 p++; 942 } else { 943 return false; 944 } 945 if (q == buffer) { 946 if (!IsIdentFirst(c)) { 947 return false; 948 } 949 } else { 950 if (!u_isIDPart(c)) { 951 return false; 952 } 953 } 954 if (q != nullptr) { 955 *q++ = c; 956 } 957 } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 958 p++; 959 *pp = p; 960 name = buffer; 961 return true; 962} 963 964int RegExpParser::ParseCaptureCount(const char *groupName) 965{ 966 const uint8_t *p = nullptr; 967 int captureIndex = 1; 968 CString name; 969 hasNamedCaptures_ = 0; 970 for (p = base_; p < end_; p++) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 971 switch (*p) { 972 case '(': { 973 if (p[1] == '?') { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 974 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 975 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' && 976 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 977 p[CAPTURE_CONUT_ADVANCE] != '=') { 978 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 979 hasNamedCaptures_ = 1; 980 p += CAPTURE_CONUT_ADVANCE; 981 if (groupName != nullptr) { 982 if (ParseGroupSpecifier(&p, name)) { 983 if (strcmp(name.c_str(), groupName) == 0) { 984 return captureIndex; 985 } 986 } 987 } 988 captureIndex++; 989 } 990 } else { 991 captureIndex++; 992 } 993 break; 994 } 995 case '\\': 996 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 997 break; 998 case '[': { 999 while (p < end_ && *p != ']') { 1000 if (*p == '\\') { 1001 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 1002 } 1003 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 1004 } 1005 break; 1006 } 1007 default: 1008 break; 1009 } 1010 } 1011 return captureIndex; 1012} 1013 1014// NOLINTNEXTLINE(readability-function-size) 1015int RegExpParser::ParseAtomEscape(bool isBackward) 1016{ 1017 // AtomEscape[U, N]:: 1018 // DecimalEscape 1019 // CharacterClassEscape[?U] 1020 // CharacterEscape[?U] 1021 // [+N]kGroupName[?U] 1022 int result = -1; 1023 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1024 PrintF("Parse AtomEscape------\n"); 1025 PrevOpCode prevOp; 1026 switch (c0_) { 1027 case KEY_EOF: 1028 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1029 ParseError("unexpected end"); 1030 break; 1031 // DecimalEscape 1032 case '1': 1033 case '2': 1034 case '3': 1035 case '4': 1036 case '5': 1037 case '6': 1038 case '7': 1039 case '8': 1040 case '9': { 1041 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1042 PrintF("NonZeroDigit %c\n", c0_); 1043 int capture = ParseDecimalDigits(); 1044 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) { 1045 ParseError("invalid backreference count"); 1046 break; 1047 } 1048 if (isBackward) { 1049 BackwardBackReferenceOpCode backReferenceOp; 1050 backReferenceOp.EmitOpCode(&buffer_, capture); 1051 } else { 1052 BackReferenceOpCode backReferenceOp; 1053 backReferenceOp.EmitOpCode(&buffer_, capture); 1054 } 1055 break; 1056 } 1057 // CharacterClassEscape 1058 case 'd': { 1059 // [0-9] 1060 RangeOpCode rangeOp; 1061 if (isBackward) { 1062 prevOp.EmitOpCode(&buffer_, 0); 1063 } 1064 rangeOp.InsertOpCode(&buffer_, g_rangeD); 1065 goto parseLookBehind; 1066 } 1067 case 'D': { 1068 // [^0-9] 1069 RangeSet atomRange(g_rangeD); 1070 atomRange.Invert(IsUtf16()); 1071 Range32OpCode rangeOp; 1072 if (isBackward) { 1073 prevOp.EmitOpCode(&buffer_, 0); 1074 } 1075 rangeOp.InsertOpCode(&buffer_, atomRange); 1076 goto parseLookBehind; 1077 } 1078 case 's': { 1079 // [\f\n\r\t\v] 1080 RangeOpCode rangeOp; 1081 if (isBackward) { 1082 prevOp.EmitOpCode(&buffer_, 0); 1083 } 1084 rangeOp.InsertOpCode(&buffer_, g_rangeS); 1085 goto parseLookBehind; 1086 } 1087 case 'S': { 1088 RangeSet atomRange(g_rangeS); 1089 Range32OpCode rangeOp; 1090 atomRange.Invert(IsUtf16()); 1091 if (isBackward) { 1092 prevOp.EmitOpCode(&buffer_, 0); 1093 } 1094 rangeOp.InsertOpCode(&buffer_, atomRange); 1095 goto parseLookBehind; 1096 } 1097 case 'w': { 1098 // [A-Za-z0-9] 1099 RangeOpCode rangeOp; 1100 if (isBackward) { 1101 prevOp.EmitOpCode(&buffer_, 0); 1102 } 1103 rangeOp.InsertOpCode(&buffer_, g_rangeW); 1104 goto parseLookBehind; 1105 } 1106 case 'W': { 1107 // [^A-Za-z0-9] 1108 RangeSet atomRange(g_rangeW); 1109 atomRange.Invert(IsUtf16()); 1110 Range32OpCode rangeOp; 1111 if (isBackward) { 1112 prevOp.EmitOpCode(&buffer_, 0); 1113 } 1114 rangeOp.InsertOpCode(&buffer_, atomRange); 1115 goto parseLookBehind; 1116 } 1117 case 'P': 1118 case 'p': { 1119 //CharacterClassStrings 1120 RangeSet atomRange; 1121 Range32OpCode rangeOp; 1122 ParseClassEscape(&atomRange); 1123 if (isBackward) { 1124 prevOp.EmitOpCode(&buffer_, 0); 1125 } 1126 rangeOp.InsertOpCode(&buffer_, atomRange); 1127 break; 1128 } 1129 // [+N]kGroupName[?U] 1130 case 'k': { 1131 Advance(); 1132 if (c0_ != '<') { 1133 if (!IsUtf16() || HasNamedCaptures()) { 1134 ParseError("expecting group name."); 1135 break; 1136 } 1137 } 1138 Advance(); 1139 Prev(); 1140 CString name; 1141 auto **pp = const_cast<const uint8_t **>(&pc_); 1142 if (!ParseGroupSpecifier(pp, name)) { 1143 ParseError("GroupName Syntax error."); 1144 break; 1145 } 1146 int postion = FindGroupName(name); 1147 if (postion < 0) { 1148 postion = ParseCaptureCount(name.c_str()); 1149 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) { 1150 ParseError("group name not defined"); 1151 break; 1152 } 1153 } 1154 if (isBackward) { 1155 BackwardBackReferenceOpCode backReferenceOp; 1156 backReferenceOp.EmitOpCode(&buffer_, postion); 1157 } else { 1158 BackReferenceOpCode backReferenceOp; 1159 backReferenceOp.EmitOpCode(&buffer_, postion); 1160 } 1161 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1162 Advance(); 1163 break; 1164 } 1165 parseLookBehind: { 1166 if (isBackward) { 1167 prevOp.EmitOpCode(&buffer_, 0); 1168 } 1169 Advance(); 1170 break; 1171 } 1172 default: 1173 result = ParseCharacterEscape(); 1174 break; 1175 } 1176 return result; 1177} 1178 1179int RegExpParser::RecountCaptures() 1180{ 1181 if (totalCaptureCount_ < 0) { 1182 const char *name = reinterpret_cast<const char*>(groupNames_.buf_); 1183 totalCaptureCount_ = ParseCaptureCount(name); 1184 } 1185 return totalCaptureCount_; 1186} 1187bool RegExpParser::HasNamedCaptures() 1188{ 1189 if (hasNamedCaptures_ < 0) { 1190 RecountCaptures(); 1191 } 1192 return false; 1193} 1194 1195int RegExpParser::ParseCharacterEscape() 1196{ 1197 // CharacterEscape[U]:: 1198 // ControlEscape 1199 // c ControlLetter 1200 // 0 [lookahead ? DecimalDigit] 1201 // HexEscapeSequence 1202 // RegExpUnicodeEscapeSequence[?U] 1203 // IdentityEscape[?U] 1204 uint32_t result = 0; 1205 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1206 switch (c0_) { 1207 // ControlEscape 1208 case 'f': 1209 result = '\f'; 1210 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1211 PrintF("ControlEscape %c\n", c0_); 1212 Advance(); 1213 break; 1214 case 'n': 1215 result = '\n'; 1216 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1217 PrintF("ControlEscape %c\n", c0_); 1218 Advance(); 1219 break; 1220 case 'r': 1221 result = '\r'; 1222 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1223 PrintF("ControlEscape %c\n", c0_); 1224 Advance(); 1225 break; 1226 case 't': 1227 result = '\t'; 1228 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1229 PrintF("ControlEscape %c\n", c0_); 1230 Advance(); 1231 break; 1232 case 'v': 1233 result = '\v'; 1234 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1235 PrintF("ControlEscape %c\n", c0_); 1236 Advance(); 1237 break; 1238 // c ControlLetter 1239 case 'c': { 1240 Advance(); 1241 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) { 1242 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1243 PrintF("ControlLetter %c\n", c0_); 1244 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINTNEXTLINE(readability-magic-numbers) 1245 Advance(); 1246 } else { 1247 if (!IsUtf16()) { 1248 pc_--; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 1249 result = '\\'; 1250 } else { 1251 ParseError("Invalid control letter"); 1252 return -1; 1253 } 1254 } 1255 break; 1256 } 1257 case '0': { 1258 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1259 PrintF("CharacterEscape 0 [lookahead ? DecimalDigit]\n"); 1260 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINTNEXTLINE(readability-magic-numbers) 1261 Advance(); 1262 result = 0; 1263 break; 1264 } 1265 [[fallthrough]]; 1266 } 1267 case '1': 1268 case '2': 1269 case '3': 1270 case '4': 1271 case '5': 1272 case '6': 1273 case '7': { 1274 if (IsUtf16()) { 1275 // With /u, decimal escape is not interpreted as octal character code. 1276 ParseError("Invalid class escape"); 1277 return 0; 1278 } 1279 result = ParseOctalLiteral(); 1280 break; 1281 } 1282 // ParseHexEscapeSequence 1283 // ParseRegExpUnicodeEscapeSequence 1284 case 'x': { 1285 Advance(); 1286 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) { 1287 return result; 1288 } 1289 if (IsUtf16()) { 1290 ParseError("Invalid class escape"); 1291 return -1; 1292 } 1293 result = 'x'; 1294 break; 1295 } 1296 case 'u': { 1297 Advance(); 1298 if (ParseUnicodeEscape(&result)) { 1299 return result; 1300 } 1301 if (IsUtf16()) { 1302 // With /u, invalid escapes are not treated as identity escapes. 1303 ParseError("Invalid unicode escape"); 1304 return 0; 1305 } 1306 // If \u is not followed by a two-digit hexadecimal, treat it 1307 // as an identity escape. 1308 result = 'u'; 1309 break; 1310 } 1311 // IdentityEscape[?U] 1312 case '$': 1313 case '(': 1314 case ')': 1315 case '*': 1316 case '+': 1317 case '.': 1318 case '/': 1319 case '?': 1320 case '[': 1321 case '\\': 1322 case ']': 1323 case '^': 1324 case '{': 1325 case '|': 1326 case '}': 1327 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1328 PrintF("IdentityEscape %c\n", c0_); 1329 result = c0_; 1330 Advance(); 1331 break; 1332 default: { 1333 if (IsUtf16()) { 1334 ParseError("Invalid unicode escape"); 1335 return 0; 1336 } 1337 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1338 PrintF("SourceCharacter %c\n", c0_); 1339 result = c0_; 1340 if (result < CHAR_MAXS) { 1341 Advance(); 1342 } else { 1343 Prev(); 1344 const uint8_t *p = pc_; 1345 result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p)); 1346 int offset = static_cast<int>(p - pc_); 1347 Advance(offset + 1); 1348 } 1349 break; 1350 } 1351 } 1352 return static_cast<int>(result); 1353} 1354 1355bool RegExpParser::ParseClassRanges(RangeSet *result) 1356{ 1357 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1358 PrintF("Parse ClassRanges------\n"); 1359 while (c0_ != ']') { 1360 RangeSet s1; 1361 bool needInter = false; 1362 uint32_t c1 = ParseClassAtom(&s1); 1363 if (c1 == UINT32_MAX) { 1364 ParseError("invalid class range"); 1365 return false; 1366 } 1367 needInter = NeedIntersection(c1); 1368 int next_c0 = *pc_; 1369 if (c0_ == '-' && next_c0 != ']') { 1370 if (c1 == CLASS_RANGE_BASE) { 1371 if (IsUtf16()) { 1372 ParseError("invalid class range"); 1373 return false; 1374 } 1375 result->Insert(s1); 1376 continue; 1377 } 1378 Advance(); 1379 RangeSet s2; 1380 uint32_t c2 = ParseClassAtom(&s2); 1381 if (c2 == UINT32_MAX) { 1382 ParseError("invalid class range"); 1383 return false; 1384 } 1385 if (c2 == CLASS_RANGE_BASE) { 1386 if (IsUtf16()) { 1387 ParseError("invalid class range"); 1388 return false; 1389 } 1390 result->Insert(s2); 1391 continue; 1392 } 1393 if (c1 < INT8_MAX) { 1394 if (c1 > c2) { 1395 ParseError("invalid class range"); 1396 return false; 1397 } 1398 } 1399 needInter = NeedIntersection(c2); 1400 result->Insert(c1, c2); 1401 if (IsIgnoreCase() && needInter) { 1402 ProcessIntersection(result); 1403 } 1404 } else { 1405 result->Insert(s1); 1406 if (!(IsIgnoreCase() && needInter)) { 1407 continue; 1408 } 1409 if (c1 <= 'z' && c1 >= 'a') { 1410 result->Insert(RangeSet(c1 - 'a' + 'A')); 1411 } else { 1412 result->Insert(RangeSet(c1 - 'A' + 'a')); 1413 } 1414 } 1415 } 1416 Advance(); 1417 return true; 1418} 1419 1420uint32_t RegExpParser::ParseClassAtom(RangeSet *atom) 1421{ 1422 uint32_t ret = UINT32_MAX; 1423 switch (c0_) { 1424 case '\\': { 1425 Advance(); 1426 ret = static_cast<uint32_t>(ParseClassEscape(atom)); 1427 break; 1428 } 1429 case KEY_EOF: 1430 break; 1431 case 0: { 1432 if (pc_ >= end_) { 1433 return UINT32_MAX; 1434 } 1435 [[fallthrough]]; 1436 } 1437 default: { 1438 uint32_t value = c0_; 1439 size_t u16_size = 0; 1440 if (c0_ > INT8_MAX) { // NOLINTNEXTLINE(readability-magic-numbers) 1441 pc_ -= 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 1442 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true); 1443 value = u16_result.first; 1444 u16_size = u16_result.second; 1445 Advance(u16_size + 1); 1446 } else { 1447 Advance(); 1448 } 1449 atom->Insert(RangeSet(value)); 1450 ret = value; 1451 break; 1452 } 1453 } 1454 return ret; 1455} 1456 1457int RegExpParser::ParseClassEscape(RangeSet *atom) 1458{ 1459 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1460 PrintF("Parse ClassEscape------\n"); 1461 int result = -1; 1462 switch (c0_) { 1463 case 'b': 1464 Advance(); 1465 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1466 PrintF("ClassEscape %c", 'b'); 1467 result = '\b'; 1468 atom->Insert(RangeSet(static_cast<uint32_t>('\b'))); 1469 break; 1470 case '-': 1471 Advance(); 1472 result = '-'; 1473 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1474 PrintF("ClassEscape %c", '-'); 1475 atom->Insert(RangeSet(static_cast<uint32_t>('-'))); 1476 break; 1477 // CharacterClassEscape 1478 case 'd': 1479 case 'D': 1480 result = CLASS_RANGE_BASE; 1481 atom->Insert(g_rangeD); 1482 if (c0_ == 'D') { 1483 atom->Invert(IsUtf16()); 1484 } 1485 Advance(); 1486 break; 1487 case 's': 1488 case 'S': 1489 result = CLASS_RANGE_BASE; 1490 atom->Insert(g_rangeS); 1491 if (c0_ == 'S') { 1492 atom->Invert(IsUtf16()); 1493 } 1494 Advance(); 1495 break; 1496 case 'w': 1497 case 'W': 1498 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1499 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_); 1500 result = CLASS_RANGE_BASE; 1501 atom->Insert(g_rangeW); 1502 if (c0_ == 'W') { 1503 atom->Invert(IsUtf16()); 1504 } 1505 Advance(); 1506 break; 1507 case 'P': 1508 case 'p': { 1509 bool negate = (c0_ == 'P'); 1510 CString propertyName; 1511 CString valueName; 1512 if (!ParseUnicodePropertyValueCharacters(propertyName, valueName) || 1513 !ParseUnicodePropertyClassRange(propertyName, valueName, atom, negate)) { 1514 char *originExpression = (char *)base_; 1515 CString expression(originExpression); 1516 CString msg = "Invalid regular expression :" + expression; 1517 ParseError(msg.c_str()); 1518 } 1519 result = CLASS_RANGE_BASE; 1520 break; 1521 } 1522 default: 1523 result = ParseCharacterEscape(); 1524 int value = result; 1525 if (IsIgnoreCase()) { 1526 value = Canonicalize(value, IsUtf16()); 1527 } 1528 atom->Insert(RangeSet(static_cast<uint32_t>(value))); 1529 break; 1530 } 1531 return result; 1532} 1533 1534bool RegExpParser::ParseUnicodePropertyValueCharacters(CString &propertyName, CString &valueName) 1535{ 1536 Advance(); 1537 if (c0_ == '{') { 1538 if (!GetUnicodePropertyName(propertyName)) { 1539 return false; 1540 } 1541 1542 if (!GetUnicodePropertyValueName(valueName)) { 1543 return false; 1544 } 1545 } else { 1546 return false; 1547 } 1548 Advance(); 1549 return true; 1550} 1551 1552bool RegExpParser::GetUnicodePropertyName(CString &propertyName) 1553{ 1554 Advance(); 1555 while (c0_ != '}' && c0_ != '=') { 1556 if (IsUnicodePropertyValueCharacter(c0_)) { 1557 propertyName += c0_; 1558 } else { 1559 return false; 1560 } 1561 Advance(); 1562 } 1563 return true; 1564} 1565 1566bool RegExpParser::GetUnicodePropertyValueName(CString &valueName) 1567{ 1568 if (c0_ == '=') { 1569 Advance(); 1570 while (c0_ != '}') { 1571 if (IsUnicodePropertyValueCharacter(c0_)) { 1572 valueName += c0_; 1573 } else { 1574 return false; 1575 } 1576 Advance(); 1577 } 1578 } 1579 return true; 1580} 1581 1582// NOLINTNEXTLINE(cert-dcl50-cpp) 1583void RegExpParser::PrintF(const char *fmt, ...) 1584{ 1585#ifndef _NO_DEBUG_ 1586 va_list args; 1587 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,) 1588 va_start(args, fmt); 1589 vprintf(fmt, args); 1590 va_end(args); 1591#else 1592 (void)fmt; 1593#endif 1594} 1595 1596void RegExpParser::ParseError(const char *errorMessage) 1597{ 1598 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1599 PrintF("error: "); 1600 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1601 PrintF(errorMessage); 1602 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg) 1603 PrintF("\n"); 1604 SetIsError(); 1605 size_t length = strlen(errorMessage) + 1; 1606 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) { 1607 LOG_FULL(FATAL) << "memcpy_s failed"; 1608 UNREACHABLE(); 1609 } 1610} 1611 1612int RegExpParser::IsIdentFirst(uint32_t c) 1613{ 1614 if (c < CACHE_SIZE) { 1615 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31 1616 } else { 1617 auto uchar = static_cast<UChar32>(c); 1618 return static_cast<int>(u_isIDStart(uchar)); 1619 } 1620} 1621 1622int RegExpParser::Canonicalize(int c, bool isUnicode) 1623{ 1624 if (c < TMP_BUF_SIZE) { // NOLINTNEXTLINE(readability-magic-numbers) 1625 if (c >= 'a' && c <= 'z') { 1626 c = c - 'a' + 'A'; 1627 } 1628 } else { 1629 int cur = c; 1630 if (isUnicode) { 1631 c = u_tolower(static_cast<UChar32>(c)); 1632 if (c >= 'a' && c <= 'z') { 1633 c = cur; 1634 } 1635 } else { 1636 c = u_toupper(static_cast<UChar32>(c)); 1637 if (c >= 'A' && c <= 'Z') { 1638 c = cur; 1639 } 1640 } 1641 } 1642 return c; 1643} 1644 1645bool RegExpParser::NeedIntersection(uint32_t c) 1646{ 1647 return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A'); 1648} 1649 1650void RegExpParser::DoParserStackOverflowCheck(const char *errorMessage) 1651{ 1652 if (UNLIKELY(thread_->GetCurrentStackPosition() < thread_->GetStackLimit())) { 1653 LOG_ECMA(ERROR) << "Stack overflow! current:" << thread_->GetCurrentStackPosition() << 1654 " limit:" << thread_->GetStackLimit(); 1655 ParseError(errorMessage); 1656 return; 1657 } 1658} 1659 1660bool RegExpParser::ParseUnicodePropertyClassRange(CString &propertyName, CString &valueName, 1661 RangeSet *atom, bool negate) 1662{ 1663 const char *name = propertyName.c_str(); 1664 if (valueName.size() == 0) { 1665 if (MatchUnicodeProperty(UCHAR_GENERAL_CATEGORY_MASK, name, atom, negate)) { 1666 return true; 1667 } 1668 if (MatchSepcialUnicodeProperty(propertyName, negate, atom)) { 1669 return true; 1670 } 1671 UProperty property = u_getPropertyEnum(name); 1672 if (!IsSupportedBinaryProperty(property)) { 1673 return false; 1674 } 1675 if (!IsExactPropertyAlias(name, property)) { 1676 return false; 1677 } 1678 if (negate && IsBinaryPropertyOfStrings(property)) { 1679 return false; 1680 } 1681 return MatchUnicodeProperty(property, negate ? "N" : "Y", atom, false); 1682 } else { 1683 UProperty property = u_getPropertyEnum(propertyName.c_str()); 1684 if (property == UCHAR_GENERAL_CATEGORY) { 1685 property = UCHAR_GENERAL_CATEGORY_MASK; 1686 } else if (property != UCHAR_SCRIPT && property != UCHAR_SCRIPT_EXTENSIONS) { 1687 return false; 1688 } 1689 return MatchUnicodeProperty(property, valueName.c_str(), atom, negate); 1690 } 1691} 1692 1693bool RegExpParser::MatchUnicodeProperty(UProperty property, const char* propertyName, RangeSet *atom, bool negate) 1694{ 1695 UProperty propertyForMatch = property; 1696 if (propertyForMatch == UCHAR_SCRIPT_EXTENSIONS) { 1697 propertyForMatch = UCHAR_SCRIPT; 1698 } 1699 int32_t propertyValue = u_getPropertyValueEnum(propertyForMatch, propertyName); 1700 if (propertyValue == UCHAR_INVALID_CODE) { 1701 return false; 1702 } 1703 if (!IsExactPropertyValueAlis(propertyName, propertyForMatch, propertyValue)) { 1704 return false; 1705 } 1706 UErrorCode ec = U_ZERO_ERROR; 1707 icu::UnicodeSet set; 1708 set.applyIntPropertyValue(property, propertyValue, ec); 1709 bool success = ec == U_ZERO_ERROR && !set.isEmpty(); 1710 if (success) { 1711 const bool caseFolding = IsIgnoreCase(); 1712 if (negate) { 1713 set.complement(); 1714 } 1715 if (caseFolding) { 1716 set.closeOver(USET_CASE_INSENSITIVE); 1717 } 1718 set.removeAllStrings(); 1719 for (int i = 0; i < set.getRangeCount(); i++) { 1720 atom->Insert(set.getRangeStart(i), set.getRangeEnd(i)); 1721 } 1722 } 1723 return success; 1724} 1725 1726bool RegExpParser::IsExactPropertyValueAlis(const char *valueName, UProperty property, int32_t propertyValue) 1727{ 1728 const char *shortName = u_getPropertyValueName(property, propertyValue, U_SHORT_PROPERTY_NAME); 1729 if (shortName != nullptr && strcmp(valueName, shortName) == 0) { 1730 return true; 1731 } 1732 int i = 0; 1733 bool flag = true; 1734 while (flag) { 1735 const char *longName = u_getPropertyValueName(property, propertyValue, 1736 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1737 if (longName == nullptr) { 1738 flag = false; 1739 break; 1740 } 1741 if (strcmp(valueName, longName) == 0) { 1742 return true; 1743 } 1744 i++; 1745 } 1746 return false; 1747} 1748 1749bool RegExpParser::IsExactPropertyAlias(const char* propertyName, UProperty property) 1750{ 1751 const char* shortName = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); 1752 if (shortName != nullptr && strcmp(propertyName, shortName) == 0) { 1753 return true; 1754 } 1755 int i = 0; 1756 bool flag = true; 1757 while (flag) { 1758 const char* longName = u_getPropertyName(property, 1759 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1760 if (longName == nullptr) { 1761 flag = false; 1762 break; 1763 } 1764 if (strcmp(propertyName, longName) == 0) { 1765 return true; 1766 } 1767 i++; 1768 } 1769 return false; 1770} 1771 1772bool RegExpParser::MatchSepcialUnicodeProperty(CString &name, bool negate, RangeSet *atom) 1773{ 1774 if (name == "Any") { 1775 if (!negate) { 1776 atom->Insert(0, 0x10FFFF); 1777 } 1778 } else if (name == "ASCII") { 1779 if (negate) { 1780 atom->Insert(0x80, 0x10FFFF); 1781 } else { 1782 atom->Insert(0x0, 0x7F); 1783 } 1784 } else if (name == "Assigned") { 1785 return MatchUnicodeProperty(UCHAR_GENERAL_CATEGORY, "Unassigned", atom, !negate); 1786 } else { 1787 return false; 1788 } 1789 return true; 1790} 1791 1792bool RegExpParser::IsSupportedBinaryProperty(UProperty property) 1793{ 1794 switch (property) { 1795 case UCHAR_ALPHABETIC: 1796 case UCHAR_ASCII_HEX_DIGIT: 1797 case UCHAR_BIDI_CONTROL: 1798 case UCHAR_BIDI_MIRRORED: 1799 case UCHAR_DASH: 1800 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: 1801 case UCHAR_DEPRECATED: 1802 case UCHAR_DIACRITIC: 1803 case UCHAR_JOIN_CONTROL: 1804 case UCHAR_IDS_TRINARY_OPERATOR: 1805 case UCHAR_IDS_BINARY_OPERATOR: 1806 case UCHAR_IDEOGRAPHIC: 1807 case UCHAR_S_TERM: 1808 case UCHAR_ID_START: 1809 case UCHAR_ID_CONTINUE: 1810 case UCHAR_HEX_DIGIT: 1811 case UCHAR_GRAPHEME_EXTEND: 1812 case UCHAR_GRAPHEME_BASE: 1813 case UCHAR_EXTENDER: 1814 case UCHAR_LOGICAL_ORDER_EXCEPTION: 1815 case UCHAR_LOWERCASE: 1816 case UCHAR_MATH: 1817 case UCHAR_NONCHARACTER_CODE_POINT: 1818 case UCHAR_QUOTATION_MARK: 1819 case UCHAR_RADICAL: 1820 case UCHAR_SOFT_DOTTED: 1821 case UCHAR_TERMINAL_PUNCTUATION: 1822 case UCHAR_UNIFIED_IDEOGRAPH: 1823 case UCHAR_UPPERCASE: 1824 case UCHAR_WHITE_SPACE: 1825 case UCHAR_XID_CONTINUE: 1826 case UCHAR_XID_START: 1827 case UCHAR_VARIATION_SELECTOR: 1828 case UCHAR_PATTERN_SYNTAX: 1829 case UCHAR_PATTERN_WHITE_SPACE: 1830 case UCHAR_CASED: 1831 case UCHAR_CASE_IGNORABLE: 1832 case UCHAR_CHANGES_WHEN_LOWERCASED: 1833 case UCHAR_CHANGES_WHEN_UPPERCASED: 1834 case UCHAR_CHANGES_WHEN_TITLECASED: 1835 case UCHAR_CHANGES_WHEN_CASEFOLDED: 1836 case UCHAR_CHANGES_WHEN_CASEMAPPED: 1837 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: 1838 case UCHAR_REGIONAL_INDICATOR: 1839 case UCHAR_EMOJI: 1840 case UCHAR_EMOJI_PRESENTATION: 1841 case UCHAR_EMOJI_MODIFIER: 1842 case UCHAR_EMOJI_MODIFIER_BASE: 1843 case UCHAR_EMOJI_COMPONENT: 1844 case UCHAR_EXTENDED_PICTOGRAPHIC: 1845 return true; 1846 case UCHAR_BASIC_EMOJI: 1847 case UCHAR_EMOJI_KEYCAP_SEQUENCE: 1848 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: 1849 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: 1850 case UCHAR_RGI_EMOJI_TAG_SEQUENCE: 1851 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: 1852 case UCHAR_RGI_EMOJI: 1853 return false; 1854 default: 1855 break; 1856 } 1857 return false; 1858} 1859 1860bool RegExpParser::IsBinaryPropertyOfStrings(UProperty property) 1861{ 1862 switch (property) { 1863 case UCHAR_BASIC_EMOJI: 1864 case UCHAR_EMOJI_KEYCAP_SEQUENCE: 1865 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE: 1866 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE: 1867 case UCHAR_RGI_EMOJI_TAG_SEQUENCE: 1868 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE: 1869 case UCHAR_RGI_EMOJI: 1870 return true; 1871 default: 1872 break; 1873 } 1874 return false; 1875} 1876} // namespace panda::ecmascript 1877