1// Copyright 2019 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/regexp-compiler.h" 6 7#include "src/execution/isolate.h" 8#include "src/regexp/regexp.h" 9#include "src/strings/unicode-inl.h" 10#include "src/zone/zone-list-inl.h" 11 12#ifdef V8_INTL_SUPPORT 13#include "src/base/strings.h" 14#include "src/regexp/special-case.h" 15#include "unicode/locid.h" 16#include "unicode/uniset.h" 17#include "unicode/utypes.h" 18#endif // V8_INTL_SUPPORT 19 20namespace v8 { 21namespace internal { 22 23using namespace regexp_compiler_constants; // NOLINT(build/namespaces) 24 25constexpr base::uc32 kMaxCodePoint = 0x10ffff; 26constexpr int kMaxUtf16CodeUnit = 0xffff; 27constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff; 28constexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; 29 30// ------------------------------------------------------------------- 31// Tree to graph conversion 32 33RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 34 RegExpNode* on_success) { 35 ZoneList<TextElement>* elms = 36 compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone()); 37 elms->Add(TextElement::Atom(this), compiler->zone()); 38 return compiler->zone()->New<TextNode>(elms, compiler->read_backward(), 39 on_success); 40} 41 42RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 43 RegExpNode* on_success) { 44 return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(), 45 on_success); 46} 47 48namespace { 49 50bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 51 const int* special_class, int length) { 52 length--; // Remove final marker. 53 54 DCHECK_EQ(kRangeEndMarker, special_class[length]); 55 DCHECK_NE(0, ranges->length()); 56 DCHECK_NE(0, length); 57 DCHECK_NE(0, special_class[0]); 58 59 if (ranges->length() != (length >> 1) + 1) return false; 60 61 CharacterRange range = ranges->at(0); 62 if (range.from() != 0) return false; 63 64 for (int i = 0; i < length; i += 2) { 65 if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) { 66 return false; 67 } 68 range = ranges->at((i >> 1) + 1); 69 if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) { 70 return false; 71 } 72 } 73 74 return range.to() == kMaxCodePoint; 75} 76 77bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class, 78 int length) { 79 length--; // Remove final marker. 80 81 DCHECK_EQ(kRangeEndMarker, special_class[length]); 82 if (ranges->length() * 2 != length) return false; 83 84 for (int i = 0; i < length; i += 2) { 85 CharacterRange range = ranges->at(i >> 1); 86 if (range.from() != static_cast<base::uc32>(special_class[i]) || 87 range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) { 88 return false; 89 } 90 } 91 return true; 92} 93 94} // namespace 95 96bool RegExpCharacterClass::is_standard(Zone* zone) { 97 // TODO(lrn): Remove need for this function, by not throwing away information 98 // along the way. 99 if (is_negated()) { 100 return false; 101 } 102 if (set_.is_standard()) { 103 return true; 104 } 105 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 106 set_.set_standard_set_type(StandardCharacterSet::kWhitespace); 107 return true; 108 } 109 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 110 set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace); 111 return true; 112 } 113 if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges, 114 kLineTerminatorRangeCount)) { 115 set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator); 116 return true; 117 } 118 if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges, 119 kLineTerminatorRangeCount)) { 120 set_.set_standard_set_type(StandardCharacterSet::kLineTerminator); 121 return true; 122 } 123 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 124 set_.set_standard_set_type(StandardCharacterSet::kWord); 125 return true; 126 } 127 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 128 set_.set_standard_set_type(StandardCharacterSet::kNotWord); 129 return true; 130 } 131 return false; 132} 133 134UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) { 135 // The unicode range splitter categorizes given character ranges into: 136 // - Code points from the BMP representable by one code unit. 137 // - Code points outside the BMP that need to be split into surrogate pairs. 138 // - Lone lead surrogates. 139 // - Lone trail surrogates. 140 // Lone surrogates are valid code points, even though no actual characters. 141 // They require special matching to make sure we do not split surrogate pairs. 142 143 for (int i = 0; i < base->length(); i++) AddRange(base->at(i)); 144} 145 146void UnicodeRangeSplitter::AddRange(CharacterRange range) { 147 static constexpr base::uc32 kBmp1Start = 0; 148 static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1; 149 static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1; 150 static constexpr base::uc32 kBmp2End = kNonBmpStart - 1; 151 152 // Ends are all inclusive. 153 STATIC_ASSERT(kBmp1Start == 0); 154 STATIC_ASSERT(kBmp1Start < kBmp1End); 155 STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart); 156 STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd); 157 STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); 158 STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd); 159 STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start); 160 STATIC_ASSERT(kBmp2Start < kBmp2End); 161 STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart); 162 STATIC_ASSERT(kNonBmpStart < kNonBmpEnd); 163 164 static constexpr base::uc32 kStarts[] = { 165 kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, 166 kBmp2Start, kNonBmpStart, 167 }; 168 169 static constexpr base::uc32 kEnds[] = { 170 kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, 171 }; 172 173 CharacterRangeVector* const kTargets[] = { 174 &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_, 175 }; 176 177 static constexpr int kCount = arraysize(kStarts); 178 STATIC_ASSERT(kCount == arraysize(kEnds)); 179 STATIC_ASSERT(kCount == arraysize(kTargets)); 180 181 for (int i = 0; i < kCount; i++) { 182 if (kStarts[i] > range.to()) break; 183 const base::uc32 from = std::max(kStarts[i], range.from()); 184 const base::uc32 to = std::min(kEnds[i], range.to()); 185 if (from > to) continue; 186 kTargets[i]->emplace_back(CharacterRange::Range(from, to)); 187 } 188} 189 190namespace { 191 192// Translates between new and old V8-isms (SmallVector, ZoneList). 193ZoneList<CharacterRange>* ToCanonicalZoneList( 194 const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { 195 if (v->empty()) return nullptr; 196 197 ZoneList<CharacterRange>* result = 198 zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone); 199 for (size_t i = 0; i < v->size(); i++) { 200 result->Add(v->at(i), zone); 201 } 202 203 CharacterRange::Canonicalize(result); 204 return result; 205} 206 207void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, 208 RegExpNode* on_success, UnicodeRangeSplitter* splitter) { 209 ZoneList<CharacterRange>* bmp = 210 ToCanonicalZoneList(splitter->bmp(), compiler->zone()); 211 if (bmp == nullptr) return; 212 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( 213 compiler->zone(), bmp, compiler->read_backward(), on_success))); 214} 215 216using UC16Range = uint32_t; // {from, to} packed into one uint32_t. 217constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) { 218 return (static_cast<uint32_t>(from) << 16) | to; 219} 220constexpr base::uc16 ExtractFrom(UC16Range r) { 221 return static_cast<base::uc16>(r >> 16); 222} 223constexpr base::uc16 ExtractTo(UC16Range r) { 224 return static_cast<base::uc16>(r); 225} 226 227void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, 228 RegExpNode* on_success, 229 UnicodeRangeSplitter* splitter) { 230 DCHECK(!compiler->one_byte()); 231 Zone* const zone = compiler->zone(); 232 ZoneList<CharacterRange>* non_bmp = 233 ToCanonicalZoneList(splitter->non_bmp(), zone); 234 if (non_bmp == nullptr) return; 235 236 // Translate each 32-bit code point range into the corresponding 16-bit code 237 // unit representation consisting of the lead- and trail surrogate. 238 // 239 // The generated alternatives are grouped by the leading surrogate to avoid 240 // emitting excessive code. For example, for 241 // 242 // { \ud800[\udc00-\udc01] 243 // , \ud800[\udc05-\udc06] 244 // } 245 // 246 // there's no need to emit matching code for the leading surrogate \ud800 247 // twice. We also create a dedicated grouping for full trailing ranges, i.e. 248 // [dc00-dfff]. 249 ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading( 250 zone); 251 ZoneList<CharacterRange>* leading_with_full_trailing_range = 252 zone->New<ZoneList<CharacterRange>>(1, zone); 253 const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l, 254 base::uc16 from_t, base::uc16 to_t) { 255 const UC16Range leading_range = ToUC16Range(from_l, to_l); 256 if (grouped_by_leading.count(leading_range) == 0) { 257 if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) { 258 leading_with_full_trailing_range->Add( 259 CharacterRange::Range(from_l, to_l), zone); 260 return; 261 } 262 grouped_by_leading[leading_range] = 263 zone->New<ZoneList<CharacterRange>>(2, zone); 264 } 265 grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t), 266 zone); 267 }; 268 269 // First, create the grouped ranges. 270 CharacterRange::Canonicalize(non_bmp); 271 for (int i = 0; i < non_bmp->length(); i++) { 272 // Match surrogate pair. 273 // E.g. [\u10005-\u11005] becomes 274 // \ud800[\udc05-\udfff]| 275 // [\ud801-\ud803][\udc00-\udfff]| 276 // \ud804[\udc00-\udc05] 277 base::uc32 from = non_bmp->at(i).from(); 278 base::uc32 to = non_bmp->at(i).to(); 279 base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from); 280 base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from); 281 base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to); 282 base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to); 283 284 if (from_l == to_l) { 285 // The lead surrogate is the same. 286 AddRange(from_l, to_l, from_t, to_t); 287 continue; 288 } 289 290 if (from_t != kTrailSurrogateStart) { 291 // Add [from_l][from_t-\udfff]. 292 AddRange(from_l, from_l, from_t, kTrailSurrogateEnd); 293 from_l++; 294 } 295 if (to_t != kTrailSurrogateEnd) { 296 // Add [to_l][\udc00-to_t]. 297 AddRange(to_l, to_l, kTrailSurrogateStart, to_t); 298 to_l--; 299 } 300 if (from_l <= to_l) { 301 // Add [from_l-to_l][\udc00-\udfff]. 302 AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd); 303 } 304 } 305 306 // Create the actual TextNode now that ranges are fully grouped. 307 if (!leading_with_full_trailing_range->is_empty()) { 308 CharacterRange::Canonicalize(leading_with_full_trailing_range); 309 result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( 310 zone, leading_with_full_trailing_range, 311 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 312 compiler->read_backward(), on_success))); 313 } 314 for (const auto& it : grouped_by_leading) { 315 CharacterRange leading_range = 316 CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first)); 317 ZoneList<CharacterRange>* trailing_ranges = it.second; 318 CharacterRange::Canonicalize(trailing_ranges); 319 result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( 320 zone, leading_range, trailing_ranges, compiler->read_backward(), 321 on_success))); 322 } 323} 324 325RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( 326 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, 327 ZoneList<CharacterRange>* match, RegExpNode* on_success, 328 bool read_backward) { 329 Zone* zone = compiler->zone(); 330 RegExpNode* match_node = TextNode::CreateForCharacterRanges( 331 zone, match, read_backward, on_success); 332 int stack_register = compiler->UnicodeLookaroundStackRegister(); 333 int position_register = compiler->UnicodeLookaroundPositionRegister(); 334 RegExpLookaround::Builder lookaround(false, match_node, stack_register, 335 position_register); 336 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 337 zone, lookbehind, !read_backward, lookaround.on_match_success()); 338 return lookaround.ForMatch(negative_match); 339} 340 341RegExpNode* MatchAndNegativeLookaroundInReadDirection( 342 RegExpCompiler* compiler, ZoneList<CharacterRange>* match, 343 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, 344 bool read_backward) { 345 Zone* zone = compiler->zone(); 346 int stack_register = compiler->UnicodeLookaroundStackRegister(); 347 int position_register = compiler->UnicodeLookaroundPositionRegister(); 348 RegExpLookaround::Builder lookaround(false, on_success, stack_register, 349 position_register); 350 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 351 zone, lookahead, read_backward, lookaround.on_match_success()); 352 return TextNode::CreateForCharacterRanges( 353 zone, match, read_backward, lookaround.ForMatch(negative_match)); 354} 355 356void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 357 RegExpNode* on_success, 358 UnicodeRangeSplitter* splitter) { 359 ZoneList<CharacterRange>* lead_surrogates = 360 ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); 361 if (lead_surrogates == nullptr) return; 362 Zone* zone = compiler->zone(); 363 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 364 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 365 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 366 367 RegExpNode* match; 368 if (compiler->read_backward()) { 369 // Reading backward. Assert that reading forward, there is no trail 370 // surrogate, and then backward match the lead surrogate. 371 match = NegativeLookaroundAgainstReadDirectionAndMatch( 372 compiler, trail_surrogates, lead_surrogates, on_success, true); 373 } else { 374 // Reading forward. Forward match the lead surrogate and assert that 375 // no trail surrogate follows. 376 match = MatchAndNegativeLookaroundInReadDirection( 377 compiler, lead_surrogates, trail_surrogates, on_success, false); 378 } 379 result->AddAlternative(GuardedAlternative(match)); 380} 381 382void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 383 RegExpNode* on_success, 384 UnicodeRangeSplitter* splitter) { 385 ZoneList<CharacterRange>* trail_surrogates = 386 ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); 387 if (trail_surrogates == nullptr) return; 388 Zone* zone = compiler->zone(); 389 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 390 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 391 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 392 393 RegExpNode* match; 394 if (compiler->read_backward()) { 395 // Reading backward. Backward match the trail surrogate and assert that no 396 // lead surrogate precedes it. 397 match = MatchAndNegativeLookaroundInReadDirection( 398 compiler, trail_surrogates, lead_surrogates, on_success, true); 399 } else { 400 // Reading forward. Assert that reading backward, there is no lead 401 // surrogate, and then forward match the trail surrogate. 402 match = NegativeLookaroundAgainstReadDirectionAndMatch( 403 compiler, lead_surrogates, trail_surrogates, on_success, false); 404 } 405 result->AddAlternative(GuardedAlternative(match)); 406} 407 408RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, 409 RegExpNode* on_success) { 410 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. 411 DCHECK(!compiler->read_backward()); 412 Zone* zone = compiler->zone(); 413 // Advance any character. If the character happens to be a lead surrogate and 414 // we advanced into the middle of a surrogate pair, it will work out, as 415 // nothing will match from there. We will have to advance again, consuming 416 // the associated trail surrogate. 417 ZoneList<CharacterRange>* range = 418 CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit)); 419 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); 420} 421 422void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) { 423#ifdef V8_INTL_SUPPORT 424 DCHECK(CharacterRange::IsCanonical(ranges)); 425 426 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. 427 // See also https://crbug.com/v8/6727. 428 // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range, 429 // which we use frequently internally. But large ranges can also easily be 430 // created by the user. We might want to have a more general caching mechanism 431 // for such ranges. 432 if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return; 433 434 // Use ICU to compute the case fold closure over the ranges. 435 icu::UnicodeSet set; 436 for (int i = 0; i < ranges->length(); i++) { 437 set.add(ranges->at(i).from(), ranges->at(i).to()); 438 } 439 // Clear the ranges list without freeing the backing store. 440 ranges->Rewind(0); 441 set.closeOver(USET_CASE_INSENSITIVE); 442 // Full case mapping map single characters to multiple characters. 443 // Those are represented as strings in the set. Remove them so that 444 // we end up with only simple and common case mappings. 445 set.removeAllStrings(); 446 for (int i = 0; i < set.getRangeCount(); i++) { 447 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 448 zone); 449 } 450 // No errors and everything we collected have been ranges. 451 CharacterRange::Canonicalize(ranges); 452#endif // V8_INTL_SUPPORT 453} 454 455} // namespace 456 457RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 458 RegExpNode* on_success) { 459 set_.Canonicalize(); 460 Zone* const zone = compiler->zone(); 461 ZoneList<CharacterRange>* ranges = this->ranges(zone); 462 463 if (NeedsUnicodeCaseEquivalents(compiler->flags())) { 464 AddUnicodeCaseEquivalents(ranges, zone); 465 } 466 467 if (!IsUnicode(compiler->flags()) || compiler->one_byte() || 468 contains_split_surrogate()) { 469 return zone->New<TextNode>(this, compiler->read_backward(), on_success); 470 } 471 472 if (is_negated()) { 473 ZoneList<CharacterRange>* negated = 474 zone->New<ZoneList<CharacterRange>>(2, zone); 475 CharacterRange::Negate(ranges, negated, zone); 476 ranges = negated; 477 } 478 479 if (ranges->length() == 0) { 480 // The empty character class is used as a 'fail' node. 481 RegExpCharacterClass* fail = zone->New<RegExpCharacterClass>(zone, ranges); 482 return zone->New<TextNode>(fail, compiler->read_backward(), on_success); 483 } 484 485 if (set_.is_standard() && 486 standard_type() == StandardCharacterSet::kEverything) { 487 return UnanchoredAdvance(compiler, on_success); 488 } 489 490 // Split ranges in order to handle surrogates correctly: 491 // - Surrogate pairs: translate the 32-bit code point into two uc16 code 492 // units (irregexp operates only on code units). 493 // - Lone surrogates: these require lookarounds to ensure we don't match in 494 // the middle of a surrogate pair. 495 ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 496 UnicodeRangeSplitter splitter(ranges); 497 AddBmpCharacters(compiler, result, on_success, &splitter); 498 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 499 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 500 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 501 502 static constexpr int kMaxRangesToInline = 32; // Arbitrary. 503 if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline(); 504 505 return result; 506} 507 508namespace { 509 510int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 511 RegExpAtom* atom1 = (*a)->AsAtom(); 512 RegExpAtom* atom2 = (*b)->AsAtom(); 513 base::uc16 character1 = atom1->data().at(0); 514 base::uc16 character2 = atom2->data().at(0); 515 if (character1 < character2) return -1; 516 if (character1 > character2) return 1; 517 return 0; 518} 519 520#ifdef V8_INTL_SUPPORT 521 522int CompareCaseInsensitive(const icu::UnicodeString& a, 523 const icu::UnicodeString& b) { 524 return a.caseCompare(b, U_FOLD_CASE_DEFAULT); 525} 526 527int CompareFirstCharCaseInsensitive(RegExpTree* const* a, 528 RegExpTree* const* b) { 529 RegExpAtom* atom1 = (*a)->AsAtom(); 530 RegExpAtom* atom2 = (*b)->AsAtom(); 531 return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)}, 532 icu::UnicodeString{atom2->data().at(0)}); 533} 534 535bool Equals(bool ignore_case, const icu::UnicodeString& a, 536 const icu::UnicodeString& b) { 537 if (a == b) return true; 538 if (ignore_case) return CompareCaseInsensitive(a, b) == 0; 539 return false; // Case-sensitive equality already checked above. 540} 541 542bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a, 543 const RegExpAtom* b) { 544 return Equals(ignore_case, a->data().at(index), b->data().at(index)); 545} 546 547#else 548 549unibrow::uchar Canonical( 550 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 551 unibrow::uchar c) { 552 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 553 int length = canonicalize->get(c, '\0', chars); 554 DCHECK_LE(length, 1); 555 unibrow::uchar canonical = c; 556 if (length == 1) canonical = chars[0]; 557 return canonical; 558} 559 560int CompareCaseInsensitive( 561 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 562 unibrow::uchar a, unibrow::uchar b) { 563 if (a == b) return 0; 564 if (a >= 'a' || b >= 'a') { 565 a = Canonical(canonicalize, a); 566 b = Canonical(canonicalize, b); 567 } 568 return static_cast<int>(a) - static_cast<int>(b); 569} 570 571int CompareFirstCharCaseInsensitive( 572 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 573 RegExpTree* const* a, RegExpTree* const* b) { 574 RegExpAtom* atom1 = (*a)->AsAtom(); 575 RegExpAtom* atom2 = (*b)->AsAtom(); 576 return CompareCaseInsensitive(canonicalize, atom1->data().at(0), 577 atom2->data().at(0)); 578} 579 580bool Equals(bool ignore_case, 581 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 582 unibrow::uchar a, unibrow::uchar b) { 583 if (a == b) return true; 584 if (ignore_case) { 585 return CompareCaseInsensitive(canonicalize, a, b) == 0; 586 } 587 return false; // Case-sensitive equality already checked above. 588} 589 590bool CharAtEquals(bool ignore_case, 591 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 592 int index, const RegExpAtom* a, const RegExpAtom* b) { 593 return Equals(ignore_case, canonicalize, a->data().at(index), 594 b->data().at(index)); 595} 596 597#endif // V8_INTL_SUPPORT 598 599} // namespace 600 601// We can stable sort runs of atoms, since the order does not matter if they 602// start with different characters. 603// Returns true if any consecutive atoms were found. 604bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 605 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 606 int length = alternatives->length(); 607 bool found_consecutive_atoms = false; 608 for (int i = 0; i < length; i++) { 609 while (i < length) { 610 RegExpTree* alternative = alternatives->at(i); 611 if (alternative->IsAtom()) break; 612 i++; 613 } 614 // i is length or it is the index of an atom. 615 if (i == length) break; 616 int first_atom = i; 617 i++; 618 while (i < length) { 619 RegExpTree* alternative = alternatives->at(i); 620 if (!alternative->IsAtom()) break; 621 i++; 622 } 623 // Sort atoms to get ones with common prefixes together. 624 // This step is more tricky if we are in a case-independent regexp, 625 // because it would change /is|I/ to /I|is/, and order matters when 626 // the regexp parts don't match only disjoint starting points. To fix 627 // this we have a version of CompareFirstChar that uses case- 628 // independent character classes for comparison. 629 DCHECK_LT(first_atom, alternatives->length()); 630 DCHECK_LE(i, alternatives->length()); 631 DCHECK_LE(first_atom, i); 632 if (IsIgnoreCase(compiler->flags())) { 633#ifdef V8_INTL_SUPPORT 634 alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom, 635 i - first_atom); 636#else 637 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 638 compiler->isolate()->regexp_macro_assembler_canonicalize(); 639 auto compare_closure = [canonicalize](RegExpTree* const* a, 640 RegExpTree* const* b) { 641 return CompareFirstCharCaseInsensitive(canonicalize, a, b); 642 }; 643 alternatives->StableSort(compare_closure, first_atom, i - first_atom); 644#endif // V8_INTL_SUPPORT 645 } else { 646 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 647 } 648 if (i - first_atom > 1) found_consecutive_atoms = true; 649 } 650 return found_consecutive_atoms; 651} 652 653// Optimizes ab|ac|az to a(?:b|c|d). 654void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 655 Zone* zone = compiler->zone(); 656 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 657 int length = alternatives->length(); 658 const bool ignore_case = IsIgnoreCase(compiler->flags()); 659 660 int write_posn = 0; 661 int i = 0; 662 while (i < length) { 663 RegExpTree* alternative = alternatives->at(i); 664 if (!alternative->IsAtom()) { 665 alternatives->at(write_posn++) = alternatives->at(i); 666 i++; 667 continue; 668 } 669 RegExpAtom* const atom = alternative->AsAtom(); 670#ifdef V8_INTL_SUPPORT 671 icu::UnicodeString common_prefix(atom->data().at(0)); 672#else 673 unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize = 674 compiler->isolate()->regexp_macro_assembler_canonicalize(); 675 unibrow::uchar common_prefix = atom->data().at(0); 676 if (ignore_case) { 677 common_prefix = Canonical(canonicalize, common_prefix); 678 } 679#endif // V8_INTL_SUPPORT 680 int first_with_prefix = i; 681 int prefix_length = atom->length(); 682 i++; 683 while (i < length) { 684 alternative = alternatives->at(i); 685 if (!alternative->IsAtom()) break; 686 RegExpAtom* const alt_atom = alternative->AsAtom(); 687#ifdef V8_INTL_SUPPORT 688 icu::UnicodeString new_prefix(alt_atom->data().at(0)); 689 if (!Equals(ignore_case, new_prefix, common_prefix)) break; 690#else 691 unibrow::uchar new_prefix = alt_atom->data().at(0); 692 if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break; 693#endif // V8_INTL_SUPPORT 694 prefix_length = std::min(prefix_length, alt_atom->length()); 695 i++; 696 } 697 if (i > first_with_prefix + 2) { 698 // Found worthwhile run of alternatives with common prefix of at least one 699 // character. The sorting function above did not sort on more than one 700 // character for reasons of correctness, but there may still be a longer 701 // common prefix if the terms were similar or presorted in the input. 702 // Find out how long the common prefix is. 703 int run_length = i - first_with_prefix; 704 RegExpAtom* const alt_atom = 705 alternatives->at(first_with_prefix)->AsAtom(); 706 for (int j = 1; j < run_length && prefix_length > 1; j++) { 707 RegExpAtom* old_atom = 708 alternatives->at(j + first_with_prefix)->AsAtom(); 709 for (int k = 1; k < prefix_length; k++) { 710#ifdef V8_INTL_SUPPORT 711 if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) { 712#else 713 if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) { 714#endif // V8_INTL_SUPPORT 715 prefix_length = k; 716 break; 717 } 718 } 719 } 720 RegExpAtom* prefix = 721 zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length)); 722 ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone); 723 pair->Add(prefix, zone); 724 ZoneList<RegExpTree*>* suffixes = 725 zone->New<ZoneList<RegExpTree*>>(run_length, zone); 726 for (int j = 0; j < run_length; j++) { 727 RegExpAtom* old_atom = 728 alternatives->at(j + first_with_prefix)->AsAtom(); 729 int len = old_atom->length(); 730 if (len == prefix_length) { 731 suffixes->Add(zone->New<RegExpEmpty>(), zone); 732 } else { 733 RegExpTree* suffix = zone->New<RegExpAtom>( 734 old_atom->data().SubVector(prefix_length, old_atom->length())); 735 suffixes->Add(suffix, zone); 736 } 737 } 738 pair->Add(zone->New<RegExpDisjunction>(suffixes), zone); 739 alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair); 740 } else { 741 // Just copy any non-worthwhile alternatives. 742 for (int j = first_with_prefix; j < i; j++) { 743 alternatives->at(write_posn++) = alternatives->at(j); 744 } 745 } 746 } 747 alternatives->Rewind(write_posn); // Trim end of array. 748} 749 750// Optimizes b|c|z to [bcz]. 751void RegExpDisjunction::FixSingleCharacterDisjunctions( 752 RegExpCompiler* compiler) { 753 Zone* zone = compiler->zone(); 754 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 755 int length = alternatives->length(); 756 757 int write_posn = 0; 758 int i = 0; 759 while (i < length) { 760 RegExpTree* alternative = alternatives->at(i); 761 if (!alternative->IsAtom()) { 762 alternatives->at(write_posn++) = alternatives->at(i); 763 i++; 764 continue; 765 } 766 RegExpAtom* const atom = alternative->AsAtom(); 767 if (atom->length() != 1) { 768 alternatives->at(write_posn++) = alternatives->at(i); 769 i++; 770 continue; 771 } 772 const RegExpFlags flags = compiler->flags(); 773 DCHECK_IMPLIES(IsUnicode(flags), 774 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); 775 bool contains_trail_surrogate = 776 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); 777 int first_in_run = i; 778 i++; 779 // Find a run of single-character atom alternatives that have identical 780 // flags (case independence and unicode-ness). 781 while (i < length) { 782 alternative = alternatives->at(i); 783 if (!alternative->IsAtom()) break; 784 RegExpAtom* const alt_atom = alternative->AsAtom(); 785 if (alt_atom->length() != 1) break; 786 DCHECK_IMPLIES(IsUnicode(flags), 787 !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0))); 788 contains_trail_surrogate |= 789 unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0)); 790 i++; 791 } 792 if (i > first_in_run + 1) { 793 // Found non-trivial run of single-character alternatives. 794 int run_length = i - first_in_run; 795 ZoneList<CharacterRange>* ranges = 796 zone->New<ZoneList<CharacterRange>>(2, zone); 797 for (int j = 0; j < run_length; j++) { 798 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 799 DCHECK_EQ(old_atom->length(), 1); 800 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 801 } 802 RegExpCharacterClass::CharacterClassFlags character_class_flags; 803 if (IsUnicode(flags) && contains_trail_surrogate) { 804 character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; 805 } 806 alternatives->at(write_posn++) = 807 zone->New<RegExpCharacterClass>(zone, ranges, character_class_flags); 808 } else { 809 // Just copy any trivial alternatives. 810 for (int j = first_in_run; j < i; j++) { 811 alternatives->at(write_posn++) = alternatives->at(j); 812 } 813 } 814 } 815 alternatives->Rewind(write_posn); // Trim end of array. 816} 817 818RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 819 RegExpNode* on_success) { 820 compiler->ToNodeMaybeCheckForStackOverflow(); 821 822 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 823 824 if (alternatives->length() > 2) { 825 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 826 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 827 FixSingleCharacterDisjunctions(compiler); 828 if (alternatives->length() == 1) { 829 return alternatives->at(0)->ToNode(compiler, on_success); 830 } 831 } 832 833 int length = alternatives->length(); 834 835 ChoiceNode* result = 836 compiler->zone()->New<ChoiceNode>(length, compiler->zone()); 837 for (int i = 0; i < length; i++) { 838 GuardedAlternative alternative( 839 alternatives->at(i)->ToNode(compiler, on_success)); 840 result->AddAlternative(alternative); 841 } 842 return result; 843} 844 845RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 846 RegExpNode* on_success) { 847 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success); 848} 849 850namespace { 851// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and 852// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) 853RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, 854 RegExpNode* on_success, 855 RegExpAssertion::Type type, 856 RegExpFlags flags) { 857 CHECK(NeedsUnicodeCaseEquivalents(flags)); 858 Zone* zone = compiler->zone(); 859 ZoneList<CharacterRange>* word_range = 860 zone->New<ZoneList<CharacterRange>>(2, zone); 861 CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true, 862 zone); 863 int stack_register = compiler->UnicodeLookaroundStackRegister(); 864 int position_register = compiler->UnicodeLookaroundPositionRegister(); 865 ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 866 // Add two choices. The (non-)boundary could start with a word or 867 // a non-word-character. 868 for (int i = 0; i < 2; i++) { 869 bool lookbehind_for_word = i == 0; 870 bool lookahead_for_word = 871 (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word; 872 // Look to the left. 873 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, 874 stack_register, position_register); 875 RegExpNode* backward = TextNode::CreateForCharacterRanges( 876 zone, word_range, true, lookbehind.on_match_success()); 877 // Look to the right. 878 RegExpLookaround::Builder lookahead(lookahead_for_word, 879 lookbehind.ForMatch(backward), 880 stack_register, position_register); 881 RegExpNode* forward = TextNode::CreateForCharacterRanges( 882 zone, word_range, false, lookahead.on_match_success()); 883 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); 884 } 885 return result; 886} 887} // anonymous namespace 888 889RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 890 RegExpNode* on_success) { 891 NodeInfo info; 892 Zone* zone = compiler->zone(); 893 894 switch (assertion_type()) { 895 case Type::START_OF_LINE: 896 return AssertionNode::AfterNewline(on_success); 897 case Type::START_OF_INPUT: 898 return AssertionNode::AtStart(on_success); 899 case Type::BOUNDARY: 900 return NeedsUnicodeCaseEquivalents(compiler->flags()) 901 ? BoundaryAssertionAsLookaround( 902 compiler, on_success, Type::BOUNDARY, compiler->flags()) 903 : AssertionNode::AtBoundary(on_success); 904 case Type::NON_BOUNDARY: 905 return NeedsUnicodeCaseEquivalents(compiler->flags()) 906 ? BoundaryAssertionAsLookaround(compiler, on_success, 907 Type::NON_BOUNDARY, 908 compiler->flags()) 909 : AssertionNode::AtNonBoundary(on_success); 910 case Type::END_OF_INPUT: 911 return AssertionNode::AtEnd(on_success); 912 case Type::END_OF_LINE: { 913 // Compile $ in multiline regexps as an alternation with a positive 914 // lookahead in one side and an end-of-input on the other side. 915 // We need two registers for the lookahead. 916 int stack_pointer_register = compiler->AllocateRegister(); 917 int position_register = compiler->AllocateRegister(); 918 // The ChoiceNode to distinguish between a newline and end-of-input. 919 ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 920 // Create a newline atom. 921 ZoneList<CharacterRange>* newline_ranges = 922 zone->New<ZoneList<CharacterRange>>(3, zone); 923 CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator, 924 newline_ranges, false, zone); 925 RegExpCharacterClass* newline_atom = zone->New<RegExpCharacterClass>( 926 StandardCharacterSet::kLineTerminator); 927 TextNode* newline_matcher = 928 zone->New<TextNode>(newline_atom, false, 929 ActionNode::PositiveSubmatchSuccess( 930 stack_pointer_register, position_register, 931 0, // No captures inside. 932 -1, // Ignored if no captures. 933 on_success)); 934 // Create an end-of-input matcher. 935 RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch( 936 stack_pointer_register, position_register, newline_matcher); 937 // Add the two alternatives to the ChoiceNode. 938 GuardedAlternative eol_alternative(end_of_line); 939 result->AddAlternative(eol_alternative); 940 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 941 result->AddAlternative(end_alternative); 942 return result; 943 } 944 default: 945 UNREACHABLE(); 946 } 947} 948 949RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 950 RegExpNode* on_success) { 951 return compiler->zone()->New<BackReferenceNode>( 952 RegExpCapture::StartRegister(index()), 953 RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(), 954 on_success); 955} 956 957RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 958 RegExpNode* on_success) { 959 return on_success; 960} 961 962RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler, 963 RegExpNode* on_success) { 964 return body_->ToNode(compiler, on_success); 965} 966 967RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, 968 int stack_pointer_register, 969 int position_register, 970 int capture_register_count, 971 int capture_register_start) 972 : is_positive_(is_positive), 973 on_success_(on_success), 974 stack_pointer_register_(stack_pointer_register), 975 position_register_(position_register) { 976 if (is_positive_) { 977 on_match_success_ = ActionNode::PositiveSubmatchSuccess( 978 stack_pointer_register, position_register, capture_register_count, 979 capture_register_start, on_success_); 980 } else { 981 Zone* zone = on_success_->zone(); 982 on_match_success_ = zone->New<NegativeSubmatchSuccess>( 983 stack_pointer_register, position_register, capture_register_count, 984 capture_register_start, zone); 985 } 986} 987 988RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { 989 if (is_positive_) { 990 return ActionNode::BeginPositiveSubmatch(stack_pointer_register_, 991 position_register_, match); 992 } else { 993 Zone* zone = on_success_->zone(); 994 // We use a ChoiceNode to represent the negative lookaround. The first 995 // alternative is the negative match. On success, the end node backtracks. 996 // On failure, the second alternative is tried and leads to success. 997 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the 998 // first exit when calculating quick checks. 999 ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>( 1000 GuardedAlternative(match), GuardedAlternative(on_success_), zone); 1001 return ActionNode::BeginNegativeSubmatch(stack_pointer_register_, 1002 position_register_, choice_node); 1003 } 1004} 1005 1006RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 1007 RegExpNode* on_success) { 1008 int stack_pointer_register = compiler->AllocateRegister(); 1009 int position_register = compiler->AllocateRegister(); 1010 1011 const int registers_per_capture = 2; 1012 const int register_of_first_capture = 2; 1013 int register_count = capture_count_ * registers_per_capture; 1014 int register_start = 1015 register_of_first_capture + capture_from_ * registers_per_capture; 1016 1017 RegExpNode* result; 1018 bool was_reading_backward = compiler->read_backward(); 1019 compiler->set_read_backward(type() == LOOKBEHIND); 1020 Builder builder(is_positive(), on_success, stack_pointer_register, 1021 position_register, register_count, register_start); 1022 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); 1023 result = builder.ForMatch(match); 1024 compiler->set_read_backward(was_reading_backward); 1025 return result; 1026} 1027 1028RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 1029 RegExpNode* on_success) { 1030 return ToNode(body(), index(), compiler, on_success); 1031} 1032 1033RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index, 1034 RegExpCompiler* compiler, 1035 RegExpNode* on_success) { 1036 DCHECK_NOT_NULL(body); 1037 int start_reg = RegExpCapture::StartRegister(index); 1038 int end_reg = RegExpCapture::EndRegister(index); 1039 if (compiler->read_backward()) std::swap(start_reg, end_reg); 1040 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 1041 RegExpNode* body_node = body->ToNode(compiler, store_end); 1042 return ActionNode::StorePosition(start_reg, true, body_node); 1043} 1044 1045namespace { 1046 1047class AssertionSequenceRewriter final { 1048 public: 1049 // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass 1050 // instead of sprinkling rewrites into the AST->Node conversion process. 1051 static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) { 1052 AssertionSequenceRewriter rewriter(terms, zone); 1053 1054 static constexpr int kNoIndex = -1; 1055 int from = kNoIndex; 1056 1057 for (int i = 0; i < terms->length(); i++) { 1058 RegExpTree* t = terms->at(i); 1059 if (from == kNoIndex && t->IsAssertion()) { 1060 from = i; // Start a sequence. 1061 } else if (from != kNoIndex && !t->IsAssertion()) { 1062 // Terminate and process the sequence. 1063 if (i - from > 1) rewriter.Rewrite(from, i); 1064 from = kNoIndex; 1065 } 1066 } 1067 1068 if (from != kNoIndex && terms->length() - from > 1) { 1069 rewriter.Rewrite(from, terms->length()); 1070 } 1071 } 1072 1073 // All assertions are zero width. A consecutive sequence of assertions is 1074 // order-independent. There's two ways we can optimize here: 1075 // 1. fold all identical assertions. 1076 // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire 1077 // sequence fails. 1078 void Rewrite(int from, int to) { 1079 DCHECK_GT(to, from + 1); 1080 1081 // Bitfield of all seen assertions. 1082 uint32_t seen_assertions = 0; 1083 STATIC_ASSERT(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) < 1084 kUInt32Size * kBitsPerByte); 1085 1086 for (int i = from; i < to; i++) { 1087 RegExpAssertion* t = terms_->at(i)->AsAssertion(); 1088 const uint32_t bit = 1 << static_cast<int>(t->assertion_type()); 1089 1090 if (seen_assertions & bit) { 1091 // Fold duplicates. 1092 terms_->Set(i, zone_->New<RegExpEmpty>()); 1093 } 1094 1095 seen_assertions |= bit; 1096 } 1097 1098 // Collapse failures. 1099 const uint32_t always_fails_mask = 1100 1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) | 1101 1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY); 1102 if ((seen_assertions & always_fails_mask) == always_fails_mask) { 1103 ReplaceSequenceWithFailure(from, to); 1104 } 1105 } 1106 1107 void ReplaceSequenceWithFailure(int from, int to) { 1108 // Replace the entire sequence with a single node that always fails. 1109 // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the 1110 // negated '*' (everything) range serves the purpose. 1111 ZoneList<CharacterRange>* ranges = 1112 zone_->New<ZoneList<CharacterRange>>(0, zone_); 1113 RegExpCharacterClass* cc = zone_->New<RegExpCharacterClass>(zone_, ranges); 1114 terms_->Set(from, cc); 1115 1116 // Zero out the rest. 1117 RegExpEmpty* empty = zone_->New<RegExpEmpty>(); 1118 for (int i = from + 1; i < to; i++) terms_->Set(i, empty); 1119 } 1120 1121 private: 1122 AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone) 1123 : zone_(zone), terms_(terms) {} 1124 1125 Zone* zone_; 1126 ZoneList<RegExpTree*>* terms_; 1127}; 1128 1129} // namespace 1130 1131RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 1132 RegExpNode* on_success) { 1133 compiler->ToNodeMaybeCheckForStackOverflow(); 1134 1135 ZoneList<RegExpTree*>* children = nodes(); 1136 1137 AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone()); 1138 1139 RegExpNode* current = on_success; 1140 if (compiler->read_backward()) { 1141 for (int i = 0; i < children->length(); i++) { 1142 current = children->at(i)->ToNode(compiler, current); 1143 } 1144 } else { 1145 for (int i = children->length() - 1; i >= 0; i--) { 1146 current = children->at(i)->ToNode(compiler, current); 1147 } 1148 } 1149 return current; 1150} 1151 1152namespace { 1153 1154void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges, 1155 Zone* zone) { 1156 elmc--; 1157 DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 1158 for (int i = 0; i < elmc; i += 2) { 1159 DCHECK(elmv[i] < elmv[i + 1]); 1160 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); 1161 } 1162} 1163 1164void AddClassNegated(const int* elmv, int elmc, 1165 ZoneList<CharacterRange>* ranges, Zone* zone) { 1166 elmc--; 1167 DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 1168 DCHECK_NE(0x0000, elmv[0]); 1169 DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]); 1170 base::uc16 last = 0x0000; 1171 for (int i = 0; i < elmc; i += 2) { 1172 DCHECK(last <= elmv[i] - 1); 1173 DCHECK(elmv[i] < elmv[i + 1]); 1174 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 1175 last = elmv[i + 1]; 1176 } 1177 ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone); 1178} 1179 1180} // namespace 1181 1182void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set, 1183 ZoneList<CharacterRange>* ranges, 1184 bool add_unicode_case_equivalents, 1185 Zone* zone) { 1186 if (add_unicode_case_equivalents && 1187 (standard_character_set == StandardCharacterSet::kWord || 1188 standard_character_set == StandardCharacterSet::kNotWord)) { 1189 // See #sec-runtime-semantics-wordcharacters-abstract-operation 1190 // In case of unicode and ignore_case, we need to create the closure over 1191 // case equivalent characters before negating. 1192 ZoneList<CharacterRange>* new_ranges = 1193 zone->New<ZoneList<CharacterRange>>(2, zone); 1194 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); 1195 AddUnicodeCaseEquivalents(new_ranges, zone); 1196 if (standard_character_set == StandardCharacterSet::kNotWord) { 1197 ZoneList<CharacterRange>* negated = 1198 zone->New<ZoneList<CharacterRange>>(2, zone); 1199 CharacterRange::Negate(new_ranges, negated, zone); 1200 new_ranges = negated; 1201 } 1202 ranges->AddAll(*new_ranges, zone); 1203 return; 1204 } 1205 1206 switch (standard_character_set) { 1207 case StandardCharacterSet::kWhitespace: 1208 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 1209 break; 1210 case StandardCharacterSet::kNotWhitespace: 1211 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 1212 break; 1213 case StandardCharacterSet::kWord: 1214 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 1215 break; 1216 case StandardCharacterSet::kNotWord: 1217 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 1218 break; 1219 case StandardCharacterSet::kDigit: 1220 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 1221 break; 1222 case StandardCharacterSet::kNotDigit: 1223 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 1224 break; 1225 // This is the set of characters matched by the $ and ^ symbols 1226 // in multiline mode. 1227 case StandardCharacterSet::kLineTerminator: 1228 AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); 1229 break; 1230 case StandardCharacterSet::kNotLineTerminator: 1231 AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, 1232 zone); 1233 break; 1234 // This is not a character range as defined by the spec but a 1235 // convenient shorthand for a character class that matches any 1236 // character. 1237 case StandardCharacterSet::kEverything: 1238 ranges->Add(CharacterRange::Everything(), zone); 1239 break; 1240 } 1241} 1242 1243// static 1244void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 1245 ZoneList<CharacterRange>* ranges, 1246 bool is_one_byte) { 1247 CharacterRange::Canonicalize(ranges); 1248 int range_count = ranges->length(); 1249#ifdef V8_INTL_SUPPORT 1250 icu::UnicodeSet others; 1251 for (int i = 0; i < range_count; i++) { 1252 CharacterRange range = ranges->at(i); 1253 base::uc32 from = range.from(); 1254 if (from > kMaxUtf16CodeUnit) continue; 1255 base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU}); 1256 // Nothing to be done for surrogates. 1257 if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; 1258 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 1259 if (from > kMaxOneByteCharCode) continue; 1260 if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode; 1261 } 1262 others.add(from, to); 1263 } 1264 1265 // Compute the set of additional characters that should be added, 1266 // using UnicodeSet::closeOver. ECMA 262 defines slightly different 1267 // case-folding rules than Unicode, so some characters that are 1268 // added by closeOver do not match anything other than themselves in 1269 // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the 1270 // same case-insensitive character as 's' or 'S' according to 1271 // Unicode, but does not match any other character in JS. To handle 1272 // this case, we add such characters to the IgnoreSet and filter 1273 // them out. We filter twice: once before calling closeOver (to 1274 // prevent 'ſ' from adding 's'), and once after calling closeOver 1275 // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for 1276 // more information. 1277 icu::UnicodeSet already_added(others); 1278 others.removeAll(RegExpCaseFolding::IgnoreSet()); 1279 others.closeOver(USET_CASE_INSENSITIVE); 1280 others.removeAll(RegExpCaseFolding::IgnoreSet()); 1281 others.removeAll(already_added); 1282 1283 // Add others to the ranges 1284 for (int32_t i = 0; i < others.getRangeCount(); i++) { 1285 UChar32 from = others.getRangeStart(i); 1286 UChar32 to = others.getRangeEnd(i); 1287 if (from == to) { 1288 ranges->Add(CharacterRange::Singleton(from), zone); 1289 } else { 1290 ranges->Add(CharacterRange::Range(from, to), zone); 1291 } 1292 } 1293#else 1294 for (int i = 0; i < range_count; i++) { 1295 CharacterRange range = ranges->at(i); 1296 base::uc32 bottom = range.from(); 1297 if (bottom > kMaxUtf16CodeUnit) continue; 1298 base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU}); 1299 // Nothing to be done for surrogates. 1300 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; 1301 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 1302 if (bottom > kMaxOneByteCharCode) continue; 1303 if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode; 1304 } 1305 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1306 if (top == bottom) { 1307 // If this is a singleton we just expand the one character. 1308 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 1309 for (int i = 0; i < length; i++) { 1310 base::uc32 chr = chars[i]; 1311 if (chr != bottom) { 1312 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 1313 } 1314 } 1315 } else { 1316 // If this is a range we expand the characters block by block, expanding 1317 // contiguous subranges (blocks) one at a time. The approach is as 1318 // follows. For a given start character we look up the remainder of the 1319 // block that contains it (represented by the end point), for instance we 1320 // find 'z' if the character is 'c'. A block is characterized by the 1321 // property that all characters uncanonicalize in the same way, except 1322 // that each entry in the result is incremented by the distance from the 1323 // first element. So a-z is a block because 'a' uncanonicalizes to ['a', 1324 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once 1325 // we've found the end point we look up its uncanonicalization and 1326 // produce a range for each element. For instance for [c-f] we look up 1327 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if 1328 // it is not already contained in the input, so [c-f] will be skipped but 1329 // [C-F] will be added. If this range is not completely contained in a 1330 // block we do this for all the blocks covered by the range (handling 1331 // characters that is not in a block as a "singleton block"). 1332 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1333 base::uc32 pos = bottom; 1334 while (pos <= top) { 1335 int length = 1336 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); 1337 base::uc32 block_end; 1338 if (length == 0) { 1339 block_end = pos; 1340 } else { 1341 DCHECK_EQ(1, length); 1342 block_end = equivalents[0]; 1343 } 1344 int end = (block_end > top) ? top : block_end; 1345 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', 1346 equivalents); 1347 for (int i = 0; i < length; i++) { 1348 base::uc32 c = equivalents[i]; 1349 base::uc32 range_from = c - (block_end - pos); 1350 base::uc32 range_to = c - (block_end - end); 1351 if (!(bottom <= range_from && range_to <= top)) { 1352 ranges->Add(CharacterRange::Range(range_from, range_to), zone); 1353 } 1354 } 1355 pos = end + 1; 1356 } 1357 } 1358 } 1359#endif // V8_INTL_SUPPORT 1360} 1361 1362bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 1363 DCHECK_NOT_NULL(ranges); 1364 int n = ranges->length(); 1365 if (n <= 1) return true; 1366 base::uc32 max = ranges->at(0).to(); 1367 for (int i = 1; i < n; i++) { 1368 CharacterRange next_range = ranges->at(i); 1369 if (next_range.from() <= max + 1) return false; 1370 max = next_range.to(); 1371 } 1372 return true; 1373} 1374 1375ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 1376 if (ranges_ == nullptr) { 1377 ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone); 1378 CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false, 1379 zone); 1380 } 1381 return ranges_; 1382} 1383 1384namespace { 1385 1386// Move a number of elements in a zonelist to another position 1387// in the same list. Handles overlapping source and target areas. 1388void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) { 1389 // Ranges are potentially overlapping. 1390 if (from < to) { 1391 for (int i = count - 1; i >= 0; i--) { 1392 list->at(to + i) = list->at(from + i); 1393 } 1394 } else { 1395 for (int i = 0; i < count; i++) { 1396 list->at(to + i) = list->at(from + i); 1397 } 1398 } 1399} 1400 1401int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count, 1402 CharacterRange insert) { 1403 // Inserts a range into list[0..count[, which must be sorted 1404 // by from value and non-overlapping and non-adjacent, using at most 1405 // list[0..count] for the result. Returns the number of resulting 1406 // canonicalized ranges. Inserting a range may collapse existing ranges into 1407 // fewer ranges, so the return value can be anything in the range 1..count+1. 1408 base::uc32 from = insert.from(); 1409 base::uc32 to = insert.to(); 1410 int start_pos = 0; 1411 int end_pos = count; 1412 for (int i = count - 1; i >= 0; i--) { 1413 CharacterRange current = list->at(i); 1414 if (current.from() > to + 1) { 1415 end_pos = i; 1416 } else if (current.to() + 1 < from) { 1417 start_pos = i + 1; 1418 break; 1419 } 1420 } 1421 1422 // Inserted range overlaps, or is adjacent to, ranges at positions 1423 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 1424 // not affected by the insertion. 1425 // If start_pos == end_pos, the range must be inserted before start_pos. 1426 // if start_pos < end_pos, the entire range from start_pos to end_pos 1427 // must be merged with the insert range. 1428 1429 if (start_pos == end_pos) { 1430 // Insert between existing ranges at position start_pos. 1431 if (start_pos < count) { 1432 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 1433 } 1434 list->at(start_pos) = insert; 1435 return count + 1; 1436 } 1437 if (start_pos + 1 == end_pos) { 1438 // Replace single existing range at position start_pos. 1439 CharacterRange to_replace = list->at(start_pos); 1440 int new_from = std::min(to_replace.from(), from); 1441 int new_to = std::max(to_replace.to(), to); 1442 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 1443 return count; 1444 } 1445 // Replace a number of existing ranges from start_pos to end_pos - 1. 1446 // Move the remaining ranges down. 1447 1448 int new_from = std::min(list->at(start_pos).from(), from); 1449 int new_to = std::max(list->at(end_pos - 1).to(), to); 1450 if (end_pos < count) { 1451 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 1452 } 1453 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 1454 return count - (end_pos - start_pos) + 1; 1455} 1456 1457} // namespace 1458 1459void CharacterSet::Canonicalize() { 1460 // Special/default classes are always considered canonical. The result 1461 // of calling ranges() will be sorted. 1462 if (ranges_ == nullptr) return; 1463 CharacterRange::Canonicalize(ranges_); 1464} 1465 1466// static 1467void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 1468 if (character_ranges->length() <= 1) return; 1469 // Check whether ranges are already canonical (increasing, non-overlapping, 1470 // non-adjacent). 1471 int n = character_ranges->length(); 1472 base::uc32 max = character_ranges->at(0).to(); 1473 int i = 1; 1474 while (i < n) { 1475 CharacterRange current = character_ranges->at(i); 1476 if (current.from() <= max + 1) { 1477 break; 1478 } 1479 max = current.to(); 1480 i++; 1481 } 1482 // Canonical until the i'th range. If that's all of them, we are done. 1483 if (i == n) return; 1484 1485 // The ranges at index i and forward are not canonicalized. Make them so by 1486 // doing the equivalent of insertion sort (inserting each into the previous 1487 // list, in order). 1488 // Notice that inserting a range can reduce the number of ranges in the 1489 // result due to combining of adjacent and overlapping ranges. 1490 int read = i; // Range to insert. 1491 int num_canonical = i; // Length of canonicalized part of list. 1492 do { 1493 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical, 1494 character_ranges->at(read)); 1495 read++; 1496 } while (read < n); 1497 character_ranges->Rewind(num_canonical); 1498 1499 DCHECK(CharacterRange::IsCanonical(character_ranges)); 1500} 1501 1502// static 1503void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 1504 ZoneList<CharacterRange>* negated_ranges, 1505 Zone* zone) { 1506 DCHECK(CharacterRange::IsCanonical(ranges)); 1507 DCHECK_EQ(0, negated_ranges->length()); 1508 int range_count = ranges->length(); 1509 base::uc32 from = 0; 1510 int i = 0; 1511 if (range_count > 0 && ranges->at(0).from() == 0) { 1512 from = ranges->at(0).to() + 1; 1513 i = 1; 1514 } 1515 while (i < range_count) { 1516 CharacterRange range = ranges->at(i); 1517 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); 1518 from = range.to() + 1; 1519 i++; 1520 } 1521 if (from < kMaxCodePoint) { 1522 negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone); 1523 } 1524} 1525 1526// static 1527void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) { 1528 DCHECK(IsCanonical(ranges)); 1529 1530 // Drop all ranges that don't contain one-byte code units, and clamp the last 1531 // range s.t. it likewise only contains one-byte code units. Note this relies 1532 // on `ranges` being canonicalized, i.e. sorted and non-overlapping. 1533 1534 static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU; 1535 int n = ranges->length(); 1536 for (; n > 0; n--) { 1537 CharacterRange& r = ranges->at(n - 1); 1538 if (r.from() <= max_char) { 1539 r.to_ = std::min(r.to_, max_char); 1540 break; 1541 } 1542 } 1543 1544 ranges->Rewind(n); 1545} 1546 1547namespace { 1548 1549// Scoped object to keep track of how much we unroll quantifier loops in the 1550// regexp graph generator. 1551class RegExpExpansionLimiter { 1552 public: 1553 static const int kMaxExpansionFactor = 6; 1554 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 1555 : compiler_(compiler), 1556 saved_expansion_factor_(compiler->current_expansion_factor()), 1557 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 1558 DCHECK_LT(0, factor); 1559 if (ok_to_expand_) { 1560 if (factor > kMaxExpansionFactor) { 1561 // Avoid integer overflow of the current expansion factor. 1562 ok_to_expand_ = false; 1563 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 1564 } else { 1565 int new_factor = saved_expansion_factor_ * factor; 1566 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 1567 compiler->set_current_expansion_factor(new_factor); 1568 } 1569 } 1570 } 1571 1572 ~RegExpExpansionLimiter() { 1573 compiler_->set_current_expansion_factor(saved_expansion_factor_); 1574 } 1575 1576 bool ok_to_expand() { return ok_to_expand_; } 1577 1578 private: 1579 RegExpCompiler* compiler_; 1580 int saved_expansion_factor_; 1581 bool ok_to_expand_; 1582 1583 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 1584}; 1585 1586} // namespace 1587 1588RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, 1589 RegExpTree* body, RegExpCompiler* compiler, 1590 RegExpNode* on_success, 1591 bool not_at_start) { 1592 // x{f, t} becomes this: 1593 // 1594 // (r++)<-. 1595 // | ` 1596 // | (x) 1597 // v ^ 1598 // (r=0)-->(?)---/ [if r < t] 1599 // | 1600 // [if r >= f] \----> ... 1601 // 1602 1603 // 15.10.2.5 RepeatMatcher algorithm. 1604 // The parser has already eliminated the case where max is 0. In the case 1605 // where max_match is zero the parser has removed the quantifier if min was 1606 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 1607 1608 // If we know that we cannot match zero length then things are a little 1609 // simpler since we don't need to make the special zero length match check 1610 // from step 2.1. If the min and max are small we can unroll a little in 1611 // this case. 1612 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 1613 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 1614 if (max == 0) return on_success; // This can happen due to recursion. 1615 bool body_can_be_empty = (body->min_match() == 0); 1616 int body_start_reg = RegExpCompiler::kNoRegister; 1617 Interval capture_registers = body->CaptureRegisters(); 1618 bool needs_capture_clearing = !capture_registers.is_empty(); 1619 Zone* zone = compiler->zone(); 1620 1621 if (body_can_be_empty) { 1622 body_start_reg = compiler->AllocateRegister(); 1623 } else if (compiler->optimize() && !needs_capture_clearing) { 1624 // Only unroll if there are no captures and the body can't be 1625 // empty. 1626 { 1627 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0)); 1628 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 1629 int new_max = (max == kInfinity) ? max : max - min; 1630 // Recurse once to get the loop or optional matches after the fixed 1631 // ones. 1632 RegExpNode* answer = 1633 ToNode(0, new_max, is_greedy, body, compiler, on_success, true); 1634 // Unroll the forced matches from 0 to min. This can cause chains of 1635 // TextNodes (which the parser does not generate). These should be 1636 // combined if it turns out they hinder good code generation. 1637 for (int i = 0; i < min; i++) { 1638 answer = body->ToNode(compiler, answer); 1639 } 1640 return answer; 1641 } 1642 } 1643 if (max <= kMaxUnrolledMaxMatches && min == 0) { 1644 DCHECK_LT(0, max); // Due to the 'if' above. 1645 RegExpExpansionLimiter limiter(compiler, max); 1646 if (limiter.ok_to_expand()) { 1647 // Unroll the optional matches up to max. 1648 RegExpNode* answer = on_success; 1649 for (int i = 0; i < max; i++) { 1650 ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone); 1651 if (is_greedy) { 1652 alternation->AddAlternative( 1653 GuardedAlternative(body->ToNode(compiler, answer))); 1654 alternation->AddAlternative(GuardedAlternative(on_success)); 1655 } else { 1656 alternation->AddAlternative(GuardedAlternative(on_success)); 1657 alternation->AddAlternative( 1658 GuardedAlternative(body->ToNode(compiler, answer))); 1659 } 1660 answer = alternation; 1661 if (not_at_start && !compiler->read_backward()) { 1662 alternation->set_not_at_start(); 1663 } 1664 } 1665 return answer; 1666 } 1667 } 1668 } 1669 bool has_min = min > 0; 1670 bool has_max = max < RegExpTree::kInfinity; 1671 bool needs_counter = has_min || has_max; 1672 int reg_ctr = needs_counter ? compiler->AllocateRegister() 1673 : RegExpCompiler::kNoRegister; 1674 LoopChoiceNode* center = zone->New<LoopChoiceNode>( 1675 body->min_match() == 0, compiler->read_backward(), min, zone); 1676 if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 1677 RegExpNode* loop_return = 1678 needs_counter ? static_cast<RegExpNode*>( 1679 ActionNode::IncrementRegister(reg_ctr, center)) 1680 : static_cast<RegExpNode*>(center); 1681 if (body_can_be_empty) { 1682 // If the body can be empty we need to check if it was and then 1683 // backtrack. 1684 loop_return = 1685 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return); 1686 } 1687 RegExpNode* body_node = body->ToNode(compiler, loop_return); 1688 if (body_can_be_empty) { 1689 // If the body can be empty we need to store the start position 1690 // so we can bail out if it was empty. 1691 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 1692 } 1693 if (needs_capture_clearing) { 1694 // Before entering the body of this loop we need to clear captures. 1695 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 1696 } 1697 GuardedAlternative body_alt(body_node); 1698 if (has_max) { 1699 Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max); 1700 body_alt.AddGuard(body_guard, zone); 1701 } 1702 GuardedAlternative rest_alt(on_success); 1703 if (has_min) { 1704 Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min); 1705 rest_alt.AddGuard(rest_guard, zone); 1706 } 1707 if (is_greedy) { 1708 center->AddLoopAlternative(body_alt); 1709 center->AddContinueAlternative(rest_alt); 1710 } else { 1711 center->AddContinueAlternative(rest_alt); 1712 center->AddLoopAlternative(body_alt); 1713 } 1714 if (needs_counter) { 1715 return ActionNode::SetRegisterForLoop(reg_ctr, 0, center); 1716 } else { 1717 return center; 1718 } 1719} 1720 1721} // namespace internal 1722} // namespace v8 1723