1// Copyright 2020 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/experimental/experimental-compiler.h" 6 7#include "src/base/strings.h" 8#include "src/regexp/experimental/experimental.h" 9#include "src/zone/zone-list-inl.h" 10 11namespace v8 { 12namespace internal { 13 14namespace { 15 16// TODO(mbid, v8:10765): Currently the experimental engine doesn't support 17// UTF-16, but this shouldn't be too hard to implement. 18constexpr base::uc32 kMaxSupportedCodepoint = 0xFFFFu; 19#ifdef DEBUG 20constexpr base::uc32 kMaxCodePoint = 0x10ffff; 21#endif // DEBUG 22 23class CanBeHandledVisitor final : private RegExpVisitor { 24 // Visitor to implement `ExperimentalRegExp::CanBeHandled`. 25 public: 26 static bool Check(RegExpTree* tree, RegExpFlags flags, int capture_count) { 27 if (!AreSuitableFlags(flags)) return false; 28 CanBeHandledVisitor visitor; 29 tree->Accept(&visitor, nullptr); 30 return visitor.result_; 31 } 32 33 private: 34 CanBeHandledVisitor() = default; 35 36 static bool AreSuitableFlags(RegExpFlags flags) { 37 // TODO(mbid, v8:10765): We should be able to support all flags in the 38 // future. 39 static constexpr RegExpFlags kAllowedFlags = 40 RegExpFlag::kGlobal | RegExpFlag::kSticky | RegExpFlag::kMultiline | 41 RegExpFlag::kDotAll | RegExpFlag::kLinear; 42 // We support Unicode iff kUnicode is among the supported flags. 43 STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode == 44 IsUnicode(kAllowedFlags)); 45 return (flags & ~kAllowedFlags) == 0; 46 } 47 48 void* VisitDisjunction(RegExpDisjunction* node, void*) override { 49 for (RegExpTree* alt : *node->alternatives()) { 50 alt->Accept(this, nullptr); 51 if (!result_) { 52 return nullptr; 53 } 54 } 55 return nullptr; 56 } 57 58 void* VisitAlternative(RegExpAlternative* node, void*) override { 59 for (RegExpTree* child : *node->nodes()) { 60 child->Accept(this, nullptr); 61 if (!result_) { 62 return nullptr; 63 } 64 } 65 return nullptr; 66 } 67 68 void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { 69 return nullptr; 70 } 71 72 void* VisitAssertion(RegExpAssertion* node, void*) override { 73 return nullptr; 74 } 75 76 void* VisitAtom(RegExpAtom* node, void*) override { 77 return nullptr; 78 } 79 80 void* VisitText(RegExpText* node, void*) override { 81 for (TextElement& el : *node->elements()) { 82 el.tree()->Accept(this, nullptr); 83 if (!result_) { 84 return nullptr; 85 } 86 } 87 return nullptr; 88 } 89 90 void* VisitQuantifier(RegExpQuantifier* node, void*) override { 91 // Finite but large values of `min()` and `max()` are bad for the 92 // breadth-first engine because finite (optional) repetition is dealt with 93 // by replicating the bytecode of the body of the quantifier. The number 94 // of replicatons grows exponentially in how deeply quantifiers are nested. 95 // `replication_factor_` keeps track of how often the current node will 96 // have to be replicated in the generated bytecode, and we don't allow this 97 // to exceed some small value. 98 static constexpr int kMaxReplicationFactor = 16; 99 100 // First we rule out values for min and max that are too big even before 101 // taking into account the ambient replication_factor_. This also guards 102 // against overflows in `local_replication` or `replication_factor_`. 103 if (node->min() > kMaxReplicationFactor || 104 (node->max() != RegExpTree::kInfinity && 105 node->max() > kMaxReplicationFactor)) { 106 result_ = false; 107 return nullptr; 108 } 109 110 // Save the current replication factor so that it can be restored if we 111 // return with `result_ == true`. 112 int before_replication_factor = replication_factor_; 113 114 int local_replication; 115 if (node->max() == RegExpTree::kInfinity) { 116 local_replication = node->min() + 1; 117 } else { 118 local_replication = node->max(); 119 } 120 121 replication_factor_ *= local_replication; 122 if (replication_factor_ > kMaxReplicationFactor) { 123 result_ = false; 124 return nullptr; 125 } 126 127 switch (node->quantifier_type()) { 128 case RegExpQuantifier::GREEDY: 129 case RegExpQuantifier::NON_GREEDY: 130 break; 131 case RegExpQuantifier::POSSESSIVE: 132 // TODO(mbid, v8:10765): It's not clear to me whether this can be 133 // supported in breadth-first mode. Re2 doesn't support it. 134 result_ = false; 135 return nullptr; 136 } 137 138 node->body()->Accept(this, nullptr); 139 replication_factor_ = before_replication_factor; 140 return nullptr; 141 } 142 143 void* VisitCapture(RegExpCapture* node, void*) override { 144 node->body()->Accept(this, nullptr); 145 return nullptr; 146 } 147 148 void* VisitGroup(RegExpGroup* node, void*) override { 149 node->body()->Accept(this, nullptr); 150 return nullptr; 151 } 152 153 void* VisitLookaround(RegExpLookaround* node, void*) override { 154 // TODO(mbid, v8:10765): This will be hard to support, but not impossible I 155 // think. See product automata. 156 result_ = false; 157 return nullptr; 158 } 159 160 void* VisitBackReference(RegExpBackReference* node, void*) override { 161 // This can't be implemented without backtracking. 162 result_ = false; 163 return nullptr; 164 } 165 166 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } 167 168 private: 169 // See comment in `VisitQuantifier`: 170 int replication_factor_ = 1; 171 172 bool result_ = true; 173}; 174 175} // namespace 176 177bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree, 178 RegExpFlags flags, 179 int capture_count) { 180 return CanBeHandledVisitor::Check(tree, flags, capture_count); 181} 182 183namespace { 184 185// A label in bytecode which starts with no known address. The address *must* 186// be bound with `Bind` before the label goes out of scope. 187// Implemented as a linked list through the `payload.pc` of FORK and JMP 188// instructions. 189struct Label { 190 public: 191 Label() = default; 192 ~Label() { 193 DCHECK_EQ(state_, BOUND); 194 DCHECK_GE(bound_index_, 0); 195 } 196 197 // Don't copy, don't move. Moving could be implemented, but it's not 198 // needed anywhere. 199 Label(const Label&) = delete; 200 Label& operator=(const Label&) = delete; 201 202 private: 203 friend class BytecodeAssembler; 204 205 // UNBOUND implies unbound_patch_list_begin_. 206 // BOUND implies bound_index_. 207 enum { UNBOUND, BOUND } state_ = UNBOUND; 208 union { 209 int unbound_patch_list_begin_ = -1; 210 int bound_index_; 211 }; 212}; 213 214class BytecodeAssembler { 215 public: 216 // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from 217 // the `tree` size we're going to compile? 218 explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {} 219 220 ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); } 221 222 void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); } 223 224 void Assertion(RegExpAssertion::Type t) { 225 code_.Add(RegExpInstruction::Assertion(t), zone_); 226 } 227 228 void ClearRegister(int32_t register_index) { 229 code_.Add(RegExpInstruction::ClearRegister(register_index), zone_); 230 } 231 232 void ConsumeRange(base::uc16 from, base::uc16 to) { 233 code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_); 234 } 235 236 void ConsumeAnyChar() { 237 code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_); 238 } 239 240 void Fork(Label& target) { 241 LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target); 242 } 243 244 void Jmp(Label& target) { 245 LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target); 246 } 247 248 void SetRegisterToCp(int32_t register_index) { 249 code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_); 250 } 251 252 void Bind(Label& target) { 253 DCHECK_EQ(target.state_, Label::UNBOUND); 254 255 int index = code_.length(); 256 257 while (target.unbound_patch_list_begin_ != -1) { 258 RegExpInstruction& inst = code_[target.unbound_patch_list_begin_]; 259 DCHECK(inst.opcode == RegExpInstruction::FORK || 260 inst.opcode == RegExpInstruction::JMP); 261 262 target.unbound_patch_list_begin_ = inst.payload.pc; 263 inst.payload.pc = index; 264 } 265 266 target.state_ = Label::BOUND; 267 target.bound_index_ = index; 268 } 269 270 void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); } 271 272 private: 273 void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) { 274 RegExpInstruction result; 275 result.opcode = op; 276 277 if (target.state_ == Label::BOUND) { 278 result.payload.pc = target.bound_index_; 279 } else { 280 DCHECK_EQ(target.state_, Label::UNBOUND); 281 int new_list_begin = code_.length(); 282 DCHECK_GE(new_list_begin, 0); 283 284 result.payload.pc = target.unbound_patch_list_begin_; 285 286 target.unbound_patch_list_begin_ = new_list_begin; 287 } 288 289 code_.Add(result, zone_); 290 } 291 292 Zone* zone_; 293 ZoneList<RegExpInstruction> code_; 294}; 295 296class CompileVisitor : private RegExpVisitor { 297 public: 298 static ZoneList<RegExpInstruction> Compile(RegExpTree* tree, 299 RegExpFlags flags, Zone* zone) { 300 CompileVisitor compiler(zone); 301 302 if (!IsSticky(flags) && !tree->IsAnchoredAtStart()) { 303 // The match is not anchored, i.e. may start at any input position, so we 304 // emit a preamble corresponding to /.*?/. This skips an arbitrary 305 // prefix in the input non-greedily. 306 compiler.CompileNonGreedyStar( 307 [&]() { compiler.assembler_.ConsumeAnyChar(); }); 308 } 309 310 compiler.assembler_.SetRegisterToCp(0); 311 tree->Accept(&compiler, nullptr); 312 compiler.assembler_.SetRegisterToCp(1); 313 compiler.assembler_.Accept(); 314 315 return std::move(compiler.assembler_).IntoCode(); 316 } 317 318 private: 319 explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {} 320 321 // Generate a disjunction of code fragments compiled by a function `alt_gen`. 322 // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - 323 // 1` and should build code corresponding to the ith alternative. 324 template <class F> 325 void CompileDisjunction(int alt_num, F&& gen_alt) { 326 // An alternative a1 | ... | an is compiled into 327 // 328 // FORK tail1 329 // <a1> 330 // JMP end 331 // tail1: 332 // FORK tail2 333 // <a2> 334 // JMP end 335 // tail2: 336 // ... 337 // ... 338 // tail{n -1}: 339 // <an> 340 // end: 341 // 342 // By the semantics of the FORK instruction (see above at definition and 343 // semantics), a forked thread has lower priority than the thread that 344 // spawned it. This means that with the code we're generating here, the 345 // thread matching the alternative a1 has indeed highest priority, followed 346 // by the thread for a2 and so on. 347 348 if (alt_num == 0) { 349 // The empty disjunction. This can never match. 350 assembler_.Fail(); 351 return; 352 } 353 354 Label end; 355 356 for (int i = 0; i != alt_num - 1; ++i) { 357 Label tail; 358 assembler_.Fork(tail); 359 gen_alt(i); 360 assembler_.Jmp(end); 361 assembler_.Bind(tail); 362 } 363 364 gen_alt(alt_num - 1); 365 366 assembler_.Bind(end); 367 } 368 369 void* VisitDisjunction(RegExpDisjunction* node, void*) override { 370 ZoneList<RegExpTree*>& alts = *node->alternatives(); 371 CompileDisjunction(alts.length(), 372 [&](int i) { alts[i]->Accept(this, nullptr); }); 373 return nullptr; 374 } 375 376 void* VisitAlternative(RegExpAlternative* node, void*) override { 377 for (RegExpTree* child : *node->nodes()) { 378 child->Accept(this, nullptr); 379 } 380 return nullptr; 381 } 382 383 void* VisitAssertion(RegExpAssertion* node, void*) override { 384 assembler_.Assertion(node->assertion_type()); 385 return nullptr; 386 } 387 388 void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { 389 // A character class is compiled as Disjunction over its `CharacterRange`s. 390 ZoneList<CharacterRange>* ranges = node->ranges(zone_); 391 CharacterRange::Canonicalize(ranges); 392 if (node->is_negated()) { 393 // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d) 394 // union of k intervals is a union of at most k + 1 intervals. 395 ZoneList<CharacterRange>* negated = 396 zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_); 397 CharacterRange::Negate(ranges, negated, zone_); 398 DCHECK_LE(negated->length(), ranges->length() + 1); 399 ranges = negated; 400 } 401 402 CompileDisjunction(ranges->length(), [&](int i) { 403 // We don't support utf16 for now, so only ranges that can be specified 404 // by (complements of) ranges with base::uc16 bounds. 405 STATIC_ASSERT(kMaxSupportedCodepoint <= 406 std::numeric_limits<base::uc16>::max()); 407 408 base::uc32 from = (*ranges)[i].from(); 409 DCHECK_LE(from, kMaxSupportedCodepoint); 410 base::uc16 from_uc16 = static_cast<base::uc16>(from); 411 412 base::uc32 to = (*ranges)[i].to(); 413 DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == kMaxCodePoint); 414 base::uc16 to_uc16 = 415 static_cast<base::uc16>(std::min(to, kMaxSupportedCodepoint)); 416 417 assembler_.ConsumeRange(from_uc16, to_uc16); 418 }); 419 return nullptr; 420 } 421 422 void* VisitAtom(RegExpAtom* node, void*) override { 423 for (base::uc16 c : node->data()) { 424 assembler_.ConsumeRange(c, c); 425 } 426 return nullptr; 427 } 428 429 void ClearRegisters(Interval indices) { 430 if (indices.is_empty()) return; 431 DCHECK_EQ(indices.from() % 2, 0); 432 DCHECK_EQ(indices.to() % 2, 1); 433 for (int i = indices.from(); i <= indices.to(); i += 2) { 434 // It suffices to clear the register containing the `begin` of a capture 435 // because this indicates that the capture is undefined, regardless of 436 // the value in the `end` register. 437 assembler_.ClearRegister(i); 438 } 439 } 440 441 // Emit bytecode corresponding to /<emit_body>*/. 442 template <class F> 443 void CompileGreedyStar(F&& emit_body) { 444 // This is compiled into 445 // 446 // begin: 447 // FORK end 448 // <body> 449 // JMP begin 450 // end: 451 // ... 452 // 453 // This is greedy because a forked thread has lower priority than the 454 // thread that spawned it. 455 Label begin; 456 Label end; 457 458 assembler_.Bind(begin); 459 assembler_.Fork(end); 460 emit_body(); 461 assembler_.Jmp(begin); 462 463 assembler_.Bind(end); 464 } 465 466 // Emit bytecode corresponding to /<emit_body>*?/. 467 template <class F> 468 void CompileNonGreedyStar(F&& emit_body) { 469 // This is compiled into 470 // 471 // FORK body 472 // JMP end 473 // body: 474 // <body> 475 // FORK body 476 // end: 477 // ... 478 479 Label body; 480 Label end; 481 482 assembler_.Fork(body); 483 assembler_.Jmp(end); 484 485 assembler_.Bind(body); 486 emit_body(); 487 assembler_.Fork(body); 488 489 assembler_.Bind(end); 490 } 491 492 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/. 493 template <class F> 494 void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) { 495 // This is compiled into 496 // 497 // FORK end 498 // <body> 499 // FORK end 500 // <body> 501 // ... 502 // ... 503 // FORK end 504 // <body> 505 // end: 506 // ... 507 508 Label end; 509 for (int i = 0; i != max_repetition_num; ++i) { 510 assembler_.Fork(end); 511 emit_body(); 512 } 513 assembler_.Bind(end); 514 } 515 516 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/. 517 template <class F> 518 void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) { 519 // This is compiled into 520 // 521 // FORK body0 522 // JMP end 523 // body0: 524 // <body> 525 // FORK body1 526 // JMP end 527 // body1: 528 // <body> 529 // ... 530 // ... 531 // body{max_repetition_num - 1}: 532 // <body> 533 // end: 534 // ... 535 536 Label end; 537 for (int i = 0; i != max_repetition_num; ++i) { 538 Label body; 539 assembler_.Fork(body); 540 assembler_.Jmp(end); 541 542 assembler_.Bind(body); 543 emit_body(); 544 } 545 assembler_.Bind(end); 546 } 547 548 void* VisitQuantifier(RegExpQuantifier* node, void*) override { 549 // Emit the body, but clear registers occuring in body first. 550 // 551 // TODO(mbid,v8:10765): It's not always necessary to a) capture registers 552 // and b) clear them. For example, we don't have to capture anything for 553 // the first 4 repetitions if node->min() >= 5, and then we don't have to 554 // clear registers in the first node->min() repetitions. 555 // Later, and if node->min() == 0, we don't have to clear registers before 556 // the first optional repetition. 557 Interval body_registers = node->body()->CaptureRegisters(); 558 auto emit_body = [&]() { 559 ClearRegisters(body_registers); 560 node->body()->Accept(this, nullptr); 561 }; 562 563 // First repeat the body `min()` times. 564 for (int i = 0; i != node->min(); ++i) emit_body(); 565 566 switch (node->quantifier_type()) { 567 case RegExpQuantifier::POSSESSIVE: 568 UNREACHABLE(); 569 case RegExpQuantifier::GREEDY: { 570 if (node->max() == RegExpTree::kInfinity) { 571 CompileGreedyStar(emit_body); 572 } else { 573 DCHECK_NE(node->max(), RegExpTree::kInfinity); 574 CompileGreedyRepetition(emit_body, node->max() - node->min()); 575 } 576 break; 577 } 578 case RegExpQuantifier::NON_GREEDY: { 579 if (node->max() == RegExpTree::kInfinity) { 580 CompileNonGreedyStar(emit_body); 581 } else { 582 DCHECK_NE(node->max(), RegExpTree::kInfinity); 583 CompileNonGreedyRepetition(emit_body, node->max() - node->min()); 584 } 585 } 586 } 587 return nullptr; 588 } 589 590 void* VisitCapture(RegExpCapture* node, void*) override { 591 int index = node->index(); 592 int start_register = RegExpCapture::StartRegister(index); 593 int end_register = RegExpCapture::EndRegister(index); 594 assembler_.SetRegisterToCp(start_register); 595 node->body()->Accept(this, nullptr); 596 assembler_.SetRegisterToCp(end_register); 597 return nullptr; 598 } 599 600 void* VisitGroup(RegExpGroup* node, void*) override { 601 node->body()->Accept(this, nullptr); 602 return nullptr; 603 } 604 605 void* VisitLookaround(RegExpLookaround* node, void*) override { 606 // TODO(mbid,v8:10765): Support this case. 607 UNREACHABLE(); 608 } 609 610 void* VisitBackReference(RegExpBackReference* node, void*) override { 611 UNREACHABLE(); 612 } 613 614 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } 615 616 void* VisitText(RegExpText* node, void*) override { 617 for (TextElement& text_el : *node->elements()) { 618 text_el.tree()->Accept(this, nullptr); 619 } 620 return nullptr; 621 } 622 623 private: 624 Zone* zone_; 625 BytecodeAssembler assembler_; 626}; 627 628} // namespace 629 630ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile( 631 RegExpTree* tree, RegExpFlags flags, Zone* zone) { 632 return CompileVisitor::Compile(tree, flags, zone); 633} 634 635} // namespace internal 636} // namespace v8 637