1// Copyright 2019 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/regexp-compiler.h" 6 7#include "src/base/safe_conversions.h" 8#include "src/execution/isolate.h" 9#include "src/objects/fixed-array-inl.h" 10#include "src/regexp/regexp-macro-assembler-arch.h" 11#include "src/strings/unicode-inl.h" 12#include "src/zone/zone-list-inl.h" 13 14#ifdef V8_INTL_SUPPORT 15#include "src/regexp/special-case.h" 16#include "unicode/locid.h" 17#include "unicode/uniset.h" 18#include "unicode/utypes.h" 19#endif // V8_INTL_SUPPORT 20 21namespace v8 { 22namespace internal { 23 24using namespace regexp_compiler_constants; // NOLINT(build/namespaces) 25 26// ------------------------------------------------------------------- 27// Implementation of the Irregexp regular expression engine. 28// 29// The Irregexp regular expression engine is intended to be a complete 30// implementation of ECMAScript regular expressions. It generates either 31// bytecodes or native code. 32 33// The Irregexp regexp engine is structured in three steps. 34// 1) The parser generates an abstract syntax tree. See ast.cc. 35// 2) From the AST a node network is created. The nodes are all 36// subclasses of RegExpNode. The nodes represent states when 37// executing a regular expression. Several optimizations are 38// performed on the node network. 39// 3) From the nodes we generate either byte codes or native code 40// that can actually execute the regular expression (perform 41// the search). The code generation step is described in more 42// detail below. 43 44// Code generation. 45// 46// The nodes are divided into four main categories. 47// * Choice nodes 48// These represent places where the regular expression can 49// match in more than one way. For example on entry to an 50// alternation (foo|bar) or a repetition (*, +, ? or {}). 51// * Action nodes 52// These represent places where some action should be 53// performed. Examples include recording the current position 54// in the input string to a register (in order to implement 55// captures) or other actions on register for example in order 56// to implement the counters needed for {} repetitions. 57// * Matching nodes 58// These attempt to match some element part of the input string. 59// Examples of elements include character classes, plain strings 60// or back references. 61// * End nodes 62// These are used to implement the actions required on finding 63// a successful match or failing to find a match. 64// 65// The code generated (whether as byte codes or native code) maintains 66// some state as it runs. This consists of the following elements: 67// 68// * The capture registers. Used for string captures. 69// * Other registers. Used for counters etc. 70// * The current position. 71// * The stack of backtracking information. Used when a matching node 72// fails to find a match and needs to try an alternative. 73// 74// Conceptual regular expression execution model: 75// 76// There is a simple conceptual model of regular expression execution 77// which will be presented first. The actual code generated is a more 78// efficient simulation of the simple conceptual model: 79// 80// * Choice nodes are implemented as follows: 81// For each choice except the last { 82// push current position 83// push backtrack code location 84// <generate code to test for choice> 85// backtrack code location: 86// pop current position 87// } 88// <generate code to test for last choice> 89// 90// * Actions nodes are generated as follows 91// <push affected registers on backtrack stack> 92// <generate code to perform action> 93// push backtrack code location 94// <generate code to test for following nodes> 95// backtrack code location: 96// <pop affected registers to restore their state> 97// <pop backtrack location from stack and go to it> 98// 99// * Matching nodes are generated as follows: 100// if input string matches at current position 101// update current position 102// <generate code to test for following nodes> 103// else 104// <pop backtrack location from stack and go to it> 105// 106// Thus it can be seen that the current position is saved and restored 107// by the choice nodes, whereas the registers are saved and restored by 108// by the action nodes that manipulate them. 109// 110// The other interesting aspect of this model is that nodes are generated 111// at the point where they are needed by a recursive call to Emit(). If 112// the node has already been code generated then the Emit() call will 113// generate a jump to the previously generated code instead. In order to 114// limit recursion it is possible for the Emit() function to put the node 115// on a work list for later generation and instead generate a jump. The 116// destination of the jump is resolved later when the code is generated. 117// 118// Actual regular expression code generation. 119// 120// Code generation is actually more complicated than the above. In order 121// to improve the efficiency of the generated code some optimizations are 122// performed 123// 124// * Choice nodes have 1-character lookahead. 125// A choice node looks at the following character and eliminates some of 126// the choices immediately based on that character. This is not yet 127// implemented. 128// * Simple greedy loops store reduced backtracking information. 129// A quantifier like /.*foo/m will greedily match the whole input. It will 130// then need to backtrack to a point where it can match "foo". The naive 131// implementation of this would push each character position onto the 132// backtracking stack, then pop them off one by one. This would use space 133// proportional to the length of the input string. However since the "." 134// can only match in one way and always has a constant length (in this case 135// of 1) it suffices to store the current position on the top of the stack 136// once. Matching now becomes merely incrementing the current position and 137// backtracking becomes decrementing the current position and checking the 138// result against the stored current position. This is faster and saves 139// space. 140// * The current state is virtualized. 141// This is used to defer expensive operations until it is clear that they 142// are needed and to generate code for a node more than once, allowing 143// specialized an efficient versions of the code to be created. This is 144// explained in the section below. 145// 146// Execution state virtualization. 147// 148// Instead of emitting code, nodes that manipulate the state can record their 149// manipulation in an object called the Trace. The Trace object can record a 150// current position offset, an optional backtrack code location on the top of 151// the virtualized backtrack stack and some register changes. When a node is 152// to be emitted it can flush the Trace or update it. Flushing the Trace 153// will emit code to bring the actual state into line with the virtual state. 154// Avoiding flushing the state can postpone some work (e.g. updates of capture 155// registers). Postponing work can save time when executing the regular 156// expression since it may be found that the work never has to be done as a 157// failure to match can occur. In addition it is much faster to jump to a 158// known backtrack code location than it is to pop an unknown backtrack 159// location from the stack and jump there. 160// 161// The virtual state found in the Trace affects code generation. For example 162// the virtual state contains the difference between the actual current 163// position and the virtual current position, and matching code needs to use 164// this offset to attempt a match in the correct location of the input 165// string. Therefore code generated for a non-trivial trace is specialized 166// to that trace. The code generator therefore has the ability to generate 167// code for each node several times. In order to limit the size of the 168// generated code there is an arbitrary limit on how many specialized sets of 169// code may be generated for a given node. If the limit is reached, the 170// trace is flushed and a generic version of the code for a node is emitted. 171// This is subsequently used for that node. The code emitted for non-generic 172// trace is not recorded in the node and so it cannot currently be reused in 173// the event that code generation is requested for an identical trace. 174 175namespace { 176 177constexpr base::uc32 MaxCodeUnit(const bool one_byte) { 178 STATIC_ASSERT(String::kMaxOneByteCharCodeU <= 179 std::numeric_limits<uint16_t>::max()); 180 STATIC_ASSERT(String::kMaxUtf16CodeUnitU <= 181 std::numeric_limits<uint16_t>::max()); 182 return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU; 183} 184 185constexpr uint32_t CharMask(const bool one_byte) { 186 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1)); 187 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1)); 188 return MaxCodeUnit(one_byte); 189} 190 191} // namespace 192 193void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); } 194 195void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 196 text->AddElement(TextElement::Atom(this), zone); 197} 198 199void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 200 text->AddElement(TextElement::CharClass(this), zone); 201} 202 203void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 204 for (int i = 0; i < elements()->length(); i++) 205 text->AddElement(elements()->at(i), zone); 206} 207 208TextElement TextElement::Atom(RegExpAtom* atom) { 209 return TextElement(ATOM, atom); 210} 211 212TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 213 return TextElement(CHAR_CLASS, char_class); 214} 215 216int TextElement::length() const { 217 switch (text_type()) { 218 case ATOM: 219 return atom()->length(); 220 221 case CHAR_CLASS: 222 return 1; 223 } 224 UNREACHABLE(); 225} 226 227class RecursionCheck { 228 public: 229 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 230 compiler->IncrementRecursionDepth(); 231 } 232 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 233 234 private: 235 RegExpCompiler* compiler_; 236}; 237 238// Attempts to compile the regexp using an Irregexp code generator. Returns 239// a fixed array or a null handle depending on whether it succeeded. 240RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 241 RegExpFlags flags, bool one_byte) 242 : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)), 243 unicode_lookaround_stack_register_(kNoRegister), 244 unicode_lookaround_position_register_(kNoRegister), 245 work_list_(nullptr), 246 recursion_depth_(0), 247 flags_(flags), 248 one_byte_(one_byte), 249 reg_exp_too_big_(false), 250 limiting_recursion_(false), 251 optimize_(FLAG_regexp_optimization), 252 read_backward_(false), 253 current_expansion_factor_(1), 254 frequency_collator_(), 255 isolate_(isolate), 256 zone_(zone) { 257 accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone); 258 DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1); 259} 260 261RegExpCompiler::CompilationResult RegExpCompiler::Assemble( 262 Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, 263 int capture_count, Handle<String> pattern) { 264 macro_assembler_ = macro_assembler; 265 266 ZoneVector<RegExpNode*> work_list(zone()); 267 work_list_ = &work_list; 268 Label fail; 269 macro_assembler_->PushBacktrack(&fail); 270 Trace new_trace; 271 start->Emit(this, &new_trace); 272 macro_assembler_->BindJumpTarget(&fail); 273 macro_assembler_->Fail(); 274 while (!work_list.empty()) { 275 RegExpNode* node = work_list.back(); 276 work_list.pop_back(); 277 node->set_on_work_list(false); 278 if (!node->label()->is_bound()) node->Emit(this, &new_trace); 279 } 280 if (reg_exp_too_big_) { 281 if (FLAG_correctness_fuzzer_suppressions) { 282 FATAL("Aborting on excess zone allocation"); 283 } 284 macro_assembler_->AbortedCodeGeneration(); 285 return CompilationResult::RegExpTooBig(); 286 } 287 288 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 289 isolate->IncreaseTotalRegexpCodeGenerated(code); 290 work_list_ = nullptr; 291 292 return {code, next_register_}; 293} 294 295bool Trace::DeferredAction::Mentions(int that) { 296 if (action_type() == ActionNode::CLEAR_CAPTURES) { 297 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 298 return range.Contains(that); 299 } else { 300 return reg() == that; 301 } 302} 303 304bool Trace::mentions_reg(int reg) { 305 for (DeferredAction* action = actions_; action != nullptr; 306 action = action->next()) { 307 if (action->Mentions(reg)) return true; 308 } 309 return false; 310} 311 312bool Trace::GetStoredPosition(int reg, int* cp_offset) { 313 DCHECK_EQ(0, *cp_offset); 314 for (DeferredAction* action = actions_; action != nullptr; 315 action = action->next()) { 316 if (action->Mentions(reg)) { 317 if (action->action_type() == ActionNode::STORE_POSITION) { 318 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 319 return true; 320 } else { 321 return false; 322 } 323 } 324 } 325 return false; 326} 327 328// A (dynamically-sized) set of unsigned integers that behaves especially well 329// on small integers (< kFirstLimit). May do zone-allocation. 330class DynamicBitSet : public ZoneObject { 331 public: 332 V8_EXPORT_PRIVATE bool Get(unsigned value) const { 333 if (value < kFirstLimit) { 334 return (first_ & (1 << value)) != 0; 335 } else if (remaining_ == nullptr) { 336 return false; 337 } else { 338 return remaining_->Contains(value); 339 } 340 } 341 342 // Destructively set a value in this set. 343 void Set(unsigned value, Zone* zone) { 344 if (value < kFirstLimit) { 345 first_ |= (1 << value); 346 } else { 347 if (remaining_ == nullptr) 348 remaining_ = zone->New<ZoneList<unsigned>>(1, zone); 349 if (remaining_->is_empty() || !remaining_->Contains(value)) 350 remaining_->Add(value, zone); 351 } 352 } 353 354 private: 355 static constexpr unsigned kFirstLimit = 32; 356 357 uint32_t first_ = 0; 358 ZoneList<unsigned>* remaining_ = nullptr; 359}; 360 361int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers, 362 Zone* zone) { 363 int max_register = RegExpCompiler::kNoRegister; 364 for (DeferredAction* action = actions_; action != nullptr; 365 action = action->next()) { 366 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 367 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 368 for (int i = range.from(); i <= range.to(); i++) 369 affected_registers->Set(i, zone); 370 if (range.to() > max_register) max_register = range.to(); 371 } else { 372 affected_registers->Set(action->reg(), zone); 373 if (action->reg() > max_register) max_register = action->reg(); 374 } 375 } 376 return max_register; 377} 378 379void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 380 int max_register, 381 const DynamicBitSet& registers_to_pop, 382 const DynamicBitSet& registers_to_clear) { 383 for (int reg = max_register; reg >= 0; reg--) { 384 if (registers_to_pop.Get(reg)) { 385 assembler->PopRegister(reg); 386 } else if (registers_to_clear.Get(reg)) { 387 int clear_to = reg; 388 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 389 reg--; 390 } 391 assembler->ClearRegisters(reg, clear_to); 392 } 393 } 394} 395 396void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 397 int max_register, 398 const DynamicBitSet& affected_registers, 399 DynamicBitSet* registers_to_pop, 400 DynamicBitSet* registers_to_clear, 401 Zone* zone) { 402 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 403 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 404 405 // Count pushes performed to force a stack limit check occasionally. 406 int pushes = 0; 407 408 for (int reg = 0; reg <= max_register; reg++) { 409 if (!affected_registers.Get(reg)) continue; 410 411 // The chronologically first deferred action in the trace 412 // is used to infer the action needed to restore a register 413 // to its previous state (or not, if it's safe to ignore it). 414 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 415 DeferredActionUndoType undo_action = IGNORE; 416 417 int value = 0; 418 bool absolute = false; 419 bool clear = false; 420 static const int kNoStore = kMinInt; 421 int store_position = kNoStore; 422 // This is a little tricky because we are scanning the actions in reverse 423 // historical order (newest first). 424 for (DeferredAction* action = actions_; action != nullptr; 425 action = action->next()) { 426 if (action->Mentions(reg)) { 427 switch (action->action_type()) { 428 case ActionNode::SET_REGISTER_FOR_LOOP: { 429 Trace::DeferredSetRegisterForLoop* psr = 430 static_cast<Trace::DeferredSetRegisterForLoop*>(action); 431 if (!absolute) { 432 value += psr->value(); 433 absolute = true; 434 } 435 // SET_REGISTER_FOR_LOOP is only used for newly introduced loop 436 // counters. They can have a significant previous value if they 437 // occur in a loop. TODO(lrn): Propagate this information, so 438 // we can set undo_action to IGNORE if we know there is no value to 439 // restore. 440 undo_action = RESTORE; 441 DCHECK_EQ(store_position, kNoStore); 442 DCHECK(!clear); 443 break; 444 } 445 case ActionNode::INCREMENT_REGISTER: 446 if (!absolute) { 447 value++; 448 } 449 DCHECK_EQ(store_position, kNoStore); 450 DCHECK(!clear); 451 undo_action = RESTORE; 452 break; 453 case ActionNode::STORE_POSITION: { 454 Trace::DeferredCapture* pc = 455 static_cast<Trace::DeferredCapture*>(action); 456 if (!clear && store_position == kNoStore) { 457 store_position = pc->cp_offset(); 458 } 459 460 // For captures we know that stores and clears alternate. 461 // Other register, are never cleared, and if the occur 462 // inside a loop, they might be assigned more than once. 463 if (reg <= 1) { 464 // Registers zero and one, aka "capture zero", is 465 // always set correctly if we succeed. There is no 466 // need to undo a setting on backtrack, because we 467 // will set it again or fail. 468 undo_action = IGNORE; 469 } else { 470 undo_action = pc->is_capture() ? CLEAR : RESTORE; 471 } 472 DCHECK(!absolute); 473 DCHECK_EQ(value, 0); 474 break; 475 } 476 case ActionNode::CLEAR_CAPTURES: { 477 // Since we're scanning in reverse order, if we've already 478 // set the position we have to ignore historically earlier 479 // clearing operations. 480 if (store_position == kNoStore) { 481 clear = true; 482 } 483 undo_action = RESTORE; 484 DCHECK(!absolute); 485 DCHECK_EQ(value, 0); 486 break; 487 } 488 default: 489 UNREACHABLE(); 490 } 491 } 492 } 493 // Prepare for the undo-action (e.g., push if it's going to be popped). 494 if (undo_action == RESTORE) { 495 pushes++; 496 RegExpMacroAssembler::StackCheckFlag stack_check = 497 RegExpMacroAssembler::kNoStackLimitCheck; 498 if (pushes == push_limit) { 499 stack_check = RegExpMacroAssembler::kCheckStackLimit; 500 pushes = 0; 501 } 502 503 assembler->PushRegister(reg, stack_check); 504 registers_to_pop->Set(reg, zone); 505 } else if (undo_action == CLEAR) { 506 registers_to_clear->Set(reg, zone); 507 } 508 // Perform the chronologically last action (or accumulated increment) 509 // for the register. 510 if (store_position != kNoStore) { 511 assembler->WriteCurrentPositionToRegister(reg, store_position); 512 } else if (clear) { 513 assembler->ClearRegisters(reg, reg); 514 } else if (absolute) { 515 assembler->SetRegister(reg, value); 516 } else if (value != 0) { 517 assembler->AdvanceRegister(reg, value); 518 } 519 } 520} 521 522// This is called as we come into a loop choice node and some other tricky 523// nodes. It normalizes the state of the code generator to ensure we can 524// generate generic code. 525void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 526 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 527 528 DCHECK(!is_trivial()); 529 530 if (actions_ == nullptr && backtrack() == nullptr) { 531 // Here we just have some deferred cp advances to fix and we are back to 532 // a normal situation. We may also have to forget some information gained 533 // through a quick check that was already performed. 534 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 535 // Create a new trivial state and generate the node with that. 536 Trace new_state; 537 successor->Emit(compiler, &new_state); 538 return; 539 } 540 541 // Generate deferred actions here along with code to undo them again. 542 DynamicBitSet affected_registers; 543 544 if (backtrack() != nullptr) { 545 // Here we have a concrete backtrack location. These are set up by choice 546 // nodes and so they indicate that we have a deferred save of the current 547 // position which we may need to emit here. 548 assembler->PushCurrentPosition(); 549 } 550 551 int max_register = 552 FindAffectedRegisters(&affected_registers, compiler->zone()); 553 DynamicBitSet registers_to_pop; 554 DynamicBitSet registers_to_clear; 555 PerformDeferredActions(assembler, max_register, affected_registers, 556 ®isters_to_pop, ®isters_to_clear, 557 compiler->zone()); 558 if (cp_offset_ != 0) { 559 assembler->AdvanceCurrentPosition(cp_offset_); 560 } 561 562 // Create a new trivial state and generate the node with that. 563 Label undo; 564 assembler->PushBacktrack(&undo); 565 if (successor->KeepRecursing(compiler)) { 566 Trace new_state; 567 successor->Emit(compiler, &new_state); 568 } else { 569 compiler->AddWork(successor); 570 assembler->GoTo(successor->label()); 571 } 572 573 // On backtrack we need to restore state. 574 assembler->BindJumpTarget(&undo); 575 RestoreAffectedRegisters(assembler, max_register, registers_to_pop, 576 registers_to_clear); 577 if (backtrack() == nullptr) { 578 assembler->Backtrack(); 579 } else { 580 assembler->PopCurrentPosition(); 581 assembler->GoTo(backtrack()); 582 } 583} 584 585void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 586 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 587 588 // Omit flushing the trace. We discard the entire stack frame anyway. 589 590 if (!label()->is_bound()) { 591 // We are completely independent of the trace, since we ignore it, 592 // so this code can be used as the generic version. 593 assembler->Bind(label()); 594 } 595 596 // Throw away everything on the backtrack stack since the start 597 // of the negative submatch and restore the character position. 598 assembler->ReadCurrentPositionFromRegister(current_position_register_); 599 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 600 if (clear_capture_count_ > 0) { 601 // Clear any captures that might have been performed during the success 602 // of the body of the negative look-ahead. 603 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 604 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 605 } 606 // Now that we have unwound the stack we find at the top of the stack the 607 // backtrack that the BeginNegativeSubmatch node got. 608 assembler->Backtrack(); 609} 610 611void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 612 if (!trace->is_trivial()) { 613 trace->Flush(compiler, this); 614 return; 615 } 616 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 617 if (!label()->is_bound()) { 618 assembler->Bind(label()); 619 } 620 switch (action_) { 621 case ACCEPT: 622 assembler->Succeed(); 623 return; 624 case BACKTRACK: 625 assembler->GoTo(trace->backtrack()); 626 return; 627 case NEGATIVE_SUBMATCH_SUCCESS: 628 // This case is handled in a different virtual method. 629 UNREACHABLE(); 630 } 631 UNIMPLEMENTED(); 632} 633 634void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 635 if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone); 636 guards_->Add(guard, zone); 637} 638 639ActionNode* ActionNode::SetRegisterForLoop(int reg, int val, 640 RegExpNode* on_success) { 641 ActionNode* result = 642 on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success); 643 result->data_.u_store_register.reg = reg; 644 result->data_.u_store_register.value = val; 645 return result; 646} 647 648ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 649 ActionNode* result = 650 on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success); 651 result->data_.u_increment_register.reg = reg; 652 return result; 653} 654 655ActionNode* ActionNode::StorePosition(int reg, bool is_capture, 656 RegExpNode* on_success) { 657 ActionNode* result = 658 on_success->zone()->New<ActionNode>(STORE_POSITION, on_success); 659 result->data_.u_position_register.reg = reg; 660 result->data_.u_position_register.is_capture = is_capture; 661 return result; 662} 663 664ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) { 665 ActionNode* result = 666 on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success); 667 result->data_.u_clear_captures.range_from = range.from(); 668 result->data_.u_clear_captures.range_to = range.to(); 669 return result; 670} 671 672ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg, 673 RegExpNode* on_success) { 674 ActionNode* result = 675 on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success); 676 result->data_.u_submatch.stack_pointer_register = stack_reg; 677 result->data_.u_submatch.current_position_register = position_reg; 678 return result; 679} 680 681ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg, 682 RegExpNode* on_success) { 683 ActionNode* result = 684 on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success); 685 result->data_.u_submatch.stack_pointer_register = stack_reg; 686 result->data_.u_submatch.current_position_register = position_reg; 687 return result; 688} 689 690ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg, 691 int clear_register_count, 692 int clear_register_from, 693 RegExpNode* on_success) { 694 ActionNode* result = on_success->zone()->New<ActionNode>( 695 POSITIVE_SUBMATCH_SUCCESS, on_success); 696 result->data_.u_submatch.stack_pointer_register = stack_reg; 697 result->data_.u_submatch.current_position_register = position_reg; 698 result->data_.u_submatch.clear_register_count = clear_register_count; 699 result->data_.u_submatch.clear_register_from = clear_register_from; 700 return result; 701} 702 703ActionNode* ActionNode::EmptyMatchCheck(int start_register, 704 int repetition_register, 705 int repetition_limit, 706 RegExpNode* on_success) { 707 ActionNode* result = 708 on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success); 709 result->data_.u_empty_match_check.start_register = start_register; 710 result->data_.u_empty_match_check.repetition_register = repetition_register; 711 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 712 return result; 713} 714 715#define DEFINE_ACCEPT(Type) \ 716 void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); } 717FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 718#undef DEFINE_ACCEPT 719 720// ------------------------------------------------------------------- 721// Emit code. 722 723void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 724 Guard* guard, Trace* trace) { 725 switch (guard->op()) { 726 case Guard::LT: 727 DCHECK(!trace->mentions_reg(guard->reg())); 728 macro_assembler->IfRegisterGE(guard->reg(), guard->value(), 729 trace->backtrack()); 730 break; 731 case Guard::GEQ: 732 DCHECK(!trace->mentions_reg(guard->reg())); 733 macro_assembler->IfRegisterLT(guard->reg(), guard->value(), 734 trace->backtrack()); 735 break; 736 } 737} 738 739namespace { 740 741#ifdef DEBUG 742bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) { 743 STATIC_ASSERT(sizeof(unibrow::uchar) == 4); 744 for (int i = 0; i < length; i++) { 745 if (chars[i] > String::kMaxUtf16CodeUnit) return false; 746 } 747 return true; 748} 749#endif // DEBUG 750 751// Returns the number of characters in the equivalence class, omitting those 752// that cannot occur in the source string because it is Latin1. 753int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character, 754 bool one_byte_subject, unibrow::uchar* letters, 755 int letter_length) { 756#ifdef V8_INTL_SUPPORT 757 if (RegExpCaseFolding::IgnoreSet().contains(character)) { 758 letters[0] = character; 759 DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1)); 760 return 1; 761 } 762 bool in_special_add_set = 763 RegExpCaseFolding::SpecialAddSet().contains(character); 764 765 icu::UnicodeSet set; 766 set.add(character); 767 set = set.closeOver(USET_CASE_INSENSITIVE); 768 769 UChar32 canon = 0; 770 if (in_special_add_set) { 771 canon = RegExpCaseFolding::Canonicalize(character); 772 } 773 774 int32_t range_count = set.getRangeCount(); 775 int items = 0; 776 for (int32_t i = 0; i < range_count; i++) { 777 UChar32 start = set.getRangeStart(i); 778 UChar32 end = set.getRangeEnd(i); 779 CHECK(end - start + items <= letter_length); 780 for (UChar32 cu = start; cu <= end; cu++) { 781 if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; 782 if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) { 783 continue; 784 } 785 letters[items++] = static_cast<unibrow::uchar>(cu); 786 } 787 } 788 DCHECK(ContainsOnlyUtf16CodeUnits(letters, items)); 789 return items; 790#else 791 int length = 792 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 793 // Unibrow returns 0 or 1 for characters where case independence is 794 // trivial. 795 if (length == 0) { 796 letters[0] = character; 797 length = 1; 798 } 799 800 if (one_byte_subject) { 801 int new_length = 0; 802 for (int i = 0; i < length; i++) { 803 if (letters[i] <= String::kMaxOneByteCharCode) { 804 letters[new_length++] = letters[i]; 805 } 806 } 807 length = new_length; 808 } 809 810 DCHECK(ContainsOnlyUtf16CodeUnits(letters, length)); 811 return length; 812#endif // V8_INTL_SUPPORT 813} 814 815inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler, 816 base::uc16 c, Label* on_failure, int cp_offset, 817 bool check, bool preloaded) { 818 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 819 bool bound_checked = false; 820 if (!preloaded) { 821 assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 822 bound_checked = true; 823 } 824 assembler->CheckNotCharacter(c, on_failure); 825 return bound_checked; 826} 827 828// Only emits non-letters (things that don't have case). Only used for case 829// independent matches. 830inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, 831 base::uc16 c, Label* on_failure, int cp_offset, 832 bool check, bool preloaded) { 833 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 834 bool one_byte = compiler->one_byte(); 835 unibrow::uchar chars[4]; 836 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); 837 if (length < 1) { 838 // This can't match. Must be an one-byte subject and a non-one-byte 839 // character. We do not need to do anything since the one-byte pass 840 // already handled this. 841 return false; // Bounds not checked. 842 } 843 bool checked = false; 844 // We handle the length > 1 case in a later pass. 845 if (length == 1) { 846 if (one_byte && c > String::kMaxOneByteCharCodeU) { 847 // Can't match - see above. 848 return false; // Bounds not checked. 849 } 850 if (!preloaded) { 851 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 852 checked = check; 853 } 854 macro_assembler->CheckNotCharacter(c, on_failure); 855 } 856 return checked; 857} 858 859bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 860 bool one_byte, base::uc16 c1, base::uc16 c2, 861 Label* on_failure) { 862 const uint32_t char_mask = CharMask(one_byte); 863 base::uc16 exor = c1 ^ c2; 864 // Check whether exor has only one bit set. 865 if (((exor - 1) & exor) == 0) { 866 // If c1 and c2 differ only by one bit. 867 // Ecma262UnCanonicalize always gives the highest number last. 868 DCHECK(c2 > c1); 869 base::uc16 mask = char_mask ^ exor; 870 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 871 return true; 872 } 873 DCHECK(c2 > c1); 874 base::uc16 diff = c2 - c1; 875 if (((diff - 1) & diff) == 0 && c1 >= diff) { 876 // If the characters differ by 2^n but don't differ by one bit then 877 // subtract the difference from the found character, then do the or 878 // trick. We avoid the theoretical case where negative numbers are 879 // involved in order to simplify code generation. 880 base::uc16 mask = char_mask ^ diff; 881 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask, 882 on_failure); 883 return true; 884 } 885 return false; 886} 887 888// Only emits letters (things that have case). Only used for case independent 889// matches. 890inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, 891 base::uc16 c, Label* on_failure, int cp_offset, 892 bool check, bool preloaded) { 893 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 894 bool one_byte = compiler->one_byte(); 895 unibrow::uchar chars[4]; 896 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); 897 if (length <= 1) return false; 898 // We may not need to check against the end of the input string 899 // if this character lies before a character that matched. 900 if (!preloaded) { 901 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 902 } 903 Label ok; 904 switch (length) { 905 case 2: { 906 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 907 chars[1], on_failure)) { 908 } else { 909 macro_assembler->CheckCharacter(chars[0], &ok); 910 macro_assembler->CheckNotCharacter(chars[1], on_failure); 911 macro_assembler->Bind(&ok); 912 } 913 break; 914 } 915 case 4: 916 macro_assembler->CheckCharacter(chars[3], &ok); 917 V8_FALLTHROUGH; 918 case 3: 919 macro_assembler->CheckCharacter(chars[0], &ok); 920 macro_assembler->CheckCharacter(chars[1], &ok); 921 macro_assembler->CheckNotCharacter(chars[2], on_failure); 922 macro_assembler->Bind(&ok); 923 break; 924 default: 925 UNREACHABLE(); 926 } 927 return true; 928} 929 930void EmitBoundaryTest(RegExpMacroAssembler* masm, int border, 931 Label* fall_through, Label* above_or_equal, 932 Label* below) { 933 if (below != fall_through) { 934 masm->CheckCharacterLT(border, below); 935 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 936 } else { 937 masm->CheckCharacterGT(border - 1, above_or_equal); 938 } 939} 940 941void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last, 942 Label* fall_through, Label* in_range, 943 Label* out_of_range) { 944 if (in_range == fall_through) { 945 if (first == last) { 946 masm->CheckNotCharacter(first, out_of_range); 947 } else { 948 masm->CheckCharacterNotInRange(first, last, out_of_range); 949 } 950 } else { 951 if (first == last) { 952 masm->CheckCharacter(first, in_range); 953 } else { 954 masm->CheckCharacterInRange(first, last, in_range); 955 } 956 if (out_of_range != fall_through) masm->GoTo(out_of_range); 957 } 958} 959 960// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 961// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 962void EmitUseLookupTable(RegExpMacroAssembler* masm, 963 ZoneList<base::uc32>* ranges, uint32_t start_index, 964 uint32_t end_index, base::uc32 min_char, 965 Label* fall_through, Label* even_label, 966 Label* odd_label) { 967 static const uint32_t kSize = RegExpMacroAssembler::kTableSize; 968 static const uint32_t kMask = RegExpMacroAssembler::kTableMask; 969 970 base::uc32 base = (min_char & ~kMask); 971 USE(base); 972 973 // Assert that everything is on one kTableSize page. 974 for (uint32_t i = start_index; i <= end_index; i++) { 975 DCHECK_EQ(ranges->at(i) & ~kMask, base); 976 } 977 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 978 979 char templ[kSize]; 980 Label* on_bit_set; 981 Label* on_bit_clear; 982 int bit; 983 if (even_label == fall_through) { 984 on_bit_set = odd_label; 985 on_bit_clear = even_label; 986 bit = 1; 987 } else { 988 on_bit_set = even_label; 989 on_bit_clear = odd_label; 990 bit = 0; 991 } 992 for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; 993 i++) { 994 templ[i] = bit; 995 } 996 uint32_t j = 0; 997 bit ^= 1; 998 for (uint32_t i = start_index; i < end_index; i++) { 999 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1000 templ[j] = bit; 1001 } 1002 bit ^= 1; 1003 } 1004 for (uint32_t i = j; i < kSize; i++) { 1005 templ[i] = bit; 1006 } 1007 Factory* factory = masm->isolate()->factory(); 1008 // TODO(erikcorry): Cache these. 1009 Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld); 1010 for (uint32_t i = 0; i < kSize; i++) { 1011 ba->set(i, templ[i]); 1012 } 1013 masm->CheckBitInTable(ba, on_bit_set); 1014 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1015} 1016 1017void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges, 1018 uint32_t start_index, uint32_t end_index, uint32_t cut_index, 1019 Label* even_label, Label* odd_label) { 1020 bool odd = (((cut_index - start_index) & 1) == 1); 1021 Label* in_range_label = odd ? odd_label : even_label; 1022 Label dummy; 1023 EmitDoubleBoundaryTest(masm, ranges->at(cut_index), 1024 ranges->at(cut_index + 1) - 1, &dummy, in_range_label, 1025 &dummy); 1026 DCHECK(!dummy.is_linked()); 1027 // Cut out the single range by rewriting the array. This creates a new 1028 // range that is a merger of the two ranges on either side of the one we 1029 // are cutting out. The oddity of the labels is preserved. 1030 for (uint32_t j = cut_index; j > start_index; j--) { 1031 ranges->at(j) = ranges->at(j - 1); 1032 } 1033 for (uint32_t j = cut_index + 1; j < end_index; j++) { 1034 ranges->at(j) = ranges->at(j + 1); 1035 } 1036} 1037 1038// Unicode case. Split the search space into kSize spaces that are handled 1039// with recursion. 1040void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index, 1041 uint32_t end_index, uint32_t* new_start_index, 1042 uint32_t* new_end_index, base::uc32* border) { 1043 static const uint32_t kSize = RegExpMacroAssembler::kTableSize; 1044 static const uint32_t kMask = RegExpMacroAssembler::kTableMask; 1045 1046 base::uc32 first = ranges->at(start_index); 1047 base::uc32 last = ranges->at(end_index) - 1; 1048 1049 *new_start_index = start_index; 1050 *border = (ranges->at(start_index) & ~kMask) + kSize; 1051 while (*new_start_index < end_index) { 1052 if (ranges->at(*new_start_index) > *border) break; 1053 (*new_start_index)++; 1054 } 1055 // new_start_index is the index of the first edge that is beyond the 1056 // current kSize space. 1057 1058 // For very large search spaces we do a binary chop search of the non-Latin1 1059 // space instead of just going to the end of the current kSize space. The 1060 // heuristics are complicated a little by the fact that any 128-character 1061 // encoding space can be quickly tested with a table lookup, so we don't 1062 // wish to do binary chop search at a smaller granularity than that. A 1063 // 128-character space can take up a lot of space in the ranges array if, 1064 // for example, we only want to match every second character (eg. the lower 1065 // case characters on some Unicode pages). 1066 uint32_t binary_chop_index = (end_index + start_index) / 2; 1067 // The first test ensures that we get to the code that handles the Latin1 1068 // range with a single not-taken branch, speeding up this important 1069 // character range (even non-Latin1 charset-based text has spaces and 1070 // punctuation). 1071 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1072 end_index - start_index > (*new_start_index - start_index) * 2 && 1073 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1074 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1075 uint32_t scan_forward_for_section_border = binary_chop_index; 1076 uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1077 1078 while (scan_forward_for_section_border < end_index) { 1079 if (ranges->at(scan_forward_for_section_border) > new_border) { 1080 *new_start_index = scan_forward_for_section_border; 1081 *border = new_border; 1082 break; 1083 } 1084 scan_forward_for_section_border++; 1085 } 1086 } 1087 1088 DCHECK(*new_start_index > start_index); 1089 *new_end_index = *new_start_index - 1; 1090 if (ranges->at(*new_end_index) == *border) { 1091 (*new_end_index)--; 1092 } 1093 if (*border >= ranges->at(end_index)) { 1094 *border = ranges->at(end_index); 1095 *new_start_index = end_index; // Won't be used. 1096 *new_end_index = end_index - 1; 1097 } 1098} 1099 1100// Gets a series of segment boundaries representing a character class. If the 1101// character is in the range between an even and an odd boundary (counting from 1102// start_index) then go to even_label, otherwise go to odd_label. We already 1103// know that the character is in the range of min_char to max_char inclusive. 1104// Either label can be nullptr indicating backtracking. Either label can also 1105// be equal to the fall_through label. 1106void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges, 1107 uint32_t start_index, uint32_t end_index, 1108 base::uc32 min_char, base::uc32 max_char, 1109 Label* fall_through, Label* even_label, 1110 Label* odd_label) { 1111 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); 1112 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); 1113 1114 base::uc32 first = ranges->at(start_index); 1115 base::uc32 last = ranges->at(end_index) - 1; 1116 1117 DCHECK_LT(min_char, first); 1118 1119 // Just need to test if the character is before or on-or-after 1120 // a particular character. 1121 if (start_index == end_index) { 1122 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 1123 return; 1124 } 1125 1126 // Another almost trivial case: There is one interval in the middle that is 1127 // different from the end intervals. 1128 if (start_index + 1 == end_index) { 1129 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label, 1130 odd_label); 1131 return; 1132 } 1133 1134 // It's not worth using table lookup if there are very few intervals in the 1135 // character class. 1136 if (end_index - start_index <= 6) { 1137 // It is faster to test for individual characters, so we look for those 1138 // first, then try arbitrary ranges in the second round. 1139 static uint32_t kNoCutIndex = -1; 1140 uint32_t cut = kNoCutIndex; 1141 for (uint32_t i = start_index; i < end_index; i++) { 1142 if (ranges->at(i) == ranges->at(i + 1) - 1) { 1143 cut = i; 1144 break; 1145 } 1146 } 1147 if (cut == kNoCutIndex) cut = start_index; 1148 CutOutRange(masm, ranges, start_index, end_index, cut, even_label, 1149 odd_label); 1150 DCHECK_GE(end_index - start_index, 2); 1151 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char, 1152 max_char, fall_through, even_label, odd_label); 1153 return; 1154 } 1155 1156 // If there are a lot of intervals in the regexp, then we will use tables to 1157 // determine whether the character is inside or outside the character class. 1158 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 1159 1160 if ((max_char >> kBits) == (min_char >> kBits)) { 1161 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char, 1162 fall_through, even_label, odd_label); 1163 return; 1164 } 1165 1166 if ((min_char >> kBits) != first >> kBits) { 1167 masm->CheckCharacterLT(first, odd_label); 1168 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char, 1169 fall_through, odd_label, even_label); 1170 return; 1171 } 1172 1173 uint32_t new_start_index = 0; 1174 uint32_t new_end_index = 0; 1175 base::uc32 border = 0; 1176 1177 SplitSearchSpace(ranges, start_index, end_index, &new_start_index, 1178 &new_end_index, &border); 1179 1180 Label handle_rest; 1181 Label* above = &handle_rest; 1182 if (border == last + 1) { 1183 // We didn't find any section that started after the limit, so everything 1184 // above the border is one of the terminal labels. 1185 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 1186 DCHECK(new_end_index == end_index - 1); 1187 } 1188 1189 DCHECK_LE(start_index, new_end_index); 1190 DCHECK_LE(new_start_index, end_index); 1191 DCHECK_LT(start_index, new_start_index); 1192 DCHECK_LT(new_end_index, end_index); 1193 DCHECK(new_end_index + 1 == new_start_index || 1194 (new_end_index + 2 == new_start_index && 1195 border == ranges->at(new_end_index + 1))); 1196 DCHECK_LT(min_char, border - 1); 1197 DCHECK_LT(border, max_char); 1198 DCHECK_LT(ranges->at(new_end_index), border); 1199 DCHECK(border < ranges->at(new_start_index) || 1200 (border == ranges->at(new_start_index) && 1201 new_start_index == end_index && new_end_index == end_index - 1 && 1202 border == last + 1)); 1203 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 1204 1205 masm->CheckCharacterGT(border - 1, above); 1206 Label dummy; 1207 GenerateBranches(masm, ranges, start_index, new_end_index, min_char, 1208 border - 1, &dummy, even_label, odd_label); 1209 if (handle_rest.is_linked()) { 1210 masm->Bind(&handle_rest); 1211 bool flip = (new_start_index & 1) != (start_index & 1); 1212 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char, 1213 &dummy, flip ? odd_label : even_label, 1214 flip ? even_label : odd_label); 1215 } 1216} 1217 1218void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1219 RegExpCharacterClass* cc, bool one_byte, Label* on_failure, 1220 int cp_offset, bool check_offset, bool preloaded, 1221 Zone* zone) { 1222 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 1223 CharacterRange::Canonicalize(ranges); 1224 1225 // Now that all processing (like case-insensitivity) is done, clamp the 1226 // ranges to the set of ranges that may actually occur in the subject string. 1227 if (one_byte) CharacterRange::ClampToOneByte(ranges); 1228 1229 const int ranges_length = ranges->length(); 1230 if (ranges_length == 0) { 1231 if (!cc->is_negated()) { 1232 macro_assembler->GoTo(on_failure); 1233 } 1234 if (check_offset) { 1235 macro_assembler->CheckPosition(cp_offset, on_failure); 1236 } 1237 return; 1238 } 1239 1240 const base::uc32 max_char = MaxCodeUnit(one_byte); 1241 if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) { 1242 if (cc->is_negated()) { 1243 macro_assembler->GoTo(on_failure); 1244 } else { 1245 // This is a common case hit by non-anchored expressions. 1246 if (check_offset) { 1247 macro_assembler->CheckPosition(cp_offset, on_failure); 1248 } 1249 } 1250 return; 1251 } 1252 1253 if (!preloaded) { 1254 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1255 } 1256 1257 if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass( 1258 cc->standard_type(), on_failure)) { 1259 return; 1260 } 1261 1262 static constexpr int kMaxRangesForInlineBranchGeneration = 16; 1263 if (ranges_length > kMaxRangesForInlineBranchGeneration) { 1264 // For large range sets, emit a more compact instruction sequence to avoid 1265 // a potentially problematic increase in code size. 1266 // Note the flipped logic below (we check InRange if negated, NotInRange if 1267 // not negated); this is necessary since the method falls through on 1268 // failure whereas we want to fall through on success. 1269 if (cc->is_negated()) { 1270 if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) { 1271 return; 1272 } 1273 } else { 1274 if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) { 1275 return; 1276 } 1277 } 1278 } 1279 1280 // Generate a flat list of range boundaries for consumption by 1281 // GenerateBranches. See the comment on that function for how the list should 1282 // be structured 1283 ZoneList<base::uc32>* range_boundaries = 1284 zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone); 1285 1286 bool zeroth_entry_is_failure = !cc->is_negated(); 1287 1288 for (int i = 0; i < ranges_length; i++) { 1289 CharacterRange& range = ranges->at(i); 1290 if (range.from() == 0) { 1291 DCHECK_EQ(i, 0); 1292 zeroth_entry_is_failure = !zeroth_entry_is_failure; 1293 } else { 1294 range_boundaries->Add(range.from(), zone); 1295 } 1296 // `+ 1` to convert from inclusive to exclusive `to`. 1297 // [from, to] == [from, to+1[. 1298 range_boundaries->Add(range.to() + 1, zone); 1299 } 1300 int end_index = range_boundaries->length() - 1; 1301 if (range_boundaries->at(end_index) > max_char) { 1302 end_index--; 1303 } 1304 1305 Label fall_through; 1306 GenerateBranches(macro_assembler, range_boundaries, 1307 0, // start_index. 1308 end_index, 1309 0, // min_char. 1310 max_char, &fall_through, 1311 zeroth_entry_is_failure ? &fall_through : on_failure, 1312 zeroth_entry_is_failure ? on_failure : &fall_through); 1313 macro_assembler->Bind(&fall_through); 1314} 1315 1316} // namespace 1317 1318RegExpNode::~RegExpNode() = default; 1319 1320RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 1321 Trace* trace) { 1322 // If we are generating a greedy loop then don't stop and don't reuse code. 1323 if (trace->stop_node() != nullptr) { 1324 return CONTINUE; 1325 } 1326 1327 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1328 if (trace->is_trivial()) { 1329 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { 1330 // If a generic version is already scheduled to be generated or we have 1331 // recursed too deeply then just generate a jump to that code. 1332 macro_assembler->GoTo(&label_); 1333 // This will queue it up for generation of a generic version if it hasn't 1334 // already been queued. 1335 compiler->AddWork(this); 1336 return DONE; 1337 } 1338 // Generate generic version of the node and bind the label for later use. 1339 macro_assembler->Bind(&label_); 1340 return CONTINUE; 1341 } 1342 1343 // We are being asked to make a non-generic version. Keep track of how many 1344 // non-generic versions we generate so as not to overdo it. 1345 trace_count_++; 1346 if (KeepRecursing(compiler) && compiler->optimize() && 1347 trace_count_ < kMaxCopiesCodeGenerated) { 1348 return CONTINUE; 1349 } 1350 1351 // If we get here code has been generated for this node too many times or 1352 // recursion is too deep. Time to switch to a generic version. The code for 1353 // generic versions above can handle deep recursion properly. 1354 bool was_limiting = compiler->limiting_recursion(); 1355 compiler->set_limiting_recursion(true); 1356 trace->Flush(compiler, this); 1357 compiler->set_limiting_recursion(was_limiting); 1358 return DONE; 1359} 1360 1361bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { 1362 return !compiler->limiting_recursion() && 1363 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; 1364} 1365 1366void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 1367 BoyerMooreLookahead* bm, bool not_at_start) { 1368 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) { 1369 // Anything may follow a positive submatch success, thus we need to accept 1370 // all characters from this position onwards. 1371 bm->SetRest(offset); 1372 } else { 1373 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 1374 } 1375 SaveBMInfo(bm, not_at_start, offset); 1376} 1377 1378void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details, 1379 RegExpCompiler* compiler, int filled_in, 1380 bool not_at_start) { 1381 if (action_type_ == SET_REGISTER_FOR_LOOP) { 1382 on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler, 1383 filled_in, not_at_start); 1384 } else { 1385 on_success()->GetQuickCheckDetails(details, compiler, filled_in, 1386 not_at_start); 1387 } 1388} 1389 1390void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 1391 BoyerMooreLookahead* bm, bool not_at_start) { 1392 // Match the behaviour of EatsAtLeast on this node. 1393 if (assertion_type() == AT_START && not_at_start) return; 1394 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 1395 SaveBMInfo(bm, not_at_start, offset); 1396} 1397 1398void NegativeLookaroundChoiceNode::GetQuickCheckDetails( 1399 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, 1400 bool not_at_start) { 1401 RegExpNode* node = continue_node(); 1402 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 1403} 1404 1405namespace { 1406 1407// Takes the left-most 1-bit and smears it out, setting all bits to its right. 1408inline uint32_t SmearBitsRight(uint32_t v) { 1409 v |= v >> 1; 1410 v |= v >> 2; 1411 v |= v >> 4; 1412 v |= v >> 8; 1413 v |= v >> 16; 1414 return v; 1415} 1416 1417} // namespace 1418 1419bool QuickCheckDetails::Rationalize(bool asc) { 1420 bool found_useful_op = false; 1421 const uint32_t char_mask = CharMask(asc); 1422 mask_ = 0; 1423 value_ = 0; 1424 int char_shift = 0; 1425 for (int i = 0; i < characters_; i++) { 1426 Position* pos = &positions_[i]; 1427 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 1428 found_useful_op = true; 1429 } 1430 mask_ |= (pos->mask & char_mask) << char_shift; 1431 value_ |= (pos->value & char_mask) << char_shift; 1432 char_shift += asc ? 8 : 16; 1433 } 1434 return found_useful_op; 1435} 1436 1437int RegExpNode::EatsAtLeast(bool not_at_start) { 1438 return not_at_start ? eats_at_least_.eats_at_least_from_not_start 1439 : eats_at_least_.eats_at_least_from_possibly_start; 1440} 1441 1442EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() { 1443 // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it 1444 // implies that the following node must be a LoopChoiceNode. If we need to 1445 // set registers to constant values for other reasons, we could introduce a 1446 // new action type SET_REGISTER that doesn't imply anything about its 1447 // successor. 1448 UNREACHABLE(); 1449} 1450 1451void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 1452 RegExpCompiler* compiler, 1453 int characters_filled_in, 1454 bool not_at_start) { 1455 // See comment in RegExpNode::EatsAtLeastFromLoopEntry. 1456 UNREACHABLE(); 1457} 1458 1459EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() { 1460 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. 1461 1462 if (read_backward()) { 1463 // The eats_at_least value is not used if reading backward. The 1464 // EatsAtLeastPropagator should've zeroed it as well. 1465 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0); 1466 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0); 1467 return {}; 1468 } 1469 1470 // Figure out how much the loop body itself eats, not including anything in 1471 // the continuation case. In general, the nodes in the loop body should report 1472 // that they eat at least the number eaten by the continuation node, since any 1473 // successful match in the loop body must also include the continuation node. 1474 // However, in some cases involving positive lookaround, the loop body under- 1475 // reports its appetite, so use saturated math here to avoid negative numbers. 1476 uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>( 1477 loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true)); 1478 uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>( 1479 loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true)); 1480 1481 // Limit the number of loop iterations to avoid overflow in subsequent steps. 1482 int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations()); 1483 1484 EatsAtLeastInfo result; 1485 result.eats_at_least_from_not_start = 1486 base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start + 1487 continue_node_->EatsAtLeast(true)); 1488 if (loop_iterations > 0 && loop_body_from_possibly_start > 0) { 1489 // First loop iteration eats at least one, so all subsequent iterations 1490 // and the after-loop chunk are guaranteed to not be at the start. 1491 result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>( 1492 loop_body_from_possibly_start + 1493 (loop_iterations - 1) * loop_body_from_not_start + 1494 continue_node_->EatsAtLeast(true)); 1495 } else { 1496 // Loop body might eat nothing, so only continue node contributes. 1497 result.eats_at_least_from_possibly_start = 1498 continue_node_->EatsAtLeast(false); 1499 } 1500 return result; 1501} 1502 1503bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 1504 Trace* bounds_check_trace, Trace* trace, 1505 bool preload_has_checked_bounds, 1506 Label* on_possible_success, 1507 QuickCheckDetails* details, 1508 bool fall_through_on_failure, 1509 ChoiceNode* predecessor) { 1510 DCHECK_NOT_NULL(predecessor); 1511 if (details->characters() == 0) return false; 1512 GetQuickCheckDetails(details, compiler, 0, 1513 trace->at_start() == Trace::FALSE_VALUE); 1514 if (details->cannot_match()) return false; 1515 if (!details->Rationalize(compiler->one_byte())) return false; 1516 DCHECK(details->characters() == 1 || 1517 compiler->macro_assembler()->CanReadUnaligned()); 1518 uint32_t mask = details->mask(); 1519 uint32_t value = details->value(); 1520 1521 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1522 1523 if (trace->characters_preloaded() != details->characters()) { 1524 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 1525 // The bounds check is performed using the minimum number of characters 1526 // any choice would eat, so if the bounds check fails, then none of the 1527 // choices can succeed, so we can just immediately backtrack, rather 1528 // than go to the next choice. The number of characters preloaded may be 1529 // less than the number used for the bounds check. 1530 int eats_at_least = predecessor->EatsAtLeast( 1531 bounds_check_trace->at_start() == Trace::FALSE_VALUE); 1532 DCHECK_GE(eats_at_least, details->characters()); 1533 assembler->LoadCurrentCharacter( 1534 trace->cp_offset(), bounds_check_trace->backtrack(), 1535 !preload_has_checked_bounds, details->characters(), eats_at_least); 1536 } 1537 1538 bool need_mask = true; 1539 1540 if (details->characters() == 1) { 1541 // If number of characters preloaded is 1 then we used a byte or 16 bit 1542 // load so the value is already masked down. 1543 const uint32_t char_mask = CharMask(compiler->one_byte()); 1544 if ((mask & char_mask) == char_mask) need_mask = false; 1545 mask &= char_mask; 1546 } else { 1547 // For 2-character preloads in one-byte mode or 1-character preloads in 1548 // two-byte mode we also use a 16 bit load with zero extend. 1549 static const uint32_t kTwoByteMask = 0xFFFF; 1550 static const uint32_t kFourByteMask = 0xFFFFFFFF; 1551 if (details->characters() == 2 && compiler->one_byte()) { 1552 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 1553 } else if (details->characters() == 1 && !compiler->one_byte()) { 1554 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 1555 } else { 1556 if (mask == kFourByteMask) need_mask = false; 1557 } 1558 } 1559 1560 if (fall_through_on_failure) { 1561 if (need_mask) { 1562 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 1563 } else { 1564 assembler->CheckCharacter(value, on_possible_success); 1565 } 1566 } else { 1567 if (need_mask) { 1568 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 1569 } else { 1570 assembler->CheckNotCharacter(value, trace->backtrack()); 1571 } 1572 } 1573 return true; 1574} 1575 1576// Here is the meat of GetQuickCheckDetails (see also the comment on the 1577// super-class in the .h file). 1578// 1579// We iterate along the text object, building up for each character a 1580// mask and value that can be used to test for a quick failure to match. 1581// The masks and values for the positions will be combined into a single 1582// machine word for the current character width in order to be used in 1583// generating a quick check. 1584void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 1585 RegExpCompiler* compiler, 1586 int characters_filled_in, 1587 bool not_at_start) { 1588 // Do not collect any quick check details if the text node reads backward, 1589 // since it reads in the opposite direction than we use for quick checks. 1590 if (read_backward()) return; 1591 Isolate* isolate = compiler->macro_assembler()->isolate(); 1592 DCHECK(characters_filled_in < details->characters()); 1593 int characters = details->characters(); 1594 const uint32_t char_mask = CharMask(compiler->one_byte()); 1595 for (int k = 0; k < elements()->length(); k++) { 1596 TextElement elm = elements()->at(k); 1597 if (elm.text_type() == TextElement::ATOM) { 1598 base::Vector<const base::uc16> quarks = elm.atom()->data(); 1599 for (int i = 0; i < characters && i < quarks.length(); i++) { 1600 QuickCheckDetails::Position* pos = 1601 details->positions(characters_filled_in); 1602 base::uc16 c = quarks[i]; 1603 if (IsIgnoreCase(compiler->flags())) { 1604 unibrow::uchar chars[4]; 1605 int length = GetCaseIndependentLetters( 1606 isolate, c, compiler->one_byte(), chars, 4); 1607 if (length == 0) { 1608 // This can happen because all case variants are non-Latin1, but we 1609 // know the input is Latin1. 1610 details->set_cannot_match(); 1611 pos->determines_perfectly = false; 1612 return; 1613 } 1614 if (length == 1) { 1615 // This letter has no case equivalents, so it's nice and simple 1616 // and the mask-compare will determine definitely whether we have 1617 // a match at this character position. 1618 pos->mask = char_mask; 1619 pos->value = chars[0]; 1620 pos->determines_perfectly = true; 1621 } else { 1622 uint32_t common_bits = char_mask; 1623 uint32_t bits = chars[0]; 1624 for (int j = 1; j < length; j++) { 1625 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 1626 common_bits ^= differing_bits; 1627 bits &= common_bits; 1628 } 1629 // If length is 2 and common bits has only one zero in it then 1630 // our mask and compare instruction will determine definitely 1631 // whether we have a match at this character position. Otherwise 1632 // it can only be an approximate check. 1633 uint32_t one_zero = (common_bits | ~char_mask); 1634 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 1635 pos->determines_perfectly = true; 1636 } 1637 pos->mask = common_bits; 1638 pos->value = bits; 1639 } 1640 } else { 1641 // Don't ignore case. Nice simple case where the mask-compare will 1642 // determine definitely whether we have a match at this character 1643 // position. 1644 if (c > char_mask) { 1645 details->set_cannot_match(); 1646 pos->determines_perfectly = false; 1647 return; 1648 } 1649 pos->mask = char_mask; 1650 pos->value = c; 1651 pos->determines_perfectly = true; 1652 } 1653 characters_filled_in++; 1654 DCHECK(characters_filled_in <= details->characters()); 1655 if (characters_filled_in == details->characters()) { 1656 return; 1657 } 1658 } 1659 } else { 1660 QuickCheckDetails::Position* pos = 1661 details->positions(characters_filled_in); 1662 RegExpCharacterClass* tree = elm.char_class(); 1663 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 1664 if (tree->is_negated() || ranges->is_empty()) { 1665 // A quick check uses multi-character mask and compare. There is no 1666 // useful way to incorporate a negative char class into this scheme 1667 // so we just conservatively create a mask and value that will always 1668 // succeed. 1669 // Likewise for empty ranges (empty ranges can occur e.g. when 1670 // compiling for one-byte subjects and impossible (non-one-byte) ranges 1671 // have been removed). 1672 pos->mask = 0; 1673 pos->value = 0; 1674 } else { 1675 int first_range = 0; 1676 while (ranges->at(first_range).from() > char_mask) { 1677 first_range++; 1678 if (first_range == ranges->length()) { 1679 details->set_cannot_match(); 1680 pos->determines_perfectly = false; 1681 return; 1682 } 1683 } 1684 CharacterRange range = ranges->at(first_range); 1685 const base::uc32 first_from = range.from(); 1686 const base::uc32 first_to = 1687 (range.to() > char_mask) ? char_mask : range.to(); 1688 const uint32_t differing_bits = (first_from ^ first_to); 1689 // A mask and compare is only perfect if the differing bits form a 1690 // number like 00011111 with one single block of trailing 1s. 1691 if ((differing_bits & (differing_bits + 1)) == 0 && 1692 first_from + differing_bits == first_to) { 1693 pos->determines_perfectly = true; 1694 } 1695 uint32_t common_bits = ~SmearBitsRight(differing_bits); 1696 uint32_t bits = (first_from & common_bits); 1697 for (int i = first_range + 1; i < ranges->length(); i++) { 1698 range = ranges->at(i); 1699 const base::uc32 from = range.from(); 1700 if (from > char_mask) continue; 1701 const base::uc32 to = 1702 (range.to() > char_mask) ? char_mask : range.to(); 1703 // Here we are combining more ranges into the mask and compare 1704 // value. With each new range the mask becomes more sparse and 1705 // so the chances of a false positive rise. A character class 1706 // with multiple ranges is assumed never to be equivalent to a 1707 // mask and compare operation. 1708 pos->determines_perfectly = false; 1709 uint32_t new_common_bits = (from ^ to); 1710 new_common_bits = ~SmearBitsRight(new_common_bits); 1711 common_bits &= new_common_bits; 1712 bits &= new_common_bits; 1713 uint32_t new_differing_bits = (from & common_bits) ^ bits; 1714 common_bits ^= new_differing_bits; 1715 bits &= common_bits; 1716 } 1717 pos->mask = common_bits; 1718 pos->value = bits; 1719 } 1720 characters_filled_in++; 1721 DCHECK(characters_filled_in <= details->characters()); 1722 if (characters_filled_in == details->characters()) return; 1723 } 1724 } 1725 DCHECK(characters_filled_in != details->characters()); 1726 if (!details->cannot_match()) { 1727 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in, 1728 true); 1729 } 1730} 1731 1732void QuickCheckDetails::Clear() { 1733 for (int i = 0; i < characters_; i++) { 1734 positions_[i].mask = 0; 1735 positions_[i].value = 0; 1736 positions_[i].determines_perfectly = false; 1737 } 1738 characters_ = 0; 1739} 1740 1741void QuickCheckDetails::Advance(int by, bool one_byte) { 1742 if (by >= characters_ || by < 0) { 1743 DCHECK_IMPLIES(by < 0, characters_ == 0); 1744 Clear(); 1745 return; 1746 } 1747 DCHECK_LE(characters_ - by, 4); 1748 DCHECK_LE(characters_, 4); 1749 for (int i = 0; i < characters_ - by; i++) { 1750 positions_[i] = positions_[by + i]; 1751 } 1752 for (int i = characters_ - by; i < characters_; i++) { 1753 positions_[i].mask = 0; 1754 positions_[i].value = 0; 1755 positions_[i].determines_perfectly = false; 1756 } 1757 characters_ -= by; 1758 // We could change mask_ and value_ here but we would never advance unless 1759 // they had already been used in a check and they won't be used again because 1760 // it would gain us nothing. So there's no point. 1761} 1762 1763void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 1764 DCHECK(characters_ == other->characters_); 1765 if (other->cannot_match_) { 1766 return; 1767 } 1768 if (cannot_match_) { 1769 *this = *other; 1770 return; 1771 } 1772 for (int i = from_index; i < characters_; i++) { 1773 QuickCheckDetails::Position* pos = positions(i); 1774 QuickCheckDetails::Position* other_pos = other->positions(i); 1775 if (pos->mask != other_pos->mask || pos->value != other_pos->value || 1776 !other_pos->determines_perfectly) { 1777 // Our mask-compare operation will be approximate unless we have the 1778 // exact same operation on both sides of the alternation. 1779 pos->determines_perfectly = false; 1780 } 1781 pos->mask &= other_pos->mask; 1782 pos->value &= pos->mask; 1783 other_pos->value &= pos->mask; 1784 uint32_t differing_bits = (pos->value ^ other_pos->value); 1785 pos->mask &= ~differing_bits; 1786 pos->value &= pos->mask; 1787 } 1788} 1789 1790class VisitMarker { 1791 public: 1792 explicit VisitMarker(NodeInfo* info) : info_(info) { 1793 DCHECK(!info->visited); 1794 info->visited = true; 1795 } 1796 ~VisitMarker() { info_->visited = false; } 1797 1798 private: 1799 NodeInfo* info_; 1800}; 1801 1802// Temporarily sets traversed_loop_initialization_node_. 1803class LoopInitializationMarker { 1804 public: 1805 explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) { 1806 DCHECK(!node_->traversed_loop_initialization_node_); 1807 node_->traversed_loop_initialization_node_ = true; 1808 } 1809 ~LoopInitializationMarker() { 1810 DCHECK(node_->traversed_loop_initialization_node_); 1811 node_->traversed_loop_initialization_node_ = false; 1812 } 1813 LoopInitializationMarker(const LoopInitializationMarker&) = delete; 1814 LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete; 1815 1816 private: 1817 LoopChoiceNode* node_; 1818}; 1819 1820// Temporarily decrements min_loop_iterations_. 1821class IterationDecrementer { 1822 public: 1823 explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) { 1824 DCHECK_GT(node_->min_loop_iterations_, 0); 1825 --node_->min_loop_iterations_; 1826 } 1827 ~IterationDecrementer() { ++node_->min_loop_iterations_; } 1828 IterationDecrementer(const IterationDecrementer&) = delete; 1829 IterationDecrementer& operator=(const IterationDecrementer&) = delete; 1830 1831 private: 1832 LoopChoiceNode* node_; 1833}; 1834 1835RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) { 1836 if (info()->replacement_calculated) return replacement(); 1837 if (depth < 0) return this; 1838 DCHECK(!info()->visited); 1839 VisitMarker marker(info()); 1840 return FilterSuccessor(depth - 1, flags); 1841} 1842 1843RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) { 1844 RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags); 1845 if (next == nullptr) return set_replacement(nullptr); 1846 on_success_ = next; 1847 return set_replacement(this); 1848} 1849 1850// We need to check for the following characters: 0x39C 0x3BC 0x178. 1851bool RangeContainsLatin1Equivalents(CharacterRange range) { 1852 // TODO(dcarney): this could be a lot more efficient. 1853 return range.Contains(0x039C) || range.Contains(0x03BC) || 1854 range.Contains(0x0178); 1855} 1856 1857namespace { 1858 1859bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 1860 for (int i = 0; i < ranges->length(); i++) { 1861 // TODO(dcarney): this could be a lot more efficient. 1862 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 1863 } 1864 return false; 1865} 1866 1867} // namespace 1868 1869RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) { 1870 if (info()->replacement_calculated) return replacement(); 1871 if (depth < 0) return this; 1872 DCHECK(!info()->visited); 1873 VisitMarker marker(info()); 1874 int element_count = elements()->length(); 1875 for (int i = 0; i < element_count; i++) { 1876 TextElement elm = elements()->at(i); 1877 if (elm.text_type() == TextElement::ATOM) { 1878 base::Vector<const base::uc16> quarks = elm.atom()->data(); 1879 for (int j = 0; j < quarks.length(); j++) { 1880 base::uc16 c = quarks[j]; 1881 if (IsIgnoreCase(flags)) { 1882 c = unibrow::Latin1::TryConvertToLatin1(c); 1883 } 1884 if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); 1885 // Replace quark in case we converted to Latin-1. 1886 base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin()); 1887 writable_quarks[j] = c; 1888 } 1889 } else { 1890 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 1891 RegExpCharacterClass* cc = elm.char_class(); 1892 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 1893 CharacterRange::Canonicalize(ranges); 1894 // Now they are in order so we only need to look at the first. 1895 int range_count = ranges->length(); 1896 if (cc->is_negated()) { 1897 if (range_count != 0 && ranges->at(0).from() == 0 && 1898 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 1899 // This will be handled in a later filter. 1900 if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) { 1901 continue; 1902 } 1903 return set_replacement(nullptr); 1904 } 1905 } else { 1906 if (range_count == 0 || 1907 ranges->at(0).from() > String::kMaxOneByteCharCode) { 1908 // This will be handled in a later filter. 1909 if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) { 1910 continue; 1911 } 1912 return set_replacement(nullptr); 1913 } 1914 } 1915 } 1916 } 1917 return FilterSuccessor(depth - 1, flags); 1918} 1919 1920RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) { 1921 if (info()->replacement_calculated) return replacement(); 1922 if (depth < 0) return this; 1923 if (info()->visited) return this; 1924 { 1925 VisitMarker marker(info()); 1926 1927 RegExpNode* continue_replacement = 1928 continue_node_->FilterOneByte(depth - 1, flags); 1929 // If we can't continue after the loop then there is no sense in doing the 1930 // loop. 1931 if (continue_replacement == nullptr) return set_replacement(nullptr); 1932 } 1933 1934 return ChoiceNode::FilterOneByte(depth - 1, flags); 1935} 1936 1937RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) { 1938 if (info()->replacement_calculated) return replacement(); 1939 if (depth < 0) return this; 1940 if (info()->visited) return this; 1941 VisitMarker marker(info()); 1942 int choice_count = alternatives_->length(); 1943 1944 for (int i = 0; i < choice_count; i++) { 1945 GuardedAlternative alternative = alternatives_->at(i); 1946 if (alternative.guards() != nullptr && 1947 alternative.guards()->length() != 0) { 1948 set_replacement(this); 1949 return this; 1950 } 1951 } 1952 1953 int surviving = 0; 1954 RegExpNode* survivor = nullptr; 1955 for (int i = 0; i < choice_count; i++) { 1956 GuardedAlternative alternative = alternatives_->at(i); 1957 RegExpNode* replacement = 1958 alternative.node()->FilterOneByte(depth - 1, flags); 1959 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 1960 if (replacement != nullptr) { 1961 alternatives_->at(i).set_node(replacement); 1962 surviving++; 1963 survivor = replacement; 1964 } 1965 } 1966 if (surviving < 2) return set_replacement(survivor); 1967 1968 set_replacement(this); 1969 if (surviving == choice_count) { 1970 return this; 1971 } 1972 // Only some of the nodes survived the filtering. We need to rebuild the 1973 // alternatives list. 1974 ZoneList<GuardedAlternative>* new_alternatives = 1975 zone()->New<ZoneList<GuardedAlternative>>(surviving, zone()); 1976 for (int i = 0; i < choice_count; i++) { 1977 RegExpNode* replacement = 1978 alternatives_->at(i).node()->FilterOneByte(depth - 1, flags); 1979 if (replacement != nullptr) { 1980 alternatives_->at(i).set_node(replacement); 1981 new_alternatives->Add(alternatives_->at(i), zone()); 1982 } 1983 } 1984 alternatives_ = new_alternatives; 1985 return this; 1986} 1987 1988RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, 1989 RegExpFlags flags) { 1990 if (info()->replacement_calculated) return replacement(); 1991 if (depth < 0) return this; 1992 if (info()->visited) return this; 1993 VisitMarker marker(info()); 1994 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1995 // afterwards. 1996 RegExpNode* node = continue_node(); 1997 RegExpNode* replacement = node->FilterOneByte(depth - 1, flags); 1998 if (replacement == nullptr) return set_replacement(nullptr); 1999 alternatives_->at(kContinueIndex).set_node(replacement); 2000 2001 RegExpNode* neg_node = lookaround_node(); 2002 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags); 2003 // If the negative lookahead is always going to fail then 2004 // we don't need to check it. 2005 if (neg_replacement == nullptr) return set_replacement(replacement); 2006 alternatives_->at(kLookaroundIndex).set_node(neg_replacement); 2007 return set_replacement(this); 2008} 2009 2010void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2011 RegExpCompiler* compiler, 2012 int characters_filled_in, 2013 bool not_at_start) { 2014 if (body_can_be_zero_length_ || info()->visited) return; 2015 not_at_start = not_at_start || this->not_at_start(); 2016 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue. 2017 if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 && 2018 loop_node_->EatsAtLeast(not_at_start) > 2019 continue_node_->EatsAtLeast(true)) { 2020 // Loop body is guaranteed to execute at least once, and consume characters 2021 // when it does, meaning the only possible quick checks from this point 2022 // begin with the loop body. We may recursively visit this LoopChoiceNode, 2023 // but we temporarily decrease its minimum iteration counter so we know when 2024 // to check the continue case. 2025 IterationDecrementer next_iteration(this); 2026 loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in, 2027 not_at_start); 2028 } else { 2029 // Might not consume anything in the loop body, so treat it like a normal 2030 // ChoiceNode (and don't recursively visit this node again). 2031 VisitMarker marker(info()); 2032 ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in, 2033 not_at_start); 2034 } 2035} 2036 2037void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry( 2038 QuickCheckDetails* details, RegExpCompiler* compiler, 2039 int characters_filled_in, bool not_at_start) { 2040 if (traversed_loop_initialization_node_) { 2041 // We already entered this loop once, exited via its continuation node, and 2042 // followed an outer loop's back-edge to before the loop entry point. We 2043 // could try to reset the minimum iteration count to its starting value at 2044 // this point, but that seems like more trouble than it's worth. It's safe 2045 // to keep going with the current (possibly reduced) minimum iteration 2046 // count. 2047 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); 2048 } else { 2049 // We are entering a loop via its counter initialization action, meaning we 2050 // are guaranteed to run the loop body at least some minimum number of times 2051 // before running the continuation node. Set a flag so that this node knows 2052 // (now and any times we visit it again recursively) that it was entered 2053 // from the top. 2054 LoopInitializationMarker marker(this); 2055 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start); 2056 } 2057} 2058 2059void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2060 BoyerMooreLookahead* bm, bool not_at_start) { 2061 if (body_can_be_zero_length_ || budget <= 0) { 2062 bm->SetRest(offset); 2063 SaveBMInfo(bm, not_at_start, offset); 2064 return; 2065 } 2066 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2067 SaveBMInfo(bm, not_at_start, offset); 2068} 2069 2070void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2071 RegExpCompiler* compiler, 2072 int characters_filled_in, 2073 bool not_at_start) { 2074 not_at_start = (not_at_start || not_at_start_); 2075 int choice_count = alternatives_->length(); 2076 DCHECK_LT(0, choice_count); 2077 alternatives_->at(0).node()->GetQuickCheckDetails( 2078 details, compiler, characters_filled_in, not_at_start); 2079 for (int i = 1; i < choice_count; i++) { 2080 QuickCheckDetails new_details(details->characters()); 2081 RegExpNode* node = alternatives_->at(i).node(); 2082 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in, 2083 not_at_start); 2084 // Here we merge the quick match details of the two branches. 2085 details->Merge(&new_details, characters_filled_in); 2086 } 2087} 2088 2089namespace { 2090 2091// Check for [0-9A-Z_a-z]. 2092void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word, 2093 Label* non_word, bool fall_through_on_word) { 2094 if (assembler->CheckSpecialCharacterClass( 2095 fall_through_on_word ? StandardCharacterSet::kWord 2096 : StandardCharacterSet::kNotWord, 2097 fall_through_on_word ? non_word : word)) { 2098 // Optimized implementation available. 2099 return; 2100 } 2101 assembler->CheckCharacterGT('z', non_word); 2102 assembler->CheckCharacterLT('0', non_word); 2103 assembler->CheckCharacterGT('a' - 1, word); 2104 assembler->CheckCharacterLT('9' + 1, word); 2105 assembler->CheckCharacterLT('A', non_word); 2106 assembler->CheckCharacterLT('Z' + 1, word); 2107 if (fall_through_on_word) { 2108 assembler->CheckNotCharacter('_', non_word); 2109 } else { 2110 assembler->CheckCharacter('_', word); 2111 } 2112} 2113 2114// Emit the code to check for a ^ in multiline mode (1-character lookbehind 2115// that matches newline or the start of input). 2116void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) { 2117 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2118 2119 // We will load the previous character into the current character register. 2120 Trace new_trace(*trace); 2121 new_trace.InvalidateCurrentCharacter(); 2122 2123 // A positive (> 0) cp_offset means we've already successfully matched a 2124 // non-empty-width part of the pattern, and thus cannot be at or before the 2125 // start of the subject string. We can thus skip both at-start and 2126 // bounds-checks when loading the one-character lookbehind. 2127 const bool may_be_at_or_before_subject_string_start = 2128 new_trace.cp_offset() <= 0; 2129 2130 Label ok; 2131 if (may_be_at_or_before_subject_string_start) { 2132 // The start of input counts as a newline in this context, so skip to ok if 2133 // we are at the start. 2134 assembler->CheckAtStart(new_trace.cp_offset(), &ok); 2135 } 2136 2137 // If we've already checked that we are not at the start of input, it's okay 2138 // to load the previous character without bounds checks. 2139 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; 2140 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2141 new_trace.backtrack(), can_skip_bounds_check); 2142 if (!assembler->CheckSpecialCharacterClass( 2143 StandardCharacterSet::kLineTerminator, new_trace.backtrack())) { 2144 // Newline means \n, \r, 0x2028 or 0x2029. 2145 if (!compiler->one_byte()) { 2146 assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok); 2147 } 2148 assembler->CheckCharacter('\n', &ok); 2149 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2150 } 2151 assembler->Bind(&ok); 2152 on_success->Emit(compiler, &new_trace); 2153} 2154 2155} // namespace 2156 2157// Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2158void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 2159 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2160 Isolate* isolate = assembler->isolate(); 2161 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 2162 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 2163 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 2164 if (lookahead == nullptr) { 2165 int eats_at_least = 2166 std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start)); 2167 if (eats_at_least >= 1) { 2168 BoyerMooreLookahead* bm = 2169 zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone()); 2170 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 2171 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE; 2172 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 2173 } 2174 } else { 2175 if (lookahead->at(0)->is_non_word()) 2176 next_is_word_character = Trace::FALSE_VALUE; 2177 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 2178 } 2179 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 2180 if (next_is_word_character == Trace::UNKNOWN) { 2181 Label before_non_word; 2182 Label before_word; 2183 if (trace->characters_preloaded() != 1) { 2184 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2185 } 2186 // Fall through on non-word. 2187 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2188 // Next character is not a word character. 2189 assembler->Bind(&before_non_word); 2190 Label ok; 2191 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 2192 assembler->GoTo(&ok); 2193 2194 assembler->Bind(&before_word); 2195 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 2196 assembler->Bind(&ok); 2197 } else if (next_is_word_character == Trace::TRUE_VALUE) { 2198 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 2199 } else { 2200 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 2201 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 2202 } 2203} 2204 2205void AssertionNode::BacktrackIfPrevious( 2206 RegExpCompiler* compiler, Trace* trace, 2207 AssertionNode::IfPrevious backtrack_if_previous) { 2208 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2209 Trace new_trace(*trace); 2210 new_trace.InvalidateCurrentCharacter(); 2211 2212 Label fall_through; 2213 Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack() 2214 : &fall_through; 2215 Label* word = backtrack_if_previous == kIsNonWord ? &fall_through 2216 : new_trace.backtrack(); 2217 2218 // A positive (> 0) cp_offset means we've already successfully matched a 2219 // non-empty-width part of the pattern, and thus cannot be at or before the 2220 // start of the subject string. We can thus skip both at-start and 2221 // bounds-checks when loading the one-character lookbehind. 2222 const bool may_be_at_or_before_subject_string_start = 2223 new_trace.cp_offset() <= 0; 2224 2225 if (may_be_at_or_before_subject_string_start) { 2226 // The start of input counts as a non-word character, so the question is 2227 // decided if we are at the start. 2228 assembler->CheckAtStart(new_trace.cp_offset(), non_word); 2229 } 2230 2231 // If we've already checked that we are not at the start of input, it's okay 2232 // to load the previous character without bounds checks. 2233 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; 2234 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word, 2235 can_skip_bounds_check); 2236 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 2237 2238 assembler->Bind(&fall_through); 2239 on_success()->Emit(compiler, &new_trace); 2240} 2241 2242void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2243 RegExpCompiler* compiler, 2244 int filled_in, bool not_at_start) { 2245 if (assertion_type_ == AT_START && not_at_start) { 2246 details->set_cannot_match(); 2247 return; 2248 } 2249 return on_success()->GetQuickCheckDetails(details, compiler, filled_in, 2250 not_at_start); 2251} 2252 2253void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2254 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2255 switch (assertion_type_) { 2256 case AT_END: { 2257 Label ok; 2258 assembler->CheckPosition(trace->cp_offset(), &ok); 2259 assembler->GoTo(trace->backtrack()); 2260 assembler->Bind(&ok); 2261 break; 2262 } 2263 case AT_START: { 2264 if (trace->at_start() == Trace::FALSE_VALUE) { 2265 assembler->GoTo(trace->backtrack()); 2266 return; 2267 } 2268 if (trace->at_start() == Trace::UNKNOWN) { 2269 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); 2270 Trace at_start_trace = *trace; 2271 at_start_trace.set_at_start(Trace::TRUE_VALUE); 2272 on_success()->Emit(compiler, &at_start_trace); 2273 return; 2274 } 2275 } break; 2276 case AFTER_NEWLINE: 2277 EmitHat(compiler, on_success(), trace); 2278 return; 2279 case AT_BOUNDARY: 2280 case AT_NON_BOUNDARY: { 2281 EmitBoundaryCheck(compiler, trace); 2282 return; 2283 } 2284 } 2285 on_success()->Emit(compiler, trace); 2286} 2287 2288namespace { 2289 2290bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2291 if (quick_check == nullptr) return false; 2292 if (offset >= quick_check->characters()) return false; 2293 return quick_check->positions(offset)->determines_perfectly; 2294} 2295 2296void UpdateBoundsCheck(int index, int* checked_up_to) { 2297 if (index > *checked_up_to) { 2298 *checked_up_to = index; 2299 } 2300} 2301 2302} // namespace 2303 2304// We call this repeatedly to generate code for each pass over the text node. 2305// The passes are in increasing order of difficulty because we hope one 2306// of the first passes will fail in which case we are saved the work of the 2307// later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2308// we will check the '%' in the first pass, the case independent 'a' in the 2309// second pass and the character class in the last pass. 2310// 2311// The passes are done from right to left, so for example to test for /bar/ 2312// we will first test for an 'r' with offset 2, then an 'a' with offset 1 2313// and then a 'b' with offset 0. This means we can avoid the end-of-input 2314// bounds check most of the time. In the example we only need to check for 2315// end-of-input when loading the putative 'r'. 2316// 2317// A slight complication involves the fact that the first character may already 2318// be fetched into a register by the previous node. In this case we want to 2319// do the test for that character first. We do this in separate passes. The 2320// 'preloaded' argument indicates that we are doing such a 'pass'. If such a 2321// pass has been performed then subsequent passes will have true in 2322// first_element_checked to indicate that that character does not need to be 2323// checked again. 2324// 2325// In addition to all this we are passed a Trace, which can 2326// contain an AlternativeGeneration object. In this AlternativeGeneration 2327// object we can see details of any quick check that was already passed in 2328// order to get to the code we are now generating. The quick check can involve 2329// loading characters, which means we do not need to recheck the bounds 2330// up to the limit the quick check already checked. In addition the quick 2331// check can have involved a mask and compare operation which may simplify 2332// or obviate the need for further checks at some character positions. 2333void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, 2334 bool preloaded, Trace* trace, 2335 bool first_element_checked, int* checked_up_to) { 2336 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2337 Isolate* isolate = assembler->isolate(); 2338 bool one_byte = compiler->one_byte(); 2339 Label* backtrack = trace->backtrack(); 2340 QuickCheckDetails* quick_check = trace->quick_check_performed(); 2341 int element_count = elements()->length(); 2342 int backward_offset = read_backward() ? -Length() : 0; 2343 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 2344 TextElement elm = elements()->at(i); 2345 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 2346 if (elm.text_type() == TextElement::ATOM) { 2347 if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue; 2348 base::Vector<const base::uc16> quarks = elm.atom()->data(); 2349 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 2350 if (first_element_checked && i == 0 && j == 0) continue; 2351 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 2352 base::uc16 quark = quarks[j]; 2353 if (IsIgnoreCase(compiler->flags())) { 2354 // Everywhere else we assume that a non-Latin-1 character cannot match 2355 // a Latin-1 character. Avoid the cases where this is assumption is 2356 // invalid by using the Latin1 equivalent instead. 2357 quark = unibrow::Latin1::TryConvertToLatin1(quark); 2358 } 2359 bool needs_bounds_check = 2360 *checked_up_to < cp_offset + j || read_backward(); 2361 bool bounds_checked = false; 2362 switch (pass) { 2363 case NON_LATIN1_MATCH: 2364 DCHECK(one_byte); 2365 if (quark > String::kMaxOneByteCharCode) { 2366 assembler->GoTo(backtrack); 2367 return; 2368 } 2369 break; 2370 case NON_LETTER_CHARACTER_MATCH: 2371 bounds_checked = 2372 EmitAtomNonLetter(isolate, compiler, quark, backtrack, 2373 cp_offset + j, needs_bounds_check, preloaded); 2374 break; 2375 case SIMPLE_CHARACTER_MATCH: 2376 bounds_checked = EmitSimpleCharacter(isolate, compiler, quark, 2377 backtrack, cp_offset + j, 2378 needs_bounds_check, preloaded); 2379 break; 2380 case CASE_CHARACTER_MATCH: 2381 bounds_checked = 2382 EmitAtomLetter(isolate, compiler, quark, backtrack, 2383 cp_offset + j, needs_bounds_check, preloaded); 2384 break; 2385 default: 2386 break; 2387 } 2388 if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 2389 } 2390 } else { 2391 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 2392 if (pass == CHARACTER_CLASS_MATCH) { 2393 if (first_element_checked && i == 0) continue; 2394 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 2395 RegExpCharacterClass* cc = elm.char_class(); 2396 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 2397 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, 2398 bounds_check, preloaded, zone()); 2399 UpdateBoundsCheck(cp_offset, checked_up_to); 2400 } 2401 } 2402 } 2403} 2404 2405int TextNode::Length() { 2406 TextElement elm = elements()->last(); 2407 DCHECK_LE(0, elm.cp_offset()); 2408 return elm.cp_offset() + elm.length(); 2409} 2410 2411bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { 2412 if (ignore_case) { 2413 return pass == SIMPLE_CHARACTER_MATCH; 2414 } else { 2415 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 2416 } 2417} 2418 2419TextNode* TextNode::CreateForCharacterRanges(Zone* zone, 2420 ZoneList<CharacterRange>* ranges, 2421 bool read_backward, 2422 RegExpNode* on_success) { 2423 DCHECK_NOT_NULL(ranges); 2424 // TODO(jgruber): There's no fundamental need to create this 2425 // RegExpCharacterClass; we could refactor to avoid the allocation. 2426 return zone->New<TextNode>(zone->New<RegExpCharacterClass>(zone, ranges), 2427 read_backward, on_success); 2428} 2429 2430TextNode* TextNode::CreateForSurrogatePair( 2431 Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges, 2432 bool read_backward, RegExpNode* on_success) { 2433 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); 2434 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone); 2435 elms->Add(TextElement::CharClass( 2436 zone->New<RegExpCharacterClass>(zone, lead_ranges)), 2437 zone); 2438 elms->Add(TextElement::CharClass( 2439 zone->New<RegExpCharacterClass>(zone, trail_ranges)), 2440 zone); 2441 return zone->New<TextNode>(elms, read_backward, on_success); 2442} 2443 2444TextNode* TextNode::CreateForSurrogatePair( 2445 Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail, 2446 bool read_backward, RegExpNode* on_success) { 2447 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); 2448 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone); 2449 elms->Add(TextElement::CharClass( 2450 zone->New<RegExpCharacterClass>(zone, lead_ranges)), 2451 zone); 2452 elms->Add(TextElement::CharClass( 2453 zone->New<RegExpCharacterClass>(zone, trail_ranges)), 2454 zone); 2455 return zone->New<TextNode>(elms, read_backward, on_success); 2456} 2457 2458// This generates the code to match a text node. A text node can contain 2459// straight character sequences (possibly to be matched in a case-independent 2460// way) and character classes. For efficiency we do not do this in a single 2461// pass from left to right. Instead we pass over the text node several times, 2462// emitting code for some character positions every time. See the comment on 2463// TextEmitPass for details. 2464void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2465 LimitResult limit_result = LimitVersions(compiler, trace); 2466 if (limit_result == DONE) return; 2467 DCHECK(limit_result == CONTINUE); 2468 2469 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 2470 compiler->SetRegExpTooBig(); 2471 return; 2472 } 2473 2474 if (compiler->one_byte()) { 2475 int dummy = 0; 2476 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 2477 } 2478 2479 bool first_elt_done = false; 2480 int bound_checked_to = trace->cp_offset() - 1; 2481 bound_checked_to += trace->bound_checked_up_to(); 2482 2483 // If a character is preloaded into the current character register then 2484 // check that now. 2485 if (trace->characters_preloaded() == 1) { 2486 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2487 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace, 2488 false, &bound_checked_to); 2489 } 2490 first_elt_done = true; 2491 } 2492 2493 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2494 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace, 2495 first_elt_done, &bound_checked_to); 2496 } 2497 2498 Trace successor_trace(*trace); 2499 // If we advance backward, we may end up at the start. 2500 successor_trace.AdvanceCurrentPositionInTrace( 2501 read_backward() ? -Length() : Length(), compiler); 2502 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN 2503 : Trace::FALSE_VALUE); 2504 RecursionCheck rc(compiler); 2505 on_success()->Emit(compiler, &successor_trace); 2506} 2507 2508void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; } 2509 2510void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 2511 // We don't have an instruction for shifting the current character register 2512 // down or for using a shifted value for anything so lets just forget that 2513 // we preloaded any characters into it. 2514 characters_preloaded_ = 0; 2515 // Adjust the offsets of the quick check performed information. This 2516 // information is used to find out what we already determined about the 2517 // characters by means of mask and compare. 2518 quick_check_performed_.Advance(by, compiler->one_byte()); 2519 cp_offset_ += by; 2520 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 2521 compiler->SetRegExpTooBig(); 2522 cp_offset_ = 0; 2523 } 2524 bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by); 2525} 2526 2527void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte, 2528 RegExpFlags flags) { 2529 if (!IsIgnoreCase(flags)) return; 2530#ifdef V8_INTL_SUPPORT 2531 if (NeedsUnicodeCaseEquivalents(flags)) return; 2532#endif 2533 2534 int element_count = elements()->length(); 2535 for (int i = 0; i < element_count; i++) { 2536 TextElement elm = elements()->at(i); 2537 if (elm.text_type() == TextElement::CHAR_CLASS) { 2538 RegExpCharacterClass* cc = elm.char_class(); 2539 // None of the standard character classes is different in the case 2540 // independent case and it slows us down if we don't know that. 2541 if (cc->is_standard(zone())) continue; 2542 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2543 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); 2544 } 2545 } 2546} 2547 2548int TextNode::GreedyLoopTextLength() { return Length(); } 2549 2550RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 2551 RegExpCompiler* compiler) { 2552 if (read_backward()) return nullptr; 2553 if (elements()->length() != 1) return nullptr; 2554 TextElement elm = elements()->at(0); 2555 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr; 2556 RegExpCharacterClass* node = elm.char_class(); 2557 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 2558 CharacterRange::Canonicalize(ranges); 2559 if (node->is_negated()) { 2560 return ranges->length() == 0 ? on_success() : nullptr; 2561 } 2562 if (ranges->length() != 1) return nullptr; 2563 const base::uc32 max_char = MaxCodeUnit(compiler->one_byte()); 2564 return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; 2565} 2566 2567// Finds the fixed match length of a sequence of nodes that goes from 2568// this alternative and back to this choice node. If there are variable 2569// length nodes or other complications in the way then return a sentinel 2570// value indicating that a greedy loop cannot be constructed. 2571int ChoiceNode::GreedyLoopTextLengthForAlternative( 2572 GuardedAlternative* alternative) { 2573 int length = 0; 2574 RegExpNode* node = alternative->node(); 2575 // Later we will generate code for all these text nodes using recursion 2576 // so we have to limit the max number. 2577 int recursion_depth = 0; 2578 while (node != this) { 2579 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 2580 return kNodeIsTooComplexForGreedyLoops; 2581 } 2582 int node_length = node->GreedyLoopTextLength(); 2583 if (node_length == kNodeIsTooComplexForGreedyLoops) { 2584 return kNodeIsTooComplexForGreedyLoops; 2585 } 2586 length += node_length; 2587 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 2588 node = seq_node->on_success(); 2589 } 2590 if (read_backward()) { 2591 length = -length; 2592 } 2593 // Check that we can jump by the whole text length. If not, return sentinel 2594 // to indicate the we can't construct a greedy loop. 2595 if (length < RegExpMacroAssembler::kMinCPOffset || 2596 length > RegExpMacroAssembler::kMaxCPOffset) { 2597 return kNodeIsTooComplexForGreedyLoops; 2598 } 2599 return length; 2600} 2601 2602void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 2603 DCHECK_NULL(loop_node_); 2604 AddAlternative(alt); 2605 loop_node_ = alt.node(); 2606} 2607 2608void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 2609 DCHECK_NULL(continue_node_); 2610 AddAlternative(alt); 2611 continue_node_ = alt.node(); 2612} 2613 2614void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2615 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2616 if (trace->stop_node() == this) { 2617 // Back edge of greedy optimized loop node graph. 2618 int text_length = 2619 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 2620 DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length); 2621 // Update the counter-based backtracking info on the stack. This is an 2622 // optimization for greedy loops (see below). 2623 DCHECK(trace->cp_offset() == text_length); 2624 macro_assembler->AdvanceCurrentPosition(text_length); 2625 macro_assembler->GoTo(trace->loop_label()); 2626 return; 2627 } 2628 DCHECK_NULL(trace->stop_node()); 2629 if (!trace->is_trivial()) { 2630 trace->Flush(compiler, this); 2631 return; 2632 } 2633 ChoiceNode::Emit(compiler, trace); 2634} 2635 2636int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 2637 int eats_at_least) { 2638 int preload_characters = std::min(4, eats_at_least); 2639 DCHECK_LE(preload_characters, 4); 2640 if (compiler->macro_assembler()->CanReadUnaligned()) { 2641 bool one_byte = compiler->one_byte(); 2642 if (one_byte) { 2643 // We can't preload 3 characters because there is no machine instruction 2644 // to do that. We can't just load 4 because we could be reading 2645 // beyond the end of the string, which could cause a memory fault. 2646 if (preload_characters == 3) preload_characters = 2; 2647 } else { 2648 if (preload_characters > 2) preload_characters = 2; 2649 } 2650 } else { 2651 if (preload_characters > 1) preload_characters = 1; 2652 } 2653 return preload_characters; 2654} 2655 2656// This class is used when generating the alternatives in a choice node. It 2657// records the way the alternative is being code generated. 2658class AlternativeGeneration : public Malloced { 2659 public: 2660 AlternativeGeneration() 2661 : possible_success(), 2662 expects_preload(false), 2663 after(), 2664 quick_check_details() {} 2665 Label possible_success; 2666 bool expects_preload; 2667 Label after; 2668 QuickCheckDetails quick_check_details; 2669}; 2670 2671// Creates a list of AlternativeGenerations. If the list has a reasonable 2672// size then it is on the stack, otherwise the excess is on the heap. 2673class AlternativeGenerationList { 2674 public: 2675 AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) { 2676 for (int i = 0; i < count && i < kAFew; i++) { 2677 alt_gens_.Add(a_few_alt_gens_ + i, zone); 2678 } 2679 for (int i = kAFew; i < count; i++) { 2680 alt_gens_.Add(new AlternativeGeneration(), zone); 2681 } 2682 } 2683 ~AlternativeGenerationList() { 2684 for (int i = kAFew; i < alt_gens_.length(); i++) { 2685 delete alt_gens_[i]; 2686 alt_gens_[i] = nullptr; 2687 } 2688 } 2689 2690 AlternativeGeneration* at(int i) { return alt_gens_[i]; } 2691 2692 private: 2693 static const int kAFew = 10; 2694 ZoneList<AlternativeGeneration*> alt_gens_; 2695 AlternativeGeneration a_few_alt_gens_[kAFew]; 2696}; 2697 2698void BoyerMoorePositionInfo::Set(int character) { 2699 SetInterval(Interval(character, character)); 2700} 2701 2702namespace { 2703 2704ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges, 2705 int ranges_length, Interval new_range) { 2706 DCHECK_EQ(1, ranges_length & 1); 2707 DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]); 2708 if (containment == kLatticeUnknown) return containment; 2709 bool inside = false; 2710 int last = 0; 2711 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 2712 // Consider the range from last to ranges[i]. 2713 // We haven't got to the new range yet. 2714 if (ranges[i] <= new_range.from()) continue; 2715 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 2716 // inclusive, but the values in ranges are not. 2717 if (last <= new_range.from() && new_range.to() < ranges[i]) { 2718 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 2719 } 2720 return kLatticeUnknown; 2721 } 2722 return containment; 2723} 2724 2725int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) { 2726 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 2727 2 * kInt64Size * kBitsPerByte); 2728 2729 // Slight fiddling is needed here, since the bitset is of length 128 while 2730 // CountTrailingZeros requires an integral type and std::bitset can only 2731 // convert to unsigned long long. So we handle the most- and least-significant 2732 // bits separately. 2733 2734 { 2735 static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0}); 2736 BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask; 2737 STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong()))); 2738 uint64_t lsb = masked_bitset.to_ullong(); 2739 if (lsb != 0) return base::bits::CountTrailingZeros(lsb); 2740 } 2741 2742 { 2743 BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64; 2744 uint64_t msb = masked_bitset.to_ullong(); 2745 if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb); 2746 } 2747 2748 return -1; 2749} 2750 2751} // namespace 2752 2753void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 2754 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 2755 2756 if (interval.size() >= kMapSize) { 2757 map_count_ = kMapSize; 2758 map_.set(); 2759 return; 2760 } 2761 2762 for (int i = interval.from(); i <= interval.to(); i++) { 2763 int mod_character = (i & kMask); 2764 if (!map_[mod_character]) { 2765 map_count_++; 2766 map_.set(mod_character); 2767 } 2768 if (map_count_ == kMapSize) return; 2769 } 2770} 2771 2772void BoyerMoorePositionInfo::SetAll() { 2773 w_ = kLatticeUnknown; 2774 if (map_count_ != kMapSize) { 2775 map_count_ = kMapSize; 2776 map_.set(); 2777 } 2778} 2779 2780BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler, 2781 Zone* zone) 2782 : length_(length), 2783 compiler_(compiler), 2784 max_char_(MaxCodeUnit(compiler->one_byte())) { 2785 bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone); 2786 for (int i = 0; i < length; i++) { 2787 bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone); 2788 } 2789} 2790 2791// Find the longest range of lookahead that has the fewest number of different 2792// characters that can occur at a given position. Since we are optimizing two 2793// different parameters at once this is a tradeoff. 2794bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 2795 int biggest_points = 0; 2796 // If more than 32 characters out of 128 can occur it is unlikely that we can 2797 // be lucky enough to step forwards much of the time. 2798 const int kMaxMax = 32; 2799 for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax; 2800 max_number_of_chars *= 2) { 2801 biggest_points = 2802 FindBestInterval(max_number_of_chars, biggest_points, from, to); 2803 } 2804 if (biggest_points == 0) return false; 2805 return true; 2806} 2807 2808// Find the highest-points range between 0 and length_ where the character 2809// information is not too vague. 'Too vague' means that there are more than 2810// max_number_of_chars that can occur at this position. Calculates the number 2811// of points as the product of width-of-the-range and 2812// probability-of-finding-one-of-the-characters, where the probability is 2813// calculated using the frequency distribution of the sample subject string. 2814int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars, 2815 int old_biggest_points, int* from, 2816 int* to) { 2817 int biggest_points = old_biggest_points; 2818 static const int kSize = RegExpMacroAssembler::kTableSize; 2819 for (int i = 0; i < length_;) { 2820 while (i < length_ && Count(i) > max_number_of_chars) i++; 2821 if (i == length_) break; 2822 int remembered_from = i; 2823 2824 BoyerMoorePositionInfo::Bitset union_bitset; 2825 for (; i < length_ && Count(i) <= max_number_of_chars; i++) { 2826 union_bitset |= bitmaps_->at(i)->raw_bitset(); 2827 } 2828 2829 int frequency = 0; 2830 2831 // Iterate only over set bits. 2832 int j; 2833 while ((j = BitsetFirstSetBit(union_bitset)) != -1) { 2834 DCHECK(union_bitset[j]); // Sanity check. 2835 // Add 1 to the frequency to give a small per-character boost for 2836 // the cases where our sampling is not good enough and many 2837 // characters have a frequency of zero. This means the frequency 2838 // can theoretically be up to 2*kSize though we treat it mostly as 2839 // a fraction of kSize. 2840 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 2841 union_bitset.reset(j); 2842 } 2843 2844 // We use the probability of skipping times the distance we are skipping to 2845 // judge the effectiveness of this. Actually we have a cut-off: By 2846 // dividing by 2 we switch off the skipping if the probability of skipping 2847 // is less than 50%. This is because the multibyte mask-and-compare 2848 // skipping in quickcheck is more likely to do well on this case. 2849 bool in_quickcheck_range = 2850 ((i - remembered_from < 4) || 2851 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 2852 // Called 'probability' but it is only a rough estimate and can actually 2853 // be outside the 0-kSize range. 2854 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 2855 int points = (i - remembered_from) * probability; 2856 if (points > biggest_points) { 2857 *from = remembered_from; 2858 *to = i - 1; 2859 biggest_points = points; 2860 } 2861 } 2862 return biggest_points; 2863} 2864 2865// Take all the characters that will not prevent a successful match if they 2866// occur in the subject string in the range between min_lookahead and 2867// max_lookahead (inclusive) measured from the current position. If the 2868// character at max_lookahead offset is not one of these characters, then we 2869// can safely skip forwards by the number of characters in the range. 2870int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead, 2871 Handle<ByteArray> boolean_skip_table) { 2872 const int kSkipArrayEntry = 0; 2873 const int kDontSkipArrayEntry = 1; 2874 2875 std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry, 2876 boolean_skip_table->length()); 2877 2878 for (int i = max_lookahead; i >= min_lookahead; i--) { 2879 BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset(); 2880 2881 // Iterate only over set bits. 2882 int j; 2883 while ((j = BitsetFirstSetBit(bitset)) != -1) { 2884 DCHECK(bitset[j]); // Sanity check. 2885 boolean_skip_table->set(j, kDontSkipArrayEntry); 2886 bitset.reset(j); 2887 } 2888 } 2889 2890 const int skip = max_lookahead + 1 - min_lookahead; 2891 return skip; 2892} 2893 2894// See comment above on the implementation of GetSkipTable. 2895void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 2896 const int kSize = RegExpMacroAssembler::kTableSize; 2897 2898 int min_lookahead = 0; 2899 int max_lookahead = 0; 2900 2901 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 2902 2903 // Check if we only have a single non-empty position info, and that info 2904 // contains precisely one character. 2905 bool found_single_character = false; 2906 int single_character = 0; 2907 for (int i = max_lookahead; i >= min_lookahead; i--) { 2908 BoyerMoorePositionInfo* map = bitmaps_->at(i); 2909 if (map->map_count() == 0) continue; 2910 2911 if (found_single_character || map->map_count() > 1) { 2912 found_single_character = false; 2913 break; 2914 } 2915 2916 DCHECK(!found_single_character); 2917 DCHECK_EQ(map->map_count(), 1); 2918 2919 found_single_character = true; 2920 single_character = BitsetFirstSetBit(map->raw_bitset()); 2921 2922 DCHECK_NE(single_character, -1); 2923 } 2924 2925 int lookahead_width = max_lookahead + 1 - min_lookahead; 2926 2927 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 2928 // The mask-compare can probably handle this better. 2929 return; 2930 } 2931 2932 if (found_single_character) { 2933 Label cont, again; 2934 masm->Bind(&again); 2935 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 2936 if (max_char_ > kSize) { 2937 masm->CheckCharacterAfterAnd(single_character, 2938 RegExpMacroAssembler::kTableMask, &cont); 2939 } else { 2940 masm->CheckCharacter(single_character, &cont); 2941 } 2942 masm->AdvanceCurrentPosition(lookahead_width); 2943 masm->GoTo(&again); 2944 masm->Bind(&cont); 2945 return; 2946 } 2947 2948 Factory* factory = masm->isolate()->factory(); 2949 Handle<ByteArray> boolean_skip_table = 2950 factory->NewByteArray(kSize, AllocationType::kOld); 2951 int skip_distance = 2952 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table); 2953 DCHECK_NE(0, skip_distance); 2954 2955 Label cont, again; 2956 masm->Bind(&again); 2957 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 2958 masm->CheckBitInTable(boolean_skip_table, &cont); 2959 masm->AdvanceCurrentPosition(skip_distance); 2960 masm->GoTo(&again); 2961 masm->Bind(&cont); 2962} 2963 2964/* Code generation for choice nodes. 2965 * 2966 * We generate quick checks that do a mask and compare to eliminate a 2967 * choice. If the quick check succeeds then it jumps to the continuation to 2968 * do slow checks and check subsequent nodes. If it fails (the common case) 2969 * it falls through to the next choice. 2970 * 2971 * Here is the desired flow graph. Nodes directly below each other imply 2972 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 2973 * 3 doesn't have a quick check so we have to call the slow check. 2974 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 2975 * regexp continuation is generated directly after the Sn node, up to the 2976 * next GoTo if we decide to reuse some already generated code. Some 2977 * nodes expect preload_characters to be preloaded into the current 2978 * character register. R nodes do this preloading. Vertices are marked 2979 * F for failures and S for success (possible success in the case of quick 2980 * nodes). L, V, < and > are used as arrow heads. 2981 * 2982 * ----------> R 2983 * | 2984 * V 2985 * Q1 -----> S1 2986 * | S / 2987 * F| / 2988 * | F/ 2989 * | / 2990 * | R 2991 * | / 2992 * V L 2993 * Q2 -----> S2 2994 * | S / 2995 * F| / 2996 * | F/ 2997 * | / 2998 * | R 2999 * | / 3000 * V L 3001 * S3 3002 * | 3003 * F| 3004 * | 3005 * R 3006 * | 3007 * backtrack V 3008 * <----------Q4 3009 * \ F | 3010 * \ |S 3011 * \ F V 3012 * \-----S4 3013 * 3014 * For greedy loops we push the current position, then generate the code that 3015 * eats the input specially in EmitGreedyLoop. The other choice (the 3016 * continuation) is generated by the normal code in EmitChoices, and steps back 3017 * in the input to the starting position when it fails to match. The loop code 3018 * looks like this (U is the unwind code that steps back in the greedy loop). 3019 * 3020 * _____ 3021 * / \ 3022 * V | 3023 * ----------> S1 | 3024 * /| | 3025 * / |S | 3026 * F/ \_____/ 3027 * / 3028 * |<----- 3029 * | \ 3030 * V |S 3031 * Q2 ---> U----->backtrack 3032 * | F / 3033 * S| / 3034 * V F / 3035 * S2--/ 3036 */ 3037 3038GreedyLoopState::GreedyLoopState(bool not_at_start) { 3039 counter_backtrack_trace_.set_backtrack(&label_); 3040 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); 3041} 3042 3043void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3044#ifdef DEBUG 3045 int choice_count = alternatives_->length(); 3046 for (int i = 0; i < choice_count - 1; i++) { 3047 GuardedAlternative alternative = alternatives_->at(i); 3048 ZoneList<Guard*>* guards = alternative.guards(); 3049 int guard_count = (guards == nullptr) ? 0 : guards->length(); 3050 for (int j = 0; j < guard_count; j++) { 3051 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3052 } 3053 } 3054#endif 3055} 3056 3057void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, 3058 PreloadState* state) { 3059 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3060 // Save some time by looking at most one machine word ahead. 3061 state->eats_at_least_ = 3062 EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE); 3063 } 3064 state->preload_characters_ = 3065 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3066 3067 state->preload_is_current_ = 3068 (current_trace->characters_preloaded() == state->preload_characters_); 3069 state->preload_has_checked_bounds_ = state->preload_is_current_; 3070} 3071 3072void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3073 int choice_count = alternatives_->length(); 3074 3075 if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) { 3076 alternatives_->at(0).node()->Emit(compiler, trace); 3077 return; 3078 } 3079 3080 AssertGuardsMentionRegisters(trace); 3081 3082 LimitResult limit_result = LimitVersions(compiler, trace); 3083 if (limit_result == DONE) return; 3084 DCHECK(limit_result == CONTINUE); 3085 3086 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3087 // other choice nodes we only flush if we are out of code size budget. 3088 if (trace->flush_budget() == 0 && trace->actions() != nullptr) { 3089 trace->Flush(compiler, this); 3090 return; 3091 } 3092 3093 RecursionCheck rc(compiler); 3094 3095 PreloadState preload; 3096 preload.init(); 3097 GreedyLoopState greedy_loop_state(not_at_start()); 3098 3099 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); 3100 AlternativeGenerationList alt_gens(choice_count, zone()); 3101 3102 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 3103 trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload, 3104 &greedy_loop_state, text_length); 3105 } else { 3106 // TODO(erikcorry): Delete this. We don't need this label, but it makes us 3107 // match the traces produced pre-cleanup. 3108 Label second_choice; 3109 compiler->macro_assembler()->Bind(&second_choice); 3110 3111 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 3112 3113 EmitChoices(compiler, &alt_gens, 0, trace, &preload); 3114 } 3115 3116 // At this point we need to generate slow checks for the alternatives where 3117 // the quick check was inlined. We can recognize these because the associated 3118 // label was bound. 3119 int new_flush_budget = trace->flush_budget() / choice_count; 3120 for (int i = 0; i < choice_count; i++) { 3121 AlternativeGeneration* alt_gen = alt_gens.at(i); 3122 Trace new_trace(*trace); 3123 // If there are actions to be flushed we have to limit how many times 3124 // they are flushed. Take the budget of the parent trace and distribute 3125 // it fairly amongst the children. 3126 if (new_trace.actions() != nullptr) { 3127 new_trace.set_flush_budget(new_flush_budget); 3128 } 3129 bool next_expects_preload = 3130 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 3131 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i), 3132 alt_gen, preload.preload_characters_, 3133 next_expects_preload); 3134 } 3135} 3136 3137Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace, 3138 AlternativeGenerationList* alt_gens, 3139 PreloadState* preload, 3140 GreedyLoopState* greedy_loop_state, 3141 int text_length) { 3142 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3143 // Here we have special handling for greedy loops containing only text nodes 3144 // and other simple nodes. These are handled by pushing the current 3145 // position on the stack and then incrementing the current position each 3146 // time around the switch. On backtrack we decrement the current position 3147 // and check it against the pushed value. This avoids pushing backtrack 3148 // information for each iteration of the loop, which could take up a lot of 3149 // space. 3150 DCHECK(trace->stop_node() == nullptr); 3151 macro_assembler->PushCurrentPosition(); 3152 Label greedy_match_failed; 3153 Trace greedy_match_trace; 3154 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); 3155 greedy_match_trace.set_backtrack(&greedy_match_failed); 3156 Label loop_label; 3157 macro_assembler->Bind(&loop_label); 3158 greedy_match_trace.set_stop_node(this); 3159 greedy_match_trace.set_loop_label(&loop_label); 3160 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 3161 macro_assembler->Bind(&greedy_match_failed); 3162 3163 Label second_choice; // For use in greedy matches. 3164 macro_assembler->Bind(&second_choice); 3165 3166 Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); 3167 3168 EmitChoices(compiler, alt_gens, 1, new_trace, preload); 3169 3170 macro_assembler->Bind(greedy_loop_state->label()); 3171 // If we have unwound to the bottom then backtrack. 3172 macro_assembler->CheckGreedyLoop(trace->backtrack()); 3173 // Otherwise try the second priority at an earlier position. 3174 macro_assembler->AdvanceCurrentPosition(-text_length); 3175 macro_assembler->GoTo(&second_choice); 3176 return new_trace; 3177} 3178 3179int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 3180 Trace* trace) { 3181 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 3182 if (alternatives_->length() != 2) return eats_at_least; 3183 3184 GuardedAlternative alt1 = alternatives_->at(1); 3185 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) { 3186 return eats_at_least; 3187 } 3188 RegExpNode* eats_anything_node = alt1.node(); 3189 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 3190 return eats_at_least; 3191 } 3192 3193 // Really we should be creating a new trace when we execute this function, 3194 // but there is no need, because the code it generates cannot backtrack, and 3195 // we always arrive here with a trivial trace (since it's the entry to a 3196 // loop. That also implies that there are no preloaded characters, which is 3197 // good, because it means we won't be violating any assumptions by 3198 // overwriting those characters with new load instructions. 3199 DCHECK(trace->is_trivial()); 3200 3201 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3202 Isolate* isolate = macro_assembler->isolate(); 3203 // At this point we know that we are at a non-greedy loop that will eat 3204 // any character one at a time. Any non-anchored regexp has such a 3205 // loop prepended to it in order to find where it starts. We look for 3206 // a pattern of the form ...abc... where we can look 6 characters ahead 3207 // and step forwards 3 if the character is not one of abc. Abc need 3208 // not be atoms, they can be any reasonably limited character class or 3209 // small alternation. 3210 BoyerMooreLookahead* bm = bm_info(false); 3211 if (bm == nullptr) { 3212 eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false)); 3213 if (eats_at_least >= 1) { 3214 bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone()); 3215 GuardedAlternative alt0 = alternatives_->at(0); 3216 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 3217 } 3218 } 3219 if (bm != nullptr) { 3220 bm->EmitSkipInstructions(macro_assembler); 3221 } 3222 return eats_at_least; 3223} 3224 3225void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 3226 AlternativeGenerationList* alt_gens, 3227 int first_choice, Trace* trace, 3228 PreloadState* preload) { 3229 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3230 SetUpPreLoad(compiler, trace, preload); 3231 3232 // For now we just call all choices one after the other. The idea ultimately 3233 // is to use the Dispatch table to try only the relevant ones. 3234 int choice_count = alternatives_->length(); 3235 3236 int new_flush_budget = trace->flush_budget() / choice_count; 3237 3238 for (int i = first_choice; i < choice_count; i++) { 3239 bool is_last = i == choice_count - 1; 3240 bool fall_through_on_failure = !is_last; 3241 GuardedAlternative alternative = alternatives_->at(i); 3242 AlternativeGeneration* alt_gen = alt_gens->at(i); 3243 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 3244 ZoneList<Guard*>* guards = alternative.guards(); 3245 int guard_count = (guards == nullptr) ? 0 : guards->length(); 3246 Trace new_trace(*trace); 3247 new_trace.set_characters_preloaded( 3248 preload->preload_is_current_ ? preload->preload_characters_ : 0); 3249 if (preload->preload_has_checked_bounds_) { 3250 new_trace.set_bound_checked_up_to(preload->preload_characters_); 3251 } 3252 new_trace.quick_check_performed()->Clear(); 3253 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 3254 if (!is_last) { 3255 new_trace.set_backtrack(&alt_gen->after); 3256 } 3257 alt_gen->expects_preload = preload->preload_is_current_; 3258 bool generate_full_check_inline = false; 3259 if (compiler->optimize() && 3260 try_to_emit_quick_check_for_alternative(i == 0) && 3261 alternative.node()->EmitQuickCheck( 3262 compiler, trace, &new_trace, preload->preload_has_checked_bounds_, 3263 &alt_gen->possible_success, &alt_gen->quick_check_details, 3264 fall_through_on_failure, this)) { 3265 // Quick check was generated for this choice. 3266 preload->preload_is_current_ = true; 3267 preload->preload_has_checked_bounds_ = true; 3268 // If we generated the quick check to fall through on possible success, 3269 // we now need to generate the full check inline. 3270 if (!fall_through_on_failure) { 3271 macro_assembler->Bind(&alt_gen->possible_success); 3272 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 3273 new_trace.set_characters_preloaded(preload->preload_characters_); 3274 new_trace.set_bound_checked_up_to(preload->preload_characters_); 3275 generate_full_check_inline = true; 3276 } 3277 } else if (alt_gen->quick_check_details.cannot_match()) { 3278 if (!fall_through_on_failure) { 3279 macro_assembler->GoTo(trace->backtrack()); 3280 } 3281 continue; 3282 } else { 3283 // No quick check was generated. Put the full code here. 3284 // If this is not the first choice then there could be slow checks from 3285 // previous cases that go here when they fail. There's no reason to 3286 // insist that they preload characters since the slow check we are about 3287 // to generate probably can't use it. 3288 if (i != first_choice) { 3289 alt_gen->expects_preload = false; 3290 new_trace.InvalidateCurrentCharacter(); 3291 } 3292 generate_full_check_inline = true; 3293 } 3294 if (generate_full_check_inline) { 3295 if (new_trace.actions() != nullptr) { 3296 new_trace.set_flush_budget(new_flush_budget); 3297 } 3298 for (int j = 0; j < guard_count; j++) { 3299 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 3300 } 3301 alternative.node()->Emit(compiler, &new_trace); 3302 preload->preload_is_current_ = false; 3303 } 3304 macro_assembler->Bind(&alt_gen->after); 3305 } 3306} 3307 3308void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 3309 Trace* trace, 3310 GuardedAlternative alternative, 3311 AlternativeGeneration* alt_gen, 3312 int preload_characters, 3313 bool next_expects_preload) { 3314 if (!alt_gen->possible_success.is_linked()) return; 3315 3316 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3317 macro_assembler->Bind(&alt_gen->possible_success); 3318 Trace out_of_line_trace(*trace); 3319 out_of_line_trace.set_characters_preloaded(preload_characters); 3320 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 3321 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 3322 ZoneList<Guard*>* guards = alternative.guards(); 3323 int guard_count = (guards == nullptr) ? 0 : guards->length(); 3324 if (next_expects_preload) { 3325 Label reload_current_char; 3326 out_of_line_trace.set_backtrack(&reload_current_char); 3327 for (int j = 0; j < guard_count; j++) { 3328 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3329 } 3330 alternative.node()->Emit(compiler, &out_of_line_trace); 3331 macro_assembler->Bind(&reload_current_char); 3332 // Reload the current character, since the next quick check expects that. 3333 // We don't need to check bounds here because we only get into this 3334 // code through a quick check which already did the checked load. 3335 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false, 3336 preload_characters); 3337 macro_assembler->GoTo(&(alt_gen->after)); 3338 } else { 3339 out_of_line_trace.set_backtrack(&(alt_gen->after)); 3340 for (int j = 0; j < guard_count; j++) { 3341 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3342 } 3343 alternative.node()->Emit(compiler, &out_of_line_trace); 3344 } 3345} 3346 3347void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3348 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3349 LimitResult limit_result = LimitVersions(compiler, trace); 3350 if (limit_result == DONE) return; 3351 DCHECK(limit_result == CONTINUE); 3352 3353 RecursionCheck rc(compiler); 3354 3355 switch (action_type_) { 3356 case STORE_POSITION: { 3357 Trace::DeferredCapture new_capture(data_.u_position_register.reg, 3358 data_.u_position_register.is_capture, 3359 trace); 3360 Trace new_trace = *trace; 3361 new_trace.add_action(&new_capture); 3362 on_success()->Emit(compiler, &new_trace); 3363 break; 3364 } 3365 case INCREMENT_REGISTER: { 3366 Trace::DeferredIncrementRegister new_increment( 3367 data_.u_increment_register.reg); 3368 Trace new_trace = *trace; 3369 new_trace.add_action(&new_increment); 3370 on_success()->Emit(compiler, &new_trace); 3371 break; 3372 } 3373 case SET_REGISTER_FOR_LOOP: { 3374 Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg, 3375 data_.u_store_register.value); 3376 Trace new_trace = *trace; 3377 new_trace.add_action(&new_set); 3378 on_success()->Emit(compiler, &new_trace); 3379 break; 3380 } 3381 case CLEAR_CAPTURES: { 3382 Trace::DeferredClearCaptures new_capture(Interval( 3383 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to)); 3384 Trace new_trace = *trace; 3385 new_trace.add_action(&new_capture); 3386 on_success()->Emit(compiler, &new_trace); 3387 break; 3388 } 3389 case BEGIN_POSITIVE_SUBMATCH: 3390 case BEGIN_NEGATIVE_SUBMATCH: 3391 if (!trace->is_trivial()) { 3392 trace->Flush(compiler, this); 3393 } else { 3394 assembler->WriteCurrentPositionToRegister( 3395 data_.u_submatch.current_position_register, 0); 3396 assembler->WriteStackPointerToRegister( 3397 data_.u_submatch.stack_pointer_register); 3398 on_success()->Emit(compiler, trace); 3399 } 3400 break; 3401 case EMPTY_MATCH_CHECK: { 3402 int start_pos_reg = data_.u_empty_match_check.start_register; 3403 int stored_pos = 0; 3404 int rep_reg = data_.u_empty_match_check.repetition_register; 3405 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 3406 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 3407 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 3408 // If we know we haven't advanced and there is no minimum we 3409 // can just backtrack immediately. 3410 assembler->GoTo(trace->backtrack()); 3411 } else if (know_dist && stored_pos < trace->cp_offset()) { 3412 // If we know we've advanced we can generate the continuation 3413 // immediately. 3414 on_success()->Emit(compiler, trace); 3415 } else if (!trace->is_trivial()) { 3416 trace->Flush(compiler, this); 3417 } else { 3418 Label skip_empty_check; 3419 // If we have a minimum number of repetitions we check the current 3420 // number first and skip the empty check if it's not enough. 3421 if (has_minimum) { 3422 int limit = data_.u_empty_match_check.repetition_limit; 3423 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 3424 } 3425 // If the match is empty we bail out, otherwise we fall through 3426 // to the on-success continuation. 3427 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 3428 trace->backtrack()); 3429 assembler->Bind(&skip_empty_check); 3430 on_success()->Emit(compiler, trace); 3431 } 3432 break; 3433 } 3434 case POSITIVE_SUBMATCH_SUCCESS: { 3435 if (!trace->is_trivial()) { 3436 trace->Flush(compiler, this); 3437 return; 3438 } 3439 assembler->ReadCurrentPositionFromRegister( 3440 data_.u_submatch.current_position_register); 3441 assembler->ReadStackPointerFromRegister( 3442 data_.u_submatch.stack_pointer_register); 3443 int clear_register_count = data_.u_submatch.clear_register_count; 3444 if (clear_register_count == 0) { 3445 on_success()->Emit(compiler, trace); 3446 return; 3447 } 3448 int clear_registers_from = data_.u_submatch.clear_register_from; 3449 Label clear_registers_backtrack; 3450 Trace new_trace = *trace; 3451 new_trace.set_backtrack(&clear_registers_backtrack); 3452 on_success()->Emit(compiler, &new_trace); 3453 3454 assembler->Bind(&clear_registers_backtrack); 3455 int clear_registers_to = clear_registers_from + clear_register_count - 1; 3456 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 3457 3458 DCHECK(trace->backtrack() == nullptr); 3459 assembler->Backtrack(); 3460 return; 3461 } 3462 default: 3463 UNREACHABLE(); 3464 } 3465} 3466 3467void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3468 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3469 if (!trace->is_trivial()) { 3470 trace->Flush(compiler, this); 3471 return; 3472 } 3473 3474 LimitResult limit_result = LimitVersions(compiler, trace); 3475 if (limit_result == DONE) return; 3476 DCHECK(limit_result == CONTINUE); 3477 3478 RecursionCheck rc(compiler); 3479 3480 DCHECK_EQ(start_reg_ + 1, end_reg_); 3481 if (IsIgnoreCase(flags_)) { 3482 bool unicode = IsUnicode(flags_); 3483 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), 3484 unicode, trace->backtrack()); 3485 } else { 3486 assembler->CheckNotBackReference(start_reg_, read_backward(), 3487 trace->backtrack()); 3488 } 3489 // We are going to advance backward, so we may end up at the start. 3490 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 3491 3492 // Check that the back reference does not end inside a surrogate pair. 3493 if (IsUnicode(flags_) && !compiler->one_byte()) { 3494 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); 3495 } 3496 on_success()->Emit(compiler, trace); 3497} 3498 3499void TextNode::CalculateOffsets() { 3500 int element_count = elements()->length(); 3501 // Set up the offsets of the elements relative to the start. This is a fixed 3502 // quantity since a TextNode can only contain fixed-width things. 3503 int cp_offset = 0; 3504 for (int i = 0; i < element_count; i++) { 3505 TextElement& elm = elements()->at(i); 3506 elm.set_cp_offset(cp_offset); 3507 cp_offset += elm.length(); 3508 } 3509} 3510 3511namespace { 3512 3513// Assertion propagation moves information about assertions such as 3514// \b to the affected nodes. For instance, in /.\b./ information must 3515// be propagated to the first '.' that whatever follows needs to know 3516// if it matched a word or a non-word, and to the second '.' that it 3517// has to check if it succeeds a word or non-word. In this case the 3518// result will be something like: 3519// 3520// +-------+ +------------+ 3521// | . | | . | 3522// +-------+ ---> +------------+ 3523// | word? | | check word | 3524// +-------+ +------------+ 3525class AssertionPropagator : public AllStatic { 3526 public: 3527 static void VisitText(TextNode* that) {} 3528 3529 static void VisitAction(ActionNode* that) { 3530 // If the next node is interested in what it follows then this node 3531 // has to be interested too so it can pass the information on. 3532 that->info()->AddFromFollowing(that->on_success()->info()); 3533 } 3534 3535 static void VisitChoice(ChoiceNode* that, int i) { 3536 // Anything the following nodes need to know has to be known by 3537 // this node also, so it can pass it on. 3538 that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info()); 3539 } 3540 3541 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { 3542 that->info()->AddFromFollowing(that->continue_node()->info()); 3543 } 3544 3545 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) { 3546 that->info()->AddFromFollowing(that->loop_node()->info()); 3547 } 3548 3549 static void VisitNegativeLookaroundChoiceLookaroundNode( 3550 NegativeLookaroundChoiceNode* that) { 3551 VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex); 3552 } 3553 3554 static void VisitNegativeLookaroundChoiceContinueNode( 3555 NegativeLookaroundChoiceNode* that) { 3556 VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex); 3557 } 3558 3559 static void VisitBackReference(BackReferenceNode* that) {} 3560 3561 static void VisitAssertion(AssertionNode* that) {} 3562}; 3563 3564// Propagates information about the minimum size of successful matches from 3565// successor nodes to their predecessors. Note that all eats_at_least values 3566// are initialized to zero before analysis. 3567class EatsAtLeastPropagator : public AllStatic { 3568 public: 3569 static void VisitText(TextNode* that) { 3570 // The eats_at_least value is not used if reading backward. 3571 if (!that->read_backward()) { 3572 // We are not at the start after this node, and thus we can use the 3573 // successor's eats_at_least_from_not_start value. 3574 uint8_t eats_at_least = base::saturated_cast<uint8_t>( 3575 that->Length() + that->on_success() 3576 ->eats_at_least_info() 3577 ->eats_at_least_from_not_start); 3578 that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least)); 3579 } 3580 } 3581 3582 static void VisitAction(ActionNode* that) { 3583 switch (that->action_type()) { 3584 case ActionNode::BEGIN_POSITIVE_SUBMATCH: 3585 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 3586 // We do not propagate eats_at_least data through positive lookarounds, 3587 // because they rewind input. 3588 // TODO(v8:11859) Potential approaches for fixing this include: 3589 // 1. Add a dedicated choice node for positive lookaround, similar to 3590 // NegativeLookaroundChoiceNode. 3591 // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which 3592 // is <= eats_at_least_from_possibly_start, and use that value in 3593 // EatsAtLeastFromLoopEntry. 3594 DCHECK(that->eats_at_least_info()->IsZero()); 3595 break; 3596 case ActionNode::SET_REGISTER_FOR_LOOP: 3597 // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the 3598 // loop body will run at least the minimum number of times before the 3599 // continuation case can run. 3600 that->set_eats_at_least_info( 3601 that->on_success()->EatsAtLeastFromLoopEntry()); 3602 break; 3603 case ActionNode::BEGIN_NEGATIVE_SUBMATCH: 3604 default: 3605 // Otherwise, the current node eats at least as much as its successor. 3606 // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH 3607 // because NegativeLookaroundChoiceNode ignores its lookaround successor 3608 // when computing eats-at-least and quick check information. 3609 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); 3610 break; 3611 } 3612 } 3613 3614 static void VisitChoice(ChoiceNode* that, int i) { 3615 // The minimum possible match from a choice node is the minimum of its 3616 // successors. 3617 EatsAtLeastInfo eats_at_least = 3618 i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info(); 3619 eats_at_least.SetMin( 3620 *that->alternatives()->at(i).node()->eats_at_least_info()); 3621 that->set_eats_at_least_info(eats_at_least); 3622 } 3623 3624 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) { 3625 if (!that->read_backward()) { 3626 that->set_eats_at_least_info( 3627 *that->continue_node()->eats_at_least_info()); 3628 } 3629 } 3630 3631 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {} 3632 3633 static void VisitNegativeLookaroundChoiceLookaroundNode( 3634 NegativeLookaroundChoiceNode* that) {} 3635 3636 static void VisitNegativeLookaroundChoiceContinueNode( 3637 NegativeLookaroundChoiceNode* that) { 3638 that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info()); 3639 } 3640 3641 static void VisitBackReference(BackReferenceNode* that) { 3642 if (!that->read_backward()) { 3643 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info()); 3644 } 3645 } 3646 3647 static void VisitAssertion(AssertionNode* that) { 3648 EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info(); 3649 if (that->assertion_type() == AssertionNode::AT_START) { 3650 // If we know we are not at the start and we are asked "how many 3651 // characters will you match if you succeed?" then we can answer anything 3652 // since false implies false. So let's just set the max answer 3653 // (UINT8_MAX) since that won't prevent us from preloading a lot of 3654 // characters for the other branches in the node graph. 3655 eats_at_least.eats_at_least_from_not_start = UINT8_MAX; 3656 } 3657 that->set_eats_at_least_info(eats_at_least); 3658 } 3659}; 3660 3661} // namespace 3662 3663// ------------------------------------------------------------------- 3664// Analysis 3665 3666// Iterates the node graph and provides the opportunity for propagators to set 3667// values that depend on successor nodes. 3668template <typename... Propagators> 3669class Analysis : public NodeVisitor { 3670 public: 3671 Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags) 3672 : isolate_(isolate), 3673 is_one_byte_(is_one_byte), 3674 flags_(flags), 3675 error_(RegExpError::kNone) {} 3676 3677 void EnsureAnalyzed(RegExpNode* that) { 3678 StackLimitCheck check(isolate()); 3679 if (check.HasOverflowed()) { 3680 if (FLAG_correctness_fuzzer_suppressions) { 3681 FATAL("Analysis: Aborting on stack overflow"); 3682 } 3683 fail(RegExpError::kAnalysisStackOverflow); 3684 return; 3685 } 3686 if (that->info()->been_analyzed || that->info()->being_analyzed) return; 3687 that->info()->being_analyzed = true; 3688 that->Accept(this); 3689 that->info()->being_analyzed = false; 3690 that->info()->been_analyzed = true; 3691 } 3692 3693 bool has_failed() { return error_ != RegExpError::kNone; } 3694 RegExpError error() { 3695 DCHECK(error_ != RegExpError::kNone); 3696 return error_; 3697 } 3698 void fail(RegExpError error) { error_ = error; } 3699 3700 Isolate* isolate() const { return isolate_; } 3701 3702 void VisitEnd(EndNode* that) override { 3703 // nothing to do 3704 } 3705 3706// Used to call the given static function on each propagator / variadic template 3707// argument. 3708#define STATIC_FOR_EACH(expr) \ 3709 do { \ 3710 int dummy[] = {((expr), 0)...}; \ 3711 USE(dummy); \ 3712 } while (false) 3713 3714 void VisitText(TextNode* that) override { 3715 that->MakeCaseIndependent(isolate(), is_one_byte_, flags_); 3716 EnsureAnalyzed(that->on_success()); 3717 if (has_failed()) return; 3718 that->CalculateOffsets(); 3719 STATIC_FOR_EACH(Propagators::VisitText(that)); 3720 } 3721 3722 void VisitAction(ActionNode* that) override { 3723 EnsureAnalyzed(that->on_success()); 3724 if (has_failed()) return; 3725 STATIC_FOR_EACH(Propagators::VisitAction(that)); 3726 } 3727 3728 void VisitChoice(ChoiceNode* that) override { 3729 for (int i = 0; i < that->alternatives()->length(); i++) { 3730 EnsureAnalyzed(that->alternatives()->at(i).node()); 3731 if (has_failed()) return; 3732 STATIC_FOR_EACH(Propagators::VisitChoice(that, i)); 3733 } 3734 } 3735 3736 void VisitLoopChoice(LoopChoiceNode* that) override { 3737 DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue. 3738 3739 // First propagate all information from the continuation node. 3740 EnsureAnalyzed(that->continue_node()); 3741 if (has_failed()) return; 3742 STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that)); 3743 3744 // Check the loop last since it may need the value of this node 3745 // to get a correct result. 3746 EnsureAnalyzed(that->loop_node()); 3747 if (has_failed()) return; 3748 STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that)); 3749 } 3750 3751 void VisitNegativeLookaroundChoice( 3752 NegativeLookaroundChoiceNode* that) override { 3753 DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue. 3754 3755 EnsureAnalyzed(that->lookaround_node()); 3756 if (has_failed()) return; 3757 STATIC_FOR_EACH( 3758 Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that)); 3759 3760 EnsureAnalyzed(that->continue_node()); 3761 if (has_failed()) return; 3762 STATIC_FOR_EACH( 3763 Propagators::VisitNegativeLookaroundChoiceContinueNode(that)); 3764 } 3765 3766 void VisitBackReference(BackReferenceNode* that) override { 3767 EnsureAnalyzed(that->on_success()); 3768 if (has_failed()) return; 3769 STATIC_FOR_EACH(Propagators::VisitBackReference(that)); 3770 } 3771 3772 void VisitAssertion(AssertionNode* that) override { 3773 EnsureAnalyzed(that->on_success()); 3774 if (has_failed()) return; 3775 STATIC_FOR_EACH(Propagators::VisitAssertion(that)); 3776 } 3777 3778#undef STATIC_FOR_EACH 3779 3780 private: 3781 Isolate* isolate_; 3782 const bool is_one_byte_; 3783 const RegExpFlags flags_; 3784 RegExpError error_; 3785 3786 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); 3787}; 3788 3789RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, 3790 RegExpNode* node) { 3791 Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis( 3792 isolate, is_one_byte, flags); 3793 DCHECK_EQ(node->info()->been_analyzed, false); 3794 analysis.EnsureAnalyzed(node); 3795 DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); 3796 return analysis.has_failed() ? analysis.error() : RegExpError::kNone; 3797} 3798 3799void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 3800 BoyerMooreLookahead* bm, 3801 bool not_at_start) { 3802 // Working out the set of characters that a backreference can match is too 3803 // hard, so we just say that any character can match. 3804 bm->SetRest(offset); 3805 SaveBMInfo(bm, not_at_start, offset); 3806} 3807 3808STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 3809 RegExpMacroAssembler::kTableSize); 3810 3811void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 3812 BoyerMooreLookahead* bm, bool not_at_start) { 3813 ZoneList<GuardedAlternative>* alts = alternatives(); 3814 budget = (budget - 1) / alts->length(); 3815 for (int i = 0; i < alts->length(); i++) { 3816 GuardedAlternative& alt = alts->at(i); 3817 if (alt.guards() != nullptr && alt.guards()->length() != 0) { 3818 bm->SetRest(offset); // Give up trying to fill in info. 3819 SaveBMInfo(bm, not_at_start, offset); 3820 return; 3821 } 3822 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 3823 } 3824 SaveBMInfo(bm, not_at_start, offset); 3825} 3826 3827void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 3828 BoyerMooreLookahead* bm, bool not_at_start) { 3829 if (initial_offset >= bm->length()) return; 3830 int offset = initial_offset; 3831 int max_char = bm->max_char(); 3832 for (int i = 0; i < elements()->length(); i++) { 3833 if (offset >= bm->length()) { 3834 if (initial_offset == 0) set_bm_info(not_at_start, bm); 3835 return; 3836 } 3837 TextElement text = elements()->at(i); 3838 if (text.text_type() == TextElement::ATOM) { 3839 RegExpAtom* atom = text.atom(); 3840 for (int j = 0; j < atom->length(); j++, offset++) { 3841 if (offset >= bm->length()) { 3842 if (initial_offset == 0) set_bm_info(not_at_start, bm); 3843 return; 3844 } 3845 base::uc16 character = atom->data()[j]; 3846 if (IsIgnoreCase(bm->compiler()->flags())) { 3847 unibrow::uchar chars[4]; 3848 int length = GetCaseIndependentLetters( 3849 isolate, character, bm->max_char() == String::kMaxOneByteCharCode, 3850 chars, 4); 3851 for (int k = 0; k < length; k++) { 3852 bm->Set(offset, chars[k]); 3853 } 3854 } else { 3855 if (character <= max_char) bm->Set(offset, character); 3856 } 3857 } 3858 } else { 3859 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 3860 RegExpCharacterClass* char_class = text.char_class(); 3861 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 3862 if (char_class->is_negated()) { 3863 bm->SetAll(offset); 3864 } else { 3865 for (int k = 0; k < ranges->length(); k++) { 3866 CharacterRange& range = ranges->at(k); 3867 if (static_cast<int>(range.from()) > max_char) continue; 3868 int to = std::min(max_char, static_cast<int>(range.to())); 3869 bm->SetInterval(offset, Interval(range.from(), to)); 3870 } 3871 } 3872 offset++; 3873 } 3874 } 3875 if (offset >= bm->length()) { 3876 if (initial_offset == 0) set_bm_info(not_at_start, bm); 3877 return; 3878 } 3879 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 3880 true); // Not at start after a text node. 3881 if (initial_offset == 0) set_bm_info(not_at_start, bm); 3882} 3883 3884RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( 3885 RegExpNode* on_success) { 3886 DCHECK(!read_backward()); 3887 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 3888 zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 3889 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 3890 zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 3891 3892 ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone()); 3893 3894 int stack_register = UnicodeLookaroundStackRegister(); 3895 int position_register = UnicodeLookaroundPositionRegister(); 3896 RegExpNode* step_back = TextNode::CreateForCharacterRanges( 3897 zone(), lead_surrogates, true, on_success); 3898 RegExpLookaround::Builder builder(true, step_back, stack_register, 3899 position_register); 3900 RegExpNode* match_trail = TextNode::CreateForCharacterRanges( 3901 zone(), trail_surrogates, false, builder.on_match_success()); 3902 3903 optional_step_back->AddAlternative( 3904 GuardedAlternative(builder.ForMatch(match_trail))); 3905 optional_step_back->AddAlternative(GuardedAlternative(on_success)); 3906 3907 return optional_step_back; 3908} 3909 3910RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, 3911 RegExpFlags flags, 3912 bool is_one_byte) { 3913 // Wrap the body of the regexp in capture #0. 3914 RegExpNode* captured_body = 3915 RegExpCapture::ToNode(data->tree, 0, this, accept()); 3916 RegExpNode* node = captured_body; 3917 if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) { 3918 // Add a .*? at the beginning, outside the body capture, unless 3919 // this expression is anchored at the beginning or sticky. 3920 RegExpNode* loop_node = RegExpQuantifier::ToNode( 3921 0, RegExpTree::kInfinity, false, 3922 zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything), 3923 this, captured_body, data->contains_anchor); 3924 3925 if (data->contains_anchor) { 3926 // Unroll loop once, to take care of the case that might start 3927 // at the start of input. 3928 ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone()); 3929 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 3930 first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>( 3931 zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything), 3932 false, loop_node))); 3933 node = first_step_node; 3934 } else { 3935 node = loop_node; 3936 } 3937 } 3938 if (is_one_byte) { 3939 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); 3940 // Do it again to propagate the new nodes to places where they were not 3941 // put because they had not been calculated yet. 3942 if (node != nullptr) { 3943 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); 3944 } 3945 } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) { 3946 node = OptionallyStepBackToLeadSurrogate(node); 3947 } 3948 3949 if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone()); 3950 return node; 3951} 3952 3953void RegExpCompiler::ToNodeCheckForStackOverflow() { 3954 if (StackLimitCheck{isolate()}.HasOverflowed()) { 3955 FatalProcessOutOfMemory(isolate(), "RegExpCompiler"); 3956 } 3957} 3958 3959} // namespace internal 3960} // namespace v8 3961