1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-compiler.h"
6
7#include "src/base/safe_conversions.h"
8#include "src/execution/isolate.h"
9#include "src/objects/fixed-array-inl.h"
10#include "src/regexp/regexp-macro-assembler-arch.h"
11#include "src/strings/unicode-inl.h"
12#include "src/zone/zone-list-inl.h"
13
14#ifdef V8_INTL_SUPPORT
15#include "src/regexp/special-case.h"
16#include "unicode/locid.h"
17#include "unicode/uniset.h"
18#include "unicode/utypes.h"
19#endif  // V8_INTL_SUPPORT
20
21namespace v8 {
22namespace internal {
23
24using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
25
26// -------------------------------------------------------------------
27// Implementation of the Irregexp regular expression engine.
28//
29// The Irregexp regular expression engine is intended to be a complete
30// implementation of ECMAScript regular expressions.  It generates either
31// bytecodes or native code.
32
33//   The Irregexp regexp engine is structured in three steps.
34//   1) The parser generates an abstract syntax tree.  See ast.cc.
35//   2) From the AST a node network is created.  The nodes are all
36//      subclasses of RegExpNode.  The nodes represent states when
37//      executing a regular expression.  Several optimizations are
38//      performed on the node network.
39//   3) From the nodes we generate either byte codes or native code
40//      that can actually execute the regular expression (perform
41//      the search).  The code generation step is described in more
42//      detail below.
43
44// Code generation.
45//
46//   The nodes are divided into four main categories.
47//   * Choice nodes
48//        These represent places where the regular expression can
49//        match in more than one way.  For example on entry to an
50//        alternation (foo|bar) or a repetition (*, +, ? or {}).
51//   * Action nodes
52//        These represent places where some action should be
53//        performed.  Examples include recording the current position
54//        in the input string to a register (in order to implement
55//        captures) or other actions on register for example in order
56//        to implement the counters needed for {} repetitions.
57//   * Matching nodes
58//        These attempt to match some element part of the input string.
59//        Examples of elements include character classes, plain strings
60//        or back references.
61//   * End nodes
62//        These are used to implement the actions required on finding
63//        a successful match or failing to find a match.
64//
65//   The code generated (whether as byte codes or native code) maintains
66//   some state as it runs.  This consists of the following elements:
67//
68//   * The capture registers.  Used for string captures.
69//   * Other registers.  Used for counters etc.
70//   * The current position.
71//   * The stack of backtracking information.  Used when a matching node
72//     fails to find a match and needs to try an alternative.
73//
74// Conceptual regular expression execution model:
75//
76//   There is a simple conceptual model of regular expression execution
77//   which will be presented first.  The actual code generated is a more
78//   efficient simulation of the simple conceptual model:
79//
80//   * Choice nodes are implemented as follows:
81//     For each choice except the last {
82//       push current position
83//       push backtrack code location
84//       <generate code to test for choice>
85//       backtrack code location:
86//       pop current position
87//     }
88//     <generate code to test for last choice>
89//
90//   * Actions nodes are generated as follows
91//     <push affected registers on backtrack stack>
92//     <generate code to perform action>
93//     push backtrack code location
94//     <generate code to test for following nodes>
95//     backtrack code location:
96//     <pop affected registers to restore their state>
97//     <pop backtrack location from stack and go to it>
98//
99//   * Matching nodes are generated as follows:
100//     if input string matches at current position
101//       update current position
102//       <generate code to test for following nodes>
103//     else
104//       <pop backtrack location from stack and go to it>
105//
106//   Thus it can be seen that the current position is saved and restored
107//   by the choice nodes, whereas the registers are saved and restored by
108//   by the action nodes that manipulate them.
109//
110//   The other interesting aspect of this model is that nodes are generated
111//   at the point where they are needed by a recursive call to Emit().  If
112//   the node has already been code generated then the Emit() call will
113//   generate a jump to the previously generated code instead.  In order to
114//   limit recursion it is possible for the Emit() function to put the node
115//   on a work list for later generation and instead generate a jump.  The
116//   destination of the jump is resolved later when the code is generated.
117//
118// Actual regular expression code generation.
119//
120//   Code generation is actually more complicated than the above.  In order
121//   to improve the efficiency of the generated code some optimizations are
122//   performed
123//
124//   * Choice nodes have 1-character lookahead.
125//     A choice node looks at the following character and eliminates some of
126//     the choices immediately based on that character.  This is not yet
127//     implemented.
128//   * Simple greedy loops store reduced backtracking information.
129//     A quantifier like /.*foo/m will greedily match the whole input.  It will
130//     then need to backtrack to a point where it can match "foo".  The naive
131//     implementation of this would push each character position onto the
132//     backtracking stack, then pop them off one by one.  This would use space
133//     proportional to the length of the input string.  However since the "."
134//     can only match in one way and always has a constant length (in this case
135//     of 1) it suffices to store the current position on the top of the stack
136//     once.  Matching now becomes merely incrementing the current position and
137//     backtracking becomes decrementing the current position and checking the
138//     result against the stored current position.  This is faster and saves
139//     space.
140//   * The current state is virtualized.
141//     This is used to defer expensive operations until it is clear that they
142//     are needed and to generate code for a node more than once, allowing
143//     specialized an efficient versions of the code to be created. This is
144//     explained in the section below.
145//
146// Execution state virtualization.
147//
148//   Instead of emitting code, nodes that manipulate the state can record their
149//   manipulation in an object called the Trace.  The Trace object can record a
150//   current position offset, an optional backtrack code location on the top of
151//   the virtualized backtrack stack and some register changes.  When a node is
152//   to be emitted it can flush the Trace or update it.  Flushing the Trace
153//   will emit code to bring the actual state into line with the virtual state.
154//   Avoiding flushing the state can postpone some work (e.g. updates of capture
155//   registers).  Postponing work can save time when executing the regular
156//   expression since it may be found that the work never has to be done as a
157//   failure to match can occur.  In addition it is much faster to jump to a
158//   known backtrack code location than it is to pop an unknown backtrack
159//   location from the stack and jump there.
160//
161//   The virtual state found in the Trace affects code generation.  For example
162//   the virtual state contains the difference between the actual current
163//   position and the virtual current position, and matching code needs to use
164//   this offset to attempt a match in the correct location of the input
165//   string.  Therefore code generated for a non-trivial trace is specialized
166//   to that trace.  The code generator therefore has the ability to generate
167//   code for each node several times.  In order to limit the size of the
168//   generated code there is an arbitrary limit on how many specialized sets of
169//   code may be generated for a given node.  If the limit is reached, the
170//   trace is flushed and a generic version of the code for a node is emitted.
171//   This is subsequently used for that node.  The code emitted for non-generic
172//   trace is not recorded in the node and so it cannot currently be reused in
173//   the event that code generation is requested for an identical trace.
174
175namespace {
176
177constexpr base::uc32 MaxCodeUnit(const bool one_byte) {
178  STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
179                std::numeric_limits<uint16_t>::max());
180  STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
181                std::numeric_limits<uint16_t>::max());
182  return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
183}
184
185constexpr uint32_t CharMask(const bool one_byte) {
186  STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
187  STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
188  return MaxCodeUnit(one_byte);
189}
190
191}  // namespace
192
193void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
194
195void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
196  text->AddElement(TextElement::Atom(this), zone);
197}
198
199void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
200  text->AddElement(TextElement::CharClass(this), zone);
201}
202
203void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
204  for (int i = 0; i < elements()->length(); i++)
205    text->AddElement(elements()->at(i), zone);
206}
207
208TextElement TextElement::Atom(RegExpAtom* atom) {
209  return TextElement(ATOM, atom);
210}
211
212TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
213  return TextElement(CHAR_CLASS, char_class);
214}
215
216int TextElement::length() const {
217  switch (text_type()) {
218    case ATOM:
219      return atom()->length();
220
221    case CHAR_CLASS:
222      return 1;
223  }
224  UNREACHABLE();
225}
226
227class RecursionCheck {
228 public:
229  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
230    compiler->IncrementRecursionDepth();
231  }
232  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
233
234 private:
235  RegExpCompiler* compiler_;
236};
237
238// Attempts to compile the regexp using an Irregexp code generator.  Returns
239// a fixed array or a null handle depending on whether it succeeded.
240RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
241                               RegExpFlags flags, bool one_byte)
242    : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
243      unicode_lookaround_stack_register_(kNoRegister),
244      unicode_lookaround_position_register_(kNoRegister),
245      work_list_(nullptr),
246      recursion_depth_(0),
247      flags_(flags),
248      one_byte_(one_byte),
249      reg_exp_too_big_(false),
250      limiting_recursion_(false),
251      optimize_(FLAG_regexp_optimization),
252      read_backward_(false),
253      current_expansion_factor_(1),
254      frequency_collator_(),
255      isolate_(isolate),
256      zone_(zone) {
257  accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
258  DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
259}
260
261RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
262    Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
263    int capture_count, Handle<String> pattern) {
264  macro_assembler_ = macro_assembler;
265
266  ZoneVector<RegExpNode*> work_list(zone());
267  work_list_ = &work_list;
268  Label fail;
269  macro_assembler_->PushBacktrack(&fail);
270  Trace new_trace;
271  start->Emit(this, &new_trace);
272  macro_assembler_->BindJumpTarget(&fail);
273  macro_assembler_->Fail();
274  while (!work_list.empty()) {
275    RegExpNode* node = work_list.back();
276    work_list.pop_back();
277    node->set_on_work_list(false);
278    if (!node->label()->is_bound()) node->Emit(this, &new_trace);
279  }
280  if (reg_exp_too_big_) {
281    if (FLAG_correctness_fuzzer_suppressions) {
282      FATAL("Aborting on excess zone allocation");
283    }
284    macro_assembler_->AbortedCodeGeneration();
285    return CompilationResult::RegExpTooBig();
286  }
287
288  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
289  isolate->IncreaseTotalRegexpCodeGenerated(code);
290  work_list_ = nullptr;
291
292  return {code, next_register_};
293}
294
295bool Trace::DeferredAction::Mentions(int that) {
296  if (action_type() == ActionNode::CLEAR_CAPTURES) {
297    Interval range = static_cast<DeferredClearCaptures*>(this)->range();
298    return range.Contains(that);
299  } else {
300    return reg() == that;
301  }
302}
303
304bool Trace::mentions_reg(int reg) {
305  for (DeferredAction* action = actions_; action != nullptr;
306       action = action->next()) {
307    if (action->Mentions(reg)) return true;
308  }
309  return false;
310}
311
312bool Trace::GetStoredPosition(int reg, int* cp_offset) {
313  DCHECK_EQ(0, *cp_offset);
314  for (DeferredAction* action = actions_; action != nullptr;
315       action = action->next()) {
316    if (action->Mentions(reg)) {
317      if (action->action_type() == ActionNode::STORE_POSITION) {
318        *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
319        return true;
320      } else {
321        return false;
322      }
323    }
324  }
325  return false;
326}
327
328// A (dynamically-sized) set of unsigned integers that behaves especially well
329// on small integers (< kFirstLimit). May do zone-allocation.
330class DynamicBitSet : public ZoneObject {
331 public:
332  V8_EXPORT_PRIVATE bool Get(unsigned value) const {
333    if (value < kFirstLimit) {
334      return (first_ & (1 << value)) != 0;
335    } else if (remaining_ == nullptr) {
336      return false;
337    } else {
338      return remaining_->Contains(value);
339    }
340  }
341
342  // Destructively set a value in this set.
343  void Set(unsigned value, Zone* zone) {
344    if (value < kFirstLimit) {
345      first_ |= (1 << value);
346    } else {
347      if (remaining_ == nullptr)
348        remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
349      if (remaining_->is_empty() || !remaining_->Contains(value))
350        remaining_->Add(value, zone);
351    }
352  }
353
354 private:
355  static constexpr unsigned kFirstLimit = 32;
356
357  uint32_t first_ = 0;
358  ZoneList<unsigned>* remaining_ = nullptr;
359};
360
361int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
362                                 Zone* zone) {
363  int max_register = RegExpCompiler::kNoRegister;
364  for (DeferredAction* action = actions_; action != nullptr;
365       action = action->next()) {
366    if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
367      Interval range = static_cast<DeferredClearCaptures*>(action)->range();
368      for (int i = range.from(); i <= range.to(); i++)
369        affected_registers->Set(i, zone);
370      if (range.to() > max_register) max_register = range.to();
371    } else {
372      affected_registers->Set(action->reg(), zone);
373      if (action->reg() > max_register) max_register = action->reg();
374    }
375  }
376  return max_register;
377}
378
379void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
380                                     int max_register,
381                                     const DynamicBitSet& registers_to_pop,
382                                     const DynamicBitSet& registers_to_clear) {
383  for (int reg = max_register; reg >= 0; reg--) {
384    if (registers_to_pop.Get(reg)) {
385      assembler->PopRegister(reg);
386    } else if (registers_to_clear.Get(reg)) {
387      int clear_to = reg;
388      while (reg > 0 && registers_to_clear.Get(reg - 1)) {
389        reg--;
390      }
391      assembler->ClearRegisters(reg, clear_to);
392    }
393  }
394}
395
396void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
397                                   int max_register,
398                                   const DynamicBitSet& affected_registers,
399                                   DynamicBitSet* registers_to_pop,
400                                   DynamicBitSet* registers_to_clear,
401                                   Zone* zone) {
402  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
403  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
404
405  // Count pushes performed to force a stack limit check occasionally.
406  int pushes = 0;
407
408  for (int reg = 0; reg <= max_register; reg++) {
409    if (!affected_registers.Get(reg)) continue;
410
411    // The chronologically first deferred action in the trace
412    // is used to infer the action needed to restore a register
413    // to its previous state (or not, if it's safe to ignore it).
414    enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
415    DeferredActionUndoType undo_action = IGNORE;
416
417    int value = 0;
418    bool absolute = false;
419    bool clear = false;
420    static const int kNoStore = kMinInt;
421    int store_position = kNoStore;
422    // This is a little tricky because we are scanning the actions in reverse
423    // historical order (newest first).
424    for (DeferredAction* action = actions_; action != nullptr;
425         action = action->next()) {
426      if (action->Mentions(reg)) {
427        switch (action->action_type()) {
428          case ActionNode::SET_REGISTER_FOR_LOOP: {
429            Trace::DeferredSetRegisterForLoop* psr =
430                static_cast<Trace::DeferredSetRegisterForLoop*>(action);
431            if (!absolute) {
432              value += psr->value();
433              absolute = true;
434            }
435            // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
436            // counters. They can have a significant previous value if they
437            // occur in a loop. TODO(lrn): Propagate this information, so
438            // we can set undo_action to IGNORE if we know there is no value to
439            // restore.
440            undo_action = RESTORE;
441            DCHECK_EQ(store_position, kNoStore);
442            DCHECK(!clear);
443            break;
444          }
445          case ActionNode::INCREMENT_REGISTER:
446            if (!absolute) {
447              value++;
448            }
449            DCHECK_EQ(store_position, kNoStore);
450            DCHECK(!clear);
451            undo_action = RESTORE;
452            break;
453          case ActionNode::STORE_POSITION: {
454            Trace::DeferredCapture* pc =
455                static_cast<Trace::DeferredCapture*>(action);
456            if (!clear && store_position == kNoStore) {
457              store_position = pc->cp_offset();
458            }
459
460            // For captures we know that stores and clears alternate.
461            // Other register, are never cleared, and if the occur
462            // inside a loop, they might be assigned more than once.
463            if (reg <= 1) {
464              // Registers zero and one, aka "capture zero", is
465              // always set correctly if we succeed. There is no
466              // need to undo a setting on backtrack, because we
467              // will set it again or fail.
468              undo_action = IGNORE;
469            } else {
470              undo_action = pc->is_capture() ? CLEAR : RESTORE;
471            }
472            DCHECK(!absolute);
473            DCHECK_EQ(value, 0);
474            break;
475          }
476          case ActionNode::CLEAR_CAPTURES: {
477            // Since we're scanning in reverse order, if we've already
478            // set the position we have to ignore historically earlier
479            // clearing operations.
480            if (store_position == kNoStore) {
481              clear = true;
482            }
483            undo_action = RESTORE;
484            DCHECK(!absolute);
485            DCHECK_EQ(value, 0);
486            break;
487          }
488          default:
489            UNREACHABLE();
490        }
491      }
492    }
493    // Prepare for the undo-action (e.g., push if it's going to be popped).
494    if (undo_action == RESTORE) {
495      pushes++;
496      RegExpMacroAssembler::StackCheckFlag stack_check =
497          RegExpMacroAssembler::kNoStackLimitCheck;
498      if (pushes == push_limit) {
499        stack_check = RegExpMacroAssembler::kCheckStackLimit;
500        pushes = 0;
501      }
502
503      assembler->PushRegister(reg, stack_check);
504      registers_to_pop->Set(reg, zone);
505    } else if (undo_action == CLEAR) {
506      registers_to_clear->Set(reg, zone);
507    }
508    // Perform the chronologically last action (or accumulated increment)
509    // for the register.
510    if (store_position != kNoStore) {
511      assembler->WriteCurrentPositionToRegister(reg, store_position);
512    } else if (clear) {
513      assembler->ClearRegisters(reg, reg);
514    } else if (absolute) {
515      assembler->SetRegister(reg, value);
516    } else if (value != 0) {
517      assembler->AdvanceRegister(reg, value);
518    }
519  }
520}
521
522// This is called as we come into a loop choice node and some other tricky
523// nodes.  It normalizes the state of the code generator to ensure we can
524// generate generic code.
525void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
526  RegExpMacroAssembler* assembler = compiler->macro_assembler();
527
528  DCHECK(!is_trivial());
529
530  if (actions_ == nullptr && backtrack() == nullptr) {
531    // Here we just have some deferred cp advances to fix and we are back to
532    // a normal situation.  We may also have to forget some information gained
533    // through a quick check that was already performed.
534    if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
535    // Create a new trivial state and generate the node with that.
536    Trace new_state;
537    successor->Emit(compiler, &new_state);
538    return;
539  }
540
541  // Generate deferred actions here along with code to undo them again.
542  DynamicBitSet affected_registers;
543
544  if (backtrack() != nullptr) {
545    // Here we have a concrete backtrack location.  These are set up by choice
546    // nodes and so they indicate that we have a deferred save of the current
547    // position which we may need to emit here.
548    assembler->PushCurrentPosition();
549  }
550
551  int max_register =
552      FindAffectedRegisters(&affected_registers, compiler->zone());
553  DynamicBitSet registers_to_pop;
554  DynamicBitSet registers_to_clear;
555  PerformDeferredActions(assembler, max_register, affected_registers,
556                         &registers_to_pop, &registers_to_clear,
557                         compiler->zone());
558  if (cp_offset_ != 0) {
559    assembler->AdvanceCurrentPosition(cp_offset_);
560  }
561
562  // Create a new trivial state and generate the node with that.
563  Label undo;
564  assembler->PushBacktrack(&undo);
565  if (successor->KeepRecursing(compiler)) {
566    Trace new_state;
567    successor->Emit(compiler, &new_state);
568  } else {
569    compiler->AddWork(successor);
570    assembler->GoTo(successor->label());
571  }
572
573  // On backtrack we need to restore state.
574  assembler->BindJumpTarget(&undo);
575  RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
576                           registers_to_clear);
577  if (backtrack() == nullptr) {
578    assembler->Backtrack();
579  } else {
580    assembler->PopCurrentPosition();
581    assembler->GoTo(backtrack());
582  }
583}
584
585void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
586  RegExpMacroAssembler* assembler = compiler->macro_assembler();
587
588  // Omit flushing the trace. We discard the entire stack frame anyway.
589
590  if (!label()->is_bound()) {
591    // We are completely independent of the trace, since we ignore it,
592    // so this code can be used as the generic version.
593    assembler->Bind(label());
594  }
595
596  // Throw away everything on the backtrack stack since the start
597  // of the negative submatch and restore the character position.
598  assembler->ReadCurrentPositionFromRegister(current_position_register_);
599  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
600  if (clear_capture_count_ > 0) {
601    // Clear any captures that might have been performed during the success
602    // of the body of the negative look-ahead.
603    int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
604    assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
605  }
606  // Now that we have unwound the stack we find at the top of the stack the
607  // backtrack that the BeginNegativeSubmatch node got.
608  assembler->Backtrack();
609}
610
611void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
612  if (!trace->is_trivial()) {
613    trace->Flush(compiler, this);
614    return;
615  }
616  RegExpMacroAssembler* assembler = compiler->macro_assembler();
617  if (!label()->is_bound()) {
618    assembler->Bind(label());
619  }
620  switch (action_) {
621    case ACCEPT:
622      assembler->Succeed();
623      return;
624    case BACKTRACK:
625      assembler->GoTo(trace->backtrack());
626      return;
627    case NEGATIVE_SUBMATCH_SUCCESS:
628      // This case is handled in a different virtual method.
629      UNREACHABLE();
630  }
631  UNIMPLEMENTED();
632}
633
634void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
635  if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
636  guards_->Add(guard, zone);
637}
638
639ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
640                                           RegExpNode* on_success) {
641  ActionNode* result =
642      on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success);
643  result->data_.u_store_register.reg = reg;
644  result->data_.u_store_register.value = val;
645  return result;
646}
647
648ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
649  ActionNode* result =
650      on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success);
651  result->data_.u_increment_register.reg = reg;
652  return result;
653}
654
655ActionNode* ActionNode::StorePosition(int reg, bool is_capture,
656                                      RegExpNode* on_success) {
657  ActionNode* result =
658      on_success->zone()->New<ActionNode>(STORE_POSITION, on_success);
659  result->data_.u_position_register.reg = reg;
660  result->data_.u_position_register.is_capture = is_capture;
661  return result;
662}
663
664ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
665  ActionNode* result =
666      on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success);
667  result->data_.u_clear_captures.range_from = range.from();
668  result->data_.u_clear_captures.range_to = range.to();
669  return result;
670}
671
672ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg,
673                                              RegExpNode* on_success) {
674  ActionNode* result =
675      on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success);
676  result->data_.u_submatch.stack_pointer_register = stack_reg;
677  result->data_.u_submatch.current_position_register = position_reg;
678  return result;
679}
680
681ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg,
682                                              RegExpNode* on_success) {
683  ActionNode* result =
684      on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success);
685  result->data_.u_submatch.stack_pointer_register = stack_reg;
686  result->data_.u_submatch.current_position_register = position_reg;
687  return result;
688}
689
690ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
691                                                int clear_register_count,
692                                                int clear_register_from,
693                                                RegExpNode* on_success) {
694  ActionNode* result = on_success->zone()->New<ActionNode>(
695      POSITIVE_SUBMATCH_SUCCESS, on_success);
696  result->data_.u_submatch.stack_pointer_register = stack_reg;
697  result->data_.u_submatch.current_position_register = position_reg;
698  result->data_.u_submatch.clear_register_count = clear_register_count;
699  result->data_.u_submatch.clear_register_from = clear_register_from;
700  return result;
701}
702
703ActionNode* ActionNode::EmptyMatchCheck(int start_register,
704                                        int repetition_register,
705                                        int repetition_limit,
706                                        RegExpNode* on_success) {
707  ActionNode* result =
708      on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
709  result->data_.u_empty_match_check.start_register = start_register;
710  result->data_.u_empty_match_check.repetition_register = repetition_register;
711  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
712  return result;
713}
714
715#define DEFINE_ACCEPT(Type) \
716  void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
717FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
718#undef DEFINE_ACCEPT
719
720// -------------------------------------------------------------------
721// Emit code.
722
723void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
724                               Guard* guard, Trace* trace) {
725  switch (guard->op()) {
726    case Guard::LT:
727      DCHECK(!trace->mentions_reg(guard->reg()));
728      macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
729                                    trace->backtrack());
730      break;
731    case Guard::GEQ:
732      DCHECK(!trace->mentions_reg(guard->reg()));
733      macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
734                                    trace->backtrack());
735      break;
736  }
737}
738
739namespace {
740
741#ifdef DEBUG
742bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
743  STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
744  for (int i = 0; i < length; i++) {
745    if (chars[i] > String::kMaxUtf16CodeUnit) return false;
746  }
747  return true;
748}
749#endif  // DEBUG
750
751// Returns the number of characters in the equivalence class, omitting those
752// that cannot occur in the source string because it is Latin1.
753int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character,
754                              bool one_byte_subject, unibrow::uchar* letters,
755                              int letter_length) {
756#ifdef V8_INTL_SUPPORT
757  if (RegExpCaseFolding::IgnoreSet().contains(character)) {
758    letters[0] = character;
759    DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
760    return 1;
761  }
762  bool in_special_add_set =
763      RegExpCaseFolding::SpecialAddSet().contains(character);
764
765  icu::UnicodeSet set;
766  set.add(character);
767  set = set.closeOver(USET_CASE_INSENSITIVE);
768
769  UChar32 canon = 0;
770  if (in_special_add_set) {
771    canon = RegExpCaseFolding::Canonicalize(character);
772  }
773
774  int32_t range_count = set.getRangeCount();
775  int items = 0;
776  for (int32_t i = 0; i < range_count; i++) {
777    UChar32 start = set.getRangeStart(i);
778    UChar32 end = set.getRangeEnd(i);
779    CHECK(end - start + items <= letter_length);
780    for (UChar32 cu = start; cu <= end; cu++) {
781      if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
782      if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
783        continue;
784      }
785      letters[items++] = static_cast<unibrow::uchar>(cu);
786    }
787  }
788  DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
789  return items;
790#else
791  int length =
792      isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
793  // Unibrow returns 0 or 1 for characters where case independence is
794  // trivial.
795  if (length == 0) {
796    letters[0] = character;
797    length = 1;
798  }
799
800  if (one_byte_subject) {
801    int new_length = 0;
802    for (int i = 0; i < length; i++) {
803      if (letters[i] <= String::kMaxOneByteCharCode) {
804        letters[new_length++] = letters[i];
805      }
806    }
807    length = new_length;
808  }
809
810  DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
811  return length;
812#endif  // V8_INTL_SUPPORT
813}
814
815inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler,
816                                base::uc16 c, Label* on_failure, int cp_offset,
817                                bool check, bool preloaded) {
818  RegExpMacroAssembler* assembler = compiler->macro_assembler();
819  bool bound_checked = false;
820  if (!preloaded) {
821    assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
822    bound_checked = true;
823  }
824  assembler->CheckNotCharacter(c, on_failure);
825  return bound_checked;
826}
827
828// Only emits non-letters (things that don't have case).  Only used for case
829// independent matches.
830inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
831                              base::uc16 c, Label* on_failure, int cp_offset,
832                              bool check, bool preloaded) {
833  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
834  bool one_byte = compiler->one_byte();
835  unibrow::uchar chars[4];
836  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
837  if (length < 1) {
838    // This can't match.  Must be an one-byte subject and a non-one-byte
839    // character.  We do not need to do anything since the one-byte pass
840    // already handled this.
841    return false;  // Bounds not checked.
842  }
843  bool checked = false;
844  // We handle the length > 1 case in a later pass.
845  if (length == 1) {
846    if (one_byte && c > String::kMaxOneByteCharCodeU) {
847      // Can't match - see above.
848      return false;  // Bounds not checked.
849    }
850    if (!preloaded) {
851      macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
852      checked = check;
853    }
854    macro_assembler->CheckNotCharacter(c, on_failure);
855  }
856  return checked;
857}
858
859bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
860                               bool one_byte, base::uc16 c1, base::uc16 c2,
861                               Label* on_failure) {
862  const uint32_t char_mask = CharMask(one_byte);
863  base::uc16 exor = c1 ^ c2;
864  // Check whether exor has only one bit set.
865  if (((exor - 1) & exor) == 0) {
866    // If c1 and c2 differ only by one bit.
867    // Ecma262UnCanonicalize always gives the highest number last.
868    DCHECK(c2 > c1);
869    base::uc16 mask = char_mask ^ exor;
870    macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
871    return true;
872  }
873  DCHECK(c2 > c1);
874  base::uc16 diff = c2 - c1;
875  if (((diff - 1) & diff) == 0 && c1 >= diff) {
876    // If the characters differ by 2^n but don't differ by one bit then
877    // subtract the difference from the found character, then do the or
878    // trick.  We avoid the theoretical case where negative numbers are
879    // involved in order to simplify code generation.
880    base::uc16 mask = char_mask ^ diff;
881    macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
882                                                    on_failure);
883    return true;
884  }
885  return false;
886}
887
888// Only emits letters (things that have case).  Only used for case independent
889// matches.
890inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
891                           base::uc16 c, Label* on_failure, int cp_offset,
892                           bool check, bool preloaded) {
893  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
894  bool one_byte = compiler->one_byte();
895  unibrow::uchar chars[4];
896  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
897  if (length <= 1) return false;
898  // We may not need to check against the end of the input string
899  // if this character lies before a character that matched.
900  if (!preloaded) {
901    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
902  }
903  Label ok;
904  switch (length) {
905    case 2: {
906      if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
907                                    chars[1], on_failure)) {
908      } else {
909        macro_assembler->CheckCharacter(chars[0], &ok);
910        macro_assembler->CheckNotCharacter(chars[1], on_failure);
911        macro_assembler->Bind(&ok);
912      }
913      break;
914    }
915    case 4:
916      macro_assembler->CheckCharacter(chars[3], &ok);
917      V8_FALLTHROUGH;
918    case 3:
919      macro_assembler->CheckCharacter(chars[0], &ok);
920      macro_assembler->CheckCharacter(chars[1], &ok);
921      macro_assembler->CheckNotCharacter(chars[2], on_failure);
922      macro_assembler->Bind(&ok);
923      break;
924    default:
925      UNREACHABLE();
926  }
927  return true;
928}
929
930void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
931                      Label* fall_through, Label* above_or_equal,
932                      Label* below) {
933  if (below != fall_through) {
934    masm->CheckCharacterLT(border, below);
935    if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
936  } else {
937    masm->CheckCharacterGT(border - 1, above_or_equal);
938  }
939}
940
941void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last,
942                            Label* fall_through, Label* in_range,
943                            Label* out_of_range) {
944  if (in_range == fall_through) {
945    if (first == last) {
946      masm->CheckNotCharacter(first, out_of_range);
947    } else {
948      masm->CheckCharacterNotInRange(first, last, out_of_range);
949    }
950  } else {
951    if (first == last) {
952      masm->CheckCharacter(first, in_range);
953    } else {
954      masm->CheckCharacterInRange(first, last, in_range);
955    }
956    if (out_of_range != fall_through) masm->GoTo(out_of_range);
957  }
958}
959
960// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
961// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
962void EmitUseLookupTable(RegExpMacroAssembler* masm,
963                        ZoneList<base::uc32>* ranges, uint32_t start_index,
964                        uint32_t end_index, base::uc32 min_char,
965                        Label* fall_through, Label* even_label,
966                        Label* odd_label) {
967  static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
968  static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
969
970  base::uc32 base = (min_char & ~kMask);
971  USE(base);
972
973  // Assert that everything is on one kTableSize page.
974  for (uint32_t i = start_index; i <= end_index; i++) {
975    DCHECK_EQ(ranges->at(i) & ~kMask, base);
976  }
977  DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
978
979  char templ[kSize];
980  Label* on_bit_set;
981  Label* on_bit_clear;
982  int bit;
983  if (even_label == fall_through) {
984    on_bit_set = odd_label;
985    on_bit_clear = even_label;
986    bit = 1;
987  } else {
988    on_bit_set = even_label;
989    on_bit_clear = odd_label;
990    bit = 0;
991  }
992  for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize;
993       i++) {
994    templ[i] = bit;
995  }
996  uint32_t j = 0;
997  bit ^= 1;
998  for (uint32_t i = start_index; i < end_index; i++) {
999    for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1000      templ[j] = bit;
1001    }
1002    bit ^= 1;
1003  }
1004  for (uint32_t i = j; i < kSize; i++) {
1005    templ[i] = bit;
1006  }
1007  Factory* factory = masm->isolate()->factory();
1008  // TODO(erikcorry): Cache these.
1009  Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1010  for (uint32_t i = 0; i < kSize; i++) {
1011    ba->set(i, templ[i]);
1012  }
1013  masm->CheckBitInTable(ba, on_bit_set);
1014  if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1015}
1016
1017void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
1018                 uint32_t start_index, uint32_t end_index, uint32_t cut_index,
1019                 Label* even_label, Label* odd_label) {
1020  bool odd = (((cut_index - start_index) & 1) == 1);
1021  Label* in_range_label = odd ? odd_label : even_label;
1022  Label dummy;
1023  EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
1024                         ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
1025                         &dummy);
1026  DCHECK(!dummy.is_linked());
1027  // Cut out the single range by rewriting the array.  This creates a new
1028  // range that is a merger of the two ranges on either side of the one we
1029  // are cutting out.  The oddity of the labels is preserved.
1030  for (uint32_t j = cut_index; j > start_index; j--) {
1031    ranges->at(j) = ranges->at(j - 1);
1032  }
1033  for (uint32_t j = cut_index + 1; j < end_index; j++) {
1034    ranges->at(j) = ranges->at(j + 1);
1035  }
1036}
1037
1038// Unicode case.  Split the search space into kSize spaces that are handled
1039// with recursion.
1040void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index,
1041                      uint32_t end_index, uint32_t* new_start_index,
1042                      uint32_t* new_end_index, base::uc32* border) {
1043  static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
1044  static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
1045
1046  base::uc32 first = ranges->at(start_index);
1047  base::uc32 last = ranges->at(end_index) - 1;
1048
1049  *new_start_index = start_index;
1050  *border = (ranges->at(start_index) & ~kMask) + kSize;
1051  while (*new_start_index < end_index) {
1052    if (ranges->at(*new_start_index) > *border) break;
1053    (*new_start_index)++;
1054  }
1055  // new_start_index is the index of the first edge that is beyond the
1056  // current kSize space.
1057
1058  // For very large search spaces we do a binary chop search of the non-Latin1
1059  // space instead of just going to the end of the current kSize space.  The
1060  // heuristics are complicated a little by the fact that any 128-character
1061  // encoding space can be quickly tested with a table lookup, so we don't
1062  // wish to do binary chop search at a smaller granularity than that.  A
1063  // 128-character space can take up a lot of space in the ranges array if,
1064  // for example, we only want to match every second character (eg. the lower
1065  // case characters on some Unicode pages).
1066  uint32_t binary_chop_index = (end_index + start_index) / 2;
1067  // The first test ensures that we get to the code that handles the Latin1
1068  // range with a single not-taken branch, speeding up this important
1069  // character range (even non-Latin1 charset-based text has spaces and
1070  // punctuation).
1071  if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1072      end_index - start_index > (*new_start_index - start_index) * 2 &&
1073      last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1074      ranges->at(binary_chop_index) >= first + 2 * kSize) {
1075    uint32_t scan_forward_for_section_border = binary_chop_index;
1076    uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1077
1078    while (scan_forward_for_section_border < end_index) {
1079      if (ranges->at(scan_forward_for_section_border) > new_border) {
1080        *new_start_index = scan_forward_for_section_border;
1081        *border = new_border;
1082        break;
1083      }
1084      scan_forward_for_section_border++;
1085    }
1086  }
1087
1088  DCHECK(*new_start_index > start_index);
1089  *new_end_index = *new_start_index - 1;
1090  if (ranges->at(*new_end_index) == *border) {
1091    (*new_end_index)--;
1092  }
1093  if (*border >= ranges->at(end_index)) {
1094    *border = ranges->at(end_index);
1095    *new_start_index = end_index;  // Won't be used.
1096    *new_end_index = end_index - 1;
1097  }
1098}
1099
1100// Gets a series of segment boundaries representing a character class.  If the
1101// character is in the range between an even and an odd boundary (counting from
1102// start_index) then go to even_label, otherwise go to odd_label.  We already
1103// know that the character is in the range of min_char to max_char inclusive.
1104// Either label can be nullptr indicating backtracking.  Either label can also
1105// be equal to the fall_through label.
1106void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
1107                      uint32_t start_index, uint32_t end_index,
1108                      base::uc32 min_char, base::uc32 max_char,
1109                      Label* fall_through, Label* even_label,
1110                      Label* odd_label) {
1111  DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1112  DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1113
1114  base::uc32 first = ranges->at(start_index);
1115  base::uc32 last = ranges->at(end_index) - 1;
1116
1117  DCHECK_LT(min_char, first);
1118
1119  // Just need to test if the character is before or on-or-after
1120  // a particular character.
1121  if (start_index == end_index) {
1122    EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1123    return;
1124  }
1125
1126  // Another almost trivial case:  There is one interval in the middle that is
1127  // different from the end intervals.
1128  if (start_index + 1 == end_index) {
1129    EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1130                           odd_label);
1131    return;
1132  }
1133
1134  // It's not worth using table lookup if there are very few intervals in the
1135  // character class.
1136  if (end_index - start_index <= 6) {
1137    // It is faster to test for individual characters, so we look for those
1138    // first, then try arbitrary ranges in the second round.
1139    static uint32_t kNoCutIndex = -1;
1140    uint32_t cut = kNoCutIndex;
1141    for (uint32_t i = start_index; i < end_index; i++) {
1142      if (ranges->at(i) == ranges->at(i + 1) - 1) {
1143        cut = i;
1144        break;
1145      }
1146    }
1147    if (cut == kNoCutIndex) cut = start_index;
1148    CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1149                odd_label);
1150    DCHECK_GE(end_index - start_index, 2);
1151    GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1152                     max_char, fall_through, even_label, odd_label);
1153    return;
1154  }
1155
1156  // If there are a lot of intervals in the regexp, then we will use tables to
1157  // determine whether the character is inside or outside the character class.
1158  static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1159
1160  if ((max_char >> kBits) == (min_char >> kBits)) {
1161    EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1162                       fall_through, even_label, odd_label);
1163    return;
1164  }
1165
1166  if ((min_char >> kBits) != first >> kBits) {
1167    masm->CheckCharacterLT(first, odd_label);
1168    GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1169                     fall_through, odd_label, even_label);
1170    return;
1171  }
1172
1173  uint32_t new_start_index = 0;
1174  uint32_t new_end_index = 0;
1175  base::uc32 border = 0;
1176
1177  SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1178                   &new_end_index, &border);
1179
1180  Label handle_rest;
1181  Label* above = &handle_rest;
1182  if (border == last + 1) {
1183    // We didn't find any section that started after the limit, so everything
1184    // above the border is one of the terminal labels.
1185    above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1186    DCHECK(new_end_index == end_index - 1);
1187  }
1188
1189  DCHECK_LE(start_index, new_end_index);
1190  DCHECK_LE(new_start_index, end_index);
1191  DCHECK_LT(start_index, new_start_index);
1192  DCHECK_LT(new_end_index, end_index);
1193  DCHECK(new_end_index + 1 == new_start_index ||
1194         (new_end_index + 2 == new_start_index &&
1195          border == ranges->at(new_end_index + 1)));
1196  DCHECK_LT(min_char, border - 1);
1197  DCHECK_LT(border, max_char);
1198  DCHECK_LT(ranges->at(new_end_index), border);
1199  DCHECK(border < ranges->at(new_start_index) ||
1200         (border == ranges->at(new_start_index) &&
1201          new_start_index == end_index && new_end_index == end_index - 1 &&
1202          border == last + 1));
1203  DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
1204
1205  masm->CheckCharacterGT(border - 1, above);
1206  Label dummy;
1207  GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1208                   border - 1, &dummy, even_label, odd_label);
1209  if (handle_rest.is_linked()) {
1210    masm->Bind(&handle_rest);
1211    bool flip = (new_start_index & 1) != (start_index & 1);
1212    GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1213                     &dummy, flip ? odd_label : even_label,
1214                     flip ? even_label : odd_label);
1215  }
1216}
1217
1218void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1219                   RegExpCharacterClass* cc, bool one_byte, Label* on_failure,
1220                   int cp_offset, bool check_offset, bool preloaded,
1221                   Zone* zone) {
1222  ZoneList<CharacterRange>* ranges = cc->ranges(zone);
1223  CharacterRange::Canonicalize(ranges);
1224
1225  // Now that all processing (like case-insensitivity) is done, clamp the
1226  // ranges to the set of ranges that may actually occur in the subject string.
1227  if (one_byte) CharacterRange::ClampToOneByte(ranges);
1228
1229  const int ranges_length = ranges->length();
1230  if (ranges_length == 0) {
1231    if (!cc->is_negated()) {
1232      macro_assembler->GoTo(on_failure);
1233    }
1234    if (check_offset) {
1235      macro_assembler->CheckPosition(cp_offset, on_failure);
1236    }
1237    return;
1238  }
1239
1240  const base::uc32 max_char = MaxCodeUnit(one_byte);
1241  if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
1242    if (cc->is_negated()) {
1243      macro_assembler->GoTo(on_failure);
1244    } else {
1245      // This is a common case hit by non-anchored expressions.
1246      if (check_offset) {
1247        macro_assembler->CheckPosition(cp_offset, on_failure);
1248      }
1249    }
1250    return;
1251  }
1252
1253  if (!preloaded) {
1254    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1255  }
1256
1257  if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass(
1258                                   cc->standard_type(), on_failure)) {
1259    return;
1260  }
1261
1262  static constexpr int kMaxRangesForInlineBranchGeneration = 16;
1263  if (ranges_length > kMaxRangesForInlineBranchGeneration) {
1264    // For large range sets, emit a more compact instruction sequence to avoid
1265    // a potentially problematic increase in code size.
1266    // Note the flipped logic below (we check InRange if negated, NotInRange if
1267    // not negated); this is necessary since the method falls through on
1268    // failure whereas we want to fall through on success.
1269    if (cc->is_negated()) {
1270      if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) {
1271        return;
1272      }
1273    } else {
1274      if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) {
1275        return;
1276      }
1277    }
1278  }
1279
1280  // Generate a flat list of range boundaries for consumption by
1281  // GenerateBranches. See the comment on that function for how the list should
1282  // be structured
1283  ZoneList<base::uc32>* range_boundaries =
1284      zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone);
1285
1286  bool zeroth_entry_is_failure = !cc->is_negated();
1287
1288  for (int i = 0; i < ranges_length; i++) {
1289    CharacterRange& range = ranges->at(i);
1290    if (range.from() == 0) {
1291      DCHECK_EQ(i, 0);
1292      zeroth_entry_is_failure = !zeroth_entry_is_failure;
1293    } else {
1294      range_boundaries->Add(range.from(), zone);
1295    }
1296    // `+ 1` to convert from inclusive to exclusive `to`.
1297    // [from, to] == [from, to+1[.
1298    range_boundaries->Add(range.to() + 1, zone);
1299  }
1300  int end_index = range_boundaries->length() - 1;
1301  if (range_boundaries->at(end_index) > max_char) {
1302    end_index--;
1303  }
1304
1305  Label fall_through;
1306  GenerateBranches(macro_assembler, range_boundaries,
1307                   0,  // start_index.
1308                   end_index,
1309                   0,  // min_char.
1310                   max_char, &fall_through,
1311                   zeroth_entry_is_failure ? &fall_through : on_failure,
1312                   zeroth_entry_is_failure ? on_failure : &fall_through);
1313  macro_assembler->Bind(&fall_through);
1314}
1315
1316}  // namespace
1317
1318RegExpNode::~RegExpNode() = default;
1319
1320RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1321                                                  Trace* trace) {
1322  // If we are generating a greedy loop then don't stop and don't reuse code.
1323  if (trace->stop_node() != nullptr) {
1324    return CONTINUE;
1325  }
1326
1327  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1328  if (trace->is_trivial()) {
1329    if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
1330      // If a generic version is already scheduled to be generated or we have
1331      // recursed too deeply then just generate a jump to that code.
1332      macro_assembler->GoTo(&label_);
1333      // This will queue it up for generation of a generic version if it hasn't
1334      // already been queued.
1335      compiler->AddWork(this);
1336      return DONE;
1337    }
1338    // Generate generic version of the node and bind the label for later use.
1339    macro_assembler->Bind(&label_);
1340    return CONTINUE;
1341  }
1342
1343  // We are being asked to make a non-generic version.  Keep track of how many
1344  // non-generic versions we generate so as not to overdo it.
1345  trace_count_++;
1346  if (KeepRecursing(compiler) && compiler->optimize() &&
1347      trace_count_ < kMaxCopiesCodeGenerated) {
1348    return CONTINUE;
1349  }
1350
1351  // If we get here code has been generated for this node too many times or
1352  // recursion is too deep.  Time to switch to a generic version.  The code for
1353  // generic versions above can handle deep recursion properly.
1354  bool was_limiting = compiler->limiting_recursion();
1355  compiler->set_limiting_recursion(true);
1356  trace->Flush(compiler, this);
1357  compiler->set_limiting_recursion(was_limiting);
1358  return DONE;
1359}
1360
1361bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
1362  return !compiler->limiting_recursion() &&
1363         compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
1364}
1365
1366void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1367                              BoyerMooreLookahead* bm, bool not_at_start) {
1368  if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) {
1369    // Anything may follow a positive submatch success, thus we need to accept
1370    // all characters from this position onwards.
1371    bm->SetRest(offset);
1372  } else {
1373    on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1374  }
1375  SaveBMInfo(bm, not_at_start, offset);
1376}
1377
1378void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
1379                                      RegExpCompiler* compiler, int filled_in,
1380                                      bool not_at_start) {
1381  if (action_type_ == SET_REGISTER_FOR_LOOP) {
1382    on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
1383                                                    filled_in, not_at_start);
1384  } else {
1385    on_success()->GetQuickCheckDetails(details, compiler, filled_in,
1386                                       not_at_start);
1387  }
1388}
1389
1390void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1391                                 BoyerMooreLookahead* bm, bool not_at_start) {
1392  // Match the behaviour of EatsAtLeast on this node.
1393  if (assertion_type() == AT_START && not_at_start) return;
1394  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1395  SaveBMInfo(bm, not_at_start, offset);
1396}
1397
1398void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1399    QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
1400    bool not_at_start) {
1401  RegExpNode* node = continue_node();
1402  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1403}
1404
1405namespace {
1406
1407// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1408inline uint32_t SmearBitsRight(uint32_t v) {
1409  v |= v >> 1;
1410  v |= v >> 2;
1411  v |= v >> 4;
1412  v |= v >> 8;
1413  v |= v >> 16;
1414  return v;
1415}
1416
1417}  // namespace
1418
1419bool QuickCheckDetails::Rationalize(bool asc) {
1420  bool found_useful_op = false;
1421  const uint32_t char_mask = CharMask(asc);
1422  mask_ = 0;
1423  value_ = 0;
1424  int char_shift = 0;
1425  for (int i = 0; i < characters_; i++) {
1426    Position* pos = &positions_[i];
1427    if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
1428      found_useful_op = true;
1429    }
1430    mask_ |= (pos->mask & char_mask) << char_shift;
1431    value_ |= (pos->value & char_mask) << char_shift;
1432    char_shift += asc ? 8 : 16;
1433  }
1434  return found_useful_op;
1435}
1436
1437int RegExpNode::EatsAtLeast(bool not_at_start) {
1438  return not_at_start ? eats_at_least_.eats_at_least_from_not_start
1439                      : eats_at_least_.eats_at_least_from_possibly_start;
1440}
1441
1442EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
1443  // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
1444  // implies that the following node must be a LoopChoiceNode. If we need to
1445  // set registers to constant values for other reasons, we could introduce a
1446  // new action type SET_REGISTER that doesn't imply anything about its
1447  // successor.
1448  UNREACHABLE();
1449}
1450
1451void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
1452                                                   RegExpCompiler* compiler,
1453                                                   int characters_filled_in,
1454                                                   bool not_at_start) {
1455  // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
1456  UNREACHABLE();
1457}
1458
1459EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
1460  DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
1461
1462  if (read_backward()) {
1463    // The eats_at_least value is not used if reading backward. The
1464    // EatsAtLeastPropagator should've zeroed it as well.
1465    DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0);
1466    DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0);
1467    return {};
1468  }
1469
1470  // Figure out how much the loop body itself eats, not including anything in
1471  // the continuation case. In general, the nodes in the loop body should report
1472  // that they eat at least the number eaten by the continuation node, since any
1473  // successful match in the loop body must also include the continuation node.
1474  // However, in some cases involving positive lookaround, the loop body under-
1475  // reports its appetite, so use saturated math here to avoid negative numbers.
1476  uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
1477      loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
1478  uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
1479      loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
1480
1481  // Limit the number of loop iterations to avoid overflow in subsequent steps.
1482  int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
1483
1484  EatsAtLeastInfo result;
1485  result.eats_at_least_from_not_start =
1486      base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
1487                                    continue_node_->EatsAtLeast(true));
1488  if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
1489    // First loop iteration eats at least one, so all subsequent iterations
1490    // and the after-loop chunk are guaranteed to not be at the start.
1491    result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
1492        loop_body_from_possibly_start +
1493        (loop_iterations - 1) * loop_body_from_not_start +
1494        continue_node_->EatsAtLeast(true));
1495  } else {
1496    // Loop body might eat nothing, so only continue node contributes.
1497    result.eats_at_least_from_possibly_start =
1498        continue_node_->EatsAtLeast(false);
1499  }
1500  return result;
1501}
1502
1503bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1504                                Trace* bounds_check_trace, Trace* trace,
1505                                bool preload_has_checked_bounds,
1506                                Label* on_possible_success,
1507                                QuickCheckDetails* details,
1508                                bool fall_through_on_failure,
1509                                ChoiceNode* predecessor) {
1510  DCHECK_NOT_NULL(predecessor);
1511  if (details->characters() == 0) return false;
1512  GetQuickCheckDetails(details, compiler, 0,
1513                       trace->at_start() == Trace::FALSE_VALUE);
1514  if (details->cannot_match()) return false;
1515  if (!details->Rationalize(compiler->one_byte())) return false;
1516  DCHECK(details->characters() == 1 ||
1517         compiler->macro_assembler()->CanReadUnaligned());
1518  uint32_t mask = details->mask();
1519  uint32_t value = details->value();
1520
1521  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1522
1523  if (trace->characters_preloaded() != details->characters()) {
1524    DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
1525    // The bounds check is performed using the minimum number of characters
1526    // any choice would eat, so if the bounds check fails, then none of the
1527    // choices can succeed, so we can just immediately backtrack, rather
1528    // than go to the next choice. The number of characters preloaded may be
1529    // less than the number used for the bounds check.
1530    int eats_at_least = predecessor->EatsAtLeast(
1531        bounds_check_trace->at_start() == Trace::FALSE_VALUE);
1532    DCHECK_GE(eats_at_least, details->characters());
1533    assembler->LoadCurrentCharacter(
1534        trace->cp_offset(), bounds_check_trace->backtrack(),
1535        !preload_has_checked_bounds, details->characters(), eats_at_least);
1536  }
1537
1538  bool need_mask = true;
1539
1540  if (details->characters() == 1) {
1541    // If number of characters preloaded is 1 then we used a byte or 16 bit
1542    // load so the value is already masked down.
1543    const uint32_t char_mask = CharMask(compiler->one_byte());
1544    if ((mask & char_mask) == char_mask) need_mask = false;
1545    mask &= char_mask;
1546  } else {
1547    // For 2-character preloads in one-byte mode or 1-character preloads in
1548    // two-byte mode we also use a 16 bit load with zero extend.
1549    static const uint32_t kTwoByteMask = 0xFFFF;
1550    static const uint32_t kFourByteMask = 0xFFFFFFFF;
1551    if (details->characters() == 2 && compiler->one_byte()) {
1552      if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1553    } else if (details->characters() == 1 && !compiler->one_byte()) {
1554      if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1555    } else {
1556      if (mask == kFourByteMask) need_mask = false;
1557    }
1558  }
1559
1560  if (fall_through_on_failure) {
1561    if (need_mask) {
1562      assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1563    } else {
1564      assembler->CheckCharacter(value, on_possible_success);
1565    }
1566  } else {
1567    if (need_mask) {
1568      assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1569    } else {
1570      assembler->CheckNotCharacter(value, trace->backtrack());
1571    }
1572  }
1573  return true;
1574}
1575
1576// Here is the meat of GetQuickCheckDetails (see also the comment on the
1577// super-class in the .h file).
1578//
1579// We iterate along the text object, building up for each character a
1580// mask and value that can be used to test for a quick failure to match.
1581// The masks and values for the positions will be combined into a single
1582// machine word for the current character width in order to be used in
1583// generating a quick check.
1584void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1585                                    RegExpCompiler* compiler,
1586                                    int characters_filled_in,
1587                                    bool not_at_start) {
1588  // Do not collect any quick check details if the text node reads backward,
1589  // since it reads in the opposite direction than we use for quick checks.
1590  if (read_backward()) return;
1591  Isolate* isolate = compiler->macro_assembler()->isolate();
1592  DCHECK(characters_filled_in < details->characters());
1593  int characters = details->characters();
1594  const uint32_t char_mask = CharMask(compiler->one_byte());
1595  for (int k = 0; k < elements()->length(); k++) {
1596    TextElement elm = elements()->at(k);
1597    if (elm.text_type() == TextElement::ATOM) {
1598      base::Vector<const base::uc16> quarks = elm.atom()->data();
1599      for (int i = 0; i < characters && i < quarks.length(); i++) {
1600        QuickCheckDetails::Position* pos =
1601            details->positions(characters_filled_in);
1602        base::uc16 c = quarks[i];
1603        if (IsIgnoreCase(compiler->flags())) {
1604          unibrow::uchar chars[4];
1605          int length = GetCaseIndependentLetters(
1606              isolate, c, compiler->one_byte(), chars, 4);
1607          if (length == 0) {
1608            // This can happen because all case variants are non-Latin1, but we
1609            // know the input is Latin1.
1610            details->set_cannot_match();
1611            pos->determines_perfectly = false;
1612            return;
1613          }
1614          if (length == 1) {
1615            // This letter has no case equivalents, so it's nice and simple
1616            // and the mask-compare will determine definitely whether we have
1617            // a match at this character position.
1618            pos->mask = char_mask;
1619            pos->value = chars[0];
1620            pos->determines_perfectly = true;
1621          } else {
1622            uint32_t common_bits = char_mask;
1623            uint32_t bits = chars[0];
1624            for (int j = 1; j < length; j++) {
1625              uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1626              common_bits ^= differing_bits;
1627              bits &= common_bits;
1628            }
1629            // If length is 2 and common bits has only one zero in it then
1630            // our mask and compare instruction will determine definitely
1631            // whether we have a match at this character position.  Otherwise
1632            // it can only be an approximate check.
1633            uint32_t one_zero = (common_bits | ~char_mask);
1634            if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1635              pos->determines_perfectly = true;
1636            }
1637            pos->mask = common_bits;
1638            pos->value = bits;
1639          }
1640        } else {
1641          // Don't ignore case.  Nice simple case where the mask-compare will
1642          // determine definitely whether we have a match at this character
1643          // position.
1644          if (c > char_mask) {
1645            details->set_cannot_match();
1646            pos->determines_perfectly = false;
1647            return;
1648          }
1649          pos->mask = char_mask;
1650          pos->value = c;
1651          pos->determines_perfectly = true;
1652        }
1653        characters_filled_in++;
1654        DCHECK(characters_filled_in <= details->characters());
1655        if (characters_filled_in == details->characters()) {
1656          return;
1657        }
1658      }
1659    } else {
1660      QuickCheckDetails::Position* pos =
1661          details->positions(characters_filled_in);
1662      RegExpCharacterClass* tree = elm.char_class();
1663      ZoneList<CharacterRange>* ranges = tree->ranges(zone());
1664      if (tree->is_negated() || ranges->is_empty()) {
1665        // A quick check uses multi-character mask and compare.  There is no
1666        // useful way to incorporate a negative char class into this scheme
1667        // so we just conservatively create a mask and value that will always
1668        // succeed.
1669        // Likewise for empty ranges (empty ranges can occur e.g. when
1670        // compiling for one-byte subjects and impossible (non-one-byte) ranges
1671        // have been removed).
1672        pos->mask = 0;
1673        pos->value = 0;
1674      } else {
1675        int first_range = 0;
1676        while (ranges->at(first_range).from() > char_mask) {
1677          first_range++;
1678          if (first_range == ranges->length()) {
1679            details->set_cannot_match();
1680            pos->determines_perfectly = false;
1681            return;
1682          }
1683        }
1684        CharacterRange range = ranges->at(first_range);
1685        const base::uc32 first_from = range.from();
1686        const base::uc32 first_to =
1687            (range.to() > char_mask) ? char_mask : range.to();
1688        const uint32_t differing_bits = (first_from ^ first_to);
1689        // A mask and compare is only perfect if the differing bits form a
1690        // number like 00011111 with one single block of trailing 1s.
1691        if ((differing_bits & (differing_bits + 1)) == 0 &&
1692            first_from + differing_bits == first_to) {
1693          pos->determines_perfectly = true;
1694        }
1695        uint32_t common_bits = ~SmearBitsRight(differing_bits);
1696        uint32_t bits = (first_from & common_bits);
1697        for (int i = first_range + 1; i < ranges->length(); i++) {
1698          range = ranges->at(i);
1699          const base::uc32 from = range.from();
1700          if (from > char_mask) continue;
1701          const base::uc32 to =
1702              (range.to() > char_mask) ? char_mask : range.to();
1703          // Here we are combining more ranges into the mask and compare
1704          // value.  With each new range the mask becomes more sparse and
1705          // so the chances of a false positive rise.  A character class
1706          // with multiple ranges is assumed never to be equivalent to a
1707          // mask and compare operation.
1708          pos->determines_perfectly = false;
1709          uint32_t new_common_bits = (from ^ to);
1710          new_common_bits = ~SmearBitsRight(new_common_bits);
1711          common_bits &= new_common_bits;
1712          bits &= new_common_bits;
1713          uint32_t new_differing_bits = (from & common_bits) ^ bits;
1714          common_bits ^= new_differing_bits;
1715          bits &= common_bits;
1716        }
1717        pos->mask = common_bits;
1718        pos->value = bits;
1719      }
1720      characters_filled_in++;
1721      DCHECK(characters_filled_in <= details->characters());
1722      if (characters_filled_in == details->characters()) return;
1723    }
1724  }
1725  DCHECK(characters_filled_in != details->characters());
1726  if (!details->cannot_match()) {
1727    on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1728                                       true);
1729  }
1730}
1731
1732void QuickCheckDetails::Clear() {
1733  for (int i = 0; i < characters_; i++) {
1734    positions_[i].mask = 0;
1735    positions_[i].value = 0;
1736    positions_[i].determines_perfectly = false;
1737  }
1738  characters_ = 0;
1739}
1740
1741void QuickCheckDetails::Advance(int by, bool one_byte) {
1742  if (by >= characters_ || by < 0) {
1743    DCHECK_IMPLIES(by < 0, characters_ == 0);
1744    Clear();
1745    return;
1746  }
1747  DCHECK_LE(characters_ - by, 4);
1748  DCHECK_LE(characters_, 4);
1749  for (int i = 0; i < characters_ - by; i++) {
1750    positions_[i] = positions_[by + i];
1751  }
1752  for (int i = characters_ - by; i < characters_; i++) {
1753    positions_[i].mask = 0;
1754    positions_[i].value = 0;
1755    positions_[i].determines_perfectly = false;
1756  }
1757  characters_ -= by;
1758  // We could change mask_ and value_ here but we would never advance unless
1759  // they had already been used in a check and they won't be used again because
1760  // it would gain us nothing.  So there's no point.
1761}
1762
1763void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1764  DCHECK(characters_ == other->characters_);
1765  if (other->cannot_match_) {
1766    return;
1767  }
1768  if (cannot_match_) {
1769    *this = *other;
1770    return;
1771  }
1772  for (int i = from_index; i < characters_; i++) {
1773    QuickCheckDetails::Position* pos = positions(i);
1774    QuickCheckDetails::Position* other_pos = other->positions(i);
1775    if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1776        !other_pos->determines_perfectly) {
1777      // Our mask-compare operation will be approximate unless we have the
1778      // exact same operation on both sides of the alternation.
1779      pos->determines_perfectly = false;
1780    }
1781    pos->mask &= other_pos->mask;
1782    pos->value &= pos->mask;
1783    other_pos->value &= pos->mask;
1784    uint32_t differing_bits = (pos->value ^ other_pos->value);
1785    pos->mask &= ~differing_bits;
1786    pos->value &= pos->mask;
1787  }
1788}
1789
1790class VisitMarker {
1791 public:
1792  explicit VisitMarker(NodeInfo* info) : info_(info) {
1793    DCHECK(!info->visited);
1794    info->visited = true;
1795  }
1796  ~VisitMarker() { info_->visited = false; }
1797
1798 private:
1799  NodeInfo* info_;
1800};
1801
1802// Temporarily sets traversed_loop_initialization_node_.
1803class LoopInitializationMarker {
1804 public:
1805  explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
1806    DCHECK(!node_->traversed_loop_initialization_node_);
1807    node_->traversed_loop_initialization_node_ = true;
1808  }
1809  ~LoopInitializationMarker() {
1810    DCHECK(node_->traversed_loop_initialization_node_);
1811    node_->traversed_loop_initialization_node_ = false;
1812  }
1813  LoopInitializationMarker(const LoopInitializationMarker&) = delete;
1814  LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
1815
1816 private:
1817  LoopChoiceNode* node_;
1818};
1819
1820// Temporarily decrements min_loop_iterations_.
1821class IterationDecrementer {
1822 public:
1823  explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
1824    DCHECK_GT(node_->min_loop_iterations_, 0);
1825    --node_->min_loop_iterations_;
1826  }
1827  ~IterationDecrementer() { ++node_->min_loop_iterations_; }
1828  IterationDecrementer(const IterationDecrementer&) = delete;
1829  IterationDecrementer& operator=(const IterationDecrementer&) = delete;
1830
1831 private:
1832  LoopChoiceNode* node_;
1833};
1834
1835RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) {
1836  if (info()->replacement_calculated) return replacement();
1837  if (depth < 0) return this;
1838  DCHECK(!info()->visited);
1839  VisitMarker marker(info());
1840  return FilterSuccessor(depth - 1, flags);
1841}
1842
1843RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) {
1844  RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags);
1845  if (next == nullptr) return set_replacement(nullptr);
1846  on_success_ = next;
1847  return set_replacement(this);
1848}
1849
1850// We need to check for the following characters: 0x39C 0x3BC 0x178.
1851bool RangeContainsLatin1Equivalents(CharacterRange range) {
1852  // TODO(dcarney): this could be a lot more efficient.
1853  return range.Contains(0x039C) || range.Contains(0x03BC) ||
1854         range.Contains(0x0178);
1855}
1856
1857namespace {
1858
1859bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
1860  for (int i = 0; i < ranges->length(); i++) {
1861    // TODO(dcarney): this could be a lot more efficient.
1862    if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
1863  }
1864  return false;
1865}
1866
1867}  // namespace
1868
1869RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) {
1870  if (info()->replacement_calculated) return replacement();
1871  if (depth < 0) return this;
1872  DCHECK(!info()->visited);
1873  VisitMarker marker(info());
1874  int element_count = elements()->length();
1875  for (int i = 0; i < element_count; i++) {
1876    TextElement elm = elements()->at(i);
1877    if (elm.text_type() == TextElement::ATOM) {
1878      base::Vector<const base::uc16> quarks = elm.atom()->data();
1879      for (int j = 0; j < quarks.length(); j++) {
1880        base::uc16 c = quarks[j];
1881        if (IsIgnoreCase(flags)) {
1882          c = unibrow::Latin1::TryConvertToLatin1(c);
1883        }
1884        if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
1885        // Replace quark in case we converted to Latin-1.
1886        base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin());
1887        writable_quarks[j] = c;
1888      }
1889    } else {
1890      DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
1891      RegExpCharacterClass* cc = elm.char_class();
1892      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1893      CharacterRange::Canonicalize(ranges);
1894      // Now they are in order so we only need to look at the first.
1895      int range_count = ranges->length();
1896      if (cc->is_negated()) {
1897        if (range_count != 0 && ranges->at(0).from() == 0 &&
1898            ranges->at(0).to() >= String::kMaxOneByteCharCode) {
1899          // This will be handled in a later filter.
1900          if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
1901            continue;
1902          }
1903          return set_replacement(nullptr);
1904        }
1905      } else {
1906        if (range_count == 0 ||
1907            ranges->at(0).from() > String::kMaxOneByteCharCode) {
1908          // This will be handled in a later filter.
1909          if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
1910            continue;
1911          }
1912          return set_replacement(nullptr);
1913        }
1914      }
1915    }
1916  }
1917  return FilterSuccessor(depth - 1, flags);
1918}
1919
1920RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
1921  if (info()->replacement_calculated) return replacement();
1922  if (depth < 0) return this;
1923  if (info()->visited) return this;
1924  {
1925    VisitMarker marker(info());
1926
1927    RegExpNode* continue_replacement =
1928        continue_node_->FilterOneByte(depth - 1, flags);
1929    // If we can't continue after the loop then there is no sense in doing the
1930    // loop.
1931    if (continue_replacement == nullptr) return set_replacement(nullptr);
1932  }
1933
1934  return ChoiceNode::FilterOneByte(depth - 1, flags);
1935}
1936
1937RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
1938  if (info()->replacement_calculated) return replacement();
1939  if (depth < 0) return this;
1940  if (info()->visited) return this;
1941  VisitMarker marker(info());
1942  int choice_count = alternatives_->length();
1943
1944  for (int i = 0; i < choice_count; i++) {
1945    GuardedAlternative alternative = alternatives_->at(i);
1946    if (alternative.guards() != nullptr &&
1947        alternative.guards()->length() != 0) {
1948      set_replacement(this);
1949      return this;
1950    }
1951  }
1952
1953  int surviving = 0;
1954  RegExpNode* survivor = nullptr;
1955  for (int i = 0; i < choice_count; i++) {
1956    GuardedAlternative alternative = alternatives_->at(i);
1957    RegExpNode* replacement =
1958        alternative.node()->FilterOneByte(depth - 1, flags);
1959    DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
1960    if (replacement != nullptr) {
1961      alternatives_->at(i).set_node(replacement);
1962      surviving++;
1963      survivor = replacement;
1964    }
1965  }
1966  if (surviving < 2) return set_replacement(survivor);
1967
1968  set_replacement(this);
1969  if (surviving == choice_count) {
1970    return this;
1971  }
1972  // Only some of the nodes survived the filtering.  We need to rebuild the
1973  // alternatives list.
1974  ZoneList<GuardedAlternative>* new_alternatives =
1975      zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
1976  for (int i = 0; i < choice_count; i++) {
1977    RegExpNode* replacement =
1978        alternatives_->at(i).node()->FilterOneByte(depth - 1, flags);
1979    if (replacement != nullptr) {
1980      alternatives_->at(i).set_node(replacement);
1981      new_alternatives->Add(alternatives_->at(i), zone());
1982    }
1983  }
1984  alternatives_ = new_alternatives;
1985  return this;
1986}
1987
1988RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
1989                                                        RegExpFlags flags) {
1990  if (info()->replacement_calculated) return replacement();
1991  if (depth < 0) return this;
1992  if (info()->visited) return this;
1993  VisitMarker marker(info());
1994  // Alternative 0 is the negative lookahead, alternative 1 is what comes
1995  // afterwards.
1996  RegExpNode* node = continue_node();
1997  RegExpNode* replacement = node->FilterOneByte(depth - 1, flags);
1998  if (replacement == nullptr) return set_replacement(nullptr);
1999  alternatives_->at(kContinueIndex).set_node(replacement);
2000
2001  RegExpNode* neg_node = lookaround_node();
2002  RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags);
2003  // If the negative lookahead is always going to fail then
2004  // we don't need to check it.
2005  if (neg_replacement == nullptr) return set_replacement(replacement);
2006  alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
2007  return set_replacement(this);
2008}
2009
2010void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2011                                          RegExpCompiler* compiler,
2012                                          int characters_filled_in,
2013                                          bool not_at_start) {
2014  if (body_can_be_zero_length_ || info()->visited) return;
2015  not_at_start = not_at_start || this->not_at_start();
2016  DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
2017  if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
2018      loop_node_->EatsAtLeast(not_at_start) >
2019          continue_node_->EatsAtLeast(true)) {
2020    // Loop body is guaranteed to execute at least once, and consume characters
2021    // when it does, meaning the only possible quick checks from this point
2022    // begin with the loop body. We may recursively visit this LoopChoiceNode,
2023    // but we temporarily decrease its minimum iteration counter so we know when
2024    // to check the continue case.
2025    IterationDecrementer next_iteration(this);
2026    loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
2027                                     not_at_start);
2028  } else {
2029    // Might not consume anything in the loop body, so treat it like a normal
2030    // ChoiceNode (and don't recursively visit this node again).
2031    VisitMarker marker(info());
2032    ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
2033                                     not_at_start);
2034  }
2035}
2036
2037void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
2038    QuickCheckDetails* details, RegExpCompiler* compiler,
2039    int characters_filled_in, bool not_at_start) {
2040  if (traversed_loop_initialization_node_) {
2041    // We already entered this loop once, exited via its continuation node, and
2042    // followed an outer loop's back-edge to before the loop entry point. We
2043    // could try to reset the minimum iteration count to its starting value at
2044    // this point, but that seems like more trouble than it's worth. It's safe
2045    // to keep going with the current (possibly reduced) minimum iteration
2046    // count.
2047    GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2048  } else {
2049    // We are entering a loop via its counter initialization action, meaning we
2050    // are guaranteed to run the loop body at least some minimum number of times
2051    // before running the continuation node. Set a flag so that this node knows
2052    // (now and any times we visit it again recursively) that it was entered
2053    // from the top.
2054    LoopInitializationMarker marker(this);
2055    GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2056  }
2057}
2058
2059void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2060                                  BoyerMooreLookahead* bm, bool not_at_start) {
2061  if (body_can_be_zero_length_ || budget <= 0) {
2062    bm->SetRest(offset);
2063    SaveBMInfo(bm, not_at_start, offset);
2064    return;
2065  }
2066  ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2067  SaveBMInfo(bm, not_at_start, offset);
2068}
2069
2070void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2071                                      RegExpCompiler* compiler,
2072                                      int characters_filled_in,
2073                                      bool not_at_start) {
2074  not_at_start = (not_at_start || not_at_start_);
2075  int choice_count = alternatives_->length();
2076  DCHECK_LT(0, choice_count);
2077  alternatives_->at(0).node()->GetQuickCheckDetails(
2078      details, compiler, characters_filled_in, not_at_start);
2079  for (int i = 1; i < choice_count; i++) {
2080    QuickCheckDetails new_details(details->characters());
2081    RegExpNode* node = alternatives_->at(i).node();
2082    node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2083                               not_at_start);
2084    // Here we merge the quick match details of the two branches.
2085    details->Merge(&new_details, characters_filled_in);
2086  }
2087}
2088
2089namespace {
2090
2091// Check for [0-9A-Z_a-z].
2092void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
2093                   Label* non_word, bool fall_through_on_word) {
2094  if (assembler->CheckSpecialCharacterClass(
2095          fall_through_on_word ? StandardCharacterSet::kWord
2096                               : StandardCharacterSet::kNotWord,
2097          fall_through_on_word ? non_word : word)) {
2098    // Optimized implementation available.
2099    return;
2100  }
2101  assembler->CheckCharacterGT('z', non_word);
2102  assembler->CheckCharacterLT('0', non_word);
2103  assembler->CheckCharacterGT('a' - 1, word);
2104  assembler->CheckCharacterLT('9' + 1, word);
2105  assembler->CheckCharacterLT('A', non_word);
2106  assembler->CheckCharacterLT('Z' + 1, word);
2107  if (fall_through_on_word) {
2108    assembler->CheckNotCharacter('_', non_word);
2109  } else {
2110    assembler->CheckCharacter('_', word);
2111  }
2112}
2113
2114// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2115// that matches newline or the start of input).
2116void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
2117  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2118
2119  // We will load the previous character into the current character register.
2120  Trace new_trace(*trace);
2121  new_trace.InvalidateCurrentCharacter();
2122
2123  // A positive (> 0) cp_offset means we've already successfully matched a
2124  // non-empty-width part of the pattern, and thus cannot be at or before the
2125  // start of the subject string. We can thus skip both at-start and
2126  // bounds-checks when loading the one-character lookbehind.
2127  const bool may_be_at_or_before_subject_string_start =
2128      new_trace.cp_offset() <= 0;
2129
2130  Label ok;
2131  if (may_be_at_or_before_subject_string_start) {
2132    // The start of input counts as a newline in this context, so skip to ok if
2133    // we are at the start.
2134    assembler->CheckAtStart(new_trace.cp_offset(), &ok);
2135  }
2136
2137  // If we've already checked that we are not at the start of input, it's okay
2138  // to load the previous character without bounds checks.
2139  const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2140  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2141                                  new_trace.backtrack(), can_skip_bounds_check);
2142  if (!assembler->CheckSpecialCharacterClass(
2143          StandardCharacterSet::kLineTerminator, new_trace.backtrack())) {
2144    // Newline means \n, \r, 0x2028 or 0x2029.
2145    if (!compiler->one_byte()) {
2146      assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2147    }
2148    assembler->CheckCharacter('\n', &ok);
2149    assembler->CheckNotCharacter('\r', new_trace.backtrack());
2150  }
2151  assembler->Bind(&ok);
2152  on_success->Emit(compiler, &new_trace);
2153}
2154
2155}  // namespace
2156
2157// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2158void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2159  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2160  Isolate* isolate = assembler->isolate();
2161  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2162  bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2163  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2164  if (lookahead == nullptr) {
2165    int eats_at_least =
2166        std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
2167    if (eats_at_least >= 1) {
2168      BoyerMooreLookahead* bm =
2169          zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
2170      FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2171      if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2172      if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2173    }
2174  } else {
2175    if (lookahead->at(0)->is_non_word())
2176      next_is_word_character = Trace::FALSE_VALUE;
2177    if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2178  }
2179  bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2180  if (next_is_word_character == Trace::UNKNOWN) {
2181    Label before_non_word;
2182    Label before_word;
2183    if (trace->characters_preloaded() != 1) {
2184      assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2185    }
2186    // Fall through on non-word.
2187    EmitWordCheck(assembler, &before_word, &before_non_word, false);
2188    // Next character is not a word character.
2189    assembler->Bind(&before_non_word);
2190    Label ok;
2191    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2192    assembler->GoTo(&ok);
2193
2194    assembler->Bind(&before_word);
2195    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2196    assembler->Bind(&ok);
2197  } else if (next_is_word_character == Trace::TRUE_VALUE) {
2198    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2199  } else {
2200    DCHECK(next_is_word_character == Trace::FALSE_VALUE);
2201    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2202  }
2203}
2204
2205void AssertionNode::BacktrackIfPrevious(
2206    RegExpCompiler* compiler, Trace* trace,
2207    AssertionNode::IfPrevious backtrack_if_previous) {
2208  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2209  Trace new_trace(*trace);
2210  new_trace.InvalidateCurrentCharacter();
2211
2212  Label fall_through;
2213  Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
2214                                                        : &fall_through;
2215  Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
2216                                                    : new_trace.backtrack();
2217
2218  // A positive (> 0) cp_offset means we've already successfully matched a
2219  // non-empty-width part of the pattern, and thus cannot be at or before the
2220  // start of the subject string. We can thus skip both at-start and
2221  // bounds-checks when loading the one-character lookbehind.
2222  const bool may_be_at_or_before_subject_string_start =
2223      new_trace.cp_offset() <= 0;
2224
2225  if (may_be_at_or_before_subject_string_start) {
2226    // The start of input counts as a non-word character, so the question is
2227    // decided if we are at the start.
2228    assembler->CheckAtStart(new_trace.cp_offset(), non_word);
2229  }
2230
2231  // If we've already checked that we are not at the start of input, it's okay
2232  // to load the previous character without bounds checks.
2233  const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2234  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
2235                                  can_skip_bounds_check);
2236  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2237
2238  assembler->Bind(&fall_through);
2239  on_success()->Emit(compiler, &new_trace);
2240}
2241
2242void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2243                                         RegExpCompiler* compiler,
2244                                         int filled_in, bool not_at_start) {
2245  if (assertion_type_ == AT_START && not_at_start) {
2246    details->set_cannot_match();
2247    return;
2248  }
2249  return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2250                                            not_at_start);
2251}
2252
2253void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2254  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2255  switch (assertion_type_) {
2256    case AT_END: {
2257      Label ok;
2258      assembler->CheckPosition(trace->cp_offset(), &ok);
2259      assembler->GoTo(trace->backtrack());
2260      assembler->Bind(&ok);
2261      break;
2262    }
2263    case AT_START: {
2264      if (trace->at_start() == Trace::FALSE_VALUE) {
2265        assembler->GoTo(trace->backtrack());
2266        return;
2267      }
2268      if (trace->at_start() == Trace::UNKNOWN) {
2269        assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2270        Trace at_start_trace = *trace;
2271        at_start_trace.set_at_start(Trace::TRUE_VALUE);
2272        on_success()->Emit(compiler, &at_start_trace);
2273        return;
2274      }
2275    } break;
2276    case AFTER_NEWLINE:
2277      EmitHat(compiler, on_success(), trace);
2278      return;
2279    case AT_BOUNDARY:
2280    case AT_NON_BOUNDARY: {
2281      EmitBoundaryCheck(compiler, trace);
2282      return;
2283    }
2284  }
2285  on_success()->Emit(compiler, trace);
2286}
2287
2288namespace {
2289
2290bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2291  if (quick_check == nullptr) return false;
2292  if (offset >= quick_check->characters()) return false;
2293  return quick_check->positions(offset)->determines_perfectly;
2294}
2295
2296void UpdateBoundsCheck(int index, int* checked_up_to) {
2297  if (index > *checked_up_to) {
2298    *checked_up_to = index;
2299  }
2300}
2301
2302}  // namespace
2303
2304// We call this repeatedly to generate code for each pass over the text node.
2305// The passes are in increasing order of difficulty because we hope one
2306// of the first passes will fail in which case we are saved the work of the
2307// later passes.  for example for the case independent regexp /%[asdfghjkl]a/
2308// we will check the '%' in the first pass, the case independent 'a' in the
2309// second pass and the character class in the last pass.
2310//
2311// The passes are done from right to left, so for example to test for /bar/
2312// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2313// and then a 'b' with offset 0.  This means we can avoid the end-of-input
2314// bounds check most of the time.  In the example we only need to check for
2315// end-of-input when loading the putative 'r'.
2316//
2317// A slight complication involves the fact that the first character may already
2318// be fetched into a register by the previous node.  In this case we want to
2319// do the test for that character first.  We do this in separate passes.  The
2320// 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
2321// pass has been performed then subsequent passes will have true in
2322// first_element_checked to indicate that that character does not need to be
2323// checked again.
2324//
2325// In addition to all this we are passed a Trace, which can
2326// contain an AlternativeGeneration object.  In this AlternativeGeneration
2327// object we can see details of any quick check that was already passed in
2328// order to get to the code we are now generating.  The quick check can involve
2329// loading characters, which means we do not need to recheck the bounds
2330// up to the limit the quick check already checked.  In addition the quick
2331// check can have involved a mask and compare operation which may simplify
2332// or obviate the need for further checks at some character positions.
2333void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
2334                            bool preloaded, Trace* trace,
2335                            bool first_element_checked, int* checked_up_to) {
2336  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2337  Isolate* isolate = assembler->isolate();
2338  bool one_byte = compiler->one_byte();
2339  Label* backtrack = trace->backtrack();
2340  QuickCheckDetails* quick_check = trace->quick_check_performed();
2341  int element_count = elements()->length();
2342  int backward_offset = read_backward() ? -Length() : 0;
2343  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2344    TextElement elm = elements()->at(i);
2345    int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2346    if (elm.text_type() == TextElement::ATOM) {
2347      if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue;
2348      base::Vector<const base::uc16> quarks = elm.atom()->data();
2349      for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2350        if (first_element_checked && i == 0 && j == 0) continue;
2351        if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2352        base::uc16 quark = quarks[j];
2353        if (IsIgnoreCase(compiler->flags())) {
2354          // Everywhere else we assume that a non-Latin-1 character cannot match
2355          // a Latin-1 character. Avoid the cases where this is assumption is
2356          // invalid by using the Latin1 equivalent instead.
2357          quark = unibrow::Latin1::TryConvertToLatin1(quark);
2358        }
2359        bool needs_bounds_check =
2360            *checked_up_to < cp_offset + j || read_backward();
2361        bool bounds_checked = false;
2362        switch (pass) {
2363          case NON_LATIN1_MATCH:
2364            DCHECK(one_byte);
2365            if (quark > String::kMaxOneByteCharCode) {
2366              assembler->GoTo(backtrack);
2367              return;
2368            }
2369            break;
2370          case NON_LETTER_CHARACTER_MATCH:
2371            bounds_checked =
2372                EmitAtomNonLetter(isolate, compiler, quark, backtrack,
2373                                  cp_offset + j, needs_bounds_check, preloaded);
2374            break;
2375          case SIMPLE_CHARACTER_MATCH:
2376            bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
2377                                                 backtrack, cp_offset + j,
2378                                                 needs_bounds_check, preloaded);
2379            break;
2380          case CASE_CHARACTER_MATCH:
2381            bounds_checked =
2382                EmitAtomLetter(isolate, compiler, quark, backtrack,
2383                               cp_offset + j, needs_bounds_check, preloaded);
2384            break;
2385          default:
2386            break;
2387        }
2388        if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2389      }
2390    } else {
2391      DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
2392      if (pass == CHARACTER_CLASS_MATCH) {
2393        if (first_element_checked && i == 0) continue;
2394        if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2395        RegExpCharacterClass* cc = elm.char_class();
2396        bool bounds_check = *checked_up_to < cp_offset || read_backward();
2397        EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2398                      bounds_check, preloaded, zone());
2399        UpdateBoundsCheck(cp_offset, checked_up_to);
2400      }
2401    }
2402  }
2403}
2404
2405int TextNode::Length() {
2406  TextElement elm = elements()->last();
2407  DCHECK_LE(0, elm.cp_offset());
2408  return elm.cp_offset() + elm.length();
2409}
2410
2411bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
2412  if (ignore_case) {
2413    return pass == SIMPLE_CHARACTER_MATCH;
2414  } else {
2415    return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2416  }
2417}
2418
2419TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
2420                                             ZoneList<CharacterRange>* ranges,
2421                                             bool read_backward,
2422                                             RegExpNode* on_success) {
2423  DCHECK_NOT_NULL(ranges);
2424  // TODO(jgruber): There's no fundamental need to create this
2425  // RegExpCharacterClass; we could refactor to avoid the allocation.
2426  return zone->New<TextNode>(zone->New<RegExpCharacterClass>(zone, ranges),
2427                             read_backward, on_success);
2428}
2429
2430TextNode* TextNode::CreateForSurrogatePair(
2431    Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
2432    bool read_backward, RegExpNode* on_success) {
2433  ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
2434  ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2435  elms->Add(TextElement::CharClass(
2436                zone->New<RegExpCharacterClass>(zone, lead_ranges)),
2437            zone);
2438  elms->Add(TextElement::CharClass(
2439                zone->New<RegExpCharacterClass>(zone, trail_ranges)),
2440            zone);
2441  return zone->New<TextNode>(elms, read_backward, on_success);
2442}
2443
2444TextNode* TextNode::CreateForSurrogatePair(
2445    Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail,
2446    bool read_backward, RegExpNode* on_success) {
2447  ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
2448  ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2449  elms->Add(TextElement::CharClass(
2450                zone->New<RegExpCharacterClass>(zone, lead_ranges)),
2451            zone);
2452  elms->Add(TextElement::CharClass(
2453                zone->New<RegExpCharacterClass>(zone, trail_ranges)),
2454            zone);
2455  return zone->New<TextNode>(elms, read_backward, on_success);
2456}
2457
2458// This generates the code to match a text node.  A text node can contain
2459// straight character sequences (possibly to be matched in a case-independent
2460// way) and character classes.  For efficiency we do not do this in a single
2461// pass from left to right.  Instead we pass over the text node several times,
2462// emitting code for some character positions every time.  See the comment on
2463// TextEmitPass for details.
2464void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2465  LimitResult limit_result = LimitVersions(compiler, trace);
2466  if (limit_result == DONE) return;
2467  DCHECK(limit_result == CONTINUE);
2468
2469  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2470    compiler->SetRegExpTooBig();
2471    return;
2472  }
2473
2474  if (compiler->one_byte()) {
2475    int dummy = 0;
2476    TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2477  }
2478
2479  bool first_elt_done = false;
2480  int bound_checked_to = trace->cp_offset() - 1;
2481  bound_checked_to += trace->bound_checked_up_to();
2482
2483  // If a character is preloaded into the current character register then
2484  // check that now.
2485  if (trace->characters_preloaded() == 1) {
2486    for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2487      TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2488                   false, &bound_checked_to);
2489    }
2490    first_elt_done = true;
2491  }
2492
2493  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2494    TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2495                 first_elt_done, &bound_checked_to);
2496  }
2497
2498  Trace successor_trace(*trace);
2499  // If we advance backward, we may end up at the start.
2500  successor_trace.AdvanceCurrentPositionInTrace(
2501      read_backward() ? -Length() : Length(), compiler);
2502  successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2503                                               : Trace::FALSE_VALUE);
2504  RecursionCheck rc(compiler);
2505  on_success()->Emit(compiler, &successor_trace);
2506}
2507
2508void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
2509
2510void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2511  // We don't have an instruction for shifting the current character register
2512  // down or for using a shifted value for anything so lets just forget that
2513  // we preloaded any characters into it.
2514  characters_preloaded_ = 0;
2515  // Adjust the offsets of the quick check performed information.  This
2516  // information is used to find out what we already determined about the
2517  // characters by means of mask and compare.
2518  quick_check_performed_.Advance(by, compiler->one_byte());
2519  cp_offset_ += by;
2520  if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2521    compiler->SetRegExpTooBig();
2522    cp_offset_ = 0;
2523  }
2524  bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
2525}
2526
2527void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
2528                                   RegExpFlags flags) {
2529  if (!IsIgnoreCase(flags)) return;
2530#ifdef V8_INTL_SUPPORT
2531  if (NeedsUnicodeCaseEquivalents(flags)) return;
2532#endif
2533
2534  int element_count = elements()->length();
2535  for (int i = 0; i < element_count; i++) {
2536    TextElement elm = elements()->at(i);
2537    if (elm.text_type() == TextElement::CHAR_CLASS) {
2538      RegExpCharacterClass* cc = elm.char_class();
2539      // None of the standard character classes is different in the case
2540      // independent case and it slows us down if we don't know that.
2541      if (cc->is_standard(zone())) continue;
2542      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2543      CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
2544    }
2545  }
2546}
2547
2548int TextNode::GreedyLoopTextLength() { return Length(); }
2549
2550RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2551    RegExpCompiler* compiler) {
2552  if (read_backward()) return nullptr;
2553  if (elements()->length() != 1) return nullptr;
2554  TextElement elm = elements()->at(0);
2555  if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2556  RegExpCharacterClass* node = elm.char_class();
2557  ZoneList<CharacterRange>* ranges = node->ranges(zone());
2558  CharacterRange::Canonicalize(ranges);
2559  if (node->is_negated()) {
2560    return ranges->length() == 0 ? on_success() : nullptr;
2561  }
2562  if (ranges->length() != 1) return nullptr;
2563  const base::uc32 max_char = MaxCodeUnit(compiler->one_byte());
2564  return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
2565}
2566
2567// Finds the fixed match length of a sequence of nodes that goes from
2568// this alternative and back to this choice node.  If there are variable
2569// length nodes or other complications in the way then return a sentinel
2570// value indicating that a greedy loop cannot be constructed.
2571int ChoiceNode::GreedyLoopTextLengthForAlternative(
2572    GuardedAlternative* alternative) {
2573  int length = 0;
2574  RegExpNode* node = alternative->node();
2575  // Later we will generate code for all these text nodes using recursion
2576  // so we have to limit the max number.
2577  int recursion_depth = 0;
2578  while (node != this) {
2579    if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2580      return kNodeIsTooComplexForGreedyLoops;
2581    }
2582    int node_length = node->GreedyLoopTextLength();
2583    if (node_length == kNodeIsTooComplexForGreedyLoops) {
2584      return kNodeIsTooComplexForGreedyLoops;
2585    }
2586    length += node_length;
2587    SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2588    node = seq_node->on_success();
2589  }
2590  if (read_backward()) {
2591    length = -length;
2592  }
2593  // Check that we can jump by the whole text length. If not, return sentinel
2594  // to indicate the we can't construct a greedy loop.
2595  if (length < RegExpMacroAssembler::kMinCPOffset ||
2596      length > RegExpMacroAssembler::kMaxCPOffset) {
2597    return kNodeIsTooComplexForGreedyLoops;
2598  }
2599  return length;
2600}
2601
2602void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2603  DCHECK_NULL(loop_node_);
2604  AddAlternative(alt);
2605  loop_node_ = alt.node();
2606}
2607
2608void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2609  DCHECK_NULL(continue_node_);
2610  AddAlternative(alt);
2611  continue_node_ = alt.node();
2612}
2613
2614void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2615  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2616  if (trace->stop_node() == this) {
2617    // Back edge of greedy optimized loop node graph.
2618    int text_length =
2619        GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2620    DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
2621    // Update the counter-based backtracking info on the stack.  This is an
2622    // optimization for greedy loops (see below).
2623    DCHECK(trace->cp_offset() == text_length);
2624    macro_assembler->AdvanceCurrentPosition(text_length);
2625    macro_assembler->GoTo(trace->loop_label());
2626    return;
2627  }
2628  DCHECK_NULL(trace->stop_node());
2629  if (!trace->is_trivial()) {
2630    trace->Flush(compiler, this);
2631    return;
2632  }
2633  ChoiceNode::Emit(compiler, trace);
2634}
2635
2636int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2637                                           int eats_at_least) {
2638  int preload_characters = std::min(4, eats_at_least);
2639  DCHECK_LE(preload_characters, 4);
2640  if (compiler->macro_assembler()->CanReadUnaligned()) {
2641    bool one_byte = compiler->one_byte();
2642    if (one_byte) {
2643      // We can't preload 3 characters because there is no machine instruction
2644      // to do that.  We can't just load 4 because we could be reading
2645      // beyond the end of the string, which could cause a memory fault.
2646      if (preload_characters == 3) preload_characters = 2;
2647    } else {
2648      if (preload_characters > 2) preload_characters = 2;
2649    }
2650  } else {
2651    if (preload_characters > 1) preload_characters = 1;
2652  }
2653  return preload_characters;
2654}
2655
2656// This class is used when generating the alternatives in a choice node.  It
2657// records the way the alternative is being code generated.
2658class AlternativeGeneration : public Malloced {
2659 public:
2660  AlternativeGeneration()
2661      : possible_success(),
2662        expects_preload(false),
2663        after(),
2664        quick_check_details() {}
2665  Label possible_success;
2666  bool expects_preload;
2667  Label after;
2668  QuickCheckDetails quick_check_details;
2669};
2670
2671// Creates a list of AlternativeGenerations.  If the list has a reasonable
2672// size then it is on the stack, otherwise the excess is on the heap.
2673class AlternativeGenerationList {
2674 public:
2675  AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
2676    for (int i = 0; i < count && i < kAFew; i++) {
2677      alt_gens_.Add(a_few_alt_gens_ + i, zone);
2678    }
2679    for (int i = kAFew; i < count; i++) {
2680      alt_gens_.Add(new AlternativeGeneration(), zone);
2681    }
2682  }
2683  ~AlternativeGenerationList() {
2684    for (int i = kAFew; i < alt_gens_.length(); i++) {
2685      delete alt_gens_[i];
2686      alt_gens_[i] = nullptr;
2687    }
2688  }
2689
2690  AlternativeGeneration* at(int i) { return alt_gens_[i]; }
2691
2692 private:
2693  static const int kAFew = 10;
2694  ZoneList<AlternativeGeneration*> alt_gens_;
2695  AlternativeGeneration a_few_alt_gens_[kAFew];
2696};
2697
2698void BoyerMoorePositionInfo::Set(int character) {
2699  SetInterval(Interval(character, character));
2700}
2701
2702namespace {
2703
2704ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
2705                            int ranges_length, Interval new_range) {
2706  DCHECK_EQ(1, ranges_length & 1);
2707  DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
2708  if (containment == kLatticeUnknown) return containment;
2709  bool inside = false;
2710  int last = 0;
2711  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2712    // Consider the range from last to ranges[i].
2713    // We haven't got to the new range yet.
2714    if (ranges[i] <= new_range.from()) continue;
2715    // New range is wholly inside last-ranges[i].  Note that new_range.to() is
2716    // inclusive, but the values in ranges are not.
2717    if (last <= new_range.from() && new_range.to() < ranges[i]) {
2718      return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2719    }
2720    return kLatticeUnknown;
2721  }
2722  return containment;
2723}
2724
2725int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
2726  STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
2727                2 * kInt64Size * kBitsPerByte);
2728
2729  // Slight fiddling is needed here, since the bitset is of length 128 while
2730  // CountTrailingZeros requires an integral type and std::bitset can only
2731  // convert to unsigned long long. So we handle the most- and least-significant
2732  // bits separately.
2733
2734  {
2735    static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
2736    BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
2737    STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
2738    uint64_t lsb = masked_bitset.to_ullong();
2739    if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
2740  }
2741
2742  {
2743    BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
2744    uint64_t msb = masked_bitset.to_ullong();
2745    if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
2746  }
2747
2748  return -1;
2749}
2750
2751}  // namespace
2752
2753void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2754  w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2755
2756  if (interval.size() >= kMapSize) {
2757    map_count_ = kMapSize;
2758    map_.set();
2759    return;
2760  }
2761
2762  for (int i = interval.from(); i <= interval.to(); i++) {
2763    int mod_character = (i & kMask);
2764    if (!map_[mod_character]) {
2765      map_count_++;
2766      map_.set(mod_character);
2767    }
2768    if (map_count_ == kMapSize) return;
2769  }
2770}
2771
2772void BoyerMoorePositionInfo::SetAll() {
2773  w_ = kLatticeUnknown;
2774  if (map_count_ != kMapSize) {
2775    map_count_ = kMapSize;
2776    map_.set();
2777  }
2778}
2779
2780BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
2781                                         Zone* zone)
2782    : length_(length),
2783      compiler_(compiler),
2784      max_char_(MaxCodeUnit(compiler->one_byte())) {
2785  bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
2786  for (int i = 0; i < length; i++) {
2787    bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
2788  }
2789}
2790
2791// Find the longest range of lookahead that has the fewest number of different
2792// characters that can occur at a given position.  Since we are optimizing two
2793// different parameters at once this is a tradeoff.
2794bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2795  int biggest_points = 0;
2796  // If more than 32 characters out of 128 can occur it is unlikely that we can
2797  // be lucky enough to step forwards much of the time.
2798  const int kMaxMax = 32;
2799  for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2800       max_number_of_chars *= 2) {
2801    biggest_points =
2802        FindBestInterval(max_number_of_chars, biggest_points, from, to);
2803  }
2804  if (biggest_points == 0) return false;
2805  return true;
2806}
2807
2808// Find the highest-points range between 0 and length_ where the character
2809// information is not too vague.  'Too vague' means that there are more than
2810// max_number_of_chars that can occur at this position.  Calculates the number
2811// of points as the product of width-of-the-range and
2812// probability-of-finding-one-of-the-characters, where the probability is
2813// calculated using the frequency distribution of the sample subject string.
2814int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
2815                                          int old_biggest_points, int* from,
2816                                          int* to) {
2817  int biggest_points = old_biggest_points;
2818  static const int kSize = RegExpMacroAssembler::kTableSize;
2819  for (int i = 0; i < length_;) {
2820    while (i < length_ && Count(i) > max_number_of_chars) i++;
2821    if (i == length_) break;
2822    int remembered_from = i;
2823
2824    BoyerMoorePositionInfo::Bitset union_bitset;
2825    for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
2826      union_bitset |= bitmaps_->at(i)->raw_bitset();
2827    }
2828
2829    int frequency = 0;
2830
2831    // Iterate only over set bits.
2832    int j;
2833    while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
2834      DCHECK(union_bitset[j]);  // Sanity check.
2835      // Add 1 to the frequency to give a small per-character boost for
2836      // the cases where our sampling is not good enough and many
2837      // characters have a frequency of zero.  This means the frequency
2838      // can theoretically be up to 2*kSize though we treat it mostly as
2839      // a fraction of kSize.
2840      frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2841      union_bitset.reset(j);
2842    }
2843
2844    // We use the probability of skipping times the distance we are skipping to
2845    // judge the effectiveness of this.  Actually we have a cut-off:  By
2846    // dividing by 2 we switch off the skipping if the probability of skipping
2847    // is less than 50%.  This is because the multibyte mask-and-compare
2848    // skipping in quickcheck is more likely to do well on this case.
2849    bool in_quickcheck_range =
2850        ((i - remembered_from < 4) ||
2851         (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2852    // Called 'probability' but it is only a rough estimate and can actually
2853    // be outside the 0-kSize range.
2854    int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2855    int points = (i - remembered_from) * probability;
2856    if (points > biggest_points) {
2857      *from = remembered_from;
2858      *to = i - 1;
2859      biggest_points = points;
2860    }
2861  }
2862  return biggest_points;
2863}
2864
2865// Take all the characters that will not prevent a successful match if they
2866// occur in the subject string in the range between min_lookahead and
2867// max_lookahead (inclusive) measured from the current position.  If the
2868// character at max_lookahead offset is not one of these characters, then we
2869// can safely skip forwards by the number of characters in the range.
2870int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
2871                                      Handle<ByteArray> boolean_skip_table) {
2872  const int kSkipArrayEntry = 0;
2873  const int kDontSkipArrayEntry = 1;
2874
2875  std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
2876              boolean_skip_table->length());
2877
2878  for (int i = max_lookahead; i >= min_lookahead; i--) {
2879    BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
2880
2881    // Iterate only over set bits.
2882    int j;
2883    while ((j = BitsetFirstSetBit(bitset)) != -1) {
2884      DCHECK(bitset[j]);  // Sanity check.
2885      boolean_skip_table->set(j, kDontSkipArrayEntry);
2886      bitset.reset(j);
2887    }
2888  }
2889
2890  const int skip = max_lookahead + 1 - min_lookahead;
2891  return skip;
2892}
2893
2894// See comment above on the implementation of GetSkipTable.
2895void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2896  const int kSize = RegExpMacroAssembler::kTableSize;
2897
2898  int min_lookahead = 0;
2899  int max_lookahead = 0;
2900
2901  if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2902
2903  // Check if we only have a single non-empty position info, and that info
2904  // contains precisely one character.
2905  bool found_single_character = false;
2906  int single_character = 0;
2907  for (int i = max_lookahead; i >= min_lookahead; i--) {
2908    BoyerMoorePositionInfo* map = bitmaps_->at(i);
2909    if (map->map_count() == 0) continue;
2910
2911    if (found_single_character || map->map_count() > 1) {
2912      found_single_character = false;
2913      break;
2914    }
2915
2916    DCHECK(!found_single_character);
2917    DCHECK_EQ(map->map_count(), 1);
2918
2919    found_single_character = true;
2920    single_character = BitsetFirstSetBit(map->raw_bitset());
2921
2922    DCHECK_NE(single_character, -1);
2923  }
2924
2925  int lookahead_width = max_lookahead + 1 - min_lookahead;
2926
2927  if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2928    // The mask-compare can probably handle this better.
2929    return;
2930  }
2931
2932  if (found_single_character) {
2933    Label cont, again;
2934    masm->Bind(&again);
2935    masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2936    if (max_char_ > kSize) {
2937      masm->CheckCharacterAfterAnd(single_character,
2938                                   RegExpMacroAssembler::kTableMask, &cont);
2939    } else {
2940      masm->CheckCharacter(single_character, &cont);
2941    }
2942    masm->AdvanceCurrentPosition(lookahead_width);
2943    masm->GoTo(&again);
2944    masm->Bind(&cont);
2945    return;
2946  }
2947
2948  Factory* factory = masm->isolate()->factory();
2949  Handle<ByteArray> boolean_skip_table =
2950      factory->NewByteArray(kSize, AllocationType::kOld);
2951  int skip_distance =
2952      GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2953  DCHECK_NE(0, skip_distance);
2954
2955  Label cont, again;
2956  masm->Bind(&again);
2957  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2958  masm->CheckBitInTable(boolean_skip_table, &cont);
2959  masm->AdvanceCurrentPosition(skip_distance);
2960  masm->GoTo(&again);
2961  masm->Bind(&cont);
2962}
2963
2964/* Code generation for choice nodes.
2965 *
2966 * We generate quick checks that do a mask and compare to eliminate a
2967 * choice.  If the quick check succeeds then it jumps to the continuation to
2968 * do slow checks and check subsequent nodes.  If it fails (the common case)
2969 * it falls through to the next choice.
2970 *
2971 * Here is the desired flow graph.  Nodes directly below each other imply
2972 * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
2973 * 3 doesn't have a quick check so we have to call the slow check.
2974 * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
2975 * regexp continuation is generated directly after the Sn node, up to the
2976 * next GoTo if we decide to reuse some already generated code.  Some
2977 * nodes expect preload_characters to be preloaded into the current
2978 * character register.  R nodes do this preloading.  Vertices are marked
2979 * F for failures and S for success (possible success in the case of quick
2980 * nodes).  L, V, < and > are used as arrow heads.
2981 *
2982 * ----------> R
2983 *             |
2984 *             V
2985 *            Q1 -----> S1
2986 *             |   S   /
2987 *            F|      /
2988 *             |    F/
2989 *             |    /
2990 *             |   R
2991 *             |  /
2992 *             V L
2993 *            Q2 -----> S2
2994 *             |   S   /
2995 *            F|      /
2996 *             |    F/
2997 *             |    /
2998 *             |   R
2999 *             |  /
3000 *             V L
3001 *            S3
3002 *             |
3003 *            F|
3004 *             |
3005 *             R
3006 *             |
3007 * backtrack   V
3008 * <----------Q4
3009 *   \    F    |
3010 *    \        |S
3011 *     \   F   V
3012 *      \-----S4
3013 *
3014 * For greedy loops we push the current position, then generate the code that
3015 * eats the input specially in EmitGreedyLoop.  The other choice (the
3016 * continuation) is generated by the normal code in EmitChoices, and steps back
3017 * in the input to the starting position when it fails to match.  The loop code
3018 * looks like this (U is the unwind code that steps back in the greedy loop).
3019 *
3020 *              _____
3021 *             /     \
3022 *             V     |
3023 * ----------> S1    |
3024 *            /|     |
3025 *           / |S    |
3026 *         F/  \_____/
3027 *         /
3028 *        |<-----
3029 *        |      \
3030 *        V       |S
3031 *        Q2 ---> U----->backtrack
3032 *        |  F   /
3033 *       S|     /
3034 *        V  F /
3035 *        S2--/
3036 */
3037
3038GreedyLoopState::GreedyLoopState(bool not_at_start) {
3039  counter_backtrack_trace_.set_backtrack(&label_);
3040  if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3041}
3042
3043void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3044#ifdef DEBUG
3045  int choice_count = alternatives_->length();
3046  for (int i = 0; i < choice_count - 1; i++) {
3047    GuardedAlternative alternative = alternatives_->at(i);
3048    ZoneList<Guard*>* guards = alternative.guards();
3049    int guard_count = (guards == nullptr) ? 0 : guards->length();
3050    for (int j = 0; j < guard_count; j++) {
3051      DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3052    }
3053  }
3054#endif
3055}
3056
3057void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
3058                              PreloadState* state) {
3059  if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3060    // Save some time by looking at most one machine word ahead.
3061    state->eats_at_least_ =
3062        EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
3063  }
3064  state->preload_characters_ =
3065      CalculatePreloadCharacters(compiler, state->eats_at_least_);
3066
3067  state->preload_is_current_ =
3068      (current_trace->characters_preloaded() == state->preload_characters_);
3069  state->preload_has_checked_bounds_ = state->preload_is_current_;
3070}
3071
3072void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3073  int choice_count = alternatives_->length();
3074
3075  if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3076    alternatives_->at(0).node()->Emit(compiler, trace);
3077    return;
3078  }
3079
3080  AssertGuardsMentionRegisters(trace);
3081
3082  LimitResult limit_result = LimitVersions(compiler, trace);
3083  if (limit_result == DONE) return;
3084  DCHECK(limit_result == CONTINUE);
3085
3086  // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3087  // other choice nodes we only flush if we are out of code size budget.
3088  if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3089    trace->Flush(compiler, this);
3090    return;
3091  }
3092
3093  RecursionCheck rc(compiler);
3094
3095  PreloadState preload;
3096  preload.init();
3097  GreedyLoopState greedy_loop_state(not_at_start());
3098
3099  int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3100  AlternativeGenerationList alt_gens(choice_count, zone());
3101
3102  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3103    trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3104                           &greedy_loop_state, text_length);
3105  } else {
3106    // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
3107    // match the traces produced pre-cleanup.
3108    Label second_choice;
3109    compiler->macro_assembler()->Bind(&second_choice);
3110
3111    preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3112
3113    EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3114  }
3115
3116  // At this point we need to generate slow checks for the alternatives where
3117  // the quick check was inlined.  We can recognize these because the associated
3118  // label was bound.
3119  int new_flush_budget = trace->flush_budget() / choice_count;
3120  for (int i = 0; i < choice_count; i++) {
3121    AlternativeGeneration* alt_gen = alt_gens.at(i);
3122    Trace new_trace(*trace);
3123    // If there are actions to be flushed we have to limit how many times
3124    // they are flushed.  Take the budget of the parent trace and distribute
3125    // it fairly amongst the children.
3126    if (new_trace.actions() != nullptr) {
3127      new_trace.set_flush_budget(new_flush_budget);
3128    }
3129    bool next_expects_preload =
3130        i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3131    EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
3132                              alt_gen, preload.preload_characters_,
3133                              next_expects_preload);
3134  }
3135}
3136
3137Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
3138                                  AlternativeGenerationList* alt_gens,
3139                                  PreloadState* preload,
3140                                  GreedyLoopState* greedy_loop_state,
3141                                  int text_length) {
3142  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3143  // Here we have special handling for greedy loops containing only text nodes
3144  // and other simple nodes.  These are handled by pushing the current
3145  // position on the stack and then incrementing the current position each
3146  // time around the switch.  On backtrack we decrement the current position
3147  // and check it against the pushed value.  This avoids pushing backtrack
3148  // information for each iteration of the loop, which could take up a lot of
3149  // space.
3150  DCHECK(trace->stop_node() == nullptr);
3151  macro_assembler->PushCurrentPosition();
3152  Label greedy_match_failed;
3153  Trace greedy_match_trace;
3154  if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3155  greedy_match_trace.set_backtrack(&greedy_match_failed);
3156  Label loop_label;
3157  macro_assembler->Bind(&loop_label);
3158  greedy_match_trace.set_stop_node(this);
3159  greedy_match_trace.set_loop_label(&loop_label);
3160  alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3161  macro_assembler->Bind(&greedy_match_failed);
3162
3163  Label second_choice;  // For use in greedy matches.
3164  macro_assembler->Bind(&second_choice);
3165
3166  Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3167
3168  EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3169
3170  macro_assembler->Bind(greedy_loop_state->label());
3171  // If we have unwound to the bottom then backtrack.
3172  macro_assembler->CheckGreedyLoop(trace->backtrack());
3173  // Otherwise try the second priority at an earlier position.
3174  macro_assembler->AdvanceCurrentPosition(-text_length);
3175  macro_assembler->GoTo(&second_choice);
3176  return new_trace;
3177}
3178
3179int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3180                                              Trace* trace) {
3181  int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3182  if (alternatives_->length() != 2) return eats_at_least;
3183
3184  GuardedAlternative alt1 = alternatives_->at(1);
3185  if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3186    return eats_at_least;
3187  }
3188  RegExpNode* eats_anything_node = alt1.node();
3189  if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3190    return eats_at_least;
3191  }
3192
3193  // Really we should be creating a new trace when we execute this function,
3194  // but there is no need, because the code it generates cannot backtrack, and
3195  // we always arrive here with a trivial trace (since it's the entry to a
3196  // loop.  That also implies that there are no preloaded characters, which is
3197  // good, because it means we won't be violating any assumptions by
3198  // overwriting those characters with new load instructions.
3199  DCHECK(trace->is_trivial());
3200
3201  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3202  Isolate* isolate = macro_assembler->isolate();
3203  // At this point we know that we are at a non-greedy loop that will eat
3204  // any character one at a time.  Any non-anchored regexp has such a
3205  // loop prepended to it in order to find where it starts.  We look for
3206  // a pattern of the form ...abc... where we can look 6 characters ahead
3207  // and step forwards 3 if the character is not one of abc.  Abc need
3208  // not be atoms, they can be any reasonably limited character class or
3209  // small alternation.
3210  BoyerMooreLookahead* bm = bm_info(false);
3211  if (bm == nullptr) {
3212    eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
3213    if (eats_at_least >= 1) {
3214      bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
3215      GuardedAlternative alt0 = alternatives_->at(0);
3216      alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
3217    }
3218  }
3219  if (bm != nullptr) {
3220    bm->EmitSkipInstructions(macro_assembler);
3221  }
3222  return eats_at_least;
3223}
3224
3225void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3226                             AlternativeGenerationList* alt_gens,
3227                             int first_choice, Trace* trace,
3228                             PreloadState* preload) {
3229  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3230  SetUpPreLoad(compiler, trace, preload);
3231
3232  // For now we just call all choices one after the other.  The idea ultimately
3233  // is to use the Dispatch table to try only the relevant ones.
3234  int choice_count = alternatives_->length();
3235
3236  int new_flush_budget = trace->flush_budget() / choice_count;
3237
3238  for (int i = first_choice; i < choice_count; i++) {
3239    bool is_last = i == choice_count - 1;
3240    bool fall_through_on_failure = !is_last;
3241    GuardedAlternative alternative = alternatives_->at(i);
3242    AlternativeGeneration* alt_gen = alt_gens->at(i);
3243    alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3244    ZoneList<Guard*>* guards = alternative.guards();
3245    int guard_count = (guards == nullptr) ? 0 : guards->length();
3246    Trace new_trace(*trace);
3247    new_trace.set_characters_preloaded(
3248        preload->preload_is_current_ ? preload->preload_characters_ : 0);
3249    if (preload->preload_has_checked_bounds_) {
3250      new_trace.set_bound_checked_up_to(preload->preload_characters_);
3251    }
3252    new_trace.quick_check_performed()->Clear();
3253    if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3254    if (!is_last) {
3255      new_trace.set_backtrack(&alt_gen->after);
3256    }
3257    alt_gen->expects_preload = preload->preload_is_current_;
3258    bool generate_full_check_inline = false;
3259    if (compiler->optimize() &&
3260        try_to_emit_quick_check_for_alternative(i == 0) &&
3261        alternative.node()->EmitQuickCheck(
3262            compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3263            &alt_gen->possible_success, &alt_gen->quick_check_details,
3264            fall_through_on_failure, this)) {
3265      // Quick check was generated for this choice.
3266      preload->preload_is_current_ = true;
3267      preload->preload_has_checked_bounds_ = true;
3268      // If we generated the quick check to fall through on possible success,
3269      // we now need to generate the full check inline.
3270      if (!fall_through_on_failure) {
3271        macro_assembler->Bind(&alt_gen->possible_success);
3272        new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3273        new_trace.set_characters_preloaded(preload->preload_characters_);
3274        new_trace.set_bound_checked_up_to(preload->preload_characters_);
3275        generate_full_check_inline = true;
3276      }
3277    } else if (alt_gen->quick_check_details.cannot_match()) {
3278      if (!fall_through_on_failure) {
3279        macro_assembler->GoTo(trace->backtrack());
3280      }
3281      continue;
3282    } else {
3283      // No quick check was generated.  Put the full code here.
3284      // If this is not the first choice then there could be slow checks from
3285      // previous cases that go here when they fail.  There's no reason to
3286      // insist that they preload characters since the slow check we are about
3287      // to generate probably can't use it.
3288      if (i != first_choice) {
3289        alt_gen->expects_preload = false;
3290        new_trace.InvalidateCurrentCharacter();
3291      }
3292      generate_full_check_inline = true;
3293    }
3294    if (generate_full_check_inline) {
3295      if (new_trace.actions() != nullptr) {
3296        new_trace.set_flush_budget(new_flush_budget);
3297      }
3298      for (int j = 0; j < guard_count; j++) {
3299        GenerateGuard(macro_assembler, guards->at(j), &new_trace);
3300      }
3301      alternative.node()->Emit(compiler, &new_trace);
3302      preload->preload_is_current_ = false;
3303    }
3304    macro_assembler->Bind(&alt_gen->after);
3305  }
3306}
3307
3308void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3309                                           Trace* trace,
3310                                           GuardedAlternative alternative,
3311                                           AlternativeGeneration* alt_gen,
3312                                           int preload_characters,
3313                                           bool next_expects_preload) {
3314  if (!alt_gen->possible_success.is_linked()) return;
3315
3316  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3317  macro_assembler->Bind(&alt_gen->possible_success);
3318  Trace out_of_line_trace(*trace);
3319  out_of_line_trace.set_characters_preloaded(preload_characters);
3320  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3321  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3322  ZoneList<Guard*>* guards = alternative.guards();
3323  int guard_count = (guards == nullptr) ? 0 : guards->length();
3324  if (next_expects_preload) {
3325    Label reload_current_char;
3326    out_of_line_trace.set_backtrack(&reload_current_char);
3327    for (int j = 0; j < guard_count; j++) {
3328      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3329    }
3330    alternative.node()->Emit(compiler, &out_of_line_trace);
3331    macro_assembler->Bind(&reload_current_char);
3332    // Reload the current character, since the next quick check expects that.
3333    // We don't need to check bounds here because we only get into this
3334    // code through a quick check which already did the checked load.
3335    macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3336                                          preload_characters);
3337    macro_assembler->GoTo(&(alt_gen->after));
3338  } else {
3339    out_of_line_trace.set_backtrack(&(alt_gen->after));
3340    for (int j = 0; j < guard_count; j++) {
3341      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3342    }
3343    alternative.node()->Emit(compiler, &out_of_line_trace);
3344  }
3345}
3346
3347void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3348  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3349  LimitResult limit_result = LimitVersions(compiler, trace);
3350  if (limit_result == DONE) return;
3351  DCHECK(limit_result == CONTINUE);
3352
3353  RecursionCheck rc(compiler);
3354
3355  switch (action_type_) {
3356    case STORE_POSITION: {
3357      Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3358                                         data_.u_position_register.is_capture,
3359                                         trace);
3360      Trace new_trace = *trace;
3361      new_trace.add_action(&new_capture);
3362      on_success()->Emit(compiler, &new_trace);
3363      break;
3364    }
3365    case INCREMENT_REGISTER: {
3366      Trace::DeferredIncrementRegister new_increment(
3367          data_.u_increment_register.reg);
3368      Trace new_trace = *trace;
3369      new_trace.add_action(&new_increment);
3370      on_success()->Emit(compiler, &new_trace);
3371      break;
3372    }
3373    case SET_REGISTER_FOR_LOOP: {
3374      Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
3375                                                data_.u_store_register.value);
3376      Trace new_trace = *trace;
3377      new_trace.add_action(&new_set);
3378      on_success()->Emit(compiler, &new_trace);
3379      break;
3380    }
3381    case CLEAR_CAPTURES: {
3382      Trace::DeferredClearCaptures new_capture(Interval(
3383          data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3384      Trace new_trace = *trace;
3385      new_trace.add_action(&new_capture);
3386      on_success()->Emit(compiler, &new_trace);
3387      break;
3388    }
3389    case BEGIN_POSITIVE_SUBMATCH:
3390    case BEGIN_NEGATIVE_SUBMATCH:
3391      if (!trace->is_trivial()) {
3392        trace->Flush(compiler, this);
3393      } else {
3394        assembler->WriteCurrentPositionToRegister(
3395            data_.u_submatch.current_position_register, 0);
3396        assembler->WriteStackPointerToRegister(
3397            data_.u_submatch.stack_pointer_register);
3398        on_success()->Emit(compiler, trace);
3399      }
3400      break;
3401    case EMPTY_MATCH_CHECK: {
3402      int start_pos_reg = data_.u_empty_match_check.start_register;
3403      int stored_pos = 0;
3404      int rep_reg = data_.u_empty_match_check.repetition_register;
3405      bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3406      bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3407      if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3408        // If we know we haven't advanced and there is no minimum we
3409        // can just backtrack immediately.
3410        assembler->GoTo(trace->backtrack());
3411      } else if (know_dist && stored_pos < trace->cp_offset()) {
3412        // If we know we've advanced we can generate the continuation
3413        // immediately.
3414        on_success()->Emit(compiler, trace);
3415      } else if (!trace->is_trivial()) {
3416        trace->Flush(compiler, this);
3417      } else {
3418        Label skip_empty_check;
3419        // If we have a minimum number of repetitions we check the current
3420        // number first and skip the empty check if it's not enough.
3421        if (has_minimum) {
3422          int limit = data_.u_empty_match_check.repetition_limit;
3423          assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3424        }
3425        // If the match is empty we bail out, otherwise we fall through
3426        // to the on-success continuation.
3427        assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3428                                   trace->backtrack());
3429        assembler->Bind(&skip_empty_check);
3430        on_success()->Emit(compiler, trace);
3431      }
3432      break;
3433    }
3434    case POSITIVE_SUBMATCH_SUCCESS: {
3435      if (!trace->is_trivial()) {
3436        trace->Flush(compiler, this);
3437        return;
3438      }
3439      assembler->ReadCurrentPositionFromRegister(
3440          data_.u_submatch.current_position_register);
3441      assembler->ReadStackPointerFromRegister(
3442          data_.u_submatch.stack_pointer_register);
3443      int clear_register_count = data_.u_submatch.clear_register_count;
3444      if (clear_register_count == 0) {
3445        on_success()->Emit(compiler, trace);
3446        return;
3447      }
3448      int clear_registers_from = data_.u_submatch.clear_register_from;
3449      Label clear_registers_backtrack;
3450      Trace new_trace = *trace;
3451      new_trace.set_backtrack(&clear_registers_backtrack);
3452      on_success()->Emit(compiler, &new_trace);
3453
3454      assembler->Bind(&clear_registers_backtrack);
3455      int clear_registers_to = clear_registers_from + clear_register_count - 1;
3456      assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3457
3458      DCHECK(trace->backtrack() == nullptr);
3459      assembler->Backtrack();
3460      return;
3461    }
3462    default:
3463      UNREACHABLE();
3464  }
3465}
3466
3467void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3468  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3469  if (!trace->is_trivial()) {
3470    trace->Flush(compiler, this);
3471    return;
3472  }
3473
3474  LimitResult limit_result = LimitVersions(compiler, trace);
3475  if (limit_result == DONE) return;
3476  DCHECK(limit_result == CONTINUE);
3477
3478  RecursionCheck rc(compiler);
3479
3480  DCHECK_EQ(start_reg_ + 1, end_reg_);
3481  if (IsIgnoreCase(flags_)) {
3482    bool unicode = IsUnicode(flags_);
3483    assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
3484                                               unicode, trace->backtrack());
3485  } else {
3486    assembler->CheckNotBackReference(start_reg_, read_backward(),
3487                                     trace->backtrack());
3488  }
3489  // We are going to advance backward, so we may end up at the start.
3490  if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3491
3492  // Check that the back reference does not end inside a surrogate pair.
3493  if (IsUnicode(flags_) && !compiler->one_byte()) {
3494    assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3495  }
3496  on_success()->Emit(compiler, trace);
3497}
3498
3499void TextNode::CalculateOffsets() {
3500  int element_count = elements()->length();
3501  // Set up the offsets of the elements relative to the start.  This is a fixed
3502  // quantity since a TextNode can only contain fixed-width things.
3503  int cp_offset = 0;
3504  for (int i = 0; i < element_count; i++) {
3505    TextElement& elm = elements()->at(i);
3506    elm.set_cp_offset(cp_offset);
3507    cp_offset += elm.length();
3508  }
3509}
3510
3511namespace {
3512
3513// Assertion propagation moves information about assertions such as
3514// \b to the affected nodes.  For instance, in /.\b./ information must
3515// be propagated to the first '.' that whatever follows needs to know
3516// if it matched a word or a non-word, and to the second '.' that it
3517// has to check if it succeeds a word or non-word.  In this case the
3518// result will be something like:
3519//
3520//   +-------+        +------------+
3521//   |   .   |        |      .     |
3522//   +-------+  --->  +------------+
3523//   | word? |        | check word |
3524//   +-------+        +------------+
3525class AssertionPropagator : public AllStatic {
3526 public:
3527  static void VisitText(TextNode* that) {}
3528
3529  static void VisitAction(ActionNode* that) {
3530    // If the next node is interested in what it follows then this node
3531    // has to be interested too so it can pass the information on.
3532    that->info()->AddFromFollowing(that->on_success()->info());
3533  }
3534
3535  static void VisitChoice(ChoiceNode* that, int i) {
3536    // Anything the following nodes need to know has to be known by
3537    // this node also, so it can pass it on.
3538    that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
3539  }
3540
3541  static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3542    that->info()->AddFromFollowing(that->continue_node()->info());
3543  }
3544
3545  static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
3546    that->info()->AddFromFollowing(that->loop_node()->info());
3547  }
3548
3549  static void VisitNegativeLookaroundChoiceLookaroundNode(
3550      NegativeLookaroundChoiceNode* that) {
3551    VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
3552  }
3553
3554  static void VisitNegativeLookaroundChoiceContinueNode(
3555      NegativeLookaroundChoiceNode* that) {
3556    VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
3557  }
3558
3559  static void VisitBackReference(BackReferenceNode* that) {}
3560
3561  static void VisitAssertion(AssertionNode* that) {}
3562};
3563
3564// Propagates information about the minimum size of successful matches from
3565// successor nodes to their predecessors. Note that all eats_at_least values
3566// are initialized to zero before analysis.
3567class EatsAtLeastPropagator : public AllStatic {
3568 public:
3569  static void VisitText(TextNode* that) {
3570    // The eats_at_least value is not used if reading backward.
3571    if (!that->read_backward()) {
3572      // We are not at the start after this node, and thus we can use the
3573      // successor's eats_at_least_from_not_start value.
3574      uint8_t eats_at_least = base::saturated_cast<uint8_t>(
3575          that->Length() + that->on_success()
3576                               ->eats_at_least_info()
3577                               ->eats_at_least_from_not_start);
3578      that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
3579    }
3580  }
3581
3582  static void VisitAction(ActionNode* that) {
3583    switch (that->action_type()) {
3584      case ActionNode::BEGIN_POSITIVE_SUBMATCH:
3585      case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3586        // We do not propagate eats_at_least data through positive lookarounds,
3587        // because they rewind input.
3588        // TODO(v8:11859) Potential approaches for fixing this include:
3589        // 1. Add a dedicated choice node for positive lookaround, similar to
3590        //    NegativeLookaroundChoiceNode.
3591        // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which
3592        //    is <= eats_at_least_from_possibly_start, and use that value in
3593        //    EatsAtLeastFromLoopEntry.
3594        DCHECK(that->eats_at_least_info()->IsZero());
3595        break;
3596      case ActionNode::SET_REGISTER_FOR_LOOP:
3597        // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the
3598        // loop body will run at least the minimum number of times before the
3599        // continuation case can run.
3600        that->set_eats_at_least_info(
3601            that->on_success()->EatsAtLeastFromLoopEntry());
3602        break;
3603      case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
3604      default:
3605        // Otherwise, the current node eats at least as much as its successor.
3606        // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH
3607        // because NegativeLookaroundChoiceNode ignores its lookaround successor
3608        // when computing eats-at-least and quick check information.
3609        that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3610        break;
3611    }
3612  }
3613
3614  static void VisitChoice(ChoiceNode* that, int i) {
3615    // The minimum possible match from a choice node is the minimum of its
3616    // successors.
3617    EatsAtLeastInfo eats_at_least =
3618        i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
3619    eats_at_least.SetMin(
3620        *that->alternatives()->at(i).node()->eats_at_least_info());
3621    that->set_eats_at_least_info(eats_at_least);
3622  }
3623
3624  static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3625    if (!that->read_backward()) {
3626      that->set_eats_at_least_info(
3627          *that->continue_node()->eats_at_least_info());
3628    }
3629  }
3630
3631  static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
3632
3633  static void VisitNegativeLookaroundChoiceLookaroundNode(
3634      NegativeLookaroundChoiceNode* that) {}
3635
3636  static void VisitNegativeLookaroundChoiceContinueNode(
3637      NegativeLookaroundChoiceNode* that) {
3638    that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
3639  }
3640
3641  static void VisitBackReference(BackReferenceNode* that) {
3642    if (!that->read_backward()) {
3643      that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3644    }
3645  }
3646
3647  static void VisitAssertion(AssertionNode* that) {
3648    EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
3649    if (that->assertion_type() == AssertionNode::AT_START) {
3650      // If we know we are not at the start and we are asked "how many
3651      // characters will you match if you succeed?" then we can answer anything
3652      // since false implies false.  So let's just set the max answer
3653      // (UINT8_MAX) since that won't prevent us from preloading a lot of
3654      // characters for the other branches in the node graph.
3655      eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
3656    }
3657    that->set_eats_at_least_info(eats_at_least);
3658  }
3659};
3660
3661}  // namespace
3662
3663// -------------------------------------------------------------------
3664// Analysis
3665
3666// Iterates the node graph and provides the opportunity for propagators to set
3667// values that depend on successor nodes.
3668template <typename... Propagators>
3669class Analysis : public NodeVisitor {
3670 public:
3671  Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags)
3672      : isolate_(isolate),
3673        is_one_byte_(is_one_byte),
3674        flags_(flags),
3675        error_(RegExpError::kNone) {}
3676
3677  void EnsureAnalyzed(RegExpNode* that) {
3678    StackLimitCheck check(isolate());
3679    if (check.HasOverflowed()) {
3680      if (FLAG_correctness_fuzzer_suppressions) {
3681        FATAL("Analysis: Aborting on stack overflow");
3682      }
3683      fail(RegExpError::kAnalysisStackOverflow);
3684      return;
3685    }
3686    if (that->info()->been_analyzed || that->info()->being_analyzed) return;
3687    that->info()->being_analyzed = true;
3688    that->Accept(this);
3689    that->info()->being_analyzed = false;
3690    that->info()->been_analyzed = true;
3691  }
3692
3693  bool has_failed() { return error_ != RegExpError::kNone; }
3694  RegExpError error() {
3695    DCHECK(error_ != RegExpError::kNone);
3696    return error_;
3697  }
3698  void fail(RegExpError error) { error_ = error; }
3699
3700  Isolate* isolate() const { return isolate_; }
3701
3702  void VisitEnd(EndNode* that) override {
3703    // nothing to do
3704  }
3705
3706// Used to call the given static function on each propagator / variadic template
3707// argument.
3708#define STATIC_FOR_EACH(expr)       \
3709  do {                              \
3710    int dummy[] = {((expr), 0)...}; \
3711    USE(dummy);                     \
3712  } while (false)
3713
3714  void VisitText(TextNode* that) override {
3715    that->MakeCaseIndependent(isolate(), is_one_byte_, flags_);
3716    EnsureAnalyzed(that->on_success());
3717    if (has_failed()) return;
3718    that->CalculateOffsets();
3719    STATIC_FOR_EACH(Propagators::VisitText(that));
3720  }
3721
3722  void VisitAction(ActionNode* that) override {
3723    EnsureAnalyzed(that->on_success());
3724    if (has_failed()) return;
3725    STATIC_FOR_EACH(Propagators::VisitAction(that));
3726  }
3727
3728  void VisitChoice(ChoiceNode* that) override {
3729    for (int i = 0; i < that->alternatives()->length(); i++) {
3730      EnsureAnalyzed(that->alternatives()->at(i).node());
3731      if (has_failed()) return;
3732      STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
3733    }
3734  }
3735
3736  void VisitLoopChoice(LoopChoiceNode* that) override {
3737    DCHECK_EQ(that->alternatives()->length(), 2);  // Just loop and continue.
3738
3739    // First propagate all information from the continuation node.
3740    EnsureAnalyzed(that->continue_node());
3741    if (has_failed()) return;
3742    STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
3743
3744    // Check the loop last since it may need the value of this node
3745    // to get a correct result.
3746    EnsureAnalyzed(that->loop_node());
3747    if (has_failed()) return;
3748    STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
3749  }
3750
3751  void VisitNegativeLookaroundChoice(
3752      NegativeLookaroundChoiceNode* that) override {
3753    DCHECK_EQ(that->alternatives()->length(), 2);  // Lookaround and continue.
3754
3755    EnsureAnalyzed(that->lookaround_node());
3756    if (has_failed()) return;
3757    STATIC_FOR_EACH(
3758        Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
3759
3760    EnsureAnalyzed(that->continue_node());
3761    if (has_failed()) return;
3762    STATIC_FOR_EACH(
3763        Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
3764  }
3765
3766  void VisitBackReference(BackReferenceNode* that) override {
3767    EnsureAnalyzed(that->on_success());
3768    if (has_failed()) return;
3769    STATIC_FOR_EACH(Propagators::VisitBackReference(that));
3770  }
3771
3772  void VisitAssertion(AssertionNode* that) override {
3773    EnsureAnalyzed(that->on_success());
3774    if (has_failed()) return;
3775    STATIC_FOR_EACH(Propagators::VisitAssertion(that));
3776  }
3777
3778#undef STATIC_FOR_EACH
3779
3780 private:
3781  Isolate* isolate_;
3782  const bool is_one_byte_;
3783  const RegExpFlags flags_;
3784  RegExpError error_;
3785
3786  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
3787};
3788
3789RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
3790                          RegExpNode* node) {
3791  Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(
3792      isolate, is_one_byte, flags);
3793  DCHECK_EQ(node->info()->been_analyzed, false);
3794  analysis.EnsureAnalyzed(node);
3795  DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
3796  return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
3797}
3798
3799void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3800                                     BoyerMooreLookahead* bm,
3801                                     bool not_at_start) {
3802  // Working out the set of characters that a backreference can match is too
3803  // hard, so we just say that any character can match.
3804  bm->SetRest(offset);
3805  SaveBMInfo(bm, not_at_start, offset);
3806}
3807
3808STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
3809              RegExpMacroAssembler::kTableSize);
3810
3811void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3812                              BoyerMooreLookahead* bm, bool not_at_start) {
3813  ZoneList<GuardedAlternative>* alts = alternatives();
3814  budget = (budget - 1) / alts->length();
3815  for (int i = 0; i < alts->length(); i++) {
3816    GuardedAlternative& alt = alts->at(i);
3817    if (alt.guards() != nullptr && alt.guards()->length() != 0) {
3818      bm->SetRest(offset);  // Give up trying to fill in info.
3819      SaveBMInfo(bm, not_at_start, offset);
3820      return;
3821    }
3822    alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
3823  }
3824  SaveBMInfo(bm, not_at_start, offset);
3825}
3826
3827void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
3828                            BoyerMooreLookahead* bm, bool not_at_start) {
3829  if (initial_offset >= bm->length()) return;
3830  int offset = initial_offset;
3831  int max_char = bm->max_char();
3832  for (int i = 0; i < elements()->length(); i++) {
3833    if (offset >= bm->length()) {
3834      if (initial_offset == 0) set_bm_info(not_at_start, bm);
3835      return;
3836    }
3837    TextElement text = elements()->at(i);
3838    if (text.text_type() == TextElement::ATOM) {
3839      RegExpAtom* atom = text.atom();
3840      for (int j = 0; j < atom->length(); j++, offset++) {
3841        if (offset >= bm->length()) {
3842          if (initial_offset == 0) set_bm_info(not_at_start, bm);
3843          return;
3844        }
3845        base::uc16 character = atom->data()[j];
3846        if (IsIgnoreCase(bm->compiler()->flags())) {
3847          unibrow::uchar chars[4];
3848          int length = GetCaseIndependentLetters(
3849              isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
3850              chars, 4);
3851          for (int k = 0; k < length; k++) {
3852            bm->Set(offset, chars[k]);
3853          }
3854        } else {
3855          if (character <= max_char) bm->Set(offset, character);
3856        }
3857      }
3858    } else {
3859      DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
3860      RegExpCharacterClass* char_class = text.char_class();
3861      ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
3862      if (char_class->is_negated()) {
3863        bm->SetAll(offset);
3864      } else {
3865        for (int k = 0; k < ranges->length(); k++) {
3866          CharacterRange& range = ranges->at(k);
3867          if (static_cast<int>(range.from()) > max_char) continue;
3868          int to = std::min(max_char, static_cast<int>(range.to()));
3869          bm->SetInterval(offset, Interval(range.from(), to));
3870        }
3871      }
3872      offset++;
3873    }
3874  }
3875  if (offset >= bm->length()) {
3876    if (initial_offset == 0) set_bm_info(not_at_start, bm);
3877    return;
3878  }
3879  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
3880                             true);  // Not at start after a text node.
3881  if (initial_offset == 0) set_bm_info(not_at_start, bm);
3882}
3883
3884RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
3885    RegExpNode* on_success) {
3886  DCHECK(!read_backward());
3887  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
3888      zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
3889  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
3890      zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
3891
3892  ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
3893
3894  int stack_register = UnicodeLookaroundStackRegister();
3895  int position_register = UnicodeLookaroundPositionRegister();
3896  RegExpNode* step_back = TextNode::CreateForCharacterRanges(
3897      zone(), lead_surrogates, true, on_success);
3898  RegExpLookaround::Builder builder(true, step_back, stack_register,
3899                                    position_register);
3900  RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
3901      zone(), trail_surrogates, false, builder.on_match_success());
3902
3903  optional_step_back->AddAlternative(
3904      GuardedAlternative(builder.ForMatch(match_trail)));
3905  optional_step_back->AddAlternative(GuardedAlternative(on_success));
3906
3907  return optional_step_back;
3908}
3909
3910RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
3911                                             RegExpFlags flags,
3912                                             bool is_one_byte) {
3913  // Wrap the body of the regexp in capture #0.
3914  RegExpNode* captured_body =
3915      RegExpCapture::ToNode(data->tree, 0, this, accept());
3916  RegExpNode* node = captured_body;
3917  if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
3918    // Add a .*? at the beginning, outside the body capture, unless
3919    // this expression is anchored at the beginning or sticky.
3920    RegExpNode* loop_node = RegExpQuantifier::ToNode(
3921        0, RegExpTree::kInfinity, false,
3922        zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
3923        this, captured_body, data->contains_anchor);
3924
3925    if (data->contains_anchor) {
3926      // Unroll loop once, to take care of the case that might start
3927      // at the start of input.
3928      ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
3929      first_step_node->AddAlternative(GuardedAlternative(captured_body));
3930      first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
3931          zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
3932          false, loop_node)));
3933      node = first_step_node;
3934    } else {
3935      node = loop_node;
3936    }
3937  }
3938  if (is_one_byte) {
3939    node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
3940    // Do it again to propagate the new nodes to places where they were not
3941    // put because they had not been calculated yet.
3942    if (node != nullptr) {
3943      node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
3944    }
3945  } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
3946    node = OptionallyStepBackToLeadSurrogate(node);
3947  }
3948
3949  if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
3950  return node;
3951}
3952
3953void RegExpCompiler::ToNodeCheckForStackOverflow() {
3954  if (StackLimitCheck{isolate()}.HasOverflowed()) {
3955    FatalProcessOutOfMemory(isolate(), "RegExpCompiler");
3956  }
3957}
3958
3959}  // namespace internal
3960}  // namespace v8
3961