1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_COMPILER_H_
6 #define V8_REGEXP_REGEXP_COMPILER_H_
7 
8 #include <bitset>
9 
10 #include "src/base/small-vector.h"
11 #include "src/base/strings.h"
12 #include "src/regexp/regexp-flags.h"
13 #include "src/regexp/regexp-nodes.h"
14 
15 namespace v8 {
16 namespace internal {
17 
18 class DynamicBitSet;
19 class Isolate;
20 
21 namespace regexp_compiler_constants {
22 
23 // The '2' variant is has inclusive from and exclusive to.
24 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
25 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
26 constexpr base::uc32 kRangeEndMarker = 0x110000;
27 constexpr int kSpaceRanges[] = {
28     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
29     0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
30     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
31 constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
32 
33 constexpr int kWordRanges[] = {'0',     '9' + 1, 'A',     'Z' + 1,        '_',
34                                '_' + 1, 'a',     'z' + 1, kRangeEndMarker};
35 constexpr int kWordRangeCount = arraysize(kWordRanges);
36 constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
37 constexpr int kDigitRangeCount = arraysize(kDigitRanges);
38 constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
39                                     kLeadSurrogateStart + 1, kRangeEndMarker};
40 constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
41 constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D,         0x000E,
42                                          0x2028, 0x202A, kRangeEndMarker};
43 constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
44 
45 // More makes code generation slower, less makes V8 benchmark score lower.
46 constexpr int kMaxLookaheadForBoyerMoore = 8;
47 // In a 3-character pattern you can maximally step forwards 3 characters
48 // at a time, which is not always enough to pay for the extra logic.
49 constexpr int kPatternTooShortForBoyerMoore = 2;
50 
51 }  // namespace regexp_compiler_constants
52 
NeedsUnicodeCaseEquivalents(RegExpFlags flags)53 inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) {
54   // Both unicode and ignore_case flags are set. We need to use ICU to find
55   // the closure over case equivalents.
56   return IsUnicode(flags) && IsIgnoreCase(flags);
57 }
58 
59 // Details of a quick mask-compare check that can look ahead in the
60 // input stream.
61 class QuickCheckDetails {
62  public:
QuickCheckDetails()63   QuickCheckDetails()
64       : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
QuickCheckDetails(int characters)65   explicit QuickCheckDetails(int characters)
66       : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
67   bool Rationalize(bool one_byte);
68   // Merge in the information from another branch of an alternation.
69   void Merge(QuickCheckDetails* other, int from_index);
70   // Advance the current position by some amount.
71   void Advance(int by, bool one_byte);
72   void Clear();
cannot_match()73   bool cannot_match() { return cannot_match_; }
set_cannot_match()74   void set_cannot_match() { cannot_match_ = true; }
75   struct Position {
Positionv8::internal::QuickCheckDetails::Position76     Position() : mask(0), value(0), determines_perfectly(false) {}
77     base::uc32 mask;
78     base::uc32 value;
79     bool determines_perfectly;
80   };
characters()81   int characters() { return characters_; }
set_characters(int characters)82   void set_characters(int characters) { characters_ = characters; }
positions(int index)83   Position* positions(int index) {
84     DCHECK_LE(0, index);
85     DCHECK_GT(characters_, index);
86     return positions_ + index;
87   }
mask()88   uint32_t mask() { return mask_; }
value()89   uint32_t value() { return value_; }
90 
91  private:
92   // How many characters do we have quick check information from.  This is
93   // the same for all branches of a choice node.
94   int characters_;
95   Position positions_[4];
96   // These values are the condensate of the above array after Rationalize().
97   uint32_t mask_;
98   uint32_t value_;
99   // If set to true, there is no way this quick check can match at all.
100   // E.g., if it requires to be at the start of the input, and isn't.
101   bool cannot_match_;
102 };
103 
104 // Improve the speed that we scan for an initial point where a non-anchored
105 // regexp can match by using a Boyer-Moore-like table. This is done by
106 // identifying non-greedy non-capturing loops in the nodes that eat any
107 // character one at a time.  For example in the middle of the regexp
108 // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
109 // inserted at the start of any non-anchored regexp.
110 //
111 // When we have found such a loop we look ahead in the nodes to find the set of
112 // characters that can come at given distances. For example for the regexp
113 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
114 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
115 // the lookahead info where the set of characters is reasonably constrained. In
116 // our example this is from index 1 to 2 (0 is not constrained). We can now
117 // look 3 characters ahead and if we don't find one of [f, o] (the union of
118 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
119 //
120 // For Unicode input strings we do the same, but modulo 128.
121 //
122 // We also look at the first string fed to the regexp and use that to get a hint
123 // of the character frequencies in the inputs. This affects the assessment of
124 // whether the set of characters is 'reasonably constrained'.
125 //
126 // We also have another lookahead mechanism (called quick check in the code),
127 // which uses a wide load of multiple characters followed by a mask and compare
128 // to determine whether a match is possible at this point.
129 enum ContainedInLattice {
130   kNotYet = 0,
131   kLatticeIn = 1,
132   kLatticeOut = 2,
133   kLatticeUnknown = 3  // Can also mean both in and out.
134 };
135 
Combine(ContainedInLattice a, ContainedInLattice b)136 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
137   return static_cast<ContainedInLattice>(a | b);
138 }
139 
140 class BoyerMoorePositionInfo : public ZoneObject {
141  public:
at(int i) const142   bool at(int i) const { return map_[i]; }
143 
144   static constexpr int kMapSize = 128;
145   static constexpr int kMask = kMapSize - 1;
146 
map_count() const147   int map_count() const { return map_count_; }
148 
149   void Set(int character);
150   void SetInterval(const Interval& interval);
151   void SetAll();
152 
is_non_word()153   bool is_non_word() { return w_ == kLatticeOut; }
is_word()154   bool is_word() { return w_ == kLatticeIn; }
155 
156   using Bitset = std::bitset<kMapSize>;
raw_bitset() const157   Bitset raw_bitset() const { return map_; }
158 
159  private:
160   Bitset map_;
161   int map_count_ = 0;               // Number of set bits in the map.
162   ContainedInLattice w_ = kNotYet;  // The \w character class.
163 };
164 
165 class BoyerMooreLookahead : public ZoneObject {
166  public:
167   BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
168 
length()169   int length() { return length_; }
max_char()170   int max_char() { return max_char_; }
compiler()171   RegExpCompiler* compiler() { return compiler_; }
172 
Count(int map_number)173   int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
174 
at(int i)175   BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
176 
Set(int map_number, int character)177   void Set(int map_number, int character) {
178     if (character > max_char_) return;
179     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
180     info->Set(character);
181   }
182 
SetInterval(int map_number, const Interval& interval)183   void SetInterval(int map_number, const Interval& interval) {
184     if (interval.from() > max_char_) return;
185     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
186     if (interval.to() > max_char_) {
187       info->SetInterval(Interval(interval.from(), max_char_));
188     } else {
189       info->SetInterval(interval);
190     }
191   }
192 
SetAll(int map_number)193   void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
194 
SetRest(int from_map)195   void SetRest(int from_map) {
196     for (int i = from_map; i < length_; i++) SetAll(i);
197   }
198   void EmitSkipInstructions(RegExpMacroAssembler* masm);
199 
200  private:
201   // This is the value obtained by EatsAtLeast.  If we do not have at least this
202   // many characters left in the sample string then the match is bound to fail.
203   // Therefore it is OK to read a character this far ahead of the current match
204   // point.
205   int length_;
206   RegExpCompiler* compiler_;
207   // 0xff for Latin1, 0xffff for UTF-16.
208   int max_char_;
209   ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
210 
211   int GetSkipTable(int min_lookahead, int max_lookahead,
212                    Handle<ByteArray> boolean_skip_table);
213   bool FindWorthwhileInterval(int* from, int* to);
214   int FindBestInterval(int max_number_of_chars, int old_biggest_points,
215                        int* from, int* to);
216 };
217 
218 // There are many ways to generate code for a node.  This class encapsulates
219 // the current way we should be generating.  In other words it encapsulates
220 // the current state of the code generator.  The effect of this is that we
221 // generate code for paths that the matcher can take through the regular
222 // expression.  A given node in the regexp can be code-generated several times
223 // as it can be part of several traces.  For example for the regexp:
224 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
225 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
226 // to match foo is generated only once (the traces have a common prefix).  The
227 // code to store the capture is deferred and generated (twice) after the places
228 // where baz has been matched.
229 class Trace {
230  public:
231   // A value for a property that is either known to be true, know to be false,
232   // or not known.
233   enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
234 
235   class DeferredAction {
236    public:
DeferredAction(ActionNode::ActionType action_type, int reg)237     DeferredAction(ActionNode::ActionType action_type, int reg)
238         : action_type_(action_type), reg_(reg), next_(nullptr) {}
next()239     DeferredAction* next() { return next_; }
240     bool Mentions(int reg);
reg()241     int reg() { return reg_; }
action_type()242     ActionNode::ActionType action_type() { return action_type_; }
243 
244    private:
245     ActionNode::ActionType action_type_;
246     int reg_;
247     DeferredAction* next_;
248     friend class Trace;
249   };
250 
251   class DeferredCapture : public DeferredAction {
252    public:
DeferredCapture(int reg, bool is_capture, Trace* trace)253     DeferredCapture(int reg, bool is_capture, Trace* trace)
254         : DeferredAction(ActionNode::STORE_POSITION, reg),
255           cp_offset_(trace->cp_offset()),
256           is_capture_(is_capture) {}
cp_offset()257     int cp_offset() { return cp_offset_; }
is_capture()258     bool is_capture() { return is_capture_; }
259 
260    private:
261     int cp_offset_;
262     bool is_capture_;
set_cp_offset(int cp_offset)263     void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
264   };
265 
266   class DeferredSetRegisterForLoop : public DeferredAction {
267    public:
DeferredSetRegisterForLoop(int reg, int value)268     DeferredSetRegisterForLoop(int reg, int value)
269         : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
270           value_(value) {}
value()271     int value() { return value_; }
272 
273    private:
274     int value_;
275   };
276 
277   class DeferredClearCaptures : public DeferredAction {
278    public:
DeferredClearCaptures(Interval range)279     explicit DeferredClearCaptures(Interval range)
280         : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
range()281     Interval range() { return range_; }
282 
283    private:
284     Interval range_;
285   };
286 
287   class DeferredIncrementRegister : public DeferredAction {
288    public:
DeferredIncrementRegister(int reg)289     explicit DeferredIncrementRegister(int reg)
290         : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
291   };
292 
Trace()293   Trace()
294       : cp_offset_(0),
295         actions_(nullptr),
296         backtrack_(nullptr),
297         stop_node_(nullptr),
298         loop_label_(nullptr),
299         characters_preloaded_(0),
300         bound_checked_up_to_(0),
301         flush_budget_(100),
302         at_start_(UNKNOWN) {}
303 
304   // End the trace.  This involves flushing the deferred actions in the trace
305   // and pushing a backtrack location onto the backtrack stack.  Once this is
306   // done we can start a new trace or go to one that has already been
307   // generated.
308   void Flush(RegExpCompiler* compiler, RegExpNode* successor);
cp_offset()309   int cp_offset() { return cp_offset_; }
actions()310   DeferredAction* actions() { return actions_; }
311   // A trivial trace is one that has no deferred actions or other state that
312   // affects the assumptions used when generating code.  There is no recorded
313   // backtrack location in a trivial trace, so with a trivial trace we will
314   // generate code that, on a failure to match, gets the backtrack location
315   // from the backtrack stack rather than using a direct jump instruction.  We
316   // always start code generation with a trivial trace and non-trivial traces
317   // are created as we emit code for nodes or add to the list of deferred
318   // actions in the trace.  The location of the code generated for a node using
319   // a trivial trace is recorded in a label in the node so that gotos can be
320   // generated to that code.
is_trivial()321   bool is_trivial() {
322     return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
323            characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
324            quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
325   }
at_start()326   TriBool at_start() { return at_start_; }
set_at_start(TriBool at_start)327   void set_at_start(TriBool at_start) { at_start_ = at_start; }
backtrack()328   Label* backtrack() { return backtrack_; }
loop_label()329   Label* loop_label() { return loop_label_; }
stop_node()330   RegExpNode* stop_node() { return stop_node_; }
characters_preloaded()331   int characters_preloaded() { return characters_preloaded_; }
bound_checked_up_to()332   int bound_checked_up_to() { return bound_checked_up_to_; }
flush_budget()333   int flush_budget() { return flush_budget_; }
quick_check_performed()334   QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
335   bool mentions_reg(int reg);
336   // Returns true if a deferred position store exists to the specified
337   // register and stores the offset in the out-parameter.  Otherwise
338   // returns false.
339   bool GetStoredPosition(int reg, int* cp_offset);
340   // These set methods and AdvanceCurrentPositionInTrace should be used only on
341   // new traces - the intention is that traces are immutable after creation.
add_action(DeferredAction* new_action)342   void add_action(DeferredAction* new_action) {
343     DCHECK(new_action->next_ == nullptr);
344     new_action->next_ = actions_;
345     actions_ = new_action;
346   }
set_backtrack(Label* backtrack)347   void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
set_stop_node(RegExpNode* node)348   void set_stop_node(RegExpNode* node) { stop_node_ = node; }
set_loop_label(Label* label)349   void set_loop_label(Label* label) { loop_label_ = label; }
set_characters_preloaded(int count)350   void set_characters_preloaded(int count) { characters_preloaded_ = count; }
set_bound_checked_up_to(int to)351   void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
set_flush_budget(int to)352   void set_flush_budget(int to) { flush_budget_ = to; }
set_quick_check_performed(QuickCheckDetails* d)353   void set_quick_check_performed(QuickCheckDetails* d) {
354     quick_check_performed_ = *d;
355   }
356   void InvalidateCurrentCharacter();
357   void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
358 
359  private:
360   int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
361   void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
362                               const DynamicBitSet& affected_registers,
363                               DynamicBitSet* registers_to_pop,
364                               DynamicBitSet* registers_to_clear, Zone* zone);
365   void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
366                                 const DynamicBitSet& registers_to_pop,
367                                 const DynamicBitSet& registers_to_clear);
368   int cp_offset_;
369   DeferredAction* actions_;
370   Label* backtrack_;
371   RegExpNode* stop_node_;
372   Label* loop_label_;
373   int characters_preloaded_;
374   int bound_checked_up_to_;
375   QuickCheckDetails quick_check_performed_;
376   int flush_budget_;
377   TriBool at_start_;
378 };
379 
380 class GreedyLoopState {
381  public:
382   explicit GreedyLoopState(bool not_at_start);
383 
label()384   Label* label() { return &label_; }
counter_backtrack_trace()385   Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
386 
387  private:
388   Label label_;
389   Trace counter_backtrack_trace_;
390 };
391 
392 struct PreloadState {
393   static const int kEatsAtLeastNotYetInitialized = -1;
394   bool preload_is_current_;
395   bool preload_has_checked_bounds_;
396   int preload_characters_;
397   int eats_at_least_;
initv8::internal::PreloadState398   void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
399 };
400 
401 // Analysis performs assertion propagation and computes eats_at_least_ values.
402 // See the comments on AssertionPropagator and EatsAtLeastPropagator for more
403 // details.
404 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
405                           RegExpNode* node);
406 
407 class FrequencyCollator {
408  public:
FrequencyCollator()409   FrequencyCollator() : total_samples_(0) {
410     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
411       frequencies_[i] = CharacterFrequency(i);
412     }
413   }
414 
CountCharacter(int character)415   void CountCharacter(int character) {
416     int index = (character & RegExpMacroAssembler::kTableMask);
417     frequencies_[index].Increment();
418     total_samples_++;
419   }
420 
421   // Does not measure in percent, but rather per-128 (the table size from the
422   // regexp macro assembler).
Frequency(int in_character)423   int Frequency(int in_character) {
424     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
425     if (total_samples_ < 1) return 1;  // Division by zero.
426     int freq_in_per128 =
427         (frequencies_[in_character].counter() * 128) / total_samples_;
428     return freq_in_per128;
429   }
430 
431  private:
432   class CharacterFrequency {
433    public:
CharacterFrequency()434     CharacterFrequency() : counter_(0), character_(-1) {}
CharacterFrequency(int character)435     explicit CharacterFrequency(int character)
436         : counter_(0), character_(character) {}
437 
Increment()438     void Increment() { counter_++; }
counter()439     int counter() { return counter_; }
character()440     int character() { return character_; }
441 
442    private:
443     int counter_;
444     int character_;
445   };
446 
447  private:
448   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
449   int total_samples_;
450 };
451 
452 class RegExpCompiler {
453  public:
454   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
455                  RegExpFlags flags, bool is_one_byte);
456 
AllocateRegister()457   int AllocateRegister() {
458     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
459       reg_exp_too_big_ = true;
460       return next_register_;
461     }
462     return next_register_++;
463   }
464 
465   // Lookarounds to match lone surrogates for unicode character class matches
466   // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()467   int UnicodeLookaroundStackRegister() {
468     if (unicode_lookaround_stack_register_ == kNoRegister) {
469       unicode_lookaround_stack_register_ = AllocateRegister();
470     }
471     return unicode_lookaround_stack_register_;
472   }
473 
UnicodeLookaroundPositionRegister()474   int UnicodeLookaroundPositionRegister() {
475     if (unicode_lookaround_position_register_ == kNoRegister) {
476       unicode_lookaround_position_register_ = AllocateRegister();
477     }
478     return unicode_lookaround_position_register_;
479   }
480 
481   struct CompilationResult final {
CompilationResultv8::internal::RegExpCompiler::final482     explicit CompilationResult(RegExpError err) : error(err) {}
CompilationResultv8::internal::RegExpCompiler::final483     CompilationResult(Handle<Object> code, int registers)
484         : code(code), num_registers(registers) {}
485 
RegExpTooBigv8::internal::RegExpCompiler::final486     static CompilationResult RegExpTooBig() {
487       return CompilationResult(RegExpError::kTooLarge);
488     }
489 
Succeededv8::internal::RegExpCompiler::final490     bool Succeeded() const { return error == RegExpError::kNone; }
491 
492     const RegExpError error = RegExpError::kNone;
493     Handle<Object> code;
494     int num_registers = 0;
495   };
496 
497   CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
498                              RegExpNode* start, int capture_count,
499                              Handle<String> pattern);
500 
501   // Preprocessing is the final step of node creation before analysis
502   // and assembly. It includes:
503   // - Wrapping the body of the regexp in capture 0.
504   // - Inserting the implicit .* before/after the regexp if necessary.
505   // - If the input is a one-byte string, filtering out nodes that can't match.
506   // - Fixing up regexp matches that start within a surrogate pair.
507   RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags,
508                                bool is_one_byte);
509 
510   // If the regexp matching starts within a surrogate pair, step back to the
511   // lead surrogate and start matching from there.
512   RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success);
513 
AddWork(RegExpNode* node)514   inline void AddWork(RegExpNode* node) {
515     if (!node->on_work_list() && !node->label()->is_bound()) {
516       node->set_on_work_list(true);
517       work_list_->push_back(node);
518     }
519   }
520 
521   static const int kImplementationOffset = 0;
522   static const int kNumberOfRegistersOffset = 0;
523   static const int kCodeOffset = 1;
524 
macro_assembler()525   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()526   EndNode* accept() { return accept_; }
527 
528   static const int kMaxRecursion = 100;
recursion_depth()529   inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()530   inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()531   inline void DecrementRecursionDepth() { recursion_depth_--; }
532 
flags() const533   RegExpFlags flags() const { return flags_; }
534 
SetRegExpTooBig()535   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
536 
one_byte()537   inline bool one_byte() { return one_byte_; }
optimize()538   inline bool optimize() { return optimize_; }
set_optimize(bool value)539   inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()540   inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)541   inline void set_limiting_recursion(bool value) {
542     limiting_recursion_ = value;
543   }
read_backward()544   bool read_backward() { return read_backward_; }
set_read_backward(bool value)545   void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()546   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
547 
current_expansion_factor()548   int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)549   void set_current_expansion_factor(int value) {
550     current_expansion_factor_ = value;
551   }
552 
553   // The recursive nature of ToNode node generation means we may run into stack
554   // overflow issues. We introduce periodic checks to detect these, and the
555   // tick counter helps limit overhead of these checks.
556   // TODO(jgruber): This is super hacky and should be replaced by an abort
557   // mechanism or iterative node generation.
ToNodeMaybeCheckForStackOverflow()558   void ToNodeMaybeCheckForStackOverflow() {
559     if ((to_node_overflow_check_ticks_++ % 16 == 0)) {
560       ToNodeCheckForStackOverflow();
561     }
562   }
563   void ToNodeCheckForStackOverflow();
564 
isolate() const565   Isolate* isolate() const { return isolate_; }
zone() const566   Zone* zone() const { return zone_; }
567 
568   static const int kNoRegister = -1;
569 
570  private:
571   EndNode* accept_;
572   int next_register_;
573   int unicode_lookaround_stack_register_;
574   int unicode_lookaround_position_register_;
575   ZoneVector<RegExpNode*>* work_list_;
576   int recursion_depth_;
577   const RegExpFlags flags_;
578   RegExpMacroAssembler* macro_assembler_;
579   bool one_byte_;
580   bool reg_exp_too_big_;
581   bool limiting_recursion_;
582   int to_node_overflow_check_ticks_ = 0;
583   bool optimize_;
584   bool read_backward_;
585   int current_expansion_factor_;
586   FrequencyCollator frequency_collator_;
587   Isolate* isolate_;
588   Zone* zone_;
589 };
590 
591 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
592 class UnicodeRangeSplitter {
593  public:
594   V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
595 
596   static constexpr int kInitialSize = 8;
597   using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
598 
bmp() const599   const CharacterRangeVector* bmp() const { return &bmp_; }
lead_surrogates() const600   const CharacterRangeVector* lead_surrogates() const {
601     return &lead_surrogates_;
602   }
trail_surrogates() const603   const CharacterRangeVector* trail_surrogates() const {
604     return &trail_surrogates_;
605   }
non_bmp() const606   const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
607 
608  private:
609   void AddRange(CharacterRange range);
610 
611   CharacterRangeVector bmp_;
612   CharacterRangeVector lead_surrogates_;
613   CharacterRangeVector trail_surrogates_;
614   CharacterRangeVector non_bmp_;
615 };
616 
617 // We need to check for the following characters: 0x39C 0x3BC 0x178.
618 // TODO(jgruber): Move to CharacterRange.
619 bool RangeContainsLatin1Equivalents(CharacterRange range);
620 
621 }  // namespace internal
622 }  // namespace v8
623 
624 #endif  // V8_REGEXP_REGEXP_COMPILER_H_
625