1// Copyright 2020 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/experimental/experimental-interpreter.h"
6
7#include "src/base/optional.h"
8#include "src/base/strings.h"
9#include "src/common/assert-scope.h"
10#include "src/objects/fixed-array-inl.h"
11#include "src/objects/string-inl.h"
12#include "src/regexp/experimental/experimental.h"
13#include "src/strings/char-predicates-inl.h"
14#include "src/zone/zone-allocator.h"
15#include "src/zone/zone-list-inl.h"
16
17namespace v8 {
18namespace internal {
19
20namespace {
21
22constexpr int kUndefinedRegisterValue = -1;
23
24template <class Character>
25bool SatisfiesAssertion(RegExpAssertion::Type type,
26                        base::Vector<const Character> context, int position) {
27  DCHECK_LE(position, context.length());
28  DCHECK_GE(position, 0);
29
30  switch (type) {
31    case RegExpAssertion::Type::START_OF_INPUT:
32      return position == 0;
33    case RegExpAssertion::Type::END_OF_INPUT:
34      return position == context.length();
35    case RegExpAssertion::Type::START_OF_LINE:
36      if (position == 0) return true;
37      return unibrow::IsLineTerminator(context[position - 1]);
38    case RegExpAssertion::Type::END_OF_LINE:
39      if (position == context.length()) return true;
40      return unibrow::IsLineTerminator(context[position]);
41    case RegExpAssertion::Type::BOUNDARY:
42      if (context.length() == 0) {
43        return false;
44      } else if (position == 0) {
45        return IsRegExpWord(context[position]);
46      } else if (position == context.length()) {
47        return IsRegExpWord(context[position - 1]);
48      } else {
49        return IsRegExpWord(context[position - 1]) !=
50               IsRegExpWord(context[position]);
51      }
52    case RegExpAssertion::Type::NON_BOUNDARY:
53      return !SatisfiesAssertion(RegExpAssertion::Type::BOUNDARY, context,
54                                 position);
55  }
56}
57
58base::Vector<RegExpInstruction> ToInstructionVector(
59    ByteArray raw_bytes, const DisallowGarbageCollection& no_gc) {
60  RegExpInstruction* inst_begin =
61      reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress());
62  int inst_num = raw_bytes.length() / sizeof(RegExpInstruction);
63  DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length());
64  return base::Vector<RegExpInstruction>(inst_begin, inst_num);
65}
66
67template <class Character>
68base::Vector<const Character> ToCharacterVector(
69    String str, const DisallowGarbageCollection& no_gc);
70
71template <>
72base::Vector<const uint8_t> ToCharacterVector<uint8_t>(
73    String str, const DisallowGarbageCollection& no_gc) {
74  DCHECK(str.IsFlat());
75  String::FlatContent content = str.GetFlatContent(no_gc);
76  DCHECK(content.IsOneByte());
77  return content.ToOneByteVector();
78}
79
80template <>
81base::Vector<const base::uc16> ToCharacterVector<base::uc16>(
82    String str, const DisallowGarbageCollection& no_gc) {
83  DCHECK(str.IsFlat());
84  String::FlatContent content = str.GetFlatContent(no_gc);
85  DCHECK(content.IsTwoByte());
86  return content.ToUC16Vector();
87}
88
89template <class Character>
90class NfaInterpreter {
91  // Executes a bytecode program in breadth-first mode, without backtracking.
92  // `Character` can be instantiated with `uint8_t` or `base::uc16` for one byte
93  // or two byte input strings.
94  //
95  // In contrast to the backtracking implementation, this has linear time
96  // complexity in the length of the input string. Breadth-first mode means
97  // that threads are executed in lockstep with respect to their input
98  // position, i.e. the threads share a common input index.  This is similar
99  // to breadth-first simulation of a non-deterministic finite automaton (nfa),
100  // hence the name of the class.
101  //
102  // To follow the semantics of a backtracking VM implementation, we have to be
103  // careful about whether we stop execution when a thread executes ACCEPT.
104  // For example, consider execution of the bytecode generated by the regexp
105  //
106  //   r = /abc|..|[a-c]{10,}/
107  //
108  // on input "abcccccccccccccc".  Clearly the three alternatives
109  // - /abc/
110  // - /../
111  // - /[a-c]{10,}/
112  // all match this input.  A backtracking implementation will report "abc" as
113  // match, because it explores the first alternative before the others.
114  //
115  // However, if we execute breadth first, then we execute the 3 threads
116  // - t1, which tries to match /abc/
117  // - t2, which tries to match /../
118  // - t3, which tries to match /[a-c]{10,}/
119  // in lockstep i.e. by iterating over the input and feeding all threads one
120  // character at a time.  t2 will execute an ACCEPT after two characters,
121  // while t1 will only execute ACCEPT after three characters. Thus we find a
122  // match for the second alternative before a match of the first alternative.
123  //
124  // This shows that we cannot always stop searching as soon as some thread t
125  // executes ACCEPT:  If there is a thread u with higher priority than t, then
126  // it must be finished first.  If u produces a match, then we can discard the
127  // match of t because matches produced by threads with higher priority are
128  // preferred over matches of threads with lower priority.  On the other hand,
129  // we are allowed to abort all threads with lower priority than t if t
130  // produces a match: Such threads can only produce worse matches.  In the
131  // example above, we can abort t3 after two characters because of t2's match.
132  //
133  // Thus the interpreter keeps track of a priority-ordered list of threads.
134  // If a thread ACCEPTs, all threads with lower priority are discarded, and
135  // the search continues with the threads with higher priority.  If no threads
136  // with high priority are left, we return the match that was produced by the
137  // ACCEPTing thread with highest priority.
138 public:
139  NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin,
140                 ByteArray bytecode, int register_count_per_match, String input,
141                 int32_t input_index, Zone* zone)
142      : isolate_(isolate),
143        call_origin_(call_origin),
144        bytecode_object_(bytecode),
145        bytecode_(ToInstructionVector(bytecode, no_gc_)),
146        register_count_per_match_(register_count_per_match),
147        input_object_(input),
148        input_(ToCharacterVector<Character>(input, no_gc_)),
149        input_index_(input_index),
150        pc_last_input_index_(zone->NewArray<int>(bytecode.length()),
151                             bytecode.length()),
152        active_threads_(0, zone),
153        blocked_threads_(0, zone),
154        register_array_allocator_(zone),
155        best_match_registers_(base::nullopt),
156        zone_(zone) {
157    DCHECK(!bytecode_.empty());
158    DCHECK_GE(input_index_, 0);
159    DCHECK_LE(input_index_, input_.length());
160
161    std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
162  }
163
164  // Finds matches and writes their concatenated capture registers to
165  // `output_registers`.  `output_registers[i]` has to be valid for all i <
166  // output_register_count.  The search continues until all remaining matches
167  // have been found or there is no space left in `output_registers`.  Returns
168  // the number of matches found.
169  int FindMatches(int32_t* output_registers, int output_register_count) {
170    const int max_match_num = output_register_count / register_count_per_match_;
171
172    int match_num = 0;
173    while (match_num != max_match_num) {
174      int err_code = FindNextMatch();
175      if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
176
177      if (!FoundMatch()) break;
178
179      base::Vector<int> registers = *best_match_registers_;
180      output_registers =
181          std::copy(registers.begin(), registers.end(), output_registers);
182
183      ++match_num;
184
185      const int match_begin = registers[0];
186      const int match_end = registers[1];
187      DCHECK_LE(match_begin, match_end);
188      const int match_length = match_end - match_begin;
189      if (match_length != 0) {
190        SetInputIndex(match_end);
191      } else if (match_end == input_.length()) {
192        // Zero-length match, input exhausted.
193        SetInputIndex(match_end);
194        break;
195      } else {
196        // Zero-length match, more input.  We don't want to report more matches
197        // here endlessly, so we advance by 1.
198        SetInputIndex(match_end + 1);
199
200        // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to
201        // the next codepoint, not to the next code unit. See also
202        // `RegExpUtils::AdvanceStringIndex`.
203        STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode);
204      }
205    }
206
207    return match_num;
208  }
209
210 private:
211  // The state of a "thread" executing experimental regexp bytecode.  (Not to
212  // be confused with an OS thread.)
213  struct InterpreterThread {
214    // This thread's program counter, i.e. the index within `bytecode_` of the
215    // next instruction to be executed.
216    int pc;
217    // Pointer to the array of registers, which is always size
218    // `register_count_per_match_`.  Should be deallocated with
219    // `register_array_allocator_`.
220    int* register_array_begin;
221  };
222
223  // Handles pending interrupts if there are any.  Returns
224  // RegExp::kInternalRegExpSuccess if execution can continue, and an error
225  // code otherwise.
226  int HandleInterrupts() {
227    StackLimitCheck check(isolate_);
228    if (call_origin_ == RegExp::CallOrigin::kFromJs) {
229      // Direct calls from JavaScript can be interrupted in two ways:
230      // 1. A real stack overflow, in which case we let the caller throw the
231      //    exception.
232      // 2. The stack guard was used to interrupt execution for another purpose,
233      //    forcing the call through the runtime system.
234      if (check.JsHasOverflowed()) {
235        return RegExp::kInternalRegExpException;
236      } else if (check.InterruptRequested()) {
237        return RegExp::kInternalRegExpRetry;
238      }
239    } else {
240      DCHECK(call_origin_ == RegExp::CallOrigin::kFromRuntime);
241      HandleScope handles(isolate_);
242      Handle<ByteArray> bytecode_handle(bytecode_object_, isolate_);
243      Handle<String> input_handle(input_object_, isolate_);
244
245      if (check.JsHasOverflowed()) {
246        // We abort the interpreter now anyway, so gc can't invalidate any
247        // pointers.
248        AllowGarbageCollection yes_gc;
249        isolate_->StackOverflow();
250        return RegExp::kInternalRegExpException;
251      } else if (check.InterruptRequested()) {
252        // TODO(mbid): Is this really equivalent to whether the string is
253        // one-byte or two-byte? A comment at the declaration of
254        // IsOneByteRepresentationUnderneath says that this might fail for
255        // external strings.
256        const bool was_one_byte =
257            String::IsOneByteRepresentationUnderneath(input_object_);
258
259        Object result;
260        {
261          AllowGarbageCollection yes_gc;
262          result = isolate_->stack_guard()->HandleInterrupts();
263        }
264        if (result.IsException(isolate_)) {
265          return RegExp::kInternalRegExpException;
266        }
267
268        // If we changed between a LATIN1 and a UC16 string, we need to restart
269        // regexp matching with the appropriate template instantiation of
270        // RawMatch.
271        if (String::IsOneByteRepresentationUnderneath(*input_handle) !=
272            was_one_byte) {
273          return RegExp::kInternalRegExpRetry;
274        }
275
276        // Update objects and pointers in case they have changed during gc.
277        bytecode_object_ = *bytecode_handle;
278        bytecode_ = ToInstructionVector(bytecode_object_, no_gc_);
279        input_object_ = *input_handle;
280        input_ = ToCharacterVector<Character>(input_object_, no_gc_);
281      }
282    }
283    return RegExp::kInternalRegExpSuccess;
284  }
285
286  // Change the current input index for future calls to `FindNextMatch`.
287  void SetInputIndex(int new_input_index) {
288    DCHECK_GE(input_index_, 0);
289    DCHECK_LE(input_index_, input_.length());
290
291    input_index_ = new_input_index;
292  }
293
294  // Find the next match and return the corresponding capture registers and
295  // write its capture registers to `best_match_registers_`.  The search starts
296  // at the current `input_index_`.  Returns RegExp::kInternalRegExpSuccess if
297  // execution could finish regularly (with or without a match) and an error
298  // code due to interrupt otherwise.
299  int FindNextMatch() {
300    DCHECK(active_threads_.is_empty());
301    // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
302    // here? As long as
303    //
304    //   pc_last_input_index_[pc] < input_index_
305    //
306    // for all possible program counters pc that are reachable without input
307    // from pc = 0 and
308    //
309    //   pc_last_input_index_[k] <= input_index_
310    //
311    // for all k > 0 hold I think everything should be fine.  Maybe we can do
312    // something about this in `SetInputIndex`.
313    std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
314
315    // Clean up left-over data from a previous call to FindNextMatch.
316    for (InterpreterThread t : blocked_threads_) {
317      DestroyThread(t);
318    }
319    blocked_threads_.DropAndClear();
320
321    for (InterpreterThread t : active_threads_) {
322      DestroyThread(t);
323    }
324    active_threads_.DropAndClear();
325
326    if (best_match_registers_.has_value()) {
327      FreeRegisterArray(best_match_registers_->begin());
328      best_match_registers_ = base::nullopt;
329    }
330
331    // All threads start at bytecode 0.
332    active_threads_.Add(
333        InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_);
334    // Run the initial thread, potentially forking new threads, until every
335    // thread is blocked without further input.
336    RunActiveThreads();
337
338    // We stop if one of the following conditions hold:
339    // - We have exhausted the entire input.
340    // - We have found a match at some point, and there are no remaining
341    //   threads with higher priority than the thread that produced the match.
342    //   Threads with low priority have been aborted earlier, and the remaining
343    //   threads are blocked here, so the latter simply means that
344    //   `blocked_threads_` is empty.
345    while (input_index_ != input_.length() &&
346           !(FoundMatch() && blocked_threads_.is_empty())) {
347      DCHECK(active_threads_.is_empty());
348      base::uc16 input_char = input_[input_index_];
349      ++input_index_;
350
351      static constexpr int kTicksBetweenInterruptHandling = 64;
352      if (input_index_ % kTicksBetweenInterruptHandling == 0) {
353        int err_code = HandleInterrupts();
354        if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
355      }
356
357      // We unblock all blocked_threads_ by feeding them the input char.
358      FlushBlockedThreads(input_char);
359
360      // Run all threads until they block or accept.
361      RunActiveThreads();
362    }
363
364    return RegExp::kInternalRegExpSuccess;
365  }
366
367  // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
368  // instruction, or its PC value was already processed.
369  // - If processing of `t` can't continue because of CONSUME_RANGE, it is
370  //   pushed on `blocked_threads_`.
371  // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
372  //   the current input index. All remaining `active_threads_` are discarded.
373  void RunActiveThread(InterpreterThread t) {
374    while (true) {
375      if (IsPcProcessed(t.pc)) return;
376      MarkPcProcessed(t.pc);
377
378      RegExpInstruction inst = bytecode_[t.pc];
379      switch (inst.opcode) {
380        case RegExpInstruction::CONSUME_RANGE: {
381          blocked_threads_.Add(t, zone_);
382          return;
383        }
384        case RegExpInstruction::ASSERTION:
385          if (!SatisfiesAssertion(inst.payload.assertion_type, input_,
386                                  input_index_)) {
387            DestroyThread(t);
388            return;
389          }
390          ++t.pc;
391          break;
392        case RegExpInstruction::FORK: {
393          InterpreterThread fork{inst.payload.pc,
394                                 NewRegisterArrayUninitialized()};
395          base::Vector<int> fork_registers = GetRegisterArray(fork);
396          base::Vector<int> t_registers = GetRegisterArray(t);
397          DCHECK_EQ(fork_registers.length(), t_registers.length());
398          std::copy(t_registers.begin(), t_registers.end(),
399                    fork_registers.begin());
400          active_threads_.Add(fork, zone_);
401
402          ++t.pc;
403          break;
404        }
405        case RegExpInstruction::JMP:
406          t.pc = inst.payload.pc;
407          break;
408        case RegExpInstruction::ACCEPT:
409          if (best_match_registers_.has_value()) {
410            FreeRegisterArray(best_match_registers_->begin());
411          }
412          best_match_registers_ = GetRegisterArray(t);
413
414          for (InterpreterThread s : active_threads_) {
415            FreeRegisterArray(s.register_array_begin);
416          }
417          active_threads_.DropAndClear();
418          return;
419        case RegExpInstruction::SET_REGISTER_TO_CP:
420          GetRegisterArray(t)[inst.payload.register_index] = input_index_;
421          ++t.pc;
422          break;
423        case RegExpInstruction::CLEAR_REGISTER:
424          GetRegisterArray(t)[inst.payload.register_index] =
425              kUndefinedRegisterValue;
426          ++t.pc;
427          break;
428      }
429    }
430  }
431
432  // Run each active thread until it can't continue without further input.
433  // `active_threads_` is empty afterwards.  `blocked_threads_` are sorted from
434  // low to high priority.
435  void RunActiveThreads() {
436    while (!active_threads_.is_empty()) {
437      RunActiveThread(active_threads_.RemoveLast());
438    }
439  }
440
441  // Unblock all blocked_threads_ by feeding them an `input_char`.  Should only
442  // be called with `input_index_` pointing to the character *after*
443  // `input_char` so that `pc_last_input_index_` is updated correctly.
444  void FlushBlockedThreads(base::uc16 input_char) {
445    // The threads in blocked_threads_ are sorted from high to low priority,
446    // but active_threads_ needs to be sorted from low to high priority, so we
447    // need to activate blocked threads in reverse order.
448    for (int i = blocked_threads_.length() - 1; i >= 0; --i) {
449      InterpreterThread t = blocked_threads_[i];
450      RegExpInstruction inst = bytecode_[t.pc];
451      DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
452      RegExpInstruction::Uc16Range range = inst.payload.consume_range;
453      if (input_char >= range.min && input_char <= range.max) {
454        ++t.pc;
455        active_threads_.Add(t, zone_);
456      } else {
457        DestroyThread(t);
458      }
459    }
460    blocked_threads_.DropAndClear();
461  }
462
463  bool FoundMatch() const { return best_match_registers_.has_value(); }
464
465  base::Vector<int> GetRegisterArray(InterpreterThread t) {
466    return base::Vector<int>(t.register_array_begin, register_count_per_match_);
467  }
468
469  int* NewRegisterArrayUninitialized() {
470    return register_array_allocator_.allocate(register_count_per_match_);
471  }
472
473  int* NewRegisterArray(int fill_value) {
474    int* array_begin = NewRegisterArrayUninitialized();
475    int* array_end = array_begin + register_count_per_match_;
476    std::fill(array_begin, array_end, fill_value);
477    return array_begin;
478  }
479
480  void FreeRegisterArray(int* register_array_begin) {
481    register_array_allocator_.deallocate(register_array_begin,
482                                         register_count_per_match_);
483  }
484
485  void DestroyThread(InterpreterThread t) {
486    FreeRegisterArray(t.register_array_begin);
487  }
488
489  // It is redundant to have two threads t, t0 execute at the same PC value,
490  // because one of t, t0 matches iff the other does.  We can thus discard
491  // the one with lower priority.  We check whether a thread executed at some
492  // PC value by recording for every possible value of PC what the value of
493  // input_index_ was the last time a thread executed at PC. If a thread
494  // tries to continue execution at a PC value that we have seen before at
495  // the current input index, we abort it. (We execute threads with higher
496  // priority first, so the second thread is guaranteed to have lower
497  // priority.)
498  //
499  // Check whether we've seen an active thread with a given pc value since the
500  // last increment of `input_index_`.
501  bool IsPcProcessed(int pc) {
502    DCHECK_LE(pc_last_input_index_[pc], input_index_);
503    return pc_last_input_index_[pc] == input_index_;
504  }
505
506  // Mark a pc as having been processed since the last increment of
507  // `input_index_`.
508  void MarkPcProcessed(int pc) {
509    DCHECK_LE(pc_last_input_index_[pc], input_index_);
510    pc_last_input_index_[pc] = input_index_;
511  }
512
513  Isolate* const isolate_;
514
515  const RegExp::CallOrigin call_origin_;
516
517  DisallowGarbageCollection no_gc_;
518
519  ByteArray bytecode_object_;
520  base::Vector<const RegExpInstruction> bytecode_;
521
522  // Number of registers used per thread.
523  const int register_count_per_match_;
524
525  String input_object_;
526  base::Vector<const Character> input_;
527  int input_index_;
528
529  // pc_last_input_index_[k] records the value of input_index_ the last
530  // time a thread t such that t.pc == k was activated, i.e. put on
531  // active_threads_.  Thus pc_last_input_index.size() == bytecode.size().  See
532  // also `RunActiveThread`.
533  base::Vector<int> pc_last_input_index_;
534
535  // Active threads can potentially (but not necessarily) continue without
536  // input.  Sorted from low to high priority.
537  ZoneList<InterpreterThread> active_threads_;
538
539  // The pc of a blocked thread points to an instruction that consumes a
540  // character. Sorted from high to low priority (so the opposite of
541  // `active_threads_`).
542  ZoneList<InterpreterThread> blocked_threads_;
543
544  // RecyclingZoneAllocator maintains a linked list through freed allocations
545  // for reuse if possible.
546  RecyclingZoneAllocator<int> register_array_allocator_;
547
548  // The register array of the best match found so far during the current
549  // search.  If several threads ACCEPTed, then this will be the register array
550  // of the accepting thread with highest priority.  Should be deallocated with
551  // `register_array_allocator_`.
552  base::Optional<base::Vector<int>> best_match_registers_;
553
554  Zone* zone_;
555};
556
557}  // namespace
558
559int ExperimentalRegExpInterpreter::FindMatches(
560    Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray bytecode,
561    int register_count_per_match, String input, int start_index,
562    int32_t* output_registers, int output_register_count, Zone* zone) {
563  DCHECK(input.IsFlat());
564  DisallowGarbageCollection no_gc;
565
566  if (input.GetFlatContent(no_gc).IsOneByte()) {
567    NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode,
568                                        register_count_per_match, input,
569                                        start_index, zone);
570    return interpreter.FindMatches(output_registers, output_register_count);
571  } else {
572    DCHECK(input.GetFlatContent(no_gc).IsTwoByte());
573    NfaInterpreter<base::uc16> interpreter(isolate, call_origin, bytecode,
574                                           register_count_per_match, input,
575                                           start_index, zone);
576    return interpreter.FindMatches(output_registers, output_register_count);
577  }
578}
579
580}  // namespace internal
581}  // namespace v8
582