11cb0ef41Sopenharmony_ci// Copyright 2020 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental-interpreter.h"
61cb0ef41Sopenharmony_ci
71cb0ef41Sopenharmony_ci#include "src/base/optional.h"
81cb0ef41Sopenharmony_ci#include "src/base/strings.h"
91cb0ef41Sopenharmony_ci#include "src/common/assert-scope.h"
101cb0ef41Sopenharmony_ci#include "src/objects/fixed-array-inl.h"
111cb0ef41Sopenharmony_ci#include "src/objects/string-inl.h"
121cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental.h"
131cb0ef41Sopenharmony_ci#include "src/strings/char-predicates-inl.h"
141cb0ef41Sopenharmony_ci#include "src/zone/zone-allocator.h"
151cb0ef41Sopenharmony_ci#include "src/zone/zone-list-inl.h"
161cb0ef41Sopenharmony_ci
171cb0ef41Sopenharmony_cinamespace v8 {
181cb0ef41Sopenharmony_cinamespace internal {
191cb0ef41Sopenharmony_ci
201cb0ef41Sopenharmony_cinamespace {
211cb0ef41Sopenharmony_ci
221cb0ef41Sopenharmony_ciconstexpr int kUndefinedRegisterValue = -1;
231cb0ef41Sopenharmony_ci
241cb0ef41Sopenharmony_citemplate <class Character>
251cb0ef41Sopenharmony_cibool SatisfiesAssertion(RegExpAssertion::Type type,
261cb0ef41Sopenharmony_ci                        base::Vector<const Character> context, int position) {
271cb0ef41Sopenharmony_ci  DCHECK_LE(position, context.length());
281cb0ef41Sopenharmony_ci  DCHECK_GE(position, 0);
291cb0ef41Sopenharmony_ci
301cb0ef41Sopenharmony_ci  switch (type) {
311cb0ef41Sopenharmony_ci    case RegExpAssertion::Type::START_OF_INPUT:
321cb0ef41Sopenharmony_ci      return position == 0;
331cb0ef41Sopenharmony_ci    case RegExpAssertion::Type::END_OF_INPUT:
341cb0ef41Sopenharmony_ci      return position == context.length();
351cb0ef41Sopenharmony_ci    case RegExpAssertion::Type::START_OF_LINE:
361cb0ef41Sopenharmony_ci      if (position == 0) return true;
371cb0ef41Sopenharmony_ci      return unibrow::IsLineTerminator(context[position - 1]);
381cb0ef41Sopenharmony_ci    case RegExpAssertion::Type::END_OF_LINE:
391cb0ef41Sopenharmony_ci      if (position == context.length()) return true;
401cb0ef41Sopenharmony_ci      return unibrow::IsLineTerminator(context[position]);
411cb0ef41Sopenharmony_ci    case RegExpAssertion::Type::BOUNDARY:
421cb0ef41Sopenharmony_ci      if (context.length() == 0) {
431cb0ef41Sopenharmony_ci        return false;
441cb0ef41Sopenharmony_ci      } else if (position == 0) {
451cb0ef41Sopenharmony_ci        return IsRegExpWord(context[position]);
461cb0ef41Sopenharmony_ci      } else if (position == context.length()) {
471cb0ef41Sopenharmony_ci        return IsRegExpWord(context[position - 1]);
481cb0ef41Sopenharmony_ci      } else {
491cb0ef41Sopenharmony_ci        return IsRegExpWord(context[position - 1]) !=
501cb0ef41Sopenharmony_ci               IsRegExpWord(context[position]);
511cb0ef41Sopenharmony_ci      }
521cb0ef41Sopenharmony_ci    case RegExpAssertion::Type::NON_BOUNDARY:
531cb0ef41Sopenharmony_ci      return !SatisfiesAssertion(RegExpAssertion::Type::BOUNDARY, context,
541cb0ef41Sopenharmony_ci                                 position);
551cb0ef41Sopenharmony_ci  }
561cb0ef41Sopenharmony_ci}
571cb0ef41Sopenharmony_ci
581cb0ef41Sopenharmony_cibase::Vector<RegExpInstruction> ToInstructionVector(
591cb0ef41Sopenharmony_ci    ByteArray raw_bytes, const DisallowGarbageCollection& no_gc) {
601cb0ef41Sopenharmony_ci  RegExpInstruction* inst_begin =
611cb0ef41Sopenharmony_ci      reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress());
621cb0ef41Sopenharmony_ci  int inst_num = raw_bytes.length() / sizeof(RegExpInstruction);
631cb0ef41Sopenharmony_ci  DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length());
641cb0ef41Sopenharmony_ci  return base::Vector<RegExpInstruction>(inst_begin, inst_num);
651cb0ef41Sopenharmony_ci}
661cb0ef41Sopenharmony_ci
671cb0ef41Sopenharmony_citemplate <class Character>
681cb0ef41Sopenharmony_cibase::Vector<const Character> ToCharacterVector(
691cb0ef41Sopenharmony_ci    String str, const DisallowGarbageCollection& no_gc);
701cb0ef41Sopenharmony_ci
711cb0ef41Sopenharmony_citemplate <>
721cb0ef41Sopenharmony_cibase::Vector<const uint8_t> ToCharacterVector<uint8_t>(
731cb0ef41Sopenharmony_ci    String str, const DisallowGarbageCollection& no_gc) {
741cb0ef41Sopenharmony_ci  DCHECK(str.IsFlat());
751cb0ef41Sopenharmony_ci  String::FlatContent content = str.GetFlatContent(no_gc);
761cb0ef41Sopenharmony_ci  DCHECK(content.IsOneByte());
771cb0ef41Sopenharmony_ci  return content.ToOneByteVector();
781cb0ef41Sopenharmony_ci}
791cb0ef41Sopenharmony_ci
801cb0ef41Sopenharmony_citemplate <>
811cb0ef41Sopenharmony_cibase::Vector<const base::uc16> ToCharacterVector<base::uc16>(
821cb0ef41Sopenharmony_ci    String str, const DisallowGarbageCollection& no_gc) {
831cb0ef41Sopenharmony_ci  DCHECK(str.IsFlat());
841cb0ef41Sopenharmony_ci  String::FlatContent content = str.GetFlatContent(no_gc);
851cb0ef41Sopenharmony_ci  DCHECK(content.IsTwoByte());
861cb0ef41Sopenharmony_ci  return content.ToUC16Vector();
871cb0ef41Sopenharmony_ci}
881cb0ef41Sopenharmony_ci
891cb0ef41Sopenharmony_citemplate <class Character>
901cb0ef41Sopenharmony_ciclass NfaInterpreter {
911cb0ef41Sopenharmony_ci  // Executes a bytecode program in breadth-first mode, without backtracking.
921cb0ef41Sopenharmony_ci  // `Character` can be instantiated with `uint8_t` or `base::uc16` for one byte
931cb0ef41Sopenharmony_ci  // or two byte input strings.
941cb0ef41Sopenharmony_ci  //
951cb0ef41Sopenharmony_ci  // In contrast to the backtracking implementation, this has linear time
961cb0ef41Sopenharmony_ci  // complexity in the length of the input string. Breadth-first mode means
971cb0ef41Sopenharmony_ci  // that threads are executed in lockstep with respect to their input
981cb0ef41Sopenharmony_ci  // position, i.e. the threads share a common input index.  This is similar
991cb0ef41Sopenharmony_ci  // to breadth-first simulation of a non-deterministic finite automaton (nfa),
1001cb0ef41Sopenharmony_ci  // hence the name of the class.
1011cb0ef41Sopenharmony_ci  //
1021cb0ef41Sopenharmony_ci  // To follow the semantics of a backtracking VM implementation, we have to be
1031cb0ef41Sopenharmony_ci  // careful about whether we stop execution when a thread executes ACCEPT.
1041cb0ef41Sopenharmony_ci  // For example, consider execution of the bytecode generated by the regexp
1051cb0ef41Sopenharmony_ci  //
1061cb0ef41Sopenharmony_ci  //   r = /abc|..|[a-c]{10,}/
1071cb0ef41Sopenharmony_ci  //
1081cb0ef41Sopenharmony_ci  // on input "abcccccccccccccc".  Clearly the three alternatives
1091cb0ef41Sopenharmony_ci  // - /abc/
1101cb0ef41Sopenharmony_ci  // - /../
1111cb0ef41Sopenharmony_ci  // - /[a-c]{10,}/
1121cb0ef41Sopenharmony_ci  // all match this input.  A backtracking implementation will report "abc" as
1131cb0ef41Sopenharmony_ci  // match, because it explores the first alternative before the others.
1141cb0ef41Sopenharmony_ci  //
1151cb0ef41Sopenharmony_ci  // However, if we execute breadth first, then we execute the 3 threads
1161cb0ef41Sopenharmony_ci  // - t1, which tries to match /abc/
1171cb0ef41Sopenharmony_ci  // - t2, which tries to match /../
1181cb0ef41Sopenharmony_ci  // - t3, which tries to match /[a-c]{10,}/
1191cb0ef41Sopenharmony_ci  // in lockstep i.e. by iterating over the input and feeding all threads one
1201cb0ef41Sopenharmony_ci  // character at a time.  t2 will execute an ACCEPT after two characters,
1211cb0ef41Sopenharmony_ci  // while t1 will only execute ACCEPT after three characters. Thus we find a
1221cb0ef41Sopenharmony_ci  // match for the second alternative before a match of the first alternative.
1231cb0ef41Sopenharmony_ci  //
1241cb0ef41Sopenharmony_ci  // This shows that we cannot always stop searching as soon as some thread t
1251cb0ef41Sopenharmony_ci  // executes ACCEPT:  If there is a thread u with higher priority than t, then
1261cb0ef41Sopenharmony_ci  // it must be finished first.  If u produces a match, then we can discard the
1271cb0ef41Sopenharmony_ci  // match of t because matches produced by threads with higher priority are
1281cb0ef41Sopenharmony_ci  // preferred over matches of threads with lower priority.  On the other hand,
1291cb0ef41Sopenharmony_ci  // we are allowed to abort all threads with lower priority than t if t
1301cb0ef41Sopenharmony_ci  // produces a match: Such threads can only produce worse matches.  In the
1311cb0ef41Sopenharmony_ci  // example above, we can abort t3 after two characters because of t2's match.
1321cb0ef41Sopenharmony_ci  //
1331cb0ef41Sopenharmony_ci  // Thus the interpreter keeps track of a priority-ordered list of threads.
1341cb0ef41Sopenharmony_ci  // If a thread ACCEPTs, all threads with lower priority are discarded, and
1351cb0ef41Sopenharmony_ci  // the search continues with the threads with higher priority.  If no threads
1361cb0ef41Sopenharmony_ci  // with high priority are left, we return the match that was produced by the
1371cb0ef41Sopenharmony_ci  // ACCEPTing thread with highest priority.
1381cb0ef41Sopenharmony_ci public:
1391cb0ef41Sopenharmony_ci  NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin,
1401cb0ef41Sopenharmony_ci                 ByteArray bytecode, int register_count_per_match, String input,
1411cb0ef41Sopenharmony_ci                 int32_t input_index, Zone* zone)
1421cb0ef41Sopenharmony_ci      : isolate_(isolate),
1431cb0ef41Sopenharmony_ci        call_origin_(call_origin),
1441cb0ef41Sopenharmony_ci        bytecode_object_(bytecode),
1451cb0ef41Sopenharmony_ci        bytecode_(ToInstructionVector(bytecode, no_gc_)),
1461cb0ef41Sopenharmony_ci        register_count_per_match_(register_count_per_match),
1471cb0ef41Sopenharmony_ci        input_object_(input),
1481cb0ef41Sopenharmony_ci        input_(ToCharacterVector<Character>(input, no_gc_)),
1491cb0ef41Sopenharmony_ci        input_index_(input_index),
1501cb0ef41Sopenharmony_ci        pc_last_input_index_(zone->NewArray<int>(bytecode.length()),
1511cb0ef41Sopenharmony_ci                             bytecode.length()),
1521cb0ef41Sopenharmony_ci        active_threads_(0, zone),
1531cb0ef41Sopenharmony_ci        blocked_threads_(0, zone),
1541cb0ef41Sopenharmony_ci        register_array_allocator_(zone),
1551cb0ef41Sopenharmony_ci        best_match_registers_(base::nullopt),
1561cb0ef41Sopenharmony_ci        zone_(zone) {
1571cb0ef41Sopenharmony_ci    DCHECK(!bytecode_.empty());
1581cb0ef41Sopenharmony_ci    DCHECK_GE(input_index_, 0);
1591cb0ef41Sopenharmony_ci    DCHECK_LE(input_index_, input_.length());
1601cb0ef41Sopenharmony_ci
1611cb0ef41Sopenharmony_ci    std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
1621cb0ef41Sopenharmony_ci  }
1631cb0ef41Sopenharmony_ci
1641cb0ef41Sopenharmony_ci  // Finds matches and writes their concatenated capture registers to
1651cb0ef41Sopenharmony_ci  // `output_registers`.  `output_registers[i]` has to be valid for all i <
1661cb0ef41Sopenharmony_ci  // output_register_count.  The search continues until all remaining matches
1671cb0ef41Sopenharmony_ci  // have been found or there is no space left in `output_registers`.  Returns
1681cb0ef41Sopenharmony_ci  // the number of matches found.
1691cb0ef41Sopenharmony_ci  int FindMatches(int32_t* output_registers, int output_register_count) {
1701cb0ef41Sopenharmony_ci    const int max_match_num = output_register_count / register_count_per_match_;
1711cb0ef41Sopenharmony_ci
1721cb0ef41Sopenharmony_ci    int match_num = 0;
1731cb0ef41Sopenharmony_ci    while (match_num != max_match_num) {
1741cb0ef41Sopenharmony_ci      int err_code = FindNextMatch();
1751cb0ef41Sopenharmony_ci      if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
1761cb0ef41Sopenharmony_ci
1771cb0ef41Sopenharmony_ci      if (!FoundMatch()) break;
1781cb0ef41Sopenharmony_ci
1791cb0ef41Sopenharmony_ci      base::Vector<int> registers = *best_match_registers_;
1801cb0ef41Sopenharmony_ci      output_registers =
1811cb0ef41Sopenharmony_ci          std::copy(registers.begin(), registers.end(), output_registers);
1821cb0ef41Sopenharmony_ci
1831cb0ef41Sopenharmony_ci      ++match_num;
1841cb0ef41Sopenharmony_ci
1851cb0ef41Sopenharmony_ci      const int match_begin = registers[0];
1861cb0ef41Sopenharmony_ci      const int match_end = registers[1];
1871cb0ef41Sopenharmony_ci      DCHECK_LE(match_begin, match_end);
1881cb0ef41Sopenharmony_ci      const int match_length = match_end - match_begin;
1891cb0ef41Sopenharmony_ci      if (match_length != 0) {
1901cb0ef41Sopenharmony_ci        SetInputIndex(match_end);
1911cb0ef41Sopenharmony_ci      } else if (match_end == input_.length()) {
1921cb0ef41Sopenharmony_ci        // Zero-length match, input exhausted.
1931cb0ef41Sopenharmony_ci        SetInputIndex(match_end);
1941cb0ef41Sopenharmony_ci        break;
1951cb0ef41Sopenharmony_ci      } else {
1961cb0ef41Sopenharmony_ci        // Zero-length match, more input.  We don't want to report more matches
1971cb0ef41Sopenharmony_ci        // here endlessly, so we advance by 1.
1981cb0ef41Sopenharmony_ci        SetInputIndex(match_end + 1);
1991cb0ef41Sopenharmony_ci
2001cb0ef41Sopenharmony_ci        // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to
2011cb0ef41Sopenharmony_ci        // the next codepoint, not to the next code unit. See also
2021cb0ef41Sopenharmony_ci        // `RegExpUtils::AdvanceStringIndex`.
2031cb0ef41Sopenharmony_ci        STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode);
2041cb0ef41Sopenharmony_ci      }
2051cb0ef41Sopenharmony_ci    }
2061cb0ef41Sopenharmony_ci
2071cb0ef41Sopenharmony_ci    return match_num;
2081cb0ef41Sopenharmony_ci  }
2091cb0ef41Sopenharmony_ci
2101cb0ef41Sopenharmony_ci private:
2111cb0ef41Sopenharmony_ci  // The state of a "thread" executing experimental regexp bytecode.  (Not to
2121cb0ef41Sopenharmony_ci  // be confused with an OS thread.)
2131cb0ef41Sopenharmony_ci  struct InterpreterThread {
2141cb0ef41Sopenharmony_ci    // This thread's program counter, i.e. the index within `bytecode_` of the
2151cb0ef41Sopenharmony_ci    // next instruction to be executed.
2161cb0ef41Sopenharmony_ci    int pc;
2171cb0ef41Sopenharmony_ci    // Pointer to the array of registers, which is always size
2181cb0ef41Sopenharmony_ci    // `register_count_per_match_`.  Should be deallocated with
2191cb0ef41Sopenharmony_ci    // `register_array_allocator_`.
2201cb0ef41Sopenharmony_ci    int* register_array_begin;
2211cb0ef41Sopenharmony_ci  };
2221cb0ef41Sopenharmony_ci
2231cb0ef41Sopenharmony_ci  // Handles pending interrupts if there are any.  Returns
2241cb0ef41Sopenharmony_ci  // RegExp::kInternalRegExpSuccess if execution can continue, and an error
2251cb0ef41Sopenharmony_ci  // code otherwise.
2261cb0ef41Sopenharmony_ci  int HandleInterrupts() {
2271cb0ef41Sopenharmony_ci    StackLimitCheck check(isolate_);
2281cb0ef41Sopenharmony_ci    if (call_origin_ == RegExp::CallOrigin::kFromJs) {
2291cb0ef41Sopenharmony_ci      // Direct calls from JavaScript can be interrupted in two ways:
2301cb0ef41Sopenharmony_ci      // 1. A real stack overflow, in which case we let the caller throw the
2311cb0ef41Sopenharmony_ci      //    exception.
2321cb0ef41Sopenharmony_ci      // 2. The stack guard was used to interrupt execution for another purpose,
2331cb0ef41Sopenharmony_ci      //    forcing the call through the runtime system.
2341cb0ef41Sopenharmony_ci      if (check.JsHasOverflowed()) {
2351cb0ef41Sopenharmony_ci        return RegExp::kInternalRegExpException;
2361cb0ef41Sopenharmony_ci      } else if (check.InterruptRequested()) {
2371cb0ef41Sopenharmony_ci        return RegExp::kInternalRegExpRetry;
2381cb0ef41Sopenharmony_ci      }
2391cb0ef41Sopenharmony_ci    } else {
2401cb0ef41Sopenharmony_ci      DCHECK(call_origin_ == RegExp::CallOrigin::kFromRuntime);
2411cb0ef41Sopenharmony_ci      HandleScope handles(isolate_);
2421cb0ef41Sopenharmony_ci      Handle<ByteArray> bytecode_handle(bytecode_object_, isolate_);
2431cb0ef41Sopenharmony_ci      Handle<String> input_handle(input_object_, isolate_);
2441cb0ef41Sopenharmony_ci
2451cb0ef41Sopenharmony_ci      if (check.JsHasOverflowed()) {
2461cb0ef41Sopenharmony_ci        // We abort the interpreter now anyway, so gc can't invalidate any
2471cb0ef41Sopenharmony_ci        // pointers.
2481cb0ef41Sopenharmony_ci        AllowGarbageCollection yes_gc;
2491cb0ef41Sopenharmony_ci        isolate_->StackOverflow();
2501cb0ef41Sopenharmony_ci        return RegExp::kInternalRegExpException;
2511cb0ef41Sopenharmony_ci      } else if (check.InterruptRequested()) {
2521cb0ef41Sopenharmony_ci        // TODO(mbid): Is this really equivalent to whether the string is
2531cb0ef41Sopenharmony_ci        // one-byte or two-byte? A comment at the declaration of
2541cb0ef41Sopenharmony_ci        // IsOneByteRepresentationUnderneath says that this might fail for
2551cb0ef41Sopenharmony_ci        // external strings.
2561cb0ef41Sopenharmony_ci        const bool was_one_byte =
2571cb0ef41Sopenharmony_ci            String::IsOneByteRepresentationUnderneath(input_object_);
2581cb0ef41Sopenharmony_ci
2591cb0ef41Sopenharmony_ci        Object result;
2601cb0ef41Sopenharmony_ci        {
2611cb0ef41Sopenharmony_ci          AllowGarbageCollection yes_gc;
2621cb0ef41Sopenharmony_ci          result = isolate_->stack_guard()->HandleInterrupts();
2631cb0ef41Sopenharmony_ci        }
2641cb0ef41Sopenharmony_ci        if (result.IsException(isolate_)) {
2651cb0ef41Sopenharmony_ci          return RegExp::kInternalRegExpException;
2661cb0ef41Sopenharmony_ci        }
2671cb0ef41Sopenharmony_ci
2681cb0ef41Sopenharmony_ci        // If we changed between a LATIN1 and a UC16 string, we need to restart
2691cb0ef41Sopenharmony_ci        // regexp matching with the appropriate template instantiation of
2701cb0ef41Sopenharmony_ci        // RawMatch.
2711cb0ef41Sopenharmony_ci        if (String::IsOneByteRepresentationUnderneath(*input_handle) !=
2721cb0ef41Sopenharmony_ci            was_one_byte) {
2731cb0ef41Sopenharmony_ci          return RegExp::kInternalRegExpRetry;
2741cb0ef41Sopenharmony_ci        }
2751cb0ef41Sopenharmony_ci
2761cb0ef41Sopenharmony_ci        // Update objects and pointers in case they have changed during gc.
2771cb0ef41Sopenharmony_ci        bytecode_object_ = *bytecode_handle;
2781cb0ef41Sopenharmony_ci        bytecode_ = ToInstructionVector(bytecode_object_, no_gc_);
2791cb0ef41Sopenharmony_ci        input_object_ = *input_handle;
2801cb0ef41Sopenharmony_ci        input_ = ToCharacterVector<Character>(input_object_, no_gc_);
2811cb0ef41Sopenharmony_ci      }
2821cb0ef41Sopenharmony_ci    }
2831cb0ef41Sopenharmony_ci    return RegExp::kInternalRegExpSuccess;
2841cb0ef41Sopenharmony_ci  }
2851cb0ef41Sopenharmony_ci
2861cb0ef41Sopenharmony_ci  // Change the current input index for future calls to `FindNextMatch`.
2871cb0ef41Sopenharmony_ci  void SetInputIndex(int new_input_index) {
2881cb0ef41Sopenharmony_ci    DCHECK_GE(input_index_, 0);
2891cb0ef41Sopenharmony_ci    DCHECK_LE(input_index_, input_.length());
2901cb0ef41Sopenharmony_ci
2911cb0ef41Sopenharmony_ci    input_index_ = new_input_index;
2921cb0ef41Sopenharmony_ci  }
2931cb0ef41Sopenharmony_ci
2941cb0ef41Sopenharmony_ci  // Find the next match and return the corresponding capture registers and
2951cb0ef41Sopenharmony_ci  // write its capture registers to `best_match_registers_`.  The search starts
2961cb0ef41Sopenharmony_ci  // at the current `input_index_`.  Returns RegExp::kInternalRegExpSuccess if
2971cb0ef41Sopenharmony_ci  // execution could finish regularly (with or without a match) and an error
2981cb0ef41Sopenharmony_ci  // code due to interrupt otherwise.
2991cb0ef41Sopenharmony_ci  int FindNextMatch() {
3001cb0ef41Sopenharmony_ci    DCHECK(active_threads_.is_empty());
3011cb0ef41Sopenharmony_ci    // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
3021cb0ef41Sopenharmony_ci    // here? As long as
3031cb0ef41Sopenharmony_ci    //
3041cb0ef41Sopenharmony_ci    //   pc_last_input_index_[pc] < input_index_
3051cb0ef41Sopenharmony_ci    //
3061cb0ef41Sopenharmony_ci    // for all possible program counters pc that are reachable without input
3071cb0ef41Sopenharmony_ci    // from pc = 0 and
3081cb0ef41Sopenharmony_ci    //
3091cb0ef41Sopenharmony_ci    //   pc_last_input_index_[k] <= input_index_
3101cb0ef41Sopenharmony_ci    //
3111cb0ef41Sopenharmony_ci    // for all k > 0 hold I think everything should be fine.  Maybe we can do
3121cb0ef41Sopenharmony_ci    // something about this in `SetInputIndex`.
3131cb0ef41Sopenharmony_ci    std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
3141cb0ef41Sopenharmony_ci
3151cb0ef41Sopenharmony_ci    // Clean up left-over data from a previous call to FindNextMatch.
3161cb0ef41Sopenharmony_ci    for (InterpreterThread t : blocked_threads_) {
3171cb0ef41Sopenharmony_ci      DestroyThread(t);
3181cb0ef41Sopenharmony_ci    }
3191cb0ef41Sopenharmony_ci    blocked_threads_.DropAndClear();
3201cb0ef41Sopenharmony_ci
3211cb0ef41Sopenharmony_ci    for (InterpreterThread t : active_threads_) {
3221cb0ef41Sopenharmony_ci      DestroyThread(t);
3231cb0ef41Sopenharmony_ci    }
3241cb0ef41Sopenharmony_ci    active_threads_.DropAndClear();
3251cb0ef41Sopenharmony_ci
3261cb0ef41Sopenharmony_ci    if (best_match_registers_.has_value()) {
3271cb0ef41Sopenharmony_ci      FreeRegisterArray(best_match_registers_->begin());
3281cb0ef41Sopenharmony_ci      best_match_registers_ = base::nullopt;
3291cb0ef41Sopenharmony_ci    }
3301cb0ef41Sopenharmony_ci
3311cb0ef41Sopenharmony_ci    // All threads start at bytecode 0.
3321cb0ef41Sopenharmony_ci    active_threads_.Add(
3331cb0ef41Sopenharmony_ci        InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_);
3341cb0ef41Sopenharmony_ci    // Run the initial thread, potentially forking new threads, until every
3351cb0ef41Sopenharmony_ci    // thread is blocked without further input.
3361cb0ef41Sopenharmony_ci    RunActiveThreads();
3371cb0ef41Sopenharmony_ci
3381cb0ef41Sopenharmony_ci    // We stop if one of the following conditions hold:
3391cb0ef41Sopenharmony_ci    // - We have exhausted the entire input.
3401cb0ef41Sopenharmony_ci    // - We have found a match at some point, and there are no remaining
3411cb0ef41Sopenharmony_ci    //   threads with higher priority than the thread that produced the match.
3421cb0ef41Sopenharmony_ci    //   Threads with low priority have been aborted earlier, and the remaining
3431cb0ef41Sopenharmony_ci    //   threads are blocked here, so the latter simply means that
3441cb0ef41Sopenharmony_ci    //   `blocked_threads_` is empty.
3451cb0ef41Sopenharmony_ci    while (input_index_ != input_.length() &&
3461cb0ef41Sopenharmony_ci           !(FoundMatch() && blocked_threads_.is_empty())) {
3471cb0ef41Sopenharmony_ci      DCHECK(active_threads_.is_empty());
3481cb0ef41Sopenharmony_ci      base::uc16 input_char = input_[input_index_];
3491cb0ef41Sopenharmony_ci      ++input_index_;
3501cb0ef41Sopenharmony_ci
3511cb0ef41Sopenharmony_ci      static constexpr int kTicksBetweenInterruptHandling = 64;
3521cb0ef41Sopenharmony_ci      if (input_index_ % kTicksBetweenInterruptHandling == 0) {
3531cb0ef41Sopenharmony_ci        int err_code = HandleInterrupts();
3541cb0ef41Sopenharmony_ci        if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
3551cb0ef41Sopenharmony_ci      }
3561cb0ef41Sopenharmony_ci
3571cb0ef41Sopenharmony_ci      // We unblock all blocked_threads_ by feeding them the input char.
3581cb0ef41Sopenharmony_ci      FlushBlockedThreads(input_char);
3591cb0ef41Sopenharmony_ci
3601cb0ef41Sopenharmony_ci      // Run all threads until they block or accept.
3611cb0ef41Sopenharmony_ci      RunActiveThreads();
3621cb0ef41Sopenharmony_ci    }
3631cb0ef41Sopenharmony_ci
3641cb0ef41Sopenharmony_ci    return RegExp::kInternalRegExpSuccess;
3651cb0ef41Sopenharmony_ci  }
3661cb0ef41Sopenharmony_ci
3671cb0ef41Sopenharmony_ci  // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
3681cb0ef41Sopenharmony_ci  // instruction, or its PC value was already processed.
3691cb0ef41Sopenharmony_ci  // - If processing of `t` can't continue because of CONSUME_RANGE, it is
3701cb0ef41Sopenharmony_ci  //   pushed on `blocked_threads_`.
3711cb0ef41Sopenharmony_ci  // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
3721cb0ef41Sopenharmony_ci  //   the current input index. All remaining `active_threads_` are discarded.
3731cb0ef41Sopenharmony_ci  void RunActiveThread(InterpreterThread t) {
3741cb0ef41Sopenharmony_ci    while (true) {
3751cb0ef41Sopenharmony_ci      if (IsPcProcessed(t.pc)) return;
3761cb0ef41Sopenharmony_ci      MarkPcProcessed(t.pc);
3771cb0ef41Sopenharmony_ci
3781cb0ef41Sopenharmony_ci      RegExpInstruction inst = bytecode_[t.pc];
3791cb0ef41Sopenharmony_ci      switch (inst.opcode) {
3801cb0ef41Sopenharmony_ci        case RegExpInstruction::CONSUME_RANGE: {
3811cb0ef41Sopenharmony_ci          blocked_threads_.Add(t, zone_);
3821cb0ef41Sopenharmony_ci          return;
3831cb0ef41Sopenharmony_ci        }
3841cb0ef41Sopenharmony_ci        case RegExpInstruction::ASSERTION:
3851cb0ef41Sopenharmony_ci          if (!SatisfiesAssertion(inst.payload.assertion_type, input_,
3861cb0ef41Sopenharmony_ci                                  input_index_)) {
3871cb0ef41Sopenharmony_ci            DestroyThread(t);
3881cb0ef41Sopenharmony_ci            return;
3891cb0ef41Sopenharmony_ci          }
3901cb0ef41Sopenharmony_ci          ++t.pc;
3911cb0ef41Sopenharmony_ci          break;
3921cb0ef41Sopenharmony_ci        case RegExpInstruction::FORK: {
3931cb0ef41Sopenharmony_ci          InterpreterThread fork{inst.payload.pc,
3941cb0ef41Sopenharmony_ci                                 NewRegisterArrayUninitialized()};
3951cb0ef41Sopenharmony_ci          base::Vector<int> fork_registers = GetRegisterArray(fork);
3961cb0ef41Sopenharmony_ci          base::Vector<int> t_registers = GetRegisterArray(t);
3971cb0ef41Sopenharmony_ci          DCHECK_EQ(fork_registers.length(), t_registers.length());
3981cb0ef41Sopenharmony_ci          std::copy(t_registers.begin(), t_registers.end(),
3991cb0ef41Sopenharmony_ci                    fork_registers.begin());
4001cb0ef41Sopenharmony_ci          active_threads_.Add(fork, zone_);
4011cb0ef41Sopenharmony_ci
4021cb0ef41Sopenharmony_ci          ++t.pc;
4031cb0ef41Sopenharmony_ci          break;
4041cb0ef41Sopenharmony_ci        }
4051cb0ef41Sopenharmony_ci        case RegExpInstruction::JMP:
4061cb0ef41Sopenharmony_ci          t.pc = inst.payload.pc;
4071cb0ef41Sopenharmony_ci          break;
4081cb0ef41Sopenharmony_ci        case RegExpInstruction::ACCEPT:
4091cb0ef41Sopenharmony_ci          if (best_match_registers_.has_value()) {
4101cb0ef41Sopenharmony_ci            FreeRegisterArray(best_match_registers_->begin());
4111cb0ef41Sopenharmony_ci          }
4121cb0ef41Sopenharmony_ci          best_match_registers_ = GetRegisterArray(t);
4131cb0ef41Sopenharmony_ci
4141cb0ef41Sopenharmony_ci          for (InterpreterThread s : active_threads_) {
4151cb0ef41Sopenharmony_ci            FreeRegisterArray(s.register_array_begin);
4161cb0ef41Sopenharmony_ci          }
4171cb0ef41Sopenharmony_ci          active_threads_.DropAndClear();
4181cb0ef41Sopenharmony_ci          return;
4191cb0ef41Sopenharmony_ci        case RegExpInstruction::SET_REGISTER_TO_CP:
4201cb0ef41Sopenharmony_ci          GetRegisterArray(t)[inst.payload.register_index] = input_index_;
4211cb0ef41Sopenharmony_ci          ++t.pc;
4221cb0ef41Sopenharmony_ci          break;
4231cb0ef41Sopenharmony_ci        case RegExpInstruction::CLEAR_REGISTER:
4241cb0ef41Sopenharmony_ci          GetRegisterArray(t)[inst.payload.register_index] =
4251cb0ef41Sopenharmony_ci              kUndefinedRegisterValue;
4261cb0ef41Sopenharmony_ci          ++t.pc;
4271cb0ef41Sopenharmony_ci          break;
4281cb0ef41Sopenharmony_ci      }
4291cb0ef41Sopenharmony_ci    }
4301cb0ef41Sopenharmony_ci  }
4311cb0ef41Sopenharmony_ci
4321cb0ef41Sopenharmony_ci  // Run each active thread until it can't continue without further input.
4331cb0ef41Sopenharmony_ci  // `active_threads_` is empty afterwards.  `blocked_threads_` are sorted from
4341cb0ef41Sopenharmony_ci  // low to high priority.
4351cb0ef41Sopenharmony_ci  void RunActiveThreads() {
4361cb0ef41Sopenharmony_ci    while (!active_threads_.is_empty()) {
4371cb0ef41Sopenharmony_ci      RunActiveThread(active_threads_.RemoveLast());
4381cb0ef41Sopenharmony_ci    }
4391cb0ef41Sopenharmony_ci  }
4401cb0ef41Sopenharmony_ci
4411cb0ef41Sopenharmony_ci  // Unblock all blocked_threads_ by feeding them an `input_char`.  Should only
4421cb0ef41Sopenharmony_ci  // be called with `input_index_` pointing to the character *after*
4431cb0ef41Sopenharmony_ci  // `input_char` so that `pc_last_input_index_` is updated correctly.
4441cb0ef41Sopenharmony_ci  void FlushBlockedThreads(base::uc16 input_char) {
4451cb0ef41Sopenharmony_ci    // The threads in blocked_threads_ are sorted from high to low priority,
4461cb0ef41Sopenharmony_ci    // but active_threads_ needs to be sorted from low to high priority, so we
4471cb0ef41Sopenharmony_ci    // need to activate blocked threads in reverse order.
4481cb0ef41Sopenharmony_ci    for (int i = blocked_threads_.length() - 1; i >= 0; --i) {
4491cb0ef41Sopenharmony_ci      InterpreterThread t = blocked_threads_[i];
4501cb0ef41Sopenharmony_ci      RegExpInstruction inst = bytecode_[t.pc];
4511cb0ef41Sopenharmony_ci      DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
4521cb0ef41Sopenharmony_ci      RegExpInstruction::Uc16Range range = inst.payload.consume_range;
4531cb0ef41Sopenharmony_ci      if (input_char >= range.min && input_char <= range.max) {
4541cb0ef41Sopenharmony_ci        ++t.pc;
4551cb0ef41Sopenharmony_ci        active_threads_.Add(t, zone_);
4561cb0ef41Sopenharmony_ci      } else {
4571cb0ef41Sopenharmony_ci        DestroyThread(t);
4581cb0ef41Sopenharmony_ci      }
4591cb0ef41Sopenharmony_ci    }
4601cb0ef41Sopenharmony_ci    blocked_threads_.DropAndClear();
4611cb0ef41Sopenharmony_ci  }
4621cb0ef41Sopenharmony_ci
4631cb0ef41Sopenharmony_ci  bool FoundMatch() const { return best_match_registers_.has_value(); }
4641cb0ef41Sopenharmony_ci
4651cb0ef41Sopenharmony_ci  base::Vector<int> GetRegisterArray(InterpreterThread t) {
4661cb0ef41Sopenharmony_ci    return base::Vector<int>(t.register_array_begin, register_count_per_match_);
4671cb0ef41Sopenharmony_ci  }
4681cb0ef41Sopenharmony_ci
4691cb0ef41Sopenharmony_ci  int* NewRegisterArrayUninitialized() {
4701cb0ef41Sopenharmony_ci    return register_array_allocator_.allocate(register_count_per_match_);
4711cb0ef41Sopenharmony_ci  }
4721cb0ef41Sopenharmony_ci
4731cb0ef41Sopenharmony_ci  int* NewRegisterArray(int fill_value) {
4741cb0ef41Sopenharmony_ci    int* array_begin = NewRegisterArrayUninitialized();
4751cb0ef41Sopenharmony_ci    int* array_end = array_begin + register_count_per_match_;
4761cb0ef41Sopenharmony_ci    std::fill(array_begin, array_end, fill_value);
4771cb0ef41Sopenharmony_ci    return array_begin;
4781cb0ef41Sopenharmony_ci  }
4791cb0ef41Sopenharmony_ci
4801cb0ef41Sopenharmony_ci  void FreeRegisterArray(int* register_array_begin) {
4811cb0ef41Sopenharmony_ci    register_array_allocator_.deallocate(register_array_begin,
4821cb0ef41Sopenharmony_ci                                         register_count_per_match_);
4831cb0ef41Sopenharmony_ci  }
4841cb0ef41Sopenharmony_ci
4851cb0ef41Sopenharmony_ci  void DestroyThread(InterpreterThread t) {
4861cb0ef41Sopenharmony_ci    FreeRegisterArray(t.register_array_begin);
4871cb0ef41Sopenharmony_ci  }
4881cb0ef41Sopenharmony_ci
4891cb0ef41Sopenharmony_ci  // It is redundant to have two threads t, t0 execute at the same PC value,
4901cb0ef41Sopenharmony_ci  // because one of t, t0 matches iff the other does.  We can thus discard
4911cb0ef41Sopenharmony_ci  // the one with lower priority.  We check whether a thread executed at some
4921cb0ef41Sopenharmony_ci  // PC value by recording for every possible value of PC what the value of
4931cb0ef41Sopenharmony_ci  // input_index_ was the last time a thread executed at PC. If a thread
4941cb0ef41Sopenharmony_ci  // tries to continue execution at a PC value that we have seen before at
4951cb0ef41Sopenharmony_ci  // the current input index, we abort it. (We execute threads with higher
4961cb0ef41Sopenharmony_ci  // priority first, so the second thread is guaranteed to have lower
4971cb0ef41Sopenharmony_ci  // priority.)
4981cb0ef41Sopenharmony_ci  //
4991cb0ef41Sopenharmony_ci  // Check whether we've seen an active thread with a given pc value since the
5001cb0ef41Sopenharmony_ci  // last increment of `input_index_`.
5011cb0ef41Sopenharmony_ci  bool IsPcProcessed(int pc) {
5021cb0ef41Sopenharmony_ci    DCHECK_LE(pc_last_input_index_[pc], input_index_);
5031cb0ef41Sopenharmony_ci    return pc_last_input_index_[pc] == input_index_;
5041cb0ef41Sopenharmony_ci  }
5051cb0ef41Sopenharmony_ci
5061cb0ef41Sopenharmony_ci  // Mark a pc as having been processed since the last increment of
5071cb0ef41Sopenharmony_ci  // `input_index_`.
5081cb0ef41Sopenharmony_ci  void MarkPcProcessed(int pc) {
5091cb0ef41Sopenharmony_ci    DCHECK_LE(pc_last_input_index_[pc], input_index_);
5101cb0ef41Sopenharmony_ci    pc_last_input_index_[pc] = input_index_;
5111cb0ef41Sopenharmony_ci  }
5121cb0ef41Sopenharmony_ci
5131cb0ef41Sopenharmony_ci  Isolate* const isolate_;
5141cb0ef41Sopenharmony_ci
5151cb0ef41Sopenharmony_ci  const RegExp::CallOrigin call_origin_;
5161cb0ef41Sopenharmony_ci
5171cb0ef41Sopenharmony_ci  DisallowGarbageCollection no_gc_;
5181cb0ef41Sopenharmony_ci
5191cb0ef41Sopenharmony_ci  ByteArray bytecode_object_;
5201cb0ef41Sopenharmony_ci  base::Vector<const RegExpInstruction> bytecode_;
5211cb0ef41Sopenharmony_ci
5221cb0ef41Sopenharmony_ci  // Number of registers used per thread.
5231cb0ef41Sopenharmony_ci  const int register_count_per_match_;
5241cb0ef41Sopenharmony_ci
5251cb0ef41Sopenharmony_ci  String input_object_;
5261cb0ef41Sopenharmony_ci  base::Vector<const Character> input_;
5271cb0ef41Sopenharmony_ci  int input_index_;
5281cb0ef41Sopenharmony_ci
5291cb0ef41Sopenharmony_ci  // pc_last_input_index_[k] records the value of input_index_ the last
5301cb0ef41Sopenharmony_ci  // time a thread t such that t.pc == k was activated, i.e. put on
5311cb0ef41Sopenharmony_ci  // active_threads_.  Thus pc_last_input_index.size() == bytecode.size().  See
5321cb0ef41Sopenharmony_ci  // also `RunActiveThread`.
5331cb0ef41Sopenharmony_ci  base::Vector<int> pc_last_input_index_;
5341cb0ef41Sopenharmony_ci
5351cb0ef41Sopenharmony_ci  // Active threads can potentially (but not necessarily) continue without
5361cb0ef41Sopenharmony_ci  // input.  Sorted from low to high priority.
5371cb0ef41Sopenharmony_ci  ZoneList<InterpreterThread> active_threads_;
5381cb0ef41Sopenharmony_ci
5391cb0ef41Sopenharmony_ci  // The pc of a blocked thread points to an instruction that consumes a
5401cb0ef41Sopenharmony_ci  // character. Sorted from high to low priority (so the opposite of
5411cb0ef41Sopenharmony_ci  // `active_threads_`).
5421cb0ef41Sopenharmony_ci  ZoneList<InterpreterThread> blocked_threads_;
5431cb0ef41Sopenharmony_ci
5441cb0ef41Sopenharmony_ci  // RecyclingZoneAllocator maintains a linked list through freed allocations
5451cb0ef41Sopenharmony_ci  // for reuse if possible.
5461cb0ef41Sopenharmony_ci  RecyclingZoneAllocator<int> register_array_allocator_;
5471cb0ef41Sopenharmony_ci
5481cb0ef41Sopenharmony_ci  // The register array of the best match found so far during the current
5491cb0ef41Sopenharmony_ci  // search.  If several threads ACCEPTed, then this will be the register array
5501cb0ef41Sopenharmony_ci  // of the accepting thread with highest priority.  Should be deallocated with
5511cb0ef41Sopenharmony_ci  // `register_array_allocator_`.
5521cb0ef41Sopenharmony_ci  base::Optional<base::Vector<int>> best_match_registers_;
5531cb0ef41Sopenharmony_ci
5541cb0ef41Sopenharmony_ci  Zone* zone_;
5551cb0ef41Sopenharmony_ci};
5561cb0ef41Sopenharmony_ci
5571cb0ef41Sopenharmony_ci}  // namespace
5581cb0ef41Sopenharmony_ci
5591cb0ef41Sopenharmony_ciint ExperimentalRegExpInterpreter::FindMatches(
5601cb0ef41Sopenharmony_ci    Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray bytecode,
5611cb0ef41Sopenharmony_ci    int register_count_per_match, String input, int start_index,
5621cb0ef41Sopenharmony_ci    int32_t* output_registers, int output_register_count, Zone* zone) {
5631cb0ef41Sopenharmony_ci  DCHECK(input.IsFlat());
5641cb0ef41Sopenharmony_ci  DisallowGarbageCollection no_gc;
5651cb0ef41Sopenharmony_ci
5661cb0ef41Sopenharmony_ci  if (input.GetFlatContent(no_gc).IsOneByte()) {
5671cb0ef41Sopenharmony_ci    NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode,
5681cb0ef41Sopenharmony_ci                                        register_count_per_match, input,
5691cb0ef41Sopenharmony_ci                                        start_index, zone);
5701cb0ef41Sopenharmony_ci    return interpreter.FindMatches(output_registers, output_register_count);
5711cb0ef41Sopenharmony_ci  } else {
5721cb0ef41Sopenharmony_ci    DCHECK(input.GetFlatContent(no_gc).IsTwoByte());
5731cb0ef41Sopenharmony_ci    NfaInterpreter<base::uc16> interpreter(isolate, call_origin, bytecode,
5741cb0ef41Sopenharmony_ci                                           register_count_per_match, input,
5751cb0ef41Sopenharmony_ci                                           start_index, zone);
5761cb0ef41Sopenharmony_ci    return interpreter.FindMatches(output_registers, output_register_count);
5771cb0ef41Sopenharmony_ci  }
5781cb0ef41Sopenharmony_ci}
5791cb0ef41Sopenharmony_ci
5801cb0ef41Sopenharmony_ci}  // namespace internal
5811cb0ef41Sopenharmony_ci}  // namespace v8
582