11cb0ef41Sopenharmony_ci// Copyright 2020 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental-interpreter.h" 61cb0ef41Sopenharmony_ci 71cb0ef41Sopenharmony_ci#include "src/base/optional.h" 81cb0ef41Sopenharmony_ci#include "src/base/strings.h" 91cb0ef41Sopenharmony_ci#include "src/common/assert-scope.h" 101cb0ef41Sopenharmony_ci#include "src/objects/fixed-array-inl.h" 111cb0ef41Sopenharmony_ci#include "src/objects/string-inl.h" 121cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental.h" 131cb0ef41Sopenharmony_ci#include "src/strings/char-predicates-inl.h" 141cb0ef41Sopenharmony_ci#include "src/zone/zone-allocator.h" 151cb0ef41Sopenharmony_ci#include "src/zone/zone-list-inl.h" 161cb0ef41Sopenharmony_ci 171cb0ef41Sopenharmony_cinamespace v8 { 181cb0ef41Sopenharmony_cinamespace internal { 191cb0ef41Sopenharmony_ci 201cb0ef41Sopenharmony_cinamespace { 211cb0ef41Sopenharmony_ci 221cb0ef41Sopenharmony_ciconstexpr int kUndefinedRegisterValue = -1; 231cb0ef41Sopenharmony_ci 241cb0ef41Sopenharmony_citemplate <class Character> 251cb0ef41Sopenharmony_cibool SatisfiesAssertion(RegExpAssertion::Type type, 261cb0ef41Sopenharmony_ci base::Vector<const Character> context, int position) { 271cb0ef41Sopenharmony_ci DCHECK_LE(position, context.length()); 281cb0ef41Sopenharmony_ci DCHECK_GE(position, 0); 291cb0ef41Sopenharmony_ci 301cb0ef41Sopenharmony_ci switch (type) { 311cb0ef41Sopenharmony_ci case RegExpAssertion::Type::START_OF_INPUT: 321cb0ef41Sopenharmony_ci return position == 0; 331cb0ef41Sopenharmony_ci case RegExpAssertion::Type::END_OF_INPUT: 341cb0ef41Sopenharmony_ci return position == context.length(); 351cb0ef41Sopenharmony_ci case RegExpAssertion::Type::START_OF_LINE: 361cb0ef41Sopenharmony_ci if (position == 0) return true; 371cb0ef41Sopenharmony_ci return unibrow::IsLineTerminator(context[position - 1]); 381cb0ef41Sopenharmony_ci case RegExpAssertion::Type::END_OF_LINE: 391cb0ef41Sopenharmony_ci if (position == context.length()) return true; 401cb0ef41Sopenharmony_ci return unibrow::IsLineTerminator(context[position]); 411cb0ef41Sopenharmony_ci case RegExpAssertion::Type::BOUNDARY: 421cb0ef41Sopenharmony_ci if (context.length() == 0) { 431cb0ef41Sopenharmony_ci return false; 441cb0ef41Sopenharmony_ci } else if (position == 0) { 451cb0ef41Sopenharmony_ci return IsRegExpWord(context[position]); 461cb0ef41Sopenharmony_ci } else if (position == context.length()) { 471cb0ef41Sopenharmony_ci return IsRegExpWord(context[position - 1]); 481cb0ef41Sopenharmony_ci } else { 491cb0ef41Sopenharmony_ci return IsRegExpWord(context[position - 1]) != 501cb0ef41Sopenharmony_ci IsRegExpWord(context[position]); 511cb0ef41Sopenharmony_ci } 521cb0ef41Sopenharmony_ci case RegExpAssertion::Type::NON_BOUNDARY: 531cb0ef41Sopenharmony_ci return !SatisfiesAssertion(RegExpAssertion::Type::BOUNDARY, context, 541cb0ef41Sopenharmony_ci position); 551cb0ef41Sopenharmony_ci } 561cb0ef41Sopenharmony_ci} 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_cibase::Vector<RegExpInstruction> ToInstructionVector( 591cb0ef41Sopenharmony_ci ByteArray raw_bytes, const DisallowGarbageCollection& no_gc) { 601cb0ef41Sopenharmony_ci RegExpInstruction* inst_begin = 611cb0ef41Sopenharmony_ci reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress()); 621cb0ef41Sopenharmony_ci int inst_num = raw_bytes.length() / sizeof(RegExpInstruction); 631cb0ef41Sopenharmony_ci DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length()); 641cb0ef41Sopenharmony_ci return base::Vector<RegExpInstruction>(inst_begin, inst_num); 651cb0ef41Sopenharmony_ci} 661cb0ef41Sopenharmony_ci 671cb0ef41Sopenharmony_citemplate <class Character> 681cb0ef41Sopenharmony_cibase::Vector<const Character> ToCharacterVector( 691cb0ef41Sopenharmony_ci String str, const DisallowGarbageCollection& no_gc); 701cb0ef41Sopenharmony_ci 711cb0ef41Sopenharmony_citemplate <> 721cb0ef41Sopenharmony_cibase::Vector<const uint8_t> ToCharacterVector<uint8_t>( 731cb0ef41Sopenharmony_ci String str, const DisallowGarbageCollection& no_gc) { 741cb0ef41Sopenharmony_ci DCHECK(str.IsFlat()); 751cb0ef41Sopenharmony_ci String::FlatContent content = str.GetFlatContent(no_gc); 761cb0ef41Sopenharmony_ci DCHECK(content.IsOneByte()); 771cb0ef41Sopenharmony_ci return content.ToOneByteVector(); 781cb0ef41Sopenharmony_ci} 791cb0ef41Sopenharmony_ci 801cb0ef41Sopenharmony_citemplate <> 811cb0ef41Sopenharmony_cibase::Vector<const base::uc16> ToCharacterVector<base::uc16>( 821cb0ef41Sopenharmony_ci String str, const DisallowGarbageCollection& no_gc) { 831cb0ef41Sopenharmony_ci DCHECK(str.IsFlat()); 841cb0ef41Sopenharmony_ci String::FlatContent content = str.GetFlatContent(no_gc); 851cb0ef41Sopenharmony_ci DCHECK(content.IsTwoByte()); 861cb0ef41Sopenharmony_ci return content.ToUC16Vector(); 871cb0ef41Sopenharmony_ci} 881cb0ef41Sopenharmony_ci 891cb0ef41Sopenharmony_citemplate <class Character> 901cb0ef41Sopenharmony_ciclass NfaInterpreter { 911cb0ef41Sopenharmony_ci // Executes a bytecode program in breadth-first mode, without backtracking. 921cb0ef41Sopenharmony_ci // `Character` can be instantiated with `uint8_t` or `base::uc16` for one byte 931cb0ef41Sopenharmony_ci // or two byte input strings. 941cb0ef41Sopenharmony_ci // 951cb0ef41Sopenharmony_ci // In contrast to the backtracking implementation, this has linear time 961cb0ef41Sopenharmony_ci // complexity in the length of the input string. Breadth-first mode means 971cb0ef41Sopenharmony_ci // that threads are executed in lockstep with respect to their input 981cb0ef41Sopenharmony_ci // position, i.e. the threads share a common input index. This is similar 991cb0ef41Sopenharmony_ci // to breadth-first simulation of a non-deterministic finite automaton (nfa), 1001cb0ef41Sopenharmony_ci // hence the name of the class. 1011cb0ef41Sopenharmony_ci // 1021cb0ef41Sopenharmony_ci // To follow the semantics of a backtracking VM implementation, we have to be 1031cb0ef41Sopenharmony_ci // careful about whether we stop execution when a thread executes ACCEPT. 1041cb0ef41Sopenharmony_ci // For example, consider execution of the bytecode generated by the regexp 1051cb0ef41Sopenharmony_ci // 1061cb0ef41Sopenharmony_ci // r = /abc|..|[a-c]{10,}/ 1071cb0ef41Sopenharmony_ci // 1081cb0ef41Sopenharmony_ci // on input "abcccccccccccccc". Clearly the three alternatives 1091cb0ef41Sopenharmony_ci // - /abc/ 1101cb0ef41Sopenharmony_ci // - /../ 1111cb0ef41Sopenharmony_ci // - /[a-c]{10,}/ 1121cb0ef41Sopenharmony_ci // all match this input. A backtracking implementation will report "abc" as 1131cb0ef41Sopenharmony_ci // match, because it explores the first alternative before the others. 1141cb0ef41Sopenharmony_ci // 1151cb0ef41Sopenharmony_ci // However, if we execute breadth first, then we execute the 3 threads 1161cb0ef41Sopenharmony_ci // - t1, which tries to match /abc/ 1171cb0ef41Sopenharmony_ci // - t2, which tries to match /../ 1181cb0ef41Sopenharmony_ci // - t3, which tries to match /[a-c]{10,}/ 1191cb0ef41Sopenharmony_ci // in lockstep i.e. by iterating over the input and feeding all threads one 1201cb0ef41Sopenharmony_ci // character at a time. t2 will execute an ACCEPT after two characters, 1211cb0ef41Sopenharmony_ci // while t1 will only execute ACCEPT after three characters. Thus we find a 1221cb0ef41Sopenharmony_ci // match for the second alternative before a match of the first alternative. 1231cb0ef41Sopenharmony_ci // 1241cb0ef41Sopenharmony_ci // This shows that we cannot always stop searching as soon as some thread t 1251cb0ef41Sopenharmony_ci // executes ACCEPT: If there is a thread u with higher priority than t, then 1261cb0ef41Sopenharmony_ci // it must be finished first. If u produces a match, then we can discard the 1271cb0ef41Sopenharmony_ci // match of t because matches produced by threads with higher priority are 1281cb0ef41Sopenharmony_ci // preferred over matches of threads with lower priority. On the other hand, 1291cb0ef41Sopenharmony_ci // we are allowed to abort all threads with lower priority than t if t 1301cb0ef41Sopenharmony_ci // produces a match: Such threads can only produce worse matches. In the 1311cb0ef41Sopenharmony_ci // example above, we can abort t3 after two characters because of t2's match. 1321cb0ef41Sopenharmony_ci // 1331cb0ef41Sopenharmony_ci // Thus the interpreter keeps track of a priority-ordered list of threads. 1341cb0ef41Sopenharmony_ci // If a thread ACCEPTs, all threads with lower priority are discarded, and 1351cb0ef41Sopenharmony_ci // the search continues with the threads with higher priority. If no threads 1361cb0ef41Sopenharmony_ci // with high priority are left, we return the match that was produced by the 1371cb0ef41Sopenharmony_ci // ACCEPTing thread with highest priority. 1381cb0ef41Sopenharmony_ci public: 1391cb0ef41Sopenharmony_ci NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin, 1401cb0ef41Sopenharmony_ci ByteArray bytecode, int register_count_per_match, String input, 1411cb0ef41Sopenharmony_ci int32_t input_index, Zone* zone) 1421cb0ef41Sopenharmony_ci : isolate_(isolate), 1431cb0ef41Sopenharmony_ci call_origin_(call_origin), 1441cb0ef41Sopenharmony_ci bytecode_object_(bytecode), 1451cb0ef41Sopenharmony_ci bytecode_(ToInstructionVector(bytecode, no_gc_)), 1461cb0ef41Sopenharmony_ci register_count_per_match_(register_count_per_match), 1471cb0ef41Sopenharmony_ci input_object_(input), 1481cb0ef41Sopenharmony_ci input_(ToCharacterVector<Character>(input, no_gc_)), 1491cb0ef41Sopenharmony_ci input_index_(input_index), 1501cb0ef41Sopenharmony_ci pc_last_input_index_(zone->NewArray<int>(bytecode.length()), 1511cb0ef41Sopenharmony_ci bytecode.length()), 1521cb0ef41Sopenharmony_ci active_threads_(0, zone), 1531cb0ef41Sopenharmony_ci blocked_threads_(0, zone), 1541cb0ef41Sopenharmony_ci register_array_allocator_(zone), 1551cb0ef41Sopenharmony_ci best_match_registers_(base::nullopt), 1561cb0ef41Sopenharmony_ci zone_(zone) { 1571cb0ef41Sopenharmony_ci DCHECK(!bytecode_.empty()); 1581cb0ef41Sopenharmony_ci DCHECK_GE(input_index_, 0); 1591cb0ef41Sopenharmony_ci DCHECK_LE(input_index_, input_.length()); 1601cb0ef41Sopenharmony_ci 1611cb0ef41Sopenharmony_ci std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); 1621cb0ef41Sopenharmony_ci } 1631cb0ef41Sopenharmony_ci 1641cb0ef41Sopenharmony_ci // Finds matches and writes their concatenated capture registers to 1651cb0ef41Sopenharmony_ci // `output_registers`. `output_registers[i]` has to be valid for all i < 1661cb0ef41Sopenharmony_ci // output_register_count. The search continues until all remaining matches 1671cb0ef41Sopenharmony_ci // have been found or there is no space left in `output_registers`. Returns 1681cb0ef41Sopenharmony_ci // the number of matches found. 1691cb0ef41Sopenharmony_ci int FindMatches(int32_t* output_registers, int output_register_count) { 1701cb0ef41Sopenharmony_ci const int max_match_num = output_register_count / register_count_per_match_; 1711cb0ef41Sopenharmony_ci 1721cb0ef41Sopenharmony_ci int match_num = 0; 1731cb0ef41Sopenharmony_ci while (match_num != max_match_num) { 1741cb0ef41Sopenharmony_ci int err_code = FindNextMatch(); 1751cb0ef41Sopenharmony_ci if (err_code != RegExp::kInternalRegExpSuccess) return err_code; 1761cb0ef41Sopenharmony_ci 1771cb0ef41Sopenharmony_ci if (!FoundMatch()) break; 1781cb0ef41Sopenharmony_ci 1791cb0ef41Sopenharmony_ci base::Vector<int> registers = *best_match_registers_; 1801cb0ef41Sopenharmony_ci output_registers = 1811cb0ef41Sopenharmony_ci std::copy(registers.begin(), registers.end(), output_registers); 1821cb0ef41Sopenharmony_ci 1831cb0ef41Sopenharmony_ci ++match_num; 1841cb0ef41Sopenharmony_ci 1851cb0ef41Sopenharmony_ci const int match_begin = registers[0]; 1861cb0ef41Sopenharmony_ci const int match_end = registers[1]; 1871cb0ef41Sopenharmony_ci DCHECK_LE(match_begin, match_end); 1881cb0ef41Sopenharmony_ci const int match_length = match_end - match_begin; 1891cb0ef41Sopenharmony_ci if (match_length != 0) { 1901cb0ef41Sopenharmony_ci SetInputIndex(match_end); 1911cb0ef41Sopenharmony_ci } else if (match_end == input_.length()) { 1921cb0ef41Sopenharmony_ci // Zero-length match, input exhausted. 1931cb0ef41Sopenharmony_ci SetInputIndex(match_end); 1941cb0ef41Sopenharmony_ci break; 1951cb0ef41Sopenharmony_ci } else { 1961cb0ef41Sopenharmony_ci // Zero-length match, more input. We don't want to report more matches 1971cb0ef41Sopenharmony_ci // here endlessly, so we advance by 1. 1981cb0ef41Sopenharmony_ci SetInputIndex(match_end + 1); 1991cb0ef41Sopenharmony_ci 2001cb0ef41Sopenharmony_ci // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to 2011cb0ef41Sopenharmony_ci // the next codepoint, not to the next code unit. See also 2021cb0ef41Sopenharmony_ci // `RegExpUtils::AdvanceStringIndex`. 2031cb0ef41Sopenharmony_ci STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode); 2041cb0ef41Sopenharmony_ci } 2051cb0ef41Sopenharmony_ci } 2061cb0ef41Sopenharmony_ci 2071cb0ef41Sopenharmony_ci return match_num; 2081cb0ef41Sopenharmony_ci } 2091cb0ef41Sopenharmony_ci 2101cb0ef41Sopenharmony_ci private: 2111cb0ef41Sopenharmony_ci // The state of a "thread" executing experimental regexp bytecode. (Not to 2121cb0ef41Sopenharmony_ci // be confused with an OS thread.) 2131cb0ef41Sopenharmony_ci struct InterpreterThread { 2141cb0ef41Sopenharmony_ci // This thread's program counter, i.e. the index within `bytecode_` of the 2151cb0ef41Sopenharmony_ci // next instruction to be executed. 2161cb0ef41Sopenharmony_ci int pc; 2171cb0ef41Sopenharmony_ci // Pointer to the array of registers, which is always size 2181cb0ef41Sopenharmony_ci // `register_count_per_match_`. Should be deallocated with 2191cb0ef41Sopenharmony_ci // `register_array_allocator_`. 2201cb0ef41Sopenharmony_ci int* register_array_begin; 2211cb0ef41Sopenharmony_ci }; 2221cb0ef41Sopenharmony_ci 2231cb0ef41Sopenharmony_ci // Handles pending interrupts if there are any. Returns 2241cb0ef41Sopenharmony_ci // RegExp::kInternalRegExpSuccess if execution can continue, and an error 2251cb0ef41Sopenharmony_ci // code otherwise. 2261cb0ef41Sopenharmony_ci int HandleInterrupts() { 2271cb0ef41Sopenharmony_ci StackLimitCheck check(isolate_); 2281cb0ef41Sopenharmony_ci if (call_origin_ == RegExp::CallOrigin::kFromJs) { 2291cb0ef41Sopenharmony_ci // Direct calls from JavaScript can be interrupted in two ways: 2301cb0ef41Sopenharmony_ci // 1. A real stack overflow, in which case we let the caller throw the 2311cb0ef41Sopenharmony_ci // exception. 2321cb0ef41Sopenharmony_ci // 2. The stack guard was used to interrupt execution for another purpose, 2331cb0ef41Sopenharmony_ci // forcing the call through the runtime system. 2341cb0ef41Sopenharmony_ci if (check.JsHasOverflowed()) { 2351cb0ef41Sopenharmony_ci return RegExp::kInternalRegExpException; 2361cb0ef41Sopenharmony_ci } else if (check.InterruptRequested()) { 2371cb0ef41Sopenharmony_ci return RegExp::kInternalRegExpRetry; 2381cb0ef41Sopenharmony_ci } 2391cb0ef41Sopenharmony_ci } else { 2401cb0ef41Sopenharmony_ci DCHECK(call_origin_ == RegExp::CallOrigin::kFromRuntime); 2411cb0ef41Sopenharmony_ci HandleScope handles(isolate_); 2421cb0ef41Sopenharmony_ci Handle<ByteArray> bytecode_handle(bytecode_object_, isolate_); 2431cb0ef41Sopenharmony_ci Handle<String> input_handle(input_object_, isolate_); 2441cb0ef41Sopenharmony_ci 2451cb0ef41Sopenharmony_ci if (check.JsHasOverflowed()) { 2461cb0ef41Sopenharmony_ci // We abort the interpreter now anyway, so gc can't invalidate any 2471cb0ef41Sopenharmony_ci // pointers. 2481cb0ef41Sopenharmony_ci AllowGarbageCollection yes_gc; 2491cb0ef41Sopenharmony_ci isolate_->StackOverflow(); 2501cb0ef41Sopenharmony_ci return RegExp::kInternalRegExpException; 2511cb0ef41Sopenharmony_ci } else if (check.InterruptRequested()) { 2521cb0ef41Sopenharmony_ci // TODO(mbid): Is this really equivalent to whether the string is 2531cb0ef41Sopenharmony_ci // one-byte or two-byte? A comment at the declaration of 2541cb0ef41Sopenharmony_ci // IsOneByteRepresentationUnderneath says that this might fail for 2551cb0ef41Sopenharmony_ci // external strings. 2561cb0ef41Sopenharmony_ci const bool was_one_byte = 2571cb0ef41Sopenharmony_ci String::IsOneByteRepresentationUnderneath(input_object_); 2581cb0ef41Sopenharmony_ci 2591cb0ef41Sopenharmony_ci Object result; 2601cb0ef41Sopenharmony_ci { 2611cb0ef41Sopenharmony_ci AllowGarbageCollection yes_gc; 2621cb0ef41Sopenharmony_ci result = isolate_->stack_guard()->HandleInterrupts(); 2631cb0ef41Sopenharmony_ci } 2641cb0ef41Sopenharmony_ci if (result.IsException(isolate_)) { 2651cb0ef41Sopenharmony_ci return RegExp::kInternalRegExpException; 2661cb0ef41Sopenharmony_ci } 2671cb0ef41Sopenharmony_ci 2681cb0ef41Sopenharmony_ci // If we changed between a LATIN1 and a UC16 string, we need to restart 2691cb0ef41Sopenharmony_ci // regexp matching with the appropriate template instantiation of 2701cb0ef41Sopenharmony_ci // RawMatch. 2711cb0ef41Sopenharmony_ci if (String::IsOneByteRepresentationUnderneath(*input_handle) != 2721cb0ef41Sopenharmony_ci was_one_byte) { 2731cb0ef41Sopenharmony_ci return RegExp::kInternalRegExpRetry; 2741cb0ef41Sopenharmony_ci } 2751cb0ef41Sopenharmony_ci 2761cb0ef41Sopenharmony_ci // Update objects and pointers in case they have changed during gc. 2771cb0ef41Sopenharmony_ci bytecode_object_ = *bytecode_handle; 2781cb0ef41Sopenharmony_ci bytecode_ = ToInstructionVector(bytecode_object_, no_gc_); 2791cb0ef41Sopenharmony_ci input_object_ = *input_handle; 2801cb0ef41Sopenharmony_ci input_ = ToCharacterVector<Character>(input_object_, no_gc_); 2811cb0ef41Sopenharmony_ci } 2821cb0ef41Sopenharmony_ci } 2831cb0ef41Sopenharmony_ci return RegExp::kInternalRegExpSuccess; 2841cb0ef41Sopenharmony_ci } 2851cb0ef41Sopenharmony_ci 2861cb0ef41Sopenharmony_ci // Change the current input index for future calls to `FindNextMatch`. 2871cb0ef41Sopenharmony_ci void SetInputIndex(int new_input_index) { 2881cb0ef41Sopenharmony_ci DCHECK_GE(input_index_, 0); 2891cb0ef41Sopenharmony_ci DCHECK_LE(input_index_, input_.length()); 2901cb0ef41Sopenharmony_ci 2911cb0ef41Sopenharmony_ci input_index_ = new_input_index; 2921cb0ef41Sopenharmony_ci } 2931cb0ef41Sopenharmony_ci 2941cb0ef41Sopenharmony_ci // Find the next match and return the corresponding capture registers and 2951cb0ef41Sopenharmony_ci // write its capture registers to `best_match_registers_`. The search starts 2961cb0ef41Sopenharmony_ci // at the current `input_index_`. Returns RegExp::kInternalRegExpSuccess if 2971cb0ef41Sopenharmony_ci // execution could finish regularly (with or without a match) and an error 2981cb0ef41Sopenharmony_ci // code due to interrupt otherwise. 2991cb0ef41Sopenharmony_ci int FindNextMatch() { 3001cb0ef41Sopenharmony_ci DCHECK(active_threads_.is_empty()); 3011cb0ef41Sopenharmony_ci // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_` 3021cb0ef41Sopenharmony_ci // here? As long as 3031cb0ef41Sopenharmony_ci // 3041cb0ef41Sopenharmony_ci // pc_last_input_index_[pc] < input_index_ 3051cb0ef41Sopenharmony_ci // 3061cb0ef41Sopenharmony_ci // for all possible program counters pc that are reachable without input 3071cb0ef41Sopenharmony_ci // from pc = 0 and 3081cb0ef41Sopenharmony_ci // 3091cb0ef41Sopenharmony_ci // pc_last_input_index_[k] <= input_index_ 3101cb0ef41Sopenharmony_ci // 3111cb0ef41Sopenharmony_ci // for all k > 0 hold I think everything should be fine. Maybe we can do 3121cb0ef41Sopenharmony_ci // something about this in `SetInputIndex`. 3131cb0ef41Sopenharmony_ci std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); 3141cb0ef41Sopenharmony_ci 3151cb0ef41Sopenharmony_ci // Clean up left-over data from a previous call to FindNextMatch. 3161cb0ef41Sopenharmony_ci for (InterpreterThread t : blocked_threads_) { 3171cb0ef41Sopenharmony_ci DestroyThread(t); 3181cb0ef41Sopenharmony_ci } 3191cb0ef41Sopenharmony_ci blocked_threads_.DropAndClear(); 3201cb0ef41Sopenharmony_ci 3211cb0ef41Sopenharmony_ci for (InterpreterThread t : active_threads_) { 3221cb0ef41Sopenharmony_ci DestroyThread(t); 3231cb0ef41Sopenharmony_ci } 3241cb0ef41Sopenharmony_ci active_threads_.DropAndClear(); 3251cb0ef41Sopenharmony_ci 3261cb0ef41Sopenharmony_ci if (best_match_registers_.has_value()) { 3271cb0ef41Sopenharmony_ci FreeRegisterArray(best_match_registers_->begin()); 3281cb0ef41Sopenharmony_ci best_match_registers_ = base::nullopt; 3291cb0ef41Sopenharmony_ci } 3301cb0ef41Sopenharmony_ci 3311cb0ef41Sopenharmony_ci // All threads start at bytecode 0. 3321cb0ef41Sopenharmony_ci active_threads_.Add( 3331cb0ef41Sopenharmony_ci InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_); 3341cb0ef41Sopenharmony_ci // Run the initial thread, potentially forking new threads, until every 3351cb0ef41Sopenharmony_ci // thread is blocked without further input. 3361cb0ef41Sopenharmony_ci RunActiveThreads(); 3371cb0ef41Sopenharmony_ci 3381cb0ef41Sopenharmony_ci // We stop if one of the following conditions hold: 3391cb0ef41Sopenharmony_ci // - We have exhausted the entire input. 3401cb0ef41Sopenharmony_ci // - We have found a match at some point, and there are no remaining 3411cb0ef41Sopenharmony_ci // threads with higher priority than the thread that produced the match. 3421cb0ef41Sopenharmony_ci // Threads with low priority have been aborted earlier, and the remaining 3431cb0ef41Sopenharmony_ci // threads are blocked here, so the latter simply means that 3441cb0ef41Sopenharmony_ci // `blocked_threads_` is empty. 3451cb0ef41Sopenharmony_ci while (input_index_ != input_.length() && 3461cb0ef41Sopenharmony_ci !(FoundMatch() && blocked_threads_.is_empty())) { 3471cb0ef41Sopenharmony_ci DCHECK(active_threads_.is_empty()); 3481cb0ef41Sopenharmony_ci base::uc16 input_char = input_[input_index_]; 3491cb0ef41Sopenharmony_ci ++input_index_; 3501cb0ef41Sopenharmony_ci 3511cb0ef41Sopenharmony_ci static constexpr int kTicksBetweenInterruptHandling = 64; 3521cb0ef41Sopenharmony_ci if (input_index_ % kTicksBetweenInterruptHandling == 0) { 3531cb0ef41Sopenharmony_ci int err_code = HandleInterrupts(); 3541cb0ef41Sopenharmony_ci if (err_code != RegExp::kInternalRegExpSuccess) return err_code; 3551cb0ef41Sopenharmony_ci } 3561cb0ef41Sopenharmony_ci 3571cb0ef41Sopenharmony_ci // We unblock all blocked_threads_ by feeding them the input char. 3581cb0ef41Sopenharmony_ci FlushBlockedThreads(input_char); 3591cb0ef41Sopenharmony_ci 3601cb0ef41Sopenharmony_ci // Run all threads until they block or accept. 3611cb0ef41Sopenharmony_ci RunActiveThreads(); 3621cb0ef41Sopenharmony_ci } 3631cb0ef41Sopenharmony_ci 3641cb0ef41Sopenharmony_ci return RegExp::kInternalRegExpSuccess; 3651cb0ef41Sopenharmony_ci } 3661cb0ef41Sopenharmony_ci 3671cb0ef41Sopenharmony_ci // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT 3681cb0ef41Sopenharmony_ci // instruction, or its PC value was already processed. 3691cb0ef41Sopenharmony_ci // - If processing of `t` can't continue because of CONSUME_RANGE, it is 3701cb0ef41Sopenharmony_ci // pushed on `blocked_threads_`. 3711cb0ef41Sopenharmony_ci // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and 3721cb0ef41Sopenharmony_ci // the current input index. All remaining `active_threads_` are discarded. 3731cb0ef41Sopenharmony_ci void RunActiveThread(InterpreterThread t) { 3741cb0ef41Sopenharmony_ci while (true) { 3751cb0ef41Sopenharmony_ci if (IsPcProcessed(t.pc)) return; 3761cb0ef41Sopenharmony_ci MarkPcProcessed(t.pc); 3771cb0ef41Sopenharmony_ci 3781cb0ef41Sopenharmony_ci RegExpInstruction inst = bytecode_[t.pc]; 3791cb0ef41Sopenharmony_ci switch (inst.opcode) { 3801cb0ef41Sopenharmony_ci case RegExpInstruction::CONSUME_RANGE: { 3811cb0ef41Sopenharmony_ci blocked_threads_.Add(t, zone_); 3821cb0ef41Sopenharmony_ci return; 3831cb0ef41Sopenharmony_ci } 3841cb0ef41Sopenharmony_ci case RegExpInstruction::ASSERTION: 3851cb0ef41Sopenharmony_ci if (!SatisfiesAssertion(inst.payload.assertion_type, input_, 3861cb0ef41Sopenharmony_ci input_index_)) { 3871cb0ef41Sopenharmony_ci DestroyThread(t); 3881cb0ef41Sopenharmony_ci return; 3891cb0ef41Sopenharmony_ci } 3901cb0ef41Sopenharmony_ci ++t.pc; 3911cb0ef41Sopenharmony_ci break; 3921cb0ef41Sopenharmony_ci case RegExpInstruction::FORK: { 3931cb0ef41Sopenharmony_ci InterpreterThread fork{inst.payload.pc, 3941cb0ef41Sopenharmony_ci NewRegisterArrayUninitialized()}; 3951cb0ef41Sopenharmony_ci base::Vector<int> fork_registers = GetRegisterArray(fork); 3961cb0ef41Sopenharmony_ci base::Vector<int> t_registers = GetRegisterArray(t); 3971cb0ef41Sopenharmony_ci DCHECK_EQ(fork_registers.length(), t_registers.length()); 3981cb0ef41Sopenharmony_ci std::copy(t_registers.begin(), t_registers.end(), 3991cb0ef41Sopenharmony_ci fork_registers.begin()); 4001cb0ef41Sopenharmony_ci active_threads_.Add(fork, zone_); 4011cb0ef41Sopenharmony_ci 4021cb0ef41Sopenharmony_ci ++t.pc; 4031cb0ef41Sopenharmony_ci break; 4041cb0ef41Sopenharmony_ci } 4051cb0ef41Sopenharmony_ci case RegExpInstruction::JMP: 4061cb0ef41Sopenharmony_ci t.pc = inst.payload.pc; 4071cb0ef41Sopenharmony_ci break; 4081cb0ef41Sopenharmony_ci case RegExpInstruction::ACCEPT: 4091cb0ef41Sopenharmony_ci if (best_match_registers_.has_value()) { 4101cb0ef41Sopenharmony_ci FreeRegisterArray(best_match_registers_->begin()); 4111cb0ef41Sopenharmony_ci } 4121cb0ef41Sopenharmony_ci best_match_registers_ = GetRegisterArray(t); 4131cb0ef41Sopenharmony_ci 4141cb0ef41Sopenharmony_ci for (InterpreterThread s : active_threads_) { 4151cb0ef41Sopenharmony_ci FreeRegisterArray(s.register_array_begin); 4161cb0ef41Sopenharmony_ci } 4171cb0ef41Sopenharmony_ci active_threads_.DropAndClear(); 4181cb0ef41Sopenharmony_ci return; 4191cb0ef41Sopenharmony_ci case RegExpInstruction::SET_REGISTER_TO_CP: 4201cb0ef41Sopenharmony_ci GetRegisterArray(t)[inst.payload.register_index] = input_index_; 4211cb0ef41Sopenharmony_ci ++t.pc; 4221cb0ef41Sopenharmony_ci break; 4231cb0ef41Sopenharmony_ci case RegExpInstruction::CLEAR_REGISTER: 4241cb0ef41Sopenharmony_ci GetRegisterArray(t)[inst.payload.register_index] = 4251cb0ef41Sopenharmony_ci kUndefinedRegisterValue; 4261cb0ef41Sopenharmony_ci ++t.pc; 4271cb0ef41Sopenharmony_ci break; 4281cb0ef41Sopenharmony_ci } 4291cb0ef41Sopenharmony_ci } 4301cb0ef41Sopenharmony_ci } 4311cb0ef41Sopenharmony_ci 4321cb0ef41Sopenharmony_ci // Run each active thread until it can't continue without further input. 4331cb0ef41Sopenharmony_ci // `active_threads_` is empty afterwards. `blocked_threads_` are sorted from 4341cb0ef41Sopenharmony_ci // low to high priority. 4351cb0ef41Sopenharmony_ci void RunActiveThreads() { 4361cb0ef41Sopenharmony_ci while (!active_threads_.is_empty()) { 4371cb0ef41Sopenharmony_ci RunActiveThread(active_threads_.RemoveLast()); 4381cb0ef41Sopenharmony_ci } 4391cb0ef41Sopenharmony_ci } 4401cb0ef41Sopenharmony_ci 4411cb0ef41Sopenharmony_ci // Unblock all blocked_threads_ by feeding them an `input_char`. Should only 4421cb0ef41Sopenharmony_ci // be called with `input_index_` pointing to the character *after* 4431cb0ef41Sopenharmony_ci // `input_char` so that `pc_last_input_index_` is updated correctly. 4441cb0ef41Sopenharmony_ci void FlushBlockedThreads(base::uc16 input_char) { 4451cb0ef41Sopenharmony_ci // The threads in blocked_threads_ are sorted from high to low priority, 4461cb0ef41Sopenharmony_ci // but active_threads_ needs to be sorted from low to high priority, so we 4471cb0ef41Sopenharmony_ci // need to activate blocked threads in reverse order. 4481cb0ef41Sopenharmony_ci for (int i = blocked_threads_.length() - 1; i >= 0; --i) { 4491cb0ef41Sopenharmony_ci InterpreterThread t = blocked_threads_[i]; 4501cb0ef41Sopenharmony_ci RegExpInstruction inst = bytecode_[t.pc]; 4511cb0ef41Sopenharmony_ci DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE); 4521cb0ef41Sopenharmony_ci RegExpInstruction::Uc16Range range = inst.payload.consume_range; 4531cb0ef41Sopenharmony_ci if (input_char >= range.min && input_char <= range.max) { 4541cb0ef41Sopenharmony_ci ++t.pc; 4551cb0ef41Sopenharmony_ci active_threads_.Add(t, zone_); 4561cb0ef41Sopenharmony_ci } else { 4571cb0ef41Sopenharmony_ci DestroyThread(t); 4581cb0ef41Sopenharmony_ci } 4591cb0ef41Sopenharmony_ci } 4601cb0ef41Sopenharmony_ci blocked_threads_.DropAndClear(); 4611cb0ef41Sopenharmony_ci } 4621cb0ef41Sopenharmony_ci 4631cb0ef41Sopenharmony_ci bool FoundMatch() const { return best_match_registers_.has_value(); } 4641cb0ef41Sopenharmony_ci 4651cb0ef41Sopenharmony_ci base::Vector<int> GetRegisterArray(InterpreterThread t) { 4661cb0ef41Sopenharmony_ci return base::Vector<int>(t.register_array_begin, register_count_per_match_); 4671cb0ef41Sopenharmony_ci } 4681cb0ef41Sopenharmony_ci 4691cb0ef41Sopenharmony_ci int* NewRegisterArrayUninitialized() { 4701cb0ef41Sopenharmony_ci return register_array_allocator_.allocate(register_count_per_match_); 4711cb0ef41Sopenharmony_ci } 4721cb0ef41Sopenharmony_ci 4731cb0ef41Sopenharmony_ci int* NewRegisterArray(int fill_value) { 4741cb0ef41Sopenharmony_ci int* array_begin = NewRegisterArrayUninitialized(); 4751cb0ef41Sopenharmony_ci int* array_end = array_begin + register_count_per_match_; 4761cb0ef41Sopenharmony_ci std::fill(array_begin, array_end, fill_value); 4771cb0ef41Sopenharmony_ci return array_begin; 4781cb0ef41Sopenharmony_ci } 4791cb0ef41Sopenharmony_ci 4801cb0ef41Sopenharmony_ci void FreeRegisterArray(int* register_array_begin) { 4811cb0ef41Sopenharmony_ci register_array_allocator_.deallocate(register_array_begin, 4821cb0ef41Sopenharmony_ci register_count_per_match_); 4831cb0ef41Sopenharmony_ci } 4841cb0ef41Sopenharmony_ci 4851cb0ef41Sopenharmony_ci void DestroyThread(InterpreterThread t) { 4861cb0ef41Sopenharmony_ci FreeRegisterArray(t.register_array_begin); 4871cb0ef41Sopenharmony_ci } 4881cb0ef41Sopenharmony_ci 4891cb0ef41Sopenharmony_ci // It is redundant to have two threads t, t0 execute at the same PC value, 4901cb0ef41Sopenharmony_ci // because one of t, t0 matches iff the other does. We can thus discard 4911cb0ef41Sopenharmony_ci // the one with lower priority. We check whether a thread executed at some 4921cb0ef41Sopenharmony_ci // PC value by recording for every possible value of PC what the value of 4931cb0ef41Sopenharmony_ci // input_index_ was the last time a thread executed at PC. If a thread 4941cb0ef41Sopenharmony_ci // tries to continue execution at a PC value that we have seen before at 4951cb0ef41Sopenharmony_ci // the current input index, we abort it. (We execute threads with higher 4961cb0ef41Sopenharmony_ci // priority first, so the second thread is guaranteed to have lower 4971cb0ef41Sopenharmony_ci // priority.) 4981cb0ef41Sopenharmony_ci // 4991cb0ef41Sopenharmony_ci // Check whether we've seen an active thread with a given pc value since the 5001cb0ef41Sopenharmony_ci // last increment of `input_index_`. 5011cb0ef41Sopenharmony_ci bool IsPcProcessed(int pc) { 5021cb0ef41Sopenharmony_ci DCHECK_LE(pc_last_input_index_[pc], input_index_); 5031cb0ef41Sopenharmony_ci return pc_last_input_index_[pc] == input_index_; 5041cb0ef41Sopenharmony_ci } 5051cb0ef41Sopenharmony_ci 5061cb0ef41Sopenharmony_ci // Mark a pc as having been processed since the last increment of 5071cb0ef41Sopenharmony_ci // `input_index_`. 5081cb0ef41Sopenharmony_ci void MarkPcProcessed(int pc) { 5091cb0ef41Sopenharmony_ci DCHECK_LE(pc_last_input_index_[pc], input_index_); 5101cb0ef41Sopenharmony_ci pc_last_input_index_[pc] = input_index_; 5111cb0ef41Sopenharmony_ci } 5121cb0ef41Sopenharmony_ci 5131cb0ef41Sopenharmony_ci Isolate* const isolate_; 5141cb0ef41Sopenharmony_ci 5151cb0ef41Sopenharmony_ci const RegExp::CallOrigin call_origin_; 5161cb0ef41Sopenharmony_ci 5171cb0ef41Sopenharmony_ci DisallowGarbageCollection no_gc_; 5181cb0ef41Sopenharmony_ci 5191cb0ef41Sopenharmony_ci ByteArray bytecode_object_; 5201cb0ef41Sopenharmony_ci base::Vector<const RegExpInstruction> bytecode_; 5211cb0ef41Sopenharmony_ci 5221cb0ef41Sopenharmony_ci // Number of registers used per thread. 5231cb0ef41Sopenharmony_ci const int register_count_per_match_; 5241cb0ef41Sopenharmony_ci 5251cb0ef41Sopenharmony_ci String input_object_; 5261cb0ef41Sopenharmony_ci base::Vector<const Character> input_; 5271cb0ef41Sopenharmony_ci int input_index_; 5281cb0ef41Sopenharmony_ci 5291cb0ef41Sopenharmony_ci // pc_last_input_index_[k] records the value of input_index_ the last 5301cb0ef41Sopenharmony_ci // time a thread t such that t.pc == k was activated, i.e. put on 5311cb0ef41Sopenharmony_ci // active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See 5321cb0ef41Sopenharmony_ci // also `RunActiveThread`. 5331cb0ef41Sopenharmony_ci base::Vector<int> pc_last_input_index_; 5341cb0ef41Sopenharmony_ci 5351cb0ef41Sopenharmony_ci // Active threads can potentially (but not necessarily) continue without 5361cb0ef41Sopenharmony_ci // input. Sorted from low to high priority. 5371cb0ef41Sopenharmony_ci ZoneList<InterpreterThread> active_threads_; 5381cb0ef41Sopenharmony_ci 5391cb0ef41Sopenharmony_ci // The pc of a blocked thread points to an instruction that consumes a 5401cb0ef41Sopenharmony_ci // character. Sorted from high to low priority (so the opposite of 5411cb0ef41Sopenharmony_ci // `active_threads_`). 5421cb0ef41Sopenharmony_ci ZoneList<InterpreterThread> blocked_threads_; 5431cb0ef41Sopenharmony_ci 5441cb0ef41Sopenharmony_ci // RecyclingZoneAllocator maintains a linked list through freed allocations 5451cb0ef41Sopenharmony_ci // for reuse if possible. 5461cb0ef41Sopenharmony_ci RecyclingZoneAllocator<int> register_array_allocator_; 5471cb0ef41Sopenharmony_ci 5481cb0ef41Sopenharmony_ci // The register array of the best match found so far during the current 5491cb0ef41Sopenharmony_ci // search. If several threads ACCEPTed, then this will be the register array 5501cb0ef41Sopenharmony_ci // of the accepting thread with highest priority. Should be deallocated with 5511cb0ef41Sopenharmony_ci // `register_array_allocator_`. 5521cb0ef41Sopenharmony_ci base::Optional<base::Vector<int>> best_match_registers_; 5531cb0ef41Sopenharmony_ci 5541cb0ef41Sopenharmony_ci Zone* zone_; 5551cb0ef41Sopenharmony_ci}; 5561cb0ef41Sopenharmony_ci 5571cb0ef41Sopenharmony_ci} // namespace 5581cb0ef41Sopenharmony_ci 5591cb0ef41Sopenharmony_ciint ExperimentalRegExpInterpreter::FindMatches( 5601cb0ef41Sopenharmony_ci Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray bytecode, 5611cb0ef41Sopenharmony_ci int register_count_per_match, String input, int start_index, 5621cb0ef41Sopenharmony_ci int32_t* output_registers, int output_register_count, Zone* zone) { 5631cb0ef41Sopenharmony_ci DCHECK(input.IsFlat()); 5641cb0ef41Sopenharmony_ci DisallowGarbageCollection no_gc; 5651cb0ef41Sopenharmony_ci 5661cb0ef41Sopenharmony_ci if (input.GetFlatContent(no_gc).IsOneByte()) { 5671cb0ef41Sopenharmony_ci NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode, 5681cb0ef41Sopenharmony_ci register_count_per_match, input, 5691cb0ef41Sopenharmony_ci start_index, zone); 5701cb0ef41Sopenharmony_ci return interpreter.FindMatches(output_registers, output_register_count); 5711cb0ef41Sopenharmony_ci } else { 5721cb0ef41Sopenharmony_ci DCHECK(input.GetFlatContent(no_gc).IsTwoByte()); 5731cb0ef41Sopenharmony_ci NfaInterpreter<base::uc16> interpreter(isolate, call_origin, bytecode, 5741cb0ef41Sopenharmony_ci register_count_per_match, input, 5751cb0ef41Sopenharmony_ci start_index, zone); 5761cb0ef41Sopenharmony_ci return interpreter.FindMatches(output_registers, output_register_count); 5771cb0ef41Sopenharmony_ci } 5781cb0ef41Sopenharmony_ci} 5791cb0ef41Sopenharmony_ci 5801cb0ef41Sopenharmony_ci} // namespace internal 5811cb0ef41Sopenharmony_ci} // namespace v8 582