1// Copyright 2020 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/experimental/experimental-interpreter.h" 6 7#include "src/base/optional.h" 8#include "src/base/strings.h" 9#include "src/common/assert-scope.h" 10#include "src/objects/fixed-array-inl.h" 11#include "src/objects/string-inl.h" 12#include "src/regexp/experimental/experimental.h" 13#include "src/strings/char-predicates-inl.h" 14#include "src/zone/zone-allocator.h" 15#include "src/zone/zone-list-inl.h" 16 17namespace v8 { 18namespace internal { 19 20namespace { 21 22constexpr int kUndefinedRegisterValue = -1; 23 24template <class Character> 25bool SatisfiesAssertion(RegExpAssertion::Type type, 26 base::Vector<const Character> context, int position) { 27 DCHECK_LE(position, context.length()); 28 DCHECK_GE(position, 0); 29 30 switch (type) { 31 case RegExpAssertion::Type::START_OF_INPUT: 32 return position == 0; 33 case RegExpAssertion::Type::END_OF_INPUT: 34 return position == context.length(); 35 case RegExpAssertion::Type::START_OF_LINE: 36 if (position == 0) return true; 37 return unibrow::IsLineTerminator(context[position - 1]); 38 case RegExpAssertion::Type::END_OF_LINE: 39 if (position == context.length()) return true; 40 return unibrow::IsLineTerminator(context[position]); 41 case RegExpAssertion::Type::BOUNDARY: 42 if (context.length() == 0) { 43 return false; 44 } else if (position == 0) { 45 return IsRegExpWord(context[position]); 46 } else if (position == context.length()) { 47 return IsRegExpWord(context[position - 1]); 48 } else { 49 return IsRegExpWord(context[position - 1]) != 50 IsRegExpWord(context[position]); 51 } 52 case RegExpAssertion::Type::NON_BOUNDARY: 53 return !SatisfiesAssertion(RegExpAssertion::Type::BOUNDARY, context, 54 position); 55 } 56} 57 58base::Vector<RegExpInstruction> ToInstructionVector( 59 ByteArray raw_bytes, const DisallowGarbageCollection& no_gc) { 60 RegExpInstruction* inst_begin = 61 reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress()); 62 int inst_num = raw_bytes.length() / sizeof(RegExpInstruction); 63 DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length()); 64 return base::Vector<RegExpInstruction>(inst_begin, inst_num); 65} 66 67template <class Character> 68base::Vector<const Character> ToCharacterVector( 69 String str, const DisallowGarbageCollection& no_gc); 70 71template <> 72base::Vector<const uint8_t> ToCharacterVector<uint8_t>( 73 String str, const DisallowGarbageCollection& no_gc) { 74 DCHECK(str.IsFlat()); 75 String::FlatContent content = str.GetFlatContent(no_gc); 76 DCHECK(content.IsOneByte()); 77 return content.ToOneByteVector(); 78} 79 80template <> 81base::Vector<const base::uc16> ToCharacterVector<base::uc16>( 82 String str, const DisallowGarbageCollection& no_gc) { 83 DCHECK(str.IsFlat()); 84 String::FlatContent content = str.GetFlatContent(no_gc); 85 DCHECK(content.IsTwoByte()); 86 return content.ToUC16Vector(); 87} 88 89template <class Character> 90class NfaInterpreter { 91 // Executes a bytecode program in breadth-first mode, without backtracking. 92 // `Character` can be instantiated with `uint8_t` or `base::uc16` for one byte 93 // or two byte input strings. 94 // 95 // In contrast to the backtracking implementation, this has linear time 96 // complexity in the length of the input string. Breadth-first mode means 97 // that threads are executed in lockstep with respect to their input 98 // position, i.e. the threads share a common input index. This is similar 99 // to breadth-first simulation of a non-deterministic finite automaton (nfa), 100 // hence the name of the class. 101 // 102 // To follow the semantics of a backtracking VM implementation, we have to be 103 // careful about whether we stop execution when a thread executes ACCEPT. 104 // For example, consider execution of the bytecode generated by the regexp 105 // 106 // r = /abc|..|[a-c]{10,}/ 107 // 108 // on input "abcccccccccccccc". Clearly the three alternatives 109 // - /abc/ 110 // - /../ 111 // - /[a-c]{10,}/ 112 // all match this input. A backtracking implementation will report "abc" as 113 // match, because it explores the first alternative before the others. 114 // 115 // However, if we execute breadth first, then we execute the 3 threads 116 // - t1, which tries to match /abc/ 117 // - t2, which tries to match /../ 118 // - t3, which tries to match /[a-c]{10,}/ 119 // in lockstep i.e. by iterating over the input and feeding all threads one 120 // character at a time. t2 will execute an ACCEPT after two characters, 121 // while t1 will only execute ACCEPT after three characters. Thus we find a 122 // match for the second alternative before a match of the first alternative. 123 // 124 // This shows that we cannot always stop searching as soon as some thread t 125 // executes ACCEPT: If there is a thread u with higher priority than t, then 126 // it must be finished first. If u produces a match, then we can discard the 127 // match of t because matches produced by threads with higher priority are 128 // preferred over matches of threads with lower priority. On the other hand, 129 // we are allowed to abort all threads with lower priority than t if t 130 // produces a match: Such threads can only produce worse matches. In the 131 // example above, we can abort t3 after two characters because of t2's match. 132 // 133 // Thus the interpreter keeps track of a priority-ordered list of threads. 134 // If a thread ACCEPTs, all threads with lower priority are discarded, and 135 // the search continues with the threads with higher priority. If no threads 136 // with high priority are left, we return the match that was produced by the 137 // ACCEPTing thread with highest priority. 138 public: 139 NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin, 140 ByteArray bytecode, int register_count_per_match, String input, 141 int32_t input_index, Zone* zone) 142 : isolate_(isolate), 143 call_origin_(call_origin), 144 bytecode_object_(bytecode), 145 bytecode_(ToInstructionVector(bytecode, no_gc_)), 146 register_count_per_match_(register_count_per_match), 147 input_object_(input), 148 input_(ToCharacterVector<Character>(input, no_gc_)), 149 input_index_(input_index), 150 pc_last_input_index_(zone->NewArray<int>(bytecode.length()), 151 bytecode.length()), 152 active_threads_(0, zone), 153 blocked_threads_(0, zone), 154 register_array_allocator_(zone), 155 best_match_registers_(base::nullopt), 156 zone_(zone) { 157 DCHECK(!bytecode_.empty()); 158 DCHECK_GE(input_index_, 0); 159 DCHECK_LE(input_index_, input_.length()); 160 161 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); 162 } 163 164 // Finds matches and writes their concatenated capture registers to 165 // `output_registers`. `output_registers[i]` has to be valid for all i < 166 // output_register_count. The search continues until all remaining matches 167 // have been found or there is no space left in `output_registers`. Returns 168 // the number of matches found. 169 int FindMatches(int32_t* output_registers, int output_register_count) { 170 const int max_match_num = output_register_count / register_count_per_match_; 171 172 int match_num = 0; 173 while (match_num != max_match_num) { 174 int err_code = FindNextMatch(); 175 if (err_code != RegExp::kInternalRegExpSuccess) return err_code; 176 177 if (!FoundMatch()) break; 178 179 base::Vector<int> registers = *best_match_registers_; 180 output_registers = 181 std::copy(registers.begin(), registers.end(), output_registers); 182 183 ++match_num; 184 185 const int match_begin = registers[0]; 186 const int match_end = registers[1]; 187 DCHECK_LE(match_begin, match_end); 188 const int match_length = match_end - match_begin; 189 if (match_length != 0) { 190 SetInputIndex(match_end); 191 } else if (match_end == input_.length()) { 192 // Zero-length match, input exhausted. 193 SetInputIndex(match_end); 194 break; 195 } else { 196 // Zero-length match, more input. We don't want to report more matches 197 // here endlessly, so we advance by 1. 198 SetInputIndex(match_end + 1); 199 200 // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to 201 // the next codepoint, not to the next code unit. See also 202 // `RegExpUtils::AdvanceStringIndex`. 203 STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode); 204 } 205 } 206 207 return match_num; 208 } 209 210 private: 211 // The state of a "thread" executing experimental regexp bytecode. (Not to 212 // be confused with an OS thread.) 213 struct InterpreterThread { 214 // This thread's program counter, i.e. the index within `bytecode_` of the 215 // next instruction to be executed. 216 int pc; 217 // Pointer to the array of registers, which is always size 218 // `register_count_per_match_`. Should be deallocated with 219 // `register_array_allocator_`. 220 int* register_array_begin; 221 }; 222 223 // Handles pending interrupts if there are any. Returns 224 // RegExp::kInternalRegExpSuccess if execution can continue, and an error 225 // code otherwise. 226 int HandleInterrupts() { 227 StackLimitCheck check(isolate_); 228 if (call_origin_ == RegExp::CallOrigin::kFromJs) { 229 // Direct calls from JavaScript can be interrupted in two ways: 230 // 1. A real stack overflow, in which case we let the caller throw the 231 // exception. 232 // 2. The stack guard was used to interrupt execution for another purpose, 233 // forcing the call through the runtime system. 234 if (check.JsHasOverflowed()) { 235 return RegExp::kInternalRegExpException; 236 } else if (check.InterruptRequested()) { 237 return RegExp::kInternalRegExpRetry; 238 } 239 } else { 240 DCHECK(call_origin_ == RegExp::CallOrigin::kFromRuntime); 241 HandleScope handles(isolate_); 242 Handle<ByteArray> bytecode_handle(bytecode_object_, isolate_); 243 Handle<String> input_handle(input_object_, isolate_); 244 245 if (check.JsHasOverflowed()) { 246 // We abort the interpreter now anyway, so gc can't invalidate any 247 // pointers. 248 AllowGarbageCollection yes_gc; 249 isolate_->StackOverflow(); 250 return RegExp::kInternalRegExpException; 251 } else if (check.InterruptRequested()) { 252 // TODO(mbid): Is this really equivalent to whether the string is 253 // one-byte or two-byte? A comment at the declaration of 254 // IsOneByteRepresentationUnderneath says that this might fail for 255 // external strings. 256 const bool was_one_byte = 257 String::IsOneByteRepresentationUnderneath(input_object_); 258 259 Object result; 260 { 261 AllowGarbageCollection yes_gc; 262 result = isolate_->stack_guard()->HandleInterrupts(); 263 } 264 if (result.IsException(isolate_)) { 265 return RegExp::kInternalRegExpException; 266 } 267 268 // If we changed between a LATIN1 and a UC16 string, we need to restart 269 // regexp matching with the appropriate template instantiation of 270 // RawMatch. 271 if (String::IsOneByteRepresentationUnderneath(*input_handle) != 272 was_one_byte) { 273 return RegExp::kInternalRegExpRetry; 274 } 275 276 // Update objects and pointers in case they have changed during gc. 277 bytecode_object_ = *bytecode_handle; 278 bytecode_ = ToInstructionVector(bytecode_object_, no_gc_); 279 input_object_ = *input_handle; 280 input_ = ToCharacterVector<Character>(input_object_, no_gc_); 281 } 282 } 283 return RegExp::kInternalRegExpSuccess; 284 } 285 286 // Change the current input index for future calls to `FindNextMatch`. 287 void SetInputIndex(int new_input_index) { 288 DCHECK_GE(input_index_, 0); 289 DCHECK_LE(input_index_, input_.length()); 290 291 input_index_ = new_input_index; 292 } 293 294 // Find the next match and return the corresponding capture registers and 295 // write its capture registers to `best_match_registers_`. The search starts 296 // at the current `input_index_`. Returns RegExp::kInternalRegExpSuccess if 297 // execution could finish regularly (with or without a match) and an error 298 // code due to interrupt otherwise. 299 int FindNextMatch() { 300 DCHECK(active_threads_.is_empty()); 301 // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_` 302 // here? As long as 303 // 304 // pc_last_input_index_[pc] < input_index_ 305 // 306 // for all possible program counters pc that are reachable without input 307 // from pc = 0 and 308 // 309 // pc_last_input_index_[k] <= input_index_ 310 // 311 // for all k > 0 hold I think everything should be fine. Maybe we can do 312 // something about this in `SetInputIndex`. 313 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1); 314 315 // Clean up left-over data from a previous call to FindNextMatch. 316 for (InterpreterThread t : blocked_threads_) { 317 DestroyThread(t); 318 } 319 blocked_threads_.DropAndClear(); 320 321 for (InterpreterThread t : active_threads_) { 322 DestroyThread(t); 323 } 324 active_threads_.DropAndClear(); 325 326 if (best_match_registers_.has_value()) { 327 FreeRegisterArray(best_match_registers_->begin()); 328 best_match_registers_ = base::nullopt; 329 } 330 331 // All threads start at bytecode 0. 332 active_threads_.Add( 333 InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_); 334 // Run the initial thread, potentially forking new threads, until every 335 // thread is blocked without further input. 336 RunActiveThreads(); 337 338 // We stop if one of the following conditions hold: 339 // - We have exhausted the entire input. 340 // - We have found a match at some point, and there are no remaining 341 // threads with higher priority than the thread that produced the match. 342 // Threads with low priority have been aborted earlier, and the remaining 343 // threads are blocked here, so the latter simply means that 344 // `blocked_threads_` is empty. 345 while (input_index_ != input_.length() && 346 !(FoundMatch() && blocked_threads_.is_empty())) { 347 DCHECK(active_threads_.is_empty()); 348 base::uc16 input_char = input_[input_index_]; 349 ++input_index_; 350 351 static constexpr int kTicksBetweenInterruptHandling = 64; 352 if (input_index_ % kTicksBetweenInterruptHandling == 0) { 353 int err_code = HandleInterrupts(); 354 if (err_code != RegExp::kInternalRegExpSuccess) return err_code; 355 } 356 357 // We unblock all blocked_threads_ by feeding them the input char. 358 FlushBlockedThreads(input_char); 359 360 // Run all threads until they block or accept. 361 RunActiveThreads(); 362 } 363 364 return RegExp::kInternalRegExpSuccess; 365 } 366 367 // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT 368 // instruction, or its PC value was already processed. 369 // - If processing of `t` can't continue because of CONSUME_RANGE, it is 370 // pushed on `blocked_threads_`. 371 // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and 372 // the current input index. All remaining `active_threads_` are discarded. 373 void RunActiveThread(InterpreterThread t) { 374 while (true) { 375 if (IsPcProcessed(t.pc)) return; 376 MarkPcProcessed(t.pc); 377 378 RegExpInstruction inst = bytecode_[t.pc]; 379 switch (inst.opcode) { 380 case RegExpInstruction::CONSUME_RANGE: { 381 blocked_threads_.Add(t, zone_); 382 return; 383 } 384 case RegExpInstruction::ASSERTION: 385 if (!SatisfiesAssertion(inst.payload.assertion_type, input_, 386 input_index_)) { 387 DestroyThread(t); 388 return; 389 } 390 ++t.pc; 391 break; 392 case RegExpInstruction::FORK: { 393 InterpreterThread fork{inst.payload.pc, 394 NewRegisterArrayUninitialized()}; 395 base::Vector<int> fork_registers = GetRegisterArray(fork); 396 base::Vector<int> t_registers = GetRegisterArray(t); 397 DCHECK_EQ(fork_registers.length(), t_registers.length()); 398 std::copy(t_registers.begin(), t_registers.end(), 399 fork_registers.begin()); 400 active_threads_.Add(fork, zone_); 401 402 ++t.pc; 403 break; 404 } 405 case RegExpInstruction::JMP: 406 t.pc = inst.payload.pc; 407 break; 408 case RegExpInstruction::ACCEPT: 409 if (best_match_registers_.has_value()) { 410 FreeRegisterArray(best_match_registers_->begin()); 411 } 412 best_match_registers_ = GetRegisterArray(t); 413 414 for (InterpreterThread s : active_threads_) { 415 FreeRegisterArray(s.register_array_begin); 416 } 417 active_threads_.DropAndClear(); 418 return; 419 case RegExpInstruction::SET_REGISTER_TO_CP: 420 GetRegisterArray(t)[inst.payload.register_index] = input_index_; 421 ++t.pc; 422 break; 423 case RegExpInstruction::CLEAR_REGISTER: 424 GetRegisterArray(t)[inst.payload.register_index] = 425 kUndefinedRegisterValue; 426 ++t.pc; 427 break; 428 } 429 } 430 } 431 432 // Run each active thread until it can't continue without further input. 433 // `active_threads_` is empty afterwards. `blocked_threads_` are sorted from 434 // low to high priority. 435 void RunActiveThreads() { 436 while (!active_threads_.is_empty()) { 437 RunActiveThread(active_threads_.RemoveLast()); 438 } 439 } 440 441 // Unblock all blocked_threads_ by feeding them an `input_char`. Should only 442 // be called with `input_index_` pointing to the character *after* 443 // `input_char` so that `pc_last_input_index_` is updated correctly. 444 void FlushBlockedThreads(base::uc16 input_char) { 445 // The threads in blocked_threads_ are sorted from high to low priority, 446 // but active_threads_ needs to be sorted from low to high priority, so we 447 // need to activate blocked threads in reverse order. 448 for (int i = blocked_threads_.length() - 1; i >= 0; --i) { 449 InterpreterThread t = blocked_threads_[i]; 450 RegExpInstruction inst = bytecode_[t.pc]; 451 DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE); 452 RegExpInstruction::Uc16Range range = inst.payload.consume_range; 453 if (input_char >= range.min && input_char <= range.max) { 454 ++t.pc; 455 active_threads_.Add(t, zone_); 456 } else { 457 DestroyThread(t); 458 } 459 } 460 blocked_threads_.DropAndClear(); 461 } 462 463 bool FoundMatch() const { return best_match_registers_.has_value(); } 464 465 base::Vector<int> GetRegisterArray(InterpreterThread t) { 466 return base::Vector<int>(t.register_array_begin, register_count_per_match_); 467 } 468 469 int* NewRegisterArrayUninitialized() { 470 return register_array_allocator_.allocate(register_count_per_match_); 471 } 472 473 int* NewRegisterArray(int fill_value) { 474 int* array_begin = NewRegisterArrayUninitialized(); 475 int* array_end = array_begin + register_count_per_match_; 476 std::fill(array_begin, array_end, fill_value); 477 return array_begin; 478 } 479 480 void FreeRegisterArray(int* register_array_begin) { 481 register_array_allocator_.deallocate(register_array_begin, 482 register_count_per_match_); 483 } 484 485 void DestroyThread(InterpreterThread t) { 486 FreeRegisterArray(t.register_array_begin); 487 } 488 489 // It is redundant to have two threads t, t0 execute at the same PC value, 490 // because one of t, t0 matches iff the other does. We can thus discard 491 // the one with lower priority. We check whether a thread executed at some 492 // PC value by recording for every possible value of PC what the value of 493 // input_index_ was the last time a thread executed at PC. If a thread 494 // tries to continue execution at a PC value that we have seen before at 495 // the current input index, we abort it. (We execute threads with higher 496 // priority first, so the second thread is guaranteed to have lower 497 // priority.) 498 // 499 // Check whether we've seen an active thread with a given pc value since the 500 // last increment of `input_index_`. 501 bool IsPcProcessed(int pc) { 502 DCHECK_LE(pc_last_input_index_[pc], input_index_); 503 return pc_last_input_index_[pc] == input_index_; 504 } 505 506 // Mark a pc as having been processed since the last increment of 507 // `input_index_`. 508 void MarkPcProcessed(int pc) { 509 DCHECK_LE(pc_last_input_index_[pc], input_index_); 510 pc_last_input_index_[pc] = input_index_; 511 } 512 513 Isolate* const isolate_; 514 515 const RegExp::CallOrigin call_origin_; 516 517 DisallowGarbageCollection no_gc_; 518 519 ByteArray bytecode_object_; 520 base::Vector<const RegExpInstruction> bytecode_; 521 522 // Number of registers used per thread. 523 const int register_count_per_match_; 524 525 String input_object_; 526 base::Vector<const Character> input_; 527 int input_index_; 528 529 // pc_last_input_index_[k] records the value of input_index_ the last 530 // time a thread t such that t.pc == k was activated, i.e. put on 531 // active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See 532 // also `RunActiveThread`. 533 base::Vector<int> pc_last_input_index_; 534 535 // Active threads can potentially (but not necessarily) continue without 536 // input. Sorted from low to high priority. 537 ZoneList<InterpreterThread> active_threads_; 538 539 // The pc of a blocked thread points to an instruction that consumes a 540 // character. Sorted from high to low priority (so the opposite of 541 // `active_threads_`). 542 ZoneList<InterpreterThread> blocked_threads_; 543 544 // RecyclingZoneAllocator maintains a linked list through freed allocations 545 // for reuse if possible. 546 RecyclingZoneAllocator<int> register_array_allocator_; 547 548 // The register array of the best match found so far during the current 549 // search. If several threads ACCEPTed, then this will be the register array 550 // of the accepting thread with highest priority. Should be deallocated with 551 // `register_array_allocator_`. 552 base::Optional<base::Vector<int>> best_match_registers_; 553 554 Zone* zone_; 555}; 556 557} // namespace 558 559int ExperimentalRegExpInterpreter::FindMatches( 560 Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray bytecode, 561 int register_count_per_match, String input, int start_index, 562 int32_t* output_registers, int output_register_count, Zone* zone) { 563 DCHECK(input.IsFlat()); 564 DisallowGarbageCollection no_gc; 565 566 if (input.GetFlatContent(no_gc).IsOneByte()) { 567 NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode, 568 register_count_per_match, input, 569 start_index, zone); 570 return interpreter.FindMatches(output_registers, output_register_count); 571 } else { 572 DCHECK(input.GetFlatContent(no_gc).IsTwoByte()); 573 NfaInterpreter<base::uc16> interpreter(isolate, call_origin, bytecode, 574 register_count_per_match, input, 575 start_index, zone); 576 return interpreter.FindMatches(output_registers, output_register_count); 577 } 578} 579 580} // namespace internal 581} // namespace v8 582