1// Copyright 2012 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/regexp.h" 6 7#include "src/base/strings.h" 8#include "src/codegen/compilation-cache.h" 9#include "src/diagnostics/code-tracer.h" 10#include "src/execution/interrupts-scope.h" 11#include "src/heap/heap-inl.h" 12#include "src/objects/js-regexp-inl.h" 13#include "src/regexp/experimental/experimental.h" 14#include "src/regexp/regexp-bytecode-generator.h" 15#include "src/regexp/regexp-bytecodes.h" 16#include "src/regexp/regexp-compiler.h" 17#include "src/regexp/regexp-dotprinter.h" 18#include "src/regexp/regexp-interpreter.h" 19#include "src/regexp/regexp-macro-assembler-arch.h" 20#include "src/regexp/regexp-macro-assembler-tracer.h" 21#include "src/regexp/regexp-parser.h" 22#include "src/regexp/regexp-utils.h" 23#include "src/strings/string-search.h" 24#include "src/utils/ostreams.h" 25 26namespace v8 { 27namespace internal { 28 29using namespace regexp_compiler_constants; // NOLINT(build/namespaces) 30 31class RegExpImpl final : public AllStatic { 32 public: 33 // Returns a string representation of a regular expression. 34 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4. 35 // This function calls the garbage collector if necessary. 36 static Handle<String> ToString(Handle<Object> value); 37 38 // Prepares a JSRegExp object with Irregexp-specific data. 39 static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, 40 Handle<String> pattern, RegExpFlags flags, 41 int capture_count, uint32_t backtrack_limit); 42 43 // Prepare a RegExp for being executed one or more times (using 44 // IrregexpExecOnce) on the subject. 45 // This ensures that the regexp is compiled for the subject, and that 46 // the subject is flat. 47 // Returns the number of integer spaces required by IrregexpExecOnce 48 // as its "registers" argument. If the regexp cannot be compiled, 49 // an exception is set as pending, and this function returns negative. 50 static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp, 51 Handle<String> subject); 52 53 static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re, 54 Handle<String> pattern, RegExpFlags flags, 55 Handle<String> match_pattern); 56 57 static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 58 Handle<String> subject, int index, int32_t* output, 59 int output_size); 60 61 static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp, 62 Handle<String> subject, int index, 63 Handle<RegExpMatchInfo> last_match_info); 64 65 // Execute a regular expression on the subject, starting from index. 66 // If matching succeeds, return the number of matches. This can be larger 67 // than one in the case of global regular expressions. 68 // The captures and subcaptures are stored into the registers vector. 69 // If matching fails, returns RE_FAILURE. 70 // If execution fails, sets a pending exception and returns RE_EXCEPTION. 71 static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 72 Handle<String> subject, int index, int32_t* output, 73 int output_size); 74 75 // Execute an Irregexp bytecode pattern. 76 // On a successful match, the result is a JSArray containing 77 // captured positions. On a failure, the result is the null value. 78 // Returns an empty handle in case of an exception. 79 V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec( 80 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, 81 int index, Handle<RegExpMatchInfo> last_match_info, 82 RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone); 83 84 static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, 85 Handle<String> sample_subject, bool is_one_byte); 86 static inline bool EnsureCompiledIrregexp(Isolate* isolate, 87 Handle<JSRegExp> re, 88 Handle<String> sample_subject, 89 bool is_one_byte); 90 91 // Returns true on success, false on failure. 92 static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input, 93 RegExpFlags flags, Handle<String> pattern, 94 Handle<String> sample_subject, bool is_one_byte, 95 uint32_t& backtrack_limit); 96 97 // For acting on the JSRegExp data FixedArray. 98 static int IrregexpMaxRegisterCount(FixedArray re); 99 static void SetIrregexpMaxRegisterCount(FixedArray re, int value); 100 static int IrregexpNumberOfCaptures(FixedArray re); 101 static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte); 102 static Code IrregexpNativeCode(FixedArray re, bool is_one_byte); 103}; 104 105// static 106bool RegExp::CanGenerateBytecode() { 107 return FLAG_regexp_interpret_all || FLAG_regexp_tier_up; 108} 109 110// static 111template <class CharT> 112bool RegExp::VerifySyntax(Zone* zone, uintptr_t stack_limit, const CharT* input, 113 int input_length, RegExpFlags flags, 114 RegExpError* regexp_error_out, 115 const DisallowGarbageCollection& no_gc) { 116 RegExpCompileData data; 117 bool pattern_is_valid = RegExpParser::VerifyRegExpSyntax( 118 zone, stack_limit, input, input_length, flags, &data, no_gc); 119 *regexp_error_out = data.error; 120 return pattern_is_valid; 121} 122 123template bool RegExp::VerifySyntax<uint8_t>(Zone*, uintptr_t, const uint8_t*, 124 int, RegExpFlags, 125 RegExpError* regexp_error_out, 126 const DisallowGarbageCollection&); 127template bool RegExp::VerifySyntax<base::uc16>( 128 Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, 129 RegExpError* regexp_error_out, const DisallowGarbageCollection&); 130 131MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate, 132 Handle<JSRegExp> re, 133 Handle<String> pattern, 134 RegExpError error) { 135 base::Vector<const char> error_data = 136 base::CStrVector(RegExpErrorString(error)); 137 Handle<String> error_text = 138 isolate->factory() 139 ->NewStringFromOneByte(base::Vector<const uint8_t>::cast(error_data)) 140 .ToHandleChecked(); 141 THROW_NEW_ERROR( 142 isolate, 143 NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text), 144 Object); 145} 146 147void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, 148 RegExpError error_text) { 149 USE(ThrowRegExpException(isolate, re, Handle<String>(re->source(), isolate), 150 error_text)); 151} 152 153bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) { 154 return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp); 155} 156 157namespace { 158 159// Identifies the sort of regexps where the regexp engine is faster 160// than the code used for atom matches. 161bool HasFewDifferentCharacters(Handle<String> pattern) { 162 int length = std::min(kMaxLookaheadForBoyerMoore, pattern->length()); 163 if (length <= kPatternTooShortForBoyerMoore) return false; 164 const int kMod = 128; 165 bool character_found[kMod]; 166 int different = 0; 167 memset(&character_found[0], 0, sizeof(character_found)); 168 for (int i = 0; i < length; i++) { 169 int ch = (pattern->Get(i) & (kMod - 1)); 170 if (!character_found[ch]) { 171 character_found[ch] = true; 172 different++; 173 // We declare a regexp low-alphabet if it has at least 3 times as many 174 // characters as it has different characters. 175 if (different * 3 > length) return false; 176 } 177 } 178 return true; 179} 180 181} // namespace 182 183// Generic RegExp methods. Dispatches to implementation specific methods. 184 185// static 186MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re, 187 Handle<String> pattern, RegExpFlags flags, 188 uint32_t backtrack_limit) { 189 DCHECK(pattern->IsFlat()); 190 191 // Caching is based only on the pattern and flags, but code also differs when 192 // a backtrack limit is set. A present backtrack limit is very much *not* the 193 // common case, so just skip the cache for these. 194 const bool is_compilation_cache_enabled = 195 (backtrack_limit == JSRegExp::kNoBacktrackLimit); 196 197 Zone zone(isolate->allocator(), ZONE_NAME); 198 CompilationCache* compilation_cache = nullptr; 199 if (is_compilation_cache_enabled) { 200 compilation_cache = isolate->compilation_cache(); 201 MaybeHandle<FixedArray> maybe_cached = compilation_cache->LookupRegExp( 202 pattern, JSRegExp::AsJSRegExpFlags(flags)); 203 Handle<FixedArray> cached; 204 if (maybe_cached.ToHandle(&cached)) { 205 re->set_data(*cached); 206 return re; 207 } 208 } 209 210 PostponeInterruptsScope postpone(isolate); 211 RegExpCompileData parse_result; 212 DCHECK(!isolate->has_pending_exception()); 213 if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags, 214 &parse_result)) { 215 // Throw an exception if we fail to parse the pattern. 216 return RegExp::ThrowRegExpException(isolate, re, pattern, 217 parse_result.error); 218 } 219 220 bool has_been_compiled = false; 221 222 if (FLAG_default_to_experimental_regexp_engine && 223 ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, 224 parse_result.capture_count)) { 225 DCHECK(FLAG_enable_experimental_regexp_engine); 226 ExperimentalRegExp::Initialize(isolate, re, pattern, flags, 227 parse_result.capture_count); 228 has_been_compiled = true; 229 } else if (flags & JSRegExp::kLinear) { 230 DCHECK(FLAG_enable_experimental_regexp_engine); 231 if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, 232 parse_result.capture_count)) { 233 // TODO(mbid): The error could provide a reason for why the regexp can't 234 // be executed in linear time (e.g. due to back references). 235 return RegExp::ThrowRegExpException(isolate, re, pattern, 236 RegExpError::kNotLinear); 237 } 238 ExperimentalRegExp::Initialize(isolate, re, pattern, flags, 239 parse_result.capture_count); 240 has_been_compiled = true; 241 } else if (parse_result.simple && !IsIgnoreCase(flags) && !IsSticky(flags) && 242 !HasFewDifferentCharacters(pattern)) { 243 // Parse-tree is a single atom that is equal to the pattern. 244 RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern); 245 has_been_compiled = true; 246 } else if (parse_result.tree->IsAtom() && !IsSticky(flags) && 247 parse_result.capture_count == 0) { 248 RegExpAtom* atom = parse_result.tree->AsAtom(); 249 // The pattern source might (?) contain escape sequences, but they're 250 // resolved in atom_string. 251 base::Vector<const base::uc16> atom_pattern = atom->data(); 252 Handle<String> atom_string; 253 ASSIGN_RETURN_ON_EXCEPTION( 254 isolate, atom_string, 255 isolate->factory()->NewStringFromTwoByte(atom_pattern), Object); 256 if (!IsIgnoreCase(flags) && !HasFewDifferentCharacters(atom_string)) { 257 RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string); 258 has_been_compiled = true; 259 } 260 } 261 if (!has_been_compiled) { 262 RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags, 263 parse_result.capture_count, backtrack_limit); 264 } 265 DCHECK(re->data().IsFixedArray()); 266 // Compilation succeeded so the data is set on the regexp 267 // and we can store it in the cache. 268 Handle<FixedArray> data(FixedArray::cast(re->data()), isolate); 269 if (is_compilation_cache_enabled) { 270 compilation_cache->PutRegExp(pattern, JSRegExp::AsJSRegExpFlags(flags), 271 data); 272 } 273 274 return re; 275} 276 277// static 278bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re, 279 Handle<String> subject) { 280 switch (re->type_tag()) { 281 case JSRegExp::NOT_COMPILED: 282 UNREACHABLE(); 283 case JSRegExp::ATOM: 284 return true; 285 case JSRegExp::IRREGEXP: 286 if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) { 287 DCHECK(isolate->has_pending_exception()); 288 return false; 289 } 290 return true; 291 case JSRegExp::EXPERIMENTAL: 292 if (!ExperimentalRegExp::IsCompiled(re, isolate) && 293 !ExperimentalRegExp::Compile(isolate, re)) { 294 DCHECK(isolate->has_pending_exception()); 295 return false; 296 } 297 return true; 298 } 299} 300 301// static 302MaybeHandle<Object> RegExp::ExperimentalOneshotExec( 303 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, 304 int index, Handle<RegExpMatchInfo> last_match_info, 305 RegExp::ExecQuirks exec_quirks) { 306 return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index, 307 last_match_info, exec_quirks); 308} 309 310// static 311MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp, 312 Handle<String> subject, int index, 313 Handle<RegExpMatchInfo> last_match_info, 314 ExecQuirks exec_quirks) { 315 switch (regexp->type_tag()) { 316 case JSRegExp::NOT_COMPILED: 317 UNREACHABLE(); 318 case JSRegExp::ATOM: 319 return RegExpImpl::AtomExec(isolate, regexp, subject, index, 320 last_match_info); 321 case JSRegExp::IRREGEXP: 322 return RegExpImpl::IrregexpExec(isolate, regexp, subject, index, 323 last_match_info, exec_quirks); 324 case JSRegExp::EXPERIMENTAL: 325 return ExperimentalRegExp::Exec(isolate, regexp, subject, index, 326 last_match_info, exec_quirks); 327 } 328} 329 330// RegExp Atom implementation: Simple string search using indexOf. 331 332void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re, 333 Handle<String> pattern, RegExpFlags flags, 334 Handle<String> match_pattern) { 335 isolate->factory()->SetRegExpAtomData( 336 re, pattern, JSRegExp::AsJSRegExpFlags(flags), match_pattern); 337} 338 339namespace { 340 341void SetAtomLastCapture(Isolate* isolate, 342 Handle<RegExpMatchInfo> last_match_info, String subject, 343 int from, int to) { 344 SealHandleScope shs(isolate); 345 last_match_info->SetNumberOfCaptureRegisters(2); 346 last_match_info->SetLastSubject(subject); 347 last_match_info->SetLastInput(subject); 348 last_match_info->SetCapture(0, from); 349 last_match_info->SetCapture(1, to); 350} 351 352} // namespace 353 354int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 355 Handle<String> subject, int index, int32_t* output, 356 int output_size) { 357 DCHECK_LE(0, index); 358 DCHECK_LE(index, subject->length()); 359 360 subject = String::Flatten(isolate, subject); 361 DisallowGarbageCollection no_gc; // ensure vectors stay valid 362 363 String needle = regexp->atom_pattern(); 364 int needle_len = needle.length(); 365 DCHECK(needle.IsFlat()); 366 DCHECK_LT(0, needle_len); 367 368 if (index + needle_len > subject->length()) { 369 return RegExp::RE_FAILURE; 370 } 371 372 for (int i = 0; i < output_size; i += 2) { 373 String::FlatContent needle_content = needle.GetFlatContent(no_gc); 374 String::FlatContent subject_content = subject->GetFlatContent(no_gc); 375 DCHECK(needle_content.IsFlat()); 376 DCHECK(subject_content.IsFlat()); 377 // dispatch on type of strings 378 index = 379 (needle_content.IsOneByte() 380 ? (subject_content.IsOneByte() 381 ? SearchString(isolate, subject_content.ToOneByteVector(), 382 needle_content.ToOneByteVector(), index) 383 : SearchString(isolate, subject_content.ToUC16Vector(), 384 needle_content.ToOneByteVector(), index)) 385 : (subject_content.IsOneByte() 386 ? SearchString(isolate, subject_content.ToOneByteVector(), 387 needle_content.ToUC16Vector(), index) 388 : SearchString(isolate, subject_content.ToUC16Vector(), 389 needle_content.ToUC16Vector(), index))); 390 if (index == -1) { 391 return i / 2; // Return number of matches. 392 } else { 393 output[i] = index; 394 output[i + 1] = index + needle_len; 395 index += needle_len; 396 } 397 } 398 return output_size / 2; 399} 400 401Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re, 402 Handle<String> subject, int index, 403 Handle<RegExpMatchInfo> last_match_info) { 404 static const int kNumRegisters = 2; 405 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 406 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 407 408 int res = 409 AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters); 410 411 if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value(); 412 413 DCHECK_EQ(res, RegExp::RE_SUCCESS); 414 SealHandleScope shs(isolate); 415 SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0], 416 output_registers[1]); 417 return last_match_info; 418} 419 420// Irregexp implementation. 421 422// Ensures that the regexp object contains a compiled version of the 423// source for either one-byte or two-byte subject strings. 424// If the compiled version doesn't already exist, it is compiled 425// from the source pattern. 426// If compilation fails, an exception is thrown and this function 427// returns false. 428bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re, 429 Handle<String> sample_subject, 430 bool is_one_byte) { 431 Object compiled_code = re->code(is_one_byte); 432 Object bytecode = re->bytecode(is_one_byte); 433 bool needs_initial_compilation = 434 compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue); 435 // Recompile is needed when we're dealing with the first execution of the 436 // regexp after the decision to tier up has been made. If the tiering up 437 // strategy is not in use, this value is always false. 438 bool needs_tier_up_compilation = 439 re->MarkedForTierUp() && bytecode.IsByteArray(); 440 441 if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) { 442 PrintF("JSRegExp object %p needs tier-up compilation\n", 443 reinterpret_cast<void*>(re->ptr())); 444 } 445 446 if (!needs_initial_compilation && !needs_tier_up_compilation) { 447 DCHECK(compiled_code.IsCodeT()); 448 DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray()); 449 return true; 450 } 451 452 DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray()); 453 454 return CompileIrregexp(isolate, re, sample_subject, is_one_byte); 455} 456 457namespace { 458 459#ifdef DEBUG 460bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) { 461 Object entry = re->code(is_one_byte); 462 Object bytecode = re->bytecode(is_one_byte); 463 // If we're not using the tier-up strategy, entry can only be a smi 464 // representing an uncompiled regexp here. If we're using the tier-up 465 // strategy, entry can still be a smi representing an uncompiled regexp, when 466 // compiling the regexp before the tier-up, or it can contain a trampoline to 467 // the regexp interpreter, in which case the bytecode field contains compiled 468 // bytecode, when recompiling the regexp after the tier-up. If the 469 // tier-up was forced, which happens for global replaces, entry is a smi 470 // representing an uncompiled regexp, even though we're "recompiling" after 471 // the tier-up. 472 if (re->ShouldProduceBytecode()) { 473 DCHECK(entry.IsSmi()); 474 DCHECK(bytecode.IsSmi()); 475 int entry_value = Smi::ToInt(entry); 476 int bytecode_value = Smi::ToInt(bytecode); 477 DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value); 478 DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value); 479 } else { 480 DCHECK(entry.IsSmi() || (entry.IsCodeT() && bytecode.IsByteArray())); 481 } 482 483 return true; 484} 485#endif 486 487struct RegExpCaptureIndexLess { 488 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { 489 DCHECK_NOT_NULL(lhs); 490 DCHECK_NOT_NULL(rhs); 491 return lhs->index() < rhs->index(); 492 } 493}; 494 495} // namespace 496 497// static 498Handle<FixedArray> RegExp::CreateCaptureNameMap( 499 Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures) { 500 if (named_captures == nullptr) return Handle<FixedArray>(); 501 502 DCHECK(!named_captures->empty()); 503 504 // Named captures are sorted by name (because the set is used to ensure 505 // name uniqueness). But the capture name map must to be sorted by index. 506 507 std::sort(named_captures->begin(), named_captures->end(), 508 RegExpCaptureIndexLess{}); 509 510 int len = static_cast<int>(named_captures->size()) * 2; 511 Handle<FixedArray> array = isolate->factory()->NewFixedArray(len); 512 513 int i = 0; 514 for (const RegExpCapture* capture : *named_captures) { 515 base::Vector<const base::uc16> capture_name(capture->name()->data(), 516 capture->name()->size()); 517 // CSA code in ConstructNewResultFromMatchInfo requires these strings to be 518 // internalized so they can be used as property names in the 'exec' results. 519 Handle<String> name = isolate->factory()->InternalizeString(capture_name); 520 array->set(i * 2, *name); 521 array->set(i * 2 + 1, Smi::FromInt(capture->index())); 522 523 i++; 524 } 525 DCHECK_EQ(i * 2, len); 526 527 return array; 528} 529 530bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, 531 Handle<String> sample_subject, 532 bool is_one_byte) { 533 // Compile the RegExp. 534 Zone zone(isolate->allocator(), ZONE_NAME); 535 PostponeInterruptsScope postpone(isolate); 536 537 DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte)); 538 539 RegExpFlags flags = JSRegExp::AsRegExpFlags(re->flags()); 540 541 Handle<String> pattern(re->source(), isolate); 542 pattern = String::Flatten(isolate, pattern); 543 RegExpCompileData compile_data; 544 if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags, 545 &compile_data)) { 546 // Throw an exception if we fail to parse the pattern. 547 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 548 USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error)); 549 return false; 550 } 551 // The compilation target is a kBytecode if we're interpreting all regexp 552 // objects, or if we're using the tier-up strategy but the tier-up hasn't 553 // happened yet. The compilation target is a kNative if we're using the 554 // tier-up strategy and we need to recompile to tier-up, or if we're producing 555 // native code for all regexp objects. 556 compile_data.compilation_target = re->ShouldProduceBytecode() 557 ? RegExpCompilationTarget::kBytecode 558 : RegExpCompilationTarget::kNative; 559 uint32_t backtrack_limit = re->backtrack_limit(); 560 const bool compilation_succeeded = 561 Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject, 562 is_one_byte, backtrack_limit); 563 if (!compilation_succeeded) { 564 DCHECK(compile_data.error != RegExpError::kNone); 565 RegExp::ThrowRegExpException(isolate, re, compile_data.error); 566 return false; 567 } 568 569 Handle<FixedArray> data = 570 Handle<FixedArray>(FixedArray::cast(re->data()), isolate); 571 if (compile_data.compilation_target == RegExpCompilationTarget::kNative) { 572 Code code = Code::cast(*compile_data.code); 573 data->set(JSRegExp::code_index(is_one_byte), ToCodeT(code)); 574 575 // Reset bytecode to uninitialized. In case we use tier-up we know that 576 // tier-up has happened this way. 577 data->set(JSRegExp::bytecode_index(is_one_byte), 578 Smi::FromInt(JSRegExp::kUninitializedValue)); 579 } else { 580 DCHECK_EQ(compile_data.compilation_target, 581 RegExpCompilationTarget::kBytecode); 582 // Store code generated by compiler in bytecode and trampoline to 583 // interpreter in code. 584 data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code); 585 Handle<CodeT> trampoline = 586 BUILTIN_CODE(isolate, RegExpInterpreterTrampoline); 587 data->set(JSRegExp::code_index(is_one_byte), *trampoline); 588 } 589 Handle<FixedArray> capture_name_map = 590 RegExp::CreateCaptureNameMap(isolate, compile_data.named_captures); 591 re->set_capture_name_map(capture_name_map); 592 int register_max = IrregexpMaxRegisterCount(*data); 593 if (compile_data.register_count > register_max) { 594 SetIrregexpMaxRegisterCount(*data, compile_data.register_count); 595 } 596 data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit)); 597 598 if (FLAG_trace_regexp_tier_up) { 599 PrintF("JSRegExp object %p %s size: %d\n", 600 reinterpret_cast<void*>(re->ptr()), 601 re->ShouldProduceBytecode() ? "bytecode" : "native code", 602 re->ShouldProduceBytecode() 603 ? IrregexpByteCode(*data, is_one_byte).Size() 604 : IrregexpNativeCode(*data, is_one_byte).Size()); 605 } 606 607 return true; 608} 609 610int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) { 611 return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex)); 612} 613 614void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) { 615 re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 616} 617 618int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) { 619 return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex)); 620} 621 622ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) { 623 return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte))); 624} 625 626Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) { 627 return Code::cast(re.get(JSRegExp::code_index(is_one_byte))); 628} 629 630void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, 631 Handle<String> pattern, RegExpFlags flags, 632 int capture_count, 633 uint32_t backtrack_limit) { 634 // Initialize compiled code entries to null. 635 isolate->factory()->SetRegExpIrregexpData(re, pattern, 636 JSRegExp::AsJSRegExpFlags(flags), 637 capture_count, backtrack_limit); 638} 639 640// static 641int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp, 642 Handle<String> subject) { 643 DCHECK(subject->IsFlat()); 644 645 // Check representation of the underlying storage. 646 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); 647 if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject, 648 is_one_byte)) { 649 return -1; 650 } 651 652 // Only reserve room for output captures. Internal registers are allocated by 653 // the engine. 654 return JSRegExp::RegistersForCaptureCount(regexp->capture_count()); 655} 656 657int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 658 Handle<String> subject, int index, 659 int32_t* output, int output_size) { 660 DCHECK_LE(0, index); 661 DCHECK_LE(index, subject->length()); 662 DCHECK(subject->IsFlat()); 663 DCHECK_GE(output_size, 664 JSRegExp::RegistersForCaptureCount(regexp->capture_count())); 665 666 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); 667 668 if (!regexp->ShouldProduceBytecode()) { 669 do { 670 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte); 671 // The stack is used to allocate registers for the compiled regexp code. 672 // This means that in case of failure, the output registers array is left 673 // untouched and contains the capture results from the previous successful 674 // match. We can use that to set the last match info lazily. 675 int res = NativeRegExpMacroAssembler::Match(regexp, subject, output, 676 output_size, index, isolate); 677 if (res != NativeRegExpMacroAssembler::RETRY) { 678 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || 679 isolate->has_pending_exception()); 680 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == 681 RegExp::RE_SUCCESS); 682 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == 683 RegExp::RE_FAILURE); 684 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) == 685 RegExp::RE_EXCEPTION); 686 return res; 687 } 688 // If result is RETRY, the string has changed representation, and we 689 // must restart from scratch. 690 // In this case, it means we must make sure we are prepared to handle 691 // the, potentially, different subject (the string can switch between 692 // being internal and external, and even between being Latin1 and 693 // UC16, but the characters are always the same). 694 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); 695 } while (true); 696 UNREACHABLE(); 697 } else { 698 DCHECK(regexp->ShouldProduceBytecode()); 699 700 do { 701 IrregexpInterpreter::Result result = 702 IrregexpInterpreter::MatchForCallFromRuntime( 703 isolate, regexp, subject, output, output_size, index); 704 DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION, 705 isolate->has_pending_exception()); 706 707 switch (result) { 708 case IrregexpInterpreter::SUCCESS: 709 case IrregexpInterpreter::EXCEPTION: 710 case IrregexpInterpreter::FAILURE: 711 case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL: 712 return result; 713 case IrregexpInterpreter::RETRY: 714 // The string has changed representation, and we must restart the 715 // match. 716 // We need to reset the tier up to start over with compilation. 717 if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick(); 718 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); 719 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte); 720 break; 721 } 722 } while (true); 723 UNREACHABLE(); 724 } 725} 726 727MaybeHandle<Object> RegExpImpl::IrregexpExec( 728 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, 729 int previous_index, Handle<RegExpMatchInfo> last_match_info, 730 RegExp::ExecQuirks exec_quirks) { 731 DCHECK_EQ(regexp->type_tag(), JSRegExp::IRREGEXP); 732 733 subject = String::Flatten(isolate, subject); 734 735#ifdef DEBUG 736 if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) { 737 PrintF("\n\nRegexp match: /%s/\n\n", regexp->source().ToCString().get()); 738 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 739 } 740#endif 741 742 // For very long subject strings, the regexp interpreter is currently much 743 // slower than the jitted code execution. If the tier-up strategy is turned 744 // on, we want to avoid this performance penalty so we eagerly tier-up if the 745 // subject string length is equal or greater than the given heuristic value. 746 if (FLAG_regexp_tier_up && 747 subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) { 748 regexp->MarkTierUpForNextExec(); 749 if (FLAG_trace_regexp_tier_up) { 750 PrintF( 751 "Forcing tier-up for very long strings in " 752 "RegExpImpl::IrregexpExec\n"); 753 } 754 } 755 756 // Prepare space for the return values. 757 int required_registers = 758 RegExpImpl::IrregexpPrepare(isolate, regexp, subject); 759 if (required_registers < 0) { 760 // Compiling failed with an exception. 761 DCHECK(isolate->has_pending_exception()); 762 return MaybeHandle<Object>(); 763 } 764 765 int32_t* output_registers = nullptr; 766 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 767 output_registers = NewArray<int32_t>(required_registers); 768 } 769 std::unique_ptr<int32_t[]> auto_release(output_registers); 770 if (output_registers == nullptr) { 771 output_registers = isolate->jsregexp_static_offsets_vector(); 772 } 773 774 int res = 775 RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index, 776 output_registers, required_registers); 777 778 if (res == RegExp::RE_SUCCESS) { 779 if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) { 780 if (output_registers[0] >= subject->length()) { 781 return isolate->factory()->null_value(); 782 } 783 } 784 int capture_count = regexp->capture_count(); 785 return RegExp::SetLastMatchInfo(isolate, last_match_info, subject, 786 capture_count, output_registers); 787 } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) { 788 return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, 789 previous_index, last_match_info); 790 } else if (res == RegExp::RE_EXCEPTION) { 791 DCHECK(isolate->has_pending_exception()); 792 return MaybeHandle<Object>(); 793 } else { 794 DCHECK(res == RegExp::RE_FAILURE); 795 return isolate->factory()->null_value(); 796 } 797} 798 799// static 800Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo( 801 Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, 802 Handle<String> subject, int capture_count, int32_t* match) { 803 // This is the only place where match infos can grow. If, after executing the 804 // regexp, RegExpExecStub finds that the match info is too small, it restarts 805 // execution in RegExpImpl::Exec, which finally grows the match info right 806 // here. 807 Handle<RegExpMatchInfo> result = 808 RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count); 809 if (*result != *last_match_info) { 810 if (*last_match_info == *isolate->regexp_last_match_info()) { 811 // This inner condition is only needed for special situations like the 812 // regexp fuzzer, where we pass our own custom RegExpMatchInfo to 813 // RegExpImpl::Exec; there actually want to bypass the Isolate's match 814 // info and execute the regexp without side effects. 815 isolate->native_context()->set_regexp_last_match_info(*result); 816 } 817 } 818 819 int capture_register_count = 820 JSRegExp::RegistersForCaptureCount(capture_count); 821 DisallowGarbageCollection no_gc; 822 if (match != nullptr) { 823 for (int i = 0; i < capture_register_count; i += 2) { 824 result->SetCapture(i, match[i]); 825 result->SetCapture(i + 1, match[i + 1]); 826 } 827 } 828 result->SetLastSubject(*subject); 829 result->SetLastInput(*subject); 830 return result; 831} 832 833// static 834void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) { 835 DotPrinter::DotPrint(label, node); 836} 837 838namespace { 839 840// Returns true if we've either generated too much irregex code within this 841// isolate, or the pattern string is too long. 842bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) { 843 // Limit the space regexps take up on the heap. In order to limit this we 844 // would like to keep track of the amount of regexp code on the heap. This 845 // is not tracked, however. As a conservative approximation we track the 846 // total regexp code compiled including code that has subsequently been freed 847 // and the total executable memory at any point. 848 static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB; 849 static constexpr size_t kRegExpCompiledLimit = 1 * MB; 850 851 Heap* heap = isolate->heap(); 852 if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true; 853 return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit && 854 heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit); 855} 856 857} // namespace 858 859// static 860bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone, 861 RegExpCompileData* data, RegExpFlags flags, 862 Handle<String> pattern, 863 Handle<String> sample_subject, 864 bool is_one_byte) { 865 uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit; 866 return RegExpImpl::Compile(isolate, zone, data, flags, pattern, 867 sample_subject, is_one_byte, backtrack_limit); 868} 869 870bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, 871 RegExpFlags flags, Handle<String> pattern, 872 Handle<String> sample_subject, bool is_one_byte, 873 uint32_t& backtrack_limit) { 874 if (JSRegExp::RegistersForCaptureCount(data->capture_count) > 875 RegExpMacroAssembler::kMaxRegisterCount) { 876 data->error = RegExpError::kTooLarge; 877 return false; 878 } 879 880 RegExpCompiler compiler(isolate, zone, data->capture_count, flags, 881 is_one_byte); 882 883 if (compiler.optimize()) { 884 compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern)); 885 } 886 887 // Sample some characters from the middle of the string. 888 static const int kSampleSize = 128; 889 890 sample_subject = String::Flatten(isolate, sample_subject); 891 int chars_sampled = 0; 892 int half_way = (sample_subject->length() - kSampleSize) / 2; 893 for (int i = std::max(0, half_way); 894 i < sample_subject->length() && chars_sampled < kSampleSize; 895 i++, chars_sampled++) { 896 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 897 } 898 899 data->node = compiler.PreprocessRegExp(data, flags, is_one_byte); 900 data->error = AnalyzeRegExp(isolate, is_one_byte, flags, data->node); 901 if (data->error != RegExpError::kNone) { 902 return false; 903 } 904 905 if (FLAG_trace_regexp_graph) DotPrinter::DotPrint("Start", data->node); 906 907 // Create the correct assembler for the architecture. 908 std::unique_ptr<RegExpMacroAssembler> macro_assembler; 909 if (data->compilation_target == RegExpCompilationTarget::kNative) { 910 // Native regexp implementation. 911 DCHECK(!FLAG_jitless); 912 913 NativeRegExpMacroAssembler::Mode mode = 914 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 915 : NativeRegExpMacroAssembler::UC16; 916 917 const int output_register_count = 918 JSRegExp::RegistersForCaptureCount(data->capture_count); 919#if V8_TARGET_ARCH_IA32 920 macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode, 921 output_register_count)); 922#elif V8_TARGET_ARCH_X64 923 macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode, 924 output_register_count)); 925#elif V8_TARGET_ARCH_ARM 926 macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode, 927 output_register_count)); 928#elif V8_TARGET_ARCH_ARM64 929 macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode, 930 output_register_count)); 931#elif V8_TARGET_ARCH_S390 932 macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode, 933 output_register_count)); 934#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64 935 macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode, 936 output_register_count)); 937#elif V8_TARGET_ARCH_MIPS 938 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode, 939 output_register_count)); 940#elif V8_TARGET_ARCH_MIPS64 941 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode, 942 output_register_count)); 943#elif V8_TARGET_ARCH_RISCV64 944 macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode, 945 output_register_count)); 946#elif V8_TARGET_ARCH_LOONG64 947 macro_assembler.reset(new RegExpMacroAssemblerLOONG64( 948 isolate, zone, mode, output_register_count)); 949#else 950#error "Unsupported architecture" 951#endif 952 } else { 953 DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode); 954 // Interpreted regexp implementation. 955 macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone)); 956 } 957 958 macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern)); 959 if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks && 960 ExperimentalRegExp::CanBeHandled(data->tree, flags, 961 data->capture_count)) { 962 if (backtrack_limit == JSRegExp::kNoBacktrackLimit) { 963 backtrack_limit = FLAG_regexp_backtracks_before_fallback; 964 } else { 965 backtrack_limit = 966 std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback); 967 } 968 macro_assembler->set_backtrack_limit(backtrack_limit); 969 macro_assembler->set_can_fallback(true); 970 } else { 971 macro_assembler->set_backtrack_limit(backtrack_limit); 972 macro_assembler->set_can_fallback(false); 973 } 974 975 // Inserted here, instead of in Assembler, because it depends on information 976 // in the AST that isn't replicated in the Node structure. 977 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 978 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 979 int max_length = data->tree->max_match(); 980 static const int kMaxBacksearchLimit = 1024; 981 if (is_end_anchored && !is_start_anchored && !IsSticky(flags) && 982 max_length < kMaxBacksearchLimit) { 983 macro_assembler->SetCurrentPositionFromEnd(max_length); 984 } 985 986 if (IsGlobal(flags)) { 987 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; 988 if (data->tree->min_match() > 0) { 989 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; 990 } else if (IsUnicode(flags)) { 991 mode = RegExpMacroAssembler::GLOBAL_UNICODE; 992 } 993 macro_assembler->set_global_mode(mode); 994 } 995 996 RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get(); 997#ifdef DEBUG 998 std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler; 999 if (FLAG_trace_regexp_assembler) { 1000 tracer_macro_assembler.reset( 1001 new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr)); 1002 macro_assembler_ptr = tracer_macro_assembler.get(); 1003 } 1004#endif 1005 1006 RegExpCompiler::CompilationResult result = compiler.Assemble( 1007 isolate, macro_assembler_ptr, data->node, data->capture_count, pattern); 1008 1009 // Code / bytecode printing. 1010 { 1011#ifdef ENABLE_DISASSEMBLER 1012 if (FLAG_print_regexp_code && 1013 data->compilation_target == RegExpCompilationTarget::kNative) { 1014 CodeTracer::Scope trace_scope(isolate->GetCodeTracer()); 1015 OFStream os(trace_scope.file()); 1016 Handle<Code> c = Handle<Code>::cast(result.code); 1017 auto pattern_cstring = pattern->ToCString(); 1018 c->Disassemble(pattern_cstring.get(), os, isolate); 1019 } 1020#endif 1021 if (FLAG_print_regexp_bytecode && 1022 data->compilation_target == RegExpCompilationTarget::kBytecode) { 1023 Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code); 1024 auto pattern_cstring = pattern->ToCString(); 1025 RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(), 1026 bytecode->length(), pattern_cstring.get()); 1027 } 1028 } 1029 1030 if (result.error != RegExpError::kNone) { 1031 if (FLAG_correctness_fuzzer_suppressions && 1032 result.error == RegExpError::kStackOverflow) { 1033 FATAL("Aborting on stack overflow"); 1034 } 1035 data->error = result.error; 1036 } 1037 1038 data->code = result.code; 1039 data->register_count = result.num_registers; 1040 1041 return result.Succeeded(); 1042} 1043 1044RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp, 1045 Handle<String> subject, Isolate* isolate) 1046 : register_array_(nullptr), 1047 register_array_size_(0), 1048 regexp_(regexp), 1049 subject_(subject), 1050 isolate_(isolate) { 1051 DCHECK(IsGlobal(JSRegExp::AsRegExpFlags(regexp->flags()))); 1052 1053 switch (regexp_->type_tag()) { 1054 case JSRegExp::NOT_COMPILED: 1055 UNREACHABLE(); 1056 case JSRegExp::ATOM: { 1057 // ATOM regexps do not have a global loop, so we search for one match at 1058 // a time. 1059 static const int kAtomRegistersPerMatch = 2; 1060 registers_per_match_ = kAtomRegistersPerMatch; 1061 register_array_size_ = registers_per_match_; 1062 break; 1063 } 1064 case JSRegExp::IRREGEXP: { 1065 registers_per_match_ = 1066 RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_); 1067 if (registers_per_match_ < 0) { 1068 num_matches_ = -1; // Signal exception. 1069 return; 1070 } 1071 if (regexp->ShouldProduceBytecode()) { 1072 // Global loop in interpreted regexp is not implemented. We choose the 1073 // size of the offsets vector so that it can only store one match. 1074 register_array_size_ = registers_per_match_; 1075 max_matches_ = 1; 1076 } else { 1077 register_array_size_ = std::max( 1078 {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize}); 1079 } 1080 break; 1081 } 1082 case JSRegExp::EXPERIMENTAL: { 1083 if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) && 1084 !ExperimentalRegExp::Compile(isolate_, regexp)) { 1085 DCHECK(isolate->has_pending_exception()); 1086 num_matches_ = -1; // Signal exception. 1087 return; 1088 } 1089 registers_per_match_ = 1090 JSRegExp::RegistersForCaptureCount(regexp->capture_count()); 1091 register_array_size_ = std::max( 1092 {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize}); 1093 break; 1094 } 1095 } 1096 1097 max_matches_ = register_array_size_ / registers_per_match_; 1098 1099 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 1100 register_array_ = NewArray<int32_t>(register_array_size_); 1101 } else { 1102 register_array_ = isolate->jsregexp_static_offsets_vector(); 1103 } 1104 1105 // Set state so that fetching the results the first time triggers a call 1106 // to the compiled regexp. 1107 current_match_index_ = max_matches_ - 1; 1108 num_matches_ = max_matches_; 1109 DCHECK_LE(2, registers_per_match_); // Each match has at least one capture. 1110 DCHECK_GE(register_array_size_, registers_per_match_); 1111 int32_t* last_match = 1112 ®ister_array_[current_match_index_ * registers_per_match_]; 1113 last_match[0] = -1; 1114 last_match[1] = 0; 1115} 1116 1117RegExpGlobalCache::~RegExpGlobalCache() { 1118 // Deallocate the register array if we allocated it in the constructor 1119 // (as opposed to using the existing jsregexp_static_offsets_vector). 1120 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 1121 DeleteArray(register_array_); 1122 } 1123} 1124 1125int RegExpGlobalCache::AdvanceZeroLength(int last_index) { 1126 if (IsUnicode(JSRegExp::AsRegExpFlags(regexp_->flags())) && 1127 last_index + 1 < subject_->length() && 1128 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && 1129 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { 1130 // Advance over the surrogate pair. 1131 return last_index + 2; 1132 } 1133 return last_index + 1; 1134} 1135 1136int32_t* RegExpGlobalCache::FetchNext() { 1137 current_match_index_++; 1138 1139 if (current_match_index_ >= num_matches_) { 1140 // Current batch of results exhausted. 1141 // Fail if last batch was not even fully filled. 1142 if (num_matches_ < max_matches_) { 1143 num_matches_ = 0; // Signal failed match. 1144 return nullptr; 1145 } 1146 1147 int32_t* last_match = 1148 ®ister_array_[(current_match_index_ - 1) * registers_per_match_]; 1149 int last_end_index = last_match[1]; 1150 1151 switch (regexp_->type_tag()) { 1152 case JSRegExp::NOT_COMPILED: 1153 UNREACHABLE(); 1154 case JSRegExp::ATOM: 1155 num_matches_ = 1156 RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index, 1157 register_array_, register_array_size_); 1158 break; 1159 case JSRegExp::EXPERIMENTAL: { 1160 DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_)); 1161 DisallowGarbageCollection no_gc; 1162 num_matches_ = ExperimentalRegExp::ExecRaw( 1163 isolate_, RegExp::kFromRuntime, *regexp_, *subject_, 1164 register_array_, register_array_size_, last_end_index); 1165 break; 1166 } 1167 case JSRegExp::IRREGEXP: { 1168 int last_start_index = last_match[0]; 1169 if (last_start_index == last_end_index) { 1170 // Zero-length match. Advance by one code point. 1171 last_end_index = AdvanceZeroLength(last_end_index); 1172 } 1173 if (last_end_index > subject_->length()) { 1174 num_matches_ = 0; // Signal failed match. 1175 return nullptr; 1176 } 1177 num_matches_ = RegExpImpl::IrregexpExecRaw( 1178 isolate_, regexp_, subject_, last_end_index, register_array_, 1179 register_array_size_); 1180 break; 1181 } 1182 } 1183 1184 // Fall back to experimental engine if needed and possible. 1185 if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) { 1186 num_matches_ = ExperimentalRegExp::OneshotExecRaw( 1187 isolate_, regexp_, subject_, register_array_, register_array_size_, 1188 last_end_index); 1189 } 1190 1191 if (num_matches_ <= 0) { 1192 return nullptr; 1193 } 1194 current_match_index_ = 0; 1195 return register_array_; 1196 } else { 1197 return ®ister_array_[current_match_index_ * registers_per_match_]; 1198 } 1199} 1200 1201int32_t* RegExpGlobalCache::LastSuccessfulMatch() { 1202 int index = current_match_index_ * registers_per_match_; 1203 if (num_matches_ == 0) { 1204 // After a failed match we shift back by one result. 1205 index -= registers_per_match_; 1206 } 1207 return ®ister_array_[index]; 1208} 1209 1210Object RegExpResultsCache::Lookup(Heap* heap, String key_string, 1211 Object key_pattern, 1212 FixedArray* last_match_cache, 1213 ResultsCacheType type) { 1214 FixedArray cache; 1215 if (!key_string.IsInternalizedString()) return Smi::zero(); 1216 if (type == STRING_SPLIT_SUBSTRINGS) { 1217 DCHECK(key_pattern.IsString()); 1218 if (!key_pattern.IsInternalizedString()) return Smi::zero(); 1219 cache = heap->string_split_cache(); 1220 } else { 1221 DCHECK(type == REGEXP_MULTIPLE_INDICES); 1222 DCHECK(key_pattern.IsFixedArray()); 1223 cache = heap->regexp_multiple_cache(); 1224 } 1225 1226 uint32_t hash = key_string.hash(); 1227 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 1228 ~(kArrayEntriesPerCacheEntry - 1)); 1229 if (cache.get(index + kStringOffset) != key_string || 1230 cache.get(index + kPatternOffset) != key_pattern) { 1231 index = 1232 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 1233 if (cache.get(index + kStringOffset) != key_string || 1234 cache.get(index + kPatternOffset) != key_pattern) { 1235 return Smi::zero(); 1236 } 1237 } 1238 1239 *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset)); 1240 return cache.get(index + kArrayOffset); 1241} 1242 1243void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, 1244 Handle<Object> key_pattern, 1245 Handle<FixedArray> value_array, 1246 Handle<FixedArray> last_match_cache, 1247 ResultsCacheType type) { 1248 Factory* factory = isolate->factory(); 1249 Handle<FixedArray> cache; 1250 if (!key_string->IsInternalizedString()) return; 1251 if (type == STRING_SPLIT_SUBSTRINGS) { 1252 DCHECK(key_pattern->IsString()); 1253 if (!key_pattern->IsInternalizedString()) return; 1254 cache = factory->string_split_cache(); 1255 } else { 1256 DCHECK(type == REGEXP_MULTIPLE_INDICES); 1257 DCHECK(key_pattern->IsFixedArray()); 1258 cache = factory->regexp_multiple_cache(); 1259 } 1260 1261 uint32_t hash = key_string->hash(); 1262 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 1263 ~(kArrayEntriesPerCacheEntry - 1)); 1264 if (cache->get(index + kStringOffset) == Smi::zero()) { 1265 cache->set(index + kStringOffset, *key_string); 1266 cache->set(index + kPatternOffset, *key_pattern); 1267 cache->set(index + kArrayOffset, *value_array); 1268 cache->set(index + kLastMatchOffset, *last_match_cache); 1269 } else { 1270 uint32_t index2 = 1271 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 1272 if (cache->get(index2 + kStringOffset) == Smi::zero()) { 1273 cache->set(index2 + kStringOffset, *key_string); 1274 cache->set(index2 + kPatternOffset, *key_pattern); 1275 cache->set(index2 + kArrayOffset, *value_array); 1276 cache->set(index2 + kLastMatchOffset, *last_match_cache); 1277 } else { 1278 cache->set(index2 + kStringOffset, Smi::zero()); 1279 cache->set(index2 + kPatternOffset, Smi::zero()); 1280 cache->set(index2 + kArrayOffset, Smi::zero()); 1281 cache->set(index2 + kLastMatchOffset, Smi::zero()); 1282 cache->set(index + kStringOffset, *key_string); 1283 cache->set(index + kPatternOffset, *key_pattern); 1284 cache->set(index + kArrayOffset, *value_array); 1285 cache->set(index + kLastMatchOffset, *last_match_cache); 1286 } 1287 } 1288 // If the array is a reasonably short list of substrings, convert it into a 1289 // list of internalized strings. 1290 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { 1291 for (int i = 0; i < value_array->length(); i++) { 1292 Handle<String> str(String::cast(value_array->get(i)), isolate); 1293 Handle<String> internalized_str = factory->InternalizeString(str); 1294 value_array->set(i, *internalized_str); 1295 } 1296 } 1297 // Convert backing store to a copy-on-write array. 1298 value_array->set_map_no_write_barrier( 1299 ReadOnlyRoots(isolate).fixed_cow_array_map()); 1300} 1301 1302void RegExpResultsCache::Clear(FixedArray cache) { 1303 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 1304 cache.set(i, Smi::zero()); 1305 } 1306} 1307 1308} // namespace internal 1309} // namespace v8 1310