xref: /third_party/node/deps/v8/src/regexp/regexp.cc (revision 1cb0ef41)
1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp.h"
6
7#include "src/base/strings.h"
8#include "src/codegen/compilation-cache.h"
9#include "src/diagnostics/code-tracer.h"
10#include "src/execution/interrupts-scope.h"
11#include "src/heap/heap-inl.h"
12#include "src/objects/js-regexp-inl.h"
13#include "src/regexp/experimental/experimental.h"
14#include "src/regexp/regexp-bytecode-generator.h"
15#include "src/regexp/regexp-bytecodes.h"
16#include "src/regexp/regexp-compiler.h"
17#include "src/regexp/regexp-dotprinter.h"
18#include "src/regexp/regexp-interpreter.h"
19#include "src/regexp/regexp-macro-assembler-arch.h"
20#include "src/regexp/regexp-macro-assembler-tracer.h"
21#include "src/regexp/regexp-parser.h"
22#include "src/regexp/regexp-utils.h"
23#include "src/strings/string-search.h"
24#include "src/utils/ostreams.h"
25
26namespace v8 {
27namespace internal {
28
29using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
30
31class RegExpImpl final : public AllStatic {
32 public:
33  // Returns a string representation of a regular expression.
34  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
35  // This function calls the garbage collector if necessary.
36  static Handle<String> ToString(Handle<Object> value);
37
38  // Prepares a JSRegExp object with Irregexp-specific data.
39  static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
40                                 Handle<String> pattern, RegExpFlags flags,
41                                 int capture_count, uint32_t backtrack_limit);
42
43  // Prepare a RegExp for being executed one or more times (using
44  // IrregexpExecOnce) on the subject.
45  // This ensures that the regexp is compiled for the subject, and that
46  // the subject is flat.
47  // Returns the number of integer spaces required by IrregexpExecOnce
48  // as its "registers" argument.  If the regexp cannot be compiled,
49  // an exception is set as pending, and this function returns negative.
50  static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
51                             Handle<String> subject);
52
53  static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
54                          Handle<String> pattern, RegExpFlags flags,
55                          Handle<String> match_pattern);
56
57  static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
58                         Handle<String> subject, int index, int32_t* output,
59                         int output_size);
60
61  static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
62                                 Handle<String> subject, int index,
63                                 Handle<RegExpMatchInfo> last_match_info);
64
65  // Execute a regular expression on the subject, starting from index.
66  // If matching succeeds, return the number of matches.  This can be larger
67  // than one in the case of global regular expressions.
68  // The captures and subcaptures are stored into the registers vector.
69  // If matching fails, returns RE_FAILURE.
70  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
71  static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
72                             Handle<String> subject, int index, int32_t* output,
73                             int output_size);
74
75  // Execute an Irregexp bytecode pattern.
76  // On a successful match, the result is a JSArray containing
77  // captured positions.  On a failure, the result is the null value.
78  // Returns an empty handle in case of an exception.
79  V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
80      Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
81      int index, Handle<RegExpMatchInfo> last_match_info,
82      RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
83
84  static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
85                              Handle<String> sample_subject, bool is_one_byte);
86  static inline bool EnsureCompiledIrregexp(Isolate* isolate,
87                                            Handle<JSRegExp> re,
88                                            Handle<String> sample_subject,
89                                            bool is_one_byte);
90
91  // Returns true on success, false on failure.
92  static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
93                      RegExpFlags flags, Handle<String> pattern,
94                      Handle<String> sample_subject, bool is_one_byte,
95                      uint32_t& backtrack_limit);
96
97  // For acting on the JSRegExp data FixedArray.
98  static int IrregexpMaxRegisterCount(FixedArray re);
99  static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
100  static int IrregexpNumberOfCaptures(FixedArray re);
101  static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
102  static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
103};
104
105// static
106bool RegExp::CanGenerateBytecode() {
107  return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
108}
109
110// static
111template <class CharT>
112bool RegExp::VerifySyntax(Zone* zone, uintptr_t stack_limit, const CharT* input,
113                          int input_length, RegExpFlags flags,
114                          RegExpError* regexp_error_out,
115                          const DisallowGarbageCollection& no_gc) {
116  RegExpCompileData data;
117  bool pattern_is_valid = RegExpParser::VerifyRegExpSyntax(
118      zone, stack_limit, input, input_length, flags, &data, no_gc);
119  *regexp_error_out = data.error;
120  return pattern_is_valid;
121}
122
123template bool RegExp::VerifySyntax<uint8_t>(Zone*, uintptr_t, const uint8_t*,
124                                            int, RegExpFlags,
125                                            RegExpError* regexp_error_out,
126                                            const DisallowGarbageCollection&);
127template bool RegExp::VerifySyntax<base::uc16>(
128    Zone*, uintptr_t, const base::uc16*, int, RegExpFlags,
129    RegExpError* regexp_error_out, const DisallowGarbageCollection&);
130
131MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
132                                                 Handle<JSRegExp> re,
133                                                 Handle<String> pattern,
134                                                 RegExpError error) {
135  base::Vector<const char> error_data =
136      base::CStrVector(RegExpErrorString(error));
137  Handle<String> error_text =
138      isolate->factory()
139          ->NewStringFromOneByte(base::Vector<const uint8_t>::cast(error_data))
140          .ToHandleChecked();
141  THROW_NEW_ERROR(
142      isolate,
143      NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
144      Object);
145}
146
147void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
148                                  RegExpError error_text) {
149  USE(ThrowRegExpException(isolate, re, Handle<String>(re->source(), isolate),
150                           error_text));
151}
152
153bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) {
154  return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
155}
156
157namespace {
158
159// Identifies the sort of regexps where the regexp engine is faster
160// than the code used for atom matches.
161bool HasFewDifferentCharacters(Handle<String> pattern) {
162  int length = std::min(kMaxLookaheadForBoyerMoore, pattern->length());
163  if (length <= kPatternTooShortForBoyerMoore) return false;
164  const int kMod = 128;
165  bool character_found[kMod];
166  int different = 0;
167  memset(&character_found[0], 0, sizeof(character_found));
168  for (int i = 0; i < length; i++) {
169    int ch = (pattern->Get(i) & (kMod - 1));
170    if (!character_found[ch]) {
171      character_found[ch] = true;
172      different++;
173      // We declare a regexp low-alphabet if it has at least 3 times as many
174      // characters as it has different characters.
175      if (different * 3 > length) return false;
176    }
177  }
178  return true;
179}
180
181}  // namespace
182
183// Generic RegExp methods. Dispatches to implementation specific methods.
184
185// static
186MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
187                                    Handle<String> pattern, RegExpFlags flags,
188                                    uint32_t backtrack_limit) {
189  DCHECK(pattern->IsFlat());
190
191  // Caching is based only on the pattern and flags, but code also differs when
192  // a backtrack limit is set. A present backtrack limit is very much *not* the
193  // common case, so just skip the cache for these.
194  const bool is_compilation_cache_enabled =
195      (backtrack_limit == JSRegExp::kNoBacktrackLimit);
196
197  Zone zone(isolate->allocator(), ZONE_NAME);
198  CompilationCache* compilation_cache = nullptr;
199  if (is_compilation_cache_enabled) {
200    compilation_cache = isolate->compilation_cache();
201    MaybeHandle<FixedArray> maybe_cached = compilation_cache->LookupRegExp(
202        pattern, JSRegExp::AsJSRegExpFlags(flags));
203    Handle<FixedArray> cached;
204    if (maybe_cached.ToHandle(&cached)) {
205      re->set_data(*cached);
206      return re;
207    }
208  }
209
210  PostponeInterruptsScope postpone(isolate);
211  RegExpCompileData parse_result;
212  DCHECK(!isolate->has_pending_exception());
213  if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
214                                               &parse_result)) {
215    // Throw an exception if we fail to parse the pattern.
216    return RegExp::ThrowRegExpException(isolate, re, pattern,
217                                        parse_result.error);
218  }
219
220  bool has_been_compiled = false;
221
222  if (FLAG_default_to_experimental_regexp_engine &&
223      ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
224                                       parse_result.capture_count)) {
225    DCHECK(FLAG_enable_experimental_regexp_engine);
226    ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
227                                   parse_result.capture_count);
228    has_been_compiled = true;
229  } else if (flags & JSRegExp::kLinear) {
230    DCHECK(FLAG_enable_experimental_regexp_engine);
231    if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
232                                          parse_result.capture_count)) {
233      // TODO(mbid): The error could provide a reason for why the regexp can't
234      // be executed in linear time (e.g. due to back references).
235      return RegExp::ThrowRegExpException(isolate, re, pattern,
236                                          RegExpError::kNotLinear);
237    }
238    ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
239                                   parse_result.capture_count);
240    has_been_compiled = true;
241  } else if (parse_result.simple && !IsIgnoreCase(flags) && !IsSticky(flags) &&
242             !HasFewDifferentCharacters(pattern)) {
243    // Parse-tree is a single atom that is equal to the pattern.
244    RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
245    has_been_compiled = true;
246  } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
247             parse_result.capture_count == 0) {
248    RegExpAtom* atom = parse_result.tree->AsAtom();
249    // The pattern source might (?) contain escape sequences, but they're
250    // resolved in atom_string.
251    base::Vector<const base::uc16> atom_pattern = atom->data();
252    Handle<String> atom_string;
253    ASSIGN_RETURN_ON_EXCEPTION(
254        isolate, atom_string,
255        isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
256    if (!IsIgnoreCase(flags) && !HasFewDifferentCharacters(atom_string)) {
257      RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
258      has_been_compiled = true;
259    }
260  }
261  if (!has_been_compiled) {
262    RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
263                                   parse_result.capture_count, backtrack_limit);
264  }
265  DCHECK(re->data().IsFixedArray());
266  // Compilation succeeded so the data is set on the regexp
267  // and we can store it in the cache.
268  Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
269  if (is_compilation_cache_enabled) {
270    compilation_cache->PutRegExp(pattern, JSRegExp::AsJSRegExpFlags(flags),
271                                 data);
272  }
273
274  return re;
275}
276
277// static
278bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
279                                 Handle<String> subject) {
280  switch (re->type_tag()) {
281    case JSRegExp::NOT_COMPILED:
282      UNREACHABLE();
283    case JSRegExp::ATOM:
284      return true;
285    case JSRegExp::IRREGEXP:
286      if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
287        DCHECK(isolate->has_pending_exception());
288        return false;
289      }
290      return true;
291    case JSRegExp::EXPERIMENTAL:
292      if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
293          !ExperimentalRegExp::Compile(isolate, re)) {
294        DCHECK(isolate->has_pending_exception());
295        return false;
296      }
297      return true;
298  }
299}
300
301// static
302MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
303    Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
304    int index, Handle<RegExpMatchInfo> last_match_info,
305    RegExp::ExecQuirks exec_quirks) {
306  return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
307                                         last_match_info, exec_quirks);
308}
309
310// static
311MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
312                                 Handle<String> subject, int index,
313                                 Handle<RegExpMatchInfo> last_match_info,
314                                 ExecQuirks exec_quirks) {
315  switch (regexp->type_tag()) {
316    case JSRegExp::NOT_COMPILED:
317      UNREACHABLE();
318    case JSRegExp::ATOM:
319      return RegExpImpl::AtomExec(isolate, regexp, subject, index,
320                                  last_match_info);
321    case JSRegExp::IRREGEXP:
322      return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
323                                      last_match_info, exec_quirks);
324    case JSRegExp::EXPERIMENTAL:
325      return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
326                                      last_match_info, exec_quirks);
327  }
328}
329
330// RegExp Atom implementation: Simple string search using indexOf.
331
332void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
333                             Handle<String> pattern, RegExpFlags flags,
334                             Handle<String> match_pattern) {
335  isolate->factory()->SetRegExpAtomData(
336      re, pattern, JSRegExp::AsJSRegExpFlags(flags), match_pattern);
337}
338
339namespace {
340
341void SetAtomLastCapture(Isolate* isolate,
342                        Handle<RegExpMatchInfo> last_match_info, String subject,
343                        int from, int to) {
344  SealHandleScope shs(isolate);
345  last_match_info->SetNumberOfCaptureRegisters(2);
346  last_match_info->SetLastSubject(subject);
347  last_match_info->SetLastInput(subject);
348  last_match_info->SetCapture(0, from);
349  last_match_info->SetCapture(1, to);
350}
351
352}  // namespace
353
354int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
355                            Handle<String> subject, int index, int32_t* output,
356                            int output_size) {
357  DCHECK_LE(0, index);
358  DCHECK_LE(index, subject->length());
359
360  subject = String::Flatten(isolate, subject);
361  DisallowGarbageCollection no_gc;  // ensure vectors stay valid
362
363  String needle = regexp->atom_pattern();
364  int needle_len = needle.length();
365  DCHECK(needle.IsFlat());
366  DCHECK_LT(0, needle_len);
367
368  if (index + needle_len > subject->length()) {
369    return RegExp::RE_FAILURE;
370  }
371
372  for (int i = 0; i < output_size; i += 2) {
373    String::FlatContent needle_content = needle.GetFlatContent(no_gc);
374    String::FlatContent subject_content = subject->GetFlatContent(no_gc);
375    DCHECK(needle_content.IsFlat());
376    DCHECK(subject_content.IsFlat());
377    // dispatch on type of strings
378    index =
379        (needle_content.IsOneByte()
380             ? (subject_content.IsOneByte()
381                    ? SearchString(isolate, subject_content.ToOneByteVector(),
382                                   needle_content.ToOneByteVector(), index)
383                    : SearchString(isolate, subject_content.ToUC16Vector(),
384                                   needle_content.ToOneByteVector(), index))
385             : (subject_content.IsOneByte()
386                    ? SearchString(isolate, subject_content.ToOneByteVector(),
387                                   needle_content.ToUC16Vector(), index)
388                    : SearchString(isolate, subject_content.ToUC16Vector(),
389                                   needle_content.ToUC16Vector(), index)));
390    if (index == -1) {
391      return i / 2;  // Return number of matches.
392    } else {
393      output[i] = index;
394      output[i + 1] = index + needle_len;
395      index += needle_len;
396    }
397  }
398  return output_size / 2;
399}
400
401Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
402                                    Handle<String> subject, int index,
403                                    Handle<RegExpMatchInfo> last_match_info) {
404  static const int kNumRegisters = 2;
405  STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
406  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
407
408  int res =
409      AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
410
411  if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value();
412
413  DCHECK_EQ(res, RegExp::RE_SUCCESS);
414  SealHandleScope shs(isolate);
415  SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
416                     output_registers[1]);
417  return last_match_info;
418}
419
420// Irregexp implementation.
421
422// Ensures that the regexp object contains a compiled version of the
423// source for either one-byte or two-byte subject strings.
424// If the compiled version doesn't already exist, it is compiled
425// from the source pattern.
426// If compilation fails, an exception is thrown and this function
427// returns false.
428bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
429                                        Handle<String> sample_subject,
430                                        bool is_one_byte) {
431  Object compiled_code = re->code(is_one_byte);
432  Object bytecode = re->bytecode(is_one_byte);
433  bool needs_initial_compilation =
434      compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue);
435  // Recompile is needed when we're dealing with the first execution of the
436  // regexp after the decision to tier up has been made. If the tiering up
437  // strategy is not in use, this value is always false.
438  bool needs_tier_up_compilation =
439      re->MarkedForTierUp() && bytecode.IsByteArray();
440
441  if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) {
442    PrintF("JSRegExp object %p needs tier-up compilation\n",
443           reinterpret_cast<void*>(re->ptr()));
444  }
445
446  if (!needs_initial_compilation && !needs_tier_up_compilation) {
447    DCHECK(compiled_code.IsCodeT());
448    DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray());
449    return true;
450  }
451
452  DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray());
453
454  return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
455}
456
457namespace {
458
459#ifdef DEBUG
460bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
461  Object entry = re->code(is_one_byte);
462  Object bytecode = re->bytecode(is_one_byte);
463  // If we're not using the tier-up strategy, entry can only be a smi
464  // representing an uncompiled regexp here. If we're using the tier-up
465  // strategy, entry can still be a smi representing an uncompiled regexp, when
466  // compiling the regexp before the tier-up, or it can contain a trampoline to
467  // the regexp interpreter, in which case the bytecode field contains compiled
468  // bytecode, when recompiling the regexp after the tier-up. If the
469  // tier-up was forced, which happens for global replaces, entry is a smi
470  // representing an uncompiled regexp, even though we're "recompiling" after
471  // the tier-up.
472  if (re->ShouldProduceBytecode()) {
473    DCHECK(entry.IsSmi());
474    DCHECK(bytecode.IsSmi());
475    int entry_value = Smi::ToInt(entry);
476    int bytecode_value = Smi::ToInt(bytecode);
477    DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
478    DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value);
479  } else {
480    DCHECK(entry.IsSmi() || (entry.IsCodeT() && bytecode.IsByteArray()));
481  }
482
483  return true;
484}
485#endif
486
487struct RegExpCaptureIndexLess {
488  bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
489    DCHECK_NOT_NULL(lhs);
490    DCHECK_NOT_NULL(rhs);
491    return lhs->index() < rhs->index();
492  }
493};
494
495}  // namespace
496
497// static
498Handle<FixedArray> RegExp::CreateCaptureNameMap(
499    Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures) {
500  if (named_captures == nullptr) return Handle<FixedArray>();
501
502  DCHECK(!named_captures->empty());
503
504  // Named captures are sorted by name (because the set is used to ensure
505  // name uniqueness). But the capture name map must to be sorted by index.
506
507  std::sort(named_captures->begin(), named_captures->end(),
508            RegExpCaptureIndexLess{});
509
510  int len = static_cast<int>(named_captures->size()) * 2;
511  Handle<FixedArray> array = isolate->factory()->NewFixedArray(len);
512
513  int i = 0;
514  for (const RegExpCapture* capture : *named_captures) {
515    base::Vector<const base::uc16> capture_name(capture->name()->data(),
516                                                capture->name()->size());
517    // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
518    // internalized so they can be used as property names in the 'exec' results.
519    Handle<String> name = isolate->factory()->InternalizeString(capture_name);
520    array->set(i * 2, *name);
521    array->set(i * 2 + 1, Smi::FromInt(capture->index()));
522
523    i++;
524  }
525  DCHECK_EQ(i * 2, len);
526
527  return array;
528}
529
530bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
531                                 Handle<String> sample_subject,
532                                 bool is_one_byte) {
533  // Compile the RegExp.
534  Zone zone(isolate->allocator(), ZONE_NAME);
535  PostponeInterruptsScope postpone(isolate);
536
537  DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte));
538
539  RegExpFlags flags = JSRegExp::AsRegExpFlags(re->flags());
540
541  Handle<String> pattern(re->source(), isolate);
542  pattern = String::Flatten(isolate, pattern);
543  RegExpCompileData compile_data;
544  if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
545                                               &compile_data)) {
546    // Throw an exception if we fail to parse the pattern.
547    // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
548    USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
549    return false;
550  }
551  // The compilation target is a kBytecode if we're interpreting all regexp
552  // objects, or if we're using the tier-up strategy but the tier-up hasn't
553  // happened yet. The compilation target is a kNative if we're using the
554  // tier-up strategy and we need to recompile to tier-up, or if we're producing
555  // native code for all regexp objects.
556  compile_data.compilation_target = re->ShouldProduceBytecode()
557                                        ? RegExpCompilationTarget::kBytecode
558                                        : RegExpCompilationTarget::kNative;
559  uint32_t backtrack_limit = re->backtrack_limit();
560  const bool compilation_succeeded =
561      Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
562              is_one_byte, backtrack_limit);
563  if (!compilation_succeeded) {
564    DCHECK(compile_data.error != RegExpError::kNone);
565    RegExp::ThrowRegExpException(isolate, re, compile_data.error);
566    return false;
567  }
568
569  Handle<FixedArray> data =
570      Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
571  if (compile_data.compilation_target == RegExpCompilationTarget::kNative) {
572    Code code = Code::cast(*compile_data.code);
573    data->set(JSRegExp::code_index(is_one_byte), ToCodeT(code));
574
575    // Reset bytecode to uninitialized. In case we use tier-up we know that
576    // tier-up has happened this way.
577    data->set(JSRegExp::bytecode_index(is_one_byte),
578              Smi::FromInt(JSRegExp::kUninitializedValue));
579  } else {
580    DCHECK_EQ(compile_data.compilation_target,
581              RegExpCompilationTarget::kBytecode);
582    // Store code generated by compiler in bytecode and trampoline to
583    // interpreter in code.
584    data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code);
585    Handle<CodeT> trampoline =
586        BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
587    data->set(JSRegExp::code_index(is_one_byte), *trampoline);
588  }
589  Handle<FixedArray> capture_name_map =
590      RegExp::CreateCaptureNameMap(isolate, compile_data.named_captures);
591  re->set_capture_name_map(capture_name_map);
592  int register_max = IrregexpMaxRegisterCount(*data);
593  if (compile_data.register_count > register_max) {
594    SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
595  }
596  data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
597
598  if (FLAG_trace_regexp_tier_up) {
599    PrintF("JSRegExp object %p %s size: %d\n",
600           reinterpret_cast<void*>(re->ptr()),
601           re->ShouldProduceBytecode() ? "bytecode" : "native code",
602           re->ShouldProduceBytecode()
603               ? IrregexpByteCode(*data, is_one_byte).Size()
604               : IrregexpNativeCode(*data, is_one_byte).Size());
605  }
606
607  return true;
608}
609
610int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
611  return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex));
612}
613
614void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
615  re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
616}
617
618int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
619  return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
620}
621
622ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
623  return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte)));
624}
625
626Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
627  return Code::cast(re.get(JSRegExp::code_index(is_one_byte)));
628}
629
630void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
631                                    Handle<String> pattern, RegExpFlags flags,
632                                    int capture_count,
633                                    uint32_t backtrack_limit) {
634  // Initialize compiled code entries to null.
635  isolate->factory()->SetRegExpIrregexpData(re, pattern,
636                                            JSRegExp::AsJSRegExpFlags(flags),
637                                            capture_count, backtrack_limit);
638}
639
640// static
641int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
642                                Handle<String> subject) {
643  DCHECK(subject->IsFlat());
644
645  // Check representation of the underlying storage.
646  bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
647  if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject,
648                                          is_one_byte)) {
649    return -1;
650  }
651
652  // Only reserve room for output captures. Internal registers are allocated by
653  // the engine.
654  return JSRegExp::RegistersForCaptureCount(regexp->capture_count());
655}
656
657int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
658                                Handle<String> subject, int index,
659                                int32_t* output, int output_size) {
660  DCHECK_LE(0, index);
661  DCHECK_LE(index, subject->length());
662  DCHECK(subject->IsFlat());
663  DCHECK_GE(output_size,
664            JSRegExp::RegistersForCaptureCount(regexp->capture_count()));
665
666  bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
667
668  if (!regexp->ShouldProduceBytecode()) {
669    do {
670      EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
671      // The stack is used to allocate registers for the compiled regexp code.
672      // This means that in case of failure, the output registers array is left
673      // untouched and contains the capture results from the previous successful
674      // match.  We can use that to set the last match info lazily.
675      int res = NativeRegExpMacroAssembler::Match(regexp, subject, output,
676                                                  output_size, index, isolate);
677      if (res != NativeRegExpMacroAssembler::RETRY) {
678        DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
679               isolate->has_pending_exception());
680        STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
681                      RegExp::RE_SUCCESS);
682        STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
683                      RegExp::RE_FAILURE);
684        STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
685                      RegExp::RE_EXCEPTION);
686        return res;
687      }
688      // If result is RETRY, the string has changed representation, and we
689      // must restart from scratch.
690      // In this case, it means we must make sure we are prepared to handle
691      // the, potentially, different subject (the string can switch between
692      // being internal and external, and even between being Latin1 and
693      // UC16, but the characters are always the same).
694      is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
695    } while (true);
696    UNREACHABLE();
697  } else {
698    DCHECK(regexp->ShouldProduceBytecode());
699
700    do {
701      IrregexpInterpreter::Result result =
702          IrregexpInterpreter::MatchForCallFromRuntime(
703              isolate, regexp, subject, output, output_size, index);
704      DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
705                     isolate->has_pending_exception());
706
707      switch (result) {
708        case IrregexpInterpreter::SUCCESS:
709        case IrregexpInterpreter::EXCEPTION:
710        case IrregexpInterpreter::FAILURE:
711        case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL:
712          return result;
713        case IrregexpInterpreter::RETRY:
714          // The string has changed representation, and we must restart the
715          // match.
716          // We need to reset the tier up to start over with compilation.
717          if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick();
718          is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
719          EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
720          break;
721      }
722    } while (true);
723    UNREACHABLE();
724  }
725}
726
727MaybeHandle<Object> RegExpImpl::IrregexpExec(
728    Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
729    int previous_index, Handle<RegExpMatchInfo> last_match_info,
730    RegExp::ExecQuirks exec_quirks) {
731  DCHECK_EQ(regexp->type_tag(), JSRegExp::IRREGEXP);
732
733  subject = String::Flatten(isolate, subject);
734
735#ifdef DEBUG
736  if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) {
737    PrintF("\n\nRegexp match:   /%s/\n\n", regexp->source().ToCString().get());
738    PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
739  }
740#endif
741
742  // For very long subject strings, the regexp interpreter is currently much
743  // slower than the jitted code execution. If the tier-up strategy is turned
744  // on, we want to avoid this performance penalty so we eagerly tier-up if the
745  // subject string length is equal or greater than the given heuristic value.
746  if (FLAG_regexp_tier_up &&
747      subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
748    regexp->MarkTierUpForNextExec();
749    if (FLAG_trace_regexp_tier_up) {
750      PrintF(
751          "Forcing tier-up for very long strings in "
752          "RegExpImpl::IrregexpExec\n");
753    }
754  }
755
756  // Prepare space for the return values.
757  int required_registers =
758      RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
759  if (required_registers < 0) {
760    // Compiling failed with an exception.
761    DCHECK(isolate->has_pending_exception());
762    return MaybeHandle<Object>();
763  }
764
765  int32_t* output_registers = nullptr;
766  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
767    output_registers = NewArray<int32_t>(required_registers);
768  }
769  std::unique_ptr<int32_t[]> auto_release(output_registers);
770  if (output_registers == nullptr) {
771    output_registers = isolate->jsregexp_static_offsets_vector();
772  }
773
774  int res =
775      RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
776                                  output_registers, required_registers);
777
778  if (res == RegExp::RE_SUCCESS) {
779    if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
780      if (output_registers[0] >= subject->length()) {
781        return isolate->factory()->null_value();
782      }
783    }
784    int capture_count = regexp->capture_count();
785    return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
786                                    capture_count, output_registers);
787  } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
788    return ExperimentalRegExp::OneshotExec(isolate, regexp, subject,
789                                           previous_index, last_match_info);
790  } else if (res == RegExp::RE_EXCEPTION) {
791    DCHECK(isolate->has_pending_exception());
792    return MaybeHandle<Object>();
793  } else {
794    DCHECK(res == RegExp::RE_FAILURE);
795    return isolate->factory()->null_value();
796  }
797}
798
799// static
800Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo(
801    Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
802    Handle<String> subject, int capture_count, int32_t* match) {
803  // This is the only place where match infos can grow. If, after executing the
804  // regexp, RegExpExecStub finds that the match info is too small, it restarts
805  // execution in RegExpImpl::Exec, which finally grows the match info right
806  // here.
807  Handle<RegExpMatchInfo> result =
808      RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
809  if (*result != *last_match_info) {
810    if (*last_match_info == *isolate->regexp_last_match_info()) {
811      // This inner condition is only needed for special situations like the
812      // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
813      // RegExpImpl::Exec; there actually want to bypass the Isolate's match
814      // info and execute the regexp without side effects.
815      isolate->native_context()->set_regexp_last_match_info(*result);
816    }
817  }
818
819  int capture_register_count =
820      JSRegExp::RegistersForCaptureCount(capture_count);
821  DisallowGarbageCollection no_gc;
822  if (match != nullptr) {
823    for (int i = 0; i < capture_register_count; i += 2) {
824      result->SetCapture(i, match[i]);
825      result->SetCapture(i + 1, match[i + 1]);
826    }
827  }
828  result->SetLastSubject(*subject);
829  result->SetLastInput(*subject);
830  return result;
831}
832
833// static
834void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) {
835  DotPrinter::DotPrint(label, node);
836}
837
838namespace {
839
840// Returns true if we've either generated too much irregex code within this
841// isolate, or the pattern string is too long.
842bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
843  // Limit the space regexps take up on the heap.  In order to limit this we
844  // would like to keep track of the amount of regexp code on the heap.  This
845  // is not tracked, however.  As a conservative approximation we track the
846  // total regexp code compiled including code that has subsequently been freed
847  // and the total executable memory at any point.
848  static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
849  static constexpr size_t kRegExpCompiledLimit = 1 * MB;
850
851  Heap* heap = isolate->heap();
852  if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
853  return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
854          heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
855}
856
857}  // namespace
858
859// static
860bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone,
861                               RegExpCompileData* data, RegExpFlags flags,
862                               Handle<String> pattern,
863                               Handle<String> sample_subject,
864                               bool is_one_byte) {
865  uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
866  return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
867                             sample_subject, is_one_byte, backtrack_limit);
868}
869
870bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
871                         RegExpFlags flags, Handle<String> pattern,
872                         Handle<String> sample_subject, bool is_one_byte,
873                         uint32_t& backtrack_limit) {
874  if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
875      RegExpMacroAssembler::kMaxRegisterCount) {
876    data->error = RegExpError::kTooLarge;
877    return false;
878  }
879
880  RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
881                          is_one_byte);
882
883  if (compiler.optimize()) {
884    compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
885  }
886
887  // Sample some characters from the middle of the string.
888  static const int kSampleSize = 128;
889
890  sample_subject = String::Flatten(isolate, sample_subject);
891  int chars_sampled = 0;
892  int half_way = (sample_subject->length() - kSampleSize) / 2;
893  for (int i = std::max(0, half_way);
894       i < sample_subject->length() && chars_sampled < kSampleSize;
895       i++, chars_sampled++) {
896    compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
897  }
898
899  data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
900  data->error = AnalyzeRegExp(isolate, is_one_byte, flags, data->node);
901  if (data->error != RegExpError::kNone) {
902    return false;
903  }
904
905  if (FLAG_trace_regexp_graph) DotPrinter::DotPrint("Start", data->node);
906
907  // Create the correct assembler for the architecture.
908  std::unique_ptr<RegExpMacroAssembler> macro_assembler;
909  if (data->compilation_target == RegExpCompilationTarget::kNative) {
910    // Native regexp implementation.
911    DCHECK(!FLAG_jitless);
912
913    NativeRegExpMacroAssembler::Mode mode =
914        is_one_byte ? NativeRegExpMacroAssembler::LATIN1
915                    : NativeRegExpMacroAssembler::UC16;
916
917    const int output_register_count =
918        JSRegExp::RegistersForCaptureCount(data->capture_count);
919#if V8_TARGET_ARCH_IA32
920    macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
921                                                       output_register_count));
922#elif V8_TARGET_ARCH_X64
923    macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
924                                                      output_register_count));
925#elif V8_TARGET_ARCH_ARM
926    macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
927                                                      output_register_count));
928#elif V8_TARGET_ARCH_ARM64
929    macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
930                                                        output_register_count));
931#elif V8_TARGET_ARCH_S390
932    macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
933                                                       output_register_count));
934#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
935    macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
936                                                      output_register_count));
937#elif V8_TARGET_ARCH_MIPS
938    macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
939                                                       output_register_count));
940#elif V8_TARGET_ARCH_MIPS64
941    macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
942                                                       output_register_count));
943#elif V8_TARGET_ARCH_RISCV64
944    macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode,
945                                                        output_register_count));
946#elif V8_TARGET_ARCH_LOONG64
947    macro_assembler.reset(new RegExpMacroAssemblerLOONG64(
948        isolate, zone, mode, output_register_count));
949#else
950#error "Unsupported architecture"
951#endif
952  } else {
953    DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
954    // Interpreted regexp implementation.
955    macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
956  }
957
958  macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
959  if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks &&
960      ExperimentalRegExp::CanBeHandled(data->tree, flags,
961                                       data->capture_count)) {
962    if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
963      backtrack_limit = FLAG_regexp_backtracks_before_fallback;
964    } else {
965      backtrack_limit =
966          std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback);
967    }
968    macro_assembler->set_backtrack_limit(backtrack_limit);
969    macro_assembler->set_can_fallback(true);
970  } else {
971    macro_assembler->set_backtrack_limit(backtrack_limit);
972    macro_assembler->set_can_fallback(false);
973  }
974
975  // Inserted here, instead of in Assembler, because it depends on information
976  // in the AST that isn't replicated in the Node structure.
977  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
978  bool is_start_anchored = data->tree->IsAnchoredAtStart();
979  int max_length = data->tree->max_match();
980  static const int kMaxBacksearchLimit = 1024;
981  if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
982      max_length < kMaxBacksearchLimit) {
983    macro_assembler->SetCurrentPositionFromEnd(max_length);
984  }
985
986  if (IsGlobal(flags)) {
987    RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
988    if (data->tree->min_match() > 0) {
989      mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
990    } else if (IsUnicode(flags)) {
991      mode = RegExpMacroAssembler::GLOBAL_UNICODE;
992    }
993    macro_assembler->set_global_mode(mode);
994  }
995
996  RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
997#ifdef DEBUG
998  std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
999  if (FLAG_trace_regexp_assembler) {
1000    tracer_macro_assembler.reset(
1001        new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
1002    macro_assembler_ptr = tracer_macro_assembler.get();
1003  }
1004#endif
1005
1006  RegExpCompiler::CompilationResult result = compiler.Assemble(
1007      isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
1008
1009  // Code / bytecode printing.
1010  {
1011#ifdef ENABLE_DISASSEMBLER
1012    if (FLAG_print_regexp_code &&
1013        data->compilation_target == RegExpCompilationTarget::kNative) {
1014      CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1015      OFStream os(trace_scope.file());
1016      Handle<Code> c = Handle<Code>::cast(result.code);
1017      auto pattern_cstring = pattern->ToCString();
1018      c->Disassemble(pattern_cstring.get(), os, isolate);
1019    }
1020#endif
1021    if (FLAG_print_regexp_bytecode &&
1022        data->compilation_target == RegExpCompilationTarget::kBytecode) {
1023      Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code);
1024      auto pattern_cstring = pattern->ToCString();
1025      RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(),
1026                                bytecode->length(), pattern_cstring.get());
1027    }
1028  }
1029
1030  if (result.error != RegExpError::kNone) {
1031    if (FLAG_correctness_fuzzer_suppressions &&
1032        result.error == RegExpError::kStackOverflow) {
1033      FATAL("Aborting on stack overflow");
1034    }
1035    data->error = result.error;
1036  }
1037
1038  data->code = result.code;
1039  data->register_count = result.num_registers;
1040
1041  return result.Succeeded();
1042}
1043
1044RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
1045                                     Handle<String> subject, Isolate* isolate)
1046    : register_array_(nullptr),
1047      register_array_size_(0),
1048      regexp_(regexp),
1049      subject_(subject),
1050      isolate_(isolate) {
1051  DCHECK(IsGlobal(JSRegExp::AsRegExpFlags(regexp->flags())));
1052
1053  switch (regexp_->type_tag()) {
1054    case JSRegExp::NOT_COMPILED:
1055      UNREACHABLE();
1056    case JSRegExp::ATOM: {
1057      // ATOM regexps do not have a global loop, so we search for one match at
1058      // a time.
1059      static const int kAtomRegistersPerMatch = 2;
1060      registers_per_match_ = kAtomRegistersPerMatch;
1061      register_array_size_ = registers_per_match_;
1062      break;
1063    }
1064    case JSRegExp::IRREGEXP: {
1065      registers_per_match_ =
1066          RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
1067      if (registers_per_match_ < 0) {
1068        num_matches_ = -1;  // Signal exception.
1069        return;
1070      }
1071      if (regexp->ShouldProduceBytecode()) {
1072        // Global loop in interpreted regexp is not implemented.  We choose the
1073        // size of the offsets vector so that it can only store one match.
1074        register_array_size_ = registers_per_match_;
1075        max_matches_ = 1;
1076      } else {
1077        register_array_size_ = std::max(
1078            {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
1079      }
1080      break;
1081    }
1082    case JSRegExp::EXPERIMENTAL: {
1083      if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
1084          !ExperimentalRegExp::Compile(isolate_, regexp)) {
1085        DCHECK(isolate->has_pending_exception());
1086        num_matches_ = -1;  // Signal exception.
1087        return;
1088      }
1089      registers_per_match_ =
1090          JSRegExp::RegistersForCaptureCount(regexp->capture_count());
1091      register_array_size_ = std::max(
1092          {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
1093      break;
1094    }
1095  }
1096
1097  max_matches_ = register_array_size_ / registers_per_match_;
1098
1099  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1100    register_array_ = NewArray<int32_t>(register_array_size_);
1101  } else {
1102    register_array_ = isolate->jsregexp_static_offsets_vector();
1103  }
1104
1105  // Set state so that fetching the results the first time triggers a call
1106  // to the compiled regexp.
1107  current_match_index_ = max_matches_ - 1;
1108  num_matches_ = max_matches_;
1109  DCHECK_LE(2, registers_per_match_);  // Each match has at least one capture.
1110  DCHECK_GE(register_array_size_, registers_per_match_);
1111  int32_t* last_match =
1112      &register_array_[current_match_index_ * registers_per_match_];
1113  last_match[0] = -1;
1114  last_match[1] = 0;
1115}
1116
1117RegExpGlobalCache::~RegExpGlobalCache() {
1118  // Deallocate the register array if we allocated it in the constructor
1119  // (as opposed to using the existing jsregexp_static_offsets_vector).
1120  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1121    DeleteArray(register_array_);
1122  }
1123}
1124
1125int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
1126  if (IsUnicode(JSRegExp::AsRegExpFlags(regexp_->flags())) &&
1127      last_index + 1 < subject_->length() &&
1128      unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
1129      unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
1130    // Advance over the surrogate pair.
1131    return last_index + 2;
1132  }
1133  return last_index + 1;
1134}
1135
1136int32_t* RegExpGlobalCache::FetchNext() {
1137  current_match_index_++;
1138
1139  if (current_match_index_ >= num_matches_) {
1140    // Current batch of results exhausted.
1141    // Fail if last batch was not even fully filled.
1142    if (num_matches_ < max_matches_) {
1143      num_matches_ = 0;  // Signal failed match.
1144      return nullptr;
1145    }
1146
1147    int32_t* last_match =
1148        &register_array_[(current_match_index_ - 1) * registers_per_match_];
1149    int last_end_index = last_match[1];
1150
1151    switch (regexp_->type_tag()) {
1152      case JSRegExp::NOT_COMPILED:
1153        UNREACHABLE();
1154      case JSRegExp::ATOM:
1155        num_matches_ =
1156            RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
1157                                    register_array_, register_array_size_);
1158        break;
1159      case JSRegExp::EXPERIMENTAL: {
1160        DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
1161        DisallowGarbageCollection no_gc;
1162        num_matches_ = ExperimentalRegExp::ExecRaw(
1163            isolate_, RegExp::kFromRuntime, *regexp_, *subject_,
1164            register_array_, register_array_size_, last_end_index);
1165        break;
1166      }
1167      case JSRegExp::IRREGEXP: {
1168        int last_start_index = last_match[0];
1169        if (last_start_index == last_end_index) {
1170          // Zero-length match. Advance by one code point.
1171          last_end_index = AdvanceZeroLength(last_end_index);
1172        }
1173        if (last_end_index > subject_->length()) {
1174          num_matches_ = 0;  // Signal failed match.
1175          return nullptr;
1176        }
1177        num_matches_ = RegExpImpl::IrregexpExecRaw(
1178            isolate_, regexp_, subject_, last_end_index, register_array_,
1179            register_array_size_);
1180        break;
1181      }
1182    }
1183
1184    // Fall back to experimental engine if needed and possible.
1185    if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) {
1186      num_matches_ = ExperimentalRegExp::OneshotExecRaw(
1187          isolate_, regexp_, subject_, register_array_, register_array_size_,
1188          last_end_index);
1189    }
1190
1191    if (num_matches_ <= 0) {
1192      return nullptr;
1193    }
1194    current_match_index_ = 0;
1195    return register_array_;
1196  } else {
1197    return &register_array_[current_match_index_ * registers_per_match_];
1198  }
1199}
1200
1201int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
1202  int index = current_match_index_ * registers_per_match_;
1203  if (num_matches_ == 0) {
1204    // After a failed match we shift back by one result.
1205    index -= registers_per_match_;
1206  }
1207  return &register_array_[index];
1208}
1209
1210Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
1211                                  Object key_pattern,
1212                                  FixedArray* last_match_cache,
1213                                  ResultsCacheType type) {
1214  FixedArray cache;
1215  if (!key_string.IsInternalizedString()) return Smi::zero();
1216  if (type == STRING_SPLIT_SUBSTRINGS) {
1217    DCHECK(key_pattern.IsString());
1218    if (!key_pattern.IsInternalizedString()) return Smi::zero();
1219    cache = heap->string_split_cache();
1220  } else {
1221    DCHECK(type == REGEXP_MULTIPLE_INDICES);
1222    DCHECK(key_pattern.IsFixedArray());
1223    cache = heap->regexp_multiple_cache();
1224  }
1225
1226  uint32_t hash = key_string.hash();
1227  uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1228                    ~(kArrayEntriesPerCacheEntry - 1));
1229  if (cache.get(index + kStringOffset) != key_string ||
1230      cache.get(index + kPatternOffset) != key_pattern) {
1231    index =
1232        ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1233    if (cache.get(index + kStringOffset) != key_string ||
1234        cache.get(index + kPatternOffset) != key_pattern) {
1235      return Smi::zero();
1236    }
1237  }
1238
1239  *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset));
1240  return cache.get(index + kArrayOffset);
1241}
1242
1243void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
1244                               Handle<Object> key_pattern,
1245                               Handle<FixedArray> value_array,
1246                               Handle<FixedArray> last_match_cache,
1247                               ResultsCacheType type) {
1248  Factory* factory = isolate->factory();
1249  Handle<FixedArray> cache;
1250  if (!key_string->IsInternalizedString()) return;
1251  if (type == STRING_SPLIT_SUBSTRINGS) {
1252    DCHECK(key_pattern->IsString());
1253    if (!key_pattern->IsInternalizedString()) return;
1254    cache = factory->string_split_cache();
1255  } else {
1256    DCHECK(type == REGEXP_MULTIPLE_INDICES);
1257    DCHECK(key_pattern->IsFixedArray());
1258    cache = factory->regexp_multiple_cache();
1259  }
1260
1261  uint32_t hash = key_string->hash();
1262  uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1263                    ~(kArrayEntriesPerCacheEntry - 1));
1264  if (cache->get(index + kStringOffset) == Smi::zero()) {
1265    cache->set(index + kStringOffset, *key_string);
1266    cache->set(index + kPatternOffset, *key_pattern);
1267    cache->set(index + kArrayOffset, *value_array);
1268    cache->set(index + kLastMatchOffset, *last_match_cache);
1269  } else {
1270    uint32_t index2 =
1271        ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1272    if (cache->get(index2 + kStringOffset) == Smi::zero()) {
1273      cache->set(index2 + kStringOffset, *key_string);
1274      cache->set(index2 + kPatternOffset, *key_pattern);
1275      cache->set(index2 + kArrayOffset, *value_array);
1276      cache->set(index2 + kLastMatchOffset, *last_match_cache);
1277    } else {
1278      cache->set(index2 + kStringOffset, Smi::zero());
1279      cache->set(index2 + kPatternOffset, Smi::zero());
1280      cache->set(index2 + kArrayOffset, Smi::zero());
1281      cache->set(index2 + kLastMatchOffset, Smi::zero());
1282      cache->set(index + kStringOffset, *key_string);
1283      cache->set(index + kPatternOffset, *key_pattern);
1284      cache->set(index + kArrayOffset, *value_array);
1285      cache->set(index + kLastMatchOffset, *last_match_cache);
1286    }
1287  }
1288  // If the array is a reasonably short list of substrings, convert it into a
1289  // list of internalized strings.
1290  if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
1291    for (int i = 0; i < value_array->length(); i++) {
1292      Handle<String> str(String::cast(value_array->get(i)), isolate);
1293      Handle<String> internalized_str = factory->InternalizeString(str);
1294      value_array->set(i, *internalized_str);
1295    }
1296  }
1297  // Convert backing store to a copy-on-write array.
1298  value_array->set_map_no_write_barrier(
1299      ReadOnlyRoots(isolate).fixed_cow_array_map());
1300}
1301
1302void RegExpResultsCache::Clear(FixedArray cache) {
1303  for (int i = 0; i < kRegExpResultsCacheSize; i++) {
1304    cache.set(i, Smi::zero());
1305  }
1306}
1307
1308}  // namespace internal
1309}  // namespace v8
1310