1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-compiler.h"
6
7#include "src/execution/isolate.h"
8#include "src/regexp/regexp.h"
9#include "src/strings/unicode-inl.h"
10#include "src/zone/zone-list-inl.h"
11
12#ifdef V8_INTL_SUPPORT
13#include "src/base/strings.h"
14#include "src/regexp/special-case.h"
15#include "unicode/locid.h"
16#include "unicode/uniset.h"
17#include "unicode/utypes.h"
18#endif  // V8_INTL_SUPPORT
19
20namespace v8 {
21namespace internal {
22
23using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
24
25constexpr base::uc32 kMaxCodePoint = 0x10ffff;
26constexpr int kMaxUtf16CodeUnit = 0xffff;
27constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;
28constexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
29
30// -------------------------------------------------------------------
31// Tree to graph conversion
32
33RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
34                               RegExpNode* on_success) {
35  ZoneList<TextElement>* elms =
36      compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
37  elms->Add(TextElement::Atom(this), compiler->zone());
38  return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
39                                         on_success);
40}
41
42RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
43                               RegExpNode* on_success) {
44  return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
45                                         on_success);
46}
47
48namespace {
49
50bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
51                          const int* special_class, int length) {
52  length--;  // Remove final marker.
53
54  DCHECK_EQ(kRangeEndMarker, special_class[length]);
55  DCHECK_NE(0, ranges->length());
56  DCHECK_NE(0, length);
57  DCHECK_NE(0, special_class[0]);
58
59  if (ranges->length() != (length >> 1) + 1) return false;
60
61  CharacterRange range = ranges->at(0);
62  if (range.from() != 0) return false;
63
64  for (int i = 0; i < length; i += 2) {
65    if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
66      return false;
67    }
68    range = ranges->at((i >> 1) + 1);
69    if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {
70      return false;
71    }
72  }
73
74  return range.to() == kMaxCodePoint;
75}
76
77bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
78                   int length) {
79  length--;  // Remove final marker.
80
81  DCHECK_EQ(kRangeEndMarker, special_class[length]);
82  if (ranges->length() * 2 != length) return false;
83
84  for (int i = 0; i < length; i += 2) {
85    CharacterRange range = ranges->at(i >> 1);
86    if (range.from() != static_cast<base::uc32>(special_class[i]) ||
87        range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {
88      return false;
89    }
90  }
91  return true;
92}
93
94}  // namespace
95
96bool RegExpCharacterClass::is_standard(Zone* zone) {
97  // TODO(lrn): Remove need for this function, by not throwing away information
98  // along the way.
99  if (is_negated()) {
100    return false;
101  }
102  if (set_.is_standard()) {
103    return true;
104  }
105  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
106    set_.set_standard_set_type(StandardCharacterSet::kWhitespace);
107    return true;
108  }
109  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
110    set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);
111    return true;
112  }
113  if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
114                           kLineTerminatorRangeCount)) {
115    set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);
116    return true;
117  }
118  if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
119                    kLineTerminatorRangeCount)) {
120    set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);
121    return true;
122  }
123  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
124    set_.set_standard_set_type(StandardCharacterSet::kWord);
125    return true;
126  }
127  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
128    set_.set_standard_set_type(StandardCharacterSet::kNotWord);
129    return true;
130  }
131  return false;
132}
133
134UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
135  // The unicode range splitter categorizes given character ranges into:
136  // - Code points from the BMP representable by one code unit.
137  // - Code points outside the BMP that need to be split into surrogate pairs.
138  // - Lone lead surrogates.
139  // - Lone trail surrogates.
140  // Lone surrogates are valid code points, even though no actual characters.
141  // They require special matching to make sure we do not split surrogate pairs.
142
143  for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
144}
145
146void UnicodeRangeSplitter::AddRange(CharacterRange range) {
147  static constexpr base::uc32 kBmp1Start = 0;
148  static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;
149  static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;
150  static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;
151
152  // Ends are all inclusive.
153  STATIC_ASSERT(kBmp1Start == 0);
154  STATIC_ASSERT(kBmp1Start < kBmp1End);
155  STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
156  STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
157  STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
158  STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
159  STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
160  STATIC_ASSERT(kBmp2Start < kBmp2End);
161  STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
162  STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
163
164  static constexpr base::uc32 kStarts[] = {
165      kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
166      kBmp2Start, kNonBmpStart,
167  };
168
169  static constexpr base::uc32 kEnds[] = {
170      kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
171  };
172
173  CharacterRangeVector* const kTargets[] = {
174      &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
175  };
176
177  static constexpr int kCount = arraysize(kStarts);
178  STATIC_ASSERT(kCount == arraysize(kEnds));
179  STATIC_ASSERT(kCount == arraysize(kTargets));
180
181  for (int i = 0; i < kCount; i++) {
182    if (kStarts[i] > range.to()) break;
183    const base::uc32 from = std::max(kStarts[i], range.from());
184    const base::uc32 to = std::min(kEnds[i], range.to());
185    if (from > to) continue;
186    kTargets[i]->emplace_back(CharacterRange::Range(from, to));
187  }
188}
189
190namespace {
191
192// Translates between new and old V8-isms (SmallVector, ZoneList).
193ZoneList<CharacterRange>* ToCanonicalZoneList(
194    const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
195  if (v->empty()) return nullptr;
196
197  ZoneList<CharacterRange>* result =
198      zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
199  for (size_t i = 0; i < v->size(); i++) {
200    result->Add(v->at(i), zone);
201  }
202
203  CharacterRange::Canonicalize(result);
204  return result;
205}
206
207void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
208                      RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
209  ZoneList<CharacterRange>* bmp =
210      ToCanonicalZoneList(splitter->bmp(), compiler->zone());
211  if (bmp == nullptr) return;
212  result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
213      compiler->zone(), bmp, compiler->read_backward(), on_success)));
214}
215
216using UC16Range = uint32_t;  // {from, to} packed into one uint32_t.
217constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {
218  return (static_cast<uint32_t>(from) << 16) | to;
219}
220constexpr base::uc16 ExtractFrom(UC16Range r) {
221  return static_cast<base::uc16>(r >> 16);
222}
223constexpr base::uc16 ExtractTo(UC16Range r) {
224  return static_cast<base::uc16>(r);
225}
226
227void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
228                             RegExpNode* on_success,
229                             UnicodeRangeSplitter* splitter) {
230  DCHECK(!compiler->one_byte());
231  Zone* const zone = compiler->zone();
232  ZoneList<CharacterRange>* non_bmp =
233      ToCanonicalZoneList(splitter->non_bmp(), zone);
234  if (non_bmp == nullptr) return;
235
236  // Translate each 32-bit code point range into the corresponding 16-bit code
237  // unit representation consisting of the lead- and trail surrogate.
238  //
239  // The generated alternatives are grouped by the leading surrogate to avoid
240  // emitting excessive code. For example, for
241  //
242  //  { \ud800[\udc00-\udc01]
243  //  , \ud800[\udc05-\udc06]
244  //  }
245  //
246  // there's no need to emit matching code for the leading surrogate \ud800
247  // twice. We also create a dedicated grouping for full trailing ranges, i.e.
248  // [dc00-dfff].
249  ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
250      zone);
251  ZoneList<CharacterRange>* leading_with_full_trailing_range =
252      zone->New<ZoneList<CharacterRange>>(1, zone);
253  const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,
254                            base::uc16 from_t, base::uc16 to_t) {
255    const UC16Range leading_range = ToUC16Range(from_l, to_l);
256    if (grouped_by_leading.count(leading_range) == 0) {
257      if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
258        leading_with_full_trailing_range->Add(
259            CharacterRange::Range(from_l, to_l), zone);
260        return;
261      }
262      grouped_by_leading[leading_range] =
263          zone->New<ZoneList<CharacterRange>>(2, zone);
264    }
265    grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),
266                                           zone);
267  };
268
269  // First, create the grouped ranges.
270  CharacterRange::Canonicalize(non_bmp);
271  for (int i = 0; i < non_bmp->length(); i++) {
272    // Match surrogate pair.
273    // E.g. [\u10005-\u11005] becomes
274    //      \ud800[\udc05-\udfff]|
275    //      [\ud801-\ud803][\udc00-\udfff]|
276    //      \ud804[\udc00-\udc05]
277    base::uc32 from = non_bmp->at(i).from();
278    base::uc32 to = non_bmp->at(i).to();
279    base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
280    base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
281    base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
282    base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
283
284    if (from_l == to_l) {
285      // The lead surrogate is the same.
286      AddRange(from_l, to_l, from_t, to_t);
287      continue;
288    }
289
290    if (from_t != kTrailSurrogateStart) {
291      // Add [from_l][from_t-\udfff].
292      AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);
293      from_l++;
294    }
295    if (to_t != kTrailSurrogateEnd) {
296      // Add [to_l][\udc00-to_t].
297      AddRange(to_l, to_l, kTrailSurrogateStart, to_t);
298      to_l--;
299    }
300    if (from_l <= to_l) {
301      // Add [from_l-to_l][\udc00-\udfff].
302      AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);
303    }
304  }
305
306  // Create the actual TextNode now that ranges are fully grouped.
307  if (!leading_with_full_trailing_range->is_empty()) {
308    CharacterRange::Canonicalize(leading_with_full_trailing_range);
309    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
310        zone, leading_with_full_trailing_range,
311        CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
312        compiler->read_backward(), on_success)));
313  }
314  for (const auto& it : grouped_by_leading) {
315    CharacterRange leading_range =
316        CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));
317    ZoneList<CharacterRange>* trailing_ranges = it.second;
318    CharacterRange::Canonicalize(trailing_ranges);
319    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
320        zone, leading_range, trailing_ranges, compiler->read_backward(),
321        on_success)));
322  }
323}
324
325RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
326    RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
327    ZoneList<CharacterRange>* match, RegExpNode* on_success,
328    bool read_backward) {
329  Zone* zone = compiler->zone();
330  RegExpNode* match_node = TextNode::CreateForCharacterRanges(
331      zone, match, read_backward, on_success);
332  int stack_register = compiler->UnicodeLookaroundStackRegister();
333  int position_register = compiler->UnicodeLookaroundPositionRegister();
334  RegExpLookaround::Builder lookaround(false, match_node, stack_register,
335                                       position_register);
336  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
337      zone, lookbehind, !read_backward, lookaround.on_match_success());
338  return lookaround.ForMatch(negative_match);
339}
340
341RegExpNode* MatchAndNegativeLookaroundInReadDirection(
342    RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
343    ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
344    bool read_backward) {
345  Zone* zone = compiler->zone();
346  int stack_register = compiler->UnicodeLookaroundStackRegister();
347  int position_register = compiler->UnicodeLookaroundPositionRegister();
348  RegExpLookaround::Builder lookaround(false, on_success, stack_register,
349                                       position_register);
350  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
351      zone, lookahead, read_backward, lookaround.on_match_success());
352  return TextNode::CreateForCharacterRanges(
353      zone, match, read_backward, lookaround.ForMatch(negative_match));
354}
355
356void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
357                           RegExpNode* on_success,
358                           UnicodeRangeSplitter* splitter) {
359  ZoneList<CharacterRange>* lead_surrogates =
360      ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
361  if (lead_surrogates == nullptr) return;
362  Zone* zone = compiler->zone();
363  // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
364  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
365      zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
366
367  RegExpNode* match;
368  if (compiler->read_backward()) {
369    // Reading backward. Assert that reading forward, there is no trail
370    // surrogate, and then backward match the lead surrogate.
371    match = NegativeLookaroundAgainstReadDirectionAndMatch(
372        compiler, trail_surrogates, lead_surrogates, on_success, true);
373  } else {
374    // Reading forward. Forward match the lead surrogate and assert that
375    // no trail surrogate follows.
376    match = MatchAndNegativeLookaroundInReadDirection(
377        compiler, lead_surrogates, trail_surrogates, on_success, false);
378  }
379  result->AddAlternative(GuardedAlternative(match));
380}
381
382void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
383                            RegExpNode* on_success,
384                            UnicodeRangeSplitter* splitter) {
385  ZoneList<CharacterRange>* trail_surrogates =
386      ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
387  if (trail_surrogates == nullptr) return;
388  Zone* zone = compiler->zone();
389  // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
390  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
391      zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
392
393  RegExpNode* match;
394  if (compiler->read_backward()) {
395    // Reading backward. Backward match the trail surrogate and assert that no
396    // lead surrogate precedes it.
397    match = MatchAndNegativeLookaroundInReadDirection(
398        compiler, trail_surrogates, lead_surrogates, on_success, true);
399  } else {
400    // Reading forward. Assert that reading backward, there is no lead
401    // surrogate, and then forward match the trail surrogate.
402    match = NegativeLookaroundAgainstReadDirectionAndMatch(
403        compiler, lead_surrogates, trail_surrogates, on_success, false);
404  }
405  result->AddAlternative(GuardedAlternative(match));
406}
407
408RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
409                              RegExpNode* on_success) {
410  // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
411  DCHECK(!compiler->read_backward());
412  Zone* zone = compiler->zone();
413  // Advance any character. If the character happens to be a lead surrogate and
414  // we advanced into the middle of a surrogate pair, it will work out, as
415  // nothing will match from there. We will have to advance again, consuming
416  // the associated trail surrogate.
417  ZoneList<CharacterRange>* range =
418      CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit));
419  return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
420}
421
422void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
423#ifdef V8_INTL_SUPPORT
424  DCHECK(CharacterRange::IsCanonical(ranges));
425
426  // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
427  // See also https://crbug.com/v8/6727.
428  // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
429  // which we use frequently internally. But large ranges can also easily be
430  // created by the user. We might want to have a more general caching mechanism
431  // for such ranges.
432  if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
433
434  // Use ICU to compute the case fold closure over the ranges.
435  icu::UnicodeSet set;
436  for (int i = 0; i < ranges->length(); i++) {
437    set.add(ranges->at(i).from(), ranges->at(i).to());
438  }
439  // Clear the ranges list without freeing the backing store.
440  ranges->Rewind(0);
441  set.closeOver(USET_CASE_INSENSITIVE);
442  // Full case mapping map single characters to multiple characters.
443  // Those are represented as strings in the set. Remove them so that
444  // we end up with only simple and common case mappings.
445  set.removeAllStrings();
446  for (int i = 0; i < set.getRangeCount(); i++) {
447    ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
448                zone);
449  }
450  // No errors and everything we collected have been ranges.
451  CharacterRange::Canonicalize(ranges);
452#endif  // V8_INTL_SUPPORT
453}
454
455}  // namespace
456
457RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
458                                         RegExpNode* on_success) {
459  set_.Canonicalize();
460  Zone* const zone = compiler->zone();
461  ZoneList<CharacterRange>* ranges = this->ranges(zone);
462
463  if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
464    AddUnicodeCaseEquivalents(ranges, zone);
465  }
466
467  if (!IsUnicode(compiler->flags()) || compiler->one_byte() ||
468      contains_split_surrogate()) {
469    return zone->New<TextNode>(this, compiler->read_backward(), on_success);
470  }
471
472  if (is_negated()) {
473    ZoneList<CharacterRange>* negated =
474        zone->New<ZoneList<CharacterRange>>(2, zone);
475    CharacterRange::Negate(ranges, negated, zone);
476    ranges = negated;
477  }
478
479  if (ranges->length() == 0) {
480    // The empty character class is used as a 'fail' node.
481    RegExpCharacterClass* fail = zone->New<RegExpCharacterClass>(zone, ranges);
482    return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
483  }
484
485  if (set_.is_standard() &&
486      standard_type() == StandardCharacterSet::kEverything) {
487    return UnanchoredAdvance(compiler, on_success);
488  }
489
490  // Split ranges in order to handle surrogates correctly:
491  // - Surrogate pairs: translate the 32-bit code point into two uc16 code
492  //   units (irregexp operates only on code units).
493  // - Lone surrogates: these require lookarounds to ensure we don't match in
494  //   the middle of a surrogate pair.
495  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
496  UnicodeRangeSplitter splitter(ranges);
497  AddBmpCharacters(compiler, result, on_success, &splitter);
498  AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
499  AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
500  AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
501
502  static constexpr int kMaxRangesToInline = 32;  // Arbitrary.
503  if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
504
505  return result;
506}
507
508namespace {
509
510int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
511  RegExpAtom* atom1 = (*a)->AsAtom();
512  RegExpAtom* atom2 = (*b)->AsAtom();
513  base::uc16 character1 = atom1->data().at(0);
514  base::uc16 character2 = atom2->data().at(0);
515  if (character1 < character2) return -1;
516  if (character1 > character2) return 1;
517  return 0;
518}
519
520#ifdef V8_INTL_SUPPORT
521
522int CompareCaseInsensitive(const icu::UnicodeString& a,
523                           const icu::UnicodeString& b) {
524  return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
525}
526
527int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
528                                    RegExpTree* const* b) {
529  RegExpAtom* atom1 = (*a)->AsAtom();
530  RegExpAtom* atom2 = (*b)->AsAtom();
531  return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)},
532                                icu::UnicodeString{atom2->data().at(0)});
533}
534
535bool Equals(bool ignore_case, const icu::UnicodeString& a,
536            const icu::UnicodeString& b) {
537  if (a == b) return true;
538  if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
539  return false;  // Case-sensitive equality already checked above.
540}
541
542bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
543                  const RegExpAtom* b) {
544  return Equals(ignore_case, a->data().at(index), b->data().at(index));
545}
546
547#else
548
549unibrow::uchar Canonical(
550    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
551    unibrow::uchar c) {
552  unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
553  int length = canonicalize->get(c, '\0', chars);
554  DCHECK_LE(length, 1);
555  unibrow::uchar canonical = c;
556  if (length == 1) canonical = chars[0];
557  return canonical;
558}
559
560int CompareCaseInsensitive(
561    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
562    unibrow::uchar a, unibrow::uchar b) {
563  if (a == b) return 0;
564  if (a >= 'a' || b >= 'a') {
565    a = Canonical(canonicalize, a);
566    b = Canonical(canonicalize, b);
567  }
568  return static_cast<int>(a) - static_cast<int>(b);
569}
570
571int CompareFirstCharCaseInsensitive(
572    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
573    RegExpTree* const* a, RegExpTree* const* b) {
574  RegExpAtom* atom1 = (*a)->AsAtom();
575  RegExpAtom* atom2 = (*b)->AsAtom();
576  return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
577                                atom2->data().at(0));
578}
579
580bool Equals(bool ignore_case,
581            unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
582            unibrow::uchar a, unibrow::uchar b) {
583  if (a == b) return true;
584  if (ignore_case) {
585    return CompareCaseInsensitive(canonicalize, a, b) == 0;
586  }
587  return false;  // Case-sensitive equality already checked above.
588}
589
590bool CharAtEquals(bool ignore_case,
591                  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
592                  int index, const RegExpAtom* a, const RegExpAtom* b) {
593  return Equals(ignore_case, canonicalize, a->data().at(index),
594                b->data().at(index));
595}
596
597#endif  // V8_INTL_SUPPORT
598
599}  // namespace
600
601// We can stable sort runs of atoms, since the order does not matter if they
602// start with different characters.
603// Returns true if any consecutive atoms were found.
604bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
605  ZoneList<RegExpTree*>* alternatives = this->alternatives();
606  int length = alternatives->length();
607  bool found_consecutive_atoms = false;
608  for (int i = 0; i < length; i++) {
609    while (i < length) {
610      RegExpTree* alternative = alternatives->at(i);
611      if (alternative->IsAtom()) break;
612      i++;
613    }
614    // i is length or it is the index of an atom.
615    if (i == length) break;
616    int first_atom = i;
617    i++;
618    while (i < length) {
619      RegExpTree* alternative = alternatives->at(i);
620      if (!alternative->IsAtom()) break;
621      i++;
622    }
623    // Sort atoms to get ones with common prefixes together.
624    // This step is more tricky if we are in a case-independent regexp,
625    // because it would change /is|I/ to /I|is/, and order matters when
626    // the regexp parts don't match only disjoint starting points. To fix
627    // this we have a version of CompareFirstChar that uses case-
628    // independent character classes for comparison.
629    DCHECK_LT(first_atom, alternatives->length());
630    DCHECK_LE(i, alternatives->length());
631    DCHECK_LE(first_atom, i);
632    if (IsIgnoreCase(compiler->flags())) {
633#ifdef V8_INTL_SUPPORT
634      alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom,
635                               i - first_atom);
636#else
637      unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
638          compiler->isolate()->regexp_macro_assembler_canonicalize();
639      auto compare_closure = [canonicalize](RegExpTree* const* a,
640                                            RegExpTree* const* b) {
641        return CompareFirstCharCaseInsensitive(canonicalize, a, b);
642      };
643      alternatives->StableSort(compare_closure, first_atom, i - first_atom);
644#endif  // V8_INTL_SUPPORT
645    } else {
646      alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
647    }
648    if (i - first_atom > 1) found_consecutive_atoms = true;
649  }
650  return found_consecutive_atoms;
651}
652
653// Optimizes ab|ac|az to a(?:b|c|d).
654void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
655  Zone* zone = compiler->zone();
656  ZoneList<RegExpTree*>* alternatives = this->alternatives();
657  int length = alternatives->length();
658  const bool ignore_case = IsIgnoreCase(compiler->flags());
659
660  int write_posn = 0;
661  int i = 0;
662  while (i < length) {
663    RegExpTree* alternative = alternatives->at(i);
664    if (!alternative->IsAtom()) {
665      alternatives->at(write_posn++) = alternatives->at(i);
666      i++;
667      continue;
668    }
669    RegExpAtom* const atom = alternative->AsAtom();
670#ifdef V8_INTL_SUPPORT
671    icu::UnicodeString common_prefix(atom->data().at(0));
672#else
673    unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =
674        compiler->isolate()->regexp_macro_assembler_canonicalize();
675    unibrow::uchar common_prefix = atom->data().at(0);
676    if (ignore_case) {
677      common_prefix = Canonical(canonicalize, common_prefix);
678    }
679#endif  // V8_INTL_SUPPORT
680    int first_with_prefix = i;
681    int prefix_length = atom->length();
682    i++;
683    while (i < length) {
684      alternative = alternatives->at(i);
685      if (!alternative->IsAtom()) break;
686      RegExpAtom* const alt_atom = alternative->AsAtom();
687#ifdef V8_INTL_SUPPORT
688      icu::UnicodeString new_prefix(alt_atom->data().at(0));
689      if (!Equals(ignore_case, new_prefix, common_prefix)) break;
690#else
691      unibrow::uchar new_prefix = alt_atom->data().at(0);
692      if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break;
693#endif  // V8_INTL_SUPPORT
694      prefix_length = std::min(prefix_length, alt_atom->length());
695      i++;
696    }
697    if (i > first_with_prefix + 2) {
698      // Found worthwhile run of alternatives with common prefix of at least one
699      // character.  The sorting function above did not sort on more than one
700      // character for reasons of correctness, but there may still be a longer
701      // common prefix if the terms were similar or presorted in the input.
702      // Find out how long the common prefix is.
703      int run_length = i - first_with_prefix;
704      RegExpAtom* const alt_atom =
705          alternatives->at(first_with_prefix)->AsAtom();
706      for (int j = 1; j < run_length && prefix_length > 1; j++) {
707        RegExpAtom* old_atom =
708            alternatives->at(j + first_with_prefix)->AsAtom();
709        for (int k = 1; k < prefix_length; k++) {
710#ifdef V8_INTL_SUPPORT
711          if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) {
712#else
713          if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) {
714#endif  // V8_INTL_SUPPORT
715            prefix_length = k;
716            break;
717          }
718        }
719      }
720      RegExpAtom* prefix =
721          zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length));
722      ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
723      pair->Add(prefix, zone);
724      ZoneList<RegExpTree*>* suffixes =
725          zone->New<ZoneList<RegExpTree*>>(run_length, zone);
726      for (int j = 0; j < run_length; j++) {
727        RegExpAtom* old_atom =
728            alternatives->at(j + first_with_prefix)->AsAtom();
729        int len = old_atom->length();
730        if (len == prefix_length) {
731          suffixes->Add(zone->New<RegExpEmpty>(), zone);
732        } else {
733          RegExpTree* suffix = zone->New<RegExpAtom>(
734              old_atom->data().SubVector(prefix_length, old_atom->length()));
735          suffixes->Add(suffix, zone);
736        }
737      }
738      pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
739      alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
740    } else {
741      // Just copy any non-worthwhile alternatives.
742      for (int j = first_with_prefix; j < i; j++) {
743        alternatives->at(write_posn++) = alternatives->at(j);
744      }
745    }
746  }
747  alternatives->Rewind(write_posn);  // Trim end of array.
748}
749
750// Optimizes b|c|z to [bcz].
751void RegExpDisjunction::FixSingleCharacterDisjunctions(
752    RegExpCompiler* compiler) {
753  Zone* zone = compiler->zone();
754  ZoneList<RegExpTree*>* alternatives = this->alternatives();
755  int length = alternatives->length();
756
757  int write_posn = 0;
758  int i = 0;
759  while (i < length) {
760    RegExpTree* alternative = alternatives->at(i);
761    if (!alternative->IsAtom()) {
762      alternatives->at(write_posn++) = alternatives->at(i);
763      i++;
764      continue;
765    }
766    RegExpAtom* const atom = alternative->AsAtom();
767    if (atom->length() != 1) {
768      alternatives->at(write_posn++) = alternatives->at(i);
769      i++;
770      continue;
771    }
772    const RegExpFlags flags = compiler->flags();
773    DCHECK_IMPLIES(IsUnicode(flags),
774                   !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
775    bool contains_trail_surrogate =
776        unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
777    int first_in_run = i;
778    i++;
779    // Find a run of single-character atom alternatives that have identical
780    // flags (case independence and unicode-ness).
781    while (i < length) {
782      alternative = alternatives->at(i);
783      if (!alternative->IsAtom()) break;
784      RegExpAtom* const alt_atom = alternative->AsAtom();
785      if (alt_atom->length() != 1) break;
786      DCHECK_IMPLIES(IsUnicode(flags),
787                     !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));
788      contains_trail_surrogate |=
789          unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0));
790      i++;
791    }
792    if (i > first_in_run + 1) {
793      // Found non-trivial run of single-character alternatives.
794      int run_length = i - first_in_run;
795      ZoneList<CharacterRange>* ranges =
796          zone->New<ZoneList<CharacterRange>>(2, zone);
797      for (int j = 0; j < run_length; j++) {
798        RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
799        DCHECK_EQ(old_atom->length(), 1);
800        ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
801      }
802      RegExpCharacterClass::CharacterClassFlags character_class_flags;
803      if (IsUnicode(flags) && contains_trail_surrogate) {
804        character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
805      }
806      alternatives->at(write_posn++) =
807          zone->New<RegExpCharacterClass>(zone, ranges, character_class_flags);
808    } else {
809      // Just copy any trivial alternatives.
810      for (int j = first_in_run; j < i; j++) {
811        alternatives->at(write_posn++) = alternatives->at(j);
812      }
813    }
814  }
815  alternatives->Rewind(write_posn);  // Trim end of array.
816}
817
818RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
819                                      RegExpNode* on_success) {
820  compiler->ToNodeMaybeCheckForStackOverflow();
821
822  ZoneList<RegExpTree*>* alternatives = this->alternatives();
823
824  if (alternatives->length() > 2) {
825    bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
826    if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
827    FixSingleCharacterDisjunctions(compiler);
828    if (alternatives->length() == 1) {
829      return alternatives->at(0)->ToNode(compiler, on_success);
830    }
831  }
832
833  int length = alternatives->length();
834
835  ChoiceNode* result =
836      compiler->zone()->New<ChoiceNode>(length, compiler->zone());
837  for (int i = 0; i < length; i++) {
838    GuardedAlternative alternative(
839        alternatives->at(i)->ToNode(compiler, on_success));
840    result->AddAlternative(alternative);
841  }
842  return result;
843}
844
845RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
846                                     RegExpNode* on_success) {
847  return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
848}
849
850namespace {
851// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
852//         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
853RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
854                                          RegExpNode* on_success,
855                                          RegExpAssertion::Type type,
856                                          RegExpFlags flags) {
857  CHECK(NeedsUnicodeCaseEquivalents(flags));
858  Zone* zone = compiler->zone();
859  ZoneList<CharacterRange>* word_range =
860      zone->New<ZoneList<CharacterRange>>(2, zone);
861  CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,
862                                 zone);
863  int stack_register = compiler->UnicodeLookaroundStackRegister();
864  int position_register = compiler->UnicodeLookaroundPositionRegister();
865  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
866  // Add two choices. The (non-)boundary could start with a word or
867  // a non-word-character.
868  for (int i = 0; i < 2; i++) {
869    bool lookbehind_for_word = i == 0;
870    bool lookahead_for_word =
871        (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;
872    // Look to the left.
873    RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
874                                         stack_register, position_register);
875    RegExpNode* backward = TextNode::CreateForCharacterRanges(
876        zone, word_range, true, lookbehind.on_match_success());
877    // Look to the right.
878    RegExpLookaround::Builder lookahead(lookahead_for_word,
879                                        lookbehind.ForMatch(backward),
880                                        stack_register, position_register);
881    RegExpNode* forward = TextNode::CreateForCharacterRanges(
882        zone, word_range, false, lookahead.on_match_success());
883    result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
884  }
885  return result;
886}
887}  // anonymous namespace
888
889RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
890                                    RegExpNode* on_success) {
891  NodeInfo info;
892  Zone* zone = compiler->zone();
893
894  switch (assertion_type()) {
895    case Type::START_OF_LINE:
896      return AssertionNode::AfterNewline(on_success);
897    case Type::START_OF_INPUT:
898      return AssertionNode::AtStart(on_success);
899    case Type::BOUNDARY:
900      return NeedsUnicodeCaseEquivalents(compiler->flags())
901                 ? BoundaryAssertionAsLookaround(
902                       compiler, on_success, Type::BOUNDARY, compiler->flags())
903                 : AssertionNode::AtBoundary(on_success);
904    case Type::NON_BOUNDARY:
905      return NeedsUnicodeCaseEquivalents(compiler->flags())
906                 ? BoundaryAssertionAsLookaround(compiler, on_success,
907                                                 Type::NON_BOUNDARY,
908                                                 compiler->flags())
909                 : AssertionNode::AtNonBoundary(on_success);
910    case Type::END_OF_INPUT:
911      return AssertionNode::AtEnd(on_success);
912    case Type::END_OF_LINE: {
913      // Compile $ in multiline regexps as an alternation with a positive
914      // lookahead in one side and an end-of-input on the other side.
915      // We need two registers for the lookahead.
916      int stack_pointer_register = compiler->AllocateRegister();
917      int position_register = compiler->AllocateRegister();
918      // The ChoiceNode to distinguish between a newline and end-of-input.
919      ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
920      // Create a newline atom.
921      ZoneList<CharacterRange>* newline_ranges =
922          zone->New<ZoneList<CharacterRange>>(3, zone);
923      CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,
924                                     newline_ranges, false, zone);
925      RegExpCharacterClass* newline_atom = zone->New<RegExpCharacterClass>(
926          StandardCharacterSet::kLineTerminator);
927      TextNode* newline_matcher =
928          zone->New<TextNode>(newline_atom, false,
929                              ActionNode::PositiveSubmatchSuccess(
930                                  stack_pointer_register, position_register,
931                                  0,   // No captures inside.
932                                  -1,  // Ignored if no captures.
933                                  on_success));
934      // Create an end-of-input matcher.
935      RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
936          stack_pointer_register, position_register, newline_matcher);
937      // Add the two alternatives to the ChoiceNode.
938      GuardedAlternative eol_alternative(end_of_line);
939      result->AddAlternative(eol_alternative);
940      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
941      result->AddAlternative(end_alternative);
942      return result;
943    }
944    default:
945      UNREACHABLE();
946  }
947}
948
949RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
950                                        RegExpNode* on_success) {
951  return compiler->zone()->New<BackReferenceNode>(
952      RegExpCapture::StartRegister(index()),
953      RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
954      on_success);
955}
956
957RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
958                                RegExpNode* on_success) {
959  return on_success;
960}
961
962RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
963                                RegExpNode* on_success) {
964  return body_->ToNode(compiler, on_success);
965}
966
967RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
968                                   int stack_pointer_register,
969                                   int position_register,
970                                   int capture_register_count,
971                                   int capture_register_start)
972    : is_positive_(is_positive),
973      on_success_(on_success),
974      stack_pointer_register_(stack_pointer_register),
975      position_register_(position_register) {
976  if (is_positive_) {
977    on_match_success_ = ActionNode::PositiveSubmatchSuccess(
978        stack_pointer_register, position_register, capture_register_count,
979        capture_register_start, on_success_);
980  } else {
981    Zone* zone = on_success_->zone();
982    on_match_success_ = zone->New<NegativeSubmatchSuccess>(
983        stack_pointer_register, position_register, capture_register_count,
984        capture_register_start, zone);
985  }
986}
987
988RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
989  if (is_positive_) {
990    return ActionNode::BeginPositiveSubmatch(stack_pointer_register_,
991                                             position_register_, match);
992  } else {
993    Zone* zone = on_success_->zone();
994    // We use a ChoiceNode to represent the negative lookaround. The first
995    // alternative is the negative match. On success, the end node backtracks.
996    // On failure, the second alternative is tried and leads to success.
997    // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
998    // first exit when calculating quick checks.
999    ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
1000        GuardedAlternative(match), GuardedAlternative(on_success_), zone);
1001    return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
1002                                             position_register_, choice_node);
1003  }
1004}
1005
1006RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
1007                                     RegExpNode* on_success) {
1008  int stack_pointer_register = compiler->AllocateRegister();
1009  int position_register = compiler->AllocateRegister();
1010
1011  const int registers_per_capture = 2;
1012  const int register_of_first_capture = 2;
1013  int register_count = capture_count_ * registers_per_capture;
1014  int register_start =
1015      register_of_first_capture + capture_from_ * registers_per_capture;
1016
1017  RegExpNode* result;
1018  bool was_reading_backward = compiler->read_backward();
1019  compiler->set_read_backward(type() == LOOKBEHIND);
1020  Builder builder(is_positive(), on_success, stack_pointer_register,
1021                  position_register, register_count, register_start);
1022  RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
1023  result = builder.ForMatch(match);
1024  compiler->set_read_backward(was_reading_backward);
1025  return result;
1026}
1027
1028RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
1029                                  RegExpNode* on_success) {
1030  return ToNode(body(), index(), compiler, on_success);
1031}
1032
1033RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
1034                                  RegExpCompiler* compiler,
1035                                  RegExpNode* on_success) {
1036  DCHECK_NOT_NULL(body);
1037  int start_reg = RegExpCapture::StartRegister(index);
1038  int end_reg = RegExpCapture::EndRegister(index);
1039  if (compiler->read_backward()) std::swap(start_reg, end_reg);
1040  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
1041  RegExpNode* body_node = body->ToNode(compiler, store_end);
1042  return ActionNode::StorePosition(start_reg, true, body_node);
1043}
1044
1045namespace {
1046
1047class AssertionSequenceRewriter final {
1048 public:
1049  // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
1050  // instead of sprinkling rewrites into the AST->Node conversion process.
1051  static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
1052    AssertionSequenceRewriter rewriter(terms, zone);
1053
1054    static constexpr int kNoIndex = -1;
1055    int from = kNoIndex;
1056
1057    for (int i = 0; i < terms->length(); i++) {
1058      RegExpTree* t = terms->at(i);
1059      if (from == kNoIndex && t->IsAssertion()) {
1060        from = i;  // Start a sequence.
1061      } else if (from != kNoIndex && !t->IsAssertion()) {
1062        // Terminate and process the sequence.
1063        if (i - from > 1) rewriter.Rewrite(from, i);
1064        from = kNoIndex;
1065      }
1066    }
1067
1068    if (from != kNoIndex && terms->length() - from > 1) {
1069      rewriter.Rewrite(from, terms->length());
1070    }
1071  }
1072
1073  // All assertions are zero width. A consecutive sequence of assertions is
1074  // order-independent. There's two ways we can optimize here:
1075  // 1. fold all identical assertions.
1076  // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
1077  //    sequence fails.
1078  void Rewrite(int from, int to) {
1079    DCHECK_GT(to, from + 1);
1080
1081    // Bitfield of all seen assertions.
1082    uint32_t seen_assertions = 0;
1083    STATIC_ASSERT(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
1084                  kUInt32Size * kBitsPerByte);
1085
1086    for (int i = from; i < to; i++) {
1087      RegExpAssertion* t = terms_->at(i)->AsAssertion();
1088      const uint32_t bit = 1 << static_cast<int>(t->assertion_type());
1089
1090      if (seen_assertions & bit) {
1091        // Fold duplicates.
1092        terms_->Set(i, zone_->New<RegExpEmpty>());
1093      }
1094
1095      seen_assertions |= bit;
1096    }
1097
1098    // Collapse failures.
1099    const uint32_t always_fails_mask =
1100        1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
1101        1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
1102    if ((seen_assertions & always_fails_mask) == always_fails_mask) {
1103      ReplaceSequenceWithFailure(from, to);
1104    }
1105  }
1106
1107  void ReplaceSequenceWithFailure(int from, int to) {
1108    // Replace the entire sequence with a single node that always fails.
1109    // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
1110    // negated '*' (everything) range serves the purpose.
1111    ZoneList<CharacterRange>* ranges =
1112        zone_->New<ZoneList<CharacterRange>>(0, zone_);
1113    RegExpCharacterClass* cc = zone_->New<RegExpCharacterClass>(zone_, ranges);
1114    terms_->Set(from, cc);
1115
1116    // Zero out the rest.
1117    RegExpEmpty* empty = zone_->New<RegExpEmpty>();
1118    for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
1119  }
1120
1121 private:
1122  AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
1123      : zone_(zone), terms_(terms) {}
1124
1125  Zone* zone_;
1126  ZoneList<RegExpTree*>* terms_;
1127};
1128
1129}  // namespace
1130
1131RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1132                                      RegExpNode* on_success) {
1133  compiler->ToNodeMaybeCheckForStackOverflow();
1134
1135  ZoneList<RegExpTree*>* children = nodes();
1136
1137  AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
1138
1139  RegExpNode* current = on_success;
1140  if (compiler->read_backward()) {
1141    for (int i = 0; i < children->length(); i++) {
1142      current = children->at(i)->ToNode(compiler, current);
1143    }
1144  } else {
1145    for (int i = children->length() - 1; i >= 0; i--) {
1146      current = children->at(i)->ToNode(compiler, current);
1147    }
1148  }
1149  return current;
1150}
1151
1152namespace {
1153
1154void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
1155              Zone* zone) {
1156  elmc--;
1157  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1158  for (int i = 0; i < elmc; i += 2) {
1159    DCHECK(elmv[i] < elmv[i + 1]);
1160    ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
1161  }
1162}
1163
1164void AddClassNegated(const int* elmv, int elmc,
1165                     ZoneList<CharacterRange>* ranges, Zone* zone) {
1166  elmc--;
1167  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1168  DCHECK_NE(0x0000, elmv[0]);
1169  DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);
1170  base::uc16 last = 0x0000;
1171  for (int i = 0; i < elmc; i += 2) {
1172    DCHECK(last <= elmv[i] - 1);
1173    DCHECK(elmv[i] < elmv[i + 1]);
1174    ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
1175    last = elmv[i + 1];
1176  }
1177  ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
1178}
1179
1180}  // namespace
1181
1182void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
1183                                    ZoneList<CharacterRange>* ranges,
1184                                    bool add_unicode_case_equivalents,
1185                                    Zone* zone) {
1186  if (add_unicode_case_equivalents &&
1187      (standard_character_set == StandardCharacterSet::kWord ||
1188       standard_character_set == StandardCharacterSet::kNotWord)) {
1189    // See #sec-runtime-semantics-wordcharacters-abstract-operation
1190    // In case of unicode and ignore_case, we need to create the closure over
1191    // case equivalent characters before negating.
1192    ZoneList<CharacterRange>* new_ranges =
1193        zone->New<ZoneList<CharacterRange>>(2, zone);
1194    AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
1195    AddUnicodeCaseEquivalents(new_ranges, zone);
1196    if (standard_character_set == StandardCharacterSet::kNotWord) {
1197      ZoneList<CharacterRange>* negated =
1198          zone->New<ZoneList<CharacterRange>>(2, zone);
1199      CharacterRange::Negate(new_ranges, negated, zone);
1200      new_ranges = negated;
1201    }
1202    ranges->AddAll(*new_ranges, zone);
1203    return;
1204  }
1205
1206  switch (standard_character_set) {
1207    case StandardCharacterSet::kWhitespace:
1208      AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1209      break;
1210    case StandardCharacterSet::kNotWhitespace:
1211      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1212      break;
1213    case StandardCharacterSet::kWord:
1214      AddClass(kWordRanges, kWordRangeCount, ranges, zone);
1215      break;
1216    case StandardCharacterSet::kNotWord:
1217      AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
1218      break;
1219    case StandardCharacterSet::kDigit:
1220      AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
1221      break;
1222    case StandardCharacterSet::kNotDigit:
1223      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
1224      break;
1225    // This is the set of characters matched by the $ and ^ symbols
1226    // in multiline mode.
1227    case StandardCharacterSet::kLineTerminator:
1228      AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
1229      break;
1230    case StandardCharacterSet::kNotLineTerminator:
1231      AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
1232                      zone);
1233      break;
1234    // This is not a character range as defined by the spec but a
1235    // convenient shorthand for a character class that matches any
1236    // character.
1237    case StandardCharacterSet::kEverything:
1238      ranges->Add(CharacterRange::Everything(), zone);
1239      break;
1240  }
1241}
1242
1243// static
1244void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
1245                                        ZoneList<CharacterRange>* ranges,
1246                                        bool is_one_byte) {
1247  CharacterRange::Canonicalize(ranges);
1248  int range_count = ranges->length();
1249#ifdef V8_INTL_SUPPORT
1250  icu::UnicodeSet others;
1251  for (int i = 0; i < range_count; i++) {
1252    CharacterRange range = ranges->at(i);
1253    base::uc32 from = range.from();
1254    if (from > kMaxUtf16CodeUnit) continue;
1255    base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});
1256    // Nothing to be done for surrogates.
1257    if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
1258    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1259      if (from > kMaxOneByteCharCode) continue;
1260      if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode;
1261    }
1262    others.add(from, to);
1263  }
1264
1265  // Compute the set of additional characters that should be added,
1266  // using UnicodeSet::closeOver. ECMA 262 defines slightly different
1267  // case-folding rules than Unicode, so some characters that are
1268  // added by closeOver do not match anything other than themselves in
1269  // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
1270  // same case-insensitive character as 's' or 'S' according to
1271  // Unicode, but does not match any other character in JS. To handle
1272  // this case, we add such characters to the IgnoreSet and filter
1273  // them out. We filter twice: once before calling closeOver (to
1274  // prevent 'ſ' from adding 's'), and once after calling closeOver
1275  // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
1276  // more information.
1277  icu::UnicodeSet already_added(others);
1278  others.removeAll(RegExpCaseFolding::IgnoreSet());
1279  others.closeOver(USET_CASE_INSENSITIVE);
1280  others.removeAll(RegExpCaseFolding::IgnoreSet());
1281  others.removeAll(already_added);
1282
1283  // Add others to the ranges
1284  for (int32_t i = 0; i < others.getRangeCount(); i++) {
1285    UChar32 from = others.getRangeStart(i);
1286    UChar32 to = others.getRangeEnd(i);
1287    if (from == to) {
1288      ranges->Add(CharacterRange::Singleton(from), zone);
1289    } else {
1290      ranges->Add(CharacterRange::Range(from, to), zone);
1291    }
1292  }
1293#else
1294  for (int i = 0; i < range_count; i++) {
1295    CharacterRange range = ranges->at(i);
1296    base::uc32 bottom = range.from();
1297    if (bottom > kMaxUtf16CodeUnit) continue;
1298    base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});
1299    // Nothing to be done for surrogates.
1300    if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
1301    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1302      if (bottom > kMaxOneByteCharCode) continue;
1303      if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode;
1304    }
1305    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1306    if (top == bottom) {
1307      // If this is a singleton we just expand the one character.
1308      int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
1309      for (int i = 0; i < length; i++) {
1310        base::uc32 chr = chars[i];
1311        if (chr != bottom) {
1312          ranges->Add(CharacterRange::Singleton(chars[i]), zone);
1313        }
1314      }
1315    } else {
1316      // If this is a range we expand the characters block by block, expanding
1317      // contiguous subranges (blocks) one at a time.  The approach is as
1318      // follows.  For a given start character we look up the remainder of the
1319      // block that contains it (represented by the end point), for instance we
1320      // find 'z' if the character is 'c'.  A block is characterized by the
1321      // property that all characters uncanonicalize in the same way, except
1322      // that each entry in the result is incremented by the distance from the
1323      // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
1324      // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
1325      // we've found the end point we look up its uncanonicalization and
1326      // produce a range for each element.  For instance for [c-f] we look up
1327      // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
1328      // it is not already contained in the input, so [c-f] will be skipped but
1329      // [C-F] will be added.  If this range is not completely contained in a
1330      // block we do this for all the blocks covered by the range (handling
1331      // characters that is not in a block as a "singleton block").
1332      unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1333      base::uc32 pos = bottom;
1334      while (pos <= top) {
1335        int length =
1336            isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
1337        base::uc32 block_end;
1338        if (length == 0) {
1339          block_end = pos;
1340        } else {
1341          DCHECK_EQ(1, length);
1342          block_end = equivalents[0];
1343        }
1344        int end = (block_end > top) ? top : block_end;
1345        length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
1346                                                         equivalents);
1347        for (int i = 0; i < length; i++) {
1348          base::uc32 c = equivalents[i];
1349          base::uc32 range_from = c - (block_end - pos);
1350          base::uc32 range_to = c - (block_end - end);
1351          if (!(bottom <= range_from && range_to <= top)) {
1352            ranges->Add(CharacterRange::Range(range_from, range_to), zone);
1353          }
1354        }
1355        pos = end + 1;
1356      }
1357    }
1358  }
1359#endif  // V8_INTL_SUPPORT
1360}
1361
1362bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
1363  DCHECK_NOT_NULL(ranges);
1364  int n = ranges->length();
1365  if (n <= 1) return true;
1366  base::uc32 max = ranges->at(0).to();
1367  for (int i = 1; i < n; i++) {
1368    CharacterRange next_range = ranges->at(i);
1369    if (next_range.from() <= max + 1) return false;
1370    max = next_range.to();
1371  }
1372  return true;
1373}
1374
1375ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
1376  if (ranges_ == nullptr) {
1377    ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
1378    CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
1379                                   zone);
1380  }
1381  return ranges_;
1382}
1383
1384namespace {
1385
1386// Move a number of elements in a zonelist to another position
1387// in the same list. Handles overlapping source and target areas.
1388void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {
1389  // Ranges are potentially overlapping.
1390  if (from < to) {
1391    for (int i = count - 1; i >= 0; i--) {
1392      list->at(to + i) = list->at(from + i);
1393    }
1394  } else {
1395    for (int i = 0; i < count; i++) {
1396      list->at(to + i) = list->at(from + i);
1397    }
1398  }
1399}
1400
1401int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
1402                               CharacterRange insert) {
1403  // Inserts a range into list[0..count[, which must be sorted
1404  // by from value and non-overlapping and non-adjacent, using at most
1405  // list[0..count] for the result. Returns the number of resulting
1406  // canonicalized ranges. Inserting a range may collapse existing ranges into
1407  // fewer ranges, so the return value can be anything in the range 1..count+1.
1408  base::uc32 from = insert.from();
1409  base::uc32 to = insert.to();
1410  int start_pos = 0;
1411  int end_pos = count;
1412  for (int i = count - 1; i >= 0; i--) {
1413    CharacterRange current = list->at(i);
1414    if (current.from() > to + 1) {
1415      end_pos = i;
1416    } else if (current.to() + 1 < from) {
1417      start_pos = i + 1;
1418      break;
1419    }
1420  }
1421
1422  // Inserted range overlaps, or is adjacent to, ranges at positions
1423  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
1424  // not affected by the insertion.
1425  // If start_pos == end_pos, the range must be inserted before start_pos.
1426  // if start_pos < end_pos, the entire range from start_pos to end_pos
1427  // must be merged with the insert range.
1428
1429  if (start_pos == end_pos) {
1430    // Insert between existing ranges at position start_pos.
1431    if (start_pos < count) {
1432      MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1433    }
1434    list->at(start_pos) = insert;
1435    return count + 1;
1436  }
1437  if (start_pos + 1 == end_pos) {
1438    // Replace single existing range at position start_pos.
1439    CharacterRange to_replace = list->at(start_pos);
1440    int new_from = std::min(to_replace.from(), from);
1441    int new_to = std::max(to_replace.to(), to);
1442    list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1443    return count;
1444  }
1445  // Replace a number of existing ranges from start_pos to end_pos - 1.
1446  // Move the remaining ranges down.
1447
1448  int new_from = std::min(list->at(start_pos).from(), from);
1449  int new_to = std::max(list->at(end_pos - 1).to(), to);
1450  if (end_pos < count) {
1451    MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1452  }
1453  list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1454  return count - (end_pos - start_pos) + 1;
1455}
1456
1457}  // namespace
1458
1459void CharacterSet::Canonicalize() {
1460  // Special/default classes are always considered canonical. The result
1461  // of calling ranges() will be sorted.
1462  if (ranges_ == nullptr) return;
1463  CharacterRange::Canonicalize(ranges_);
1464}
1465
1466// static
1467void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
1468  if (character_ranges->length() <= 1) return;
1469  // Check whether ranges are already canonical (increasing, non-overlapping,
1470  // non-adjacent).
1471  int n = character_ranges->length();
1472  base::uc32 max = character_ranges->at(0).to();
1473  int i = 1;
1474  while (i < n) {
1475    CharacterRange current = character_ranges->at(i);
1476    if (current.from() <= max + 1) {
1477      break;
1478    }
1479    max = current.to();
1480    i++;
1481  }
1482  // Canonical until the i'th range. If that's all of them, we are done.
1483  if (i == n) return;
1484
1485  // The ranges at index i and forward are not canonicalized. Make them so by
1486  // doing the equivalent of insertion sort (inserting each into the previous
1487  // list, in order).
1488  // Notice that inserting a range can reduce the number of ranges in the
1489  // result due to combining of adjacent and overlapping ranges.
1490  int read = i;           // Range to insert.
1491  int num_canonical = i;  // Length of canonicalized part of list.
1492  do {
1493    num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1494                                               character_ranges->at(read));
1495    read++;
1496  } while (read < n);
1497  character_ranges->Rewind(num_canonical);
1498
1499  DCHECK(CharacterRange::IsCanonical(character_ranges));
1500}
1501
1502// static
1503void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
1504                            ZoneList<CharacterRange>* negated_ranges,
1505                            Zone* zone) {
1506  DCHECK(CharacterRange::IsCanonical(ranges));
1507  DCHECK_EQ(0, negated_ranges->length());
1508  int range_count = ranges->length();
1509  base::uc32 from = 0;
1510  int i = 0;
1511  if (range_count > 0 && ranges->at(0).from() == 0) {
1512    from = ranges->at(0).to() + 1;
1513    i = 1;
1514  }
1515  while (i < range_count) {
1516    CharacterRange range = ranges->at(i);
1517    negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
1518    from = range.to() + 1;
1519    i++;
1520  }
1521  if (from < kMaxCodePoint) {
1522    negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);
1523  }
1524}
1525
1526// static
1527void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
1528  DCHECK(IsCanonical(ranges));
1529
1530  // Drop all ranges that don't contain one-byte code units, and clamp the last
1531  // range s.t. it likewise only contains one-byte code units. Note this relies
1532  // on `ranges` being canonicalized, i.e. sorted and non-overlapping.
1533
1534  static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
1535  int n = ranges->length();
1536  for (; n > 0; n--) {
1537    CharacterRange& r = ranges->at(n - 1);
1538    if (r.from() <= max_char) {
1539      r.to_ = std::min(r.to_, max_char);
1540      break;
1541    }
1542  }
1543
1544  ranges->Rewind(n);
1545}
1546
1547namespace {
1548
1549// Scoped object to keep track of how much we unroll quantifier loops in the
1550// regexp graph generator.
1551class RegExpExpansionLimiter {
1552 public:
1553  static const int kMaxExpansionFactor = 6;
1554  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1555      : compiler_(compiler),
1556        saved_expansion_factor_(compiler->current_expansion_factor()),
1557        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
1558    DCHECK_LT(0, factor);
1559    if (ok_to_expand_) {
1560      if (factor > kMaxExpansionFactor) {
1561        // Avoid integer overflow of the current expansion factor.
1562        ok_to_expand_ = false;
1563        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1564      } else {
1565        int new_factor = saved_expansion_factor_ * factor;
1566        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1567        compiler->set_current_expansion_factor(new_factor);
1568      }
1569    }
1570  }
1571
1572  ~RegExpExpansionLimiter() {
1573    compiler_->set_current_expansion_factor(saved_expansion_factor_);
1574  }
1575
1576  bool ok_to_expand() { return ok_to_expand_; }
1577
1578 private:
1579  RegExpCompiler* compiler_;
1580  int saved_expansion_factor_;
1581  bool ok_to_expand_;
1582
1583  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
1584};
1585
1586}  // namespace
1587
1588RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
1589                                     RegExpTree* body, RegExpCompiler* compiler,
1590                                     RegExpNode* on_success,
1591                                     bool not_at_start) {
1592  // x{f, t} becomes this:
1593  //
1594  //             (r++)<-.
1595  //               |     `
1596  //               |     (x)
1597  //               v     ^
1598  //      (r=0)-->(?)---/ [if r < t]
1599  //               |
1600  //   [if r >= f] \----> ...
1601  //
1602
1603  // 15.10.2.5 RepeatMatcher algorithm.
1604  // The parser has already eliminated the case where max is 0.  In the case
1605  // where max_match is zero the parser has removed the quantifier if min was
1606  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
1607
1608  // If we know that we cannot match zero length then things are a little
1609  // simpler since we don't need to make the special zero length match check
1610  // from step 2.1.  If the min and max are small we can unroll a little in
1611  // this case.
1612  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
1613  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
1614  if (max == 0) return on_success;  // This can happen due to recursion.
1615  bool body_can_be_empty = (body->min_match() == 0);
1616  int body_start_reg = RegExpCompiler::kNoRegister;
1617  Interval capture_registers = body->CaptureRegisters();
1618  bool needs_capture_clearing = !capture_registers.is_empty();
1619  Zone* zone = compiler->zone();
1620
1621  if (body_can_be_empty) {
1622    body_start_reg = compiler->AllocateRegister();
1623  } else if (compiler->optimize() && !needs_capture_clearing) {
1624    // Only unroll if there are no captures and the body can't be
1625    // empty.
1626    {
1627      RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
1628      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
1629        int new_max = (max == kInfinity) ? max : max - min;
1630        // Recurse once to get the loop or optional matches after the fixed
1631        // ones.
1632        RegExpNode* answer =
1633            ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
1634        // Unroll the forced matches from 0 to min.  This can cause chains of
1635        // TextNodes (which the parser does not generate).  These should be
1636        // combined if it turns out they hinder good code generation.
1637        for (int i = 0; i < min; i++) {
1638          answer = body->ToNode(compiler, answer);
1639        }
1640        return answer;
1641      }
1642    }
1643    if (max <= kMaxUnrolledMaxMatches && min == 0) {
1644      DCHECK_LT(0, max);  // Due to the 'if' above.
1645      RegExpExpansionLimiter limiter(compiler, max);
1646      if (limiter.ok_to_expand()) {
1647        // Unroll the optional matches up to max.
1648        RegExpNode* answer = on_success;
1649        for (int i = 0; i < max; i++) {
1650          ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
1651          if (is_greedy) {
1652            alternation->AddAlternative(
1653                GuardedAlternative(body->ToNode(compiler, answer)));
1654            alternation->AddAlternative(GuardedAlternative(on_success));
1655          } else {
1656            alternation->AddAlternative(GuardedAlternative(on_success));
1657            alternation->AddAlternative(
1658                GuardedAlternative(body->ToNode(compiler, answer)));
1659          }
1660          answer = alternation;
1661          if (not_at_start && !compiler->read_backward()) {
1662            alternation->set_not_at_start();
1663          }
1664        }
1665        return answer;
1666      }
1667    }
1668  }
1669  bool has_min = min > 0;
1670  bool has_max = max < RegExpTree::kInfinity;
1671  bool needs_counter = has_min || has_max;
1672  int reg_ctr = needs_counter ? compiler->AllocateRegister()
1673                              : RegExpCompiler::kNoRegister;
1674  LoopChoiceNode* center = zone->New<LoopChoiceNode>(
1675      body->min_match() == 0, compiler->read_backward(), min, zone);
1676  if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
1677  RegExpNode* loop_return =
1678      needs_counter ? static_cast<RegExpNode*>(
1679                          ActionNode::IncrementRegister(reg_ctr, center))
1680                    : static_cast<RegExpNode*>(center);
1681  if (body_can_be_empty) {
1682    // If the body can be empty we need to check if it was and then
1683    // backtrack.
1684    loop_return =
1685        ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
1686  }
1687  RegExpNode* body_node = body->ToNode(compiler, loop_return);
1688  if (body_can_be_empty) {
1689    // If the body can be empty we need to store the start position
1690    // so we can bail out if it was empty.
1691    body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
1692  }
1693  if (needs_capture_clearing) {
1694    // Before entering the body of this loop we need to clear captures.
1695    body_node = ActionNode::ClearCaptures(capture_registers, body_node);
1696  }
1697  GuardedAlternative body_alt(body_node);
1698  if (has_max) {
1699    Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
1700    body_alt.AddGuard(body_guard, zone);
1701  }
1702  GuardedAlternative rest_alt(on_success);
1703  if (has_min) {
1704    Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
1705    rest_alt.AddGuard(rest_guard, zone);
1706  }
1707  if (is_greedy) {
1708    center->AddLoopAlternative(body_alt);
1709    center->AddContinueAlternative(rest_alt);
1710  } else {
1711    center->AddContinueAlternative(rest_alt);
1712    center->AddLoopAlternative(body_alt);
1713  }
1714  if (needs_counter) {
1715    return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
1716  } else {
1717    return center;
1718  }
1719}
1720
1721}  // namespace internal
1722}  // namespace v8
1723