11cb0ef41Sopenharmony_ci// Copyright 2019 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci#include "src/regexp/regexp-compiler.h"
61cb0ef41Sopenharmony_ci
71cb0ef41Sopenharmony_ci#include "src/execution/isolate.h"
81cb0ef41Sopenharmony_ci#include "src/regexp/regexp.h"
91cb0ef41Sopenharmony_ci#include "src/strings/unicode-inl.h"
101cb0ef41Sopenharmony_ci#include "src/zone/zone-list-inl.h"
111cb0ef41Sopenharmony_ci
121cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
131cb0ef41Sopenharmony_ci#include "src/base/strings.h"
141cb0ef41Sopenharmony_ci#include "src/regexp/special-case.h"
151cb0ef41Sopenharmony_ci#include "unicode/locid.h"
161cb0ef41Sopenharmony_ci#include "unicode/uniset.h"
171cb0ef41Sopenharmony_ci#include "unicode/utypes.h"
181cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
191cb0ef41Sopenharmony_ci
201cb0ef41Sopenharmony_cinamespace v8 {
211cb0ef41Sopenharmony_cinamespace internal {
221cb0ef41Sopenharmony_ci
231cb0ef41Sopenharmony_ciusing namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
241cb0ef41Sopenharmony_ci
251cb0ef41Sopenharmony_ciconstexpr base::uc32 kMaxCodePoint = 0x10ffff;
261cb0ef41Sopenharmony_ciconstexpr int kMaxUtf16CodeUnit = 0xffff;
271cb0ef41Sopenharmony_ciconstexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;
281cb0ef41Sopenharmony_ciconstexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
291cb0ef41Sopenharmony_ci
301cb0ef41Sopenharmony_ci// -------------------------------------------------------------------
311cb0ef41Sopenharmony_ci// Tree to graph conversion
321cb0ef41Sopenharmony_ci
331cb0ef41Sopenharmony_ciRegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
341cb0ef41Sopenharmony_ci                               RegExpNode* on_success) {
351cb0ef41Sopenharmony_ci  ZoneList<TextElement>* elms =
361cb0ef41Sopenharmony_ci      compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
371cb0ef41Sopenharmony_ci  elms->Add(TextElement::Atom(this), compiler->zone());
381cb0ef41Sopenharmony_ci  return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
391cb0ef41Sopenharmony_ci                                         on_success);
401cb0ef41Sopenharmony_ci}
411cb0ef41Sopenharmony_ci
421cb0ef41Sopenharmony_ciRegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
431cb0ef41Sopenharmony_ci                               RegExpNode* on_success) {
441cb0ef41Sopenharmony_ci  return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
451cb0ef41Sopenharmony_ci                                         on_success);
461cb0ef41Sopenharmony_ci}
471cb0ef41Sopenharmony_ci
481cb0ef41Sopenharmony_cinamespace {
491cb0ef41Sopenharmony_ci
501cb0ef41Sopenharmony_cibool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
511cb0ef41Sopenharmony_ci                          const int* special_class, int length) {
521cb0ef41Sopenharmony_ci  length--;  // Remove final marker.
531cb0ef41Sopenharmony_ci
541cb0ef41Sopenharmony_ci  DCHECK_EQ(kRangeEndMarker, special_class[length]);
551cb0ef41Sopenharmony_ci  DCHECK_NE(0, ranges->length());
561cb0ef41Sopenharmony_ci  DCHECK_NE(0, length);
571cb0ef41Sopenharmony_ci  DCHECK_NE(0, special_class[0]);
581cb0ef41Sopenharmony_ci
591cb0ef41Sopenharmony_ci  if (ranges->length() != (length >> 1) + 1) return false;
601cb0ef41Sopenharmony_ci
611cb0ef41Sopenharmony_ci  CharacterRange range = ranges->at(0);
621cb0ef41Sopenharmony_ci  if (range.from() != 0) return false;
631cb0ef41Sopenharmony_ci
641cb0ef41Sopenharmony_ci  for (int i = 0; i < length; i += 2) {
651cb0ef41Sopenharmony_ci    if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
661cb0ef41Sopenharmony_ci      return false;
671cb0ef41Sopenharmony_ci    }
681cb0ef41Sopenharmony_ci    range = ranges->at((i >> 1) + 1);
691cb0ef41Sopenharmony_ci    if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {
701cb0ef41Sopenharmony_ci      return false;
711cb0ef41Sopenharmony_ci    }
721cb0ef41Sopenharmony_ci  }
731cb0ef41Sopenharmony_ci
741cb0ef41Sopenharmony_ci  return range.to() == kMaxCodePoint;
751cb0ef41Sopenharmony_ci}
761cb0ef41Sopenharmony_ci
771cb0ef41Sopenharmony_cibool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
781cb0ef41Sopenharmony_ci                   int length) {
791cb0ef41Sopenharmony_ci  length--;  // Remove final marker.
801cb0ef41Sopenharmony_ci
811cb0ef41Sopenharmony_ci  DCHECK_EQ(kRangeEndMarker, special_class[length]);
821cb0ef41Sopenharmony_ci  if (ranges->length() * 2 != length) return false;
831cb0ef41Sopenharmony_ci
841cb0ef41Sopenharmony_ci  for (int i = 0; i < length; i += 2) {
851cb0ef41Sopenharmony_ci    CharacterRange range = ranges->at(i >> 1);
861cb0ef41Sopenharmony_ci    if (range.from() != static_cast<base::uc32>(special_class[i]) ||
871cb0ef41Sopenharmony_ci        range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {
881cb0ef41Sopenharmony_ci      return false;
891cb0ef41Sopenharmony_ci    }
901cb0ef41Sopenharmony_ci  }
911cb0ef41Sopenharmony_ci  return true;
921cb0ef41Sopenharmony_ci}
931cb0ef41Sopenharmony_ci
941cb0ef41Sopenharmony_ci}  // namespace
951cb0ef41Sopenharmony_ci
961cb0ef41Sopenharmony_cibool RegExpCharacterClass::is_standard(Zone* zone) {
971cb0ef41Sopenharmony_ci  // TODO(lrn): Remove need for this function, by not throwing away information
981cb0ef41Sopenharmony_ci  // along the way.
991cb0ef41Sopenharmony_ci  if (is_negated()) {
1001cb0ef41Sopenharmony_ci    return false;
1011cb0ef41Sopenharmony_ci  }
1021cb0ef41Sopenharmony_ci  if (set_.is_standard()) {
1031cb0ef41Sopenharmony_ci    return true;
1041cb0ef41Sopenharmony_ci  }
1051cb0ef41Sopenharmony_ci  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
1061cb0ef41Sopenharmony_ci    set_.set_standard_set_type(StandardCharacterSet::kWhitespace);
1071cb0ef41Sopenharmony_ci    return true;
1081cb0ef41Sopenharmony_ci  }
1091cb0ef41Sopenharmony_ci  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
1101cb0ef41Sopenharmony_ci    set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);
1111cb0ef41Sopenharmony_ci    return true;
1121cb0ef41Sopenharmony_ci  }
1131cb0ef41Sopenharmony_ci  if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
1141cb0ef41Sopenharmony_ci                           kLineTerminatorRangeCount)) {
1151cb0ef41Sopenharmony_ci    set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);
1161cb0ef41Sopenharmony_ci    return true;
1171cb0ef41Sopenharmony_ci  }
1181cb0ef41Sopenharmony_ci  if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
1191cb0ef41Sopenharmony_ci                    kLineTerminatorRangeCount)) {
1201cb0ef41Sopenharmony_ci    set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);
1211cb0ef41Sopenharmony_ci    return true;
1221cb0ef41Sopenharmony_ci  }
1231cb0ef41Sopenharmony_ci  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
1241cb0ef41Sopenharmony_ci    set_.set_standard_set_type(StandardCharacterSet::kWord);
1251cb0ef41Sopenharmony_ci    return true;
1261cb0ef41Sopenharmony_ci  }
1271cb0ef41Sopenharmony_ci  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
1281cb0ef41Sopenharmony_ci    set_.set_standard_set_type(StandardCharacterSet::kNotWord);
1291cb0ef41Sopenharmony_ci    return true;
1301cb0ef41Sopenharmony_ci  }
1311cb0ef41Sopenharmony_ci  return false;
1321cb0ef41Sopenharmony_ci}
1331cb0ef41Sopenharmony_ci
1341cb0ef41Sopenharmony_ciUnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
1351cb0ef41Sopenharmony_ci  // The unicode range splitter categorizes given character ranges into:
1361cb0ef41Sopenharmony_ci  // - Code points from the BMP representable by one code unit.
1371cb0ef41Sopenharmony_ci  // - Code points outside the BMP that need to be split into surrogate pairs.
1381cb0ef41Sopenharmony_ci  // - Lone lead surrogates.
1391cb0ef41Sopenharmony_ci  // - Lone trail surrogates.
1401cb0ef41Sopenharmony_ci  // Lone surrogates are valid code points, even though no actual characters.
1411cb0ef41Sopenharmony_ci  // They require special matching to make sure we do not split surrogate pairs.
1421cb0ef41Sopenharmony_ci
1431cb0ef41Sopenharmony_ci  for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
1441cb0ef41Sopenharmony_ci}
1451cb0ef41Sopenharmony_ci
1461cb0ef41Sopenharmony_civoid UnicodeRangeSplitter::AddRange(CharacterRange range) {
1471cb0ef41Sopenharmony_ci  static constexpr base::uc32 kBmp1Start = 0;
1481cb0ef41Sopenharmony_ci  static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;
1491cb0ef41Sopenharmony_ci  static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;
1501cb0ef41Sopenharmony_ci  static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;
1511cb0ef41Sopenharmony_ci
1521cb0ef41Sopenharmony_ci  // Ends are all inclusive.
1531cb0ef41Sopenharmony_ci  STATIC_ASSERT(kBmp1Start == 0);
1541cb0ef41Sopenharmony_ci  STATIC_ASSERT(kBmp1Start < kBmp1End);
1551cb0ef41Sopenharmony_ci  STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
1561cb0ef41Sopenharmony_ci  STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
1571cb0ef41Sopenharmony_ci  STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
1581cb0ef41Sopenharmony_ci  STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
1591cb0ef41Sopenharmony_ci  STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
1601cb0ef41Sopenharmony_ci  STATIC_ASSERT(kBmp2Start < kBmp2End);
1611cb0ef41Sopenharmony_ci  STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
1621cb0ef41Sopenharmony_ci  STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
1631cb0ef41Sopenharmony_ci
1641cb0ef41Sopenharmony_ci  static constexpr base::uc32 kStarts[] = {
1651cb0ef41Sopenharmony_ci      kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
1661cb0ef41Sopenharmony_ci      kBmp2Start, kNonBmpStart,
1671cb0ef41Sopenharmony_ci  };
1681cb0ef41Sopenharmony_ci
1691cb0ef41Sopenharmony_ci  static constexpr base::uc32 kEnds[] = {
1701cb0ef41Sopenharmony_ci      kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
1711cb0ef41Sopenharmony_ci  };
1721cb0ef41Sopenharmony_ci
1731cb0ef41Sopenharmony_ci  CharacterRangeVector* const kTargets[] = {
1741cb0ef41Sopenharmony_ci      &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
1751cb0ef41Sopenharmony_ci  };
1761cb0ef41Sopenharmony_ci
1771cb0ef41Sopenharmony_ci  static constexpr int kCount = arraysize(kStarts);
1781cb0ef41Sopenharmony_ci  STATIC_ASSERT(kCount == arraysize(kEnds));
1791cb0ef41Sopenharmony_ci  STATIC_ASSERT(kCount == arraysize(kTargets));
1801cb0ef41Sopenharmony_ci
1811cb0ef41Sopenharmony_ci  for (int i = 0; i < kCount; i++) {
1821cb0ef41Sopenharmony_ci    if (kStarts[i] > range.to()) break;
1831cb0ef41Sopenharmony_ci    const base::uc32 from = std::max(kStarts[i], range.from());
1841cb0ef41Sopenharmony_ci    const base::uc32 to = std::min(kEnds[i], range.to());
1851cb0ef41Sopenharmony_ci    if (from > to) continue;
1861cb0ef41Sopenharmony_ci    kTargets[i]->emplace_back(CharacterRange::Range(from, to));
1871cb0ef41Sopenharmony_ci  }
1881cb0ef41Sopenharmony_ci}
1891cb0ef41Sopenharmony_ci
1901cb0ef41Sopenharmony_cinamespace {
1911cb0ef41Sopenharmony_ci
1921cb0ef41Sopenharmony_ci// Translates between new and old V8-isms (SmallVector, ZoneList).
1931cb0ef41Sopenharmony_ciZoneList<CharacterRange>* ToCanonicalZoneList(
1941cb0ef41Sopenharmony_ci    const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
1951cb0ef41Sopenharmony_ci  if (v->empty()) return nullptr;
1961cb0ef41Sopenharmony_ci
1971cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* result =
1981cb0ef41Sopenharmony_ci      zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
1991cb0ef41Sopenharmony_ci  for (size_t i = 0; i < v->size(); i++) {
2001cb0ef41Sopenharmony_ci    result->Add(v->at(i), zone);
2011cb0ef41Sopenharmony_ci  }
2021cb0ef41Sopenharmony_ci
2031cb0ef41Sopenharmony_ci  CharacterRange::Canonicalize(result);
2041cb0ef41Sopenharmony_ci  return result;
2051cb0ef41Sopenharmony_ci}
2061cb0ef41Sopenharmony_ci
2071cb0ef41Sopenharmony_civoid AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
2081cb0ef41Sopenharmony_ci                      RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
2091cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* bmp =
2101cb0ef41Sopenharmony_ci      ToCanonicalZoneList(splitter->bmp(), compiler->zone());
2111cb0ef41Sopenharmony_ci  if (bmp == nullptr) return;
2121cb0ef41Sopenharmony_ci  result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
2131cb0ef41Sopenharmony_ci      compiler->zone(), bmp, compiler->read_backward(), on_success)));
2141cb0ef41Sopenharmony_ci}
2151cb0ef41Sopenharmony_ci
2161cb0ef41Sopenharmony_ciusing UC16Range = uint32_t;  // {from, to} packed into one uint32_t.
2171cb0ef41Sopenharmony_ciconstexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {
2181cb0ef41Sopenharmony_ci  return (static_cast<uint32_t>(from) << 16) | to;
2191cb0ef41Sopenharmony_ci}
2201cb0ef41Sopenharmony_ciconstexpr base::uc16 ExtractFrom(UC16Range r) {
2211cb0ef41Sopenharmony_ci  return static_cast<base::uc16>(r >> 16);
2221cb0ef41Sopenharmony_ci}
2231cb0ef41Sopenharmony_ciconstexpr base::uc16 ExtractTo(UC16Range r) {
2241cb0ef41Sopenharmony_ci  return static_cast<base::uc16>(r);
2251cb0ef41Sopenharmony_ci}
2261cb0ef41Sopenharmony_ci
2271cb0ef41Sopenharmony_civoid AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
2281cb0ef41Sopenharmony_ci                             RegExpNode* on_success,
2291cb0ef41Sopenharmony_ci                             UnicodeRangeSplitter* splitter) {
2301cb0ef41Sopenharmony_ci  DCHECK(!compiler->one_byte());
2311cb0ef41Sopenharmony_ci  Zone* const zone = compiler->zone();
2321cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* non_bmp =
2331cb0ef41Sopenharmony_ci      ToCanonicalZoneList(splitter->non_bmp(), zone);
2341cb0ef41Sopenharmony_ci  if (non_bmp == nullptr) return;
2351cb0ef41Sopenharmony_ci
2361cb0ef41Sopenharmony_ci  // Translate each 32-bit code point range into the corresponding 16-bit code
2371cb0ef41Sopenharmony_ci  // unit representation consisting of the lead- and trail surrogate.
2381cb0ef41Sopenharmony_ci  //
2391cb0ef41Sopenharmony_ci  // The generated alternatives are grouped by the leading surrogate to avoid
2401cb0ef41Sopenharmony_ci  // emitting excessive code. For example, for
2411cb0ef41Sopenharmony_ci  //
2421cb0ef41Sopenharmony_ci  //  { \ud800[\udc00-\udc01]
2431cb0ef41Sopenharmony_ci  //  , \ud800[\udc05-\udc06]
2441cb0ef41Sopenharmony_ci  //  }
2451cb0ef41Sopenharmony_ci  //
2461cb0ef41Sopenharmony_ci  // there's no need to emit matching code for the leading surrogate \ud800
2471cb0ef41Sopenharmony_ci  // twice. We also create a dedicated grouping for full trailing ranges, i.e.
2481cb0ef41Sopenharmony_ci  // [dc00-dfff].
2491cb0ef41Sopenharmony_ci  ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
2501cb0ef41Sopenharmony_ci      zone);
2511cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* leading_with_full_trailing_range =
2521cb0ef41Sopenharmony_ci      zone->New<ZoneList<CharacterRange>>(1, zone);
2531cb0ef41Sopenharmony_ci  const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,
2541cb0ef41Sopenharmony_ci                            base::uc16 from_t, base::uc16 to_t) {
2551cb0ef41Sopenharmony_ci    const UC16Range leading_range = ToUC16Range(from_l, to_l);
2561cb0ef41Sopenharmony_ci    if (grouped_by_leading.count(leading_range) == 0) {
2571cb0ef41Sopenharmony_ci      if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
2581cb0ef41Sopenharmony_ci        leading_with_full_trailing_range->Add(
2591cb0ef41Sopenharmony_ci            CharacterRange::Range(from_l, to_l), zone);
2601cb0ef41Sopenharmony_ci        return;
2611cb0ef41Sopenharmony_ci      }
2621cb0ef41Sopenharmony_ci      grouped_by_leading[leading_range] =
2631cb0ef41Sopenharmony_ci          zone->New<ZoneList<CharacterRange>>(2, zone);
2641cb0ef41Sopenharmony_ci    }
2651cb0ef41Sopenharmony_ci    grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),
2661cb0ef41Sopenharmony_ci                                           zone);
2671cb0ef41Sopenharmony_ci  };
2681cb0ef41Sopenharmony_ci
2691cb0ef41Sopenharmony_ci  // First, create the grouped ranges.
2701cb0ef41Sopenharmony_ci  CharacterRange::Canonicalize(non_bmp);
2711cb0ef41Sopenharmony_ci  for (int i = 0; i < non_bmp->length(); i++) {
2721cb0ef41Sopenharmony_ci    // Match surrogate pair.
2731cb0ef41Sopenharmony_ci    // E.g. [\u10005-\u11005] becomes
2741cb0ef41Sopenharmony_ci    //      \ud800[\udc05-\udfff]|
2751cb0ef41Sopenharmony_ci    //      [\ud801-\ud803][\udc00-\udfff]|
2761cb0ef41Sopenharmony_ci    //      \ud804[\udc00-\udc05]
2771cb0ef41Sopenharmony_ci    base::uc32 from = non_bmp->at(i).from();
2781cb0ef41Sopenharmony_ci    base::uc32 to = non_bmp->at(i).to();
2791cb0ef41Sopenharmony_ci    base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
2801cb0ef41Sopenharmony_ci    base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
2811cb0ef41Sopenharmony_ci    base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
2821cb0ef41Sopenharmony_ci    base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
2831cb0ef41Sopenharmony_ci
2841cb0ef41Sopenharmony_ci    if (from_l == to_l) {
2851cb0ef41Sopenharmony_ci      // The lead surrogate is the same.
2861cb0ef41Sopenharmony_ci      AddRange(from_l, to_l, from_t, to_t);
2871cb0ef41Sopenharmony_ci      continue;
2881cb0ef41Sopenharmony_ci    }
2891cb0ef41Sopenharmony_ci
2901cb0ef41Sopenharmony_ci    if (from_t != kTrailSurrogateStart) {
2911cb0ef41Sopenharmony_ci      // Add [from_l][from_t-\udfff].
2921cb0ef41Sopenharmony_ci      AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);
2931cb0ef41Sopenharmony_ci      from_l++;
2941cb0ef41Sopenharmony_ci    }
2951cb0ef41Sopenharmony_ci    if (to_t != kTrailSurrogateEnd) {
2961cb0ef41Sopenharmony_ci      // Add [to_l][\udc00-to_t].
2971cb0ef41Sopenharmony_ci      AddRange(to_l, to_l, kTrailSurrogateStart, to_t);
2981cb0ef41Sopenharmony_ci      to_l--;
2991cb0ef41Sopenharmony_ci    }
3001cb0ef41Sopenharmony_ci    if (from_l <= to_l) {
3011cb0ef41Sopenharmony_ci      // Add [from_l-to_l][\udc00-\udfff].
3021cb0ef41Sopenharmony_ci      AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);
3031cb0ef41Sopenharmony_ci    }
3041cb0ef41Sopenharmony_ci  }
3051cb0ef41Sopenharmony_ci
3061cb0ef41Sopenharmony_ci  // Create the actual TextNode now that ranges are fully grouped.
3071cb0ef41Sopenharmony_ci  if (!leading_with_full_trailing_range->is_empty()) {
3081cb0ef41Sopenharmony_ci    CharacterRange::Canonicalize(leading_with_full_trailing_range);
3091cb0ef41Sopenharmony_ci    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
3101cb0ef41Sopenharmony_ci        zone, leading_with_full_trailing_range,
3111cb0ef41Sopenharmony_ci        CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
3121cb0ef41Sopenharmony_ci        compiler->read_backward(), on_success)));
3131cb0ef41Sopenharmony_ci  }
3141cb0ef41Sopenharmony_ci  for (const auto& it : grouped_by_leading) {
3151cb0ef41Sopenharmony_ci    CharacterRange leading_range =
3161cb0ef41Sopenharmony_ci        CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));
3171cb0ef41Sopenharmony_ci    ZoneList<CharacterRange>* trailing_ranges = it.second;
3181cb0ef41Sopenharmony_ci    CharacterRange::Canonicalize(trailing_ranges);
3191cb0ef41Sopenharmony_ci    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
3201cb0ef41Sopenharmony_ci        zone, leading_range, trailing_ranges, compiler->read_backward(),
3211cb0ef41Sopenharmony_ci        on_success)));
3221cb0ef41Sopenharmony_ci  }
3231cb0ef41Sopenharmony_ci}
3241cb0ef41Sopenharmony_ci
3251cb0ef41Sopenharmony_ciRegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
3261cb0ef41Sopenharmony_ci    RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
3271cb0ef41Sopenharmony_ci    ZoneList<CharacterRange>* match, RegExpNode* on_success,
3281cb0ef41Sopenharmony_ci    bool read_backward) {
3291cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
3301cb0ef41Sopenharmony_ci  RegExpNode* match_node = TextNode::CreateForCharacterRanges(
3311cb0ef41Sopenharmony_ci      zone, match, read_backward, on_success);
3321cb0ef41Sopenharmony_ci  int stack_register = compiler->UnicodeLookaroundStackRegister();
3331cb0ef41Sopenharmony_ci  int position_register = compiler->UnicodeLookaroundPositionRegister();
3341cb0ef41Sopenharmony_ci  RegExpLookaround::Builder lookaround(false, match_node, stack_register,
3351cb0ef41Sopenharmony_ci                                       position_register);
3361cb0ef41Sopenharmony_ci  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
3371cb0ef41Sopenharmony_ci      zone, lookbehind, !read_backward, lookaround.on_match_success());
3381cb0ef41Sopenharmony_ci  return lookaround.ForMatch(negative_match);
3391cb0ef41Sopenharmony_ci}
3401cb0ef41Sopenharmony_ci
3411cb0ef41Sopenharmony_ciRegExpNode* MatchAndNegativeLookaroundInReadDirection(
3421cb0ef41Sopenharmony_ci    RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
3431cb0ef41Sopenharmony_ci    ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
3441cb0ef41Sopenharmony_ci    bool read_backward) {
3451cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
3461cb0ef41Sopenharmony_ci  int stack_register = compiler->UnicodeLookaroundStackRegister();
3471cb0ef41Sopenharmony_ci  int position_register = compiler->UnicodeLookaroundPositionRegister();
3481cb0ef41Sopenharmony_ci  RegExpLookaround::Builder lookaround(false, on_success, stack_register,
3491cb0ef41Sopenharmony_ci                                       position_register);
3501cb0ef41Sopenharmony_ci  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
3511cb0ef41Sopenharmony_ci      zone, lookahead, read_backward, lookaround.on_match_success());
3521cb0ef41Sopenharmony_ci  return TextNode::CreateForCharacterRanges(
3531cb0ef41Sopenharmony_ci      zone, match, read_backward, lookaround.ForMatch(negative_match));
3541cb0ef41Sopenharmony_ci}
3551cb0ef41Sopenharmony_ci
3561cb0ef41Sopenharmony_civoid AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
3571cb0ef41Sopenharmony_ci                           RegExpNode* on_success,
3581cb0ef41Sopenharmony_ci                           UnicodeRangeSplitter* splitter) {
3591cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* lead_surrogates =
3601cb0ef41Sopenharmony_ci      ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
3611cb0ef41Sopenharmony_ci  if (lead_surrogates == nullptr) return;
3621cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
3631cb0ef41Sopenharmony_ci  // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
3641cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
3651cb0ef41Sopenharmony_ci      zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
3661cb0ef41Sopenharmony_ci
3671cb0ef41Sopenharmony_ci  RegExpNode* match;
3681cb0ef41Sopenharmony_ci  if (compiler->read_backward()) {
3691cb0ef41Sopenharmony_ci    // Reading backward. Assert that reading forward, there is no trail
3701cb0ef41Sopenharmony_ci    // surrogate, and then backward match the lead surrogate.
3711cb0ef41Sopenharmony_ci    match = NegativeLookaroundAgainstReadDirectionAndMatch(
3721cb0ef41Sopenharmony_ci        compiler, trail_surrogates, lead_surrogates, on_success, true);
3731cb0ef41Sopenharmony_ci  } else {
3741cb0ef41Sopenharmony_ci    // Reading forward. Forward match the lead surrogate and assert that
3751cb0ef41Sopenharmony_ci    // no trail surrogate follows.
3761cb0ef41Sopenharmony_ci    match = MatchAndNegativeLookaroundInReadDirection(
3771cb0ef41Sopenharmony_ci        compiler, lead_surrogates, trail_surrogates, on_success, false);
3781cb0ef41Sopenharmony_ci  }
3791cb0ef41Sopenharmony_ci  result->AddAlternative(GuardedAlternative(match));
3801cb0ef41Sopenharmony_ci}
3811cb0ef41Sopenharmony_ci
3821cb0ef41Sopenharmony_civoid AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
3831cb0ef41Sopenharmony_ci                            RegExpNode* on_success,
3841cb0ef41Sopenharmony_ci                            UnicodeRangeSplitter* splitter) {
3851cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* trail_surrogates =
3861cb0ef41Sopenharmony_ci      ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
3871cb0ef41Sopenharmony_ci  if (trail_surrogates == nullptr) return;
3881cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
3891cb0ef41Sopenharmony_ci  // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
3901cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
3911cb0ef41Sopenharmony_ci      zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
3921cb0ef41Sopenharmony_ci
3931cb0ef41Sopenharmony_ci  RegExpNode* match;
3941cb0ef41Sopenharmony_ci  if (compiler->read_backward()) {
3951cb0ef41Sopenharmony_ci    // Reading backward. Backward match the trail surrogate and assert that no
3961cb0ef41Sopenharmony_ci    // lead surrogate precedes it.
3971cb0ef41Sopenharmony_ci    match = MatchAndNegativeLookaroundInReadDirection(
3981cb0ef41Sopenharmony_ci        compiler, trail_surrogates, lead_surrogates, on_success, true);
3991cb0ef41Sopenharmony_ci  } else {
4001cb0ef41Sopenharmony_ci    // Reading forward. Assert that reading backward, there is no lead
4011cb0ef41Sopenharmony_ci    // surrogate, and then forward match the trail surrogate.
4021cb0ef41Sopenharmony_ci    match = NegativeLookaroundAgainstReadDirectionAndMatch(
4031cb0ef41Sopenharmony_ci        compiler, lead_surrogates, trail_surrogates, on_success, false);
4041cb0ef41Sopenharmony_ci  }
4051cb0ef41Sopenharmony_ci  result->AddAlternative(GuardedAlternative(match));
4061cb0ef41Sopenharmony_ci}
4071cb0ef41Sopenharmony_ci
4081cb0ef41Sopenharmony_ciRegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
4091cb0ef41Sopenharmony_ci                              RegExpNode* on_success) {
4101cb0ef41Sopenharmony_ci  // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
4111cb0ef41Sopenharmony_ci  DCHECK(!compiler->read_backward());
4121cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
4131cb0ef41Sopenharmony_ci  // Advance any character. If the character happens to be a lead surrogate and
4141cb0ef41Sopenharmony_ci  // we advanced into the middle of a surrogate pair, it will work out, as
4151cb0ef41Sopenharmony_ci  // nothing will match from there. We will have to advance again, consuming
4161cb0ef41Sopenharmony_ci  // the associated trail surrogate.
4171cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* range =
4181cb0ef41Sopenharmony_ci      CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit));
4191cb0ef41Sopenharmony_ci  return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
4201cb0ef41Sopenharmony_ci}
4211cb0ef41Sopenharmony_ci
4221cb0ef41Sopenharmony_civoid AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
4231cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
4241cb0ef41Sopenharmony_ci  DCHECK(CharacterRange::IsCanonical(ranges));
4251cb0ef41Sopenharmony_ci
4261cb0ef41Sopenharmony_ci  // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
4271cb0ef41Sopenharmony_ci  // See also https://crbug.com/v8/6727.
4281cb0ef41Sopenharmony_ci  // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
4291cb0ef41Sopenharmony_ci  // which we use frequently internally. But large ranges can also easily be
4301cb0ef41Sopenharmony_ci  // created by the user. We might want to have a more general caching mechanism
4311cb0ef41Sopenharmony_ci  // for such ranges.
4321cb0ef41Sopenharmony_ci  if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
4331cb0ef41Sopenharmony_ci
4341cb0ef41Sopenharmony_ci  // Use ICU to compute the case fold closure over the ranges.
4351cb0ef41Sopenharmony_ci  icu::UnicodeSet set;
4361cb0ef41Sopenharmony_ci  for (int i = 0; i < ranges->length(); i++) {
4371cb0ef41Sopenharmony_ci    set.add(ranges->at(i).from(), ranges->at(i).to());
4381cb0ef41Sopenharmony_ci  }
4391cb0ef41Sopenharmony_ci  // Clear the ranges list without freeing the backing store.
4401cb0ef41Sopenharmony_ci  ranges->Rewind(0);
4411cb0ef41Sopenharmony_ci  set.closeOver(USET_CASE_INSENSITIVE);
4421cb0ef41Sopenharmony_ci  // Full case mapping map single characters to multiple characters.
4431cb0ef41Sopenharmony_ci  // Those are represented as strings in the set. Remove them so that
4441cb0ef41Sopenharmony_ci  // we end up with only simple and common case mappings.
4451cb0ef41Sopenharmony_ci  set.removeAllStrings();
4461cb0ef41Sopenharmony_ci  for (int i = 0; i < set.getRangeCount(); i++) {
4471cb0ef41Sopenharmony_ci    ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
4481cb0ef41Sopenharmony_ci                zone);
4491cb0ef41Sopenharmony_ci  }
4501cb0ef41Sopenharmony_ci  // No errors and everything we collected have been ranges.
4511cb0ef41Sopenharmony_ci  CharacterRange::Canonicalize(ranges);
4521cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
4531cb0ef41Sopenharmony_ci}
4541cb0ef41Sopenharmony_ci
4551cb0ef41Sopenharmony_ci}  // namespace
4561cb0ef41Sopenharmony_ci
4571cb0ef41Sopenharmony_ciRegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4581cb0ef41Sopenharmony_ci                                         RegExpNode* on_success) {
4591cb0ef41Sopenharmony_ci  set_.Canonicalize();
4601cb0ef41Sopenharmony_ci  Zone* const zone = compiler->zone();
4611cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* ranges = this->ranges(zone);
4621cb0ef41Sopenharmony_ci
4631cb0ef41Sopenharmony_ci  if (NeedsUnicodeCaseEquivalents(compiler->flags())) {
4641cb0ef41Sopenharmony_ci    AddUnicodeCaseEquivalents(ranges, zone);
4651cb0ef41Sopenharmony_ci  }
4661cb0ef41Sopenharmony_ci
4671cb0ef41Sopenharmony_ci  if (!IsUnicode(compiler->flags()) || compiler->one_byte() ||
4681cb0ef41Sopenharmony_ci      contains_split_surrogate()) {
4691cb0ef41Sopenharmony_ci    return zone->New<TextNode>(this, compiler->read_backward(), on_success);
4701cb0ef41Sopenharmony_ci  }
4711cb0ef41Sopenharmony_ci
4721cb0ef41Sopenharmony_ci  if (is_negated()) {
4731cb0ef41Sopenharmony_ci    ZoneList<CharacterRange>* negated =
4741cb0ef41Sopenharmony_ci        zone->New<ZoneList<CharacterRange>>(2, zone);
4751cb0ef41Sopenharmony_ci    CharacterRange::Negate(ranges, negated, zone);
4761cb0ef41Sopenharmony_ci    ranges = negated;
4771cb0ef41Sopenharmony_ci  }
4781cb0ef41Sopenharmony_ci
4791cb0ef41Sopenharmony_ci  if (ranges->length() == 0) {
4801cb0ef41Sopenharmony_ci    // The empty character class is used as a 'fail' node.
4811cb0ef41Sopenharmony_ci    RegExpCharacterClass* fail = zone->New<RegExpCharacterClass>(zone, ranges);
4821cb0ef41Sopenharmony_ci    return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
4831cb0ef41Sopenharmony_ci  }
4841cb0ef41Sopenharmony_ci
4851cb0ef41Sopenharmony_ci  if (set_.is_standard() &&
4861cb0ef41Sopenharmony_ci      standard_type() == StandardCharacterSet::kEverything) {
4871cb0ef41Sopenharmony_ci    return UnanchoredAdvance(compiler, on_success);
4881cb0ef41Sopenharmony_ci  }
4891cb0ef41Sopenharmony_ci
4901cb0ef41Sopenharmony_ci  // Split ranges in order to handle surrogates correctly:
4911cb0ef41Sopenharmony_ci  // - Surrogate pairs: translate the 32-bit code point into two uc16 code
4921cb0ef41Sopenharmony_ci  //   units (irregexp operates only on code units).
4931cb0ef41Sopenharmony_ci  // - Lone surrogates: these require lookarounds to ensure we don't match in
4941cb0ef41Sopenharmony_ci  //   the middle of a surrogate pair.
4951cb0ef41Sopenharmony_ci  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
4961cb0ef41Sopenharmony_ci  UnicodeRangeSplitter splitter(ranges);
4971cb0ef41Sopenharmony_ci  AddBmpCharacters(compiler, result, on_success, &splitter);
4981cb0ef41Sopenharmony_ci  AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
4991cb0ef41Sopenharmony_ci  AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5001cb0ef41Sopenharmony_ci  AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5011cb0ef41Sopenharmony_ci
5021cb0ef41Sopenharmony_ci  static constexpr int kMaxRangesToInline = 32;  // Arbitrary.
5031cb0ef41Sopenharmony_ci  if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
5041cb0ef41Sopenharmony_ci
5051cb0ef41Sopenharmony_ci  return result;
5061cb0ef41Sopenharmony_ci}
5071cb0ef41Sopenharmony_ci
5081cb0ef41Sopenharmony_cinamespace {
5091cb0ef41Sopenharmony_ci
5101cb0ef41Sopenharmony_ciint CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5111cb0ef41Sopenharmony_ci  RegExpAtom* atom1 = (*a)->AsAtom();
5121cb0ef41Sopenharmony_ci  RegExpAtom* atom2 = (*b)->AsAtom();
5131cb0ef41Sopenharmony_ci  base::uc16 character1 = atom1->data().at(0);
5141cb0ef41Sopenharmony_ci  base::uc16 character2 = atom2->data().at(0);
5151cb0ef41Sopenharmony_ci  if (character1 < character2) return -1;
5161cb0ef41Sopenharmony_ci  if (character1 > character2) return 1;
5171cb0ef41Sopenharmony_ci  return 0;
5181cb0ef41Sopenharmony_ci}
5191cb0ef41Sopenharmony_ci
5201cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
5211cb0ef41Sopenharmony_ci
5221cb0ef41Sopenharmony_ciint CompareCaseInsensitive(const icu::UnicodeString& a,
5231cb0ef41Sopenharmony_ci                           const icu::UnicodeString& b) {
5241cb0ef41Sopenharmony_ci  return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
5251cb0ef41Sopenharmony_ci}
5261cb0ef41Sopenharmony_ci
5271cb0ef41Sopenharmony_ciint CompareFirstCharCaseInsensitive(RegExpTree* const* a,
5281cb0ef41Sopenharmony_ci                                    RegExpTree* const* b) {
5291cb0ef41Sopenharmony_ci  RegExpAtom* atom1 = (*a)->AsAtom();
5301cb0ef41Sopenharmony_ci  RegExpAtom* atom2 = (*b)->AsAtom();
5311cb0ef41Sopenharmony_ci  return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)},
5321cb0ef41Sopenharmony_ci                                icu::UnicodeString{atom2->data().at(0)});
5331cb0ef41Sopenharmony_ci}
5341cb0ef41Sopenharmony_ci
5351cb0ef41Sopenharmony_cibool Equals(bool ignore_case, const icu::UnicodeString& a,
5361cb0ef41Sopenharmony_ci            const icu::UnicodeString& b) {
5371cb0ef41Sopenharmony_ci  if (a == b) return true;
5381cb0ef41Sopenharmony_ci  if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
5391cb0ef41Sopenharmony_ci  return false;  // Case-sensitive equality already checked above.
5401cb0ef41Sopenharmony_ci}
5411cb0ef41Sopenharmony_ci
5421cb0ef41Sopenharmony_cibool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
5431cb0ef41Sopenharmony_ci                  const RegExpAtom* b) {
5441cb0ef41Sopenharmony_ci  return Equals(ignore_case, a->data().at(index), b->data().at(index));
5451cb0ef41Sopenharmony_ci}
5461cb0ef41Sopenharmony_ci
5471cb0ef41Sopenharmony_ci#else
5481cb0ef41Sopenharmony_ci
5491cb0ef41Sopenharmony_ciunibrow::uchar Canonical(
5501cb0ef41Sopenharmony_ci    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5511cb0ef41Sopenharmony_ci    unibrow::uchar c) {
5521cb0ef41Sopenharmony_ci  unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5531cb0ef41Sopenharmony_ci  int length = canonicalize->get(c, '\0', chars);
5541cb0ef41Sopenharmony_ci  DCHECK_LE(length, 1);
5551cb0ef41Sopenharmony_ci  unibrow::uchar canonical = c;
5561cb0ef41Sopenharmony_ci  if (length == 1) canonical = chars[0];
5571cb0ef41Sopenharmony_ci  return canonical;
5581cb0ef41Sopenharmony_ci}
5591cb0ef41Sopenharmony_ci
5601cb0ef41Sopenharmony_ciint CompareCaseInsensitive(
5611cb0ef41Sopenharmony_ci    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5621cb0ef41Sopenharmony_ci    unibrow::uchar a, unibrow::uchar b) {
5631cb0ef41Sopenharmony_ci  if (a == b) return 0;
5641cb0ef41Sopenharmony_ci  if (a >= 'a' || b >= 'a') {
5651cb0ef41Sopenharmony_ci    a = Canonical(canonicalize, a);
5661cb0ef41Sopenharmony_ci    b = Canonical(canonicalize, b);
5671cb0ef41Sopenharmony_ci  }
5681cb0ef41Sopenharmony_ci  return static_cast<int>(a) - static_cast<int>(b);
5691cb0ef41Sopenharmony_ci}
5701cb0ef41Sopenharmony_ci
5711cb0ef41Sopenharmony_ciint CompareFirstCharCaseInsensitive(
5721cb0ef41Sopenharmony_ci    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5731cb0ef41Sopenharmony_ci    RegExpTree* const* a, RegExpTree* const* b) {
5741cb0ef41Sopenharmony_ci  RegExpAtom* atom1 = (*a)->AsAtom();
5751cb0ef41Sopenharmony_ci  RegExpAtom* atom2 = (*b)->AsAtom();
5761cb0ef41Sopenharmony_ci  return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
5771cb0ef41Sopenharmony_ci                                atom2->data().at(0));
5781cb0ef41Sopenharmony_ci}
5791cb0ef41Sopenharmony_ci
5801cb0ef41Sopenharmony_cibool Equals(bool ignore_case,
5811cb0ef41Sopenharmony_ci            unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5821cb0ef41Sopenharmony_ci            unibrow::uchar a, unibrow::uchar b) {
5831cb0ef41Sopenharmony_ci  if (a == b) return true;
5841cb0ef41Sopenharmony_ci  if (ignore_case) {
5851cb0ef41Sopenharmony_ci    return CompareCaseInsensitive(canonicalize, a, b) == 0;
5861cb0ef41Sopenharmony_ci  }
5871cb0ef41Sopenharmony_ci  return false;  // Case-sensitive equality already checked above.
5881cb0ef41Sopenharmony_ci}
5891cb0ef41Sopenharmony_ci
5901cb0ef41Sopenharmony_cibool CharAtEquals(bool ignore_case,
5911cb0ef41Sopenharmony_ci                  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5921cb0ef41Sopenharmony_ci                  int index, const RegExpAtom* a, const RegExpAtom* b) {
5931cb0ef41Sopenharmony_ci  return Equals(ignore_case, canonicalize, a->data().at(index),
5941cb0ef41Sopenharmony_ci                b->data().at(index));
5951cb0ef41Sopenharmony_ci}
5961cb0ef41Sopenharmony_ci
5971cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
5981cb0ef41Sopenharmony_ci
5991cb0ef41Sopenharmony_ci}  // namespace
6001cb0ef41Sopenharmony_ci
6011cb0ef41Sopenharmony_ci// We can stable sort runs of atoms, since the order does not matter if they
6021cb0ef41Sopenharmony_ci// start with different characters.
6031cb0ef41Sopenharmony_ci// Returns true if any consecutive atoms were found.
6041cb0ef41Sopenharmony_cibool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
6051cb0ef41Sopenharmony_ci  ZoneList<RegExpTree*>* alternatives = this->alternatives();
6061cb0ef41Sopenharmony_ci  int length = alternatives->length();
6071cb0ef41Sopenharmony_ci  bool found_consecutive_atoms = false;
6081cb0ef41Sopenharmony_ci  for (int i = 0; i < length; i++) {
6091cb0ef41Sopenharmony_ci    while (i < length) {
6101cb0ef41Sopenharmony_ci      RegExpTree* alternative = alternatives->at(i);
6111cb0ef41Sopenharmony_ci      if (alternative->IsAtom()) break;
6121cb0ef41Sopenharmony_ci      i++;
6131cb0ef41Sopenharmony_ci    }
6141cb0ef41Sopenharmony_ci    // i is length or it is the index of an atom.
6151cb0ef41Sopenharmony_ci    if (i == length) break;
6161cb0ef41Sopenharmony_ci    int first_atom = i;
6171cb0ef41Sopenharmony_ci    i++;
6181cb0ef41Sopenharmony_ci    while (i < length) {
6191cb0ef41Sopenharmony_ci      RegExpTree* alternative = alternatives->at(i);
6201cb0ef41Sopenharmony_ci      if (!alternative->IsAtom()) break;
6211cb0ef41Sopenharmony_ci      i++;
6221cb0ef41Sopenharmony_ci    }
6231cb0ef41Sopenharmony_ci    // Sort atoms to get ones with common prefixes together.
6241cb0ef41Sopenharmony_ci    // This step is more tricky if we are in a case-independent regexp,
6251cb0ef41Sopenharmony_ci    // because it would change /is|I/ to /I|is/, and order matters when
6261cb0ef41Sopenharmony_ci    // the regexp parts don't match only disjoint starting points. To fix
6271cb0ef41Sopenharmony_ci    // this we have a version of CompareFirstChar that uses case-
6281cb0ef41Sopenharmony_ci    // independent character classes for comparison.
6291cb0ef41Sopenharmony_ci    DCHECK_LT(first_atom, alternatives->length());
6301cb0ef41Sopenharmony_ci    DCHECK_LE(i, alternatives->length());
6311cb0ef41Sopenharmony_ci    DCHECK_LE(first_atom, i);
6321cb0ef41Sopenharmony_ci    if (IsIgnoreCase(compiler->flags())) {
6331cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
6341cb0ef41Sopenharmony_ci      alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom,
6351cb0ef41Sopenharmony_ci                               i - first_atom);
6361cb0ef41Sopenharmony_ci#else
6371cb0ef41Sopenharmony_ci      unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
6381cb0ef41Sopenharmony_ci          compiler->isolate()->regexp_macro_assembler_canonicalize();
6391cb0ef41Sopenharmony_ci      auto compare_closure = [canonicalize](RegExpTree* const* a,
6401cb0ef41Sopenharmony_ci                                            RegExpTree* const* b) {
6411cb0ef41Sopenharmony_ci        return CompareFirstCharCaseInsensitive(canonicalize, a, b);
6421cb0ef41Sopenharmony_ci      };
6431cb0ef41Sopenharmony_ci      alternatives->StableSort(compare_closure, first_atom, i - first_atom);
6441cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
6451cb0ef41Sopenharmony_ci    } else {
6461cb0ef41Sopenharmony_ci      alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
6471cb0ef41Sopenharmony_ci    }
6481cb0ef41Sopenharmony_ci    if (i - first_atom > 1) found_consecutive_atoms = true;
6491cb0ef41Sopenharmony_ci  }
6501cb0ef41Sopenharmony_ci  return found_consecutive_atoms;
6511cb0ef41Sopenharmony_ci}
6521cb0ef41Sopenharmony_ci
6531cb0ef41Sopenharmony_ci// Optimizes ab|ac|az to a(?:b|c|d).
6541cb0ef41Sopenharmony_civoid RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
6551cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
6561cb0ef41Sopenharmony_ci  ZoneList<RegExpTree*>* alternatives = this->alternatives();
6571cb0ef41Sopenharmony_ci  int length = alternatives->length();
6581cb0ef41Sopenharmony_ci  const bool ignore_case = IsIgnoreCase(compiler->flags());
6591cb0ef41Sopenharmony_ci
6601cb0ef41Sopenharmony_ci  int write_posn = 0;
6611cb0ef41Sopenharmony_ci  int i = 0;
6621cb0ef41Sopenharmony_ci  while (i < length) {
6631cb0ef41Sopenharmony_ci    RegExpTree* alternative = alternatives->at(i);
6641cb0ef41Sopenharmony_ci    if (!alternative->IsAtom()) {
6651cb0ef41Sopenharmony_ci      alternatives->at(write_posn++) = alternatives->at(i);
6661cb0ef41Sopenharmony_ci      i++;
6671cb0ef41Sopenharmony_ci      continue;
6681cb0ef41Sopenharmony_ci    }
6691cb0ef41Sopenharmony_ci    RegExpAtom* const atom = alternative->AsAtom();
6701cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
6711cb0ef41Sopenharmony_ci    icu::UnicodeString common_prefix(atom->data().at(0));
6721cb0ef41Sopenharmony_ci#else
6731cb0ef41Sopenharmony_ci    unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =
6741cb0ef41Sopenharmony_ci        compiler->isolate()->regexp_macro_assembler_canonicalize();
6751cb0ef41Sopenharmony_ci    unibrow::uchar common_prefix = atom->data().at(0);
6761cb0ef41Sopenharmony_ci    if (ignore_case) {
6771cb0ef41Sopenharmony_ci      common_prefix = Canonical(canonicalize, common_prefix);
6781cb0ef41Sopenharmony_ci    }
6791cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
6801cb0ef41Sopenharmony_ci    int first_with_prefix = i;
6811cb0ef41Sopenharmony_ci    int prefix_length = atom->length();
6821cb0ef41Sopenharmony_ci    i++;
6831cb0ef41Sopenharmony_ci    while (i < length) {
6841cb0ef41Sopenharmony_ci      alternative = alternatives->at(i);
6851cb0ef41Sopenharmony_ci      if (!alternative->IsAtom()) break;
6861cb0ef41Sopenharmony_ci      RegExpAtom* const alt_atom = alternative->AsAtom();
6871cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
6881cb0ef41Sopenharmony_ci      icu::UnicodeString new_prefix(alt_atom->data().at(0));
6891cb0ef41Sopenharmony_ci      if (!Equals(ignore_case, new_prefix, common_prefix)) break;
6901cb0ef41Sopenharmony_ci#else
6911cb0ef41Sopenharmony_ci      unibrow::uchar new_prefix = alt_atom->data().at(0);
6921cb0ef41Sopenharmony_ci      if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break;
6931cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
6941cb0ef41Sopenharmony_ci      prefix_length = std::min(prefix_length, alt_atom->length());
6951cb0ef41Sopenharmony_ci      i++;
6961cb0ef41Sopenharmony_ci    }
6971cb0ef41Sopenharmony_ci    if (i > first_with_prefix + 2) {
6981cb0ef41Sopenharmony_ci      // Found worthwhile run of alternatives with common prefix of at least one
6991cb0ef41Sopenharmony_ci      // character.  The sorting function above did not sort on more than one
7001cb0ef41Sopenharmony_ci      // character for reasons of correctness, but there may still be a longer
7011cb0ef41Sopenharmony_ci      // common prefix if the terms were similar or presorted in the input.
7021cb0ef41Sopenharmony_ci      // Find out how long the common prefix is.
7031cb0ef41Sopenharmony_ci      int run_length = i - first_with_prefix;
7041cb0ef41Sopenharmony_ci      RegExpAtom* const alt_atom =
7051cb0ef41Sopenharmony_ci          alternatives->at(first_with_prefix)->AsAtom();
7061cb0ef41Sopenharmony_ci      for (int j = 1; j < run_length && prefix_length > 1; j++) {
7071cb0ef41Sopenharmony_ci        RegExpAtom* old_atom =
7081cb0ef41Sopenharmony_ci            alternatives->at(j + first_with_prefix)->AsAtom();
7091cb0ef41Sopenharmony_ci        for (int k = 1; k < prefix_length; k++) {
7101cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
7111cb0ef41Sopenharmony_ci          if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) {
7121cb0ef41Sopenharmony_ci#else
7131cb0ef41Sopenharmony_ci          if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) {
7141cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
7151cb0ef41Sopenharmony_ci            prefix_length = k;
7161cb0ef41Sopenharmony_ci            break;
7171cb0ef41Sopenharmony_ci          }
7181cb0ef41Sopenharmony_ci        }
7191cb0ef41Sopenharmony_ci      }
7201cb0ef41Sopenharmony_ci      RegExpAtom* prefix =
7211cb0ef41Sopenharmony_ci          zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length));
7221cb0ef41Sopenharmony_ci      ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
7231cb0ef41Sopenharmony_ci      pair->Add(prefix, zone);
7241cb0ef41Sopenharmony_ci      ZoneList<RegExpTree*>* suffixes =
7251cb0ef41Sopenharmony_ci          zone->New<ZoneList<RegExpTree*>>(run_length, zone);
7261cb0ef41Sopenharmony_ci      for (int j = 0; j < run_length; j++) {
7271cb0ef41Sopenharmony_ci        RegExpAtom* old_atom =
7281cb0ef41Sopenharmony_ci            alternatives->at(j + first_with_prefix)->AsAtom();
7291cb0ef41Sopenharmony_ci        int len = old_atom->length();
7301cb0ef41Sopenharmony_ci        if (len == prefix_length) {
7311cb0ef41Sopenharmony_ci          suffixes->Add(zone->New<RegExpEmpty>(), zone);
7321cb0ef41Sopenharmony_ci        } else {
7331cb0ef41Sopenharmony_ci          RegExpTree* suffix = zone->New<RegExpAtom>(
7341cb0ef41Sopenharmony_ci              old_atom->data().SubVector(prefix_length, old_atom->length()));
7351cb0ef41Sopenharmony_ci          suffixes->Add(suffix, zone);
7361cb0ef41Sopenharmony_ci        }
7371cb0ef41Sopenharmony_ci      }
7381cb0ef41Sopenharmony_ci      pair->Add(zone->New<RegExpDisjunction>(suffixes), zone);
7391cb0ef41Sopenharmony_ci      alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
7401cb0ef41Sopenharmony_ci    } else {
7411cb0ef41Sopenharmony_ci      // Just copy any non-worthwhile alternatives.
7421cb0ef41Sopenharmony_ci      for (int j = first_with_prefix; j < i; j++) {
7431cb0ef41Sopenharmony_ci        alternatives->at(write_posn++) = alternatives->at(j);
7441cb0ef41Sopenharmony_ci      }
7451cb0ef41Sopenharmony_ci    }
7461cb0ef41Sopenharmony_ci  }
7471cb0ef41Sopenharmony_ci  alternatives->Rewind(write_posn);  // Trim end of array.
7481cb0ef41Sopenharmony_ci}
7491cb0ef41Sopenharmony_ci
7501cb0ef41Sopenharmony_ci// Optimizes b|c|z to [bcz].
7511cb0ef41Sopenharmony_civoid RegExpDisjunction::FixSingleCharacterDisjunctions(
7521cb0ef41Sopenharmony_ci    RegExpCompiler* compiler) {
7531cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
7541cb0ef41Sopenharmony_ci  ZoneList<RegExpTree*>* alternatives = this->alternatives();
7551cb0ef41Sopenharmony_ci  int length = alternatives->length();
7561cb0ef41Sopenharmony_ci
7571cb0ef41Sopenharmony_ci  int write_posn = 0;
7581cb0ef41Sopenharmony_ci  int i = 0;
7591cb0ef41Sopenharmony_ci  while (i < length) {
7601cb0ef41Sopenharmony_ci    RegExpTree* alternative = alternatives->at(i);
7611cb0ef41Sopenharmony_ci    if (!alternative->IsAtom()) {
7621cb0ef41Sopenharmony_ci      alternatives->at(write_posn++) = alternatives->at(i);
7631cb0ef41Sopenharmony_ci      i++;
7641cb0ef41Sopenharmony_ci      continue;
7651cb0ef41Sopenharmony_ci    }
7661cb0ef41Sopenharmony_ci    RegExpAtom* const atom = alternative->AsAtom();
7671cb0ef41Sopenharmony_ci    if (atom->length() != 1) {
7681cb0ef41Sopenharmony_ci      alternatives->at(write_posn++) = alternatives->at(i);
7691cb0ef41Sopenharmony_ci      i++;
7701cb0ef41Sopenharmony_ci      continue;
7711cb0ef41Sopenharmony_ci    }
7721cb0ef41Sopenharmony_ci    const RegExpFlags flags = compiler->flags();
7731cb0ef41Sopenharmony_ci    DCHECK_IMPLIES(IsUnicode(flags),
7741cb0ef41Sopenharmony_ci                   !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
7751cb0ef41Sopenharmony_ci    bool contains_trail_surrogate =
7761cb0ef41Sopenharmony_ci        unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
7771cb0ef41Sopenharmony_ci    int first_in_run = i;
7781cb0ef41Sopenharmony_ci    i++;
7791cb0ef41Sopenharmony_ci    // Find a run of single-character atom alternatives that have identical
7801cb0ef41Sopenharmony_ci    // flags (case independence and unicode-ness).
7811cb0ef41Sopenharmony_ci    while (i < length) {
7821cb0ef41Sopenharmony_ci      alternative = alternatives->at(i);
7831cb0ef41Sopenharmony_ci      if (!alternative->IsAtom()) break;
7841cb0ef41Sopenharmony_ci      RegExpAtom* const alt_atom = alternative->AsAtom();
7851cb0ef41Sopenharmony_ci      if (alt_atom->length() != 1) break;
7861cb0ef41Sopenharmony_ci      DCHECK_IMPLIES(IsUnicode(flags),
7871cb0ef41Sopenharmony_ci                     !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));
7881cb0ef41Sopenharmony_ci      contains_trail_surrogate |=
7891cb0ef41Sopenharmony_ci          unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0));
7901cb0ef41Sopenharmony_ci      i++;
7911cb0ef41Sopenharmony_ci    }
7921cb0ef41Sopenharmony_ci    if (i > first_in_run + 1) {
7931cb0ef41Sopenharmony_ci      // Found non-trivial run of single-character alternatives.
7941cb0ef41Sopenharmony_ci      int run_length = i - first_in_run;
7951cb0ef41Sopenharmony_ci      ZoneList<CharacterRange>* ranges =
7961cb0ef41Sopenharmony_ci          zone->New<ZoneList<CharacterRange>>(2, zone);
7971cb0ef41Sopenharmony_ci      for (int j = 0; j < run_length; j++) {
7981cb0ef41Sopenharmony_ci        RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
7991cb0ef41Sopenharmony_ci        DCHECK_EQ(old_atom->length(), 1);
8001cb0ef41Sopenharmony_ci        ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
8011cb0ef41Sopenharmony_ci      }
8021cb0ef41Sopenharmony_ci      RegExpCharacterClass::CharacterClassFlags character_class_flags;
8031cb0ef41Sopenharmony_ci      if (IsUnicode(flags) && contains_trail_surrogate) {
8041cb0ef41Sopenharmony_ci        character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
8051cb0ef41Sopenharmony_ci      }
8061cb0ef41Sopenharmony_ci      alternatives->at(write_posn++) =
8071cb0ef41Sopenharmony_ci          zone->New<RegExpCharacterClass>(zone, ranges, character_class_flags);
8081cb0ef41Sopenharmony_ci    } else {
8091cb0ef41Sopenharmony_ci      // Just copy any trivial alternatives.
8101cb0ef41Sopenharmony_ci      for (int j = first_in_run; j < i; j++) {
8111cb0ef41Sopenharmony_ci        alternatives->at(write_posn++) = alternatives->at(j);
8121cb0ef41Sopenharmony_ci      }
8131cb0ef41Sopenharmony_ci    }
8141cb0ef41Sopenharmony_ci  }
8151cb0ef41Sopenharmony_ci  alternatives->Rewind(write_posn);  // Trim end of array.
8161cb0ef41Sopenharmony_ci}
8171cb0ef41Sopenharmony_ci
8181cb0ef41Sopenharmony_ciRegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
8191cb0ef41Sopenharmony_ci                                      RegExpNode* on_success) {
8201cb0ef41Sopenharmony_ci  compiler->ToNodeMaybeCheckForStackOverflow();
8211cb0ef41Sopenharmony_ci
8221cb0ef41Sopenharmony_ci  ZoneList<RegExpTree*>* alternatives = this->alternatives();
8231cb0ef41Sopenharmony_ci
8241cb0ef41Sopenharmony_ci  if (alternatives->length() > 2) {
8251cb0ef41Sopenharmony_ci    bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
8261cb0ef41Sopenharmony_ci    if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
8271cb0ef41Sopenharmony_ci    FixSingleCharacterDisjunctions(compiler);
8281cb0ef41Sopenharmony_ci    if (alternatives->length() == 1) {
8291cb0ef41Sopenharmony_ci      return alternatives->at(0)->ToNode(compiler, on_success);
8301cb0ef41Sopenharmony_ci    }
8311cb0ef41Sopenharmony_ci  }
8321cb0ef41Sopenharmony_ci
8331cb0ef41Sopenharmony_ci  int length = alternatives->length();
8341cb0ef41Sopenharmony_ci
8351cb0ef41Sopenharmony_ci  ChoiceNode* result =
8361cb0ef41Sopenharmony_ci      compiler->zone()->New<ChoiceNode>(length, compiler->zone());
8371cb0ef41Sopenharmony_ci  for (int i = 0; i < length; i++) {
8381cb0ef41Sopenharmony_ci    GuardedAlternative alternative(
8391cb0ef41Sopenharmony_ci        alternatives->at(i)->ToNode(compiler, on_success));
8401cb0ef41Sopenharmony_ci    result->AddAlternative(alternative);
8411cb0ef41Sopenharmony_ci  }
8421cb0ef41Sopenharmony_ci  return result;
8431cb0ef41Sopenharmony_ci}
8441cb0ef41Sopenharmony_ci
8451cb0ef41Sopenharmony_ciRegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
8461cb0ef41Sopenharmony_ci                                     RegExpNode* on_success) {
8471cb0ef41Sopenharmony_ci  return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
8481cb0ef41Sopenharmony_ci}
8491cb0ef41Sopenharmony_ci
8501cb0ef41Sopenharmony_cinamespace {
8511cb0ef41Sopenharmony_ci// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
8521cb0ef41Sopenharmony_ci//         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
8531cb0ef41Sopenharmony_ciRegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
8541cb0ef41Sopenharmony_ci                                          RegExpNode* on_success,
8551cb0ef41Sopenharmony_ci                                          RegExpAssertion::Type type,
8561cb0ef41Sopenharmony_ci                                          RegExpFlags flags) {
8571cb0ef41Sopenharmony_ci  CHECK(NeedsUnicodeCaseEquivalents(flags));
8581cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
8591cb0ef41Sopenharmony_ci  ZoneList<CharacterRange>* word_range =
8601cb0ef41Sopenharmony_ci      zone->New<ZoneList<CharacterRange>>(2, zone);
8611cb0ef41Sopenharmony_ci  CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,
8621cb0ef41Sopenharmony_ci                                 zone);
8631cb0ef41Sopenharmony_ci  int stack_register = compiler->UnicodeLookaroundStackRegister();
8641cb0ef41Sopenharmony_ci  int position_register = compiler->UnicodeLookaroundPositionRegister();
8651cb0ef41Sopenharmony_ci  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
8661cb0ef41Sopenharmony_ci  // Add two choices. The (non-)boundary could start with a word or
8671cb0ef41Sopenharmony_ci  // a non-word-character.
8681cb0ef41Sopenharmony_ci  for (int i = 0; i < 2; i++) {
8691cb0ef41Sopenharmony_ci    bool lookbehind_for_word = i == 0;
8701cb0ef41Sopenharmony_ci    bool lookahead_for_word =
8711cb0ef41Sopenharmony_ci        (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;
8721cb0ef41Sopenharmony_ci    // Look to the left.
8731cb0ef41Sopenharmony_ci    RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
8741cb0ef41Sopenharmony_ci                                         stack_register, position_register);
8751cb0ef41Sopenharmony_ci    RegExpNode* backward = TextNode::CreateForCharacterRanges(
8761cb0ef41Sopenharmony_ci        zone, word_range, true, lookbehind.on_match_success());
8771cb0ef41Sopenharmony_ci    // Look to the right.
8781cb0ef41Sopenharmony_ci    RegExpLookaround::Builder lookahead(lookahead_for_word,
8791cb0ef41Sopenharmony_ci                                        lookbehind.ForMatch(backward),
8801cb0ef41Sopenharmony_ci                                        stack_register, position_register);
8811cb0ef41Sopenharmony_ci    RegExpNode* forward = TextNode::CreateForCharacterRanges(
8821cb0ef41Sopenharmony_ci        zone, word_range, false, lookahead.on_match_success());
8831cb0ef41Sopenharmony_ci    result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
8841cb0ef41Sopenharmony_ci  }
8851cb0ef41Sopenharmony_ci  return result;
8861cb0ef41Sopenharmony_ci}
8871cb0ef41Sopenharmony_ci}  // anonymous namespace
8881cb0ef41Sopenharmony_ci
8891cb0ef41Sopenharmony_ciRegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
8901cb0ef41Sopenharmony_ci                                    RegExpNode* on_success) {
8911cb0ef41Sopenharmony_ci  NodeInfo info;
8921cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
8931cb0ef41Sopenharmony_ci
8941cb0ef41Sopenharmony_ci  switch (assertion_type()) {
8951cb0ef41Sopenharmony_ci    case Type::START_OF_LINE:
8961cb0ef41Sopenharmony_ci      return AssertionNode::AfterNewline(on_success);
8971cb0ef41Sopenharmony_ci    case Type::START_OF_INPUT:
8981cb0ef41Sopenharmony_ci      return AssertionNode::AtStart(on_success);
8991cb0ef41Sopenharmony_ci    case Type::BOUNDARY:
9001cb0ef41Sopenharmony_ci      return NeedsUnicodeCaseEquivalents(compiler->flags())
9011cb0ef41Sopenharmony_ci                 ? BoundaryAssertionAsLookaround(
9021cb0ef41Sopenharmony_ci                       compiler, on_success, Type::BOUNDARY, compiler->flags())
9031cb0ef41Sopenharmony_ci                 : AssertionNode::AtBoundary(on_success);
9041cb0ef41Sopenharmony_ci    case Type::NON_BOUNDARY:
9051cb0ef41Sopenharmony_ci      return NeedsUnicodeCaseEquivalents(compiler->flags())
9061cb0ef41Sopenharmony_ci                 ? BoundaryAssertionAsLookaround(compiler, on_success,
9071cb0ef41Sopenharmony_ci                                                 Type::NON_BOUNDARY,
9081cb0ef41Sopenharmony_ci                                                 compiler->flags())
9091cb0ef41Sopenharmony_ci                 : AssertionNode::AtNonBoundary(on_success);
9101cb0ef41Sopenharmony_ci    case Type::END_OF_INPUT:
9111cb0ef41Sopenharmony_ci      return AssertionNode::AtEnd(on_success);
9121cb0ef41Sopenharmony_ci    case Type::END_OF_LINE: {
9131cb0ef41Sopenharmony_ci      // Compile $ in multiline regexps as an alternation with a positive
9141cb0ef41Sopenharmony_ci      // lookahead in one side and an end-of-input on the other side.
9151cb0ef41Sopenharmony_ci      // We need two registers for the lookahead.
9161cb0ef41Sopenharmony_ci      int stack_pointer_register = compiler->AllocateRegister();
9171cb0ef41Sopenharmony_ci      int position_register = compiler->AllocateRegister();
9181cb0ef41Sopenharmony_ci      // The ChoiceNode to distinguish between a newline and end-of-input.
9191cb0ef41Sopenharmony_ci      ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
9201cb0ef41Sopenharmony_ci      // Create a newline atom.
9211cb0ef41Sopenharmony_ci      ZoneList<CharacterRange>* newline_ranges =
9221cb0ef41Sopenharmony_ci          zone->New<ZoneList<CharacterRange>>(3, zone);
9231cb0ef41Sopenharmony_ci      CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,
9241cb0ef41Sopenharmony_ci                                     newline_ranges, false, zone);
9251cb0ef41Sopenharmony_ci      RegExpCharacterClass* newline_atom = zone->New<RegExpCharacterClass>(
9261cb0ef41Sopenharmony_ci          StandardCharacterSet::kLineTerminator);
9271cb0ef41Sopenharmony_ci      TextNode* newline_matcher =
9281cb0ef41Sopenharmony_ci          zone->New<TextNode>(newline_atom, false,
9291cb0ef41Sopenharmony_ci                              ActionNode::PositiveSubmatchSuccess(
9301cb0ef41Sopenharmony_ci                                  stack_pointer_register, position_register,
9311cb0ef41Sopenharmony_ci                                  0,   // No captures inside.
9321cb0ef41Sopenharmony_ci                                  -1,  // Ignored if no captures.
9331cb0ef41Sopenharmony_ci                                  on_success));
9341cb0ef41Sopenharmony_ci      // Create an end-of-input matcher.
9351cb0ef41Sopenharmony_ci      RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
9361cb0ef41Sopenharmony_ci          stack_pointer_register, position_register, newline_matcher);
9371cb0ef41Sopenharmony_ci      // Add the two alternatives to the ChoiceNode.
9381cb0ef41Sopenharmony_ci      GuardedAlternative eol_alternative(end_of_line);
9391cb0ef41Sopenharmony_ci      result->AddAlternative(eol_alternative);
9401cb0ef41Sopenharmony_ci      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
9411cb0ef41Sopenharmony_ci      result->AddAlternative(end_alternative);
9421cb0ef41Sopenharmony_ci      return result;
9431cb0ef41Sopenharmony_ci    }
9441cb0ef41Sopenharmony_ci    default:
9451cb0ef41Sopenharmony_ci      UNREACHABLE();
9461cb0ef41Sopenharmony_ci  }
9471cb0ef41Sopenharmony_ci}
9481cb0ef41Sopenharmony_ci
9491cb0ef41Sopenharmony_ciRegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
9501cb0ef41Sopenharmony_ci                                        RegExpNode* on_success) {
9511cb0ef41Sopenharmony_ci  return compiler->zone()->New<BackReferenceNode>(
9521cb0ef41Sopenharmony_ci      RegExpCapture::StartRegister(index()),
9531cb0ef41Sopenharmony_ci      RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(),
9541cb0ef41Sopenharmony_ci      on_success);
9551cb0ef41Sopenharmony_ci}
9561cb0ef41Sopenharmony_ci
9571cb0ef41Sopenharmony_ciRegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
9581cb0ef41Sopenharmony_ci                                RegExpNode* on_success) {
9591cb0ef41Sopenharmony_ci  return on_success;
9601cb0ef41Sopenharmony_ci}
9611cb0ef41Sopenharmony_ci
9621cb0ef41Sopenharmony_ciRegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
9631cb0ef41Sopenharmony_ci                                RegExpNode* on_success) {
9641cb0ef41Sopenharmony_ci  return body_->ToNode(compiler, on_success);
9651cb0ef41Sopenharmony_ci}
9661cb0ef41Sopenharmony_ci
9671cb0ef41Sopenharmony_ciRegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
9681cb0ef41Sopenharmony_ci                                   int stack_pointer_register,
9691cb0ef41Sopenharmony_ci                                   int position_register,
9701cb0ef41Sopenharmony_ci                                   int capture_register_count,
9711cb0ef41Sopenharmony_ci                                   int capture_register_start)
9721cb0ef41Sopenharmony_ci    : is_positive_(is_positive),
9731cb0ef41Sopenharmony_ci      on_success_(on_success),
9741cb0ef41Sopenharmony_ci      stack_pointer_register_(stack_pointer_register),
9751cb0ef41Sopenharmony_ci      position_register_(position_register) {
9761cb0ef41Sopenharmony_ci  if (is_positive_) {
9771cb0ef41Sopenharmony_ci    on_match_success_ = ActionNode::PositiveSubmatchSuccess(
9781cb0ef41Sopenharmony_ci        stack_pointer_register, position_register, capture_register_count,
9791cb0ef41Sopenharmony_ci        capture_register_start, on_success_);
9801cb0ef41Sopenharmony_ci  } else {
9811cb0ef41Sopenharmony_ci    Zone* zone = on_success_->zone();
9821cb0ef41Sopenharmony_ci    on_match_success_ = zone->New<NegativeSubmatchSuccess>(
9831cb0ef41Sopenharmony_ci        stack_pointer_register, position_register, capture_register_count,
9841cb0ef41Sopenharmony_ci        capture_register_start, zone);
9851cb0ef41Sopenharmony_ci  }
9861cb0ef41Sopenharmony_ci}
9871cb0ef41Sopenharmony_ci
9881cb0ef41Sopenharmony_ciRegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
9891cb0ef41Sopenharmony_ci  if (is_positive_) {
9901cb0ef41Sopenharmony_ci    return ActionNode::BeginPositiveSubmatch(stack_pointer_register_,
9911cb0ef41Sopenharmony_ci                                             position_register_, match);
9921cb0ef41Sopenharmony_ci  } else {
9931cb0ef41Sopenharmony_ci    Zone* zone = on_success_->zone();
9941cb0ef41Sopenharmony_ci    // We use a ChoiceNode to represent the negative lookaround. The first
9951cb0ef41Sopenharmony_ci    // alternative is the negative match. On success, the end node backtracks.
9961cb0ef41Sopenharmony_ci    // On failure, the second alternative is tried and leads to success.
9971cb0ef41Sopenharmony_ci    // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
9981cb0ef41Sopenharmony_ci    // first exit when calculating quick checks.
9991cb0ef41Sopenharmony_ci    ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
10001cb0ef41Sopenharmony_ci        GuardedAlternative(match), GuardedAlternative(on_success_), zone);
10011cb0ef41Sopenharmony_ci    return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
10021cb0ef41Sopenharmony_ci                                             position_register_, choice_node);
10031cb0ef41Sopenharmony_ci  }
10041cb0ef41Sopenharmony_ci}
10051cb0ef41Sopenharmony_ci
10061cb0ef41Sopenharmony_ciRegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
10071cb0ef41Sopenharmony_ci                                     RegExpNode* on_success) {
10081cb0ef41Sopenharmony_ci  int stack_pointer_register = compiler->AllocateRegister();
10091cb0ef41Sopenharmony_ci  int position_register = compiler->AllocateRegister();
10101cb0ef41Sopenharmony_ci
10111cb0ef41Sopenharmony_ci  const int registers_per_capture = 2;
10121cb0ef41Sopenharmony_ci  const int register_of_first_capture = 2;
10131cb0ef41Sopenharmony_ci  int register_count = capture_count_ * registers_per_capture;
10141cb0ef41Sopenharmony_ci  int register_start =
10151cb0ef41Sopenharmony_ci      register_of_first_capture + capture_from_ * registers_per_capture;
10161cb0ef41Sopenharmony_ci
10171cb0ef41Sopenharmony_ci  RegExpNode* result;
10181cb0ef41Sopenharmony_ci  bool was_reading_backward = compiler->read_backward();
10191cb0ef41Sopenharmony_ci  compiler->set_read_backward(type() == LOOKBEHIND);
10201cb0ef41Sopenharmony_ci  Builder builder(is_positive(), on_success, stack_pointer_register,
10211cb0ef41Sopenharmony_ci                  position_register, register_count, register_start);
10221cb0ef41Sopenharmony_ci  RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
10231cb0ef41Sopenharmony_ci  result = builder.ForMatch(match);
10241cb0ef41Sopenharmony_ci  compiler->set_read_backward(was_reading_backward);
10251cb0ef41Sopenharmony_ci  return result;
10261cb0ef41Sopenharmony_ci}
10271cb0ef41Sopenharmony_ci
10281cb0ef41Sopenharmony_ciRegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
10291cb0ef41Sopenharmony_ci                                  RegExpNode* on_success) {
10301cb0ef41Sopenharmony_ci  return ToNode(body(), index(), compiler, on_success);
10311cb0ef41Sopenharmony_ci}
10321cb0ef41Sopenharmony_ci
10331cb0ef41Sopenharmony_ciRegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
10341cb0ef41Sopenharmony_ci                                  RegExpCompiler* compiler,
10351cb0ef41Sopenharmony_ci                                  RegExpNode* on_success) {
10361cb0ef41Sopenharmony_ci  DCHECK_NOT_NULL(body);
10371cb0ef41Sopenharmony_ci  int start_reg = RegExpCapture::StartRegister(index);
10381cb0ef41Sopenharmony_ci  int end_reg = RegExpCapture::EndRegister(index);
10391cb0ef41Sopenharmony_ci  if (compiler->read_backward()) std::swap(start_reg, end_reg);
10401cb0ef41Sopenharmony_ci  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
10411cb0ef41Sopenharmony_ci  RegExpNode* body_node = body->ToNode(compiler, store_end);
10421cb0ef41Sopenharmony_ci  return ActionNode::StorePosition(start_reg, true, body_node);
10431cb0ef41Sopenharmony_ci}
10441cb0ef41Sopenharmony_ci
10451cb0ef41Sopenharmony_cinamespace {
10461cb0ef41Sopenharmony_ci
10471cb0ef41Sopenharmony_ciclass AssertionSequenceRewriter final {
10481cb0ef41Sopenharmony_ci public:
10491cb0ef41Sopenharmony_ci  // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
10501cb0ef41Sopenharmony_ci  // instead of sprinkling rewrites into the AST->Node conversion process.
10511cb0ef41Sopenharmony_ci  static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
10521cb0ef41Sopenharmony_ci    AssertionSequenceRewriter rewriter(terms, zone);
10531cb0ef41Sopenharmony_ci
10541cb0ef41Sopenharmony_ci    static constexpr int kNoIndex = -1;
10551cb0ef41Sopenharmony_ci    int from = kNoIndex;
10561cb0ef41Sopenharmony_ci
10571cb0ef41Sopenharmony_ci    for (int i = 0; i < terms->length(); i++) {
10581cb0ef41Sopenharmony_ci      RegExpTree* t = terms->at(i);
10591cb0ef41Sopenharmony_ci      if (from == kNoIndex && t->IsAssertion()) {
10601cb0ef41Sopenharmony_ci        from = i;  // Start a sequence.
10611cb0ef41Sopenharmony_ci      } else if (from != kNoIndex && !t->IsAssertion()) {
10621cb0ef41Sopenharmony_ci        // Terminate and process the sequence.
10631cb0ef41Sopenharmony_ci        if (i - from > 1) rewriter.Rewrite(from, i);
10641cb0ef41Sopenharmony_ci        from = kNoIndex;
10651cb0ef41Sopenharmony_ci      }
10661cb0ef41Sopenharmony_ci    }
10671cb0ef41Sopenharmony_ci
10681cb0ef41Sopenharmony_ci    if (from != kNoIndex && terms->length() - from > 1) {
10691cb0ef41Sopenharmony_ci      rewriter.Rewrite(from, terms->length());
10701cb0ef41Sopenharmony_ci    }
10711cb0ef41Sopenharmony_ci  }
10721cb0ef41Sopenharmony_ci
10731cb0ef41Sopenharmony_ci  // All assertions are zero width. A consecutive sequence of assertions is
10741cb0ef41Sopenharmony_ci  // order-independent. There's two ways we can optimize here:
10751cb0ef41Sopenharmony_ci  // 1. fold all identical assertions.
10761cb0ef41Sopenharmony_ci  // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
10771cb0ef41Sopenharmony_ci  //    sequence fails.
10781cb0ef41Sopenharmony_ci  void Rewrite(int from, int to) {
10791cb0ef41Sopenharmony_ci    DCHECK_GT(to, from + 1);
10801cb0ef41Sopenharmony_ci
10811cb0ef41Sopenharmony_ci    // Bitfield of all seen assertions.
10821cb0ef41Sopenharmony_ci    uint32_t seen_assertions = 0;
10831cb0ef41Sopenharmony_ci    STATIC_ASSERT(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
10841cb0ef41Sopenharmony_ci                  kUInt32Size * kBitsPerByte);
10851cb0ef41Sopenharmony_ci
10861cb0ef41Sopenharmony_ci    for (int i = from; i < to; i++) {
10871cb0ef41Sopenharmony_ci      RegExpAssertion* t = terms_->at(i)->AsAssertion();
10881cb0ef41Sopenharmony_ci      const uint32_t bit = 1 << static_cast<int>(t->assertion_type());
10891cb0ef41Sopenharmony_ci
10901cb0ef41Sopenharmony_ci      if (seen_assertions & bit) {
10911cb0ef41Sopenharmony_ci        // Fold duplicates.
10921cb0ef41Sopenharmony_ci        terms_->Set(i, zone_->New<RegExpEmpty>());
10931cb0ef41Sopenharmony_ci      }
10941cb0ef41Sopenharmony_ci
10951cb0ef41Sopenharmony_ci      seen_assertions |= bit;
10961cb0ef41Sopenharmony_ci    }
10971cb0ef41Sopenharmony_ci
10981cb0ef41Sopenharmony_ci    // Collapse failures.
10991cb0ef41Sopenharmony_ci    const uint32_t always_fails_mask =
11001cb0ef41Sopenharmony_ci        1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
11011cb0ef41Sopenharmony_ci        1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
11021cb0ef41Sopenharmony_ci    if ((seen_assertions & always_fails_mask) == always_fails_mask) {
11031cb0ef41Sopenharmony_ci      ReplaceSequenceWithFailure(from, to);
11041cb0ef41Sopenharmony_ci    }
11051cb0ef41Sopenharmony_ci  }
11061cb0ef41Sopenharmony_ci
11071cb0ef41Sopenharmony_ci  void ReplaceSequenceWithFailure(int from, int to) {
11081cb0ef41Sopenharmony_ci    // Replace the entire sequence with a single node that always fails.
11091cb0ef41Sopenharmony_ci    // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
11101cb0ef41Sopenharmony_ci    // negated '*' (everything) range serves the purpose.
11111cb0ef41Sopenharmony_ci    ZoneList<CharacterRange>* ranges =
11121cb0ef41Sopenharmony_ci        zone_->New<ZoneList<CharacterRange>>(0, zone_);
11131cb0ef41Sopenharmony_ci    RegExpCharacterClass* cc = zone_->New<RegExpCharacterClass>(zone_, ranges);
11141cb0ef41Sopenharmony_ci    terms_->Set(from, cc);
11151cb0ef41Sopenharmony_ci
11161cb0ef41Sopenharmony_ci    // Zero out the rest.
11171cb0ef41Sopenharmony_ci    RegExpEmpty* empty = zone_->New<RegExpEmpty>();
11181cb0ef41Sopenharmony_ci    for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
11191cb0ef41Sopenharmony_ci  }
11201cb0ef41Sopenharmony_ci
11211cb0ef41Sopenharmony_ci private:
11221cb0ef41Sopenharmony_ci  AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
11231cb0ef41Sopenharmony_ci      : zone_(zone), terms_(terms) {}
11241cb0ef41Sopenharmony_ci
11251cb0ef41Sopenharmony_ci  Zone* zone_;
11261cb0ef41Sopenharmony_ci  ZoneList<RegExpTree*>* terms_;
11271cb0ef41Sopenharmony_ci};
11281cb0ef41Sopenharmony_ci
11291cb0ef41Sopenharmony_ci}  // namespace
11301cb0ef41Sopenharmony_ci
11311cb0ef41Sopenharmony_ciRegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
11321cb0ef41Sopenharmony_ci                                      RegExpNode* on_success) {
11331cb0ef41Sopenharmony_ci  compiler->ToNodeMaybeCheckForStackOverflow();
11341cb0ef41Sopenharmony_ci
11351cb0ef41Sopenharmony_ci  ZoneList<RegExpTree*>* children = nodes();
11361cb0ef41Sopenharmony_ci
11371cb0ef41Sopenharmony_ci  AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
11381cb0ef41Sopenharmony_ci
11391cb0ef41Sopenharmony_ci  RegExpNode* current = on_success;
11401cb0ef41Sopenharmony_ci  if (compiler->read_backward()) {
11411cb0ef41Sopenharmony_ci    for (int i = 0; i < children->length(); i++) {
11421cb0ef41Sopenharmony_ci      current = children->at(i)->ToNode(compiler, current);
11431cb0ef41Sopenharmony_ci    }
11441cb0ef41Sopenharmony_ci  } else {
11451cb0ef41Sopenharmony_ci    for (int i = children->length() - 1; i >= 0; i--) {
11461cb0ef41Sopenharmony_ci      current = children->at(i)->ToNode(compiler, current);
11471cb0ef41Sopenharmony_ci    }
11481cb0ef41Sopenharmony_ci  }
11491cb0ef41Sopenharmony_ci  return current;
11501cb0ef41Sopenharmony_ci}
11511cb0ef41Sopenharmony_ci
11521cb0ef41Sopenharmony_cinamespace {
11531cb0ef41Sopenharmony_ci
11541cb0ef41Sopenharmony_civoid AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
11551cb0ef41Sopenharmony_ci              Zone* zone) {
11561cb0ef41Sopenharmony_ci  elmc--;
11571cb0ef41Sopenharmony_ci  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
11581cb0ef41Sopenharmony_ci  for (int i = 0; i < elmc; i += 2) {
11591cb0ef41Sopenharmony_ci    DCHECK(elmv[i] < elmv[i + 1]);
11601cb0ef41Sopenharmony_ci    ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
11611cb0ef41Sopenharmony_ci  }
11621cb0ef41Sopenharmony_ci}
11631cb0ef41Sopenharmony_ci
11641cb0ef41Sopenharmony_civoid AddClassNegated(const int* elmv, int elmc,
11651cb0ef41Sopenharmony_ci                     ZoneList<CharacterRange>* ranges, Zone* zone) {
11661cb0ef41Sopenharmony_ci  elmc--;
11671cb0ef41Sopenharmony_ci  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
11681cb0ef41Sopenharmony_ci  DCHECK_NE(0x0000, elmv[0]);
11691cb0ef41Sopenharmony_ci  DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);
11701cb0ef41Sopenharmony_ci  base::uc16 last = 0x0000;
11711cb0ef41Sopenharmony_ci  for (int i = 0; i < elmc; i += 2) {
11721cb0ef41Sopenharmony_ci    DCHECK(last <= elmv[i] - 1);
11731cb0ef41Sopenharmony_ci    DCHECK(elmv[i] < elmv[i + 1]);
11741cb0ef41Sopenharmony_ci    ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
11751cb0ef41Sopenharmony_ci    last = elmv[i + 1];
11761cb0ef41Sopenharmony_ci  }
11771cb0ef41Sopenharmony_ci  ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
11781cb0ef41Sopenharmony_ci}
11791cb0ef41Sopenharmony_ci
11801cb0ef41Sopenharmony_ci}  // namespace
11811cb0ef41Sopenharmony_ci
11821cb0ef41Sopenharmony_civoid CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
11831cb0ef41Sopenharmony_ci                                    ZoneList<CharacterRange>* ranges,
11841cb0ef41Sopenharmony_ci                                    bool add_unicode_case_equivalents,
11851cb0ef41Sopenharmony_ci                                    Zone* zone) {
11861cb0ef41Sopenharmony_ci  if (add_unicode_case_equivalents &&
11871cb0ef41Sopenharmony_ci      (standard_character_set == StandardCharacterSet::kWord ||
11881cb0ef41Sopenharmony_ci       standard_character_set == StandardCharacterSet::kNotWord)) {
11891cb0ef41Sopenharmony_ci    // See #sec-runtime-semantics-wordcharacters-abstract-operation
11901cb0ef41Sopenharmony_ci    // In case of unicode and ignore_case, we need to create the closure over
11911cb0ef41Sopenharmony_ci    // case equivalent characters before negating.
11921cb0ef41Sopenharmony_ci    ZoneList<CharacterRange>* new_ranges =
11931cb0ef41Sopenharmony_ci        zone->New<ZoneList<CharacterRange>>(2, zone);
11941cb0ef41Sopenharmony_ci    AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
11951cb0ef41Sopenharmony_ci    AddUnicodeCaseEquivalents(new_ranges, zone);
11961cb0ef41Sopenharmony_ci    if (standard_character_set == StandardCharacterSet::kNotWord) {
11971cb0ef41Sopenharmony_ci      ZoneList<CharacterRange>* negated =
11981cb0ef41Sopenharmony_ci          zone->New<ZoneList<CharacterRange>>(2, zone);
11991cb0ef41Sopenharmony_ci      CharacterRange::Negate(new_ranges, negated, zone);
12001cb0ef41Sopenharmony_ci      new_ranges = negated;
12011cb0ef41Sopenharmony_ci    }
12021cb0ef41Sopenharmony_ci    ranges->AddAll(*new_ranges, zone);
12031cb0ef41Sopenharmony_ci    return;
12041cb0ef41Sopenharmony_ci  }
12051cb0ef41Sopenharmony_ci
12061cb0ef41Sopenharmony_ci  switch (standard_character_set) {
12071cb0ef41Sopenharmony_ci    case StandardCharacterSet::kWhitespace:
12081cb0ef41Sopenharmony_ci      AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
12091cb0ef41Sopenharmony_ci      break;
12101cb0ef41Sopenharmony_ci    case StandardCharacterSet::kNotWhitespace:
12111cb0ef41Sopenharmony_ci      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
12121cb0ef41Sopenharmony_ci      break;
12131cb0ef41Sopenharmony_ci    case StandardCharacterSet::kWord:
12141cb0ef41Sopenharmony_ci      AddClass(kWordRanges, kWordRangeCount, ranges, zone);
12151cb0ef41Sopenharmony_ci      break;
12161cb0ef41Sopenharmony_ci    case StandardCharacterSet::kNotWord:
12171cb0ef41Sopenharmony_ci      AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
12181cb0ef41Sopenharmony_ci      break;
12191cb0ef41Sopenharmony_ci    case StandardCharacterSet::kDigit:
12201cb0ef41Sopenharmony_ci      AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
12211cb0ef41Sopenharmony_ci      break;
12221cb0ef41Sopenharmony_ci    case StandardCharacterSet::kNotDigit:
12231cb0ef41Sopenharmony_ci      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
12241cb0ef41Sopenharmony_ci      break;
12251cb0ef41Sopenharmony_ci    // This is the set of characters matched by the $ and ^ symbols
12261cb0ef41Sopenharmony_ci    // in multiline mode.
12271cb0ef41Sopenharmony_ci    case StandardCharacterSet::kLineTerminator:
12281cb0ef41Sopenharmony_ci      AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
12291cb0ef41Sopenharmony_ci      break;
12301cb0ef41Sopenharmony_ci    case StandardCharacterSet::kNotLineTerminator:
12311cb0ef41Sopenharmony_ci      AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
12321cb0ef41Sopenharmony_ci                      zone);
12331cb0ef41Sopenharmony_ci      break;
12341cb0ef41Sopenharmony_ci    // This is not a character range as defined by the spec but a
12351cb0ef41Sopenharmony_ci    // convenient shorthand for a character class that matches any
12361cb0ef41Sopenharmony_ci    // character.
12371cb0ef41Sopenharmony_ci    case StandardCharacterSet::kEverything:
12381cb0ef41Sopenharmony_ci      ranges->Add(CharacterRange::Everything(), zone);
12391cb0ef41Sopenharmony_ci      break;
12401cb0ef41Sopenharmony_ci  }
12411cb0ef41Sopenharmony_ci}
12421cb0ef41Sopenharmony_ci
12431cb0ef41Sopenharmony_ci// static
12441cb0ef41Sopenharmony_civoid CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
12451cb0ef41Sopenharmony_ci                                        ZoneList<CharacterRange>* ranges,
12461cb0ef41Sopenharmony_ci                                        bool is_one_byte) {
12471cb0ef41Sopenharmony_ci  CharacterRange::Canonicalize(ranges);
12481cb0ef41Sopenharmony_ci  int range_count = ranges->length();
12491cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT
12501cb0ef41Sopenharmony_ci  icu::UnicodeSet others;
12511cb0ef41Sopenharmony_ci  for (int i = 0; i < range_count; i++) {
12521cb0ef41Sopenharmony_ci    CharacterRange range = ranges->at(i);
12531cb0ef41Sopenharmony_ci    base::uc32 from = range.from();
12541cb0ef41Sopenharmony_ci    if (from > kMaxUtf16CodeUnit) continue;
12551cb0ef41Sopenharmony_ci    base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});
12561cb0ef41Sopenharmony_ci    // Nothing to be done for surrogates.
12571cb0ef41Sopenharmony_ci    if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
12581cb0ef41Sopenharmony_ci    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
12591cb0ef41Sopenharmony_ci      if (from > kMaxOneByteCharCode) continue;
12601cb0ef41Sopenharmony_ci      if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode;
12611cb0ef41Sopenharmony_ci    }
12621cb0ef41Sopenharmony_ci    others.add(from, to);
12631cb0ef41Sopenharmony_ci  }
12641cb0ef41Sopenharmony_ci
12651cb0ef41Sopenharmony_ci  // Compute the set of additional characters that should be added,
12661cb0ef41Sopenharmony_ci  // using UnicodeSet::closeOver. ECMA 262 defines slightly different
12671cb0ef41Sopenharmony_ci  // case-folding rules than Unicode, so some characters that are
12681cb0ef41Sopenharmony_ci  // added by closeOver do not match anything other than themselves in
12691cb0ef41Sopenharmony_ci  // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
12701cb0ef41Sopenharmony_ci  // same case-insensitive character as 's' or 'S' according to
12711cb0ef41Sopenharmony_ci  // Unicode, but does not match any other character in JS. To handle
12721cb0ef41Sopenharmony_ci  // this case, we add such characters to the IgnoreSet and filter
12731cb0ef41Sopenharmony_ci  // them out. We filter twice: once before calling closeOver (to
12741cb0ef41Sopenharmony_ci  // prevent 'ſ' from adding 's'), and once after calling closeOver
12751cb0ef41Sopenharmony_ci  // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
12761cb0ef41Sopenharmony_ci  // more information.
12771cb0ef41Sopenharmony_ci  icu::UnicodeSet already_added(others);
12781cb0ef41Sopenharmony_ci  others.removeAll(RegExpCaseFolding::IgnoreSet());
12791cb0ef41Sopenharmony_ci  others.closeOver(USET_CASE_INSENSITIVE);
12801cb0ef41Sopenharmony_ci  others.removeAll(RegExpCaseFolding::IgnoreSet());
12811cb0ef41Sopenharmony_ci  others.removeAll(already_added);
12821cb0ef41Sopenharmony_ci
12831cb0ef41Sopenharmony_ci  // Add others to the ranges
12841cb0ef41Sopenharmony_ci  for (int32_t i = 0; i < others.getRangeCount(); i++) {
12851cb0ef41Sopenharmony_ci    UChar32 from = others.getRangeStart(i);
12861cb0ef41Sopenharmony_ci    UChar32 to = others.getRangeEnd(i);
12871cb0ef41Sopenharmony_ci    if (from == to) {
12881cb0ef41Sopenharmony_ci      ranges->Add(CharacterRange::Singleton(from), zone);
12891cb0ef41Sopenharmony_ci    } else {
12901cb0ef41Sopenharmony_ci      ranges->Add(CharacterRange::Range(from, to), zone);
12911cb0ef41Sopenharmony_ci    }
12921cb0ef41Sopenharmony_ci  }
12931cb0ef41Sopenharmony_ci#else
12941cb0ef41Sopenharmony_ci  for (int i = 0; i < range_count; i++) {
12951cb0ef41Sopenharmony_ci    CharacterRange range = ranges->at(i);
12961cb0ef41Sopenharmony_ci    base::uc32 bottom = range.from();
12971cb0ef41Sopenharmony_ci    if (bottom > kMaxUtf16CodeUnit) continue;
12981cb0ef41Sopenharmony_ci    base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});
12991cb0ef41Sopenharmony_ci    // Nothing to be done for surrogates.
13001cb0ef41Sopenharmony_ci    if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
13011cb0ef41Sopenharmony_ci    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
13021cb0ef41Sopenharmony_ci      if (bottom > kMaxOneByteCharCode) continue;
13031cb0ef41Sopenharmony_ci      if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode;
13041cb0ef41Sopenharmony_ci    }
13051cb0ef41Sopenharmony_ci    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
13061cb0ef41Sopenharmony_ci    if (top == bottom) {
13071cb0ef41Sopenharmony_ci      // If this is a singleton we just expand the one character.
13081cb0ef41Sopenharmony_ci      int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
13091cb0ef41Sopenharmony_ci      for (int i = 0; i < length; i++) {
13101cb0ef41Sopenharmony_ci        base::uc32 chr = chars[i];
13111cb0ef41Sopenharmony_ci        if (chr != bottom) {
13121cb0ef41Sopenharmony_ci          ranges->Add(CharacterRange::Singleton(chars[i]), zone);
13131cb0ef41Sopenharmony_ci        }
13141cb0ef41Sopenharmony_ci      }
13151cb0ef41Sopenharmony_ci    } else {
13161cb0ef41Sopenharmony_ci      // If this is a range we expand the characters block by block, expanding
13171cb0ef41Sopenharmony_ci      // contiguous subranges (blocks) one at a time.  The approach is as
13181cb0ef41Sopenharmony_ci      // follows.  For a given start character we look up the remainder of the
13191cb0ef41Sopenharmony_ci      // block that contains it (represented by the end point), for instance we
13201cb0ef41Sopenharmony_ci      // find 'z' if the character is 'c'.  A block is characterized by the
13211cb0ef41Sopenharmony_ci      // property that all characters uncanonicalize in the same way, except
13221cb0ef41Sopenharmony_ci      // that each entry in the result is incremented by the distance from the
13231cb0ef41Sopenharmony_ci      // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
13241cb0ef41Sopenharmony_ci      // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
13251cb0ef41Sopenharmony_ci      // we've found the end point we look up its uncanonicalization and
13261cb0ef41Sopenharmony_ci      // produce a range for each element.  For instance for [c-f] we look up
13271cb0ef41Sopenharmony_ci      // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
13281cb0ef41Sopenharmony_ci      // it is not already contained in the input, so [c-f] will be skipped but
13291cb0ef41Sopenharmony_ci      // [C-F] will be added.  If this range is not completely contained in a
13301cb0ef41Sopenharmony_ci      // block we do this for all the blocks covered by the range (handling
13311cb0ef41Sopenharmony_ci      // characters that is not in a block as a "singleton block").
13321cb0ef41Sopenharmony_ci      unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
13331cb0ef41Sopenharmony_ci      base::uc32 pos = bottom;
13341cb0ef41Sopenharmony_ci      while (pos <= top) {
13351cb0ef41Sopenharmony_ci        int length =
13361cb0ef41Sopenharmony_ci            isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
13371cb0ef41Sopenharmony_ci        base::uc32 block_end;
13381cb0ef41Sopenharmony_ci        if (length == 0) {
13391cb0ef41Sopenharmony_ci          block_end = pos;
13401cb0ef41Sopenharmony_ci        } else {
13411cb0ef41Sopenharmony_ci          DCHECK_EQ(1, length);
13421cb0ef41Sopenharmony_ci          block_end = equivalents[0];
13431cb0ef41Sopenharmony_ci        }
13441cb0ef41Sopenharmony_ci        int end = (block_end > top) ? top : block_end;
13451cb0ef41Sopenharmony_ci        length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
13461cb0ef41Sopenharmony_ci                                                         equivalents);
13471cb0ef41Sopenharmony_ci        for (int i = 0; i < length; i++) {
13481cb0ef41Sopenharmony_ci          base::uc32 c = equivalents[i];
13491cb0ef41Sopenharmony_ci          base::uc32 range_from = c - (block_end - pos);
13501cb0ef41Sopenharmony_ci          base::uc32 range_to = c - (block_end - end);
13511cb0ef41Sopenharmony_ci          if (!(bottom <= range_from && range_to <= top)) {
13521cb0ef41Sopenharmony_ci            ranges->Add(CharacterRange::Range(range_from, range_to), zone);
13531cb0ef41Sopenharmony_ci          }
13541cb0ef41Sopenharmony_ci        }
13551cb0ef41Sopenharmony_ci        pos = end + 1;
13561cb0ef41Sopenharmony_ci      }
13571cb0ef41Sopenharmony_ci    }
13581cb0ef41Sopenharmony_ci  }
13591cb0ef41Sopenharmony_ci#endif  // V8_INTL_SUPPORT
13601cb0ef41Sopenharmony_ci}
13611cb0ef41Sopenharmony_ci
13621cb0ef41Sopenharmony_cibool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
13631cb0ef41Sopenharmony_ci  DCHECK_NOT_NULL(ranges);
13641cb0ef41Sopenharmony_ci  int n = ranges->length();
13651cb0ef41Sopenharmony_ci  if (n <= 1) return true;
13661cb0ef41Sopenharmony_ci  base::uc32 max = ranges->at(0).to();
13671cb0ef41Sopenharmony_ci  for (int i = 1; i < n; i++) {
13681cb0ef41Sopenharmony_ci    CharacterRange next_range = ranges->at(i);
13691cb0ef41Sopenharmony_ci    if (next_range.from() <= max + 1) return false;
13701cb0ef41Sopenharmony_ci    max = next_range.to();
13711cb0ef41Sopenharmony_ci  }
13721cb0ef41Sopenharmony_ci  return true;
13731cb0ef41Sopenharmony_ci}
13741cb0ef41Sopenharmony_ci
13751cb0ef41Sopenharmony_ciZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
13761cb0ef41Sopenharmony_ci  if (ranges_ == nullptr) {
13771cb0ef41Sopenharmony_ci    ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
13781cb0ef41Sopenharmony_ci    CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
13791cb0ef41Sopenharmony_ci                                   zone);
13801cb0ef41Sopenharmony_ci  }
13811cb0ef41Sopenharmony_ci  return ranges_;
13821cb0ef41Sopenharmony_ci}
13831cb0ef41Sopenharmony_ci
13841cb0ef41Sopenharmony_cinamespace {
13851cb0ef41Sopenharmony_ci
13861cb0ef41Sopenharmony_ci// Move a number of elements in a zonelist to another position
13871cb0ef41Sopenharmony_ci// in the same list. Handles overlapping source and target areas.
13881cb0ef41Sopenharmony_civoid MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {
13891cb0ef41Sopenharmony_ci  // Ranges are potentially overlapping.
13901cb0ef41Sopenharmony_ci  if (from < to) {
13911cb0ef41Sopenharmony_ci    for (int i = count - 1; i >= 0; i--) {
13921cb0ef41Sopenharmony_ci      list->at(to + i) = list->at(from + i);
13931cb0ef41Sopenharmony_ci    }
13941cb0ef41Sopenharmony_ci  } else {
13951cb0ef41Sopenharmony_ci    for (int i = 0; i < count; i++) {
13961cb0ef41Sopenharmony_ci      list->at(to + i) = list->at(from + i);
13971cb0ef41Sopenharmony_ci    }
13981cb0ef41Sopenharmony_ci  }
13991cb0ef41Sopenharmony_ci}
14001cb0ef41Sopenharmony_ci
14011cb0ef41Sopenharmony_ciint InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
14021cb0ef41Sopenharmony_ci                               CharacterRange insert) {
14031cb0ef41Sopenharmony_ci  // Inserts a range into list[0..count[, which must be sorted
14041cb0ef41Sopenharmony_ci  // by from value and non-overlapping and non-adjacent, using at most
14051cb0ef41Sopenharmony_ci  // list[0..count] for the result. Returns the number of resulting
14061cb0ef41Sopenharmony_ci  // canonicalized ranges. Inserting a range may collapse existing ranges into
14071cb0ef41Sopenharmony_ci  // fewer ranges, so the return value can be anything in the range 1..count+1.
14081cb0ef41Sopenharmony_ci  base::uc32 from = insert.from();
14091cb0ef41Sopenharmony_ci  base::uc32 to = insert.to();
14101cb0ef41Sopenharmony_ci  int start_pos = 0;
14111cb0ef41Sopenharmony_ci  int end_pos = count;
14121cb0ef41Sopenharmony_ci  for (int i = count - 1; i >= 0; i--) {
14131cb0ef41Sopenharmony_ci    CharacterRange current = list->at(i);
14141cb0ef41Sopenharmony_ci    if (current.from() > to + 1) {
14151cb0ef41Sopenharmony_ci      end_pos = i;
14161cb0ef41Sopenharmony_ci    } else if (current.to() + 1 < from) {
14171cb0ef41Sopenharmony_ci      start_pos = i + 1;
14181cb0ef41Sopenharmony_ci      break;
14191cb0ef41Sopenharmony_ci    }
14201cb0ef41Sopenharmony_ci  }
14211cb0ef41Sopenharmony_ci
14221cb0ef41Sopenharmony_ci  // Inserted range overlaps, or is adjacent to, ranges at positions
14231cb0ef41Sopenharmony_ci  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
14241cb0ef41Sopenharmony_ci  // not affected by the insertion.
14251cb0ef41Sopenharmony_ci  // If start_pos == end_pos, the range must be inserted before start_pos.
14261cb0ef41Sopenharmony_ci  // if start_pos < end_pos, the entire range from start_pos to end_pos
14271cb0ef41Sopenharmony_ci  // must be merged with the insert range.
14281cb0ef41Sopenharmony_ci
14291cb0ef41Sopenharmony_ci  if (start_pos == end_pos) {
14301cb0ef41Sopenharmony_ci    // Insert between existing ranges at position start_pos.
14311cb0ef41Sopenharmony_ci    if (start_pos < count) {
14321cb0ef41Sopenharmony_ci      MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
14331cb0ef41Sopenharmony_ci    }
14341cb0ef41Sopenharmony_ci    list->at(start_pos) = insert;
14351cb0ef41Sopenharmony_ci    return count + 1;
14361cb0ef41Sopenharmony_ci  }
14371cb0ef41Sopenharmony_ci  if (start_pos + 1 == end_pos) {
14381cb0ef41Sopenharmony_ci    // Replace single existing range at position start_pos.
14391cb0ef41Sopenharmony_ci    CharacterRange to_replace = list->at(start_pos);
14401cb0ef41Sopenharmony_ci    int new_from = std::min(to_replace.from(), from);
14411cb0ef41Sopenharmony_ci    int new_to = std::max(to_replace.to(), to);
14421cb0ef41Sopenharmony_ci    list->at(start_pos) = CharacterRange::Range(new_from, new_to);
14431cb0ef41Sopenharmony_ci    return count;
14441cb0ef41Sopenharmony_ci  }
14451cb0ef41Sopenharmony_ci  // Replace a number of existing ranges from start_pos to end_pos - 1.
14461cb0ef41Sopenharmony_ci  // Move the remaining ranges down.
14471cb0ef41Sopenharmony_ci
14481cb0ef41Sopenharmony_ci  int new_from = std::min(list->at(start_pos).from(), from);
14491cb0ef41Sopenharmony_ci  int new_to = std::max(list->at(end_pos - 1).to(), to);
14501cb0ef41Sopenharmony_ci  if (end_pos < count) {
14511cb0ef41Sopenharmony_ci    MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
14521cb0ef41Sopenharmony_ci  }
14531cb0ef41Sopenharmony_ci  list->at(start_pos) = CharacterRange::Range(new_from, new_to);
14541cb0ef41Sopenharmony_ci  return count - (end_pos - start_pos) + 1;
14551cb0ef41Sopenharmony_ci}
14561cb0ef41Sopenharmony_ci
14571cb0ef41Sopenharmony_ci}  // namespace
14581cb0ef41Sopenharmony_ci
14591cb0ef41Sopenharmony_civoid CharacterSet::Canonicalize() {
14601cb0ef41Sopenharmony_ci  // Special/default classes are always considered canonical. The result
14611cb0ef41Sopenharmony_ci  // of calling ranges() will be sorted.
14621cb0ef41Sopenharmony_ci  if (ranges_ == nullptr) return;
14631cb0ef41Sopenharmony_ci  CharacterRange::Canonicalize(ranges_);
14641cb0ef41Sopenharmony_ci}
14651cb0ef41Sopenharmony_ci
14661cb0ef41Sopenharmony_ci// static
14671cb0ef41Sopenharmony_civoid CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
14681cb0ef41Sopenharmony_ci  if (character_ranges->length() <= 1) return;
14691cb0ef41Sopenharmony_ci  // Check whether ranges are already canonical (increasing, non-overlapping,
14701cb0ef41Sopenharmony_ci  // non-adjacent).
14711cb0ef41Sopenharmony_ci  int n = character_ranges->length();
14721cb0ef41Sopenharmony_ci  base::uc32 max = character_ranges->at(0).to();
14731cb0ef41Sopenharmony_ci  int i = 1;
14741cb0ef41Sopenharmony_ci  while (i < n) {
14751cb0ef41Sopenharmony_ci    CharacterRange current = character_ranges->at(i);
14761cb0ef41Sopenharmony_ci    if (current.from() <= max + 1) {
14771cb0ef41Sopenharmony_ci      break;
14781cb0ef41Sopenharmony_ci    }
14791cb0ef41Sopenharmony_ci    max = current.to();
14801cb0ef41Sopenharmony_ci    i++;
14811cb0ef41Sopenharmony_ci  }
14821cb0ef41Sopenharmony_ci  // Canonical until the i'th range. If that's all of them, we are done.
14831cb0ef41Sopenharmony_ci  if (i == n) return;
14841cb0ef41Sopenharmony_ci
14851cb0ef41Sopenharmony_ci  // The ranges at index i and forward are not canonicalized. Make them so by
14861cb0ef41Sopenharmony_ci  // doing the equivalent of insertion sort (inserting each into the previous
14871cb0ef41Sopenharmony_ci  // list, in order).
14881cb0ef41Sopenharmony_ci  // Notice that inserting a range can reduce the number of ranges in the
14891cb0ef41Sopenharmony_ci  // result due to combining of adjacent and overlapping ranges.
14901cb0ef41Sopenharmony_ci  int read = i;           // Range to insert.
14911cb0ef41Sopenharmony_ci  int num_canonical = i;  // Length of canonicalized part of list.
14921cb0ef41Sopenharmony_ci  do {
14931cb0ef41Sopenharmony_ci    num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
14941cb0ef41Sopenharmony_ci                                               character_ranges->at(read));
14951cb0ef41Sopenharmony_ci    read++;
14961cb0ef41Sopenharmony_ci  } while (read < n);
14971cb0ef41Sopenharmony_ci  character_ranges->Rewind(num_canonical);
14981cb0ef41Sopenharmony_ci
14991cb0ef41Sopenharmony_ci  DCHECK(CharacterRange::IsCanonical(character_ranges));
15001cb0ef41Sopenharmony_ci}
15011cb0ef41Sopenharmony_ci
15021cb0ef41Sopenharmony_ci// static
15031cb0ef41Sopenharmony_civoid CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
15041cb0ef41Sopenharmony_ci                            ZoneList<CharacterRange>* negated_ranges,
15051cb0ef41Sopenharmony_ci                            Zone* zone) {
15061cb0ef41Sopenharmony_ci  DCHECK(CharacterRange::IsCanonical(ranges));
15071cb0ef41Sopenharmony_ci  DCHECK_EQ(0, negated_ranges->length());
15081cb0ef41Sopenharmony_ci  int range_count = ranges->length();
15091cb0ef41Sopenharmony_ci  base::uc32 from = 0;
15101cb0ef41Sopenharmony_ci  int i = 0;
15111cb0ef41Sopenharmony_ci  if (range_count > 0 && ranges->at(0).from() == 0) {
15121cb0ef41Sopenharmony_ci    from = ranges->at(0).to() + 1;
15131cb0ef41Sopenharmony_ci    i = 1;
15141cb0ef41Sopenharmony_ci  }
15151cb0ef41Sopenharmony_ci  while (i < range_count) {
15161cb0ef41Sopenharmony_ci    CharacterRange range = ranges->at(i);
15171cb0ef41Sopenharmony_ci    negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
15181cb0ef41Sopenharmony_ci    from = range.to() + 1;
15191cb0ef41Sopenharmony_ci    i++;
15201cb0ef41Sopenharmony_ci  }
15211cb0ef41Sopenharmony_ci  if (from < kMaxCodePoint) {
15221cb0ef41Sopenharmony_ci    negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);
15231cb0ef41Sopenharmony_ci  }
15241cb0ef41Sopenharmony_ci}
15251cb0ef41Sopenharmony_ci
15261cb0ef41Sopenharmony_ci// static
15271cb0ef41Sopenharmony_civoid CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {
15281cb0ef41Sopenharmony_ci  DCHECK(IsCanonical(ranges));
15291cb0ef41Sopenharmony_ci
15301cb0ef41Sopenharmony_ci  // Drop all ranges that don't contain one-byte code units, and clamp the last
15311cb0ef41Sopenharmony_ci  // range s.t. it likewise only contains one-byte code units. Note this relies
15321cb0ef41Sopenharmony_ci  // on `ranges` being canonicalized, i.e. sorted and non-overlapping.
15331cb0ef41Sopenharmony_ci
15341cb0ef41Sopenharmony_ci  static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
15351cb0ef41Sopenharmony_ci  int n = ranges->length();
15361cb0ef41Sopenharmony_ci  for (; n > 0; n--) {
15371cb0ef41Sopenharmony_ci    CharacterRange& r = ranges->at(n - 1);
15381cb0ef41Sopenharmony_ci    if (r.from() <= max_char) {
15391cb0ef41Sopenharmony_ci      r.to_ = std::min(r.to_, max_char);
15401cb0ef41Sopenharmony_ci      break;
15411cb0ef41Sopenharmony_ci    }
15421cb0ef41Sopenharmony_ci  }
15431cb0ef41Sopenharmony_ci
15441cb0ef41Sopenharmony_ci  ranges->Rewind(n);
15451cb0ef41Sopenharmony_ci}
15461cb0ef41Sopenharmony_ci
15471cb0ef41Sopenharmony_cinamespace {
15481cb0ef41Sopenharmony_ci
15491cb0ef41Sopenharmony_ci// Scoped object to keep track of how much we unroll quantifier loops in the
15501cb0ef41Sopenharmony_ci// regexp graph generator.
15511cb0ef41Sopenharmony_ciclass RegExpExpansionLimiter {
15521cb0ef41Sopenharmony_ci public:
15531cb0ef41Sopenharmony_ci  static const int kMaxExpansionFactor = 6;
15541cb0ef41Sopenharmony_ci  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
15551cb0ef41Sopenharmony_ci      : compiler_(compiler),
15561cb0ef41Sopenharmony_ci        saved_expansion_factor_(compiler->current_expansion_factor()),
15571cb0ef41Sopenharmony_ci        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
15581cb0ef41Sopenharmony_ci    DCHECK_LT(0, factor);
15591cb0ef41Sopenharmony_ci    if (ok_to_expand_) {
15601cb0ef41Sopenharmony_ci      if (factor > kMaxExpansionFactor) {
15611cb0ef41Sopenharmony_ci        // Avoid integer overflow of the current expansion factor.
15621cb0ef41Sopenharmony_ci        ok_to_expand_ = false;
15631cb0ef41Sopenharmony_ci        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
15641cb0ef41Sopenharmony_ci      } else {
15651cb0ef41Sopenharmony_ci        int new_factor = saved_expansion_factor_ * factor;
15661cb0ef41Sopenharmony_ci        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
15671cb0ef41Sopenharmony_ci        compiler->set_current_expansion_factor(new_factor);
15681cb0ef41Sopenharmony_ci      }
15691cb0ef41Sopenharmony_ci    }
15701cb0ef41Sopenharmony_ci  }
15711cb0ef41Sopenharmony_ci
15721cb0ef41Sopenharmony_ci  ~RegExpExpansionLimiter() {
15731cb0ef41Sopenharmony_ci    compiler_->set_current_expansion_factor(saved_expansion_factor_);
15741cb0ef41Sopenharmony_ci  }
15751cb0ef41Sopenharmony_ci
15761cb0ef41Sopenharmony_ci  bool ok_to_expand() { return ok_to_expand_; }
15771cb0ef41Sopenharmony_ci
15781cb0ef41Sopenharmony_ci private:
15791cb0ef41Sopenharmony_ci  RegExpCompiler* compiler_;
15801cb0ef41Sopenharmony_ci  int saved_expansion_factor_;
15811cb0ef41Sopenharmony_ci  bool ok_to_expand_;
15821cb0ef41Sopenharmony_ci
15831cb0ef41Sopenharmony_ci  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
15841cb0ef41Sopenharmony_ci};
15851cb0ef41Sopenharmony_ci
15861cb0ef41Sopenharmony_ci}  // namespace
15871cb0ef41Sopenharmony_ci
15881cb0ef41Sopenharmony_ciRegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
15891cb0ef41Sopenharmony_ci                                     RegExpTree* body, RegExpCompiler* compiler,
15901cb0ef41Sopenharmony_ci                                     RegExpNode* on_success,
15911cb0ef41Sopenharmony_ci                                     bool not_at_start) {
15921cb0ef41Sopenharmony_ci  // x{f, t} becomes this:
15931cb0ef41Sopenharmony_ci  //
15941cb0ef41Sopenharmony_ci  //             (r++)<-.
15951cb0ef41Sopenharmony_ci  //               |     `
15961cb0ef41Sopenharmony_ci  //               |     (x)
15971cb0ef41Sopenharmony_ci  //               v     ^
15981cb0ef41Sopenharmony_ci  //      (r=0)-->(?)---/ [if r < t]
15991cb0ef41Sopenharmony_ci  //               |
16001cb0ef41Sopenharmony_ci  //   [if r >= f] \----> ...
16011cb0ef41Sopenharmony_ci  //
16021cb0ef41Sopenharmony_ci
16031cb0ef41Sopenharmony_ci  // 15.10.2.5 RepeatMatcher algorithm.
16041cb0ef41Sopenharmony_ci  // The parser has already eliminated the case where max is 0.  In the case
16051cb0ef41Sopenharmony_ci  // where max_match is zero the parser has removed the quantifier if min was
16061cb0ef41Sopenharmony_ci  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
16071cb0ef41Sopenharmony_ci
16081cb0ef41Sopenharmony_ci  // If we know that we cannot match zero length then things are a little
16091cb0ef41Sopenharmony_ci  // simpler since we don't need to make the special zero length match check
16101cb0ef41Sopenharmony_ci  // from step 2.1.  If the min and max are small we can unroll a little in
16111cb0ef41Sopenharmony_ci  // this case.
16121cb0ef41Sopenharmony_ci  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
16131cb0ef41Sopenharmony_ci  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
16141cb0ef41Sopenharmony_ci  if (max == 0) return on_success;  // This can happen due to recursion.
16151cb0ef41Sopenharmony_ci  bool body_can_be_empty = (body->min_match() == 0);
16161cb0ef41Sopenharmony_ci  int body_start_reg = RegExpCompiler::kNoRegister;
16171cb0ef41Sopenharmony_ci  Interval capture_registers = body->CaptureRegisters();
16181cb0ef41Sopenharmony_ci  bool needs_capture_clearing = !capture_registers.is_empty();
16191cb0ef41Sopenharmony_ci  Zone* zone = compiler->zone();
16201cb0ef41Sopenharmony_ci
16211cb0ef41Sopenharmony_ci  if (body_can_be_empty) {
16221cb0ef41Sopenharmony_ci    body_start_reg = compiler->AllocateRegister();
16231cb0ef41Sopenharmony_ci  } else if (compiler->optimize() && !needs_capture_clearing) {
16241cb0ef41Sopenharmony_ci    // Only unroll if there are no captures and the body can't be
16251cb0ef41Sopenharmony_ci    // empty.
16261cb0ef41Sopenharmony_ci    {
16271cb0ef41Sopenharmony_ci      RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
16281cb0ef41Sopenharmony_ci      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
16291cb0ef41Sopenharmony_ci        int new_max = (max == kInfinity) ? max : max - min;
16301cb0ef41Sopenharmony_ci        // Recurse once to get the loop or optional matches after the fixed
16311cb0ef41Sopenharmony_ci        // ones.
16321cb0ef41Sopenharmony_ci        RegExpNode* answer =
16331cb0ef41Sopenharmony_ci            ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
16341cb0ef41Sopenharmony_ci        // Unroll the forced matches from 0 to min.  This can cause chains of
16351cb0ef41Sopenharmony_ci        // TextNodes (which the parser does not generate).  These should be
16361cb0ef41Sopenharmony_ci        // combined if it turns out they hinder good code generation.
16371cb0ef41Sopenharmony_ci        for (int i = 0; i < min; i++) {
16381cb0ef41Sopenharmony_ci          answer = body->ToNode(compiler, answer);
16391cb0ef41Sopenharmony_ci        }
16401cb0ef41Sopenharmony_ci        return answer;
16411cb0ef41Sopenharmony_ci      }
16421cb0ef41Sopenharmony_ci    }
16431cb0ef41Sopenharmony_ci    if (max <= kMaxUnrolledMaxMatches && min == 0) {
16441cb0ef41Sopenharmony_ci      DCHECK_LT(0, max);  // Due to the 'if' above.
16451cb0ef41Sopenharmony_ci      RegExpExpansionLimiter limiter(compiler, max);
16461cb0ef41Sopenharmony_ci      if (limiter.ok_to_expand()) {
16471cb0ef41Sopenharmony_ci        // Unroll the optional matches up to max.
16481cb0ef41Sopenharmony_ci        RegExpNode* answer = on_success;
16491cb0ef41Sopenharmony_ci        for (int i = 0; i < max; i++) {
16501cb0ef41Sopenharmony_ci          ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
16511cb0ef41Sopenharmony_ci          if (is_greedy) {
16521cb0ef41Sopenharmony_ci            alternation->AddAlternative(
16531cb0ef41Sopenharmony_ci                GuardedAlternative(body->ToNode(compiler, answer)));
16541cb0ef41Sopenharmony_ci            alternation->AddAlternative(GuardedAlternative(on_success));
16551cb0ef41Sopenharmony_ci          } else {
16561cb0ef41Sopenharmony_ci            alternation->AddAlternative(GuardedAlternative(on_success));
16571cb0ef41Sopenharmony_ci            alternation->AddAlternative(
16581cb0ef41Sopenharmony_ci                GuardedAlternative(body->ToNode(compiler, answer)));
16591cb0ef41Sopenharmony_ci          }
16601cb0ef41Sopenharmony_ci          answer = alternation;
16611cb0ef41Sopenharmony_ci          if (not_at_start && !compiler->read_backward()) {
16621cb0ef41Sopenharmony_ci            alternation->set_not_at_start();
16631cb0ef41Sopenharmony_ci          }
16641cb0ef41Sopenharmony_ci        }
16651cb0ef41Sopenharmony_ci        return answer;
16661cb0ef41Sopenharmony_ci      }
16671cb0ef41Sopenharmony_ci    }
16681cb0ef41Sopenharmony_ci  }
16691cb0ef41Sopenharmony_ci  bool has_min = min > 0;
16701cb0ef41Sopenharmony_ci  bool has_max = max < RegExpTree::kInfinity;
16711cb0ef41Sopenharmony_ci  bool needs_counter = has_min || has_max;
16721cb0ef41Sopenharmony_ci  int reg_ctr = needs_counter ? compiler->AllocateRegister()
16731cb0ef41Sopenharmony_ci                              : RegExpCompiler::kNoRegister;
16741cb0ef41Sopenharmony_ci  LoopChoiceNode* center = zone->New<LoopChoiceNode>(
16751cb0ef41Sopenharmony_ci      body->min_match() == 0, compiler->read_backward(), min, zone);
16761cb0ef41Sopenharmony_ci  if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
16771cb0ef41Sopenharmony_ci  RegExpNode* loop_return =
16781cb0ef41Sopenharmony_ci      needs_counter ? static_cast<RegExpNode*>(
16791cb0ef41Sopenharmony_ci                          ActionNode::IncrementRegister(reg_ctr, center))
16801cb0ef41Sopenharmony_ci                    : static_cast<RegExpNode*>(center);
16811cb0ef41Sopenharmony_ci  if (body_can_be_empty) {
16821cb0ef41Sopenharmony_ci    // If the body can be empty we need to check if it was and then
16831cb0ef41Sopenharmony_ci    // backtrack.
16841cb0ef41Sopenharmony_ci    loop_return =
16851cb0ef41Sopenharmony_ci        ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
16861cb0ef41Sopenharmony_ci  }
16871cb0ef41Sopenharmony_ci  RegExpNode* body_node = body->ToNode(compiler, loop_return);
16881cb0ef41Sopenharmony_ci  if (body_can_be_empty) {
16891cb0ef41Sopenharmony_ci    // If the body can be empty we need to store the start position
16901cb0ef41Sopenharmony_ci    // so we can bail out if it was empty.
16911cb0ef41Sopenharmony_ci    body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
16921cb0ef41Sopenharmony_ci  }
16931cb0ef41Sopenharmony_ci  if (needs_capture_clearing) {
16941cb0ef41Sopenharmony_ci    // Before entering the body of this loop we need to clear captures.
16951cb0ef41Sopenharmony_ci    body_node = ActionNode::ClearCaptures(capture_registers, body_node);
16961cb0ef41Sopenharmony_ci  }
16971cb0ef41Sopenharmony_ci  GuardedAlternative body_alt(body_node);
16981cb0ef41Sopenharmony_ci  if (has_max) {
16991cb0ef41Sopenharmony_ci    Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
17001cb0ef41Sopenharmony_ci    body_alt.AddGuard(body_guard, zone);
17011cb0ef41Sopenharmony_ci  }
17021cb0ef41Sopenharmony_ci  GuardedAlternative rest_alt(on_success);
17031cb0ef41Sopenharmony_ci  if (has_min) {
17041cb0ef41Sopenharmony_ci    Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
17051cb0ef41Sopenharmony_ci    rest_alt.AddGuard(rest_guard, zone);
17061cb0ef41Sopenharmony_ci  }
17071cb0ef41Sopenharmony_ci  if (is_greedy) {
17081cb0ef41Sopenharmony_ci    center->AddLoopAlternative(body_alt);
17091cb0ef41Sopenharmony_ci    center->AddContinueAlternative(rest_alt);
17101cb0ef41Sopenharmony_ci  } else {
17111cb0ef41Sopenharmony_ci    center->AddContinueAlternative(rest_alt);
17121cb0ef41Sopenharmony_ci    center->AddLoopAlternative(body_alt);
17131cb0ef41Sopenharmony_ci  }
17141cb0ef41Sopenharmony_ci  if (needs_counter) {
17151cb0ef41Sopenharmony_ci    return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
17161cb0ef41Sopenharmony_ci  } else {
17171cb0ef41Sopenharmony_ci    return center;
17181cb0ef41Sopenharmony_ci  }
17191cb0ef41Sopenharmony_ci}
17201cb0ef41Sopenharmony_ci
17211cb0ef41Sopenharmony_ci}  // namespace internal
17221cb0ef41Sopenharmony_ci}  // namespace v8
1723