11cb0ef41Sopenharmony_ci// Copyright 2019 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#include "src/regexp/regexp-compiler.h" 61cb0ef41Sopenharmony_ci 71cb0ef41Sopenharmony_ci#include "src/execution/isolate.h" 81cb0ef41Sopenharmony_ci#include "src/regexp/regexp.h" 91cb0ef41Sopenharmony_ci#include "src/strings/unicode-inl.h" 101cb0ef41Sopenharmony_ci#include "src/zone/zone-list-inl.h" 111cb0ef41Sopenharmony_ci 121cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 131cb0ef41Sopenharmony_ci#include "src/base/strings.h" 141cb0ef41Sopenharmony_ci#include "src/regexp/special-case.h" 151cb0ef41Sopenharmony_ci#include "unicode/locid.h" 161cb0ef41Sopenharmony_ci#include "unicode/uniset.h" 171cb0ef41Sopenharmony_ci#include "unicode/utypes.h" 181cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 191cb0ef41Sopenharmony_ci 201cb0ef41Sopenharmony_cinamespace v8 { 211cb0ef41Sopenharmony_cinamespace internal { 221cb0ef41Sopenharmony_ci 231cb0ef41Sopenharmony_ciusing namespace regexp_compiler_constants; // NOLINT(build/namespaces) 241cb0ef41Sopenharmony_ci 251cb0ef41Sopenharmony_ciconstexpr base::uc32 kMaxCodePoint = 0x10ffff; 261cb0ef41Sopenharmony_ciconstexpr int kMaxUtf16CodeUnit = 0xffff; 271cb0ef41Sopenharmony_ciconstexpr uint32_t kMaxUtf16CodeUnitU = 0xffff; 281cb0ef41Sopenharmony_ciconstexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; 291cb0ef41Sopenharmony_ci 301cb0ef41Sopenharmony_ci// ------------------------------------------------------------------- 311cb0ef41Sopenharmony_ci// Tree to graph conversion 321cb0ef41Sopenharmony_ci 331cb0ef41Sopenharmony_ciRegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 341cb0ef41Sopenharmony_ci RegExpNode* on_success) { 351cb0ef41Sopenharmony_ci ZoneList<TextElement>* elms = 361cb0ef41Sopenharmony_ci compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone()); 371cb0ef41Sopenharmony_ci elms->Add(TextElement::Atom(this), compiler->zone()); 381cb0ef41Sopenharmony_ci return compiler->zone()->New<TextNode>(elms, compiler->read_backward(), 391cb0ef41Sopenharmony_ci on_success); 401cb0ef41Sopenharmony_ci} 411cb0ef41Sopenharmony_ci 421cb0ef41Sopenharmony_ciRegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 431cb0ef41Sopenharmony_ci RegExpNode* on_success) { 441cb0ef41Sopenharmony_ci return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(), 451cb0ef41Sopenharmony_ci on_success); 461cb0ef41Sopenharmony_ci} 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_cinamespace { 491cb0ef41Sopenharmony_ci 501cb0ef41Sopenharmony_cibool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 511cb0ef41Sopenharmony_ci const int* special_class, int length) { 521cb0ef41Sopenharmony_ci length--; // Remove final marker. 531cb0ef41Sopenharmony_ci 541cb0ef41Sopenharmony_ci DCHECK_EQ(kRangeEndMarker, special_class[length]); 551cb0ef41Sopenharmony_ci DCHECK_NE(0, ranges->length()); 561cb0ef41Sopenharmony_ci DCHECK_NE(0, length); 571cb0ef41Sopenharmony_ci DCHECK_NE(0, special_class[0]); 581cb0ef41Sopenharmony_ci 591cb0ef41Sopenharmony_ci if (ranges->length() != (length >> 1) + 1) return false; 601cb0ef41Sopenharmony_ci 611cb0ef41Sopenharmony_ci CharacterRange range = ranges->at(0); 621cb0ef41Sopenharmony_ci if (range.from() != 0) return false; 631cb0ef41Sopenharmony_ci 641cb0ef41Sopenharmony_ci for (int i = 0; i < length; i += 2) { 651cb0ef41Sopenharmony_ci if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) { 661cb0ef41Sopenharmony_ci return false; 671cb0ef41Sopenharmony_ci } 681cb0ef41Sopenharmony_ci range = ranges->at((i >> 1) + 1); 691cb0ef41Sopenharmony_ci if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) { 701cb0ef41Sopenharmony_ci return false; 711cb0ef41Sopenharmony_ci } 721cb0ef41Sopenharmony_ci } 731cb0ef41Sopenharmony_ci 741cb0ef41Sopenharmony_ci return range.to() == kMaxCodePoint; 751cb0ef41Sopenharmony_ci} 761cb0ef41Sopenharmony_ci 771cb0ef41Sopenharmony_cibool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class, 781cb0ef41Sopenharmony_ci int length) { 791cb0ef41Sopenharmony_ci length--; // Remove final marker. 801cb0ef41Sopenharmony_ci 811cb0ef41Sopenharmony_ci DCHECK_EQ(kRangeEndMarker, special_class[length]); 821cb0ef41Sopenharmony_ci if (ranges->length() * 2 != length) return false; 831cb0ef41Sopenharmony_ci 841cb0ef41Sopenharmony_ci for (int i = 0; i < length; i += 2) { 851cb0ef41Sopenharmony_ci CharacterRange range = ranges->at(i >> 1); 861cb0ef41Sopenharmony_ci if (range.from() != static_cast<base::uc32>(special_class[i]) || 871cb0ef41Sopenharmony_ci range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) { 881cb0ef41Sopenharmony_ci return false; 891cb0ef41Sopenharmony_ci } 901cb0ef41Sopenharmony_ci } 911cb0ef41Sopenharmony_ci return true; 921cb0ef41Sopenharmony_ci} 931cb0ef41Sopenharmony_ci 941cb0ef41Sopenharmony_ci} // namespace 951cb0ef41Sopenharmony_ci 961cb0ef41Sopenharmony_cibool RegExpCharacterClass::is_standard(Zone* zone) { 971cb0ef41Sopenharmony_ci // TODO(lrn): Remove need for this function, by not throwing away information 981cb0ef41Sopenharmony_ci // along the way. 991cb0ef41Sopenharmony_ci if (is_negated()) { 1001cb0ef41Sopenharmony_ci return false; 1011cb0ef41Sopenharmony_ci } 1021cb0ef41Sopenharmony_ci if (set_.is_standard()) { 1031cb0ef41Sopenharmony_ci return true; 1041cb0ef41Sopenharmony_ci } 1051cb0ef41Sopenharmony_ci if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 1061cb0ef41Sopenharmony_ci set_.set_standard_set_type(StandardCharacterSet::kWhitespace); 1071cb0ef41Sopenharmony_ci return true; 1081cb0ef41Sopenharmony_ci } 1091cb0ef41Sopenharmony_ci if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 1101cb0ef41Sopenharmony_ci set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace); 1111cb0ef41Sopenharmony_ci return true; 1121cb0ef41Sopenharmony_ci } 1131cb0ef41Sopenharmony_ci if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges, 1141cb0ef41Sopenharmony_ci kLineTerminatorRangeCount)) { 1151cb0ef41Sopenharmony_ci set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator); 1161cb0ef41Sopenharmony_ci return true; 1171cb0ef41Sopenharmony_ci } 1181cb0ef41Sopenharmony_ci if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges, 1191cb0ef41Sopenharmony_ci kLineTerminatorRangeCount)) { 1201cb0ef41Sopenharmony_ci set_.set_standard_set_type(StandardCharacterSet::kLineTerminator); 1211cb0ef41Sopenharmony_ci return true; 1221cb0ef41Sopenharmony_ci } 1231cb0ef41Sopenharmony_ci if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 1241cb0ef41Sopenharmony_ci set_.set_standard_set_type(StandardCharacterSet::kWord); 1251cb0ef41Sopenharmony_ci return true; 1261cb0ef41Sopenharmony_ci } 1271cb0ef41Sopenharmony_ci if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 1281cb0ef41Sopenharmony_ci set_.set_standard_set_type(StandardCharacterSet::kNotWord); 1291cb0ef41Sopenharmony_ci return true; 1301cb0ef41Sopenharmony_ci } 1311cb0ef41Sopenharmony_ci return false; 1321cb0ef41Sopenharmony_ci} 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_ciUnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) { 1351cb0ef41Sopenharmony_ci // The unicode range splitter categorizes given character ranges into: 1361cb0ef41Sopenharmony_ci // - Code points from the BMP representable by one code unit. 1371cb0ef41Sopenharmony_ci // - Code points outside the BMP that need to be split into surrogate pairs. 1381cb0ef41Sopenharmony_ci // - Lone lead surrogates. 1391cb0ef41Sopenharmony_ci // - Lone trail surrogates. 1401cb0ef41Sopenharmony_ci // Lone surrogates are valid code points, even though no actual characters. 1411cb0ef41Sopenharmony_ci // They require special matching to make sure we do not split surrogate pairs. 1421cb0ef41Sopenharmony_ci 1431cb0ef41Sopenharmony_ci for (int i = 0; i < base->length(); i++) AddRange(base->at(i)); 1441cb0ef41Sopenharmony_ci} 1451cb0ef41Sopenharmony_ci 1461cb0ef41Sopenharmony_civoid UnicodeRangeSplitter::AddRange(CharacterRange range) { 1471cb0ef41Sopenharmony_ci static constexpr base::uc32 kBmp1Start = 0; 1481cb0ef41Sopenharmony_ci static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1; 1491cb0ef41Sopenharmony_ci static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1; 1501cb0ef41Sopenharmony_ci static constexpr base::uc32 kBmp2End = kNonBmpStart - 1; 1511cb0ef41Sopenharmony_ci 1521cb0ef41Sopenharmony_ci // Ends are all inclusive. 1531cb0ef41Sopenharmony_ci STATIC_ASSERT(kBmp1Start == 0); 1541cb0ef41Sopenharmony_ci STATIC_ASSERT(kBmp1Start < kBmp1End); 1551cb0ef41Sopenharmony_ci STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart); 1561cb0ef41Sopenharmony_ci STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd); 1571cb0ef41Sopenharmony_ci STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); 1581cb0ef41Sopenharmony_ci STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd); 1591cb0ef41Sopenharmony_ci STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start); 1601cb0ef41Sopenharmony_ci STATIC_ASSERT(kBmp2Start < kBmp2End); 1611cb0ef41Sopenharmony_ci STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart); 1621cb0ef41Sopenharmony_ci STATIC_ASSERT(kNonBmpStart < kNonBmpEnd); 1631cb0ef41Sopenharmony_ci 1641cb0ef41Sopenharmony_ci static constexpr base::uc32 kStarts[] = { 1651cb0ef41Sopenharmony_ci kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, 1661cb0ef41Sopenharmony_ci kBmp2Start, kNonBmpStart, 1671cb0ef41Sopenharmony_ci }; 1681cb0ef41Sopenharmony_ci 1691cb0ef41Sopenharmony_ci static constexpr base::uc32 kEnds[] = { 1701cb0ef41Sopenharmony_ci kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, 1711cb0ef41Sopenharmony_ci }; 1721cb0ef41Sopenharmony_ci 1731cb0ef41Sopenharmony_ci CharacterRangeVector* const kTargets[] = { 1741cb0ef41Sopenharmony_ci &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_, 1751cb0ef41Sopenharmony_ci }; 1761cb0ef41Sopenharmony_ci 1771cb0ef41Sopenharmony_ci static constexpr int kCount = arraysize(kStarts); 1781cb0ef41Sopenharmony_ci STATIC_ASSERT(kCount == arraysize(kEnds)); 1791cb0ef41Sopenharmony_ci STATIC_ASSERT(kCount == arraysize(kTargets)); 1801cb0ef41Sopenharmony_ci 1811cb0ef41Sopenharmony_ci for (int i = 0; i < kCount; i++) { 1821cb0ef41Sopenharmony_ci if (kStarts[i] > range.to()) break; 1831cb0ef41Sopenharmony_ci const base::uc32 from = std::max(kStarts[i], range.from()); 1841cb0ef41Sopenharmony_ci const base::uc32 to = std::min(kEnds[i], range.to()); 1851cb0ef41Sopenharmony_ci if (from > to) continue; 1861cb0ef41Sopenharmony_ci kTargets[i]->emplace_back(CharacterRange::Range(from, to)); 1871cb0ef41Sopenharmony_ci } 1881cb0ef41Sopenharmony_ci} 1891cb0ef41Sopenharmony_ci 1901cb0ef41Sopenharmony_cinamespace { 1911cb0ef41Sopenharmony_ci 1921cb0ef41Sopenharmony_ci// Translates between new and old V8-isms (SmallVector, ZoneList). 1931cb0ef41Sopenharmony_ciZoneList<CharacterRange>* ToCanonicalZoneList( 1941cb0ef41Sopenharmony_ci const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { 1951cb0ef41Sopenharmony_ci if (v->empty()) return nullptr; 1961cb0ef41Sopenharmony_ci 1971cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* result = 1981cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone); 1991cb0ef41Sopenharmony_ci for (size_t i = 0; i < v->size(); i++) { 2001cb0ef41Sopenharmony_ci result->Add(v->at(i), zone); 2011cb0ef41Sopenharmony_ci } 2021cb0ef41Sopenharmony_ci 2031cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(result); 2041cb0ef41Sopenharmony_ci return result; 2051cb0ef41Sopenharmony_ci} 2061cb0ef41Sopenharmony_ci 2071cb0ef41Sopenharmony_civoid AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, 2081cb0ef41Sopenharmony_ci RegExpNode* on_success, UnicodeRangeSplitter* splitter) { 2091cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* bmp = 2101cb0ef41Sopenharmony_ci ToCanonicalZoneList(splitter->bmp(), compiler->zone()); 2111cb0ef41Sopenharmony_ci if (bmp == nullptr) return; 2121cb0ef41Sopenharmony_ci result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( 2131cb0ef41Sopenharmony_ci compiler->zone(), bmp, compiler->read_backward(), on_success))); 2141cb0ef41Sopenharmony_ci} 2151cb0ef41Sopenharmony_ci 2161cb0ef41Sopenharmony_ciusing UC16Range = uint32_t; // {from, to} packed into one uint32_t. 2171cb0ef41Sopenharmony_ciconstexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) { 2181cb0ef41Sopenharmony_ci return (static_cast<uint32_t>(from) << 16) | to; 2191cb0ef41Sopenharmony_ci} 2201cb0ef41Sopenharmony_ciconstexpr base::uc16 ExtractFrom(UC16Range r) { 2211cb0ef41Sopenharmony_ci return static_cast<base::uc16>(r >> 16); 2221cb0ef41Sopenharmony_ci} 2231cb0ef41Sopenharmony_ciconstexpr base::uc16 ExtractTo(UC16Range r) { 2241cb0ef41Sopenharmony_ci return static_cast<base::uc16>(r); 2251cb0ef41Sopenharmony_ci} 2261cb0ef41Sopenharmony_ci 2271cb0ef41Sopenharmony_civoid AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, 2281cb0ef41Sopenharmony_ci RegExpNode* on_success, 2291cb0ef41Sopenharmony_ci UnicodeRangeSplitter* splitter) { 2301cb0ef41Sopenharmony_ci DCHECK(!compiler->one_byte()); 2311cb0ef41Sopenharmony_ci Zone* const zone = compiler->zone(); 2321cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* non_bmp = 2331cb0ef41Sopenharmony_ci ToCanonicalZoneList(splitter->non_bmp(), zone); 2341cb0ef41Sopenharmony_ci if (non_bmp == nullptr) return; 2351cb0ef41Sopenharmony_ci 2361cb0ef41Sopenharmony_ci // Translate each 32-bit code point range into the corresponding 16-bit code 2371cb0ef41Sopenharmony_ci // unit representation consisting of the lead- and trail surrogate. 2381cb0ef41Sopenharmony_ci // 2391cb0ef41Sopenharmony_ci // The generated alternatives are grouped by the leading surrogate to avoid 2401cb0ef41Sopenharmony_ci // emitting excessive code. For example, for 2411cb0ef41Sopenharmony_ci // 2421cb0ef41Sopenharmony_ci // { \ud800[\udc00-\udc01] 2431cb0ef41Sopenharmony_ci // , \ud800[\udc05-\udc06] 2441cb0ef41Sopenharmony_ci // } 2451cb0ef41Sopenharmony_ci // 2461cb0ef41Sopenharmony_ci // there's no need to emit matching code for the leading surrogate \ud800 2471cb0ef41Sopenharmony_ci // twice. We also create a dedicated grouping for full trailing ranges, i.e. 2481cb0ef41Sopenharmony_ci // [dc00-dfff]. 2491cb0ef41Sopenharmony_ci ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading( 2501cb0ef41Sopenharmony_ci zone); 2511cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* leading_with_full_trailing_range = 2521cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(1, zone); 2531cb0ef41Sopenharmony_ci const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l, 2541cb0ef41Sopenharmony_ci base::uc16 from_t, base::uc16 to_t) { 2551cb0ef41Sopenharmony_ci const UC16Range leading_range = ToUC16Range(from_l, to_l); 2561cb0ef41Sopenharmony_ci if (grouped_by_leading.count(leading_range) == 0) { 2571cb0ef41Sopenharmony_ci if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) { 2581cb0ef41Sopenharmony_ci leading_with_full_trailing_range->Add( 2591cb0ef41Sopenharmony_ci CharacterRange::Range(from_l, to_l), zone); 2601cb0ef41Sopenharmony_ci return; 2611cb0ef41Sopenharmony_ci } 2621cb0ef41Sopenharmony_ci grouped_by_leading[leading_range] = 2631cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(2, zone); 2641cb0ef41Sopenharmony_ci } 2651cb0ef41Sopenharmony_ci grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t), 2661cb0ef41Sopenharmony_ci zone); 2671cb0ef41Sopenharmony_ci }; 2681cb0ef41Sopenharmony_ci 2691cb0ef41Sopenharmony_ci // First, create the grouped ranges. 2701cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(non_bmp); 2711cb0ef41Sopenharmony_ci for (int i = 0; i < non_bmp->length(); i++) { 2721cb0ef41Sopenharmony_ci // Match surrogate pair. 2731cb0ef41Sopenharmony_ci // E.g. [\u10005-\u11005] becomes 2741cb0ef41Sopenharmony_ci // \ud800[\udc05-\udfff]| 2751cb0ef41Sopenharmony_ci // [\ud801-\ud803][\udc00-\udfff]| 2761cb0ef41Sopenharmony_ci // \ud804[\udc00-\udc05] 2771cb0ef41Sopenharmony_ci base::uc32 from = non_bmp->at(i).from(); 2781cb0ef41Sopenharmony_ci base::uc32 to = non_bmp->at(i).to(); 2791cb0ef41Sopenharmony_ci base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from); 2801cb0ef41Sopenharmony_ci base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from); 2811cb0ef41Sopenharmony_ci base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to); 2821cb0ef41Sopenharmony_ci base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to); 2831cb0ef41Sopenharmony_ci 2841cb0ef41Sopenharmony_ci if (from_l == to_l) { 2851cb0ef41Sopenharmony_ci // The lead surrogate is the same. 2861cb0ef41Sopenharmony_ci AddRange(from_l, to_l, from_t, to_t); 2871cb0ef41Sopenharmony_ci continue; 2881cb0ef41Sopenharmony_ci } 2891cb0ef41Sopenharmony_ci 2901cb0ef41Sopenharmony_ci if (from_t != kTrailSurrogateStart) { 2911cb0ef41Sopenharmony_ci // Add [from_l][from_t-\udfff]. 2921cb0ef41Sopenharmony_ci AddRange(from_l, from_l, from_t, kTrailSurrogateEnd); 2931cb0ef41Sopenharmony_ci from_l++; 2941cb0ef41Sopenharmony_ci } 2951cb0ef41Sopenharmony_ci if (to_t != kTrailSurrogateEnd) { 2961cb0ef41Sopenharmony_ci // Add [to_l][\udc00-to_t]. 2971cb0ef41Sopenharmony_ci AddRange(to_l, to_l, kTrailSurrogateStart, to_t); 2981cb0ef41Sopenharmony_ci to_l--; 2991cb0ef41Sopenharmony_ci } 3001cb0ef41Sopenharmony_ci if (from_l <= to_l) { 3011cb0ef41Sopenharmony_ci // Add [from_l-to_l][\udc00-\udfff]. 3021cb0ef41Sopenharmony_ci AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd); 3031cb0ef41Sopenharmony_ci } 3041cb0ef41Sopenharmony_ci } 3051cb0ef41Sopenharmony_ci 3061cb0ef41Sopenharmony_ci // Create the actual TextNode now that ranges are fully grouped. 3071cb0ef41Sopenharmony_ci if (!leading_with_full_trailing_range->is_empty()) { 3081cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(leading_with_full_trailing_range); 3091cb0ef41Sopenharmony_ci result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( 3101cb0ef41Sopenharmony_ci zone, leading_with_full_trailing_range, 3111cb0ef41Sopenharmony_ci CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 3121cb0ef41Sopenharmony_ci compiler->read_backward(), on_success))); 3131cb0ef41Sopenharmony_ci } 3141cb0ef41Sopenharmony_ci for (const auto& it : grouped_by_leading) { 3151cb0ef41Sopenharmony_ci CharacterRange leading_range = 3161cb0ef41Sopenharmony_ci CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first)); 3171cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* trailing_ranges = it.second; 3181cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(trailing_ranges); 3191cb0ef41Sopenharmony_ci result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( 3201cb0ef41Sopenharmony_ci zone, leading_range, trailing_ranges, compiler->read_backward(), 3211cb0ef41Sopenharmony_ci on_success))); 3221cb0ef41Sopenharmony_ci } 3231cb0ef41Sopenharmony_ci} 3241cb0ef41Sopenharmony_ci 3251cb0ef41Sopenharmony_ciRegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( 3261cb0ef41Sopenharmony_ci RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, 3271cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* match, RegExpNode* on_success, 3281cb0ef41Sopenharmony_ci bool read_backward) { 3291cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 3301cb0ef41Sopenharmony_ci RegExpNode* match_node = TextNode::CreateForCharacterRanges( 3311cb0ef41Sopenharmony_ci zone, match, read_backward, on_success); 3321cb0ef41Sopenharmony_ci int stack_register = compiler->UnicodeLookaroundStackRegister(); 3331cb0ef41Sopenharmony_ci int position_register = compiler->UnicodeLookaroundPositionRegister(); 3341cb0ef41Sopenharmony_ci RegExpLookaround::Builder lookaround(false, match_node, stack_register, 3351cb0ef41Sopenharmony_ci position_register); 3361cb0ef41Sopenharmony_ci RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 3371cb0ef41Sopenharmony_ci zone, lookbehind, !read_backward, lookaround.on_match_success()); 3381cb0ef41Sopenharmony_ci return lookaround.ForMatch(negative_match); 3391cb0ef41Sopenharmony_ci} 3401cb0ef41Sopenharmony_ci 3411cb0ef41Sopenharmony_ciRegExpNode* MatchAndNegativeLookaroundInReadDirection( 3421cb0ef41Sopenharmony_ci RegExpCompiler* compiler, ZoneList<CharacterRange>* match, 3431cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, 3441cb0ef41Sopenharmony_ci bool read_backward) { 3451cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 3461cb0ef41Sopenharmony_ci int stack_register = compiler->UnicodeLookaroundStackRegister(); 3471cb0ef41Sopenharmony_ci int position_register = compiler->UnicodeLookaroundPositionRegister(); 3481cb0ef41Sopenharmony_ci RegExpLookaround::Builder lookaround(false, on_success, stack_register, 3491cb0ef41Sopenharmony_ci position_register); 3501cb0ef41Sopenharmony_ci RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 3511cb0ef41Sopenharmony_ci zone, lookahead, read_backward, lookaround.on_match_success()); 3521cb0ef41Sopenharmony_ci return TextNode::CreateForCharacterRanges( 3531cb0ef41Sopenharmony_ci zone, match, read_backward, lookaround.ForMatch(negative_match)); 3541cb0ef41Sopenharmony_ci} 3551cb0ef41Sopenharmony_ci 3561cb0ef41Sopenharmony_civoid AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 3571cb0ef41Sopenharmony_ci RegExpNode* on_success, 3581cb0ef41Sopenharmony_ci UnicodeRangeSplitter* splitter) { 3591cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* lead_surrogates = 3601cb0ef41Sopenharmony_ci ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); 3611cb0ef41Sopenharmony_ci if (lead_surrogates == nullptr) return; 3621cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 3631cb0ef41Sopenharmony_ci // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 3641cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 3651cb0ef41Sopenharmony_ci zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 3661cb0ef41Sopenharmony_ci 3671cb0ef41Sopenharmony_ci RegExpNode* match; 3681cb0ef41Sopenharmony_ci if (compiler->read_backward()) { 3691cb0ef41Sopenharmony_ci // Reading backward. Assert that reading forward, there is no trail 3701cb0ef41Sopenharmony_ci // surrogate, and then backward match the lead surrogate. 3711cb0ef41Sopenharmony_ci match = NegativeLookaroundAgainstReadDirectionAndMatch( 3721cb0ef41Sopenharmony_ci compiler, trail_surrogates, lead_surrogates, on_success, true); 3731cb0ef41Sopenharmony_ci } else { 3741cb0ef41Sopenharmony_ci // Reading forward. Forward match the lead surrogate and assert that 3751cb0ef41Sopenharmony_ci // no trail surrogate follows. 3761cb0ef41Sopenharmony_ci match = MatchAndNegativeLookaroundInReadDirection( 3771cb0ef41Sopenharmony_ci compiler, lead_surrogates, trail_surrogates, on_success, false); 3781cb0ef41Sopenharmony_ci } 3791cb0ef41Sopenharmony_ci result->AddAlternative(GuardedAlternative(match)); 3801cb0ef41Sopenharmony_ci} 3811cb0ef41Sopenharmony_ci 3821cb0ef41Sopenharmony_civoid AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 3831cb0ef41Sopenharmony_ci RegExpNode* on_success, 3841cb0ef41Sopenharmony_ci UnicodeRangeSplitter* splitter) { 3851cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* trail_surrogates = 3861cb0ef41Sopenharmony_ci ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); 3871cb0ef41Sopenharmony_ci if (trail_surrogates == nullptr) return; 3881cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 3891cb0ef41Sopenharmony_ci // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 3901cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 3911cb0ef41Sopenharmony_ci zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 3921cb0ef41Sopenharmony_ci 3931cb0ef41Sopenharmony_ci RegExpNode* match; 3941cb0ef41Sopenharmony_ci if (compiler->read_backward()) { 3951cb0ef41Sopenharmony_ci // Reading backward. Backward match the trail surrogate and assert that no 3961cb0ef41Sopenharmony_ci // lead surrogate precedes it. 3971cb0ef41Sopenharmony_ci match = MatchAndNegativeLookaroundInReadDirection( 3981cb0ef41Sopenharmony_ci compiler, trail_surrogates, lead_surrogates, on_success, true); 3991cb0ef41Sopenharmony_ci } else { 4001cb0ef41Sopenharmony_ci // Reading forward. Assert that reading backward, there is no lead 4011cb0ef41Sopenharmony_ci // surrogate, and then forward match the trail surrogate. 4021cb0ef41Sopenharmony_ci match = NegativeLookaroundAgainstReadDirectionAndMatch( 4031cb0ef41Sopenharmony_ci compiler, lead_surrogates, trail_surrogates, on_success, false); 4041cb0ef41Sopenharmony_ci } 4051cb0ef41Sopenharmony_ci result->AddAlternative(GuardedAlternative(match)); 4061cb0ef41Sopenharmony_ci} 4071cb0ef41Sopenharmony_ci 4081cb0ef41Sopenharmony_ciRegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, 4091cb0ef41Sopenharmony_ci RegExpNode* on_success) { 4101cb0ef41Sopenharmony_ci // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. 4111cb0ef41Sopenharmony_ci DCHECK(!compiler->read_backward()); 4121cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 4131cb0ef41Sopenharmony_ci // Advance any character. If the character happens to be a lead surrogate and 4141cb0ef41Sopenharmony_ci // we advanced into the middle of a surrogate pair, it will work out, as 4151cb0ef41Sopenharmony_ci // nothing will match from there. We will have to advance again, consuming 4161cb0ef41Sopenharmony_ci // the associated trail surrogate. 4171cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* range = 4181cb0ef41Sopenharmony_ci CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit)); 4191cb0ef41Sopenharmony_ci return TextNode::CreateForCharacterRanges(zone, range, false, on_success); 4201cb0ef41Sopenharmony_ci} 4211cb0ef41Sopenharmony_ci 4221cb0ef41Sopenharmony_civoid AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) { 4231cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 4241cb0ef41Sopenharmony_ci DCHECK(CharacterRange::IsCanonical(ranges)); 4251cb0ef41Sopenharmony_ci 4261cb0ef41Sopenharmony_ci // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. 4271cb0ef41Sopenharmony_ci // See also https://crbug.com/v8/6727. 4281cb0ef41Sopenharmony_ci // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range, 4291cb0ef41Sopenharmony_ci // which we use frequently internally. But large ranges can also easily be 4301cb0ef41Sopenharmony_ci // created by the user. We might want to have a more general caching mechanism 4311cb0ef41Sopenharmony_ci // for such ranges. 4321cb0ef41Sopenharmony_ci if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return; 4331cb0ef41Sopenharmony_ci 4341cb0ef41Sopenharmony_ci // Use ICU to compute the case fold closure over the ranges. 4351cb0ef41Sopenharmony_ci icu::UnicodeSet set; 4361cb0ef41Sopenharmony_ci for (int i = 0; i < ranges->length(); i++) { 4371cb0ef41Sopenharmony_ci set.add(ranges->at(i).from(), ranges->at(i).to()); 4381cb0ef41Sopenharmony_ci } 4391cb0ef41Sopenharmony_ci // Clear the ranges list without freeing the backing store. 4401cb0ef41Sopenharmony_ci ranges->Rewind(0); 4411cb0ef41Sopenharmony_ci set.closeOver(USET_CASE_INSENSITIVE); 4421cb0ef41Sopenharmony_ci // Full case mapping map single characters to multiple characters. 4431cb0ef41Sopenharmony_ci // Those are represented as strings in the set. Remove them so that 4441cb0ef41Sopenharmony_ci // we end up with only simple and common case mappings. 4451cb0ef41Sopenharmony_ci set.removeAllStrings(); 4461cb0ef41Sopenharmony_ci for (int i = 0; i < set.getRangeCount(); i++) { 4471cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 4481cb0ef41Sopenharmony_ci zone); 4491cb0ef41Sopenharmony_ci } 4501cb0ef41Sopenharmony_ci // No errors and everything we collected have been ranges. 4511cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(ranges); 4521cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 4531cb0ef41Sopenharmony_ci} 4541cb0ef41Sopenharmony_ci 4551cb0ef41Sopenharmony_ci} // namespace 4561cb0ef41Sopenharmony_ci 4571cb0ef41Sopenharmony_ciRegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4581cb0ef41Sopenharmony_ci RegExpNode* on_success) { 4591cb0ef41Sopenharmony_ci set_.Canonicalize(); 4601cb0ef41Sopenharmony_ci Zone* const zone = compiler->zone(); 4611cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* ranges = this->ranges(zone); 4621cb0ef41Sopenharmony_ci 4631cb0ef41Sopenharmony_ci if (NeedsUnicodeCaseEquivalents(compiler->flags())) { 4641cb0ef41Sopenharmony_ci AddUnicodeCaseEquivalents(ranges, zone); 4651cb0ef41Sopenharmony_ci } 4661cb0ef41Sopenharmony_ci 4671cb0ef41Sopenharmony_ci if (!IsUnicode(compiler->flags()) || compiler->one_byte() || 4681cb0ef41Sopenharmony_ci contains_split_surrogate()) { 4691cb0ef41Sopenharmony_ci return zone->New<TextNode>(this, compiler->read_backward(), on_success); 4701cb0ef41Sopenharmony_ci } 4711cb0ef41Sopenharmony_ci 4721cb0ef41Sopenharmony_ci if (is_negated()) { 4731cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* negated = 4741cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(2, zone); 4751cb0ef41Sopenharmony_ci CharacterRange::Negate(ranges, negated, zone); 4761cb0ef41Sopenharmony_ci ranges = negated; 4771cb0ef41Sopenharmony_ci } 4781cb0ef41Sopenharmony_ci 4791cb0ef41Sopenharmony_ci if (ranges->length() == 0) { 4801cb0ef41Sopenharmony_ci // The empty character class is used as a 'fail' node. 4811cb0ef41Sopenharmony_ci RegExpCharacterClass* fail = zone->New<RegExpCharacterClass>(zone, ranges); 4821cb0ef41Sopenharmony_ci return zone->New<TextNode>(fail, compiler->read_backward(), on_success); 4831cb0ef41Sopenharmony_ci } 4841cb0ef41Sopenharmony_ci 4851cb0ef41Sopenharmony_ci if (set_.is_standard() && 4861cb0ef41Sopenharmony_ci standard_type() == StandardCharacterSet::kEverything) { 4871cb0ef41Sopenharmony_ci return UnanchoredAdvance(compiler, on_success); 4881cb0ef41Sopenharmony_ci } 4891cb0ef41Sopenharmony_ci 4901cb0ef41Sopenharmony_ci // Split ranges in order to handle surrogates correctly: 4911cb0ef41Sopenharmony_ci // - Surrogate pairs: translate the 32-bit code point into two uc16 code 4921cb0ef41Sopenharmony_ci // units (irregexp operates only on code units). 4931cb0ef41Sopenharmony_ci // - Lone surrogates: these require lookarounds to ensure we don't match in 4941cb0ef41Sopenharmony_ci // the middle of a surrogate pair. 4951cb0ef41Sopenharmony_ci ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 4961cb0ef41Sopenharmony_ci UnicodeRangeSplitter splitter(ranges); 4971cb0ef41Sopenharmony_ci AddBmpCharacters(compiler, result, on_success, &splitter); 4981cb0ef41Sopenharmony_ci AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 4991cb0ef41Sopenharmony_ci AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5001cb0ef41Sopenharmony_ci AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 5011cb0ef41Sopenharmony_ci 5021cb0ef41Sopenharmony_ci static constexpr int kMaxRangesToInline = 32; // Arbitrary. 5031cb0ef41Sopenharmony_ci if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline(); 5041cb0ef41Sopenharmony_ci 5051cb0ef41Sopenharmony_ci return result; 5061cb0ef41Sopenharmony_ci} 5071cb0ef41Sopenharmony_ci 5081cb0ef41Sopenharmony_cinamespace { 5091cb0ef41Sopenharmony_ci 5101cb0ef41Sopenharmony_ciint CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5111cb0ef41Sopenharmony_ci RegExpAtom* atom1 = (*a)->AsAtom(); 5121cb0ef41Sopenharmony_ci RegExpAtom* atom2 = (*b)->AsAtom(); 5131cb0ef41Sopenharmony_ci base::uc16 character1 = atom1->data().at(0); 5141cb0ef41Sopenharmony_ci base::uc16 character2 = atom2->data().at(0); 5151cb0ef41Sopenharmony_ci if (character1 < character2) return -1; 5161cb0ef41Sopenharmony_ci if (character1 > character2) return 1; 5171cb0ef41Sopenharmony_ci return 0; 5181cb0ef41Sopenharmony_ci} 5191cb0ef41Sopenharmony_ci 5201cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 5211cb0ef41Sopenharmony_ci 5221cb0ef41Sopenharmony_ciint CompareCaseInsensitive(const icu::UnicodeString& a, 5231cb0ef41Sopenharmony_ci const icu::UnicodeString& b) { 5241cb0ef41Sopenharmony_ci return a.caseCompare(b, U_FOLD_CASE_DEFAULT); 5251cb0ef41Sopenharmony_ci} 5261cb0ef41Sopenharmony_ci 5271cb0ef41Sopenharmony_ciint CompareFirstCharCaseInsensitive(RegExpTree* const* a, 5281cb0ef41Sopenharmony_ci RegExpTree* const* b) { 5291cb0ef41Sopenharmony_ci RegExpAtom* atom1 = (*a)->AsAtom(); 5301cb0ef41Sopenharmony_ci RegExpAtom* atom2 = (*b)->AsAtom(); 5311cb0ef41Sopenharmony_ci return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)}, 5321cb0ef41Sopenharmony_ci icu::UnicodeString{atom2->data().at(0)}); 5331cb0ef41Sopenharmony_ci} 5341cb0ef41Sopenharmony_ci 5351cb0ef41Sopenharmony_cibool Equals(bool ignore_case, const icu::UnicodeString& a, 5361cb0ef41Sopenharmony_ci const icu::UnicodeString& b) { 5371cb0ef41Sopenharmony_ci if (a == b) return true; 5381cb0ef41Sopenharmony_ci if (ignore_case) return CompareCaseInsensitive(a, b) == 0; 5391cb0ef41Sopenharmony_ci return false; // Case-sensitive equality already checked above. 5401cb0ef41Sopenharmony_ci} 5411cb0ef41Sopenharmony_ci 5421cb0ef41Sopenharmony_cibool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a, 5431cb0ef41Sopenharmony_ci const RegExpAtom* b) { 5441cb0ef41Sopenharmony_ci return Equals(ignore_case, a->data().at(index), b->data().at(index)); 5451cb0ef41Sopenharmony_ci} 5461cb0ef41Sopenharmony_ci 5471cb0ef41Sopenharmony_ci#else 5481cb0ef41Sopenharmony_ci 5491cb0ef41Sopenharmony_ciunibrow::uchar Canonical( 5501cb0ef41Sopenharmony_ci unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5511cb0ef41Sopenharmony_ci unibrow::uchar c) { 5521cb0ef41Sopenharmony_ci unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 5531cb0ef41Sopenharmony_ci int length = canonicalize->get(c, '\0', chars); 5541cb0ef41Sopenharmony_ci DCHECK_LE(length, 1); 5551cb0ef41Sopenharmony_ci unibrow::uchar canonical = c; 5561cb0ef41Sopenharmony_ci if (length == 1) canonical = chars[0]; 5571cb0ef41Sopenharmony_ci return canonical; 5581cb0ef41Sopenharmony_ci} 5591cb0ef41Sopenharmony_ci 5601cb0ef41Sopenharmony_ciint CompareCaseInsensitive( 5611cb0ef41Sopenharmony_ci unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5621cb0ef41Sopenharmony_ci unibrow::uchar a, unibrow::uchar b) { 5631cb0ef41Sopenharmony_ci if (a == b) return 0; 5641cb0ef41Sopenharmony_ci if (a >= 'a' || b >= 'a') { 5651cb0ef41Sopenharmony_ci a = Canonical(canonicalize, a); 5661cb0ef41Sopenharmony_ci b = Canonical(canonicalize, b); 5671cb0ef41Sopenharmony_ci } 5681cb0ef41Sopenharmony_ci return static_cast<int>(a) - static_cast<int>(b); 5691cb0ef41Sopenharmony_ci} 5701cb0ef41Sopenharmony_ci 5711cb0ef41Sopenharmony_ciint CompareFirstCharCaseInsensitive( 5721cb0ef41Sopenharmony_ci unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5731cb0ef41Sopenharmony_ci RegExpTree* const* a, RegExpTree* const* b) { 5741cb0ef41Sopenharmony_ci RegExpAtom* atom1 = (*a)->AsAtom(); 5751cb0ef41Sopenharmony_ci RegExpAtom* atom2 = (*b)->AsAtom(); 5761cb0ef41Sopenharmony_ci return CompareCaseInsensitive(canonicalize, atom1->data().at(0), 5771cb0ef41Sopenharmony_ci atom2->data().at(0)); 5781cb0ef41Sopenharmony_ci} 5791cb0ef41Sopenharmony_ci 5801cb0ef41Sopenharmony_cibool Equals(bool ignore_case, 5811cb0ef41Sopenharmony_ci unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5821cb0ef41Sopenharmony_ci unibrow::uchar a, unibrow::uchar b) { 5831cb0ef41Sopenharmony_ci if (a == b) return true; 5841cb0ef41Sopenharmony_ci if (ignore_case) { 5851cb0ef41Sopenharmony_ci return CompareCaseInsensitive(canonicalize, a, b) == 0; 5861cb0ef41Sopenharmony_ci } 5871cb0ef41Sopenharmony_ci return false; // Case-sensitive equality already checked above. 5881cb0ef41Sopenharmony_ci} 5891cb0ef41Sopenharmony_ci 5901cb0ef41Sopenharmony_cibool CharAtEquals(bool ignore_case, 5911cb0ef41Sopenharmony_ci unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5921cb0ef41Sopenharmony_ci int index, const RegExpAtom* a, const RegExpAtom* b) { 5931cb0ef41Sopenharmony_ci return Equals(ignore_case, canonicalize, a->data().at(index), 5941cb0ef41Sopenharmony_ci b->data().at(index)); 5951cb0ef41Sopenharmony_ci} 5961cb0ef41Sopenharmony_ci 5971cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 5981cb0ef41Sopenharmony_ci 5991cb0ef41Sopenharmony_ci} // namespace 6001cb0ef41Sopenharmony_ci 6011cb0ef41Sopenharmony_ci// We can stable sort runs of atoms, since the order does not matter if they 6021cb0ef41Sopenharmony_ci// start with different characters. 6031cb0ef41Sopenharmony_ci// Returns true if any consecutive atoms were found. 6041cb0ef41Sopenharmony_cibool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 6051cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* alternatives = this->alternatives(); 6061cb0ef41Sopenharmony_ci int length = alternatives->length(); 6071cb0ef41Sopenharmony_ci bool found_consecutive_atoms = false; 6081cb0ef41Sopenharmony_ci for (int i = 0; i < length; i++) { 6091cb0ef41Sopenharmony_ci while (i < length) { 6101cb0ef41Sopenharmony_ci RegExpTree* alternative = alternatives->at(i); 6111cb0ef41Sopenharmony_ci if (alternative->IsAtom()) break; 6121cb0ef41Sopenharmony_ci i++; 6131cb0ef41Sopenharmony_ci } 6141cb0ef41Sopenharmony_ci // i is length or it is the index of an atom. 6151cb0ef41Sopenharmony_ci if (i == length) break; 6161cb0ef41Sopenharmony_ci int first_atom = i; 6171cb0ef41Sopenharmony_ci i++; 6181cb0ef41Sopenharmony_ci while (i < length) { 6191cb0ef41Sopenharmony_ci RegExpTree* alternative = alternatives->at(i); 6201cb0ef41Sopenharmony_ci if (!alternative->IsAtom()) break; 6211cb0ef41Sopenharmony_ci i++; 6221cb0ef41Sopenharmony_ci } 6231cb0ef41Sopenharmony_ci // Sort atoms to get ones with common prefixes together. 6241cb0ef41Sopenharmony_ci // This step is more tricky if we are in a case-independent regexp, 6251cb0ef41Sopenharmony_ci // because it would change /is|I/ to /I|is/, and order matters when 6261cb0ef41Sopenharmony_ci // the regexp parts don't match only disjoint starting points. To fix 6271cb0ef41Sopenharmony_ci // this we have a version of CompareFirstChar that uses case- 6281cb0ef41Sopenharmony_ci // independent character classes for comparison. 6291cb0ef41Sopenharmony_ci DCHECK_LT(first_atom, alternatives->length()); 6301cb0ef41Sopenharmony_ci DCHECK_LE(i, alternatives->length()); 6311cb0ef41Sopenharmony_ci DCHECK_LE(first_atom, i); 6321cb0ef41Sopenharmony_ci if (IsIgnoreCase(compiler->flags())) { 6331cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 6341cb0ef41Sopenharmony_ci alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom, 6351cb0ef41Sopenharmony_ci i - first_atom); 6361cb0ef41Sopenharmony_ci#else 6371cb0ef41Sopenharmony_ci unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 6381cb0ef41Sopenharmony_ci compiler->isolate()->regexp_macro_assembler_canonicalize(); 6391cb0ef41Sopenharmony_ci auto compare_closure = [canonicalize](RegExpTree* const* a, 6401cb0ef41Sopenharmony_ci RegExpTree* const* b) { 6411cb0ef41Sopenharmony_ci return CompareFirstCharCaseInsensitive(canonicalize, a, b); 6421cb0ef41Sopenharmony_ci }; 6431cb0ef41Sopenharmony_ci alternatives->StableSort(compare_closure, first_atom, i - first_atom); 6441cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 6451cb0ef41Sopenharmony_ci } else { 6461cb0ef41Sopenharmony_ci alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 6471cb0ef41Sopenharmony_ci } 6481cb0ef41Sopenharmony_ci if (i - first_atom > 1) found_consecutive_atoms = true; 6491cb0ef41Sopenharmony_ci } 6501cb0ef41Sopenharmony_ci return found_consecutive_atoms; 6511cb0ef41Sopenharmony_ci} 6521cb0ef41Sopenharmony_ci 6531cb0ef41Sopenharmony_ci// Optimizes ab|ac|az to a(?:b|c|d). 6541cb0ef41Sopenharmony_civoid RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 6551cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 6561cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* alternatives = this->alternatives(); 6571cb0ef41Sopenharmony_ci int length = alternatives->length(); 6581cb0ef41Sopenharmony_ci const bool ignore_case = IsIgnoreCase(compiler->flags()); 6591cb0ef41Sopenharmony_ci 6601cb0ef41Sopenharmony_ci int write_posn = 0; 6611cb0ef41Sopenharmony_ci int i = 0; 6621cb0ef41Sopenharmony_ci while (i < length) { 6631cb0ef41Sopenharmony_ci RegExpTree* alternative = alternatives->at(i); 6641cb0ef41Sopenharmony_ci if (!alternative->IsAtom()) { 6651cb0ef41Sopenharmony_ci alternatives->at(write_posn++) = alternatives->at(i); 6661cb0ef41Sopenharmony_ci i++; 6671cb0ef41Sopenharmony_ci continue; 6681cb0ef41Sopenharmony_ci } 6691cb0ef41Sopenharmony_ci RegExpAtom* const atom = alternative->AsAtom(); 6701cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 6711cb0ef41Sopenharmony_ci icu::UnicodeString common_prefix(atom->data().at(0)); 6721cb0ef41Sopenharmony_ci#else 6731cb0ef41Sopenharmony_ci unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize = 6741cb0ef41Sopenharmony_ci compiler->isolate()->regexp_macro_assembler_canonicalize(); 6751cb0ef41Sopenharmony_ci unibrow::uchar common_prefix = atom->data().at(0); 6761cb0ef41Sopenharmony_ci if (ignore_case) { 6771cb0ef41Sopenharmony_ci common_prefix = Canonical(canonicalize, common_prefix); 6781cb0ef41Sopenharmony_ci } 6791cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 6801cb0ef41Sopenharmony_ci int first_with_prefix = i; 6811cb0ef41Sopenharmony_ci int prefix_length = atom->length(); 6821cb0ef41Sopenharmony_ci i++; 6831cb0ef41Sopenharmony_ci while (i < length) { 6841cb0ef41Sopenharmony_ci alternative = alternatives->at(i); 6851cb0ef41Sopenharmony_ci if (!alternative->IsAtom()) break; 6861cb0ef41Sopenharmony_ci RegExpAtom* const alt_atom = alternative->AsAtom(); 6871cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 6881cb0ef41Sopenharmony_ci icu::UnicodeString new_prefix(alt_atom->data().at(0)); 6891cb0ef41Sopenharmony_ci if (!Equals(ignore_case, new_prefix, common_prefix)) break; 6901cb0ef41Sopenharmony_ci#else 6911cb0ef41Sopenharmony_ci unibrow::uchar new_prefix = alt_atom->data().at(0); 6921cb0ef41Sopenharmony_ci if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break; 6931cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 6941cb0ef41Sopenharmony_ci prefix_length = std::min(prefix_length, alt_atom->length()); 6951cb0ef41Sopenharmony_ci i++; 6961cb0ef41Sopenharmony_ci } 6971cb0ef41Sopenharmony_ci if (i > first_with_prefix + 2) { 6981cb0ef41Sopenharmony_ci // Found worthwhile run of alternatives with common prefix of at least one 6991cb0ef41Sopenharmony_ci // character. The sorting function above did not sort on more than one 7001cb0ef41Sopenharmony_ci // character for reasons of correctness, but there may still be a longer 7011cb0ef41Sopenharmony_ci // common prefix if the terms were similar or presorted in the input. 7021cb0ef41Sopenharmony_ci // Find out how long the common prefix is. 7031cb0ef41Sopenharmony_ci int run_length = i - first_with_prefix; 7041cb0ef41Sopenharmony_ci RegExpAtom* const alt_atom = 7051cb0ef41Sopenharmony_ci alternatives->at(first_with_prefix)->AsAtom(); 7061cb0ef41Sopenharmony_ci for (int j = 1; j < run_length && prefix_length > 1; j++) { 7071cb0ef41Sopenharmony_ci RegExpAtom* old_atom = 7081cb0ef41Sopenharmony_ci alternatives->at(j + first_with_prefix)->AsAtom(); 7091cb0ef41Sopenharmony_ci for (int k = 1; k < prefix_length; k++) { 7101cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 7111cb0ef41Sopenharmony_ci if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) { 7121cb0ef41Sopenharmony_ci#else 7131cb0ef41Sopenharmony_ci if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) { 7141cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 7151cb0ef41Sopenharmony_ci prefix_length = k; 7161cb0ef41Sopenharmony_ci break; 7171cb0ef41Sopenharmony_ci } 7181cb0ef41Sopenharmony_ci } 7191cb0ef41Sopenharmony_ci } 7201cb0ef41Sopenharmony_ci RegExpAtom* prefix = 7211cb0ef41Sopenharmony_ci zone->New<RegExpAtom>(alt_atom->data().SubVector(0, prefix_length)); 7221cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone); 7231cb0ef41Sopenharmony_ci pair->Add(prefix, zone); 7241cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* suffixes = 7251cb0ef41Sopenharmony_ci zone->New<ZoneList<RegExpTree*>>(run_length, zone); 7261cb0ef41Sopenharmony_ci for (int j = 0; j < run_length; j++) { 7271cb0ef41Sopenharmony_ci RegExpAtom* old_atom = 7281cb0ef41Sopenharmony_ci alternatives->at(j + first_with_prefix)->AsAtom(); 7291cb0ef41Sopenharmony_ci int len = old_atom->length(); 7301cb0ef41Sopenharmony_ci if (len == prefix_length) { 7311cb0ef41Sopenharmony_ci suffixes->Add(zone->New<RegExpEmpty>(), zone); 7321cb0ef41Sopenharmony_ci } else { 7331cb0ef41Sopenharmony_ci RegExpTree* suffix = zone->New<RegExpAtom>( 7341cb0ef41Sopenharmony_ci old_atom->data().SubVector(prefix_length, old_atom->length())); 7351cb0ef41Sopenharmony_ci suffixes->Add(suffix, zone); 7361cb0ef41Sopenharmony_ci } 7371cb0ef41Sopenharmony_ci } 7381cb0ef41Sopenharmony_ci pair->Add(zone->New<RegExpDisjunction>(suffixes), zone); 7391cb0ef41Sopenharmony_ci alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair); 7401cb0ef41Sopenharmony_ci } else { 7411cb0ef41Sopenharmony_ci // Just copy any non-worthwhile alternatives. 7421cb0ef41Sopenharmony_ci for (int j = first_with_prefix; j < i; j++) { 7431cb0ef41Sopenharmony_ci alternatives->at(write_posn++) = alternatives->at(j); 7441cb0ef41Sopenharmony_ci } 7451cb0ef41Sopenharmony_ci } 7461cb0ef41Sopenharmony_ci } 7471cb0ef41Sopenharmony_ci alternatives->Rewind(write_posn); // Trim end of array. 7481cb0ef41Sopenharmony_ci} 7491cb0ef41Sopenharmony_ci 7501cb0ef41Sopenharmony_ci// Optimizes b|c|z to [bcz]. 7511cb0ef41Sopenharmony_civoid RegExpDisjunction::FixSingleCharacterDisjunctions( 7521cb0ef41Sopenharmony_ci RegExpCompiler* compiler) { 7531cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 7541cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* alternatives = this->alternatives(); 7551cb0ef41Sopenharmony_ci int length = alternatives->length(); 7561cb0ef41Sopenharmony_ci 7571cb0ef41Sopenharmony_ci int write_posn = 0; 7581cb0ef41Sopenharmony_ci int i = 0; 7591cb0ef41Sopenharmony_ci while (i < length) { 7601cb0ef41Sopenharmony_ci RegExpTree* alternative = alternatives->at(i); 7611cb0ef41Sopenharmony_ci if (!alternative->IsAtom()) { 7621cb0ef41Sopenharmony_ci alternatives->at(write_posn++) = alternatives->at(i); 7631cb0ef41Sopenharmony_ci i++; 7641cb0ef41Sopenharmony_ci continue; 7651cb0ef41Sopenharmony_ci } 7661cb0ef41Sopenharmony_ci RegExpAtom* const atom = alternative->AsAtom(); 7671cb0ef41Sopenharmony_ci if (atom->length() != 1) { 7681cb0ef41Sopenharmony_ci alternatives->at(write_posn++) = alternatives->at(i); 7691cb0ef41Sopenharmony_ci i++; 7701cb0ef41Sopenharmony_ci continue; 7711cb0ef41Sopenharmony_ci } 7721cb0ef41Sopenharmony_ci const RegExpFlags flags = compiler->flags(); 7731cb0ef41Sopenharmony_ci DCHECK_IMPLIES(IsUnicode(flags), 7741cb0ef41Sopenharmony_ci !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); 7751cb0ef41Sopenharmony_ci bool contains_trail_surrogate = 7761cb0ef41Sopenharmony_ci unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); 7771cb0ef41Sopenharmony_ci int first_in_run = i; 7781cb0ef41Sopenharmony_ci i++; 7791cb0ef41Sopenharmony_ci // Find a run of single-character atom alternatives that have identical 7801cb0ef41Sopenharmony_ci // flags (case independence and unicode-ness). 7811cb0ef41Sopenharmony_ci while (i < length) { 7821cb0ef41Sopenharmony_ci alternative = alternatives->at(i); 7831cb0ef41Sopenharmony_ci if (!alternative->IsAtom()) break; 7841cb0ef41Sopenharmony_ci RegExpAtom* const alt_atom = alternative->AsAtom(); 7851cb0ef41Sopenharmony_ci if (alt_atom->length() != 1) break; 7861cb0ef41Sopenharmony_ci DCHECK_IMPLIES(IsUnicode(flags), 7871cb0ef41Sopenharmony_ci !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0))); 7881cb0ef41Sopenharmony_ci contains_trail_surrogate |= 7891cb0ef41Sopenharmony_ci unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0)); 7901cb0ef41Sopenharmony_ci i++; 7911cb0ef41Sopenharmony_ci } 7921cb0ef41Sopenharmony_ci if (i > first_in_run + 1) { 7931cb0ef41Sopenharmony_ci // Found non-trivial run of single-character alternatives. 7941cb0ef41Sopenharmony_ci int run_length = i - first_in_run; 7951cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* ranges = 7961cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(2, zone); 7971cb0ef41Sopenharmony_ci for (int j = 0; j < run_length; j++) { 7981cb0ef41Sopenharmony_ci RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 7991cb0ef41Sopenharmony_ci DCHECK_EQ(old_atom->length(), 1); 8001cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 8011cb0ef41Sopenharmony_ci } 8021cb0ef41Sopenharmony_ci RegExpCharacterClass::CharacterClassFlags character_class_flags; 8031cb0ef41Sopenharmony_ci if (IsUnicode(flags) && contains_trail_surrogate) { 8041cb0ef41Sopenharmony_ci character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; 8051cb0ef41Sopenharmony_ci } 8061cb0ef41Sopenharmony_ci alternatives->at(write_posn++) = 8071cb0ef41Sopenharmony_ci zone->New<RegExpCharacterClass>(zone, ranges, character_class_flags); 8081cb0ef41Sopenharmony_ci } else { 8091cb0ef41Sopenharmony_ci // Just copy any trivial alternatives. 8101cb0ef41Sopenharmony_ci for (int j = first_in_run; j < i; j++) { 8111cb0ef41Sopenharmony_ci alternatives->at(write_posn++) = alternatives->at(j); 8121cb0ef41Sopenharmony_ci } 8131cb0ef41Sopenharmony_ci } 8141cb0ef41Sopenharmony_ci } 8151cb0ef41Sopenharmony_ci alternatives->Rewind(write_posn); // Trim end of array. 8161cb0ef41Sopenharmony_ci} 8171cb0ef41Sopenharmony_ci 8181cb0ef41Sopenharmony_ciRegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 8191cb0ef41Sopenharmony_ci RegExpNode* on_success) { 8201cb0ef41Sopenharmony_ci compiler->ToNodeMaybeCheckForStackOverflow(); 8211cb0ef41Sopenharmony_ci 8221cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* alternatives = this->alternatives(); 8231cb0ef41Sopenharmony_ci 8241cb0ef41Sopenharmony_ci if (alternatives->length() > 2) { 8251cb0ef41Sopenharmony_ci bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 8261cb0ef41Sopenharmony_ci if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 8271cb0ef41Sopenharmony_ci FixSingleCharacterDisjunctions(compiler); 8281cb0ef41Sopenharmony_ci if (alternatives->length() == 1) { 8291cb0ef41Sopenharmony_ci return alternatives->at(0)->ToNode(compiler, on_success); 8301cb0ef41Sopenharmony_ci } 8311cb0ef41Sopenharmony_ci } 8321cb0ef41Sopenharmony_ci 8331cb0ef41Sopenharmony_ci int length = alternatives->length(); 8341cb0ef41Sopenharmony_ci 8351cb0ef41Sopenharmony_ci ChoiceNode* result = 8361cb0ef41Sopenharmony_ci compiler->zone()->New<ChoiceNode>(length, compiler->zone()); 8371cb0ef41Sopenharmony_ci for (int i = 0; i < length; i++) { 8381cb0ef41Sopenharmony_ci GuardedAlternative alternative( 8391cb0ef41Sopenharmony_ci alternatives->at(i)->ToNode(compiler, on_success)); 8401cb0ef41Sopenharmony_ci result->AddAlternative(alternative); 8411cb0ef41Sopenharmony_ci } 8421cb0ef41Sopenharmony_ci return result; 8431cb0ef41Sopenharmony_ci} 8441cb0ef41Sopenharmony_ci 8451cb0ef41Sopenharmony_ciRegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 8461cb0ef41Sopenharmony_ci RegExpNode* on_success) { 8471cb0ef41Sopenharmony_ci return ToNode(min(), max(), is_greedy(), body(), compiler, on_success); 8481cb0ef41Sopenharmony_ci} 8491cb0ef41Sopenharmony_ci 8501cb0ef41Sopenharmony_cinamespace { 8511cb0ef41Sopenharmony_ci// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and 8521cb0ef41Sopenharmony_ci// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) 8531cb0ef41Sopenharmony_ciRegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, 8541cb0ef41Sopenharmony_ci RegExpNode* on_success, 8551cb0ef41Sopenharmony_ci RegExpAssertion::Type type, 8561cb0ef41Sopenharmony_ci RegExpFlags flags) { 8571cb0ef41Sopenharmony_ci CHECK(NeedsUnicodeCaseEquivalents(flags)); 8581cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 8591cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* word_range = 8601cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(2, zone); 8611cb0ef41Sopenharmony_ci CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true, 8621cb0ef41Sopenharmony_ci zone); 8631cb0ef41Sopenharmony_ci int stack_register = compiler->UnicodeLookaroundStackRegister(); 8641cb0ef41Sopenharmony_ci int position_register = compiler->UnicodeLookaroundPositionRegister(); 8651cb0ef41Sopenharmony_ci ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 8661cb0ef41Sopenharmony_ci // Add two choices. The (non-)boundary could start with a word or 8671cb0ef41Sopenharmony_ci // a non-word-character. 8681cb0ef41Sopenharmony_ci for (int i = 0; i < 2; i++) { 8691cb0ef41Sopenharmony_ci bool lookbehind_for_word = i == 0; 8701cb0ef41Sopenharmony_ci bool lookahead_for_word = 8711cb0ef41Sopenharmony_ci (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word; 8721cb0ef41Sopenharmony_ci // Look to the left. 8731cb0ef41Sopenharmony_ci RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, 8741cb0ef41Sopenharmony_ci stack_register, position_register); 8751cb0ef41Sopenharmony_ci RegExpNode* backward = TextNode::CreateForCharacterRanges( 8761cb0ef41Sopenharmony_ci zone, word_range, true, lookbehind.on_match_success()); 8771cb0ef41Sopenharmony_ci // Look to the right. 8781cb0ef41Sopenharmony_ci RegExpLookaround::Builder lookahead(lookahead_for_word, 8791cb0ef41Sopenharmony_ci lookbehind.ForMatch(backward), 8801cb0ef41Sopenharmony_ci stack_register, position_register); 8811cb0ef41Sopenharmony_ci RegExpNode* forward = TextNode::CreateForCharacterRanges( 8821cb0ef41Sopenharmony_ci zone, word_range, false, lookahead.on_match_success()); 8831cb0ef41Sopenharmony_ci result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); 8841cb0ef41Sopenharmony_ci } 8851cb0ef41Sopenharmony_ci return result; 8861cb0ef41Sopenharmony_ci} 8871cb0ef41Sopenharmony_ci} // anonymous namespace 8881cb0ef41Sopenharmony_ci 8891cb0ef41Sopenharmony_ciRegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 8901cb0ef41Sopenharmony_ci RegExpNode* on_success) { 8911cb0ef41Sopenharmony_ci NodeInfo info; 8921cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 8931cb0ef41Sopenharmony_ci 8941cb0ef41Sopenharmony_ci switch (assertion_type()) { 8951cb0ef41Sopenharmony_ci case Type::START_OF_LINE: 8961cb0ef41Sopenharmony_ci return AssertionNode::AfterNewline(on_success); 8971cb0ef41Sopenharmony_ci case Type::START_OF_INPUT: 8981cb0ef41Sopenharmony_ci return AssertionNode::AtStart(on_success); 8991cb0ef41Sopenharmony_ci case Type::BOUNDARY: 9001cb0ef41Sopenharmony_ci return NeedsUnicodeCaseEquivalents(compiler->flags()) 9011cb0ef41Sopenharmony_ci ? BoundaryAssertionAsLookaround( 9021cb0ef41Sopenharmony_ci compiler, on_success, Type::BOUNDARY, compiler->flags()) 9031cb0ef41Sopenharmony_ci : AssertionNode::AtBoundary(on_success); 9041cb0ef41Sopenharmony_ci case Type::NON_BOUNDARY: 9051cb0ef41Sopenharmony_ci return NeedsUnicodeCaseEquivalents(compiler->flags()) 9061cb0ef41Sopenharmony_ci ? BoundaryAssertionAsLookaround(compiler, on_success, 9071cb0ef41Sopenharmony_ci Type::NON_BOUNDARY, 9081cb0ef41Sopenharmony_ci compiler->flags()) 9091cb0ef41Sopenharmony_ci : AssertionNode::AtNonBoundary(on_success); 9101cb0ef41Sopenharmony_ci case Type::END_OF_INPUT: 9111cb0ef41Sopenharmony_ci return AssertionNode::AtEnd(on_success); 9121cb0ef41Sopenharmony_ci case Type::END_OF_LINE: { 9131cb0ef41Sopenharmony_ci // Compile $ in multiline regexps as an alternation with a positive 9141cb0ef41Sopenharmony_ci // lookahead in one side and an end-of-input on the other side. 9151cb0ef41Sopenharmony_ci // We need two registers for the lookahead. 9161cb0ef41Sopenharmony_ci int stack_pointer_register = compiler->AllocateRegister(); 9171cb0ef41Sopenharmony_ci int position_register = compiler->AllocateRegister(); 9181cb0ef41Sopenharmony_ci // The ChoiceNode to distinguish between a newline and end-of-input. 9191cb0ef41Sopenharmony_ci ChoiceNode* result = zone->New<ChoiceNode>(2, zone); 9201cb0ef41Sopenharmony_ci // Create a newline atom. 9211cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* newline_ranges = 9221cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(3, zone); 9231cb0ef41Sopenharmony_ci CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator, 9241cb0ef41Sopenharmony_ci newline_ranges, false, zone); 9251cb0ef41Sopenharmony_ci RegExpCharacterClass* newline_atom = zone->New<RegExpCharacterClass>( 9261cb0ef41Sopenharmony_ci StandardCharacterSet::kLineTerminator); 9271cb0ef41Sopenharmony_ci TextNode* newline_matcher = 9281cb0ef41Sopenharmony_ci zone->New<TextNode>(newline_atom, false, 9291cb0ef41Sopenharmony_ci ActionNode::PositiveSubmatchSuccess( 9301cb0ef41Sopenharmony_ci stack_pointer_register, position_register, 9311cb0ef41Sopenharmony_ci 0, // No captures inside. 9321cb0ef41Sopenharmony_ci -1, // Ignored if no captures. 9331cb0ef41Sopenharmony_ci on_success)); 9341cb0ef41Sopenharmony_ci // Create an end-of-input matcher. 9351cb0ef41Sopenharmony_ci RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch( 9361cb0ef41Sopenharmony_ci stack_pointer_register, position_register, newline_matcher); 9371cb0ef41Sopenharmony_ci // Add the two alternatives to the ChoiceNode. 9381cb0ef41Sopenharmony_ci GuardedAlternative eol_alternative(end_of_line); 9391cb0ef41Sopenharmony_ci result->AddAlternative(eol_alternative); 9401cb0ef41Sopenharmony_ci GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 9411cb0ef41Sopenharmony_ci result->AddAlternative(end_alternative); 9421cb0ef41Sopenharmony_ci return result; 9431cb0ef41Sopenharmony_ci } 9441cb0ef41Sopenharmony_ci default: 9451cb0ef41Sopenharmony_ci UNREACHABLE(); 9461cb0ef41Sopenharmony_ci } 9471cb0ef41Sopenharmony_ci} 9481cb0ef41Sopenharmony_ci 9491cb0ef41Sopenharmony_ciRegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 9501cb0ef41Sopenharmony_ci RegExpNode* on_success) { 9511cb0ef41Sopenharmony_ci return compiler->zone()->New<BackReferenceNode>( 9521cb0ef41Sopenharmony_ci RegExpCapture::StartRegister(index()), 9531cb0ef41Sopenharmony_ci RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(), 9541cb0ef41Sopenharmony_ci on_success); 9551cb0ef41Sopenharmony_ci} 9561cb0ef41Sopenharmony_ci 9571cb0ef41Sopenharmony_ciRegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 9581cb0ef41Sopenharmony_ci RegExpNode* on_success) { 9591cb0ef41Sopenharmony_ci return on_success; 9601cb0ef41Sopenharmony_ci} 9611cb0ef41Sopenharmony_ci 9621cb0ef41Sopenharmony_ciRegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler, 9631cb0ef41Sopenharmony_ci RegExpNode* on_success) { 9641cb0ef41Sopenharmony_ci return body_->ToNode(compiler, on_success); 9651cb0ef41Sopenharmony_ci} 9661cb0ef41Sopenharmony_ci 9671cb0ef41Sopenharmony_ciRegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, 9681cb0ef41Sopenharmony_ci int stack_pointer_register, 9691cb0ef41Sopenharmony_ci int position_register, 9701cb0ef41Sopenharmony_ci int capture_register_count, 9711cb0ef41Sopenharmony_ci int capture_register_start) 9721cb0ef41Sopenharmony_ci : is_positive_(is_positive), 9731cb0ef41Sopenharmony_ci on_success_(on_success), 9741cb0ef41Sopenharmony_ci stack_pointer_register_(stack_pointer_register), 9751cb0ef41Sopenharmony_ci position_register_(position_register) { 9761cb0ef41Sopenharmony_ci if (is_positive_) { 9771cb0ef41Sopenharmony_ci on_match_success_ = ActionNode::PositiveSubmatchSuccess( 9781cb0ef41Sopenharmony_ci stack_pointer_register, position_register, capture_register_count, 9791cb0ef41Sopenharmony_ci capture_register_start, on_success_); 9801cb0ef41Sopenharmony_ci } else { 9811cb0ef41Sopenharmony_ci Zone* zone = on_success_->zone(); 9821cb0ef41Sopenharmony_ci on_match_success_ = zone->New<NegativeSubmatchSuccess>( 9831cb0ef41Sopenharmony_ci stack_pointer_register, position_register, capture_register_count, 9841cb0ef41Sopenharmony_ci capture_register_start, zone); 9851cb0ef41Sopenharmony_ci } 9861cb0ef41Sopenharmony_ci} 9871cb0ef41Sopenharmony_ci 9881cb0ef41Sopenharmony_ciRegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { 9891cb0ef41Sopenharmony_ci if (is_positive_) { 9901cb0ef41Sopenharmony_ci return ActionNode::BeginPositiveSubmatch(stack_pointer_register_, 9911cb0ef41Sopenharmony_ci position_register_, match); 9921cb0ef41Sopenharmony_ci } else { 9931cb0ef41Sopenharmony_ci Zone* zone = on_success_->zone(); 9941cb0ef41Sopenharmony_ci // We use a ChoiceNode to represent the negative lookaround. The first 9951cb0ef41Sopenharmony_ci // alternative is the negative match. On success, the end node backtracks. 9961cb0ef41Sopenharmony_ci // On failure, the second alternative is tried and leads to success. 9971cb0ef41Sopenharmony_ci // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the 9981cb0ef41Sopenharmony_ci // first exit when calculating quick checks. 9991cb0ef41Sopenharmony_ci ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>( 10001cb0ef41Sopenharmony_ci GuardedAlternative(match), GuardedAlternative(on_success_), zone); 10011cb0ef41Sopenharmony_ci return ActionNode::BeginNegativeSubmatch(stack_pointer_register_, 10021cb0ef41Sopenharmony_ci position_register_, choice_node); 10031cb0ef41Sopenharmony_ci } 10041cb0ef41Sopenharmony_ci} 10051cb0ef41Sopenharmony_ci 10061cb0ef41Sopenharmony_ciRegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 10071cb0ef41Sopenharmony_ci RegExpNode* on_success) { 10081cb0ef41Sopenharmony_ci int stack_pointer_register = compiler->AllocateRegister(); 10091cb0ef41Sopenharmony_ci int position_register = compiler->AllocateRegister(); 10101cb0ef41Sopenharmony_ci 10111cb0ef41Sopenharmony_ci const int registers_per_capture = 2; 10121cb0ef41Sopenharmony_ci const int register_of_first_capture = 2; 10131cb0ef41Sopenharmony_ci int register_count = capture_count_ * registers_per_capture; 10141cb0ef41Sopenharmony_ci int register_start = 10151cb0ef41Sopenharmony_ci register_of_first_capture + capture_from_ * registers_per_capture; 10161cb0ef41Sopenharmony_ci 10171cb0ef41Sopenharmony_ci RegExpNode* result; 10181cb0ef41Sopenharmony_ci bool was_reading_backward = compiler->read_backward(); 10191cb0ef41Sopenharmony_ci compiler->set_read_backward(type() == LOOKBEHIND); 10201cb0ef41Sopenharmony_ci Builder builder(is_positive(), on_success, stack_pointer_register, 10211cb0ef41Sopenharmony_ci position_register, register_count, register_start); 10221cb0ef41Sopenharmony_ci RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); 10231cb0ef41Sopenharmony_ci result = builder.ForMatch(match); 10241cb0ef41Sopenharmony_ci compiler->set_read_backward(was_reading_backward); 10251cb0ef41Sopenharmony_ci return result; 10261cb0ef41Sopenharmony_ci} 10271cb0ef41Sopenharmony_ci 10281cb0ef41Sopenharmony_ciRegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 10291cb0ef41Sopenharmony_ci RegExpNode* on_success) { 10301cb0ef41Sopenharmony_ci return ToNode(body(), index(), compiler, on_success); 10311cb0ef41Sopenharmony_ci} 10321cb0ef41Sopenharmony_ci 10331cb0ef41Sopenharmony_ciRegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index, 10341cb0ef41Sopenharmony_ci RegExpCompiler* compiler, 10351cb0ef41Sopenharmony_ci RegExpNode* on_success) { 10361cb0ef41Sopenharmony_ci DCHECK_NOT_NULL(body); 10371cb0ef41Sopenharmony_ci int start_reg = RegExpCapture::StartRegister(index); 10381cb0ef41Sopenharmony_ci int end_reg = RegExpCapture::EndRegister(index); 10391cb0ef41Sopenharmony_ci if (compiler->read_backward()) std::swap(start_reg, end_reg); 10401cb0ef41Sopenharmony_ci RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 10411cb0ef41Sopenharmony_ci RegExpNode* body_node = body->ToNode(compiler, store_end); 10421cb0ef41Sopenharmony_ci return ActionNode::StorePosition(start_reg, true, body_node); 10431cb0ef41Sopenharmony_ci} 10441cb0ef41Sopenharmony_ci 10451cb0ef41Sopenharmony_cinamespace { 10461cb0ef41Sopenharmony_ci 10471cb0ef41Sopenharmony_ciclass AssertionSequenceRewriter final { 10481cb0ef41Sopenharmony_ci public: 10491cb0ef41Sopenharmony_ci // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass 10501cb0ef41Sopenharmony_ci // instead of sprinkling rewrites into the AST->Node conversion process. 10511cb0ef41Sopenharmony_ci static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) { 10521cb0ef41Sopenharmony_ci AssertionSequenceRewriter rewriter(terms, zone); 10531cb0ef41Sopenharmony_ci 10541cb0ef41Sopenharmony_ci static constexpr int kNoIndex = -1; 10551cb0ef41Sopenharmony_ci int from = kNoIndex; 10561cb0ef41Sopenharmony_ci 10571cb0ef41Sopenharmony_ci for (int i = 0; i < terms->length(); i++) { 10581cb0ef41Sopenharmony_ci RegExpTree* t = terms->at(i); 10591cb0ef41Sopenharmony_ci if (from == kNoIndex && t->IsAssertion()) { 10601cb0ef41Sopenharmony_ci from = i; // Start a sequence. 10611cb0ef41Sopenharmony_ci } else if (from != kNoIndex && !t->IsAssertion()) { 10621cb0ef41Sopenharmony_ci // Terminate and process the sequence. 10631cb0ef41Sopenharmony_ci if (i - from > 1) rewriter.Rewrite(from, i); 10641cb0ef41Sopenharmony_ci from = kNoIndex; 10651cb0ef41Sopenharmony_ci } 10661cb0ef41Sopenharmony_ci } 10671cb0ef41Sopenharmony_ci 10681cb0ef41Sopenharmony_ci if (from != kNoIndex && terms->length() - from > 1) { 10691cb0ef41Sopenharmony_ci rewriter.Rewrite(from, terms->length()); 10701cb0ef41Sopenharmony_ci } 10711cb0ef41Sopenharmony_ci } 10721cb0ef41Sopenharmony_ci 10731cb0ef41Sopenharmony_ci // All assertions are zero width. A consecutive sequence of assertions is 10741cb0ef41Sopenharmony_ci // order-independent. There's two ways we can optimize here: 10751cb0ef41Sopenharmony_ci // 1. fold all identical assertions. 10761cb0ef41Sopenharmony_ci // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire 10771cb0ef41Sopenharmony_ci // sequence fails. 10781cb0ef41Sopenharmony_ci void Rewrite(int from, int to) { 10791cb0ef41Sopenharmony_ci DCHECK_GT(to, from + 1); 10801cb0ef41Sopenharmony_ci 10811cb0ef41Sopenharmony_ci // Bitfield of all seen assertions. 10821cb0ef41Sopenharmony_ci uint32_t seen_assertions = 0; 10831cb0ef41Sopenharmony_ci STATIC_ASSERT(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) < 10841cb0ef41Sopenharmony_ci kUInt32Size * kBitsPerByte); 10851cb0ef41Sopenharmony_ci 10861cb0ef41Sopenharmony_ci for (int i = from; i < to; i++) { 10871cb0ef41Sopenharmony_ci RegExpAssertion* t = terms_->at(i)->AsAssertion(); 10881cb0ef41Sopenharmony_ci const uint32_t bit = 1 << static_cast<int>(t->assertion_type()); 10891cb0ef41Sopenharmony_ci 10901cb0ef41Sopenharmony_ci if (seen_assertions & bit) { 10911cb0ef41Sopenharmony_ci // Fold duplicates. 10921cb0ef41Sopenharmony_ci terms_->Set(i, zone_->New<RegExpEmpty>()); 10931cb0ef41Sopenharmony_ci } 10941cb0ef41Sopenharmony_ci 10951cb0ef41Sopenharmony_ci seen_assertions |= bit; 10961cb0ef41Sopenharmony_ci } 10971cb0ef41Sopenharmony_ci 10981cb0ef41Sopenharmony_ci // Collapse failures. 10991cb0ef41Sopenharmony_ci const uint32_t always_fails_mask = 11001cb0ef41Sopenharmony_ci 1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) | 11011cb0ef41Sopenharmony_ci 1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY); 11021cb0ef41Sopenharmony_ci if ((seen_assertions & always_fails_mask) == always_fails_mask) { 11031cb0ef41Sopenharmony_ci ReplaceSequenceWithFailure(from, to); 11041cb0ef41Sopenharmony_ci } 11051cb0ef41Sopenharmony_ci } 11061cb0ef41Sopenharmony_ci 11071cb0ef41Sopenharmony_ci void ReplaceSequenceWithFailure(int from, int to) { 11081cb0ef41Sopenharmony_ci // Replace the entire sequence with a single node that always fails. 11091cb0ef41Sopenharmony_ci // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the 11101cb0ef41Sopenharmony_ci // negated '*' (everything) range serves the purpose. 11111cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* ranges = 11121cb0ef41Sopenharmony_ci zone_->New<ZoneList<CharacterRange>>(0, zone_); 11131cb0ef41Sopenharmony_ci RegExpCharacterClass* cc = zone_->New<RegExpCharacterClass>(zone_, ranges); 11141cb0ef41Sopenharmony_ci terms_->Set(from, cc); 11151cb0ef41Sopenharmony_ci 11161cb0ef41Sopenharmony_ci // Zero out the rest. 11171cb0ef41Sopenharmony_ci RegExpEmpty* empty = zone_->New<RegExpEmpty>(); 11181cb0ef41Sopenharmony_ci for (int i = from + 1; i < to; i++) terms_->Set(i, empty); 11191cb0ef41Sopenharmony_ci } 11201cb0ef41Sopenharmony_ci 11211cb0ef41Sopenharmony_ci private: 11221cb0ef41Sopenharmony_ci AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone) 11231cb0ef41Sopenharmony_ci : zone_(zone), terms_(terms) {} 11241cb0ef41Sopenharmony_ci 11251cb0ef41Sopenharmony_ci Zone* zone_; 11261cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* terms_; 11271cb0ef41Sopenharmony_ci}; 11281cb0ef41Sopenharmony_ci 11291cb0ef41Sopenharmony_ci} // namespace 11301cb0ef41Sopenharmony_ci 11311cb0ef41Sopenharmony_ciRegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 11321cb0ef41Sopenharmony_ci RegExpNode* on_success) { 11331cb0ef41Sopenharmony_ci compiler->ToNodeMaybeCheckForStackOverflow(); 11341cb0ef41Sopenharmony_ci 11351cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>* children = nodes(); 11361cb0ef41Sopenharmony_ci 11371cb0ef41Sopenharmony_ci AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone()); 11381cb0ef41Sopenharmony_ci 11391cb0ef41Sopenharmony_ci RegExpNode* current = on_success; 11401cb0ef41Sopenharmony_ci if (compiler->read_backward()) { 11411cb0ef41Sopenharmony_ci for (int i = 0; i < children->length(); i++) { 11421cb0ef41Sopenharmony_ci current = children->at(i)->ToNode(compiler, current); 11431cb0ef41Sopenharmony_ci } 11441cb0ef41Sopenharmony_ci } else { 11451cb0ef41Sopenharmony_ci for (int i = children->length() - 1; i >= 0; i--) { 11461cb0ef41Sopenharmony_ci current = children->at(i)->ToNode(compiler, current); 11471cb0ef41Sopenharmony_ci } 11481cb0ef41Sopenharmony_ci } 11491cb0ef41Sopenharmony_ci return current; 11501cb0ef41Sopenharmony_ci} 11511cb0ef41Sopenharmony_ci 11521cb0ef41Sopenharmony_cinamespace { 11531cb0ef41Sopenharmony_ci 11541cb0ef41Sopenharmony_civoid AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges, 11551cb0ef41Sopenharmony_ci Zone* zone) { 11561cb0ef41Sopenharmony_ci elmc--; 11571cb0ef41Sopenharmony_ci DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 11581cb0ef41Sopenharmony_ci for (int i = 0; i < elmc; i += 2) { 11591cb0ef41Sopenharmony_ci DCHECK(elmv[i] < elmv[i + 1]); 11601cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); 11611cb0ef41Sopenharmony_ci } 11621cb0ef41Sopenharmony_ci} 11631cb0ef41Sopenharmony_ci 11641cb0ef41Sopenharmony_civoid AddClassNegated(const int* elmv, int elmc, 11651cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* ranges, Zone* zone) { 11661cb0ef41Sopenharmony_ci elmc--; 11671cb0ef41Sopenharmony_ci DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 11681cb0ef41Sopenharmony_ci DCHECK_NE(0x0000, elmv[0]); 11691cb0ef41Sopenharmony_ci DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]); 11701cb0ef41Sopenharmony_ci base::uc16 last = 0x0000; 11711cb0ef41Sopenharmony_ci for (int i = 0; i < elmc; i += 2) { 11721cb0ef41Sopenharmony_ci DCHECK(last <= elmv[i] - 1); 11731cb0ef41Sopenharmony_ci DCHECK(elmv[i] < elmv[i + 1]); 11741cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 11751cb0ef41Sopenharmony_ci last = elmv[i + 1]; 11761cb0ef41Sopenharmony_ci } 11771cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone); 11781cb0ef41Sopenharmony_ci} 11791cb0ef41Sopenharmony_ci 11801cb0ef41Sopenharmony_ci} // namespace 11811cb0ef41Sopenharmony_ci 11821cb0ef41Sopenharmony_civoid CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set, 11831cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* ranges, 11841cb0ef41Sopenharmony_ci bool add_unicode_case_equivalents, 11851cb0ef41Sopenharmony_ci Zone* zone) { 11861cb0ef41Sopenharmony_ci if (add_unicode_case_equivalents && 11871cb0ef41Sopenharmony_ci (standard_character_set == StandardCharacterSet::kWord || 11881cb0ef41Sopenharmony_ci standard_character_set == StandardCharacterSet::kNotWord)) { 11891cb0ef41Sopenharmony_ci // See #sec-runtime-semantics-wordcharacters-abstract-operation 11901cb0ef41Sopenharmony_ci // In case of unicode and ignore_case, we need to create the closure over 11911cb0ef41Sopenharmony_ci // case equivalent characters before negating. 11921cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* new_ranges = 11931cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(2, zone); 11941cb0ef41Sopenharmony_ci AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); 11951cb0ef41Sopenharmony_ci AddUnicodeCaseEquivalents(new_ranges, zone); 11961cb0ef41Sopenharmony_ci if (standard_character_set == StandardCharacterSet::kNotWord) { 11971cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* negated = 11981cb0ef41Sopenharmony_ci zone->New<ZoneList<CharacterRange>>(2, zone); 11991cb0ef41Sopenharmony_ci CharacterRange::Negate(new_ranges, negated, zone); 12001cb0ef41Sopenharmony_ci new_ranges = negated; 12011cb0ef41Sopenharmony_ci } 12021cb0ef41Sopenharmony_ci ranges->AddAll(*new_ranges, zone); 12031cb0ef41Sopenharmony_ci return; 12041cb0ef41Sopenharmony_ci } 12051cb0ef41Sopenharmony_ci 12061cb0ef41Sopenharmony_ci switch (standard_character_set) { 12071cb0ef41Sopenharmony_ci case StandardCharacterSet::kWhitespace: 12081cb0ef41Sopenharmony_ci AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 12091cb0ef41Sopenharmony_ci break; 12101cb0ef41Sopenharmony_ci case StandardCharacterSet::kNotWhitespace: 12111cb0ef41Sopenharmony_ci AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 12121cb0ef41Sopenharmony_ci break; 12131cb0ef41Sopenharmony_ci case StandardCharacterSet::kWord: 12141cb0ef41Sopenharmony_ci AddClass(kWordRanges, kWordRangeCount, ranges, zone); 12151cb0ef41Sopenharmony_ci break; 12161cb0ef41Sopenharmony_ci case StandardCharacterSet::kNotWord: 12171cb0ef41Sopenharmony_ci AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 12181cb0ef41Sopenharmony_ci break; 12191cb0ef41Sopenharmony_ci case StandardCharacterSet::kDigit: 12201cb0ef41Sopenharmony_ci AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 12211cb0ef41Sopenharmony_ci break; 12221cb0ef41Sopenharmony_ci case StandardCharacterSet::kNotDigit: 12231cb0ef41Sopenharmony_ci AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 12241cb0ef41Sopenharmony_ci break; 12251cb0ef41Sopenharmony_ci // This is the set of characters matched by the $ and ^ symbols 12261cb0ef41Sopenharmony_ci // in multiline mode. 12271cb0ef41Sopenharmony_ci case StandardCharacterSet::kLineTerminator: 12281cb0ef41Sopenharmony_ci AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); 12291cb0ef41Sopenharmony_ci break; 12301cb0ef41Sopenharmony_ci case StandardCharacterSet::kNotLineTerminator: 12311cb0ef41Sopenharmony_ci AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, 12321cb0ef41Sopenharmony_ci zone); 12331cb0ef41Sopenharmony_ci break; 12341cb0ef41Sopenharmony_ci // This is not a character range as defined by the spec but a 12351cb0ef41Sopenharmony_ci // convenient shorthand for a character class that matches any 12361cb0ef41Sopenharmony_ci // character. 12371cb0ef41Sopenharmony_ci case StandardCharacterSet::kEverything: 12381cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Everything(), zone); 12391cb0ef41Sopenharmony_ci break; 12401cb0ef41Sopenharmony_ci } 12411cb0ef41Sopenharmony_ci} 12421cb0ef41Sopenharmony_ci 12431cb0ef41Sopenharmony_ci// static 12441cb0ef41Sopenharmony_civoid CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 12451cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* ranges, 12461cb0ef41Sopenharmony_ci bool is_one_byte) { 12471cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(ranges); 12481cb0ef41Sopenharmony_ci int range_count = ranges->length(); 12491cb0ef41Sopenharmony_ci#ifdef V8_INTL_SUPPORT 12501cb0ef41Sopenharmony_ci icu::UnicodeSet others; 12511cb0ef41Sopenharmony_ci for (int i = 0; i < range_count; i++) { 12521cb0ef41Sopenharmony_ci CharacterRange range = ranges->at(i); 12531cb0ef41Sopenharmony_ci base::uc32 from = range.from(); 12541cb0ef41Sopenharmony_ci if (from > kMaxUtf16CodeUnit) continue; 12551cb0ef41Sopenharmony_ci base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU}); 12561cb0ef41Sopenharmony_ci // Nothing to be done for surrogates. 12571cb0ef41Sopenharmony_ci if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; 12581cb0ef41Sopenharmony_ci if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 12591cb0ef41Sopenharmony_ci if (from > kMaxOneByteCharCode) continue; 12601cb0ef41Sopenharmony_ci if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode; 12611cb0ef41Sopenharmony_ci } 12621cb0ef41Sopenharmony_ci others.add(from, to); 12631cb0ef41Sopenharmony_ci } 12641cb0ef41Sopenharmony_ci 12651cb0ef41Sopenharmony_ci // Compute the set of additional characters that should be added, 12661cb0ef41Sopenharmony_ci // using UnicodeSet::closeOver. ECMA 262 defines slightly different 12671cb0ef41Sopenharmony_ci // case-folding rules than Unicode, so some characters that are 12681cb0ef41Sopenharmony_ci // added by closeOver do not match anything other than themselves in 12691cb0ef41Sopenharmony_ci // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the 12701cb0ef41Sopenharmony_ci // same case-insensitive character as 's' or 'S' according to 12711cb0ef41Sopenharmony_ci // Unicode, but does not match any other character in JS. To handle 12721cb0ef41Sopenharmony_ci // this case, we add such characters to the IgnoreSet and filter 12731cb0ef41Sopenharmony_ci // them out. We filter twice: once before calling closeOver (to 12741cb0ef41Sopenharmony_ci // prevent 'ſ' from adding 's'), and once after calling closeOver 12751cb0ef41Sopenharmony_ci // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for 12761cb0ef41Sopenharmony_ci // more information. 12771cb0ef41Sopenharmony_ci icu::UnicodeSet already_added(others); 12781cb0ef41Sopenharmony_ci others.removeAll(RegExpCaseFolding::IgnoreSet()); 12791cb0ef41Sopenharmony_ci others.closeOver(USET_CASE_INSENSITIVE); 12801cb0ef41Sopenharmony_ci others.removeAll(RegExpCaseFolding::IgnoreSet()); 12811cb0ef41Sopenharmony_ci others.removeAll(already_added); 12821cb0ef41Sopenharmony_ci 12831cb0ef41Sopenharmony_ci // Add others to the ranges 12841cb0ef41Sopenharmony_ci for (int32_t i = 0; i < others.getRangeCount(); i++) { 12851cb0ef41Sopenharmony_ci UChar32 from = others.getRangeStart(i); 12861cb0ef41Sopenharmony_ci UChar32 to = others.getRangeEnd(i); 12871cb0ef41Sopenharmony_ci if (from == to) { 12881cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Singleton(from), zone); 12891cb0ef41Sopenharmony_ci } else { 12901cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Range(from, to), zone); 12911cb0ef41Sopenharmony_ci } 12921cb0ef41Sopenharmony_ci } 12931cb0ef41Sopenharmony_ci#else 12941cb0ef41Sopenharmony_ci for (int i = 0; i < range_count; i++) { 12951cb0ef41Sopenharmony_ci CharacterRange range = ranges->at(i); 12961cb0ef41Sopenharmony_ci base::uc32 bottom = range.from(); 12971cb0ef41Sopenharmony_ci if (bottom > kMaxUtf16CodeUnit) continue; 12981cb0ef41Sopenharmony_ci base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU}); 12991cb0ef41Sopenharmony_ci // Nothing to be done for surrogates. 13001cb0ef41Sopenharmony_ci if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; 13011cb0ef41Sopenharmony_ci if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 13021cb0ef41Sopenharmony_ci if (bottom > kMaxOneByteCharCode) continue; 13031cb0ef41Sopenharmony_ci if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode; 13041cb0ef41Sopenharmony_ci } 13051cb0ef41Sopenharmony_ci unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 13061cb0ef41Sopenharmony_ci if (top == bottom) { 13071cb0ef41Sopenharmony_ci // If this is a singleton we just expand the one character. 13081cb0ef41Sopenharmony_ci int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 13091cb0ef41Sopenharmony_ci for (int i = 0; i < length; i++) { 13101cb0ef41Sopenharmony_ci base::uc32 chr = chars[i]; 13111cb0ef41Sopenharmony_ci if (chr != bottom) { 13121cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Singleton(chars[i]), zone); 13131cb0ef41Sopenharmony_ci } 13141cb0ef41Sopenharmony_ci } 13151cb0ef41Sopenharmony_ci } else { 13161cb0ef41Sopenharmony_ci // If this is a range we expand the characters block by block, expanding 13171cb0ef41Sopenharmony_ci // contiguous subranges (blocks) one at a time. The approach is as 13181cb0ef41Sopenharmony_ci // follows. For a given start character we look up the remainder of the 13191cb0ef41Sopenharmony_ci // block that contains it (represented by the end point), for instance we 13201cb0ef41Sopenharmony_ci // find 'z' if the character is 'c'. A block is characterized by the 13211cb0ef41Sopenharmony_ci // property that all characters uncanonicalize in the same way, except 13221cb0ef41Sopenharmony_ci // that each entry in the result is incremented by the distance from the 13231cb0ef41Sopenharmony_ci // first element. So a-z is a block because 'a' uncanonicalizes to ['a', 13241cb0ef41Sopenharmony_ci // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once 13251cb0ef41Sopenharmony_ci // we've found the end point we look up its uncanonicalization and 13261cb0ef41Sopenharmony_ci // produce a range for each element. For instance for [c-f] we look up 13271cb0ef41Sopenharmony_ci // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if 13281cb0ef41Sopenharmony_ci // it is not already contained in the input, so [c-f] will be skipped but 13291cb0ef41Sopenharmony_ci // [C-F] will be added. If this range is not completely contained in a 13301cb0ef41Sopenharmony_ci // block we do this for all the blocks covered by the range (handling 13311cb0ef41Sopenharmony_ci // characters that is not in a block as a "singleton block"). 13321cb0ef41Sopenharmony_ci unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 13331cb0ef41Sopenharmony_ci base::uc32 pos = bottom; 13341cb0ef41Sopenharmony_ci while (pos <= top) { 13351cb0ef41Sopenharmony_ci int length = 13361cb0ef41Sopenharmony_ci isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); 13371cb0ef41Sopenharmony_ci base::uc32 block_end; 13381cb0ef41Sopenharmony_ci if (length == 0) { 13391cb0ef41Sopenharmony_ci block_end = pos; 13401cb0ef41Sopenharmony_ci } else { 13411cb0ef41Sopenharmony_ci DCHECK_EQ(1, length); 13421cb0ef41Sopenharmony_ci block_end = equivalents[0]; 13431cb0ef41Sopenharmony_ci } 13441cb0ef41Sopenharmony_ci int end = (block_end > top) ? top : block_end; 13451cb0ef41Sopenharmony_ci length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', 13461cb0ef41Sopenharmony_ci equivalents); 13471cb0ef41Sopenharmony_ci for (int i = 0; i < length; i++) { 13481cb0ef41Sopenharmony_ci base::uc32 c = equivalents[i]; 13491cb0ef41Sopenharmony_ci base::uc32 range_from = c - (block_end - pos); 13501cb0ef41Sopenharmony_ci base::uc32 range_to = c - (block_end - end); 13511cb0ef41Sopenharmony_ci if (!(bottom <= range_from && range_to <= top)) { 13521cb0ef41Sopenharmony_ci ranges->Add(CharacterRange::Range(range_from, range_to), zone); 13531cb0ef41Sopenharmony_ci } 13541cb0ef41Sopenharmony_ci } 13551cb0ef41Sopenharmony_ci pos = end + 1; 13561cb0ef41Sopenharmony_ci } 13571cb0ef41Sopenharmony_ci } 13581cb0ef41Sopenharmony_ci } 13591cb0ef41Sopenharmony_ci#endif // V8_INTL_SUPPORT 13601cb0ef41Sopenharmony_ci} 13611cb0ef41Sopenharmony_ci 13621cb0ef41Sopenharmony_cibool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 13631cb0ef41Sopenharmony_ci DCHECK_NOT_NULL(ranges); 13641cb0ef41Sopenharmony_ci int n = ranges->length(); 13651cb0ef41Sopenharmony_ci if (n <= 1) return true; 13661cb0ef41Sopenharmony_ci base::uc32 max = ranges->at(0).to(); 13671cb0ef41Sopenharmony_ci for (int i = 1; i < n; i++) { 13681cb0ef41Sopenharmony_ci CharacterRange next_range = ranges->at(i); 13691cb0ef41Sopenharmony_ci if (next_range.from() <= max + 1) return false; 13701cb0ef41Sopenharmony_ci max = next_range.to(); 13711cb0ef41Sopenharmony_ci } 13721cb0ef41Sopenharmony_ci return true; 13731cb0ef41Sopenharmony_ci} 13741cb0ef41Sopenharmony_ci 13751cb0ef41Sopenharmony_ciZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 13761cb0ef41Sopenharmony_ci if (ranges_ == nullptr) { 13771cb0ef41Sopenharmony_ci ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone); 13781cb0ef41Sopenharmony_ci CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false, 13791cb0ef41Sopenharmony_ci zone); 13801cb0ef41Sopenharmony_ci } 13811cb0ef41Sopenharmony_ci return ranges_; 13821cb0ef41Sopenharmony_ci} 13831cb0ef41Sopenharmony_ci 13841cb0ef41Sopenharmony_cinamespace { 13851cb0ef41Sopenharmony_ci 13861cb0ef41Sopenharmony_ci// Move a number of elements in a zonelist to another position 13871cb0ef41Sopenharmony_ci// in the same list. Handles overlapping source and target areas. 13881cb0ef41Sopenharmony_civoid MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) { 13891cb0ef41Sopenharmony_ci // Ranges are potentially overlapping. 13901cb0ef41Sopenharmony_ci if (from < to) { 13911cb0ef41Sopenharmony_ci for (int i = count - 1; i >= 0; i--) { 13921cb0ef41Sopenharmony_ci list->at(to + i) = list->at(from + i); 13931cb0ef41Sopenharmony_ci } 13941cb0ef41Sopenharmony_ci } else { 13951cb0ef41Sopenharmony_ci for (int i = 0; i < count; i++) { 13961cb0ef41Sopenharmony_ci list->at(to + i) = list->at(from + i); 13971cb0ef41Sopenharmony_ci } 13981cb0ef41Sopenharmony_ci } 13991cb0ef41Sopenharmony_ci} 14001cb0ef41Sopenharmony_ci 14011cb0ef41Sopenharmony_ciint InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count, 14021cb0ef41Sopenharmony_ci CharacterRange insert) { 14031cb0ef41Sopenharmony_ci // Inserts a range into list[0..count[, which must be sorted 14041cb0ef41Sopenharmony_ci // by from value and non-overlapping and non-adjacent, using at most 14051cb0ef41Sopenharmony_ci // list[0..count] for the result. Returns the number of resulting 14061cb0ef41Sopenharmony_ci // canonicalized ranges. Inserting a range may collapse existing ranges into 14071cb0ef41Sopenharmony_ci // fewer ranges, so the return value can be anything in the range 1..count+1. 14081cb0ef41Sopenharmony_ci base::uc32 from = insert.from(); 14091cb0ef41Sopenharmony_ci base::uc32 to = insert.to(); 14101cb0ef41Sopenharmony_ci int start_pos = 0; 14111cb0ef41Sopenharmony_ci int end_pos = count; 14121cb0ef41Sopenharmony_ci for (int i = count - 1; i >= 0; i--) { 14131cb0ef41Sopenharmony_ci CharacterRange current = list->at(i); 14141cb0ef41Sopenharmony_ci if (current.from() > to + 1) { 14151cb0ef41Sopenharmony_ci end_pos = i; 14161cb0ef41Sopenharmony_ci } else if (current.to() + 1 < from) { 14171cb0ef41Sopenharmony_ci start_pos = i + 1; 14181cb0ef41Sopenharmony_ci break; 14191cb0ef41Sopenharmony_ci } 14201cb0ef41Sopenharmony_ci } 14211cb0ef41Sopenharmony_ci 14221cb0ef41Sopenharmony_ci // Inserted range overlaps, or is adjacent to, ranges at positions 14231cb0ef41Sopenharmony_ci // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 14241cb0ef41Sopenharmony_ci // not affected by the insertion. 14251cb0ef41Sopenharmony_ci // If start_pos == end_pos, the range must be inserted before start_pos. 14261cb0ef41Sopenharmony_ci // if start_pos < end_pos, the entire range from start_pos to end_pos 14271cb0ef41Sopenharmony_ci // must be merged with the insert range. 14281cb0ef41Sopenharmony_ci 14291cb0ef41Sopenharmony_ci if (start_pos == end_pos) { 14301cb0ef41Sopenharmony_ci // Insert between existing ranges at position start_pos. 14311cb0ef41Sopenharmony_ci if (start_pos < count) { 14321cb0ef41Sopenharmony_ci MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 14331cb0ef41Sopenharmony_ci } 14341cb0ef41Sopenharmony_ci list->at(start_pos) = insert; 14351cb0ef41Sopenharmony_ci return count + 1; 14361cb0ef41Sopenharmony_ci } 14371cb0ef41Sopenharmony_ci if (start_pos + 1 == end_pos) { 14381cb0ef41Sopenharmony_ci // Replace single existing range at position start_pos. 14391cb0ef41Sopenharmony_ci CharacterRange to_replace = list->at(start_pos); 14401cb0ef41Sopenharmony_ci int new_from = std::min(to_replace.from(), from); 14411cb0ef41Sopenharmony_ci int new_to = std::max(to_replace.to(), to); 14421cb0ef41Sopenharmony_ci list->at(start_pos) = CharacterRange::Range(new_from, new_to); 14431cb0ef41Sopenharmony_ci return count; 14441cb0ef41Sopenharmony_ci } 14451cb0ef41Sopenharmony_ci // Replace a number of existing ranges from start_pos to end_pos - 1. 14461cb0ef41Sopenharmony_ci // Move the remaining ranges down. 14471cb0ef41Sopenharmony_ci 14481cb0ef41Sopenharmony_ci int new_from = std::min(list->at(start_pos).from(), from); 14491cb0ef41Sopenharmony_ci int new_to = std::max(list->at(end_pos - 1).to(), to); 14501cb0ef41Sopenharmony_ci if (end_pos < count) { 14511cb0ef41Sopenharmony_ci MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 14521cb0ef41Sopenharmony_ci } 14531cb0ef41Sopenharmony_ci list->at(start_pos) = CharacterRange::Range(new_from, new_to); 14541cb0ef41Sopenharmony_ci return count - (end_pos - start_pos) + 1; 14551cb0ef41Sopenharmony_ci} 14561cb0ef41Sopenharmony_ci 14571cb0ef41Sopenharmony_ci} // namespace 14581cb0ef41Sopenharmony_ci 14591cb0ef41Sopenharmony_civoid CharacterSet::Canonicalize() { 14601cb0ef41Sopenharmony_ci // Special/default classes are always considered canonical. The result 14611cb0ef41Sopenharmony_ci // of calling ranges() will be sorted. 14621cb0ef41Sopenharmony_ci if (ranges_ == nullptr) return; 14631cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(ranges_); 14641cb0ef41Sopenharmony_ci} 14651cb0ef41Sopenharmony_ci 14661cb0ef41Sopenharmony_ci// static 14671cb0ef41Sopenharmony_civoid CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 14681cb0ef41Sopenharmony_ci if (character_ranges->length() <= 1) return; 14691cb0ef41Sopenharmony_ci // Check whether ranges are already canonical (increasing, non-overlapping, 14701cb0ef41Sopenharmony_ci // non-adjacent). 14711cb0ef41Sopenharmony_ci int n = character_ranges->length(); 14721cb0ef41Sopenharmony_ci base::uc32 max = character_ranges->at(0).to(); 14731cb0ef41Sopenharmony_ci int i = 1; 14741cb0ef41Sopenharmony_ci while (i < n) { 14751cb0ef41Sopenharmony_ci CharacterRange current = character_ranges->at(i); 14761cb0ef41Sopenharmony_ci if (current.from() <= max + 1) { 14771cb0ef41Sopenharmony_ci break; 14781cb0ef41Sopenharmony_ci } 14791cb0ef41Sopenharmony_ci max = current.to(); 14801cb0ef41Sopenharmony_ci i++; 14811cb0ef41Sopenharmony_ci } 14821cb0ef41Sopenharmony_ci // Canonical until the i'th range. If that's all of them, we are done. 14831cb0ef41Sopenharmony_ci if (i == n) return; 14841cb0ef41Sopenharmony_ci 14851cb0ef41Sopenharmony_ci // The ranges at index i and forward are not canonicalized. Make them so by 14861cb0ef41Sopenharmony_ci // doing the equivalent of insertion sort (inserting each into the previous 14871cb0ef41Sopenharmony_ci // list, in order). 14881cb0ef41Sopenharmony_ci // Notice that inserting a range can reduce the number of ranges in the 14891cb0ef41Sopenharmony_ci // result due to combining of adjacent and overlapping ranges. 14901cb0ef41Sopenharmony_ci int read = i; // Range to insert. 14911cb0ef41Sopenharmony_ci int num_canonical = i; // Length of canonicalized part of list. 14921cb0ef41Sopenharmony_ci do { 14931cb0ef41Sopenharmony_ci num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical, 14941cb0ef41Sopenharmony_ci character_ranges->at(read)); 14951cb0ef41Sopenharmony_ci read++; 14961cb0ef41Sopenharmony_ci } while (read < n); 14971cb0ef41Sopenharmony_ci character_ranges->Rewind(num_canonical); 14981cb0ef41Sopenharmony_ci 14991cb0ef41Sopenharmony_ci DCHECK(CharacterRange::IsCanonical(character_ranges)); 15001cb0ef41Sopenharmony_ci} 15011cb0ef41Sopenharmony_ci 15021cb0ef41Sopenharmony_ci// static 15031cb0ef41Sopenharmony_civoid CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 15041cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* negated_ranges, 15051cb0ef41Sopenharmony_ci Zone* zone) { 15061cb0ef41Sopenharmony_ci DCHECK(CharacterRange::IsCanonical(ranges)); 15071cb0ef41Sopenharmony_ci DCHECK_EQ(0, negated_ranges->length()); 15081cb0ef41Sopenharmony_ci int range_count = ranges->length(); 15091cb0ef41Sopenharmony_ci base::uc32 from = 0; 15101cb0ef41Sopenharmony_ci int i = 0; 15111cb0ef41Sopenharmony_ci if (range_count > 0 && ranges->at(0).from() == 0) { 15121cb0ef41Sopenharmony_ci from = ranges->at(0).to() + 1; 15131cb0ef41Sopenharmony_ci i = 1; 15141cb0ef41Sopenharmony_ci } 15151cb0ef41Sopenharmony_ci while (i < range_count) { 15161cb0ef41Sopenharmony_ci CharacterRange range = ranges->at(i); 15171cb0ef41Sopenharmony_ci negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); 15181cb0ef41Sopenharmony_ci from = range.to() + 1; 15191cb0ef41Sopenharmony_ci i++; 15201cb0ef41Sopenharmony_ci } 15211cb0ef41Sopenharmony_ci if (from < kMaxCodePoint) { 15221cb0ef41Sopenharmony_ci negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone); 15231cb0ef41Sopenharmony_ci } 15241cb0ef41Sopenharmony_ci} 15251cb0ef41Sopenharmony_ci 15261cb0ef41Sopenharmony_ci// static 15271cb0ef41Sopenharmony_civoid CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) { 15281cb0ef41Sopenharmony_ci DCHECK(IsCanonical(ranges)); 15291cb0ef41Sopenharmony_ci 15301cb0ef41Sopenharmony_ci // Drop all ranges that don't contain one-byte code units, and clamp the last 15311cb0ef41Sopenharmony_ci // range s.t. it likewise only contains one-byte code units. Note this relies 15321cb0ef41Sopenharmony_ci // on `ranges` being canonicalized, i.e. sorted and non-overlapping. 15331cb0ef41Sopenharmony_ci 15341cb0ef41Sopenharmony_ci static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU; 15351cb0ef41Sopenharmony_ci int n = ranges->length(); 15361cb0ef41Sopenharmony_ci for (; n > 0; n--) { 15371cb0ef41Sopenharmony_ci CharacterRange& r = ranges->at(n - 1); 15381cb0ef41Sopenharmony_ci if (r.from() <= max_char) { 15391cb0ef41Sopenharmony_ci r.to_ = std::min(r.to_, max_char); 15401cb0ef41Sopenharmony_ci break; 15411cb0ef41Sopenharmony_ci } 15421cb0ef41Sopenharmony_ci } 15431cb0ef41Sopenharmony_ci 15441cb0ef41Sopenharmony_ci ranges->Rewind(n); 15451cb0ef41Sopenharmony_ci} 15461cb0ef41Sopenharmony_ci 15471cb0ef41Sopenharmony_cinamespace { 15481cb0ef41Sopenharmony_ci 15491cb0ef41Sopenharmony_ci// Scoped object to keep track of how much we unroll quantifier loops in the 15501cb0ef41Sopenharmony_ci// regexp graph generator. 15511cb0ef41Sopenharmony_ciclass RegExpExpansionLimiter { 15521cb0ef41Sopenharmony_ci public: 15531cb0ef41Sopenharmony_ci static const int kMaxExpansionFactor = 6; 15541cb0ef41Sopenharmony_ci RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 15551cb0ef41Sopenharmony_ci : compiler_(compiler), 15561cb0ef41Sopenharmony_ci saved_expansion_factor_(compiler->current_expansion_factor()), 15571cb0ef41Sopenharmony_ci ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 15581cb0ef41Sopenharmony_ci DCHECK_LT(0, factor); 15591cb0ef41Sopenharmony_ci if (ok_to_expand_) { 15601cb0ef41Sopenharmony_ci if (factor > kMaxExpansionFactor) { 15611cb0ef41Sopenharmony_ci // Avoid integer overflow of the current expansion factor. 15621cb0ef41Sopenharmony_ci ok_to_expand_ = false; 15631cb0ef41Sopenharmony_ci compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 15641cb0ef41Sopenharmony_ci } else { 15651cb0ef41Sopenharmony_ci int new_factor = saved_expansion_factor_ * factor; 15661cb0ef41Sopenharmony_ci ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 15671cb0ef41Sopenharmony_ci compiler->set_current_expansion_factor(new_factor); 15681cb0ef41Sopenharmony_ci } 15691cb0ef41Sopenharmony_ci } 15701cb0ef41Sopenharmony_ci } 15711cb0ef41Sopenharmony_ci 15721cb0ef41Sopenharmony_ci ~RegExpExpansionLimiter() { 15731cb0ef41Sopenharmony_ci compiler_->set_current_expansion_factor(saved_expansion_factor_); 15741cb0ef41Sopenharmony_ci } 15751cb0ef41Sopenharmony_ci 15761cb0ef41Sopenharmony_ci bool ok_to_expand() { return ok_to_expand_; } 15771cb0ef41Sopenharmony_ci 15781cb0ef41Sopenharmony_ci private: 15791cb0ef41Sopenharmony_ci RegExpCompiler* compiler_; 15801cb0ef41Sopenharmony_ci int saved_expansion_factor_; 15811cb0ef41Sopenharmony_ci bool ok_to_expand_; 15821cb0ef41Sopenharmony_ci 15831cb0ef41Sopenharmony_ci DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 15841cb0ef41Sopenharmony_ci}; 15851cb0ef41Sopenharmony_ci 15861cb0ef41Sopenharmony_ci} // namespace 15871cb0ef41Sopenharmony_ci 15881cb0ef41Sopenharmony_ciRegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, 15891cb0ef41Sopenharmony_ci RegExpTree* body, RegExpCompiler* compiler, 15901cb0ef41Sopenharmony_ci RegExpNode* on_success, 15911cb0ef41Sopenharmony_ci bool not_at_start) { 15921cb0ef41Sopenharmony_ci // x{f, t} becomes this: 15931cb0ef41Sopenharmony_ci // 15941cb0ef41Sopenharmony_ci // (r++)<-. 15951cb0ef41Sopenharmony_ci // | ` 15961cb0ef41Sopenharmony_ci // | (x) 15971cb0ef41Sopenharmony_ci // v ^ 15981cb0ef41Sopenharmony_ci // (r=0)-->(?)---/ [if r < t] 15991cb0ef41Sopenharmony_ci // | 16001cb0ef41Sopenharmony_ci // [if r >= f] \----> ... 16011cb0ef41Sopenharmony_ci // 16021cb0ef41Sopenharmony_ci 16031cb0ef41Sopenharmony_ci // 15.10.2.5 RepeatMatcher algorithm. 16041cb0ef41Sopenharmony_ci // The parser has already eliminated the case where max is 0. In the case 16051cb0ef41Sopenharmony_ci // where max_match is zero the parser has removed the quantifier if min was 16061cb0ef41Sopenharmony_ci // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 16071cb0ef41Sopenharmony_ci 16081cb0ef41Sopenharmony_ci // If we know that we cannot match zero length then things are a little 16091cb0ef41Sopenharmony_ci // simpler since we don't need to make the special zero length match check 16101cb0ef41Sopenharmony_ci // from step 2.1. If the min and max are small we can unroll a little in 16111cb0ef41Sopenharmony_ci // this case. 16121cb0ef41Sopenharmony_ci static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 16131cb0ef41Sopenharmony_ci static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 16141cb0ef41Sopenharmony_ci if (max == 0) return on_success; // This can happen due to recursion. 16151cb0ef41Sopenharmony_ci bool body_can_be_empty = (body->min_match() == 0); 16161cb0ef41Sopenharmony_ci int body_start_reg = RegExpCompiler::kNoRegister; 16171cb0ef41Sopenharmony_ci Interval capture_registers = body->CaptureRegisters(); 16181cb0ef41Sopenharmony_ci bool needs_capture_clearing = !capture_registers.is_empty(); 16191cb0ef41Sopenharmony_ci Zone* zone = compiler->zone(); 16201cb0ef41Sopenharmony_ci 16211cb0ef41Sopenharmony_ci if (body_can_be_empty) { 16221cb0ef41Sopenharmony_ci body_start_reg = compiler->AllocateRegister(); 16231cb0ef41Sopenharmony_ci } else if (compiler->optimize() && !needs_capture_clearing) { 16241cb0ef41Sopenharmony_ci // Only unroll if there are no captures and the body can't be 16251cb0ef41Sopenharmony_ci // empty. 16261cb0ef41Sopenharmony_ci { 16271cb0ef41Sopenharmony_ci RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0)); 16281cb0ef41Sopenharmony_ci if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 16291cb0ef41Sopenharmony_ci int new_max = (max == kInfinity) ? max : max - min; 16301cb0ef41Sopenharmony_ci // Recurse once to get the loop or optional matches after the fixed 16311cb0ef41Sopenharmony_ci // ones. 16321cb0ef41Sopenharmony_ci RegExpNode* answer = 16331cb0ef41Sopenharmony_ci ToNode(0, new_max, is_greedy, body, compiler, on_success, true); 16341cb0ef41Sopenharmony_ci // Unroll the forced matches from 0 to min. This can cause chains of 16351cb0ef41Sopenharmony_ci // TextNodes (which the parser does not generate). These should be 16361cb0ef41Sopenharmony_ci // combined if it turns out they hinder good code generation. 16371cb0ef41Sopenharmony_ci for (int i = 0; i < min; i++) { 16381cb0ef41Sopenharmony_ci answer = body->ToNode(compiler, answer); 16391cb0ef41Sopenharmony_ci } 16401cb0ef41Sopenharmony_ci return answer; 16411cb0ef41Sopenharmony_ci } 16421cb0ef41Sopenharmony_ci } 16431cb0ef41Sopenharmony_ci if (max <= kMaxUnrolledMaxMatches && min == 0) { 16441cb0ef41Sopenharmony_ci DCHECK_LT(0, max); // Due to the 'if' above. 16451cb0ef41Sopenharmony_ci RegExpExpansionLimiter limiter(compiler, max); 16461cb0ef41Sopenharmony_ci if (limiter.ok_to_expand()) { 16471cb0ef41Sopenharmony_ci // Unroll the optional matches up to max. 16481cb0ef41Sopenharmony_ci RegExpNode* answer = on_success; 16491cb0ef41Sopenharmony_ci for (int i = 0; i < max; i++) { 16501cb0ef41Sopenharmony_ci ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone); 16511cb0ef41Sopenharmony_ci if (is_greedy) { 16521cb0ef41Sopenharmony_ci alternation->AddAlternative( 16531cb0ef41Sopenharmony_ci GuardedAlternative(body->ToNode(compiler, answer))); 16541cb0ef41Sopenharmony_ci alternation->AddAlternative(GuardedAlternative(on_success)); 16551cb0ef41Sopenharmony_ci } else { 16561cb0ef41Sopenharmony_ci alternation->AddAlternative(GuardedAlternative(on_success)); 16571cb0ef41Sopenharmony_ci alternation->AddAlternative( 16581cb0ef41Sopenharmony_ci GuardedAlternative(body->ToNode(compiler, answer))); 16591cb0ef41Sopenharmony_ci } 16601cb0ef41Sopenharmony_ci answer = alternation; 16611cb0ef41Sopenharmony_ci if (not_at_start && !compiler->read_backward()) { 16621cb0ef41Sopenharmony_ci alternation->set_not_at_start(); 16631cb0ef41Sopenharmony_ci } 16641cb0ef41Sopenharmony_ci } 16651cb0ef41Sopenharmony_ci return answer; 16661cb0ef41Sopenharmony_ci } 16671cb0ef41Sopenharmony_ci } 16681cb0ef41Sopenharmony_ci } 16691cb0ef41Sopenharmony_ci bool has_min = min > 0; 16701cb0ef41Sopenharmony_ci bool has_max = max < RegExpTree::kInfinity; 16711cb0ef41Sopenharmony_ci bool needs_counter = has_min || has_max; 16721cb0ef41Sopenharmony_ci int reg_ctr = needs_counter ? compiler->AllocateRegister() 16731cb0ef41Sopenharmony_ci : RegExpCompiler::kNoRegister; 16741cb0ef41Sopenharmony_ci LoopChoiceNode* center = zone->New<LoopChoiceNode>( 16751cb0ef41Sopenharmony_ci body->min_match() == 0, compiler->read_backward(), min, zone); 16761cb0ef41Sopenharmony_ci if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 16771cb0ef41Sopenharmony_ci RegExpNode* loop_return = 16781cb0ef41Sopenharmony_ci needs_counter ? static_cast<RegExpNode*>( 16791cb0ef41Sopenharmony_ci ActionNode::IncrementRegister(reg_ctr, center)) 16801cb0ef41Sopenharmony_ci : static_cast<RegExpNode*>(center); 16811cb0ef41Sopenharmony_ci if (body_can_be_empty) { 16821cb0ef41Sopenharmony_ci // If the body can be empty we need to check if it was and then 16831cb0ef41Sopenharmony_ci // backtrack. 16841cb0ef41Sopenharmony_ci loop_return = 16851cb0ef41Sopenharmony_ci ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return); 16861cb0ef41Sopenharmony_ci } 16871cb0ef41Sopenharmony_ci RegExpNode* body_node = body->ToNode(compiler, loop_return); 16881cb0ef41Sopenharmony_ci if (body_can_be_empty) { 16891cb0ef41Sopenharmony_ci // If the body can be empty we need to store the start position 16901cb0ef41Sopenharmony_ci // so we can bail out if it was empty. 16911cb0ef41Sopenharmony_ci body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 16921cb0ef41Sopenharmony_ci } 16931cb0ef41Sopenharmony_ci if (needs_capture_clearing) { 16941cb0ef41Sopenharmony_ci // Before entering the body of this loop we need to clear captures. 16951cb0ef41Sopenharmony_ci body_node = ActionNode::ClearCaptures(capture_registers, body_node); 16961cb0ef41Sopenharmony_ci } 16971cb0ef41Sopenharmony_ci GuardedAlternative body_alt(body_node); 16981cb0ef41Sopenharmony_ci if (has_max) { 16991cb0ef41Sopenharmony_ci Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max); 17001cb0ef41Sopenharmony_ci body_alt.AddGuard(body_guard, zone); 17011cb0ef41Sopenharmony_ci } 17021cb0ef41Sopenharmony_ci GuardedAlternative rest_alt(on_success); 17031cb0ef41Sopenharmony_ci if (has_min) { 17041cb0ef41Sopenharmony_ci Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min); 17051cb0ef41Sopenharmony_ci rest_alt.AddGuard(rest_guard, zone); 17061cb0ef41Sopenharmony_ci } 17071cb0ef41Sopenharmony_ci if (is_greedy) { 17081cb0ef41Sopenharmony_ci center->AddLoopAlternative(body_alt); 17091cb0ef41Sopenharmony_ci center->AddContinueAlternative(rest_alt); 17101cb0ef41Sopenharmony_ci } else { 17111cb0ef41Sopenharmony_ci center->AddContinueAlternative(rest_alt); 17121cb0ef41Sopenharmony_ci center->AddLoopAlternative(body_alt); 17131cb0ef41Sopenharmony_ci } 17141cb0ef41Sopenharmony_ci if (needs_counter) { 17151cb0ef41Sopenharmony_ci return ActionNode::SetRegisterForLoop(reg_ctr, 0, center); 17161cb0ef41Sopenharmony_ci } else { 17171cb0ef41Sopenharmony_ci return center; 17181cb0ef41Sopenharmony_ci } 17191cb0ef41Sopenharmony_ci} 17201cb0ef41Sopenharmony_ci 17211cb0ef41Sopenharmony_ci} // namespace internal 17221cb0ef41Sopenharmony_ci} // namespace v8 1723