11cb0ef41Sopenharmony_ci// Copyright 2019 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#ifndef V8_REGEXP_REGEXP_COMPILER_H_ 61cb0ef41Sopenharmony_ci#define V8_REGEXP_REGEXP_COMPILER_H_ 71cb0ef41Sopenharmony_ci 81cb0ef41Sopenharmony_ci#include <bitset> 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci#include "src/base/small-vector.h" 111cb0ef41Sopenharmony_ci#include "src/base/strings.h" 121cb0ef41Sopenharmony_ci#include "src/regexp/regexp-flags.h" 131cb0ef41Sopenharmony_ci#include "src/regexp/regexp-nodes.h" 141cb0ef41Sopenharmony_ci 151cb0ef41Sopenharmony_cinamespace v8 { 161cb0ef41Sopenharmony_cinamespace internal { 171cb0ef41Sopenharmony_ci 181cb0ef41Sopenharmony_ciclass DynamicBitSet; 191cb0ef41Sopenharmony_ciclass Isolate; 201cb0ef41Sopenharmony_ci 211cb0ef41Sopenharmony_cinamespace regexp_compiler_constants { 221cb0ef41Sopenharmony_ci 231cb0ef41Sopenharmony_ci// The '2' variant is has inclusive from and exclusive to. 241cb0ef41Sopenharmony_ci// This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 251cb0ef41Sopenharmony_ci// which include WhiteSpace (7.2) or LineTerminator (7.3) values. 261cb0ef41Sopenharmony_ciconstexpr base::uc32 kRangeEndMarker = 0x110000; 271cb0ef41Sopenharmony_ciconstexpr int kSpaceRanges[] = { 281cb0ef41Sopenharmony_ci '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 291cb0ef41Sopenharmony_ci 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 301cb0ef41Sopenharmony_ci 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; 311cb0ef41Sopenharmony_ciconstexpr int kSpaceRangeCount = arraysize(kSpaceRanges); 321cb0ef41Sopenharmony_ci 331cb0ef41Sopenharmony_ciconstexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', 341cb0ef41Sopenharmony_ci '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; 351cb0ef41Sopenharmony_ciconstexpr int kWordRangeCount = arraysize(kWordRanges); 361cb0ef41Sopenharmony_ciconstexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; 371cb0ef41Sopenharmony_ciconstexpr int kDigitRangeCount = arraysize(kDigitRanges); 381cb0ef41Sopenharmony_ciconstexpr int kSurrogateRanges[] = {kLeadSurrogateStart, 391cb0ef41Sopenharmony_ci kLeadSurrogateStart + 1, kRangeEndMarker}; 401cb0ef41Sopenharmony_ciconstexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges); 411cb0ef41Sopenharmony_ciconstexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, 421cb0ef41Sopenharmony_ci 0x2028, 0x202A, kRangeEndMarker}; 431cb0ef41Sopenharmony_ciconstexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 441cb0ef41Sopenharmony_ci 451cb0ef41Sopenharmony_ci// More makes code generation slower, less makes V8 benchmark score lower. 461cb0ef41Sopenharmony_ciconstexpr int kMaxLookaheadForBoyerMoore = 8; 471cb0ef41Sopenharmony_ci// In a 3-character pattern you can maximally step forwards 3 characters 481cb0ef41Sopenharmony_ci// at a time, which is not always enough to pay for the extra logic. 491cb0ef41Sopenharmony_ciconstexpr int kPatternTooShortForBoyerMoore = 2; 501cb0ef41Sopenharmony_ci 511cb0ef41Sopenharmony_ci} // namespace regexp_compiler_constants 521cb0ef41Sopenharmony_ci 531cb0ef41Sopenharmony_ciinline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) { 541cb0ef41Sopenharmony_ci // Both unicode and ignore_case flags are set. We need to use ICU to find 551cb0ef41Sopenharmony_ci // the closure over case equivalents. 561cb0ef41Sopenharmony_ci return IsUnicode(flags) && IsIgnoreCase(flags); 571cb0ef41Sopenharmony_ci} 581cb0ef41Sopenharmony_ci 591cb0ef41Sopenharmony_ci// Details of a quick mask-compare check that can look ahead in the 601cb0ef41Sopenharmony_ci// input stream. 611cb0ef41Sopenharmony_ciclass QuickCheckDetails { 621cb0ef41Sopenharmony_ci public: 631cb0ef41Sopenharmony_ci QuickCheckDetails() 641cb0ef41Sopenharmony_ci : characters_(0), mask_(0), value_(0), cannot_match_(false) {} 651cb0ef41Sopenharmony_ci explicit QuickCheckDetails(int characters) 661cb0ef41Sopenharmony_ci : characters_(characters), mask_(0), value_(0), cannot_match_(false) {} 671cb0ef41Sopenharmony_ci bool Rationalize(bool one_byte); 681cb0ef41Sopenharmony_ci // Merge in the information from another branch of an alternation. 691cb0ef41Sopenharmony_ci void Merge(QuickCheckDetails* other, int from_index); 701cb0ef41Sopenharmony_ci // Advance the current position by some amount. 711cb0ef41Sopenharmony_ci void Advance(int by, bool one_byte); 721cb0ef41Sopenharmony_ci void Clear(); 731cb0ef41Sopenharmony_ci bool cannot_match() { return cannot_match_; } 741cb0ef41Sopenharmony_ci void set_cannot_match() { cannot_match_ = true; } 751cb0ef41Sopenharmony_ci struct Position { 761cb0ef41Sopenharmony_ci Position() : mask(0), value(0), determines_perfectly(false) {} 771cb0ef41Sopenharmony_ci base::uc32 mask; 781cb0ef41Sopenharmony_ci base::uc32 value; 791cb0ef41Sopenharmony_ci bool determines_perfectly; 801cb0ef41Sopenharmony_ci }; 811cb0ef41Sopenharmony_ci int characters() { return characters_; } 821cb0ef41Sopenharmony_ci void set_characters(int characters) { characters_ = characters; } 831cb0ef41Sopenharmony_ci Position* positions(int index) { 841cb0ef41Sopenharmony_ci DCHECK_LE(0, index); 851cb0ef41Sopenharmony_ci DCHECK_GT(characters_, index); 861cb0ef41Sopenharmony_ci return positions_ + index; 871cb0ef41Sopenharmony_ci } 881cb0ef41Sopenharmony_ci uint32_t mask() { return mask_; } 891cb0ef41Sopenharmony_ci uint32_t value() { return value_; } 901cb0ef41Sopenharmony_ci 911cb0ef41Sopenharmony_ci private: 921cb0ef41Sopenharmony_ci // How many characters do we have quick check information from. This is 931cb0ef41Sopenharmony_ci // the same for all branches of a choice node. 941cb0ef41Sopenharmony_ci int characters_; 951cb0ef41Sopenharmony_ci Position positions_[4]; 961cb0ef41Sopenharmony_ci // These values are the condensate of the above array after Rationalize(). 971cb0ef41Sopenharmony_ci uint32_t mask_; 981cb0ef41Sopenharmony_ci uint32_t value_; 991cb0ef41Sopenharmony_ci // If set to true, there is no way this quick check can match at all. 1001cb0ef41Sopenharmony_ci // E.g., if it requires to be at the start of the input, and isn't. 1011cb0ef41Sopenharmony_ci bool cannot_match_; 1021cb0ef41Sopenharmony_ci}; 1031cb0ef41Sopenharmony_ci 1041cb0ef41Sopenharmony_ci// Improve the speed that we scan for an initial point where a non-anchored 1051cb0ef41Sopenharmony_ci// regexp can match by using a Boyer-Moore-like table. This is done by 1061cb0ef41Sopenharmony_ci// identifying non-greedy non-capturing loops in the nodes that eat any 1071cb0ef41Sopenharmony_ci// character one at a time. For example in the middle of the regexp 1081cb0ef41Sopenharmony_ci// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly 1091cb0ef41Sopenharmony_ci// inserted at the start of any non-anchored regexp. 1101cb0ef41Sopenharmony_ci// 1111cb0ef41Sopenharmony_ci// When we have found such a loop we look ahead in the nodes to find the set of 1121cb0ef41Sopenharmony_ci// characters that can come at given distances. For example for the regexp 1131cb0ef41Sopenharmony_ci// /.?foo/ we know that there are at least 3 characters ahead of us, and the 1141cb0ef41Sopenharmony_ci// sets of characters that can occur are [any, [f, o], [o]]. We find a range in 1151cb0ef41Sopenharmony_ci// the lookahead info where the set of characters is reasonably constrained. In 1161cb0ef41Sopenharmony_ci// our example this is from index 1 to 2 (0 is not constrained). We can now 1171cb0ef41Sopenharmony_ci// look 3 characters ahead and if we don't find one of [f, o] (the union of 1181cb0ef41Sopenharmony_ci// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). 1191cb0ef41Sopenharmony_ci// 1201cb0ef41Sopenharmony_ci// For Unicode input strings we do the same, but modulo 128. 1211cb0ef41Sopenharmony_ci// 1221cb0ef41Sopenharmony_ci// We also look at the first string fed to the regexp and use that to get a hint 1231cb0ef41Sopenharmony_ci// of the character frequencies in the inputs. This affects the assessment of 1241cb0ef41Sopenharmony_ci// whether the set of characters is 'reasonably constrained'. 1251cb0ef41Sopenharmony_ci// 1261cb0ef41Sopenharmony_ci// We also have another lookahead mechanism (called quick check in the code), 1271cb0ef41Sopenharmony_ci// which uses a wide load of multiple characters followed by a mask and compare 1281cb0ef41Sopenharmony_ci// to determine whether a match is possible at this point. 1291cb0ef41Sopenharmony_cienum ContainedInLattice { 1301cb0ef41Sopenharmony_ci kNotYet = 0, 1311cb0ef41Sopenharmony_ci kLatticeIn = 1, 1321cb0ef41Sopenharmony_ci kLatticeOut = 2, 1331cb0ef41Sopenharmony_ci kLatticeUnknown = 3 // Can also mean both in and out. 1341cb0ef41Sopenharmony_ci}; 1351cb0ef41Sopenharmony_ci 1361cb0ef41Sopenharmony_ciinline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { 1371cb0ef41Sopenharmony_ci return static_cast<ContainedInLattice>(a | b); 1381cb0ef41Sopenharmony_ci} 1391cb0ef41Sopenharmony_ci 1401cb0ef41Sopenharmony_ciclass BoyerMoorePositionInfo : public ZoneObject { 1411cb0ef41Sopenharmony_ci public: 1421cb0ef41Sopenharmony_ci bool at(int i) const { return map_[i]; } 1431cb0ef41Sopenharmony_ci 1441cb0ef41Sopenharmony_ci static constexpr int kMapSize = 128; 1451cb0ef41Sopenharmony_ci static constexpr int kMask = kMapSize - 1; 1461cb0ef41Sopenharmony_ci 1471cb0ef41Sopenharmony_ci int map_count() const { return map_count_; } 1481cb0ef41Sopenharmony_ci 1491cb0ef41Sopenharmony_ci void Set(int character); 1501cb0ef41Sopenharmony_ci void SetInterval(const Interval& interval); 1511cb0ef41Sopenharmony_ci void SetAll(); 1521cb0ef41Sopenharmony_ci 1531cb0ef41Sopenharmony_ci bool is_non_word() { return w_ == kLatticeOut; } 1541cb0ef41Sopenharmony_ci bool is_word() { return w_ == kLatticeIn; } 1551cb0ef41Sopenharmony_ci 1561cb0ef41Sopenharmony_ci using Bitset = std::bitset<kMapSize>; 1571cb0ef41Sopenharmony_ci Bitset raw_bitset() const { return map_; } 1581cb0ef41Sopenharmony_ci 1591cb0ef41Sopenharmony_ci private: 1601cb0ef41Sopenharmony_ci Bitset map_; 1611cb0ef41Sopenharmony_ci int map_count_ = 0; // Number of set bits in the map. 1621cb0ef41Sopenharmony_ci ContainedInLattice w_ = kNotYet; // The \w character class. 1631cb0ef41Sopenharmony_ci}; 1641cb0ef41Sopenharmony_ci 1651cb0ef41Sopenharmony_ciclass BoyerMooreLookahead : public ZoneObject { 1661cb0ef41Sopenharmony_ci public: 1671cb0ef41Sopenharmony_ci BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); 1681cb0ef41Sopenharmony_ci 1691cb0ef41Sopenharmony_ci int length() { return length_; } 1701cb0ef41Sopenharmony_ci int max_char() { return max_char_; } 1711cb0ef41Sopenharmony_ci RegExpCompiler* compiler() { return compiler_; } 1721cb0ef41Sopenharmony_ci 1731cb0ef41Sopenharmony_ci int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); } 1741cb0ef41Sopenharmony_ci 1751cb0ef41Sopenharmony_ci BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } 1761cb0ef41Sopenharmony_ci 1771cb0ef41Sopenharmony_ci void Set(int map_number, int character) { 1781cb0ef41Sopenharmony_ci if (character > max_char_) return; 1791cb0ef41Sopenharmony_ci BoyerMoorePositionInfo* info = bitmaps_->at(map_number); 1801cb0ef41Sopenharmony_ci info->Set(character); 1811cb0ef41Sopenharmony_ci } 1821cb0ef41Sopenharmony_ci 1831cb0ef41Sopenharmony_ci void SetInterval(int map_number, const Interval& interval) { 1841cb0ef41Sopenharmony_ci if (interval.from() > max_char_) return; 1851cb0ef41Sopenharmony_ci BoyerMoorePositionInfo* info = bitmaps_->at(map_number); 1861cb0ef41Sopenharmony_ci if (interval.to() > max_char_) { 1871cb0ef41Sopenharmony_ci info->SetInterval(Interval(interval.from(), max_char_)); 1881cb0ef41Sopenharmony_ci } else { 1891cb0ef41Sopenharmony_ci info->SetInterval(interval); 1901cb0ef41Sopenharmony_ci } 1911cb0ef41Sopenharmony_ci } 1921cb0ef41Sopenharmony_ci 1931cb0ef41Sopenharmony_ci void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); } 1941cb0ef41Sopenharmony_ci 1951cb0ef41Sopenharmony_ci void SetRest(int from_map) { 1961cb0ef41Sopenharmony_ci for (int i = from_map; i < length_; i++) SetAll(i); 1971cb0ef41Sopenharmony_ci } 1981cb0ef41Sopenharmony_ci void EmitSkipInstructions(RegExpMacroAssembler* masm); 1991cb0ef41Sopenharmony_ci 2001cb0ef41Sopenharmony_ci private: 2011cb0ef41Sopenharmony_ci // This is the value obtained by EatsAtLeast. If we do not have at least this 2021cb0ef41Sopenharmony_ci // many characters left in the sample string then the match is bound to fail. 2031cb0ef41Sopenharmony_ci // Therefore it is OK to read a character this far ahead of the current match 2041cb0ef41Sopenharmony_ci // point. 2051cb0ef41Sopenharmony_ci int length_; 2061cb0ef41Sopenharmony_ci RegExpCompiler* compiler_; 2071cb0ef41Sopenharmony_ci // 0xff for Latin1, 0xffff for UTF-16. 2081cb0ef41Sopenharmony_ci int max_char_; 2091cb0ef41Sopenharmony_ci ZoneList<BoyerMoorePositionInfo*>* bitmaps_; 2101cb0ef41Sopenharmony_ci 2111cb0ef41Sopenharmony_ci int GetSkipTable(int min_lookahead, int max_lookahead, 2121cb0ef41Sopenharmony_ci Handle<ByteArray> boolean_skip_table); 2131cb0ef41Sopenharmony_ci bool FindWorthwhileInterval(int* from, int* to); 2141cb0ef41Sopenharmony_ci int FindBestInterval(int max_number_of_chars, int old_biggest_points, 2151cb0ef41Sopenharmony_ci int* from, int* to); 2161cb0ef41Sopenharmony_ci}; 2171cb0ef41Sopenharmony_ci 2181cb0ef41Sopenharmony_ci// There are many ways to generate code for a node. This class encapsulates 2191cb0ef41Sopenharmony_ci// the current way we should be generating. In other words it encapsulates 2201cb0ef41Sopenharmony_ci// the current state of the code generator. The effect of this is that we 2211cb0ef41Sopenharmony_ci// generate code for paths that the matcher can take through the regular 2221cb0ef41Sopenharmony_ci// expression. A given node in the regexp can be code-generated several times 2231cb0ef41Sopenharmony_ci// as it can be part of several traces. For example for the regexp: 2241cb0ef41Sopenharmony_ci// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part 2251cb0ef41Sopenharmony_ci// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code 2261cb0ef41Sopenharmony_ci// to match foo is generated only once (the traces have a common prefix). The 2271cb0ef41Sopenharmony_ci// code to store the capture is deferred and generated (twice) after the places 2281cb0ef41Sopenharmony_ci// where baz has been matched. 2291cb0ef41Sopenharmony_ciclass Trace { 2301cb0ef41Sopenharmony_ci public: 2311cb0ef41Sopenharmony_ci // A value for a property that is either known to be true, know to be false, 2321cb0ef41Sopenharmony_ci // or not known. 2331cb0ef41Sopenharmony_ci enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 }; 2341cb0ef41Sopenharmony_ci 2351cb0ef41Sopenharmony_ci class DeferredAction { 2361cb0ef41Sopenharmony_ci public: 2371cb0ef41Sopenharmony_ci DeferredAction(ActionNode::ActionType action_type, int reg) 2381cb0ef41Sopenharmony_ci : action_type_(action_type), reg_(reg), next_(nullptr) {} 2391cb0ef41Sopenharmony_ci DeferredAction* next() { return next_; } 2401cb0ef41Sopenharmony_ci bool Mentions(int reg); 2411cb0ef41Sopenharmony_ci int reg() { return reg_; } 2421cb0ef41Sopenharmony_ci ActionNode::ActionType action_type() { return action_type_; } 2431cb0ef41Sopenharmony_ci 2441cb0ef41Sopenharmony_ci private: 2451cb0ef41Sopenharmony_ci ActionNode::ActionType action_type_; 2461cb0ef41Sopenharmony_ci int reg_; 2471cb0ef41Sopenharmony_ci DeferredAction* next_; 2481cb0ef41Sopenharmony_ci friend class Trace; 2491cb0ef41Sopenharmony_ci }; 2501cb0ef41Sopenharmony_ci 2511cb0ef41Sopenharmony_ci class DeferredCapture : public DeferredAction { 2521cb0ef41Sopenharmony_ci public: 2531cb0ef41Sopenharmony_ci DeferredCapture(int reg, bool is_capture, Trace* trace) 2541cb0ef41Sopenharmony_ci : DeferredAction(ActionNode::STORE_POSITION, reg), 2551cb0ef41Sopenharmony_ci cp_offset_(trace->cp_offset()), 2561cb0ef41Sopenharmony_ci is_capture_(is_capture) {} 2571cb0ef41Sopenharmony_ci int cp_offset() { return cp_offset_; } 2581cb0ef41Sopenharmony_ci bool is_capture() { return is_capture_; } 2591cb0ef41Sopenharmony_ci 2601cb0ef41Sopenharmony_ci private: 2611cb0ef41Sopenharmony_ci int cp_offset_; 2621cb0ef41Sopenharmony_ci bool is_capture_; 2631cb0ef41Sopenharmony_ci void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 2641cb0ef41Sopenharmony_ci }; 2651cb0ef41Sopenharmony_ci 2661cb0ef41Sopenharmony_ci class DeferredSetRegisterForLoop : public DeferredAction { 2671cb0ef41Sopenharmony_ci public: 2681cb0ef41Sopenharmony_ci DeferredSetRegisterForLoop(int reg, int value) 2691cb0ef41Sopenharmony_ci : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg), 2701cb0ef41Sopenharmony_ci value_(value) {} 2711cb0ef41Sopenharmony_ci int value() { return value_; } 2721cb0ef41Sopenharmony_ci 2731cb0ef41Sopenharmony_ci private: 2741cb0ef41Sopenharmony_ci int value_; 2751cb0ef41Sopenharmony_ci }; 2761cb0ef41Sopenharmony_ci 2771cb0ef41Sopenharmony_ci class DeferredClearCaptures : public DeferredAction { 2781cb0ef41Sopenharmony_ci public: 2791cb0ef41Sopenharmony_ci explicit DeferredClearCaptures(Interval range) 2801cb0ef41Sopenharmony_ci : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {} 2811cb0ef41Sopenharmony_ci Interval range() { return range_; } 2821cb0ef41Sopenharmony_ci 2831cb0ef41Sopenharmony_ci private: 2841cb0ef41Sopenharmony_ci Interval range_; 2851cb0ef41Sopenharmony_ci }; 2861cb0ef41Sopenharmony_ci 2871cb0ef41Sopenharmony_ci class DeferredIncrementRegister : public DeferredAction { 2881cb0ef41Sopenharmony_ci public: 2891cb0ef41Sopenharmony_ci explicit DeferredIncrementRegister(int reg) 2901cb0ef41Sopenharmony_ci : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {} 2911cb0ef41Sopenharmony_ci }; 2921cb0ef41Sopenharmony_ci 2931cb0ef41Sopenharmony_ci Trace() 2941cb0ef41Sopenharmony_ci : cp_offset_(0), 2951cb0ef41Sopenharmony_ci actions_(nullptr), 2961cb0ef41Sopenharmony_ci backtrack_(nullptr), 2971cb0ef41Sopenharmony_ci stop_node_(nullptr), 2981cb0ef41Sopenharmony_ci loop_label_(nullptr), 2991cb0ef41Sopenharmony_ci characters_preloaded_(0), 3001cb0ef41Sopenharmony_ci bound_checked_up_to_(0), 3011cb0ef41Sopenharmony_ci flush_budget_(100), 3021cb0ef41Sopenharmony_ci at_start_(UNKNOWN) {} 3031cb0ef41Sopenharmony_ci 3041cb0ef41Sopenharmony_ci // End the trace. This involves flushing the deferred actions in the trace 3051cb0ef41Sopenharmony_ci // and pushing a backtrack location onto the backtrack stack. Once this is 3061cb0ef41Sopenharmony_ci // done we can start a new trace or go to one that has already been 3071cb0ef41Sopenharmony_ci // generated. 3081cb0ef41Sopenharmony_ci void Flush(RegExpCompiler* compiler, RegExpNode* successor); 3091cb0ef41Sopenharmony_ci int cp_offset() { return cp_offset_; } 3101cb0ef41Sopenharmony_ci DeferredAction* actions() { return actions_; } 3111cb0ef41Sopenharmony_ci // A trivial trace is one that has no deferred actions or other state that 3121cb0ef41Sopenharmony_ci // affects the assumptions used when generating code. There is no recorded 3131cb0ef41Sopenharmony_ci // backtrack location in a trivial trace, so with a trivial trace we will 3141cb0ef41Sopenharmony_ci // generate code that, on a failure to match, gets the backtrack location 3151cb0ef41Sopenharmony_ci // from the backtrack stack rather than using a direct jump instruction. We 3161cb0ef41Sopenharmony_ci // always start code generation with a trivial trace and non-trivial traces 3171cb0ef41Sopenharmony_ci // are created as we emit code for nodes or add to the list of deferred 3181cb0ef41Sopenharmony_ci // actions in the trace. The location of the code generated for a node using 3191cb0ef41Sopenharmony_ci // a trivial trace is recorded in a label in the node so that gotos can be 3201cb0ef41Sopenharmony_ci // generated to that code. 3211cb0ef41Sopenharmony_ci bool is_trivial() { 3221cb0ef41Sopenharmony_ci return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 && 3231cb0ef41Sopenharmony_ci characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && 3241cb0ef41Sopenharmony_ci quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; 3251cb0ef41Sopenharmony_ci } 3261cb0ef41Sopenharmony_ci TriBool at_start() { return at_start_; } 3271cb0ef41Sopenharmony_ci void set_at_start(TriBool at_start) { at_start_ = at_start; } 3281cb0ef41Sopenharmony_ci Label* backtrack() { return backtrack_; } 3291cb0ef41Sopenharmony_ci Label* loop_label() { return loop_label_; } 3301cb0ef41Sopenharmony_ci RegExpNode* stop_node() { return stop_node_; } 3311cb0ef41Sopenharmony_ci int characters_preloaded() { return characters_preloaded_; } 3321cb0ef41Sopenharmony_ci int bound_checked_up_to() { return bound_checked_up_to_; } 3331cb0ef41Sopenharmony_ci int flush_budget() { return flush_budget_; } 3341cb0ef41Sopenharmony_ci QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } 3351cb0ef41Sopenharmony_ci bool mentions_reg(int reg); 3361cb0ef41Sopenharmony_ci // Returns true if a deferred position store exists to the specified 3371cb0ef41Sopenharmony_ci // register and stores the offset in the out-parameter. Otherwise 3381cb0ef41Sopenharmony_ci // returns false. 3391cb0ef41Sopenharmony_ci bool GetStoredPosition(int reg, int* cp_offset); 3401cb0ef41Sopenharmony_ci // These set methods and AdvanceCurrentPositionInTrace should be used only on 3411cb0ef41Sopenharmony_ci // new traces - the intention is that traces are immutable after creation. 3421cb0ef41Sopenharmony_ci void add_action(DeferredAction* new_action) { 3431cb0ef41Sopenharmony_ci DCHECK(new_action->next_ == nullptr); 3441cb0ef41Sopenharmony_ci new_action->next_ = actions_; 3451cb0ef41Sopenharmony_ci actions_ = new_action; 3461cb0ef41Sopenharmony_ci } 3471cb0ef41Sopenharmony_ci void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } 3481cb0ef41Sopenharmony_ci void set_stop_node(RegExpNode* node) { stop_node_ = node; } 3491cb0ef41Sopenharmony_ci void set_loop_label(Label* label) { loop_label_ = label; } 3501cb0ef41Sopenharmony_ci void set_characters_preloaded(int count) { characters_preloaded_ = count; } 3511cb0ef41Sopenharmony_ci void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } 3521cb0ef41Sopenharmony_ci void set_flush_budget(int to) { flush_budget_ = to; } 3531cb0ef41Sopenharmony_ci void set_quick_check_performed(QuickCheckDetails* d) { 3541cb0ef41Sopenharmony_ci quick_check_performed_ = *d; 3551cb0ef41Sopenharmony_ci } 3561cb0ef41Sopenharmony_ci void InvalidateCurrentCharacter(); 3571cb0ef41Sopenharmony_ci void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); 3581cb0ef41Sopenharmony_ci 3591cb0ef41Sopenharmony_ci private: 3601cb0ef41Sopenharmony_ci int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone); 3611cb0ef41Sopenharmony_ci void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, 3621cb0ef41Sopenharmony_ci const DynamicBitSet& affected_registers, 3631cb0ef41Sopenharmony_ci DynamicBitSet* registers_to_pop, 3641cb0ef41Sopenharmony_ci DynamicBitSet* registers_to_clear, Zone* zone); 3651cb0ef41Sopenharmony_ci void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, 3661cb0ef41Sopenharmony_ci const DynamicBitSet& registers_to_pop, 3671cb0ef41Sopenharmony_ci const DynamicBitSet& registers_to_clear); 3681cb0ef41Sopenharmony_ci int cp_offset_; 3691cb0ef41Sopenharmony_ci DeferredAction* actions_; 3701cb0ef41Sopenharmony_ci Label* backtrack_; 3711cb0ef41Sopenharmony_ci RegExpNode* stop_node_; 3721cb0ef41Sopenharmony_ci Label* loop_label_; 3731cb0ef41Sopenharmony_ci int characters_preloaded_; 3741cb0ef41Sopenharmony_ci int bound_checked_up_to_; 3751cb0ef41Sopenharmony_ci QuickCheckDetails quick_check_performed_; 3761cb0ef41Sopenharmony_ci int flush_budget_; 3771cb0ef41Sopenharmony_ci TriBool at_start_; 3781cb0ef41Sopenharmony_ci}; 3791cb0ef41Sopenharmony_ci 3801cb0ef41Sopenharmony_ciclass GreedyLoopState { 3811cb0ef41Sopenharmony_ci public: 3821cb0ef41Sopenharmony_ci explicit GreedyLoopState(bool not_at_start); 3831cb0ef41Sopenharmony_ci 3841cb0ef41Sopenharmony_ci Label* label() { return &label_; } 3851cb0ef41Sopenharmony_ci Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } 3861cb0ef41Sopenharmony_ci 3871cb0ef41Sopenharmony_ci private: 3881cb0ef41Sopenharmony_ci Label label_; 3891cb0ef41Sopenharmony_ci Trace counter_backtrack_trace_; 3901cb0ef41Sopenharmony_ci}; 3911cb0ef41Sopenharmony_ci 3921cb0ef41Sopenharmony_cistruct PreloadState { 3931cb0ef41Sopenharmony_ci static const int kEatsAtLeastNotYetInitialized = -1; 3941cb0ef41Sopenharmony_ci bool preload_is_current_; 3951cb0ef41Sopenharmony_ci bool preload_has_checked_bounds_; 3961cb0ef41Sopenharmony_ci int preload_characters_; 3971cb0ef41Sopenharmony_ci int eats_at_least_; 3981cb0ef41Sopenharmony_ci void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; } 3991cb0ef41Sopenharmony_ci}; 4001cb0ef41Sopenharmony_ci 4011cb0ef41Sopenharmony_ci// Analysis performs assertion propagation and computes eats_at_least_ values. 4021cb0ef41Sopenharmony_ci// See the comments on AssertionPropagator and EatsAtLeastPropagator for more 4031cb0ef41Sopenharmony_ci// details. 4041cb0ef41Sopenharmony_ciRegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, 4051cb0ef41Sopenharmony_ci RegExpNode* node); 4061cb0ef41Sopenharmony_ci 4071cb0ef41Sopenharmony_ciclass FrequencyCollator { 4081cb0ef41Sopenharmony_ci public: 4091cb0ef41Sopenharmony_ci FrequencyCollator() : total_samples_(0) { 4101cb0ef41Sopenharmony_ci for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 4111cb0ef41Sopenharmony_ci frequencies_[i] = CharacterFrequency(i); 4121cb0ef41Sopenharmony_ci } 4131cb0ef41Sopenharmony_ci } 4141cb0ef41Sopenharmony_ci 4151cb0ef41Sopenharmony_ci void CountCharacter(int character) { 4161cb0ef41Sopenharmony_ci int index = (character & RegExpMacroAssembler::kTableMask); 4171cb0ef41Sopenharmony_ci frequencies_[index].Increment(); 4181cb0ef41Sopenharmony_ci total_samples_++; 4191cb0ef41Sopenharmony_ci } 4201cb0ef41Sopenharmony_ci 4211cb0ef41Sopenharmony_ci // Does not measure in percent, but rather per-128 (the table size from the 4221cb0ef41Sopenharmony_ci // regexp macro assembler). 4231cb0ef41Sopenharmony_ci int Frequency(int in_character) { 4241cb0ef41Sopenharmony_ci DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 4251cb0ef41Sopenharmony_ci if (total_samples_ < 1) return 1; // Division by zero. 4261cb0ef41Sopenharmony_ci int freq_in_per128 = 4271cb0ef41Sopenharmony_ci (frequencies_[in_character].counter() * 128) / total_samples_; 4281cb0ef41Sopenharmony_ci return freq_in_per128; 4291cb0ef41Sopenharmony_ci } 4301cb0ef41Sopenharmony_ci 4311cb0ef41Sopenharmony_ci private: 4321cb0ef41Sopenharmony_ci class CharacterFrequency { 4331cb0ef41Sopenharmony_ci public: 4341cb0ef41Sopenharmony_ci CharacterFrequency() : counter_(0), character_(-1) {} 4351cb0ef41Sopenharmony_ci explicit CharacterFrequency(int character) 4361cb0ef41Sopenharmony_ci : counter_(0), character_(character) {} 4371cb0ef41Sopenharmony_ci 4381cb0ef41Sopenharmony_ci void Increment() { counter_++; } 4391cb0ef41Sopenharmony_ci int counter() { return counter_; } 4401cb0ef41Sopenharmony_ci int character() { return character_; } 4411cb0ef41Sopenharmony_ci 4421cb0ef41Sopenharmony_ci private: 4431cb0ef41Sopenharmony_ci int counter_; 4441cb0ef41Sopenharmony_ci int character_; 4451cb0ef41Sopenharmony_ci }; 4461cb0ef41Sopenharmony_ci 4471cb0ef41Sopenharmony_ci private: 4481cb0ef41Sopenharmony_ci CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 4491cb0ef41Sopenharmony_ci int total_samples_; 4501cb0ef41Sopenharmony_ci}; 4511cb0ef41Sopenharmony_ci 4521cb0ef41Sopenharmony_ciclass RegExpCompiler { 4531cb0ef41Sopenharmony_ci public: 4541cb0ef41Sopenharmony_ci RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 4551cb0ef41Sopenharmony_ci RegExpFlags flags, bool is_one_byte); 4561cb0ef41Sopenharmony_ci 4571cb0ef41Sopenharmony_ci int AllocateRegister() { 4581cb0ef41Sopenharmony_ci if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 4591cb0ef41Sopenharmony_ci reg_exp_too_big_ = true; 4601cb0ef41Sopenharmony_ci return next_register_; 4611cb0ef41Sopenharmony_ci } 4621cb0ef41Sopenharmony_ci return next_register_++; 4631cb0ef41Sopenharmony_ci } 4641cb0ef41Sopenharmony_ci 4651cb0ef41Sopenharmony_ci // Lookarounds to match lone surrogates for unicode character class matches 4661cb0ef41Sopenharmony_ci // are never nested. We can therefore reuse registers. 4671cb0ef41Sopenharmony_ci int UnicodeLookaroundStackRegister() { 4681cb0ef41Sopenharmony_ci if (unicode_lookaround_stack_register_ == kNoRegister) { 4691cb0ef41Sopenharmony_ci unicode_lookaround_stack_register_ = AllocateRegister(); 4701cb0ef41Sopenharmony_ci } 4711cb0ef41Sopenharmony_ci return unicode_lookaround_stack_register_; 4721cb0ef41Sopenharmony_ci } 4731cb0ef41Sopenharmony_ci 4741cb0ef41Sopenharmony_ci int UnicodeLookaroundPositionRegister() { 4751cb0ef41Sopenharmony_ci if (unicode_lookaround_position_register_ == kNoRegister) { 4761cb0ef41Sopenharmony_ci unicode_lookaround_position_register_ = AllocateRegister(); 4771cb0ef41Sopenharmony_ci } 4781cb0ef41Sopenharmony_ci return unicode_lookaround_position_register_; 4791cb0ef41Sopenharmony_ci } 4801cb0ef41Sopenharmony_ci 4811cb0ef41Sopenharmony_ci struct CompilationResult final { 4821cb0ef41Sopenharmony_ci explicit CompilationResult(RegExpError err) : error(err) {} 4831cb0ef41Sopenharmony_ci CompilationResult(Handle<Object> code, int registers) 4841cb0ef41Sopenharmony_ci : code(code), num_registers(registers) {} 4851cb0ef41Sopenharmony_ci 4861cb0ef41Sopenharmony_ci static CompilationResult RegExpTooBig() { 4871cb0ef41Sopenharmony_ci return CompilationResult(RegExpError::kTooLarge); 4881cb0ef41Sopenharmony_ci } 4891cb0ef41Sopenharmony_ci 4901cb0ef41Sopenharmony_ci bool Succeeded() const { return error == RegExpError::kNone; } 4911cb0ef41Sopenharmony_ci 4921cb0ef41Sopenharmony_ci const RegExpError error = RegExpError::kNone; 4931cb0ef41Sopenharmony_ci Handle<Object> code; 4941cb0ef41Sopenharmony_ci int num_registers = 0; 4951cb0ef41Sopenharmony_ci }; 4961cb0ef41Sopenharmony_ci 4971cb0ef41Sopenharmony_ci CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler, 4981cb0ef41Sopenharmony_ci RegExpNode* start, int capture_count, 4991cb0ef41Sopenharmony_ci Handle<String> pattern); 5001cb0ef41Sopenharmony_ci 5011cb0ef41Sopenharmony_ci // Preprocessing is the final step of node creation before analysis 5021cb0ef41Sopenharmony_ci // and assembly. It includes: 5031cb0ef41Sopenharmony_ci // - Wrapping the body of the regexp in capture 0. 5041cb0ef41Sopenharmony_ci // - Inserting the implicit .* before/after the regexp if necessary. 5051cb0ef41Sopenharmony_ci // - If the input is a one-byte string, filtering out nodes that can't match. 5061cb0ef41Sopenharmony_ci // - Fixing up regexp matches that start within a surrogate pair. 5071cb0ef41Sopenharmony_ci RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags, 5081cb0ef41Sopenharmony_ci bool is_one_byte); 5091cb0ef41Sopenharmony_ci 5101cb0ef41Sopenharmony_ci // If the regexp matching starts within a surrogate pair, step back to the 5111cb0ef41Sopenharmony_ci // lead surrogate and start matching from there. 5121cb0ef41Sopenharmony_ci RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success); 5131cb0ef41Sopenharmony_ci 5141cb0ef41Sopenharmony_ci inline void AddWork(RegExpNode* node) { 5151cb0ef41Sopenharmony_ci if (!node->on_work_list() && !node->label()->is_bound()) { 5161cb0ef41Sopenharmony_ci node->set_on_work_list(true); 5171cb0ef41Sopenharmony_ci work_list_->push_back(node); 5181cb0ef41Sopenharmony_ci } 5191cb0ef41Sopenharmony_ci } 5201cb0ef41Sopenharmony_ci 5211cb0ef41Sopenharmony_ci static const int kImplementationOffset = 0; 5221cb0ef41Sopenharmony_ci static const int kNumberOfRegistersOffset = 0; 5231cb0ef41Sopenharmony_ci static const int kCodeOffset = 1; 5241cb0ef41Sopenharmony_ci 5251cb0ef41Sopenharmony_ci RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 5261cb0ef41Sopenharmony_ci EndNode* accept() { return accept_; } 5271cb0ef41Sopenharmony_ci 5281cb0ef41Sopenharmony_ci static const int kMaxRecursion = 100; 5291cb0ef41Sopenharmony_ci inline int recursion_depth() { return recursion_depth_; } 5301cb0ef41Sopenharmony_ci inline void IncrementRecursionDepth() { recursion_depth_++; } 5311cb0ef41Sopenharmony_ci inline void DecrementRecursionDepth() { recursion_depth_--; } 5321cb0ef41Sopenharmony_ci 5331cb0ef41Sopenharmony_ci RegExpFlags flags() const { return flags_; } 5341cb0ef41Sopenharmony_ci 5351cb0ef41Sopenharmony_ci void SetRegExpTooBig() { reg_exp_too_big_ = true; } 5361cb0ef41Sopenharmony_ci 5371cb0ef41Sopenharmony_ci inline bool one_byte() { return one_byte_; } 5381cb0ef41Sopenharmony_ci inline bool optimize() { return optimize_; } 5391cb0ef41Sopenharmony_ci inline void set_optimize(bool value) { optimize_ = value; } 5401cb0ef41Sopenharmony_ci inline bool limiting_recursion() { return limiting_recursion_; } 5411cb0ef41Sopenharmony_ci inline void set_limiting_recursion(bool value) { 5421cb0ef41Sopenharmony_ci limiting_recursion_ = value; 5431cb0ef41Sopenharmony_ci } 5441cb0ef41Sopenharmony_ci bool read_backward() { return read_backward_; } 5451cb0ef41Sopenharmony_ci void set_read_backward(bool value) { read_backward_ = value; } 5461cb0ef41Sopenharmony_ci FrequencyCollator* frequency_collator() { return &frequency_collator_; } 5471cb0ef41Sopenharmony_ci 5481cb0ef41Sopenharmony_ci int current_expansion_factor() { return current_expansion_factor_; } 5491cb0ef41Sopenharmony_ci void set_current_expansion_factor(int value) { 5501cb0ef41Sopenharmony_ci current_expansion_factor_ = value; 5511cb0ef41Sopenharmony_ci } 5521cb0ef41Sopenharmony_ci 5531cb0ef41Sopenharmony_ci // The recursive nature of ToNode node generation means we may run into stack 5541cb0ef41Sopenharmony_ci // overflow issues. We introduce periodic checks to detect these, and the 5551cb0ef41Sopenharmony_ci // tick counter helps limit overhead of these checks. 5561cb0ef41Sopenharmony_ci // TODO(jgruber): This is super hacky and should be replaced by an abort 5571cb0ef41Sopenharmony_ci // mechanism or iterative node generation. 5581cb0ef41Sopenharmony_ci void ToNodeMaybeCheckForStackOverflow() { 5591cb0ef41Sopenharmony_ci if ((to_node_overflow_check_ticks_++ % 16 == 0)) { 5601cb0ef41Sopenharmony_ci ToNodeCheckForStackOverflow(); 5611cb0ef41Sopenharmony_ci } 5621cb0ef41Sopenharmony_ci } 5631cb0ef41Sopenharmony_ci void ToNodeCheckForStackOverflow(); 5641cb0ef41Sopenharmony_ci 5651cb0ef41Sopenharmony_ci Isolate* isolate() const { return isolate_; } 5661cb0ef41Sopenharmony_ci Zone* zone() const { return zone_; } 5671cb0ef41Sopenharmony_ci 5681cb0ef41Sopenharmony_ci static const int kNoRegister = -1; 5691cb0ef41Sopenharmony_ci 5701cb0ef41Sopenharmony_ci private: 5711cb0ef41Sopenharmony_ci EndNode* accept_; 5721cb0ef41Sopenharmony_ci int next_register_; 5731cb0ef41Sopenharmony_ci int unicode_lookaround_stack_register_; 5741cb0ef41Sopenharmony_ci int unicode_lookaround_position_register_; 5751cb0ef41Sopenharmony_ci ZoneVector<RegExpNode*>* work_list_; 5761cb0ef41Sopenharmony_ci int recursion_depth_; 5771cb0ef41Sopenharmony_ci const RegExpFlags flags_; 5781cb0ef41Sopenharmony_ci RegExpMacroAssembler* macro_assembler_; 5791cb0ef41Sopenharmony_ci bool one_byte_; 5801cb0ef41Sopenharmony_ci bool reg_exp_too_big_; 5811cb0ef41Sopenharmony_ci bool limiting_recursion_; 5821cb0ef41Sopenharmony_ci int to_node_overflow_check_ticks_ = 0; 5831cb0ef41Sopenharmony_ci bool optimize_; 5841cb0ef41Sopenharmony_ci bool read_backward_; 5851cb0ef41Sopenharmony_ci int current_expansion_factor_; 5861cb0ef41Sopenharmony_ci FrequencyCollator frequency_collator_; 5871cb0ef41Sopenharmony_ci Isolate* isolate_; 5881cb0ef41Sopenharmony_ci Zone* zone_; 5891cb0ef41Sopenharmony_ci}; 5901cb0ef41Sopenharmony_ci 5911cb0ef41Sopenharmony_ci// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. 5921cb0ef41Sopenharmony_ciclass UnicodeRangeSplitter { 5931cb0ef41Sopenharmony_ci public: 5941cb0ef41Sopenharmony_ci V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base); 5951cb0ef41Sopenharmony_ci 5961cb0ef41Sopenharmony_ci static constexpr int kInitialSize = 8; 5971cb0ef41Sopenharmony_ci using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>; 5981cb0ef41Sopenharmony_ci 5991cb0ef41Sopenharmony_ci const CharacterRangeVector* bmp() const { return &bmp_; } 6001cb0ef41Sopenharmony_ci const CharacterRangeVector* lead_surrogates() const { 6011cb0ef41Sopenharmony_ci return &lead_surrogates_; 6021cb0ef41Sopenharmony_ci } 6031cb0ef41Sopenharmony_ci const CharacterRangeVector* trail_surrogates() const { 6041cb0ef41Sopenharmony_ci return &trail_surrogates_; 6051cb0ef41Sopenharmony_ci } 6061cb0ef41Sopenharmony_ci const CharacterRangeVector* non_bmp() const { return &non_bmp_; } 6071cb0ef41Sopenharmony_ci 6081cb0ef41Sopenharmony_ci private: 6091cb0ef41Sopenharmony_ci void AddRange(CharacterRange range); 6101cb0ef41Sopenharmony_ci 6111cb0ef41Sopenharmony_ci CharacterRangeVector bmp_; 6121cb0ef41Sopenharmony_ci CharacterRangeVector lead_surrogates_; 6131cb0ef41Sopenharmony_ci CharacterRangeVector trail_surrogates_; 6141cb0ef41Sopenharmony_ci CharacterRangeVector non_bmp_; 6151cb0ef41Sopenharmony_ci}; 6161cb0ef41Sopenharmony_ci 6171cb0ef41Sopenharmony_ci// We need to check for the following characters: 0x39C 0x3BC 0x178. 6181cb0ef41Sopenharmony_ci// TODO(jgruber): Move to CharacterRange. 6191cb0ef41Sopenharmony_cibool RangeContainsLatin1Equivalents(CharacterRange range); 6201cb0ef41Sopenharmony_ci 6211cb0ef41Sopenharmony_ci} // namespace internal 6221cb0ef41Sopenharmony_ci} // namespace v8 6231cb0ef41Sopenharmony_ci 6241cb0ef41Sopenharmony_ci#endif // V8_REGEXP_REGEXP_COMPILER_H_ 625