11cb0ef41Sopenharmony_ci// Copyright 2020 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental-compiler.h" 61cb0ef41Sopenharmony_ci 71cb0ef41Sopenharmony_ci#include "src/base/strings.h" 81cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental.h" 91cb0ef41Sopenharmony_ci#include "src/zone/zone-list-inl.h" 101cb0ef41Sopenharmony_ci 111cb0ef41Sopenharmony_cinamespace v8 { 121cb0ef41Sopenharmony_cinamespace internal { 131cb0ef41Sopenharmony_ci 141cb0ef41Sopenharmony_cinamespace { 151cb0ef41Sopenharmony_ci 161cb0ef41Sopenharmony_ci// TODO(mbid, v8:10765): Currently the experimental engine doesn't support 171cb0ef41Sopenharmony_ci// UTF-16, but this shouldn't be too hard to implement. 181cb0ef41Sopenharmony_ciconstexpr base::uc32 kMaxSupportedCodepoint = 0xFFFFu; 191cb0ef41Sopenharmony_ci#ifdef DEBUG 201cb0ef41Sopenharmony_ciconstexpr base::uc32 kMaxCodePoint = 0x10ffff; 211cb0ef41Sopenharmony_ci#endif // DEBUG 221cb0ef41Sopenharmony_ci 231cb0ef41Sopenharmony_ciclass CanBeHandledVisitor final : private RegExpVisitor { 241cb0ef41Sopenharmony_ci // Visitor to implement `ExperimentalRegExp::CanBeHandled`. 251cb0ef41Sopenharmony_ci public: 261cb0ef41Sopenharmony_ci static bool Check(RegExpTree* tree, RegExpFlags flags, int capture_count) { 271cb0ef41Sopenharmony_ci if (!AreSuitableFlags(flags)) return false; 281cb0ef41Sopenharmony_ci CanBeHandledVisitor visitor; 291cb0ef41Sopenharmony_ci tree->Accept(&visitor, nullptr); 301cb0ef41Sopenharmony_ci return visitor.result_; 311cb0ef41Sopenharmony_ci } 321cb0ef41Sopenharmony_ci 331cb0ef41Sopenharmony_ci private: 341cb0ef41Sopenharmony_ci CanBeHandledVisitor() = default; 351cb0ef41Sopenharmony_ci 361cb0ef41Sopenharmony_ci static bool AreSuitableFlags(RegExpFlags flags) { 371cb0ef41Sopenharmony_ci // TODO(mbid, v8:10765): We should be able to support all flags in the 381cb0ef41Sopenharmony_ci // future. 391cb0ef41Sopenharmony_ci static constexpr RegExpFlags kAllowedFlags = 401cb0ef41Sopenharmony_ci RegExpFlag::kGlobal | RegExpFlag::kSticky | RegExpFlag::kMultiline | 411cb0ef41Sopenharmony_ci RegExpFlag::kDotAll | RegExpFlag::kLinear; 421cb0ef41Sopenharmony_ci // We support Unicode iff kUnicode is among the supported flags. 431cb0ef41Sopenharmony_ci STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode == 441cb0ef41Sopenharmony_ci IsUnicode(kAllowedFlags)); 451cb0ef41Sopenharmony_ci return (flags & ~kAllowedFlags) == 0; 461cb0ef41Sopenharmony_ci } 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_ci void* VisitDisjunction(RegExpDisjunction* node, void*) override { 491cb0ef41Sopenharmony_ci for (RegExpTree* alt : *node->alternatives()) { 501cb0ef41Sopenharmony_ci alt->Accept(this, nullptr); 511cb0ef41Sopenharmony_ci if (!result_) { 521cb0ef41Sopenharmony_ci return nullptr; 531cb0ef41Sopenharmony_ci } 541cb0ef41Sopenharmony_ci } 551cb0ef41Sopenharmony_ci return nullptr; 561cb0ef41Sopenharmony_ci } 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_ci void* VisitAlternative(RegExpAlternative* node, void*) override { 591cb0ef41Sopenharmony_ci for (RegExpTree* child : *node->nodes()) { 601cb0ef41Sopenharmony_ci child->Accept(this, nullptr); 611cb0ef41Sopenharmony_ci if (!result_) { 621cb0ef41Sopenharmony_ci return nullptr; 631cb0ef41Sopenharmony_ci } 641cb0ef41Sopenharmony_ci } 651cb0ef41Sopenharmony_ci return nullptr; 661cb0ef41Sopenharmony_ci } 671cb0ef41Sopenharmony_ci 681cb0ef41Sopenharmony_ci void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { 691cb0ef41Sopenharmony_ci return nullptr; 701cb0ef41Sopenharmony_ci } 711cb0ef41Sopenharmony_ci 721cb0ef41Sopenharmony_ci void* VisitAssertion(RegExpAssertion* node, void*) override { 731cb0ef41Sopenharmony_ci return nullptr; 741cb0ef41Sopenharmony_ci } 751cb0ef41Sopenharmony_ci 761cb0ef41Sopenharmony_ci void* VisitAtom(RegExpAtom* node, void*) override { 771cb0ef41Sopenharmony_ci return nullptr; 781cb0ef41Sopenharmony_ci } 791cb0ef41Sopenharmony_ci 801cb0ef41Sopenharmony_ci void* VisitText(RegExpText* node, void*) override { 811cb0ef41Sopenharmony_ci for (TextElement& el : *node->elements()) { 821cb0ef41Sopenharmony_ci el.tree()->Accept(this, nullptr); 831cb0ef41Sopenharmony_ci if (!result_) { 841cb0ef41Sopenharmony_ci return nullptr; 851cb0ef41Sopenharmony_ci } 861cb0ef41Sopenharmony_ci } 871cb0ef41Sopenharmony_ci return nullptr; 881cb0ef41Sopenharmony_ci } 891cb0ef41Sopenharmony_ci 901cb0ef41Sopenharmony_ci void* VisitQuantifier(RegExpQuantifier* node, void*) override { 911cb0ef41Sopenharmony_ci // Finite but large values of `min()` and `max()` are bad for the 921cb0ef41Sopenharmony_ci // breadth-first engine because finite (optional) repetition is dealt with 931cb0ef41Sopenharmony_ci // by replicating the bytecode of the body of the quantifier. The number 941cb0ef41Sopenharmony_ci // of replicatons grows exponentially in how deeply quantifiers are nested. 951cb0ef41Sopenharmony_ci // `replication_factor_` keeps track of how often the current node will 961cb0ef41Sopenharmony_ci // have to be replicated in the generated bytecode, and we don't allow this 971cb0ef41Sopenharmony_ci // to exceed some small value. 981cb0ef41Sopenharmony_ci static constexpr int kMaxReplicationFactor = 16; 991cb0ef41Sopenharmony_ci 1001cb0ef41Sopenharmony_ci // First we rule out values for min and max that are too big even before 1011cb0ef41Sopenharmony_ci // taking into account the ambient replication_factor_. This also guards 1021cb0ef41Sopenharmony_ci // against overflows in `local_replication` or `replication_factor_`. 1031cb0ef41Sopenharmony_ci if (node->min() > kMaxReplicationFactor || 1041cb0ef41Sopenharmony_ci (node->max() != RegExpTree::kInfinity && 1051cb0ef41Sopenharmony_ci node->max() > kMaxReplicationFactor)) { 1061cb0ef41Sopenharmony_ci result_ = false; 1071cb0ef41Sopenharmony_ci return nullptr; 1081cb0ef41Sopenharmony_ci } 1091cb0ef41Sopenharmony_ci 1101cb0ef41Sopenharmony_ci // Save the current replication factor so that it can be restored if we 1111cb0ef41Sopenharmony_ci // return with `result_ == true`. 1121cb0ef41Sopenharmony_ci int before_replication_factor = replication_factor_; 1131cb0ef41Sopenharmony_ci 1141cb0ef41Sopenharmony_ci int local_replication; 1151cb0ef41Sopenharmony_ci if (node->max() == RegExpTree::kInfinity) { 1161cb0ef41Sopenharmony_ci local_replication = node->min() + 1; 1171cb0ef41Sopenharmony_ci } else { 1181cb0ef41Sopenharmony_ci local_replication = node->max(); 1191cb0ef41Sopenharmony_ci } 1201cb0ef41Sopenharmony_ci 1211cb0ef41Sopenharmony_ci replication_factor_ *= local_replication; 1221cb0ef41Sopenharmony_ci if (replication_factor_ > kMaxReplicationFactor) { 1231cb0ef41Sopenharmony_ci result_ = false; 1241cb0ef41Sopenharmony_ci return nullptr; 1251cb0ef41Sopenharmony_ci } 1261cb0ef41Sopenharmony_ci 1271cb0ef41Sopenharmony_ci switch (node->quantifier_type()) { 1281cb0ef41Sopenharmony_ci case RegExpQuantifier::GREEDY: 1291cb0ef41Sopenharmony_ci case RegExpQuantifier::NON_GREEDY: 1301cb0ef41Sopenharmony_ci break; 1311cb0ef41Sopenharmony_ci case RegExpQuantifier::POSSESSIVE: 1321cb0ef41Sopenharmony_ci // TODO(mbid, v8:10765): It's not clear to me whether this can be 1331cb0ef41Sopenharmony_ci // supported in breadth-first mode. Re2 doesn't support it. 1341cb0ef41Sopenharmony_ci result_ = false; 1351cb0ef41Sopenharmony_ci return nullptr; 1361cb0ef41Sopenharmony_ci } 1371cb0ef41Sopenharmony_ci 1381cb0ef41Sopenharmony_ci node->body()->Accept(this, nullptr); 1391cb0ef41Sopenharmony_ci replication_factor_ = before_replication_factor; 1401cb0ef41Sopenharmony_ci return nullptr; 1411cb0ef41Sopenharmony_ci } 1421cb0ef41Sopenharmony_ci 1431cb0ef41Sopenharmony_ci void* VisitCapture(RegExpCapture* node, void*) override { 1441cb0ef41Sopenharmony_ci node->body()->Accept(this, nullptr); 1451cb0ef41Sopenharmony_ci return nullptr; 1461cb0ef41Sopenharmony_ci } 1471cb0ef41Sopenharmony_ci 1481cb0ef41Sopenharmony_ci void* VisitGroup(RegExpGroup* node, void*) override { 1491cb0ef41Sopenharmony_ci node->body()->Accept(this, nullptr); 1501cb0ef41Sopenharmony_ci return nullptr; 1511cb0ef41Sopenharmony_ci } 1521cb0ef41Sopenharmony_ci 1531cb0ef41Sopenharmony_ci void* VisitLookaround(RegExpLookaround* node, void*) override { 1541cb0ef41Sopenharmony_ci // TODO(mbid, v8:10765): This will be hard to support, but not impossible I 1551cb0ef41Sopenharmony_ci // think. See product automata. 1561cb0ef41Sopenharmony_ci result_ = false; 1571cb0ef41Sopenharmony_ci return nullptr; 1581cb0ef41Sopenharmony_ci } 1591cb0ef41Sopenharmony_ci 1601cb0ef41Sopenharmony_ci void* VisitBackReference(RegExpBackReference* node, void*) override { 1611cb0ef41Sopenharmony_ci // This can't be implemented without backtracking. 1621cb0ef41Sopenharmony_ci result_ = false; 1631cb0ef41Sopenharmony_ci return nullptr; 1641cb0ef41Sopenharmony_ci } 1651cb0ef41Sopenharmony_ci 1661cb0ef41Sopenharmony_ci void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } 1671cb0ef41Sopenharmony_ci 1681cb0ef41Sopenharmony_ci private: 1691cb0ef41Sopenharmony_ci // See comment in `VisitQuantifier`: 1701cb0ef41Sopenharmony_ci int replication_factor_ = 1; 1711cb0ef41Sopenharmony_ci 1721cb0ef41Sopenharmony_ci bool result_ = true; 1731cb0ef41Sopenharmony_ci}; 1741cb0ef41Sopenharmony_ci 1751cb0ef41Sopenharmony_ci} // namespace 1761cb0ef41Sopenharmony_ci 1771cb0ef41Sopenharmony_cibool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree, 1781cb0ef41Sopenharmony_ci RegExpFlags flags, 1791cb0ef41Sopenharmony_ci int capture_count) { 1801cb0ef41Sopenharmony_ci return CanBeHandledVisitor::Check(tree, flags, capture_count); 1811cb0ef41Sopenharmony_ci} 1821cb0ef41Sopenharmony_ci 1831cb0ef41Sopenharmony_cinamespace { 1841cb0ef41Sopenharmony_ci 1851cb0ef41Sopenharmony_ci// A label in bytecode which starts with no known address. The address *must* 1861cb0ef41Sopenharmony_ci// be bound with `Bind` before the label goes out of scope. 1871cb0ef41Sopenharmony_ci// Implemented as a linked list through the `payload.pc` of FORK and JMP 1881cb0ef41Sopenharmony_ci// instructions. 1891cb0ef41Sopenharmony_cistruct Label { 1901cb0ef41Sopenharmony_ci public: 1911cb0ef41Sopenharmony_ci Label() = default; 1921cb0ef41Sopenharmony_ci ~Label() { 1931cb0ef41Sopenharmony_ci DCHECK_EQ(state_, BOUND); 1941cb0ef41Sopenharmony_ci DCHECK_GE(bound_index_, 0); 1951cb0ef41Sopenharmony_ci } 1961cb0ef41Sopenharmony_ci 1971cb0ef41Sopenharmony_ci // Don't copy, don't move. Moving could be implemented, but it's not 1981cb0ef41Sopenharmony_ci // needed anywhere. 1991cb0ef41Sopenharmony_ci Label(const Label&) = delete; 2001cb0ef41Sopenharmony_ci Label& operator=(const Label&) = delete; 2011cb0ef41Sopenharmony_ci 2021cb0ef41Sopenharmony_ci private: 2031cb0ef41Sopenharmony_ci friend class BytecodeAssembler; 2041cb0ef41Sopenharmony_ci 2051cb0ef41Sopenharmony_ci // UNBOUND implies unbound_patch_list_begin_. 2061cb0ef41Sopenharmony_ci // BOUND implies bound_index_. 2071cb0ef41Sopenharmony_ci enum { UNBOUND, BOUND } state_ = UNBOUND; 2081cb0ef41Sopenharmony_ci union { 2091cb0ef41Sopenharmony_ci int unbound_patch_list_begin_ = -1; 2101cb0ef41Sopenharmony_ci int bound_index_; 2111cb0ef41Sopenharmony_ci }; 2121cb0ef41Sopenharmony_ci}; 2131cb0ef41Sopenharmony_ci 2141cb0ef41Sopenharmony_ciclass BytecodeAssembler { 2151cb0ef41Sopenharmony_ci public: 2161cb0ef41Sopenharmony_ci // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from 2171cb0ef41Sopenharmony_ci // the `tree` size we're going to compile? 2181cb0ef41Sopenharmony_ci explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {} 2191cb0ef41Sopenharmony_ci 2201cb0ef41Sopenharmony_ci ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); } 2211cb0ef41Sopenharmony_ci 2221cb0ef41Sopenharmony_ci void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); } 2231cb0ef41Sopenharmony_ci 2241cb0ef41Sopenharmony_ci void Assertion(RegExpAssertion::Type t) { 2251cb0ef41Sopenharmony_ci code_.Add(RegExpInstruction::Assertion(t), zone_); 2261cb0ef41Sopenharmony_ci } 2271cb0ef41Sopenharmony_ci 2281cb0ef41Sopenharmony_ci void ClearRegister(int32_t register_index) { 2291cb0ef41Sopenharmony_ci code_.Add(RegExpInstruction::ClearRegister(register_index), zone_); 2301cb0ef41Sopenharmony_ci } 2311cb0ef41Sopenharmony_ci 2321cb0ef41Sopenharmony_ci void ConsumeRange(base::uc16 from, base::uc16 to) { 2331cb0ef41Sopenharmony_ci code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_); 2341cb0ef41Sopenharmony_ci } 2351cb0ef41Sopenharmony_ci 2361cb0ef41Sopenharmony_ci void ConsumeAnyChar() { 2371cb0ef41Sopenharmony_ci code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_); 2381cb0ef41Sopenharmony_ci } 2391cb0ef41Sopenharmony_ci 2401cb0ef41Sopenharmony_ci void Fork(Label& target) { 2411cb0ef41Sopenharmony_ci LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target); 2421cb0ef41Sopenharmony_ci } 2431cb0ef41Sopenharmony_ci 2441cb0ef41Sopenharmony_ci void Jmp(Label& target) { 2451cb0ef41Sopenharmony_ci LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target); 2461cb0ef41Sopenharmony_ci } 2471cb0ef41Sopenharmony_ci 2481cb0ef41Sopenharmony_ci void SetRegisterToCp(int32_t register_index) { 2491cb0ef41Sopenharmony_ci code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_); 2501cb0ef41Sopenharmony_ci } 2511cb0ef41Sopenharmony_ci 2521cb0ef41Sopenharmony_ci void Bind(Label& target) { 2531cb0ef41Sopenharmony_ci DCHECK_EQ(target.state_, Label::UNBOUND); 2541cb0ef41Sopenharmony_ci 2551cb0ef41Sopenharmony_ci int index = code_.length(); 2561cb0ef41Sopenharmony_ci 2571cb0ef41Sopenharmony_ci while (target.unbound_patch_list_begin_ != -1) { 2581cb0ef41Sopenharmony_ci RegExpInstruction& inst = code_[target.unbound_patch_list_begin_]; 2591cb0ef41Sopenharmony_ci DCHECK(inst.opcode == RegExpInstruction::FORK || 2601cb0ef41Sopenharmony_ci inst.opcode == RegExpInstruction::JMP); 2611cb0ef41Sopenharmony_ci 2621cb0ef41Sopenharmony_ci target.unbound_patch_list_begin_ = inst.payload.pc; 2631cb0ef41Sopenharmony_ci inst.payload.pc = index; 2641cb0ef41Sopenharmony_ci } 2651cb0ef41Sopenharmony_ci 2661cb0ef41Sopenharmony_ci target.state_ = Label::BOUND; 2671cb0ef41Sopenharmony_ci target.bound_index_ = index; 2681cb0ef41Sopenharmony_ci } 2691cb0ef41Sopenharmony_ci 2701cb0ef41Sopenharmony_ci void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); } 2711cb0ef41Sopenharmony_ci 2721cb0ef41Sopenharmony_ci private: 2731cb0ef41Sopenharmony_ci void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) { 2741cb0ef41Sopenharmony_ci RegExpInstruction result; 2751cb0ef41Sopenharmony_ci result.opcode = op; 2761cb0ef41Sopenharmony_ci 2771cb0ef41Sopenharmony_ci if (target.state_ == Label::BOUND) { 2781cb0ef41Sopenharmony_ci result.payload.pc = target.bound_index_; 2791cb0ef41Sopenharmony_ci } else { 2801cb0ef41Sopenharmony_ci DCHECK_EQ(target.state_, Label::UNBOUND); 2811cb0ef41Sopenharmony_ci int new_list_begin = code_.length(); 2821cb0ef41Sopenharmony_ci DCHECK_GE(new_list_begin, 0); 2831cb0ef41Sopenharmony_ci 2841cb0ef41Sopenharmony_ci result.payload.pc = target.unbound_patch_list_begin_; 2851cb0ef41Sopenharmony_ci 2861cb0ef41Sopenharmony_ci target.unbound_patch_list_begin_ = new_list_begin; 2871cb0ef41Sopenharmony_ci } 2881cb0ef41Sopenharmony_ci 2891cb0ef41Sopenharmony_ci code_.Add(result, zone_); 2901cb0ef41Sopenharmony_ci } 2911cb0ef41Sopenharmony_ci 2921cb0ef41Sopenharmony_ci Zone* zone_; 2931cb0ef41Sopenharmony_ci ZoneList<RegExpInstruction> code_; 2941cb0ef41Sopenharmony_ci}; 2951cb0ef41Sopenharmony_ci 2961cb0ef41Sopenharmony_ciclass CompileVisitor : private RegExpVisitor { 2971cb0ef41Sopenharmony_ci public: 2981cb0ef41Sopenharmony_ci static ZoneList<RegExpInstruction> Compile(RegExpTree* tree, 2991cb0ef41Sopenharmony_ci RegExpFlags flags, Zone* zone) { 3001cb0ef41Sopenharmony_ci CompileVisitor compiler(zone); 3011cb0ef41Sopenharmony_ci 3021cb0ef41Sopenharmony_ci if (!IsSticky(flags) && !tree->IsAnchoredAtStart()) { 3031cb0ef41Sopenharmony_ci // The match is not anchored, i.e. may start at any input position, so we 3041cb0ef41Sopenharmony_ci // emit a preamble corresponding to /.*?/. This skips an arbitrary 3051cb0ef41Sopenharmony_ci // prefix in the input non-greedily. 3061cb0ef41Sopenharmony_ci compiler.CompileNonGreedyStar( 3071cb0ef41Sopenharmony_ci [&]() { compiler.assembler_.ConsumeAnyChar(); }); 3081cb0ef41Sopenharmony_ci } 3091cb0ef41Sopenharmony_ci 3101cb0ef41Sopenharmony_ci compiler.assembler_.SetRegisterToCp(0); 3111cb0ef41Sopenharmony_ci tree->Accept(&compiler, nullptr); 3121cb0ef41Sopenharmony_ci compiler.assembler_.SetRegisterToCp(1); 3131cb0ef41Sopenharmony_ci compiler.assembler_.Accept(); 3141cb0ef41Sopenharmony_ci 3151cb0ef41Sopenharmony_ci return std::move(compiler.assembler_).IntoCode(); 3161cb0ef41Sopenharmony_ci } 3171cb0ef41Sopenharmony_ci 3181cb0ef41Sopenharmony_ci private: 3191cb0ef41Sopenharmony_ci explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {} 3201cb0ef41Sopenharmony_ci 3211cb0ef41Sopenharmony_ci // Generate a disjunction of code fragments compiled by a function `alt_gen`. 3221cb0ef41Sopenharmony_ci // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - 3231cb0ef41Sopenharmony_ci // 1` and should build code corresponding to the ith alternative. 3241cb0ef41Sopenharmony_ci template <class F> 3251cb0ef41Sopenharmony_ci void CompileDisjunction(int alt_num, F&& gen_alt) { 3261cb0ef41Sopenharmony_ci // An alternative a1 | ... | an is compiled into 3271cb0ef41Sopenharmony_ci // 3281cb0ef41Sopenharmony_ci // FORK tail1 3291cb0ef41Sopenharmony_ci // <a1> 3301cb0ef41Sopenharmony_ci // JMP end 3311cb0ef41Sopenharmony_ci // tail1: 3321cb0ef41Sopenharmony_ci // FORK tail2 3331cb0ef41Sopenharmony_ci // <a2> 3341cb0ef41Sopenharmony_ci // JMP end 3351cb0ef41Sopenharmony_ci // tail2: 3361cb0ef41Sopenharmony_ci // ... 3371cb0ef41Sopenharmony_ci // ... 3381cb0ef41Sopenharmony_ci // tail{n -1}: 3391cb0ef41Sopenharmony_ci // <an> 3401cb0ef41Sopenharmony_ci // end: 3411cb0ef41Sopenharmony_ci // 3421cb0ef41Sopenharmony_ci // By the semantics of the FORK instruction (see above at definition and 3431cb0ef41Sopenharmony_ci // semantics), a forked thread has lower priority than the thread that 3441cb0ef41Sopenharmony_ci // spawned it. This means that with the code we're generating here, the 3451cb0ef41Sopenharmony_ci // thread matching the alternative a1 has indeed highest priority, followed 3461cb0ef41Sopenharmony_ci // by the thread for a2 and so on. 3471cb0ef41Sopenharmony_ci 3481cb0ef41Sopenharmony_ci if (alt_num == 0) { 3491cb0ef41Sopenharmony_ci // The empty disjunction. This can never match. 3501cb0ef41Sopenharmony_ci assembler_.Fail(); 3511cb0ef41Sopenharmony_ci return; 3521cb0ef41Sopenharmony_ci } 3531cb0ef41Sopenharmony_ci 3541cb0ef41Sopenharmony_ci Label end; 3551cb0ef41Sopenharmony_ci 3561cb0ef41Sopenharmony_ci for (int i = 0; i != alt_num - 1; ++i) { 3571cb0ef41Sopenharmony_ci Label tail; 3581cb0ef41Sopenharmony_ci assembler_.Fork(tail); 3591cb0ef41Sopenharmony_ci gen_alt(i); 3601cb0ef41Sopenharmony_ci assembler_.Jmp(end); 3611cb0ef41Sopenharmony_ci assembler_.Bind(tail); 3621cb0ef41Sopenharmony_ci } 3631cb0ef41Sopenharmony_ci 3641cb0ef41Sopenharmony_ci gen_alt(alt_num - 1); 3651cb0ef41Sopenharmony_ci 3661cb0ef41Sopenharmony_ci assembler_.Bind(end); 3671cb0ef41Sopenharmony_ci } 3681cb0ef41Sopenharmony_ci 3691cb0ef41Sopenharmony_ci void* VisitDisjunction(RegExpDisjunction* node, void*) override { 3701cb0ef41Sopenharmony_ci ZoneList<RegExpTree*>& alts = *node->alternatives(); 3711cb0ef41Sopenharmony_ci CompileDisjunction(alts.length(), 3721cb0ef41Sopenharmony_ci [&](int i) { alts[i]->Accept(this, nullptr); }); 3731cb0ef41Sopenharmony_ci return nullptr; 3741cb0ef41Sopenharmony_ci } 3751cb0ef41Sopenharmony_ci 3761cb0ef41Sopenharmony_ci void* VisitAlternative(RegExpAlternative* node, void*) override { 3771cb0ef41Sopenharmony_ci for (RegExpTree* child : *node->nodes()) { 3781cb0ef41Sopenharmony_ci child->Accept(this, nullptr); 3791cb0ef41Sopenharmony_ci } 3801cb0ef41Sopenharmony_ci return nullptr; 3811cb0ef41Sopenharmony_ci } 3821cb0ef41Sopenharmony_ci 3831cb0ef41Sopenharmony_ci void* VisitAssertion(RegExpAssertion* node, void*) override { 3841cb0ef41Sopenharmony_ci assembler_.Assertion(node->assertion_type()); 3851cb0ef41Sopenharmony_ci return nullptr; 3861cb0ef41Sopenharmony_ci } 3871cb0ef41Sopenharmony_ci 3881cb0ef41Sopenharmony_ci void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { 3891cb0ef41Sopenharmony_ci // A character class is compiled as Disjunction over its `CharacterRange`s. 3901cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* ranges = node->ranges(zone_); 3911cb0ef41Sopenharmony_ci CharacterRange::Canonicalize(ranges); 3921cb0ef41Sopenharmony_ci if (node->is_negated()) { 3931cb0ef41Sopenharmony_ci // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d) 3941cb0ef41Sopenharmony_ci // union of k intervals is a union of at most k + 1 intervals. 3951cb0ef41Sopenharmony_ci ZoneList<CharacterRange>* negated = 3961cb0ef41Sopenharmony_ci zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_); 3971cb0ef41Sopenharmony_ci CharacterRange::Negate(ranges, negated, zone_); 3981cb0ef41Sopenharmony_ci DCHECK_LE(negated->length(), ranges->length() + 1); 3991cb0ef41Sopenharmony_ci ranges = negated; 4001cb0ef41Sopenharmony_ci } 4011cb0ef41Sopenharmony_ci 4021cb0ef41Sopenharmony_ci CompileDisjunction(ranges->length(), [&](int i) { 4031cb0ef41Sopenharmony_ci // We don't support utf16 for now, so only ranges that can be specified 4041cb0ef41Sopenharmony_ci // by (complements of) ranges with base::uc16 bounds. 4051cb0ef41Sopenharmony_ci STATIC_ASSERT(kMaxSupportedCodepoint <= 4061cb0ef41Sopenharmony_ci std::numeric_limits<base::uc16>::max()); 4071cb0ef41Sopenharmony_ci 4081cb0ef41Sopenharmony_ci base::uc32 from = (*ranges)[i].from(); 4091cb0ef41Sopenharmony_ci DCHECK_LE(from, kMaxSupportedCodepoint); 4101cb0ef41Sopenharmony_ci base::uc16 from_uc16 = static_cast<base::uc16>(from); 4111cb0ef41Sopenharmony_ci 4121cb0ef41Sopenharmony_ci base::uc32 to = (*ranges)[i].to(); 4131cb0ef41Sopenharmony_ci DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == kMaxCodePoint); 4141cb0ef41Sopenharmony_ci base::uc16 to_uc16 = 4151cb0ef41Sopenharmony_ci static_cast<base::uc16>(std::min(to, kMaxSupportedCodepoint)); 4161cb0ef41Sopenharmony_ci 4171cb0ef41Sopenharmony_ci assembler_.ConsumeRange(from_uc16, to_uc16); 4181cb0ef41Sopenharmony_ci }); 4191cb0ef41Sopenharmony_ci return nullptr; 4201cb0ef41Sopenharmony_ci } 4211cb0ef41Sopenharmony_ci 4221cb0ef41Sopenharmony_ci void* VisitAtom(RegExpAtom* node, void*) override { 4231cb0ef41Sopenharmony_ci for (base::uc16 c : node->data()) { 4241cb0ef41Sopenharmony_ci assembler_.ConsumeRange(c, c); 4251cb0ef41Sopenharmony_ci } 4261cb0ef41Sopenharmony_ci return nullptr; 4271cb0ef41Sopenharmony_ci } 4281cb0ef41Sopenharmony_ci 4291cb0ef41Sopenharmony_ci void ClearRegisters(Interval indices) { 4301cb0ef41Sopenharmony_ci if (indices.is_empty()) return; 4311cb0ef41Sopenharmony_ci DCHECK_EQ(indices.from() % 2, 0); 4321cb0ef41Sopenharmony_ci DCHECK_EQ(indices.to() % 2, 1); 4331cb0ef41Sopenharmony_ci for (int i = indices.from(); i <= indices.to(); i += 2) { 4341cb0ef41Sopenharmony_ci // It suffices to clear the register containing the `begin` of a capture 4351cb0ef41Sopenharmony_ci // because this indicates that the capture is undefined, regardless of 4361cb0ef41Sopenharmony_ci // the value in the `end` register. 4371cb0ef41Sopenharmony_ci assembler_.ClearRegister(i); 4381cb0ef41Sopenharmony_ci } 4391cb0ef41Sopenharmony_ci } 4401cb0ef41Sopenharmony_ci 4411cb0ef41Sopenharmony_ci // Emit bytecode corresponding to /<emit_body>*/. 4421cb0ef41Sopenharmony_ci template <class F> 4431cb0ef41Sopenharmony_ci void CompileGreedyStar(F&& emit_body) { 4441cb0ef41Sopenharmony_ci // This is compiled into 4451cb0ef41Sopenharmony_ci // 4461cb0ef41Sopenharmony_ci // begin: 4471cb0ef41Sopenharmony_ci // FORK end 4481cb0ef41Sopenharmony_ci // <body> 4491cb0ef41Sopenharmony_ci // JMP begin 4501cb0ef41Sopenharmony_ci // end: 4511cb0ef41Sopenharmony_ci // ... 4521cb0ef41Sopenharmony_ci // 4531cb0ef41Sopenharmony_ci // This is greedy because a forked thread has lower priority than the 4541cb0ef41Sopenharmony_ci // thread that spawned it. 4551cb0ef41Sopenharmony_ci Label begin; 4561cb0ef41Sopenharmony_ci Label end; 4571cb0ef41Sopenharmony_ci 4581cb0ef41Sopenharmony_ci assembler_.Bind(begin); 4591cb0ef41Sopenharmony_ci assembler_.Fork(end); 4601cb0ef41Sopenharmony_ci emit_body(); 4611cb0ef41Sopenharmony_ci assembler_.Jmp(begin); 4621cb0ef41Sopenharmony_ci 4631cb0ef41Sopenharmony_ci assembler_.Bind(end); 4641cb0ef41Sopenharmony_ci } 4651cb0ef41Sopenharmony_ci 4661cb0ef41Sopenharmony_ci // Emit bytecode corresponding to /<emit_body>*?/. 4671cb0ef41Sopenharmony_ci template <class F> 4681cb0ef41Sopenharmony_ci void CompileNonGreedyStar(F&& emit_body) { 4691cb0ef41Sopenharmony_ci // This is compiled into 4701cb0ef41Sopenharmony_ci // 4711cb0ef41Sopenharmony_ci // FORK body 4721cb0ef41Sopenharmony_ci // JMP end 4731cb0ef41Sopenharmony_ci // body: 4741cb0ef41Sopenharmony_ci // <body> 4751cb0ef41Sopenharmony_ci // FORK body 4761cb0ef41Sopenharmony_ci // end: 4771cb0ef41Sopenharmony_ci // ... 4781cb0ef41Sopenharmony_ci 4791cb0ef41Sopenharmony_ci Label body; 4801cb0ef41Sopenharmony_ci Label end; 4811cb0ef41Sopenharmony_ci 4821cb0ef41Sopenharmony_ci assembler_.Fork(body); 4831cb0ef41Sopenharmony_ci assembler_.Jmp(end); 4841cb0ef41Sopenharmony_ci 4851cb0ef41Sopenharmony_ci assembler_.Bind(body); 4861cb0ef41Sopenharmony_ci emit_body(); 4871cb0ef41Sopenharmony_ci assembler_.Fork(body); 4881cb0ef41Sopenharmony_ci 4891cb0ef41Sopenharmony_ci assembler_.Bind(end); 4901cb0ef41Sopenharmony_ci } 4911cb0ef41Sopenharmony_ci 4921cb0ef41Sopenharmony_ci // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/. 4931cb0ef41Sopenharmony_ci template <class F> 4941cb0ef41Sopenharmony_ci void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) { 4951cb0ef41Sopenharmony_ci // This is compiled into 4961cb0ef41Sopenharmony_ci // 4971cb0ef41Sopenharmony_ci // FORK end 4981cb0ef41Sopenharmony_ci // <body> 4991cb0ef41Sopenharmony_ci // FORK end 5001cb0ef41Sopenharmony_ci // <body> 5011cb0ef41Sopenharmony_ci // ... 5021cb0ef41Sopenharmony_ci // ... 5031cb0ef41Sopenharmony_ci // FORK end 5041cb0ef41Sopenharmony_ci // <body> 5051cb0ef41Sopenharmony_ci // end: 5061cb0ef41Sopenharmony_ci // ... 5071cb0ef41Sopenharmony_ci 5081cb0ef41Sopenharmony_ci Label end; 5091cb0ef41Sopenharmony_ci for (int i = 0; i != max_repetition_num; ++i) { 5101cb0ef41Sopenharmony_ci assembler_.Fork(end); 5111cb0ef41Sopenharmony_ci emit_body(); 5121cb0ef41Sopenharmony_ci } 5131cb0ef41Sopenharmony_ci assembler_.Bind(end); 5141cb0ef41Sopenharmony_ci } 5151cb0ef41Sopenharmony_ci 5161cb0ef41Sopenharmony_ci // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/. 5171cb0ef41Sopenharmony_ci template <class F> 5181cb0ef41Sopenharmony_ci void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) { 5191cb0ef41Sopenharmony_ci // This is compiled into 5201cb0ef41Sopenharmony_ci // 5211cb0ef41Sopenharmony_ci // FORK body0 5221cb0ef41Sopenharmony_ci // JMP end 5231cb0ef41Sopenharmony_ci // body0: 5241cb0ef41Sopenharmony_ci // <body> 5251cb0ef41Sopenharmony_ci // FORK body1 5261cb0ef41Sopenharmony_ci // JMP end 5271cb0ef41Sopenharmony_ci // body1: 5281cb0ef41Sopenharmony_ci // <body> 5291cb0ef41Sopenharmony_ci // ... 5301cb0ef41Sopenharmony_ci // ... 5311cb0ef41Sopenharmony_ci // body{max_repetition_num - 1}: 5321cb0ef41Sopenharmony_ci // <body> 5331cb0ef41Sopenharmony_ci // end: 5341cb0ef41Sopenharmony_ci // ... 5351cb0ef41Sopenharmony_ci 5361cb0ef41Sopenharmony_ci Label end; 5371cb0ef41Sopenharmony_ci for (int i = 0; i != max_repetition_num; ++i) { 5381cb0ef41Sopenharmony_ci Label body; 5391cb0ef41Sopenharmony_ci assembler_.Fork(body); 5401cb0ef41Sopenharmony_ci assembler_.Jmp(end); 5411cb0ef41Sopenharmony_ci 5421cb0ef41Sopenharmony_ci assembler_.Bind(body); 5431cb0ef41Sopenharmony_ci emit_body(); 5441cb0ef41Sopenharmony_ci } 5451cb0ef41Sopenharmony_ci assembler_.Bind(end); 5461cb0ef41Sopenharmony_ci } 5471cb0ef41Sopenharmony_ci 5481cb0ef41Sopenharmony_ci void* VisitQuantifier(RegExpQuantifier* node, void*) override { 5491cb0ef41Sopenharmony_ci // Emit the body, but clear registers occuring in body first. 5501cb0ef41Sopenharmony_ci // 5511cb0ef41Sopenharmony_ci // TODO(mbid,v8:10765): It's not always necessary to a) capture registers 5521cb0ef41Sopenharmony_ci // and b) clear them. For example, we don't have to capture anything for 5531cb0ef41Sopenharmony_ci // the first 4 repetitions if node->min() >= 5, and then we don't have to 5541cb0ef41Sopenharmony_ci // clear registers in the first node->min() repetitions. 5551cb0ef41Sopenharmony_ci // Later, and if node->min() == 0, we don't have to clear registers before 5561cb0ef41Sopenharmony_ci // the first optional repetition. 5571cb0ef41Sopenharmony_ci Interval body_registers = node->body()->CaptureRegisters(); 5581cb0ef41Sopenharmony_ci auto emit_body = [&]() { 5591cb0ef41Sopenharmony_ci ClearRegisters(body_registers); 5601cb0ef41Sopenharmony_ci node->body()->Accept(this, nullptr); 5611cb0ef41Sopenharmony_ci }; 5621cb0ef41Sopenharmony_ci 5631cb0ef41Sopenharmony_ci // First repeat the body `min()` times. 5641cb0ef41Sopenharmony_ci for (int i = 0; i != node->min(); ++i) emit_body(); 5651cb0ef41Sopenharmony_ci 5661cb0ef41Sopenharmony_ci switch (node->quantifier_type()) { 5671cb0ef41Sopenharmony_ci case RegExpQuantifier::POSSESSIVE: 5681cb0ef41Sopenharmony_ci UNREACHABLE(); 5691cb0ef41Sopenharmony_ci case RegExpQuantifier::GREEDY: { 5701cb0ef41Sopenharmony_ci if (node->max() == RegExpTree::kInfinity) { 5711cb0ef41Sopenharmony_ci CompileGreedyStar(emit_body); 5721cb0ef41Sopenharmony_ci } else { 5731cb0ef41Sopenharmony_ci DCHECK_NE(node->max(), RegExpTree::kInfinity); 5741cb0ef41Sopenharmony_ci CompileGreedyRepetition(emit_body, node->max() - node->min()); 5751cb0ef41Sopenharmony_ci } 5761cb0ef41Sopenharmony_ci break; 5771cb0ef41Sopenharmony_ci } 5781cb0ef41Sopenharmony_ci case RegExpQuantifier::NON_GREEDY: { 5791cb0ef41Sopenharmony_ci if (node->max() == RegExpTree::kInfinity) { 5801cb0ef41Sopenharmony_ci CompileNonGreedyStar(emit_body); 5811cb0ef41Sopenharmony_ci } else { 5821cb0ef41Sopenharmony_ci DCHECK_NE(node->max(), RegExpTree::kInfinity); 5831cb0ef41Sopenharmony_ci CompileNonGreedyRepetition(emit_body, node->max() - node->min()); 5841cb0ef41Sopenharmony_ci } 5851cb0ef41Sopenharmony_ci } 5861cb0ef41Sopenharmony_ci } 5871cb0ef41Sopenharmony_ci return nullptr; 5881cb0ef41Sopenharmony_ci } 5891cb0ef41Sopenharmony_ci 5901cb0ef41Sopenharmony_ci void* VisitCapture(RegExpCapture* node, void*) override { 5911cb0ef41Sopenharmony_ci int index = node->index(); 5921cb0ef41Sopenharmony_ci int start_register = RegExpCapture::StartRegister(index); 5931cb0ef41Sopenharmony_ci int end_register = RegExpCapture::EndRegister(index); 5941cb0ef41Sopenharmony_ci assembler_.SetRegisterToCp(start_register); 5951cb0ef41Sopenharmony_ci node->body()->Accept(this, nullptr); 5961cb0ef41Sopenharmony_ci assembler_.SetRegisterToCp(end_register); 5971cb0ef41Sopenharmony_ci return nullptr; 5981cb0ef41Sopenharmony_ci } 5991cb0ef41Sopenharmony_ci 6001cb0ef41Sopenharmony_ci void* VisitGroup(RegExpGroup* node, void*) override { 6011cb0ef41Sopenharmony_ci node->body()->Accept(this, nullptr); 6021cb0ef41Sopenharmony_ci return nullptr; 6031cb0ef41Sopenharmony_ci } 6041cb0ef41Sopenharmony_ci 6051cb0ef41Sopenharmony_ci void* VisitLookaround(RegExpLookaround* node, void*) override { 6061cb0ef41Sopenharmony_ci // TODO(mbid,v8:10765): Support this case. 6071cb0ef41Sopenharmony_ci UNREACHABLE(); 6081cb0ef41Sopenharmony_ci } 6091cb0ef41Sopenharmony_ci 6101cb0ef41Sopenharmony_ci void* VisitBackReference(RegExpBackReference* node, void*) override { 6111cb0ef41Sopenharmony_ci UNREACHABLE(); 6121cb0ef41Sopenharmony_ci } 6131cb0ef41Sopenharmony_ci 6141cb0ef41Sopenharmony_ci void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; } 6151cb0ef41Sopenharmony_ci 6161cb0ef41Sopenharmony_ci void* VisitText(RegExpText* node, void*) override { 6171cb0ef41Sopenharmony_ci for (TextElement& text_el : *node->elements()) { 6181cb0ef41Sopenharmony_ci text_el.tree()->Accept(this, nullptr); 6191cb0ef41Sopenharmony_ci } 6201cb0ef41Sopenharmony_ci return nullptr; 6211cb0ef41Sopenharmony_ci } 6221cb0ef41Sopenharmony_ci 6231cb0ef41Sopenharmony_ci private: 6241cb0ef41Sopenharmony_ci Zone* zone_; 6251cb0ef41Sopenharmony_ci BytecodeAssembler assembler_; 6261cb0ef41Sopenharmony_ci}; 6271cb0ef41Sopenharmony_ci 6281cb0ef41Sopenharmony_ci} // namespace 6291cb0ef41Sopenharmony_ci 6301cb0ef41Sopenharmony_ciZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile( 6311cb0ef41Sopenharmony_ci RegExpTree* tree, RegExpFlags flags, Zone* zone) { 6321cb0ef41Sopenharmony_ci return CompileVisitor::Compile(tree, flags, zone); 6331cb0ef41Sopenharmony_ci} 6341cb0ef41Sopenharmony_ci 6351cb0ef41Sopenharmony_ci} // namespace internal 6361cb0ef41Sopenharmony_ci} // namespace v8 637