11cb0ef41Sopenharmony_ci// Copyright 2020 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental-compiler.h"
61cb0ef41Sopenharmony_ci
71cb0ef41Sopenharmony_ci#include "src/base/strings.h"
81cb0ef41Sopenharmony_ci#include "src/regexp/experimental/experimental.h"
91cb0ef41Sopenharmony_ci#include "src/zone/zone-list-inl.h"
101cb0ef41Sopenharmony_ci
111cb0ef41Sopenharmony_cinamespace v8 {
121cb0ef41Sopenharmony_cinamespace internal {
131cb0ef41Sopenharmony_ci
141cb0ef41Sopenharmony_cinamespace {
151cb0ef41Sopenharmony_ci
161cb0ef41Sopenharmony_ci// TODO(mbid, v8:10765): Currently the experimental engine doesn't support
171cb0ef41Sopenharmony_ci// UTF-16, but this shouldn't be too hard to implement.
181cb0ef41Sopenharmony_ciconstexpr base::uc32 kMaxSupportedCodepoint = 0xFFFFu;
191cb0ef41Sopenharmony_ci#ifdef DEBUG
201cb0ef41Sopenharmony_ciconstexpr base::uc32 kMaxCodePoint = 0x10ffff;
211cb0ef41Sopenharmony_ci#endif  // DEBUG
221cb0ef41Sopenharmony_ci
231cb0ef41Sopenharmony_ciclass CanBeHandledVisitor final : private RegExpVisitor {
241cb0ef41Sopenharmony_ci  // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
251cb0ef41Sopenharmony_ci public:
261cb0ef41Sopenharmony_ci  static bool Check(RegExpTree* tree, RegExpFlags flags, int capture_count) {
271cb0ef41Sopenharmony_ci    if (!AreSuitableFlags(flags)) return false;
281cb0ef41Sopenharmony_ci    CanBeHandledVisitor visitor;
291cb0ef41Sopenharmony_ci    tree->Accept(&visitor, nullptr);
301cb0ef41Sopenharmony_ci    return visitor.result_;
311cb0ef41Sopenharmony_ci  }
321cb0ef41Sopenharmony_ci
331cb0ef41Sopenharmony_ci private:
341cb0ef41Sopenharmony_ci  CanBeHandledVisitor() = default;
351cb0ef41Sopenharmony_ci
361cb0ef41Sopenharmony_ci  static bool AreSuitableFlags(RegExpFlags flags) {
371cb0ef41Sopenharmony_ci    // TODO(mbid, v8:10765): We should be able to support all flags in the
381cb0ef41Sopenharmony_ci    // future.
391cb0ef41Sopenharmony_ci    static constexpr RegExpFlags kAllowedFlags =
401cb0ef41Sopenharmony_ci        RegExpFlag::kGlobal | RegExpFlag::kSticky | RegExpFlag::kMultiline |
411cb0ef41Sopenharmony_ci        RegExpFlag::kDotAll | RegExpFlag::kLinear;
421cb0ef41Sopenharmony_ci    // We support Unicode iff kUnicode is among the supported flags.
431cb0ef41Sopenharmony_ci    STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
441cb0ef41Sopenharmony_ci                  IsUnicode(kAllowedFlags));
451cb0ef41Sopenharmony_ci    return (flags & ~kAllowedFlags) == 0;
461cb0ef41Sopenharmony_ci  }
471cb0ef41Sopenharmony_ci
481cb0ef41Sopenharmony_ci  void* VisitDisjunction(RegExpDisjunction* node, void*) override {
491cb0ef41Sopenharmony_ci    for (RegExpTree* alt : *node->alternatives()) {
501cb0ef41Sopenharmony_ci      alt->Accept(this, nullptr);
511cb0ef41Sopenharmony_ci      if (!result_) {
521cb0ef41Sopenharmony_ci        return nullptr;
531cb0ef41Sopenharmony_ci      }
541cb0ef41Sopenharmony_ci    }
551cb0ef41Sopenharmony_ci    return nullptr;
561cb0ef41Sopenharmony_ci  }
571cb0ef41Sopenharmony_ci
581cb0ef41Sopenharmony_ci  void* VisitAlternative(RegExpAlternative* node, void*) override {
591cb0ef41Sopenharmony_ci    for (RegExpTree* child : *node->nodes()) {
601cb0ef41Sopenharmony_ci      child->Accept(this, nullptr);
611cb0ef41Sopenharmony_ci      if (!result_) {
621cb0ef41Sopenharmony_ci        return nullptr;
631cb0ef41Sopenharmony_ci      }
641cb0ef41Sopenharmony_ci    }
651cb0ef41Sopenharmony_ci    return nullptr;
661cb0ef41Sopenharmony_ci  }
671cb0ef41Sopenharmony_ci
681cb0ef41Sopenharmony_ci  void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
691cb0ef41Sopenharmony_ci    return nullptr;
701cb0ef41Sopenharmony_ci  }
711cb0ef41Sopenharmony_ci
721cb0ef41Sopenharmony_ci  void* VisitAssertion(RegExpAssertion* node, void*) override {
731cb0ef41Sopenharmony_ci    return nullptr;
741cb0ef41Sopenharmony_ci  }
751cb0ef41Sopenharmony_ci
761cb0ef41Sopenharmony_ci  void* VisitAtom(RegExpAtom* node, void*) override {
771cb0ef41Sopenharmony_ci    return nullptr;
781cb0ef41Sopenharmony_ci  }
791cb0ef41Sopenharmony_ci
801cb0ef41Sopenharmony_ci  void* VisitText(RegExpText* node, void*) override {
811cb0ef41Sopenharmony_ci    for (TextElement& el : *node->elements()) {
821cb0ef41Sopenharmony_ci      el.tree()->Accept(this, nullptr);
831cb0ef41Sopenharmony_ci      if (!result_) {
841cb0ef41Sopenharmony_ci        return nullptr;
851cb0ef41Sopenharmony_ci      }
861cb0ef41Sopenharmony_ci    }
871cb0ef41Sopenharmony_ci    return nullptr;
881cb0ef41Sopenharmony_ci  }
891cb0ef41Sopenharmony_ci
901cb0ef41Sopenharmony_ci  void* VisitQuantifier(RegExpQuantifier* node, void*) override {
911cb0ef41Sopenharmony_ci    // Finite but large values of `min()` and `max()` are bad for the
921cb0ef41Sopenharmony_ci    // breadth-first engine because finite (optional) repetition is dealt with
931cb0ef41Sopenharmony_ci    // by replicating the bytecode of the body of the quantifier.  The number
941cb0ef41Sopenharmony_ci    // of replicatons grows exponentially in how deeply quantifiers are nested.
951cb0ef41Sopenharmony_ci    // `replication_factor_` keeps track of how often the current node will
961cb0ef41Sopenharmony_ci    // have to be replicated in the generated bytecode, and we don't allow this
971cb0ef41Sopenharmony_ci    // to exceed some small value.
981cb0ef41Sopenharmony_ci    static constexpr int kMaxReplicationFactor = 16;
991cb0ef41Sopenharmony_ci
1001cb0ef41Sopenharmony_ci    // First we rule out values for min and max that are too big even before
1011cb0ef41Sopenharmony_ci    // taking into account the ambient replication_factor_.  This also guards
1021cb0ef41Sopenharmony_ci    // against overflows in `local_replication` or `replication_factor_`.
1031cb0ef41Sopenharmony_ci    if (node->min() > kMaxReplicationFactor ||
1041cb0ef41Sopenharmony_ci        (node->max() != RegExpTree::kInfinity &&
1051cb0ef41Sopenharmony_ci         node->max() > kMaxReplicationFactor)) {
1061cb0ef41Sopenharmony_ci      result_ = false;
1071cb0ef41Sopenharmony_ci      return nullptr;
1081cb0ef41Sopenharmony_ci    }
1091cb0ef41Sopenharmony_ci
1101cb0ef41Sopenharmony_ci    // Save the current replication factor so that it can be restored if we
1111cb0ef41Sopenharmony_ci    // return with `result_ == true`.
1121cb0ef41Sopenharmony_ci    int before_replication_factor = replication_factor_;
1131cb0ef41Sopenharmony_ci
1141cb0ef41Sopenharmony_ci    int local_replication;
1151cb0ef41Sopenharmony_ci    if (node->max() == RegExpTree::kInfinity) {
1161cb0ef41Sopenharmony_ci      local_replication = node->min() + 1;
1171cb0ef41Sopenharmony_ci    } else {
1181cb0ef41Sopenharmony_ci      local_replication = node->max();
1191cb0ef41Sopenharmony_ci    }
1201cb0ef41Sopenharmony_ci
1211cb0ef41Sopenharmony_ci    replication_factor_ *= local_replication;
1221cb0ef41Sopenharmony_ci    if (replication_factor_ > kMaxReplicationFactor) {
1231cb0ef41Sopenharmony_ci      result_ = false;
1241cb0ef41Sopenharmony_ci      return nullptr;
1251cb0ef41Sopenharmony_ci    }
1261cb0ef41Sopenharmony_ci
1271cb0ef41Sopenharmony_ci    switch (node->quantifier_type()) {
1281cb0ef41Sopenharmony_ci      case RegExpQuantifier::GREEDY:
1291cb0ef41Sopenharmony_ci      case RegExpQuantifier::NON_GREEDY:
1301cb0ef41Sopenharmony_ci        break;
1311cb0ef41Sopenharmony_ci      case RegExpQuantifier::POSSESSIVE:
1321cb0ef41Sopenharmony_ci        // TODO(mbid, v8:10765): It's not clear to me whether this can be
1331cb0ef41Sopenharmony_ci        // supported in breadth-first mode. Re2 doesn't support it.
1341cb0ef41Sopenharmony_ci        result_ = false;
1351cb0ef41Sopenharmony_ci        return nullptr;
1361cb0ef41Sopenharmony_ci    }
1371cb0ef41Sopenharmony_ci
1381cb0ef41Sopenharmony_ci    node->body()->Accept(this, nullptr);
1391cb0ef41Sopenharmony_ci    replication_factor_ = before_replication_factor;
1401cb0ef41Sopenharmony_ci    return nullptr;
1411cb0ef41Sopenharmony_ci  }
1421cb0ef41Sopenharmony_ci
1431cb0ef41Sopenharmony_ci  void* VisitCapture(RegExpCapture* node, void*) override {
1441cb0ef41Sopenharmony_ci    node->body()->Accept(this, nullptr);
1451cb0ef41Sopenharmony_ci    return nullptr;
1461cb0ef41Sopenharmony_ci  }
1471cb0ef41Sopenharmony_ci
1481cb0ef41Sopenharmony_ci  void* VisitGroup(RegExpGroup* node, void*) override {
1491cb0ef41Sopenharmony_ci    node->body()->Accept(this, nullptr);
1501cb0ef41Sopenharmony_ci    return nullptr;
1511cb0ef41Sopenharmony_ci  }
1521cb0ef41Sopenharmony_ci
1531cb0ef41Sopenharmony_ci  void* VisitLookaround(RegExpLookaround* node, void*) override {
1541cb0ef41Sopenharmony_ci    // TODO(mbid, v8:10765): This will be hard to support, but not impossible I
1551cb0ef41Sopenharmony_ci    // think.  See product automata.
1561cb0ef41Sopenharmony_ci    result_ = false;
1571cb0ef41Sopenharmony_ci    return nullptr;
1581cb0ef41Sopenharmony_ci  }
1591cb0ef41Sopenharmony_ci
1601cb0ef41Sopenharmony_ci  void* VisitBackReference(RegExpBackReference* node, void*) override {
1611cb0ef41Sopenharmony_ci    // This can't be implemented without backtracking.
1621cb0ef41Sopenharmony_ci    result_ = false;
1631cb0ef41Sopenharmony_ci    return nullptr;
1641cb0ef41Sopenharmony_ci  }
1651cb0ef41Sopenharmony_ci
1661cb0ef41Sopenharmony_ci  void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
1671cb0ef41Sopenharmony_ci
1681cb0ef41Sopenharmony_ci private:
1691cb0ef41Sopenharmony_ci  // See comment in `VisitQuantifier`:
1701cb0ef41Sopenharmony_ci  int replication_factor_ = 1;
1711cb0ef41Sopenharmony_ci
1721cb0ef41Sopenharmony_ci  bool result_ = true;
1731cb0ef41Sopenharmony_ci};
1741cb0ef41Sopenharmony_ci
1751cb0ef41Sopenharmony_ci}  // namespace
1761cb0ef41Sopenharmony_ci
1771cb0ef41Sopenharmony_cibool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
1781cb0ef41Sopenharmony_ci                                              RegExpFlags flags,
1791cb0ef41Sopenharmony_ci                                              int capture_count) {
1801cb0ef41Sopenharmony_ci  return CanBeHandledVisitor::Check(tree, flags, capture_count);
1811cb0ef41Sopenharmony_ci}
1821cb0ef41Sopenharmony_ci
1831cb0ef41Sopenharmony_cinamespace {
1841cb0ef41Sopenharmony_ci
1851cb0ef41Sopenharmony_ci// A label in bytecode which starts with no known address. The address *must*
1861cb0ef41Sopenharmony_ci// be bound with `Bind` before the label goes out of scope.
1871cb0ef41Sopenharmony_ci// Implemented as a linked list through the `payload.pc` of FORK and JMP
1881cb0ef41Sopenharmony_ci// instructions.
1891cb0ef41Sopenharmony_cistruct Label {
1901cb0ef41Sopenharmony_ci public:
1911cb0ef41Sopenharmony_ci  Label() = default;
1921cb0ef41Sopenharmony_ci  ~Label() {
1931cb0ef41Sopenharmony_ci    DCHECK_EQ(state_, BOUND);
1941cb0ef41Sopenharmony_ci    DCHECK_GE(bound_index_, 0);
1951cb0ef41Sopenharmony_ci  }
1961cb0ef41Sopenharmony_ci
1971cb0ef41Sopenharmony_ci  // Don't copy, don't move.  Moving could be implemented, but it's not
1981cb0ef41Sopenharmony_ci  // needed anywhere.
1991cb0ef41Sopenharmony_ci  Label(const Label&) = delete;
2001cb0ef41Sopenharmony_ci  Label& operator=(const Label&) = delete;
2011cb0ef41Sopenharmony_ci
2021cb0ef41Sopenharmony_ci private:
2031cb0ef41Sopenharmony_ci  friend class BytecodeAssembler;
2041cb0ef41Sopenharmony_ci
2051cb0ef41Sopenharmony_ci  // UNBOUND implies unbound_patch_list_begin_.
2061cb0ef41Sopenharmony_ci  // BOUND implies bound_index_.
2071cb0ef41Sopenharmony_ci  enum { UNBOUND, BOUND } state_ = UNBOUND;
2081cb0ef41Sopenharmony_ci  union {
2091cb0ef41Sopenharmony_ci    int unbound_patch_list_begin_ = -1;
2101cb0ef41Sopenharmony_ci    int bound_index_;
2111cb0ef41Sopenharmony_ci  };
2121cb0ef41Sopenharmony_ci};
2131cb0ef41Sopenharmony_ci
2141cb0ef41Sopenharmony_ciclass BytecodeAssembler {
2151cb0ef41Sopenharmony_ci public:
2161cb0ef41Sopenharmony_ci  // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
2171cb0ef41Sopenharmony_ci  // the `tree` size we're going to compile?
2181cb0ef41Sopenharmony_ci  explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
2191cb0ef41Sopenharmony_ci
2201cb0ef41Sopenharmony_ci  ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
2211cb0ef41Sopenharmony_ci
2221cb0ef41Sopenharmony_ci  void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
2231cb0ef41Sopenharmony_ci
2241cb0ef41Sopenharmony_ci  void Assertion(RegExpAssertion::Type t) {
2251cb0ef41Sopenharmony_ci    code_.Add(RegExpInstruction::Assertion(t), zone_);
2261cb0ef41Sopenharmony_ci  }
2271cb0ef41Sopenharmony_ci
2281cb0ef41Sopenharmony_ci  void ClearRegister(int32_t register_index) {
2291cb0ef41Sopenharmony_ci    code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
2301cb0ef41Sopenharmony_ci  }
2311cb0ef41Sopenharmony_ci
2321cb0ef41Sopenharmony_ci  void ConsumeRange(base::uc16 from, base::uc16 to) {
2331cb0ef41Sopenharmony_ci    code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
2341cb0ef41Sopenharmony_ci  }
2351cb0ef41Sopenharmony_ci
2361cb0ef41Sopenharmony_ci  void ConsumeAnyChar() {
2371cb0ef41Sopenharmony_ci    code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_);
2381cb0ef41Sopenharmony_ci  }
2391cb0ef41Sopenharmony_ci
2401cb0ef41Sopenharmony_ci  void Fork(Label& target) {
2411cb0ef41Sopenharmony_ci    LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
2421cb0ef41Sopenharmony_ci  }
2431cb0ef41Sopenharmony_ci
2441cb0ef41Sopenharmony_ci  void Jmp(Label& target) {
2451cb0ef41Sopenharmony_ci    LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
2461cb0ef41Sopenharmony_ci  }
2471cb0ef41Sopenharmony_ci
2481cb0ef41Sopenharmony_ci  void SetRegisterToCp(int32_t register_index) {
2491cb0ef41Sopenharmony_ci    code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
2501cb0ef41Sopenharmony_ci  }
2511cb0ef41Sopenharmony_ci
2521cb0ef41Sopenharmony_ci  void Bind(Label& target) {
2531cb0ef41Sopenharmony_ci    DCHECK_EQ(target.state_, Label::UNBOUND);
2541cb0ef41Sopenharmony_ci
2551cb0ef41Sopenharmony_ci    int index = code_.length();
2561cb0ef41Sopenharmony_ci
2571cb0ef41Sopenharmony_ci    while (target.unbound_patch_list_begin_ != -1) {
2581cb0ef41Sopenharmony_ci      RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
2591cb0ef41Sopenharmony_ci      DCHECK(inst.opcode == RegExpInstruction::FORK ||
2601cb0ef41Sopenharmony_ci             inst.opcode == RegExpInstruction::JMP);
2611cb0ef41Sopenharmony_ci
2621cb0ef41Sopenharmony_ci      target.unbound_patch_list_begin_ = inst.payload.pc;
2631cb0ef41Sopenharmony_ci      inst.payload.pc = index;
2641cb0ef41Sopenharmony_ci    }
2651cb0ef41Sopenharmony_ci
2661cb0ef41Sopenharmony_ci    target.state_ = Label::BOUND;
2671cb0ef41Sopenharmony_ci    target.bound_index_ = index;
2681cb0ef41Sopenharmony_ci  }
2691cb0ef41Sopenharmony_ci
2701cb0ef41Sopenharmony_ci  void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
2711cb0ef41Sopenharmony_ci
2721cb0ef41Sopenharmony_ci private:
2731cb0ef41Sopenharmony_ci  void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
2741cb0ef41Sopenharmony_ci    RegExpInstruction result;
2751cb0ef41Sopenharmony_ci    result.opcode = op;
2761cb0ef41Sopenharmony_ci
2771cb0ef41Sopenharmony_ci    if (target.state_ == Label::BOUND) {
2781cb0ef41Sopenharmony_ci      result.payload.pc = target.bound_index_;
2791cb0ef41Sopenharmony_ci    } else {
2801cb0ef41Sopenharmony_ci      DCHECK_EQ(target.state_, Label::UNBOUND);
2811cb0ef41Sopenharmony_ci      int new_list_begin = code_.length();
2821cb0ef41Sopenharmony_ci      DCHECK_GE(new_list_begin, 0);
2831cb0ef41Sopenharmony_ci
2841cb0ef41Sopenharmony_ci      result.payload.pc = target.unbound_patch_list_begin_;
2851cb0ef41Sopenharmony_ci
2861cb0ef41Sopenharmony_ci      target.unbound_patch_list_begin_ = new_list_begin;
2871cb0ef41Sopenharmony_ci    }
2881cb0ef41Sopenharmony_ci
2891cb0ef41Sopenharmony_ci    code_.Add(result, zone_);
2901cb0ef41Sopenharmony_ci  }
2911cb0ef41Sopenharmony_ci
2921cb0ef41Sopenharmony_ci  Zone* zone_;
2931cb0ef41Sopenharmony_ci  ZoneList<RegExpInstruction> code_;
2941cb0ef41Sopenharmony_ci};
2951cb0ef41Sopenharmony_ci
2961cb0ef41Sopenharmony_ciclass CompileVisitor : private RegExpVisitor {
2971cb0ef41Sopenharmony_ci public:
2981cb0ef41Sopenharmony_ci  static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
2991cb0ef41Sopenharmony_ci                                             RegExpFlags flags, Zone* zone) {
3001cb0ef41Sopenharmony_ci    CompileVisitor compiler(zone);
3011cb0ef41Sopenharmony_ci
3021cb0ef41Sopenharmony_ci    if (!IsSticky(flags) && !tree->IsAnchoredAtStart()) {
3031cb0ef41Sopenharmony_ci      // The match is not anchored, i.e. may start at any input position, so we
3041cb0ef41Sopenharmony_ci      // emit a preamble corresponding to /.*?/.  This skips an arbitrary
3051cb0ef41Sopenharmony_ci      // prefix in the input non-greedily.
3061cb0ef41Sopenharmony_ci      compiler.CompileNonGreedyStar(
3071cb0ef41Sopenharmony_ci          [&]() { compiler.assembler_.ConsumeAnyChar(); });
3081cb0ef41Sopenharmony_ci    }
3091cb0ef41Sopenharmony_ci
3101cb0ef41Sopenharmony_ci    compiler.assembler_.SetRegisterToCp(0);
3111cb0ef41Sopenharmony_ci    tree->Accept(&compiler, nullptr);
3121cb0ef41Sopenharmony_ci    compiler.assembler_.SetRegisterToCp(1);
3131cb0ef41Sopenharmony_ci    compiler.assembler_.Accept();
3141cb0ef41Sopenharmony_ci
3151cb0ef41Sopenharmony_ci    return std::move(compiler.assembler_).IntoCode();
3161cb0ef41Sopenharmony_ci  }
3171cb0ef41Sopenharmony_ci
3181cb0ef41Sopenharmony_ci private:
3191cb0ef41Sopenharmony_ci  explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {}
3201cb0ef41Sopenharmony_ci
3211cb0ef41Sopenharmony_ci  // Generate a disjunction of code fragments compiled by a function `alt_gen`.
3221cb0ef41Sopenharmony_ci  // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
3231cb0ef41Sopenharmony_ci  // 1` and should build code corresponding to the ith alternative.
3241cb0ef41Sopenharmony_ci  template <class F>
3251cb0ef41Sopenharmony_ci  void CompileDisjunction(int alt_num, F&& gen_alt) {
3261cb0ef41Sopenharmony_ci    // An alternative a1 | ... | an is compiled into
3271cb0ef41Sopenharmony_ci    //
3281cb0ef41Sopenharmony_ci    //     FORK tail1
3291cb0ef41Sopenharmony_ci    //     <a1>
3301cb0ef41Sopenharmony_ci    //     JMP end
3311cb0ef41Sopenharmony_ci    //   tail1:
3321cb0ef41Sopenharmony_ci    //     FORK tail2
3331cb0ef41Sopenharmony_ci    //     <a2>
3341cb0ef41Sopenharmony_ci    //     JMP end
3351cb0ef41Sopenharmony_ci    //   tail2:
3361cb0ef41Sopenharmony_ci    //     ...
3371cb0ef41Sopenharmony_ci    //     ...
3381cb0ef41Sopenharmony_ci    //   tail{n -1}:
3391cb0ef41Sopenharmony_ci    //     <an>
3401cb0ef41Sopenharmony_ci    //   end:
3411cb0ef41Sopenharmony_ci    //
3421cb0ef41Sopenharmony_ci    // By the semantics of the FORK instruction (see above at definition and
3431cb0ef41Sopenharmony_ci    // semantics), a forked thread has lower priority than the thread that
3441cb0ef41Sopenharmony_ci    // spawned it.  This means that with the code we're generating here, the
3451cb0ef41Sopenharmony_ci    // thread matching the alternative a1 has indeed highest priority, followed
3461cb0ef41Sopenharmony_ci    // by the thread for a2 and so on.
3471cb0ef41Sopenharmony_ci
3481cb0ef41Sopenharmony_ci    if (alt_num == 0) {
3491cb0ef41Sopenharmony_ci      // The empty disjunction.  This can never match.
3501cb0ef41Sopenharmony_ci      assembler_.Fail();
3511cb0ef41Sopenharmony_ci      return;
3521cb0ef41Sopenharmony_ci    }
3531cb0ef41Sopenharmony_ci
3541cb0ef41Sopenharmony_ci    Label end;
3551cb0ef41Sopenharmony_ci
3561cb0ef41Sopenharmony_ci    for (int i = 0; i != alt_num - 1; ++i) {
3571cb0ef41Sopenharmony_ci      Label tail;
3581cb0ef41Sopenharmony_ci      assembler_.Fork(tail);
3591cb0ef41Sopenharmony_ci      gen_alt(i);
3601cb0ef41Sopenharmony_ci      assembler_.Jmp(end);
3611cb0ef41Sopenharmony_ci      assembler_.Bind(tail);
3621cb0ef41Sopenharmony_ci    }
3631cb0ef41Sopenharmony_ci
3641cb0ef41Sopenharmony_ci    gen_alt(alt_num - 1);
3651cb0ef41Sopenharmony_ci
3661cb0ef41Sopenharmony_ci    assembler_.Bind(end);
3671cb0ef41Sopenharmony_ci  }
3681cb0ef41Sopenharmony_ci
3691cb0ef41Sopenharmony_ci  void* VisitDisjunction(RegExpDisjunction* node, void*) override {
3701cb0ef41Sopenharmony_ci    ZoneList<RegExpTree*>& alts = *node->alternatives();
3711cb0ef41Sopenharmony_ci    CompileDisjunction(alts.length(),
3721cb0ef41Sopenharmony_ci                       [&](int i) { alts[i]->Accept(this, nullptr); });
3731cb0ef41Sopenharmony_ci    return nullptr;
3741cb0ef41Sopenharmony_ci  }
3751cb0ef41Sopenharmony_ci
3761cb0ef41Sopenharmony_ci  void* VisitAlternative(RegExpAlternative* node, void*) override {
3771cb0ef41Sopenharmony_ci    for (RegExpTree* child : *node->nodes()) {
3781cb0ef41Sopenharmony_ci      child->Accept(this, nullptr);
3791cb0ef41Sopenharmony_ci    }
3801cb0ef41Sopenharmony_ci    return nullptr;
3811cb0ef41Sopenharmony_ci  }
3821cb0ef41Sopenharmony_ci
3831cb0ef41Sopenharmony_ci  void* VisitAssertion(RegExpAssertion* node, void*) override {
3841cb0ef41Sopenharmony_ci    assembler_.Assertion(node->assertion_type());
3851cb0ef41Sopenharmony_ci    return nullptr;
3861cb0ef41Sopenharmony_ci  }
3871cb0ef41Sopenharmony_ci
3881cb0ef41Sopenharmony_ci  void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
3891cb0ef41Sopenharmony_ci    // A character class is compiled as Disjunction over its `CharacterRange`s.
3901cb0ef41Sopenharmony_ci    ZoneList<CharacterRange>* ranges = node->ranges(zone_);
3911cb0ef41Sopenharmony_ci    CharacterRange::Canonicalize(ranges);
3921cb0ef41Sopenharmony_ci    if (node->is_negated()) {
3931cb0ef41Sopenharmony_ci      // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
3941cb0ef41Sopenharmony_ci      // union of k intervals is a union of at most k + 1 intervals.
3951cb0ef41Sopenharmony_ci      ZoneList<CharacterRange>* negated =
3961cb0ef41Sopenharmony_ci          zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
3971cb0ef41Sopenharmony_ci      CharacterRange::Negate(ranges, negated, zone_);
3981cb0ef41Sopenharmony_ci      DCHECK_LE(negated->length(), ranges->length() + 1);
3991cb0ef41Sopenharmony_ci      ranges = negated;
4001cb0ef41Sopenharmony_ci    }
4011cb0ef41Sopenharmony_ci
4021cb0ef41Sopenharmony_ci    CompileDisjunction(ranges->length(), [&](int i) {
4031cb0ef41Sopenharmony_ci      // We don't support utf16 for now, so only ranges that can be specified
4041cb0ef41Sopenharmony_ci      // by (complements of) ranges with base::uc16 bounds.
4051cb0ef41Sopenharmony_ci      STATIC_ASSERT(kMaxSupportedCodepoint <=
4061cb0ef41Sopenharmony_ci                    std::numeric_limits<base::uc16>::max());
4071cb0ef41Sopenharmony_ci
4081cb0ef41Sopenharmony_ci      base::uc32 from = (*ranges)[i].from();
4091cb0ef41Sopenharmony_ci      DCHECK_LE(from, kMaxSupportedCodepoint);
4101cb0ef41Sopenharmony_ci      base::uc16 from_uc16 = static_cast<base::uc16>(from);
4111cb0ef41Sopenharmony_ci
4121cb0ef41Sopenharmony_ci      base::uc32 to = (*ranges)[i].to();
4131cb0ef41Sopenharmony_ci      DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == kMaxCodePoint);
4141cb0ef41Sopenharmony_ci      base::uc16 to_uc16 =
4151cb0ef41Sopenharmony_ci          static_cast<base::uc16>(std::min(to, kMaxSupportedCodepoint));
4161cb0ef41Sopenharmony_ci
4171cb0ef41Sopenharmony_ci      assembler_.ConsumeRange(from_uc16, to_uc16);
4181cb0ef41Sopenharmony_ci    });
4191cb0ef41Sopenharmony_ci    return nullptr;
4201cb0ef41Sopenharmony_ci  }
4211cb0ef41Sopenharmony_ci
4221cb0ef41Sopenharmony_ci  void* VisitAtom(RegExpAtom* node, void*) override {
4231cb0ef41Sopenharmony_ci    for (base::uc16 c : node->data()) {
4241cb0ef41Sopenharmony_ci      assembler_.ConsumeRange(c, c);
4251cb0ef41Sopenharmony_ci    }
4261cb0ef41Sopenharmony_ci    return nullptr;
4271cb0ef41Sopenharmony_ci  }
4281cb0ef41Sopenharmony_ci
4291cb0ef41Sopenharmony_ci  void ClearRegisters(Interval indices) {
4301cb0ef41Sopenharmony_ci    if (indices.is_empty()) return;
4311cb0ef41Sopenharmony_ci    DCHECK_EQ(indices.from() % 2, 0);
4321cb0ef41Sopenharmony_ci    DCHECK_EQ(indices.to() % 2, 1);
4331cb0ef41Sopenharmony_ci    for (int i = indices.from(); i <= indices.to(); i += 2) {
4341cb0ef41Sopenharmony_ci      // It suffices to clear the register containing the `begin` of a capture
4351cb0ef41Sopenharmony_ci      // because this indicates that the capture is undefined, regardless of
4361cb0ef41Sopenharmony_ci      // the value in the `end` register.
4371cb0ef41Sopenharmony_ci      assembler_.ClearRegister(i);
4381cb0ef41Sopenharmony_ci    }
4391cb0ef41Sopenharmony_ci  }
4401cb0ef41Sopenharmony_ci
4411cb0ef41Sopenharmony_ci  // Emit bytecode corresponding to /<emit_body>*/.
4421cb0ef41Sopenharmony_ci  template <class F>
4431cb0ef41Sopenharmony_ci  void CompileGreedyStar(F&& emit_body) {
4441cb0ef41Sopenharmony_ci    // This is compiled into
4451cb0ef41Sopenharmony_ci    //
4461cb0ef41Sopenharmony_ci    //   begin:
4471cb0ef41Sopenharmony_ci    //     FORK end
4481cb0ef41Sopenharmony_ci    //     <body>
4491cb0ef41Sopenharmony_ci    //     JMP begin
4501cb0ef41Sopenharmony_ci    //   end:
4511cb0ef41Sopenharmony_ci    //     ...
4521cb0ef41Sopenharmony_ci    //
4531cb0ef41Sopenharmony_ci    // This is greedy because a forked thread has lower priority than the
4541cb0ef41Sopenharmony_ci    // thread that spawned it.
4551cb0ef41Sopenharmony_ci    Label begin;
4561cb0ef41Sopenharmony_ci    Label end;
4571cb0ef41Sopenharmony_ci
4581cb0ef41Sopenharmony_ci    assembler_.Bind(begin);
4591cb0ef41Sopenharmony_ci    assembler_.Fork(end);
4601cb0ef41Sopenharmony_ci    emit_body();
4611cb0ef41Sopenharmony_ci    assembler_.Jmp(begin);
4621cb0ef41Sopenharmony_ci
4631cb0ef41Sopenharmony_ci    assembler_.Bind(end);
4641cb0ef41Sopenharmony_ci  }
4651cb0ef41Sopenharmony_ci
4661cb0ef41Sopenharmony_ci  // Emit bytecode corresponding to /<emit_body>*?/.
4671cb0ef41Sopenharmony_ci  template <class F>
4681cb0ef41Sopenharmony_ci  void CompileNonGreedyStar(F&& emit_body) {
4691cb0ef41Sopenharmony_ci    // This is compiled into
4701cb0ef41Sopenharmony_ci    //
4711cb0ef41Sopenharmony_ci    //     FORK body
4721cb0ef41Sopenharmony_ci    //     JMP end
4731cb0ef41Sopenharmony_ci    //   body:
4741cb0ef41Sopenharmony_ci    //     <body>
4751cb0ef41Sopenharmony_ci    //     FORK body
4761cb0ef41Sopenharmony_ci    //   end:
4771cb0ef41Sopenharmony_ci    //     ...
4781cb0ef41Sopenharmony_ci
4791cb0ef41Sopenharmony_ci    Label body;
4801cb0ef41Sopenharmony_ci    Label end;
4811cb0ef41Sopenharmony_ci
4821cb0ef41Sopenharmony_ci    assembler_.Fork(body);
4831cb0ef41Sopenharmony_ci    assembler_.Jmp(end);
4841cb0ef41Sopenharmony_ci
4851cb0ef41Sopenharmony_ci    assembler_.Bind(body);
4861cb0ef41Sopenharmony_ci    emit_body();
4871cb0ef41Sopenharmony_ci    assembler_.Fork(body);
4881cb0ef41Sopenharmony_ci
4891cb0ef41Sopenharmony_ci    assembler_.Bind(end);
4901cb0ef41Sopenharmony_ci  }
4911cb0ef41Sopenharmony_ci
4921cb0ef41Sopenharmony_ci  // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
4931cb0ef41Sopenharmony_ci  template <class F>
4941cb0ef41Sopenharmony_ci  void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
4951cb0ef41Sopenharmony_ci    // This is compiled into
4961cb0ef41Sopenharmony_ci    //
4971cb0ef41Sopenharmony_ci    //     FORK end
4981cb0ef41Sopenharmony_ci    //     <body>
4991cb0ef41Sopenharmony_ci    //     FORK end
5001cb0ef41Sopenharmony_ci    //     <body>
5011cb0ef41Sopenharmony_ci    //     ...
5021cb0ef41Sopenharmony_ci    //     ...
5031cb0ef41Sopenharmony_ci    //     FORK end
5041cb0ef41Sopenharmony_ci    //     <body>
5051cb0ef41Sopenharmony_ci    //   end:
5061cb0ef41Sopenharmony_ci    //     ...
5071cb0ef41Sopenharmony_ci
5081cb0ef41Sopenharmony_ci    Label end;
5091cb0ef41Sopenharmony_ci    for (int i = 0; i != max_repetition_num; ++i) {
5101cb0ef41Sopenharmony_ci      assembler_.Fork(end);
5111cb0ef41Sopenharmony_ci      emit_body();
5121cb0ef41Sopenharmony_ci    }
5131cb0ef41Sopenharmony_ci    assembler_.Bind(end);
5141cb0ef41Sopenharmony_ci  }
5151cb0ef41Sopenharmony_ci
5161cb0ef41Sopenharmony_ci  // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
5171cb0ef41Sopenharmony_ci  template <class F>
5181cb0ef41Sopenharmony_ci  void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
5191cb0ef41Sopenharmony_ci    // This is compiled into
5201cb0ef41Sopenharmony_ci    //
5211cb0ef41Sopenharmony_ci    //     FORK body0
5221cb0ef41Sopenharmony_ci    //     JMP end
5231cb0ef41Sopenharmony_ci    //   body0:
5241cb0ef41Sopenharmony_ci    //     <body>
5251cb0ef41Sopenharmony_ci    //     FORK body1
5261cb0ef41Sopenharmony_ci    //     JMP end
5271cb0ef41Sopenharmony_ci    //   body1:
5281cb0ef41Sopenharmony_ci    //     <body>
5291cb0ef41Sopenharmony_ci    //     ...
5301cb0ef41Sopenharmony_ci    //     ...
5311cb0ef41Sopenharmony_ci    //   body{max_repetition_num - 1}:
5321cb0ef41Sopenharmony_ci    //     <body>
5331cb0ef41Sopenharmony_ci    //   end:
5341cb0ef41Sopenharmony_ci    //     ...
5351cb0ef41Sopenharmony_ci
5361cb0ef41Sopenharmony_ci    Label end;
5371cb0ef41Sopenharmony_ci    for (int i = 0; i != max_repetition_num; ++i) {
5381cb0ef41Sopenharmony_ci      Label body;
5391cb0ef41Sopenharmony_ci      assembler_.Fork(body);
5401cb0ef41Sopenharmony_ci      assembler_.Jmp(end);
5411cb0ef41Sopenharmony_ci
5421cb0ef41Sopenharmony_ci      assembler_.Bind(body);
5431cb0ef41Sopenharmony_ci      emit_body();
5441cb0ef41Sopenharmony_ci    }
5451cb0ef41Sopenharmony_ci    assembler_.Bind(end);
5461cb0ef41Sopenharmony_ci  }
5471cb0ef41Sopenharmony_ci
5481cb0ef41Sopenharmony_ci  void* VisitQuantifier(RegExpQuantifier* node, void*) override {
5491cb0ef41Sopenharmony_ci    // Emit the body, but clear registers occuring in body first.
5501cb0ef41Sopenharmony_ci    //
5511cb0ef41Sopenharmony_ci    // TODO(mbid,v8:10765): It's not always necessary to a) capture registers
5521cb0ef41Sopenharmony_ci    // and b) clear them. For example, we don't have to capture anything for
5531cb0ef41Sopenharmony_ci    // the first 4 repetitions if node->min() >= 5, and then we don't have to
5541cb0ef41Sopenharmony_ci    // clear registers in the first node->min() repetitions.
5551cb0ef41Sopenharmony_ci    // Later, and if node->min() == 0, we don't have to clear registers before
5561cb0ef41Sopenharmony_ci    // the first optional repetition.
5571cb0ef41Sopenharmony_ci    Interval body_registers = node->body()->CaptureRegisters();
5581cb0ef41Sopenharmony_ci    auto emit_body = [&]() {
5591cb0ef41Sopenharmony_ci      ClearRegisters(body_registers);
5601cb0ef41Sopenharmony_ci      node->body()->Accept(this, nullptr);
5611cb0ef41Sopenharmony_ci    };
5621cb0ef41Sopenharmony_ci
5631cb0ef41Sopenharmony_ci    // First repeat the body `min()` times.
5641cb0ef41Sopenharmony_ci    for (int i = 0; i != node->min(); ++i) emit_body();
5651cb0ef41Sopenharmony_ci
5661cb0ef41Sopenharmony_ci    switch (node->quantifier_type()) {
5671cb0ef41Sopenharmony_ci      case RegExpQuantifier::POSSESSIVE:
5681cb0ef41Sopenharmony_ci        UNREACHABLE();
5691cb0ef41Sopenharmony_ci      case RegExpQuantifier::GREEDY: {
5701cb0ef41Sopenharmony_ci        if (node->max() == RegExpTree::kInfinity) {
5711cb0ef41Sopenharmony_ci          CompileGreedyStar(emit_body);
5721cb0ef41Sopenharmony_ci        } else {
5731cb0ef41Sopenharmony_ci          DCHECK_NE(node->max(), RegExpTree::kInfinity);
5741cb0ef41Sopenharmony_ci          CompileGreedyRepetition(emit_body, node->max() - node->min());
5751cb0ef41Sopenharmony_ci        }
5761cb0ef41Sopenharmony_ci        break;
5771cb0ef41Sopenharmony_ci      }
5781cb0ef41Sopenharmony_ci      case RegExpQuantifier::NON_GREEDY: {
5791cb0ef41Sopenharmony_ci        if (node->max() == RegExpTree::kInfinity) {
5801cb0ef41Sopenharmony_ci          CompileNonGreedyStar(emit_body);
5811cb0ef41Sopenharmony_ci        } else {
5821cb0ef41Sopenharmony_ci          DCHECK_NE(node->max(), RegExpTree::kInfinity);
5831cb0ef41Sopenharmony_ci          CompileNonGreedyRepetition(emit_body, node->max() - node->min());
5841cb0ef41Sopenharmony_ci        }
5851cb0ef41Sopenharmony_ci      }
5861cb0ef41Sopenharmony_ci    }
5871cb0ef41Sopenharmony_ci    return nullptr;
5881cb0ef41Sopenharmony_ci  }
5891cb0ef41Sopenharmony_ci
5901cb0ef41Sopenharmony_ci  void* VisitCapture(RegExpCapture* node, void*) override {
5911cb0ef41Sopenharmony_ci    int index = node->index();
5921cb0ef41Sopenharmony_ci    int start_register = RegExpCapture::StartRegister(index);
5931cb0ef41Sopenharmony_ci    int end_register = RegExpCapture::EndRegister(index);
5941cb0ef41Sopenharmony_ci    assembler_.SetRegisterToCp(start_register);
5951cb0ef41Sopenharmony_ci    node->body()->Accept(this, nullptr);
5961cb0ef41Sopenharmony_ci    assembler_.SetRegisterToCp(end_register);
5971cb0ef41Sopenharmony_ci    return nullptr;
5981cb0ef41Sopenharmony_ci  }
5991cb0ef41Sopenharmony_ci
6001cb0ef41Sopenharmony_ci  void* VisitGroup(RegExpGroup* node, void*) override {
6011cb0ef41Sopenharmony_ci    node->body()->Accept(this, nullptr);
6021cb0ef41Sopenharmony_ci    return nullptr;
6031cb0ef41Sopenharmony_ci  }
6041cb0ef41Sopenharmony_ci
6051cb0ef41Sopenharmony_ci  void* VisitLookaround(RegExpLookaround* node, void*) override {
6061cb0ef41Sopenharmony_ci    // TODO(mbid,v8:10765): Support this case.
6071cb0ef41Sopenharmony_ci    UNREACHABLE();
6081cb0ef41Sopenharmony_ci  }
6091cb0ef41Sopenharmony_ci
6101cb0ef41Sopenharmony_ci  void* VisitBackReference(RegExpBackReference* node, void*) override {
6111cb0ef41Sopenharmony_ci    UNREACHABLE();
6121cb0ef41Sopenharmony_ci  }
6131cb0ef41Sopenharmony_ci
6141cb0ef41Sopenharmony_ci  void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
6151cb0ef41Sopenharmony_ci
6161cb0ef41Sopenharmony_ci  void* VisitText(RegExpText* node, void*) override {
6171cb0ef41Sopenharmony_ci    for (TextElement& text_el : *node->elements()) {
6181cb0ef41Sopenharmony_ci      text_el.tree()->Accept(this, nullptr);
6191cb0ef41Sopenharmony_ci    }
6201cb0ef41Sopenharmony_ci    return nullptr;
6211cb0ef41Sopenharmony_ci  }
6221cb0ef41Sopenharmony_ci
6231cb0ef41Sopenharmony_ci private:
6241cb0ef41Sopenharmony_ci  Zone* zone_;
6251cb0ef41Sopenharmony_ci  BytecodeAssembler assembler_;
6261cb0ef41Sopenharmony_ci};
6271cb0ef41Sopenharmony_ci
6281cb0ef41Sopenharmony_ci}  // namespace
6291cb0ef41Sopenharmony_ci
6301cb0ef41Sopenharmony_ciZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
6311cb0ef41Sopenharmony_ci    RegExpTree* tree, RegExpFlags flags, Zone* zone) {
6321cb0ef41Sopenharmony_ci  return CompileVisitor::Compile(tree, flags, zone);
6331cb0ef41Sopenharmony_ci}
6341cb0ef41Sopenharmony_ci
6351cb0ef41Sopenharmony_ci}  // namespace internal
6361cb0ef41Sopenharmony_ci}  // namespace v8
637