1// Copyright 2020 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/experimental/experimental-compiler.h"
6
7#include "src/base/strings.h"
8#include "src/regexp/experimental/experimental.h"
9#include "src/zone/zone-list-inl.h"
10
11namespace v8 {
12namespace internal {
13
14namespace {
15
16// TODO(mbid, v8:10765): Currently the experimental engine doesn't support
17// UTF-16, but this shouldn't be too hard to implement.
18constexpr base::uc32 kMaxSupportedCodepoint = 0xFFFFu;
19#ifdef DEBUG
20constexpr base::uc32 kMaxCodePoint = 0x10ffff;
21#endif  // DEBUG
22
23class CanBeHandledVisitor final : private RegExpVisitor {
24  // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
25 public:
26  static bool Check(RegExpTree* tree, RegExpFlags flags, int capture_count) {
27    if (!AreSuitableFlags(flags)) return false;
28    CanBeHandledVisitor visitor;
29    tree->Accept(&visitor, nullptr);
30    return visitor.result_;
31  }
32
33 private:
34  CanBeHandledVisitor() = default;
35
36  static bool AreSuitableFlags(RegExpFlags flags) {
37    // TODO(mbid, v8:10765): We should be able to support all flags in the
38    // future.
39    static constexpr RegExpFlags kAllowedFlags =
40        RegExpFlag::kGlobal | RegExpFlag::kSticky | RegExpFlag::kMultiline |
41        RegExpFlag::kDotAll | RegExpFlag::kLinear;
42    // We support Unicode iff kUnicode is among the supported flags.
43    STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
44                  IsUnicode(kAllowedFlags));
45    return (flags & ~kAllowedFlags) == 0;
46  }
47
48  void* VisitDisjunction(RegExpDisjunction* node, void*) override {
49    for (RegExpTree* alt : *node->alternatives()) {
50      alt->Accept(this, nullptr);
51      if (!result_) {
52        return nullptr;
53      }
54    }
55    return nullptr;
56  }
57
58  void* VisitAlternative(RegExpAlternative* node, void*) override {
59    for (RegExpTree* child : *node->nodes()) {
60      child->Accept(this, nullptr);
61      if (!result_) {
62        return nullptr;
63      }
64    }
65    return nullptr;
66  }
67
68  void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
69    return nullptr;
70  }
71
72  void* VisitAssertion(RegExpAssertion* node, void*) override {
73    return nullptr;
74  }
75
76  void* VisitAtom(RegExpAtom* node, void*) override {
77    return nullptr;
78  }
79
80  void* VisitText(RegExpText* node, void*) override {
81    for (TextElement& el : *node->elements()) {
82      el.tree()->Accept(this, nullptr);
83      if (!result_) {
84        return nullptr;
85      }
86    }
87    return nullptr;
88  }
89
90  void* VisitQuantifier(RegExpQuantifier* node, void*) override {
91    // Finite but large values of `min()` and `max()` are bad for the
92    // breadth-first engine because finite (optional) repetition is dealt with
93    // by replicating the bytecode of the body of the quantifier.  The number
94    // of replicatons grows exponentially in how deeply quantifiers are nested.
95    // `replication_factor_` keeps track of how often the current node will
96    // have to be replicated in the generated bytecode, and we don't allow this
97    // to exceed some small value.
98    static constexpr int kMaxReplicationFactor = 16;
99
100    // First we rule out values for min and max that are too big even before
101    // taking into account the ambient replication_factor_.  This also guards
102    // against overflows in `local_replication` or `replication_factor_`.
103    if (node->min() > kMaxReplicationFactor ||
104        (node->max() != RegExpTree::kInfinity &&
105         node->max() > kMaxReplicationFactor)) {
106      result_ = false;
107      return nullptr;
108    }
109
110    // Save the current replication factor so that it can be restored if we
111    // return with `result_ == true`.
112    int before_replication_factor = replication_factor_;
113
114    int local_replication;
115    if (node->max() == RegExpTree::kInfinity) {
116      local_replication = node->min() + 1;
117    } else {
118      local_replication = node->max();
119    }
120
121    replication_factor_ *= local_replication;
122    if (replication_factor_ > kMaxReplicationFactor) {
123      result_ = false;
124      return nullptr;
125    }
126
127    switch (node->quantifier_type()) {
128      case RegExpQuantifier::GREEDY:
129      case RegExpQuantifier::NON_GREEDY:
130        break;
131      case RegExpQuantifier::POSSESSIVE:
132        // TODO(mbid, v8:10765): It's not clear to me whether this can be
133        // supported in breadth-first mode. Re2 doesn't support it.
134        result_ = false;
135        return nullptr;
136    }
137
138    node->body()->Accept(this, nullptr);
139    replication_factor_ = before_replication_factor;
140    return nullptr;
141  }
142
143  void* VisitCapture(RegExpCapture* node, void*) override {
144    node->body()->Accept(this, nullptr);
145    return nullptr;
146  }
147
148  void* VisitGroup(RegExpGroup* node, void*) override {
149    node->body()->Accept(this, nullptr);
150    return nullptr;
151  }
152
153  void* VisitLookaround(RegExpLookaround* node, void*) override {
154    // TODO(mbid, v8:10765): This will be hard to support, but not impossible I
155    // think.  See product automata.
156    result_ = false;
157    return nullptr;
158  }
159
160  void* VisitBackReference(RegExpBackReference* node, void*) override {
161    // This can't be implemented without backtracking.
162    result_ = false;
163    return nullptr;
164  }
165
166  void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
167
168 private:
169  // See comment in `VisitQuantifier`:
170  int replication_factor_ = 1;
171
172  bool result_ = true;
173};
174
175}  // namespace
176
177bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
178                                              RegExpFlags flags,
179                                              int capture_count) {
180  return CanBeHandledVisitor::Check(tree, flags, capture_count);
181}
182
183namespace {
184
185// A label in bytecode which starts with no known address. The address *must*
186// be bound with `Bind` before the label goes out of scope.
187// Implemented as a linked list through the `payload.pc` of FORK and JMP
188// instructions.
189struct Label {
190 public:
191  Label() = default;
192  ~Label() {
193    DCHECK_EQ(state_, BOUND);
194    DCHECK_GE(bound_index_, 0);
195  }
196
197  // Don't copy, don't move.  Moving could be implemented, but it's not
198  // needed anywhere.
199  Label(const Label&) = delete;
200  Label& operator=(const Label&) = delete;
201
202 private:
203  friend class BytecodeAssembler;
204
205  // UNBOUND implies unbound_patch_list_begin_.
206  // BOUND implies bound_index_.
207  enum { UNBOUND, BOUND } state_ = UNBOUND;
208  union {
209    int unbound_patch_list_begin_ = -1;
210    int bound_index_;
211  };
212};
213
214class BytecodeAssembler {
215 public:
216  // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
217  // the `tree` size we're going to compile?
218  explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
219
220  ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
221
222  void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
223
224  void Assertion(RegExpAssertion::Type t) {
225    code_.Add(RegExpInstruction::Assertion(t), zone_);
226  }
227
228  void ClearRegister(int32_t register_index) {
229    code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
230  }
231
232  void ConsumeRange(base::uc16 from, base::uc16 to) {
233    code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
234  }
235
236  void ConsumeAnyChar() {
237    code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_);
238  }
239
240  void Fork(Label& target) {
241    LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
242  }
243
244  void Jmp(Label& target) {
245    LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
246  }
247
248  void SetRegisterToCp(int32_t register_index) {
249    code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
250  }
251
252  void Bind(Label& target) {
253    DCHECK_EQ(target.state_, Label::UNBOUND);
254
255    int index = code_.length();
256
257    while (target.unbound_patch_list_begin_ != -1) {
258      RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
259      DCHECK(inst.opcode == RegExpInstruction::FORK ||
260             inst.opcode == RegExpInstruction::JMP);
261
262      target.unbound_patch_list_begin_ = inst.payload.pc;
263      inst.payload.pc = index;
264    }
265
266    target.state_ = Label::BOUND;
267    target.bound_index_ = index;
268  }
269
270  void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
271
272 private:
273  void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
274    RegExpInstruction result;
275    result.opcode = op;
276
277    if (target.state_ == Label::BOUND) {
278      result.payload.pc = target.bound_index_;
279    } else {
280      DCHECK_EQ(target.state_, Label::UNBOUND);
281      int new_list_begin = code_.length();
282      DCHECK_GE(new_list_begin, 0);
283
284      result.payload.pc = target.unbound_patch_list_begin_;
285
286      target.unbound_patch_list_begin_ = new_list_begin;
287    }
288
289    code_.Add(result, zone_);
290  }
291
292  Zone* zone_;
293  ZoneList<RegExpInstruction> code_;
294};
295
296class CompileVisitor : private RegExpVisitor {
297 public:
298  static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
299                                             RegExpFlags flags, Zone* zone) {
300    CompileVisitor compiler(zone);
301
302    if (!IsSticky(flags) && !tree->IsAnchoredAtStart()) {
303      // The match is not anchored, i.e. may start at any input position, so we
304      // emit a preamble corresponding to /.*?/.  This skips an arbitrary
305      // prefix in the input non-greedily.
306      compiler.CompileNonGreedyStar(
307          [&]() { compiler.assembler_.ConsumeAnyChar(); });
308    }
309
310    compiler.assembler_.SetRegisterToCp(0);
311    tree->Accept(&compiler, nullptr);
312    compiler.assembler_.SetRegisterToCp(1);
313    compiler.assembler_.Accept();
314
315    return std::move(compiler.assembler_).IntoCode();
316  }
317
318 private:
319  explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {}
320
321  // Generate a disjunction of code fragments compiled by a function `alt_gen`.
322  // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
323  // 1` and should build code corresponding to the ith alternative.
324  template <class F>
325  void CompileDisjunction(int alt_num, F&& gen_alt) {
326    // An alternative a1 | ... | an is compiled into
327    //
328    //     FORK tail1
329    //     <a1>
330    //     JMP end
331    //   tail1:
332    //     FORK tail2
333    //     <a2>
334    //     JMP end
335    //   tail2:
336    //     ...
337    //     ...
338    //   tail{n -1}:
339    //     <an>
340    //   end:
341    //
342    // By the semantics of the FORK instruction (see above at definition and
343    // semantics), a forked thread has lower priority than the thread that
344    // spawned it.  This means that with the code we're generating here, the
345    // thread matching the alternative a1 has indeed highest priority, followed
346    // by the thread for a2 and so on.
347
348    if (alt_num == 0) {
349      // The empty disjunction.  This can never match.
350      assembler_.Fail();
351      return;
352    }
353
354    Label end;
355
356    for (int i = 0; i != alt_num - 1; ++i) {
357      Label tail;
358      assembler_.Fork(tail);
359      gen_alt(i);
360      assembler_.Jmp(end);
361      assembler_.Bind(tail);
362    }
363
364    gen_alt(alt_num - 1);
365
366    assembler_.Bind(end);
367  }
368
369  void* VisitDisjunction(RegExpDisjunction* node, void*) override {
370    ZoneList<RegExpTree*>& alts = *node->alternatives();
371    CompileDisjunction(alts.length(),
372                       [&](int i) { alts[i]->Accept(this, nullptr); });
373    return nullptr;
374  }
375
376  void* VisitAlternative(RegExpAlternative* node, void*) override {
377    for (RegExpTree* child : *node->nodes()) {
378      child->Accept(this, nullptr);
379    }
380    return nullptr;
381  }
382
383  void* VisitAssertion(RegExpAssertion* node, void*) override {
384    assembler_.Assertion(node->assertion_type());
385    return nullptr;
386  }
387
388  void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
389    // A character class is compiled as Disjunction over its `CharacterRange`s.
390    ZoneList<CharacterRange>* ranges = node->ranges(zone_);
391    CharacterRange::Canonicalize(ranges);
392    if (node->is_negated()) {
393      // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
394      // union of k intervals is a union of at most k + 1 intervals.
395      ZoneList<CharacterRange>* negated =
396          zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
397      CharacterRange::Negate(ranges, negated, zone_);
398      DCHECK_LE(negated->length(), ranges->length() + 1);
399      ranges = negated;
400    }
401
402    CompileDisjunction(ranges->length(), [&](int i) {
403      // We don't support utf16 for now, so only ranges that can be specified
404      // by (complements of) ranges with base::uc16 bounds.
405      STATIC_ASSERT(kMaxSupportedCodepoint <=
406                    std::numeric_limits<base::uc16>::max());
407
408      base::uc32 from = (*ranges)[i].from();
409      DCHECK_LE(from, kMaxSupportedCodepoint);
410      base::uc16 from_uc16 = static_cast<base::uc16>(from);
411
412      base::uc32 to = (*ranges)[i].to();
413      DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == kMaxCodePoint);
414      base::uc16 to_uc16 =
415          static_cast<base::uc16>(std::min(to, kMaxSupportedCodepoint));
416
417      assembler_.ConsumeRange(from_uc16, to_uc16);
418    });
419    return nullptr;
420  }
421
422  void* VisitAtom(RegExpAtom* node, void*) override {
423    for (base::uc16 c : node->data()) {
424      assembler_.ConsumeRange(c, c);
425    }
426    return nullptr;
427  }
428
429  void ClearRegisters(Interval indices) {
430    if (indices.is_empty()) return;
431    DCHECK_EQ(indices.from() % 2, 0);
432    DCHECK_EQ(indices.to() % 2, 1);
433    for (int i = indices.from(); i <= indices.to(); i += 2) {
434      // It suffices to clear the register containing the `begin` of a capture
435      // because this indicates that the capture is undefined, regardless of
436      // the value in the `end` register.
437      assembler_.ClearRegister(i);
438    }
439  }
440
441  // Emit bytecode corresponding to /<emit_body>*/.
442  template <class F>
443  void CompileGreedyStar(F&& emit_body) {
444    // This is compiled into
445    //
446    //   begin:
447    //     FORK end
448    //     <body>
449    //     JMP begin
450    //   end:
451    //     ...
452    //
453    // This is greedy because a forked thread has lower priority than the
454    // thread that spawned it.
455    Label begin;
456    Label end;
457
458    assembler_.Bind(begin);
459    assembler_.Fork(end);
460    emit_body();
461    assembler_.Jmp(begin);
462
463    assembler_.Bind(end);
464  }
465
466  // Emit bytecode corresponding to /<emit_body>*?/.
467  template <class F>
468  void CompileNonGreedyStar(F&& emit_body) {
469    // This is compiled into
470    //
471    //     FORK body
472    //     JMP end
473    //   body:
474    //     <body>
475    //     FORK body
476    //   end:
477    //     ...
478
479    Label body;
480    Label end;
481
482    assembler_.Fork(body);
483    assembler_.Jmp(end);
484
485    assembler_.Bind(body);
486    emit_body();
487    assembler_.Fork(body);
488
489    assembler_.Bind(end);
490  }
491
492  // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
493  template <class F>
494  void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
495    // This is compiled into
496    //
497    //     FORK end
498    //     <body>
499    //     FORK end
500    //     <body>
501    //     ...
502    //     ...
503    //     FORK end
504    //     <body>
505    //   end:
506    //     ...
507
508    Label end;
509    for (int i = 0; i != max_repetition_num; ++i) {
510      assembler_.Fork(end);
511      emit_body();
512    }
513    assembler_.Bind(end);
514  }
515
516  // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
517  template <class F>
518  void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
519    // This is compiled into
520    //
521    //     FORK body0
522    //     JMP end
523    //   body0:
524    //     <body>
525    //     FORK body1
526    //     JMP end
527    //   body1:
528    //     <body>
529    //     ...
530    //     ...
531    //   body{max_repetition_num - 1}:
532    //     <body>
533    //   end:
534    //     ...
535
536    Label end;
537    for (int i = 0; i != max_repetition_num; ++i) {
538      Label body;
539      assembler_.Fork(body);
540      assembler_.Jmp(end);
541
542      assembler_.Bind(body);
543      emit_body();
544    }
545    assembler_.Bind(end);
546  }
547
548  void* VisitQuantifier(RegExpQuantifier* node, void*) override {
549    // Emit the body, but clear registers occuring in body first.
550    //
551    // TODO(mbid,v8:10765): It's not always necessary to a) capture registers
552    // and b) clear them. For example, we don't have to capture anything for
553    // the first 4 repetitions if node->min() >= 5, and then we don't have to
554    // clear registers in the first node->min() repetitions.
555    // Later, and if node->min() == 0, we don't have to clear registers before
556    // the first optional repetition.
557    Interval body_registers = node->body()->CaptureRegisters();
558    auto emit_body = [&]() {
559      ClearRegisters(body_registers);
560      node->body()->Accept(this, nullptr);
561    };
562
563    // First repeat the body `min()` times.
564    for (int i = 0; i != node->min(); ++i) emit_body();
565
566    switch (node->quantifier_type()) {
567      case RegExpQuantifier::POSSESSIVE:
568        UNREACHABLE();
569      case RegExpQuantifier::GREEDY: {
570        if (node->max() == RegExpTree::kInfinity) {
571          CompileGreedyStar(emit_body);
572        } else {
573          DCHECK_NE(node->max(), RegExpTree::kInfinity);
574          CompileGreedyRepetition(emit_body, node->max() - node->min());
575        }
576        break;
577      }
578      case RegExpQuantifier::NON_GREEDY: {
579        if (node->max() == RegExpTree::kInfinity) {
580          CompileNonGreedyStar(emit_body);
581        } else {
582          DCHECK_NE(node->max(), RegExpTree::kInfinity);
583          CompileNonGreedyRepetition(emit_body, node->max() - node->min());
584        }
585      }
586    }
587    return nullptr;
588  }
589
590  void* VisitCapture(RegExpCapture* node, void*) override {
591    int index = node->index();
592    int start_register = RegExpCapture::StartRegister(index);
593    int end_register = RegExpCapture::EndRegister(index);
594    assembler_.SetRegisterToCp(start_register);
595    node->body()->Accept(this, nullptr);
596    assembler_.SetRegisterToCp(end_register);
597    return nullptr;
598  }
599
600  void* VisitGroup(RegExpGroup* node, void*) override {
601    node->body()->Accept(this, nullptr);
602    return nullptr;
603  }
604
605  void* VisitLookaround(RegExpLookaround* node, void*) override {
606    // TODO(mbid,v8:10765): Support this case.
607    UNREACHABLE();
608  }
609
610  void* VisitBackReference(RegExpBackReference* node, void*) override {
611    UNREACHABLE();
612  }
613
614  void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
615
616  void* VisitText(RegExpText* node, void*) override {
617    for (TextElement& text_el : *node->elements()) {
618      text_el.tree()->Accept(this, nullptr);
619    }
620    return nullptr;
621  }
622
623 private:
624  Zone* zone_;
625  BytecodeAssembler assembler_;
626};
627
628}  // namespace
629
630ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
631    RegExpTree* tree, RegExpFlags flags, Zone* zone) {
632  return CompileVisitor::Compile(tree, flags, zone);
633}
634
635}  // namespace internal
636}  // namespace v8
637