xref: /third_party/node/deps/v8/src/regexp/regexp-ast.h (revision 1cb0ef41)
1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_AST_H_
6#define V8_REGEXP_REGEXP_AST_H_
7
8#include "src/base/strings.h"
9#include "src/regexp/regexp-flags.h"
10#include "src/zone/zone-containers.h"
11#include "src/zone/zone-list.h"
12#include "src/zone/zone.h"
13
14namespace v8 {
15namespace internal {
16
17#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
18  VISIT(Disjunction)                      \
19  VISIT(Alternative)                      \
20  VISIT(Assertion)                        \
21  VISIT(CharacterClass)                   \
22  VISIT(Atom)                             \
23  VISIT(Quantifier)                       \
24  VISIT(Capture)                          \
25  VISIT(Group)                            \
26  VISIT(Lookaround)                       \
27  VISIT(BackReference)                    \
28  VISIT(Empty)                            \
29  VISIT(Text)
30
31#define FORWARD_DECLARE(Name) class RegExp##Name;
32FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
33#undef FORWARD_DECLARE
34
35class RegExpCompiler;
36class RegExpNode;
37class RegExpTree;
38
39class RegExpVisitor {
40 public:
41  virtual ~RegExpVisitor() = default;
42#define MAKE_CASE(Name) \
43  virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
44  FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
45#undef MAKE_CASE
46};
47
48// A simple closed interval.
49class Interval {
50 public:
51  Interval() : from_(kNone), to_(kNone - 1) {}  // '- 1' for branchless size().
52  Interval(int from, int to) : from_(from), to_(to) {}
53  Interval Union(Interval that) {
54    if (that.from_ == kNone) return *this;
55    if (from_ == kNone) return that;
56    return Interval(std::min(from_, that.from_), std::max(to_, that.to_));
57  }
58
59  static Interval Empty() { return Interval(); }
60
61  bool Contains(int value) const { return (from_ <= value) && (value <= to_); }
62  bool is_empty() const { return from_ == kNone; }
63  int from() const { return from_; }
64  int to() const { return to_; }
65  int size() const { return to_ - from_ + 1; }
66
67  static constexpr int kNone = -1;
68
69 private:
70  int from_;
71  int to_;
72};
73
74// Named standard character sets.
75enum class StandardCharacterSet : char {
76  kWhitespace = 's',         // Like /\s/.
77  kNotWhitespace = 'S',      // Like /\S/.
78  kWord = 'w',               // Like /\w/.
79  kNotWord = 'W',            // Like /\W/.
80  kDigit = 'd',              // Like /\d/.
81  kNotDigit = 'D',           // Like /\D/.
82  kLineTerminator = 'n',     // The inverse of /./.
83  kNotLineTerminator = '.',  // Like /./.
84  kEverything = '*',         // Matches every character, like /./s.
85};
86
87// Represents code points (with values up to 0x10FFFF) in the range from from_
88// to to_, both ends are inclusive.
89class CharacterRange {
90 public:
91  CharacterRange() = default;
92  // For compatibility with the CHECK_OK macro.
93  CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
94
95  static inline CharacterRange Singleton(base::uc32 value) {
96    return CharacterRange(value, value);
97  }
98  static inline CharacterRange Range(base::uc32 from, base::uc32 to) {
99    DCHECK(0 <= from && to <= kMaxCodePoint);
100    DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
101    return CharacterRange(from, to);
102  }
103  static inline CharacterRange Everything() {
104    return CharacterRange(0, kMaxCodePoint);
105  }
106
107  static inline ZoneList<CharacterRange>* List(Zone* zone,
108                                               CharacterRange range) {
109    ZoneList<CharacterRange>* list =
110        zone->New<ZoneList<CharacterRange>>(1, zone);
111    list->Add(range, zone);
112    return list;
113  }
114
115  // Add class escapes. Add case equivalent closure for \w and \W if necessary.
116  V8_EXPORT_PRIVATE static void AddClassEscape(
117      StandardCharacterSet standard_character_set,
118      ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents,
119      Zone* zone);
120  V8_EXPORT_PRIVATE static void AddCaseEquivalents(
121      Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
122      bool is_one_byte);
123
124  bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
125  base::uc32 from() const { return from_; }
126  base::uc32 to() const { return to_; }
127  bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; }
128  bool IsSingleton() const { return from_ == to_; }
129  // Whether a range list is in canonical form: Ranges ordered by from value,
130  // and ranges non-overlapping and non-adjacent.
131  V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
132  // Convert range list to canonical form. The characters covered by the ranges
133  // will still be the same, but no character is in more than one range, and
134  // adjacent ranges are merged. The resulting list may be shorter than the
135  // original, but cannot be longer.
136  static void Canonicalize(ZoneList<CharacterRange>* ranges);
137  // Negate the contents of a character range in canonical form.
138  static void Negate(ZoneList<CharacterRange>* src,
139                     ZoneList<CharacterRange>* dst, Zone* zone);
140
141  // Remove all ranges outside the one-byte range.
142  static void ClampToOneByte(ZoneList<CharacterRange>* ranges);
143
144 private:
145  CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {}
146
147  static constexpr int kMaxCodePoint = 0x10ffff;
148
149  base::uc32 from_ = 0;
150  base::uc32 to_ = 0;
151};
152
153#define DECL_BOILERPLATE(Name)                                         \
154  void* Accept(RegExpVisitor* visitor, void* data) override;           \
155  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \
156      override;                                                        \
157  RegExp##Name* As##Name() override;                                   \
158  bool Is##Name() override
159
160class RegExpTree : public ZoneObject {
161 public:
162  static const int kInfinity = kMaxInt;
163  virtual ~RegExpTree() = default;
164  virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
165  virtual RegExpNode* ToNode(RegExpCompiler* compiler,
166                             RegExpNode* on_success) = 0;
167  virtual bool IsTextElement() { return false; }
168  virtual bool IsAnchoredAtStart() { return false; }
169  virtual bool IsAnchoredAtEnd() { return false; }
170  virtual int min_match() = 0;
171  virtual int max_match() = 0;
172  // Returns the interval of registers used for captures within this
173  // expression.
174  virtual Interval CaptureRegisters() { return Interval::Empty(); }
175  virtual void AppendToText(RegExpText* text, Zone* zone);
176  V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone);
177#define MAKE_ASTYPE(Name)           \
178  virtual RegExp##Name* As##Name(); \
179  virtual bool Is##Name();
180  FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
181#undef MAKE_ASTYPE
182};
183
184
185class RegExpDisjunction final : public RegExpTree {
186 public:
187  explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
188
189  DECL_BOILERPLATE(Disjunction);
190
191  Interval CaptureRegisters() override;
192  bool IsAnchoredAtStart() override;
193  bool IsAnchoredAtEnd() override;
194  int min_match() override { return min_match_; }
195  int max_match() override { return max_match_; }
196  ZoneList<RegExpTree*>* alternatives() const { return alternatives_; }
197
198 private:
199  bool SortConsecutiveAtoms(RegExpCompiler* compiler);
200  void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
201  void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
202  ZoneList<RegExpTree*>* alternatives_;
203  int min_match_;
204  int max_match_;
205};
206
207
208class RegExpAlternative final : public RegExpTree {
209 public:
210  explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
211
212  DECL_BOILERPLATE(Alternative);
213
214  Interval CaptureRegisters() override;
215  bool IsAnchoredAtStart() override;
216  bool IsAnchoredAtEnd() override;
217  int min_match() override { return min_match_; }
218  int max_match() override { return max_match_; }
219  ZoneList<RegExpTree*>* nodes() const { return nodes_; }
220
221 private:
222  ZoneList<RegExpTree*>* nodes_;
223  int min_match_;
224  int max_match_;
225};
226
227
228class RegExpAssertion final : public RegExpTree {
229 public:
230  enum class Type {
231    START_OF_LINE = 0,
232    START_OF_INPUT = 1,
233    END_OF_LINE = 2,
234    END_OF_INPUT = 3,
235    BOUNDARY = 4,
236    NON_BOUNDARY = 5,
237    LAST_ASSERTION_TYPE = NON_BOUNDARY,
238  };
239  explicit RegExpAssertion(Type type) : assertion_type_(type) {}
240
241  DECL_BOILERPLATE(Assertion);
242
243  bool IsAnchoredAtStart() override;
244  bool IsAnchoredAtEnd() override;
245  int min_match() override { return 0; }
246  int max_match() override { return 0; }
247  Type assertion_type() const { return assertion_type_; }
248
249 private:
250  const Type assertion_type_;
251};
252
253class CharacterSet final {
254 public:
255  explicit CharacterSet(StandardCharacterSet standard_set_type)
256      : standard_set_type_(standard_set_type) {}
257  explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {}
258
259  ZoneList<CharacterRange>* ranges(Zone* zone);
260  StandardCharacterSet standard_set_type() const {
261    return standard_set_type_.value();
262  }
263  void set_standard_set_type(StandardCharacterSet standard_set_type) {
264    standard_set_type_ = standard_set_type;
265  }
266  bool is_standard() const { return standard_set_type_.has_value(); }
267  V8_EXPORT_PRIVATE void Canonicalize();
268
269 private:
270  ZoneList<CharacterRange>* ranges_ = nullptr;
271  base::Optional<StandardCharacterSet> standard_set_type_;
272};
273
274class RegExpCharacterClass final : public RegExpTree {
275 public:
276  // NEGATED: The character class is negated and should match everything but
277  //     the specified ranges.
278  // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
279  //     surrogate and should not be unicode-desugared (crbug.com/641091).
280  enum Flag {
281    NEGATED = 1 << 0,
282    CONTAINS_SPLIT_SURROGATE = 1 << 1,
283  };
284  using CharacterClassFlags = base::Flags<Flag>;
285
286  RegExpCharacterClass(
287      Zone* zone, ZoneList<CharacterRange>* ranges,
288      CharacterClassFlags character_class_flags = CharacterClassFlags())
289      : set_(ranges), character_class_flags_(character_class_flags) {
290    // Convert the empty set of ranges to the negated Everything() range.
291    if (ranges->is_empty()) {
292      ranges->Add(CharacterRange::Everything(), zone);
293      character_class_flags_ ^= NEGATED;
294    }
295  }
296  explicit RegExpCharacterClass(StandardCharacterSet standard_set_type)
297      : set_(standard_set_type), character_class_flags_() {}
298
299  DECL_BOILERPLATE(CharacterClass);
300
301  bool IsTextElement() override { return true; }
302  int min_match() override { return 1; }
303  // The character class may match two code units for unicode regexps.
304  // TODO(yangguo): we should split this class for usage in TextElement, and
305  //                make max_match() dependent on the character class content.
306  int max_match() override { return 2; }
307
308  void AppendToText(RegExpText* text, Zone* zone) override;
309
310  // TODO(lrn): Remove need for complex version if is_standard that
311  // recognizes a mangled standard set and just do { return set_.is_special(); }
312  bool is_standard(Zone* zone);
313  // Returns a value representing the standard character set if is_standard()
314  // returns true.
315  StandardCharacterSet standard_type() const {
316    return set_.standard_set_type();
317  }
318
319  CharacterSet character_set() const { return set_; }
320  ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
321
322  bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
323  bool contains_split_surrogate() const {
324    return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
325  }
326
327 private:
328  CharacterSet set_;
329  CharacterClassFlags character_class_flags_;
330};
331
332class RegExpAtom final : public RegExpTree {
333 public:
334  explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {}
335
336  DECL_BOILERPLATE(Atom);
337
338  bool IsTextElement() override { return true; }
339  int min_match() override { return data_.length(); }
340  int max_match() override { return data_.length(); }
341  void AppendToText(RegExpText* text, Zone* zone) override;
342
343  base::Vector<const base::uc16> data() const { return data_; }
344  int length() const { return data_.length(); }
345
346 private:
347  base::Vector<const base::uc16> data_;
348};
349
350class TextElement final {
351 public:
352  enum TextType { ATOM, CHAR_CLASS };
353
354  static TextElement Atom(RegExpAtom* atom);
355  static TextElement CharClass(RegExpCharacterClass* char_class);
356
357  int cp_offset() const { return cp_offset_; }
358  void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
359  int length() const;
360
361  TextType text_type() const { return text_type_; }
362
363  RegExpTree* tree() const { return tree_; }
364
365  RegExpAtom* atom() const {
366    DCHECK(text_type() == ATOM);
367    return reinterpret_cast<RegExpAtom*>(tree());
368  }
369
370  RegExpCharacterClass* char_class() const {
371    DCHECK(text_type() == CHAR_CLASS);
372    return reinterpret_cast<RegExpCharacterClass*>(tree());
373  }
374
375 private:
376  TextElement(TextType text_type, RegExpTree* tree)
377      : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
378
379  int cp_offset_;
380  TextType text_type_;
381  RegExpTree* tree_;
382};
383
384class RegExpText final : public RegExpTree {
385 public:
386  explicit RegExpText(Zone* zone) : elements_(2, zone) {}
387
388  DECL_BOILERPLATE(Text);
389
390  bool IsTextElement() override { return true; }
391  int min_match() override { return length_; }
392  int max_match() override { return length_; }
393  void AppendToText(RegExpText* text, Zone* zone) override;
394  void AddElement(TextElement elm, Zone* zone) {
395    elements_.Add(elm, zone);
396    length_ += elm.length();
397  }
398  ZoneList<TextElement>* elements() { return &elements_; }
399
400 private:
401  ZoneList<TextElement> elements_;
402  int length_ = 0;
403};
404
405
406class RegExpQuantifier final : public RegExpTree {
407 public:
408  enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
409  RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
410      : body_(body),
411        min_(min),
412        max_(max),
413        quantifier_type_(type) {
414    if (min > 0 && body->min_match() > kInfinity / min) {
415      min_match_ = kInfinity;
416    } else {
417      min_match_ = min * body->min_match();
418    }
419    if (max > 0 && body->max_match() > kInfinity / max) {
420      max_match_ = kInfinity;
421    } else {
422      max_match_ = max * body->max_match();
423    }
424  }
425
426  DECL_BOILERPLATE(Quantifier);
427
428  static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
429                            RegExpCompiler* compiler, RegExpNode* on_success,
430                            bool not_at_start = false);
431  Interval CaptureRegisters() override;
432  int min_match() override { return min_match_; }
433  int max_match() override { return max_match_; }
434  int min() const { return min_; }
435  int max() const { return max_; }
436  QuantifierType quantifier_type() const { return quantifier_type_; }
437  bool is_possessive() const { return quantifier_type_ == POSSESSIVE; }
438  bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; }
439  bool is_greedy() const { return quantifier_type_ == GREEDY; }
440  RegExpTree* body() const { return body_; }
441
442 private:
443  RegExpTree* body_;
444  int min_;
445  int max_;
446  int min_match_;
447  int max_match_;
448  QuantifierType quantifier_type_;
449};
450
451
452class RegExpCapture final : public RegExpTree {
453 public:
454  explicit RegExpCapture(int index)
455      : body_(nullptr),
456        index_(index),
457        min_match_(0),
458        max_match_(0),
459        name_(nullptr) {}
460
461  DECL_BOILERPLATE(Capture);
462
463  static RegExpNode* ToNode(RegExpTree* body, int index,
464                            RegExpCompiler* compiler, RegExpNode* on_success);
465  bool IsAnchoredAtStart() override;
466  bool IsAnchoredAtEnd() override;
467  Interval CaptureRegisters() override;
468  int min_match() override { return min_match_; }
469  int max_match() override { return max_match_; }
470  RegExpTree* body() { return body_; }
471  void set_body(RegExpTree* body) {
472    body_ = body;
473    min_match_ = body->min_match();
474    max_match_ = body->max_match();
475  }
476  int index() const { return index_; }
477  const ZoneVector<base::uc16>* name() const { return name_; }
478  void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
479  static int StartRegister(int index) { return index * 2; }
480  static int EndRegister(int index) { return index * 2 + 1; }
481
482 private:
483  RegExpTree* body_ = nullptr;
484  int index_;
485  int min_match_ = 0;
486  int max_match_ = 0;
487  const ZoneVector<base::uc16>* name_ = nullptr;
488};
489
490class RegExpGroup final : public RegExpTree {
491 public:
492  explicit RegExpGroup(RegExpTree* body)
493      : body_(body),
494        min_match_(body->min_match()),
495        max_match_(body->max_match()) {}
496
497  DECL_BOILERPLATE(Group);
498
499  bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
500  bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
501  int min_match() override { return min_match_; }
502  int max_match() override { return max_match_; }
503  Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
504  RegExpTree* body() const { return body_; }
505
506 private:
507  RegExpTree* body_;
508  int min_match_;
509  int max_match_;
510};
511
512class RegExpLookaround final : public RegExpTree {
513 public:
514  enum Type { LOOKAHEAD, LOOKBEHIND };
515
516  RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
517                   int capture_from, Type type)
518      : body_(body),
519        is_positive_(is_positive),
520        capture_count_(capture_count),
521        capture_from_(capture_from),
522        type_(type) {}
523
524  DECL_BOILERPLATE(Lookaround);
525
526  Interval CaptureRegisters() override;
527  bool IsAnchoredAtStart() override;
528  int min_match() override { return 0; }
529  int max_match() override { return 0; }
530  RegExpTree* body() const { return body_; }
531  bool is_positive() const { return is_positive_; }
532  int capture_count() const { return capture_count_; }
533  int capture_from() const { return capture_from_; }
534  Type type() const { return type_; }
535
536  class Builder {
537   public:
538    Builder(bool is_positive, RegExpNode* on_success,
539            int stack_pointer_register, int position_register,
540            int capture_register_count = 0, int capture_register_start = 0);
541    RegExpNode* on_match_success() const { return on_match_success_; }
542    RegExpNode* ForMatch(RegExpNode* match);
543
544   private:
545    bool is_positive_;
546    RegExpNode* on_match_success_;
547    RegExpNode* on_success_;
548    int stack_pointer_register_;
549    int position_register_;
550  };
551
552 private:
553  RegExpTree* body_;
554  bool is_positive_;
555  int capture_count_;
556  int capture_from_;
557  Type type_;
558};
559
560
561class RegExpBackReference final : public RegExpTree {
562 public:
563  explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {}
564  RegExpBackReference(RegExpCapture* capture, RegExpFlags flags)
565      : capture_(capture), flags_(flags) {}
566
567  DECL_BOILERPLATE(BackReference);
568
569  int min_match() override { return 0; }
570  // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
571  // recursion, we give up. Ignorance is bliss.
572  int max_match() override { return kInfinity; }
573  int index() const { return capture_->index(); }
574  RegExpCapture* capture() const { return capture_; }
575  void set_capture(RegExpCapture* capture) { capture_ = capture; }
576  const ZoneVector<base::uc16>* name() const { return name_; }
577  void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
578
579 private:
580  RegExpCapture* capture_ = nullptr;
581  const ZoneVector<base::uc16>* name_ = nullptr;
582  const RegExpFlags flags_;
583};
584
585
586class RegExpEmpty final : public RegExpTree {
587 public:
588  DECL_BOILERPLATE(Empty);
589  int min_match() override { return 0; }
590  int max_match() override { return 0; }
591};
592
593}  // namespace internal
594}  // namespace v8
595
596#undef DECL_BOILERPLATE
597
598#endif  // V8_REGEXP_REGEXP_AST_H_
599