1// Copyright 2016 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef V8_REGEXP_REGEXP_AST_H_ 6#define V8_REGEXP_REGEXP_AST_H_ 7 8#include "src/base/strings.h" 9#include "src/regexp/regexp-flags.h" 10#include "src/zone/zone-containers.h" 11#include "src/zone/zone-list.h" 12#include "src/zone/zone.h" 13 14namespace v8 { 15namespace internal { 16 17#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 18 VISIT(Disjunction) \ 19 VISIT(Alternative) \ 20 VISIT(Assertion) \ 21 VISIT(CharacterClass) \ 22 VISIT(Atom) \ 23 VISIT(Quantifier) \ 24 VISIT(Capture) \ 25 VISIT(Group) \ 26 VISIT(Lookaround) \ 27 VISIT(BackReference) \ 28 VISIT(Empty) \ 29 VISIT(Text) 30 31#define FORWARD_DECLARE(Name) class RegExp##Name; 32FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) 33#undef FORWARD_DECLARE 34 35class RegExpCompiler; 36class RegExpNode; 37class RegExpTree; 38 39class RegExpVisitor { 40 public: 41 virtual ~RegExpVisitor() = default; 42#define MAKE_CASE(Name) \ 43 virtual void* Visit##Name(RegExp##Name*, void* data) = 0; 44 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) 45#undef MAKE_CASE 46}; 47 48// A simple closed interval. 49class Interval { 50 public: 51 Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). 52 Interval(int from, int to) : from_(from), to_(to) {} 53 Interval Union(Interval that) { 54 if (that.from_ == kNone) return *this; 55 if (from_ == kNone) return that; 56 return Interval(std::min(from_, that.from_), std::max(to_, that.to_)); 57 } 58 59 static Interval Empty() { return Interval(); } 60 61 bool Contains(int value) const { return (from_ <= value) && (value <= to_); } 62 bool is_empty() const { return from_ == kNone; } 63 int from() const { return from_; } 64 int to() const { return to_; } 65 int size() const { return to_ - from_ + 1; } 66 67 static constexpr int kNone = -1; 68 69 private: 70 int from_; 71 int to_; 72}; 73 74// Named standard character sets. 75enum class StandardCharacterSet : char { 76 kWhitespace = 's', // Like /\s/. 77 kNotWhitespace = 'S', // Like /\S/. 78 kWord = 'w', // Like /\w/. 79 kNotWord = 'W', // Like /\W/. 80 kDigit = 'd', // Like /\d/. 81 kNotDigit = 'D', // Like /\D/. 82 kLineTerminator = 'n', // The inverse of /./. 83 kNotLineTerminator = '.', // Like /./. 84 kEverything = '*', // Matches every character, like /./s. 85}; 86 87// Represents code points (with values up to 0x10FFFF) in the range from from_ 88// to to_, both ends are inclusive. 89class CharacterRange { 90 public: 91 CharacterRange() = default; 92 // For compatibility with the CHECK_OK macro. 93 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT 94 95 static inline CharacterRange Singleton(base::uc32 value) { 96 return CharacterRange(value, value); 97 } 98 static inline CharacterRange Range(base::uc32 from, base::uc32 to) { 99 DCHECK(0 <= from && to <= kMaxCodePoint); 100 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); 101 return CharacterRange(from, to); 102 } 103 static inline CharacterRange Everything() { 104 return CharacterRange(0, kMaxCodePoint); 105 } 106 107 static inline ZoneList<CharacterRange>* List(Zone* zone, 108 CharacterRange range) { 109 ZoneList<CharacterRange>* list = 110 zone->New<ZoneList<CharacterRange>>(1, zone); 111 list->Add(range, zone); 112 return list; 113 } 114 115 // Add class escapes. Add case equivalent closure for \w and \W if necessary. 116 V8_EXPORT_PRIVATE static void AddClassEscape( 117 StandardCharacterSet standard_character_set, 118 ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents, 119 Zone* zone); 120 V8_EXPORT_PRIVATE static void AddCaseEquivalents( 121 Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, 122 bool is_one_byte); 123 124 bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; } 125 base::uc32 from() const { return from_; } 126 base::uc32 to() const { return to_; } 127 bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; } 128 bool IsSingleton() const { return from_ == to_; } 129 // Whether a range list is in canonical form: Ranges ordered by from value, 130 // and ranges non-overlapping and non-adjacent. 131 V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges); 132 // Convert range list to canonical form. The characters covered by the ranges 133 // will still be the same, but no character is in more than one range, and 134 // adjacent ranges are merged. The resulting list may be shorter than the 135 // original, but cannot be longer. 136 static void Canonicalize(ZoneList<CharacterRange>* ranges); 137 // Negate the contents of a character range in canonical form. 138 static void Negate(ZoneList<CharacterRange>* src, 139 ZoneList<CharacterRange>* dst, Zone* zone); 140 141 // Remove all ranges outside the one-byte range. 142 static void ClampToOneByte(ZoneList<CharacterRange>* ranges); 143 144 private: 145 CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {} 146 147 static constexpr int kMaxCodePoint = 0x10ffff; 148 149 base::uc32 from_ = 0; 150 base::uc32 to_ = 0; 151}; 152 153#define DECL_BOILERPLATE(Name) \ 154 void* Accept(RegExpVisitor* visitor, void* data) override; \ 155 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \ 156 override; \ 157 RegExp##Name* As##Name() override; \ 158 bool Is##Name() override 159 160class RegExpTree : public ZoneObject { 161 public: 162 static const int kInfinity = kMaxInt; 163 virtual ~RegExpTree() = default; 164 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; 165 virtual RegExpNode* ToNode(RegExpCompiler* compiler, 166 RegExpNode* on_success) = 0; 167 virtual bool IsTextElement() { return false; } 168 virtual bool IsAnchoredAtStart() { return false; } 169 virtual bool IsAnchoredAtEnd() { return false; } 170 virtual int min_match() = 0; 171 virtual int max_match() = 0; 172 // Returns the interval of registers used for captures within this 173 // expression. 174 virtual Interval CaptureRegisters() { return Interval::Empty(); } 175 virtual void AppendToText(RegExpText* text, Zone* zone); 176 V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone); 177#define MAKE_ASTYPE(Name) \ 178 virtual RegExp##Name* As##Name(); \ 179 virtual bool Is##Name(); 180 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) 181#undef MAKE_ASTYPE 182}; 183 184 185class RegExpDisjunction final : public RegExpTree { 186 public: 187 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); 188 189 DECL_BOILERPLATE(Disjunction); 190 191 Interval CaptureRegisters() override; 192 bool IsAnchoredAtStart() override; 193 bool IsAnchoredAtEnd() override; 194 int min_match() override { return min_match_; } 195 int max_match() override { return max_match_; } 196 ZoneList<RegExpTree*>* alternatives() const { return alternatives_; } 197 198 private: 199 bool SortConsecutiveAtoms(RegExpCompiler* compiler); 200 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); 201 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); 202 ZoneList<RegExpTree*>* alternatives_; 203 int min_match_; 204 int max_match_; 205}; 206 207 208class RegExpAlternative final : public RegExpTree { 209 public: 210 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); 211 212 DECL_BOILERPLATE(Alternative); 213 214 Interval CaptureRegisters() override; 215 bool IsAnchoredAtStart() override; 216 bool IsAnchoredAtEnd() override; 217 int min_match() override { return min_match_; } 218 int max_match() override { return max_match_; } 219 ZoneList<RegExpTree*>* nodes() const { return nodes_; } 220 221 private: 222 ZoneList<RegExpTree*>* nodes_; 223 int min_match_; 224 int max_match_; 225}; 226 227 228class RegExpAssertion final : public RegExpTree { 229 public: 230 enum class Type { 231 START_OF_LINE = 0, 232 START_OF_INPUT = 1, 233 END_OF_LINE = 2, 234 END_OF_INPUT = 3, 235 BOUNDARY = 4, 236 NON_BOUNDARY = 5, 237 LAST_ASSERTION_TYPE = NON_BOUNDARY, 238 }; 239 explicit RegExpAssertion(Type type) : assertion_type_(type) {} 240 241 DECL_BOILERPLATE(Assertion); 242 243 bool IsAnchoredAtStart() override; 244 bool IsAnchoredAtEnd() override; 245 int min_match() override { return 0; } 246 int max_match() override { return 0; } 247 Type assertion_type() const { return assertion_type_; } 248 249 private: 250 const Type assertion_type_; 251}; 252 253class CharacterSet final { 254 public: 255 explicit CharacterSet(StandardCharacterSet standard_set_type) 256 : standard_set_type_(standard_set_type) {} 257 explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {} 258 259 ZoneList<CharacterRange>* ranges(Zone* zone); 260 StandardCharacterSet standard_set_type() const { 261 return standard_set_type_.value(); 262 } 263 void set_standard_set_type(StandardCharacterSet standard_set_type) { 264 standard_set_type_ = standard_set_type; 265 } 266 bool is_standard() const { return standard_set_type_.has_value(); } 267 V8_EXPORT_PRIVATE void Canonicalize(); 268 269 private: 270 ZoneList<CharacterRange>* ranges_ = nullptr; 271 base::Optional<StandardCharacterSet> standard_set_type_; 272}; 273 274class RegExpCharacterClass final : public RegExpTree { 275 public: 276 // NEGATED: The character class is negated and should match everything but 277 // the specified ranges. 278 // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split 279 // surrogate and should not be unicode-desugared (crbug.com/641091). 280 enum Flag { 281 NEGATED = 1 << 0, 282 CONTAINS_SPLIT_SURROGATE = 1 << 1, 283 }; 284 using CharacterClassFlags = base::Flags<Flag>; 285 286 RegExpCharacterClass( 287 Zone* zone, ZoneList<CharacterRange>* ranges, 288 CharacterClassFlags character_class_flags = CharacterClassFlags()) 289 : set_(ranges), character_class_flags_(character_class_flags) { 290 // Convert the empty set of ranges to the negated Everything() range. 291 if (ranges->is_empty()) { 292 ranges->Add(CharacterRange::Everything(), zone); 293 character_class_flags_ ^= NEGATED; 294 } 295 } 296 explicit RegExpCharacterClass(StandardCharacterSet standard_set_type) 297 : set_(standard_set_type), character_class_flags_() {} 298 299 DECL_BOILERPLATE(CharacterClass); 300 301 bool IsTextElement() override { return true; } 302 int min_match() override { return 1; } 303 // The character class may match two code units for unicode regexps. 304 // TODO(yangguo): we should split this class for usage in TextElement, and 305 // make max_match() dependent on the character class content. 306 int max_match() override { return 2; } 307 308 void AppendToText(RegExpText* text, Zone* zone) override; 309 310 // TODO(lrn): Remove need for complex version if is_standard that 311 // recognizes a mangled standard set and just do { return set_.is_special(); } 312 bool is_standard(Zone* zone); 313 // Returns a value representing the standard character set if is_standard() 314 // returns true. 315 StandardCharacterSet standard_type() const { 316 return set_.standard_set_type(); 317 } 318 319 CharacterSet character_set() const { return set_; } 320 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } 321 322 bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } 323 bool contains_split_surrogate() const { 324 return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; 325 } 326 327 private: 328 CharacterSet set_; 329 CharacterClassFlags character_class_flags_; 330}; 331 332class RegExpAtom final : public RegExpTree { 333 public: 334 explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {} 335 336 DECL_BOILERPLATE(Atom); 337 338 bool IsTextElement() override { return true; } 339 int min_match() override { return data_.length(); } 340 int max_match() override { return data_.length(); } 341 void AppendToText(RegExpText* text, Zone* zone) override; 342 343 base::Vector<const base::uc16> data() const { return data_; } 344 int length() const { return data_.length(); } 345 346 private: 347 base::Vector<const base::uc16> data_; 348}; 349 350class TextElement final { 351 public: 352 enum TextType { ATOM, CHAR_CLASS }; 353 354 static TextElement Atom(RegExpAtom* atom); 355 static TextElement CharClass(RegExpCharacterClass* char_class); 356 357 int cp_offset() const { return cp_offset_; } 358 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 359 int length() const; 360 361 TextType text_type() const { return text_type_; } 362 363 RegExpTree* tree() const { return tree_; } 364 365 RegExpAtom* atom() const { 366 DCHECK(text_type() == ATOM); 367 return reinterpret_cast<RegExpAtom*>(tree()); 368 } 369 370 RegExpCharacterClass* char_class() const { 371 DCHECK(text_type() == CHAR_CLASS); 372 return reinterpret_cast<RegExpCharacterClass*>(tree()); 373 } 374 375 private: 376 TextElement(TextType text_type, RegExpTree* tree) 377 : cp_offset_(-1), text_type_(text_type), tree_(tree) {} 378 379 int cp_offset_; 380 TextType text_type_; 381 RegExpTree* tree_; 382}; 383 384class RegExpText final : public RegExpTree { 385 public: 386 explicit RegExpText(Zone* zone) : elements_(2, zone) {} 387 388 DECL_BOILERPLATE(Text); 389 390 bool IsTextElement() override { return true; } 391 int min_match() override { return length_; } 392 int max_match() override { return length_; } 393 void AppendToText(RegExpText* text, Zone* zone) override; 394 void AddElement(TextElement elm, Zone* zone) { 395 elements_.Add(elm, zone); 396 length_ += elm.length(); 397 } 398 ZoneList<TextElement>* elements() { return &elements_; } 399 400 private: 401 ZoneList<TextElement> elements_; 402 int length_ = 0; 403}; 404 405 406class RegExpQuantifier final : public RegExpTree { 407 public: 408 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; 409 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) 410 : body_(body), 411 min_(min), 412 max_(max), 413 quantifier_type_(type) { 414 if (min > 0 && body->min_match() > kInfinity / min) { 415 min_match_ = kInfinity; 416 } else { 417 min_match_ = min * body->min_match(); 418 } 419 if (max > 0 && body->max_match() > kInfinity / max) { 420 max_match_ = kInfinity; 421 } else { 422 max_match_ = max * body->max_match(); 423 } 424 } 425 426 DECL_BOILERPLATE(Quantifier); 427 428 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, 429 RegExpCompiler* compiler, RegExpNode* on_success, 430 bool not_at_start = false); 431 Interval CaptureRegisters() override; 432 int min_match() override { return min_match_; } 433 int max_match() override { return max_match_; } 434 int min() const { return min_; } 435 int max() const { return max_; } 436 QuantifierType quantifier_type() const { return quantifier_type_; } 437 bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } 438 bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } 439 bool is_greedy() const { return quantifier_type_ == GREEDY; } 440 RegExpTree* body() const { return body_; } 441 442 private: 443 RegExpTree* body_; 444 int min_; 445 int max_; 446 int min_match_; 447 int max_match_; 448 QuantifierType quantifier_type_; 449}; 450 451 452class RegExpCapture final : public RegExpTree { 453 public: 454 explicit RegExpCapture(int index) 455 : body_(nullptr), 456 index_(index), 457 min_match_(0), 458 max_match_(0), 459 name_(nullptr) {} 460 461 DECL_BOILERPLATE(Capture); 462 463 static RegExpNode* ToNode(RegExpTree* body, int index, 464 RegExpCompiler* compiler, RegExpNode* on_success); 465 bool IsAnchoredAtStart() override; 466 bool IsAnchoredAtEnd() override; 467 Interval CaptureRegisters() override; 468 int min_match() override { return min_match_; } 469 int max_match() override { return max_match_; } 470 RegExpTree* body() { return body_; } 471 void set_body(RegExpTree* body) { 472 body_ = body; 473 min_match_ = body->min_match(); 474 max_match_ = body->max_match(); 475 } 476 int index() const { return index_; } 477 const ZoneVector<base::uc16>* name() const { return name_; } 478 void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } 479 static int StartRegister(int index) { return index * 2; } 480 static int EndRegister(int index) { return index * 2 + 1; } 481 482 private: 483 RegExpTree* body_ = nullptr; 484 int index_; 485 int min_match_ = 0; 486 int max_match_ = 0; 487 const ZoneVector<base::uc16>* name_ = nullptr; 488}; 489 490class RegExpGroup final : public RegExpTree { 491 public: 492 explicit RegExpGroup(RegExpTree* body) 493 : body_(body), 494 min_match_(body->min_match()), 495 max_match_(body->max_match()) {} 496 497 DECL_BOILERPLATE(Group); 498 499 bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } 500 bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } 501 int min_match() override { return min_match_; } 502 int max_match() override { return max_match_; } 503 Interval CaptureRegisters() override { return body_->CaptureRegisters(); } 504 RegExpTree* body() const { return body_; } 505 506 private: 507 RegExpTree* body_; 508 int min_match_; 509 int max_match_; 510}; 511 512class RegExpLookaround final : public RegExpTree { 513 public: 514 enum Type { LOOKAHEAD, LOOKBEHIND }; 515 516 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, 517 int capture_from, Type type) 518 : body_(body), 519 is_positive_(is_positive), 520 capture_count_(capture_count), 521 capture_from_(capture_from), 522 type_(type) {} 523 524 DECL_BOILERPLATE(Lookaround); 525 526 Interval CaptureRegisters() override; 527 bool IsAnchoredAtStart() override; 528 int min_match() override { return 0; } 529 int max_match() override { return 0; } 530 RegExpTree* body() const { return body_; } 531 bool is_positive() const { return is_positive_; } 532 int capture_count() const { return capture_count_; } 533 int capture_from() const { return capture_from_; } 534 Type type() const { return type_; } 535 536 class Builder { 537 public: 538 Builder(bool is_positive, RegExpNode* on_success, 539 int stack_pointer_register, int position_register, 540 int capture_register_count = 0, int capture_register_start = 0); 541 RegExpNode* on_match_success() const { return on_match_success_; } 542 RegExpNode* ForMatch(RegExpNode* match); 543 544 private: 545 bool is_positive_; 546 RegExpNode* on_match_success_; 547 RegExpNode* on_success_; 548 int stack_pointer_register_; 549 int position_register_; 550 }; 551 552 private: 553 RegExpTree* body_; 554 bool is_positive_; 555 int capture_count_; 556 int capture_from_; 557 Type type_; 558}; 559 560 561class RegExpBackReference final : public RegExpTree { 562 public: 563 explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {} 564 RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) 565 : capture_(capture), flags_(flags) {} 566 567 DECL_BOILERPLATE(BackReference); 568 569 int min_match() override { return 0; } 570 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite 571 // recursion, we give up. Ignorance is bliss. 572 int max_match() override { return kInfinity; } 573 int index() const { return capture_->index(); } 574 RegExpCapture* capture() const { return capture_; } 575 void set_capture(RegExpCapture* capture) { capture_ = capture; } 576 const ZoneVector<base::uc16>* name() const { return name_; } 577 void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } 578 579 private: 580 RegExpCapture* capture_ = nullptr; 581 const ZoneVector<base::uc16>* name_ = nullptr; 582 const RegExpFlags flags_; 583}; 584 585 586class RegExpEmpty final : public RegExpTree { 587 public: 588 DECL_BOILERPLATE(Empty); 589 int min_match() override { return 0; } 590 int max_match() override { return 0; } 591}; 592 593} // namespace internal 594} // namespace v8 595 596#undef DECL_BOILERPLATE 597 598#endif // V8_REGEXP_REGEXP_AST_H_ 599