1// Copyright 2016 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/regexp-parser.h" 6 7#include "src/base/small-vector.h" 8#include "src/execution/isolate.h" 9#include "src/objects/string-inl.h" 10#include "src/regexp/property-sequences.h" 11#include "src/regexp/regexp-ast.h" 12#include "src/regexp/regexp-macro-assembler.h" 13#include "src/regexp/regexp.h" 14#include "src/strings/char-predicates-inl.h" 15#include "src/utils/ostreams.h" 16#include "src/utils/utils.h" 17#include "src/zone/zone-allocator.h" 18#include "src/zone/zone-list-inl.h" 19 20#ifdef V8_INTL_SUPPORT 21#include "unicode/uniset.h" 22#endif // V8_INTL_SUPPORT 23 24namespace v8 { 25namespace internal { 26 27namespace { 28 29// Whether we're currently inside the ClassEscape production 30// (tc39.es/ecma262/#prod-annexB-CharacterEscape). 31enum class InClassEscapeState { 32 kInClass, 33 kNotInClass, 34}; 35 36// Accumulates RegExp atoms and assertions into lists of terms and alternatives. 37class RegExpBuilder { 38 public: 39 RegExpBuilder(Zone* zone, RegExpFlags flags) 40 : zone_(zone), 41 flags_(flags), 42 terms_(ZoneAllocator<RegExpTree*>{zone}), 43 text_(ZoneAllocator<RegExpTree*>{zone}), 44 alternatives_(ZoneAllocator<RegExpTree*>{zone}) {} 45 void AddCharacter(base::uc16 character); 46 void AddUnicodeCharacter(base::uc32 character); 47 void AddEscapedUnicodeCharacter(base::uc32 character); 48 // "Adds" an empty expression. Does nothing except consume a 49 // following quantifier 50 void AddEmpty(); 51 void AddCharacterClass(RegExpCharacterClass* cc); 52 void AddCharacterClassForDesugaring(base::uc32 c); 53 void AddAtom(RegExpTree* tree); 54 void AddTerm(RegExpTree* tree); 55 void AddAssertion(RegExpTree* tree); 56 void NewAlternative(); // '|' 57 bool AddQuantifierToAtom(int min, int max, 58 RegExpQuantifier::QuantifierType type); 59 void FlushText(); 60 RegExpTree* ToRegExp(); 61 RegExpFlags flags() const { return flags_; } 62 63 bool ignore_case() const { return IsIgnoreCase(flags_); } 64 bool multiline() const { return IsMultiline(flags_); } 65 bool dotall() const { return IsDotAll(flags_); } 66 67 private: 68 static const base::uc16 kNoPendingSurrogate = 0; 69 void AddLeadSurrogate(base::uc16 lead_surrogate); 70 void AddTrailSurrogate(base::uc16 trail_surrogate); 71 void FlushPendingSurrogate(); 72 void FlushCharacters(); 73 void FlushTerms(); 74 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); 75 bool NeedsDesugaringForIgnoreCase(base::uc32 c); 76 Zone* zone() const { return zone_; } 77 bool unicode() const { return IsUnicode(flags_); } 78 79 Zone* const zone_; 80 bool pending_empty_ = false; 81 const RegExpFlags flags_; 82 ZoneList<base::uc16>* characters_ = nullptr; 83 base::uc16 pending_surrogate_ = kNoPendingSurrogate; 84 85 using SmallRegExpTreeVector = 86 base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>; 87 SmallRegExpTreeVector terms_; 88 SmallRegExpTreeVector text_; 89 SmallRegExpTreeVector alternatives_; 90#ifdef DEBUG 91 enum { 92 ADD_NONE, 93 ADD_CHAR, 94 ADD_TERM, 95 ADD_ASSERT, 96 ADD_ATOM 97 } last_added_ = ADD_NONE; 98#define LAST(x) last_added_ = x; 99#else 100#define LAST(x) 101#endif 102}; 103 104enum SubexpressionType { 105 INITIAL, 106 CAPTURE, // All positive values represent captures. 107 POSITIVE_LOOKAROUND, 108 NEGATIVE_LOOKAROUND, 109 GROUPING 110}; 111 112class RegExpParserState : public ZoneObject { 113 public: 114 // Push a state on the stack. 115 RegExpParserState(RegExpParserState* previous_state, 116 SubexpressionType group_type, 117 RegExpLookaround::Type lookaround_type, 118 int disjunction_capture_index, 119 const ZoneVector<base::uc16>* capture_name, 120 RegExpFlags flags, Zone* zone) 121 : previous_state_(previous_state), 122 builder_(zone, flags), 123 group_type_(group_type), 124 lookaround_type_(lookaround_type), 125 disjunction_capture_index_(disjunction_capture_index), 126 capture_name_(capture_name) {} 127 // Parser state of containing expression, if any. 128 RegExpParserState* previous_state() const { return previous_state_; } 129 bool IsSubexpression() { return previous_state_ != nullptr; } 130 // RegExpBuilder building this regexp's AST. 131 RegExpBuilder* builder() { return &builder_; } 132 // Type of regexp being parsed (parenthesized group or entire regexp). 133 SubexpressionType group_type() const { return group_type_; } 134 // Lookahead or Lookbehind. 135 RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } 136 // Index in captures array of first capture in this sub-expression, if any. 137 // Also the capture index of this sub-expression itself, if group_type 138 // is CAPTURE. 139 int capture_index() const { return disjunction_capture_index_; } 140 // The name of the current sub-expression, if group_type is CAPTURE. Only 141 // used for named captures. 142 const ZoneVector<base::uc16>* capture_name() const { return capture_name_; } 143 144 bool IsNamedCapture() const { return capture_name_ != nullptr; } 145 146 // Check whether the parser is inside a capture group with the given index. 147 bool IsInsideCaptureGroup(int index) const { 148 for (const RegExpParserState* s = this; s != nullptr; 149 s = s->previous_state()) { 150 if (s->group_type() != CAPTURE) continue; 151 // Return true if we found the matching capture index. 152 if (index == s->capture_index()) return true; 153 // Abort if index is larger than what has been parsed up till this state. 154 if (index > s->capture_index()) return false; 155 } 156 return false; 157 } 158 159 // Check whether the parser is inside a capture group with the given name. 160 bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const { 161 DCHECK_NOT_NULL(name); 162 for (const RegExpParserState* s = this; s != nullptr; 163 s = s->previous_state()) { 164 if (s->capture_name() == nullptr) continue; 165 if (*s->capture_name() == *name) return true; 166 } 167 return false; 168 } 169 170 private: 171 // Linked list implementation of stack of states. 172 RegExpParserState* const previous_state_; 173 // Builder for the stored disjunction. 174 RegExpBuilder builder_; 175 // Stored disjunction type (capture, look-ahead or grouping), if any. 176 const SubexpressionType group_type_; 177 // Stored read direction. 178 const RegExpLookaround::Type lookaround_type_; 179 // Stored disjunction's capture index (if any). 180 const int disjunction_capture_index_; 181 // Stored capture name (if any). 182 const ZoneVector<base::uc16>* const capture_name_; 183}; 184 185template <class CharT> 186class RegExpParserImpl final { 187 private: 188 RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags, 189 uintptr_t stack_limit, Zone* zone, 190 const DisallowGarbageCollection& no_gc); 191 192 bool Parse(RegExpCompileData* result); 193 194 RegExpTree* ParsePattern(); 195 RegExpTree* ParseDisjunction(); 196 RegExpTree* ParseGroup(); 197 198 // Parses a {...,...} quantifier and stores the range in the given 199 // out parameters. 200 bool ParseIntervalQuantifier(int* min_out, int* max_out); 201 202 // Checks whether the following is a length-digit hexadecimal number, 203 // and sets the value if it is. 204 bool ParseHexEscape(int length, base::uc32* value); 205 bool ParseUnicodeEscape(base::uc32* value); 206 bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value); 207 208 bool ParsePropertyClassName(ZoneVector<char>* name_1, 209 ZoneVector<char>* name_2); 210 bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, 211 const ZoneVector<char>& name_1, 212 const ZoneVector<char>& name_2); 213 214 RegExpTree* ParseCharacterClass(const RegExpBuilder* state); 215 216 base::uc32 ParseOctalLiteral(); 217 218 // Tries to parse the input as a back reference. If successful it 219 // stores the result in the output parameter and returns true. If 220 // it fails it will push back the characters read so the same characters 221 // can be reparsed. 222 bool ParseBackReferenceIndex(int* index_out); 223 224 // Parse inside a class. Either add escaped class to the range, or return 225 // false and pass parsed single character through |char_out|. 226 void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, 227 bool add_unicode_case_equivalents, base::uc32* char_out, 228 bool* is_class_escape); 229 // Returns true iff parsing was successful. 230 bool TryParseCharacterClassEscape(base::uc32 next, 231 InClassEscapeState in_class_escape_state, 232 ZoneList<CharacterRange>* ranges, 233 Zone* zone, 234 bool add_unicode_case_equivalents); 235 // Parses and returns a single escaped character. 236 base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state, 237 bool* is_escaped_unicode_character); 238 239 RegExpTree* ReportError(RegExpError error); 240 void Advance(); 241 void Advance(int dist); 242 void RewindByOneCodepoint(); // Rewinds to before the previous Advance(). 243 void Reset(int pos); 244 245 // Reports whether the pattern might be used as a literal search string. 246 // Only use if the result of the parse is a single atom node. 247 bool simple() const { return simple_; } 248 bool contains_anchor() const { return contains_anchor_; } 249 void set_contains_anchor() { contains_anchor_ = true; } 250 int captures_started() const { return captures_started_; } 251 int position() const { return next_pos_ - 1; } 252 bool failed() const { return failed_; } 253 bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; } 254 255 static bool IsSyntaxCharacterOrSlash(base::uc32 c); 256 257 static const base::uc32 kEndMarker = (1 << 21); 258 259 private: 260 // Return the 1-indexed RegExpCapture object, allocate if necessary. 261 RegExpCapture* GetCapture(int index); 262 263 // Creates a new named capture at the specified index. Must be called exactly 264 // once for each named capture. Fails if a capture with the same name is 265 // encountered. 266 bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index); 267 268 // Parses the name of a capture group (?<name>pattern). The name must adhere 269 // to IdentifierName in the ECMAScript standard. 270 const ZoneVector<base::uc16>* ParseCaptureGroupName(); 271 272 bool ParseNamedBackReference(RegExpBuilder* builder, 273 RegExpParserState* state); 274 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); 275 276 // After the initial parsing pass, patch corresponding RegExpCapture objects 277 // into all RegExpBackReferences. This is done after initial parsing in order 278 // to avoid complicating cases in which references comes before the capture. 279 void PatchNamedBackReferences(); 280 281 ZoneVector<RegExpCapture*>* GetNamedCaptures() const; 282 283 // Returns true iff the pattern contains named captures. May call 284 // ScanForCaptures to look ahead at the remaining pattern. 285 bool HasNamedCaptures(InClassEscapeState in_class_escape_state); 286 287 Zone* zone() const { return zone_; } 288 289 base::uc32 current() const { return current_; } 290 bool has_more() const { return has_more_; } 291 bool has_next() const { return next_pos_ < input_length(); } 292 base::uc32 Next(); 293 template <bool update_position> 294 base::uc32 ReadNext(); 295 CharT InputAt(int index) const { 296 DCHECK(0 <= index && index < input_length()); 297 return input_[index]; 298 } 299 int input_length() const { return input_length_; } 300 void ScanForCaptures(InClassEscapeState in_class_escape_state); 301 302 struct RegExpCaptureNameLess { 303 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { 304 DCHECK_NOT_NULL(lhs); 305 DCHECK_NOT_NULL(rhs); 306 return *lhs->name() < *rhs->name(); 307 } 308 }; 309 310 class ForceUnicodeScope final { 311 public: 312 explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser) 313 : parser_(parser) { 314 DCHECK(!parser_->force_unicode_); 315 parser_->force_unicode_ = true; 316 } 317 ~ForceUnicodeScope() { 318 DCHECK(parser_->force_unicode_); 319 parser_->force_unicode_ = false; 320 } 321 322 private: 323 RegExpParserImpl<CharT>* const parser_; 324 }; 325 326 const DisallowGarbageCollection no_gc_; 327 Zone* const zone_; 328 RegExpError error_ = RegExpError::kNone; 329 int error_pos_ = 0; 330 ZoneList<RegExpCapture*>* captures_; 331 ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_; 332 ZoneList<RegExpBackReference*>* named_back_references_; 333 const CharT* const input_; 334 const int input_length_; 335 base::uc32 current_; 336 const RegExpFlags top_level_flags_; 337 bool force_unicode_ = false; // Force parser to act as if unicode were set. 338 int next_pos_; 339 int captures_started_; 340 int capture_count_; // Only valid after we have scanned for captures. 341 bool has_more_; 342 bool simple_; 343 bool contains_anchor_; 344 bool is_scanned_for_captures_; 345 bool has_named_captures_; // Only valid after we have scanned for captures. 346 bool failed_; 347 const uintptr_t stack_limit_; 348 349 friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*, 350 Handle<String>, 351 RegExpFlags, 352 RegExpCompileData*); 353 friend bool RegExpParser::VerifyRegExpSyntax<CharT>( 354 Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*, 355 const DisallowGarbageCollection&); 356}; 357 358template <class CharT> 359RegExpParserImpl<CharT>::RegExpParserImpl( 360 const CharT* input, int input_length, RegExpFlags flags, 361 uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc) 362 : zone_(zone), 363 captures_(nullptr), 364 named_captures_(nullptr), 365 named_back_references_(nullptr), 366 input_(input), 367 input_length_(input_length), 368 current_(kEndMarker), 369 top_level_flags_(flags), 370 next_pos_(0), 371 captures_started_(0), 372 capture_count_(0), 373 has_more_(true), 374 simple_(false), 375 contains_anchor_(false), 376 is_scanned_for_captures_(false), 377 has_named_captures_(false), 378 failed_(false), 379 stack_limit_(stack_limit) { 380 Advance(); 381} 382 383template <> 384template <bool update_position> 385inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() { 386 int position = next_pos_; 387 base::uc16 c0 = InputAt(position); 388 position++; 389 DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0)); 390 if (update_position) next_pos_ = position; 391 return c0; 392} 393 394template <> 395template <bool update_position> 396inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() { 397 int position = next_pos_; 398 base::uc16 c0 = InputAt(position); 399 base::uc32 result = c0; 400 position++; 401 // Read the whole surrogate pair in case of unicode flag, if possible. 402 if (unicode() && position < input_length() && 403 unibrow::Utf16::IsLeadSurrogate(c0)) { 404 base::uc16 c1 = InputAt(position); 405 if (unibrow::Utf16::IsTrailSurrogate(c1)) { 406 result = unibrow::Utf16::CombineSurrogatePair(c0, c1); 407 position++; 408 } 409 } 410 if (update_position) next_pos_ = position; 411 return result; 412} 413 414template <class CharT> 415base::uc32 RegExpParserImpl<CharT>::Next() { 416 if (has_next()) { 417 return ReadNext<false>(); 418 } else { 419 return kEndMarker; 420 } 421} 422 423template <class CharT> 424void RegExpParserImpl<CharT>::Advance() { 425 if (has_next()) { 426 if (GetCurrentStackPosition() < stack_limit_) { 427 if (FLAG_correctness_fuzzer_suppressions) { 428 FATAL("Aborting on stack overflow"); 429 } 430 ReportError(RegExpError::kStackOverflow); 431 } else { 432 current_ = ReadNext<true>(); 433 } 434 } else { 435 current_ = kEndMarker; 436 // Advance so that position() points to 1-after-the-last-character. This is 437 // important so that Reset() to this position works correctly. 438 next_pos_ = input_length() + 1; 439 has_more_ = false; 440 } 441} 442 443template <class CharT> 444void RegExpParserImpl<CharT>::RewindByOneCodepoint() { 445 if (current() == kEndMarker) return; 446 // Rewinds by one code point, i.e.: two code units if `current` is outside 447 // the basic multilingual plane (= composed of a lead and trail surrogate), 448 // or one code unit otherwise. 449 const int rewind_by = 450 current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1; 451 Advance(rewind_by); // Undo the last Advance. 452} 453 454template <class CharT> 455void RegExpParserImpl<CharT>::Reset(int pos) { 456 next_pos_ = pos; 457 has_more_ = (pos < input_length()); 458 Advance(); 459} 460 461template <class CharT> 462void RegExpParserImpl<CharT>::Advance(int dist) { 463 next_pos_ += dist - 1; 464 Advance(); 465} 466 467template <class CharT> 468bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) { 469 switch (c) { 470 case '^': 471 case '$': 472 case '\\': 473 case '.': 474 case '*': 475 case '+': 476 case '?': 477 case '(': 478 case ')': 479 case '[': 480 case ']': 481 case '{': 482 case '}': 483 case '|': 484 case '/': 485 return true; 486 default: 487 break; 488 } 489 return false; 490} 491 492template <class CharT> 493RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) { 494 if (failed_) return nullptr; // Do not overwrite any existing error. 495 failed_ = true; 496 error_ = error; 497 error_pos_ = position(); 498 // Zip to the end to make sure no more input is read. 499 current_ = kEndMarker; 500 next_pos_ = input_length(); 501 return nullptr; 502} 503 504#define CHECK_FAILED /**/); \ 505 if (failed_) return nullptr; \ 506 ((void)0 507 508// Pattern :: 509// Disjunction 510template <class CharT> 511RegExpTree* RegExpParserImpl<CharT>::ParsePattern() { 512 RegExpTree* result = ParseDisjunction(CHECK_FAILED); 513 PatchNamedBackReferences(CHECK_FAILED); 514 DCHECK(!has_more()); 515 // If the result of parsing is a literal string atom, and it has the 516 // same length as the input, then the atom is identical to the input. 517 if (result->IsAtom() && result->AsAtom()->length() == input_length()) { 518 simple_ = true; 519 } 520 return result; 521} 522 523// Disjunction :: 524// Alternative 525// Alternative | Disjunction 526// Alternative :: 527// [empty] 528// Term Alternative 529// Term :: 530// Assertion 531// Atom 532// Atom Quantifier 533template <class CharT> 534RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { 535 // Used to store current state while parsing subexpressions. 536 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 537 0, nullptr, top_level_flags_, zone()); 538 RegExpParserState* state = &initial_state; 539 // Cache the builder in a local variable for quick access. 540 RegExpBuilder* builder = initial_state.builder(); 541 while (true) { 542 switch (current()) { 543 case kEndMarker: 544 if (failed()) return nullptr; // E.g. the initial Advance failed. 545 if (state->IsSubexpression()) { 546 // Inside a parenthesized group when hitting end of input. 547 return ReportError(RegExpError::kUnterminatedGroup); 548 } 549 DCHECK_EQ(INITIAL, state->group_type()); 550 // Parsing completed successfully. 551 return builder->ToRegExp(); 552 case ')': { 553 if (!state->IsSubexpression()) { 554 return ReportError(RegExpError::kUnmatchedParen); 555 } 556 DCHECK_NE(INITIAL, state->group_type()); 557 558 Advance(); 559 // End disjunction parsing and convert builder content to new single 560 // regexp atom. 561 RegExpTree* body = builder->ToRegExp(); 562 563 int end_capture_index = captures_started(); 564 565 int capture_index = state->capture_index(); 566 SubexpressionType group_type = state->group_type(); 567 568 // Build result of subexpression. 569 if (group_type == CAPTURE) { 570 if (state->IsNamedCapture()) { 571 CreateNamedCaptureAtIndex(state->capture_name(), 572 capture_index CHECK_FAILED); 573 } 574 RegExpCapture* capture = GetCapture(capture_index); 575 capture->set_body(body); 576 body = capture; 577 } else if (group_type == GROUPING) { 578 body = zone()->template New<RegExpGroup>(body); 579 } else { 580 DCHECK(group_type == POSITIVE_LOOKAROUND || 581 group_type == NEGATIVE_LOOKAROUND); 582 bool is_positive = (group_type == POSITIVE_LOOKAROUND); 583 body = zone()->template New<RegExpLookaround>( 584 body, is_positive, end_capture_index - capture_index, 585 capture_index, state->lookaround_type()); 586 } 587 588 // Restore previous state. 589 state = state->previous_state(); 590 builder = state->builder(); 591 592 builder->AddAtom(body); 593 // For compatibility with JSC and ES3, we allow quantifiers after 594 // lookaheads, and break in all cases. 595 break; 596 } 597 case '|': { 598 Advance(); 599 builder->NewAlternative(); 600 continue; 601 } 602 case '*': 603 case '+': 604 case '?': 605 return ReportError(RegExpError::kNothingToRepeat); 606 case '^': { 607 Advance(); 608 builder->AddAssertion(zone()->template New<RegExpAssertion>( 609 builder->multiline() ? RegExpAssertion::Type::START_OF_LINE 610 : RegExpAssertion::Type::START_OF_INPUT)); 611 set_contains_anchor(); 612 continue; 613 } 614 case '$': { 615 Advance(); 616 RegExpAssertion::Type assertion_type = 617 builder->multiline() ? RegExpAssertion::Type::END_OF_LINE 618 : RegExpAssertion::Type::END_OF_INPUT; 619 builder->AddAssertion( 620 zone()->template New<RegExpAssertion>(assertion_type)); 621 continue; 622 } 623 case '.': { 624 Advance(); 625 ZoneList<CharacterRange>* ranges = 626 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 627 628 if (builder->dotall()) { 629 // Everything. 630 CharacterRange::AddClassEscape(StandardCharacterSet::kEverything, 631 ranges, false, zone()); 632 } else { 633 // Everything except \x0A, \x0D, \u2028 and \u2029. 634 CharacterRange::AddClassEscape( 635 StandardCharacterSet::kNotLineTerminator, ranges, false, zone()); 636 } 637 638 RegExpCharacterClass* cc = 639 zone()->template New<RegExpCharacterClass>(zone(), ranges); 640 builder->AddCharacterClass(cc); 641 break; 642 } 643 case '(': { 644 state = ParseOpenParenthesis(state CHECK_FAILED); 645 builder = state->builder(); 646 continue; 647 } 648 case '[': { 649 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); 650 builder->AddCharacterClass(cc->AsCharacterClass()); 651 break; 652 } 653 // Atom :: 654 // \ AtomEscape 655 case '\\': 656 switch (Next()) { 657 case kEndMarker: 658 return ReportError(RegExpError::kEscapeAtEndOfPattern); 659 // AtomEscape :: 660 // [+UnicodeMode] DecimalEscape 661 // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber 662 // of DecimalEscape is ≤ NcapturingParens 663 // CharacterEscape (some cases of this mixed in too) 664 // 665 // TODO(jgruber): It may make sense to disentangle all the different 666 // cases and make the structure mirror the spec, e.g. for AtomEscape: 667 // 668 // if (TryParseDecimalEscape(...)) return; 669 // if (TryParseCharacterClassEscape(...)) return; 670 // if (TryParseCharacterEscape(...)) return; 671 // if (TryParseGroupName(...)) return; 672 case '1': 673 case '2': 674 case '3': 675 case '4': 676 case '5': 677 case '6': 678 case '7': 679 case '8': 680 case '9': { 681 int index = 0; 682 const bool is_backref = 683 ParseBackReferenceIndex(&index CHECK_FAILED); 684 if (is_backref) { 685 if (state->IsInsideCaptureGroup(index)) { 686 // The back reference is inside the capture group it refers to. 687 // Nothing can possibly have been captured yet, so we use empty 688 // instead. This ensures that, when checking a back reference, 689 // the capture registers of the referenced capture are either 690 // both set or both cleared. 691 builder->AddEmpty(); 692 } else { 693 RegExpCapture* capture = GetCapture(index); 694 RegExpTree* atom = zone()->template New<RegExpBackReference>( 695 capture, builder->flags()); 696 builder->AddAtom(atom); 697 } 698 break; 699 } 700 // With /u, no identity escapes except for syntax characters 701 // are allowed. Otherwise, all identity escapes are allowed. 702 if (unicode()) { 703 return ReportError(RegExpError::kInvalidEscape); 704 } 705 base::uc32 first_digit = Next(); 706 if (first_digit == '8' || first_digit == '9') { 707 builder->AddCharacter(first_digit); 708 Advance(2); 709 break; 710 } 711 V8_FALLTHROUGH; 712 } 713 case '0': { 714 Advance(); 715 if (unicode() && Next() >= '0' && Next() <= '9') { 716 // With /u, decimal escape with leading 0 are not parsed as octal. 717 return ReportError(RegExpError::kInvalidDecimalEscape); 718 } 719 base::uc32 octal = ParseOctalLiteral(); 720 builder->AddCharacter(octal); 721 break; 722 } 723 case 'b': 724 Advance(2); 725 builder->AddAssertion(zone()->template New<RegExpAssertion>( 726 RegExpAssertion::Type::BOUNDARY)); 727 continue; 728 case 'B': 729 Advance(2); 730 builder->AddAssertion(zone()->template New<RegExpAssertion>( 731 RegExpAssertion::Type::NON_BOUNDARY)); 732 continue; 733 // AtomEscape :: 734 // CharacterClassEscape 735 case 'd': 736 case 'D': 737 case 's': 738 case 'S': 739 case 'w': 740 case 'W': 741 case 'p': 742 case 'P': { 743 base::uc32 next = Next(); 744 ZoneList<CharacterRange>* ranges = 745 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 746 bool add_unicode_case_equivalents = 747 unicode() && builder->ignore_case(); 748 bool parsed_character_class_escape = TryParseCharacterClassEscape( 749 next, InClassEscapeState::kNotInClass, ranges, zone(), 750 add_unicode_case_equivalents CHECK_FAILED); 751 752 if (parsed_character_class_escape) { 753 RegExpCharacterClass* cc = 754 zone()->template New<RegExpCharacterClass>(zone(), ranges); 755 builder->AddCharacterClass(cc); 756 } else { 757 CHECK(!unicode()); 758 Advance(2); 759 builder->AddCharacter(next); // IdentityEscape. 760 } 761 break; 762 } 763 // AtomEscape :: 764 // k GroupName 765 case 'k': { 766 // Either an identity escape or a named back-reference. The two 767 // interpretations are mutually exclusive: '\k' is interpreted as 768 // an identity escape for non-Unicode patterns without named 769 // capture groups, and as the beginning of a named back-reference 770 // in all other cases. 771 const bool has_named_captures = 772 HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED); 773 if (unicode() || has_named_captures) { 774 Advance(2); 775 ParseNamedBackReference(builder, state CHECK_FAILED); 776 break; 777 } 778 } 779 V8_FALLTHROUGH; 780 // AtomEscape :: 781 // CharacterEscape 782 default: { 783 bool is_escaped_unicode_character = false; 784 base::uc32 c = ParseCharacterEscape( 785 InClassEscapeState::kNotInClass, 786 &is_escaped_unicode_character CHECK_FAILED); 787 if (is_escaped_unicode_character) { 788 builder->AddEscapedUnicodeCharacter(c); 789 } else { 790 builder->AddCharacter(c); 791 } 792 break; 793 } 794 } 795 break; 796 case '{': { 797 int dummy; 798 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); 799 if (parsed) return ReportError(RegExpError::kNothingToRepeat); 800 V8_FALLTHROUGH; 801 } 802 case '}': 803 case ']': 804 if (unicode()) { 805 return ReportError(RegExpError::kLoneQuantifierBrackets); 806 } 807 V8_FALLTHROUGH; 808 default: 809 builder->AddUnicodeCharacter(current()); 810 Advance(); 811 break; 812 } // end switch(current()) 813 814 int min; 815 int max; 816 switch (current()) { 817 // QuantifierPrefix :: 818 // * 819 // + 820 // ? 821 // { 822 case '*': 823 min = 0; 824 max = RegExpTree::kInfinity; 825 Advance(); 826 break; 827 case '+': 828 min = 1; 829 max = RegExpTree::kInfinity; 830 Advance(); 831 break; 832 case '?': 833 min = 0; 834 max = 1; 835 Advance(); 836 break; 837 case '{': 838 if (ParseIntervalQuantifier(&min, &max)) { 839 if (max < min) { 840 return ReportError(RegExpError::kRangeOutOfOrder); 841 } 842 break; 843 } else if (unicode()) { 844 // With /u, incomplete quantifiers are not allowed. 845 return ReportError(RegExpError::kIncompleteQuantifier); 846 } 847 continue; 848 default: 849 continue; 850 } 851 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 852 if (current() == '?') { 853 quantifier_type = RegExpQuantifier::NON_GREEDY; 854 Advance(); 855 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 856 // FLAG_regexp_possessive_quantifier is a debug-only flag. 857 quantifier_type = RegExpQuantifier::POSSESSIVE; 858 Advance(); 859 } 860 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { 861 return ReportError(RegExpError::kInvalidQuantifier); 862 } 863 } 864} 865 866template <class CharT> 867RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis( 868 RegExpParserState* state) { 869 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 870 bool is_named_capture = false; 871 const ZoneVector<base::uc16>* capture_name = nullptr; 872 SubexpressionType subexpr_type = CAPTURE; 873 Advance(); 874 if (current() == '?') { 875 switch (Next()) { 876 case ':': 877 Advance(2); 878 subexpr_type = GROUPING; 879 break; 880 case '=': 881 Advance(2); 882 lookaround_type = RegExpLookaround::LOOKAHEAD; 883 subexpr_type = POSITIVE_LOOKAROUND; 884 break; 885 case '!': 886 Advance(2); 887 lookaround_type = RegExpLookaround::LOOKAHEAD; 888 subexpr_type = NEGATIVE_LOOKAROUND; 889 break; 890 case '<': 891 Advance(); 892 if (Next() == '=') { 893 Advance(2); 894 lookaround_type = RegExpLookaround::LOOKBEHIND; 895 subexpr_type = POSITIVE_LOOKAROUND; 896 break; 897 } else if (Next() == '!') { 898 Advance(2); 899 lookaround_type = RegExpLookaround::LOOKBEHIND; 900 subexpr_type = NEGATIVE_LOOKAROUND; 901 break; 902 } 903 is_named_capture = true; 904 has_named_captures_ = true; 905 Advance(); 906 break; 907 default: 908 ReportError(RegExpError::kInvalidGroup); 909 return nullptr; 910 } 911 } 912 if (subexpr_type == CAPTURE) { 913 if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) { 914 ReportError(RegExpError::kTooManyCaptures); 915 return nullptr; 916 } 917 captures_started_++; 918 919 if (is_named_capture) { 920 capture_name = ParseCaptureGroupName(CHECK_FAILED); 921 } 922 } 923 // Store current state and begin new disjunction parsing. 924 return zone()->template New<RegExpParserState>( 925 state, subexpr_type, lookaround_type, captures_started_, capture_name, 926 state->builder()->flags(), zone()); 927} 928 929#ifdef DEBUG 930namespace { 931 932bool IsSpecialClassEscape(base::uc32 c) { 933 switch (c) { 934 case 'd': 935 case 'D': 936 case 's': 937 case 'S': 938 case 'w': 939 case 'W': 940 return true; 941 default: 942 return false; 943 } 944} 945 946} // namespace 947#endif 948 949// In order to know whether an escape is a backreference or not we have to scan 950// the entire regexp and find the number of capturing parentheses. However we 951// don't want to scan the regexp twice unless it is necessary. This mini-parser 952// is called when needed. It can see the difference between capturing and 953// noncapturing parentheses and can skip character classes and backslash-escaped 954// characters. 955// 956// Important: The scanner has to be in a consistent state when calling 957// ScanForCaptures, e.g. not in the middle of an escape sequence '\['. 958template <class CharT> 959void RegExpParserImpl<CharT>::ScanForCaptures( 960 InClassEscapeState in_class_escape_state) { 961 DCHECK(!is_scanned_for_captures_); 962 const int saved_position = position(); 963 // Start with captures started previous to current position 964 int capture_count = captures_started(); 965 // When we start inside a character class, skip everything inside the class. 966 if (in_class_escape_state == InClassEscapeState::kInClass) { 967 int c; 968 while ((c = current()) != kEndMarker) { 969 Advance(); 970 if (c == '\\') { 971 Advance(); 972 } else { 973 if (c == ']') break; 974 } 975 } 976 } 977 // Add count of captures after this position. 978 int n; 979 while ((n = current()) != kEndMarker) { 980 Advance(); 981 switch (n) { 982 case '\\': 983 Advance(); 984 break; 985 case '[': { 986 int c; 987 while ((c = current()) != kEndMarker) { 988 Advance(); 989 if (c == '\\') { 990 Advance(); 991 } else { 992 if (c == ']') break; 993 } 994 } 995 break; 996 } 997 case '(': 998 if (current() == '?') { 999 // At this point we could be in 1000 // * a non-capturing group '(:', 1001 // * a lookbehind assertion '(?<=' '(?<!' 1002 // * or a named capture '(?<'. 1003 // 1004 // Of these, only named captures are capturing groups. 1005 1006 Advance(); 1007 if (current() != '<') break; 1008 1009 Advance(); 1010 if (current() == '=' || current() == '!') break; 1011 1012 // Found a possible named capture. It could turn out to be a syntax 1013 // error (e.g. an unterminated or invalid name), but that distinction 1014 // does not matter for our purposes. 1015 has_named_captures_ = true; 1016 } 1017 capture_count++; 1018 break; 1019 } 1020 } 1021 capture_count_ = capture_count; 1022 is_scanned_for_captures_ = true; 1023 Reset(saved_position); 1024} 1025 1026template <class CharT> 1027bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) { 1028 DCHECK_EQ('\\', current()); 1029 DCHECK('1' <= Next() && Next() <= '9'); 1030 // Try to parse a decimal literal that is no greater than the total number 1031 // of left capturing parentheses in the input. 1032 int start = position(); 1033 int value = Next() - '0'; 1034 Advance(2); 1035 while (true) { 1036 base::uc32 c = current(); 1037 if (IsDecimalDigit(c)) { 1038 value = 10 * value + (c - '0'); 1039 if (value > RegExpMacroAssembler::kMaxRegisterCount) { 1040 Reset(start); 1041 return false; 1042 } 1043 Advance(); 1044 } else { 1045 break; 1046 } 1047 } 1048 if (value > captures_started()) { 1049 if (!is_scanned_for_captures_) 1050 ScanForCaptures(InClassEscapeState::kNotInClass); 1051 if (value > capture_count_) { 1052 Reset(start); 1053 return false; 1054 } 1055 } 1056 *index_out = value; 1057 return true; 1058} 1059 1060namespace { 1061 1062void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) { 1063 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 1064 v->push_back(code_unit); 1065 } else { 1066 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); 1067 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); 1068 } 1069} 1070 1071} // namespace 1072 1073template <class CharT> 1074const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() { 1075 // Due to special Advance requirements (see the next comment), rewind by one 1076 // such that names starting with a surrogate pair are parsed correctly for 1077 // patterns where the unicode flag is unset. 1078 // 1079 // Note that we use this odd pattern of rewinding the last advance in order 1080 // to adhere to the common parser behavior of expecting `current` to point at 1081 // the first candidate character for a function (e.g. when entering ParseFoo, 1082 // `current` should point at the first character of Foo). 1083 RewindByOneCodepoint(); 1084 1085 ZoneVector<base::uc16>* name = 1086 zone()->template New<ZoneVector<base::uc16>>(zone()); 1087 1088 { 1089 // Advance behavior inside this function is tricky since 1090 // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U) 1091 // and thus allows surrogate pairs and \u{}-style escapes even in 1092 // non-unicode patterns. Therefore Advance within the capture group name 1093 // has to force-enable unicode, and outside the name revert to default 1094 // behavior. 1095 ForceUnicodeScope force_unicode(this); 1096 1097 bool at_start = true; 1098 while (true) { 1099 Advance(); 1100 base::uc32 c = current(); 1101 1102 // Convert unicode escapes. 1103 if (c == '\\' && Next() == 'u') { 1104 Advance(2); 1105 if (!ParseUnicodeEscape(&c)) { 1106 ReportError(RegExpError::kInvalidUnicodeEscape); 1107 return nullptr; 1108 } 1109 RewindByOneCodepoint(); 1110 } 1111 1112 // The backslash char is misclassified as both ID_Start and ID_Continue. 1113 if (c == '\\') { 1114 ReportError(RegExpError::kInvalidCaptureGroupName); 1115 return nullptr; 1116 } 1117 1118 if (at_start) { 1119 if (!IsIdentifierStart(c)) { 1120 ReportError(RegExpError::kInvalidCaptureGroupName); 1121 return nullptr; 1122 } 1123 push_code_unit(name, c); 1124 at_start = false; 1125 } else { 1126 if (c == '>') { 1127 break; 1128 } else if (IsIdentifierPart(c)) { 1129 push_code_unit(name, c); 1130 } else { 1131 ReportError(RegExpError::kInvalidCaptureGroupName); 1132 return nullptr; 1133 } 1134 } 1135 } 1136 } 1137 1138 // This final advance goes back into the state of pointing at the next 1139 // relevant char, which the rest of the parser expects. See also the previous 1140 // comments in this function. 1141 Advance(); 1142 return name; 1143} 1144 1145template <class CharT> 1146bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex( 1147 const ZoneVector<base::uc16>* name, int index) { 1148 DCHECK(0 < index && index <= captures_started_); 1149 DCHECK_NOT_NULL(name); 1150 1151 RegExpCapture* capture = GetCapture(index); 1152 DCHECK_NULL(capture->name()); 1153 1154 capture->set_name(name); 1155 1156 if (named_captures_ == nullptr) { 1157 named_captures_ = 1158 zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>( 1159 zone()); 1160 } else { 1161 // Check for duplicates and bail if we find any. 1162 1163 const auto& named_capture_it = named_captures_->find(capture); 1164 if (named_capture_it != named_captures_->end()) { 1165 ReportError(RegExpError::kDuplicateCaptureGroupName); 1166 return false; 1167 } 1168 } 1169 1170 named_captures_->emplace(capture); 1171 1172 return true; 1173} 1174 1175template <class CharT> 1176bool RegExpParserImpl<CharT>::ParseNamedBackReference( 1177 RegExpBuilder* builder, RegExpParserState* state) { 1178 // The parser is assumed to be on the '<' in \k<name>. 1179 if (current() != '<') { 1180 ReportError(RegExpError::kInvalidNamedReference); 1181 return false; 1182 } 1183 1184 Advance(); 1185 const ZoneVector<base::uc16>* name = ParseCaptureGroupName(); 1186 if (name == nullptr) { 1187 return false; 1188 } 1189 1190 if (state->IsInsideCaptureGroup(name)) { 1191 builder->AddEmpty(); 1192 } else { 1193 RegExpBackReference* atom = 1194 zone()->template New<RegExpBackReference>(builder->flags()); 1195 atom->set_name(name); 1196 1197 builder->AddAtom(atom); 1198 1199 if (named_back_references_ == nullptr) { 1200 named_back_references_ = 1201 zone()->template New<ZoneList<RegExpBackReference*>>(1, zone()); 1202 } 1203 named_back_references_->Add(atom, zone()); 1204 } 1205 1206 return true; 1207} 1208 1209template <class CharT> 1210void RegExpParserImpl<CharT>::PatchNamedBackReferences() { 1211 if (named_back_references_ == nullptr) return; 1212 1213 if (named_captures_ == nullptr) { 1214 ReportError(RegExpError::kInvalidNamedCaptureReference); 1215 return; 1216 } 1217 1218 // Look up and patch the actual capture for each named back reference. 1219 1220 for (int i = 0; i < named_back_references_->length(); i++) { 1221 RegExpBackReference* ref = named_back_references_->at(i); 1222 1223 // Capture used to search the named_captures_ by name, index of the 1224 // capture is never used. 1225 static const int kInvalidIndex = 0; 1226 RegExpCapture* search_capture = 1227 zone()->template New<RegExpCapture>(kInvalidIndex); 1228 DCHECK_NULL(search_capture->name()); 1229 search_capture->set_name(ref->name()); 1230 1231 int index = -1; 1232 const auto& capture_it = named_captures_->find(search_capture); 1233 if (capture_it != named_captures_->end()) { 1234 index = (*capture_it)->index(); 1235 } else { 1236 ReportError(RegExpError::kInvalidNamedCaptureReference); 1237 return; 1238 } 1239 1240 ref->set_capture(GetCapture(index)); 1241 } 1242} 1243 1244template <class CharT> 1245RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) { 1246 // The index for the capture groups are one-based. Its index in the list is 1247 // zero-based. 1248 const int known_captures = 1249 is_scanned_for_captures_ ? capture_count_ : captures_started_; 1250 DCHECK(index <= known_captures); 1251 if (captures_ == nullptr) { 1252 captures_ = 1253 zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone()); 1254 } 1255 while (captures_->length() < known_captures) { 1256 captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1), 1257 zone()); 1258 } 1259 return captures_->at(index - 1); 1260} 1261 1262template <class CharT> 1263ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const { 1264 if (named_captures_ == nullptr || named_captures_->empty()) { 1265 return nullptr; 1266 } 1267 1268 return zone()->template New<ZoneVector<RegExpCapture*>>( 1269 named_captures_->begin(), named_captures_->end(), zone()); 1270} 1271 1272template <class CharT> 1273bool RegExpParserImpl<CharT>::HasNamedCaptures( 1274 InClassEscapeState in_class_escape_state) { 1275 if (has_named_captures_ || is_scanned_for_captures_) { 1276 return has_named_captures_; 1277 } 1278 1279 ScanForCaptures(in_class_escape_state); 1280 DCHECK(is_scanned_for_captures_); 1281 return has_named_captures_; 1282} 1283 1284// QuantifierPrefix :: 1285// { DecimalDigits } 1286// { DecimalDigits , } 1287// { DecimalDigits , DecimalDigits } 1288// 1289// Returns true if parsing succeeds, and set the min_out and max_out 1290// values. Values are truncated to RegExpTree::kInfinity if they overflow. 1291template <class CharT> 1292bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out, 1293 int* max_out) { 1294 DCHECK_EQ(current(), '{'); 1295 int start = position(); 1296 Advance(); 1297 int min = 0; 1298 if (!IsDecimalDigit(current())) { 1299 Reset(start); 1300 return false; 1301 } 1302 while (IsDecimalDigit(current())) { 1303 int next = current() - '0'; 1304 if (min > (RegExpTree::kInfinity - next) / 10) { 1305 // Overflow. Skip past remaining decimal digits and return -1. 1306 do { 1307 Advance(); 1308 } while (IsDecimalDigit(current())); 1309 min = RegExpTree::kInfinity; 1310 break; 1311 } 1312 min = 10 * min + next; 1313 Advance(); 1314 } 1315 int max = 0; 1316 if (current() == '}') { 1317 max = min; 1318 Advance(); 1319 } else if (current() == ',') { 1320 Advance(); 1321 if (current() == '}') { 1322 max = RegExpTree::kInfinity; 1323 Advance(); 1324 } else { 1325 while (IsDecimalDigit(current())) { 1326 int next = current() - '0'; 1327 if (max > (RegExpTree::kInfinity - next) / 10) { 1328 do { 1329 Advance(); 1330 } while (IsDecimalDigit(current())); 1331 max = RegExpTree::kInfinity; 1332 break; 1333 } 1334 max = 10 * max + next; 1335 Advance(); 1336 } 1337 if (current() != '}') { 1338 Reset(start); 1339 return false; 1340 } 1341 Advance(); 1342 } 1343 } else { 1344 Reset(start); 1345 return false; 1346 } 1347 *min_out = min; 1348 *max_out = max; 1349 return true; 1350} 1351 1352template <class CharT> 1353base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() { 1354 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); 1355 // For compatibility with some other browsers (not all), we parse 1356 // up to three octal digits with a value below 256. 1357 // ES#prod-annexB-LegacyOctalEscapeSequence 1358 base::uc32 value = current() - '0'; 1359 Advance(); 1360 if ('0' <= current() && current() <= '7') { 1361 value = value * 8 + current() - '0'; 1362 Advance(); 1363 if (value < 32 && '0' <= current() && current() <= '7') { 1364 value = value * 8 + current() - '0'; 1365 Advance(); 1366 } 1367 } 1368 return value; 1369} 1370 1371template <class CharT> 1372bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) { 1373 int start = position(); 1374 base::uc32 val = 0; 1375 for (int i = 0; i < length; ++i) { 1376 base::uc32 c = current(); 1377 int d = base::HexValue(c); 1378 if (d < 0) { 1379 Reset(start); 1380 return false; 1381 } 1382 val = val * 16 + d; 1383 Advance(); 1384 } 1385 *value = val; 1386 return true; 1387} 1388 1389// This parses RegExpUnicodeEscapeSequence as described in ECMA262. 1390template <class CharT> 1391bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) { 1392 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 1393 // allowed). In the latter case, the number of hex digits between { } is 1394 // arbitrary. \ and u have already been read. 1395 if (current() == '{' && unicode()) { 1396 int start = position(); 1397 Advance(); 1398 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { 1399 if (current() == '}') { 1400 Advance(); 1401 return true; 1402 } 1403 } 1404 Reset(start); 1405 return false; 1406 } 1407 // \u but no {, or \u{...} escapes not allowed. 1408 bool result = ParseHexEscape(4, value); 1409 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && 1410 current() == '\\') { 1411 // Attempt to read trail surrogate. 1412 int start = position(); 1413 if (Next() == 'u') { 1414 Advance(2); 1415 base::uc32 trail; 1416 if (ParseHexEscape(4, &trail) && 1417 unibrow::Utf16::IsTrailSurrogate(trail)) { 1418 *value = unibrow::Utf16::CombineSurrogatePair( 1419 static_cast<base::uc16>(*value), static_cast<base::uc16>(trail)); 1420 return true; 1421 } 1422 } 1423 Reset(start); 1424 } 1425 return result; 1426} 1427 1428#ifdef V8_INTL_SUPPORT 1429 1430namespace { 1431 1432bool IsExactPropertyAlias(const char* property_name, UProperty property) { 1433 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); 1434 if (short_name != nullptr && strcmp(property_name, short_name) == 0) 1435 return true; 1436 for (int i = 0;; i++) { 1437 const char* long_name = u_getPropertyName( 1438 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1439 if (long_name == nullptr) break; 1440 if (strcmp(property_name, long_name) == 0) return true; 1441 } 1442 return false; 1443} 1444 1445bool IsExactPropertyValueAlias(const char* property_value_name, 1446 UProperty property, int32_t property_value) { 1447 const char* short_name = 1448 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); 1449 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { 1450 return true; 1451 } 1452 for (int i = 0;; i++) { 1453 const char* long_name = u_getPropertyValueName( 1454 property, property_value, 1455 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); 1456 if (long_name == nullptr) break; 1457 if (strcmp(property_value_name, long_name) == 0) return true; 1458 } 1459 return false; 1460} 1461 1462bool LookupPropertyValueName(UProperty property, 1463 const char* property_value_name, bool negate, 1464 ZoneList<CharacterRange>* result, Zone* zone) { 1465 UProperty property_for_lookup = property; 1466 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { 1467 // For the property Script_Extensions, we have to do the property value 1468 // name lookup as if the property is Script. 1469 property_for_lookup = UCHAR_SCRIPT; 1470 } 1471 int32_t property_value = 1472 u_getPropertyValueEnum(property_for_lookup, property_value_name); 1473 if (property_value == UCHAR_INVALID_CODE) return false; 1474 1475 // We require the property name to match exactly to one of the property value 1476 // aliases. However, u_getPropertyValueEnum uses loose matching. 1477 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, 1478 property_value)) { 1479 return false; 1480 } 1481 1482 UErrorCode ec = U_ZERO_ERROR; 1483 icu::UnicodeSet set; 1484 set.applyIntPropertyValue(property, property_value, ec); 1485 bool success = ec == U_ZERO_ERROR && !set.isEmpty(); 1486 1487 if (success) { 1488 set.removeAllStrings(); 1489 if (negate) set.complement(); 1490 for (int i = 0; i < set.getRangeCount(); i++) { 1491 result->Add( 1492 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 1493 zone); 1494 } 1495 } 1496 return success; 1497} 1498 1499template <size_t N> 1500inline bool NameEquals(const char* name, const char (&literal)[N]) { 1501 return strncmp(name, literal, N + 1) == 0; 1502} 1503 1504bool LookupSpecialPropertyValueName(const char* name, 1505 ZoneList<CharacterRange>* result, 1506 bool negate, Zone* zone) { 1507 if (NameEquals(name, "Any")) { 1508 if (negate) { 1509 // Leave the list of character ranges empty, since the negation of 'Any' 1510 // is the empty set. 1511 } else { 1512 result->Add(CharacterRange::Everything(), zone); 1513 } 1514 } else if (NameEquals(name, "ASCII")) { 1515 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) 1516 : CharacterRange::Range(0x0, 0x7F), 1517 zone); 1518 } else if (NameEquals(name, "Assigned")) { 1519 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", 1520 !negate, result, zone); 1521 } else { 1522 return false; 1523 } 1524 return true; 1525} 1526 1527// Explicitly allowlist supported binary properties. The spec forbids supporting 1528// properties outside of this set to ensure interoperability. 1529bool IsSupportedBinaryProperty(UProperty property) { 1530 switch (property) { 1531 case UCHAR_ALPHABETIC: 1532 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. 1533 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. 1534 case UCHAR_ASCII_HEX_DIGIT: 1535 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. 1536 case UCHAR_BIDI_CONTROL: 1537 case UCHAR_BIDI_MIRRORED: 1538 case UCHAR_CASE_IGNORABLE: 1539 case UCHAR_CASED: 1540 case UCHAR_CHANGES_WHEN_CASEFOLDED: 1541 case UCHAR_CHANGES_WHEN_CASEMAPPED: 1542 case UCHAR_CHANGES_WHEN_LOWERCASED: 1543 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: 1544 case UCHAR_CHANGES_WHEN_TITLECASED: 1545 case UCHAR_CHANGES_WHEN_UPPERCASED: 1546 case UCHAR_DASH: 1547 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: 1548 case UCHAR_DEPRECATED: 1549 case UCHAR_DIACRITIC: 1550 case UCHAR_EMOJI: 1551 case UCHAR_EMOJI_COMPONENT: 1552 case UCHAR_EMOJI_MODIFIER_BASE: 1553 case UCHAR_EMOJI_MODIFIER: 1554 case UCHAR_EMOJI_PRESENTATION: 1555 case UCHAR_EXTENDED_PICTOGRAPHIC: 1556 case UCHAR_EXTENDER: 1557 case UCHAR_GRAPHEME_BASE: 1558 case UCHAR_GRAPHEME_EXTEND: 1559 case UCHAR_HEX_DIGIT: 1560 case UCHAR_ID_CONTINUE: 1561 case UCHAR_ID_START: 1562 case UCHAR_IDEOGRAPHIC: 1563 case UCHAR_IDS_BINARY_OPERATOR: 1564 case UCHAR_IDS_TRINARY_OPERATOR: 1565 case UCHAR_JOIN_CONTROL: 1566 case UCHAR_LOGICAL_ORDER_EXCEPTION: 1567 case UCHAR_LOWERCASE: 1568 case UCHAR_MATH: 1569 case UCHAR_NONCHARACTER_CODE_POINT: 1570 case UCHAR_PATTERN_SYNTAX: 1571 case UCHAR_PATTERN_WHITE_SPACE: 1572 case UCHAR_QUOTATION_MARK: 1573 case UCHAR_RADICAL: 1574 case UCHAR_REGIONAL_INDICATOR: 1575 case UCHAR_S_TERM: 1576 case UCHAR_SOFT_DOTTED: 1577 case UCHAR_TERMINAL_PUNCTUATION: 1578 case UCHAR_UNIFIED_IDEOGRAPH: 1579 case UCHAR_UPPERCASE: 1580 case UCHAR_VARIATION_SELECTOR: 1581 case UCHAR_WHITE_SPACE: 1582 case UCHAR_XID_CONTINUE: 1583 case UCHAR_XID_START: 1584 return true; 1585 default: 1586 break; 1587 } 1588 return false; 1589} 1590 1591bool IsUnicodePropertyValueCharacter(char c) { 1592 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ 1593 // 1594 // Note that using this to validate each parsed char is quite conservative. 1595 // A possible alternative solution would be to only ensure the parsed 1596 // property name/value candidate string does not contain '\0' characters and 1597 // let ICU lookups trigger the final failure. 1598 if ('a' <= c && c <= 'z') return true; 1599 if ('A' <= c && c <= 'Z') return true; 1600 if ('0' <= c && c <= '9') return true; 1601 return (c == '_'); 1602} 1603 1604} // namespace 1605 1606template <class CharT> 1607bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, 1608 ZoneVector<char>* name_2) { 1609 DCHECK(name_1->empty()); 1610 DCHECK(name_2->empty()); 1611 // Parse the property class as follows: 1612 // - In \p{name}, 'name' is interpreted 1613 // - either as a general category property value name. 1614 // - or as a binary property name. 1615 // - In \p{name=value}, 'name' is interpreted as an enumerated property name, 1616 // and 'value' is interpreted as one of the available property value names. 1617 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. 1618 // - Loose matching is not applied. 1619 if (current() == '{') { 1620 // Parse \p{[PropertyName=]PropertyNameValue} 1621 for (Advance(); current() != '}' && current() != '='; Advance()) { 1622 if (!IsUnicodePropertyValueCharacter(current())) return false; 1623 if (!has_next()) return false; 1624 name_1->push_back(static_cast<char>(current())); 1625 } 1626 if (current() == '=') { 1627 for (Advance(); current() != '}'; Advance()) { 1628 if (!IsUnicodePropertyValueCharacter(current())) return false; 1629 if (!has_next()) return false; 1630 name_2->push_back(static_cast<char>(current())); 1631 } 1632 name_2->push_back(0); // null-terminate string. 1633 } 1634 } else { 1635 return false; 1636 } 1637 Advance(); 1638 name_1->push_back(0); // null-terminate string. 1639 1640 DCHECK(name_1->size() - 1 == std::strlen(name_1->data())); 1641 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data())); 1642 return true; 1643} 1644 1645template <class CharT> 1646bool RegExpParserImpl<CharT>::AddPropertyClassRange( 1647 ZoneList<CharacterRange>* add_to, bool negate, 1648 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { 1649 if (name_2.empty()) { 1650 // First attempt to interpret as general category property value name. 1651 const char* name = name_1.data(); 1652 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, 1653 add_to, zone())) { 1654 return true; 1655 } 1656 // Interpret "Any", "ASCII", and "Assigned". 1657 if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) { 1658 return true; 1659 } 1660 // Then attempt to interpret as binary property name with value name 'Y'. 1661 UProperty property = u_getPropertyEnum(name); 1662 if (!IsSupportedBinaryProperty(property)) return false; 1663 if (!IsExactPropertyAlias(name, property)) return false; 1664 return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to, 1665 zone()); 1666 } else { 1667 // Both property name and value name are specified. Attempt to interpret 1668 // the property name as enumerated property. 1669 const char* property_name = name_1.data(); 1670 const char* value_name = name_2.data(); 1671 UProperty property = u_getPropertyEnum(property_name); 1672 if (!IsExactPropertyAlias(property_name, property)) return false; 1673 if (property == UCHAR_GENERAL_CATEGORY) { 1674 // We want to allow aggregate value names such as "Letter". 1675 property = UCHAR_GENERAL_CATEGORY_MASK; 1676 } else if (property != UCHAR_SCRIPT && 1677 property != UCHAR_SCRIPT_EXTENSIONS) { 1678 return false; 1679 } 1680 return LookupPropertyValueName(property, value_name, negate, add_to, 1681 zone()); 1682 } 1683} 1684 1685#else // V8_INTL_SUPPORT 1686 1687template <class CharT> 1688bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, 1689 ZoneVector<char>* name_2) { 1690 return false; 1691} 1692 1693template <class CharT> 1694bool RegExpParserImpl<CharT>::AddPropertyClassRange( 1695 ZoneList<CharacterRange>* add_to, bool negate, 1696 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { 1697 return false; 1698} 1699 1700#endif // V8_INTL_SUPPORT 1701 1702template <class CharT> 1703bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value, 1704 base::uc32* value) { 1705 base::uc32 x = 0; 1706 int d = base::HexValue(current()); 1707 if (d < 0) { 1708 return false; 1709 } 1710 while (d >= 0) { 1711 x = x * 16 + d; 1712 if (x > static_cast<base::uc32>(max_value)) { 1713 return false; 1714 } 1715 Advance(); 1716 d = base::HexValue(current()); 1717 } 1718 *value = x; 1719 return true; 1720} 1721 1722// https://tc39.es/ecma262/#prod-CharacterEscape 1723template <class CharT> 1724base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape( 1725 InClassEscapeState in_class_escape_state, 1726 bool* is_escaped_unicode_character) { 1727 DCHECK_EQ('\\', current()); 1728 DCHECK(has_next() && !IsSpecialClassEscape(Next())); 1729 1730 Advance(); 1731 1732 const base::uc32 c = current(); 1733 switch (c) { 1734 // CharacterEscape :: 1735 // ControlEscape :: one of 1736 // f n r t v 1737 case 'f': 1738 Advance(); 1739 return '\f'; 1740 case 'n': 1741 Advance(); 1742 return '\n'; 1743 case 'r': 1744 Advance(); 1745 return '\r'; 1746 case 't': 1747 Advance(); 1748 return '\t'; 1749 case 'v': 1750 Advance(); 1751 return '\v'; 1752 // CharacterEscape :: 1753 // c ControlLetter 1754 case 'c': { 1755 base::uc32 controlLetter = Next(); 1756 base::uc32 letter = controlLetter & ~('A' ^ 'a'); 1757 if (letter >= 'A' && letter <= 'Z') { 1758 Advance(2); 1759 // Control letters mapped to ASCII control characters in the range 1760 // 0x00-0x1F. 1761 return controlLetter & 0x1F; 1762 } 1763 if (unicode()) { 1764 // With /u, invalid escapes are not treated as identity escapes. 1765 ReportError(RegExpError::kInvalidUnicodeEscape); 1766 return 0; 1767 } 1768 if (in_class_escape_state == InClassEscapeState::kInClass) { 1769 // Inside a character class, we also accept digits and underscore as 1770 // control characters, unless with /u. See Annex B: 1771 // ES#prod-annexB-ClassControlLetter 1772 if ((controlLetter >= '0' && controlLetter <= '9') || 1773 controlLetter == '_') { 1774 Advance(2); 1775 return controlLetter & 0x1F; 1776 } 1777 } 1778 // We match JSC in reading the backslash as a literal 1779 // character instead of as starting an escape. 1780 return '\\'; 1781 } 1782 // CharacterEscape :: 1783 // 0 [lookahead ∉ DecimalDigit] 1784 // [~UnicodeMode] LegacyOctalEscapeSequence 1785 case '0': 1786 // \0 is interpreted as NUL if not followed by another digit. 1787 if (Next() < '0' || Next() > '9') { 1788 Advance(); 1789 return 0; 1790 } 1791 V8_FALLTHROUGH; 1792 case '1': 1793 case '2': 1794 case '3': 1795 case '4': 1796 case '5': 1797 case '6': 1798 case '7': 1799 // For compatibility, we interpret a decimal escape that isn't 1800 // a back reference (and therefore either \0 or not valid according 1801 // to the specification) as a 1..3 digit octal character code. 1802 // ES#prod-annexB-LegacyOctalEscapeSequence 1803 if (unicode()) { 1804 // With /u, decimal escape is not interpreted as octal character code. 1805 ReportError(RegExpError::kInvalidClassEscape); 1806 return 0; 1807 } 1808 return ParseOctalLiteral(); 1809 // CharacterEscape :: 1810 // HexEscapeSequence 1811 case 'x': { 1812 Advance(); 1813 base::uc32 value; 1814 if (ParseHexEscape(2, &value)) return value; 1815 if (unicode()) { 1816 // With /u, invalid escapes are not treated as identity escapes. 1817 ReportError(RegExpError::kInvalidEscape); 1818 return 0; 1819 } 1820 // If \x is not followed by a two-digit hexadecimal, treat it 1821 // as an identity escape. 1822 return 'x'; 1823 } 1824 // CharacterEscape :: 1825 // RegExpUnicodeEscapeSequence [?UnicodeMode] 1826 case 'u': { 1827 Advance(); 1828 base::uc32 value; 1829 if (ParseUnicodeEscape(&value)) { 1830 *is_escaped_unicode_character = true; 1831 return value; 1832 } 1833 if (unicode()) { 1834 // With /u, invalid escapes are not treated as identity escapes. 1835 ReportError(RegExpError::kInvalidUnicodeEscape); 1836 return 0; 1837 } 1838 // If \u is not followed by a two-digit hexadecimal, treat it 1839 // as an identity escape. 1840 return 'u'; 1841 } 1842 default: 1843 break; 1844 } 1845 1846 // CharacterEscape :: 1847 // IdentityEscape[?UnicodeMode, ?N] 1848 // 1849 // * With /u, no identity escapes except for syntax characters are 1850 // allowed. 1851 // * Without /u: 1852 // * '\c' is not an IdentityEscape. 1853 // * '\k' is not an IdentityEscape when named captures exist. 1854 // * Otherwise, all identity escapes are allowed. 1855 if (unicode()) { 1856 if (!IsSyntaxCharacterOrSlash(c)) { 1857 ReportError(RegExpError::kInvalidEscape); 1858 return 0; 1859 } 1860 Advance(); 1861 return c; 1862 } 1863 DCHECK(!unicode()); 1864 if (c == 'c') { 1865 ReportError(RegExpError::kInvalidEscape); 1866 return 0; 1867 } 1868 Advance(); 1869 // Note: It's important to Advance before the HasNamedCaptures call s.t. we 1870 // don't start scanning in the middle of an escape. 1871 if (c == 'k' && HasNamedCaptures(in_class_escape_state)) { 1872 ReportError(RegExpError::kInvalidEscape); 1873 return 0; 1874 } 1875 return c; 1876} 1877 1878// https://tc39.es/ecma262/#prod-ClassEscape 1879template <class CharT> 1880void RegExpParserImpl<CharT>::ParseClassEscape( 1881 ZoneList<CharacterRange>* ranges, Zone* zone, 1882 bool add_unicode_case_equivalents, base::uc32* char_out, 1883 bool* is_class_escape) { 1884 *is_class_escape = false; 1885 1886 if (current() != '\\') { 1887 // Not a ClassEscape. 1888 *char_out = current(); 1889 Advance(); 1890 return; 1891 } 1892 1893 const base::uc32 next = Next(); 1894 switch (next) { 1895 case 'b': 1896 *char_out = '\b'; 1897 Advance(2); 1898 return; 1899 case '-': 1900 if (unicode()) { 1901 *char_out = next; 1902 Advance(2); 1903 return; 1904 } 1905 break; 1906 case kEndMarker: 1907 ReportError(RegExpError::kEscapeAtEndOfPattern); 1908 return; 1909 default: 1910 break; 1911 } 1912 1913 static constexpr InClassEscapeState kInClassEscape = 1914 InClassEscapeState::kInClass; 1915 *is_class_escape = TryParseCharacterClassEscape( 1916 next, kInClassEscape, ranges, zone, add_unicode_case_equivalents); 1917 if (*is_class_escape) return; 1918 1919 bool dummy = false; // Unused. 1920 *char_out = ParseCharacterEscape(kInClassEscape, &dummy); 1921} 1922 1923// https://tc39.es/ecma262/#prod-CharacterClassEscape 1924template <class CharT> 1925bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape( 1926 base::uc32 next, InClassEscapeState in_class_escape_state, 1927 ZoneList<CharacterRange>* ranges, Zone* zone, 1928 bool add_unicode_case_equivalents) { 1929 DCHECK_EQ(current(), '\\'); 1930 DCHECK_EQ(Next(), next); 1931 1932 switch (next) { 1933 case 'd': 1934 case 'D': 1935 case 's': 1936 case 'S': 1937 case 'w': 1938 case 'W': 1939 CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next), 1940 ranges, add_unicode_case_equivalents, 1941 zone); 1942 Advance(2); 1943 return true; 1944 case 'p': 1945 case 'P': { 1946 if (!unicode()) return false; 1947 bool negate = next == 'P'; 1948 Advance(2); 1949 ZoneVector<char> name_1(zone); 1950 ZoneVector<char> name_2(zone); 1951 if (!ParsePropertyClassName(&name_1, &name_2) || 1952 !AddPropertyClassRange(ranges, negate, name_1, name_2)) { 1953 ReportError(in_class_escape_state == InClassEscapeState::kInClass 1954 ? RegExpError::kInvalidClassPropertyName 1955 : RegExpError::kInvalidPropertyName); 1956 } 1957 return true; 1958 } 1959 default: 1960 return false; 1961 } 1962} 1963 1964template <class CharT> 1965RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass( 1966 const RegExpBuilder* builder) { 1967 DCHECK_EQ(current(), '['); 1968 Advance(); 1969 bool is_negated = false; 1970 if (current() == '^') { 1971 is_negated = true; 1972 Advance(); 1973 } 1974 ZoneList<CharacterRange>* ranges = 1975 zone()->template New<ZoneList<CharacterRange>>(2, zone()); 1976 bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); 1977 while (has_more() && current() != ']') { 1978 base::uc32 char_1, char_2; 1979 bool is_class_1, is_class_2; 1980 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, 1981 &is_class_1 CHECK_FAILED); 1982 if (current() == '-') { 1983 Advance(); 1984 if (current() == kEndMarker) { 1985 // If we reach the end we break out of the loop and let the 1986 // following code report an error. 1987 break; 1988 } else if (current() == ']') { 1989 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); 1990 ranges->Add(CharacterRange::Singleton('-'), zone()); 1991 break; 1992 } 1993 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, 1994 &is_class_2 CHECK_FAILED); 1995 if (is_class_1 || is_class_2) { 1996 // Either end is an escaped character class. Treat the '-' verbatim. 1997 if (unicode()) { 1998 // ES2015 21.2.2.15.1 step 1. 1999 return ReportError(RegExpError::kInvalidCharacterClass); 2000 } 2001 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); 2002 ranges->Add(CharacterRange::Singleton('-'), zone()); 2003 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); 2004 continue; 2005 } 2006 // ES2015 21.2.2.15.1 step 6. 2007 if (char_1 > char_2) { 2008 return ReportError(RegExpError::kOutOfOrderCharacterClass); 2009 } 2010 ranges->Add(CharacterRange::Range(char_1, char_2), zone()); 2011 } else { 2012 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); 2013 } 2014 } 2015 if (!has_more()) { 2016 return ReportError(RegExpError::kUnterminatedCharacterClass); 2017 } 2018 Advance(); 2019 RegExpCharacterClass::CharacterClassFlags character_class_flags; 2020 if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; 2021 return zone()->template New<RegExpCharacterClass>(zone(), ranges, 2022 character_class_flags); 2023} 2024 2025#undef CHECK_FAILED 2026 2027template <class CharT> 2028bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) { 2029 DCHECK_NOT_NULL(result); 2030 RegExpTree* tree = ParsePattern(); 2031 2032 if (failed()) { 2033 DCHECK_NULL(tree); 2034 DCHECK_NE(error_, RegExpError::kNone); 2035 result->error = error_; 2036 result->error_pos = error_pos_; 2037 return false; 2038 } 2039 2040 DCHECK_NOT_NULL(tree); 2041 DCHECK_EQ(error_, RegExpError::kNone); 2042 if (FLAG_trace_regexp_parser) { 2043 StdoutStream os; 2044 tree->Print(os, zone()); 2045 os << "\n"; 2046 } 2047 2048 result->tree = tree; 2049 const int capture_count = captures_started(); 2050 result->simple = tree->IsAtom() && simple() && capture_count == 0; 2051 result->contains_anchor = contains_anchor(); 2052 result->capture_count = capture_count; 2053 result->named_captures = GetNamedCaptures(); 2054 return true; 2055} 2056 2057void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) { 2058 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 2059 FlushPendingSurrogate(); 2060 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. 2061 pending_surrogate_ = lead_surrogate; 2062} 2063 2064void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) { 2065 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); 2066 if (pending_surrogate_ != kNoPendingSurrogate) { 2067 base::uc16 lead_surrogate = pending_surrogate_; 2068 pending_surrogate_ = kNoPendingSurrogate; 2069 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 2070 base::uc32 combined = 2071 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); 2072 if (NeedsDesugaringForIgnoreCase(combined)) { 2073 AddCharacterClassForDesugaring(combined); 2074 } else { 2075 ZoneList<base::uc16> surrogate_pair(2, zone()); 2076 surrogate_pair.Add(lead_surrogate, zone()); 2077 surrogate_pair.Add(trail_surrogate, zone()); 2078 RegExpAtom* atom = 2079 zone()->New<RegExpAtom>(surrogate_pair.ToConstVector()); 2080 AddAtom(atom); 2081 } 2082 } else { 2083 pending_surrogate_ = trail_surrogate; 2084 FlushPendingSurrogate(); 2085 } 2086} 2087 2088void RegExpBuilder::FlushPendingSurrogate() { 2089 if (pending_surrogate_ != kNoPendingSurrogate) { 2090 DCHECK(unicode()); 2091 base::uc32 c = pending_surrogate_; 2092 pending_surrogate_ = kNoPendingSurrogate; 2093 AddCharacterClassForDesugaring(c); 2094 } 2095} 2096 2097void RegExpBuilder::FlushCharacters() { 2098 FlushPendingSurrogate(); 2099 pending_empty_ = false; 2100 if (characters_ != nullptr) { 2101 RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector()); 2102 characters_ = nullptr; 2103 text_.emplace_back(atom); 2104 LAST(ADD_ATOM); 2105 } 2106} 2107 2108void RegExpBuilder::FlushText() { 2109 FlushCharacters(); 2110 size_t num_text = text_.size(); 2111 if (num_text == 0) { 2112 return; 2113 } else if (num_text == 1) { 2114 terms_.emplace_back(text_.back()); 2115 } else { 2116 RegExpText* text = zone()->New<RegExpText>(zone()); 2117 for (size_t i = 0; i < num_text; i++) { 2118 text_[i]->AppendToText(text, zone()); 2119 } 2120 terms_.emplace_back(text); 2121 } 2122 text_.clear(); 2123} 2124 2125void RegExpBuilder::AddCharacter(base::uc16 c) { 2126 FlushPendingSurrogate(); 2127 pending_empty_ = false; 2128 if (NeedsDesugaringForIgnoreCase(c)) { 2129 AddCharacterClassForDesugaring(c); 2130 } else { 2131 if (characters_ == nullptr) { 2132 characters_ = zone()->New<ZoneList<base::uc16>>(4, zone()); 2133 } 2134 characters_->Add(c, zone()); 2135 LAST(ADD_CHAR); 2136 } 2137} 2138 2139void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) { 2140 if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 2141 DCHECK(unicode()); 2142 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); 2143 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 2144 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 2145 AddLeadSurrogate(c); 2146 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { 2147 AddTrailSurrogate(c); 2148 } else { 2149 AddCharacter(static_cast<base::uc16>(c)); 2150 } 2151} 2152 2153void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) { 2154 // A lead or trail surrogate parsed via escape sequence will not 2155 // pair up with any preceding lead or following trail surrogate. 2156 FlushPendingSurrogate(); 2157 AddUnicodeCharacter(character); 2158 FlushPendingSurrogate(); 2159} 2160 2161void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 2162 2163void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 2164 if (NeedsDesugaringForUnicode(cc)) { 2165 // With /u, character class needs to be desugared, so it 2166 // must be a standalone term instead of being part of a RegExpText. 2167 AddTerm(cc); 2168 } else { 2169 AddAtom(cc); 2170 } 2171} 2172 2173void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) { 2174 AddTerm(zone()->New<RegExpCharacterClass>( 2175 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)))); 2176} 2177 2178void RegExpBuilder::AddAtom(RegExpTree* term) { 2179 if (term->IsEmpty()) { 2180 AddEmpty(); 2181 return; 2182 } 2183 if (term->IsTextElement()) { 2184 FlushCharacters(); 2185 text_.emplace_back(term); 2186 } else { 2187 FlushText(); 2188 terms_.emplace_back(term); 2189 } 2190 LAST(ADD_ATOM); 2191} 2192 2193void RegExpBuilder::AddTerm(RegExpTree* term) { 2194 FlushText(); 2195 terms_.emplace_back(term); 2196 LAST(ADD_ATOM); 2197} 2198 2199void RegExpBuilder::AddAssertion(RegExpTree* assert) { 2200 FlushText(); 2201 terms_.emplace_back(assert); 2202 LAST(ADD_ASSERT); 2203} 2204 2205void RegExpBuilder::NewAlternative() { FlushTerms(); } 2206 2207void RegExpBuilder::FlushTerms() { 2208 FlushText(); 2209 size_t num_terms = terms_.size(); 2210 RegExpTree* alternative; 2211 if (num_terms == 0) { 2212 alternative = zone()->New<RegExpEmpty>(); 2213 } else if (num_terms == 1) { 2214 alternative = terms_.back(); 2215 } else { 2216 alternative = 2217 zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( 2218 base::VectorOf(terms_.begin(), terms_.size()), zone())); 2219 } 2220 alternatives_.emplace_back(alternative); 2221 terms_.clear(); 2222 LAST(ADD_NONE); 2223} 2224 2225bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { 2226 if (!unicode()) return false; 2227 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not 2228 // necessarily mean that we need to desugar. It's probably nicer to have a 2229 // separate pass to figure out unicode desugarings. 2230 if (ignore_case()) return true; 2231 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2232 CharacterRange::Canonicalize(ranges); 2233 for (int i = ranges->length() - 1; i >= 0; i--) { 2234 base::uc32 from = ranges->at(i).from(); 2235 base::uc32 to = ranges->at(i).to(); 2236 // Check for non-BMP characters. 2237 if (to >= kNonBmpStart) return true; 2238 // Check for lone surrogates. 2239 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; 2240 } 2241 return false; 2242} 2243 2244bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) { 2245#ifdef V8_INTL_SUPPORT 2246 if (unicode() && ignore_case()) { 2247 icu::UnicodeSet set(c, c); 2248 set.closeOver(USET_CASE_INSENSITIVE); 2249 set.removeAllStrings(); 2250 return set.size() > 1; 2251 } 2252 // In the case where ICU is not included, we act as if the unicode flag is 2253 // not set, and do not desugar. 2254#endif // V8_INTL_SUPPORT 2255 return false; 2256} 2257 2258RegExpTree* RegExpBuilder::ToRegExp() { 2259 FlushTerms(); 2260 size_t num_alternatives = alternatives_.size(); 2261 if (num_alternatives == 0) return zone()->New<RegExpEmpty>(); 2262 if (num_alternatives == 1) return alternatives_.back(); 2263 return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>( 2264 base::VectorOf(alternatives_.begin(), alternatives_.size()), zone())); 2265} 2266 2267bool RegExpBuilder::AddQuantifierToAtom( 2268 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 2269 FlushPendingSurrogate(); 2270 if (pending_empty_) { 2271 pending_empty_ = false; 2272 return true; 2273 } 2274 RegExpTree* atom; 2275 if (characters_ != nullptr) { 2276 DCHECK(last_added_ == ADD_CHAR); 2277 // Last atom was character. 2278 base::Vector<const base::uc16> char_vector = characters_->ToConstVector(); 2279 int num_chars = char_vector.length(); 2280 if (num_chars > 1) { 2281 base::Vector<const base::uc16> prefix = 2282 char_vector.SubVector(0, num_chars - 1); 2283 text_.emplace_back(zone()->New<RegExpAtom>(prefix)); 2284 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 2285 } 2286 characters_ = nullptr; 2287 atom = zone()->New<RegExpAtom>(char_vector); 2288 FlushText(); 2289 } else if (text_.size() > 0) { 2290 DCHECK(last_added_ == ADD_ATOM); 2291 atom = text_.back(); 2292 text_.pop_back(); 2293 FlushText(); 2294 } else if (terms_.size() > 0) { 2295 DCHECK(last_added_ == ADD_ATOM); 2296 atom = terms_.back(); 2297 terms_.pop_back(); 2298 if (atom->IsLookaround()) { 2299 // With /u, lookarounds are not quantifiable. 2300 if (unicode()) return false; 2301 // Lookbehinds are not quantifiable. 2302 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) { 2303 return false; 2304 } 2305 } 2306 if (atom->max_match() == 0) { 2307 // Guaranteed to only match an empty string. 2308 LAST(ADD_TERM); 2309 if (min == 0) { 2310 return true; 2311 } 2312 terms_.emplace_back(atom); 2313 return true; 2314 } 2315 } else { 2316 // Only call immediately after adding an atom or character! 2317 UNREACHABLE(); 2318 } 2319 terms_.emplace_back( 2320 zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom)); 2321 LAST(ADD_TERM); 2322 return true; 2323} 2324 2325template class RegExpParserImpl<uint8_t>; 2326template class RegExpParserImpl<base::uc16>; 2327 2328} // namespace 2329 2330// static 2331bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone, 2332 Handle<String> input, 2333 RegExpFlags flags, 2334 RegExpCompileData* result) { 2335 DisallowGarbageCollection no_gc; 2336 uintptr_t stack_limit = isolate->stack_guard()->real_climit(); 2337 String::FlatContent content = input->GetFlatContent(no_gc); 2338 if (content.IsOneByte()) { 2339 base::Vector<const uint8_t> v = content.ToOneByteVector(); 2340 return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags, 2341 stack_limit, zone, no_gc} 2342 .Parse(result); 2343 } else { 2344 base::Vector<const base::uc16> v = content.ToUC16Vector(); 2345 return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags, 2346 stack_limit, zone, no_gc} 2347 .Parse(result); 2348 } 2349} 2350 2351// static 2352template <class CharT> 2353bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit, 2354 const CharT* input, int input_length, 2355 RegExpFlags flags, 2356 RegExpCompileData* result, 2357 const DisallowGarbageCollection& no_gc) { 2358 return RegExpParserImpl<CharT>{input, input_length, flags, 2359 stack_limit, zone, no_gc} 2360 .Parse(result); 2361} 2362 2363template bool RegExpParser::VerifyRegExpSyntax<uint8_t>( 2364 Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*, 2365 const DisallowGarbageCollection&); 2366template bool RegExpParser::VerifyRegExpSyntax<base::uc16>( 2367 Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*, 2368 const DisallowGarbageCollection&); 2369 2370// static 2371bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone, 2372 Handle<String> input, RegExpFlags flags, 2373 RegExpCompileData* result, 2374 const DisallowGarbageCollection&) { 2375 return ParseRegExpFromHeapString(isolate, zone, input, flags, result); 2376} 2377 2378#undef LAST 2379 2380} // namespace internal 2381} // namespace v8 2382