1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/base/small-vector.h"
8#include "src/execution/isolate.h"
9#include "src/objects/string-inl.h"
10#include "src/regexp/property-sequences.h"
11#include "src/regexp/regexp-ast.h"
12#include "src/regexp/regexp-macro-assembler.h"
13#include "src/regexp/regexp.h"
14#include "src/strings/char-predicates-inl.h"
15#include "src/utils/ostreams.h"
16#include "src/utils/utils.h"
17#include "src/zone/zone-allocator.h"
18#include "src/zone/zone-list-inl.h"
19
20#ifdef V8_INTL_SUPPORT
21#include "unicode/uniset.h"
22#endif  // V8_INTL_SUPPORT
23
24namespace v8 {
25namespace internal {
26
27namespace {
28
29// Whether we're currently inside the ClassEscape production
30// (tc39.es/ecma262/#prod-annexB-CharacterEscape).
31enum class InClassEscapeState {
32  kInClass,
33  kNotInClass,
34};
35
36// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
37class RegExpBuilder {
38 public:
39  RegExpBuilder(Zone* zone, RegExpFlags flags)
40      : zone_(zone),
41        flags_(flags),
42        terms_(ZoneAllocator<RegExpTree*>{zone}),
43        text_(ZoneAllocator<RegExpTree*>{zone}),
44        alternatives_(ZoneAllocator<RegExpTree*>{zone}) {}
45  void AddCharacter(base::uc16 character);
46  void AddUnicodeCharacter(base::uc32 character);
47  void AddEscapedUnicodeCharacter(base::uc32 character);
48  // "Adds" an empty expression. Does nothing except consume a
49  // following quantifier
50  void AddEmpty();
51  void AddCharacterClass(RegExpCharacterClass* cc);
52  void AddCharacterClassForDesugaring(base::uc32 c);
53  void AddAtom(RegExpTree* tree);
54  void AddTerm(RegExpTree* tree);
55  void AddAssertion(RegExpTree* tree);
56  void NewAlternative();  // '|'
57  bool AddQuantifierToAtom(int min, int max,
58                           RegExpQuantifier::QuantifierType type);
59  void FlushText();
60  RegExpTree* ToRegExp();
61  RegExpFlags flags() const { return flags_; }
62
63  bool ignore_case() const { return IsIgnoreCase(flags_); }
64  bool multiline() const { return IsMultiline(flags_); }
65  bool dotall() const { return IsDotAll(flags_); }
66
67 private:
68  static const base::uc16 kNoPendingSurrogate = 0;
69  void AddLeadSurrogate(base::uc16 lead_surrogate);
70  void AddTrailSurrogate(base::uc16 trail_surrogate);
71  void FlushPendingSurrogate();
72  void FlushCharacters();
73  void FlushTerms();
74  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
75  bool NeedsDesugaringForIgnoreCase(base::uc32 c);
76  Zone* zone() const { return zone_; }
77  bool unicode() const { return IsUnicode(flags_); }
78
79  Zone* const zone_;
80  bool pending_empty_ = false;
81  const RegExpFlags flags_;
82  ZoneList<base::uc16>* characters_ = nullptr;
83  base::uc16 pending_surrogate_ = kNoPendingSurrogate;
84
85  using SmallRegExpTreeVector =
86      base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
87  SmallRegExpTreeVector terms_;
88  SmallRegExpTreeVector text_;
89  SmallRegExpTreeVector alternatives_;
90#ifdef DEBUG
91  enum {
92    ADD_NONE,
93    ADD_CHAR,
94    ADD_TERM,
95    ADD_ASSERT,
96    ADD_ATOM
97  } last_added_ = ADD_NONE;
98#define LAST(x) last_added_ = x;
99#else
100#define LAST(x)
101#endif
102};
103
104enum SubexpressionType {
105  INITIAL,
106  CAPTURE,  // All positive values represent captures.
107  POSITIVE_LOOKAROUND,
108  NEGATIVE_LOOKAROUND,
109  GROUPING
110};
111
112class RegExpParserState : public ZoneObject {
113 public:
114  // Push a state on the stack.
115  RegExpParserState(RegExpParserState* previous_state,
116                    SubexpressionType group_type,
117                    RegExpLookaround::Type lookaround_type,
118                    int disjunction_capture_index,
119                    const ZoneVector<base::uc16>* capture_name,
120                    RegExpFlags flags, Zone* zone)
121      : previous_state_(previous_state),
122        builder_(zone, flags),
123        group_type_(group_type),
124        lookaround_type_(lookaround_type),
125        disjunction_capture_index_(disjunction_capture_index),
126        capture_name_(capture_name) {}
127  // Parser state of containing expression, if any.
128  RegExpParserState* previous_state() const { return previous_state_; }
129  bool IsSubexpression() { return previous_state_ != nullptr; }
130  // RegExpBuilder building this regexp's AST.
131  RegExpBuilder* builder() { return &builder_; }
132  // Type of regexp being parsed (parenthesized group or entire regexp).
133  SubexpressionType group_type() const { return group_type_; }
134  // Lookahead or Lookbehind.
135  RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
136  // Index in captures array of first capture in this sub-expression, if any.
137  // Also the capture index of this sub-expression itself, if group_type
138  // is CAPTURE.
139  int capture_index() const { return disjunction_capture_index_; }
140  // The name of the current sub-expression, if group_type is CAPTURE. Only
141  // used for named captures.
142  const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
143
144  bool IsNamedCapture() const { return capture_name_ != nullptr; }
145
146  // Check whether the parser is inside a capture group with the given index.
147  bool IsInsideCaptureGroup(int index) const {
148    for (const RegExpParserState* s = this; s != nullptr;
149         s = s->previous_state()) {
150      if (s->group_type() != CAPTURE) continue;
151      // Return true if we found the matching capture index.
152      if (index == s->capture_index()) return true;
153      // Abort if index is larger than what has been parsed up till this state.
154      if (index > s->capture_index()) return false;
155    }
156    return false;
157  }
158
159  // Check whether the parser is inside a capture group with the given name.
160  bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
161    DCHECK_NOT_NULL(name);
162    for (const RegExpParserState* s = this; s != nullptr;
163         s = s->previous_state()) {
164      if (s->capture_name() == nullptr) continue;
165      if (*s->capture_name() == *name) return true;
166    }
167    return false;
168  }
169
170 private:
171  // Linked list implementation of stack of states.
172  RegExpParserState* const previous_state_;
173  // Builder for the stored disjunction.
174  RegExpBuilder builder_;
175  // Stored disjunction type (capture, look-ahead or grouping), if any.
176  const SubexpressionType group_type_;
177  // Stored read direction.
178  const RegExpLookaround::Type lookaround_type_;
179  // Stored disjunction's capture index (if any).
180  const int disjunction_capture_index_;
181  // Stored capture name (if any).
182  const ZoneVector<base::uc16>* const capture_name_;
183};
184
185template <class CharT>
186class RegExpParserImpl final {
187 private:
188  RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
189                   uintptr_t stack_limit, Zone* zone,
190                   const DisallowGarbageCollection& no_gc);
191
192  bool Parse(RegExpCompileData* result);
193
194  RegExpTree* ParsePattern();
195  RegExpTree* ParseDisjunction();
196  RegExpTree* ParseGroup();
197
198  // Parses a {...,...} quantifier and stores the range in the given
199  // out parameters.
200  bool ParseIntervalQuantifier(int* min_out, int* max_out);
201
202  // Checks whether the following is a length-digit hexadecimal number,
203  // and sets the value if it is.
204  bool ParseHexEscape(int length, base::uc32* value);
205  bool ParseUnicodeEscape(base::uc32* value);
206  bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
207
208  bool ParsePropertyClassName(ZoneVector<char>* name_1,
209                              ZoneVector<char>* name_2);
210  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
211                             const ZoneVector<char>& name_1,
212                             const ZoneVector<char>& name_2);
213
214  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
215
216  base::uc32 ParseOctalLiteral();
217
218  // Tries to parse the input as a back reference.  If successful it
219  // stores the result in the output parameter and returns true.  If
220  // it fails it will push back the characters read so the same characters
221  // can be reparsed.
222  bool ParseBackReferenceIndex(int* index_out);
223
224  // Parse inside a class. Either add escaped class to the range, or return
225  // false and pass parsed single character through |char_out|.
226  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
227                        bool add_unicode_case_equivalents, base::uc32* char_out,
228                        bool* is_class_escape);
229  // Returns true iff parsing was successful.
230  bool TryParseCharacterClassEscape(base::uc32 next,
231                                    InClassEscapeState in_class_escape_state,
232                                    ZoneList<CharacterRange>* ranges,
233                                    Zone* zone,
234                                    bool add_unicode_case_equivalents);
235  // Parses and returns a single escaped character.
236  base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
237                                  bool* is_escaped_unicode_character);
238
239  RegExpTree* ReportError(RegExpError error);
240  void Advance();
241  void Advance(int dist);
242  void RewindByOneCodepoint();  // Rewinds to before the previous Advance().
243  void Reset(int pos);
244
245  // Reports whether the pattern might be used as a literal search string.
246  // Only use if the result of the parse is a single atom node.
247  bool simple() const { return simple_; }
248  bool contains_anchor() const { return contains_anchor_; }
249  void set_contains_anchor() { contains_anchor_ = true; }
250  int captures_started() const { return captures_started_; }
251  int position() const { return next_pos_ - 1; }
252  bool failed() const { return failed_; }
253  bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; }
254
255  static bool IsSyntaxCharacterOrSlash(base::uc32 c);
256
257  static const base::uc32 kEndMarker = (1 << 21);
258
259 private:
260  // Return the 1-indexed RegExpCapture object, allocate if necessary.
261  RegExpCapture* GetCapture(int index);
262
263  // Creates a new named capture at the specified index. Must be called exactly
264  // once for each named capture. Fails if a capture with the same name is
265  // encountered.
266  bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);
267
268  // Parses the name of a capture group (?<name>pattern). The name must adhere
269  // to IdentifierName in the ECMAScript standard.
270  const ZoneVector<base::uc16>* ParseCaptureGroupName();
271
272  bool ParseNamedBackReference(RegExpBuilder* builder,
273                               RegExpParserState* state);
274  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
275
276  // After the initial parsing pass, patch corresponding RegExpCapture objects
277  // into all RegExpBackReferences. This is done after initial parsing in order
278  // to avoid complicating cases in which references comes before the capture.
279  void PatchNamedBackReferences();
280
281  ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
282
283  // Returns true iff the pattern contains named captures. May call
284  // ScanForCaptures to look ahead at the remaining pattern.
285  bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
286
287  Zone* zone() const { return zone_; }
288
289  base::uc32 current() const { return current_; }
290  bool has_more() const { return has_more_; }
291  bool has_next() const { return next_pos_ < input_length(); }
292  base::uc32 Next();
293  template <bool update_position>
294  base::uc32 ReadNext();
295  CharT InputAt(int index) const {
296    DCHECK(0 <= index && index < input_length());
297    return input_[index];
298  }
299  int input_length() const { return input_length_; }
300  void ScanForCaptures(InClassEscapeState in_class_escape_state);
301
302  struct RegExpCaptureNameLess {
303    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
304      DCHECK_NOT_NULL(lhs);
305      DCHECK_NOT_NULL(rhs);
306      return *lhs->name() < *rhs->name();
307    }
308  };
309
310  class ForceUnicodeScope final {
311   public:
312    explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
313        : parser_(parser) {
314      DCHECK(!parser_->force_unicode_);
315      parser_->force_unicode_ = true;
316    }
317    ~ForceUnicodeScope() {
318      DCHECK(parser_->force_unicode_);
319      parser_->force_unicode_ = false;
320    }
321
322   private:
323    RegExpParserImpl<CharT>* const parser_;
324  };
325
326  const DisallowGarbageCollection no_gc_;
327  Zone* const zone_;
328  RegExpError error_ = RegExpError::kNone;
329  int error_pos_ = 0;
330  ZoneList<RegExpCapture*>* captures_;
331  ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
332  ZoneList<RegExpBackReference*>* named_back_references_;
333  const CharT* const input_;
334  const int input_length_;
335  base::uc32 current_;
336  const RegExpFlags top_level_flags_;
337  bool force_unicode_ = false;  // Force parser to act as if unicode were set.
338  int next_pos_;
339  int captures_started_;
340  int capture_count_;  // Only valid after we have scanned for captures.
341  bool has_more_;
342  bool simple_;
343  bool contains_anchor_;
344  bool is_scanned_for_captures_;
345  bool has_named_captures_;  // Only valid after we have scanned for captures.
346  bool failed_;
347  const uintptr_t stack_limit_;
348
349  friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*,
350                                                      Handle<String>,
351                                                      RegExpFlags,
352                                                      RegExpCompileData*);
353  friend bool RegExpParser::VerifyRegExpSyntax<CharT>(
354      Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*,
355      const DisallowGarbageCollection&);
356};
357
358template <class CharT>
359RegExpParserImpl<CharT>::RegExpParserImpl(
360    const CharT* input, int input_length, RegExpFlags flags,
361    uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
362    : zone_(zone),
363      captures_(nullptr),
364      named_captures_(nullptr),
365      named_back_references_(nullptr),
366      input_(input),
367      input_length_(input_length),
368      current_(kEndMarker),
369      top_level_flags_(flags),
370      next_pos_(0),
371      captures_started_(0),
372      capture_count_(0),
373      has_more_(true),
374      simple_(false),
375      contains_anchor_(false),
376      is_scanned_for_captures_(false),
377      has_named_captures_(false),
378      failed_(false),
379      stack_limit_(stack_limit) {
380  Advance();
381}
382
383template <>
384template <bool update_position>
385inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
386  int position = next_pos_;
387  base::uc16 c0 = InputAt(position);
388  position++;
389  DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
390  if (update_position) next_pos_ = position;
391  return c0;
392}
393
394template <>
395template <bool update_position>
396inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
397  int position = next_pos_;
398  base::uc16 c0 = InputAt(position);
399  base::uc32 result = c0;
400  position++;
401  // Read the whole surrogate pair in case of unicode flag, if possible.
402  if (unicode() && position < input_length() &&
403      unibrow::Utf16::IsLeadSurrogate(c0)) {
404    base::uc16 c1 = InputAt(position);
405    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
406      result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
407      position++;
408    }
409  }
410  if (update_position) next_pos_ = position;
411  return result;
412}
413
414template <class CharT>
415base::uc32 RegExpParserImpl<CharT>::Next() {
416  if (has_next()) {
417    return ReadNext<false>();
418  } else {
419    return kEndMarker;
420  }
421}
422
423template <class CharT>
424void RegExpParserImpl<CharT>::Advance() {
425  if (has_next()) {
426    if (GetCurrentStackPosition() < stack_limit_) {
427      if (FLAG_correctness_fuzzer_suppressions) {
428        FATAL("Aborting on stack overflow");
429      }
430      ReportError(RegExpError::kStackOverflow);
431    } else {
432      current_ = ReadNext<true>();
433    }
434  } else {
435    current_ = kEndMarker;
436    // Advance so that position() points to 1-after-the-last-character. This is
437    // important so that Reset() to this position works correctly.
438    next_pos_ = input_length() + 1;
439    has_more_ = false;
440  }
441}
442
443template <class CharT>
444void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
445  if (current() == kEndMarker) return;
446  // Rewinds by one code point, i.e.: two code units if `current` is outside
447  // the basic multilingual plane (= composed of a lead and trail surrogate),
448  // or one code unit otherwise.
449  const int rewind_by =
450      current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
451  Advance(rewind_by);  // Undo the last Advance.
452}
453
454template <class CharT>
455void RegExpParserImpl<CharT>::Reset(int pos) {
456  next_pos_ = pos;
457  has_more_ = (pos < input_length());
458  Advance();
459}
460
461template <class CharT>
462void RegExpParserImpl<CharT>::Advance(int dist) {
463  next_pos_ += dist - 1;
464  Advance();
465}
466
467template <class CharT>
468bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
469  switch (c) {
470    case '^':
471    case '$':
472    case '\\':
473    case '.':
474    case '*':
475    case '+':
476    case '?':
477    case '(':
478    case ')':
479    case '[':
480    case ']':
481    case '{':
482    case '}':
483    case '|':
484    case '/':
485      return true;
486    default:
487      break;
488  }
489  return false;
490}
491
492template <class CharT>
493RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
494  if (failed_) return nullptr;  // Do not overwrite any existing error.
495  failed_ = true;
496  error_ = error;
497  error_pos_ = position();
498  // Zip to the end to make sure no more input is read.
499  current_ = kEndMarker;
500  next_pos_ = input_length();
501  return nullptr;
502}
503
504#define CHECK_FAILED /**/);    \
505  if (failed_) return nullptr; \
506  ((void)0
507
508// Pattern ::
509//   Disjunction
510template <class CharT>
511RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
512  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
513  PatchNamedBackReferences(CHECK_FAILED);
514  DCHECK(!has_more());
515  // If the result of parsing is a literal string atom, and it has the
516  // same length as the input, then the atom is identical to the input.
517  if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
518    simple_ = true;
519  }
520  return result;
521}
522
523// Disjunction ::
524//   Alternative
525//   Alternative | Disjunction
526// Alternative ::
527//   [empty]
528//   Term Alternative
529// Term ::
530//   Assertion
531//   Atom
532//   Atom Quantifier
533template <class CharT>
534RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
535  // Used to store current state while parsing subexpressions.
536  RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
537                                  0, nullptr, top_level_flags_, zone());
538  RegExpParserState* state = &initial_state;
539  // Cache the builder in a local variable for quick access.
540  RegExpBuilder* builder = initial_state.builder();
541  while (true) {
542    switch (current()) {
543      case kEndMarker:
544        if (failed()) return nullptr;  // E.g. the initial Advance failed.
545        if (state->IsSubexpression()) {
546          // Inside a parenthesized group when hitting end of input.
547          return ReportError(RegExpError::kUnterminatedGroup);
548        }
549        DCHECK_EQ(INITIAL, state->group_type());
550        // Parsing completed successfully.
551        return builder->ToRegExp();
552      case ')': {
553        if (!state->IsSubexpression()) {
554          return ReportError(RegExpError::kUnmatchedParen);
555        }
556        DCHECK_NE(INITIAL, state->group_type());
557
558        Advance();
559        // End disjunction parsing and convert builder content to new single
560        // regexp atom.
561        RegExpTree* body = builder->ToRegExp();
562
563        int end_capture_index = captures_started();
564
565        int capture_index = state->capture_index();
566        SubexpressionType group_type = state->group_type();
567
568        // Build result of subexpression.
569        if (group_type == CAPTURE) {
570          if (state->IsNamedCapture()) {
571            CreateNamedCaptureAtIndex(state->capture_name(),
572                                      capture_index CHECK_FAILED);
573          }
574          RegExpCapture* capture = GetCapture(capture_index);
575          capture->set_body(body);
576          body = capture;
577        } else if (group_type == GROUPING) {
578          body = zone()->template New<RegExpGroup>(body);
579        } else {
580          DCHECK(group_type == POSITIVE_LOOKAROUND ||
581                 group_type == NEGATIVE_LOOKAROUND);
582          bool is_positive = (group_type == POSITIVE_LOOKAROUND);
583          body = zone()->template New<RegExpLookaround>(
584              body, is_positive, end_capture_index - capture_index,
585              capture_index, state->lookaround_type());
586        }
587
588        // Restore previous state.
589        state = state->previous_state();
590        builder = state->builder();
591
592        builder->AddAtom(body);
593        // For compatibility with JSC and ES3, we allow quantifiers after
594        // lookaheads, and break in all cases.
595        break;
596      }
597      case '|': {
598        Advance();
599        builder->NewAlternative();
600        continue;
601      }
602      case '*':
603      case '+':
604      case '?':
605        return ReportError(RegExpError::kNothingToRepeat);
606      case '^': {
607        Advance();
608        builder->AddAssertion(zone()->template New<RegExpAssertion>(
609            builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
610                                 : RegExpAssertion::Type::START_OF_INPUT));
611        set_contains_anchor();
612        continue;
613      }
614      case '$': {
615        Advance();
616        RegExpAssertion::Type assertion_type =
617            builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
618                                 : RegExpAssertion::Type::END_OF_INPUT;
619        builder->AddAssertion(
620            zone()->template New<RegExpAssertion>(assertion_type));
621        continue;
622      }
623      case '.': {
624        Advance();
625        ZoneList<CharacterRange>* ranges =
626            zone()->template New<ZoneList<CharacterRange>>(2, zone());
627
628        if (builder->dotall()) {
629          // Everything.
630          CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
631                                         ranges, false, zone());
632        } else {
633          // Everything except \x0A, \x0D, \u2028 and \u2029.
634          CharacterRange::AddClassEscape(
635              StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
636        }
637
638        RegExpCharacterClass* cc =
639            zone()->template New<RegExpCharacterClass>(zone(), ranges);
640        builder->AddCharacterClass(cc);
641        break;
642      }
643      case '(': {
644        state = ParseOpenParenthesis(state CHECK_FAILED);
645        builder = state->builder();
646        continue;
647      }
648      case '[': {
649        RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
650        builder->AddCharacterClass(cc->AsCharacterClass());
651        break;
652      }
653      // Atom ::
654      //   \ AtomEscape
655      case '\\':
656        switch (Next()) {
657          case kEndMarker:
658            return ReportError(RegExpError::kEscapeAtEndOfPattern);
659          // AtomEscape ::
660          //   [+UnicodeMode] DecimalEscape
661          //   [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
662          //                  of DecimalEscape is ≤ NcapturingParens
663          //   CharacterEscape (some cases of this mixed in too)
664          //
665          // TODO(jgruber): It may make sense to disentangle all the different
666          // cases and make the structure mirror the spec, e.g. for AtomEscape:
667          //
668          //  if (TryParseDecimalEscape(...)) return;
669          //  if (TryParseCharacterClassEscape(...)) return;
670          //  if (TryParseCharacterEscape(...)) return;
671          //  if (TryParseGroupName(...)) return;
672          case '1':
673          case '2':
674          case '3':
675          case '4':
676          case '5':
677          case '6':
678          case '7':
679          case '8':
680          case '9': {
681            int index = 0;
682            const bool is_backref =
683                ParseBackReferenceIndex(&index CHECK_FAILED);
684            if (is_backref) {
685              if (state->IsInsideCaptureGroup(index)) {
686                // The back reference is inside the capture group it refers to.
687                // Nothing can possibly have been captured yet, so we use empty
688                // instead. This ensures that, when checking a back reference,
689                // the capture registers of the referenced capture are either
690                // both set or both cleared.
691                builder->AddEmpty();
692              } else {
693                RegExpCapture* capture = GetCapture(index);
694                RegExpTree* atom = zone()->template New<RegExpBackReference>(
695                    capture, builder->flags());
696                builder->AddAtom(atom);
697              }
698              break;
699            }
700            // With /u, no identity escapes except for syntax characters
701            // are allowed. Otherwise, all identity escapes are allowed.
702            if (unicode()) {
703              return ReportError(RegExpError::kInvalidEscape);
704            }
705            base::uc32 first_digit = Next();
706            if (first_digit == '8' || first_digit == '9') {
707              builder->AddCharacter(first_digit);
708              Advance(2);
709              break;
710            }
711            V8_FALLTHROUGH;
712          }
713          case '0': {
714            Advance();
715            if (unicode() && Next() >= '0' && Next() <= '9') {
716              // With /u, decimal escape with leading 0 are not parsed as octal.
717              return ReportError(RegExpError::kInvalidDecimalEscape);
718            }
719            base::uc32 octal = ParseOctalLiteral();
720            builder->AddCharacter(octal);
721            break;
722          }
723          case 'b':
724            Advance(2);
725            builder->AddAssertion(zone()->template New<RegExpAssertion>(
726                RegExpAssertion::Type::BOUNDARY));
727            continue;
728          case 'B':
729            Advance(2);
730            builder->AddAssertion(zone()->template New<RegExpAssertion>(
731                RegExpAssertion::Type::NON_BOUNDARY));
732            continue;
733          // AtomEscape ::
734          //   CharacterClassEscape
735          case 'd':
736          case 'D':
737          case 's':
738          case 'S':
739          case 'w':
740          case 'W':
741          case 'p':
742          case 'P': {
743            base::uc32 next = Next();
744            ZoneList<CharacterRange>* ranges =
745                zone()->template New<ZoneList<CharacterRange>>(2, zone());
746            bool add_unicode_case_equivalents =
747                unicode() && builder->ignore_case();
748            bool parsed_character_class_escape = TryParseCharacterClassEscape(
749                next, InClassEscapeState::kNotInClass, ranges, zone(),
750                add_unicode_case_equivalents CHECK_FAILED);
751
752            if (parsed_character_class_escape) {
753              RegExpCharacterClass* cc =
754                  zone()->template New<RegExpCharacterClass>(zone(), ranges);
755              builder->AddCharacterClass(cc);
756            } else {
757              CHECK(!unicode());
758              Advance(2);
759              builder->AddCharacter(next);  // IdentityEscape.
760            }
761            break;
762          }
763          // AtomEscape ::
764          //   k GroupName
765          case 'k': {
766            // Either an identity escape or a named back-reference.  The two
767            // interpretations are mutually exclusive: '\k' is interpreted as
768            // an identity escape for non-Unicode patterns without named
769            // capture groups, and as the beginning of a named back-reference
770            // in all other cases.
771            const bool has_named_captures =
772                HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
773            if (unicode() || has_named_captures) {
774              Advance(2);
775              ParseNamedBackReference(builder, state CHECK_FAILED);
776              break;
777            }
778          }
779            V8_FALLTHROUGH;
780          // AtomEscape ::
781          //   CharacterEscape
782          default: {
783            bool is_escaped_unicode_character = false;
784            base::uc32 c = ParseCharacterEscape(
785                InClassEscapeState::kNotInClass,
786                &is_escaped_unicode_character CHECK_FAILED);
787            if (is_escaped_unicode_character) {
788              builder->AddEscapedUnicodeCharacter(c);
789            } else {
790              builder->AddCharacter(c);
791            }
792            break;
793          }
794        }
795        break;
796      case '{': {
797        int dummy;
798        bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
799        if (parsed) return ReportError(RegExpError::kNothingToRepeat);
800        V8_FALLTHROUGH;
801      }
802      case '}':
803      case ']':
804        if (unicode()) {
805          return ReportError(RegExpError::kLoneQuantifierBrackets);
806        }
807        V8_FALLTHROUGH;
808      default:
809        builder->AddUnicodeCharacter(current());
810        Advance();
811        break;
812    }  // end switch(current())
813
814    int min;
815    int max;
816    switch (current()) {
817      // QuantifierPrefix ::
818      //   *
819      //   +
820      //   ?
821      //   {
822      case '*':
823        min = 0;
824        max = RegExpTree::kInfinity;
825        Advance();
826        break;
827      case '+':
828        min = 1;
829        max = RegExpTree::kInfinity;
830        Advance();
831        break;
832      case '?':
833        min = 0;
834        max = 1;
835        Advance();
836        break;
837      case '{':
838        if (ParseIntervalQuantifier(&min, &max)) {
839          if (max < min) {
840            return ReportError(RegExpError::kRangeOutOfOrder);
841          }
842          break;
843        } else if (unicode()) {
844          // With /u, incomplete quantifiers are not allowed.
845          return ReportError(RegExpError::kIncompleteQuantifier);
846        }
847        continue;
848      default:
849        continue;
850    }
851    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
852    if (current() == '?') {
853      quantifier_type = RegExpQuantifier::NON_GREEDY;
854      Advance();
855    } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
856      // FLAG_regexp_possessive_quantifier is a debug-only flag.
857      quantifier_type = RegExpQuantifier::POSSESSIVE;
858      Advance();
859    }
860    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
861      return ReportError(RegExpError::kInvalidQuantifier);
862    }
863  }
864}
865
866template <class CharT>
867RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
868    RegExpParserState* state) {
869  RegExpLookaround::Type lookaround_type = state->lookaround_type();
870  bool is_named_capture = false;
871  const ZoneVector<base::uc16>* capture_name = nullptr;
872  SubexpressionType subexpr_type = CAPTURE;
873  Advance();
874  if (current() == '?') {
875    switch (Next()) {
876      case ':':
877        Advance(2);
878        subexpr_type = GROUPING;
879        break;
880      case '=':
881        Advance(2);
882        lookaround_type = RegExpLookaround::LOOKAHEAD;
883        subexpr_type = POSITIVE_LOOKAROUND;
884        break;
885      case '!':
886        Advance(2);
887        lookaround_type = RegExpLookaround::LOOKAHEAD;
888        subexpr_type = NEGATIVE_LOOKAROUND;
889        break;
890      case '<':
891        Advance();
892        if (Next() == '=') {
893          Advance(2);
894          lookaround_type = RegExpLookaround::LOOKBEHIND;
895          subexpr_type = POSITIVE_LOOKAROUND;
896          break;
897        } else if (Next() == '!') {
898          Advance(2);
899          lookaround_type = RegExpLookaround::LOOKBEHIND;
900          subexpr_type = NEGATIVE_LOOKAROUND;
901          break;
902        }
903        is_named_capture = true;
904        has_named_captures_ = true;
905        Advance();
906        break;
907      default:
908        ReportError(RegExpError::kInvalidGroup);
909        return nullptr;
910    }
911  }
912  if (subexpr_type == CAPTURE) {
913    if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) {
914      ReportError(RegExpError::kTooManyCaptures);
915      return nullptr;
916    }
917    captures_started_++;
918
919    if (is_named_capture) {
920      capture_name = ParseCaptureGroupName(CHECK_FAILED);
921    }
922  }
923  // Store current state and begin new disjunction parsing.
924  return zone()->template New<RegExpParserState>(
925      state, subexpr_type, lookaround_type, captures_started_, capture_name,
926      state->builder()->flags(), zone());
927}
928
929#ifdef DEBUG
930namespace {
931
932bool IsSpecialClassEscape(base::uc32 c) {
933  switch (c) {
934    case 'd':
935    case 'D':
936    case 's':
937    case 'S':
938    case 'w':
939    case 'W':
940      return true;
941    default:
942      return false;
943  }
944}
945
946}  // namespace
947#endif
948
949// In order to know whether an escape is a backreference or not we have to scan
950// the entire regexp and find the number of capturing parentheses.  However we
951// don't want to scan the regexp twice unless it is necessary.  This mini-parser
952// is called when needed.  It can see the difference between capturing and
953// noncapturing parentheses and can skip character classes and backslash-escaped
954// characters.
955//
956// Important: The scanner has to be in a consistent state when calling
957// ScanForCaptures, e.g. not in the middle of an escape sequence '\['.
958template <class CharT>
959void RegExpParserImpl<CharT>::ScanForCaptures(
960    InClassEscapeState in_class_escape_state) {
961  DCHECK(!is_scanned_for_captures_);
962  const int saved_position = position();
963  // Start with captures started previous to current position
964  int capture_count = captures_started();
965  // When we start inside a character class, skip everything inside the class.
966  if (in_class_escape_state == InClassEscapeState::kInClass) {
967    int c;
968    while ((c = current()) != kEndMarker) {
969      Advance();
970      if (c == '\\') {
971        Advance();
972      } else {
973        if (c == ']') break;
974      }
975    }
976  }
977  // Add count of captures after this position.
978  int n;
979  while ((n = current()) != kEndMarker) {
980    Advance();
981    switch (n) {
982      case '\\':
983        Advance();
984        break;
985      case '[': {
986        int c;
987        while ((c = current()) != kEndMarker) {
988          Advance();
989          if (c == '\\') {
990            Advance();
991          } else {
992            if (c == ']') break;
993          }
994        }
995        break;
996      }
997      case '(':
998        if (current() == '?') {
999          // At this point we could be in
1000          // * a non-capturing group '(:',
1001          // * a lookbehind assertion '(?<=' '(?<!'
1002          // * or a named capture '(?<'.
1003          //
1004          // Of these, only named captures are capturing groups.
1005
1006          Advance();
1007          if (current() != '<') break;
1008
1009          Advance();
1010          if (current() == '=' || current() == '!') break;
1011
1012          // Found a possible named capture. It could turn out to be a syntax
1013          // error (e.g. an unterminated or invalid name), but that distinction
1014          // does not matter for our purposes.
1015          has_named_captures_ = true;
1016        }
1017        capture_count++;
1018        break;
1019    }
1020  }
1021  capture_count_ = capture_count;
1022  is_scanned_for_captures_ = true;
1023  Reset(saved_position);
1024}
1025
1026template <class CharT>
1027bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
1028  DCHECK_EQ('\\', current());
1029  DCHECK('1' <= Next() && Next() <= '9');
1030  // Try to parse a decimal literal that is no greater than the total number
1031  // of left capturing parentheses in the input.
1032  int start = position();
1033  int value = Next() - '0';
1034  Advance(2);
1035  while (true) {
1036    base::uc32 c = current();
1037    if (IsDecimalDigit(c)) {
1038      value = 10 * value + (c - '0');
1039      if (value > RegExpMacroAssembler::kMaxRegisterCount) {
1040        Reset(start);
1041        return false;
1042      }
1043      Advance();
1044    } else {
1045      break;
1046    }
1047  }
1048  if (value > captures_started()) {
1049    if (!is_scanned_for_captures_)
1050      ScanForCaptures(InClassEscapeState::kNotInClass);
1051    if (value > capture_count_) {
1052      Reset(start);
1053      return false;
1054    }
1055  }
1056  *index_out = value;
1057  return true;
1058}
1059
1060namespace {
1061
1062void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
1063  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
1064    v->push_back(code_unit);
1065  } else {
1066    v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
1067    v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
1068  }
1069}
1070
1071}  // namespace
1072
1073template <class CharT>
1074const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
1075  // Due to special Advance requirements (see the next comment), rewind by one
1076  // such that names starting with a surrogate pair are parsed correctly for
1077  // patterns where the unicode flag is unset.
1078  //
1079  // Note that we use this odd pattern of rewinding the last advance in order
1080  // to adhere to the common parser behavior of expecting `current` to point at
1081  // the first candidate character for a function (e.g. when entering ParseFoo,
1082  // `current` should point at the first character of Foo).
1083  RewindByOneCodepoint();
1084
1085  ZoneVector<base::uc16>* name =
1086      zone()->template New<ZoneVector<base::uc16>>(zone());
1087
1088  {
1089    // Advance behavior inside this function is tricky since
1090    // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
1091    // and thus allows surrogate pairs and \u{}-style escapes even in
1092    // non-unicode patterns. Therefore Advance within the capture group name
1093    // has to force-enable unicode, and outside the name revert to default
1094    // behavior.
1095    ForceUnicodeScope force_unicode(this);
1096
1097    bool at_start = true;
1098    while (true) {
1099      Advance();
1100      base::uc32 c = current();
1101
1102      // Convert unicode escapes.
1103      if (c == '\\' && Next() == 'u') {
1104        Advance(2);
1105        if (!ParseUnicodeEscape(&c)) {
1106          ReportError(RegExpError::kInvalidUnicodeEscape);
1107          return nullptr;
1108        }
1109        RewindByOneCodepoint();
1110      }
1111
1112      // The backslash char is misclassified as both ID_Start and ID_Continue.
1113      if (c == '\\') {
1114        ReportError(RegExpError::kInvalidCaptureGroupName);
1115        return nullptr;
1116      }
1117
1118      if (at_start) {
1119        if (!IsIdentifierStart(c)) {
1120          ReportError(RegExpError::kInvalidCaptureGroupName);
1121          return nullptr;
1122        }
1123        push_code_unit(name, c);
1124        at_start = false;
1125      } else {
1126        if (c == '>') {
1127          break;
1128        } else if (IsIdentifierPart(c)) {
1129          push_code_unit(name, c);
1130        } else {
1131          ReportError(RegExpError::kInvalidCaptureGroupName);
1132          return nullptr;
1133        }
1134      }
1135    }
1136  }
1137
1138  // This final advance goes back into the state of pointing at the next
1139  // relevant char, which the rest of the parser expects. See also the previous
1140  // comments in this function.
1141  Advance();
1142  return name;
1143}
1144
1145template <class CharT>
1146bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
1147    const ZoneVector<base::uc16>* name, int index) {
1148  DCHECK(0 < index && index <= captures_started_);
1149  DCHECK_NOT_NULL(name);
1150
1151  RegExpCapture* capture = GetCapture(index);
1152  DCHECK_NULL(capture->name());
1153
1154  capture->set_name(name);
1155
1156  if (named_captures_ == nullptr) {
1157    named_captures_ =
1158        zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
1159            zone());
1160  } else {
1161    // Check for duplicates and bail if we find any.
1162
1163    const auto& named_capture_it = named_captures_->find(capture);
1164    if (named_capture_it != named_captures_->end()) {
1165      ReportError(RegExpError::kDuplicateCaptureGroupName);
1166      return false;
1167    }
1168  }
1169
1170  named_captures_->emplace(capture);
1171
1172  return true;
1173}
1174
1175template <class CharT>
1176bool RegExpParserImpl<CharT>::ParseNamedBackReference(
1177    RegExpBuilder* builder, RegExpParserState* state) {
1178  // The parser is assumed to be on the '<' in \k<name>.
1179  if (current() != '<') {
1180    ReportError(RegExpError::kInvalidNamedReference);
1181    return false;
1182  }
1183
1184  Advance();
1185  const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
1186  if (name == nullptr) {
1187    return false;
1188  }
1189
1190  if (state->IsInsideCaptureGroup(name)) {
1191    builder->AddEmpty();
1192  } else {
1193    RegExpBackReference* atom =
1194        zone()->template New<RegExpBackReference>(builder->flags());
1195    atom->set_name(name);
1196
1197    builder->AddAtom(atom);
1198
1199    if (named_back_references_ == nullptr) {
1200      named_back_references_ =
1201          zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
1202    }
1203    named_back_references_->Add(atom, zone());
1204  }
1205
1206  return true;
1207}
1208
1209template <class CharT>
1210void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
1211  if (named_back_references_ == nullptr) return;
1212
1213  if (named_captures_ == nullptr) {
1214    ReportError(RegExpError::kInvalidNamedCaptureReference);
1215    return;
1216  }
1217
1218  // Look up and patch the actual capture for each named back reference.
1219
1220  for (int i = 0; i < named_back_references_->length(); i++) {
1221    RegExpBackReference* ref = named_back_references_->at(i);
1222
1223    // Capture used to search the named_captures_ by name, index of the
1224    // capture is never used.
1225    static const int kInvalidIndex = 0;
1226    RegExpCapture* search_capture =
1227        zone()->template New<RegExpCapture>(kInvalidIndex);
1228    DCHECK_NULL(search_capture->name());
1229    search_capture->set_name(ref->name());
1230
1231    int index = -1;
1232    const auto& capture_it = named_captures_->find(search_capture);
1233    if (capture_it != named_captures_->end()) {
1234      index = (*capture_it)->index();
1235    } else {
1236      ReportError(RegExpError::kInvalidNamedCaptureReference);
1237      return;
1238    }
1239
1240    ref->set_capture(GetCapture(index));
1241  }
1242}
1243
1244template <class CharT>
1245RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
1246  // The index for the capture groups are one-based. Its index in the list is
1247  // zero-based.
1248  const int known_captures =
1249      is_scanned_for_captures_ ? capture_count_ : captures_started_;
1250  DCHECK(index <= known_captures);
1251  if (captures_ == nullptr) {
1252    captures_ =
1253        zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
1254  }
1255  while (captures_->length() < known_captures) {
1256    captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
1257                   zone());
1258  }
1259  return captures_->at(index - 1);
1260}
1261
1262template <class CharT>
1263ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
1264  if (named_captures_ == nullptr || named_captures_->empty()) {
1265    return nullptr;
1266  }
1267
1268  return zone()->template New<ZoneVector<RegExpCapture*>>(
1269      named_captures_->begin(), named_captures_->end(), zone());
1270}
1271
1272template <class CharT>
1273bool RegExpParserImpl<CharT>::HasNamedCaptures(
1274    InClassEscapeState in_class_escape_state) {
1275  if (has_named_captures_ || is_scanned_for_captures_) {
1276    return has_named_captures_;
1277  }
1278
1279  ScanForCaptures(in_class_escape_state);
1280  DCHECK(is_scanned_for_captures_);
1281  return has_named_captures_;
1282}
1283
1284// QuantifierPrefix ::
1285//   { DecimalDigits }
1286//   { DecimalDigits , }
1287//   { DecimalDigits , DecimalDigits }
1288//
1289// Returns true if parsing succeeds, and set the min_out and max_out
1290// values. Values are truncated to RegExpTree::kInfinity if they overflow.
1291template <class CharT>
1292bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
1293                                                      int* max_out) {
1294  DCHECK_EQ(current(), '{');
1295  int start = position();
1296  Advance();
1297  int min = 0;
1298  if (!IsDecimalDigit(current())) {
1299    Reset(start);
1300    return false;
1301  }
1302  while (IsDecimalDigit(current())) {
1303    int next = current() - '0';
1304    if (min > (RegExpTree::kInfinity - next) / 10) {
1305      // Overflow. Skip past remaining decimal digits and return -1.
1306      do {
1307        Advance();
1308      } while (IsDecimalDigit(current()));
1309      min = RegExpTree::kInfinity;
1310      break;
1311    }
1312    min = 10 * min + next;
1313    Advance();
1314  }
1315  int max = 0;
1316  if (current() == '}') {
1317    max = min;
1318    Advance();
1319  } else if (current() == ',') {
1320    Advance();
1321    if (current() == '}') {
1322      max = RegExpTree::kInfinity;
1323      Advance();
1324    } else {
1325      while (IsDecimalDigit(current())) {
1326        int next = current() - '0';
1327        if (max > (RegExpTree::kInfinity - next) / 10) {
1328          do {
1329            Advance();
1330          } while (IsDecimalDigit(current()));
1331          max = RegExpTree::kInfinity;
1332          break;
1333        }
1334        max = 10 * max + next;
1335        Advance();
1336      }
1337      if (current() != '}') {
1338        Reset(start);
1339        return false;
1340      }
1341      Advance();
1342    }
1343  } else {
1344    Reset(start);
1345    return false;
1346  }
1347  *min_out = min;
1348  *max_out = max;
1349  return true;
1350}
1351
1352template <class CharT>
1353base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
1354  DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1355  // For compatibility with some other browsers (not all), we parse
1356  // up to three octal digits with a value below 256.
1357  // ES#prod-annexB-LegacyOctalEscapeSequence
1358  base::uc32 value = current() - '0';
1359  Advance();
1360  if ('0' <= current() && current() <= '7') {
1361    value = value * 8 + current() - '0';
1362    Advance();
1363    if (value < 32 && '0' <= current() && current() <= '7') {
1364      value = value * 8 + current() - '0';
1365      Advance();
1366    }
1367  }
1368  return value;
1369}
1370
1371template <class CharT>
1372bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
1373  int start = position();
1374  base::uc32 val = 0;
1375  for (int i = 0; i < length; ++i) {
1376    base::uc32 c = current();
1377    int d = base::HexValue(c);
1378    if (d < 0) {
1379      Reset(start);
1380      return false;
1381    }
1382    val = val * 16 + d;
1383    Advance();
1384  }
1385  *value = val;
1386  return true;
1387}
1388
1389// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1390template <class CharT>
1391bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
1392  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1393  // allowed). In the latter case, the number of hex digits between { } is
1394  // arbitrary. \ and u have already been read.
1395  if (current() == '{' && unicode()) {
1396    int start = position();
1397    Advance();
1398    if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1399      if (current() == '}') {
1400        Advance();
1401        return true;
1402      }
1403    }
1404    Reset(start);
1405    return false;
1406  }
1407  // \u but no {, or \u{...} escapes not allowed.
1408  bool result = ParseHexEscape(4, value);
1409  if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1410      current() == '\\') {
1411    // Attempt to read trail surrogate.
1412    int start = position();
1413    if (Next() == 'u') {
1414      Advance(2);
1415      base::uc32 trail;
1416      if (ParseHexEscape(4, &trail) &&
1417          unibrow::Utf16::IsTrailSurrogate(trail)) {
1418        *value = unibrow::Utf16::CombineSurrogatePair(
1419            static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
1420        return true;
1421      }
1422    }
1423    Reset(start);
1424  }
1425  return result;
1426}
1427
1428#ifdef V8_INTL_SUPPORT
1429
1430namespace {
1431
1432bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1433  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1434  if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1435    return true;
1436  for (int i = 0;; i++) {
1437    const char* long_name = u_getPropertyName(
1438        property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1439    if (long_name == nullptr) break;
1440    if (strcmp(property_name, long_name) == 0) return true;
1441  }
1442  return false;
1443}
1444
1445bool IsExactPropertyValueAlias(const char* property_value_name,
1446                               UProperty property, int32_t property_value) {
1447  const char* short_name =
1448      u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1449  if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1450    return true;
1451  }
1452  for (int i = 0;; i++) {
1453    const char* long_name = u_getPropertyValueName(
1454        property, property_value,
1455        static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1456    if (long_name == nullptr) break;
1457    if (strcmp(property_value_name, long_name) == 0) return true;
1458  }
1459  return false;
1460}
1461
1462bool LookupPropertyValueName(UProperty property,
1463                             const char* property_value_name, bool negate,
1464                             ZoneList<CharacterRange>* result, Zone* zone) {
1465  UProperty property_for_lookup = property;
1466  if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1467    // For the property Script_Extensions, we have to do the property value
1468    // name lookup as if the property is Script.
1469    property_for_lookup = UCHAR_SCRIPT;
1470  }
1471  int32_t property_value =
1472      u_getPropertyValueEnum(property_for_lookup, property_value_name);
1473  if (property_value == UCHAR_INVALID_CODE) return false;
1474
1475  // We require the property name to match exactly to one of the property value
1476  // aliases. However, u_getPropertyValueEnum uses loose matching.
1477  if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1478                                 property_value)) {
1479    return false;
1480  }
1481
1482  UErrorCode ec = U_ZERO_ERROR;
1483  icu::UnicodeSet set;
1484  set.applyIntPropertyValue(property, property_value, ec);
1485  bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1486
1487  if (success) {
1488    set.removeAllStrings();
1489    if (negate) set.complement();
1490    for (int i = 0; i < set.getRangeCount(); i++) {
1491      result->Add(
1492          CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1493          zone);
1494    }
1495  }
1496  return success;
1497}
1498
1499template <size_t N>
1500inline bool NameEquals(const char* name, const char (&literal)[N]) {
1501  return strncmp(name, literal, N + 1) == 0;
1502}
1503
1504bool LookupSpecialPropertyValueName(const char* name,
1505                                    ZoneList<CharacterRange>* result,
1506                                    bool negate, Zone* zone) {
1507  if (NameEquals(name, "Any")) {
1508    if (negate) {
1509      // Leave the list of character ranges empty, since the negation of 'Any'
1510      // is the empty set.
1511    } else {
1512      result->Add(CharacterRange::Everything(), zone);
1513    }
1514  } else if (NameEquals(name, "ASCII")) {
1515    result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1516                       : CharacterRange::Range(0x0, 0x7F),
1517                zone);
1518  } else if (NameEquals(name, "Assigned")) {
1519    return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1520                                   !negate, result, zone);
1521  } else {
1522    return false;
1523  }
1524  return true;
1525}
1526
1527// Explicitly allowlist supported binary properties. The spec forbids supporting
1528// properties outside of this set to ensure interoperability.
1529bool IsSupportedBinaryProperty(UProperty property) {
1530  switch (property) {
1531    case UCHAR_ALPHABETIC:
1532    // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1533    // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1534    case UCHAR_ASCII_HEX_DIGIT:
1535    // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1536    case UCHAR_BIDI_CONTROL:
1537    case UCHAR_BIDI_MIRRORED:
1538    case UCHAR_CASE_IGNORABLE:
1539    case UCHAR_CASED:
1540    case UCHAR_CHANGES_WHEN_CASEFOLDED:
1541    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1542    case UCHAR_CHANGES_WHEN_LOWERCASED:
1543    case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1544    case UCHAR_CHANGES_WHEN_TITLECASED:
1545    case UCHAR_CHANGES_WHEN_UPPERCASED:
1546    case UCHAR_DASH:
1547    case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1548    case UCHAR_DEPRECATED:
1549    case UCHAR_DIACRITIC:
1550    case UCHAR_EMOJI:
1551    case UCHAR_EMOJI_COMPONENT:
1552    case UCHAR_EMOJI_MODIFIER_BASE:
1553    case UCHAR_EMOJI_MODIFIER:
1554    case UCHAR_EMOJI_PRESENTATION:
1555    case UCHAR_EXTENDED_PICTOGRAPHIC:
1556    case UCHAR_EXTENDER:
1557    case UCHAR_GRAPHEME_BASE:
1558    case UCHAR_GRAPHEME_EXTEND:
1559    case UCHAR_HEX_DIGIT:
1560    case UCHAR_ID_CONTINUE:
1561    case UCHAR_ID_START:
1562    case UCHAR_IDEOGRAPHIC:
1563    case UCHAR_IDS_BINARY_OPERATOR:
1564    case UCHAR_IDS_TRINARY_OPERATOR:
1565    case UCHAR_JOIN_CONTROL:
1566    case UCHAR_LOGICAL_ORDER_EXCEPTION:
1567    case UCHAR_LOWERCASE:
1568    case UCHAR_MATH:
1569    case UCHAR_NONCHARACTER_CODE_POINT:
1570    case UCHAR_PATTERN_SYNTAX:
1571    case UCHAR_PATTERN_WHITE_SPACE:
1572    case UCHAR_QUOTATION_MARK:
1573    case UCHAR_RADICAL:
1574    case UCHAR_REGIONAL_INDICATOR:
1575    case UCHAR_S_TERM:
1576    case UCHAR_SOFT_DOTTED:
1577    case UCHAR_TERMINAL_PUNCTUATION:
1578    case UCHAR_UNIFIED_IDEOGRAPH:
1579    case UCHAR_UPPERCASE:
1580    case UCHAR_VARIATION_SELECTOR:
1581    case UCHAR_WHITE_SPACE:
1582    case UCHAR_XID_CONTINUE:
1583    case UCHAR_XID_START:
1584      return true;
1585    default:
1586      break;
1587  }
1588  return false;
1589}
1590
1591bool IsUnicodePropertyValueCharacter(char c) {
1592  // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1593  //
1594  // Note that using this to validate each parsed char is quite conservative.
1595  // A possible alternative solution would be to only ensure the parsed
1596  // property name/value candidate string does not contain '\0' characters and
1597  // let ICU lookups trigger the final failure.
1598  if ('a' <= c && c <= 'z') return true;
1599  if ('A' <= c && c <= 'Z') return true;
1600  if ('0' <= c && c <= '9') return true;
1601  return (c == '_');
1602}
1603
1604}  // namespace
1605
1606template <class CharT>
1607bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
1608                                                     ZoneVector<char>* name_2) {
1609  DCHECK(name_1->empty());
1610  DCHECK(name_2->empty());
1611  // Parse the property class as follows:
1612  // - In \p{name}, 'name' is interpreted
1613  //   - either as a general category property value name.
1614  //   - or as a binary property name.
1615  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1616  //   and 'value' is interpreted as one of the available property value names.
1617  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1618  // - Loose matching is not applied.
1619  if (current() == '{') {
1620    // Parse \p{[PropertyName=]PropertyNameValue}
1621    for (Advance(); current() != '}' && current() != '='; Advance()) {
1622      if (!IsUnicodePropertyValueCharacter(current())) return false;
1623      if (!has_next()) return false;
1624      name_1->push_back(static_cast<char>(current()));
1625    }
1626    if (current() == '=') {
1627      for (Advance(); current() != '}'; Advance()) {
1628        if (!IsUnicodePropertyValueCharacter(current())) return false;
1629        if (!has_next()) return false;
1630        name_2->push_back(static_cast<char>(current()));
1631      }
1632      name_2->push_back(0);  // null-terminate string.
1633    }
1634  } else {
1635    return false;
1636  }
1637  Advance();
1638  name_1->push_back(0);  // null-terminate string.
1639
1640  DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1641  DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1642  return true;
1643}
1644
1645template <class CharT>
1646bool RegExpParserImpl<CharT>::AddPropertyClassRange(
1647    ZoneList<CharacterRange>* add_to, bool negate,
1648    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1649  if (name_2.empty()) {
1650    // First attempt to interpret as general category property value name.
1651    const char* name = name_1.data();
1652    if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1653                                add_to, zone())) {
1654      return true;
1655    }
1656    // Interpret "Any", "ASCII", and "Assigned".
1657    if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1658      return true;
1659    }
1660    // Then attempt to interpret as binary property name with value name 'Y'.
1661    UProperty property = u_getPropertyEnum(name);
1662    if (!IsSupportedBinaryProperty(property)) return false;
1663    if (!IsExactPropertyAlias(name, property)) return false;
1664    return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1665                                   zone());
1666  } else {
1667    // Both property name and value name are specified. Attempt to interpret
1668    // the property name as enumerated property.
1669    const char* property_name = name_1.data();
1670    const char* value_name = name_2.data();
1671    UProperty property = u_getPropertyEnum(property_name);
1672    if (!IsExactPropertyAlias(property_name, property)) return false;
1673    if (property == UCHAR_GENERAL_CATEGORY) {
1674      // We want to allow aggregate value names such as "Letter".
1675      property = UCHAR_GENERAL_CATEGORY_MASK;
1676    } else if (property != UCHAR_SCRIPT &&
1677               property != UCHAR_SCRIPT_EXTENSIONS) {
1678      return false;
1679    }
1680    return LookupPropertyValueName(property, value_name, negate, add_to,
1681                                   zone());
1682  }
1683}
1684
1685#else  // V8_INTL_SUPPORT
1686
1687template <class CharT>
1688bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
1689                                                     ZoneVector<char>* name_2) {
1690  return false;
1691}
1692
1693template <class CharT>
1694bool RegExpParserImpl<CharT>::AddPropertyClassRange(
1695    ZoneList<CharacterRange>* add_to, bool negate,
1696    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1697  return false;
1698}
1699
1700#endif  // V8_INTL_SUPPORT
1701
1702template <class CharT>
1703bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
1704                                                            base::uc32* value) {
1705  base::uc32 x = 0;
1706  int d = base::HexValue(current());
1707  if (d < 0) {
1708    return false;
1709  }
1710  while (d >= 0) {
1711    x = x * 16 + d;
1712    if (x > static_cast<base::uc32>(max_value)) {
1713      return false;
1714    }
1715    Advance();
1716    d = base::HexValue(current());
1717  }
1718  *value = x;
1719  return true;
1720}
1721
1722// https://tc39.es/ecma262/#prod-CharacterEscape
1723template <class CharT>
1724base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
1725    InClassEscapeState in_class_escape_state,
1726    bool* is_escaped_unicode_character) {
1727  DCHECK_EQ('\\', current());
1728  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1729
1730  Advance();
1731
1732  const base::uc32 c = current();
1733  switch (c) {
1734    // CharacterEscape ::
1735    //   ControlEscape :: one of
1736    //     f n r t v
1737    case 'f':
1738      Advance();
1739      return '\f';
1740    case 'n':
1741      Advance();
1742      return '\n';
1743    case 'r':
1744      Advance();
1745      return '\r';
1746    case 't':
1747      Advance();
1748      return '\t';
1749    case 'v':
1750      Advance();
1751      return '\v';
1752    // CharacterEscape ::
1753    //   c ControlLetter
1754    case 'c': {
1755      base::uc32 controlLetter = Next();
1756      base::uc32 letter = controlLetter & ~('A' ^ 'a');
1757      if (letter >= 'A' && letter <= 'Z') {
1758        Advance(2);
1759        // Control letters mapped to ASCII control characters in the range
1760        // 0x00-0x1F.
1761        return controlLetter & 0x1F;
1762      }
1763      if (unicode()) {
1764        // With /u, invalid escapes are not treated as identity escapes.
1765        ReportError(RegExpError::kInvalidUnicodeEscape);
1766        return 0;
1767      }
1768      if (in_class_escape_state == InClassEscapeState::kInClass) {
1769        // Inside a character class, we also accept digits and underscore as
1770        // control characters, unless with /u. See Annex B:
1771        // ES#prod-annexB-ClassControlLetter
1772        if ((controlLetter >= '0' && controlLetter <= '9') ||
1773            controlLetter == '_') {
1774          Advance(2);
1775          return controlLetter & 0x1F;
1776        }
1777      }
1778      // We match JSC in reading the backslash as a literal
1779      // character instead of as starting an escape.
1780      return '\\';
1781    }
1782    // CharacterEscape ::
1783    //   0 [lookahead ∉ DecimalDigit]
1784    //   [~UnicodeMode] LegacyOctalEscapeSequence
1785    case '0':
1786      // \0 is interpreted as NUL if not followed by another digit.
1787      if (Next() < '0' || Next() > '9') {
1788        Advance();
1789        return 0;
1790      }
1791      V8_FALLTHROUGH;
1792    case '1':
1793    case '2':
1794    case '3':
1795    case '4':
1796    case '5':
1797    case '6':
1798    case '7':
1799      // For compatibility, we interpret a decimal escape that isn't
1800      // a back reference (and therefore either \0 or not valid according
1801      // to the specification) as a 1..3 digit octal character code.
1802      // ES#prod-annexB-LegacyOctalEscapeSequence
1803      if (unicode()) {
1804        // With /u, decimal escape is not interpreted as octal character code.
1805        ReportError(RegExpError::kInvalidClassEscape);
1806        return 0;
1807      }
1808      return ParseOctalLiteral();
1809    // CharacterEscape ::
1810    //   HexEscapeSequence
1811    case 'x': {
1812      Advance();
1813      base::uc32 value;
1814      if (ParseHexEscape(2, &value)) return value;
1815      if (unicode()) {
1816        // With /u, invalid escapes are not treated as identity escapes.
1817        ReportError(RegExpError::kInvalidEscape);
1818        return 0;
1819      }
1820      // If \x is not followed by a two-digit hexadecimal, treat it
1821      // as an identity escape.
1822      return 'x';
1823    }
1824    // CharacterEscape ::
1825    //   RegExpUnicodeEscapeSequence [?UnicodeMode]
1826    case 'u': {
1827      Advance();
1828      base::uc32 value;
1829      if (ParseUnicodeEscape(&value)) {
1830        *is_escaped_unicode_character = true;
1831        return value;
1832      }
1833      if (unicode()) {
1834        // With /u, invalid escapes are not treated as identity escapes.
1835        ReportError(RegExpError::kInvalidUnicodeEscape);
1836        return 0;
1837      }
1838      // If \u is not followed by a two-digit hexadecimal, treat it
1839      // as an identity escape.
1840      return 'u';
1841    }
1842    default:
1843      break;
1844  }
1845
1846  // CharacterEscape ::
1847  //   IdentityEscape[?UnicodeMode, ?N]
1848  //
1849  // * With /u, no identity escapes except for syntax characters are
1850  //   allowed.
1851  // * Without /u:
1852  //   * '\c' is not an IdentityEscape.
1853  //   * '\k' is not an IdentityEscape when named captures exist.
1854  //   * Otherwise, all identity escapes are allowed.
1855  if (unicode()) {
1856    if (!IsSyntaxCharacterOrSlash(c)) {
1857      ReportError(RegExpError::kInvalidEscape);
1858      return 0;
1859    }
1860    Advance();
1861    return c;
1862  }
1863  DCHECK(!unicode());
1864  if (c == 'c') {
1865    ReportError(RegExpError::kInvalidEscape);
1866    return 0;
1867  }
1868  Advance();
1869  // Note: It's important to Advance before the HasNamedCaptures call s.t. we
1870  // don't start scanning in the middle of an escape.
1871  if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
1872    ReportError(RegExpError::kInvalidEscape);
1873    return 0;
1874  }
1875  return c;
1876}
1877
1878// https://tc39.es/ecma262/#prod-ClassEscape
1879template <class CharT>
1880void RegExpParserImpl<CharT>::ParseClassEscape(
1881    ZoneList<CharacterRange>* ranges, Zone* zone,
1882    bool add_unicode_case_equivalents, base::uc32* char_out,
1883    bool* is_class_escape) {
1884  *is_class_escape = false;
1885
1886  if (current() != '\\') {
1887    // Not a ClassEscape.
1888    *char_out = current();
1889    Advance();
1890    return;
1891  }
1892
1893  const base::uc32 next = Next();
1894  switch (next) {
1895    case 'b':
1896      *char_out = '\b';
1897      Advance(2);
1898      return;
1899    case '-':
1900      if (unicode()) {
1901        *char_out = next;
1902        Advance(2);
1903        return;
1904      }
1905      break;
1906    case kEndMarker:
1907      ReportError(RegExpError::kEscapeAtEndOfPattern);
1908      return;
1909    default:
1910      break;
1911  }
1912
1913  static constexpr InClassEscapeState kInClassEscape =
1914      InClassEscapeState::kInClass;
1915  *is_class_escape = TryParseCharacterClassEscape(
1916      next, kInClassEscape, ranges, zone, add_unicode_case_equivalents);
1917  if (*is_class_escape) return;
1918
1919  bool dummy = false;  // Unused.
1920  *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
1921}
1922
1923// https://tc39.es/ecma262/#prod-CharacterClassEscape
1924template <class CharT>
1925bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
1926    base::uc32 next, InClassEscapeState in_class_escape_state,
1927    ZoneList<CharacterRange>* ranges, Zone* zone,
1928    bool add_unicode_case_equivalents) {
1929  DCHECK_EQ(current(), '\\');
1930  DCHECK_EQ(Next(), next);
1931
1932  switch (next) {
1933    case 'd':
1934    case 'D':
1935    case 's':
1936    case 'S':
1937    case 'w':
1938    case 'W':
1939      CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
1940                                     ranges, add_unicode_case_equivalents,
1941                                     zone);
1942      Advance(2);
1943      return true;
1944    case 'p':
1945    case 'P': {
1946      if (!unicode()) return false;
1947      bool negate = next == 'P';
1948      Advance(2);
1949      ZoneVector<char> name_1(zone);
1950      ZoneVector<char> name_2(zone);
1951      if (!ParsePropertyClassName(&name_1, &name_2) ||
1952          !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1953        ReportError(in_class_escape_state == InClassEscapeState::kInClass
1954                        ? RegExpError::kInvalidClassPropertyName
1955                        : RegExpError::kInvalidPropertyName);
1956      }
1957      return true;
1958    }
1959    default:
1960      return false;
1961  }
1962}
1963
1964template <class CharT>
1965RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
1966    const RegExpBuilder* builder) {
1967  DCHECK_EQ(current(), '[');
1968  Advance();
1969  bool is_negated = false;
1970  if (current() == '^') {
1971    is_negated = true;
1972    Advance();
1973  }
1974  ZoneList<CharacterRange>* ranges =
1975      zone()->template New<ZoneList<CharacterRange>>(2, zone());
1976  bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1977  while (has_more() && current() != ']') {
1978    base::uc32 char_1, char_2;
1979    bool is_class_1, is_class_2;
1980    ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1981                     &is_class_1 CHECK_FAILED);
1982    if (current() == '-') {
1983      Advance();
1984      if (current() == kEndMarker) {
1985        // If we reach the end we break out of the loop and let the
1986        // following code report an error.
1987        break;
1988      } else if (current() == ']') {
1989        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1990        ranges->Add(CharacterRange::Singleton('-'), zone());
1991        break;
1992      }
1993      ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1994                       &is_class_2 CHECK_FAILED);
1995      if (is_class_1 || is_class_2) {
1996        // Either end is an escaped character class. Treat the '-' verbatim.
1997        if (unicode()) {
1998          // ES2015 21.2.2.15.1 step 1.
1999          return ReportError(RegExpError::kInvalidCharacterClass);
2000        }
2001        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2002        ranges->Add(CharacterRange::Singleton('-'), zone());
2003        if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
2004        continue;
2005      }
2006      // ES2015 21.2.2.15.1 step 6.
2007      if (char_1 > char_2) {
2008        return ReportError(RegExpError::kOutOfOrderCharacterClass);
2009      }
2010      ranges->Add(CharacterRange::Range(char_1, char_2), zone());
2011    } else {
2012      if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2013    }
2014  }
2015  if (!has_more()) {
2016    return ReportError(RegExpError::kUnterminatedCharacterClass);
2017  }
2018  Advance();
2019  RegExpCharacterClass::CharacterClassFlags character_class_flags;
2020  if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
2021  return zone()->template New<RegExpCharacterClass>(zone(), ranges,
2022                                                    character_class_flags);
2023}
2024
2025#undef CHECK_FAILED
2026
2027template <class CharT>
2028bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
2029  DCHECK_NOT_NULL(result);
2030  RegExpTree* tree = ParsePattern();
2031
2032  if (failed()) {
2033    DCHECK_NULL(tree);
2034    DCHECK_NE(error_, RegExpError::kNone);
2035    result->error = error_;
2036    result->error_pos = error_pos_;
2037    return false;
2038  }
2039
2040  DCHECK_NOT_NULL(tree);
2041  DCHECK_EQ(error_, RegExpError::kNone);
2042  if (FLAG_trace_regexp_parser) {
2043    StdoutStream os;
2044    tree->Print(os, zone());
2045    os << "\n";
2046  }
2047
2048  result->tree = tree;
2049  const int capture_count = captures_started();
2050  result->simple = tree->IsAtom() && simple() && capture_count == 0;
2051  result->contains_anchor = contains_anchor();
2052  result->capture_count = capture_count;
2053  result->named_captures = GetNamedCaptures();
2054  return true;
2055}
2056
2057void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
2058  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
2059  FlushPendingSurrogate();
2060  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
2061  pending_surrogate_ = lead_surrogate;
2062}
2063
2064void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
2065  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
2066  if (pending_surrogate_ != kNoPendingSurrogate) {
2067    base::uc16 lead_surrogate = pending_surrogate_;
2068    pending_surrogate_ = kNoPendingSurrogate;
2069    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
2070    base::uc32 combined =
2071        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
2072    if (NeedsDesugaringForIgnoreCase(combined)) {
2073      AddCharacterClassForDesugaring(combined);
2074    } else {
2075      ZoneList<base::uc16> surrogate_pair(2, zone());
2076      surrogate_pair.Add(lead_surrogate, zone());
2077      surrogate_pair.Add(trail_surrogate, zone());
2078      RegExpAtom* atom =
2079          zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
2080      AddAtom(atom);
2081    }
2082  } else {
2083    pending_surrogate_ = trail_surrogate;
2084    FlushPendingSurrogate();
2085  }
2086}
2087
2088void RegExpBuilder::FlushPendingSurrogate() {
2089  if (pending_surrogate_ != kNoPendingSurrogate) {
2090    DCHECK(unicode());
2091    base::uc32 c = pending_surrogate_;
2092    pending_surrogate_ = kNoPendingSurrogate;
2093    AddCharacterClassForDesugaring(c);
2094  }
2095}
2096
2097void RegExpBuilder::FlushCharacters() {
2098  FlushPendingSurrogate();
2099  pending_empty_ = false;
2100  if (characters_ != nullptr) {
2101    RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
2102    characters_ = nullptr;
2103    text_.emplace_back(atom);
2104    LAST(ADD_ATOM);
2105  }
2106}
2107
2108void RegExpBuilder::FlushText() {
2109  FlushCharacters();
2110  size_t num_text = text_.size();
2111  if (num_text == 0) {
2112    return;
2113  } else if (num_text == 1) {
2114    terms_.emplace_back(text_.back());
2115  } else {
2116    RegExpText* text = zone()->New<RegExpText>(zone());
2117    for (size_t i = 0; i < num_text; i++) {
2118      text_[i]->AppendToText(text, zone());
2119    }
2120    terms_.emplace_back(text);
2121  }
2122  text_.clear();
2123}
2124
2125void RegExpBuilder::AddCharacter(base::uc16 c) {
2126  FlushPendingSurrogate();
2127  pending_empty_ = false;
2128  if (NeedsDesugaringForIgnoreCase(c)) {
2129    AddCharacterClassForDesugaring(c);
2130  } else {
2131    if (characters_ == nullptr) {
2132      characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
2133    }
2134    characters_->Add(c, zone());
2135    LAST(ADD_CHAR);
2136  }
2137}
2138
2139void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
2140  if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
2141    DCHECK(unicode());
2142    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
2143    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
2144  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
2145    AddLeadSurrogate(c);
2146  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
2147    AddTrailSurrogate(c);
2148  } else {
2149    AddCharacter(static_cast<base::uc16>(c));
2150  }
2151}
2152
2153void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
2154  // A lead or trail surrogate parsed via escape sequence will not
2155  // pair up with any preceding lead or following trail surrogate.
2156  FlushPendingSurrogate();
2157  AddUnicodeCharacter(character);
2158  FlushPendingSurrogate();
2159}
2160
2161void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
2162
2163void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
2164  if (NeedsDesugaringForUnicode(cc)) {
2165    // With /u, character class needs to be desugared, so it
2166    // must be a standalone term instead of being part of a RegExpText.
2167    AddTerm(cc);
2168  } else {
2169    AddAtom(cc);
2170  }
2171}
2172
2173void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) {
2174  AddTerm(zone()->New<RegExpCharacterClass>(
2175      zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
2176}
2177
2178void RegExpBuilder::AddAtom(RegExpTree* term) {
2179  if (term->IsEmpty()) {
2180    AddEmpty();
2181    return;
2182  }
2183  if (term->IsTextElement()) {
2184    FlushCharacters();
2185    text_.emplace_back(term);
2186  } else {
2187    FlushText();
2188    terms_.emplace_back(term);
2189  }
2190  LAST(ADD_ATOM);
2191}
2192
2193void RegExpBuilder::AddTerm(RegExpTree* term) {
2194  FlushText();
2195  terms_.emplace_back(term);
2196  LAST(ADD_ATOM);
2197}
2198
2199void RegExpBuilder::AddAssertion(RegExpTree* assert) {
2200  FlushText();
2201  terms_.emplace_back(assert);
2202  LAST(ADD_ASSERT);
2203}
2204
2205void RegExpBuilder::NewAlternative() { FlushTerms(); }
2206
2207void RegExpBuilder::FlushTerms() {
2208  FlushText();
2209  size_t num_terms = terms_.size();
2210  RegExpTree* alternative;
2211  if (num_terms == 0) {
2212    alternative = zone()->New<RegExpEmpty>();
2213  } else if (num_terms == 1) {
2214    alternative = terms_.back();
2215  } else {
2216    alternative =
2217        zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
2218            base::VectorOf(terms_.begin(), terms_.size()), zone()));
2219  }
2220  alternatives_.emplace_back(alternative);
2221  terms_.clear();
2222  LAST(ADD_NONE);
2223}
2224
2225bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
2226  if (!unicode()) return false;
2227  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2228  // necessarily mean that we need to desugar. It's probably nicer to have a
2229  // separate pass to figure out unicode desugarings.
2230  if (ignore_case()) return true;
2231  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2232  CharacterRange::Canonicalize(ranges);
2233  for (int i = ranges->length() - 1; i >= 0; i--) {
2234    base::uc32 from = ranges->at(i).from();
2235    base::uc32 to = ranges->at(i).to();
2236    // Check for non-BMP characters.
2237    if (to >= kNonBmpStart) return true;
2238    // Check for lone surrogates.
2239    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2240  }
2241  return false;
2242}
2243
2244bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
2245#ifdef V8_INTL_SUPPORT
2246  if (unicode() && ignore_case()) {
2247    icu::UnicodeSet set(c, c);
2248    set.closeOver(USET_CASE_INSENSITIVE);
2249    set.removeAllStrings();
2250    return set.size() > 1;
2251  }
2252  // In the case where ICU is not included, we act as if the unicode flag is
2253  // not set, and do not desugar.
2254#endif  // V8_INTL_SUPPORT
2255  return false;
2256}
2257
2258RegExpTree* RegExpBuilder::ToRegExp() {
2259  FlushTerms();
2260  size_t num_alternatives = alternatives_.size();
2261  if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
2262  if (num_alternatives == 1) return alternatives_.back();
2263  return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
2264      base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
2265}
2266
2267bool RegExpBuilder::AddQuantifierToAtom(
2268    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2269  FlushPendingSurrogate();
2270  if (pending_empty_) {
2271    pending_empty_ = false;
2272    return true;
2273  }
2274  RegExpTree* atom;
2275  if (characters_ != nullptr) {
2276    DCHECK(last_added_ == ADD_CHAR);
2277    // Last atom was character.
2278    base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
2279    int num_chars = char_vector.length();
2280    if (num_chars > 1) {
2281      base::Vector<const base::uc16> prefix =
2282          char_vector.SubVector(0, num_chars - 1);
2283      text_.emplace_back(zone()->New<RegExpAtom>(prefix));
2284      char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2285    }
2286    characters_ = nullptr;
2287    atom = zone()->New<RegExpAtom>(char_vector);
2288    FlushText();
2289  } else if (text_.size() > 0) {
2290    DCHECK(last_added_ == ADD_ATOM);
2291    atom = text_.back();
2292    text_.pop_back();
2293    FlushText();
2294  } else if (terms_.size() > 0) {
2295    DCHECK(last_added_ == ADD_ATOM);
2296    atom = terms_.back();
2297    terms_.pop_back();
2298    if (atom->IsLookaround()) {
2299      // With /u, lookarounds are not quantifiable.
2300      if (unicode()) return false;
2301      // Lookbehinds are not quantifiable.
2302      if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2303        return false;
2304      }
2305    }
2306    if (atom->max_match() == 0) {
2307      // Guaranteed to only match an empty string.
2308      LAST(ADD_TERM);
2309      if (min == 0) {
2310        return true;
2311      }
2312      terms_.emplace_back(atom);
2313      return true;
2314    }
2315  } else {
2316    // Only call immediately after adding an atom or character!
2317    UNREACHABLE();
2318  }
2319  terms_.emplace_back(
2320      zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom));
2321  LAST(ADD_TERM);
2322  return true;
2323}
2324
2325template class RegExpParserImpl<uint8_t>;
2326template class RegExpParserImpl<base::uc16>;
2327
2328}  // namespace
2329
2330// static
2331bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
2332                                             Handle<String> input,
2333                                             RegExpFlags flags,
2334                                             RegExpCompileData* result) {
2335  DisallowGarbageCollection no_gc;
2336  uintptr_t stack_limit = isolate->stack_guard()->real_climit();
2337  String::FlatContent content = input->GetFlatContent(no_gc);
2338  if (content.IsOneByte()) {
2339    base::Vector<const uint8_t> v = content.ToOneByteVector();
2340    return RegExpParserImpl<uint8_t>{v.begin(),   v.length(), flags,
2341                                     stack_limit, zone,       no_gc}
2342        .Parse(result);
2343  } else {
2344    base::Vector<const base::uc16> v = content.ToUC16Vector();
2345    return RegExpParserImpl<base::uc16>{v.begin(),   v.length(), flags,
2346                                        stack_limit, zone,       no_gc}
2347        .Parse(result);
2348  }
2349}
2350
2351// static
2352template <class CharT>
2353bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
2354                                      const CharT* input, int input_length,
2355                                      RegExpFlags flags,
2356                                      RegExpCompileData* result,
2357                                      const DisallowGarbageCollection& no_gc) {
2358  return RegExpParserImpl<CharT>{input,       input_length, flags,
2359                                 stack_limit, zone,         no_gc}
2360      .Parse(result);
2361}
2362
2363template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
2364    Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
2365    const DisallowGarbageCollection&);
2366template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
2367    Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
2368    const DisallowGarbageCollection&);
2369
2370// static
2371bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
2372                                      Handle<String> input, RegExpFlags flags,
2373                                      RegExpCompileData* result,
2374                                      const DisallowGarbageCollection&) {
2375  return ParseRegExpFromHeapString(isolate, zone, input, flags, result);
2376}
2377
2378#undef LAST
2379
2380}  // namespace internal
2381}  // namespace v8
2382