xref: /third_party/node/deps/v8/src/regexp/regexp.h (revision 1cb0ef41)
1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_H_
6#define V8_REGEXP_REGEXP_H_
7
8#include "src/common/assert-scope.h"
9#include "src/handles/handles.h"
10#include "src/regexp/regexp-error.h"
11#include "src/regexp/regexp-flags.h"
12#include "src/zone/zone-containers.h"
13
14namespace v8 {
15namespace internal {
16
17class JSRegExp;
18class RegExpCapture;
19class RegExpMatchInfo;
20class RegExpNode;
21class RegExpTree;
22
23enum class RegExpCompilationTarget : int { kBytecode, kNative };
24
25// TODO(jgruber): Do not expose in regexp.h.
26// TODO(jgruber): Consider splitting between ParseData and CompileData.
27struct RegExpCompileData {
28  // The parsed AST as produced by the RegExpParser.
29  RegExpTree* tree = nullptr;
30
31  // The compiled Node graph as produced by RegExpTree::ToNode methods.
32  RegExpNode* node = nullptr;
33
34  // Either the generated code as produced by the compiler or a trampoline
35  // to the interpreter.
36  Handle<Object> code;
37
38  // True, iff the pattern is a 'simple' atom with zero captures. In other
39  // words, the pattern consists of a string with no metacharacters and special
40  // regexp features, and can be implemented as a standard string search.
41  bool simple = true;
42
43  // True, iff the pattern is anchored at the start of the string with '^'.
44  bool contains_anchor = false;
45
46  // Only set if the pattern contains named captures.
47  // Note: the lifetime equals that of the parse/compile zone.
48  ZoneVector<RegExpCapture*>* named_captures = nullptr;
49
50  // The error message. Only used if an error occurred during parsing or
51  // compilation.
52  RegExpError error = RegExpError::kNone;
53
54  // The position at which the error was detected. Only used if an
55  // error occurred.
56  int error_pos = 0;
57
58  // The number of capture groups, without the global capture \0.
59  int capture_count = 0;
60
61  // The number of registers used by the generated code.
62  int register_count = 0;
63
64  // The compilation target (bytecode or native code).
65  RegExpCompilationTarget compilation_target;
66};
67
68class RegExp final : public AllStatic {
69 public:
70  // Whether the irregexp engine generates interpreter bytecode.
71  static bool CanGenerateBytecode();
72
73  // Verify the given pattern, i.e. check that parsing succeeds. If
74  // verification fails, `regexp_error_out` is set.
75  template <class CharT>
76  static bool VerifySyntax(Zone* zone, uintptr_t stack_limit,
77                           const CharT* input, int input_length,
78                           RegExpFlags flags, RegExpError* regexp_error_out,
79                           const DisallowGarbageCollection& no_gc);
80
81  // Parses the RegExp pattern and prepares the JSRegExp object with
82  // generic data and choice of implementation - as well as what
83  // the implementation wants to store in the data field.
84  // Returns false if compilation fails.
85  V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
86      Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
87      RegExpFlags flags, uint32_t backtrack_limit);
88
89  // Ensures that a regexp is fully compiled and ready to be executed on a
90  // subject string.  Returns true on success. Return false on failure, and
91  // then an exception will be pending.
92  V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate,
93                                                        Handle<JSRegExp> re,
94                                                        Handle<String> subject);
95
96  enum CallOrigin : int {
97    kFromRuntime = 0,
98    kFromJs = 1,
99  };
100
101  enum class ExecQuirks {
102    kNone,
103    // Used to work around an issue in the RegExpPrototypeSplit fast path,
104    // which diverges from the spec by not creating a sticky copy of the RegExp
105    // instance and calling `exec` in a loop. If called in this context, we
106    // must not update the last_match_info on a successful match at the subject
107    // string end. See crbug.com/1075514 for more information.
108    kTreatMatchAtEndAsFailure,
109  };
110
111  // See ECMA-262 section 15.10.6.2.
112  // This function calls the garbage collector if necessary.
113  V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
114      Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
115      int index, Handle<RegExpMatchInfo> last_match_info,
116      ExecQuirks exec_quirks = ExecQuirks::kNone);
117
118  V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object>
119  ExperimentalOneshotExec(Isolate* isolate, Handle<JSRegExp> regexp,
120                          Handle<String> subject, int index,
121                          Handle<RegExpMatchInfo> last_match_info,
122                          ExecQuirks exec_quirks = ExecQuirks::kNone);
123
124  // Integral return values used throughout regexp code layers.
125  static constexpr int kInternalRegExpFailure = 0;
126  static constexpr int kInternalRegExpSuccess = 1;
127  static constexpr int kInternalRegExpException = -1;
128  static constexpr int kInternalRegExpRetry = -2;
129  static constexpr int kInternalRegExpFallbackToExperimental = -3;
130  static constexpr int kInternalRegExpSmallestResult = -3;
131
132  enum IrregexpResult : int32_t {
133    RE_FAILURE = kInternalRegExpFailure,
134    RE_SUCCESS = kInternalRegExpSuccess,
135    RE_EXCEPTION = kInternalRegExpException,
136    RE_RETRY = kInternalRegExpRetry,
137    RE_FALLBACK_TO_EXPERIMENTAL = kInternalRegExpFallbackToExperimental,
138  };
139
140  // Set last match info.  If match is nullptr, then setting captures is
141  // omitted.
142  static Handle<RegExpMatchInfo> SetLastMatchInfo(
143      Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
144      Handle<String> subject, int capture_count, int32_t* match);
145
146  V8_EXPORT_PRIVATE static bool CompileForTesting(
147      Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags,
148      Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte);
149
150  V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
151                                                   RegExpNode* node);
152
153  static const int kRegExpTooLargeToOptimize = 20 * KB;
154
155  V8_WARN_UNUSED_RESULT
156  static MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
157                                                  Handle<JSRegExp> re,
158                                                  Handle<String> pattern,
159                                                  RegExpError error);
160  static void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
161                                   RegExpError error_text);
162
163  static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp);
164
165  static Handle<FixedArray> CreateCaptureNameMap(
166      Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures);
167};
168
169// Uses a special global mode of irregexp-generated code to perform a global
170// search and return multiple results at once. As such, this is essentially an
171// iterator over multiple results (retrieved batch-wise in advance).
172class RegExpGlobalCache final {
173 public:
174  RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject,
175                    Isolate* isolate);
176
177  ~RegExpGlobalCache();
178
179  // Fetch the next entry in the cache for global regexp match results.
180  // This does not set the last match info.  Upon failure, nullptr is
181  // returned. The cause can be checked with Result().  The previous result is
182  // still in available in memory when a failure happens.
183  int32_t* FetchNext();
184
185  int32_t* LastSuccessfulMatch();
186
187  bool HasException() { return num_matches_ < 0; }
188
189 private:
190  int AdvanceZeroLength(int last_index);
191
192  int num_matches_;
193  int max_matches_;
194  int current_match_index_;
195  int registers_per_match_;
196  // Pointer to the last set of captures.
197  int32_t* register_array_;
198  int register_array_size_;
199  Handle<JSRegExp> regexp_;
200  Handle<String> subject_;
201  Isolate* isolate_;
202};
203
204// Caches results for specific regexp queries on the isolate. At the time of
205// writing, this is used during global calls to RegExp.prototype.exec and
206// @@split.
207class RegExpResultsCache final : public AllStatic {
208 public:
209  enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
210
211  // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
212  // On success, the returned result is guaranteed to be a COW-array.
213  static Object Lookup(Heap* heap, String key_string, Object key_pattern,
214                       FixedArray* last_match_out, ResultsCacheType type);
215  // Attempt to add value_array to the cache specified by type.  On success,
216  // value_array is turned into a COW-array.
217  static void Enter(Isolate* isolate, Handle<String> key_string,
218                    Handle<Object> key_pattern, Handle<FixedArray> value_array,
219                    Handle<FixedArray> last_match_cache, ResultsCacheType type);
220  static void Clear(FixedArray cache);
221
222  static constexpr int kRegExpResultsCacheSize = 0x100;
223
224 private:
225  static constexpr int kStringOffset = 0;
226  static constexpr int kPatternOffset = 1;
227  static constexpr int kArrayOffset = 2;
228  static constexpr int kLastMatchOffset = 3;
229  static constexpr int kArrayEntriesPerCacheEntry = 4;
230};
231
232}  // namespace internal
233}  // namespace v8
234
235#endif  // V8_REGEXP_REGEXP_H_
236