1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/objects/js-regexp.h"
6
7#include "src/base/strings.h"
8#include "src/common/globals.h"
9#include "src/objects/code.h"
10#include "src/objects/js-array-inl.h"
11#include "src/objects/js-regexp-inl.h"
12#include "src/regexp/regexp.h"
13
14namespace v8 {
15namespace internal {
16
17Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices(
18    Isolate* isolate, Handle<RegExpMatchInfo> match_info,
19    Handle<Object> maybe_names) {
20  Handle<JSRegExpResultIndices> indices(Handle<JSRegExpResultIndices>::cast(
21      isolate->factory()->NewJSObjectFromMap(
22          isolate->regexp_result_indices_map())));
23
24  // Initialize indices length to avoid having a partially initialized object
25  // should GC be triggered by creating a NewFixedArray.
26  indices->set_length(Smi::zero());
27
28  // Build indices array from RegExpMatchInfo.
29  int num_indices = match_info->NumberOfCaptureRegisters();
30  int num_results = num_indices >> 1;
31  Handle<FixedArray> indices_array =
32      isolate->factory()->NewFixedArray(num_results);
33  JSArray::SetContent(indices, indices_array);
34
35  for (int i = 0; i < num_results; i++) {
36    int base_offset = i * 2;
37    int start_offset = match_info->Capture(base_offset);
38    int end_offset = match_info->Capture(base_offset + 1);
39
40    // Any unmatched captures are set to undefined, otherwise we set them to a
41    // subarray of the indices.
42    if (start_offset == -1) {
43      indices_array->set(i, ReadOnlyRoots(isolate).undefined_value());
44    } else {
45      Handle<FixedArray> indices_sub_array(
46          isolate->factory()->NewFixedArray(2));
47      indices_sub_array->set(0, Smi::FromInt(start_offset));
48      indices_sub_array->set(1, Smi::FromInt(end_offset));
49      Handle<JSArray> indices_sub_jsarray =
50          isolate->factory()->NewJSArrayWithElements(indices_sub_array,
51                                                     PACKED_SMI_ELEMENTS, 2);
52      indices_array->set(i, *indices_sub_jsarray);
53    }
54  }
55
56  // If there are no capture groups, set the groups property to undefined.
57  FieldIndex groups_index = FieldIndex::ForDescriptor(
58      indices->map(), InternalIndex(kGroupsDescriptorIndex));
59  if (maybe_names->IsUndefined(isolate)) {
60    indices->FastPropertyAtPut(groups_index,
61                               ReadOnlyRoots(isolate).undefined_value());
62    return indices;
63  }
64
65  // Create a groups property which returns a dictionary of named captures to
66  // their corresponding capture indices.
67  Handle<FixedArray> names(Handle<FixedArray>::cast(maybe_names));
68  int num_names = names->length() >> 1;
69  Handle<HeapObject> group_names;
70  if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) {
71    group_names = isolate->factory()->NewSwissNameDictionary(num_names);
72  } else {
73    group_names = isolate->factory()->NewNameDictionary(num_names);
74  }
75  for (int i = 0; i < num_names; i++) {
76    int base_offset = i * 2;
77    int name_offset = base_offset;
78    int index_offset = base_offset + 1;
79    Handle<String> name(String::cast(names->get(name_offset)), isolate);
80    Handle<Smi> smi_index(Smi::cast(names->get(index_offset)), isolate);
81    Handle<Object> capture_indices(indices_array->get(smi_index->value()),
82                                   isolate);
83    if (!capture_indices->IsUndefined(isolate)) {
84      capture_indices = Handle<JSArray>::cast(capture_indices);
85    }
86    if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) {
87      group_names = SwissNameDictionary::Add(
88          isolate, Handle<SwissNameDictionary>::cast(group_names), name,
89          capture_indices, PropertyDetails::Empty());
90    } else {
91      group_names = NameDictionary::Add(
92          isolate, Handle<NameDictionary>::cast(group_names), name,
93          capture_indices, PropertyDetails::Empty());
94    }
95  }
96
97  // Convert group_names to a JSObject and store at the groups property of the
98  // result indices.
99  Handle<FixedArrayBase> elements = isolate->factory()->empty_fixed_array();
100  Handle<HeapObject> null =
101      Handle<HeapObject>::cast(isolate->factory()->null_value());
102  Handle<JSObject> js_group_names =
103      isolate->factory()->NewSlowJSObjectWithPropertiesAndElements(
104          null, group_names, elements);
105  indices->FastPropertyAtPut(groups_index, *js_group_names);
106  return indices;
107}
108
109uint32_t JSRegExp::backtrack_limit() const {
110  CHECK_EQ(type_tag(), IRREGEXP);
111  return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit)));
112}
113
114// static
115base::Optional<JSRegExp::Flags> JSRegExp::FlagsFromString(
116    Isolate* isolate, Handle<String> flags) {
117  const int length = flags->length();
118
119  // A longer flags string cannot be valid.
120  if (length > JSRegExp::kFlagCount) return {};
121
122  RegExpFlags value;
123  FlatStringReader reader(isolate, String::Flatten(isolate, flags));
124
125  for (int i = 0; i < length; i++) {
126    base::Optional<RegExpFlag> flag = JSRegExp::FlagFromChar(reader.Get(i));
127    if (!flag.has_value()) return {};
128    if (value & flag.value()) return {};  // Duplicate.
129    value |= flag.value();
130  }
131
132  return JSRegExp::AsJSRegExpFlags(value);
133}
134
135// static
136Handle<String> JSRegExp::StringFromFlags(Isolate* isolate,
137                                         JSRegExp::Flags flags) {
138  static constexpr int kStringTerminator = 1;
139  int cursor = 0;
140  char buffer[kFlagCount + kStringTerminator];
141#define V(Lower, Camel, LowerCamel, Char, Bit) \
142  if (flags & JSRegExp::k##Camel) buffer[cursor++] = Char;
143  REGEXP_FLAG_LIST(V)
144#undef V
145  buffer[cursor++] = '\0';
146  DCHECK_LE(cursor, kFlagCount + kStringTerminator);
147  return isolate->factory()->NewStringFromAsciiChecked(buffer);
148}
149
150// static
151MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern,
152                                    Flags flags, uint32_t backtrack_limit) {
153  Handle<JSFunction> constructor = isolate->regexp_function();
154  Handle<JSRegExp> regexp =
155      Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor));
156
157  return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit);
158}
159
160Object JSRegExp::code(bool is_latin1) const {
161  DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
162  Object value = DataAt(code_index(is_latin1));
163  DCHECK_IMPLIES(V8_EXTERNAL_CODE_SPACE_BOOL, value.IsSmi() || value.IsCodeT());
164  return value;
165}
166
167void JSRegExp::set_code(bool is_latin1, Handle<Code> code) {
168  SetDataAt(code_index(is_latin1), ToCodeT(*code));
169}
170
171Object JSRegExp::bytecode(bool is_latin1) const {
172  DCHECK(type_tag() == JSRegExp::IRREGEXP ||
173         type_tag() == JSRegExp::EXPERIMENTAL);
174  return DataAt(bytecode_index(is_latin1));
175}
176
177void JSRegExp::set_bytecode_and_trampoline(Isolate* isolate,
178                                           Handle<ByteArray> bytecode) {
179  SetDataAt(kIrregexpLatin1BytecodeIndex, *bytecode);
180  SetDataAt(kIrregexpUC16BytecodeIndex, *bytecode);
181
182  Handle<CodeT> trampoline =
183      BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
184  SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
185  SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
186}
187
188bool JSRegExp::ShouldProduceBytecode() {
189  return FLAG_regexp_interpret_all ||
190         (FLAG_regexp_tier_up && !MarkedForTierUp());
191}
192
193// Only irregexps are subject to tier-up.
194bool JSRegExp::CanTierUp() {
195  return FLAG_regexp_tier_up && type_tag() == JSRegExp::IRREGEXP;
196}
197
198// An irregexp is considered to be marked for tier up if the tier-up ticks
199// value reaches zero.
200bool JSRegExp::MarkedForTierUp() {
201  DCHECK(data().IsFixedArray());
202
203  if (!CanTierUp()) {
204    return false;
205  }
206
207  return Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) == 0;
208}
209
210void JSRegExp::ResetLastTierUpTick() {
211  DCHECK(FLAG_regexp_tier_up);
212  DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
213  int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) + 1;
214  FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
215                               Smi::FromInt(tier_up_ticks));
216}
217
218void JSRegExp::TierUpTick() {
219  DCHECK(FLAG_regexp_tier_up);
220  DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
221  int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex));
222  if (tier_up_ticks == 0) {
223    return;
224  }
225  FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
226                               Smi::FromInt(tier_up_ticks - 1));
227}
228
229void JSRegExp::MarkTierUpForNextExec() {
230  DCHECK(FLAG_regexp_tier_up);
231  DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
232  FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
233                               Smi::zero());
234}
235
236// static
237MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
238                                           Handle<String> source,
239                                           Handle<String> flags_string) {
240  Isolate* isolate = regexp->GetIsolate();
241  base::Optional<Flags> flags =
242      JSRegExp::FlagsFromString(isolate, flags_string);
243  if (!flags.has_value()) {
244    THROW_NEW_ERROR(
245        isolate,
246        NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string),
247        JSRegExp);
248  }
249  return Initialize(regexp, source, flags.value());
250}
251
252namespace {
253
254bool IsLineTerminator(int c) {
255  // Expected to return true for '\n', '\r', 0x2028, and 0x2029.
256  return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c));
257}
258
259// TODO(jgruber): Consider merging CountAdditionalEscapeChars and
260// WriteEscapedRegExpSource into a single function to deduplicate dispatch logic
261// and move related code closer to each other.
262template <typename Char>
263int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) {
264  DisallowGarbageCollection no_gc;
265  int escapes = 0;
266  bool needs_escapes = false;
267  bool in_char_class = false;
268  base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
269  for (int i = 0; i < src.length(); i++) {
270    const Char c = src[i];
271    if (c == '\\') {
272      if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) {
273        // This '\' is ignored since the next character itself will be escaped.
274        escapes--;
275      } else {
276        // Escape. Skip next character, which will be copied verbatim;
277        i++;
278      }
279    } else if (c == '/' && !in_char_class) {
280      // Not escaped forward-slash needs escape.
281      needs_escapes = true;
282      escapes++;
283    } else if (c == '[') {
284      in_char_class = true;
285    } else if (c == ']') {
286      in_char_class = false;
287    } else if (c == '\n') {
288      needs_escapes = true;
289      escapes++;
290    } else if (c == '\r') {
291      needs_escapes = true;
292      escapes++;
293    } else if (static_cast<int>(c) == 0x2028) {
294      needs_escapes = true;
295      escapes += std::strlen("\\u2028") - 1;
296    } else if (static_cast<int>(c) == 0x2029) {
297      needs_escapes = true;
298      escapes += std::strlen("\\u2029") - 1;
299    } else {
300      DCHECK(!IsLineTerminator(c));
301    }
302  }
303  DCHECK(!in_char_class);
304  DCHECK_GE(escapes, 0);
305  DCHECK_IMPLIES(escapes != 0, needs_escapes);
306  *needs_escapes_out = needs_escapes;
307  return escapes;
308}
309
310template <typename Char>
311void WriteStringToCharVector(base::Vector<Char> v, int* d, const char* string) {
312  int s = 0;
313  while (string[s] != '\0') v[(*d)++] = string[s++];
314}
315
316template <typename Char, typename StringType>
317Handle<StringType> WriteEscapedRegExpSource(Handle<String> source,
318                                            Handle<StringType> result) {
319  DisallowGarbageCollection no_gc;
320  base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
321  base::Vector<Char> dst(result->GetChars(no_gc), result->length());
322  int s = 0;
323  int d = 0;
324  bool in_char_class = false;
325  while (s < src.length()) {
326    const Char c = src[s];
327    if (c == '\\') {
328      if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) {
329        // This '\' is ignored since the next character itself will be escaped.
330        s++;
331        continue;
332      } else {
333        // Escape. Copy this and next character.
334        dst[d++] = src[s++];
335      }
336      if (s == src.length()) break;
337    } else if (c == '/' && !in_char_class) {
338      // Not escaped forward-slash needs escape.
339      dst[d++] = '\\';
340    } else if (c == '[') {
341      in_char_class = true;
342    } else if (c == ']') {
343      in_char_class = false;
344    } else if (c == '\n') {
345      WriteStringToCharVector(dst, &d, "\\n");
346      s++;
347      continue;
348    } else if (c == '\r') {
349      WriteStringToCharVector(dst, &d, "\\r");
350      s++;
351      continue;
352    } else if (static_cast<int>(c) == 0x2028) {
353      WriteStringToCharVector(dst, &d, "\\u2028");
354      s++;
355      continue;
356    } else if (static_cast<int>(c) == 0x2029) {
357      WriteStringToCharVector(dst, &d, "\\u2029");
358      s++;
359      continue;
360    } else {
361      DCHECK(!IsLineTerminator(c));
362    }
363    dst[d++] = src[s++];
364  }
365  DCHECK_EQ(result->length(), d);
366  DCHECK(!in_char_class);
367  return result;
368}
369
370MaybeHandle<String> EscapeRegExpSource(Isolate* isolate,
371                                       Handle<String> source) {
372  DCHECK(source->IsFlat());
373  if (source->length() == 0) return isolate->factory()->query_colon_string();
374  bool one_byte = String::IsOneByteRepresentationUnderneath(*source);
375  bool needs_escapes = false;
376  int additional_escape_chars =
377      one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes)
378               : CountAdditionalEscapeChars<base::uc16>(source, &needs_escapes);
379  if (!needs_escapes) return source;
380  int length = source->length() + additional_escape_chars;
381  if (one_byte) {
382    Handle<SeqOneByteString> result;
383    ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
384                               isolate->factory()->NewRawOneByteString(length),
385                               String);
386    return WriteEscapedRegExpSource<uint8_t>(source, result);
387  } else {
388    Handle<SeqTwoByteString> result;
389    ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
390                               isolate->factory()->NewRawTwoByteString(length),
391                               String);
392    return WriteEscapedRegExpSource<base::uc16>(source, result);
393  }
394}
395
396}  // namespace
397
398// static
399MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
400                                           Handle<String> source, Flags flags,
401                                           uint32_t backtrack_limit) {
402  Isolate* isolate = regexp->GetIsolate();
403  Factory* factory = isolate->factory();
404  // If source is the empty string we set it to "(?:)" instead as
405  // suggested by ECMA-262, 5th, section 15.10.4.1.
406  if (source->length() == 0) source = factory->query_colon_string();
407
408  source = String::Flatten(isolate, source);
409
410  RETURN_ON_EXCEPTION(
411      isolate,
412      RegExp::Compile(isolate, regexp, source, JSRegExp::AsRegExpFlags(flags),
413                      backtrack_limit),
414      JSRegExp);
415
416  Handle<String> escaped_source;
417  ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source,
418                             EscapeRegExpSource(isolate, source), JSRegExp);
419
420  regexp->set_source(*escaped_source);
421  regexp->set_flags(Smi::FromInt(flags));
422
423  Map map = regexp->map();
424  Object constructor = map.GetConstructor();
425  if (constructor.IsJSFunction() &&
426      JSFunction::cast(constructor).initial_map() == map) {
427    // If we still have the original map, set in-object properties directly.
428    regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex,
429                                  Smi::FromInt(kInitialLastIndexValue),
430                                  SKIP_WRITE_BARRIER);
431  } else {
432    // Map has changed, so use generic, but slower, method.
433    RETURN_ON_EXCEPTION(
434        isolate,
435        Object::SetProperty(
436            isolate, regexp, factory->lastIndex_string(),
437            Handle<Smi>(Smi::FromInt(kInitialLastIndexValue), isolate)),
438        JSRegExp);
439  }
440
441  return regexp;
442}
443
444}  // namespace internal
445}  // namespace v8
446