1// Copyright 2019 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/objects/js-regexp.h" 6 7#include "src/base/strings.h" 8#include "src/common/globals.h" 9#include "src/objects/code.h" 10#include "src/objects/js-array-inl.h" 11#include "src/objects/js-regexp-inl.h" 12#include "src/regexp/regexp.h" 13 14namespace v8 { 15namespace internal { 16 17Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices( 18 Isolate* isolate, Handle<RegExpMatchInfo> match_info, 19 Handle<Object> maybe_names) { 20 Handle<JSRegExpResultIndices> indices(Handle<JSRegExpResultIndices>::cast( 21 isolate->factory()->NewJSObjectFromMap( 22 isolate->regexp_result_indices_map()))); 23 24 // Initialize indices length to avoid having a partially initialized object 25 // should GC be triggered by creating a NewFixedArray. 26 indices->set_length(Smi::zero()); 27 28 // Build indices array from RegExpMatchInfo. 29 int num_indices = match_info->NumberOfCaptureRegisters(); 30 int num_results = num_indices >> 1; 31 Handle<FixedArray> indices_array = 32 isolate->factory()->NewFixedArray(num_results); 33 JSArray::SetContent(indices, indices_array); 34 35 for (int i = 0; i < num_results; i++) { 36 int base_offset = i * 2; 37 int start_offset = match_info->Capture(base_offset); 38 int end_offset = match_info->Capture(base_offset + 1); 39 40 // Any unmatched captures are set to undefined, otherwise we set them to a 41 // subarray of the indices. 42 if (start_offset == -1) { 43 indices_array->set(i, ReadOnlyRoots(isolate).undefined_value()); 44 } else { 45 Handle<FixedArray> indices_sub_array( 46 isolate->factory()->NewFixedArray(2)); 47 indices_sub_array->set(0, Smi::FromInt(start_offset)); 48 indices_sub_array->set(1, Smi::FromInt(end_offset)); 49 Handle<JSArray> indices_sub_jsarray = 50 isolate->factory()->NewJSArrayWithElements(indices_sub_array, 51 PACKED_SMI_ELEMENTS, 2); 52 indices_array->set(i, *indices_sub_jsarray); 53 } 54 } 55 56 // If there are no capture groups, set the groups property to undefined. 57 FieldIndex groups_index = FieldIndex::ForDescriptor( 58 indices->map(), InternalIndex(kGroupsDescriptorIndex)); 59 if (maybe_names->IsUndefined(isolate)) { 60 indices->FastPropertyAtPut(groups_index, 61 ReadOnlyRoots(isolate).undefined_value()); 62 return indices; 63 } 64 65 // Create a groups property which returns a dictionary of named captures to 66 // their corresponding capture indices. 67 Handle<FixedArray> names(Handle<FixedArray>::cast(maybe_names)); 68 int num_names = names->length() >> 1; 69 Handle<HeapObject> group_names; 70 if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) { 71 group_names = isolate->factory()->NewSwissNameDictionary(num_names); 72 } else { 73 group_names = isolate->factory()->NewNameDictionary(num_names); 74 } 75 for (int i = 0; i < num_names; i++) { 76 int base_offset = i * 2; 77 int name_offset = base_offset; 78 int index_offset = base_offset + 1; 79 Handle<String> name(String::cast(names->get(name_offset)), isolate); 80 Handle<Smi> smi_index(Smi::cast(names->get(index_offset)), isolate); 81 Handle<Object> capture_indices(indices_array->get(smi_index->value()), 82 isolate); 83 if (!capture_indices->IsUndefined(isolate)) { 84 capture_indices = Handle<JSArray>::cast(capture_indices); 85 } 86 if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) { 87 group_names = SwissNameDictionary::Add( 88 isolate, Handle<SwissNameDictionary>::cast(group_names), name, 89 capture_indices, PropertyDetails::Empty()); 90 } else { 91 group_names = NameDictionary::Add( 92 isolate, Handle<NameDictionary>::cast(group_names), name, 93 capture_indices, PropertyDetails::Empty()); 94 } 95 } 96 97 // Convert group_names to a JSObject and store at the groups property of the 98 // result indices. 99 Handle<FixedArrayBase> elements = isolate->factory()->empty_fixed_array(); 100 Handle<HeapObject> null = 101 Handle<HeapObject>::cast(isolate->factory()->null_value()); 102 Handle<JSObject> js_group_names = 103 isolate->factory()->NewSlowJSObjectWithPropertiesAndElements( 104 null, group_names, elements); 105 indices->FastPropertyAtPut(groups_index, *js_group_names); 106 return indices; 107} 108 109uint32_t JSRegExp::backtrack_limit() const { 110 CHECK_EQ(type_tag(), IRREGEXP); 111 return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit))); 112} 113 114// static 115base::Optional<JSRegExp::Flags> JSRegExp::FlagsFromString( 116 Isolate* isolate, Handle<String> flags) { 117 const int length = flags->length(); 118 119 // A longer flags string cannot be valid. 120 if (length > JSRegExp::kFlagCount) return {}; 121 122 RegExpFlags value; 123 FlatStringReader reader(isolate, String::Flatten(isolate, flags)); 124 125 for (int i = 0; i < length; i++) { 126 base::Optional<RegExpFlag> flag = JSRegExp::FlagFromChar(reader.Get(i)); 127 if (!flag.has_value()) return {}; 128 if (value & flag.value()) return {}; // Duplicate. 129 value |= flag.value(); 130 } 131 132 return JSRegExp::AsJSRegExpFlags(value); 133} 134 135// static 136Handle<String> JSRegExp::StringFromFlags(Isolate* isolate, 137 JSRegExp::Flags flags) { 138 static constexpr int kStringTerminator = 1; 139 int cursor = 0; 140 char buffer[kFlagCount + kStringTerminator]; 141#define V(Lower, Camel, LowerCamel, Char, Bit) \ 142 if (flags & JSRegExp::k##Camel) buffer[cursor++] = Char; 143 REGEXP_FLAG_LIST(V) 144#undef V 145 buffer[cursor++] = '\0'; 146 DCHECK_LE(cursor, kFlagCount + kStringTerminator); 147 return isolate->factory()->NewStringFromAsciiChecked(buffer); 148} 149 150// static 151MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern, 152 Flags flags, uint32_t backtrack_limit) { 153 Handle<JSFunction> constructor = isolate->regexp_function(); 154 Handle<JSRegExp> regexp = 155 Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor)); 156 157 return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit); 158} 159 160Object JSRegExp::code(bool is_latin1) const { 161 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); 162 Object value = DataAt(code_index(is_latin1)); 163 DCHECK_IMPLIES(V8_EXTERNAL_CODE_SPACE_BOOL, value.IsSmi() || value.IsCodeT()); 164 return value; 165} 166 167void JSRegExp::set_code(bool is_latin1, Handle<Code> code) { 168 SetDataAt(code_index(is_latin1), ToCodeT(*code)); 169} 170 171Object JSRegExp::bytecode(bool is_latin1) const { 172 DCHECK(type_tag() == JSRegExp::IRREGEXP || 173 type_tag() == JSRegExp::EXPERIMENTAL); 174 return DataAt(bytecode_index(is_latin1)); 175} 176 177void JSRegExp::set_bytecode_and_trampoline(Isolate* isolate, 178 Handle<ByteArray> bytecode) { 179 SetDataAt(kIrregexpLatin1BytecodeIndex, *bytecode); 180 SetDataAt(kIrregexpUC16BytecodeIndex, *bytecode); 181 182 Handle<CodeT> trampoline = 183 BUILTIN_CODE(isolate, RegExpExperimentalTrampoline); 184 SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline); 185 SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline); 186} 187 188bool JSRegExp::ShouldProduceBytecode() { 189 return FLAG_regexp_interpret_all || 190 (FLAG_regexp_tier_up && !MarkedForTierUp()); 191} 192 193// Only irregexps are subject to tier-up. 194bool JSRegExp::CanTierUp() { 195 return FLAG_regexp_tier_up && type_tag() == JSRegExp::IRREGEXP; 196} 197 198// An irregexp is considered to be marked for tier up if the tier-up ticks 199// value reaches zero. 200bool JSRegExp::MarkedForTierUp() { 201 DCHECK(data().IsFixedArray()); 202 203 if (!CanTierUp()) { 204 return false; 205 } 206 207 return Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) == 0; 208} 209 210void JSRegExp::ResetLastTierUpTick() { 211 DCHECK(FLAG_regexp_tier_up); 212 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); 213 int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) + 1; 214 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex, 215 Smi::FromInt(tier_up_ticks)); 216} 217 218void JSRegExp::TierUpTick() { 219 DCHECK(FLAG_regexp_tier_up); 220 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); 221 int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)); 222 if (tier_up_ticks == 0) { 223 return; 224 } 225 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex, 226 Smi::FromInt(tier_up_ticks - 1)); 227} 228 229void JSRegExp::MarkTierUpForNextExec() { 230 DCHECK(FLAG_regexp_tier_up); 231 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); 232 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex, 233 Smi::zero()); 234} 235 236// static 237MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp, 238 Handle<String> source, 239 Handle<String> flags_string) { 240 Isolate* isolate = regexp->GetIsolate(); 241 base::Optional<Flags> flags = 242 JSRegExp::FlagsFromString(isolate, flags_string); 243 if (!flags.has_value()) { 244 THROW_NEW_ERROR( 245 isolate, 246 NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string), 247 JSRegExp); 248 } 249 return Initialize(regexp, source, flags.value()); 250} 251 252namespace { 253 254bool IsLineTerminator(int c) { 255 // Expected to return true for '\n', '\r', 0x2028, and 0x2029. 256 return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c)); 257} 258 259// TODO(jgruber): Consider merging CountAdditionalEscapeChars and 260// WriteEscapedRegExpSource into a single function to deduplicate dispatch logic 261// and move related code closer to each other. 262template <typename Char> 263int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) { 264 DisallowGarbageCollection no_gc; 265 int escapes = 0; 266 bool needs_escapes = false; 267 bool in_char_class = false; 268 base::Vector<const Char> src = source->GetCharVector<Char>(no_gc); 269 for (int i = 0; i < src.length(); i++) { 270 const Char c = src[i]; 271 if (c == '\\') { 272 if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) { 273 // This '\' is ignored since the next character itself will be escaped. 274 escapes--; 275 } else { 276 // Escape. Skip next character, which will be copied verbatim; 277 i++; 278 } 279 } else if (c == '/' && !in_char_class) { 280 // Not escaped forward-slash needs escape. 281 needs_escapes = true; 282 escapes++; 283 } else if (c == '[') { 284 in_char_class = true; 285 } else if (c == ']') { 286 in_char_class = false; 287 } else if (c == '\n') { 288 needs_escapes = true; 289 escapes++; 290 } else if (c == '\r') { 291 needs_escapes = true; 292 escapes++; 293 } else if (static_cast<int>(c) == 0x2028) { 294 needs_escapes = true; 295 escapes += std::strlen("\\u2028") - 1; 296 } else if (static_cast<int>(c) == 0x2029) { 297 needs_escapes = true; 298 escapes += std::strlen("\\u2029") - 1; 299 } else { 300 DCHECK(!IsLineTerminator(c)); 301 } 302 } 303 DCHECK(!in_char_class); 304 DCHECK_GE(escapes, 0); 305 DCHECK_IMPLIES(escapes != 0, needs_escapes); 306 *needs_escapes_out = needs_escapes; 307 return escapes; 308} 309 310template <typename Char> 311void WriteStringToCharVector(base::Vector<Char> v, int* d, const char* string) { 312 int s = 0; 313 while (string[s] != '\0') v[(*d)++] = string[s++]; 314} 315 316template <typename Char, typename StringType> 317Handle<StringType> WriteEscapedRegExpSource(Handle<String> source, 318 Handle<StringType> result) { 319 DisallowGarbageCollection no_gc; 320 base::Vector<const Char> src = source->GetCharVector<Char>(no_gc); 321 base::Vector<Char> dst(result->GetChars(no_gc), result->length()); 322 int s = 0; 323 int d = 0; 324 bool in_char_class = false; 325 while (s < src.length()) { 326 const Char c = src[s]; 327 if (c == '\\') { 328 if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) { 329 // This '\' is ignored since the next character itself will be escaped. 330 s++; 331 continue; 332 } else { 333 // Escape. Copy this and next character. 334 dst[d++] = src[s++]; 335 } 336 if (s == src.length()) break; 337 } else if (c == '/' && !in_char_class) { 338 // Not escaped forward-slash needs escape. 339 dst[d++] = '\\'; 340 } else if (c == '[') { 341 in_char_class = true; 342 } else if (c == ']') { 343 in_char_class = false; 344 } else if (c == '\n') { 345 WriteStringToCharVector(dst, &d, "\\n"); 346 s++; 347 continue; 348 } else if (c == '\r') { 349 WriteStringToCharVector(dst, &d, "\\r"); 350 s++; 351 continue; 352 } else if (static_cast<int>(c) == 0x2028) { 353 WriteStringToCharVector(dst, &d, "\\u2028"); 354 s++; 355 continue; 356 } else if (static_cast<int>(c) == 0x2029) { 357 WriteStringToCharVector(dst, &d, "\\u2029"); 358 s++; 359 continue; 360 } else { 361 DCHECK(!IsLineTerminator(c)); 362 } 363 dst[d++] = src[s++]; 364 } 365 DCHECK_EQ(result->length(), d); 366 DCHECK(!in_char_class); 367 return result; 368} 369 370MaybeHandle<String> EscapeRegExpSource(Isolate* isolate, 371 Handle<String> source) { 372 DCHECK(source->IsFlat()); 373 if (source->length() == 0) return isolate->factory()->query_colon_string(); 374 bool one_byte = String::IsOneByteRepresentationUnderneath(*source); 375 bool needs_escapes = false; 376 int additional_escape_chars = 377 one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes) 378 : CountAdditionalEscapeChars<base::uc16>(source, &needs_escapes); 379 if (!needs_escapes) return source; 380 int length = source->length() + additional_escape_chars; 381 if (one_byte) { 382 Handle<SeqOneByteString> result; 383 ASSIGN_RETURN_ON_EXCEPTION(isolate, result, 384 isolate->factory()->NewRawOneByteString(length), 385 String); 386 return WriteEscapedRegExpSource<uint8_t>(source, result); 387 } else { 388 Handle<SeqTwoByteString> result; 389 ASSIGN_RETURN_ON_EXCEPTION(isolate, result, 390 isolate->factory()->NewRawTwoByteString(length), 391 String); 392 return WriteEscapedRegExpSource<base::uc16>(source, result); 393 } 394} 395 396} // namespace 397 398// static 399MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp, 400 Handle<String> source, Flags flags, 401 uint32_t backtrack_limit) { 402 Isolate* isolate = regexp->GetIsolate(); 403 Factory* factory = isolate->factory(); 404 // If source is the empty string we set it to "(?:)" instead as 405 // suggested by ECMA-262, 5th, section 15.10.4.1. 406 if (source->length() == 0) source = factory->query_colon_string(); 407 408 source = String::Flatten(isolate, source); 409 410 RETURN_ON_EXCEPTION( 411 isolate, 412 RegExp::Compile(isolate, regexp, source, JSRegExp::AsRegExpFlags(flags), 413 backtrack_limit), 414 JSRegExp); 415 416 Handle<String> escaped_source; 417 ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source, 418 EscapeRegExpSource(isolate, source), JSRegExp); 419 420 regexp->set_source(*escaped_source); 421 regexp->set_flags(Smi::FromInt(flags)); 422 423 Map map = regexp->map(); 424 Object constructor = map.GetConstructor(); 425 if (constructor.IsJSFunction() && 426 JSFunction::cast(constructor).initial_map() == map) { 427 // If we still have the original map, set in-object properties directly. 428 regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex, 429 Smi::FromInt(kInitialLastIndexValue), 430 SKIP_WRITE_BARRIER); 431 } else { 432 // Map has changed, so use generic, but slower, method. 433 RETURN_ON_EXCEPTION( 434 isolate, 435 Object::SetProperty( 436 isolate, regexp, factory->lastIndex_string(), 437 Handle<Smi>(Smi::FromInt(kInitialLastIndexValue), isolate)), 438 JSRegExp); 439 } 440 441 return regexp; 442} 443 444} // namespace internal 445} // namespace v8 446