1// Copyright 2020 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <fstream> 6#include <iomanip> 7#include <iostream> 8#include <sstream> 9 10#include "src/base/strings.h" 11#include "src/regexp/special-case.h" 12 13namespace v8 { 14namespace internal { 15 16static const base::uc32 kSurrogateStart = 0xd800; 17static const base::uc32 kSurrogateEnd = 0xdfff; 18static const base::uc32 kNonBmpStart = 0x10000; 19 20// The following code generates "src/regexp/special-case.cc". 21void PrintSet(std::ofstream& out, const char* name, 22 const icu::UnicodeSet& set) { 23 out << "icu::UnicodeSet Build" << name << "() {\n" 24 << " icu::UnicodeSet set;\n"; 25 for (int32_t i = 0; i < set.getRangeCount(); i++) { 26 if (set.getRangeStart(i) == set.getRangeEnd(i)) { 27 out << " set.add(0x" << set.getRangeStart(i) << ");\n"; 28 } else { 29 out << " set.add(0x" << set.getRangeStart(i) << ", 0x" 30 << set.getRangeEnd(i) << ");\n"; 31 } 32 } 33 out << " set.freeze();\n" 34 << " return set;\n" 35 << "}\n\n"; 36 37 out << "struct " << name << "Data {\n" 38 << " " << name << "Data() : set(Build" << name << "()) {}\n" 39 << " const icu::UnicodeSet set;\n" 40 << "};\n\n"; 41 42 out << "//static\n" 43 << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" 44 << " static base::LazyInstance<" << name << "Data>::type set =\n" 45 << " LAZY_INSTANCE_INITIALIZER;\n" 46 << " return set.Pointer()->set;\n" 47 << "}\n\n"; 48} 49 50void PrintSpecial(std::ofstream& out) { 51 icu::UnicodeSet current; 52 icu::UnicodeSet special_add; 53 icu::UnicodeSet ignore; 54 UErrorCode status = U_ZERO_ERROR; 55 icu::UnicodeSet upper("[\\p{Lu}]", status); 56 CHECK(U_SUCCESS(status)); 57 58 // Iterate through all chars in BMP except surrogates. 59 for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { 60 if (i >= static_cast<UChar32>(kSurrogateStart) && 61 i <= static_cast<UChar32>(kSurrogateEnd)) { 62 continue; // Ignore surrogate range 63 } 64 current.set(i, i); 65 current.closeOver(USET_CASE_INSENSITIVE); 66 67 // Check to see if all characters in the case-folding equivalence 68 // class as defined by UnicodeSet::closeOver all map to the same 69 // canonical value. 70 UChar32 canonical = RegExpCaseFolding::Canonicalize(i); 71 bool class_has_matching_canonical_char = false; 72 bool class_has_non_matching_canonical_char = false; 73 for (int32_t j = 0; j < current.getRangeCount(); j++) { 74 for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); 75 c++) { 76 if (c == i) { 77 continue; 78 } 79 UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); 80 if (canonical == other_canonical) { 81 class_has_matching_canonical_char = true; 82 } else { 83 class_has_non_matching_canonical_char = true; 84 } 85 } 86 } 87 // If any other character in i's equivalence class has a 88 // different canonical value, then i needs special handling. If 89 // no other character shares a canonical value with i, we can 90 // ignore i when adding alternatives for case-independent 91 // comparison. If at least one other character shares a 92 // canonical value, then i needs special handling. 93 if (class_has_non_matching_canonical_char) { 94 if (class_has_matching_canonical_char) { 95 special_add.add(i); 96 } else { 97 ignore.add(i); 98 } 99 } 100 } 101 102 // Verify that no Unicode equivalence class contains two non-trivial 103 // JS equivalence classes. Every character in SpecialAddSet has the 104 // same canonical value as every other non-IgnoreSet character in 105 // its Unicode equivalence class. Therefore, if we call closeOver on 106 // a set containing no IgnoreSet characters, the only characters 107 // that must be removed from the result are in IgnoreSet. This fact 108 // is used in CharacterRange::AddCaseEquivalents. 109 for (int32_t i = 0; i < special_add.getRangeCount(); i++) { 110 for (UChar32 c = special_add.getRangeStart(i); 111 c <= special_add.getRangeEnd(i); c++) { 112 UChar32 canonical = RegExpCaseFolding::Canonicalize(c); 113 current.set(c, c); 114 current.closeOver(USET_CASE_INSENSITIVE); 115 current.removeAll(ignore); 116 for (int32_t j = 0; j < current.getRangeCount(); j++) { 117 for (UChar32 c2 = current.getRangeStart(j); 118 c2 <= current.getRangeEnd(j); c2++) { 119 CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); 120 } 121 } 122 } 123 } 124 125 PrintSet(out, "IgnoreSet", ignore); 126 PrintSet(out, "SpecialAddSet", special_add); 127} 128 129void WriteHeader(const char* header_filename) { 130 std::ofstream out(header_filename); 131 out << std::hex << std::setfill('0') << std::setw(4); 132 out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" 133 << "// Use of this source code is governed by a BSD-style license that\n" 134 << "// can be found in the LICENSE file.\n\n" 135 << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" 136 << "// The following functions are used to build UnicodeSets\n" 137 << "// for special cases where the case-folding algorithm used by\n" 138 << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" 139 << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" 140 << "// Semantics: Canonicalize) step 3.\n\n" 141 << "#ifdef V8_INTL_SUPPORT\n" 142 << "#include \"src/base/lazy-instance.h\"\n\n" 143 << "#include \"src/regexp/special-case.h\"\n\n" 144 << "#include \"unicode/uniset.h\"\n" 145 << "namespace v8 {\n" 146 << "namespace internal {\n\n"; 147 148 PrintSpecial(out); 149 150 out << "\n" 151 << "} // namespace internal\n" 152 << "} // namespace v8\n" 153 << "#endif // V8_INTL_SUPPORT\n"; 154} 155 156} // namespace internal 157} // namespace v8 158 159int main(int argc, const char** argv) { 160 if (argc != 2) { 161 std::cerr << "Usage: " << argv[0] << " <output filename>\n"; 162 std::exit(1); 163 } 164 v8::internal::WriteHeader(argv[1]); 165 166 return 0; 167} 168