1// Copyright 2020 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <fstream>
6#include <iomanip>
7#include <iostream>
8#include <sstream>
9
10#include "src/base/strings.h"
11#include "src/regexp/special-case.h"
12
13namespace v8 {
14namespace internal {
15
16static const base::uc32 kSurrogateStart = 0xd800;
17static const base::uc32 kSurrogateEnd = 0xdfff;
18static const base::uc32 kNonBmpStart = 0x10000;
19
20// The following code generates "src/regexp/special-case.cc".
21void PrintSet(std::ofstream& out, const char* name,
22              const icu::UnicodeSet& set) {
23  out << "icu::UnicodeSet Build" << name << "() {\n"
24      << "  icu::UnicodeSet set;\n";
25  for (int32_t i = 0; i < set.getRangeCount(); i++) {
26    if (set.getRangeStart(i) == set.getRangeEnd(i)) {
27      out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
28    } else {
29      out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
30          << set.getRangeEnd(i) << ");\n";
31    }
32  }
33  out << "  set.freeze();\n"
34      << "  return set;\n"
35      << "}\n\n";
36
37  out << "struct " << name << "Data {\n"
38      << "  " << name << "Data() : set(Build" << name << "()) {}\n"
39      << "  const icu::UnicodeSet set;\n"
40      << "};\n\n";
41
42  out << "//static\n"
43      << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
44      << "  static base::LazyInstance<" << name << "Data>::type set =\n"
45      << "      LAZY_INSTANCE_INITIALIZER;\n"
46      << "  return set.Pointer()->set;\n"
47      << "}\n\n";
48}
49
50void PrintSpecial(std::ofstream& out) {
51  icu::UnicodeSet current;
52  icu::UnicodeSet special_add;
53  icu::UnicodeSet ignore;
54  UErrorCode status = U_ZERO_ERROR;
55  icu::UnicodeSet upper("[\\p{Lu}]", status);
56  CHECK(U_SUCCESS(status));
57
58  // Iterate through all chars in BMP except surrogates.
59  for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
60    if (i >= static_cast<UChar32>(kSurrogateStart) &&
61        i <= static_cast<UChar32>(kSurrogateEnd)) {
62      continue;  // Ignore surrogate range
63    }
64    current.set(i, i);
65    current.closeOver(USET_CASE_INSENSITIVE);
66
67    // Check to see if all characters in the case-folding equivalence
68    // class as defined by UnicodeSet::closeOver all map to the same
69    // canonical value.
70    UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
71    bool class_has_matching_canonical_char = false;
72    bool class_has_non_matching_canonical_char = false;
73    for (int32_t j = 0; j < current.getRangeCount(); j++) {
74      for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
75           c++) {
76        if (c == i) {
77          continue;
78        }
79        UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
80        if (canonical == other_canonical) {
81          class_has_matching_canonical_char = true;
82        } else {
83          class_has_non_matching_canonical_char = true;
84        }
85      }
86    }
87    // If any other character in i's equivalence class has a
88    // different canonical value, then i needs special handling.  If
89    // no other character shares a canonical value with i, we can
90    // ignore i when adding alternatives for case-independent
91    // comparison.  If at least one other character shares a
92    // canonical value, then i needs special handling.
93    if (class_has_non_matching_canonical_char) {
94      if (class_has_matching_canonical_char) {
95        special_add.add(i);
96      } else {
97        ignore.add(i);
98      }
99    }
100  }
101
102  // Verify that no Unicode equivalence class contains two non-trivial
103  // JS equivalence classes. Every character in SpecialAddSet has the
104  // same canonical value as every other non-IgnoreSet character in
105  // its Unicode equivalence class. Therefore, if we call closeOver on
106  // a set containing no IgnoreSet characters, the only characters
107  // that must be removed from the result are in IgnoreSet. This fact
108  // is used in CharacterRange::AddCaseEquivalents.
109  for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
110    for (UChar32 c = special_add.getRangeStart(i);
111         c <= special_add.getRangeEnd(i); c++) {
112      UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
113      current.set(c, c);
114      current.closeOver(USET_CASE_INSENSITIVE);
115      current.removeAll(ignore);
116      for (int32_t j = 0; j < current.getRangeCount(); j++) {
117        for (UChar32 c2 = current.getRangeStart(j);
118             c2 <= current.getRangeEnd(j); c2++) {
119          CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
120        }
121      }
122    }
123  }
124
125  PrintSet(out, "IgnoreSet", ignore);
126  PrintSet(out, "SpecialAddSet", special_add);
127}
128
129void WriteHeader(const char* header_filename) {
130  std::ofstream out(header_filename);
131  out << std::hex << std::setfill('0') << std::setw(4);
132  out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
133      << "// Use of this source code is governed by a BSD-style license that\n"
134      << "// can be found in the LICENSE file.\n\n"
135      << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
136      << "// The following functions are used to build UnicodeSets\n"
137      << "// for special cases where the case-folding algorithm used by\n"
138      << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
139      << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
140      << "// Semantics: Canonicalize) step 3.\n\n"
141      << "#ifdef V8_INTL_SUPPORT\n"
142      << "#include \"src/base/lazy-instance.h\"\n\n"
143      << "#include \"src/regexp/special-case.h\"\n\n"
144      << "#include \"unicode/uniset.h\"\n"
145      << "namespace v8 {\n"
146      << "namespace internal {\n\n";
147
148  PrintSpecial(out);
149
150  out << "\n"
151      << "}  // namespace internal\n"
152      << "}  // namespace v8\n"
153      << "#endif  // V8_INTL_SUPPORT\n";
154}
155
156}  // namespace internal
157}  // namespace v8
158
159int main(int argc, const char** argv) {
160  if (argc != 2) {
161    std::cerr << "Usage: " << argv[0] << " <output filename>\n";
162    std::exit(1);
163  }
164  v8::internal::WriteHeader(argv[1]);
165
166  return 0;
167}
168