11cb0ef41Sopenharmony_ci// Copyright 2020 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci#include <fstream>
61cb0ef41Sopenharmony_ci#include <iomanip>
71cb0ef41Sopenharmony_ci#include <iostream>
81cb0ef41Sopenharmony_ci#include <sstream>
91cb0ef41Sopenharmony_ci
101cb0ef41Sopenharmony_ci#include "src/base/strings.h"
111cb0ef41Sopenharmony_ci#include "src/regexp/special-case.h"
121cb0ef41Sopenharmony_ci
131cb0ef41Sopenharmony_cinamespace v8 {
141cb0ef41Sopenharmony_cinamespace internal {
151cb0ef41Sopenharmony_ci
161cb0ef41Sopenharmony_cistatic const base::uc32 kSurrogateStart = 0xd800;
171cb0ef41Sopenharmony_cistatic const base::uc32 kSurrogateEnd = 0xdfff;
181cb0ef41Sopenharmony_cistatic const base::uc32 kNonBmpStart = 0x10000;
191cb0ef41Sopenharmony_ci
201cb0ef41Sopenharmony_ci// The following code generates "src/regexp/special-case.cc".
211cb0ef41Sopenharmony_civoid PrintSet(std::ofstream& out, const char* name,
221cb0ef41Sopenharmony_ci              const icu::UnicodeSet& set) {
231cb0ef41Sopenharmony_ci  out << "icu::UnicodeSet Build" << name << "() {\n"
241cb0ef41Sopenharmony_ci      << "  icu::UnicodeSet set;\n";
251cb0ef41Sopenharmony_ci  for (int32_t i = 0; i < set.getRangeCount(); i++) {
261cb0ef41Sopenharmony_ci    if (set.getRangeStart(i) == set.getRangeEnd(i)) {
271cb0ef41Sopenharmony_ci      out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
281cb0ef41Sopenharmony_ci    } else {
291cb0ef41Sopenharmony_ci      out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
301cb0ef41Sopenharmony_ci          << set.getRangeEnd(i) << ");\n";
311cb0ef41Sopenharmony_ci    }
321cb0ef41Sopenharmony_ci  }
331cb0ef41Sopenharmony_ci  out << "  set.freeze();\n"
341cb0ef41Sopenharmony_ci      << "  return set;\n"
351cb0ef41Sopenharmony_ci      << "}\n\n";
361cb0ef41Sopenharmony_ci
371cb0ef41Sopenharmony_ci  out << "struct " << name << "Data {\n"
381cb0ef41Sopenharmony_ci      << "  " << name << "Data() : set(Build" << name << "()) {}\n"
391cb0ef41Sopenharmony_ci      << "  const icu::UnicodeSet set;\n"
401cb0ef41Sopenharmony_ci      << "};\n\n";
411cb0ef41Sopenharmony_ci
421cb0ef41Sopenharmony_ci  out << "//static\n"
431cb0ef41Sopenharmony_ci      << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
441cb0ef41Sopenharmony_ci      << "  static base::LazyInstance<" << name << "Data>::type set =\n"
451cb0ef41Sopenharmony_ci      << "      LAZY_INSTANCE_INITIALIZER;\n"
461cb0ef41Sopenharmony_ci      << "  return set.Pointer()->set;\n"
471cb0ef41Sopenharmony_ci      << "}\n\n";
481cb0ef41Sopenharmony_ci}
491cb0ef41Sopenharmony_ci
501cb0ef41Sopenharmony_civoid PrintSpecial(std::ofstream& out) {
511cb0ef41Sopenharmony_ci  icu::UnicodeSet current;
521cb0ef41Sopenharmony_ci  icu::UnicodeSet special_add;
531cb0ef41Sopenharmony_ci  icu::UnicodeSet ignore;
541cb0ef41Sopenharmony_ci  UErrorCode status = U_ZERO_ERROR;
551cb0ef41Sopenharmony_ci  icu::UnicodeSet upper("[\\p{Lu}]", status);
561cb0ef41Sopenharmony_ci  CHECK(U_SUCCESS(status));
571cb0ef41Sopenharmony_ci
581cb0ef41Sopenharmony_ci  // Iterate through all chars in BMP except surrogates.
591cb0ef41Sopenharmony_ci  for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
601cb0ef41Sopenharmony_ci    if (i >= static_cast<UChar32>(kSurrogateStart) &&
611cb0ef41Sopenharmony_ci        i <= static_cast<UChar32>(kSurrogateEnd)) {
621cb0ef41Sopenharmony_ci      continue;  // Ignore surrogate range
631cb0ef41Sopenharmony_ci    }
641cb0ef41Sopenharmony_ci    current.set(i, i);
651cb0ef41Sopenharmony_ci    current.closeOver(USET_CASE_INSENSITIVE);
661cb0ef41Sopenharmony_ci
671cb0ef41Sopenharmony_ci    // Check to see if all characters in the case-folding equivalence
681cb0ef41Sopenharmony_ci    // class as defined by UnicodeSet::closeOver all map to the same
691cb0ef41Sopenharmony_ci    // canonical value.
701cb0ef41Sopenharmony_ci    UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
711cb0ef41Sopenharmony_ci    bool class_has_matching_canonical_char = false;
721cb0ef41Sopenharmony_ci    bool class_has_non_matching_canonical_char = false;
731cb0ef41Sopenharmony_ci    for (int32_t j = 0; j < current.getRangeCount(); j++) {
741cb0ef41Sopenharmony_ci      for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
751cb0ef41Sopenharmony_ci           c++) {
761cb0ef41Sopenharmony_ci        if (c == i) {
771cb0ef41Sopenharmony_ci          continue;
781cb0ef41Sopenharmony_ci        }
791cb0ef41Sopenharmony_ci        UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
801cb0ef41Sopenharmony_ci        if (canonical == other_canonical) {
811cb0ef41Sopenharmony_ci          class_has_matching_canonical_char = true;
821cb0ef41Sopenharmony_ci        } else {
831cb0ef41Sopenharmony_ci          class_has_non_matching_canonical_char = true;
841cb0ef41Sopenharmony_ci        }
851cb0ef41Sopenharmony_ci      }
861cb0ef41Sopenharmony_ci    }
871cb0ef41Sopenharmony_ci    // If any other character in i's equivalence class has a
881cb0ef41Sopenharmony_ci    // different canonical value, then i needs special handling.  If
891cb0ef41Sopenharmony_ci    // no other character shares a canonical value with i, we can
901cb0ef41Sopenharmony_ci    // ignore i when adding alternatives for case-independent
911cb0ef41Sopenharmony_ci    // comparison.  If at least one other character shares a
921cb0ef41Sopenharmony_ci    // canonical value, then i needs special handling.
931cb0ef41Sopenharmony_ci    if (class_has_non_matching_canonical_char) {
941cb0ef41Sopenharmony_ci      if (class_has_matching_canonical_char) {
951cb0ef41Sopenharmony_ci        special_add.add(i);
961cb0ef41Sopenharmony_ci      } else {
971cb0ef41Sopenharmony_ci        ignore.add(i);
981cb0ef41Sopenharmony_ci      }
991cb0ef41Sopenharmony_ci    }
1001cb0ef41Sopenharmony_ci  }
1011cb0ef41Sopenharmony_ci
1021cb0ef41Sopenharmony_ci  // Verify that no Unicode equivalence class contains two non-trivial
1031cb0ef41Sopenharmony_ci  // JS equivalence classes. Every character in SpecialAddSet has the
1041cb0ef41Sopenharmony_ci  // same canonical value as every other non-IgnoreSet character in
1051cb0ef41Sopenharmony_ci  // its Unicode equivalence class. Therefore, if we call closeOver on
1061cb0ef41Sopenharmony_ci  // a set containing no IgnoreSet characters, the only characters
1071cb0ef41Sopenharmony_ci  // that must be removed from the result are in IgnoreSet. This fact
1081cb0ef41Sopenharmony_ci  // is used in CharacterRange::AddCaseEquivalents.
1091cb0ef41Sopenharmony_ci  for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
1101cb0ef41Sopenharmony_ci    for (UChar32 c = special_add.getRangeStart(i);
1111cb0ef41Sopenharmony_ci         c <= special_add.getRangeEnd(i); c++) {
1121cb0ef41Sopenharmony_ci      UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
1131cb0ef41Sopenharmony_ci      current.set(c, c);
1141cb0ef41Sopenharmony_ci      current.closeOver(USET_CASE_INSENSITIVE);
1151cb0ef41Sopenharmony_ci      current.removeAll(ignore);
1161cb0ef41Sopenharmony_ci      for (int32_t j = 0; j < current.getRangeCount(); j++) {
1171cb0ef41Sopenharmony_ci        for (UChar32 c2 = current.getRangeStart(j);
1181cb0ef41Sopenharmony_ci             c2 <= current.getRangeEnd(j); c2++) {
1191cb0ef41Sopenharmony_ci          CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
1201cb0ef41Sopenharmony_ci        }
1211cb0ef41Sopenharmony_ci      }
1221cb0ef41Sopenharmony_ci    }
1231cb0ef41Sopenharmony_ci  }
1241cb0ef41Sopenharmony_ci
1251cb0ef41Sopenharmony_ci  PrintSet(out, "IgnoreSet", ignore);
1261cb0ef41Sopenharmony_ci  PrintSet(out, "SpecialAddSet", special_add);
1271cb0ef41Sopenharmony_ci}
1281cb0ef41Sopenharmony_ci
1291cb0ef41Sopenharmony_civoid WriteHeader(const char* header_filename) {
1301cb0ef41Sopenharmony_ci  std::ofstream out(header_filename);
1311cb0ef41Sopenharmony_ci  out << std::hex << std::setfill('0') << std::setw(4);
1321cb0ef41Sopenharmony_ci  out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
1331cb0ef41Sopenharmony_ci      << "// Use of this source code is governed by a BSD-style license that\n"
1341cb0ef41Sopenharmony_ci      << "// can be found in the LICENSE file.\n\n"
1351cb0ef41Sopenharmony_ci      << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
1361cb0ef41Sopenharmony_ci      << "// The following functions are used to build UnicodeSets\n"
1371cb0ef41Sopenharmony_ci      << "// for special cases where the case-folding algorithm used by\n"
1381cb0ef41Sopenharmony_ci      << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
1391cb0ef41Sopenharmony_ci      << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
1401cb0ef41Sopenharmony_ci      << "// Semantics: Canonicalize) step 3.\n\n"
1411cb0ef41Sopenharmony_ci      << "#ifdef V8_INTL_SUPPORT\n"
1421cb0ef41Sopenharmony_ci      << "#include \"src/base/lazy-instance.h\"\n\n"
1431cb0ef41Sopenharmony_ci      << "#include \"src/regexp/special-case.h\"\n\n"
1441cb0ef41Sopenharmony_ci      << "#include \"unicode/uniset.h\"\n"
1451cb0ef41Sopenharmony_ci      << "namespace v8 {\n"
1461cb0ef41Sopenharmony_ci      << "namespace internal {\n\n";
1471cb0ef41Sopenharmony_ci
1481cb0ef41Sopenharmony_ci  PrintSpecial(out);
1491cb0ef41Sopenharmony_ci
1501cb0ef41Sopenharmony_ci  out << "\n"
1511cb0ef41Sopenharmony_ci      << "}  // namespace internal\n"
1521cb0ef41Sopenharmony_ci      << "}  // namespace v8\n"
1531cb0ef41Sopenharmony_ci      << "#endif  // V8_INTL_SUPPORT\n";
1541cb0ef41Sopenharmony_ci}
1551cb0ef41Sopenharmony_ci
1561cb0ef41Sopenharmony_ci}  // namespace internal
1571cb0ef41Sopenharmony_ci}  // namespace v8
1581cb0ef41Sopenharmony_ci
1591cb0ef41Sopenharmony_ciint main(int argc, const char** argv) {
1601cb0ef41Sopenharmony_ci  if (argc != 2) {
1611cb0ef41Sopenharmony_ci    std::cerr << "Usage: " << argv[0] << " <output filename>\n";
1621cb0ef41Sopenharmony_ci    std::exit(1);
1631cb0ef41Sopenharmony_ci  }
1641cb0ef41Sopenharmony_ci  v8::internal::WriteHeader(argv[1]);
1651cb0ef41Sopenharmony_ci
1661cb0ef41Sopenharmony_ci  return 0;
1671cb0ef41Sopenharmony_ci}
168