11cb0ef41Sopenharmony_ci// Copyright 2020 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#include <fstream> 61cb0ef41Sopenharmony_ci#include <iomanip> 71cb0ef41Sopenharmony_ci#include <iostream> 81cb0ef41Sopenharmony_ci#include <sstream> 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci#include "src/base/strings.h" 111cb0ef41Sopenharmony_ci#include "src/regexp/special-case.h" 121cb0ef41Sopenharmony_ci 131cb0ef41Sopenharmony_cinamespace v8 { 141cb0ef41Sopenharmony_cinamespace internal { 151cb0ef41Sopenharmony_ci 161cb0ef41Sopenharmony_cistatic const base::uc32 kSurrogateStart = 0xd800; 171cb0ef41Sopenharmony_cistatic const base::uc32 kSurrogateEnd = 0xdfff; 181cb0ef41Sopenharmony_cistatic const base::uc32 kNonBmpStart = 0x10000; 191cb0ef41Sopenharmony_ci 201cb0ef41Sopenharmony_ci// The following code generates "src/regexp/special-case.cc". 211cb0ef41Sopenharmony_civoid PrintSet(std::ofstream& out, const char* name, 221cb0ef41Sopenharmony_ci const icu::UnicodeSet& set) { 231cb0ef41Sopenharmony_ci out << "icu::UnicodeSet Build" << name << "() {\n" 241cb0ef41Sopenharmony_ci << " icu::UnicodeSet set;\n"; 251cb0ef41Sopenharmony_ci for (int32_t i = 0; i < set.getRangeCount(); i++) { 261cb0ef41Sopenharmony_ci if (set.getRangeStart(i) == set.getRangeEnd(i)) { 271cb0ef41Sopenharmony_ci out << " set.add(0x" << set.getRangeStart(i) << ");\n"; 281cb0ef41Sopenharmony_ci } else { 291cb0ef41Sopenharmony_ci out << " set.add(0x" << set.getRangeStart(i) << ", 0x" 301cb0ef41Sopenharmony_ci << set.getRangeEnd(i) << ");\n"; 311cb0ef41Sopenharmony_ci } 321cb0ef41Sopenharmony_ci } 331cb0ef41Sopenharmony_ci out << " set.freeze();\n" 341cb0ef41Sopenharmony_ci << " return set;\n" 351cb0ef41Sopenharmony_ci << "}\n\n"; 361cb0ef41Sopenharmony_ci 371cb0ef41Sopenharmony_ci out << "struct " << name << "Data {\n" 381cb0ef41Sopenharmony_ci << " " << name << "Data() : set(Build" << name << "()) {}\n" 391cb0ef41Sopenharmony_ci << " const icu::UnicodeSet set;\n" 401cb0ef41Sopenharmony_ci << "};\n\n"; 411cb0ef41Sopenharmony_ci 421cb0ef41Sopenharmony_ci out << "//static\n" 431cb0ef41Sopenharmony_ci << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" 441cb0ef41Sopenharmony_ci << " static base::LazyInstance<" << name << "Data>::type set =\n" 451cb0ef41Sopenharmony_ci << " LAZY_INSTANCE_INITIALIZER;\n" 461cb0ef41Sopenharmony_ci << " return set.Pointer()->set;\n" 471cb0ef41Sopenharmony_ci << "}\n\n"; 481cb0ef41Sopenharmony_ci} 491cb0ef41Sopenharmony_ci 501cb0ef41Sopenharmony_civoid PrintSpecial(std::ofstream& out) { 511cb0ef41Sopenharmony_ci icu::UnicodeSet current; 521cb0ef41Sopenharmony_ci icu::UnicodeSet special_add; 531cb0ef41Sopenharmony_ci icu::UnicodeSet ignore; 541cb0ef41Sopenharmony_ci UErrorCode status = U_ZERO_ERROR; 551cb0ef41Sopenharmony_ci icu::UnicodeSet upper("[\\p{Lu}]", status); 561cb0ef41Sopenharmony_ci CHECK(U_SUCCESS(status)); 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_ci // Iterate through all chars in BMP except surrogates. 591cb0ef41Sopenharmony_ci for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { 601cb0ef41Sopenharmony_ci if (i >= static_cast<UChar32>(kSurrogateStart) && 611cb0ef41Sopenharmony_ci i <= static_cast<UChar32>(kSurrogateEnd)) { 621cb0ef41Sopenharmony_ci continue; // Ignore surrogate range 631cb0ef41Sopenharmony_ci } 641cb0ef41Sopenharmony_ci current.set(i, i); 651cb0ef41Sopenharmony_ci current.closeOver(USET_CASE_INSENSITIVE); 661cb0ef41Sopenharmony_ci 671cb0ef41Sopenharmony_ci // Check to see if all characters in the case-folding equivalence 681cb0ef41Sopenharmony_ci // class as defined by UnicodeSet::closeOver all map to the same 691cb0ef41Sopenharmony_ci // canonical value. 701cb0ef41Sopenharmony_ci UChar32 canonical = RegExpCaseFolding::Canonicalize(i); 711cb0ef41Sopenharmony_ci bool class_has_matching_canonical_char = false; 721cb0ef41Sopenharmony_ci bool class_has_non_matching_canonical_char = false; 731cb0ef41Sopenharmony_ci for (int32_t j = 0; j < current.getRangeCount(); j++) { 741cb0ef41Sopenharmony_ci for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); 751cb0ef41Sopenharmony_ci c++) { 761cb0ef41Sopenharmony_ci if (c == i) { 771cb0ef41Sopenharmony_ci continue; 781cb0ef41Sopenharmony_ci } 791cb0ef41Sopenharmony_ci UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); 801cb0ef41Sopenharmony_ci if (canonical == other_canonical) { 811cb0ef41Sopenharmony_ci class_has_matching_canonical_char = true; 821cb0ef41Sopenharmony_ci } else { 831cb0ef41Sopenharmony_ci class_has_non_matching_canonical_char = true; 841cb0ef41Sopenharmony_ci } 851cb0ef41Sopenharmony_ci } 861cb0ef41Sopenharmony_ci } 871cb0ef41Sopenharmony_ci // If any other character in i's equivalence class has a 881cb0ef41Sopenharmony_ci // different canonical value, then i needs special handling. If 891cb0ef41Sopenharmony_ci // no other character shares a canonical value with i, we can 901cb0ef41Sopenharmony_ci // ignore i when adding alternatives for case-independent 911cb0ef41Sopenharmony_ci // comparison. If at least one other character shares a 921cb0ef41Sopenharmony_ci // canonical value, then i needs special handling. 931cb0ef41Sopenharmony_ci if (class_has_non_matching_canonical_char) { 941cb0ef41Sopenharmony_ci if (class_has_matching_canonical_char) { 951cb0ef41Sopenharmony_ci special_add.add(i); 961cb0ef41Sopenharmony_ci } else { 971cb0ef41Sopenharmony_ci ignore.add(i); 981cb0ef41Sopenharmony_ci } 991cb0ef41Sopenharmony_ci } 1001cb0ef41Sopenharmony_ci } 1011cb0ef41Sopenharmony_ci 1021cb0ef41Sopenharmony_ci // Verify that no Unicode equivalence class contains two non-trivial 1031cb0ef41Sopenharmony_ci // JS equivalence classes. Every character in SpecialAddSet has the 1041cb0ef41Sopenharmony_ci // same canonical value as every other non-IgnoreSet character in 1051cb0ef41Sopenharmony_ci // its Unicode equivalence class. Therefore, if we call closeOver on 1061cb0ef41Sopenharmony_ci // a set containing no IgnoreSet characters, the only characters 1071cb0ef41Sopenharmony_ci // that must be removed from the result are in IgnoreSet. This fact 1081cb0ef41Sopenharmony_ci // is used in CharacterRange::AddCaseEquivalents. 1091cb0ef41Sopenharmony_ci for (int32_t i = 0; i < special_add.getRangeCount(); i++) { 1101cb0ef41Sopenharmony_ci for (UChar32 c = special_add.getRangeStart(i); 1111cb0ef41Sopenharmony_ci c <= special_add.getRangeEnd(i); c++) { 1121cb0ef41Sopenharmony_ci UChar32 canonical = RegExpCaseFolding::Canonicalize(c); 1131cb0ef41Sopenharmony_ci current.set(c, c); 1141cb0ef41Sopenharmony_ci current.closeOver(USET_CASE_INSENSITIVE); 1151cb0ef41Sopenharmony_ci current.removeAll(ignore); 1161cb0ef41Sopenharmony_ci for (int32_t j = 0; j < current.getRangeCount(); j++) { 1171cb0ef41Sopenharmony_ci for (UChar32 c2 = current.getRangeStart(j); 1181cb0ef41Sopenharmony_ci c2 <= current.getRangeEnd(j); c2++) { 1191cb0ef41Sopenharmony_ci CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); 1201cb0ef41Sopenharmony_ci } 1211cb0ef41Sopenharmony_ci } 1221cb0ef41Sopenharmony_ci } 1231cb0ef41Sopenharmony_ci } 1241cb0ef41Sopenharmony_ci 1251cb0ef41Sopenharmony_ci PrintSet(out, "IgnoreSet", ignore); 1261cb0ef41Sopenharmony_ci PrintSet(out, "SpecialAddSet", special_add); 1271cb0ef41Sopenharmony_ci} 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_civoid WriteHeader(const char* header_filename) { 1301cb0ef41Sopenharmony_ci std::ofstream out(header_filename); 1311cb0ef41Sopenharmony_ci out << std::hex << std::setfill('0') << std::setw(4); 1321cb0ef41Sopenharmony_ci out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" 1331cb0ef41Sopenharmony_ci << "// Use of this source code is governed by a BSD-style license that\n" 1341cb0ef41Sopenharmony_ci << "// can be found in the LICENSE file.\n\n" 1351cb0ef41Sopenharmony_ci << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" 1361cb0ef41Sopenharmony_ci << "// The following functions are used to build UnicodeSets\n" 1371cb0ef41Sopenharmony_ci << "// for special cases where the case-folding algorithm used by\n" 1381cb0ef41Sopenharmony_ci << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" 1391cb0ef41Sopenharmony_ci << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" 1401cb0ef41Sopenharmony_ci << "// Semantics: Canonicalize) step 3.\n\n" 1411cb0ef41Sopenharmony_ci << "#ifdef V8_INTL_SUPPORT\n" 1421cb0ef41Sopenharmony_ci << "#include \"src/base/lazy-instance.h\"\n\n" 1431cb0ef41Sopenharmony_ci << "#include \"src/regexp/special-case.h\"\n\n" 1441cb0ef41Sopenharmony_ci << "#include \"unicode/uniset.h\"\n" 1451cb0ef41Sopenharmony_ci << "namespace v8 {\n" 1461cb0ef41Sopenharmony_ci << "namespace internal {\n\n"; 1471cb0ef41Sopenharmony_ci 1481cb0ef41Sopenharmony_ci PrintSpecial(out); 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ci out << "\n" 1511cb0ef41Sopenharmony_ci << "} // namespace internal\n" 1521cb0ef41Sopenharmony_ci << "} // namespace v8\n" 1531cb0ef41Sopenharmony_ci << "#endif // V8_INTL_SUPPORT\n"; 1541cb0ef41Sopenharmony_ci} 1551cb0ef41Sopenharmony_ci 1561cb0ef41Sopenharmony_ci} // namespace internal 1571cb0ef41Sopenharmony_ci} // namespace v8 1581cb0ef41Sopenharmony_ci 1591cb0ef41Sopenharmony_ciint main(int argc, const char** argv) { 1601cb0ef41Sopenharmony_ci if (argc != 2) { 1611cb0ef41Sopenharmony_ci std::cerr << "Usage: " << argv[0] << " <output filename>\n"; 1621cb0ef41Sopenharmony_ci std::exit(1); 1631cb0ef41Sopenharmony_ci } 1641cb0ef41Sopenharmony_ci v8::internal::WriteHeader(argv[1]); 1651cb0ef41Sopenharmony_ci 1661cb0ef41Sopenharmony_ci return 0; 1671cb0ef41Sopenharmony_ci} 168