1// Copyright (C) 2012 The Libphonenumber Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14// 15// Author: Patrick Mezard 16 17#include "cpp-build/generate_geocoding_data.h" 18 19#include <dirent.h> 20#include <errno.h> 21#include <locale> 22#include <sys/stat.h> 23#include <algorithm> 24#include <cctype> 25#include <cmath> 26#include <cstdio> 27#include <cstring> 28#include <iomanip> 29#include <iterator> 30#include <map> 31#include <set> 32#include <sstream> 33#include <string> 34#include <utility> 35#include <vector> 36 37#include "base/basictypes.h" 38 39#include "absl/container/btree_map.h" 40#include "absl/container/btree_set.h" 41 42namespace i18n { 43namespace phonenumbers { 44 45using std::map; 46using std::string; 47using std::vector; 48using std::set; 49using std::pair; 50 51template <typename ResourceType> class AutoCloser { 52 public: 53 typedef int (*ReleaseFunction) (ResourceType* resource); 54 55 AutoCloser(ResourceType** resource, ReleaseFunction release_function) 56 : resource_(resource), 57 release_function_(release_function) 58 {} 59 60 ~AutoCloser() { 61 Close(); 62 } 63 64 ResourceType* get_resource() const { 65 return *resource_; 66 } 67 68 void Close() { 69 if (*resource_) { 70 release_function_(*resource_); 71 *resource_ = NULL; 72 } 73 } 74 75 private: 76 ResourceType** resource_; 77 ReleaseFunction release_function_; 78}; 79 80enum DirEntryKinds { 81 kFile = 0, 82 kDirectory = 1, 83}; 84 85class DirEntry { 86 public: 87 DirEntry(const char* n, DirEntryKinds k) 88 : name_(n), 89 kind_(k) 90 {} 91 92 const std::string& name() const { return name_; } 93 DirEntryKinds kind() const { return kind_; } 94 95 private: 96 std::string name_; 97 DirEntryKinds kind_; 98}; 99 100// Lists directory entries in path. "." and ".." are excluded. Returns true on 101// success. 102bool ListDirectory(const string& path, vector<DirEntry>* entries) { 103 entries->clear(); 104 DIR* dir = opendir(path.c_str()); 105 if (!dir) { 106 return false; 107 } 108 AutoCloser<DIR> dir_closer(&dir, closedir); 109 struct dirent *entry; 110 struct stat entry_stat; 111 while (true) { 112 // Set errno to 0 to be able to check if an error occurs during the 113 // readdir() call. NULL is the return value when the end of the directory 114 // stream is reached or when an error occurs, and the errno check is the 115 // only thing that helps us distinguish between the two cases. See 116 // documentation at 117 // http://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html 118 errno = 0; 119 entry = readdir(dir); 120 if (entry == NULL) { 121 return errno == 0; 122 } 123 if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { 124 continue; 125 } 126 const string entry_path = path + "/" + entry->d_name; 127 if (stat(entry_path.c_str(), &entry_stat)) { 128 return false; 129 } 130 DirEntryKinds kind = kFile; 131 if (S_ISDIR(entry_stat.st_mode)) { 132 kind = kDirectory; 133 } else if (!S_ISREG(entry_stat.st_mode)) { 134 continue; 135 } 136 entries->push_back(DirEntry(entry->d_name, kind)); 137 } 138} 139 140// Returns true if s ends with suffix. 141bool EndsWith(const string& s, const string& suffix) { 142 if (suffix.length() > s.length()) { 143 return false; 144 } 145 return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin()); 146} 147 148// Converts string to integer, returns true on success. 149bool StrToInt(const string& s, int32* n) { 150 std::stringstream stream; 151 stream << s; 152 stream >> *n; 153 return !stream.fail(); 154} 155 156// Converts integer to string, returns true on success. 157bool IntToStr(int32 n, string* s) { 158 std::stringstream stream; 159 stream << n; 160 stream >> *s; 161 return !stream.fail(); 162} 163 164// Parses the prefix descriptions file at path, clears and fills the output 165// prefixes phone number prefix to description mapping. 166// Returns true on success. 167bool ParsePrefixes(const string& path, 168 absl::btree_map<int32, string>* prefixes) { 169 prefixes->clear(); 170 FILE* input = fopen(path.c_str(), "r"); 171 if (!input) { 172 return false; 173 } 174 AutoCloser<FILE> input_closer(&input, fclose); 175 const int kMaxLineLength = 2*1024; 176 vector<char> buffer(kMaxLineLength); 177 vector<char>::iterator begin, end, sep; 178 string prefix, description; 179 int32 prefix_code; 180 while (fgets(&buffer[0], buffer.size(), input)) { 181 begin = buffer.begin(); 182 end = std::find(begin, buffer.end(), '\0'); 183 if (end == begin) { 184 continue; 185 } 186 --end; 187 if (*end != '\n' && !feof(input)) { 188 // A line without LF can only happen at the end of file. 189 return false; 190 } 191 192 // Trim and check for comments. 193 for (; begin != end && std::isspace(*begin); ++begin) {} 194 for (; end != begin && std::isspace(*(end - 1)); --end) {} 195 if (begin == end || *begin == '#') { 196 continue; 197 } 198 199 sep = std::find(begin, end, '|'); 200 if (sep == end) { 201 continue; 202 } 203 prefix = string(begin, sep); 204 if (!StrToInt(prefix, &prefix_code)) { 205 return false; 206 } 207 (*prefixes)[prefix_code] = string(sep + 1, end); 208 } 209 return ferror(input) == 0; 210} 211 212// Builds a C string literal from s. The output is enclosed in double-quotes and 213// care is taken to escape input quotes and non-ASCII or control characters. 214// 215// An input string: 216// Op\xc3\xa9ra 217// becomes: 218// "Op""\xc3""\xa9""ra" 219string MakeStringLiteral(const string& s) { 220 std::stringstream buffer; 221 int prev_is_hex = 0; 222 buffer << std::hex << std::setfill('0'); 223 buffer << "\""; 224 for (string::const_iterator it = s.begin(); it != s.end(); ++it) { 225 const char c = *it; 226 if (c >= 32 && c < 127) { 227 if (prev_is_hex == 2) { 228 buffer << "\"\""; 229 } 230 if (c == '\'') { 231 buffer << "\\"; 232 } 233 buffer << c; 234 prev_is_hex = 1; 235 } else { 236 if (prev_is_hex != 0) { 237 buffer << "\"\""; 238 } 239 buffer << "\\x" << std::setw(2) << (c < 0 ? c + 256 : c); 240 prev_is_hex = 2; 241 } 242 } 243 buffer << "\""; 244 return buffer.str(); 245} 246 247void WriteStringLiteral(const string& s, FILE* output) { 248 string literal = MakeStringLiteral(s); 249 fprintf(output, "%s", literal.c_str()); 250} 251 252const char kLicense[] = 253 "// Copyright (C) 2012 The Libphonenumber Authors\n" 254 "//\n" 255 "// Licensed under the Apache License, Version 2.0 (the \"License\");\n" 256 "// you may not use this file except in compliance with the License.\n" 257 "// You may obtain a copy of the License at\n" 258 "//\n" 259 "// http://www.apache.org/licenses/LICENSE-2.0\n" 260 "//\n" 261 "// Unless required by applicable law or agreed to in writing, software\n" 262 "// distributed under the License is distributed on an \"AS IS\" BASIS,\n" 263 "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or " 264 "implied.\n" 265 "// See the License for the specific language governing permissions and\n" 266 "// limitations under the License.\n" 267 "//\n" 268 "// This file is generated automatically, do not edit it manually.\n" 269 "\n"; 270 271void WriteLicense(FILE* output) { 272 fprintf(output, "%s", kLicense); 273} 274 275const char kI18NNS[] = "i18n"; 276const char kPhoneNumbersNS[] = "phonenumbers"; 277 278void WriteNSHeader(FILE* output) { 279 fprintf(output, "namespace %s {\n", kI18NNS); 280 fprintf(output, "namespace %s {\n", kPhoneNumbersNS); 281} 282 283void WriteNSFooter(FILE* output) { 284 fprintf(output, "} // namespace %s\n", kPhoneNumbersNS); 285 fprintf(output, "} // namespace %s\n", kI18NNS); 286} 287 288void WriteCppHeader(const string& base_name, FILE* output) { 289 fprintf(output, "#include \"phonenumbers/geocoding/%s.h\"\n", 290 base_name.c_str()); 291 fprintf(output, "\n"); 292 fprintf(output, "#include <cstdint>\n"); 293 fprintf(output, "\n"); 294} 295 296void WriteArrayAndSize(const string& name, FILE* output) { 297 fprintf(output, " %s,\n", name.c_str()); 298 fprintf(output, " sizeof(%s)/sizeof(*%s),\n", name.c_str(), name.c_str()); 299} 300 301// Writes a PrefixDescriptions variable named "name", with its prefixes field 302// set to "prefixes_name" variable, its descriptions to "desc_name" and its 303// possible_lengths to "possible_lengths_name": 304// 305// const PrefixDescriptions ${name} = { 306// ${prefix_name}, 307// sizeof(${prefix_name})/sizeof(*${prefix_name}), 308// ${desc_name}, 309// ${possible_lengths_name}, 310// sizeof(${possible_lengths_name})/sizeof(*${possible_lengths_name}), 311// }; 312// 313void WritePrefixDescriptionsDefinition( 314 const string& name, const string& prefixes_name, const string& desc_name, 315 const string& possible_lengths_name, FILE* output) { 316 fprintf(output, "const PrefixDescriptions %s = {\n", name.c_str()); 317 WriteArrayAndSize(prefixes_name, output); 318 fprintf(output, " %s,\n", desc_name.c_str()); 319 WriteArrayAndSize(possible_lengths_name, output); 320 fprintf(output, "};\n"); 321} 322 323// Writes prefixes, descriptions and possible_lengths arrays built from the 324// phone number prefix to description mapping "prefixes". Binds these arrays 325// in a single PrefixDescriptions variable named "var_name". 326// 327// const int32_t ${var_name}_prefixes[] = { 328// 1201, 329// 1650, 330// }; 331// 332// const char* ${var_name}_descriptions[] = { 333// "New Jerse", 334// "Kalifornie", 335// }; 336// 337// const int32_t ${var_name}_possible_lengths[] = { 338// 4, 339// }; 340// 341// const PrefixDescriptions ${var_name} = { 342// ... 343// }; 344// 345void WritePrefixDescriptions(const string& var_name, 346 const absl::btree_map<int, string>& prefixes, 347 FILE* output) { 348 absl::btree_set<int> possible_lengths; 349 const string prefixes_name = var_name + "_prefixes"; 350 fprintf(output, "const int32_t %s[] = {\n", prefixes_name.c_str()); 351 for (absl::btree_map<int, string>::const_iterator it = prefixes.begin(); 352 it != prefixes.end(); ++it) { 353 fprintf(output, " %d,\n", it->first); 354 possible_lengths.insert(static_cast<int>(log10(it->first) + 1)); 355 } 356 fprintf(output, 357 "};\n" 358 "\n"); 359 360 const string desc_name = var_name + "_descriptions"; 361 fprintf(output, "const char* %s[] = {\n", desc_name.c_str()); 362 for (absl::btree_map<int, string>::const_iterator it = prefixes.begin(); 363 it != prefixes.end(); ++it) { 364 fprintf(output, " "); 365 WriteStringLiteral(it->second, output); 366 fprintf(output, ",\n"); 367 } 368 fprintf(output, 369 "};\n" 370 "\n"); 371 372 const string possible_lengths_name = var_name + "_possible_lengths"; 373 fprintf(output, "const int32_t %s[] = {\n ", possible_lengths_name.c_str()); 374 for (absl::btree_set<int>::const_iterator it = possible_lengths.begin(); 375 it != possible_lengths.end(); ++it) { 376 fprintf(output, " %d,", *it); 377 } 378 fprintf(output, 379 "\n" 380 "};\n" 381 "\n"); 382 383 WritePrefixDescriptionsDefinition(var_name, prefixes_name, desc_name, 384 possible_lengths_name, output); 385 fprintf(output, "\n"); 386} 387 388// Writes a pair of arrays mapping prefix language code pairs to 389// PrefixDescriptions instances. "prefix_var_names" maps language code pairs 390// to prefix variable names. 391// 392// const char* prefix_language_code_pairs[] = { 393// "1_de", 394// "1_en", 395// }; 396// 397// const PrefixDescriptions* prefix_descriptions[] = { 398// &prefix_1_de, 399// &prefix_1_en, 400// }; 401// 402void WritePrefixesDescriptions( 403 const absl::btree_map<string, string>& prefix_var_names, FILE* output) { 404 fprintf(output, "const char* prefix_language_code_pairs[] = {\n"); 405 for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin(); 406 it != prefix_var_names.end(); ++it) { 407 fprintf(output, " \"%s\",\n", it->first.c_str()); 408 } 409 fprintf(output, 410 "};\n" 411 "\n" 412 "const PrefixDescriptions* prefixes_descriptions[] = {\n"); 413 for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin(); 414 it != prefix_var_names.end(); ++it) { 415 fprintf(output, " &%s,\n", it->second.c_str()); 416 } 417 fprintf(output, 418 "};\n" 419 "\n"); 420} 421 422// For each entry in "languages" mapping a country calling code to a set 423// of available languages, writes a sorted array of languages, then wraps it 424// into a CountryLanguages instance. Finally, writes a pair of arrays mapping 425// country calling codes to CountryLanguages instances. 426// 427// const char* country_1[] = { 428// "de", 429// "en", 430// }; 431// 432// const CountryLanguages country_1_languages = { 433// country_1, 434// sizeof(country_1)/sizeof(*country_1), 435// }; 436// 437// [...] 438// 439// const CountryLanguages* country_languages[] = { 440// &country_1_languages, 441// [...] 442// } 443// 444// const int country_calling_codes[] = { 445// 1, 446// [...] 447// }; 448// 449bool WriteCountryLanguages(const map<int32, set<string> >& languages, 450 FILE* output) { 451 vector<string> country_languages_vars; 452 vector<string> countries; 453 for (map<int32, set<string> >::const_iterator it = languages.begin(); 454 it != languages.end(); ++it) { 455 string country_code; 456 if (!IntToStr(it->first, &country_code)) { 457 return false; 458 } 459 const string country_var = "country_" + country_code; 460 fprintf(output, "const char* %s[] = {\n", country_var.c_str()); 461 for (set<string>::const_iterator it_lang = it->second.begin(); 462 it_lang != it->second.end(); ++it_lang) { 463 fprintf(output, " \"%s\",\n", it_lang->c_str()); 464 } 465 fprintf(output, 466 "};\n" 467 "\n"); 468 469 const string country_languages_var = country_var + "_languages"; 470 fprintf(output, "const CountryLanguages %s = {\n", 471 country_languages_var.c_str()); 472 WriteArrayAndSize(country_var, output); 473 fprintf(output, 474 "};\n" 475 "\n"); 476 country_languages_vars.push_back(country_languages_var); 477 countries.push_back(country_code); 478 } 479 480 fprintf(output, 481 "\n" 482 "const CountryLanguages* countries_languages[] = {\n"); 483 for (vector<string>::const_iterator 484 it_languages_var = country_languages_vars.begin(); 485 it_languages_var != country_languages_vars.end(); ++it_languages_var) { 486 fprintf(output, " &%s,\n", it_languages_var->c_str()); 487 } 488 fprintf(output, 489 "};\n" 490 "\n" 491 "const int country_calling_codes[] = {\n"); 492 for (vector<string>::const_iterator it_country = countries.begin(); 493 it_country != countries.end(); ++it_country) { 494 fprintf(output, " %s,\n", it_country->c_str()); 495 } 496 fprintf(output, 497 "};\n" 498 "\n"); 499 return true; 500} 501 502// Returns a copy of input where all occurences of pattern are replaced with 503// value. If pattern is empty, input is returned unchanged. 504string ReplaceAll(const string& input, const string& pattern, 505 const string& value) { 506 if (pattern.size() == 0) { 507 return input; 508 } 509 string replaced; 510 std::back_insert_iterator<string> output = std::back_inserter(replaced); 511 string::const_iterator begin = input.begin(), end = begin; 512 while (true) { 513 const size_t pos = input.find(pattern, begin - input.begin()); 514 if (pos == string::npos) { 515 std::copy(begin, input.end(), output); 516 break; 517 } 518 end = input.begin() + pos; 519 std::copy(begin, end, output); 520 std::copy(value.begin(), value.end(), output); 521 begin = end + pattern.length(); 522 } 523 return replaced; 524} 525 526// Writes data accessor definitions, prefixed with "accessor_prefix". 527void WriteAccessorsDefinitions(const string& accessor_prefix, FILE* output) { 528 string templ = 529 "const int* get$prefix$_country_calling_codes() {\n" 530 " return country_calling_codes;\n" 531 "}\n" 532 "\n" 533 "int get$prefix$_country_calling_codes_size() {\n" 534 " return sizeof(country_calling_codes)\n" 535 " /sizeof(*country_calling_codes);\n" 536 "}\n" 537 "\n" 538 "const CountryLanguages* get$prefix$_country_languages(int index) {\n" 539 " return countries_languages[index];\n" 540 "}\n" 541 "\n" 542 "const char** get$prefix$_prefix_language_code_pairs() {\n" 543 " return prefix_language_code_pairs;\n" 544 "}\n" 545 "\n" 546 "int get$prefix$_prefix_language_code_pairs_size() {\n" 547 " return sizeof(prefix_language_code_pairs)\n" 548 " /sizeof(*prefix_language_code_pairs);\n" 549 "}\n" 550 "\n" 551 "const PrefixDescriptions* get$prefix$_prefix_descriptions(int index) {\n" 552 " return prefixes_descriptions[index];\n" 553 "}\n"; 554 string defs = ReplaceAll(templ, "$prefix$", accessor_prefix); 555 fprintf(output, "%s", defs.c_str()); 556} 557 558// Writes geocoding data .cc file. "data_path" is the path of geocoding textual 559// data directory. "base_name" is the base name of the .h/.cc pair, like 560// "geocoding_data". 561bool WriteSource(const string& data_path, const string& base_name, 562 const string& accessor_prefix, FILE* output) { 563 WriteLicense(output); 564 WriteCppHeader(base_name, output); 565 WriteNSHeader(output); 566 fprintf(output, 567 "namespace {\n" 568 "\n"); 569 570 // Enumerate language/script directories. 571 absl::btree_map<string, string> prefix_vars; 572 map<int32, set<string> > country_languages; 573 vector<DirEntry> entries; 574 if (!ListDirectory(data_path, &entries)) { 575 fprintf(stderr, "failed to read directory entries"); 576 return false; 577 } 578 for (vector<DirEntry>::const_iterator it = entries.begin(); 579 it != entries.end(); ++it) { 580 if (it->kind() != kDirectory) { 581 continue; 582 } 583 // Enumerate country calling code files. 584 const string dir_path = data_path + "/" + it->name(); 585 vector<DirEntry> files; 586 if (!ListDirectory(dir_path, &files)) { 587 fprintf(stderr, "failed to read file entries\n"); 588 return false; 589 } 590 for (vector<DirEntry>::const_iterator it_files = files.begin(); 591 it_files != files.end(); ++it_files) { 592 const string fname = it_files->name(); 593 if (!EndsWith(fname, ".txt")) { 594 continue; 595 } 596 int32 country_code; 597 const string country_code_str = fname.substr(0, fname.length() - 4); 598 if (!StrToInt(country_code_str, &country_code)) { 599 return false; 600 } 601 const string path = dir_path + "/" + fname; 602 603 absl::btree_map<int32, string> prefixes; 604 if (!ParsePrefixes(path, &prefixes)) { 605 return false; 606 } 607 608 const string prefix_var = "prefix_" + country_code_str + "_" + it->name(); 609 WritePrefixDescriptions(prefix_var, prefixes, output); 610 prefix_vars[country_code_str + "_" + it->name()] = prefix_var; 611 country_languages[country_code].insert(it->name()); 612 } 613 } 614 WritePrefixesDescriptions(prefix_vars, output); 615 if (!WriteCountryLanguages(country_languages, output)) { 616 return false; 617 } 618 fprintf(output, "} // namespace\n"); 619 fprintf(output, "\n"); 620 WriteAccessorsDefinitions(accessor_prefix, output); 621 WriteNSFooter(output); 622 return ferror(output) == 0; 623} 624 625int PrintHelp(const string& message) { 626 fprintf(stderr, "error: %s\n", message.c_str()); 627 fprintf(stderr, "generate_geocoding_data DATADIR CCPATH"); 628 return 1; 629} 630 631int Main(int argc, const char* argv[]) { 632 if (argc < 2) { 633 return PrintHelp("geocoding data root directory expected"); 634 } 635 if (argc < 3) { 636 return PrintHelp("output source path expected"); 637 } 638 string accessor_prefix = ""; 639 if (argc > 3) { 640 accessor_prefix = argv[3]; 641 } 642 const string root_path(argv[1]); 643 string source_path(argv[2]); 644 std::replace(source_path.begin(), source_path.end(), '\\', '/'); 645 string base_name = source_path; 646 if (base_name.rfind('/') != string::npos) { 647 base_name = base_name.substr(base_name.rfind('/') + 1); 648 } 649 base_name = base_name.substr(0, base_name.rfind('.')); 650 651 FILE* source_fp = fopen(source_path.c_str(), "w"); 652 if (!source_fp) { 653 fprintf(stderr, "failed to open %s\n", source_path.c_str()); 654 return 1; 655 } 656 AutoCloser<FILE> source_closer(&source_fp, fclose); 657 if (!WriteSource(root_path, base_name, accessor_prefix, 658 source_fp)) { 659 return 1; 660 } 661 return 0; 662} 663 664} // namespace phonenumbers 665} // namespace i18n 666