1// Copyright (C) 2012 The Libphonenumber Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Author: Patrick Mezard
16
17#include "cpp-build/generate_geocoding_data.h"
18
19#include <dirent.h>
20#include <errno.h>
21#include <locale>
22#include <sys/stat.h>
23#include <algorithm>
24#include <cctype>
25#include <cmath>
26#include <cstdio>
27#include <cstring>
28#include <iomanip>
29#include <iterator>
30#include <map>
31#include <set>
32#include <sstream>
33#include <string>
34#include <utility>
35#include <vector>
36
37#include "base/basictypes.h"
38
39#include "absl/container/btree_map.h"
40#include "absl/container/btree_set.h"
41
42namespace i18n {
43namespace phonenumbers {
44
45using std::map;
46using std::string;
47using std::vector;
48using std::set;
49using std::pair;
50
51template <typename ResourceType> class AutoCloser {
52 public:
53  typedef int (*ReleaseFunction) (ResourceType* resource);
54
55  AutoCloser(ResourceType** resource, ReleaseFunction release_function)
56      : resource_(resource),
57        release_function_(release_function)
58  {}
59
60  ~AutoCloser() {
61    Close();
62  }
63
64  ResourceType* get_resource() const {
65    return *resource_;
66  }
67
68  void Close() {
69    if (*resource_) {
70      release_function_(*resource_);
71      *resource_ = NULL;
72    }
73  }
74
75 private:
76  ResourceType** resource_;
77  ReleaseFunction release_function_;
78};
79
80enum DirEntryKinds {
81  kFile = 0,
82  kDirectory = 1,
83};
84
85class DirEntry {
86 public:
87  DirEntry(const char* n, DirEntryKinds k)
88      : name_(n),
89        kind_(k)
90  {}
91
92  const std::string& name() const { return name_; }
93  DirEntryKinds kind() const { return kind_; }
94
95 private:
96  std::string name_;
97  DirEntryKinds kind_;
98};
99
100// Lists directory entries in path. "." and ".." are excluded. Returns true on
101// success.
102bool ListDirectory(const string& path, vector<DirEntry>* entries) {
103  entries->clear();
104  DIR* dir = opendir(path.c_str());
105  if (!dir) {
106    return false;
107  }
108  AutoCloser<DIR> dir_closer(&dir, closedir);
109  struct dirent *entry;
110  struct stat entry_stat;
111  while (true) {
112    // Set errno to 0 to be able to check if an error occurs during the
113    // readdir() call. NULL is the return value when the end of the directory
114    // stream is reached or when an error occurs, and the errno check is the
115    // only thing that helps us distinguish between the two cases. See
116    // documentation at
117    // http://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
118    errno = 0;
119    entry = readdir(dir);
120    if (entry == NULL) {
121      return errno == 0;
122    }
123    if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
124       continue;
125    }
126    const string entry_path = path + "/" + entry->d_name;
127    if (stat(entry_path.c_str(), &entry_stat)) {
128      return false;
129    }
130    DirEntryKinds kind = kFile;
131    if (S_ISDIR(entry_stat.st_mode)) {
132      kind = kDirectory;
133    } else if (!S_ISREG(entry_stat.st_mode)) {
134      continue;
135    }
136    entries->push_back(DirEntry(entry->d_name, kind));
137  }
138}
139
140// Returns true if s ends with suffix.
141bool EndsWith(const string& s, const string& suffix) {
142  if (suffix.length() > s.length()) {
143    return false;
144  }
145  return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
146}
147
148// Converts string to integer, returns true on success.
149bool StrToInt(const string& s, int32* n) {
150  std::stringstream stream;
151  stream << s;
152  stream >> *n;
153  return !stream.fail();
154}
155
156// Converts integer to string, returns true on success.
157bool IntToStr(int32 n, string* s) {
158  std::stringstream stream;
159  stream << n;
160  stream >> *s;
161  return !stream.fail();
162}
163
164// Parses the prefix descriptions file at path, clears and fills the output
165// prefixes phone number prefix to description mapping.
166// Returns true on success.
167bool ParsePrefixes(const string& path,
168                   absl::btree_map<int32, string>* prefixes) {
169  prefixes->clear();
170  FILE* input = fopen(path.c_str(), "r");
171  if (!input) {
172    return false;
173  }
174  AutoCloser<FILE> input_closer(&input, fclose);
175  const int kMaxLineLength = 2*1024;
176  vector<char> buffer(kMaxLineLength);
177  vector<char>::iterator begin, end, sep;
178  string prefix, description;
179  int32 prefix_code;
180  while (fgets(&buffer[0], buffer.size(), input)) {
181    begin = buffer.begin();
182    end = std::find(begin, buffer.end(), '\0');
183    if (end == begin) {
184      continue;
185    }
186    --end;
187    if (*end != '\n' && !feof(input)) {
188      // A line without LF can only happen at the end of file.
189      return false;
190    }
191
192    // Trim and check for comments.
193    for (; begin != end && std::isspace(*begin); ++begin) {}
194    for (; end != begin && std::isspace(*(end - 1)); --end) {}
195    if (begin == end || *begin == '#') {
196      continue;
197    }
198
199    sep = std::find(begin, end, '|');
200    if (sep == end) {
201      continue;
202    }
203    prefix = string(begin, sep);
204    if (!StrToInt(prefix, &prefix_code)) {
205      return false;
206    }
207    (*prefixes)[prefix_code] = string(sep + 1, end);
208  }
209  return ferror(input) == 0;
210}
211
212// Builds a C string literal from s. The output is enclosed in double-quotes and
213// care is taken to escape input quotes and non-ASCII or control characters.
214//
215// An input string:
216//   Op\xc3\xa9ra
217// becomes:
218//   "Op""\xc3""\xa9""ra"
219string MakeStringLiteral(const string& s) {
220  std::stringstream buffer;
221  int prev_is_hex = 0;
222  buffer << std::hex << std::setfill('0');
223  buffer << "\"";
224  for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
225    const char c = *it;
226    if (c >= 32 && c < 127) {
227      if (prev_is_hex == 2) {
228        buffer << "\"\"";
229      }
230      if (c == '\'') {
231        buffer << "\\";
232      }
233      buffer << c;
234      prev_is_hex = 1;
235    } else {
236      if (prev_is_hex != 0) {
237        buffer << "\"\"";
238      }
239      buffer << "\\x" << std::setw(2) << (c < 0 ? c + 256 : c);
240      prev_is_hex = 2;
241    }
242  }
243  buffer << "\"";
244  return buffer.str();
245}
246
247void WriteStringLiteral(const string& s, FILE* output) {
248  string literal = MakeStringLiteral(s);
249  fprintf(output, "%s", literal.c_str());
250}
251
252const char kLicense[] =
253  "// Copyright (C) 2012 The Libphonenumber Authors\n"
254  "//\n"
255  "// Licensed under the Apache License, Version 2.0 (the \"License\");\n"
256  "// you may not use this file except in compliance with the License.\n"
257  "// You may obtain a copy of the License at\n"
258  "//\n"
259  "// http://www.apache.org/licenses/LICENSE-2.0\n"
260  "//\n"
261  "// Unless required by applicable law or agreed to in writing, software\n"
262  "// distributed under the License is distributed on an \"AS IS\" BASIS,\n"
263  "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or "
264  "implied.\n"
265  "// See the License for the specific language governing permissions and\n"
266  "// limitations under the License.\n"
267  "//\n"
268  "// This file is generated automatically, do not edit it manually.\n"
269  "\n";
270
271void WriteLicense(FILE* output) {
272  fprintf(output, "%s", kLicense);
273}
274
275const char kI18NNS[] = "i18n";
276const char kPhoneNumbersNS[] = "phonenumbers";
277
278void WriteNSHeader(FILE* output) {
279  fprintf(output, "namespace %s {\n", kI18NNS);
280  fprintf(output, "namespace %s {\n", kPhoneNumbersNS);
281}
282
283void WriteNSFooter(FILE* output) {
284  fprintf(output, "}  // namespace %s\n", kPhoneNumbersNS);
285  fprintf(output, "}  // namespace %s\n", kI18NNS);
286}
287
288void WriteCppHeader(const string& base_name, FILE* output) {
289  fprintf(output, "#include \"phonenumbers/geocoding/%s.h\"\n",
290          base_name.c_str());
291  fprintf(output, "\n");
292  fprintf(output, "#include <cstdint>\n");
293  fprintf(output, "\n");
294}
295
296void WriteArrayAndSize(const string& name, FILE* output) {
297  fprintf(output, "  %s,\n", name.c_str());
298  fprintf(output, "  sizeof(%s)/sizeof(*%s),\n", name.c_str(), name.c_str());
299}
300
301// Writes a PrefixDescriptions variable named "name", with its prefixes field
302// set to "prefixes_name" variable, its descriptions to "desc_name" and its
303// possible_lengths to "possible_lengths_name":
304//
305// const PrefixDescriptions ${name} = {
306//   ${prefix_name},
307//   sizeof(${prefix_name})/sizeof(*${prefix_name}),
308//   ${desc_name},
309//   ${possible_lengths_name},
310//   sizeof(${possible_lengths_name})/sizeof(*${possible_lengths_name}),
311// };
312//
313void WritePrefixDescriptionsDefinition(
314    const string& name, const string& prefixes_name, const string& desc_name,
315    const string& possible_lengths_name, FILE* output) {
316  fprintf(output, "const PrefixDescriptions %s = {\n", name.c_str());
317  WriteArrayAndSize(prefixes_name, output);
318  fprintf(output, "  %s,\n", desc_name.c_str());
319  WriteArrayAndSize(possible_lengths_name, output);
320  fprintf(output, "};\n");
321}
322
323// Writes prefixes, descriptions and possible_lengths arrays built from the
324// phone number prefix to description mapping "prefixes". Binds these arrays
325// in a single PrefixDescriptions variable named "var_name".
326//
327// const int32_t ${var_name}_prefixes[] = {
328//   1201,
329//   1650,
330// };
331//
332// const char* ${var_name}_descriptions[] = {
333//   "New Jerse",
334//   "Kalifornie",
335// };
336//
337// const int32_t ${var_name}_possible_lengths[] = {
338//   4,
339// };
340//
341// const PrefixDescriptions ${var_name} = {
342//   ...
343// };
344//
345void WritePrefixDescriptions(const string& var_name,
346                             const absl::btree_map<int, string>& prefixes,
347                             FILE* output) {
348  absl::btree_set<int> possible_lengths;
349  const string prefixes_name = var_name + "_prefixes";
350  fprintf(output, "const int32_t %s[] = {\n", prefixes_name.c_str());
351  for (absl::btree_map<int, string>::const_iterator it = prefixes.begin();
352       it != prefixes.end(); ++it) {
353    fprintf(output, "  %d,\n", it->first);
354    possible_lengths.insert(static_cast<int>(log10(it->first) + 1));
355  }
356  fprintf(output,
357          "};\n"
358          "\n");
359
360  const string desc_name = var_name + "_descriptions";
361  fprintf(output, "const char* %s[] = {\n", desc_name.c_str());
362  for (absl::btree_map<int, string>::const_iterator it = prefixes.begin();
363       it != prefixes.end(); ++it) {
364    fprintf(output, "  ");
365    WriteStringLiteral(it->second, output);
366    fprintf(output, ",\n");
367  }
368  fprintf(output,
369          "};\n"
370          "\n");
371
372  const string possible_lengths_name = var_name + "_possible_lengths";
373  fprintf(output, "const int32_t %s[] = {\n ", possible_lengths_name.c_str());
374  for (absl::btree_set<int>::const_iterator it = possible_lengths.begin();
375       it != possible_lengths.end(); ++it) {
376    fprintf(output, " %d,", *it);
377  }
378  fprintf(output,
379          "\n"
380          "};\n"
381          "\n");
382
383  WritePrefixDescriptionsDefinition(var_name, prefixes_name, desc_name,
384                                    possible_lengths_name, output);
385  fprintf(output, "\n");
386}
387
388// Writes a pair of arrays mapping prefix language code pairs to
389// PrefixDescriptions instances. "prefix_var_names" maps language code pairs
390// to prefix variable names.
391//
392// const char* prefix_language_code_pairs[] = {
393//   "1_de",
394//   "1_en",
395// };
396//
397// const PrefixDescriptions* prefix_descriptions[] = {
398//   &prefix_1_de,
399//   &prefix_1_en,
400// };
401//
402void WritePrefixesDescriptions(
403    const absl::btree_map<string, string>& prefix_var_names, FILE* output) {
404  fprintf(output, "const char* prefix_language_code_pairs[] = {\n");
405  for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin();
406       it != prefix_var_names.end(); ++it) {
407    fprintf(output, "  \"%s\",\n", it->first.c_str());
408  }
409  fprintf(output,
410          "};\n"
411          "\n"
412          "const PrefixDescriptions* prefixes_descriptions[] = {\n");
413  for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin();
414       it != prefix_var_names.end(); ++it) {
415    fprintf(output, "  &%s,\n", it->second.c_str());
416  }
417  fprintf(output,
418          "};\n"
419          "\n");
420}
421
422// For each entry in "languages" mapping a country calling code to a set
423// of available languages, writes a sorted array of languages, then wraps it
424// into a CountryLanguages instance. Finally, writes a pair of arrays mapping
425// country calling codes to CountryLanguages instances.
426//
427// const char* country_1[] = {
428//   "de",
429//   "en",
430// };
431//
432// const CountryLanguages country_1_languages = {
433//   country_1,
434//   sizeof(country_1)/sizeof(*country_1),
435// };
436//
437// [...]
438//
439// const CountryLanguages* country_languages[] = {
440//   &country_1_languages,
441//   [...]
442// }
443//
444// const int country_calling_codes[] = {
445//   1,
446//   [...]
447// };
448//
449bool WriteCountryLanguages(const map<int32, set<string> >& languages,
450                           FILE* output) {
451  vector<string> country_languages_vars;
452  vector<string> countries;
453  for (map<int32, set<string> >::const_iterator it = languages.begin();
454       it != languages.end(); ++it) {
455    string country_code;
456    if (!IntToStr(it->first, &country_code)) {
457      return false;
458    }
459    const string country_var = "country_" + country_code;
460    fprintf(output, "const char* %s[] = {\n", country_var.c_str());
461    for (set<string>::const_iterator it_lang = it->second.begin();
462         it_lang != it->second.end(); ++it_lang) {
463      fprintf(output, "  \"%s\",\n", it_lang->c_str());
464    }
465    fprintf(output,
466            "};\n"
467            "\n");
468
469    const string country_languages_var = country_var + "_languages";
470    fprintf(output, "const CountryLanguages %s = {\n",
471            country_languages_var.c_str());
472    WriteArrayAndSize(country_var, output);
473    fprintf(output,
474            "};\n"
475            "\n");
476    country_languages_vars.push_back(country_languages_var);
477    countries.push_back(country_code);
478  }
479
480  fprintf(output,
481          "\n"
482          "const CountryLanguages* countries_languages[] = {\n");
483  for (vector<string>::const_iterator
484       it_languages_var = country_languages_vars.begin();
485       it_languages_var != country_languages_vars.end(); ++it_languages_var) {
486    fprintf(output, "  &%s,\n", it_languages_var->c_str());
487  }
488  fprintf(output,
489          "};\n"
490          "\n"
491          "const int country_calling_codes[] = {\n");
492  for (vector<string>::const_iterator it_country = countries.begin();
493       it_country != countries.end(); ++it_country) {
494    fprintf(output, "  %s,\n", it_country->c_str());
495  }
496  fprintf(output,
497          "};\n"
498          "\n");
499  return true;
500}
501
502// Returns a copy of input where all occurences of pattern are replaced with
503// value. If pattern is empty, input is returned unchanged.
504string ReplaceAll(const string& input, const string& pattern,
505                  const string& value) {
506  if (pattern.size() == 0) {
507    return input;
508  }
509  string replaced;
510  std::back_insert_iterator<string> output = std::back_inserter(replaced);
511  string::const_iterator begin = input.begin(), end = begin;
512  while (true) {
513    const size_t pos = input.find(pattern, begin - input.begin());
514    if (pos == string::npos) {
515      std::copy(begin, input.end(), output);
516      break;
517    }
518    end = input.begin() + pos;
519    std::copy(begin, end, output);
520    std::copy(value.begin(), value.end(), output);
521    begin = end + pattern.length();
522  }
523  return replaced;
524}
525
526// Writes data accessor definitions, prefixed with "accessor_prefix".
527void WriteAccessorsDefinitions(const string& accessor_prefix, FILE* output) {
528  string templ =
529      "const int* get$prefix$_country_calling_codes() {\n"
530      "  return country_calling_codes;\n"
531      "}\n"
532      "\n"
533      "int get$prefix$_country_calling_codes_size() {\n"
534      "  return sizeof(country_calling_codes)\n"
535      "      /sizeof(*country_calling_codes);\n"
536      "}\n"
537      "\n"
538      "const CountryLanguages* get$prefix$_country_languages(int index) {\n"
539      "  return countries_languages[index];\n"
540      "}\n"
541      "\n"
542      "const char** get$prefix$_prefix_language_code_pairs() {\n"
543      "  return prefix_language_code_pairs;\n"
544      "}\n"
545      "\n"
546      "int get$prefix$_prefix_language_code_pairs_size() {\n"
547      "  return sizeof(prefix_language_code_pairs)\n"
548      "      /sizeof(*prefix_language_code_pairs);\n"
549      "}\n"
550      "\n"
551      "const PrefixDescriptions* get$prefix$_prefix_descriptions(int index) {\n"
552      "  return prefixes_descriptions[index];\n"
553      "}\n";
554  string defs = ReplaceAll(templ, "$prefix$", accessor_prefix);
555  fprintf(output, "%s", defs.c_str());
556}
557
558// Writes geocoding data .cc file. "data_path" is the path of geocoding textual
559// data directory. "base_name" is the base name of the .h/.cc pair, like
560// "geocoding_data".
561bool WriteSource(const string& data_path, const string& base_name,
562                 const string& accessor_prefix, FILE* output) {
563  WriteLicense(output);
564  WriteCppHeader(base_name, output);
565  WriteNSHeader(output);
566  fprintf(output,
567          "namespace {\n"
568          "\n");
569
570  // Enumerate language/script directories.
571  absl::btree_map<string, string> prefix_vars;
572  map<int32, set<string> > country_languages;
573  vector<DirEntry> entries;
574  if (!ListDirectory(data_path, &entries)) {
575    fprintf(stderr, "failed to read directory entries");
576    return false;
577  }
578  for (vector<DirEntry>::const_iterator it = entries.begin();
579       it != entries.end(); ++it) {
580    if (it->kind() != kDirectory) {
581      continue;
582    }
583    // Enumerate country calling code files.
584    const string dir_path = data_path + "/" + it->name();
585    vector<DirEntry> files;
586    if (!ListDirectory(dir_path, &files)) {
587      fprintf(stderr, "failed to read file entries\n");
588      return false;
589    }
590    for (vector<DirEntry>::const_iterator it_files = files.begin();
591         it_files != files.end(); ++it_files) {
592      const string fname = it_files->name();
593      if (!EndsWith(fname, ".txt")) {
594       continue;
595      }
596      int32 country_code;
597      const string country_code_str = fname.substr(0, fname.length() - 4);
598      if (!StrToInt(country_code_str, &country_code)) {
599        return false;
600      }
601      const string path = dir_path + "/" + fname;
602
603      absl::btree_map<int32, string> prefixes;
604      if (!ParsePrefixes(path, &prefixes)) {
605        return false;
606      }
607
608      const string prefix_var = "prefix_" + country_code_str + "_" + it->name();
609      WritePrefixDescriptions(prefix_var, prefixes, output);
610      prefix_vars[country_code_str + "_" + it->name()] = prefix_var;
611      country_languages[country_code].insert(it->name());
612    }
613  }
614  WritePrefixesDescriptions(prefix_vars, output);
615  if (!WriteCountryLanguages(country_languages, output)) {
616    return false;
617  }
618  fprintf(output, "}  // namespace\n");
619  fprintf(output, "\n");
620  WriteAccessorsDefinitions(accessor_prefix, output);
621  WriteNSFooter(output);
622  return ferror(output) == 0;
623}
624
625int PrintHelp(const string& message) {
626  fprintf(stderr, "error: %s\n", message.c_str());
627  fprintf(stderr, "generate_geocoding_data DATADIR CCPATH");
628  return 1;
629}
630
631int Main(int argc, const char* argv[]) {
632  if (argc < 2) {
633    return PrintHelp("geocoding data root directory expected");
634  }
635  if (argc < 3) {
636    return PrintHelp("output source path expected");
637  }
638  string accessor_prefix = "";
639  if (argc > 3) {
640    accessor_prefix = argv[3];
641  }
642  const string root_path(argv[1]);
643  string source_path(argv[2]);
644  std::replace(source_path.begin(), source_path.end(), '\\', '/');
645  string base_name = source_path;
646  if (base_name.rfind('/') != string::npos) {
647    base_name = base_name.substr(base_name.rfind('/') + 1);
648  }
649  base_name = base_name.substr(0, base_name.rfind('.'));
650
651  FILE* source_fp = fopen(source_path.c_str(), "w");
652  if (!source_fp) {
653    fprintf(stderr, "failed to open %s\n", source_path.c_str());
654    return 1;
655  }
656  AutoCloser<FILE> source_closer(&source_fp, fclose);
657  if (!WriteSource(root_path, base_name, accessor_prefix,
658                   source_fp)) {
659    return 1;
660  }
661  return 0;
662}
663
664}  // namespace phonenumbers
665}  // namespace i18n
666