xref: /third_party/gn/src/base/json/json_parser.cc (revision 6d528ed9)
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/json/json_parser.h"
6
7#include <cmath>
8#include <string_view>
9#include <utility>
10#include <vector>
11
12#include "base/logging.h"
13#include "base/numerics/safe_conversions.h"
14#include "base/strings/string_number_conversions.h"
15#include "base/strings/string_util.h"
16#include "base/strings/stringprintf.h"
17#include "base/strings/utf_string_conversion_utils.h"
18#include "base/strings/utf_string_conversions.h"
19#include "base/third_party/icu/icu_utf.h"
20#include "base/values.h"
21
22namespace base {
23namespace internal {
24
25namespace {
26
27const int32_t kExtendedASCIIStart = 0x80;
28
29// Simple class that checks for maximum recursion/"stack overflow."
30class StackMarker {
31 public:
32  StackMarker(int max_depth, int* depth)
33      : max_depth_(max_depth), depth_(depth) {
34    ++(*depth_);
35    DCHECK_LE(*depth_, max_depth_);
36  }
37  ~StackMarker() { --(*depth_); }
38
39  bool IsTooDeep() const { return *depth_ >= max_depth_; }
40
41 private:
42  const int max_depth_;
43  int* const depth_;
44
45  StackMarker(const StackMarker&) = delete;
46  StackMarker& operator=(const StackMarker&) = delete;
47};
48
49constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD;
50
51}  // namespace
52
53// This is U+FFFD.
54const char kUnicodeReplacementString[] = "\xEF\xBF\xBD";
55
56JSONParser::JSONParser(int options, int max_depth)
57    : options_(options),
58      max_depth_(max_depth),
59      index_(0),
60      stack_depth_(0),
61      line_number_(0),
62      index_last_line_(0),
63      error_code_(JSONReader::JSON_NO_ERROR),
64      error_line_(0),
65      error_column_(0) {
66  CHECK_LE(max_depth, JSONReader::kStackMaxDepth);
67}
68
69JSONParser::~JSONParser() = default;
70
71std::optional<Value> JSONParser::Parse(std::string_view input) {
72  input_ = input;
73  index_ = 0;
74  line_number_ = 1;
75  index_last_line_ = 0;
76
77  error_code_ = JSONReader::JSON_NO_ERROR;
78  error_line_ = 0;
79  error_column_ = 0;
80
81  // ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure
82  // that the index_ will not overflow when parsing.
83  if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) {
84    ReportError(JSONReader::JSON_TOO_LARGE, 0);
85    return std::nullopt;
86  }
87
88  // When the input JSON string starts with a UTF-8 Byte-Order-Mark,
89  // advance the start position to avoid the ParseNextToken function mis-
90  // treating a Unicode BOM as an invalid character and returning NULL.
91  ConsumeIfMatch("\xEF\xBB\xBF");
92
93  // Parse the first and any nested tokens.
94  std::optional<Value> root(ParseNextToken());
95  if (!root)
96    return std::nullopt;
97
98  // Make sure the input stream is at an end.
99  if (GetNextToken() != T_END_OF_INPUT) {
100    ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1);
101    return std::nullopt;
102  }
103
104  return root;
105}
106
107JSONReader::JsonParseError JSONParser::error_code() const {
108  return error_code_;
109}
110
111std::string JSONParser::GetErrorMessage() const {
112  return FormatErrorMessage(error_line_, error_column_,
113                            JSONReader::ErrorCodeToString(error_code_));
114}
115
116int JSONParser::error_line() const {
117  return error_line_;
118}
119
120int JSONParser::error_column() const {
121  return error_column_;
122}
123
124// StringBuilder ///////////////////////////////////////////////////////////////
125
126JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {}
127
128JSONParser::StringBuilder::StringBuilder(const char* pos)
129    : pos_(pos), length_(0) {}
130
131JSONParser::StringBuilder::~StringBuilder() = default;
132
133JSONParser::StringBuilder& JSONParser::StringBuilder::operator=(
134    StringBuilder&& other) = default;
135
136void JSONParser::StringBuilder::Append(uint32_t point) {
137  DCHECK(IsValidCharacter(point));
138
139  if (point < kExtendedASCIIStart && !string_) {
140    DCHECK_EQ(static_cast<char>(point), pos_[length_]);
141    ++length_;
142  } else {
143    Convert();
144    if (UNLIKELY(point == kUnicodeReplacementPoint)) {
145      string_->append(kUnicodeReplacementString);
146    } else {
147      WriteUnicodeCharacter(point, &*string_);
148    }
149  }
150}
151
152void JSONParser::StringBuilder::Convert() {
153  if (string_)
154    return;
155  string_.emplace(pos_, length_);
156}
157
158std::string JSONParser::StringBuilder::DestructiveAsString() {
159  if (string_)
160    return std::move(*string_);
161  return std::string(pos_, length_);
162}
163
164// JSONParser private //////////////////////////////////////////////////////////
165
166std::optional<std::string_view> JSONParser::PeekChars(int count) {
167  if (static_cast<size_t>(index_) + count > input_.length())
168    return std::nullopt;
169  // Using std::string_view::substr() is significantly slower (according to
170  // base_perftests) than constructing a substring manually.
171  return std::string_view(input_.data() + index_, count);
172}
173
174std::optional<char> JSONParser::PeekChar() {
175  std::optional<std::string_view> chars = PeekChars(1);
176  if (chars)
177    return (*chars)[0];
178  return std::nullopt;
179}
180
181std::optional<std::string_view> JSONParser::ConsumeChars(int count) {
182  std::optional<std::string_view> chars = PeekChars(count);
183  if (chars)
184    index_ += count;
185  return chars;
186}
187
188std::optional<char> JSONParser::ConsumeChar() {
189  std::optional<std::string_view> chars = ConsumeChars(1);
190  if (chars)
191    return (*chars)[0];
192  return std::nullopt;
193}
194
195const char* JSONParser::pos() {
196  CHECK_LE(static_cast<size_t>(index_), input_.length());
197  return input_.data() + index_;
198}
199
200JSONParser::Token JSONParser::GetNextToken() {
201  EatWhitespaceAndComments();
202
203  std::optional<char> c = PeekChar();
204  if (!c)
205    return T_END_OF_INPUT;
206
207  switch (*c) {
208    case '{':
209      return T_OBJECT_BEGIN;
210    case '}':
211      return T_OBJECT_END;
212    case '[':
213      return T_ARRAY_BEGIN;
214    case ']':
215      return T_ARRAY_END;
216    case '"':
217      return T_STRING;
218    case '0':
219    case '1':
220    case '2':
221    case '3':
222    case '4':
223    case '5':
224    case '6':
225    case '7':
226    case '8':
227    case '9':
228    case '-':
229      return T_NUMBER;
230    case 't':
231      return T_BOOL_TRUE;
232    case 'f':
233      return T_BOOL_FALSE;
234    case 'n':
235      return T_NULL;
236    case ',':
237      return T_LIST_SEPARATOR;
238    case ':':
239      return T_OBJECT_PAIR_SEPARATOR;
240    default:
241      return T_INVALID_TOKEN;
242  }
243}
244
245void JSONParser::EatWhitespaceAndComments() {
246  while (std::optional<char> c = PeekChar()) {
247    switch (*c) {
248      case '\r':
249      case '\n':
250        index_last_line_ = index_;
251        // Don't increment line_number_ twice for "\r\n".
252        if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) {
253          ++line_number_;
254        }
255        FALLTHROUGH;
256      case ' ':
257      case '\t':
258        ConsumeChar();
259        break;
260      case '/':
261        if (!EatComment())
262          return;
263        break;
264      default:
265        return;
266    }
267  }
268}
269
270bool JSONParser::EatComment() {
271  std::optional<std::string_view> comment_start = ConsumeChars(2);
272  if (!comment_start)
273    return false;
274
275  if (comment_start == "//") {
276    // Single line comment, read to newline.
277    while (std::optional<char> c = PeekChar()) {
278      if (c == '\n' || c == '\r')
279        return true;
280      ConsumeChar();
281    }
282  } else if (comment_start == "/*") {
283    char previous_char = '\0';
284    // Block comment, read until end marker.
285    while (std::optional<char> c = PeekChar()) {
286      if (previous_char == '*' && c == '/') {
287        // EatWhitespaceAndComments will inspect pos(), which will still be on
288        // the last / of the comment, so advance once more (which may also be
289        // end of input).
290        ConsumeChar();
291        return true;
292      }
293      previous_char = *ConsumeChar();
294    }
295
296    // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
297  }
298
299  return false;
300}
301
302std::optional<Value> JSONParser::ParseNextToken() {
303  return ParseToken(GetNextToken());
304}
305
306std::optional<Value> JSONParser::ParseToken(Token token) {
307  switch (token) {
308    case T_OBJECT_BEGIN:
309      return ConsumeDictionary();
310    case T_ARRAY_BEGIN:
311      return ConsumeList();
312    case T_STRING:
313      return ConsumeString();
314    case T_NUMBER:
315      return ConsumeNumber();
316    case T_BOOL_TRUE:
317    case T_BOOL_FALSE:
318    case T_NULL:
319      return ConsumeLiteral();
320    default:
321      ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
322      return std::nullopt;
323  }
324}
325
326std::optional<Value> JSONParser::ConsumeDictionary() {
327  if (ConsumeChar() != '{') {
328    ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
329    return std::nullopt;
330  }
331
332  StackMarker depth_check(max_depth_, &stack_depth_);
333  if (depth_check.IsTooDeep()) {
334    ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
335    return std::nullopt;
336  }
337
338  std::vector<Value::DictStorage::value_type> dict_storage;
339
340  Token token = GetNextToken();
341  while (token != T_OBJECT_END) {
342    if (token != T_STRING) {
343      ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1);
344      return std::nullopt;
345    }
346
347    // First consume the key.
348    StringBuilder key;
349    if (!ConsumeStringRaw(&key)) {
350      return std::nullopt;
351    }
352
353    // Read the separator.
354    token = GetNextToken();
355    if (token != T_OBJECT_PAIR_SEPARATOR) {
356      ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
357      return std::nullopt;
358    }
359
360    // The next token is the value. Ownership transfers to |dict|.
361    ConsumeChar();
362    std::optional<Value> value = ParseNextToken();
363    if (!value) {
364      // ReportError from deeper level.
365      return std::nullopt;
366    }
367
368    dict_storage.emplace_back(key.DestructiveAsString(),
369                              std::make_unique<Value>(std::move(*value)));
370
371    token = GetNextToken();
372    if (token == T_LIST_SEPARATOR) {
373      ConsumeChar();
374      token = GetNextToken();
375      if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
376        ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
377        return std::nullopt;
378      }
379    } else if (token != T_OBJECT_END) {
380      ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
381      return std::nullopt;
382    }
383  }
384
385  ConsumeChar();  // Closing '}'.
386
387  return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES));
388}
389
390std::optional<Value> JSONParser::ConsumeList() {
391  if (ConsumeChar() != '[') {
392    ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
393    return std::nullopt;
394  }
395
396  StackMarker depth_check(max_depth_, &stack_depth_);
397  if (depth_check.IsTooDeep()) {
398    ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
399    return std::nullopt;
400  }
401
402  Value::ListStorage list_storage;
403
404  Token token = GetNextToken();
405  while (token != T_ARRAY_END) {
406    std::optional<Value> item = ParseToken(token);
407    if (!item) {
408      // ReportError from deeper level.
409      return std::nullopt;
410    }
411
412    list_storage.push_back(std::move(*item));
413
414    token = GetNextToken();
415    if (token == T_LIST_SEPARATOR) {
416      ConsumeChar();
417      token = GetNextToken();
418      if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
419        ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
420        return std::nullopt;
421      }
422    } else if (token != T_ARRAY_END) {
423      ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
424      return std::nullopt;
425    }
426  }
427
428  ConsumeChar();  // Closing ']'.
429
430  return Value(std::move(list_storage));
431}
432
433std::optional<Value> JSONParser::ConsumeString() {
434  StringBuilder string;
435  if (!ConsumeStringRaw(&string))
436    return std::nullopt;
437
438  return Value(string.DestructiveAsString());
439}
440
441bool JSONParser::ConsumeStringRaw(StringBuilder* out) {
442  if (ConsumeChar() != '"') {
443    ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
444    return false;
445  }
446
447  // StringBuilder will internally build a std::string_view unless a UTF-16
448  // conversion occurs, at which point it will perform a copy into a
449  // std::string.
450  StringBuilder string(pos());
451
452  while (PeekChar()) {
453    uint32_t next_char = 0;
454    if (!ReadUnicodeCharacter(input_.data(),
455                              static_cast<int32_t>(input_.length()), &index_,
456                              &next_char) ||
457        !IsValidCharacter(next_char)) {
458      if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
459        ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1);
460        return false;
461      }
462      ConsumeChar();
463      string.Append(kUnicodeReplacementPoint);
464      continue;
465    }
466
467    if (next_char == '"') {
468      ConsumeChar();
469      *out = std::move(string);
470      return true;
471    } else if (next_char != '\\') {
472      // If this character is not an escape sequence...
473      ConsumeChar();
474      string.Append(next_char);
475    } else {
476      // And if it is an escape sequence, the input string will be adjusted
477      // (either by combining the two characters of an encoded escape sequence,
478      // or with a UTF conversion), so using std::string_view isn't possible --
479      // force a conversion.
480      string.Convert();
481
482      // Read past the escape '\' and ensure there's a character following.
483      std::optional<std::string_view> escape_sequence = ConsumeChars(2);
484      if (!escape_sequence) {
485        ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
486        return false;
487      }
488
489      switch ((*escape_sequence)[1]) {
490        // Allowed esape sequences:
491        case 'x': {  // UTF-8 sequence.
492          // UTF-8 \x escape sequences are not allowed in the spec, but they
493          // are supported here for backwards-compatiblity with the old parser.
494          escape_sequence = ConsumeChars(2);
495          if (!escape_sequence) {
496            ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
497            return false;
498          }
499
500          int hex_digit = 0;
501          if (!HexStringToInt(*escape_sequence, &hex_digit) ||
502              !IsValidCharacter(hex_digit)) {
503            ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
504            return false;
505          }
506
507          string.Append(hex_digit);
508          break;
509        }
510        case 'u': {  // UTF-16 sequence.
511          // UTF units are of the form \uXXXX.
512          uint32_t code_point;
513          if (!DecodeUTF16(&code_point)) {
514            ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
515            return false;
516          }
517          string.Append(code_point);
518          break;
519        }
520        case '"':
521          string.Append('"');
522          break;
523        case '\\':
524          string.Append('\\');
525          break;
526        case '/':
527          string.Append('/');
528          break;
529        case 'b':
530          string.Append('\b');
531          break;
532        case 'f':
533          string.Append('\f');
534          break;
535        case 'n':
536          string.Append('\n');
537          break;
538        case 'r':
539          string.Append('\r');
540          break;
541        case 't':
542          string.Append('\t');
543          break;
544        case 'v':  // Not listed as valid escape sequence in the RFC.
545          string.Append('\v');
546          break;
547        // All other escape squences are illegal.
548        default:
549          ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
550          return false;
551      }
552    }
553  }
554
555  ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
556  return false;
557}
558
559// Entry is at the first X in \uXXXX.
560bool JSONParser::DecodeUTF16(uint32_t* out_code_point) {
561  std::optional<std::string_view> escape_sequence = ConsumeChars(4);
562  if (!escape_sequence)
563    return false;
564
565  // Consume the UTF-16 code unit, which may be a high surrogate.
566  int code_unit16_high = 0;
567  if (!HexStringToInt(*escape_sequence, &code_unit16_high))
568    return false;
569
570  // If this is a high surrogate, consume the next code unit to get the
571  // low surrogate.
572  if (CBU16_IS_SURROGATE(code_unit16_high)) {
573    // Make sure this is the high surrogate. If not, it's an encoding
574    // error.
575    if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
576      return false;
577
578    // Make sure that the token has more characters to consume the
579    // lower surrogate.
580    if (!ConsumeIfMatch("\\u"))
581      return false;
582
583    escape_sequence = ConsumeChars(4);
584    if (!escape_sequence)
585      return false;
586
587    int code_unit16_low = 0;
588    if (!HexStringToInt(*escape_sequence, &code_unit16_low))
589      return false;
590
591    if (!CBU16_IS_TRAIL(code_unit16_low))
592      return false;
593
594    uint32_t code_point =
595        CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low);
596    if (!IsValidCharacter(code_point))
597      return false;
598
599    *out_code_point = code_point;
600  } else {
601    // Not a surrogate.
602    DCHECK(CBU16_IS_SINGLE(code_unit16_high));
603    if (!IsValidCharacter(code_unit16_high)) {
604      if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
605        return false;
606      }
607      *out_code_point = kUnicodeReplacementPoint;
608      return true;
609    }
610
611    *out_code_point = code_unit16_high;
612  }
613
614  return true;
615}
616
617std::optional<Value> JSONParser::ConsumeNumber() {
618  const char* num_start = pos();
619  const int start_index = index_;
620  int end_index = start_index;
621
622  if (PeekChar() == '-')
623    ConsumeChar();
624
625  if (!ReadInt(false)) {
626    ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
627    return std::nullopt;
628  }
629  end_index = index_;
630
631  // The optional fraction part.
632  if (PeekChar() == '.') {
633    ConsumeChar();
634    if (!ReadInt(true)) {
635      ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
636      return std::nullopt;
637    }
638    end_index = index_;
639  }
640
641  // Optional exponent part.
642  std::optional<char> c = PeekChar();
643  if (c == 'e' || c == 'E') {
644    ConsumeChar();
645    if (PeekChar() == '-' || PeekChar() == '+') {
646      ConsumeChar();
647    }
648    if (!ReadInt(true)) {
649      ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
650      return std::nullopt;
651    }
652    end_index = index_;
653  }
654
655  // ReadInt is greedy because numbers have no easily detectable sentinel,
656  // so save off where the parser should be on exit (see Consume invariant at
657  // the top of the header), then make sure the next token is one which is
658  // valid.
659  int exit_index = index_;
660
661  switch (GetNextToken()) {
662    case T_OBJECT_END:
663    case T_ARRAY_END:
664    case T_LIST_SEPARATOR:
665    case T_END_OF_INPUT:
666      break;
667    default:
668      ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
669      return std::nullopt;
670  }
671
672  index_ = exit_index;
673
674  std::string_view num_string(num_start, end_index - start_index);
675
676  int num_int;
677  if (StringToInt(num_string, &num_int))
678    return Value(num_int);
679
680  return std::nullopt;
681}
682
683bool JSONParser::ReadInt(bool allow_leading_zeros) {
684  size_t len = 0;
685  char first = 0;
686
687  while (std::optional<char> c = PeekChar()) {
688    if (!IsAsciiDigit(c))
689      break;
690
691    if (len == 0)
692      first = *c;
693
694    ++len;
695    ConsumeChar();
696  }
697
698  if (len == 0)
699    return false;
700
701  if (!allow_leading_zeros && len > 1 && first == '0')
702    return false;
703
704  return true;
705}
706
707std::optional<Value> JSONParser::ConsumeLiteral() {
708  if (ConsumeIfMatch("true")) {
709    return Value(true);
710  } else if (ConsumeIfMatch("false")) {
711    return Value(false);
712  } else if (ConsumeIfMatch("null")) {
713    return Value(Value::Type::NONE);
714  } else {
715    ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
716    return std::nullopt;
717  }
718}
719
720bool JSONParser::ConsumeIfMatch(std::string_view match) {
721  if (match == PeekChars(match.size())) {
722    ConsumeChars(match.size());
723    return true;
724  }
725  return false;
726}
727
728void JSONParser::ReportError(JSONReader::JsonParseError code,
729                             int column_adjust) {
730  error_code_ = code;
731  error_line_ = line_number_;
732  error_column_ = index_ - index_last_line_ + column_adjust;
733}
734
735// static
736std::string JSONParser::FormatErrorMessage(int line,
737                                           int column,
738                                           const std::string& description) {
739  if (line || column) {
740    return StringPrintf("Line: %i, column: %i, %s", line, column,
741                        description.c_str());
742  }
743  return description;
744}
745
746}  // namespace internal
747}  // namespace base
748