1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/json/json_parser.h" 6 7#include <cmath> 8#include <string_view> 9#include <utility> 10#include <vector> 11 12#include "base/logging.h" 13#include "base/numerics/safe_conversions.h" 14#include "base/strings/string_number_conversions.h" 15#include "base/strings/string_util.h" 16#include "base/strings/stringprintf.h" 17#include "base/strings/utf_string_conversion_utils.h" 18#include "base/strings/utf_string_conversions.h" 19#include "base/third_party/icu/icu_utf.h" 20#include "base/values.h" 21 22namespace base { 23namespace internal { 24 25namespace { 26 27const int32_t kExtendedASCIIStart = 0x80; 28 29// Simple class that checks for maximum recursion/"stack overflow." 30class StackMarker { 31 public: 32 StackMarker(int max_depth, int* depth) 33 : max_depth_(max_depth), depth_(depth) { 34 ++(*depth_); 35 DCHECK_LE(*depth_, max_depth_); 36 } 37 ~StackMarker() { --(*depth_); } 38 39 bool IsTooDeep() const { return *depth_ >= max_depth_; } 40 41 private: 42 const int max_depth_; 43 int* const depth_; 44 45 StackMarker(const StackMarker&) = delete; 46 StackMarker& operator=(const StackMarker&) = delete; 47}; 48 49constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD; 50 51} // namespace 52 53// This is U+FFFD. 54const char kUnicodeReplacementString[] = "\xEF\xBF\xBD"; 55 56JSONParser::JSONParser(int options, int max_depth) 57 : options_(options), 58 max_depth_(max_depth), 59 index_(0), 60 stack_depth_(0), 61 line_number_(0), 62 index_last_line_(0), 63 error_code_(JSONReader::JSON_NO_ERROR), 64 error_line_(0), 65 error_column_(0) { 66 CHECK_LE(max_depth, JSONReader::kStackMaxDepth); 67} 68 69JSONParser::~JSONParser() = default; 70 71std::optional<Value> JSONParser::Parse(std::string_view input) { 72 input_ = input; 73 index_ = 0; 74 line_number_ = 1; 75 index_last_line_ = 0; 76 77 error_code_ = JSONReader::JSON_NO_ERROR; 78 error_line_ = 0; 79 error_column_ = 0; 80 81 // ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure 82 // that the index_ will not overflow when parsing. 83 if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) { 84 ReportError(JSONReader::JSON_TOO_LARGE, 0); 85 return std::nullopt; 86 } 87 88 // When the input JSON string starts with a UTF-8 Byte-Order-Mark, 89 // advance the start position to avoid the ParseNextToken function mis- 90 // treating a Unicode BOM as an invalid character and returning NULL. 91 ConsumeIfMatch("\xEF\xBB\xBF"); 92 93 // Parse the first and any nested tokens. 94 std::optional<Value> root(ParseNextToken()); 95 if (!root) 96 return std::nullopt; 97 98 // Make sure the input stream is at an end. 99 if (GetNextToken() != T_END_OF_INPUT) { 100 ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1); 101 return std::nullopt; 102 } 103 104 return root; 105} 106 107JSONReader::JsonParseError JSONParser::error_code() const { 108 return error_code_; 109} 110 111std::string JSONParser::GetErrorMessage() const { 112 return FormatErrorMessage(error_line_, error_column_, 113 JSONReader::ErrorCodeToString(error_code_)); 114} 115 116int JSONParser::error_line() const { 117 return error_line_; 118} 119 120int JSONParser::error_column() const { 121 return error_column_; 122} 123 124// StringBuilder /////////////////////////////////////////////////////////////// 125 126JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {} 127 128JSONParser::StringBuilder::StringBuilder(const char* pos) 129 : pos_(pos), length_(0) {} 130 131JSONParser::StringBuilder::~StringBuilder() = default; 132 133JSONParser::StringBuilder& JSONParser::StringBuilder::operator=( 134 StringBuilder&& other) = default; 135 136void JSONParser::StringBuilder::Append(uint32_t point) { 137 DCHECK(IsValidCharacter(point)); 138 139 if (point < kExtendedASCIIStart && !string_) { 140 DCHECK_EQ(static_cast<char>(point), pos_[length_]); 141 ++length_; 142 } else { 143 Convert(); 144 if (UNLIKELY(point == kUnicodeReplacementPoint)) { 145 string_->append(kUnicodeReplacementString); 146 } else { 147 WriteUnicodeCharacter(point, &*string_); 148 } 149 } 150} 151 152void JSONParser::StringBuilder::Convert() { 153 if (string_) 154 return; 155 string_.emplace(pos_, length_); 156} 157 158std::string JSONParser::StringBuilder::DestructiveAsString() { 159 if (string_) 160 return std::move(*string_); 161 return std::string(pos_, length_); 162} 163 164// JSONParser private ////////////////////////////////////////////////////////// 165 166std::optional<std::string_view> JSONParser::PeekChars(int count) { 167 if (static_cast<size_t>(index_) + count > input_.length()) 168 return std::nullopt; 169 // Using std::string_view::substr() is significantly slower (according to 170 // base_perftests) than constructing a substring manually. 171 return std::string_view(input_.data() + index_, count); 172} 173 174std::optional<char> JSONParser::PeekChar() { 175 std::optional<std::string_view> chars = PeekChars(1); 176 if (chars) 177 return (*chars)[0]; 178 return std::nullopt; 179} 180 181std::optional<std::string_view> JSONParser::ConsumeChars(int count) { 182 std::optional<std::string_view> chars = PeekChars(count); 183 if (chars) 184 index_ += count; 185 return chars; 186} 187 188std::optional<char> JSONParser::ConsumeChar() { 189 std::optional<std::string_view> chars = ConsumeChars(1); 190 if (chars) 191 return (*chars)[0]; 192 return std::nullopt; 193} 194 195const char* JSONParser::pos() { 196 CHECK_LE(static_cast<size_t>(index_), input_.length()); 197 return input_.data() + index_; 198} 199 200JSONParser::Token JSONParser::GetNextToken() { 201 EatWhitespaceAndComments(); 202 203 std::optional<char> c = PeekChar(); 204 if (!c) 205 return T_END_OF_INPUT; 206 207 switch (*c) { 208 case '{': 209 return T_OBJECT_BEGIN; 210 case '}': 211 return T_OBJECT_END; 212 case '[': 213 return T_ARRAY_BEGIN; 214 case ']': 215 return T_ARRAY_END; 216 case '"': 217 return T_STRING; 218 case '0': 219 case '1': 220 case '2': 221 case '3': 222 case '4': 223 case '5': 224 case '6': 225 case '7': 226 case '8': 227 case '9': 228 case '-': 229 return T_NUMBER; 230 case 't': 231 return T_BOOL_TRUE; 232 case 'f': 233 return T_BOOL_FALSE; 234 case 'n': 235 return T_NULL; 236 case ',': 237 return T_LIST_SEPARATOR; 238 case ':': 239 return T_OBJECT_PAIR_SEPARATOR; 240 default: 241 return T_INVALID_TOKEN; 242 } 243} 244 245void JSONParser::EatWhitespaceAndComments() { 246 while (std::optional<char> c = PeekChar()) { 247 switch (*c) { 248 case '\r': 249 case '\n': 250 index_last_line_ = index_; 251 // Don't increment line_number_ twice for "\r\n". 252 if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) { 253 ++line_number_; 254 } 255 FALLTHROUGH; 256 case ' ': 257 case '\t': 258 ConsumeChar(); 259 break; 260 case '/': 261 if (!EatComment()) 262 return; 263 break; 264 default: 265 return; 266 } 267 } 268} 269 270bool JSONParser::EatComment() { 271 std::optional<std::string_view> comment_start = ConsumeChars(2); 272 if (!comment_start) 273 return false; 274 275 if (comment_start == "//") { 276 // Single line comment, read to newline. 277 while (std::optional<char> c = PeekChar()) { 278 if (c == '\n' || c == '\r') 279 return true; 280 ConsumeChar(); 281 } 282 } else if (comment_start == "/*") { 283 char previous_char = '\0'; 284 // Block comment, read until end marker. 285 while (std::optional<char> c = PeekChar()) { 286 if (previous_char == '*' && c == '/') { 287 // EatWhitespaceAndComments will inspect pos(), which will still be on 288 // the last / of the comment, so advance once more (which may also be 289 // end of input). 290 ConsumeChar(); 291 return true; 292 } 293 previous_char = *ConsumeChar(); 294 } 295 296 // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT. 297 } 298 299 return false; 300} 301 302std::optional<Value> JSONParser::ParseNextToken() { 303 return ParseToken(GetNextToken()); 304} 305 306std::optional<Value> JSONParser::ParseToken(Token token) { 307 switch (token) { 308 case T_OBJECT_BEGIN: 309 return ConsumeDictionary(); 310 case T_ARRAY_BEGIN: 311 return ConsumeList(); 312 case T_STRING: 313 return ConsumeString(); 314 case T_NUMBER: 315 return ConsumeNumber(); 316 case T_BOOL_TRUE: 317 case T_BOOL_FALSE: 318 case T_NULL: 319 return ConsumeLiteral(); 320 default: 321 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); 322 return std::nullopt; 323 } 324} 325 326std::optional<Value> JSONParser::ConsumeDictionary() { 327 if (ConsumeChar() != '{') { 328 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); 329 return std::nullopt; 330 } 331 332 StackMarker depth_check(max_depth_, &stack_depth_); 333 if (depth_check.IsTooDeep()) { 334 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0); 335 return std::nullopt; 336 } 337 338 std::vector<Value::DictStorage::value_type> dict_storage; 339 340 Token token = GetNextToken(); 341 while (token != T_OBJECT_END) { 342 if (token != T_STRING) { 343 ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1); 344 return std::nullopt; 345 } 346 347 // First consume the key. 348 StringBuilder key; 349 if (!ConsumeStringRaw(&key)) { 350 return std::nullopt; 351 } 352 353 // Read the separator. 354 token = GetNextToken(); 355 if (token != T_OBJECT_PAIR_SEPARATOR) { 356 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); 357 return std::nullopt; 358 } 359 360 // The next token is the value. Ownership transfers to |dict|. 361 ConsumeChar(); 362 std::optional<Value> value = ParseNextToken(); 363 if (!value) { 364 // ReportError from deeper level. 365 return std::nullopt; 366 } 367 368 dict_storage.emplace_back(key.DestructiveAsString(), 369 std::make_unique<Value>(std::move(*value))); 370 371 token = GetNextToken(); 372 if (token == T_LIST_SEPARATOR) { 373 ConsumeChar(); 374 token = GetNextToken(); 375 if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) { 376 ReportError(JSONReader::JSON_TRAILING_COMMA, 1); 377 return std::nullopt; 378 } 379 } else if (token != T_OBJECT_END) { 380 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0); 381 return std::nullopt; 382 } 383 } 384 385 ConsumeChar(); // Closing '}'. 386 387 return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES)); 388} 389 390std::optional<Value> JSONParser::ConsumeList() { 391 if (ConsumeChar() != '[') { 392 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); 393 return std::nullopt; 394 } 395 396 StackMarker depth_check(max_depth_, &stack_depth_); 397 if (depth_check.IsTooDeep()) { 398 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0); 399 return std::nullopt; 400 } 401 402 Value::ListStorage list_storage; 403 404 Token token = GetNextToken(); 405 while (token != T_ARRAY_END) { 406 std::optional<Value> item = ParseToken(token); 407 if (!item) { 408 // ReportError from deeper level. 409 return std::nullopt; 410 } 411 412 list_storage.push_back(std::move(*item)); 413 414 token = GetNextToken(); 415 if (token == T_LIST_SEPARATOR) { 416 ConsumeChar(); 417 token = GetNextToken(); 418 if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) { 419 ReportError(JSONReader::JSON_TRAILING_COMMA, 1); 420 return std::nullopt; 421 } 422 } else if (token != T_ARRAY_END) { 423 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); 424 return std::nullopt; 425 } 426 } 427 428 ConsumeChar(); // Closing ']'. 429 430 return Value(std::move(list_storage)); 431} 432 433std::optional<Value> JSONParser::ConsumeString() { 434 StringBuilder string; 435 if (!ConsumeStringRaw(&string)) 436 return std::nullopt; 437 438 return Value(string.DestructiveAsString()); 439} 440 441bool JSONParser::ConsumeStringRaw(StringBuilder* out) { 442 if (ConsumeChar() != '"') { 443 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); 444 return false; 445 } 446 447 // StringBuilder will internally build a std::string_view unless a UTF-16 448 // conversion occurs, at which point it will perform a copy into a 449 // std::string. 450 StringBuilder string(pos()); 451 452 while (PeekChar()) { 453 uint32_t next_char = 0; 454 if (!ReadUnicodeCharacter(input_.data(), 455 static_cast<int32_t>(input_.length()), &index_, 456 &next_char) || 457 !IsValidCharacter(next_char)) { 458 if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) { 459 ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1); 460 return false; 461 } 462 ConsumeChar(); 463 string.Append(kUnicodeReplacementPoint); 464 continue; 465 } 466 467 if (next_char == '"') { 468 ConsumeChar(); 469 *out = std::move(string); 470 return true; 471 } else if (next_char != '\\') { 472 // If this character is not an escape sequence... 473 ConsumeChar(); 474 string.Append(next_char); 475 } else { 476 // And if it is an escape sequence, the input string will be adjusted 477 // (either by combining the two characters of an encoded escape sequence, 478 // or with a UTF conversion), so using std::string_view isn't possible -- 479 // force a conversion. 480 string.Convert(); 481 482 // Read past the escape '\' and ensure there's a character following. 483 std::optional<std::string_view> escape_sequence = ConsumeChars(2); 484 if (!escape_sequence) { 485 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); 486 return false; 487 } 488 489 switch ((*escape_sequence)[1]) { 490 // Allowed esape sequences: 491 case 'x': { // UTF-8 sequence. 492 // UTF-8 \x escape sequences are not allowed in the spec, but they 493 // are supported here for backwards-compatiblity with the old parser. 494 escape_sequence = ConsumeChars(2); 495 if (!escape_sequence) { 496 ReportError(JSONReader::JSON_INVALID_ESCAPE, -2); 497 return false; 498 } 499 500 int hex_digit = 0; 501 if (!HexStringToInt(*escape_sequence, &hex_digit) || 502 !IsValidCharacter(hex_digit)) { 503 ReportError(JSONReader::JSON_INVALID_ESCAPE, -2); 504 return false; 505 } 506 507 string.Append(hex_digit); 508 break; 509 } 510 case 'u': { // UTF-16 sequence. 511 // UTF units are of the form \uXXXX. 512 uint32_t code_point; 513 if (!DecodeUTF16(&code_point)) { 514 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); 515 return false; 516 } 517 string.Append(code_point); 518 break; 519 } 520 case '"': 521 string.Append('"'); 522 break; 523 case '\\': 524 string.Append('\\'); 525 break; 526 case '/': 527 string.Append('/'); 528 break; 529 case 'b': 530 string.Append('\b'); 531 break; 532 case 'f': 533 string.Append('\f'); 534 break; 535 case 'n': 536 string.Append('\n'); 537 break; 538 case 'r': 539 string.Append('\r'); 540 break; 541 case 't': 542 string.Append('\t'); 543 break; 544 case 'v': // Not listed as valid escape sequence in the RFC. 545 string.Append('\v'); 546 break; 547 // All other escape squences are illegal. 548 default: 549 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); 550 return false; 551 } 552 } 553 } 554 555 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0); 556 return false; 557} 558 559// Entry is at the first X in \uXXXX. 560bool JSONParser::DecodeUTF16(uint32_t* out_code_point) { 561 std::optional<std::string_view> escape_sequence = ConsumeChars(4); 562 if (!escape_sequence) 563 return false; 564 565 // Consume the UTF-16 code unit, which may be a high surrogate. 566 int code_unit16_high = 0; 567 if (!HexStringToInt(*escape_sequence, &code_unit16_high)) 568 return false; 569 570 // If this is a high surrogate, consume the next code unit to get the 571 // low surrogate. 572 if (CBU16_IS_SURROGATE(code_unit16_high)) { 573 // Make sure this is the high surrogate. If not, it's an encoding 574 // error. 575 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high)) 576 return false; 577 578 // Make sure that the token has more characters to consume the 579 // lower surrogate. 580 if (!ConsumeIfMatch("\\u")) 581 return false; 582 583 escape_sequence = ConsumeChars(4); 584 if (!escape_sequence) 585 return false; 586 587 int code_unit16_low = 0; 588 if (!HexStringToInt(*escape_sequence, &code_unit16_low)) 589 return false; 590 591 if (!CBU16_IS_TRAIL(code_unit16_low)) 592 return false; 593 594 uint32_t code_point = 595 CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low); 596 if (!IsValidCharacter(code_point)) 597 return false; 598 599 *out_code_point = code_point; 600 } else { 601 // Not a surrogate. 602 DCHECK(CBU16_IS_SINGLE(code_unit16_high)); 603 if (!IsValidCharacter(code_unit16_high)) { 604 if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) { 605 return false; 606 } 607 *out_code_point = kUnicodeReplacementPoint; 608 return true; 609 } 610 611 *out_code_point = code_unit16_high; 612 } 613 614 return true; 615} 616 617std::optional<Value> JSONParser::ConsumeNumber() { 618 const char* num_start = pos(); 619 const int start_index = index_; 620 int end_index = start_index; 621 622 if (PeekChar() == '-') 623 ConsumeChar(); 624 625 if (!ReadInt(false)) { 626 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); 627 return std::nullopt; 628 } 629 end_index = index_; 630 631 // The optional fraction part. 632 if (PeekChar() == '.') { 633 ConsumeChar(); 634 if (!ReadInt(true)) { 635 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); 636 return std::nullopt; 637 } 638 end_index = index_; 639 } 640 641 // Optional exponent part. 642 std::optional<char> c = PeekChar(); 643 if (c == 'e' || c == 'E') { 644 ConsumeChar(); 645 if (PeekChar() == '-' || PeekChar() == '+') { 646 ConsumeChar(); 647 } 648 if (!ReadInt(true)) { 649 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); 650 return std::nullopt; 651 } 652 end_index = index_; 653 } 654 655 // ReadInt is greedy because numbers have no easily detectable sentinel, 656 // so save off where the parser should be on exit (see Consume invariant at 657 // the top of the header), then make sure the next token is one which is 658 // valid. 659 int exit_index = index_; 660 661 switch (GetNextToken()) { 662 case T_OBJECT_END: 663 case T_ARRAY_END: 664 case T_LIST_SEPARATOR: 665 case T_END_OF_INPUT: 666 break; 667 default: 668 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); 669 return std::nullopt; 670 } 671 672 index_ = exit_index; 673 674 std::string_view num_string(num_start, end_index - start_index); 675 676 int num_int; 677 if (StringToInt(num_string, &num_int)) 678 return Value(num_int); 679 680 return std::nullopt; 681} 682 683bool JSONParser::ReadInt(bool allow_leading_zeros) { 684 size_t len = 0; 685 char first = 0; 686 687 while (std::optional<char> c = PeekChar()) { 688 if (!IsAsciiDigit(c)) 689 break; 690 691 if (len == 0) 692 first = *c; 693 694 ++len; 695 ConsumeChar(); 696 } 697 698 if (len == 0) 699 return false; 700 701 if (!allow_leading_zeros && len > 1 && first == '0') 702 return false; 703 704 return true; 705} 706 707std::optional<Value> JSONParser::ConsumeLiteral() { 708 if (ConsumeIfMatch("true")) { 709 return Value(true); 710 } else if (ConsumeIfMatch("false")) { 711 return Value(false); 712 } else if (ConsumeIfMatch("null")) { 713 return Value(Value::Type::NONE); 714 } else { 715 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); 716 return std::nullopt; 717 } 718} 719 720bool JSONParser::ConsumeIfMatch(std::string_view match) { 721 if (match == PeekChars(match.size())) { 722 ConsumeChars(match.size()); 723 return true; 724 } 725 return false; 726} 727 728void JSONParser::ReportError(JSONReader::JsonParseError code, 729 int column_adjust) { 730 error_code_ = code; 731 error_line_ = line_number_; 732 error_column_ = index_ - index_last_line_ + column_adjust; 733} 734 735// static 736std::string JSONParser::FormatErrorMessage(int line, 737 int column, 738 const std::string& description) { 739 if (line || column) { 740 return StringPrintf("Line: %i, column: %i, %s", line, column, 741 description.c_str()); 742 } 743 return description; 744} 745 746} // namespace internal 747} // namespace base 748