16d528ed9Sopenharmony_ci// Copyright (c) 2012 The Chromium Authors. All rights reserved.
26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
36d528ed9Sopenharmony_ci// found in the LICENSE file.
46d528ed9Sopenharmony_ci
56d528ed9Sopenharmony_ci#ifndef BASE_JSON_JSON_PARSER_H_
66d528ed9Sopenharmony_ci#define BASE_JSON_JSON_PARSER_H_
76d528ed9Sopenharmony_ci
86d528ed9Sopenharmony_ci#include <stddef.h>
96d528ed9Sopenharmony_ci#include <stdint.h>
106d528ed9Sopenharmony_ci
116d528ed9Sopenharmony_ci#include <memory>
126d528ed9Sopenharmony_ci#include <optional>
136d528ed9Sopenharmony_ci#include <string>
146d528ed9Sopenharmony_ci#include <string_view>
156d528ed9Sopenharmony_ci
166d528ed9Sopenharmony_ci#include "base/compiler_specific.h"
176d528ed9Sopenharmony_ci#include "base/gtest_prod_util.h"
186d528ed9Sopenharmony_ci#include "base/json/json_reader.h"
196d528ed9Sopenharmony_ci
206d528ed9Sopenharmony_cinamespace base {
216d528ed9Sopenharmony_ci
226d528ed9Sopenharmony_ciclass Value;
236d528ed9Sopenharmony_ci
246d528ed9Sopenharmony_cinamespace internal {
256d528ed9Sopenharmony_ci
266d528ed9Sopenharmony_ciclass JSONParserTest;
276d528ed9Sopenharmony_ci
286d528ed9Sopenharmony_ci// The implementation behind the JSONReader interface. This class is not meant
296d528ed9Sopenharmony_ci// to be used directly; it encapsulates logic that need not be exposed publicly.
306d528ed9Sopenharmony_ci//
316d528ed9Sopenharmony_ci// This parser guarantees O(n) time through the input string. Iteration happens
326d528ed9Sopenharmony_ci// on the byte level, with the functions ConsumeChars() and ConsumeChar(). The
336d528ed9Sopenharmony_ci// conversion from byte to JSON token happens without advancing the parser in
346d528ed9Sopenharmony_ci// GetNextToken/ParseToken, that is tokenization operates on the current parser
356d528ed9Sopenharmony_ci// position without advancing.
366d528ed9Sopenharmony_ci//
376d528ed9Sopenharmony_ci// Built on top of these are a family of Consume functions that iterate
386d528ed9Sopenharmony_ci// internally. Invariant: on entry of a Consume function, the parser is wound
396d528ed9Sopenharmony_ci// to the first byte of a valid JSON token. On exit, it is on the first byte
406d528ed9Sopenharmony_ci// after the token that was just consumed, which would likely be the first byte
416d528ed9Sopenharmony_ci// of the next token.
426d528ed9Sopenharmony_ciclass JSONParser {
436d528ed9Sopenharmony_ci public:
446d528ed9Sopenharmony_ci  JSONParser(int options, int max_depth = JSONReader::kStackMaxDepth);
456d528ed9Sopenharmony_ci  ~JSONParser();
466d528ed9Sopenharmony_ci
476d528ed9Sopenharmony_ci  // Parses the input string according to the set options and returns the
486d528ed9Sopenharmony_ci  // result as a Value.
496d528ed9Sopenharmony_ci  // Wrap this in base::FooValue::From() to check the Value is of type Foo and
506d528ed9Sopenharmony_ci  // convert to a FooValue at the same time.
516d528ed9Sopenharmony_ci  std::optional<Value> Parse(std::string_view input);
526d528ed9Sopenharmony_ci
536d528ed9Sopenharmony_ci  // Returns the error code.
546d528ed9Sopenharmony_ci  JSONReader::JsonParseError error_code() const;
556d528ed9Sopenharmony_ci
566d528ed9Sopenharmony_ci  // Returns the human-friendly error message.
576d528ed9Sopenharmony_ci  std::string GetErrorMessage() const;
586d528ed9Sopenharmony_ci
596d528ed9Sopenharmony_ci  // Returns the error line number if parse error happened. Otherwise always
606d528ed9Sopenharmony_ci  // returns 0.
616d528ed9Sopenharmony_ci  int error_line() const;
626d528ed9Sopenharmony_ci
636d528ed9Sopenharmony_ci  // Returns the error column number if parse error happened. Otherwise always
646d528ed9Sopenharmony_ci  // returns 0.
656d528ed9Sopenharmony_ci  int error_column() const;
666d528ed9Sopenharmony_ci
676d528ed9Sopenharmony_ci private:
686d528ed9Sopenharmony_ci  enum Token {
696d528ed9Sopenharmony_ci    T_OBJECT_BEGIN,  // {
706d528ed9Sopenharmony_ci    T_OBJECT_END,    // }
716d528ed9Sopenharmony_ci    T_ARRAY_BEGIN,   // [
726d528ed9Sopenharmony_ci    T_ARRAY_END,     // ]
736d528ed9Sopenharmony_ci    T_STRING,
746d528ed9Sopenharmony_ci    T_NUMBER,
756d528ed9Sopenharmony_ci    T_BOOL_TRUE,              // true
766d528ed9Sopenharmony_ci    T_BOOL_FALSE,             // false
776d528ed9Sopenharmony_ci    T_NULL,                   // null
786d528ed9Sopenharmony_ci    T_LIST_SEPARATOR,         // ,
796d528ed9Sopenharmony_ci    T_OBJECT_PAIR_SEPARATOR,  // :
806d528ed9Sopenharmony_ci    T_END_OF_INPUT,
816d528ed9Sopenharmony_ci    T_INVALID_TOKEN,
826d528ed9Sopenharmony_ci  };
836d528ed9Sopenharmony_ci
846d528ed9Sopenharmony_ci  // A helper class used for parsing strings. One optimization performed is to
856d528ed9Sopenharmony_ci  // create base::Value with a std::string_view to avoid unnecessary std::string
866d528ed9Sopenharmony_ci  // copies. This is not possible if the input string needs to be decoded from
876d528ed9Sopenharmony_ci  // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
886d528ed9Sopenharmony_ci  // This class centralizes that logic.
896d528ed9Sopenharmony_ci  class StringBuilder {
906d528ed9Sopenharmony_ci   public:
916d528ed9Sopenharmony_ci    // Empty constructor. Used for creating a builder with which to assign to.
926d528ed9Sopenharmony_ci    StringBuilder();
936d528ed9Sopenharmony_ci
946d528ed9Sopenharmony_ci    // |pos| is the beginning of an input string, excluding the |"|.
956d528ed9Sopenharmony_ci    explicit StringBuilder(const char* pos);
966d528ed9Sopenharmony_ci
976d528ed9Sopenharmony_ci    ~StringBuilder();
986d528ed9Sopenharmony_ci
996d528ed9Sopenharmony_ci    StringBuilder& operator=(StringBuilder&& other);
1006d528ed9Sopenharmony_ci
1016d528ed9Sopenharmony_ci    // Appends the Unicode code point |point| to the string, either by
1026d528ed9Sopenharmony_ci    // increasing the |length_| of the string if the string has not been
1036d528ed9Sopenharmony_ci    // converted, or by appending the UTF8 bytes for the code point.
1046d528ed9Sopenharmony_ci    void Append(uint32_t point);
1056d528ed9Sopenharmony_ci
1066d528ed9Sopenharmony_ci    // Converts the builder from its default std::string_view to a full
1076d528ed9Sopenharmony_ci    // std::string, performing a copy. Once a builder is converted, it cannot be
1086d528ed9Sopenharmony_ci    // made a std::string_view again.
1096d528ed9Sopenharmony_ci    void Convert();
1106d528ed9Sopenharmony_ci
1116d528ed9Sopenharmony_ci    // Returns the builder as a string, invalidating all state. This allows
1126d528ed9Sopenharmony_ci    // the internal string buffer representation to be destructively moved
1136d528ed9Sopenharmony_ci    // in cases where the builder will not be needed any more.
1146d528ed9Sopenharmony_ci    std::string DestructiveAsString();
1156d528ed9Sopenharmony_ci
1166d528ed9Sopenharmony_ci   private:
1176d528ed9Sopenharmony_ci    // The beginning of the input string.
1186d528ed9Sopenharmony_ci    const char* pos_;
1196d528ed9Sopenharmony_ci
1206d528ed9Sopenharmony_ci    // Number of bytes in |pos_| that make up the string being built.
1216d528ed9Sopenharmony_ci    size_t length_;
1226d528ed9Sopenharmony_ci
1236d528ed9Sopenharmony_ci    // The copied string representation. Will be unset until Convert() is
1246d528ed9Sopenharmony_ci    // called.
1256d528ed9Sopenharmony_ci    std::optional<std::string> string_;
1266d528ed9Sopenharmony_ci  };
1276d528ed9Sopenharmony_ci
1286d528ed9Sopenharmony_ci  // Returns the next |count| bytes of the input stream, or nullopt if fewer
1296d528ed9Sopenharmony_ci  // than |count| bytes remain.
1306d528ed9Sopenharmony_ci  std::optional<std::string_view> PeekChars(int count);
1316d528ed9Sopenharmony_ci
1326d528ed9Sopenharmony_ci  // Calls PeekChars() with a |count| of 1.
1336d528ed9Sopenharmony_ci  std::optional<char> PeekChar();
1346d528ed9Sopenharmony_ci
1356d528ed9Sopenharmony_ci  // Returns the next |count| bytes of the input stream, or nullopt if fewer
1366d528ed9Sopenharmony_ci  // than |count| bytes remain, and advances the parser position by |count|.
1376d528ed9Sopenharmony_ci  std::optional<std::string_view> ConsumeChars(int count);
1386d528ed9Sopenharmony_ci
1396d528ed9Sopenharmony_ci  // Calls ConsumeChars() with a |count| of 1.
1406d528ed9Sopenharmony_ci  std::optional<char> ConsumeChar();
1416d528ed9Sopenharmony_ci
1426d528ed9Sopenharmony_ci  // Returns a pointer to the current character position.
1436d528ed9Sopenharmony_ci  const char* pos();
1446d528ed9Sopenharmony_ci
1456d528ed9Sopenharmony_ci  // Skips over whitespace and comments to find the next token in the stream.
1466d528ed9Sopenharmony_ci  // This does not advance the parser for non-whitespace or comment chars.
1476d528ed9Sopenharmony_ci  Token GetNextToken();
1486d528ed9Sopenharmony_ci
1496d528ed9Sopenharmony_ci  // Consumes whitespace characters and comments until the next non-that is
1506d528ed9Sopenharmony_ci  // encountered.
1516d528ed9Sopenharmony_ci  void EatWhitespaceAndComments();
1526d528ed9Sopenharmony_ci  // Helper function that consumes a comment, assuming that the parser is
1536d528ed9Sopenharmony_ci  // currently wound to a '/'.
1546d528ed9Sopenharmony_ci  bool EatComment();
1556d528ed9Sopenharmony_ci
1566d528ed9Sopenharmony_ci  // Calls GetNextToken() and then ParseToken().
1576d528ed9Sopenharmony_ci  std::optional<Value> ParseNextToken();
1586d528ed9Sopenharmony_ci
1596d528ed9Sopenharmony_ci  // Takes a token that represents the start of a Value ("a structural token"
1606d528ed9Sopenharmony_ci  // in RFC terms) and consumes it, returning the result as a Value.
1616d528ed9Sopenharmony_ci  std::optional<Value> ParseToken(Token token);
1626d528ed9Sopenharmony_ci
1636d528ed9Sopenharmony_ci  // Assuming that the parser is currently wound to '{', this parses a JSON
1646d528ed9Sopenharmony_ci  // object into a Value.
1656d528ed9Sopenharmony_ci  std::optional<Value> ConsumeDictionary();
1666d528ed9Sopenharmony_ci
1676d528ed9Sopenharmony_ci  // Assuming that the parser is wound to '[', this parses a JSON list into a
1686d528ed9Sopenharmony_ci  // Value.
1696d528ed9Sopenharmony_ci  std::optional<Value> ConsumeList();
1706d528ed9Sopenharmony_ci
1716d528ed9Sopenharmony_ci  // Calls through ConsumeStringRaw and wraps it in a value.
1726d528ed9Sopenharmony_ci  std::optional<Value> ConsumeString();
1736d528ed9Sopenharmony_ci
1746d528ed9Sopenharmony_ci  // Assuming that the parser is wound to a double quote, this parses a string,
1756d528ed9Sopenharmony_ci  // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
1766d528ed9Sopenharmony_ci  // success and places result into |out|. Returns false on failure with
1776d528ed9Sopenharmony_ci  // error information set.
1786d528ed9Sopenharmony_ci  bool ConsumeStringRaw(StringBuilder* out);
1796d528ed9Sopenharmony_ci  // Helper function for ConsumeStringRaw() that consumes the next four or 10
1806d528ed9Sopenharmony_ci  // bytes (parser is wound to the first character of a HEX sequence, with the
1816d528ed9Sopenharmony_ci  // potential for consuming another \uXXXX for a surrogate). Returns true on
1826d528ed9Sopenharmony_ci  // success and places the code point |out_code_point|, and false on failure.
1836d528ed9Sopenharmony_ci  bool DecodeUTF16(uint32_t* out_code_point);
1846d528ed9Sopenharmony_ci
1856d528ed9Sopenharmony_ci  // Assuming that the parser is wound to the start of a valid JSON number,
1866d528ed9Sopenharmony_ci  // this parses and converts it to either an int or double value.
1876d528ed9Sopenharmony_ci  std::optional<Value> ConsumeNumber();
1886d528ed9Sopenharmony_ci  // Helper that reads characters that are ints. Returns true if a number was
1896d528ed9Sopenharmony_ci  // read and false on error.
1906d528ed9Sopenharmony_ci  bool ReadInt(bool allow_leading_zeros);
1916d528ed9Sopenharmony_ci
1926d528ed9Sopenharmony_ci  // Consumes the literal values of |true|, |false|, and |null|, assuming the
1936d528ed9Sopenharmony_ci  // parser is wound to the first character of any of those.
1946d528ed9Sopenharmony_ci  std::optional<Value> ConsumeLiteral();
1956d528ed9Sopenharmony_ci
1966d528ed9Sopenharmony_ci  // Helper function that returns true if the byte sequence |match| can be
1976d528ed9Sopenharmony_ci  // consumed at the current parser position. Returns false if there are fewer
1986d528ed9Sopenharmony_ci  // than |match|-length bytes or if the sequence does not match, and the
1996d528ed9Sopenharmony_ci  // parser state is unchanged.
2006d528ed9Sopenharmony_ci  bool ConsumeIfMatch(std::string_view match);
2016d528ed9Sopenharmony_ci
2026d528ed9Sopenharmony_ci  // Sets the error information to |code| at the current column, based on
2036d528ed9Sopenharmony_ci  // |index_| and |index_last_line_|, with an optional positive/negative
2046d528ed9Sopenharmony_ci  // adjustment by |column_adjust|.
2056d528ed9Sopenharmony_ci  void ReportError(JSONReader::JsonParseError code, int column_adjust);
2066d528ed9Sopenharmony_ci
2076d528ed9Sopenharmony_ci  // Given the line and column number of an error, formats one of the error
2086d528ed9Sopenharmony_ci  // message contants from json_reader.h for human display.
2096d528ed9Sopenharmony_ci  static std::string FormatErrorMessage(int line,
2106d528ed9Sopenharmony_ci                                        int column,
2116d528ed9Sopenharmony_ci                                        const std::string& description);
2126d528ed9Sopenharmony_ci
2136d528ed9Sopenharmony_ci  // base::JSONParserOptions that control parsing.
2146d528ed9Sopenharmony_ci  const int options_;
2156d528ed9Sopenharmony_ci
2166d528ed9Sopenharmony_ci  // Maximum depth to parse.
2176d528ed9Sopenharmony_ci  const int max_depth_;
2186d528ed9Sopenharmony_ci
2196d528ed9Sopenharmony_ci  // The input stream being parsed. Note: Not guaranteed to NUL-terminated.
2206d528ed9Sopenharmony_ci  std::string_view input_;
2216d528ed9Sopenharmony_ci
2226d528ed9Sopenharmony_ci  // The index in the input stream to which the parser is wound.
2236d528ed9Sopenharmony_ci  int index_;
2246d528ed9Sopenharmony_ci
2256d528ed9Sopenharmony_ci  // The number of times the parser has recursed (current stack depth).
2266d528ed9Sopenharmony_ci  int stack_depth_;
2276d528ed9Sopenharmony_ci
2286d528ed9Sopenharmony_ci  // The line number that the parser is at currently.
2296d528ed9Sopenharmony_ci  int line_number_;
2306d528ed9Sopenharmony_ci
2316d528ed9Sopenharmony_ci  // The last value of |index_| on the previous line.
2326d528ed9Sopenharmony_ci  int index_last_line_;
2336d528ed9Sopenharmony_ci
2346d528ed9Sopenharmony_ci  // Error information.
2356d528ed9Sopenharmony_ci  JSONReader::JsonParseError error_code_;
2366d528ed9Sopenharmony_ci  int error_line_;
2376d528ed9Sopenharmony_ci  int error_column_;
2386d528ed9Sopenharmony_ci
2396d528ed9Sopenharmony_ci  friend class JSONParserTest;
2406d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
2416d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
2426d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
2436d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
2446d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
2456d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
2466d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
2476d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters);
2486d528ed9Sopenharmony_ci  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidUTF16EscapeSequence);
2496d528ed9Sopenharmony_ci
2506d528ed9Sopenharmony_ci  JSONParser(const JSONParser&) = delete;
2516d528ed9Sopenharmony_ci  JSONParser& operator=(const JSONParser&) = delete;
2526d528ed9Sopenharmony_ci};
2536d528ed9Sopenharmony_ci
2546d528ed9Sopenharmony_ci// Used when decoding and an invalid utf-8 sequence is encountered.
2556d528ed9Sopenharmony_ciextern const char kUnicodeReplacementString[];
2566d528ed9Sopenharmony_ci
2576d528ed9Sopenharmony_ci}  // namespace internal
2586d528ed9Sopenharmony_ci}  // namespace base
2596d528ed9Sopenharmony_ci
2606d528ed9Sopenharmony_ci#endif  // BASE_JSON_JSON_PARSER_H_
261