xref: /third_party/gn/src/base/json/json_parser.h (revision 6d528ed9)
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef BASE_JSON_JSON_PARSER_H_
6#define BASE_JSON_JSON_PARSER_H_
7
8#include <stddef.h>
9#include <stdint.h>
10
11#include <memory>
12#include <optional>
13#include <string>
14#include <string_view>
15
16#include "base/compiler_specific.h"
17#include "base/gtest_prod_util.h"
18#include "base/json/json_reader.h"
19
20namespace base {
21
22class Value;
23
24namespace internal {
25
26class JSONParserTest;
27
28// The implementation behind the JSONReader interface. This class is not meant
29// to be used directly; it encapsulates logic that need not be exposed publicly.
30//
31// This parser guarantees O(n) time through the input string. Iteration happens
32// on the byte level, with the functions ConsumeChars() and ConsumeChar(). The
33// conversion from byte to JSON token happens without advancing the parser in
34// GetNextToken/ParseToken, that is tokenization operates on the current parser
35// position without advancing.
36//
37// Built on top of these are a family of Consume functions that iterate
38// internally. Invariant: on entry of a Consume function, the parser is wound
39// to the first byte of a valid JSON token. On exit, it is on the first byte
40// after the token that was just consumed, which would likely be the first byte
41// of the next token.
42class JSONParser {
43 public:
44  JSONParser(int options, int max_depth = JSONReader::kStackMaxDepth);
45  ~JSONParser();
46
47  // Parses the input string according to the set options and returns the
48  // result as a Value.
49  // Wrap this in base::FooValue::From() to check the Value is of type Foo and
50  // convert to a FooValue at the same time.
51  std::optional<Value> Parse(std::string_view input);
52
53  // Returns the error code.
54  JSONReader::JsonParseError error_code() const;
55
56  // Returns the human-friendly error message.
57  std::string GetErrorMessage() const;
58
59  // Returns the error line number if parse error happened. Otherwise always
60  // returns 0.
61  int error_line() const;
62
63  // Returns the error column number if parse error happened. Otherwise always
64  // returns 0.
65  int error_column() const;
66
67 private:
68  enum Token {
69    T_OBJECT_BEGIN,  // {
70    T_OBJECT_END,    // }
71    T_ARRAY_BEGIN,   // [
72    T_ARRAY_END,     // ]
73    T_STRING,
74    T_NUMBER,
75    T_BOOL_TRUE,              // true
76    T_BOOL_FALSE,             // false
77    T_NULL,                   // null
78    T_LIST_SEPARATOR,         // ,
79    T_OBJECT_PAIR_SEPARATOR,  // :
80    T_END_OF_INPUT,
81    T_INVALID_TOKEN,
82  };
83
84  // A helper class used for parsing strings. One optimization performed is to
85  // create base::Value with a std::string_view to avoid unnecessary std::string
86  // copies. This is not possible if the input string needs to be decoded from
87  // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
88  // This class centralizes that logic.
89  class StringBuilder {
90   public:
91    // Empty constructor. Used for creating a builder with which to assign to.
92    StringBuilder();
93
94    // |pos| is the beginning of an input string, excluding the |"|.
95    explicit StringBuilder(const char* pos);
96
97    ~StringBuilder();
98
99    StringBuilder& operator=(StringBuilder&& other);
100
101    // Appends the Unicode code point |point| to the string, either by
102    // increasing the |length_| of the string if the string has not been
103    // converted, or by appending the UTF8 bytes for the code point.
104    void Append(uint32_t point);
105
106    // Converts the builder from its default std::string_view to a full
107    // std::string, performing a copy. Once a builder is converted, it cannot be
108    // made a std::string_view again.
109    void Convert();
110
111    // Returns the builder as a string, invalidating all state. This allows
112    // the internal string buffer representation to be destructively moved
113    // in cases where the builder will not be needed any more.
114    std::string DestructiveAsString();
115
116   private:
117    // The beginning of the input string.
118    const char* pos_;
119
120    // Number of bytes in |pos_| that make up the string being built.
121    size_t length_;
122
123    // The copied string representation. Will be unset until Convert() is
124    // called.
125    std::optional<std::string> string_;
126  };
127
128  // Returns the next |count| bytes of the input stream, or nullopt if fewer
129  // than |count| bytes remain.
130  std::optional<std::string_view> PeekChars(int count);
131
132  // Calls PeekChars() with a |count| of 1.
133  std::optional<char> PeekChar();
134
135  // Returns the next |count| bytes of the input stream, or nullopt if fewer
136  // than |count| bytes remain, and advances the parser position by |count|.
137  std::optional<std::string_view> ConsumeChars(int count);
138
139  // Calls ConsumeChars() with a |count| of 1.
140  std::optional<char> ConsumeChar();
141
142  // Returns a pointer to the current character position.
143  const char* pos();
144
145  // Skips over whitespace and comments to find the next token in the stream.
146  // This does not advance the parser for non-whitespace or comment chars.
147  Token GetNextToken();
148
149  // Consumes whitespace characters and comments until the next non-that is
150  // encountered.
151  void EatWhitespaceAndComments();
152  // Helper function that consumes a comment, assuming that the parser is
153  // currently wound to a '/'.
154  bool EatComment();
155
156  // Calls GetNextToken() and then ParseToken().
157  std::optional<Value> ParseNextToken();
158
159  // Takes a token that represents the start of a Value ("a structural token"
160  // in RFC terms) and consumes it, returning the result as a Value.
161  std::optional<Value> ParseToken(Token token);
162
163  // Assuming that the parser is currently wound to '{', this parses a JSON
164  // object into a Value.
165  std::optional<Value> ConsumeDictionary();
166
167  // Assuming that the parser is wound to '[', this parses a JSON list into a
168  // Value.
169  std::optional<Value> ConsumeList();
170
171  // Calls through ConsumeStringRaw and wraps it in a value.
172  std::optional<Value> ConsumeString();
173
174  // Assuming that the parser is wound to a double quote, this parses a string,
175  // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
176  // success and places result into |out|. Returns false on failure with
177  // error information set.
178  bool ConsumeStringRaw(StringBuilder* out);
179  // Helper function for ConsumeStringRaw() that consumes the next four or 10
180  // bytes (parser is wound to the first character of a HEX sequence, with the
181  // potential for consuming another \uXXXX for a surrogate). Returns true on
182  // success and places the code point |out_code_point|, and false on failure.
183  bool DecodeUTF16(uint32_t* out_code_point);
184
185  // Assuming that the parser is wound to the start of a valid JSON number,
186  // this parses and converts it to either an int or double value.
187  std::optional<Value> ConsumeNumber();
188  // Helper that reads characters that are ints. Returns true if a number was
189  // read and false on error.
190  bool ReadInt(bool allow_leading_zeros);
191
192  // Consumes the literal values of |true|, |false|, and |null|, assuming the
193  // parser is wound to the first character of any of those.
194  std::optional<Value> ConsumeLiteral();
195
196  // Helper function that returns true if the byte sequence |match| can be
197  // consumed at the current parser position. Returns false if there are fewer
198  // than |match|-length bytes or if the sequence does not match, and the
199  // parser state is unchanged.
200  bool ConsumeIfMatch(std::string_view match);
201
202  // Sets the error information to |code| at the current column, based on
203  // |index_| and |index_last_line_|, with an optional positive/negative
204  // adjustment by |column_adjust|.
205  void ReportError(JSONReader::JsonParseError code, int column_adjust);
206
207  // Given the line and column number of an error, formats one of the error
208  // message contants from json_reader.h for human display.
209  static std::string FormatErrorMessage(int line,
210                                        int column,
211                                        const std::string& description);
212
213  // base::JSONParserOptions that control parsing.
214  const int options_;
215
216  // Maximum depth to parse.
217  const int max_depth_;
218
219  // The input stream being parsed. Note: Not guaranteed to NUL-terminated.
220  std::string_view input_;
221
222  // The index in the input stream to which the parser is wound.
223  int index_;
224
225  // The number of times the parser has recursed (current stack depth).
226  int stack_depth_;
227
228  // The line number that the parser is at currently.
229  int line_number_;
230
231  // The last value of |index_| on the previous line.
232  int index_last_line_;
233
234  // Error information.
235  JSONReader::JsonParseError error_code_;
236  int error_line_;
237  int error_column_;
238
239  friend class JSONParserTest;
240  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
241  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
242  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
243  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
244  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
245  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
246  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
247  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters);
248  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidUTF16EscapeSequence);
249
250  JSONParser(const JSONParser&) = delete;
251  JSONParser& operator=(const JSONParser&) = delete;
252};
253
254// Used when decoding and an invalid utf-8 sequence is encountered.
255extern const char kUnicodeReplacementString[];
256
257}  // namespace internal
258}  // namespace base
259
260#endif  // BASE_JSON_JSON_PARSER_H_
261