1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/json/string_escape.h"
6
7 #include <stddef.h>
8 #include <stdint.h>
9
10 #include <limits>
11 #include <string>
12
13 #include "base/strings/string_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "base/strings/utf_string_conversion_utils.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/third_party/icu/icu_utf.h"
18
19 namespace base {
20
21 namespace {
22
23 // Format string for printing a \uXXXX escape sequence.
24 const char kU16EscapeFormat[] = "\\u%04X";
25
26 // The code point to output for an invalid input code unit.
27 const uint32_t kReplacementCodePoint = 0xFFFD;
28
29 // Used below in EscapeSpecialCodePoint().
30 static_assert('<' == 0x3C, "less than sign must be 0x3c");
31
32 template <typename S>
IsAscii(const S& str)33 bool IsAscii(const S& str) {
34 for (auto ch : str) {
35 if (ch > 126)
36 return false;
37 }
38 return true;
39 }
40
ComputeAsciiEscapedSize(char ch)41 size_t ComputeAsciiEscapedSize(char ch) {
42 switch (ch) {
43 case '\b':
44 case '\f':
45 case '\n':
46 case '\r':
47 case '\t':
48 case '\\':
49 case '"':
50 return 2;
51 case '<': // Special case, consistent with EscapeSpecialCodePoint below.
52 return 6;
53 default:
54 if (ch < 32)
55 return 6;
56 return 0;
57 }
58 }
59
60 // Try to escape the |code_point| if it is a known special character. If
61 // successful, returns true and appends the escape sequence to |dest|. This
62 // isn't required by the spec, but it's more readable by humans.
EscapeSpecialCodePoint(uint32_t code_point, std::string* dest)63 bool EscapeSpecialCodePoint(uint32_t code_point, std::string* dest) {
64 // WARNING: if you add a new case here, you need to update the reader as well.
65 // Note: \v is in the reader, but not here since the JSON spec doesn't
66 // allow it.
67 switch (code_point) {
68 case '\b':
69 dest->append("\\b");
70 break;
71 case '\f':
72 dest->append("\\f");
73 break;
74 case '\n':
75 dest->append("\\n");
76 break;
77 case '\r':
78 dest->append("\\r");
79 break;
80 case '\t':
81 dest->append("\\t");
82 break;
83 case '\\':
84 dest->append("\\\\");
85 break;
86 case '"':
87 dest->append("\\\"");
88 break;
89 // Escape < to prevent script execution; escaping > is not necessary and
90 // not doing so save a few bytes.
91 case '<':
92 dest->append("\\u003C");
93 break;
94 // Escape the "Line Separator" and "Paragraph Separator" characters, since
95 // they should be treated like a new line \r or \n.
96 case 0x2028:
97 dest->append("\\u2028");
98 break;
99 case 0x2029:
100 dest->append("\\u2029");
101 break;
102 default:
103 if (code_point >= 32)
104 return false;
105 // Escape non-printing characters.
106 base::StringAppendF(dest, kU16EscapeFormat, code_point);
107 }
108 return true;
109 }
110
111 template <typename S>
EscapeJSONStringImpl(const S& str, bool put_in_quotes, std::string* dest)112 bool EscapeJSONStringImpl(const S& str, bool put_in_quotes, std::string* dest) {
113 bool did_replacement = false;
114
115 if (put_in_quotes)
116 dest->push_back('"');
117
118 // Most input strings are ASCII only and do not need UTF-8 parsing or
119 // even escaping at all.
120 if (IsAscii(str)) {
121 size_t escapes_size = 0;
122 for (auto ch : str)
123 escapes_size += ComputeAsciiEscapedSize(ch);
124
125 if (escapes_size == 0) {
126 dest->append(str.begin(), str.end());
127 } else {
128 dest->reserve(dest->size() + str.size() + escapes_size);
129 for (auto ch : str) {
130 if (!EscapeSpecialCodePoint(ch, dest))
131 dest->push_back(ch);
132 }
133 }
134 } else {
135 // Casting is necessary because ICU uses int32_t. Try and do so safely.
136 CHECK_LE(str.length(),
137 static_cast<size_t>(std::numeric_limits<int32_t>::max()));
138 const int32_t length = static_cast<int32_t>(str.length());
139
140 for (int32_t i = 0; i < length; ++i) {
141 uint32_t code_point;
142 if (!ReadUnicodeCharacter(str.data(), length, &i, &code_point) ||
143 code_point == static_cast<decltype(code_point)>(CBU_SENTINEL) ||
144 !IsValidCharacter(code_point)) {
145 code_point = kReplacementCodePoint;
146 did_replacement = true;
147 }
148
149 if (!EscapeSpecialCodePoint(code_point, dest))
150 WriteUnicodeCharacter(code_point, dest);
151 }
152 }
153
154 if (put_in_quotes)
155 dest->push_back('"');
156
157 return !did_replacement;
158 }
159
160 } // namespace
161
EscapeJSONString(std::string_view str, bool put_in_quotes, std::string* dest)162 void EscapeJSONString(std::string_view str,
163 bool put_in_quotes,
164 std::string* dest) {
165 EscapeJSONStringImpl(str, put_in_quotes, dest);
166 }
167
EscapeJSONString(std::u16string_view str, bool put_in_quotes, std::string* dest)168 void EscapeJSONString(std::u16string_view str,
169 bool put_in_quotes,
170 std::string* dest) {
171 EscapeJSONStringImpl(str, put_in_quotes, dest);
172 }
173
EscapeBytesAsInvalidJSONString(std::string_view str, bool put_in_quotes)174 std::string EscapeBytesAsInvalidJSONString(std::string_view str,
175 bool put_in_quotes) {
176 std::string dest;
177
178 if (put_in_quotes)
179 dest.push_back('"');
180
181 for (std::string_view::const_iterator it = str.begin(); it != str.end();
182 ++it) {
183 unsigned char c = *it;
184 if (EscapeSpecialCodePoint(c, &dest))
185 continue;
186
187 if (c < 32 || c > 126)
188 base::StringAppendF(&dest, kU16EscapeFormat, c);
189 else
190 dest.push_back(*it);
191 }
192
193 if (put_in_quotes)
194 dest.push_back('"');
195
196 return dest;
197 }
198
199 } // namespace base
200