1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/json/string_escape.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 
10 #include <limits>
11 #include <string>
12 
13 #include "base/strings/string_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "base/strings/utf_string_conversion_utils.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/third_party/icu/icu_utf.h"
18 
19 namespace base {
20 
21 namespace {
22 
23 // Format string for printing a \uXXXX escape sequence.
24 const char kU16EscapeFormat[] = "\\u%04X";
25 
26 // The code point to output for an invalid input code unit.
27 const uint32_t kReplacementCodePoint = 0xFFFD;
28 
29 // Used below in EscapeSpecialCodePoint().
30 static_assert('<' == 0x3C, "less than sign must be 0x3c");
31 
32 template <typename S>
IsAscii(const S& str)33 bool IsAscii(const S& str) {
34   for (auto ch : str) {
35     if (ch > 126)
36       return false;
37   }
38   return true;
39 }
40 
ComputeAsciiEscapedSize(char ch)41 size_t ComputeAsciiEscapedSize(char ch) {
42   switch (ch) {
43     case '\b':
44     case '\f':
45     case '\n':
46     case '\r':
47     case '\t':
48     case '\\':
49     case '"':
50       return 2;
51     case '<':  // Special case, consistent with EscapeSpecialCodePoint below.
52       return 6;
53     default:
54       if (ch < 32)
55         return 6;
56       return 0;
57   }
58 }
59 
60 // Try to escape the |code_point| if it is a known special character. If
61 // successful, returns true and appends the escape sequence to |dest|. This
62 // isn't required by the spec, but it's more readable by humans.
EscapeSpecialCodePoint(uint32_t code_point, std::string* dest)63 bool EscapeSpecialCodePoint(uint32_t code_point, std::string* dest) {
64   // WARNING: if you add a new case here, you need to update the reader as well.
65   // Note: \v is in the reader, but not here since the JSON spec doesn't
66   // allow it.
67   switch (code_point) {
68     case '\b':
69       dest->append("\\b");
70       break;
71     case '\f':
72       dest->append("\\f");
73       break;
74     case '\n':
75       dest->append("\\n");
76       break;
77     case '\r':
78       dest->append("\\r");
79       break;
80     case '\t':
81       dest->append("\\t");
82       break;
83     case '\\':
84       dest->append("\\\\");
85       break;
86     case '"':
87       dest->append("\\\"");
88       break;
89     // Escape < to prevent script execution; escaping > is not necessary and
90     // not doing so save a few bytes.
91     case '<':
92       dest->append("\\u003C");
93       break;
94     // Escape the "Line Separator" and "Paragraph Separator" characters, since
95     // they should be treated like a new line \r or \n.
96     case 0x2028:
97       dest->append("\\u2028");
98       break;
99     case 0x2029:
100       dest->append("\\u2029");
101       break;
102     default:
103       if (code_point >= 32)
104         return false;
105       // Escape non-printing characters.
106       base::StringAppendF(dest, kU16EscapeFormat, code_point);
107   }
108   return true;
109 }
110 
111 template <typename S>
EscapeJSONStringImpl(const S& str, bool put_in_quotes, std::string* dest)112 bool EscapeJSONStringImpl(const S& str, bool put_in_quotes, std::string* dest) {
113   bool did_replacement = false;
114 
115   if (put_in_quotes)
116     dest->push_back('"');
117 
118   // Most input strings are ASCII only and do not need UTF-8 parsing or
119   // even escaping at all.
120   if (IsAscii(str)) {
121     size_t escapes_size = 0;
122     for (auto ch : str)
123       escapes_size += ComputeAsciiEscapedSize(ch);
124 
125     if (escapes_size == 0) {
126       dest->append(str.begin(), str.end());
127     } else {
128       dest->reserve(dest->size() + str.size() + escapes_size);
129       for (auto ch : str) {
130         if (!EscapeSpecialCodePoint(ch, dest))
131           dest->push_back(ch);
132       }
133     }
134   } else {
135     // Casting is necessary because ICU uses int32_t. Try and do so safely.
136     CHECK_LE(str.length(),
137              static_cast<size_t>(std::numeric_limits<int32_t>::max()));
138     const int32_t length = static_cast<int32_t>(str.length());
139 
140     for (int32_t i = 0; i < length; ++i) {
141       uint32_t code_point;
142       if (!ReadUnicodeCharacter(str.data(), length, &i, &code_point) ||
143           code_point == static_cast<decltype(code_point)>(CBU_SENTINEL) ||
144           !IsValidCharacter(code_point)) {
145         code_point = kReplacementCodePoint;
146         did_replacement = true;
147       }
148 
149       if (!EscapeSpecialCodePoint(code_point, dest))
150         WriteUnicodeCharacter(code_point, dest);
151     }
152   }
153 
154   if (put_in_quotes)
155     dest->push_back('"');
156 
157   return !did_replacement;
158 }
159 
160 }  // namespace
161 
EscapeJSONString(std::string_view str, bool put_in_quotes, std::string* dest)162 void EscapeJSONString(std::string_view str,
163                       bool put_in_quotes,
164                       std::string* dest) {
165   EscapeJSONStringImpl(str, put_in_quotes, dest);
166 }
167 
EscapeJSONString(std::u16string_view str, bool put_in_quotes, std::string* dest)168 void EscapeJSONString(std::u16string_view str,
169                       bool put_in_quotes,
170                       std::string* dest) {
171   EscapeJSONStringImpl(str, put_in_quotes, dest);
172 }
173 
EscapeBytesAsInvalidJSONString(std::string_view str, bool put_in_quotes)174 std::string EscapeBytesAsInvalidJSONString(std::string_view str,
175                                            bool put_in_quotes) {
176   std::string dest;
177 
178   if (put_in_quotes)
179     dest.push_back('"');
180 
181   for (std::string_view::const_iterator it = str.begin(); it != str.end();
182        ++it) {
183     unsigned char c = *it;
184     if (EscapeSpecialCodePoint(c, &dest))
185       continue;
186 
187     if (c < 32 || c > 126)
188       base::StringAppendF(&dest, kU16EscapeFormat, c);
189     else
190       dest.push_back(*it);
191   }
192 
193   if (put_in_quotes)
194     dest.push_back('"');
195 
196   return dest;
197 }
198 
199 }  // namespace base
200