1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/string_util.h"
6
7 #include <ctype.h>
8 #include <errno.h>
9 #include <math.h>
10 #include <stdarg.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <time.h>
16 #include <wchar.h>
17 #include <wctype.h>
18
19 #include <algorithm>
20 #include <iterator>
21 #include <limits>
22 #include <string>
23 #include <vector>
24
25 #include "base/logging.h"
26 #include "base/strings/utf_string_conversion_utils.h"
27 #include "base/strings/utf_string_conversions.h"
28 #include "base/third_party/icu/icu_utf.h"
29 #include "util/build_config.h"
30
31 namespace base {
32
33 namespace {
34
35 // Used by ReplaceStringPlaceholders to track the position in the string of
36 // replaced parameters.
37 struct ReplacementOffset {
ReplacementOffsetbase::__anon2855::ReplacementOffset38 ReplacementOffset(uintptr_t parameter, size_t offset)
39 : parameter(parameter), offset(offset) {}
40
41 // Index of the parameter.
42 uintptr_t parameter;
43
44 // Starting position in the string.
45 size_t offset;
46 };
47
CompareParameter(const ReplacementOffset& elem1, const ReplacementOffset& elem2)48 static bool CompareParameter(const ReplacementOffset& elem1,
49 const ReplacementOffset& elem2) {
50 return elem1.parameter < elem2.parameter;
51 }
52
53 // Assuming that a pointer is the size of a "machine word", then
54 // uintptr_t is an integer type that is also a machine word.
55 typedef uintptr_t MachineWord;
56 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
57
IsAlignedToMachineWord(const void* pointer)58 inline bool IsAlignedToMachineWord(const void* pointer) {
59 return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
60 }
61
62 template <typename T>
AlignToMachineWord(T* pointer)63 inline T* AlignToMachineWord(T* pointer) {
64 return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
65 ~kMachineWordAlignmentMask);
66 }
67
68 template <size_t size, typename CharacterType>
69 struct NonASCIIMask;
70 template <>
71 struct NonASCIIMask<4, char16_t> {
valuebase::__anon2855::NonASCIIMask72 static inline uint32_t value() { return 0xFF80FF80U; }
73 };
74 template <>
75 struct NonASCIIMask<4, char> {
valuebase::__anon2855::NonASCIIMask76 static inline uint32_t value() { return 0x80808080U; }
77 };
78 template <>
79 struct NonASCIIMask<8, char16_t> {
valuebase::__anon2855::NonASCIIMask80 static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
81 };
82 template <>
83 struct NonASCIIMask<8, char> {
valuebase::__anon2855::NonASCIIMask84 static inline uint64_t value() { return 0x8080808080808080ULL; }
85 };
86
87 } // namespace
88
89 namespace {
90
91 template <typename StringType>
ToLowerASCIIImpl( std::basic_string_view<typename StringType::value_type> str)92 StringType ToLowerASCIIImpl(
93 std::basic_string_view<typename StringType::value_type> str) {
94 StringType ret;
95 ret.reserve(str.size());
96 for (size_t i = 0; i < str.size(); i++)
97 ret.push_back(ToLowerASCII(str[i]));
98 return ret;
99 }
100
101 template <typename StringType>
ToUpperASCIIImpl( std::basic_string_view<typename StringType::value_type> str)102 StringType ToUpperASCIIImpl(
103 std::basic_string_view<typename StringType::value_type> str) {
104 StringType ret;
105 ret.reserve(str.size());
106 for (size_t i = 0; i < str.size(); i++)
107 ret.push_back(ToUpperASCII(str[i]));
108 return ret;
109 }
110
111 } // namespace
112
ToLowerASCII(std::string_view str)113 std::string ToLowerASCII(std::string_view str) {
114 return ToLowerASCIIImpl<std::string>(str);
115 }
116
ToLowerASCII(std::u16string_view str)117 std::u16string ToLowerASCII(std::u16string_view str) {
118 return ToLowerASCIIImpl<std::u16string>(str);
119 }
120
ToUpperASCII(std::string_view str)121 std::string ToUpperASCII(std::string_view str) {
122 return ToUpperASCIIImpl<std::string>(str);
123 }
124
ToUpperASCII(std::u16string_view str)125 std::u16string ToUpperASCII(std::u16string_view str) {
126 return ToUpperASCIIImpl<std::u16string>(str);
127 }
128
starts_with(const std::string_view str1, const std::string_view str2)129 bool starts_with(const std::string_view str1, const std::string_view str2) {
130 if (str2.length() > str1.length()) {
131 return false;
132 }
133 return str1.compare(0, str2.length(), str2) == 0;
134 }
135
ends_with(const std::string_view str1, const std::string_view str2)136 bool ends_with(const std::string_view str1, const std::string_view str2) {
137 if (str2.empty()) {
138 return true;
139 }
140 if (str1.length() < str2.length()) {
141 return false;
142 }
143 return str1.substr(str1.length() - str2.length()) == str2;
144 }
145
146 template <class StringType>
CompareCaseInsensitiveASCIIT( std::basic_string_view<typename StringType::value_type> a, std::basic_string_view<typename StringType::value_type> b)147 int CompareCaseInsensitiveASCIIT(
148 std::basic_string_view<typename StringType::value_type> a,
149 std::basic_string_view<typename StringType::value_type> b) {
150 // Find the first characters that aren't equal and compare them. If the end
151 // of one of the strings is found before a nonequal character, the lengths
152 // of the strings are compared.
153 size_t i = 0;
154 while (i < a.length() && i < b.length()) {
155 typename StringType::value_type lower_a = ToLowerASCII(a[i]);
156 typename StringType::value_type lower_b = ToLowerASCII(b[i]);
157 if (lower_a < lower_b)
158 return -1;
159 if (lower_a > lower_b)
160 return 1;
161 i++;
162 }
163
164 // End of one string hit before finding a different character. Expect the
165 // common case to be "strings equal" at this point so check that first.
166 if (a.length() == b.length())
167 return 0;
168
169 if (a.length() < b.length())
170 return -1;
171 return 1;
172 }
173
CompareCaseInsensitiveASCII(std::string_view a, std::string_view b)174 int CompareCaseInsensitiveASCII(std::string_view a, std::string_view b) {
175 return CompareCaseInsensitiveASCIIT<std::string>(a, b);
176 }
177
CompareCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b)178 int CompareCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
179 return CompareCaseInsensitiveASCIIT<std::u16string>(a, b);
180 }
181
EqualsCaseInsensitiveASCII(std::string_view a, std::string_view b)182 bool EqualsCaseInsensitiveASCII(std::string_view a, std::string_view b) {
183 if (a.length() != b.length())
184 return false;
185 return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0;
186 }
187
EqualsCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b)188 bool EqualsCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
189 if (a.length() != b.length())
190 return false;
191 return CompareCaseInsensitiveASCIIT<std::u16string>(a, b) == 0;
192 }
193
194 template <class StringType>
195 bool ReplaceCharsT(
196 const StringType& input,
197 std::basic_string_view<typename StringType::value_type> find_any_of_these,
198 std::basic_string_view<typename StringType::value_type> replace_with,
199 StringType* output);
200
ReplaceChars(const std::u16string& input, std::u16string_view replace_chars, const std::u16string& replace_with, std::u16string* output)201 bool ReplaceChars(const std::u16string& input,
202 std::u16string_view replace_chars,
203 const std::u16string& replace_with,
204 std::u16string* output) {
205 return ReplaceCharsT(input, replace_chars, std::u16string_view(replace_with),
206 output);
207 }
208
ReplaceChars(const std::string& input, std::string_view replace_chars, const std::string& replace_with, std::string* output)209 bool ReplaceChars(const std::string& input,
210 std::string_view replace_chars,
211 const std::string& replace_with,
212 std::string* output) {
213 return ReplaceCharsT(input, replace_chars, std::string_view(replace_with),
214 output);
215 }
216
RemoveChars(const std::u16string& input, std::u16string_view remove_chars, std::u16string* output)217 bool RemoveChars(const std::u16string& input,
218 std::u16string_view remove_chars,
219 std::u16string* output) {
220 return ReplaceCharsT(input, remove_chars, std::u16string_view(), output);
221 }
222
RemoveChars(const std::string& input, std::string_view remove_chars, std::string* output)223 bool RemoveChars(const std::string& input,
224 std::string_view remove_chars,
225 std::string* output) {
226 return ReplaceCharsT(input, remove_chars, std::string_view(), output);
227 }
228
229 template <typename Str>
TrimStringT( const Str& input, std::basic_string_view<typename Str::value_type> trim_chars, TrimPositions positions, Str* output)230 TrimPositions TrimStringT(
231 const Str& input,
232 std::basic_string_view<typename Str::value_type> trim_chars,
233 TrimPositions positions,
234 Str* output) {
235 // Find the edges of leading/trailing whitespace as desired. Need to use
236 // a std::string_view version of input to be able to call find* on it with the
237 // std::string_view version of trim_chars (normally the trim_chars will be a
238 // constant so avoid making a copy).
239 std::basic_string_view<typename Str::value_type> input_piece(input);
240 const size_t last_char = input.length() - 1;
241 const size_t first_good_char = (positions & TRIM_LEADING)
242 ? input_piece.find_first_not_of(trim_chars)
243 : 0;
244 const size_t last_good_char = (positions & TRIM_TRAILING)
245 ? input_piece.find_last_not_of(trim_chars)
246 : last_char;
247
248 // When the string was all trimmed, report that we stripped off characters
249 // from whichever position the caller was interested in. For empty input, we
250 // stripped no characters, but we still need to clear |output|.
251 if (input.empty() || (first_good_char == Str::npos) ||
252 (last_good_char == Str::npos)) {
253 bool input_was_empty = input.empty(); // in case output == &input
254 output->clear();
255 return input_was_empty ? TRIM_NONE : positions;
256 }
257
258 // Trim.
259 *output = input.substr(first_good_char, last_good_char - first_good_char + 1);
260
261 // Return where we trimmed from.
262 return static_cast<TrimPositions>(
263 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
264 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
265 }
266
TrimString(const std::u16string& input, std::u16string_view trim_chars, std::u16string* output)267 bool TrimString(const std::u16string& input,
268 std::u16string_view trim_chars,
269 std::u16string* output) {
270 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
271 }
272
TrimString(const std::string& input, std::string_view trim_chars, std::string* output)273 bool TrimString(const std::string& input,
274 std::string_view trim_chars,
275 std::string* output) {
276 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
277 }
278
279 template <typename char_type>
TrimStringPieceT( std::basic_string_view<char_type> input, std::basic_string_view<char_type> trim_chars, TrimPositions positions)280 std::basic_string_view<char_type> TrimStringPieceT(
281 std::basic_string_view<char_type> input,
282 std::basic_string_view<char_type> trim_chars,
283 TrimPositions positions) {
284 size_t begin =
285 (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0;
286 if (begin == std::basic_string_view<char_type>::npos)
287 return std::basic_string_view<char_type>(); // All trimmed.
288
289 size_t end = (positions & TRIM_TRAILING)
290 ? input.find_last_not_of(trim_chars) + 1
291 : input.size();
292 return input.substr(begin, end - begin);
293 }
294
TrimString(std::u16string_view input, std::u16string_view trim_chars, TrimPositions positions)295 std::u16string_view TrimString(std::u16string_view input,
296 std::u16string_view trim_chars,
297 TrimPositions positions) {
298 return TrimStringPieceT(input, trim_chars, positions);
299 }
300
TrimString(std::string_view input, std::string_view trim_chars, TrimPositions positions)301 std::string_view TrimString(std::string_view input,
302 std::string_view trim_chars,
303 TrimPositions positions) {
304 return TrimStringPieceT(input, trim_chars, positions);
305 }
306
TruncateUTF8ToByteSize(const std::string& input, const size_t byte_size, std::string* output)307 void TruncateUTF8ToByteSize(const std::string& input,
308 const size_t byte_size,
309 std::string* output) {
310 DCHECK(output);
311 if (byte_size > input.length()) {
312 *output = input;
313 return;
314 }
315 DCHECK_LE(byte_size,
316 static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
317 // Note: This cast is necessary because CBU8_NEXT uses int32_ts.
318 int32_t truncation_length = static_cast<int32_t>(byte_size);
319 int32_t char_index = truncation_length - 1;
320 const char* data = input.data();
321
322 // Using CBU8, we will move backwards from the truncation point
323 // to the beginning of the string looking for a valid UTF8
324 // character. Once a full UTF8 character is found, we will
325 // truncate the string to the end of that character.
326 while (char_index >= 0) {
327 int32_t prev = char_index;
328 base_icu::UChar32 code_point = 0;
329 CBU8_NEXT(data, char_index, truncation_length, code_point);
330 if (!IsValidCharacter(code_point) || !IsValidCodepoint(code_point)) {
331 char_index = prev - 1;
332 } else {
333 break;
334 }
335 }
336
337 if (char_index >= 0)
338 *output = input.substr(0, char_index);
339 else
340 output->clear();
341 }
342
TrimWhitespace(const std::u16string& input, TrimPositions positions, std::u16string* output)343 TrimPositions TrimWhitespace(const std::u16string& input,
344 TrimPositions positions,
345 std::u16string* output) {
346 return TrimStringT(input, std::u16string_view(kWhitespaceUTF16), positions,
347 output);
348 }
349
TrimWhitespace(std::u16string_view input, TrimPositions positions)350 std::u16string_view TrimWhitespace(std::u16string_view input,
351 TrimPositions positions) {
352 return TrimStringPieceT(input, std::u16string_view(kWhitespaceUTF16),
353 positions);
354 }
355
TrimWhitespaceASCII(const std::string& input, TrimPositions positions, std::string* output)356 TrimPositions TrimWhitespaceASCII(const std::string& input,
357 TrimPositions positions,
358 std::string* output) {
359 return TrimStringT(input, std::string_view(kWhitespaceASCII), positions,
360 output);
361 }
362
TrimWhitespaceASCII(std::string_view input, TrimPositions positions)363 std::string_view TrimWhitespaceASCII(std::string_view input,
364 TrimPositions positions) {
365 return TrimStringPieceT(input, std::string_view(kWhitespaceASCII), positions);
366 }
367
368 template <typename STR>
CollapseWhitespaceT(const STR& text, bool trim_sequences_with_line_breaks)369 STR CollapseWhitespaceT(const STR& text, bool trim_sequences_with_line_breaks) {
370 STR result;
371 result.resize(text.size());
372
373 // Set flags to pretend we're already in a trimmed whitespace sequence, so we
374 // will trim any leading whitespace.
375 bool in_whitespace = true;
376 bool already_trimmed = true;
377
378 int chars_written = 0;
379 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
380 if (IsUnicodeWhitespace(*i)) {
381 if (!in_whitespace) {
382 // Reduce all whitespace sequences to a single space.
383 in_whitespace = true;
384 result[chars_written++] = L' ';
385 }
386 if (trim_sequences_with_line_breaks && !already_trimmed &&
387 ((*i == '\n') || (*i == '\r'))) {
388 // Whitespace sequences containing CR or LF are eliminated entirely.
389 already_trimmed = true;
390 --chars_written;
391 }
392 } else {
393 // Non-whitespace characters are copied straight across.
394 in_whitespace = false;
395 already_trimmed = false;
396 result[chars_written++] = *i;
397 }
398 }
399
400 if (in_whitespace && !already_trimmed) {
401 // Any trailing whitespace is eliminated.
402 --chars_written;
403 }
404
405 result.resize(chars_written);
406 return result;
407 }
408
CollapseWhitespace(const std::u16string& text, bool trim_sequences_with_line_breaks)409 std::u16string CollapseWhitespace(const std::u16string& text,
410 bool trim_sequences_with_line_breaks) {
411 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
412 }
413
CollapseWhitespaceASCII(const std::string& text, bool trim_sequences_with_line_breaks)414 std::string CollapseWhitespaceASCII(const std::string& text,
415 bool trim_sequences_with_line_breaks) {
416 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
417 }
418
ContainsOnlyChars(std::string_view input, std::string_view characters)419 bool ContainsOnlyChars(std::string_view input, std::string_view characters) {
420 return input.find_first_not_of(characters) == std::string_view::npos;
421 }
422
ContainsOnlyChars(std::u16string_view input, std::u16string_view characters)423 bool ContainsOnlyChars(std::u16string_view input,
424 std::u16string_view characters) {
425 return input.find_first_not_of(characters) == std::u16string_view::npos;
426 }
427
428 template <class Char>
DoIsStringASCII(const Char* characters, size_t length)429 inline bool DoIsStringASCII(const Char* characters, size_t length) {
430 MachineWord all_char_bits = 0;
431 const Char* end = characters + length;
432
433 // Prologue: align the input.
434 while (!IsAlignedToMachineWord(characters) && characters != end) {
435 all_char_bits |= *characters;
436 ++characters;
437 }
438
439 // Compare the values of CPU word size.
440 const Char* word_end = AlignToMachineWord(end);
441 const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
442 while (characters < word_end) {
443 all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
444 characters += loop_increment;
445 }
446
447 // Process the remaining bytes.
448 while (characters != end) {
449 all_char_bits |= *characters;
450 ++characters;
451 }
452
453 MachineWord non_ascii_bit_mask =
454 NonASCIIMask<sizeof(MachineWord), Char>::value();
455 return !(all_char_bits & non_ascii_bit_mask);
456 }
457
IsStringASCII(std::string_view str)458 bool IsStringASCII(std::string_view str) {
459 return DoIsStringASCII(str.data(), str.length());
460 }
461
IsStringASCII(std::u16string_view str)462 bool IsStringASCII(std::u16string_view str) {
463 return DoIsStringASCII(str.data(), str.length());
464 }
465
IsStringUTF8(std::string_view str)466 bool IsStringUTF8(std::string_view str) {
467 const char* src = str.data();
468 int32_t src_len = static_cast<int32_t>(str.length());
469 int32_t char_index = 0;
470
471 while (char_index < src_len) {
472 int32_t code_point;
473 CBU8_NEXT(src, char_index, src_len, code_point);
474 if (!IsValidCharacter(code_point))
475 return false;
476 }
477 return true;
478 }
479
480 // Implementation note: Normally this function will be called with a hardcoded
481 // constant for the lowercase_ascii parameter. Constructing a std::string_view
482 // from a C constant requires running strlen, so the result will be two passes
483 // through the buffers, one to file the length of lowercase_ascii, and one to
484 // compare each letter.
485 //
486 // This function could have taken a const char* to avoid this and only do one
487 // pass through the string. But the strlen is faster than the case-insensitive
488 // compares and lets us early-exit in the case that the strings are different
489 // lengths (will often be the case for non-matches). So whether one approach or
490 // the other will be faster depends on the case.
491 //
492 // The hardcoded strings are typically very short so it doesn't matter, and the
493 // string piece gives additional flexibility for the caller (doesn't have to be
494 // null terminated) so we choose the std::string_view route.
495 template <typename Str>
DoLowerCaseEqualsASCII( std::basic_string_view<typename Str::value_type> str, std::string_view lowercase_ascii)496 static inline bool DoLowerCaseEqualsASCII(
497 std::basic_string_view<typename Str::value_type> str,
498 std::string_view lowercase_ascii) {
499 if (str.size() != lowercase_ascii.size())
500 return false;
501 for (size_t i = 0; i < str.size(); i++) {
502 if (ToLowerASCII(str[i]) != lowercase_ascii[i])
503 return false;
504 }
505 return true;
506 }
507
LowerCaseEqualsASCII(std::string_view str, std::string_view lowercase_ascii)508 bool LowerCaseEqualsASCII(std::string_view str,
509 std::string_view lowercase_ascii) {
510 return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
511 }
512
LowerCaseEqualsASCII(std::u16string_view str, std::string_view lowercase_ascii)513 bool LowerCaseEqualsASCII(std::u16string_view str,
514 std::string_view lowercase_ascii) {
515 return DoLowerCaseEqualsASCII<std::u16string>(str, lowercase_ascii);
516 }
517
EqualsASCII(std::u16string_view str, std::string_view ascii)518 bool EqualsASCII(std::u16string_view str, std::string_view ascii) {
519 if (str.length() != ascii.length())
520 return false;
521 return std::equal(ascii.begin(), ascii.end(), str.begin());
522 }
523
524 template <typename char_type>
StartsWithCaseInsensitiveASCIIT( std::basic_string_view<char_type> str, std::basic_string_view<char_type> search_for)525 bool StartsWithCaseInsensitiveASCIIT(
526 std::basic_string_view<char_type> str,
527 std::basic_string_view<char_type> search_for) {
528 if (search_for.size() > str.size())
529 return false;
530
531 std::basic_string_view<char_type> source = str.substr(0, search_for.size());
532
533 return std::equal(search_for.begin(), search_for.end(), source.begin(),
534 CaseInsensitiveCompareASCII<char_type>());
535 }
536
StartsWithCaseInsensitiveASCII(std::string_view str, std::string_view search_for)537 bool StartsWithCaseInsensitiveASCII(std::string_view str,
538 std::string_view search_for) {
539 return StartsWithCaseInsensitiveASCIIT(str, search_for);
540 }
541
StartsWithCaseInsensitiveASCII(std::u16string_view str, std::u16string_view search_for)542 bool StartsWithCaseInsensitiveASCII(std::u16string_view str,
543 std::u16string_view search_for) {
544 return StartsWithCaseInsensitiveASCIIT(str, search_for);
545 }
546
547 template <typename char_type>
EndsWithCaseInsensitiveASCIIT( std::basic_string_view<char_type> str, std::basic_string_view<char_type> search_for)548 bool EndsWithCaseInsensitiveASCIIT(
549 std::basic_string_view<char_type> str,
550 std::basic_string_view<char_type> search_for) {
551 if (search_for.size() > str.size())
552 return false;
553
554 std::basic_string_view<char_type> source =
555 str.substr(str.size() - search_for.size(), search_for.size());
556
557 return std::equal(source.begin(), source.end(), search_for.begin(),
558 CaseInsensitiveCompareASCII<char_type>());
559 }
560
EndsWithCaseInsensitiveASCII(std::string_view str, std::string_view search_for)561 bool EndsWithCaseInsensitiveASCII(std::string_view str,
562 std::string_view search_for) {
563 return EndsWithCaseInsensitiveASCIIT(str, search_for);
564 }
565
EndsWithCaseInsensitiveASCII(std::u16string_view str, std::u16string_view search_for)566 bool EndsWithCaseInsensitiveASCII(std::u16string_view str,
567 std::u16string_view search_for) {
568 return EndsWithCaseInsensitiveASCIIT(str, search_for);
569 }
570
HexDigitToInt(char16_t c)571 char HexDigitToInt(char16_t c) {
572 DCHECK(IsHexDigit(c));
573 if (c >= '0' && c <= '9')
574 return static_cast<char>(c - '0');
575 if (c >= 'A' && c <= 'F')
576 return static_cast<char>(c - 'A' + 10);
577 if (c >= 'a' && c <= 'f')
578 return static_cast<char>(c - 'a' + 10);
579 return 0;
580 }
581
IsUnicodeWhitespace(char16_t c)582 bool IsUnicodeWhitespace(char16_t c) {
583 // kWhitespaceWide is a NULL-terminated string
584 for (const char16_t* cur = kWhitespaceUTF16; *cur; ++cur) {
585 if (*cur == c)
586 return true;
587 }
588 return false;
589 }
590
591 static const char* const kByteStringsUnlocalized[] = {" B", " kB", " MB",
592 " GB", " TB", " PB"};
593
FormatBytesUnlocalized(int64_t bytes)594 std::u16string FormatBytesUnlocalized(int64_t bytes) {
595 double unit_amount = static_cast<double>(bytes);
596 size_t dimension = 0;
597 const int kKilo = 1024;
598 while (unit_amount >= kKilo &&
599 dimension < std::size(kByteStringsUnlocalized) - 1) {
600 unit_amount /= kKilo;
601 dimension++;
602 }
603
604 char buf[64];
605 if (bytes != 0 && dimension > 0 && unit_amount < 100) {
606 base::snprintf(buf, std::size(buf), "%.1lf%s", unit_amount,
607 kByteStringsUnlocalized[dimension]);
608 } else {
609 base::snprintf(buf, std::size(buf), "%.0lf%s", unit_amount,
610 kByteStringsUnlocalized[dimension]);
611 }
612
613 return ASCIIToUTF16(buf);
614 }
615
616 // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings.
617 template <class StringType>
618 struct SubstringMatcher {
619 std::basic_string_view<typename StringType::value_type> find_this;
620
Findbase::SubstringMatcher621 size_t Find(const StringType& input, size_t pos) {
622 return input.find(find_this.data(), pos, find_this.length());
623 }
MatchSizebase::SubstringMatcher624 size_t MatchSize() { return find_this.length(); }
625 };
626
627 // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters.
628 template <class StringType>
629 struct CharacterMatcher {
630 std::basic_string_view<typename StringType::value_type> find_any_of_these;
631
Findbase::CharacterMatcher632 size_t Find(const StringType& input, size_t pos) {
633 return input.find_first_of(find_any_of_these.data(), pos,
634 find_any_of_these.length());
635 }
MatchSizebase::CharacterMatcher636 constexpr size_t MatchSize() { return 1; }
637 };
638
639 enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST };
640
641 // Runs in O(n) time in the length of |str|, and transforms the string without
642 // reallocating when possible. Returns |true| if any matches were found.
643 //
644 // This is parameterized on a |Matcher| traits type, so that it can be the
645 // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset().
646 template <class StringType, class Matcher>
DoReplaceMatchesAfterOffset( StringType* str, size_t initial_offset, Matcher matcher, std::basic_string_view<typename StringType::value_type> replace_with, ReplaceType replace_type)647 bool DoReplaceMatchesAfterOffset(
648 StringType* str,
649 size_t initial_offset,
650 Matcher matcher,
651 std::basic_string_view<typename StringType::value_type> replace_with,
652 ReplaceType replace_type) {
653 using CharTraits = typename StringType::traits_type;
654
655 const size_t find_length = matcher.MatchSize();
656 if (!find_length)
657 return false;
658
659 // If the find string doesn't appear, there's nothing to do.
660 size_t first_match = matcher.Find(*str, initial_offset);
661 if (first_match == StringType::npos)
662 return false;
663
664 // If we're only replacing one instance, there's no need to do anything
665 // complicated.
666 const size_t replace_length = replace_with.length();
667 if (replace_type == ReplaceType::REPLACE_FIRST) {
668 str->replace(first_match, find_length, replace_with.data(), replace_length);
669 return true;
670 }
671
672 // If the find and replace strings are the same length, we can simply use
673 // replace() on each instance, and finish the entire operation in O(n) time.
674 if (find_length == replace_length) {
675 auto* buffer = &((*str)[0]);
676 for (size_t offset = first_match; offset != StringType::npos;
677 offset = matcher.Find(*str, offset + replace_length)) {
678 CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
679 }
680 return true;
681 }
682
683 // Since the find and replace strings aren't the same length, a loop like the
684 // one above would be O(n^2) in the worst case, as replace() will shift the
685 // entire remaining string each time. We need to be more clever to keep things
686 // O(n).
687 //
688 // When the string is being shortened, it's possible to just shift the matches
689 // down in one pass while finding, and truncate the length at the end of the
690 // search.
691 //
692 // If the string is being lengthened, more work is required. The strategy used
693 // here is to make two find() passes through the string. The first pass counts
694 // the number of matches to determine the new size. The second pass will
695 // either construct the new string into a new buffer (if the existing buffer
696 // lacked capacity), or else -- if there is room -- create a region of scratch
697 // space after |first_match| by shifting the tail of the string to a higher
698 // index, and doing in-place moves from the tail to lower indices thereafter.
699 size_t str_length = str->length();
700 size_t expansion = 0;
701 if (replace_length > find_length) {
702 // This operation lengthens the string; determine the new length by counting
703 // matches.
704 const size_t expansion_per_match = (replace_length - find_length);
705 size_t num_matches = 0;
706 for (size_t match = first_match; match != StringType::npos;
707 match = matcher.Find(*str, match + find_length)) {
708 expansion += expansion_per_match;
709 ++num_matches;
710 }
711 const size_t final_length = str_length + expansion;
712
713 if (str->capacity() < final_length) {
714 // If we'd have to allocate a new buffer to grow the string, build the
715 // result directly into the new allocation via append().
716 StringType src(str->get_allocator());
717 str->swap(src);
718 str->reserve(final_length);
719
720 size_t pos = 0;
721 for (size_t match = first_match;; match = matcher.Find(src, pos)) {
722 str->append(src, pos, match - pos);
723 str->append(replace_with.data(), replace_length);
724 pos = match + find_length;
725
726 // A mid-loop test/break enables skipping the final Find() call; the
727 // number of matches is known, so don't search past the last one.
728 if (!--num_matches)
729 break;
730 }
731
732 // Handle substring after the final match.
733 str->append(src, pos, str_length - pos);
734 return true;
735 }
736
737 // Prepare for the copy/move loop below -- expand the string to its final
738 // size by shifting the data after the first match to the end of the resized
739 // string.
740 size_t shift_src = first_match + find_length;
741 size_t shift_dst = shift_src + expansion;
742
743 // Big |expansion| factors (relative to |str_length|) require padding up to
744 // |shift_dst|.
745 if (shift_dst > str_length)
746 str->resize(shift_dst);
747
748 str->replace(shift_dst, str_length - shift_src, *str, shift_src,
749 str_length - shift_src);
750 str_length = final_length;
751 }
752
753 // We can alternate replacement and move operations. This won't overwrite the
754 // unsearched region of the string so long as |write_offset| <= |read_offset|;
755 // that condition is always satisfied because:
756 //
757 // (a) If the string is being shortened, |expansion| is zero and
758 // |write_offset| grows slower than |read_offset|.
759 //
760 // (b) If the string is being lengthened, |write_offset| grows faster than
761 // |read_offset|, but |expansion| is big enough so that |write_offset|
762 // will only catch up to |read_offset| at the point of the last match.
763 auto* buffer = &((*str)[0]);
764 size_t write_offset = first_match;
765 size_t read_offset = first_match + expansion;
766 do {
767 if (replace_length) {
768 CharTraits::copy(buffer + write_offset, replace_with.data(),
769 replace_length);
770 write_offset += replace_length;
771 }
772 read_offset += find_length;
773
774 // min() clamps StringType::npos (the largest unsigned value) to str_length.
775 size_t match = std::min(matcher.Find(*str, read_offset), str_length);
776
777 size_t length = match - read_offset;
778 if (length) {
779 CharTraits::move(buffer + write_offset, buffer + read_offset, length);
780 write_offset += length;
781 read_offset += length;
782 }
783 } while (read_offset < str_length);
784
785 // If we're shortening the string, truncate it now.
786 str->resize(write_offset);
787 return true;
788 }
789
790 template <class StringType>
ReplaceCharsT( const StringType& input, std::basic_string_view<typename StringType::value_type> find_any_of_these, std::basic_string_view<typename StringType::value_type> replace_with, StringType* output)791 bool ReplaceCharsT(
792 const StringType& input,
793 std::basic_string_view<typename StringType::value_type> find_any_of_these,
794 std::basic_string_view<typename StringType::value_type> replace_with,
795 StringType* output) {
796 // Commonly, this is called with output and input being the same string; in
797 // that case, this assignment is inexpensive.
798 *output = input;
799
800 return DoReplaceMatchesAfterOffset(
801 output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with,
802 ReplaceType::REPLACE_ALL);
803 }
804
ReplaceFirstSubstringAfterOffset(std::u16string* str, size_t start_offset, std::u16string_view find_this, std::u16string_view replace_with)805 void ReplaceFirstSubstringAfterOffset(std::u16string* str,
806 size_t start_offset,
807 std::u16string_view find_this,
808 std::u16string_view replace_with) {
809 DoReplaceMatchesAfterOffset(str, start_offset,
810 SubstringMatcher<std::u16string>{find_this},
811 replace_with, ReplaceType::REPLACE_FIRST);
812 }
813
ReplaceFirstSubstringAfterOffset(std::string* str, size_t start_offset, std::string_view find_this, std::string_view replace_with)814 void ReplaceFirstSubstringAfterOffset(std::string* str,
815 size_t start_offset,
816 std::string_view find_this,
817 std::string_view replace_with) {
818 DoReplaceMatchesAfterOffset(str, start_offset,
819 SubstringMatcher<std::string>{find_this},
820 replace_with, ReplaceType::REPLACE_FIRST);
821 }
822
ReplaceSubstringsAfterOffset(std::u16string* str, size_t start_offset, std::u16string_view find_this, std::u16string_view replace_with)823 void ReplaceSubstringsAfterOffset(std::u16string* str,
824 size_t start_offset,
825 std::u16string_view find_this,
826 std::u16string_view replace_with) {
827 DoReplaceMatchesAfterOffset(str, start_offset,
828 SubstringMatcher<std::u16string>{find_this},
829 replace_with, ReplaceType::REPLACE_ALL);
830 }
831
ReplaceSubstringsAfterOffset(std::string* str, size_t start_offset, std::string_view find_this, std::string_view replace_with)832 void ReplaceSubstringsAfterOffset(std::string* str,
833 size_t start_offset,
834 std::string_view find_this,
835 std::string_view replace_with) {
836 DoReplaceMatchesAfterOffset(str, start_offset,
837 SubstringMatcher<std::string>{find_this},
838 replace_with, ReplaceType::REPLACE_ALL);
839 }
840
841 template <class string_type>
WriteIntoT(string_type* str, size_t length_with_null)842 inline typename string_type::value_type* WriteIntoT(string_type* str,
843 size_t length_with_null) {
844 DCHECK_GT(length_with_null, 1u);
845 str->reserve(length_with_null);
846 str->resize(length_with_null - 1);
847 return &((*str)[0]);
848 }
849
WriteInto(std::string* str, size_t length_with_null)850 char* WriteInto(std::string* str, size_t length_with_null) {
851 return WriteIntoT(str, length_with_null);
852 }
853
WriteInto(std::u16string* str, size_t length_with_null)854 char16_t* WriteInto(std::u16string* str, size_t length_with_null) {
855 return WriteIntoT(str, length_with_null);
856 }
857
858 #if defined(_MSC_VER) && !defined(__clang__)
859 // Work around VC++ code-gen bug. https://crbug.com/804884
860 #pragma optimize("", off)
861 #endif
862
863 // Generic version for all JoinString overloads. |list_type| must be a sequence
864 // (std::vector or std::initializer_list) of strings/string_views of any type.
865 template <typename char_type, typename list_type>
JoinStringT( const list_type& parts, std::basic_string_view<char_type> sep)866 static std::basic_string<char_type> JoinStringT(
867 const list_type& parts,
868 std::basic_string_view<char_type> sep) {
869 if (parts.size() == 0)
870 return std::basic_string<char_type>();
871
872 // Pre-allocate the eventual size of the string. Start with the size of all of
873 // the separators (note that this *assumes* parts.size() > 0).
874 size_t total_size = (parts.size() - 1) * sep.size();
875 for (const auto& part : parts)
876 total_size += part.size();
877 std::basic_string<char_type> result;
878 result.reserve(total_size);
879
880 auto iter = parts.begin();
881 DCHECK(iter != parts.end());
882 result.append(*iter);
883 ++iter;
884
885 for (; iter != parts.end(); ++iter) {
886 result.append(sep);
887 result.append(*iter);
888 }
889
890 // Sanity-check that we pre-allocated correctly.
891 DCHECK_EQ(total_size, result.size());
892
893 return result;
894 }
895
JoinString(const std::vector<std::string>& parts, std::string_view separator)896 std::string JoinString(const std::vector<std::string>& parts,
897 std::string_view separator) {
898 return JoinStringT(parts, separator);
899 }
900
JoinString(const std::vector<std::u16string>& parts, std::u16string_view separator)901 std::u16string JoinString(const std::vector<std::u16string>& parts,
902 std::u16string_view separator) {
903 return JoinStringT(parts, separator);
904 }
905
906 #if defined(_MSC_VER) && !defined(__clang__)
907 // Work around VC++ code-gen bug. https://crbug.com/804884
908 #pragma optimize("", on)
909 #endif
910
JoinString(const std::vector<std::string_view>& parts, std::string_view separator)911 std::string JoinString(const std::vector<std::string_view>& parts,
912 std::string_view separator) {
913 return JoinStringT(parts, separator);
914 }
915
JoinString(const std::vector<std::u16string_view>& parts, std::u16string_view separator)916 std::u16string JoinString(const std::vector<std::u16string_view>& parts,
917 std::u16string_view separator) {
918 return JoinStringT(parts, separator);
919 }
920
JoinString(std::initializer_list<std::string_view> parts, std::string_view separator)921 std::string JoinString(std::initializer_list<std::string_view> parts,
922 std::string_view separator) {
923 return JoinStringT(parts, separator);
924 }
925
JoinString(std::initializer_list<std::u16string_view> parts, std::u16string_view separator)926 std::u16string JoinString(std::initializer_list<std::u16string_view> parts,
927 std::u16string_view separator) {
928 return JoinStringT(parts, separator);
929 }
930
931 template <class FormatStringType, class OutStringType>
DoReplaceStringPlaceholders( const FormatStringType& format_string, const std::vector<OutStringType>& subst, std::vector<size_t>* offsets)932 OutStringType DoReplaceStringPlaceholders(
933 const FormatStringType& format_string,
934 const std::vector<OutStringType>& subst,
935 std::vector<size_t>* offsets) {
936 size_t substitutions = subst.size();
937 DCHECK_LT(substitutions, 10U);
938
939 size_t sub_length = 0;
940 for (const auto& cur : subst)
941 sub_length += cur.length();
942
943 OutStringType formatted;
944 formatted.reserve(format_string.length() + sub_length);
945
946 std::vector<ReplacementOffset> r_offsets;
947 for (auto i = format_string.begin(); i != format_string.end(); ++i) {
948 if ('$' == *i) {
949 if (i + 1 != format_string.end()) {
950 ++i;
951 if ('$' == *i) {
952 while (i != format_string.end() && '$' == *i) {
953 formatted.push_back('$');
954 ++i;
955 }
956 --i;
957 } else {
958 if (*i < '1' || *i > '9') {
959 DLOG(ERROR) << "Invalid placeholder: $" << std::to_string(*i);
960 continue;
961 }
962 uintptr_t index = *i - '1';
963 if (offsets) {
964 ReplacementOffset r_offset(index,
965 static_cast<int>(formatted.size()));
966 r_offsets.insert(
967 std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset,
968 &CompareParameter),
969 r_offset);
970 }
971 if (index < substitutions)
972 formatted.append(subst.at(index));
973 }
974 }
975 } else {
976 formatted.push_back(*i);
977 }
978 }
979 if (offsets) {
980 for (const auto& cur : r_offsets)
981 offsets->push_back(cur.offset);
982 }
983 return formatted;
984 }
985
ReplaceStringPlaceholders( const std::u16string& format_string, const std::vector<std::u16string>& subst, std::vector<size_t>* offsets)986 std::u16string ReplaceStringPlaceholders(
987 const std::u16string& format_string,
988 const std::vector<std::u16string>& subst,
989 std::vector<size_t>* offsets) {
990 return DoReplaceStringPlaceholders(format_string, subst, offsets);
991 }
992
ReplaceStringPlaceholders(std::string_view format_string, const std::vector<std::string>& subst, std::vector<size_t>* offsets)993 std::string ReplaceStringPlaceholders(std::string_view format_string,
994 const std::vector<std::string>& subst,
995 std::vector<size_t>* offsets) {
996 return DoReplaceStringPlaceholders(format_string, subst, offsets);
997 }
998
ReplaceStringPlaceholders(const std::u16string& format_string, const std::u16string& a, size_t* offset)999 std::u16string ReplaceStringPlaceholders(const std::u16string& format_string,
1000 const std::u16string& a,
1001 size_t* offset) {
1002 std::vector<size_t> offsets;
1003 std::vector<std::u16string> subst;
1004 subst.push_back(a);
1005 std::u16string result =
1006 ReplaceStringPlaceholders(format_string, subst, &offsets);
1007
1008 DCHECK_EQ(1U, offsets.size());
1009 if (offset)
1010 *offset = offsets[0];
1011 return result;
1012 }
1013
1014 } // namespace base
1015