1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/string_util.h"
6 
7 #include <ctype.h>
8 #include <errno.h>
9 #include <math.h>
10 #include <stdarg.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <time.h>
16 #include <wchar.h>
17 #include <wctype.h>
18 
19 #include <algorithm>
20 #include <iterator>
21 #include <limits>
22 #include <string>
23 #include <vector>
24 
25 #include "base/logging.h"
26 #include "base/strings/utf_string_conversion_utils.h"
27 #include "base/strings/utf_string_conversions.h"
28 #include "base/third_party/icu/icu_utf.h"
29 #include "util/build_config.h"
30 
31 namespace base {
32 
33 namespace {
34 
35 // Used by ReplaceStringPlaceholders to track the position in the string of
36 // replaced parameters.
37 struct ReplacementOffset {
ReplacementOffsetbase::__anon2855::ReplacementOffset38   ReplacementOffset(uintptr_t parameter, size_t offset)
39       : parameter(parameter), offset(offset) {}
40 
41   // Index of the parameter.
42   uintptr_t parameter;
43 
44   // Starting position in the string.
45   size_t offset;
46 };
47 
CompareParameter(const ReplacementOffset& elem1, const ReplacementOffset& elem2)48 static bool CompareParameter(const ReplacementOffset& elem1,
49                              const ReplacementOffset& elem2) {
50   return elem1.parameter < elem2.parameter;
51 }
52 
53 // Assuming that a pointer is the size of a "machine word", then
54 // uintptr_t is an integer type that is also a machine word.
55 typedef uintptr_t MachineWord;
56 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
57 
IsAlignedToMachineWord(const void* pointer)58 inline bool IsAlignedToMachineWord(const void* pointer) {
59   return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
60 }
61 
62 template <typename T>
AlignToMachineWord(T* pointer)63 inline T* AlignToMachineWord(T* pointer) {
64   return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
65                               ~kMachineWordAlignmentMask);
66 }
67 
68 template <size_t size, typename CharacterType>
69 struct NonASCIIMask;
70 template <>
71 struct NonASCIIMask<4, char16_t> {
valuebase::__anon2855::NonASCIIMask72   static inline uint32_t value() { return 0xFF80FF80U; }
73 };
74 template <>
75 struct NonASCIIMask<4, char> {
valuebase::__anon2855::NonASCIIMask76   static inline uint32_t value() { return 0x80808080U; }
77 };
78 template <>
79 struct NonASCIIMask<8, char16_t> {
valuebase::__anon2855::NonASCIIMask80   static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
81 };
82 template <>
83 struct NonASCIIMask<8, char> {
valuebase::__anon2855::NonASCIIMask84   static inline uint64_t value() { return 0x8080808080808080ULL; }
85 };
86 
87 }  // namespace
88 
89 namespace {
90 
91 template <typename StringType>
ToLowerASCIIImpl( std::basic_string_view<typename StringType::value_type> str)92 StringType ToLowerASCIIImpl(
93     std::basic_string_view<typename StringType::value_type> str) {
94   StringType ret;
95   ret.reserve(str.size());
96   for (size_t i = 0; i < str.size(); i++)
97     ret.push_back(ToLowerASCII(str[i]));
98   return ret;
99 }
100 
101 template <typename StringType>
ToUpperASCIIImpl( std::basic_string_view<typename StringType::value_type> str)102 StringType ToUpperASCIIImpl(
103     std::basic_string_view<typename StringType::value_type> str) {
104   StringType ret;
105   ret.reserve(str.size());
106   for (size_t i = 0; i < str.size(); i++)
107     ret.push_back(ToUpperASCII(str[i]));
108   return ret;
109 }
110 
111 }  // namespace
112 
ToLowerASCII(std::string_view str)113 std::string ToLowerASCII(std::string_view str) {
114   return ToLowerASCIIImpl<std::string>(str);
115 }
116 
ToLowerASCII(std::u16string_view str)117 std::u16string ToLowerASCII(std::u16string_view str) {
118   return ToLowerASCIIImpl<std::u16string>(str);
119 }
120 
ToUpperASCII(std::string_view str)121 std::string ToUpperASCII(std::string_view str) {
122   return ToUpperASCIIImpl<std::string>(str);
123 }
124 
ToUpperASCII(std::u16string_view str)125 std::u16string ToUpperASCII(std::u16string_view str) {
126   return ToUpperASCIIImpl<std::u16string>(str);
127 }
128 
starts_with(const std::string_view str1, const std::string_view str2)129 bool starts_with(const std::string_view str1, const std::string_view str2) {
130     if (str2.length() > str1.length()) {
131         return false;
132     }
133     return str1.compare(0, str2.length(), str2) == 0;
134 }
135 
ends_with(const std::string_view str1, const std::string_view str2)136 bool ends_with(const std::string_view str1, const std::string_view str2) {
137     if (str2.empty()) {
138         return true;
139     }
140     if (str1.length() < str2.length()) {
141         return false;
142     }
143     return str1.substr(str1.length() - str2.length()) == str2;
144 }
145 
146 template <class StringType>
CompareCaseInsensitiveASCIIT( std::basic_string_view<typename StringType::value_type> a, std::basic_string_view<typename StringType::value_type> b)147 int CompareCaseInsensitiveASCIIT(
148     std::basic_string_view<typename StringType::value_type> a,
149     std::basic_string_view<typename StringType::value_type> b) {
150   // Find the first characters that aren't equal and compare them.  If the end
151   // of one of the strings is found before a nonequal character, the lengths
152   // of the strings are compared.
153   size_t i = 0;
154   while (i < a.length() && i < b.length()) {
155     typename StringType::value_type lower_a = ToLowerASCII(a[i]);
156     typename StringType::value_type lower_b = ToLowerASCII(b[i]);
157     if (lower_a < lower_b)
158       return -1;
159     if (lower_a > lower_b)
160       return 1;
161     i++;
162   }
163 
164   // End of one string hit before finding a different character. Expect the
165   // common case to be "strings equal" at this point so check that first.
166   if (a.length() == b.length())
167     return 0;
168 
169   if (a.length() < b.length())
170     return -1;
171   return 1;
172 }
173 
CompareCaseInsensitiveASCII(std::string_view a, std::string_view b)174 int CompareCaseInsensitiveASCII(std::string_view a, std::string_view b) {
175   return CompareCaseInsensitiveASCIIT<std::string>(a, b);
176 }
177 
CompareCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b)178 int CompareCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
179   return CompareCaseInsensitiveASCIIT<std::u16string>(a, b);
180 }
181 
EqualsCaseInsensitiveASCII(std::string_view a, std::string_view b)182 bool EqualsCaseInsensitiveASCII(std::string_view a, std::string_view b) {
183   if (a.length() != b.length())
184     return false;
185   return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0;
186 }
187 
EqualsCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b)188 bool EqualsCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
189   if (a.length() != b.length())
190     return false;
191   return CompareCaseInsensitiveASCIIT<std::u16string>(a, b) == 0;
192 }
193 
194 template <class StringType>
195 bool ReplaceCharsT(
196     const StringType& input,
197     std::basic_string_view<typename StringType::value_type> find_any_of_these,
198     std::basic_string_view<typename StringType::value_type> replace_with,
199     StringType* output);
200 
ReplaceChars(const std::u16string& input, std::u16string_view replace_chars, const std::u16string& replace_with, std::u16string* output)201 bool ReplaceChars(const std::u16string& input,
202                   std::u16string_view replace_chars,
203                   const std::u16string& replace_with,
204                   std::u16string* output) {
205   return ReplaceCharsT(input, replace_chars, std::u16string_view(replace_with),
206                        output);
207 }
208 
ReplaceChars(const std::string& input, std::string_view replace_chars, const std::string& replace_with, std::string* output)209 bool ReplaceChars(const std::string& input,
210                   std::string_view replace_chars,
211                   const std::string& replace_with,
212                   std::string* output) {
213   return ReplaceCharsT(input, replace_chars, std::string_view(replace_with),
214                        output);
215 }
216 
RemoveChars(const std::u16string& input, std::u16string_view remove_chars, std::u16string* output)217 bool RemoveChars(const std::u16string& input,
218                  std::u16string_view remove_chars,
219                  std::u16string* output) {
220   return ReplaceCharsT(input, remove_chars, std::u16string_view(), output);
221 }
222 
RemoveChars(const std::string& input, std::string_view remove_chars, std::string* output)223 bool RemoveChars(const std::string& input,
224                  std::string_view remove_chars,
225                  std::string* output) {
226   return ReplaceCharsT(input, remove_chars, std::string_view(), output);
227 }
228 
229 template <typename Str>
TrimStringT( const Str& input, std::basic_string_view<typename Str::value_type> trim_chars, TrimPositions positions, Str* output)230 TrimPositions TrimStringT(
231     const Str& input,
232     std::basic_string_view<typename Str::value_type> trim_chars,
233     TrimPositions positions,
234     Str* output) {
235   // Find the edges of leading/trailing whitespace as desired. Need to use
236   // a std::string_view version of input to be able to call find* on it with the
237   // std::string_view version of trim_chars (normally the trim_chars will be a
238   // constant so avoid making a copy).
239   std::basic_string_view<typename Str::value_type> input_piece(input);
240   const size_t last_char = input.length() - 1;
241   const size_t first_good_char = (positions & TRIM_LEADING)
242                                      ? input_piece.find_first_not_of(trim_chars)
243                                      : 0;
244   const size_t last_good_char = (positions & TRIM_TRAILING)
245                                     ? input_piece.find_last_not_of(trim_chars)
246                                     : last_char;
247 
248   // When the string was all trimmed, report that we stripped off characters
249   // from whichever position the caller was interested in. For empty input, we
250   // stripped no characters, but we still need to clear |output|.
251   if (input.empty() || (first_good_char == Str::npos) ||
252       (last_good_char == Str::npos)) {
253     bool input_was_empty = input.empty();  // in case output == &input
254     output->clear();
255     return input_was_empty ? TRIM_NONE : positions;
256   }
257 
258   // Trim.
259   *output = input.substr(first_good_char, last_good_char - first_good_char + 1);
260 
261   // Return where we trimmed from.
262   return static_cast<TrimPositions>(
263       ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
264       ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
265 }
266 
TrimString(const std::u16string& input, std::u16string_view trim_chars, std::u16string* output)267 bool TrimString(const std::u16string& input,
268                 std::u16string_view trim_chars,
269                 std::u16string* output) {
270   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
271 }
272 
TrimString(const std::string& input, std::string_view trim_chars, std::string* output)273 bool TrimString(const std::string& input,
274                 std::string_view trim_chars,
275                 std::string* output) {
276   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
277 }
278 
279 template <typename char_type>
TrimStringPieceT( std::basic_string_view<char_type> input, std::basic_string_view<char_type> trim_chars, TrimPositions positions)280 std::basic_string_view<char_type> TrimStringPieceT(
281     std::basic_string_view<char_type> input,
282     std::basic_string_view<char_type> trim_chars,
283     TrimPositions positions) {
284   size_t begin =
285       (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0;
286   if (begin == std::basic_string_view<char_type>::npos)
287     return std::basic_string_view<char_type>();  // All trimmed.
288 
289   size_t end = (positions & TRIM_TRAILING)
290                    ? input.find_last_not_of(trim_chars) + 1
291                    : input.size();
292   return input.substr(begin, end - begin);
293 }
294 
TrimString(std::u16string_view input, std::u16string_view trim_chars, TrimPositions positions)295 std::u16string_view TrimString(std::u16string_view input,
296                                std::u16string_view trim_chars,
297                                TrimPositions positions) {
298   return TrimStringPieceT(input, trim_chars, positions);
299 }
300 
TrimString(std::string_view input, std::string_view trim_chars, TrimPositions positions)301 std::string_view TrimString(std::string_view input,
302                             std::string_view trim_chars,
303                             TrimPositions positions) {
304   return TrimStringPieceT(input, trim_chars, positions);
305 }
306 
TruncateUTF8ToByteSize(const std::string& input, const size_t byte_size, std::string* output)307 void TruncateUTF8ToByteSize(const std::string& input,
308                             const size_t byte_size,
309                             std::string* output) {
310   DCHECK(output);
311   if (byte_size > input.length()) {
312     *output = input;
313     return;
314   }
315   DCHECK_LE(byte_size,
316             static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
317   // Note: This cast is necessary because CBU8_NEXT uses int32_ts.
318   int32_t truncation_length = static_cast<int32_t>(byte_size);
319   int32_t char_index = truncation_length - 1;
320   const char* data = input.data();
321 
322   // Using CBU8, we will move backwards from the truncation point
323   // to the beginning of the string looking for a valid UTF8
324   // character.  Once a full UTF8 character is found, we will
325   // truncate the string to the end of that character.
326   while (char_index >= 0) {
327     int32_t prev = char_index;
328     base_icu::UChar32 code_point = 0;
329     CBU8_NEXT(data, char_index, truncation_length, code_point);
330     if (!IsValidCharacter(code_point) || !IsValidCodepoint(code_point)) {
331       char_index = prev - 1;
332     } else {
333       break;
334     }
335   }
336 
337   if (char_index >= 0)
338     *output = input.substr(0, char_index);
339   else
340     output->clear();
341 }
342 
TrimWhitespace(const std::u16string& input, TrimPositions positions, std::u16string* output)343 TrimPositions TrimWhitespace(const std::u16string& input,
344                              TrimPositions positions,
345                              std::u16string* output) {
346   return TrimStringT(input, std::u16string_view(kWhitespaceUTF16), positions,
347                      output);
348 }
349 
TrimWhitespace(std::u16string_view input, TrimPositions positions)350 std::u16string_view TrimWhitespace(std::u16string_view input,
351                                    TrimPositions positions) {
352   return TrimStringPieceT(input, std::u16string_view(kWhitespaceUTF16),
353                           positions);
354 }
355 
TrimWhitespaceASCII(const std::string& input, TrimPositions positions, std::string* output)356 TrimPositions TrimWhitespaceASCII(const std::string& input,
357                                   TrimPositions positions,
358                                   std::string* output) {
359   return TrimStringT(input, std::string_view(kWhitespaceASCII), positions,
360                      output);
361 }
362 
TrimWhitespaceASCII(std::string_view input, TrimPositions positions)363 std::string_view TrimWhitespaceASCII(std::string_view input,
364                                      TrimPositions positions) {
365   return TrimStringPieceT(input, std::string_view(kWhitespaceASCII), positions);
366 }
367 
368 template <typename STR>
CollapseWhitespaceT(const STR& text, bool trim_sequences_with_line_breaks)369 STR CollapseWhitespaceT(const STR& text, bool trim_sequences_with_line_breaks) {
370   STR result;
371   result.resize(text.size());
372 
373   // Set flags to pretend we're already in a trimmed whitespace sequence, so we
374   // will trim any leading whitespace.
375   bool in_whitespace = true;
376   bool already_trimmed = true;
377 
378   int chars_written = 0;
379   for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
380     if (IsUnicodeWhitespace(*i)) {
381       if (!in_whitespace) {
382         // Reduce all whitespace sequences to a single space.
383         in_whitespace = true;
384         result[chars_written++] = L' ';
385       }
386       if (trim_sequences_with_line_breaks && !already_trimmed &&
387           ((*i == '\n') || (*i == '\r'))) {
388         // Whitespace sequences containing CR or LF are eliminated entirely.
389         already_trimmed = true;
390         --chars_written;
391       }
392     } else {
393       // Non-whitespace characters are copied straight across.
394       in_whitespace = false;
395       already_trimmed = false;
396       result[chars_written++] = *i;
397     }
398   }
399 
400   if (in_whitespace && !already_trimmed) {
401     // Any trailing whitespace is eliminated.
402     --chars_written;
403   }
404 
405   result.resize(chars_written);
406   return result;
407 }
408 
CollapseWhitespace(const std::u16string& text, bool trim_sequences_with_line_breaks)409 std::u16string CollapseWhitespace(const std::u16string& text,
410                                   bool trim_sequences_with_line_breaks) {
411   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
412 }
413 
CollapseWhitespaceASCII(const std::string& text, bool trim_sequences_with_line_breaks)414 std::string CollapseWhitespaceASCII(const std::string& text,
415                                     bool trim_sequences_with_line_breaks) {
416   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
417 }
418 
ContainsOnlyChars(std::string_view input, std::string_view characters)419 bool ContainsOnlyChars(std::string_view input, std::string_view characters) {
420   return input.find_first_not_of(characters) == std::string_view::npos;
421 }
422 
ContainsOnlyChars(std::u16string_view input, std::u16string_view characters)423 bool ContainsOnlyChars(std::u16string_view input,
424                        std::u16string_view characters) {
425   return input.find_first_not_of(characters) == std::u16string_view::npos;
426 }
427 
428 template <class Char>
DoIsStringASCII(const Char* characters, size_t length)429 inline bool DoIsStringASCII(const Char* characters, size_t length) {
430   MachineWord all_char_bits = 0;
431   const Char* end = characters + length;
432 
433   // Prologue: align the input.
434   while (!IsAlignedToMachineWord(characters) && characters != end) {
435     all_char_bits |= *characters;
436     ++characters;
437   }
438 
439   // Compare the values of CPU word size.
440   const Char* word_end = AlignToMachineWord(end);
441   const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
442   while (characters < word_end) {
443     all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
444     characters += loop_increment;
445   }
446 
447   // Process the remaining bytes.
448   while (characters != end) {
449     all_char_bits |= *characters;
450     ++characters;
451   }
452 
453   MachineWord non_ascii_bit_mask =
454       NonASCIIMask<sizeof(MachineWord), Char>::value();
455   return !(all_char_bits & non_ascii_bit_mask);
456 }
457 
IsStringASCII(std::string_view str)458 bool IsStringASCII(std::string_view str) {
459   return DoIsStringASCII(str.data(), str.length());
460 }
461 
IsStringASCII(std::u16string_view str)462 bool IsStringASCII(std::u16string_view str) {
463   return DoIsStringASCII(str.data(), str.length());
464 }
465 
IsStringUTF8(std::string_view str)466 bool IsStringUTF8(std::string_view str) {
467   const char* src = str.data();
468   int32_t src_len = static_cast<int32_t>(str.length());
469   int32_t char_index = 0;
470 
471   while (char_index < src_len) {
472     int32_t code_point;
473     CBU8_NEXT(src, char_index, src_len, code_point);
474     if (!IsValidCharacter(code_point))
475       return false;
476   }
477   return true;
478 }
479 
480 // Implementation note: Normally this function will be called with a hardcoded
481 // constant for the lowercase_ascii parameter. Constructing a std::string_view
482 // from a C constant requires running strlen, so the result will be two passes
483 // through the buffers, one to file the length of lowercase_ascii, and one to
484 // compare each letter.
485 //
486 // This function could have taken a const char* to avoid this and only do one
487 // pass through the string. But the strlen is faster than the case-insensitive
488 // compares and lets us early-exit in the case that the strings are different
489 // lengths (will often be the case for non-matches). So whether one approach or
490 // the other will be faster depends on the case.
491 //
492 // The hardcoded strings are typically very short so it doesn't matter, and the
493 // string piece gives additional flexibility for the caller (doesn't have to be
494 // null terminated) so we choose the std::string_view route.
495 template <typename Str>
DoLowerCaseEqualsASCII( std::basic_string_view<typename Str::value_type> str, std::string_view lowercase_ascii)496 static inline bool DoLowerCaseEqualsASCII(
497     std::basic_string_view<typename Str::value_type> str,
498     std::string_view lowercase_ascii) {
499   if (str.size() != lowercase_ascii.size())
500     return false;
501   for (size_t i = 0; i < str.size(); i++) {
502     if (ToLowerASCII(str[i]) != lowercase_ascii[i])
503       return false;
504   }
505   return true;
506 }
507 
LowerCaseEqualsASCII(std::string_view str, std::string_view lowercase_ascii)508 bool LowerCaseEqualsASCII(std::string_view str,
509                           std::string_view lowercase_ascii) {
510   return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
511 }
512 
LowerCaseEqualsASCII(std::u16string_view str, std::string_view lowercase_ascii)513 bool LowerCaseEqualsASCII(std::u16string_view str,
514                           std::string_view lowercase_ascii) {
515   return DoLowerCaseEqualsASCII<std::u16string>(str, lowercase_ascii);
516 }
517 
EqualsASCII(std::u16string_view str, std::string_view ascii)518 bool EqualsASCII(std::u16string_view str, std::string_view ascii) {
519   if (str.length() != ascii.length())
520     return false;
521   return std::equal(ascii.begin(), ascii.end(), str.begin());
522 }
523 
524 template <typename char_type>
StartsWithCaseInsensitiveASCIIT( std::basic_string_view<char_type> str, std::basic_string_view<char_type> search_for)525 bool StartsWithCaseInsensitiveASCIIT(
526     std::basic_string_view<char_type> str,
527     std::basic_string_view<char_type> search_for) {
528   if (search_for.size() > str.size())
529     return false;
530 
531   std::basic_string_view<char_type> source = str.substr(0, search_for.size());
532 
533   return std::equal(search_for.begin(), search_for.end(), source.begin(),
534                     CaseInsensitiveCompareASCII<char_type>());
535 }
536 
StartsWithCaseInsensitiveASCII(std::string_view str, std::string_view search_for)537 bool StartsWithCaseInsensitiveASCII(std::string_view str,
538                                     std::string_view search_for) {
539   return StartsWithCaseInsensitiveASCIIT(str, search_for);
540 }
541 
StartsWithCaseInsensitiveASCII(std::u16string_view str, std::u16string_view search_for)542 bool StartsWithCaseInsensitiveASCII(std::u16string_view str,
543                                     std::u16string_view search_for) {
544   return StartsWithCaseInsensitiveASCIIT(str, search_for);
545 }
546 
547 template <typename char_type>
EndsWithCaseInsensitiveASCIIT( std::basic_string_view<char_type> str, std::basic_string_view<char_type> search_for)548 bool EndsWithCaseInsensitiveASCIIT(
549     std::basic_string_view<char_type> str,
550     std::basic_string_view<char_type> search_for) {
551   if (search_for.size() > str.size())
552     return false;
553 
554   std::basic_string_view<char_type> source =
555       str.substr(str.size() - search_for.size(), search_for.size());
556 
557   return std::equal(source.begin(), source.end(), search_for.begin(),
558                     CaseInsensitiveCompareASCII<char_type>());
559 }
560 
EndsWithCaseInsensitiveASCII(std::string_view str, std::string_view search_for)561 bool EndsWithCaseInsensitiveASCII(std::string_view str,
562                                   std::string_view search_for) {
563   return EndsWithCaseInsensitiveASCIIT(str, search_for);
564 }
565 
EndsWithCaseInsensitiveASCII(std::u16string_view str, std::u16string_view search_for)566 bool EndsWithCaseInsensitiveASCII(std::u16string_view str,
567                                   std::u16string_view search_for) {
568   return EndsWithCaseInsensitiveASCIIT(str, search_for);
569 }
570 
HexDigitToInt(char16_t c)571 char HexDigitToInt(char16_t c) {
572   DCHECK(IsHexDigit(c));
573   if (c >= '0' && c <= '9')
574     return static_cast<char>(c - '0');
575   if (c >= 'A' && c <= 'F')
576     return static_cast<char>(c - 'A' + 10);
577   if (c >= 'a' && c <= 'f')
578     return static_cast<char>(c - 'a' + 10);
579   return 0;
580 }
581 
IsUnicodeWhitespace(char16_t c)582 bool IsUnicodeWhitespace(char16_t c) {
583   // kWhitespaceWide is a NULL-terminated string
584   for (const char16_t* cur = kWhitespaceUTF16; *cur; ++cur) {
585     if (*cur == c)
586       return true;
587   }
588   return false;
589 }
590 
591 static const char* const kByteStringsUnlocalized[] = {" B",  " kB", " MB",
592                                                       " GB", " TB", " PB"};
593 
FormatBytesUnlocalized(int64_t bytes)594 std::u16string FormatBytesUnlocalized(int64_t bytes) {
595   double unit_amount = static_cast<double>(bytes);
596   size_t dimension = 0;
597   const int kKilo = 1024;
598   while (unit_amount >= kKilo &&
599          dimension < std::size(kByteStringsUnlocalized) - 1) {
600     unit_amount /= kKilo;
601     dimension++;
602   }
603 
604   char buf[64];
605   if (bytes != 0 && dimension > 0 && unit_amount < 100) {
606     base::snprintf(buf, std::size(buf), "%.1lf%s", unit_amount,
607                    kByteStringsUnlocalized[dimension]);
608   } else {
609     base::snprintf(buf, std::size(buf), "%.0lf%s", unit_amount,
610                    kByteStringsUnlocalized[dimension]);
611   }
612 
613   return ASCIIToUTF16(buf);
614 }
615 
616 // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings.
617 template <class StringType>
618 struct SubstringMatcher {
619   std::basic_string_view<typename StringType::value_type> find_this;
620 
Findbase::SubstringMatcher621   size_t Find(const StringType& input, size_t pos) {
622     return input.find(find_this.data(), pos, find_this.length());
623   }
MatchSizebase::SubstringMatcher624   size_t MatchSize() { return find_this.length(); }
625 };
626 
627 // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters.
628 template <class StringType>
629 struct CharacterMatcher {
630   std::basic_string_view<typename StringType::value_type> find_any_of_these;
631 
Findbase::CharacterMatcher632   size_t Find(const StringType& input, size_t pos) {
633     return input.find_first_of(find_any_of_these.data(), pos,
634                                find_any_of_these.length());
635   }
MatchSizebase::CharacterMatcher636   constexpr size_t MatchSize() { return 1; }
637 };
638 
639 enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST };
640 
641 // Runs in O(n) time in the length of |str|, and transforms the string without
642 // reallocating when possible. Returns |true| if any matches were found.
643 //
644 // This is parameterized on a |Matcher| traits type, so that it can be the
645 // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset().
646 template <class StringType, class Matcher>
DoReplaceMatchesAfterOffset( StringType* str, size_t initial_offset, Matcher matcher, std::basic_string_view<typename StringType::value_type> replace_with, ReplaceType replace_type)647 bool DoReplaceMatchesAfterOffset(
648     StringType* str,
649     size_t initial_offset,
650     Matcher matcher,
651     std::basic_string_view<typename StringType::value_type> replace_with,
652     ReplaceType replace_type) {
653   using CharTraits = typename StringType::traits_type;
654 
655   const size_t find_length = matcher.MatchSize();
656   if (!find_length)
657     return false;
658 
659   // If the find string doesn't appear, there's nothing to do.
660   size_t first_match = matcher.Find(*str, initial_offset);
661   if (first_match == StringType::npos)
662     return false;
663 
664   // If we're only replacing one instance, there's no need to do anything
665   // complicated.
666   const size_t replace_length = replace_with.length();
667   if (replace_type == ReplaceType::REPLACE_FIRST) {
668     str->replace(first_match, find_length, replace_with.data(), replace_length);
669     return true;
670   }
671 
672   // If the find and replace strings are the same length, we can simply use
673   // replace() on each instance, and finish the entire operation in O(n) time.
674   if (find_length == replace_length) {
675     auto* buffer = &((*str)[0]);
676     for (size_t offset = first_match; offset != StringType::npos;
677          offset = matcher.Find(*str, offset + replace_length)) {
678       CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
679     }
680     return true;
681   }
682 
683   // Since the find and replace strings aren't the same length, a loop like the
684   // one above would be O(n^2) in the worst case, as replace() will shift the
685   // entire remaining string each time. We need to be more clever to keep things
686   // O(n).
687   //
688   // When the string is being shortened, it's possible to just shift the matches
689   // down in one pass while finding, and truncate the length at the end of the
690   // search.
691   //
692   // If the string is being lengthened, more work is required. The strategy used
693   // here is to make two find() passes through the string. The first pass counts
694   // the number of matches to determine the new size. The second pass will
695   // either construct the new string into a new buffer (if the existing buffer
696   // lacked capacity), or else -- if there is room -- create a region of scratch
697   // space after |first_match| by shifting the tail of the string to a higher
698   // index, and doing in-place moves from the tail to lower indices thereafter.
699   size_t str_length = str->length();
700   size_t expansion = 0;
701   if (replace_length > find_length) {
702     // This operation lengthens the string; determine the new length by counting
703     // matches.
704     const size_t expansion_per_match = (replace_length - find_length);
705     size_t num_matches = 0;
706     for (size_t match = first_match; match != StringType::npos;
707          match = matcher.Find(*str, match + find_length)) {
708       expansion += expansion_per_match;
709       ++num_matches;
710     }
711     const size_t final_length = str_length + expansion;
712 
713     if (str->capacity() < final_length) {
714       // If we'd have to allocate a new buffer to grow the string, build the
715       // result directly into the new allocation via append().
716       StringType src(str->get_allocator());
717       str->swap(src);
718       str->reserve(final_length);
719 
720       size_t pos = 0;
721       for (size_t match = first_match;; match = matcher.Find(src, pos)) {
722         str->append(src, pos, match - pos);
723         str->append(replace_with.data(), replace_length);
724         pos = match + find_length;
725 
726         // A mid-loop test/break enables skipping the final Find() call; the
727         // number of matches is known, so don't search past the last one.
728         if (!--num_matches)
729           break;
730       }
731 
732       // Handle substring after the final match.
733       str->append(src, pos, str_length - pos);
734       return true;
735     }
736 
737     // Prepare for the copy/move loop below -- expand the string to its final
738     // size by shifting the data after the first match to the end of the resized
739     // string.
740     size_t shift_src = first_match + find_length;
741     size_t shift_dst = shift_src + expansion;
742 
743     // Big |expansion| factors (relative to |str_length|) require padding up to
744     // |shift_dst|.
745     if (shift_dst > str_length)
746       str->resize(shift_dst);
747 
748     str->replace(shift_dst, str_length - shift_src, *str, shift_src,
749                  str_length - shift_src);
750     str_length = final_length;
751   }
752 
753   // We can alternate replacement and move operations. This won't overwrite the
754   // unsearched region of the string so long as |write_offset| <= |read_offset|;
755   // that condition is always satisfied because:
756   //
757   //   (a) If the string is being shortened, |expansion| is zero and
758   //       |write_offset| grows slower than |read_offset|.
759   //
760   //   (b) If the string is being lengthened, |write_offset| grows faster than
761   //       |read_offset|, but |expansion| is big enough so that |write_offset|
762   //       will only catch up to |read_offset| at the point of the last match.
763   auto* buffer = &((*str)[0]);
764   size_t write_offset = first_match;
765   size_t read_offset = first_match + expansion;
766   do {
767     if (replace_length) {
768       CharTraits::copy(buffer + write_offset, replace_with.data(),
769                        replace_length);
770       write_offset += replace_length;
771     }
772     read_offset += find_length;
773 
774     // min() clamps StringType::npos (the largest unsigned value) to str_length.
775     size_t match = std::min(matcher.Find(*str, read_offset), str_length);
776 
777     size_t length = match - read_offset;
778     if (length) {
779       CharTraits::move(buffer + write_offset, buffer + read_offset, length);
780       write_offset += length;
781       read_offset += length;
782     }
783   } while (read_offset < str_length);
784 
785   // If we're shortening the string, truncate it now.
786   str->resize(write_offset);
787   return true;
788 }
789 
790 template <class StringType>
ReplaceCharsT( const StringType& input, std::basic_string_view<typename StringType::value_type> find_any_of_these, std::basic_string_view<typename StringType::value_type> replace_with, StringType* output)791 bool ReplaceCharsT(
792     const StringType& input,
793     std::basic_string_view<typename StringType::value_type> find_any_of_these,
794     std::basic_string_view<typename StringType::value_type> replace_with,
795     StringType* output) {
796   // Commonly, this is called with output and input being the same string; in
797   // that case, this assignment is inexpensive.
798   *output = input;
799 
800   return DoReplaceMatchesAfterOffset(
801       output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with,
802       ReplaceType::REPLACE_ALL);
803 }
804 
ReplaceFirstSubstringAfterOffset(std::u16string* str, size_t start_offset, std::u16string_view find_this, std::u16string_view replace_with)805 void ReplaceFirstSubstringAfterOffset(std::u16string* str,
806                                       size_t start_offset,
807                                       std::u16string_view find_this,
808                                       std::u16string_view replace_with) {
809   DoReplaceMatchesAfterOffset(str, start_offset,
810                               SubstringMatcher<std::u16string>{find_this},
811                               replace_with, ReplaceType::REPLACE_FIRST);
812 }
813 
ReplaceFirstSubstringAfterOffset(std::string* str, size_t start_offset, std::string_view find_this, std::string_view replace_with)814 void ReplaceFirstSubstringAfterOffset(std::string* str,
815                                       size_t start_offset,
816                                       std::string_view find_this,
817                                       std::string_view replace_with) {
818   DoReplaceMatchesAfterOffset(str, start_offset,
819                               SubstringMatcher<std::string>{find_this},
820                               replace_with, ReplaceType::REPLACE_FIRST);
821 }
822 
ReplaceSubstringsAfterOffset(std::u16string* str, size_t start_offset, std::u16string_view find_this, std::u16string_view replace_with)823 void ReplaceSubstringsAfterOffset(std::u16string* str,
824                                   size_t start_offset,
825                                   std::u16string_view find_this,
826                                   std::u16string_view replace_with) {
827   DoReplaceMatchesAfterOffset(str, start_offset,
828                               SubstringMatcher<std::u16string>{find_this},
829                               replace_with, ReplaceType::REPLACE_ALL);
830 }
831 
ReplaceSubstringsAfterOffset(std::string* str, size_t start_offset, std::string_view find_this, std::string_view replace_with)832 void ReplaceSubstringsAfterOffset(std::string* str,
833                                   size_t start_offset,
834                                   std::string_view find_this,
835                                   std::string_view replace_with) {
836   DoReplaceMatchesAfterOffset(str, start_offset,
837                               SubstringMatcher<std::string>{find_this},
838                               replace_with, ReplaceType::REPLACE_ALL);
839 }
840 
841 template <class string_type>
WriteIntoT(string_type* str, size_t length_with_null)842 inline typename string_type::value_type* WriteIntoT(string_type* str,
843                                                     size_t length_with_null) {
844   DCHECK_GT(length_with_null, 1u);
845   str->reserve(length_with_null);
846   str->resize(length_with_null - 1);
847   return &((*str)[0]);
848 }
849 
WriteInto(std::string* str, size_t length_with_null)850 char* WriteInto(std::string* str, size_t length_with_null) {
851   return WriteIntoT(str, length_with_null);
852 }
853 
WriteInto(std::u16string* str, size_t length_with_null)854 char16_t* WriteInto(std::u16string* str, size_t length_with_null) {
855   return WriteIntoT(str, length_with_null);
856 }
857 
858 #if defined(_MSC_VER) && !defined(__clang__)
859 // Work around VC++ code-gen bug. https://crbug.com/804884
860 #pragma optimize("", off)
861 #endif
862 
863 // Generic version for all JoinString overloads. |list_type| must be a sequence
864 // (std::vector or std::initializer_list) of strings/string_views of any type.
865 template <typename char_type, typename list_type>
JoinStringT( const list_type& parts, std::basic_string_view<char_type> sep)866 static std::basic_string<char_type> JoinStringT(
867     const list_type& parts,
868     std::basic_string_view<char_type> sep) {
869   if (parts.size() == 0)
870     return std::basic_string<char_type>();
871 
872   // Pre-allocate the eventual size of the string. Start with the size of all of
873   // the separators (note that this *assumes* parts.size() > 0).
874   size_t total_size = (parts.size() - 1) * sep.size();
875   for (const auto& part : parts)
876     total_size += part.size();
877   std::basic_string<char_type> result;
878   result.reserve(total_size);
879 
880   auto iter = parts.begin();
881   DCHECK(iter != parts.end());
882   result.append(*iter);
883   ++iter;
884 
885   for (; iter != parts.end(); ++iter) {
886     result.append(sep);
887     result.append(*iter);
888   }
889 
890   // Sanity-check that we pre-allocated correctly.
891   DCHECK_EQ(total_size, result.size());
892 
893   return result;
894 }
895 
JoinString(const std::vector<std::string>& parts, std::string_view separator)896 std::string JoinString(const std::vector<std::string>& parts,
897                        std::string_view separator) {
898   return JoinStringT(parts, separator);
899 }
900 
JoinString(const std::vector<std::u16string>& parts, std::u16string_view separator)901 std::u16string JoinString(const std::vector<std::u16string>& parts,
902                           std::u16string_view separator) {
903   return JoinStringT(parts, separator);
904 }
905 
906 #if defined(_MSC_VER) && !defined(__clang__)
907 // Work around VC++ code-gen bug. https://crbug.com/804884
908 #pragma optimize("", on)
909 #endif
910 
JoinString(const std::vector<std::string_view>& parts, std::string_view separator)911 std::string JoinString(const std::vector<std::string_view>& parts,
912                        std::string_view separator) {
913   return JoinStringT(parts, separator);
914 }
915 
JoinString(const std::vector<std::u16string_view>& parts, std::u16string_view separator)916 std::u16string JoinString(const std::vector<std::u16string_view>& parts,
917                           std::u16string_view separator) {
918   return JoinStringT(parts, separator);
919 }
920 
JoinString(std::initializer_list<std::string_view> parts, std::string_view separator)921 std::string JoinString(std::initializer_list<std::string_view> parts,
922                        std::string_view separator) {
923   return JoinStringT(parts, separator);
924 }
925 
JoinString(std::initializer_list<std::u16string_view> parts, std::u16string_view separator)926 std::u16string JoinString(std::initializer_list<std::u16string_view> parts,
927                           std::u16string_view separator) {
928   return JoinStringT(parts, separator);
929 }
930 
931 template <class FormatStringType, class OutStringType>
DoReplaceStringPlaceholders( const FormatStringType& format_string, const std::vector<OutStringType>& subst, std::vector<size_t>* offsets)932 OutStringType DoReplaceStringPlaceholders(
933     const FormatStringType& format_string,
934     const std::vector<OutStringType>& subst,
935     std::vector<size_t>* offsets) {
936   size_t substitutions = subst.size();
937   DCHECK_LT(substitutions, 10U);
938 
939   size_t sub_length = 0;
940   for (const auto& cur : subst)
941     sub_length += cur.length();
942 
943   OutStringType formatted;
944   formatted.reserve(format_string.length() + sub_length);
945 
946   std::vector<ReplacementOffset> r_offsets;
947   for (auto i = format_string.begin(); i != format_string.end(); ++i) {
948     if ('$' == *i) {
949       if (i + 1 != format_string.end()) {
950         ++i;
951         if ('$' == *i) {
952           while (i != format_string.end() && '$' == *i) {
953             formatted.push_back('$');
954             ++i;
955           }
956           --i;
957         } else {
958           if (*i < '1' || *i > '9') {
959             DLOG(ERROR) << "Invalid placeholder: $" << std::to_string(*i);
960             continue;
961           }
962           uintptr_t index = *i - '1';
963           if (offsets) {
964             ReplacementOffset r_offset(index,
965                                        static_cast<int>(formatted.size()));
966             r_offsets.insert(
967                 std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset,
968                                  &CompareParameter),
969                 r_offset);
970           }
971           if (index < substitutions)
972             formatted.append(subst.at(index));
973         }
974       }
975     } else {
976       formatted.push_back(*i);
977     }
978   }
979   if (offsets) {
980     for (const auto& cur : r_offsets)
981       offsets->push_back(cur.offset);
982   }
983   return formatted;
984 }
985 
ReplaceStringPlaceholders( const std::u16string& format_string, const std::vector<std::u16string>& subst, std::vector<size_t>* offsets)986 std::u16string ReplaceStringPlaceholders(
987     const std::u16string& format_string,
988     const std::vector<std::u16string>& subst,
989     std::vector<size_t>* offsets) {
990   return DoReplaceStringPlaceholders(format_string, subst, offsets);
991 }
992 
ReplaceStringPlaceholders(std::string_view format_string, const std::vector<std::string>& subst, std::vector<size_t>* offsets)993 std::string ReplaceStringPlaceholders(std::string_view format_string,
994                                       const std::vector<std::string>& subst,
995                                       std::vector<size_t>* offsets) {
996   return DoReplaceStringPlaceholders(format_string, subst, offsets);
997 }
998 
ReplaceStringPlaceholders(const std::u16string& format_string, const std::u16string& a, size_t* offset)999 std::u16string ReplaceStringPlaceholders(const std::u16string& format_string,
1000                                          const std::u16string& a,
1001                                          size_t* offset) {
1002   std::vector<size_t> offsets;
1003   std::vector<std::u16string> subst;
1004   subst.push_back(a);
1005   std::u16string result =
1006       ReplaceStringPlaceholders(format_string, subst, &offsets);
1007 
1008   DCHECK_EQ(1U, offsets.size());
1009   if (offset)
1010     *offset = offsets[0];
1011   return result;
1012 }
1013 
1014 }  // namespace base
1015