16d528ed9Sopenharmony_ci// Copyright (c) 2011 The Chromium Authors. All rights reserved.
26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
36d528ed9Sopenharmony_ci// found in the LICENSE file.
46d528ed9Sopenharmony_ci
56d528ed9Sopenharmony_ci#ifndef BASE_STRINGS_STRING_TOKENIZER_H_
66d528ed9Sopenharmony_ci#define BASE_STRINGS_STRING_TOKENIZER_H_
76d528ed9Sopenharmony_ci
86d528ed9Sopenharmony_ci#include <algorithm>
96d528ed9Sopenharmony_ci#include <string>
106d528ed9Sopenharmony_ci#include <string_view>
116d528ed9Sopenharmony_ci
126d528ed9Sopenharmony_cinamespace base {
136d528ed9Sopenharmony_ci
146d528ed9Sopenharmony_ci// StringTokenizerT is a simple string tokenizer class.  It works like an
156d528ed9Sopenharmony_ci// iterator that with each step (see the Advance method) updates members that
166d528ed9Sopenharmony_ci// refer to the next token in the input string.  The user may optionally
176d528ed9Sopenharmony_ci// configure the tokenizer to return delimiters.
186d528ed9Sopenharmony_ci//
196d528ed9Sopenharmony_ci// EXAMPLE 1:
206d528ed9Sopenharmony_ci//
216d528ed9Sopenharmony_ci//   char input[] = "this is a test";
226d528ed9Sopenharmony_ci//   CStringTokenizer t(input, input + strlen(input), " ");
236d528ed9Sopenharmony_ci//   while (t.GetNext()) {
246d528ed9Sopenharmony_ci//     printf("%s\n", t.token().c_str());
256d528ed9Sopenharmony_ci//   }
266d528ed9Sopenharmony_ci//
276d528ed9Sopenharmony_ci// Output:
286d528ed9Sopenharmony_ci//
296d528ed9Sopenharmony_ci//   this
306d528ed9Sopenharmony_ci//   is
316d528ed9Sopenharmony_ci//   a
326d528ed9Sopenharmony_ci//   test
336d528ed9Sopenharmony_ci//
346d528ed9Sopenharmony_ci//
356d528ed9Sopenharmony_ci// EXAMPLE 2:
366d528ed9Sopenharmony_ci//
376d528ed9Sopenharmony_ci//   std::string input = "no-cache=\"foo, bar\", private";
386d528ed9Sopenharmony_ci//   StringTokenizer t(input, ", ");
396d528ed9Sopenharmony_ci//   t.set_quote_chars("\"");
406d528ed9Sopenharmony_ci//   while (t.GetNext()) {
416d528ed9Sopenharmony_ci//     printf("%s\n", t.token().c_str());
426d528ed9Sopenharmony_ci//   }
436d528ed9Sopenharmony_ci//
446d528ed9Sopenharmony_ci// Output:
456d528ed9Sopenharmony_ci//
466d528ed9Sopenharmony_ci//   no-cache="foo, bar"
476d528ed9Sopenharmony_ci//   private
486d528ed9Sopenharmony_ci//
496d528ed9Sopenharmony_ci//
506d528ed9Sopenharmony_ci// EXAMPLE 3:
516d528ed9Sopenharmony_ci//
526d528ed9Sopenharmony_ci//   bool next_is_option = false, next_is_value = false;
536d528ed9Sopenharmony_ci//   std::string input = "text/html; charset=UTF-8; foo=bar";
546d528ed9Sopenharmony_ci//   StringTokenizer t(input, "; =");
556d528ed9Sopenharmony_ci//   t.set_options(StringTokenizer::RETURN_DELIMS);
566d528ed9Sopenharmony_ci//   while (t.GetNext()) {
576d528ed9Sopenharmony_ci//     if (t.token_is_delim()) {
586d528ed9Sopenharmony_ci//       switch (*t.token_begin()) {
596d528ed9Sopenharmony_ci//         case ';':
606d528ed9Sopenharmony_ci//           next_is_option = true;
616d528ed9Sopenharmony_ci//           break;
626d528ed9Sopenharmony_ci//         case '=':
636d528ed9Sopenharmony_ci//           next_is_value = true;
646d528ed9Sopenharmony_ci//           break;
656d528ed9Sopenharmony_ci//       }
666d528ed9Sopenharmony_ci//     } else {
676d528ed9Sopenharmony_ci//       const char* label;
686d528ed9Sopenharmony_ci//       if (next_is_option) {
696d528ed9Sopenharmony_ci//         label = "option-name";
706d528ed9Sopenharmony_ci//         next_is_option = false;
716d528ed9Sopenharmony_ci//       } else if (next_is_value) {
726d528ed9Sopenharmony_ci//         label = "option-value";
736d528ed9Sopenharmony_ci//         next_is_value = false;
746d528ed9Sopenharmony_ci//       } else {
756d528ed9Sopenharmony_ci//         label = "mime-type";
766d528ed9Sopenharmony_ci//       }
776d528ed9Sopenharmony_ci//       printf("%s: %s\n", label, t.token().c_str());
786d528ed9Sopenharmony_ci//     }
796d528ed9Sopenharmony_ci//   }
806d528ed9Sopenharmony_ci//
816d528ed9Sopenharmony_ci//
826d528ed9Sopenharmony_citemplate <class str, class const_iterator>
836d528ed9Sopenharmony_ciclass StringTokenizerT {
846d528ed9Sopenharmony_ci public:
856d528ed9Sopenharmony_ci  typedef typename str::value_type char_type;
866d528ed9Sopenharmony_ci
876d528ed9Sopenharmony_ci  // Options that may be pass to set_options()
886d528ed9Sopenharmony_ci  enum {
896d528ed9Sopenharmony_ci    // Specifies the delimiters should be returned as tokens
906d528ed9Sopenharmony_ci    RETURN_DELIMS = 1 << 0,
916d528ed9Sopenharmony_ci  };
926d528ed9Sopenharmony_ci
936d528ed9Sopenharmony_ci  // The string object must live longer than the tokenizer. In particular, this
946d528ed9Sopenharmony_ci  // should not be constructed with a temporary. The deleted rvalue constructor
956d528ed9Sopenharmony_ci  // blocks the most obvious instances of this (e.g. passing a string literal to
966d528ed9Sopenharmony_ci  // the constructor), but caution must still be exercised.
976d528ed9Sopenharmony_ci  StringTokenizerT(const str& string, const str& delims) {
986d528ed9Sopenharmony_ci    Init(string.begin(), string.end(), delims);
996d528ed9Sopenharmony_ci  }
1006d528ed9Sopenharmony_ci
1016d528ed9Sopenharmony_ci  // Don't allow temporary strings to be used with string tokenizer, since
1026d528ed9Sopenharmony_ci  // Init() would otherwise save iterators to a temporary string.
1036d528ed9Sopenharmony_ci  StringTokenizerT(str&&, const str& delims) = delete;
1046d528ed9Sopenharmony_ci
1056d528ed9Sopenharmony_ci  StringTokenizerT(const_iterator string_begin,
1066d528ed9Sopenharmony_ci                   const_iterator string_end,
1076d528ed9Sopenharmony_ci                   const str& delims) {
1086d528ed9Sopenharmony_ci    Init(string_begin, string_end, delims);
1096d528ed9Sopenharmony_ci  }
1106d528ed9Sopenharmony_ci
1116d528ed9Sopenharmony_ci  // Set the options for this tokenizer.  By default, this is 0.
1126d528ed9Sopenharmony_ci  void set_options(int options) { options_ = options; }
1136d528ed9Sopenharmony_ci
1146d528ed9Sopenharmony_ci  // Set the characters to regard as quotes.  By default, this is empty.  When
1156d528ed9Sopenharmony_ci  // a quote char is encountered, the tokenizer will switch into a mode where
1166d528ed9Sopenharmony_ci  // it ignores delimiters that it finds.  It switches out of this mode once it
1176d528ed9Sopenharmony_ci  // finds another instance of the quote char.  If a backslash is encountered
1186d528ed9Sopenharmony_ci  // within a quoted string, then the next character is skipped.
1196d528ed9Sopenharmony_ci  void set_quote_chars(const str& quotes) { quotes_ = quotes; }
1206d528ed9Sopenharmony_ci
1216d528ed9Sopenharmony_ci  // Call this method to advance the tokenizer to the next delimiter.  This
1226d528ed9Sopenharmony_ci  // returns false if the tokenizer is complete.  This method must be called
1236d528ed9Sopenharmony_ci  // before calling any of the token* methods.
1246d528ed9Sopenharmony_ci  bool GetNext() {
1256d528ed9Sopenharmony_ci    if (quotes_.empty() && options_ == 0)
1266d528ed9Sopenharmony_ci      return QuickGetNext();
1276d528ed9Sopenharmony_ci    else
1286d528ed9Sopenharmony_ci      return FullGetNext();
1296d528ed9Sopenharmony_ci  }
1306d528ed9Sopenharmony_ci
1316d528ed9Sopenharmony_ci  // Start iterating through tokens from the beginning of the string.
1326d528ed9Sopenharmony_ci  void Reset() { token_end_ = start_pos_; }
1336d528ed9Sopenharmony_ci
1346d528ed9Sopenharmony_ci  // Returns true if token is a delimiter.  When the tokenizer is constructed
1356d528ed9Sopenharmony_ci  // with the RETURN_DELIMS option, this method can be used to check if the
1366d528ed9Sopenharmony_ci  // returned token is actually a delimiter.
1376d528ed9Sopenharmony_ci  bool token_is_delim() const { return token_is_delim_; }
1386d528ed9Sopenharmony_ci
1396d528ed9Sopenharmony_ci  // If GetNext() returned true, then these methods may be used to read the
1406d528ed9Sopenharmony_ci  // value of the token.
1416d528ed9Sopenharmony_ci  const_iterator token_begin() const { return token_begin_; }
1426d528ed9Sopenharmony_ci  const_iterator token_end() const { return token_end_; }
1436d528ed9Sopenharmony_ci  str token() const { return str(token_begin_, token_end_); }
1446d528ed9Sopenharmony_ci  std::basic_string_view<typename str::value_type> token_piece() const {
1456d528ed9Sopenharmony_ci    return std::basic_string_view<typename str::value_type>(
1466d528ed9Sopenharmony_ci        &*token_begin_, std::distance(token_begin_, token_end_));
1476d528ed9Sopenharmony_ci  }
1486d528ed9Sopenharmony_ci
1496d528ed9Sopenharmony_ci private:
1506d528ed9Sopenharmony_ci  void Init(const_iterator string_begin,
1516d528ed9Sopenharmony_ci            const_iterator string_end,
1526d528ed9Sopenharmony_ci            const str& delims) {
1536d528ed9Sopenharmony_ci    start_pos_ = string_begin;
1546d528ed9Sopenharmony_ci    token_begin_ = string_begin;
1556d528ed9Sopenharmony_ci    token_end_ = string_begin;
1566d528ed9Sopenharmony_ci    end_ = string_end;
1576d528ed9Sopenharmony_ci    delims_ = delims;
1586d528ed9Sopenharmony_ci    options_ = 0;
1596d528ed9Sopenharmony_ci    token_is_delim_ = false;
1606d528ed9Sopenharmony_ci  }
1616d528ed9Sopenharmony_ci
1626d528ed9Sopenharmony_ci  // Implementation of GetNext() for when we have no quote characters. We have
1636d528ed9Sopenharmony_ci  // two separate implementations because AdvanceOne() is a hot spot in large
1646d528ed9Sopenharmony_ci  // text files with large tokens.
1656d528ed9Sopenharmony_ci  bool QuickGetNext() {
1666d528ed9Sopenharmony_ci    token_is_delim_ = false;
1676d528ed9Sopenharmony_ci    for (;;) {
1686d528ed9Sopenharmony_ci      token_begin_ = token_end_;
1696d528ed9Sopenharmony_ci      if (token_end_ == end_)
1706d528ed9Sopenharmony_ci        return false;
1716d528ed9Sopenharmony_ci      ++token_end_;
1726d528ed9Sopenharmony_ci      if (delims_.find(*token_begin_) == str::npos)
1736d528ed9Sopenharmony_ci        break;
1746d528ed9Sopenharmony_ci      // else skip over delimiter.
1756d528ed9Sopenharmony_ci    }
1766d528ed9Sopenharmony_ci    while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
1776d528ed9Sopenharmony_ci      ++token_end_;
1786d528ed9Sopenharmony_ci    return true;
1796d528ed9Sopenharmony_ci  }
1806d528ed9Sopenharmony_ci
1816d528ed9Sopenharmony_ci  // Implementation of GetNext() for when we have to take quotes into account.
1826d528ed9Sopenharmony_ci  bool FullGetNext() {
1836d528ed9Sopenharmony_ci    AdvanceState state;
1846d528ed9Sopenharmony_ci    token_is_delim_ = false;
1856d528ed9Sopenharmony_ci    for (;;) {
1866d528ed9Sopenharmony_ci      token_begin_ = token_end_;
1876d528ed9Sopenharmony_ci      if (token_end_ == end_)
1886d528ed9Sopenharmony_ci        return false;
1896d528ed9Sopenharmony_ci      ++token_end_;
1906d528ed9Sopenharmony_ci      if (AdvanceOne(&state, *token_begin_))
1916d528ed9Sopenharmony_ci        break;
1926d528ed9Sopenharmony_ci      if (options_ & RETURN_DELIMS) {
1936d528ed9Sopenharmony_ci        token_is_delim_ = true;
1946d528ed9Sopenharmony_ci        return true;
1956d528ed9Sopenharmony_ci      }
1966d528ed9Sopenharmony_ci      // else skip over delimiter.
1976d528ed9Sopenharmony_ci    }
1986d528ed9Sopenharmony_ci    while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
1996d528ed9Sopenharmony_ci      ++token_end_;
2006d528ed9Sopenharmony_ci    return true;
2016d528ed9Sopenharmony_ci  }
2026d528ed9Sopenharmony_ci
2036d528ed9Sopenharmony_ci  bool IsDelim(char_type c) const { return delims_.find(c) != str::npos; }
2046d528ed9Sopenharmony_ci
2056d528ed9Sopenharmony_ci  bool IsQuote(char_type c) const { return quotes_.find(c) != str::npos; }
2066d528ed9Sopenharmony_ci
2076d528ed9Sopenharmony_ci  struct AdvanceState {
2086d528ed9Sopenharmony_ci    bool in_quote;
2096d528ed9Sopenharmony_ci    bool in_escape;
2106d528ed9Sopenharmony_ci    char_type quote_char;
2116d528ed9Sopenharmony_ci    AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {}
2126d528ed9Sopenharmony_ci  };
2136d528ed9Sopenharmony_ci
2146d528ed9Sopenharmony_ci  // Returns true if a delimiter was not hit.
2156d528ed9Sopenharmony_ci  bool AdvanceOne(AdvanceState* state, char_type c) {
2166d528ed9Sopenharmony_ci    if (state->in_quote) {
2176d528ed9Sopenharmony_ci      if (state->in_escape) {
2186d528ed9Sopenharmony_ci        state->in_escape = false;
2196d528ed9Sopenharmony_ci      } else if (c == '\\') {
2206d528ed9Sopenharmony_ci        state->in_escape = true;
2216d528ed9Sopenharmony_ci      } else if (c == state->quote_char) {
2226d528ed9Sopenharmony_ci        state->in_quote = false;
2236d528ed9Sopenharmony_ci      }
2246d528ed9Sopenharmony_ci    } else {
2256d528ed9Sopenharmony_ci      if (IsDelim(c))
2266d528ed9Sopenharmony_ci        return false;
2276d528ed9Sopenharmony_ci      state->in_quote = IsQuote(state->quote_char = c);
2286d528ed9Sopenharmony_ci    }
2296d528ed9Sopenharmony_ci    return true;
2306d528ed9Sopenharmony_ci  }
2316d528ed9Sopenharmony_ci
2326d528ed9Sopenharmony_ci  const_iterator start_pos_;
2336d528ed9Sopenharmony_ci  const_iterator token_begin_;
2346d528ed9Sopenharmony_ci  const_iterator token_end_;
2356d528ed9Sopenharmony_ci  const_iterator end_;
2366d528ed9Sopenharmony_ci  str delims_;
2376d528ed9Sopenharmony_ci  str quotes_;
2386d528ed9Sopenharmony_ci  int options_;
2396d528ed9Sopenharmony_ci  bool token_is_delim_;
2406d528ed9Sopenharmony_ci};
2416d528ed9Sopenharmony_ci
2426d528ed9Sopenharmony_citypedef StringTokenizerT<std::string, std::string::const_iterator>
2436d528ed9Sopenharmony_ci    StringTokenizer;
2446d528ed9Sopenharmony_citypedef StringTokenizerT<std::u16string, std::u16string::const_iterator>
2456d528ed9Sopenharmony_ci    WStringTokenizer;
2466d528ed9Sopenharmony_citypedef StringTokenizerT<std::string, const char*> CStringTokenizer;
2476d528ed9Sopenharmony_ci
2486d528ed9Sopenharmony_ci}  // namespace base
2496d528ed9Sopenharmony_ci
2506d528ed9Sopenharmony_ci#endif  // BASE_STRINGS_STRING_TOKENIZER_H_
251