16d528ed9Sopenharmony_ci// Copyright (c) 2011 The Chromium Authors. All rights reserved. 26d528ed9Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 36d528ed9Sopenharmony_ci// found in the LICENSE file. 46d528ed9Sopenharmony_ci 56d528ed9Sopenharmony_ci#ifndef BASE_STRINGS_STRING_TOKENIZER_H_ 66d528ed9Sopenharmony_ci#define BASE_STRINGS_STRING_TOKENIZER_H_ 76d528ed9Sopenharmony_ci 86d528ed9Sopenharmony_ci#include <algorithm> 96d528ed9Sopenharmony_ci#include <string> 106d528ed9Sopenharmony_ci#include <string_view> 116d528ed9Sopenharmony_ci 126d528ed9Sopenharmony_cinamespace base { 136d528ed9Sopenharmony_ci 146d528ed9Sopenharmony_ci// StringTokenizerT is a simple string tokenizer class. It works like an 156d528ed9Sopenharmony_ci// iterator that with each step (see the Advance method) updates members that 166d528ed9Sopenharmony_ci// refer to the next token in the input string. The user may optionally 176d528ed9Sopenharmony_ci// configure the tokenizer to return delimiters. 186d528ed9Sopenharmony_ci// 196d528ed9Sopenharmony_ci// EXAMPLE 1: 206d528ed9Sopenharmony_ci// 216d528ed9Sopenharmony_ci// char input[] = "this is a test"; 226d528ed9Sopenharmony_ci// CStringTokenizer t(input, input + strlen(input), " "); 236d528ed9Sopenharmony_ci// while (t.GetNext()) { 246d528ed9Sopenharmony_ci// printf("%s\n", t.token().c_str()); 256d528ed9Sopenharmony_ci// } 266d528ed9Sopenharmony_ci// 276d528ed9Sopenharmony_ci// Output: 286d528ed9Sopenharmony_ci// 296d528ed9Sopenharmony_ci// this 306d528ed9Sopenharmony_ci// is 316d528ed9Sopenharmony_ci// a 326d528ed9Sopenharmony_ci// test 336d528ed9Sopenharmony_ci// 346d528ed9Sopenharmony_ci// 356d528ed9Sopenharmony_ci// EXAMPLE 2: 366d528ed9Sopenharmony_ci// 376d528ed9Sopenharmony_ci// std::string input = "no-cache=\"foo, bar\", private"; 386d528ed9Sopenharmony_ci// StringTokenizer t(input, ", "); 396d528ed9Sopenharmony_ci// t.set_quote_chars("\""); 406d528ed9Sopenharmony_ci// while (t.GetNext()) { 416d528ed9Sopenharmony_ci// printf("%s\n", t.token().c_str()); 426d528ed9Sopenharmony_ci// } 436d528ed9Sopenharmony_ci// 446d528ed9Sopenharmony_ci// Output: 456d528ed9Sopenharmony_ci// 466d528ed9Sopenharmony_ci// no-cache="foo, bar" 476d528ed9Sopenharmony_ci// private 486d528ed9Sopenharmony_ci// 496d528ed9Sopenharmony_ci// 506d528ed9Sopenharmony_ci// EXAMPLE 3: 516d528ed9Sopenharmony_ci// 526d528ed9Sopenharmony_ci// bool next_is_option = false, next_is_value = false; 536d528ed9Sopenharmony_ci// std::string input = "text/html; charset=UTF-8; foo=bar"; 546d528ed9Sopenharmony_ci// StringTokenizer t(input, "; ="); 556d528ed9Sopenharmony_ci// t.set_options(StringTokenizer::RETURN_DELIMS); 566d528ed9Sopenharmony_ci// while (t.GetNext()) { 576d528ed9Sopenharmony_ci// if (t.token_is_delim()) { 586d528ed9Sopenharmony_ci// switch (*t.token_begin()) { 596d528ed9Sopenharmony_ci// case ';': 606d528ed9Sopenharmony_ci// next_is_option = true; 616d528ed9Sopenharmony_ci// break; 626d528ed9Sopenharmony_ci// case '=': 636d528ed9Sopenharmony_ci// next_is_value = true; 646d528ed9Sopenharmony_ci// break; 656d528ed9Sopenharmony_ci// } 666d528ed9Sopenharmony_ci// } else { 676d528ed9Sopenharmony_ci// const char* label; 686d528ed9Sopenharmony_ci// if (next_is_option) { 696d528ed9Sopenharmony_ci// label = "option-name"; 706d528ed9Sopenharmony_ci// next_is_option = false; 716d528ed9Sopenharmony_ci// } else if (next_is_value) { 726d528ed9Sopenharmony_ci// label = "option-value"; 736d528ed9Sopenharmony_ci// next_is_value = false; 746d528ed9Sopenharmony_ci// } else { 756d528ed9Sopenharmony_ci// label = "mime-type"; 766d528ed9Sopenharmony_ci// } 776d528ed9Sopenharmony_ci// printf("%s: %s\n", label, t.token().c_str()); 786d528ed9Sopenharmony_ci// } 796d528ed9Sopenharmony_ci// } 806d528ed9Sopenharmony_ci// 816d528ed9Sopenharmony_ci// 826d528ed9Sopenharmony_citemplate <class str, class const_iterator> 836d528ed9Sopenharmony_ciclass StringTokenizerT { 846d528ed9Sopenharmony_ci public: 856d528ed9Sopenharmony_ci typedef typename str::value_type char_type; 866d528ed9Sopenharmony_ci 876d528ed9Sopenharmony_ci // Options that may be pass to set_options() 886d528ed9Sopenharmony_ci enum { 896d528ed9Sopenharmony_ci // Specifies the delimiters should be returned as tokens 906d528ed9Sopenharmony_ci RETURN_DELIMS = 1 << 0, 916d528ed9Sopenharmony_ci }; 926d528ed9Sopenharmony_ci 936d528ed9Sopenharmony_ci // The string object must live longer than the tokenizer. In particular, this 946d528ed9Sopenharmony_ci // should not be constructed with a temporary. The deleted rvalue constructor 956d528ed9Sopenharmony_ci // blocks the most obvious instances of this (e.g. passing a string literal to 966d528ed9Sopenharmony_ci // the constructor), but caution must still be exercised. 976d528ed9Sopenharmony_ci StringTokenizerT(const str& string, const str& delims) { 986d528ed9Sopenharmony_ci Init(string.begin(), string.end(), delims); 996d528ed9Sopenharmony_ci } 1006d528ed9Sopenharmony_ci 1016d528ed9Sopenharmony_ci // Don't allow temporary strings to be used with string tokenizer, since 1026d528ed9Sopenharmony_ci // Init() would otherwise save iterators to a temporary string. 1036d528ed9Sopenharmony_ci StringTokenizerT(str&&, const str& delims) = delete; 1046d528ed9Sopenharmony_ci 1056d528ed9Sopenharmony_ci StringTokenizerT(const_iterator string_begin, 1066d528ed9Sopenharmony_ci const_iterator string_end, 1076d528ed9Sopenharmony_ci const str& delims) { 1086d528ed9Sopenharmony_ci Init(string_begin, string_end, delims); 1096d528ed9Sopenharmony_ci } 1106d528ed9Sopenharmony_ci 1116d528ed9Sopenharmony_ci // Set the options for this tokenizer. By default, this is 0. 1126d528ed9Sopenharmony_ci void set_options(int options) { options_ = options; } 1136d528ed9Sopenharmony_ci 1146d528ed9Sopenharmony_ci // Set the characters to regard as quotes. By default, this is empty. When 1156d528ed9Sopenharmony_ci // a quote char is encountered, the tokenizer will switch into a mode where 1166d528ed9Sopenharmony_ci // it ignores delimiters that it finds. It switches out of this mode once it 1176d528ed9Sopenharmony_ci // finds another instance of the quote char. If a backslash is encountered 1186d528ed9Sopenharmony_ci // within a quoted string, then the next character is skipped. 1196d528ed9Sopenharmony_ci void set_quote_chars(const str& quotes) { quotes_ = quotes; } 1206d528ed9Sopenharmony_ci 1216d528ed9Sopenharmony_ci // Call this method to advance the tokenizer to the next delimiter. This 1226d528ed9Sopenharmony_ci // returns false if the tokenizer is complete. This method must be called 1236d528ed9Sopenharmony_ci // before calling any of the token* methods. 1246d528ed9Sopenharmony_ci bool GetNext() { 1256d528ed9Sopenharmony_ci if (quotes_.empty() && options_ == 0) 1266d528ed9Sopenharmony_ci return QuickGetNext(); 1276d528ed9Sopenharmony_ci else 1286d528ed9Sopenharmony_ci return FullGetNext(); 1296d528ed9Sopenharmony_ci } 1306d528ed9Sopenharmony_ci 1316d528ed9Sopenharmony_ci // Start iterating through tokens from the beginning of the string. 1326d528ed9Sopenharmony_ci void Reset() { token_end_ = start_pos_; } 1336d528ed9Sopenharmony_ci 1346d528ed9Sopenharmony_ci // Returns true if token is a delimiter. When the tokenizer is constructed 1356d528ed9Sopenharmony_ci // with the RETURN_DELIMS option, this method can be used to check if the 1366d528ed9Sopenharmony_ci // returned token is actually a delimiter. 1376d528ed9Sopenharmony_ci bool token_is_delim() const { return token_is_delim_; } 1386d528ed9Sopenharmony_ci 1396d528ed9Sopenharmony_ci // If GetNext() returned true, then these methods may be used to read the 1406d528ed9Sopenharmony_ci // value of the token. 1416d528ed9Sopenharmony_ci const_iterator token_begin() const { return token_begin_; } 1426d528ed9Sopenharmony_ci const_iterator token_end() const { return token_end_; } 1436d528ed9Sopenharmony_ci str token() const { return str(token_begin_, token_end_); } 1446d528ed9Sopenharmony_ci std::basic_string_view<typename str::value_type> token_piece() const { 1456d528ed9Sopenharmony_ci return std::basic_string_view<typename str::value_type>( 1466d528ed9Sopenharmony_ci &*token_begin_, std::distance(token_begin_, token_end_)); 1476d528ed9Sopenharmony_ci } 1486d528ed9Sopenharmony_ci 1496d528ed9Sopenharmony_ci private: 1506d528ed9Sopenharmony_ci void Init(const_iterator string_begin, 1516d528ed9Sopenharmony_ci const_iterator string_end, 1526d528ed9Sopenharmony_ci const str& delims) { 1536d528ed9Sopenharmony_ci start_pos_ = string_begin; 1546d528ed9Sopenharmony_ci token_begin_ = string_begin; 1556d528ed9Sopenharmony_ci token_end_ = string_begin; 1566d528ed9Sopenharmony_ci end_ = string_end; 1576d528ed9Sopenharmony_ci delims_ = delims; 1586d528ed9Sopenharmony_ci options_ = 0; 1596d528ed9Sopenharmony_ci token_is_delim_ = false; 1606d528ed9Sopenharmony_ci } 1616d528ed9Sopenharmony_ci 1626d528ed9Sopenharmony_ci // Implementation of GetNext() for when we have no quote characters. We have 1636d528ed9Sopenharmony_ci // two separate implementations because AdvanceOne() is a hot spot in large 1646d528ed9Sopenharmony_ci // text files with large tokens. 1656d528ed9Sopenharmony_ci bool QuickGetNext() { 1666d528ed9Sopenharmony_ci token_is_delim_ = false; 1676d528ed9Sopenharmony_ci for (;;) { 1686d528ed9Sopenharmony_ci token_begin_ = token_end_; 1696d528ed9Sopenharmony_ci if (token_end_ == end_) 1706d528ed9Sopenharmony_ci return false; 1716d528ed9Sopenharmony_ci ++token_end_; 1726d528ed9Sopenharmony_ci if (delims_.find(*token_begin_) == str::npos) 1736d528ed9Sopenharmony_ci break; 1746d528ed9Sopenharmony_ci // else skip over delimiter. 1756d528ed9Sopenharmony_ci } 1766d528ed9Sopenharmony_ci while (token_end_ != end_ && delims_.find(*token_end_) == str::npos) 1776d528ed9Sopenharmony_ci ++token_end_; 1786d528ed9Sopenharmony_ci return true; 1796d528ed9Sopenharmony_ci } 1806d528ed9Sopenharmony_ci 1816d528ed9Sopenharmony_ci // Implementation of GetNext() for when we have to take quotes into account. 1826d528ed9Sopenharmony_ci bool FullGetNext() { 1836d528ed9Sopenharmony_ci AdvanceState state; 1846d528ed9Sopenharmony_ci token_is_delim_ = false; 1856d528ed9Sopenharmony_ci for (;;) { 1866d528ed9Sopenharmony_ci token_begin_ = token_end_; 1876d528ed9Sopenharmony_ci if (token_end_ == end_) 1886d528ed9Sopenharmony_ci return false; 1896d528ed9Sopenharmony_ci ++token_end_; 1906d528ed9Sopenharmony_ci if (AdvanceOne(&state, *token_begin_)) 1916d528ed9Sopenharmony_ci break; 1926d528ed9Sopenharmony_ci if (options_ & RETURN_DELIMS) { 1936d528ed9Sopenharmony_ci token_is_delim_ = true; 1946d528ed9Sopenharmony_ci return true; 1956d528ed9Sopenharmony_ci } 1966d528ed9Sopenharmony_ci // else skip over delimiter. 1976d528ed9Sopenharmony_ci } 1986d528ed9Sopenharmony_ci while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) 1996d528ed9Sopenharmony_ci ++token_end_; 2006d528ed9Sopenharmony_ci return true; 2016d528ed9Sopenharmony_ci } 2026d528ed9Sopenharmony_ci 2036d528ed9Sopenharmony_ci bool IsDelim(char_type c) const { return delims_.find(c) != str::npos; } 2046d528ed9Sopenharmony_ci 2056d528ed9Sopenharmony_ci bool IsQuote(char_type c) const { return quotes_.find(c) != str::npos; } 2066d528ed9Sopenharmony_ci 2076d528ed9Sopenharmony_ci struct AdvanceState { 2086d528ed9Sopenharmony_ci bool in_quote; 2096d528ed9Sopenharmony_ci bool in_escape; 2106d528ed9Sopenharmony_ci char_type quote_char; 2116d528ed9Sopenharmony_ci AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {} 2126d528ed9Sopenharmony_ci }; 2136d528ed9Sopenharmony_ci 2146d528ed9Sopenharmony_ci // Returns true if a delimiter was not hit. 2156d528ed9Sopenharmony_ci bool AdvanceOne(AdvanceState* state, char_type c) { 2166d528ed9Sopenharmony_ci if (state->in_quote) { 2176d528ed9Sopenharmony_ci if (state->in_escape) { 2186d528ed9Sopenharmony_ci state->in_escape = false; 2196d528ed9Sopenharmony_ci } else if (c == '\\') { 2206d528ed9Sopenharmony_ci state->in_escape = true; 2216d528ed9Sopenharmony_ci } else if (c == state->quote_char) { 2226d528ed9Sopenharmony_ci state->in_quote = false; 2236d528ed9Sopenharmony_ci } 2246d528ed9Sopenharmony_ci } else { 2256d528ed9Sopenharmony_ci if (IsDelim(c)) 2266d528ed9Sopenharmony_ci return false; 2276d528ed9Sopenharmony_ci state->in_quote = IsQuote(state->quote_char = c); 2286d528ed9Sopenharmony_ci } 2296d528ed9Sopenharmony_ci return true; 2306d528ed9Sopenharmony_ci } 2316d528ed9Sopenharmony_ci 2326d528ed9Sopenharmony_ci const_iterator start_pos_; 2336d528ed9Sopenharmony_ci const_iterator token_begin_; 2346d528ed9Sopenharmony_ci const_iterator token_end_; 2356d528ed9Sopenharmony_ci const_iterator end_; 2366d528ed9Sopenharmony_ci str delims_; 2376d528ed9Sopenharmony_ci str quotes_; 2386d528ed9Sopenharmony_ci int options_; 2396d528ed9Sopenharmony_ci bool token_is_delim_; 2406d528ed9Sopenharmony_ci}; 2416d528ed9Sopenharmony_ci 2426d528ed9Sopenharmony_citypedef StringTokenizerT<std::string, std::string::const_iterator> 2436d528ed9Sopenharmony_ci StringTokenizer; 2446d528ed9Sopenharmony_citypedef StringTokenizerT<std::u16string, std::u16string::const_iterator> 2456d528ed9Sopenharmony_ci WStringTokenizer; 2466d528ed9Sopenharmony_citypedef StringTokenizerT<std::string, const char*> CStringTokenizer; 2476d528ed9Sopenharmony_ci 2486d528ed9Sopenharmony_ci} // namespace base 2496d528ed9Sopenharmony_ci 2506d528ed9Sopenharmony_ci#endif // BASE_STRINGS_STRING_TOKENIZER_H_ 251