1// Copyright 2018 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <algorithm> 6#include <set> 7#include <unordered_map> 8#include <unordered_set> 9 10#include "src/torque/ast.h" 11#include "src/torque/earley-parser.h" 12#include "src/torque/utils.h" 13 14namespace v8 { 15namespace internal { 16namespace torque { 17 18namespace { 19 20struct LineAndColumnTracker { 21 LineAndColumn previous{0, 0, 0}; 22 LineAndColumn current{0, 0, 0}; 23 24 void Advance(InputPosition from, InputPosition to) { 25 previous = current; 26 current.offset += std::distance(from, to); 27 while (from != to) { 28 if (*from == '\n') { 29 current.line += 1; 30 current.column = 0; 31 } else { 32 current.column += 1; 33 } 34 ++from; 35 } 36 } 37 38 SourcePosition ToSourcePosition() { 39 return {CurrentSourceFile::Get(), previous, current}; 40 } 41}; 42 43} // namespace 44 45base::Optional<ParseResult> Rule::RunAction(const Item* completed_item, 46 const LexerResult& tokens) const { 47 std::vector<ParseResult> results; 48 for (const Item* child : completed_item->Children()) { 49 if (!child) continue; 50 base::Optional<ParseResult> child_result = 51 child->left()->RunAction(child, tokens); 52 if (child_result) results.push_back(std::move(*child_result)); 53 } 54 MatchedInput matched_input = completed_item->GetMatchedInput(tokens); 55 CurrentSourcePosition::Scope pos_scope(matched_input.pos); 56 ParseResultIterator iterator(std::move(results), matched_input); 57 auto result = action_(&iterator); 58 // Make sure the parse action consumed all the child results. 59 CHECK(!iterator.HasNext()); 60 return result; 61} 62 63Symbol& Symbol::operator=(std::initializer_list<Rule> rules) { 64 rules_.clear(); 65 for (const Rule& rule : rules) { 66 AddRule(rule); 67 } 68 return *this; 69} 70 71std::vector<const Item*> Item::Children() const { 72 std::vector<const Item*> children; 73 for (const Item* current = this; current->prev_; current = current->prev_) { 74 children.push_back(current->child_); 75 } 76 // The above loop collects the child nodes in reversed order. 77 std::reverse(children.begin(), children.end()); 78 DCHECK_EQ(children.size(), right().size()); 79 return children; 80} 81 82std::string Item::SplitByChildren(const LexerResult& tokens) const { 83 if (right().size() == 1) { 84 if (const Item* child = Children()[0]) 85 return child->SplitByChildren(tokens); 86 } 87 std::stringstream s; 88 bool first = true; 89 for (const Item* item : Children()) { 90 if (!item) continue; 91 if (!first) s << " "; 92 s << item->GetMatchedInput(tokens).ToString(); 93 first = false; 94 } 95 return s.str(); 96} 97 98void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const { 99 DCHECK(*this == other); 100 if (child_ != other.child_) { 101 std::stringstream s; 102 s << "Ambiguous grammer rules for \"" 103 << child_->GetMatchedInput(tokens).ToString() << "\":\n " 104 << child_->SplitByChildren(tokens) << "\nvs\n " 105 << other.child_->SplitByChildren(tokens); 106 ReportError(s.str()); 107 } 108 if (prev_ != other.prev_) { 109 std::stringstream s; 110 s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString() 111 << "\":\n " << SplitByChildren(tokens) << " ...\nvs\n " 112 << other.SplitByChildren(tokens) << " ..."; 113 ReportError(s.str()); 114 } 115} 116 117LexerResult Lexer::RunLexer(const std::string& input) { 118 LexerResult result; 119 InputPosition const begin = input.c_str(); 120 InputPosition const end = begin + input.size(); 121 InputPosition pos = begin; 122 InputPosition token_start = pos; 123 LineAndColumnTracker line_column_tracker; 124 125 match_whitespace_(&pos); 126 line_column_tracker.Advance(token_start, pos); 127 while (pos != end) { 128 token_start = pos; 129 Symbol* symbol = MatchToken(&pos, end); 130 DCHECK_IMPLIES(symbol != nullptr, pos != token_start); 131 InputPosition token_end = pos; 132 line_column_tracker.Advance(token_start, token_end); 133 if (!symbol) { 134 CurrentSourcePosition::Scope pos_scope( 135 line_column_tracker.ToSourcePosition()); 136 ReportError("Lexer Error: unknown token " + 137 StringLiteralQuote(std::string( 138 token_start, token_start + std::min<ptrdiff_t>( 139 end - token_start, 10)))); 140 } 141 result.token_symbols.push_back(symbol); 142 result.token_contents.push_back( 143 {token_start, pos, line_column_tracker.ToSourcePosition()}); 144 match_whitespace_(&pos); 145 line_column_tracker.Advance(token_end, pos); 146 } 147 148 // Add an additional token position to simplify corner cases. 149 line_column_tracker.Advance(token_start, pos); 150 result.token_contents.push_back( 151 {pos, pos, line_column_tracker.ToSourcePosition()}); 152 return result; 153} 154 155Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) { 156 InputPosition token_start = *pos; 157 Symbol* symbol = nullptr; 158 // Find longest matching pattern. 159 for (std::pair<const PatternFunction, Symbol>& pair : patterns_) { 160 InputPosition token_end = token_start; 161 PatternFunction matchPattern = pair.first; 162 if (matchPattern(&token_end) && token_end > *pos) { 163 *pos = token_end; 164 symbol = &pair.second; 165 } 166 } 167 size_t pattern_size = *pos - token_start; 168 169 // Now check for keywords. Prefer keywords over patterns unless the pattern is 170 // longer. Iterate from the end to ensure that if one keyword is a prefix of 171 // another, we first try to match the longer one. 172 for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) { 173 const std::string& keyword = it->first; 174 if (static_cast<size_t>(end - token_start) < keyword.size()) continue; 175 if (keyword.size() >= pattern_size && 176 keyword == std::string(token_start, token_start + keyword.size())) { 177 *pos = token_start + keyword.size(); 178 return &it->second; 179 } 180 } 181 if (pattern_size > 0) return symbol; 182 return nullptr; 183} 184 185// This is an implementation of Earley's parsing algorithm 186// (https://en.wikipedia.org/wiki/Earley_parser). 187const Item* RunEarleyAlgorithm( 188 Symbol* start, const LexerResult& tokens, 189 std::unordered_set<Item, base::hash<Item>>* processed) { 190 // Worklist for items at the current position. 191 std::vector<Item> worklist; 192 // Worklist for items at the next position. 193 std::vector<Item> future_items; 194 CurrentSourcePosition::Scope source_position( 195 SourcePosition{CurrentSourceFile::Get(), LineAndColumn::Invalid(), 196 LineAndColumn::Invalid()}); 197 std::vector<const Item*> completed_items; 198 std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>, 199 base::hash<std::pair<size_t, Symbol*>>> 200 waiting; 201 202 std::vector<const Item*> debug_trace; 203 204 // Start with one top_level symbol mapping to the start symbol of the grammar. 205 // This simplifies things because the start symbol might have several 206 // rules. 207 Symbol top_level; 208 top_level.AddRule(Rule({start})); 209 worklist.push_back(Item{top_level.rule(0), 0, 0, 0}); 210 211 size_t input_length = tokens.token_symbols.size(); 212 213 for (size_t pos = 0; pos <= input_length; ++pos) { 214 while (!worklist.empty()) { 215 auto insert_result = processed->insert(worklist.back()); 216 const Item& item = *insert_result.first; 217 DCHECK_EQ(pos, item.pos()); 218 MatchedInput last_token = tokens.token_contents[pos]; 219 CurrentSourcePosition::Get() = last_token.pos; 220 bool is_new = insert_result.second; 221 if (!is_new) item.CheckAmbiguity(worklist.back(), tokens); 222 worklist.pop_back(); 223 if (!is_new) continue; 224 225 debug_trace.push_back(&item); 226 if (item.IsComplete()) { 227 // 'Complete' phase: Advance all items that were waiting to match this 228 // symbol next. 229 for (const Item* parent : waiting[{item.start(), item.left()}]) { 230 worklist.push_back(parent->Advance(pos, &item)); 231 } 232 } else { 233 Symbol* next = item.NextSymbol(); 234 // 'Scan' phase: Check if {next} is the next symbol in the input (this 235 // is never the case if {next} is a non-terminal). 236 if (pos < tokens.token_symbols.size() && 237 tokens.token_symbols[pos] == next) { 238 future_items.push_back(item.Advance(pos + 1, nullptr)); 239 } 240 // 'Predict' phase: Add items for every rule of the non-terminal. 241 if (!next->IsTerminal()) { 242 // Remember that this item is waiting for completion with {next}. 243 waiting[{pos, next}].insert(&item); 244 } 245 for (size_t i = 0; i < next->rule_number(); ++i) { 246 Rule* rule = next->rule(i); 247 auto already_completed = 248 processed->find(Item{rule, rule->right().size(), pos, pos}); 249 // As discussed in section 3 of 250 // Aycock, John, and R. Nigel Horspool. "Practical earley 251 // parsing." The Computer Journal 45.6 (2002): 620-630. 252 // Earley parsing has the following problem with epsilon rules: 253 // When we complete an item that started at the current position 254 // (that is, it matched zero tokens), we might not yet have 255 // predicted all items it can complete with. Thus we check for the 256 // existence of such items here and complete them immediately. 257 if (already_completed != processed->end()) { 258 worklist.push_back(item.Advance(pos, &*already_completed)); 259 } else { 260 worklist.push_back(Item{rule, 0, pos, pos}); 261 } 262 } 263 } 264 } 265 std::swap(worklist, future_items); 266 } 267 268 auto final_item = 269 processed->find(Item{top_level.rule(0), 1, 0, input_length}); 270 if (final_item != processed->end()) { 271 // Success: The {top_level} rule matches the complete input. 272 return final_item->Children()[0]; 273 } 274 std::string reason; 275 const Item& last_item = *debug_trace.back(); 276 if (last_item.pos() < tokens.token_symbols.size()) { 277 std::string next_token = tokens.token_contents[last_item.pos()].ToString(); 278 reason = "unexpected token \"" + next_token + "\""; 279 } else { 280 reason = "unexpected end of input"; 281 } 282 ReportError("Parser Error: " + reason); 283} 284 285// static 286DISABLE_CFI_ICALL 287bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) { 288 if (**pos && char_class(static_cast<unsigned char>(**pos))) { 289 ++*pos; 290 return true; 291 } 292 return false; 293} 294 295// static 296bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) { 297 if (**pos && char_class(**pos)) { 298 ++*pos; 299 return true; 300 } 301 return false; 302} 303 304// static 305bool Grammar::MatchString(const char* s, InputPosition* pos) { 306 InputPosition current = *pos; 307 for (; *s != 0; ++s, ++current) { 308 if (*s != *current) return false; 309 } 310 *pos = current; 311 return true; 312} 313 314// static 315bool Grammar::MatchAnyChar(InputPosition* pos) { 316 return MatchChar([](char c) { return true; }, pos); 317} 318 319} // namespace torque 320} // namespace internal 321} // namespace v8 322