1// Copyright 2018 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <algorithm>
6#include <set>
7#include <unordered_map>
8#include <unordered_set>
9
10#include "src/torque/ast.h"
11#include "src/torque/earley-parser.h"
12#include "src/torque/utils.h"
13
14namespace v8 {
15namespace internal {
16namespace torque {
17
18namespace {
19
20struct LineAndColumnTracker {
21  LineAndColumn previous{0, 0, 0};
22  LineAndColumn current{0, 0, 0};
23
24  void Advance(InputPosition from, InputPosition to) {
25    previous = current;
26    current.offset += std::distance(from, to);
27    while (from != to) {
28      if (*from == '\n') {
29        current.line += 1;
30        current.column = 0;
31      } else {
32        current.column += 1;
33      }
34      ++from;
35    }
36  }
37
38  SourcePosition ToSourcePosition() {
39    return {CurrentSourceFile::Get(), previous, current};
40  }
41};
42
43}  // namespace
44
45base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
46                                            const LexerResult& tokens) const {
47  std::vector<ParseResult> results;
48  for (const Item* child : completed_item->Children()) {
49    if (!child) continue;
50    base::Optional<ParseResult> child_result =
51        child->left()->RunAction(child, tokens);
52    if (child_result) results.push_back(std::move(*child_result));
53  }
54  MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
55  CurrentSourcePosition::Scope pos_scope(matched_input.pos);
56  ParseResultIterator iterator(std::move(results), matched_input);
57  auto result = action_(&iterator);
58  // Make sure the parse action consumed all the child results.
59  CHECK(!iterator.HasNext());
60  return result;
61}
62
63Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
64  rules_.clear();
65  for (const Rule& rule : rules) {
66    AddRule(rule);
67  }
68  return *this;
69}
70
71std::vector<const Item*> Item::Children() const {
72  std::vector<const Item*> children;
73  for (const Item* current = this; current->prev_; current = current->prev_) {
74    children.push_back(current->child_);
75  }
76  // The above loop collects the child nodes in reversed order.
77  std::reverse(children.begin(), children.end());
78  DCHECK_EQ(children.size(), right().size());
79  return children;
80}
81
82std::string Item::SplitByChildren(const LexerResult& tokens) const {
83  if (right().size() == 1) {
84    if (const Item* child = Children()[0])
85      return child->SplitByChildren(tokens);
86  }
87  std::stringstream s;
88  bool first = true;
89  for (const Item* item : Children()) {
90    if (!item) continue;
91    if (!first) s << "  ";
92    s << item->GetMatchedInput(tokens).ToString();
93    first = false;
94  }
95  return s.str();
96}
97
98void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
99  DCHECK(*this == other);
100  if (child_ != other.child_) {
101    std::stringstream s;
102    s << "Ambiguous grammer rules for \""
103      << child_->GetMatchedInput(tokens).ToString() << "\":\n   "
104      << child_->SplitByChildren(tokens) << "\nvs\n   "
105      << other.child_->SplitByChildren(tokens);
106    ReportError(s.str());
107  }
108  if (prev_ != other.prev_) {
109    std::stringstream s;
110    s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
111      << "\":\n   " << SplitByChildren(tokens) << "  ...\nvs\n   "
112      << other.SplitByChildren(tokens) << "  ...";
113    ReportError(s.str());
114  }
115}
116
117LexerResult Lexer::RunLexer(const std::string& input) {
118  LexerResult result;
119  InputPosition const begin = input.c_str();
120  InputPosition const end = begin + input.size();
121  InputPosition pos = begin;
122  InputPosition token_start = pos;
123  LineAndColumnTracker line_column_tracker;
124
125  match_whitespace_(&pos);
126  line_column_tracker.Advance(token_start, pos);
127  while (pos != end) {
128    token_start = pos;
129    Symbol* symbol = MatchToken(&pos, end);
130    DCHECK_IMPLIES(symbol != nullptr, pos != token_start);
131    InputPosition token_end = pos;
132    line_column_tracker.Advance(token_start, token_end);
133    if (!symbol) {
134      CurrentSourcePosition::Scope pos_scope(
135          line_column_tracker.ToSourcePosition());
136      ReportError("Lexer Error: unknown token " +
137                  StringLiteralQuote(std::string(
138                      token_start, token_start + std::min<ptrdiff_t>(
139                                                     end - token_start, 10))));
140    }
141    result.token_symbols.push_back(symbol);
142    result.token_contents.push_back(
143        {token_start, pos, line_column_tracker.ToSourcePosition()});
144    match_whitespace_(&pos);
145    line_column_tracker.Advance(token_end, pos);
146  }
147
148  // Add an additional token position to simplify corner cases.
149  line_column_tracker.Advance(token_start, pos);
150  result.token_contents.push_back(
151      {pos, pos, line_column_tracker.ToSourcePosition()});
152  return result;
153}
154
155Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
156  InputPosition token_start = *pos;
157  Symbol* symbol = nullptr;
158  // Find longest matching pattern.
159  for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
160    InputPosition token_end = token_start;
161    PatternFunction matchPattern = pair.first;
162    if (matchPattern(&token_end) && token_end > *pos) {
163      *pos = token_end;
164      symbol = &pair.second;
165    }
166  }
167  size_t pattern_size = *pos - token_start;
168
169  // Now check for keywords. Prefer keywords over patterns unless the pattern is
170  // longer. Iterate from the end to ensure that if one keyword is a prefix of
171  // another, we first try to match the longer one.
172  for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
173    const std::string& keyword = it->first;
174    if (static_cast<size_t>(end - token_start) < keyword.size()) continue;
175    if (keyword.size() >= pattern_size &&
176        keyword == std::string(token_start, token_start + keyword.size())) {
177      *pos = token_start + keyword.size();
178      return &it->second;
179    }
180  }
181  if (pattern_size > 0) return symbol;
182  return nullptr;
183}
184
185// This is an implementation of Earley's parsing algorithm
186// (https://en.wikipedia.org/wiki/Earley_parser).
187const Item* RunEarleyAlgorithm(
188    Symbol* start, const LexerResult& tokens,
189    std::unordered_set<Item, base::hash<Item>>* processed) {
190  // Worklist for items at the current position.
191  std::vector<Item> worklist;
192  // Worklist for items at the next position.
193  std::vector<Item> future_items;
194  CurrentSourcePosition::Scope source_position(
195      SourcePosition{CurrentSourceFile::Get(), LineAndColumn::Invalid(),
196                     LineAndColumn::Invalid()});
197  std::vector<const Item*> completed_items;
198  std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
199                     base::hash<std::pair<size_t, Symbol*>>>
200      waiting;
201
202  std::vector<const Item*> debug_trace;
203
204  // Start with one top_level symbol mapping to the start symbol of the grammar.
205  // This simplifies things because the start symbol might have several
206  // rules.
207  Symbol top_level;
208  top_level.AddRule(Rule({start}));
209  worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
210
211  size_t input_length = tokens.token_symbols.size();
212
213  for (size_t pos = 0; pos <= input_length; ++pos) {
214    while (!worklist.empty()) {
215      auto insert_result = processed->insert(worklist.back());
216      const Item& item = *insert_result.first;
217      DCHECK_EQ(pos, item.pos());
218      MatchedInput last_token = tokens.token_contents[pos];
219      CurrentSourcePosition::Get() = last_token.pos;
220      bool is_new = insert_result.second;
221      if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
222      worklist.pop_back();
223      if (!is_new) continue;
224
225      debug_trace.push_back(&item);
226      if (item.IsComplete()) {
227        // 'Complete' phase: Advance all items that were waiting to match this
228        // symbol next.
229        for (const Item* parent : waiting[{item.start(), item.left()}]) {
230          worklist.push_back(parent->Advance(pos, &item));
231        }
232      } else {
233        Symbol* next = item.NextSymbol();
234        // 'Scan' phase: Check if {next} is the next symbol in the input (this
235        // is never the case if {next} is a non-terminal).
236        if (pos < tokens.token_symbols.size() &&
237            tokens.token_symbols[pos] == next) {
238          future_items.push_back(item.Advance(pos + 1, nullptr));
239        }
240        // 'Predict' phase: Add items for every rule of the non-terminal.
241        if (!next->IsTerminal()) {
242          // Remember that this item is waiting for completion with {next}.
243          waiting[{pos, next}].insert(&item);
244        }
245        for (size_t i = 0; i < next->rule_number(); ++i) {
246          Rule* rule = next->rule(i);
247          auto already_completed =
248              processed->find(Item{rule, rule->right().size(), pos, pos});
249          // As discussed in section 3 of
250          //    Aycock, John, and R. Nigel Horspool. "Practical earley
251          //    parsing." The Computer Journal 45.6 (2002): 620-630.
252          // Earley parsing has the following problem with epsilon rules:
253          // When we complete an item that started at the current position
254          // (that is, it matched zero tokens), we might not yet have
255          // predicted all items it can complete with. Thus we check for the
256          // existence of such items here and complete them immediately.
257          if (already_completed != processed->end()) {
258            worklist.push_back(item.Advance(pos, &*already_completed));
259          } else {
260            worklist.push_back(Item{rule, 0, pos, pos});
261          }
262        }
263      }
264    }
265    std::swap(worklist, future_items);
266  }
267
268  auto final_item =
269      processed->find(Item{top_level.rule(0), 1, 0, input_length});
270  if (final_item != processed->end()) {
271    // Success: The {top_level} rule matches the complete input.
272    return final_item->Children()[0];
273  }
274  std::string reason;
275  const Item& last_item = *debug_trace.back();
276  if (last_item.pos() < tokens.token_symbols.size()) {
277    std::string next_token = tokens.token_contents[last_item.pos()].ToString();
278    reason = "unexpected token \"" + next_token + "\"";
279  } else {
280    reason = "unexpected end of input";
281  }
282  ReportError("Parser Error: " + reason);
283}
284
285// static
286DISABLE_CFI_ICALL
287bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
288  if (**pos && char_class(static_cast<unsigned char>(**pos))) {
289    ++*pos;
290    return true;
291  }
292  return false;
293}
294
295// static
296bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
297  if (**pos && char_class(**pos)) {
298    ++*pos;
299    return true;
300  }
301  return false;
302}
303
304// static
305bool Grammar::MatchString(const char* s, InputPosition* pos) {
306  InputPosition current = *pos;
307  for (; *s != 0; ++s, ++current) {
308    if (*s != *current) return false;
309  }
310  *pos = current;
311  return true;
312}
313
314// static
315bool Grammar::MatchAnyChar(InputPosition* pos) {
316  return MatchChar([](char c) { return true; }, pos);
317}
318
319}  // namespace torque
320}  // namespace internal
321}  // namespace v8
322