1cb93a386Sopenharmony_ci/* 2cb93a386Sopenharmony_ci * Copyright 2017 Google Inc. 3cb93a386Sopenharmony_ci * 4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be 5cb93a386Sopenharmony_ci * found in the LICENSE file. 6cb93a386Sopenharmony_ci */ 7cb93a386Sopenharmony_ci#ifndef NFAtoDFA_DEFINED 8cb93a386Sopenharmony_ci#define NFAtoDFA_DEFINED 9cb93a386Sopenharmony_ci 10cb93a386Sopenharmony_ci#include "src/sksl/lex/DFA.h" 11cb93a386Sopenharmony_ci#include "src/sksl/lex/DFAState.h" 12cb93a386Sopenharmony_ci#include "src/sksl/lex/NFA.h" 13cb93a386Sopenharmony_ci#include "src/sksl/lex/NFAState.h" 14cb93a386Sopenharmony_ci 15cb93a386Sopenharmony_ci#include <algorithm> 16cb93a386Sopenharmony_ci#include <climits> 17cb93a386Sopenharmony_ci#include <memory> 18cb93a386Sopenharmony_ci#include <unordered_map> 19cb93a386Sopenharmony_ci#include <set> 20cb93a386Sopenharmony_ci#include <vector> 21cb93a386Sopenharmony_ci 22cb93a386Sopenharmony_ci/** 23cb93a386Sopenharmony_ci * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and 24cb93a386Sopenharmony_ci * DFAs differ only in that an NFA allows multiple states at the same time, we can find each 25cb93a386Sopenharmony_ci * possible combination of simultaneous NFA states and give this combination a label. These labelled 26cb93a386Sopenharmony_ci * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time. 27cb93a386Sopenharmony_ci * 28cb93a386Sopenharmony_ci * As an NFA can end up in multiple accept states at the same time (for instance, the token "while" 29cb93a386Sopenharmony_ci * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex 30cb93a386Sopenharmony_ci * (in terms of the order in which they were added to the NFA). 31cb93a386Sopenharmony_ci */ 32cb93a386Sopenharmony_ciclass NFAtoDFA { 33cb93a386Sopenharmony_cipublic: 34cb93a386Sopenharmony_ci inline static constexpr char START_CHAR = 9; 35cb93a386Sopenharmony_ci inline static constexpr char END_CHAR = 126; 36cb93a386Sopenharmony_ci 37cb93a386Sopenharmony_ci NFAtoDFA(NFA* nfa) 38cb93a386Sopenharmony_ci : fNFA(*nfa) {} 39cb93a386Sopenharmony_ci 40cb93a386Sopenharmony_ci /** 41cb93a386Sopenharmony_ci * Returns a DFA created from the NFA. 42cb93a386Sopenharmony_ci */ 43cb93a386Sopenharmony_ci DFA convert() { 44cb93a386Sopenharmony_ci // create state 0, the "reject" state 45cb93a386Sopenharmony_ci getState(DFAState::Label({})); 46cb93a386Sopenharmony_ci // create a state representing being in all of the NFA's start states at once 47cb93a386Sopenharmony_ci std::vector<int> startStates = fNFA.fStartStates; 48cb93a386Sopenharmony_ci std::sort(startStates.begin(), startStates.end()); 49cb93a386Sopenharmony_ci // this becomes state 1, our start state 50cb93a386Sopenharmony_ci DFAState* start = getState(DFAState::Label(startStates)); 51cb93a386Sopenharmony_ci this->scanState(start); 52cb93a386Sopenharmony_ci 53cb93a386Sopenharmony_ci this->computeMappings(); 54cb93a386Sopenharmony_ci 55cb93a386Sopenharmony_ci int stateCount = 0; 56cb93a386Sopenharmony_ci for (const auto& row : fTransitions) { 57cb93a386Sopenharmony_ci stateCount = std::max(stateCount, (int) row.size()); 58cb93a386Sopenharmony_ci } 59cb93a386Sopenharmony_ci return DFA(fCharMappings, fTransitions, fAccepts); 60cb93a386Sopenharmony_ci } 61cb93a386Sopenharmony_ci 62cb93a386Sopenharmony_ciprivate: 63cb93a386Sopenharmony_ci /** 64cb93a386Sopenharmony_ci * Returns an existing state with the given label, or creates a new one and returns it. 65cb93a386Sopenharmony_ci */ 66cb93a386Sopenharmony_ci DFAState* getState(DFAState::Label label) { 67cb93a386Sopenharmony_ci auto found = fStates.find(label); 68cb93a386Sopenharmony_ci if (found == fStates.end()) { 69cb93a386Sopenharmony_ci int id = fStates.size(); 70cb93a386Sopenharmony_ci fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label)); 71cb93a386Sopenharmony_ci return fStates[label].get(); 72cb93a386Sopenharmony_ci } 73cb93a386Sopenharmony_ci return found->second.get(); 74cb93a386Sopenharmony_ci } 75cb93a386Sopenharmony_ci 76cb93a386Sopenharmony_ci void add(int nfaState, std::vector<int>* states) { 77cb93a386Sopenharmony_ci NFAState state = fNFA.fStates[nfaState]; 78cb93a386Sopenharmony_ci if (state.fKind == NFAState::kRemapped_Kind) { 79cb93a386Sopenharmony_ci for (int next : state.fData) { 80cb93a386Sopenharmony_ci this->add(next, states); 81cb93a386Sopenharmony_ci } 82cb93a386Sopenharmony_ci } else { 83cb93a386Sopenharmony_ci for (int entry : *states) { 84cb93a386Sopenharmony_ci if (nfaState == entry) { 85cb93a386Sopenharmony_ci return; 86cb93a386Sopenharmony_ci } 87cb93a386Sopenharmony_ci } 88cb93a386Sopenharmony_ci states->push_back(nfaState); 89cb93a386Sopenharmony_ci } 90cb93a386Sopenharmony_ci } 91cb93a386Sopenharmony_ci 92cb93a386Sopenharmony_ci void addTransition(char c, int start, int next) { 93cb93a386Sopenharmony_ci while (fTransitions.size() <= (size_t) c) { 94cb93a386Sopenharmony_ci fTransitions.push_back(std::vector<int>()); 95cb93a386Sopenharmony_ci } 96cb93a386Sopenharmony_ci std::vector<int>& row = fTransitions[c]; 97cb93a386Sopenharmony_ci while (row.size() <= (size_t) start) { 98cb93a386Sopenharmony_ci row.push_back(INVALID); 99cb93a386Sopenharmony_ci } 100cb93a386Sopenharmony_ci row[start] = next; 101cb93a386Sopenharmony_ci } 102cb93a386Sopenharmony_ci 103cb93a386Sopenharmony_ci void scanState(DFAState* state) { 104cb93a386Sopenharmony_ci state->fIsScanned = true; 105cb93a386Sopenharmony_ci for (char c = START_CHAR; c <= END_CHAR; ++c) { 106cb93a386Sopenharmony_ci std::vector<int> next; 107cb93a386Sopenharmony_ci int bestAccept = INT_MAX; 108cb93a386Sopenharmony_ci for (int idx : state->fLabel.fStates) { 109cb93a386Sopenharmony_ci const NFAState& nfaState = fNFA.fStates[idx]; 110cb93a386Sopenharmony_ci if (nfaState.accept(c)) { 111cb93a386Sopenharmony_ci for (int nextState : nfaState.fNext) { 112cb93a386Sopenharmony_ci if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) { 113cb93a386Sopenharmony_ci bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]); 114cb93a386Sopenharmony_ci } 115cb93a386Sopenharmony_ci this->add(nextState, &next); 116cb93a386Sopenharmony_ci } 117cb93a386Sopenharmony_ci } 118cb93a386Sopenharmony_ci } 119cb93a386Sopenharmony_ci std::sort(next.begin(), next.end()); 120cb93a386Sopenharmony_ci DFAState* nextState = this->getState(DFAState::Label(next)); 121cb93a386Sopenharmony_ci this->addTransition(c, state->fId, nextState->fId); 122cb93a386Sopenharmony_ci if (bestAccept != INT_MAX) { 123cb93a386Sopenharmony_ci while (fAccepts.size() <= (size_t) nextState->fId) { 124cb93a386Sopenharmony_ci fAccepts.push_back(INVALID); 125cb93a386Sopenharmony_ci } 126cb93a386Sopenharmony_ci fAccepts[nextState->fId] = bestAccept; 127cb93a386Sopenharmony_ci } 128cb93a386Sopenharmony_ci if (!nextState->fIsScanned) { 129cb93a386Sopenharmony_ci this->scanState(nextState); 130cb93a386Sopenharmony_ci } 131cb93a386Sopenharmony_ci } 132cb93a386Sopenharmony_ci } 133cb93a386Sopenharmony_ci 134cb93a386Sopenharmony_ci // collapse rows with the same transitions to a single row. This is common, as each row 135cb93a386Sopenharmony_ci // represents a character and often there are many characters for which all transitions are 136cb93a386Sopenharmony_ci // identical (e.g. [0-9] are treated the same way by all lexer rules) 137cb93a386Sopenharmony_ci void computeMappings() { 138cb93a386Sopenharmony_ci // mappings[<input row>] = <output row> 139cb93a386Sopenharmony_ci std::vector<std::vector<int>*> uniques; 140cb93a386Sopenharmony_ci // this could be done more efficiently, but O(n^2) is plenty fast for our purposes 141cb93a386Sopenharmony_ci for (size_t i = 0; i < fTransitions.size(); ++i) { 142cb93a386Sopenharmony_ci int found = -1; 143cb93a386Sopenharmony_ci for (size_t j = 0; j < uniques.size(); ++j) { 144cb93a386Sopenharmony_ci if (*uniques[j] == fTransitions[i]) { 145cb93a386Sopenharmony_ci found = j; 146cb93a386Sopenharmony_ci break; 147cb93a386Sopenharmony_ci } 148cb93a386Sopenharmony_ci } 149cb93a386Sopenharmony_ci if (found == -1) { 150cb93a386Sopenharmony_ci found = (int) uniques.size(); 151cb93a386Sopenharmony_ci uniques.push_back(&fTransitions[i]); 152cb93a386Sopenharmony_ci } 153cb93a386Sopenharmony_ci fCharMappings.push_back(found); 154cb93a386Sopenharmony_ci } 155cb93a386Sopenharmony_ci std::vector<std::vector<int>> newTransitions; 156cb93a386Sopenharmony_ci for (std::vector<int>* row : uniques) { 157cb93a386Sopenharmony_ci newTransitions.push_back(*row); 158cb93a386Sopenharmony_ci } 159cb93a386Sopenharmony_ci fTransitions = newTransitions; 160cb93a386Sopenharmony_ci } 161cb93a386Sopenharmony_ci 162cb93a386Sopenharmony_ci const NFA& fNFA; 163cb93a386Sopenharmony_ci std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates; 164cb93a386Sopenharmony_ci std::vector<std::vector<int>> fTransitions; 165cb93a386Sopenharmony_ci std::vector<int> fCharMappings; 166cb93a386Sopenharmony_ci std::vector<int> fAccepts; 167cb93a386Sopenharmony_ci}; 168cb93a386Sopenharmony_ci#endif // NFAtoDFA_DEFINED 169