1cb93a386Sopenharmony_ci/*
2cb93a386Sopenharmony_ci * Copyright 2017 Google Inc.
3cb93a386Sopenharmony_ci *
4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be
5cb93a386Sopenharmony_ci * found in the LICENSE file.
6cb93a386Sopenharmony_ci */
7cb93a386Sopenharmony_ci#ifndef NFAtoDFA_DEFINED
8cb93a386Sopenharmony_ci#define NFAtoDFA_DEFINED
9cb93a386Sopenharmony_ci
10cb93a386Sopenharmony_ci#include "src/sksl/lex/DFA.h"
11cb93a386Sopenharmony_ci#include "src/sksl/lex/DFAState.h"
12cb93a386Sopenharmony_ci#include "src/sksl/lex/NFA.h"
13cb93a386Sopenharmony_ci#include "src/sksl/lex/NFAState.h"
14cb93a386Sopenharmony_ci
15cb93a386Sopenharmony_ci#include <algorithm>
16cb93a386Sopenharmony_ci#include <climits>
17cb93a386Sopenharmony_ci#include <memory>
18cb93a386Sopenharmony_ci#include <unordered_map>
19cb93a386Sopenharmony_ci#include <set>
20cb93a386Sopenharmony_ci#include <vector>
21cb93a386Sopenharmony_ci
22cb93a386Sopenharmony_ci/**
23cb93a386Sopenharmony_ci * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
24cb93a386Sopenharmony_ci * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
25cb93a386Sopenharmony_ci * possible combination of simultaneous NFA states and give this combination a label. These labelled
26cb93a386Sopenharmony_ci * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
27cb93a386Sopenharmony_ci *
28cb93a386Sopenharmony_ci * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
29cb93a386Sopenharmony_ci * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
30cb93a386Sopenharmony_ci * (in terms of the order in which they were added to the NFA).
31cb93a386Sopenharmony_ci */
32cb93a386Sopenharmony_ciclass NFAtoDFA {
33cb93a386Sopenharmony_cipublic:
34cb93a386Sopenharmony_ci    inline static constexpr char START_CHAR = 9;
35cb93a386Sopenharmony_ci    inline static constexpr char END_CHAR = 126;
36cb93a386Sopenharmony_ci
37cb93a386Sopenharmony_ci    NFAtoDFA(NFA* nfa)
38cb93a386Sopenharmony_ci    : fNFA(*nfa) {}
39cb93a386Sopenharmony_ci
40cb93a386Sopenharmony_ci    /**
41cb93a386Sopenharmony_ci     * Returns a DFA created from the NFA.
42cb93a386Sopenharmony_ci     */
43cb93a386Sopenharmony_ci    DFA convert() {
44cb93a386Sopenharmony_ci        // create state 0, the "reject" state
45cb93a386Sopenharmony_ci        getState(DFAState::Label({}));
46cb93a386Sopenharmony_ci        // create a state representing being in all of the NFA's start states at once
47cb93a386Sopenharmony_ci        std::vector<int> startStates = fNFA.fStartStates;
48cb93a386Sopenharmony_ci        std::sort(startStates.begin(), startStates.end());
49cb93a386Sopenharmony_ci        // this becomes state 1, our start state
50cb93a386Sopenharmony_ci        DFAState* start = getState(DFAState::Label(startStates));
51cb93a386Sopenharmony_ci        this->scanState(start);
52cb93a386Sopenharmony_ci
53cb93a386Sopenharmony_ci        this->computeMappings();
54cb93a386Sopenharmony_ci
55cb93a386Sopenharmony_ci        int stateCount = 0;
56cb93a386Sopenharmony_ci        for (const auto& row : fTransitions) {
57cb93a386Sopenharmony_ci            stateCount = std::max(stateCount, (int) row.size());
58cb93a386Sopenharmony_ci        }
59cb93a386Sopenharmony_ci        return DFA(fCharMappings, fTransitions, fAccepts);
60cb93a386Sopenharmony_ci    }
61cb93a386Sopenharmony_ci
62cb93a386Sopenharmony_ciprivate:
63cb93a386Sopenharmony_ci    /**
64cb93a386Sopenharmony_ci     * Returns an existing state with the given label, or creates a new one and returns it.
65cb93a386Sopenharmony_ci     */
66cb93a386Sopenharmony_ci    DFAState* getState(DFAState::Label label) {
67cb93a386Sopenharmony_ci        auto found = fStates.find(label);
68cb93a386Sopenharmony_ci        if (found == fStates.end()) {
69cb93a386Sopenharmony_ci            int id = fStates.size();
70cb93a386Sopenharmony_ci            fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
71cb93a386Sopenharmony_ci            return fStates[label].get();
72cb93a386Sopenharmony_ci        }
73cb93a386Sopenharmony_ci        return found->second.get();
74cb93a386Sopenharmony_ci    }
75cb93a386Sopenharmony_ci
76cb93a386Sopenharmony_ci    void add(int nfaState, std::vector<int>* states) {
77cb93a386Sopenharmony_ci        NFAState state = fNFA.fStates[nfaState];
78cb93a386Sopenharmony_ci        if (state.fKind == NFAState::kRemapped_Kind) {
79cb93a386Sopenharmony_ci            for (int next : state.fData) {
80cb93a386Sopenharmony_ci                this->add(next, states);
81cb93a386Sopenharmony_ci            }
82cb93a386Sopenharmony_ci        } else {
83cb93a386Sopenharmony_ci            for (int entry : *states) {
84cb93a386Sopenharmony_ci                if (nfaState == entry) {
85cb93a386Sopenharmony_ci                    return;
86cb93a386Sopenharmony_ci                }
87cb93a386Sopenharmony_ci            }
88cb93a386Sopenharmony_ci            states->push_back(nfaState);
89cb93a386Sopenharmony_ci        }
90cb93a386Sopenharmony_ci    }
91cb93a386Sopenharmony_ci
92cb93a386Sopenharmony_ci    void addTransition(char c, int start, int next) {
93cb93a386Sopenharmony_ci        while (fTransitions.size() <= (size_t) c) {
94cb93a386Sopenharmony_ci            fTransitions.push_back(std::vector<int>());
95cb93a386Sopenharmony_ci        }
96cb93a386Sopenharmony_ci        std::vector<int>& row = fTransitions[c];
97cb93a386Sopenharmony_ci        while (row.size() <= (size_t) start) {
98cb93a386Sopenharmony_ci            row.push_back(INVALID);
99cb93a386Sopenharmony_ci        }
100cb93a386Sopenharmony_ci        row[start] = next;
101cb93a386Sopenharmony_ci    }
102cb93a386Sopenharmony_ci
103cb93a386Sopenharmony_ci    void scanState(DFAState* state) {
104cb93a386Sopenharmony_ci        state->fIsScanned = true;
105cb93a386Sopenharmony_ci        for (char c = START_CHAR; c <= END_CHAR; ++c) {
106cb93a386Sopenharmony_ci            std::vector<int> next;
107cb93a386Sopenharmony_ci            int bestAccept = INT_MAX;
108cb93a386Sopenharmony_ci            for (int idx : state->fLabel.fStates) {
109cb93a386Sopenharmony_ci                const NFAState& nfaState = fNFA.fStates[idx];
110cb93a386Sopenharmony_ci                if (nfaState.accept(c)) {
111cb93a386Sopenharmony_ci                    for (int nextState : nfaState.fNext) {
112cb93a386Sopenharmony_ci                        if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
113cb93a386Sopenharmony_ci                            bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
114cb93a386Sopenharmony_ci                        }
115cb93a386Sopenharmony_ci                        this->add(nextState, &next);
116cb93a386Sopenharmony_ci                    }
117cb93a386Sopenharmony_ci                }
118cb93a386Sopenharmony_ci            }
119cb93a386Sopenharmony_ci            std::sort(next.begin(), next.end());
120cb93a386Sopenharmony_ci            DFAState* nextState = this->getState(DFAState::Label(next));
121cb93a386Sopenharmony_ci            this->addTransition(c, state->fId, nextState->fId);
122cb93a386Sopenharmony_ci            if (bestAccept != INT_MAX) {
123cb93a386Sopenharmony_ci                while (fAccepts.size() <= (size_t) nextState->fId) {
124cb93a386Sopenharmony_ci                    fAccepts.push_back(INVALID);
125cb93a386Sopenharmony_ci                }
126cb93a386Sopenharmony_ci                fAccepts[nextState->fId] = bestAccept;
127cb93a386Sopenharmony_ci            }
128cb93a386Sopenharmony_ci            if (!nextState->fIsScanned) {
129cb93a386Sopenharmony_ci                this->scanState(nextState);
130cb93a386Sopenharmony_ci            }
131cb93a386Sopenharmony_ci        }
132cb93a386Sopenharmony_ci    }
133cb93a386Sopenharmony_ci
134cb93a386Sopenharmony_ci    // collapse rows with the same transitions to a single row. This is common, as each row
135cb93a386Sopenharmony_ci    // represents a character and often there are many characters for which all transitions are
136cb93a386Sopenharmony_ci    // identical (e.g. [0-9] are treated the same way by all lexer rules)
137cb93a386Sopenharmony_ci    void computeMappings() {
138cb93a386Sopenharmony_ci        // mappings[<input row>] = <output row>
139cb93a386Sopenharmony_ci        std::vector<std::vector<int>*> uniques;
140cb93a386Sopenharmony_ci        // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
141cb93a386Sopenharmony_ci        for (size_t i = 0; i < fTransitions.size(); ++i) {
142cb93a386Sopenharmony_ci            int found = -1;
143cb93a386Sopenharmony_ci            for (size_t j = 0; j < uniques.size(); ++j) {
144cb93a386Sopenharmony_ci                if (*uniques[j] == fTransitions[i]) {
145cb93a386Sopenharmony_ci                    found = j;
146cb93a386Sopenharmony_ci                    break;
147cb93a386Sopenharmony_ci                }
148cb93a386Sopenharmony_ci            }
149cb93a386Sopenharmony_ci            if (found == -1) {
150cb93a386Sopenharmony_ci                found = (int) uniques.size();
151cb93a386Sopenharmony_ci                uniques.push_back(&fTransitions[i]);
152cb93a386Sopenharmony_ci            }
153cb93a386Sopenharmony_ci            fCharMappings.push_back(found);
154cb93a386Sopenharmony_ci        }
155cb93a386Sopenharmony_ci        std::vector<std::vector<int>> newTransitions;
156cb93a386Sopenharmony_ci        for (std::vector<int>* row : uniques) {
157cb93a386Sopenharmony_ci            newTransitions.push_back(*row);
158cb93a386Sopenharmony_ci        }
159cb93a386Sopenharmony_ci        fTransitions = newTransitions;
160cb93a386Sopenharmony_ci    }
161cb93a386Sopenharmony_ci
162cb93a386Sopenharmony_ci    const NFA& fNFA;
163cb93a386Sopenharmony_ci    std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
164cb93a386Sopenharmony_ci    std::vector<std::vector<int>> fTransitions;
165cb93a386Sopenharmony_ci    std::vector<int> fCharMappings;
166cb93a386Sopenharmony_ci    std::vector<int> fAccepts;
167cb93a386Sopenharmony_ci};
168cb93a386Sopenharmony_ci#endif  // NFAtoDFA_DEFINED
169