1cb93a386Sopenharmony_ci/*
2cb93a386Sopenharmony_ci * Copyright 2017 Google Inc.
3cb93a386Sopenharmony_ci *
4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be
5cb93a386Sopenharmony_ci * found in the LICENSE file.
6cb93a386Sopenharmony_ci */
7cb93a386Sopenharmony_ci
8cb93a386Sopenharmony_ci#include "src/sksl/lex/NFAtoDFA.h"
9cb93a386Sopenharmony_ci#include "src/sksl/lex/RegexParser.h"
10cb93a386Sopenharmony_ci#include "src/sksl/lex/TransitionTable.h"
11cb93a386Sopenharmony_ci
12cb93a386Sopenharmony_ci#include <fstream>
13cb93a386Sopenharmony_ci#include <sstream>
14cb93a386Sopenharmony_ci#include <string>
15cb93a386Sopenharmony_ci
16cb93a386Sopenharmony_ci/**
17cb93a386Sopenharmony_ci * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex
18cb93a386Sopenharmony_ci * file is a text file with one token definition per line. Each line is of the form:
19cb93a386Sopenharmony_ci * <TOKEN_NAME> = <pattern>
20cb93a386Sopenharmony_ci * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string.
21cb93a386Sopenharmony_ci */
22cb93a386Sopenharmony_ci
23cb93a386Sopenharmony_cistatic constexpr const char* HEADER =
24cb93a386Sopenharmony_ci    "/*\n"
25cb93a386Sopenharmony_ci    " * Copyright 2017 Google Inc.\n"
26cb93a386Sopenharmony_ci    " *\n"
27cb93a386Sopenharmony_ci    " * Use of this source code is governed by a BSD-style license that can be\n"
28cb93a386Sopenharmony_ci    " * found in the LICENSE file.\n"
29cb93a386Sopenharmony_ci    " */\n"
30cb93a386Sopenharmony_ci    "/*****************************************************************************************\n"
31cb93a386Sopenharmony_ci    " ******************** This file was generated by sksllex. Do not edit. *******************\n"
32cb93a386Sopenharmony_ci    " *****************************************************************************************/\n";
33cb93a386Sopenharmony_ci
34cb93a386Sopenharmony_cistatic void writeH(const DFA& dfa, const char* lexer, const char* token,
35cb93a386Sopenharmony_ci                   const std::vector<std::string>& tokens, const char* hPath) {
36cb93a386Sopenharmony_ci    std::ofstream out(hPath);
37cb93a386Sopenharmony_ci    SkASSERT(out.good());
38cb93a386Sopenharmony_ci    out << HEADER;
39cb93a386Sopenharmony_ci    out << "#ifndef SKSL_" << lexer << "\n";
40cb93a386Sopenharmony_ci    out << "#define SKSL_" << lexer << "\n";
41cb93a386Sopenharmony_ci    out << "#include \"include/core/SkStringView.h\"\n";
42cb93a386Sopenharmony_ci    out << "#include <cstddef>\n";
43cb93a386Sopenharmony_ci    out << "#include <cstdint>\n";
44cb93a386Sopenharmony_ci    out << "namespace SkSL {\n";
45cb93a386Sopenharmony_ci    out << "\n";
46cb93a386Sopenharmony_ci    out << "struct " << token << " {\n";
47cb93a386Sopenharmony_ci    out << "    enum class Kind {\n";
48cb93a386Sopenharmony_ci    for (const std::string& t : tokens) {
49cb93a386Sopenharmony_ci        out << "        TK_" << t << ",\n";
50cb93a386Sopenharmony_ci    }
51cb93a386Sopenharmony_ci    out << "        TK_NONE,";
52cb93a386Sopenharmony_ci    out << R"(
53cb93a386Sopenharmony_ci    };
54cb93a386Sopenharmony_ci
55cb93a386Sopenharmony_ci    )" << token << "() {}";
56cb93a386Sopenharmony_ci
57cb93a386Sopenharmony_ci    out << token << R"((Kind kind, int32_t offset, int32_t length, int32_t line)
58cb93a386Sopenharmony_ci    : fKind(kind)
59cb93a386Sopenharmony_ci    , fOffset(offset)
60cb93a386Sopenharmony_ci    , fLength(length)
61cb93a386Sopenharmony_ci    , fLine(line) {}
62cb93a386Sopenharmony_ci
63cb93a386Sopenharmony_ci    Kind fKind      = Kind::TK_NONE;
64cb93a386Sopenharmony_ci    int32_t fOffset = -1;
65cb93a386Sopenharmony_ci    int32_t fLength = -1;
66cb93a386Sopenharmony_ci    int32_t fLine   = -1;
67cb93a386Sopenharmony_ci};
68cb93a386Sopenharmony_ci
69cb93a386Sopenharmony_ciclass )" << lexer << R"( {
70cb93a386Sopenharmony_cipublic:
71cb93a386Sopenharmony_ci    void start(skstd::string_view text) {
72cb93a386Sopenharmony_ci        fText = text;
73cb93a386Sopenharmony_ci        fOffset = 0;
74cb93a386Sopenharmony_ci        fLine = 1;
75cb93a386Sopenharmony_ci    }
76cb93a386Sopenharmony_ci
77cb93a386Sopenharmony_ci    )" << token << R"( next();
78cb93a386Sopenharmony_ci
79cb93a386Sopenharmony_ci    struct Checkpoint {
80cb93a386Sopenharmony_ci        int32_t fOffset;
81cb93a386Sopenharmony_ci        int32_t fLine;
82cb93a386Sopenharmony_ci    };
83cb93a386Sopenharmony_ci
84cb93a386Sopenharmony_ci    Checkpoint getCheckpoint() const {
85cb93a386Sopenharmony_ci        return {fOffset, fLine};
86cb93a386Sopenharmony_ci    }
87cb93a386Sopenharmony_ci
88cb93a386Sopenharmony_ci    void rewindToCheckpoint(Checkpoint checkpoint) {
89cb93a386Sopenharmony_ci        fOffset = checkpoint.fOffset;
90cb93a386Sopenharmony_ci        fLine = checkpoint.fLine;
91cb93a386Sopenharmony_ci    }
92cb93a386Sopenharmony_ci
93cb93a386Sopenharmony_ciprivate:
94cb93a386Sopenharmony_ci    skstd::string_view fText;
95cb93a386Sopenharmony_ci    int32_t fOffset;
96cb93a386Sopenharmony_ci    int32_t fLine;
97cb93a386Sopenharmony_ci};
98cb93a386Sopenharmony_ci
99cb93a386Sopenharmony_ci} // namespace
100cb93a386Sopenharmony_ci#endif
101cb93a386Sopenharmony_ci)";
102cb93a386Sopenharmony_ci}
103cb93a386Sopenharmony_ci
104cb93a386Sopenharmony_cistatic void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include,
105cb93a386Sopenharmony_ci                     const char* cppPath) {
106cb93a386Sopenharmony_ci    std::ofstream out(cppPath);
107cb93a386Sopenharmony_ci    SkASSERT(out.good());
108cb93a386Sopenharmony_ci    out << HEADER;
109cb93a386Sopenharmony_ci    out << "#include \"" << include << "\"\n";
110cb93a386Sopenharmony_ci    out << "\n";
111cb93a386Sopenharmony_ci    out << "namespace SkSL {\n";
112cb93a386Sopenharmony_ci    out << "\n";
113cb93a386Sopenharmony_ci
114cb93a386Sopenharmony_ci    size_t states = 0;
115cb93a386Sopenharmony_ci    for (const auto& row : dfa.fTransitions) {
116cb93a386Sopenharmony_ci        states = std::max(states, row.size());
117cb93a386Sopenharmony_ci    }
118cb93a386Sopenharmony_ci    out << "using State = " << (states <= 256 ? "uint8_t" : "uint16_t") << ";\n";
119cb93a386Sopenharmony_ci    // arbitrarily-chosen character which is greater than START_CHAR and should not appear in actual
120cb93a386Sopenharmony_ci    // input
121cb93a386Sopenharmony_ci    out << "static const uint8_t INVALID_CHAR = 18;";
122cb93a386Sopenharmony_ci    out << "static const int8_t kMappings[" << dfa.fCharMappings.size() << "] = {\n    ";
123cb93a386Sopenharmony_ci    const char* separator = "";
124cb93a386Sopenharmony_ci    for (int m : dfa.fCharMappings) {
125cb93a386Sopenharmony_ci        out << separator << std::to_string(m);
126cb93a386Sopenharmony_ci        separator = ", ";
127cb93a386Sopenharmony_ci    }
128cb93a386Sopenharmony_ci    out << "\n};\n";
129cb93a386Sopenharmony_ci
130cb93a386Sopenharmony_ci    WriteTransitionTable(out, dfa, states);
131cb93a386Sopenharmony_ci
132cb93a386Sopenharmony_ci    out << "static const int8_t kAccepts[" << states << "] = {";
133cb93a386Sopenharmony_ci    for (size_t i = 0; i < states; ++i) {
134cb93a386Sopenharmony_ci        if (i < dfa.fAccepts.size()) {
135cb93a386Sopenharmony_ci            out << " " << dfa.fAccepts[i] << ",";
136cb93a386Sopenharmony_ci        } else {
137cb93a386Sopenharmony_ci            out << " " << INVALID << ",";
138cb93a386Sopenharmony_ci        }
139cb93a386Sopenharmony_ci    }
140cb93a386Sopenharmony_ci    out << " };\n";
141cb93a386Sopenharmony_ci    out << "\n";
142cb93a386Sopenharmony_ci
143cb93a386Sopenharmony_ci    out << token << " " << lexer << "::next() {";
144cb93a386Sopenharmony_ci    out << R"(
145cb93a386Sopenharmony_ci    // note that we cheat here: normally a lexer needs to worry about the case
146cb93a386Sopenharmony_ci    // where a token has a prefix which is not itself a valid token - for instance,
147cb93a386Sopenharmony_ci    // maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid
148cb93a386Sopenharmony_ci    // tokens. Our grammar doesn't have this property, so we can simplify the logic
149cb93a386Sopenharmony_ci    // a bit.
150cb93a386Sopenharmony_ci    int32_t startOffset = fOffset;
151cb93a386Sopenharmony_ci    if (startOffset == (int32_t)fText.length()) {
152cb93a386Sopenharmony_ci        return )" << token << "(" << token << R"(::Kind::TK_END_OF_FILE, startOffset, 0, fLine);
153cb93a386Sopenharmony_ci    }
154cb93a386Sopenharmony_ci    State state = 1;
155cb93a386Sopenharmony_ci    for (;;) {
156cb93a386Sopenharmony_ci        if (fOffset >= (int32_t)fText.length()) {
157cb93a386Sopenharmony_ci            if (kAccepts[state] == -1) {
158cb93a386Sopenharmony_ci                return Token(Token::Kind::TK_END_OF_FILE, startOffset, 0, fLine);
159cb93a386Sopenharmony_ci            }
160cb93a386Sopenharmony_ci            break;
161cb93a386Sopenharmony_ci        }
162cb93a386Sopenharmony_ci        uint8_t c = (uint8_t) fText[fOffset];
163cb93a386Sopenharmony_ci        if (c <= 8 || c >= )" << dfa.fCharMappings.size() << R"() {
164cb93a386Sopenharmony_ci            c = INVALID_CHAR;
165cb93a386Sopenharmony_ci        }
166cb93a386Sopenharmony_ci        State newState = get_transition(kMappings[c], state);
167cb93a386Sopenharmony_ci        if (!newState) {
168cb93a386Sopenharmony_ci            break;
169cb93a386Sopenharmony_ci        }
170cb93a386Sopenharmony_ci        state = newState;
171cb93a386Sopenharmony_ci        ++fOffset;
172cb93a386Sopenharmony_ci        if (c == '\n') {
173cb93a386Sopenharmony_ci            ++fLine;
174cb93a386Sopenharmony_ci        }
175cb93a386Sopenharmony_ci    }
176cb93a386Sopenharmony_ci    Token::Kind kind = ()" << token << R"(::Kind) kAccepts[state];
177cb93a386Sopenharmony_ci    return )" << token << R"((kind, startOffset, fOffset - startOffset, fLine);
178cb93a386Sopenharmony_ci}
179cb93a386Sopenharmony_ci
180cb93a386Sopenharmony_ci} // namespace
181cb93a386Sopenharmony_ci)";
182cb93a386Sopenharmony_ci}
183cb93a386Sopenharmony_ci
184cb93a386Sopenharmony_cistatic void process(const char* inPath, const char* lexer, const char* token, const char* hPath,
185cb93a386Sopenharmony_ci                    const char* cppPath) {
186cb93a386Sopenharmony_ci    NFA nfa;
187cb93a386Sopenharmony_ci    std::vector<std::string> tokens;
188cb93a386Sopenharmony_ci    tokens.push_back("END_OF_FILE");
189cb93a386Sopenharmony_ci    std::string line;
190cb93a386Sopenharmony_ci    std::ifstream in(inPath);
191cb93a386Sopenharmony_ci    while (std::getline(in, line)) {
192cb93a386Sopenharmony_ci        if (line.length() == 0) {
193cb93a386Sopenharmony_ci            continue;
194cb93a386Sopenharmony_ci        }
195cb93a386Sopenharmony_ci        if (line.length() >= 2 && line[0] == '/' && line[1] == '/') {
196cb93a386Sopenharmony_ci            continue;
197cb93a386Sopenharmony_ci        }
198cb93a386Sopenharmony_ci        std::istringstream split(line);
199cb93a386Sopenharmony_ci        std::string name, delimiter, pattern;
200cb93a386Sopenharmony_ci        if (split >> name >> delimiter >> pattern) {
201cb93a386Sopenharmony_ci            SkASSERT(split.eof());
202cb93a386Sopenharmony_ci            SkASSERT(name != "");
203cb93a386Sopenharmony_ci            SkASSERT(delimiter == "=");
204cb93a386Sopenharmony_ci            SkASSERT(pattern != "");
205cb93a386Sopenharmony_ci            tokens.push_back(name);
206cb93a386Sopenharmony_ci            if (pattern[0] == '"') {
207cb93a386Sopenharmony_ci                SkASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"');
208cb93a386Sopenharmony_ci                RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]);
209cb93a386Sopenharmony_ci                for (size_t i = 2; i < pattern.size() - 1; ++i) {
210cb93a386Sopenharmony_ci                    node = RegexNode(RegexNode::kConcat_Kind, node,
211cb93a386Sopenharmony_ci                                     RegexNode(RegexNode::kChar_Kind, pattern[i]));
212cb93a386Sopenharmony_ci                }
213cb93a386Sopenharmony_ci                nfa.addRegex(node);
214cb93a386Sopenharmony_ci            }
215cb93a386Sopenharmony_ci            else {
216cb93a386Sopenharmony_ci                nfa.addRegex(RegexParser().parse(pattern));
217cb93a386Sopenharmony_ci            }
218cb93a386Sopenharmony_ci        }
219cb93a386Sopenharmony_ci    }
220cb93a386Sopenharmony_ci    NFAtoDFA converter(&nfa);
221cb93a386Sopenharmony_ci    DFA dfa = converter.convert();
222cb93a386Sopenharmony_ci    writeH(dfa, lexer, token, tokens, hPath);
223cb93a386Sopenharmony_ci    writeCPP(dfa, lexer, token, (std::string("src/sksl/SkSL") + lexer + ".h").c_str(), cppPath);
224cb93a386Sopenharmony_ci}
225cb93a386Sopenharmony_ci
226cb93a386Sopenharmony_ciint main(int argc, const char** argv) {
227cb93a386Sopenharmony_ci    if (argc != 6) {
228cb93a386Sopenharmony_ci        printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n");
229cb93a386Sopenharmony_ci        exit(1);
230cb93a386Sopenharmony_ci    }
231cb93a386Sopenharmony_ci    process(argv[1], argv[2], argv[3], argv[4], argv[5]);
232cb93a386Sopenharmony_ci    return 0;
233cb93a386Sopenharmony_ci}
234