193fb6ee3Sopenharmony_ciimport * as assert from 'node:assert';
293fb6ee3Sopenharmony_ciimport * as fs from 'node:fs';
393fb6ee3Sopenharmony_ciimport * as path from 'node:path';
493fb6ee3Sopenharmony_ciimport type { ParserError, Token } from 'parse5';
593fb6ee3Sopenharmony_ciimport { type Tokenizer, TokenizerMode, type TokenHandler } from 'parse5';
693fb6ee3Sopenharmony_ciimport { makeChunks } from './common.js';
793fb6ee3Sopenharmony_ci
893fb6ee3Sopenharmony_ciexport type HtmlLibToken = [string, string | null, ...unknown[]];
993fb6ee3Sopenharmony_ci
1093fb6ee3Sopenharmony_ciinterface TokenError {
1193fb6ee3Sopenharmony_ci    code: string;
1293fb6ee3Sopenharmony_ci    line: number;
1393fb6ee3Sopenharmony_ci    col: number;
1493fb6ee3Sopenharmony_ci}
1593fb6ee3Sopenharmony_ci
1693fb6ee3Sopenharmony_ciinterface TokenSourceData {
1793fb6ee3Sopenharmony_ci    tokens: HtmlLibToken[];
1893fb6ee3Sopenharmony_ci    errors: TokenError[];
1993fb6ee3Sopenharmony_ci}
2093fb6ee3Sopenharmony_ci
2193fb6ee3Sopenharmony_citype TokenSourceCreator = (data: TokenizeHandler) => Tokenizer;
2293fb6ee3Sopenharmony_ci
2393fb6ee3Sopenharmony_ci/** Receives events and immediately compares them against the expected values. We check the entire output again at the end. */
2493fb6ee3Sopenharmony_ciclass TokenizeHandler implements TokenSourceData, TokenHandler {
2593fb6ee3Sopenharmony_ci    constructor(private testData: LoadedTest) {}
2693fb6ee3Sopenharmony_ci
2793fb6ee3Sopenharmony_ci    private addToken(token: HtmlLibToken): void {
2893fb6ee3Sopenharmony_ci        assert.deepStrictEqual(token, this.testData.expected[this.tokens.length]);
2993fb6ee3Sopenharmony_ci
3093fb6ee3Sopenharmony_ci        this.tokens.push(token);
3193fb6ee3Sopenharmony_ci    }
3293fb6ee3Sopenharmony_ci
3393fb6ee3Sopenharmony_ci    onComment(token: Token.CommentToken): void {
3493fb6ee3Sopenharmony_ci        this.addToken(['Comment', token.data]);
3593fb6ee3Sopenharmony_ci    }
3693fb6ee3Sopenharmony_ci    onDoctype(token: Token.DoctypeToken): void {
3793fb6ee3Sopenharmony_ci        this.addToken(['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]);
3893fb6ee3Sopenharmony_ci    }
3993fb6ee3Sopenharmony_ci    onStartTag(token: Token.TagToken): void {
4093fb6ee3Sopenharmony_ci        const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value]));
4193fb6ee3Sopenharmony_ci        const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs];
4293fb6ee3Sopenharmony_ci
4393fb6ee3Sopenharmony_ci        if (token.selfClosing) {
4493fb6ee3Sopenharmony_ci            startTagEntry.push(true);
4593fb6ee3Sopenharmony_ci        }
4693fb6ee3Sopenharmony_ci
4793fb6ee3Sopenharmony_ci        this.addToken(startTagEntry);
4893fb6ee3Sopenharmony_ci    }
4993fb6ee3Sopenharmony_ci    onEndTag(token: Token.TagToken): void {
5093fb6ee3Sopenharmony_ci        // NOTE: parser feedback simulator can produce adjusted SVG
5193fb6ee3Sopenharmony_ci        // tag names for end tag tokens so we need to lower case it
5293fb6ee3Sopenharmony_ci        this.addToken(['EndTag', token.tagName.toLowerCase()]);
5393fb6ee3Sopenharmony_ci    }
5493fb6ee3Sopenharmony_ci    onEof(): void {
5593fb6ee3Sopenharmony_ci        this.sawEof = true;
5693fb6ee3Sopenharmony_ci    }
5793fb6ee3Sopenharmony_ci    onCharacter(token: Token.CharacterToken): void {
5893fb6ee3Sopenharmony_ci        const lastEntry = this.tokens[this.tokens.length - 1];
5993fb6ee3Sopenharmony_ci
6093fb6ee3Sopenharmony_ci        if (lastEntry && lastEntry[0] === 'Character' && lastEntry[1] != null) {
6193fb6ee3Sopenharmony_ci            lastEntry[1] += token.chars;
6293fb6ee3Sopenharmony_ci        } else {
6393fb6ee3Sopenharmony_ci            this.tokens.push(['Character', token.chars]);
6493fb6ee3Sopenharmony_ci        }
6593fb6ee3Sopenharmony_ci
6693fb6ee3Sopenharmony_ci        const actual = this.tokens[this.tokens.length - 1];
6793fb6ee3Sopenharmony_ci        const expected = this.testData.expected[this.tokens.length - 1];
6893fb6ee3Sopenharmony_ci        assert.strictEqual('Character', expected[0]);
6993fb6ee3Sopenharmony_ci        assert.ok(typeof actual[1] === 'string');
7093fb6ee3Sopenharmony_ci        assert.ok(expected[1]?.startsWith(actual[1]));
7193fb6ee3Sopenharmony_ci    }
7293fb6ee3Sopenharmony_ci    onNullCharacter(token: Token.CharacterToken): void {
7393fb6ee3Sopenharmony_ci        this.onCharacter(token);
7493fb6ee3Sopenharmony_ci    }
7593fb6ee3Sopenharmony_ci    onWhitespaceCharacter(token: Token.CharacterToken): void {
7693fb6ee3Sopenharmony_ci        this.onCharacter(token);
7793fb6ee3Sopenharmony_ci    }
7893fb6ee3Sopenharmony_ci    onParseError(err: ParserError): void {
7993fb6ee3Sopenharmony_ci        assert.ok(
8093fb6ee3Sopenharmony_ci            this.testData.expectedErrors.some(
8193fb6ee3Sopenharmony_ci                ({ code, line, col }) => code === err.code && line === err.startLine && col === err.startCol
8293fb6ee3Sopenharmony_ci            )
8393fb6ee3Sopenharmony_ci        );
8493fb6ee3Sopenharmony_ci
8593fb6ee3Sopenharmony_ci        this.errors.push({
8693fb6ee3Sopenharmony_ci            code: err.code,
8793fb6ee3Sopenharmony_ci            line: err.startLine,
8893fb6ee3Sopenharmony_ci            col: err.startCol,
8993fb6ee3Sopenharmony_ci        });
9093fb6ee3Sopenharmony_ci    }
9193fb6ee3Sopenharmony_ci
9293fb6ee3Sopenharmony_ci    public sawEof = false;
9393fb6ee3Sopenharmony_ci    public tokens: HtmlLibToken[] = [];
9493fb6ee3Sopenharmony_ci    public errors: TokenError[] = [];
9593fb6ee3Sopenharmony_ci}
9693fb6ee3Sopenharmony_ci
9793fb6ee3Sopenharmony_cifunction tokenize(createTokenSource: TokenSourceCreator, chunks: string[], testData: LoadedTest): TokenSourceData {
9893fb6ee3Sopenharmony_ci    const result = new TokenizeHandler(testData);
9993fb6ee3Sopenharmony_ci    const tokenizer = createTokenSource(result);
10093fb6ee3Sopenharmony_ci
10193fb6ee3Sopenharmony_ci    // NOTE: set small waterline for testing purposes
10293fb6ee3Sopenharmony_ci    tokenizer.preprocessor.bufferWaterline = 8;
10393fb6ee3Sopenharmony_ci    tokenizer.state = testData.initialState;
10493fb6ee3Sopenharmony_ci
10593fb6ee3Sopenharmony_ci    if (testData.lastStartTag) {
10693fb6ee3Sopenharmony_ci        tokenizer.lastStartTagName = testData.lastStartTag;
10793fb6ee3Sopenharmony_ci    }
10893fb6ee3Sopenharmony_ci
10993fb6ee3Sopenharmony_ci    for (let i = 0; i < chunks.length; i++) {
11093fb6ee3Sopenharmony_ci        assert.ok(!result.sawEof);
11193fb6ee3Sopenharmony_ci        tokenizer.write(chunks[i], i === chunks.length - 1);
11293fb6ee3Sopenharmony_ci    }
11393fb6ee3Sopenharmony_ci
11493fb6ee3Sopenharmony_ci    assert.ok(result.sawEof);
11593fb6ee3Sopenharmony_ci    assert.ok(!tokenizer.active);
11693fb6ee3Sopenharmony_ci
11793fb6ee3Sopenharmony_ci    // Sort errors by line and column
11893fb6ee3Sopenharmony_ci    result.errors.sort((err1, err2) => err1.line - err2.line || err1.col - err2.col);
11993fb6ee3Sopenharmony_ci
12093fb6ee3Sopenharmony_ci    return result;
12193fb6ee3Sopenharmony_ci}
12293fb6ee3Sopenharmony_ci
12393fb6ee3Sopenharmony_cifunction unicodeUnescape(str: string): string {
12493fb6ee3Sopenharmony_ci    return str.replace(/\\[Uu]\w{4}/g, (match: string) => String.fromCharCode(Number.parseInt(match.slice(2), 16)));
12593fb6ee3Sopenharmony_ci}
12693fb6ee3Sopenharmony_ci
12793fb6ee3Sopenharmony_cifunction unescapeDescrIO(testDescr: TestDescription): void {
12893fb6ee3Sopenharmony_ci    testDescr.input = unicodeUnescape(testDescr.input);
12993fb6ee3Sopenharmony_ci
13093fb6ee3Sopenharmony_ci    for (const tokenEntry of testDescr.output) {
13193fb6ee3Sopenharmony_ci        //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token),
13293fb6ee3Sopenharmony_ci        //character token data (for Character token).
13393fb6ee3Sopenharmony_ci        if (tokenEntry[1]) {
13493fb6ee3Sopenharmony_ci            tokenEntry[1] = unicodeUnescape(tokenEntry[1]);
13593fb6ee3Sopenharmony_ci        }
13693fb6ee3Sopenharmony_ci    }
13793fb6ee3Sopenharmony_ci}
13893fb6ee3Sopenharmony_ci
13993fb6ee3Sopenharmony_cifunction getTokenizerSuitableStateName(testDataStateName: string): Tokenizer['state'] {
14093fb6ee3Sopenharmony_ci    const name = testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase();
14193fb6ee3Sopenharmony_ci    return TokenizerMode[name as keyof typeof TokenizerMode];
14293fb6ee3Sopenharmony_ci}
14393fb6ee3Sopenharmony_ci
14493fb6ee3Sopenharmony_ciinterface TestDescription {
14593fb6ee3Sopenharmony_ci    initialStates: string[];
14693fb6ee3Sopenharmony_ci    doubleEscaped?: boolean;
14793fb6ee3Sopenharmony_ci    output: HtmlLibToken[];
14893fb6ee3Sopenharmony_ci    description: string;
14993fb6ee3Sopenharmony_ci    input: string;
15093fb6ee3Sopenharmony_ci    lastStartTag: string;
15193fb6ee3Sopenharmony_ci    errors?: TokenError[];
15293fb6ee3Sopenharmony_ci}
15393fb6ee3Sopenharmony_ci
15493fb6ee3Sopenharmony_ciinterface LoadedTest {
15593fb6ee3Sopenharmony_ci    idx: number;
15693fb6ee3Sopenharmony_ci    setName: string;
15793fb6ee3Sopenharmony_ci    name: string;
15893fb6ee3Sopenharmony_ci    input: string;
15993fb6ee3Sopenharmony_ci    expected: HtmlLibToken[];
16093fb6ee3Sopenharmony_ci    initialState: Tokenizer['state'];
16193fb6ee3Sopenharmony_ci    initialStateName: string;
16293fb6ee3Sopenharmony_ci    lastStartTag: string;
16393fb6ee3Sopenharmony_ci    expectedErrors: TokenError[];
16493fb6ee3Sopenharmony_ci}
16593fb6ee3Sopenharmony_ci
16693fb6ee3Sopenharmony_cifunction loadTests(dataDirPath: string): LoadedTest[] {
16793fb6ee3Sopenharmony_ci    const testSetFileNames = fs.readdirSync(dataDirPath);
16893fb6ee3Sopenharmony_ci    const tests: LoadedTest[] = [];
16993fb6ee3Sopenharmony_ci    let testIdx = 0;
17093fb6ee3Sopenharmony_ci
17193fb6ee3Sopenharmony_ci    for (const fileName of testSetFileNames) {
17293fb6ee3Sopenharmony_ci        if (path.extname(fileName) !== '.test') {
17393fb6ee3Sopenharmony_ci            continue;
17493fb6ee3Sopenharmony_ci        }
17593fb6ee3Sopenharmony_ci
17693fb6ee3Sopenharmony_ci        const filePath = path.join(dataDirPath, fileName);
17793fb6ee3Sopenharmony_ci        const testSetJson = fs.readFileSync(filePath).toString();
17893fb6ee3Sopenharmony_ci        const testSet = JSON.parse(testSetJson);
17993fb6ee3Sopenharmony_ci        const testDescrs: TestDescription[] = testSet.tests;
18093fb6ee3Sopenharmony_ci
18193fb6ee3Sopenharmony_ci        if (!testDescrs) {
18293fb6ee3Sopenharmony_ci            continue;
18393fb6ee3Sopenharmony_ci        }
18493fb6ee3Sopenharmony_ci
18593fb6ee3Sopenharmony_ci        const setName = fileName.replace('.test', '');
18693fb6ee3Sopenharmony_ci
18793fb6ee3Sopenharmony_ci        for (const descr of testDescrs) {
18893fb6ee3Sopenharmony_ci            if (!descr.initialStates) {
18993fb6ee3Sopenharmony_ci                descr.initialStates = ['Data state'];
19093fb6ee3Sopenharmony_ci            }
19193fb6ee3Sopenharmony_ci
19293fb6ee3Sopenharmony_ci            if (descr.doubleEscaped) {
19393fb6ee3Sopenharmony_ci                unescapeDescrIO(descr);
19493fb6ee3Sopenharmony_ci            }
19593fb6ee3Sopenharmony_ci
19693fb6ee3Sopenharmony_ci            const expected = descr.output;
19793fb6ee3Sopenharmony_ci
19893fb6ee3Sopenharmony_ci            for (const initialStateName of descr.initialStates) {
19993fb6ee3Sopenharmony_ci                tests.push({
20093fb6ee3Sopenharmony_ci                    idx: ++testIdx,
20193fb6ee3Sopenharmony_ci                    setName,
20293fb6ee3Sopenharmony_ci                    name: descr.description,
20393fb6ee3Sopenharmony_ci                    input: descr.input,
20493fb6ee3Sopenharmony_ci                    expected,
20593fb6ee3Sopenharmony_ci                    initialState: getTokenizerSuitableStateName(initialStateName),
20693fb6ee3Sopenharmony_ci                    initialStateName,
20793fb6ee3Sopenharmony_ci                    lastStartTag: descr.lastStartTag,
20893fb6ee3Sopenharmony_ci                    expectedErrors: descr.errors || [],
20993fb6ee3Sopenharmony_ci                });
21093fb6ee3Sopenharmony_ci            }
21193fb6ee3Sopenharmony_ci        }
21293fb6ee3Sopenharmony_ci    }
21393fb6ee3Sopenharmony_ci
21493fb6ee3Sopenharmony_ci    return tests;
21593fb6ee3Sopenharmony_ci}
21693fb6ee3Sopenharmony_ci
21793fb6ee3Sopenharmony_ciexport function generateTokenizationTests(
21893fb6ee3Sopenharmony_ci    prefix: string,
21993fb6ee3Sopenharmony_ci    testSuite: string,
22093fb6ee3Sopenharmony_ci    createTokenSource: TokenSourceCreator
22193fb6ee3Sopenharmony_ci): void {
22293fb6ee3Sopenharmony_ci    for (const testData of loadTests(testSuite)) {
22393fb6ee3Sopenharmony_ci        const testName = `${prefix} - ${testData.idx}.${testData.setName} - ${testData.name} - Initial state: ${testData.initialStateName}`;
22493fb6ee3Sopenharmony_ci
22593fb6ee3Sopenharmony_ci        it(testName, () => {
22693fb6ee3Sopenharmony_ci            const chunks = makeChunks(testData.input);
22793fb6ee3Sopenharmony_ci            const result = tokenize(createTokenSource, chunks, testData);
22893fb6ee3Sopenharmony_ci
22993fb6ee3Sopenharmony_ci            assert.deepEqual(result.tokens, testData.expected, `Chunks: ${JSON.stringify(chunks)}`);
23093fb6ee3Sopenharmony_ci            assert.deepEqual(result.errors, testData.expectedErrors || []);
23193fb6ee3Sopenharmony_ci        });
23293fb6ee3Sopenharmony_ci    }
23393fb6ee3Sopenharmony_ci}
234