193fb6ee3Sopenharmony_ciimport * as assert from 'node:assert'; 293fb6ee3Sopenharmony_ciimport * as fs from 'node:fs'; 393fb6ee3Sopenharmony_ciimport * as path from 'node:path'; 493fb6ee3Sopenharmony_ciimport type { ParserError, Token } from 'parse5'; 593fb6ee3Sopenharmony_ciimport { type Tokenizer, TokenizerMode, type TokenHandler } from 'parse5'; 693fb6ee3Sopenharmony_ciimport { makeChunks } from './common.js'; 793fb6ee3Sopenharmony_ci 893fb6ee3Sopenharmony_ciexport type HtmlLibToken = [string, string | null, ...unknown[]]; 993fb6ee3Sopenharmony_ci 1093fb6ee3Sopenharmony_ciinterface TokenError { 1193fb6ee3Sopenharmony_ci code: string; 1293fb6ee3Sopenharmony_ci line: number; 1393fb6ee3Sopenharmony_ci col: number; 1493fb6ee3Sopenharmony_ci} 1593fb6ee3Sopenharmony_ci 1693fb6ee3Sopenharmony_ciinterface TokenSourceData { 1793fb6ee3Sopenharmony_ci tokens: HtmlLibToken[]; 1893fb6ee3Sopenharmony_ci errors: TokenError[]; 1993fb6ee3Sopenharmony_ci} 2093fb6ee3Sopenharmony_ci 2193fb6ee3Sopenharmony_citype TokenSourceCreator = (data: TokenizeHandler) => Tokenizer; 2293fb6ee3Sopenharmony_ci 2393fb6ee3Sopenharmony_ci/** Receives events and immediately compares them against the expected values. We check the entire output again at the end. */ 2493fb6ee3Sopenharmony_ciclass TokenizeHandler implements TokenSourceData, TokenHandler { 2593fb6ee3Sopenharmony_ci constructor(private testData: LoadedTest) {} 2693fb6ee3Sopenharmony_ci 2793fb6ee3Sopenharmony_ci private addToken(token: HtmlLibToken): void { 2893fb6ee3Sopenharmony_ci assert.deepStrictEqual(token, this.testData.expected[this.tokens.length]); 2993fb6ee3Sopenharmony_ci 3093fb6ee3Sopenharmony_ci this.tokens.push(token); 3193fb6ee3Sopenharmony_ci } 3293fb6ee3Sopenharmony_ci 3393fb6ee3Sopenharmony_ci onComment(token: Token.CommentToken): void { 3493fb6ee3Sopenharmony_ci this.addToken(['Comment', token.data]); 3593fb6ee3Sopenharmony_ci } 3693fb6ee3Sopenharmony_ci onDoctype(token: Token.DoctypeToken): void { 3793fb6ee3Sopenharmony_ci this.addToken(['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]); 3893fb6ee3Sopenharmony_ci } 3993fb6ee3Sopenharmony_ci onStartTag(token: Token.TagToken): void { 4093fb6ee3Sopenharmony_ci const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); 4193fb6ee3Sopenharmony_ci const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; 4293fb6ee3Sopenharmony_ci 4393fb6ee3Sopenharmony_ci if (token.selfClosing) { 4493fb6ee3Sopenharmony_ci startTagEntry.push(true); 4593fb6ee3Sopenharmony_ci } 4693fb6ee3Sopenharmony_ci 4793fb6ee3Sopenharmony_ci this.addToken(startTagEntry); 4893fb6ee3Sopenharmony_ci } 4993fb6ee3Sopenharmony_ci onEndTag(token: Token.TagToken): void { 5093fb6ee3Sopenharmony_ci // NOTE: parser feedback simulator can produce adjusted SVG 5193fb6ee3Sopenharmony_ci // tag names for end tag tokens so we need to lower case it 5293fb6ee3Sopenharmony_ci this.addToken(['EndTag', token.tagName.toLowerCase()]); 5393fb6ee3Sopenharmony_ci } 5493fb6ee3Sopenharmony_ci onEof(): void { 5593fb6ee3Sopenharmony_ci this.sawEof = true; 5693fb6ee3Sopenharmony_ci } 5793fb6ee3Sopenharmony_ci onCharacter(token: Token.CharacterToken): void { 5893fb6ee3Sopenharmony_ci const lastEntry = this.tokens[this.tokens.length - 1]; 5993fb6ee3Sopenharmony_ci 6093fb6ee3Sopenharmony_ci if (lastEntry && lastEntry[0] === 'Character' && lastEntry[1] != null) { 6193fb6ee3Sopenharmony_ci lastEntry[1] += token.chars; 6293fb6ee3Sopenharmony_ci } else { 6393fb6ee3Sopenharmony_ci this.tokens.push(['Character', token.chars]); 6493fb6ee3Sopenharmony_ci } 6593fb6ee3Sopenharmony_ci 6693fb6ee3Sopenharmony_ci const actual = this.tokens[this.tokens.length - 1]; 6793fb6ee3Sopenharmony_ci const expected = this.testData.expected[this.tokens.length - 1]; 6893fb6ee3Sopenharmony_ci assert.strictEqual('Character', expected[0]); 6993fb6ee3Sopenharmony_ci assert.ok(typeof actual[1] === 'string'); 7093fb6ee3Sopenharmony_ci assert.ok(expected[1]?.startsWith(actual[1])); 7193fb6ee3Sopenharmony_ci } 7293fb6ee3Sopenharmony_ci onNullCharacter(token: Token.CharacterToken): void { 7393fb6ee3Sopenharmony_ci this.onCharacter(token); 7493fb6ee3Sopenharmony_ci } 7593fb6ee3Sopenharmony_ci onWhitespaceCharacter(token: Token.CharacterToken): void { 7693fb6ee3Sopenharmony_ci this.onCharacter(token); 7793fb6ee3Sopenharmony_ci } 7893fb6ee3Sopenharmony_ci onParseError(err: ParserError): void { 7993fb6ee3Sopenharmony_ci assert.ok( 8093fb6ee3Sopenharmony_ci this.testData.expectedErrors.some( 8193fb6ee3Sopenharmony_ci ({ code, line, col }) => code === err.code && line === err.startLine && col === err.startCol 8293fb6ee3Sopenharmony_ci ) 8393fb6ee3Sopenharmony_ci ); 8493fb6ee3Sopenharmony_ci 8593fb6ee3Sopenharmony_ci this.errors.push({ 8693fb6ee3Sopenharmony_ci code: err.code, 8793fb6ee3Sopenharmony_ci line: err.startLine, 8893fb6ee3Sopenharmony_ci col: err.startCol, 8993fb6ee3Sopenharmony_ci }); 9093fb6ee3Sopenharmony_ci } 9193fb6ee3Sopenharmony_ci 9293fb6ee3Sopenharmony_ci public sawEof = false; 9393fb6ee3Sopenharmony_ci public tokens: HtmlLibToken[] = []; 9493fb6ee3Sopenharmony_ci public errors: TokenError[] = []; 9593fb6ee3Sopenharmony_ci} 9693fb6ee3Sopenharmony_ci 9793fb6ee3Sopenharmony_cifunction tokenize(createTokenSource: TokenSourceCreator, chunks: string[], testData: LoadedTest): TokenSourceData { 9893fb6ee3Sopenharmony_ci const result = new TokenizeHandler(testData); 9993fb6ee3Sopenharmony_ci const tokenizer = createTokenSource(result); 10093fb6ee3Sopenharmony_ci 10193fb6ee3Sopenharmony_ci // NOTE: set small waterline for testing purposes 10293fb6ee3Sopenharmony_ci tokenizer.preprocessor.bufferWaterline = 8; 10393fb6ee3Sopenharmony_ci tokenizer.state = testData.initialState; 10493fb6ee3Sopenharmony_ci 10593fb6ee3Sopenharmony_ci if (testData.lastStartTag) { 10693fb6ee3Sopenharmony_ci tokenizer.lastStartTagName = testData.lastStartTag; 10793fb6ee3Sopenharmony_ci } 10893fb6ee3Sopenharmony_ci 10993fb6ee3Sopenharmony_ci for (let i = 0; i < chunks.length; i++) { 11093fb6ee3Sopenharmony_ci assert.ok(!result.sawEof); 11193fb6ee3Sopenharmony_ci tokenizer.write(chunks[i], i === chunks.length - 1); 11293fb6ee3Sopenharmony_ci } 11393fb6ee3Sopenharmony_ci 11493fb6ee3Sopenharmony_ci assert.ok(result.sawEof); 11593fb6ee3Sopenharmony_ci assert.ok(!tokenizer.active); 11693fb6ee3Sopenharmony_ci 11793fb6ee3Sopenharmony_ci // Sort errors by line and column 11893fb6ee3Sopenharmony_ci result.errors.sort((err1, err2) => err1.line - err2.line || err1.col - err2.col); 11993fb6ee3Sopenharmony_ci 12093fb6ee3Sopenharmony_ci return result; 12193fb6ee3Sopenharmony_ci} 12293fb6ee3Sopenharmony_ci 12393fb6ee3Sopenharmony_cifunction unicodeUnescape(str: string): string { 12493fb6ee3Sopenharmony_ci return str.replace(/\\[Uu]\w{4}/g, (match: string) => String.fromCharCode(Number.parseInt(match.slice(2), 16))); 12593fb6ee3Sopenharmony_ci} 12693fb6ee3Sopenharmony_ci 12793fb6ee3Sopenharmony_cifunction unescapeDescrIO(testDescr: TestDescription): void { 12893fb6ee3Sopenharmony_ci testDescr.input = unicodeUnescape(testDescr.input); 12993fb6ee3Sopenharmony_ci 13093fb6ee3Sopenharmony_ci for (const tokenEntry of testDescr.output) { 13193fb6ee3Sopenharmony_ci //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token), 13293fb6ee3Sopenharmony_ci //character token data (for Character token). 13393fb6ee3Sopenharmony_ci if (tokenEntry[1]) { 13493fb6ee3Sopenharmony_ci tokenEntry[1] = unicodeUnescape(tokenEntry[1]); 13593fb6ee3Sopenharmony_ci } 13693fb6ee3Sopenharmony_ci } 13793fb6ee3Sopenharmony_ci} 13893fb6ee3Sopenharmony_ci 13993fb6ee3Sopenharmony_cifunction getTokenizerSuitableStateName(testDataStateName: string): Tokenizer['state'] { 14093fb6ee3Sopenharmony_ci const name = testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase(); 14193fb6ee3Sopenharmony_ci return TokenizerMode[name as keyof typeof TokenizerMode]; 14293fb6ee3Sopenharmony_ci} 14393fb6ee3Sopenharmony_ci 14493fb6ee3Sopenharmony_ciinterface TestDescription { 14593fb6ee3Sopenharmony_ci initialStates: string[]; 14693fb6ee3Sopenharmony_ci doubleEscaped?: boolean; 14793fb6ee3Sopenharmony_ci output: HtmlLibToken[]; 14893fb6ee3Sopenharmony_ci description: string; 14993fb6ee3Sopenharmony_ci input: string; 15093fb6ee3Sopenharmony_ci lastStartTag: string; 15193fb6ee3Sopenharmony_ci errors?: TokenError[]; 15293fb6ee3Sopenharmony_ci} 15393fb6ee3Sopenharmony_ci 15493fb6ee3Sopenharmony_ciinterface LoadedTest { 15593fb6ee3Sopenharmony_ci idx: number; 15693fb6ee3Sopenharmony_ci setName: string; 15793fb6ee3Sopenharmony_ci name: string; 15893fb6ee3Sopenharmony_ci input: string; 15993fb6ee3Sopenharmony_ci expected: HtmlLibToken[]; 16093fb6ee3Sopenharmony_ci initialState: Tokenizer['state']; 16193fb6ee3Sopenharmony_ci initialStateName: string; 16293fb6ee3Sopenharmony_ci lastStartTag: string; 16393fb6ee3Sopenharmony_ci expectedErrors: TokenError[]; 16493fb6ee3Sopenharmony_ci} 16593fb6ee3Sopenharmony_ci 16693fb6ee3Sopenharmony_cifunction loadTests(dataDirPath: string): LoadedTest[] { 16793fb6ee3Sopenharmony_ci const testSetFileNames = fs.readdirSync(dataDirPath); 16893fb6ee3Sopenharmony_ci const tests: LoadedTest[] = []; 16993fb6ee3Sopenharmony_ci let testIdx = 0; 17093fb6ee3Sopenharmony_ci 17193fb6ee3Sopenharmony_ci for (const fileName of testSetFileNames) { 17293fb6ee3Sopenharmony_ci if (path.extname(fileName) !== '.test') { 17393fb6ee3Sopenharmony_ci continue; 17493fb6ee3Sopenharmony_ci } 17593fb6ee3Sopenharmony_ci 17693fb6ee3Sopenharmony_ci const filePath = path.join(dataDirPath, fileName); 17793fb6ee3Sopenharmony_ci const testSetJson = fs.readFileSync(filePath).toString(); 17893fb6ee3Sopenharmony_ci const testSet = JSON.parse(testSetJson); 17993fb6ee3Sopenharmony_ci const testDescrs: TestDescription[] = testSet.tests; 18093fb6ee3Sopenharmony_ci 18193fb6ee3Sopenharmony_ci if (!testDescrs) { 18293fb6ee3Sopenharmony_ci continue; 18393fb6ee3Sopenharmony_ci } 18493fb6ee3Sopenharmony_ci 18593fb6ee3Sopenharmony_ci const setName = fileName.replace('.test', ''); 18693fb6ee3Sopenharmony_ci 18793fb6ee3Sopenharmony_ci for (const descr of testDescrs) { 18893fb6ee3Sopenharmony_ci if (!descr.initialStates) { 18993fb6ee3Sopenharmony_ci descr.initialStates = ['Data state']; 19093fb6ee3Sopenharmony_ci } 19193fb6ee3Sopenharmony_ci 19293fb6ee3Sopenharmony_ci if (descr.doubleEscaped) { 19393fb6ee3Sopenharmony_ci unescapeDescrIO(descr); 19493fb6ee3Sopenharmony_ci } 19593fb6ee3Sopenharmony_ci 19693fb6ee3Sopenharmony_ci const expected = descr.output; 19793fb6ee3Sopenharmony_ci 19893fb6ee3Sopenharmony_ci for (const initialStateName of descr.initialStates) { 19993fb6ee3Sopenharmony_ci tests.push({ 20093fb6ee3Sopenharmony_ci idx: ++testIdx, 20193fb6ee3Sopenharmony_ci setName, 20293fb6ee3Sopenharmony_ci name: descr.description, 20393fb6ee3Sopenharmony_ci input: descr.input, 20493fb6ee3Sopenharmony_ci expected, 20593fb6ee3Sopenharmony_ci initialState: getTokenizerSuitableStateName(initialStateName), 20693fb6ee3Sopenharmony_ci initialStateName, 20793fb6ee3Sopenharmony_ci lastStartTag: descr.lastStartTag, 20893fb6ee3Sopenharmony_ci expectedErrors: descr.errors || [], 20993fb6ee3Sopenharmony_ci }); 21093fb6ee3Sopenharmony_ci } 21193fb6ee3Sopenharmony_ci } 21293fb6ee3Sopenharmony_ci } 21393fb6ee3Sopenharmony_ci 21493fb6ee3Sopenharmony_ci return tests; 21593fb6ee3Sopenharmony_ci} 21693fb6ee3Sopenharmony_ci 21793fb6ee3Sopenharmony_ciexport function generateTokenizationTests( 21893fb6ee3Sopenharmony_ci prefix: string, 21993fb6ee3Sopenharmony_ci testSuite: string, 22093fb6ee3Sopenharmony_ci createTokenSource: TokenSourceCreator 22193fb6ee3Sopenharmony_ci): void { 22293fb6ee3Sopenharmony_ci for (const testData of loadTests(testSuite)) { 22393fb6ee3Sopenharmony_ci const testName = `${prefix} - ${testData.idx}.${testData.setName} - ${testData.name} - Initial state: ${testData.initialStateName}`; 22493fb6ee3Sopenharmony_ci 22593fb6ee3Sopenharmony_ci it(testName, () => { 22693fb6ee3Sopenharmony_ci const chunks = makeChunks(testData.input); 22793fb6ee3Sopenharmony_ci const result = tokenize(createTokenSource, chunks, testData); 22893fb6ee3Sopenharmony_ci 22993fb6ee3Sopenharmony_ci assert.deepEqual(result.tokens, testData.expected, `Chunks: ${JSON.stringify(chunks)}`); 23093fb6ee3Sopenharmony_ci assert.deepEqual(result.errors, testData.expectedErrors || []); 23193fb6ee3Sopenharmony_ci }); 23293fb6ee3Sopenharmony_ci } 23393fb6ee3Sopenharmony_ci} 234