193fb6ee3Sopenharmony_ciimport { Writable } from 'node:stream'; 293fb6ee3Sopenharmony_ciimport { Parser, type ParserOptions, type TreeAdapterTypeMap, type DefaultTreeAdapterMap } from 'parse5'; 393fb6ee3Sopenharmony_ci 493fb6ee3Sopenharmony_ci/* eslint-disable unicorn/consistent-function-scoping -- The rule seems to be broken here. */ 593fb6ee3Sopenharmony_ci 693fb6ee3Sopenharmony_ci/** 793fb6ee3Sopenharmony_ci * Streaming HTML parser with scripting support. 893fb6ee3Sopenharmony_ci * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable). 993fb6ee3Sopenharmony_ci * 1093fb6ee3Sopenharmony_ci * @example 1193fb6ee3Sopenharmony_ci * 1293fb6ee3Sopenharmony_ci * ```js 1393fb6ee3Sopenharmony_ci * const ParserStream = require('parse5-parser-stream'); 1493fb6ee3Sopenharmony_ci * const http = require('http'); 1593fb6ee3Sopenharmony_ci * const { finished } = require('node:stream'); 1693fb6ee3Sopenharmony_ci * 1793fb6ee3Sopenharmony_ci * // Fetch the page content and obtain it's <head> node 1893fb6ee3Sopenharmony_ci * http.get('http://inikulin.github.io/parse5/', res => { 1993fb6ee3Sopenharmony_ci * const parser = new ParserStream(); 2093fb6ee3Sopenharmony_ci * 2193fb6ee3Sopenharmony_ci * finished(parser, () => { 2293fb6ee3Sopenharmony_ci * console.log(parser.document.childNodes[1].childNodes[0].tagName); //> 'head' 2393fb6ee3Sopenharmony_ci * }); 2493fb6ee3Sopenharmony_ci * 2593fb6ee3Sopenharmony_ci * res.pipe(parser); 2693fb6ee3Sopenharmony_ci * }); 2793fb6ee3Sopenharmony_ci * ``` 2893fb6ee3Sopenharmony_ci * 2993fb6ee3Sopenharmony_ci */ 3093fb6ee3Sopenharmony_ciexport class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> extends Writable { 3193fb6ee3Sopenharmony_ci static getFragmentStream<T extends TreeAdapterTypeMap>( 3293fb6ee3Sopenharmony_ci fragmentContext?: T['parentNode'] | null, 3393fb6ee3Sopenharmony_ci options?: ParserOptions<T> 3493fb6ee3Sopenharmony_ci ): ParserStream<T> { 3593fb6ee3Sopenharmony_ci const parser = Parser.getFragmentParser(fragmentContext, options); 3693fb6ee3Sopenharmony_ci const stream = new ParserStream(options, parser); 3793fb6ee3Sopenharmony_ci return stream; 3893fb6ee3Sopenharmony_ci } 3993fb6ee3Sopenharmony_ci 4093fb6ee3Sopenharmony_ci private lastChunkWritten = false; 4193fb6ee3Sopenharmony_ci private writeCallback: undefined | (() => void) = undefined; 4293fb6ee3Sopenharmony_ci 4393fb6ee3Sopenharmony_ci private pendingHtmlInsertions: string[] = []; 4493fb6ee3Sopenharmony_ci /** The resulting document node. */ 4593fb6ee3Sopenharmony_ci public get document(): T['document'] { 4693fb6ee3Sopenharmony_ci return this.parser.document; 4793fb6ee3Sopenharmony_ci } 4893fb6ee3Sopenharmony_ci public getFragment(): T['documentFragment'] { 4993fb6ee3Sopenharmony_ci return this.parser.getFragment(); 5093fb6ee3Sopenharmony_ci } 5193fb6ee3Sopenharmony_ci 5293fb6ee3Sopenharmony_ci /** 5393fb6ee3Sopenharmony_ci * @param options Parsing options. 5493fb6ee3Sopenharmony_ci */ 5593fb6ee3Sopenharmony_ci constructor(options?: ParserOptions<T>, public parser: Parser<T> = new Parser(options)) { 5693fb6ee3Sopenharmony_ci super({ decodeStrings: false }); 5793fb6ee3Sopenharmony_ci 5893fb6ee3Sopenharmony_ci const resume = (): void => { 5993fb6ee3Sopenharmony_ci for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) { 6093fb6ee3Sopenharmony_ci this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]); 6193fb6ee3Sopenharmony_ci } 6293fb6ee3Sopenharmony_ci 6393fb6ee3Sopenharmony_ci this.pendingHtmlInsertions.length = 0; 6493fb6ee3Sopenharmony_ci 6593fb6ee3Sopenharmony_ci //NOTE: keep parsing if we don't wait for the next input chunk 6693fb6ee3Sopenharmony_ci this.parser.tokenizer.resume(this.writeCallback); 6793fb6ee3Sopenharmony_ci }; 6893fb6ee3Sopenharmony_ci 6993fb6ee3Sopenharmony_ci const documentWrite = (html: string): void => { 7093fb6ee3Sopenharmony_ci if (!this.parser.stopped) { 7193fb6ee3Sopenharmony_ci this.pendingHtmlInsertions.push(html); 7293fb6ee3Sopenharmony_ci } 7393fb6ee3Sopenharmony_ci }; 7493fb6ee3Sopenharmony_ci 7593fb6ee3Sopenharmony_ci const scriptHandler = (scriptElement: T['element']): void => { 7693fb6ee3Sopenharmony_ci if (this.listenerCount('script') > 0) { 7793fb6ee3Sopenharmony_ci this.parser.tokenizer.pause(); 7893fb6ee3Sopenharmony_ci this.emit('script', scriptElement, documentWrite, resume); 7993fb6ee3Sopenharmony_ci } 8093fb6ee3Sopenharmony_ci }; 8193fb6ee3Sopenharmony_ci 8293fb6ee3Sopenharmony_ci this.parser.scriptHandler = scriptHandler; 8393fb6ee3Sopenharmony_ci } 8493fb6ee3Sopenharmony_ci 8593fb6ee3Sopenharmony_ci //WritableStream implementation 8693fb6ee3Sopenharmony_ci override _write(chunk: string, _encoding: string, callback: () => void): void { 8793fb6ee3Sopenharmony_ci if (typeof chunk !== 'string') { 8893fb6ee3Sopenharmony_ci throw new TypeError('Parser can work only with string streams.'); 8993fb6ee3Sopenharmony_ci } 9093fb6ee3Sopenharmony_ci 9193fb6ee3Sopenharmony_ci this.writeCallback = callback; 9293fb6ee3Sopenharmony_ci this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback); 9393fb6ee3Sopenharmony_ci } 9493fb6ee3Sopenharmony_ci 9593fb6ee3Sopenharmony_ci // TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`. 9693fb6ee3Sopenharmony_ci 9793fb6ee3Sopenharmony_ci // eslint-disable-next-line @typescript-eslint/no-explicit-any 9893fb6ee3Sopenharmony_ci override end(chunk?: any, encoding?: any, callback?: any): any { 9993fb6ee3Sopenharmony_ci this.lastChunkWritten = true; 10093fb6ee3Sopenharmony_ci super.end(chunk || '', encoding, callback); 10193fb6ee3Sopenharmony_ci } 10293fb6ee3Sopenharmony_ci} 10393fb6ee3Sopenharmony_ci 10493fb6ee3Sopenharmony_ciexport interface ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> { 10593fb6ee3Sopenharmony_ci /** 10693fb6ee3Sopenharmony_ci * Raised when parser encounters a `<script>` element. If this event has listeners, parsing will be suspended once 10793fb6ee3Sopenharmony_ci * it is emitted. So, if `<script>` has the `src` attribute, you can fetch it, execute and then resume parsing just 10893fb6ee3Sopenharmony_ci * like browsers do. 10993fb6ee3Sopenharmony_ci * 11093fb6ee3Sopenharmony_ci * @example 11193fb6ee3Sopenharmony_ci * 11293fb6ee3Sopenharmony_ci * ```js 11393fb6ee3Sopenharmony_ci * const ParserStream = require('parse5-parser-stream'); 11493fb6ee3Sopenharmony_ci * const http = require('http'); 11593fb6ee3Sopenharmony_ci * 11693fb6ee3Sopenharmony_ci * const parser = new ParserStream(); 11793fb6ee3Sopenharmony_ci * 11893fb6ee3Sopenharmony_ci * parser.on('script', (scriptElement, documentWrite, resume) => { 11993fb6ee3Sopenharmony_ci * const src = scriptElement.attrs.find(({ name }) => name === 'src').value; 12093fb6ee3Sopenharmony_ci * 12193fb6ee3Sopenharmony_ci * http.get(src, res => { 12293fb6ee3Sopenharmony_ci * // Fetch the script content, execute it with DOM built around `parser.document` and 12393fb6ee3Sopenharmony_ci * // `document.write` implemented using `documentWrite`. 12493fb6ee3Sopenharmony_ci * ... 12593fb6ee3Sopenharmony_ci * // Then resume parsing. 12693fb6ee3Sopenharmony_ci * resume(); 12793fb6ee3Sopenharmony_ci * }); 12893fb6ee3Sopenharmony_ci * }); 12993fb6ee3Sopenharmony_ci * 13093fb6ee3Sopenharmony_ci * parser.end('<script src="example.com/script.js"></script>'); 13193fb6ee3Sopenharmony_ci * ``` 13293fb6ee3Sopenharmony_ci * 13393fb6ee3Sopenharmony_ci * @param event Name of the event 13493fb6ee3Sopenharmony_ci * @param handler 13593fb6ee3Sopenharmony_ci */ 13693fb6ee3Sopenharmony_ci on( 13793fb6ee3Sopenharmony_ci event: 'script', 13893fb6ee3Sopenharmony_ci handler: (scriptElement: T['element'], documentWrite: (html: string) => void, resume: () => void) => void 13993fb6ee3Sopenharmony_ci ): void; 14093fb6ee3Sopenharmony_ci /** 14193fb6ee3Sopenharmony_ci * Base event handler. 14293fb6ee3Sopenharmony_ci * 14393fb6ee3Sopenharmony_ci * @param event Name of the event 14493fb6ee3Sopenharmony_ci * @param handler Event handler 14593fb6ee3Sopenharmony_ci */ 14693fb6ee3Sopenharmony_ci on(event: string, handler: (...args: any[]) => void): this; 14793fb6ee3Sopenharmony_ci} 148