193fb6ee3Sopenharmony_ciimport { html, type Token } from 'parse5'; 293fb6ee3Sopenharmony_ciimport { 393fb6ee3Sopenharmony_ci SAXParser, 493fb6ee3Sopenharmony_ci type EndTag, 593fb6ee3Sopenharmony_ci type StartTag, 693fb6ee3Sopenharmony_ci type Doctype, 793fb6ee3Sopenharmony_ci type Text, 893fb6ee3Sopenharmony_ci type Comment, 993fb6ee3Sopenharmony_ci type SaxToken, 1093fb6ee3Sopenharmony_ci} from 'parse5-sax-parser'; 1193fb6ee3Sopenharmony_ciimport { escapeText, escapeAttribute } from 'entities/lib/escape.js'; 1293fb6ee3Sopenharmony_ci 1393fb6ee3Sopenharmony_ci/** 1493fb6ee3Sopenharmony_ci * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter. 1593fb6ee3Sopenharmony_ci * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example). 1693fb6ee3Sopenharmony_ci * 1793fb6ee3Sopenharmony_ci * The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting 1893fb6ee3Sopenharmony_ci * HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip. 1993fb6ee3Sopenharmony_ci * 2093fb6ee3Sopenharmony_ci * @example 2193fb6ee3Sopenharmony_ci * 2293fb6ee3Sopenharmony_ci * ```js 2393fb6ee3Sopenharmony_ci * const RewritingStream = require('parse5-html-rewriting-stream'); 2493fb6ee3Sopenharmony_ci * const http = require('http'); 2593fb6ee3Sopenharmony_ci * const fs = require('fs'); 2693fb6ee3Sopenharmony_ci * 2793fb6ee3Sopenharmony_ci * const file = fs.createWriteStream('/home/google.com.html'); 2893fb6ee3Sopenharmony_ci * const rewriter = new RewritingStream(); 2993fb6ee3Sopenharmony_ci * 3093fb6ee3Sopenharmony_ci * // Replace divs with spans 3193fb6ee3Sopenharmony_ci * rewriter.on('startTag', startTag => { 3293fb6ee3Sopenharmony_ci * if (startTag.tagName === 'span') { 3393fb6ee3Sopenharmony_ci * startTag.tagName = 'div'; 3493fb6ee3Sopenharmony_ci * } 3593fb6ee3Sopenharmony_ci * 3693fb6ee3Sopenharmony_ci * rewriter.emitStartTag(startTag); 3793fb6ee3Sopenharmony_ci * }); 3893fb6ee3Sopenharmony_ci * 3993fb6ee3Sopenharmony_ci * rewriter.on('endTag', endTag => { 4093fb6ee3Sopenharmony_ci * if (endTag.tagName === 'span') { 4193fb6ee3Sopenharmony_ci * endTag.tagName = 'div'; 4293fb6ee3Sopenharmony_ci * } 4393fb6ee3Sopenharmony_ci * 4493fb6ee3Sopenharmony_ci * rewriter.emitEndTag(endTag); 4593fb6ee3Sopenharmony_ci * }); 4693fb6ee3Sopenharmony_ci * 4793fb6ee3Sopenharmony_ci * // Wrap all text nodes with an <i> tag 4893fb6ee3Sopenharmony_ci * rewriter.on('text', (_, raw) => { 4993fb6ee3Sopenharmony_ci * // Use the raw representation of text without HTML entities decoding 5093fb6ee3Sopenharmony_ci * rewriter.emitRaw(`<i>${raw}</i>`); 5193fb6ee3Sopenharmony_ci * }); 5293fb6ee3Sopenharmony_ci * 5393fb6ee3Sopenharmony_ci * http.get('http://google.com', res => { 5493fb6ee3Sopenharmony_ci * // Assumes response is UTF-8. 5593fb6ee3Sopenharmony_ci * res.setEncoding('utf8'); 5693fb6ee3Sopenharmony_ci * // `RewritingStream` is a `Transform` stream, which means you can pipe 5793fb6ee3Sopenharmony_ci * // through it. 5893fb6ee3Sopenharmony_ci * res.pipe(rewriter).pipe(file); 5993fb6ee3Sopenharmony_ci * }); 6093fb6ee3Sopenharmony_ci * ``` 6193fb6ee3Sopenharmony_ci */ 6293fb6ee3Sopenharmony_ciexport class RewritingStream extends SAXParser { 6393fb6ee3Sopenharmony_ci /** Note: `sourceCodeLocationInfo` is always enabled. */ 6493fb6ee3Sopenharmony_ci constructor() { 6593fb6ee3Sopenharmony_ci super({ sourceCodeLocationInfo: true }); 6693fb6ee3Sopenharmony_ci } 6793fb6ee3Sopenharmony_ci 6893fb6ee3Sopenharmony_ci override _transformChunk(chunk: string): string { 6993fb6ee3Sopenharmony_ci // NOTE: ignore upstream return values as we want to push to 7093fb6ee3Sopenharmony_ci // the `Writable` part of the `Transform` stream ourselves. 7193fb6ee3Sopenharmony_ci super._transformChunk(chunk); 7293fb6ee3Sopenharmony_ci return ''; 7393fb6ee3Sopenharmony_ci } 7493fb6ee3Sopenharmony_ci 7593fb6ee3Sopenharmony_ci private _getRawHtml(location: Token.Location): string { 7693fb6ee3Sopenharmony_ci const { droppedBufferSize, html } = this.tokenizer.preprocessor; 7793fb6ee3Sopenharmony_ci const start = location.startOffset - droppedBufferSize; 7893fb6ee3Sopenharmony_ci const end = location.endOffset - droppedBufferSize; 7993fb6ee3Sopenharmony_ci 8093fb6ee3Sopenharmony_ci return html.slice(start, end); 8193fb6ee3Sopenharmony_ci } 8293fb6ee3Sopenharmony_ci 8393fb6ee3Sopenharmony_ci // Events 8493fb6ee3Sopenharmony_ci protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean { 8593fb6ee3Sopenharmony_ci if (!super.emitIfListenerExists(eventName, token)) { 8693fb6ee3Sopenharmony_ci this.emitRaw(this._getRawHtml(token.sourceCodeLocation!)); 8793fb6ee3Sopenharmony_ci } 8893fb6ee3Sopenharmony_ci 8993fb6ee3Sopenharmony_ci // NOTE: don't skip new lines after `<pre>` and other tags, 9093fb6ee3Sopenharmony_ci // otherwise we'll have incorrect raw data. 9193fb6ee3Sopenharmony_ci this.parserFeedbackSimulator.skipNextNewLine = false; 9293fb6ee3Sopenharmony_ci return true; 9393fb6ee3Sopenharmony_ci } 9493fb6ee3Sopenharmony_ci 9593fb6ee3Sopenharmony_ci // Emitter API 9693fb6ee3Sopenharmony_ci protected override _emitToken(eventName: string, token: SaxToken): void { 9793fb6ee3Sopenharmony_ci this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation!)); 9893fb6ee3Sopenharmony_ci } 9993fb6ee3Sopenharmony_ci 10093fb6ee3Sopenharmony_ci /** Emits a serialized document type token into the output stream. */ 10193fb6ee3Sopenharmony_ci public emitDoctype(token: Doctype): void { 10293fb6ee3Sopenharmony_ci let res = `<!DOCTYPE ${token.name}`; 10393fb6ee3Sopenharmony_ci 10493fb6ee3Sopenharmony_ci if (token.publicId !== null) { 10593fb6ee3Sopenharmony_ci res += ` PUBLIC "${token.publicId}"`; 10693fb6ee3Sopenharmony_ci } else if (token.systemId !== null) { 10793fb6ee3Sopenharmony_ci res += ' SYSTEM'; 10893fb6ee3Sopenharmony_ci } 10993fb6ee3Sopenharmony_ci 11093fb6ee3Sopenharmony_ci if (token.systemId !== null) { 11193fb6ee3Sopenharmony_ci res += ` "${token.systemId}"`; 11293fb6ee3Sopenharmony_ci } 11393fb6ee3Sopenharmony_ci 11493fb6ee3Sopenharmony_ci res += '>'; 11593fb6ee3Sopenharmony_ci 11693fb6ee3Sopenharmony_ci this.push(res); 11793fb6ee3Sopenharmony_ci } 11893fb6ee3Sopenharmony_ci 11993fb6ee3Sopenharmony_ci /** Emits a serialized start tag token into the output stream. */ 12093fb6ee3Sopenharmony_ci public emitStartTag(token: StartTag): void { 12193fb6ee3Sopenharmony_ci let res = `<${token.tagName}`; 12293fb6ee3Sopenharmony_ci 12393fb6ee3Sopenharmony_ci for (const attr of token.attrs) { 12493fb6ee3Sopenharmony_ci res += ` ${attr.name}="${escapeAttribute(attr.value)}"`; 12593fb6ee3Sopenharmony_ci } 12693fb6ee3Sopenharmony_ci 12793fb6ee3Sopenharmony_ci res += token.selfClosing ? '/>' : '>'; 12893fb6ee3Sopenharmony_ci 12993fb6ee3Sopenharmony_ci this.push(res); 13093fb6ee3Sopenharmony_ci } 13193fb6ee3Sopenharmony_ci 13293fb6ee3Sopenharmony_ci /** Emits a serialized end tag token into the output stream. */ 13393fb6ee3Sopenharmony_ci public emitEndTag(token: EndTag): void { 13493fb6ee3Sopenharmony_ci this.push(`</${token.tagName}>`); 13593fb6ee3Sopenharmony_ci } 13693fb6ee3Sopenharmony_ci 13793fb6ee3Sopenharmony_ci /** Emits a serialized text token into the output stream. */ 13893fb6ee3Sopenharmony_ci public emitText({ text }: Text): void { 13993fb6ee3Sopenharmony_ci this.push( 14093fb6ee3Sopenharmony_ci !this.parserFeedbackSimulator.inForeignContent && 14193fb6ee3Sopenharmony_ci html.hasUnescapedText(this.tokenizer.lastStartTagName, true) 14293fb6ee3Sopenharmony_ci ? text 14393fb6ee3Sopenharmony_ci : escapeText(text) 14493fb6ee3Sopenharmony_ci ); 14593fb6ee3Sopenharmony_ci } 14693fb6ee3Sopenharmony_ci 14793fb6ee3Sopenharmony_ci /** Emits a serialized comment token into the output stream. */ 14893fb6ee3Sopenharmony_ci public emitComment(token: Comment): void { 14993fb6ee3Sopenharmony_ci this.push(`<!--${token.text}-->`); 15093fb6ee3Sopenharmony_ci } 15193fb6ee3Sopenharmony_ci 15293fb6ee3Sopenharmony_ci /** Emits a raw HTML string into the output stream. */ 15393fb6ee3Sopenharmony_ci public emitRaw(html: string): void { 15493fb6ee3Sopenharmony_ci this.push(html); 15593fb6ee3Sopenharmony_ci } 15693fb6ee3Sopenharmony_ci} 15793fb6ee3Sopenharmony_ci 15893fb6ee3Sopenharmony_ciexport interface RewritingStream { 15993fb6ee3Sopenharmony_ci /** Raised when the rewriter encounters a start tag. */ 16093fb6ee3Sopenharmony_ci on(event: 'startTag', listener: (startTag: StartTag, rawHtml: string) => void): this; 16193fb6ee3Sopenharmony_ci /** Raised when rewriter encounters an end tag. */ 16293fb6ee3Sopenharmony_ci on(event: 'endTag', listener: (endTag: EndTag, rawHtml: string) => void): this; 16393fb6ee3Sopenharmony_ci /** Raised when rewriter encounters a comment. */ 16493fb6ee3Sopenharmony_ci on(event: 'comment', listener: (comment: Comment, rawHtml: string) => void): this; 16593fb6ee3Sopenharmony_ci /** Raised when rewriter encounters text content. */ 16693fb6ee3Sopenharmony_ci on(event: 'text', listener: (text: Text, rawHtml: string) => void): this; 16793fb6ee3Sopenharmony_ci /** Raised when rewriter encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration). */ 16893fb6ee3Sopenharmony_ci on(event: 'doctype', listener: (doctype: Doctype, rawHtml: string) => void): this; 16993fb6ee3Sopenharmony_ci 17093fb6ee3Sopenharmony_ci /** 17193fb6ee3Sopenharmony_ci * Base event handler. 17293fb6ee3Sopenharmony_ci * 17393fb6ee3Sopenharmony_ci * @param event Name of the event 17493fb6ee3Sopenharmony_ci * @param handler Event handler 17593fb6ee3Sopenharmony_ci */ 17693fb6ee3Sopenharmony_ci on(event: string, handler: (...args: any[]) => void): this; 17793fb6ee3Sopenharmony_ci} 178