193fb6ee3Sopenharmony_ciimport { html, type Token } from 'parse5';
293fb6ee3Sopenharmony_ciimport {
393fb6ee3Sopenharmony_ci    SAXParser,
493fb6ee3Sopenharmony_ci    type EndTag,
593fb6ee3Sopenharmony_ci    type StartTag,
693fb6ee3Sopenharmony_ci    type Doctype,
793fb6ee3Sopenharmony_ci    type Text,
893fb6ee3Sopenharmony_ci    type Comment,
993fb6ee3Sopenharmony_ci    type SaxToken,
1093fb6ee3Sopenharmony_ci} from 'parse5-sax-parser';
1193fb6ee3Sopenharmony_ciimport { escapeText, escapeAttribute } from 'entities/lib/escape.js';
1293fb6ee3Sopenharmony_ci
1393fb6ee3Sopenharmony_ci/**
1493fb6ee3Sopenharmony_ci * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
1593fb6ee3Sopenharmony_ci * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
1693fb6ee3Sopenharmony_ci *
1793fb6ee3Sopenharmony_ci * The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting
1893fb6ee3Sopenharmony_ci * HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip.
1993fb6ee3Sopenharmony_ci *
2093fb6ee3Sopenharmony_ci * @example
2193fb6ee3Sopenharmony_ci *
2293fb6ee3Sopenharmony_ci * ```js
2393fb6ee3Sopenharmony_ci * const RewritingStream = require('parse5-html-rewriting-stream');
2493fb6ee3Sopenharmony_ci * const http = require('http');
2593fb6ee3Sopenharmony_ci * const fs = require('fs');
2693fb6ee3Sopenharmony_ci *
2793fb6ee3Sopenharmony_ci * const file = fs.createWriteStream('/home/google.com.html');
2893fb6ee3Sopenharmony_ci * const rewriter = new RewritingStream();
2993fb6ee3Sopenharmony_ci *
3093fb6ee3Sopenharmony_ci * // Replace divs with spans
3193fb6ee3Sopenharmony_ci * rewriter.on('startTag', startTag => {
3293fb6ee3Sopenharmony_ci *     if (startTag.tagName === 'span') {
3393fb6ee3Sopenharmony_ci *         startTag.tagName = 'div';
3493fb6ee3Sopenharmony_ci *     }
3593fb6ee3Sopenharmony_ci *
3693fb6ee3Sopenharmony_ci *     rewriter.emitStartTag(startTag);
3793fb6ee3Sopenharmony_ci * });
3893fb6ee3Sopenharmony_ci *
3993fb6ee3Sopenharmony_ci * rewriter.on('endTag', endTag => {
4093fb6ee3Sopenharmony_ci *     if (endTag.tagName === 'span') {
4193fb6ee3Sopenharmony_ci *         endTag.tagName = 'div';
4293fb6ee3Sopenharmony_ci *     }
4393fb6ee3Sopenharmony_ci *
4493fb6ee3Sopenharmony_ci *     rewriter.emitEndTag(endTag);
4593fb6ee3Sopenharmony_ci * });
4693fb6ee3Sopenharmony_ci *
4793fb6ee3Sopenharmony_ci * // Wrap all text nodes with an <i> tag
4893fb6ee3Sopenharmony_ci * rewriter.on('text', (_, raw) => {
4993fb6ee3Sopenharmony_ci *     // Use the raw representation of text without HTML entities decoding
5093fb6ee3Sopenharmony_ci *     rewriter.emitRaw(`<i>${raw}</i>`);
5193fb6ee3Sopenharmony_ci * });
5293fb6ee3Sopenharmony_ci *
5393fb6ee3Sopenharmony_ci * http.get('http://google.com', res => {
5493fb6ee3Sopenharmony_ci *    // Assumes response is UTF-8.
5593fb6ee3Sopenharmony_ci *    res.setEncoding('utf8');
5693fb6ee3Sopenharmony_ci *    // `RewritingStream` is a `Transform` stream, which means you can pipe
5793fb6ee3Sopenharmony_ci *    // through it.
5893fb6ee3Sopenharmony_ci *    res.pipe(rewriter).pipe(file);
5993fb6ee3Sopenharmony_ci * });
6093fb6ee3Sopenharmony_ci * ```
6193fb6ee3Sopenharmony_ci */
6293fb6ee3Sopenharmony_ciexport class RewritingStream extends SAXParser {
6393fb6ee3Sopenharmony_ci    /** Note: `sourceCodeLocationInfo` is always enabled. */
6493fb6ee3Sopenharmony_ci    constructor() {
6593fb6ee3Sopenharmony_ci        super({ sourceCodeLocationInfo: true });
6693fb6ee3Sopenharmony_ci    }
6793fb6ee3Sopenharmony_ci
6893fb6ee3Sopenharmony_ci    override _transformChunk(chunk: string): string {
6993fb6ee3Sopenharmony_ci        // NOTE: ignore upstream return values as we want to push to
7093fb6ee3Sopenharmony_ci        // the `Writable` part of the `Transform` stream ourselves.
7193fb6ee3Sopenharmony_ci        super._transformChunk(chunk);
7293fb6ee3Sopenharmony_ci        return '';
7393fb6ee3Sopenharmony_ci    }
7493fb6ee3Sopenharmony_ci
7593fb6ee3Sopenharmony_ci    private _getRawHtml(location: Token.Location): string {
7693fb6ee3Sopenharmony_ci        const { droppedBufferSize, html } = this.tokenizer.preprocessor;
7793fb6ee3Sopenharmony_ci        const start = location.startOffset - droppedBufferSize;
7893fb6ee3Sopenharmony_ci        const end = location.endOffset - droppedBufferSize;
7993fb6ee3Sopenharmony_ci
8093fb6ee3Sopenharmony_ci        return html.slice(start, end);
8193fb6ee3Sopenharmony_ci    }
8293fb6ee3Sopenharmony_ci
8393fb6ee3Sopenharmony_ci    // Events
8493fb6ee3Sopenharmony_ci    protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean {
8593fb6ee3Sopenharmony_ci        if (!super.emitIfListenerExists(eventName, token)) {
8693fb6ee3Sopenharmony_ci            this.emitRaw(this._getRawHtml(token.sourceCodeLocation!));
8793fb6ee3Sopenharmony_ci        }
8893fb6ee3Sopenharmony_ci
8993fb6ee3Sopenharmony_ci        // NOTE: don't skip new lines after `<pre>` and other tags,
9093fb6ee3Sopenharmony_ci        // otherwise we'll have incorrect raw data.
9193fb6ee3Sopenharmony_ci        this.parserFeedbackSimulator.skipNextNewLine = false;
9293fb6ee3Sopenharmony_ci        return true;
9393fb6ee3Sopenharmony_ci    }
9493fb6ee3Sopenharmony_ci
9593fb6ee3Sopenharmony_ci    // Emitter API
9693fb6ee3Sopenharmony_ci    protected override _emitToken(eventName: string, token: SaxToken): void {
9793fb6ee3Sopenharmony_ci        this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation!));
9893fb6ee3Sopenharmony_ci    }
9993fb6ee3Sopenharmony_ci
10093fb6ee3Sopenharmony_ci    /** Emits a serialized document type token into the output stream. */
10193fb6ee3Sopenharmony_ci    public emitDoctype(token: Doctype): void {
10293fb6ee3Sopenharmony_ci        let res = `<!DOCTYPE ${token.name}`;
10393fb6ee3Sopenharmony_ci
10493fb6ee3Sopenharmony_ci        if (token.publicId !== null) {
10593fb6ee3Sopenharmony_ci            res += ` PUBLIC "${token.publicId}"`;
10693fb6ee3Sopenharmony_ci        } else if (token.systemId !== null) {
10793fb6ee3Sopenharmony_ci            res += ' SYSTEM';
10893fb6ee3Sopenharmony_ci        }
10993fb6ee3Sopenharmony_ci
11093fb6ee3Sopenharmony_ci        if (token.systemId !== null) {
11193fb6ee3Sopenharmony_ci            res += ` "${token.systemId}"`;
11293fb6ee3Sopenharmony_ci        }
11393fb6ee3Sopenharmony_ci
11493fb6ee3Sopenharmony_ci        res += '>';
11593fb6ee3Sopenharmony_ci
11693fb6ee3Sopenharmony_ci        this.push(res);
11793fb6ee3Sopenharmony_ci    }
11893fb6ee3Sopenharmony_ci
11993fb6ee3Sopenharmony_ci    /** Emits a serialized start tag token into the output stream. */
12093fb6ee3Sopenharmony_ci    public emitStartTag(token: StartTag): void {
12193fb6ee3Sopenharmony_ci        let res = `<${token.tagName}`;
12293fb6ee3Sopenharmony_ci
12393fb6ee3Sopenharmony_ci        for (const attr of token.attrs) {
12493fb6ee3Sopenharmony_ci            res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
12593fb6ee3Sopenharmony_ci        }
12693fb6ee3Sopenharmony_ci
12793fb6ee3Sopenharmony_ci        res += token.selfClosing ? '/>' : '>';
12893fb6ee3Sopenharmony_ci
12993fb6ee3Sopenharmony_ci        this.push(res);
13093fb6ee3Sopenharmony_ci    }
13193fb6ee3Sopenharmony_ci
13293fb6ee3Sopenharmony_ci    /** Emits a serialized end tag token into the output stream. */
13393fb6ee3Sopenharmony_ci    public emitEndTag(token: EndTag): void {
13493fb6ee3Sopenharmony_ci        this.push(`</${token.tagName}>`);
13593fb6ee3Sopenharmony_ci    }
13693fb6ee3Sopenharmony_ci
13793fb6ee3Sopenharmony_ci    /** Emits a serialized text token into the output stream. */
13893fb6ee3Sopenharmony_ci    public emitText({ text }: Text): void {
13993fb6ee3Sopenharmony_ci        this.push(
14093fb6ee3Sopenharmony_ci            !this.parserFeedbackSimulator.inForeignContent &&
14193fb6ee3Sopenharmony_ci                html.hasUnescapedText(this.tokenizer.lastStartTagName, true)
14293fb6ee3Sopenharmony_ci                ? text
14393fb6ee3Sopenharmony_ci                : escapeText(text)
14493fb6ee3Sopenharmony_ci        );
14593fb6ee3Sopenharmony_ci    }
14693fb6ee3Sopenharmony_ci
14793fb6ee3Sopenharmony_ci    /** Emits a serialized comment token into the output stream. */
14893fb6ee3Sopenharmony_ci    public emitComment(token: Comment): void {
14993fb6ee3Sopenharmony_ci        this.push(`<!--${token.text}-->`);
15093fb6ee3Sopenharmony_ci    }
15193fb6ee3Sopenharmony_ci
15293fb6ee3Sopenharmony_ci    /** Emits a raw HTML string into the output stream. */
15393fb6ee3Sopenharmony_ci    public emitRaw(html: string): void {
15493fb6ee3Sopenharmony_ci        this.push(html);
15593fb6ee3Sopenharmony_ci    }
15693fb6ee3Sopenharmony_ci}
15793fb6ee3Sopenharmony_ci
15893fb6ee3Sopenharmony_ciexport interface RewritingStream {
15993fb6ee3Sopenharmony_ci    /** Raised when the rewriter encounters a start tag. */
16093fb6ee3Sopenharmony_ci    on(event: 'startTag', listener: (startTag: StartTag, rawHtml: string) => void): this;
16193fb6ee3Sopenharmony_ci    /** Raised when rewriter encounters an end tag. */
16293fb6ee3Sopenharmony_ci    on(event: 'endTag', listener: (endTag: EndTag, rawHtml: string) => void): this;
16393fb6ee3Sopenharmony_ci    /** Raised when rewriter encounters a comment. */
16493fb6ee3Sopenharmony_ci    on(event: 'comment', listener: (comment: Comment, rawHtml: string) => void): this;
16593fb6ee3Sopenharmony_ci    /** Raised when rewriter encounters text content. */
16693fb6ee3Sopenharmony_ci    on(event: 'text', listener: (text: Text, rawHtml: string) => void): this;
16793fb6ee3Sopenharmony_ci    /** Raised when rewriter encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration). */
16893fb6ee3Sopenharmony_ci    on(event: 'doctype', listener: (doctype: Doctype, rawHtml: string) => void): this;
16993fb6ee3Sopenharmony_ci
17093fb6ee3Sopenharmony_ci    /**
17193fb6ee3Sopenharmony_ci     * Base event handler.
17293fb6ee3Sopenharmony_ci     *
17393fb6ee3Sopenharmony_ci     * @param event Name of the event
17493fb6ee3Sopenharmony_ci     * @param handler Event handler
17593fb6ee3Sopenharmony_ci     */
17693fb6ee3Sopenharmony_ci    on(event: string, handler: (...args: any[]) => void): this;
17793fb6ee3Sopenharmony_ci}
178