193fb6ee3Sopenharmony_ciimport {
293fb6ee3Sopenharmony_ci    CODE_POINTS as $,
393fb6ee3Sopenharmony_ci    getSurrogatePairCodePoint,
493fb6ee3Sopenharmony_ci    isControlCodePoint,
593fb6ee3Sopenharmony_ci    isSurrogate,
693fb6ee3Sopenharmony_ci    isSurrogatePair,
793fb6ee3Sopenharmony_ci    isUndefinedCodePoint,
893fb6ee3Sopenharmony_ci} from '../common/unicode.js';
993fb6ee3Sopenharmony_ciimport { ERR, type ParserError, type ParserErrorHandler } from '../common/error-codes.js';
1093fb6ee3Sopenharmony_ci
1193fb6ee3Sopenharmony_ci//Const
1293fb6ee3Sopenharmony_ciconst DEFAULT_BUFFER_WATERLINE = 1 << 16;
1393fb6ee3Sopenharmony_ci
1493fb6ee3Sopenharmony_ci//Preprocessor
1593fb6ee3Sopenharmony_ci//NOTE: HTML input preprocessing
1693fb6ee3Sopenharmony_ci//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
1793fb6ee3Sopenharmony_ciexport class Preprocessor {
1893fb6ee3Sopenharmony_ci    public html = '';
1993fb6ee3Sopenharmony_ci    private pos = -1;
2093fb6ee3Sopenharmony_ci    // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
2193fb6ee3Sopenharmony_ci    private lastGapPos = -2;
2293fb6ee3Sopenharmony_ci    private gapStack: number[] = [];
2393fb6ee3Sopenharmony_ci    private skipNextNewLine = false;
2493fb6ee3Sopenharmony_ci    private lastChunkWritten = false;
2593fb6ee3Sopenharmony_ci    public endOfChunkHit = false;
2693fb6ee3Sopenharmony_ci    public bufferWaterline = DEFAULT_BUFFER_WATERLINE;
2793fb6ee3Sopenharmony_ci
2893fb6ee3Sopenharmony_ci    private isEol = false;
2993fb6ee3Sopenharmony_ci    private lineStartPos = 0;
3093fb6ee3Sopenharmony_ci    public droppedBufferSize = 0;
3193fb6ee3Sopenharmony_ci    public line = 1;
3293fb6ee3Sopenharmony_ci
3393fb6ee3Sopenharmony_ci    constructor(private handler: { onParseError?: ParserErrorHandler | null }) {}
3493fb6ee3Sopenharmony_ci
3593fb6ee3Sopenharmony_ci    /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */
3693fb6ee3Sopenharmony_ci    public get col(): number {
3793fb6ee3Sopenharmony_ci        return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos);
3893fb6ee3Sopenharmony_ci    }
3993fb6ee3Sopenharmony_ci
4093fb6ee3Sopenharmony_ci    public get offset(): number {
4193fb6ee3Sopenharmony_ci        return this.droppedBufferSize + this.pos;
4293fb6ee3Sopenharmony_ci    }
4393fb6ee3Sopenharmony_ci
4493fb6ee3Sopenharmony_ci    public getError(code: ERR): ParserError {
4593fb6ee3Sopenharmony_ci        const { line, col, offset } = this;
4693fb6ee3Sopenharmony_ci
4793fb6ee3Sopenharmony_ci        return {
4893fb6ee3Sopenharmony_ci            code,
4993fb6ee3Sopenharmony_ci            startLine: line,
5093fb6ee3Sopenharmony_ci            endLine: line,
5193fb6ee3Sopenharmony_ci            startCol: col,
5293fb6ee3Sopenharmony_ci            endCol: col,
5393fb6ee3Sopenharmony_ci            startOffset: offset,
5493fb6ee3Sopenharmony_ci            endOffset: offset,
5593fb6ee3Sopenharmony_ci        };
5693fb6ee3Sopenharmony_ci    }
5793fb6ee3Sopenharmony_ci
5893fb6ee3Sopenharmony_ci    //NOTE: avoid reporting errors twice on advance/retreat
5993fb6ee3Sopenharmony_ci    private lastErrOffset = -1;
6093fb6ee3Sopenharmony_ci    private _err(code: ERR): void {
6193fb6ee3Sopenharmony_ci        if (this.handler.onParseError && this.lastErrOffset !== this.offset) {
6293fb6ee3Sopenharmony_ci            this.lastErrOffset = this.offset;
6393fb6ee3Sopenharmony_ci            this.handler.onParseError(this.getError(code));
6493fb6ee3Sopenharmony_ci        }
6593fb6ee3Sopenharmony_ci    }
6693fb6ee3Sopenharmony_ci
6793fb6ee3Sopenharmony_ci    private _addGap(): void {
6893fb6ee3Sopenharmony_ci        this.gapStack.push(this.lastGapPos);
6993fb6ee3Sopenharmony_ci        this.lastGapPos = this.pos;
7093fb6ee3Sopenharmony_ci    }
7193fb6ee3Sopenharmony_ci
7293fb6ee3Sopenharmony_ci    private _processSurrogate(cp: number): number {
7393fb6ee3Sopenharmony_ci        //NOTE: try to peek a surrogate pair
7493fb6ee3Sopenharmony_ci        if (this.pos !== this.html.length - 1) {
7593fb6ee3Sopenharmony_ci            const nextCp = this.html.charCodeAt(this.pos + 1);
7693fb6ee3Sopenharmony_ci
7793fb6ee3Sopenharmony_ci            if (isSurrogatePair(nextCp)) {
7893fb6ee3Sopenharmony_ci                //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
7993fb6ee3Sopenharmony_ci                this.pos++;
8093fb6ee3Sopenharmony_ci
8193fb6ee3Sopenharmony_ci                //NOTE: add a gap that should be avoided during retreat
8293fb6ee3Sopenharmony_ci                this._addGap();
8393fb6ee3Sopenharmony_ci
8493fb6ee3Sopenharmony_ci                return getSurrogatePairCodePoint(cp, nextCp);
8593fb6ee3Sopenharmony_ci            }
8693fb6ee3Sopenharmony_ci        }
8793fb6ee3Sopenharmony_ci
8893fb6ee3Sopenharmony_ci        //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet.
8993fb6ee3Sopenharmony_ci        else if (!this.lastChunkWritten) {
9093fb6ee3Sopenharmony_ci            this.endOfChunkHit = true;
9193fb6ee3Sopenharmony_ci            return $.EOF;
9293fb6ee3Sopenharmony_ci        }
9393fb6ee3Sopenharmony_ci
9493fb6ee3Sopenharmony_ci        //NOTE: isolated surrogate
9593fb6ee3Sopenharmony_ci        this._err(ERR.surrogateInInputStream);
9693fb6ee3Sopenharmony_ci
9793fb6ee3Sopenharmony_ci        return cp;
9893fb6ee3Sopenharmony_ci    }
9993fb6ee3Sopenharmony_ci
10093fb6ee3Sopenharmony_ci    public willDropParsedChunk(): boolean {
10193fb6ee3Sopenharmony_ci        return this.pos > this.bufferWaterline;
10293fb6ee3Sopenharmony_ci    }
10393fb6ee3Sopenharmony_ci
10493fb6ee3Sopenharmony_ci    public dropParsedChunk(): void {
10593fb6ee3Sopenharmony_ci        if (this.willDropParsedChunk()) {
10693fb6ee3Sopenharmony_ci            this.html = this.html.substring(this.pos);
10793fb6ee3Sopenharmony_ci            this.lineStartPos -= this.pos;
10893fb6ee3Sopenharmony_ci            this.droppedBufferSize += this.pos;
10993fb6ee3Sopenharmony_ci            this.pos = 0;
11093fb6ee3Sopenharmony_ci            this.lastGapPos = -2;
11193fb6ee3Sopenharmony_ci            this.gapStack.length = 0;
11293fb6ee3Sopenharmony_ci        }
11393fb6ee3Sopenharmony_ci    }
11493fb6ee3Sopenharmony_ci
11593fb6ee3Sopenharmony_ci    public write(chunk: string, isLastChunk: boolean): void {
11693fb6ee3Sopenharmony_ci        if (this.html.length > 0) {
11793fb6ee3Sopenharmony_ci            this.html += chunk;
11893fb6ee3Sopenharmony_ci        } else {
11993fb6ee3Sopenharmony_ci            this.html = chunk;
12093fb6ee3Sopenharmony_ci        }
12193fb6ee3Sopenharmony_ci
12293fb6ee3Sopenharmony_ci        this.endOfChunkHit = false;
12393fb6ee3Sopenharmony_ci        this.lastChunkWritten = isLastChunk;
12493fb6ee3Sopenharmony_ci    }
12593fb6ee3Sopenharmony_ci
12693fb6ee3Sopenharmony_ci    public insertHtmlAtCurrentPos(chunk: string): void {
12793fb6ee3Sopenharmony_ci        this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1);
12893fb6ee3Sopenharmony_ci
12993fb6ee3Sopenharmony_ci        this.endOfChunkHit = false;
13093fb6ee3Sopenharmony_ci    }
13193fb6ee3Sopenharmony_ci
13293fb6ee3Sopenharmony_ci    public startsWith(pattern: string, caseSensitive: boolean): boolean {
13393fb6ee3Sopenharmony_ci        // Check if our buffer has enough characters
13493fb6ee3Sopenharmony_ci        if (this.pos + pattern.length > this.html.length) {
13593fb6ee3Sopenharmony_ci            this.endOfChunkHit = !this.lastChunkWritten;
13693fb6ee3Sopenharmony_ci            return false;
13793fb6ee3Sopenharmony_ci        }
13893fb6ee3Sopenharmony_ci
13993fb6ee3Sopenharmony_ci        if (caseSensitive) {
14093fb6ee3Sopenharmony_ci            return this.html.startsWith(pattern, this.pos);
14193fb6ee3Sopenharmony_ci        }
14293fb6ee3Sopenharmony_ci
14393fb6ee3Sopenharmony_ci        for (let i = 0; i < pattern.length; i++) {
14493fb6ee3Sopenharmony_ci            const cp = this.html.charCodeAt(this.pos + i) | 0x20;
14593fb6ee3Sopenharmony_ci
14693fb6ee3Sopenharmony_ci            if (cp !== pattern.charCodeAt(i)) {
14793fb6ee3Sopenharmony_ci                return false;
14893fb6ee3Sopenharmony_ci            }
14993fb6ee3Sopenharmony_ci        }
15093fb6ee3Sopenharmony_ci
15193fb6ee3Sopenharmony_ci        return true;
15293fb6ee3Sopenharmony_ci    }
15393fb6ee3Sopenharmony_ci
15493fb6ee3Sopenharmony_ci    public peek(offset: number): number {
15593fb6ee3Sopenharmony_ci        const pos = this.pos + offset;
15693fb6ee3Sopenharmony_ci
15793fb6ee3Sopenharmony_ci        if (pos >= this.html.length) {
15893fb6ee3Sopenharmony_ci            this.endOfChunkHit = !this.lastChunkWritten;
15993fb6ee3Sopenharmony_ci            return $.EOF;
16093fb6ee3Sopenharmony_ci        }
16193fb6ee3Sopenharmony_ci
16293fb6ee3Sopenharmony_ci        const code = this.html.charCodeAt(pos);
16393fb6ee3Sopenharmony_ci
16493fb6ee3Sopenharmony_ci        return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code;
16593fb6ee3Sopenharmony_ci    }
16693fb6ee3Sopenharmony_ci
16793fb6ee3Sopenharmony_ci    public advance(): number {
16893fb6ee3Sopenharmony_ci        this.pos++;
16993fb6ee3Sopenharmony_ci
17093fb6ee3Sopenharmony_ci        //NOTE: LF should be in the last column of the line
17193fb6ee3Sopenharmony_ci        if (this.isEol) {
17293fb6ee3Sopenharmony_ci            this.isEol = false;
17393fb6ee3Sopenharmony_ci            this.line++;
17493fb6ee3Sopenharmony_ci            this.lineStartPos = this.pos;
17593fb6ee3Sopenharmony_ci        }
17693fb6ee3Sopenharmony_ci
17793fb6ee3Sopenharmony_ci        if (this.pos >= this.html.length) {
17893fb6ee3Sopenharmony_ci            this.endOfChunkHit = !this.lastChunkWritten;
17993fb6ee3Sopenharmony_ci            return $.EOF;
18093fb6ee3Sopenharmony_ci        }
18193fb6ee3Sopenharmony_ci
18293fb6ee3Sopenharmony_ci        let cp = this.html.charCodeAt(this.pos);
18393fb6ee3Sopenharmony_ci
18493fb6ee3Sopenharmony_ci        //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
18593fb6ee3Sopenharmony_ci        if (cp === $.CARRIAGE_RETURN) {
18693fb6ee3Sopenharmony_ci            this.isEol = true;
18793fb6ee3Sopenharmony_ci            this.skipNextNewLine = true;
18893fb6ee3Sopenharmony_ci            return $.LINE_FEED;
18993fb6ee3Sopenharmony_ci        }
19093fb6ee3Sopenharmony_ci
19193fb6ee3Sopenharmony_ci        //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
19293fb6ee3Sopenharmony_ci        //must be ignored.
19393fb6ee3Sopenharmony_ci        if (cp === $.LINE_FEED) {
19493fb6ee3Sopenharmony_ci            this.isEol = true;
19593fb6ee3Sopenharmony_ci
19693fb6ee3Sopenharmony_ci            if (this.skipNextNewLine) {
19793fb6ee3Sopenharmony_ci                // `line` will be bumped again in the recursive call.
19893fb6ee3Sopenharmony_ci                this.line--;
19993fb6ee3Sopenharmony_ci                this.skipNextNewLine = false;
20093fb6ee3Sopenharmony_ci                this._addGap();
20193fb6ee3Sopenharmony_ci                return this.advance();
20293fb6ee3Sopenharmony_ci            }
20393fb6ee3Sopenharmony_ci        }
20493fb6ee3Sopenharmony_ci
20593fb6ee3Sopenharmony_ci        this.skipNextNewLine = false;
20693fb6ee3Sopenharmony_ci
20793fb6ee3Sopenharmony_ci        if (isSurrogate(cp)) {
20893fb6ee3Sopenharmony_ci            cp = this._processSurrogate(cp);
20993fb6ee3Sopenharmony_ci        }
21093fb6ee3Sopenharmony_ci
21193fb6ee3Sopenharmony_ci        //OPTIMIZATION: first check if code point is in the common allowed
21293fb6ee3Sopenharmony_ci        //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
21393fb6ee3Sopenharmony_ci        //before going into detailed performance cost validation.
21493fb6ee3Sopenharmony_ci        const isCommonValidRange =
21593fb6ee3Sopenharmony_ci            this.handler.onParseError === null ||
21693fb6ee3Sopenharmony_ci            (cp > 0x1f && cp < 0x7f) ||
21793fb6ee3Sopenharmony_ci            cp === $.LINE_FEED ||
21893fb6ee3Sopenharmony_ci            cp === $.CARRIAGE_RETURN ||
21993fb6ee3Sopenharmony_ci            (cp > 0x9f && cp < 0xfd_d0);
22093fb6ee3Sopenharmony_ci
22193fb6ee3Sopenharmony_ci        if (!isCommonValidRange) {
22293fb6ee3Sopenharmony_ci            this._checkForProblematicCharacters(cp);
22393fb6ee3Sopenharmony_ci        }
22493fb6ee3Sopenharmony_ci
22593fb6ee3Sopenharmony_ci        return cp;
22693fb6ee3Sopenharmony_ci    }
22793fb6ee3Sopenharmony_ci
22893fb6ee3Sopenharmony_ci    private _checkForProblematicCharacters(cp: number): void {
22993fb6ee3Sopenharmony_ci        if (isControlCodePoint(cp)) {
23093fb6ee3Sopenharmony_ci            this._err(ERR.controlCharacterInInputStream);
23193fb6ee3Sopenharmony_ci        } else if (isUndefinedCodePoint(cp)) {
23293fb6ee3Sopenharmony_ci            this._err(ERR.noncharacterInInputStream);
23393fb6ee3Sopenharmony_ci        }
23493fb6ee3Sopenharmony_ci    }
23593fb6ee3Sopenharmony_ci
23693fb6ee3Sopenharmony_ci    public retreat(count: number): void {
23793fb6ee3Sopenharmony_ci        this.pos -= count;
23893fb6ee3Sopenharmony_ci
23993fb6ee3Sopenharmony_ci        while (this.pos < this.lastGapPos) {
24093fb6ee3Sopenharmony_ci            this.lastGapPos = this.gapStack.pop()!;
24193fb6ee3Sopenharmony_ci            this.pos--;
24293fb6ee3Sopenharmony_ci        }
24393fb6ee3Sopenharmony_ci
24493fb6ee3Sopenharmony_ci        this.isEol = false;
24593fb6ee3Sopenharmony_ci    }
24693fb6ee3Sopenharmony_ci}
247