193fb6ee3Sopenharmony_ciimport { 293fb6ee3Sopenharmony_ci CODE_POINTS as $, 393fb6ee3Sopenharmony_ci getSurrogatePairCodePoint, 493fb6ee3Sopenharmony_ci isControlCodePoint, 593fb6ee3Sopenharmony_ci isSurrogate, 693fb6ee3Sopenharmony_ci isSurrogatePair, 793fb6ee3Sopenharmony_ci isUndefinedCodePoint, 893fb6ee3Sopenharmony_ci} from '../common/unicode.js'; 993fb6ee3Sopenharmony_ciimport { ERR, type ParserError, type ParserErrorHandler } from '../common/error-codes.js'; 1093fb6ee3Sopenharmony_ci 1193fb6ee3Sopenharmony_ci//Const 1293fb6ee3Sopenharmony_ciconst DEFAULT_BUFFER_WATERLINE = 1 << 16; 1393fb6ee3Sopenharmony_ci 1493fb6ee3Sopenharmony_ci//Preprocessor 1593fb6ee3Sopenharmony_ci//NOTE: HTML input preprocessing 1693fb6ee3Sopenharmony_ci//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) 1793fb6ee3Sopenharmony_ciexport class Preprocessor { 1893fb6ee3Sopenharmony_ci public html = ''; 1993fb6ee3Sopenharmony_ci private pos = -1; 2093fb6ee3Sopenharmony_ci // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0 2193fb6ee3Sopenharmony_ci private lastGapPos = -2; 2293fb6ee3Sopenharmony_ci private gapStack: number[] = []; 2393fb6ee3Sopenharmony_ci private skipNextNewLine = false; 2493fb6ee3Sopenharmony_ci private lastChunkWritten = false; 2593fb6ee3Sopenharmony_ci public endOfChunkHit = false; 2693fb6ee3Sopenharmony_ci public bufferWaterline = DEFAULT_BUFFER_WATERLINE; 2793fb6ee3Sopenharmony_ci 2893fb6ee3Sopenharmony_ci private isEol = false; 2993fb6ee3Sopenharmony_ci private lineStartPos = 0; 3093fb6ee3Sopenharmony_ci public droppedBufferSize = 0; 3193fb6ee3Sopenharmony_ci public line = 1; 3293fb6ee3Sopenharmony_ci 3393fb6ee3Sopenharmony_ci constructor(private handler: { onParseError?: ParserErrorHandler | null }) {} 3493fb6ee3Sopenharmony_ci 3593fb6ee3Sopenharmony_ci /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */ 3693fb6ee3Sopenharmony_ci public get col(): number { 3793fb6ee3Sopenharmony_ci return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos); 3893fb6ee3Sopenharmony_ci } 3993fb6ee3Sopenharmony_ci 4093fb6ee3Sopenharmony_ci public get offset(): number { 4193fb6ee3Sopenharmony_ci return this.droppedBufferSize + this.pos; 4293fb6ee3Sopenharmony_ci } 4393fb6ee3Sopenharmony_ci 4493fb6ee3Sopenharmony_ci public getError(code: ERR): ParserError { 4593fb6ee3Sopenharmony_ci const { line, col, offset } = this; 4693fb6ee3Sopenharmony_ci 4793fb6ee3Sopenharmony_ci return { 4893fb6ee3Sopenharmony_ci code, 4993fb6ee3Sopenharmony_ci startLine: line, 5093fb6ee3Sopenharmony_ci endLine: line, 5193fb6ee3Sopenharmony_ci startCol: col, 5293fb6ee3Sopenharmony_ci endCol: col, 5393fb6ee3Sopenharmony_ci startOffset: offset, 5493fb6ee3Sopenharmony_ci endOffset: offset, 5593fb6ee3Sopenharmony_ci }; 5693fb6ee3Sopenharmony_ci } 5793fb6ee3Sopenharmony_ci 5893fb6ee3Sopenharmony_ci //NOTE: avoid reporting errors twice on advance/retreat 5993fb6ee3Sopenharmony_ci private lastErrOffset = -1; 6093fb6ee3Sopenharmony_ci private _err(code: ERR): void { 6193fb6ee3Sopenharmony_ci if (this.handler.onParseError && this.lastErrOffset !== this.offset) { 6293fb6ee3Sopenharmony_ci this.lastErrOffset = this.offset; 6393fb6ee3Sopenharmony_ci this.handler.onParseError(this.getError(code)); 6493fb6ee3Sopenharmony_ci } 6593fb6ee3Sopenharmony_ci } 6693fb6ee3Sopenharmony_ci 6793fb6ee3Sopenharmony_ci private _addGap(): void { 6893fb6ee3Sopenharmony_ci this.gapStack.push(this.lastGapPos); 6993fb6ee3Sopenharmony_ci this.lastGapPos = this.pos; 7093fb6ee3Sopenharmony_ci } 7193fb6ee3Sopenharmony_ci 7293fb6ee3Sopenharmony_ci private _processSurrogate(cp: number): number { 7393fb6ee3Sopenharmony_ci //NOTE: try to peek a surrogate pair 7493fb6ee3Sopenharmony_ci if (this.pos !== this.html.length - 1) { 7593fb6ee3Sopenharmony_ci const nextCp = this.html.charCodeAt(this.pos + 1); 7693fb6ee3Sopenharmony_ci 7793fb6ee3Sopenharmony_ci if (isSurrogatePair(nextCp)) { 7893fb6ee3Sopenharmony_ci //NOTE: we have a surrogate pair. Peek pair character and recalculate code point. 7993fb6ee3Sopenharmony_ci this.pos++; 8093fb6ee3Sopenharmony_ci 8193fb6ee3Sopenharmony_ci //NOTE: add a gap that should be avoided during retreat 8293fb6ee3Sopenharmony_ci this._addGap(); 8393fb6ee3Sopenharmony_ci 8493fb6ee3Sopenharmony_ci return getSurrogatePairCodePoint(cp, nextCp); 8593fb6ee3Sopenharmony_ci } 8693fb6ee3Sopenharmony_ci } 8793fb6ee3Sopenharmony_ci 8893fb6ee3Sopenharmony_ci //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet. 8993fb6ee3Sopenharmony_ci else if (!this.lastChunkWritten) { 9093fb6ee3Sopenharmony_ci this.endOfChunkHit = true; 9193fb6ee3Sopenharmony_ci return $.EOF; 9293fb6ee3Sopenharmony_ci } 9393fb6ee3Sopenharmony_ci 9493fb6ee3Sopenharmony_ci //NOTE: isolated surrogate 9593fb6ee3Sopenharmony_ci this._err(ERR.surrogateInInputStream); 9693fb6ee3Sopenharmony_ci 9793fb6ee3Sopenharmony_ci return cp; 9893fb6ee3Sopenharmony_ci } 9993fb6ee3Sopenharmony_ci 10093fb6ee3Sopenharmony_ci public willDropParsedChunk(): boolean { 10193fb6ee3Sopenharmony_ci return this.pos > this.bufferWaterline; 10293fb6ee3Sopenharmony_ci } 10393fb6ee3Sopenharmony_ci 10493fb6ee3Sopenharmony_ci public dropParsedChunk(): void { 10593fb6ee3Sopenharmony_ci if (this.willDropParsedChunk()) { 10693fb6ee3Sopenharmony_ci this.html = this.html.substring(this.pos); 10793fb6ee3Sopenharmony_ci this.lineStartPos -= this.pos; 10893fb6ee3Sopenharmony_ci this.droppedBufferSize += this.pos; 10993fb6ee3Sopenharmony_ci this.pos = 0; 11093fb6ee3Sopenharmony_ci this.lastGapPos = -2; 11193fb6ee3Sopenharmony_ci this.gapStack.length = 0; 11293fb6ee3Sopenharmony_ci } 11393fb6ee3Sopenharmony_ci } 11493fb6ee3Sopenharmony_ci 11593fb6ee3Sopenharmony_ci public write(chunk: string, isLastChunk: boolean): void { 11693fb6ee3Sopenharmony_ci if (this.html.length > 0) { 11793fb6ee3Sopenharmony_ci this.html += chunk; 11893fb6ee3Sopenharmony_ci } else { 11993fb6ee3Sopenharmony_ci this.html = chunk; 12093fb6ee3Sopenharmony_ci } 12193fb6ee3Sopenharmony_ci 12293fb6ee3Sopenharmony_ci this.endOfChunkHit = false; 12393fb6ee3Sopenharmony_ci this.lastChunkWritten = isLastChunk; 12493fb6ee3Sopenharmony_ci } 12593fb6ee3Sopenharmony_ci 12693fb6ee3Sopenharmony_ci public insertHtmlAtCurrentPos(chunk: string): void { 12793fb6ee3Sopenharmony_ci this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1); 12893fb6ee3Sopenharmony_ci 12993fb6ee3Sopenharmony_ci this.endOfChunkHit = false; 13093fb6ee3Sopenharmony_ci } 13193fb6ee3Sopenharmony_ci 13293fb6ee3Sopenharmony_ci public startsWith(pattern: string, caseSensitive: boolean): boolean { 13393fb6ee3Sopenharmony_ci // Check if our buffer has enough characters 13493fb6ee3Sopenharmony_ci if (this.pos + pattern.length > this.html.length) { 13593fb6ee3Sopenharmony_ci this.endOfChunkHit = !this.lastChunkWritten; 13693fb6ee3Sopenharmony_ci return false; 13793fb6ee3Sopenharmony_ci } 13893fb6ee3Sopenharmony_ci 13993fb6ee3Sopenharmony_ci if (caseSensitive) { 14093fb6ee3Sopenharmony_ci return this.html.startsWith(pattern, this.pos); 14193fb6ee3Sopenharmony_ci } 14293fb6ee3Sopenharmony_ci 14393fb6ee3Sopenharmony_ci for (let i = 0; i < pattern.length; i++) { 14493fb6ee3Sopenharmony_ci const cp = this.html.charCodeAt(this.pos + i) | 0x20; 14593fb6ee3Sopenharmony_ci 14693fb6ee3Sopenharmony_ci if (cp !== pattern.charCodeAt(i)) { 14793fb6ee3Sopenharmony_ci return false; 14893fb6ee3Sopenharmony_ci } 14993fb6ee3Sopenharmony_ci } 15093fb6ee3Sopenharmony_ci 15193fb6ee3Sopenharmony_ci return true; 15293fb6ee3Sopenharmony_ci } 15393fb6ee3Sopenharmony_ci 15493fb6ee3Sopenharmony_ci public peek(offset: number): number { 15593fb6ee3Sopenharmony_ci const pos = this.pos + offset; 15693fb6ee3Sopenharmony_ci 15793fb6ee3Sopenharmony_ci if (pos >= this.html.length) { 15893fb6ee3Sopenharmony_ci this.endOfChunkHit = !this.lastChunkWritten; 15993fb6ee3Sopenharmony_ci return $.EOF; 16093fb6ee3Sopenharmony_ci } 16193fb6ee3Sopenharmony_ci 16293fb6ee3Sopenharmony_ci const code = this.html.charCodeAt(pos); 16393fb6ee3Sopenharmony_ci 16493fb6ee3Sopenharmony_ci return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code; 16593fb6ee3Sopenharmony_ci } 16693fb6ee3Sopenharmony_ci 16793fb6ee3Sopenharmony_ci public advance(): number { 16893fb6ee3Sopenharmony_ci this.pos++; 16993fb6ee3Sopenharmony_ci 17093fb6ee3Sopenharmony_ci //NOTE: LF should be in the last column of the line 17193fb6ee3Sopenharmony_ci if (this.isEol) { 17293fb6ee3Sopenharmony_ci this.isEol = false; 17393fb6ee3Sopenharmony_ci this.line++; 17493fb6ee3Sopenharmony_ci this.lineStartPos = this.pos; 17593fb6ee3Sopenharmony_ci } 17693fb6ee3Sopenharmony_ci 17793fb6ee3Sopenharmony_ci if (this.pos >= this.html.length) { 17893fb6ee3Sopenharmony_ci this.endOfChunkHit = !this.lastChunkWritten; 17993fb6ee3Sopenharmony_ci return $.EOF; 18093fb6ee3Sopenharmony_ci } 18193fb6ee3Sopenharmony_ci 18293fb6ee3Sopenharmony_ci let cp = this.html.charCodeAt(this.pos); 18393fb6ee3Sopenharmony_ci 18493fb6ee3Sopenharmony_ci //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters 18593fb6ee3Sopenharmony_ci if (cp === $.CARRIAGE_RETURN) { 18693fb6ee3Sopenharmony_ci this.isEol = true; 18793fb6ee3Sopenharmony_ci this.skipNextNewLine = true; 18893fb6ee3Sopenharmony_ci return $.LINE_FEED; 18993fb6ee3Sopenharmony_ci } 19093fb6ee3Sopenharmony_ci 19193fb6ee3Sopenharmony_ci //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character 19293fb6ee3Sopenharmony_ci //must be ignored. 19393fb6ee3Sopenharmony_ci if (cp === $.LINE_FEED) { 19493fb6ee3Sopenharmony_ci this.isEol = true; 19593fb6ee3Sopenharmony_ci 19693fb6ee3Sopenharmony_ci if (this.skipNextNewLine) { 19793fb6ee3Sopenharmony_ci // `line` will be bumped again in the recursive call. 19893fb6ee3Sopenharmony_ci this.line--; 19993fb6ee3Sopenharmony_ci this.skipNextNewLine = false; 20093fb6ee3Sopenharmony_ci this._addGap(); 20193fb6ee3Sopenharmony_ci return this.advance(); 20293fb6ee3Sopenharmony_ci } 20393fb6ee3Sopenharmony_ci } 20493fb6ee3Sopenharmony_ci 20593fb6ee3Sopenharmony_ci this.skipNextNewLine = false; 20693fb6ee3Sopenharmony_ci 20793fb6ee3Sopenharmony_ci if (isSurrogate(cp)) { 20893fb6ee3Sopenharmony_ci cp = this._processSurrogate(cp); 20993fb6ee3Sopenharmony_ci } 21093fb6ee3Sopenharmony_ci 21193fb6ee3Sopenharmony_ci //OPTIMIZATION: first check if code point is in the common allowed 21293fb6ee3Sopenharmony_ci //range (ASCII alphanumeric, whitespaces, big chunk of BMP) 21393fb6ee3Sopenharmony_ci //before going into detailed performance cost validation. 21493fb6ee3Sopenharmony_ci const isCommonValidRange = 21593fb6ee3Sopenharmony_ci this.handler.onParseError === null || 21693fb6ee3Sopenharmony_ci (cp > 0x1f && cp < 0x7f) || 21793fb6ee3Sopenharmony_ci cp === $.LINE_FEED || 21893fb6ee3Sopenharmony_ci cp === $.CARRIAGE_RETURN || 21993fb6ee3Sopenharmony_ci (cp > 0x9f && cp < 0xfd_d0); 22093fb6ee3Sopenharmony_ci 22193fb6ee3Sopenharmony_ci if (!isCommonValidRange) { 22293fb6ee3Sopenharmony_ci this._checkForProblematicCharacters(cp); 22393fb6ee3Sopenharmony_ci } 22493fb6ee3Sopenharmony_ci 22593fb6ee3Sopenharmony_ci return cp; 22693fb6ee3Sopenharmony_ci } 22793fb6ee3Sopenharmony_ci 22893fb6ee3Sopenharmony_ci private _checkForProblematicCharacters(cp: number): void { 22993fb6ee3Sopenharmony_ci if (isControlCodePoint(cp)) { 23093fb6ee3Sopenharmony_ci this._err(ERR.controlCharacterInInputStream); 23193fb6ee3Sopenharmony_ci } else if (isUndefinedCodePoint(cp)) { 23293fb6ee3Sopenharmony_ci this._err(ERR.noncharacterInInputStream); 23393fb6ee3Sopenharmony_ci } 23493fb6ee3Sopenharmony_ci } 23593fb6ee3Sopenharmony_ci 23693fb6ee3Sopenharmony_ci public retreat(count: number): void { 23793fb6ee3Sopenharmony_ci this.pos -= count; 23893fb6ee3Sopenharmony_ci 23993fb6ee3Sopenharmony_ci while (this.pos < this.lastGapPos) { 24093fb6ee3Sopenharmony_ci this.lastGapPos = this.gapStack.pop()!; 24193fb6ee3Sopenharmony_ci this.pos--; 24293fb6ee3Sopenharmony_ci } 24393fb6ee3Sopenharmony_ci 24493fb6ee3Sopenharmony_ci this.isEol = false; 24593fb6ee3Sopenharmony_ci } 24693fb6ee3Sopenharmony_ci} 247