1/*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "ecmascript/regexp/regexp_parser.h"
17
18#include "ecmascript/base/string_helper.h"
19#include "libpandabase/utils/utils.h"
20#define _NO_DEBUG_
21
22namespace panda::ecmascript {
23static constexpr uint32_t CACHE_SIZE = 128;
24static constexpr uint32_t CHAR_MAXS = 128;
25static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
26    /* $ A-Z _ a-z */
27    0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
28};
29static RangeSet g_rangeD(0x30, 0x39);  // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
30// NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
31static RangeSet g_rangeS({
32    std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINTNEXTLINE(readability-magic-numbers)
33    std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINTNEXTLINE(readability-magic-numbers)
34    std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINTNEXTLINE(readability-magic-numbers)
35    std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINTNEXTLINE(readability-magic-numbers)
36    std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINTNEXTLINE(readability-magic-numbers)
37    /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
38    /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
39    std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINTNEXTLINE(readability-magic-numbers)
40    std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINTNEXTLINE(readability-magic-numbers)
41    std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINTNEXTLINE(readability-magic-numbers)
42    std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINTNEXTLINE(readability-magic-numbers)
43    /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
44    std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINTNEXTLINE(readability-magic-numbers)
45});
46
47// NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
48static RangeSet g_rangeW({
49    std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
50    std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
51    std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINTNEXTLINE(readability-magic-numbers)
52    std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
53});
54
55// NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
56static RangeSet g_regexpIdentifyStart({
57    std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
58    std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
59    std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
60});
61
62// NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
63static RangeSet g_regexpIdentifyContinue({
64    std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
65    std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
66    std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
67    std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
68});
69
70void RegExpParser::Parse()
71{
72    // dynbuffer head init [size,capture_count,statck_count,flags,prefilter]
73    buffer_.EmitU32(0);
74    buffer_.EmitU32(0);
75    buffer_.EmitU32(0);
76    buffer_.EmitU32(0);
77    buffer_.EmitU32(0);
78    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
79    PrintF("Parse Pattern------\n");
80    // Pattern[U, N]::
81    //      Disjunction[?U, ?N]
82    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83    Advance();
84    SaveStartOpCode saveStartOp;
85    int captureIndex = captureCount_++;
86    saveStartOp.EmitOpCode(&buffer_, captureIndex);
87    ParseDisjunction(false);
88    if (isError_) {
89        return;
90    }
91    if (c0_ != KEY_EOF) {
92        ParseError("extraneous characters at the end");
93        return;
94    }
95    SaveEndOpCode saveEndOp;
96    saveEndOp.EmitOpCode(&buffer_, captureIndex);
97    MatchEndOpCode matchEndOp;
98    matchEndOp.EmitOpCode(&buffer_, 0);
99
100    uint32_t ptr = RegExpParser::OP_START_OFFSET;
101    ptr += static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_SAVE_START)->GetSize());
102    uint8_t opCode = buffer_.GetU8(ptr);
103    uint16_t expectedChar = 0;
104    if (opCode == RegExpOpCode::OP_CHAR && !IsIgnoreCase()) {
105        expectedChar = buffer_.GetU16(ptr + 1);
106        if (expectedChar > UINT8_MAX) {
107            expectedChar = 0;
108        }
109    }
110
111    // dynbuffer head assignments
112    buffer_.PutU32(0, buffer_.size_);
113    buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
114    buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
115    buffer_.PutU32(FLAGS_OFFSET, flags_);
116    buffer_.PutU32(PREFILTER_OFFSET, expectedChar);
117#ifndef _NO_DEBUG_
118    RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
119#endif
120}
121
122void RegExpParser::ParseDisjunction(bool isBackward)
123{
124    // check stack overflow because infinite recursion may occur
125    DoParserStackOverflowCheck("invalid regular expression.");
126    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
127    PrintF("Parse Disjunction------\n");
128    if (c0_ == ')') {
129        isEmpty_ = true;
130        return;
131    }
132    size_t start = buffer_.size_;
133    ParseAlternative(isBackward);
134    if (isError_) {
135        return;
136    }
137    uint32_t para = 0;
138    do {
139        if (c0_ == '|') {
140            SplitNextOpCode splitOp;
141            uint32_t len = buffer_.size_ - start;
142            GotoOpCode gotoOp;
143            splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
144            uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
145            gotoOp.UpdateOpPara(&buffer_, pos, para);
146            Advance();
147            ParseAlternative(isBackward);
148            para = buffer_.size_ - pos - gotoOp.GetSize();
149            if (c0_ != '|') {
150                uint16_t cnt = 0;
151                uint32_t opCharSize =
152                    static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_CHAR)->GetSize());
153                uint32_t opSplitSize =
154                    static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_SPLIT_NEXT)->GetSize());
155                std::vector<uint16_t> chars;
156                std::vector<uint32_t> offsets;
157                std::set<uint16_t> checkSet;
158                uint32_t ptr = start;
159                bool isSparseable = true;
160                do {
161                    uint8_t opCode = buffer_.GetU8(ptr);
162                    uint32_t offset = 0;
163                    uint32_t branch = ptr;
164                    bool isLastBranch = false;
165                    if (opCode == RegExpOpCode::OP_SPLIT_NEXT) {
166                        offset = buffer_.GetU32(ptr + 1);
167                        branch = ptr + offset + opSplitSize;
168                    } else {
169                        isLastBranch = true;
170                    }
171                    uint8_t opCodeChar = buffer_.GetU8(branch);
172                    if (opCodeChar == RegExpOpCode::OP_CHAR) {
173                        chars.push_back(buffer_.GetU16(branch + 1));
174                        offsets.push_back(offset);
175                        if (checkSet.find(chars[cnt]) != checkSet.end()) {
176                            isSparseable = false;
177                            break;
178                        }
179                        checkSet.insert(chars[cnt]);
180                    } else {
181                        isSparseable = false;
182                        break;
183                    }
184                    cnt++;
185                    if (isLastBranch) {
186                        break;
187                    }
188                    ptr += opSplitSize;
189                } while (true);
190
191                if (isSparseable) {
192                    uint32_t sparseLen = SPARSE_HEAD_OFFSET + static_cast<uint32_t>(cnt) * SPARSE_MAX_OFFSET;
193                    uint32_t splitsLen = static_cast<uint32_t>(cnt - 1) * opSplitSize;
194                    ptr = start;
195                    buffer_.Insert(start, sparseLen - splitsLen);
196                    pos += sparseLen - splitsLen;
197                    buffer_.PutU8(ptr, RegExpOpCode::OP_SPARSE);
198                    buffer_.PutU16(ptr + 1, cnt);
199                    ptr += SPARSE_HEAD_OFFSET;
200                    ASSERT(chars.size() > 0);
201                    for (int32_t i = static_cast<int32_t>(chars.size() - 1); i >= 0; i--) {
202                        buffer_.PutU16(ptr, chars[i]);
203                        // 2: cnt = count of splits + 1, for invert index should be extra - 1, so -1-1=-2
204                        offsets[i] += opCharSize - opSplitSize * std::max(0, cnt - i -2);
205                        buffer_.PutU32(ptr + SPARSE_OFF_OFFSET, offsets[i]);
206                        ptr += SPARSE_MAX_OFFSET;
207                    }
208                }
209                bool isEnd = false;
210                do {
211                    uint32_t paraTmp = buffer_.GetU32(pos + 1);
212                    if (paraTmp == 0) {
213                        isEnd = true;
214                    }
215                    buffer_.PutU32(pos + 1, para);
216                    para += paraTmp + gotoOp.GetSize();
217                    pos -= paraTmp + gotoOp.GetSize();
218                } while (!isEnd);
219            }
220            if (isError_) {
221                return;
222            }
223        }
224    } while (c0_ != KEY_EOF && c0_ != ')');
225}
226
227uint32_t RegExpParser::ParseOctalLiteral()
228{
229    // For compatibility with some other browsers (not all), we parse
230    // up to three octal digits with a value below 256.
231    // ES#prod-annexB-LegacyOctalEscapeSequence
232    uint32_t value = c0_ - '0';
233    Advance();
234    if (c0_ >= '0' && c0_ <= '7') {
235        value = value * OCTAL_VALUE + c0_ - '0';
236        Advance();
237        if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
238            value = value * OCTAL_VALUE + c0_ - '0';
239            Advance();
240        }
241    }
242    return value;
243}
244
245bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
246{
247    uint32_t x = 0;
248    int d = static_cast<int>(HexValue(c0_));
249    if (d < 0) {
250        return false;
251    }
252    while (d >= 0) {
253        if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
254            LOG_FULL(FATAL) << "value overflow";
255            return false;
256        }
257        x = x * HEX_VALUE + static_cast<uint32_t>(d);
258        if (x > maxValue) {
259            return false;
260        }
261        Advance();
262        d = static_cast<int>(HexValue(c0_));
263    }
264    *value = x;
265    return true;
266}
267
268// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
269bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
270{
271    // Accept both \uxxxx and \u{xxxxxx} (if allowed).
272    // In the latter case, the number of hex digits between { } is arbitrary.
273    // \ and u have already been read.
274    if (c0_ == '{' && IsUtf16()) {
275        uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
276        Advance();
277        if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINTNEXTLINE(readability-magic-numbers)
278            if (c0_ == '}') {
279                Advance();
280                return true;
281            }
282        }
283        pc_ = start;
284        Advance();
285        return false;
286    }
287    // \u but no {, or \u{...} escapes not allowed.
288    bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
289    if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
290        // Attempt to read trail surrogate.
291        uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
292        if (*pc_ == 'u') {
293            Advance(UNICODE_HEX_ADVANCE);
294            uint32_t trail = 0;
295            if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
296                *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
297                return true;
298            }
299        }
300        pc_ = start;
301        Advance();
302    }
303    return result;
304}
305
306bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
307{
308    uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
309    uint32_t val = 0;
310    for (int i = 0; i < length; ++i) {
311        uint32_t c = c0_;
312        int d = static_cast<int>(HexValue(c));
313        if (d < 0) {
314            pc_ = start;
315            Advance();
316            return false;
317        }
318        val = val * HEX_VALUE + static_cast<uint32_t>(d);
319        Advance();
320    }
321    *value = val;
322    return true;
323}
324
325// NOLINTNEXTLINE(readability-function-size)
326void RegExpParser::ParseAlternative(bool isBackward)
327{
328    size_t start = buffer_.size_;
329    while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
330        if (isError_) {
331            return;
332        }
333        size_t atomBcStart = buffer_.GetSize();
334        int captureIndex = 0;
335        bool isAtom = false;
336        switch (c0_) {
337            case '^': {
338                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
339                PrintF("Assertion %c line start \n", c0_);
340                LineStartOpCode lineStartOp;
341                lineStartOp.EmitOpCode(&buffer_, 0);
342                Advance();
343                break;
344            }
345            case '$': {
346                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
347                PrintF("Assertion %c line end \n", c0_);
348                LineEndOpCode lineEndOp;
349                lineEndOp.EmitOpCode(&buffer_, 0);
350                Advance();
351                break;
352            }
353            case '\\': {
354                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
355                PrintF("Escape %c \n", c0_);
356                Advance();
357                switch (c0_) {
358                    case 'b': {
359                        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
360                        PrintF("Assertion %c \n", c0_);
361                        WordBoundaryOpCode wordBoundaryOp;
362                        wordBoundaryOp.EmitOpCode(&buffer_, 0);
363                        Advance();
364                        break;
365                    }
366                    case 'B': {
367                        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
368                        PrintF("Assertion %c \n", c0_);
369                        NotWordBoundaryOpCode notWordBoundaryOp;
370                        notWordBoundaryOp.EmitOpCode(&buffer_, 0);
371                        Advance();
372                        break;
373                    }
374                    default: {
375                        isAtom = true;
376                        int atomValue = ParseAtomEscape(isBackward);
377                        if (atomValue != -1) {
378                            PrevOpCode prevOp;
379                            if (isBackward) {
380                                prevOp.EmitOpCode(&buffer_, 0);
381                            }
382                            if (IsIgnoreCase()) {
383                                if (!IsUtf16()) {
384                                    atomValue = Canonicalize(atomValue, false);
385                                } else {
386                                    icu::UnicodeSet set(atomValue, atomValue);
387                                    set.closeOver(USET_CASE_INSENSITIVE);
388                                    set.removeAllStrings();
389                                    uint32_t size = static_cast<uint32_t>(set.size());
390                                    RangeOpCode rangeOp;
391                                    RangeSet rangeResult;
392                                    for (uint32_t idx = 0; idx < size; idx++) {
393                                        int32_t uc = set.charAt(idx);
394                                        RangeSet curRange(uc);
395                                        rangeResult.Insert(curRange);
396                                    }
397                                    rangeOp.InsertOpCode(&buffer_, rangeResult);
398                                    break;
399                                }
400                            }
401                            if (atomValue <= UINT16_MAX) {
402                                CharOpCode charOp;
403                                charOp.EmitOpCode(&buffer_, atomValue);
404                            } else {
405                                Char32OpCode charOp;
406                                charOp.EmitOpCode(&buffer_, atomValue);
407                            }
408                            if (isBackward) {
409                                prevOp.EmitOpCode(&buffer_, 0);
410                            }
411                        }
412                        break;
413                    }
414                }
415                break;
416            }
417            case '(': {
418                Advance();
419                isAtom = ParseAssertionCapture(&captureIndex, isBackward);
420                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
421                Advance();
422                break;
423            }
424            case '.': {
425                PrevOpCode prevOp;
426                if (isBackward) {
427                    prevOp.EmitOpCode(&buffer_, 0);
428                }
429                if (IsDotAll()) {
430                    AllOpCode allOp;
431                    allOp.EmitOpCode(&buffer_, 0);
432                } else {
433                    DotsOpCode dotsOp;
434                    dotsOp.EmitOpCode(&buffer_, 0);
435                }
436                if (isBackward) {
437                    prevOp.EmitOpCode(&buffer_, 0);
438                }
439                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
440                PrintF("Atom %c match any \n", c0_);
441                isAtom = true;
442                Advance();
443                break;
444            }
445            case '[': {
446                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
447                PrintF("Atom %c match range \n", c0_);
448                isAtom = true;
449                PrevOpCode prevOp;
450                Advance();
451                if (isBackward) {
452                    prevOp.EmitOpCode(&buffer_, 0);
453                }
454                bool isInvert = false;
455                if (c0_ == '^') {
456                    isInvert = true;
457                    Advance();
458                }
459                RangeSet rangeResult;
460                if (!ParseClassRanges(&rangeResult)) {
461                    break;
462                }
463                if (isInvert) {
464                    rangeResult.Invert(IsUtf16());
465                }
466                uint32_t highValue = rangeResult.HighestValue();
467                if (highValue <= UINT16_MAX) {
468                    RangeOpCode rangeOp;
469                    rangeOp.InsertOpCode(&buffer_, rangeResult);
470                } else {
471                    Range32OpCode rangeOp;
472                    rangeOp.InsertOpCode(&buffer_, rangeResult);
473                }
474
475                if (isBackward) {
476                    prevOp.EmitOpCode(&buffer_, 0);
477                }
478                break;
479            }
480            case '*':
481            case '+':
482            case '?':
483                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
484                ParseError("nothing to repeat");
485                return;
486            case '{': {
487                uint8_t *begin = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488                int dummy;
489                if (ParserIntervalQuantifier(&dummy, &dummy)) {
490                    ParseError("nothing to repeat");
491                    return;
492                }
493                pc_ = begin;
494                Advance();
495            }
496                [[fallthrough]];
497            case '}':
498            case ']':
499                if (IsUtf16()) {
500                    ParseError("syntax error");
501                    return;
502                }
503                [[fallthrough]];
504            default: {
505                // PatternCharacter
506                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
507                PrintF("PatternCharacter %c\n", c0_);
508                isAtom = true;
509                {
510                    PrevOpCode prevOp;
511                    if (isBackward) {
512                        prevOp.EmitOpCode(&buffer_, 0);
513                    }
514                    uint32_t matchedChar = c0_;
515                    if (c0_ > (INT8_MAX + 1)) {
516                        Prev();
517                        UChar32 c;
518                        int32_t length = end_ - pc_ + 1;
519                        // NOLINTNEXTLINE(hicpp-signed-bitwise)
520                        auto unicodeChar = base::utf_helper::ConvertUtf8ToUnicodeChar(pc_, length);
521                        c = unicodeChar.first;
522                        matchedChar = static_cast<uint32_t>(c);
523                        pc_ += unicodeChar.second;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
524                    }
525                    if (IsIgnoreCase()) {
526                        matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
527                    }
528                    if (matchedChar > UINT16_MAX) {
529                        Char32OpCode charOp;
530                        charOp.EmitOpCode(&buffer_, matchedChar);
531                    } else {
532                        CharOpCode charOp;
533                        charOp.EmitOpCode(&buffer_, matchedChar);
534                    }
535                    if (isBackward) {
536                        prevOp.EmitOpCode(&buffer_, 0);
537                    }
538                }
539                Advance();
540                break;
541            }
542        }
543        if (isAtom && !isError_) {
544            ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
545        }
546        if (isBackward) {
547            size_t end = buffer_.GetSize();
548            size_t termSize = end - atomBcStart;
549            size_t moveSize = end - start;
550            buffer_.Expand(end + termSize);
551            if (memmove_s(buffer_.buf_ + start +  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
552                              termSize,           // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
553                          moveSize,
554                          buffer_.buf_ + start,  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
555                          moveSize) != EOK) {
556                LOG_FULL(FATAL) << "memmove_s failed";
557                UNREACHABLE();
558            }
559            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
560            if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
561                LOG_FULL(FATAL) << "memcpy_s failed";
562                UNREACHABLE();
563            }
564        }
565    }
566}
567
568int RegExpParser::FindGroupName(const CString &name)
569{
570    size_t len = 0;
571    size_t nameLen = name.size();
572    const char *p = reinterpret_cast<char *>(groupNames_.buf_);
573    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
574    const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
575    int captureIndex = 1;
576    while (p < bufEnd) {
577        len = strlen(p);
578        if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
579            return captureIndex;
580        }
581        p += len + 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
582        captureIndex++;
583    }
584    return -1;
585}
586
587bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
588{
589    bool isAtom = false;
590    do {
591        if (c0_ == '?') {
592            Advance();
593            switch (c0_) {
594                // (?=Disjunction[?U, ?N])
595                case '=': {
596                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
597                    PrintF("Assertion(?= Disjunction)\n");
598                    Advance();
599                    uint32_t start = buffer_.size_;
600                    ParseDisjunction(isBackward);
601                    MatchOpCode matchOp;
602                    matchOp.EmitOpCode(&buffer_, 0);
603                    MatchAheadOpCode matchAheadOp;
604                    uint32_t len = buffer_.size_ - start;
605                    matchAheadOp.InsertOpCode(&buffer_, start, len);
606                    break;
607                }
608                // (?!Disjunction[?U, ?N])
609                case '!': {
610                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
611                    PrintF("Assertion(?! Disjunction)\n");
612                    uint32_t start = buffer_.size_;
613                    Advance();
614                    ParseDisjunction(isBackward);
615                    MatchOpCode matchOp;
616                    matchOp.EmitOpCode(&buffer_, 0);
617                    NegativeMatchAheadOpCode matchAheadOp;
618                    uint32_t len = buffer_.size_ - start;
619                    matchAheadOp.InsertOpCode(&buffer_, start, len);
620                    break;
621                }
622                case '<': {
623                    Advance();
624                    // (?<=Disjunction[?U, ?N])
625                    if (c0_ == '=') {
626                        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
627                        PrintF("Assertion(?<= Disjunction)\n");
628                        Advance();
629                        uint32_t start = buffer_.size_;
630                        ParseDisjunction(true);
631                        MatchOpCode matchOp;
632                        matchOp.EmitOpCode(&buffer_, 0);
633                        MatchAheadOpCode matchAheadOp;
634                        uint32_t len = buffer_.size_ - start;
635                        matchAheadOp.InsertOpCode(&buffer_, start, len);
636                        // (?<!Disjunction[?U, ?N])
637                    } else if (c0_ == '!') {
638                        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
639                        PrintF("Assertion(?<! Disjunction)\n");
640                        Advance();
641                        uint32_t start = buffer_.size_;
642                        ParseDisjunction(true);
643                        MatchOpCode matchOp;
644                        matchOp.EmitOpCode(&buffer_, 0);
645                        NegativeMatchAheadOpCode matchAheadOp;
646                        uint32_t len = buffer_.size_ - start;
647                        matchAheadOp.InsertOpCode(&buffer_, start, len);
648                    } else {
649                        Prev();
650                        CString name;
651                        auto **pp = const_cast<const uint8_t **>(&pc_);
652                        if (!ParseGroupSpecifier(pp, name)) {
653                            ParseError("GroupName Syntax error.");
654                            return false;
655                        }
656                        if (FindGroupName(name) > 0) {
657                            ParseError("Duplicate GroupName error.");
658                            return false;
659                        }
660                        groupNames_.EmitStr(name.c_str());
661                        newGroupNames_.push_back(name);
662                        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
663                        PrintF("group name %s", name.c_str());
664                        Advance();
665                        goto parseCapture;  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
666                    }
667                    break;
668                }
669                // (?:Disjunction[?U, ?N])
670                case ':':
671                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
672                    PrintF("Atom(?<: Disjunction)\n");
673                    isAtom = true;
674                    Advance();
675                    ParseDisjunction(isBackward);
676                    break;
677                default:
678                    Advance();
679                    ParseError("? Syntax error.");
680                    return false;
681            }
682            if (isError_) {
683                return false;
684            }
685        } else {
686            groupNames_.EmitChar(0);
687        parseCapture:
688            isAtom = true;
689            *captureIndex = captureCount_++;
690            SaveEndOpCode saveEndOp;
691            SaveStartOpCode saveStartOp;
692            if (isBackward) {
693                saveEndOp.EmitOpCode(&buffer_, *captureIndex);
694            } else {
695                saveStartOp.EmitOpCode(&buffer_, *captureIndex);
696            }
697            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
698            PrintF("capture start %d \n", *captureIndex);
699            ParseDisjunction(isBackward);
700            if (isError_) {
701                return false;
702            }
703            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
704            PrintF("capture end %d \n", *captureIndex);
705            if (isBackward) {
706                saveStartOp.EmitOpCode(&buffer_, *captureIndex);
707            } else {
708                saveEndOp.EmitOpCode(&buffer_, *captureIndex);
709            }
710        }
711    } while (c0_ != ')' && c0_ != KEY_EOF);
712    if (c0_ != ')') {
713        ParseError("capture syntax error");
714        return false;
715    }
716    return isAtom;
717}
718
719int RegExpParser::ParseDecimalDigits()
720{
721    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
722    PrintF("Parse DecimalDigits------\n");
723    uint32_t result = 0;
724    bool overflow = false;
725    while (true) {
726        if (c0_ < '0' || c0_ > '9') {
727            break;
728        }
729        if (!overflow) {
730            if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
731                overflow = true;
732            } else {
733                result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
734            }
735        }
736        Advance();
737    }
738    if (overflow) {
739        return INT32_MAX;
740    }
741    return result;
742}
743
744bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
745{
746    // Quantifier::
747    //     QuantifierPrefix
748    //     QuantifierPrefix?
749    // QuantifierPrefix::
750    // *
751    // +
752    // ?
753    // {DecimalDigits}
754    // {DecimalDigits,}
755    // {DecimalDigits,DecimalDigits}
756    Advance();
757    *pmin = ParseDecimalDigits();
758    *pmax = *pmin;
759    switch (c0_) {
760        case ',': {
761            Advance();
762            if (c0_ == '}') {
763                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
764                PrintF("QuantifierPrefix{DecimalDigits,}\n");
765                *pmax = INT32_MAX;
766                Advance();
767            } else {
768                *pmax = ParseDecimalDigits();
769                if (c0_ == '}') {
770                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
771                    PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
772                    Advance();
773                } else {
774                    return false;
775                }
776            }
777            break;
778        }
779        case '}':
780            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
781            PrintF("QuantifierPrefix{DecimalDigits}\n");
782            Advance();
783            break;
784        default:
785            Advance();
786            return false;
787    }
788    return true;
789}
790
791void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
792{
793    int min = -1;
794    int max = -1;
795    bool isGreedy = true;
796    switch (c0_) {
797        case '*':
798            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
799            PrintF("QuantifierPrefix %c\n", c0_);
800            min = 0;
801            max = INT32_MAX;
802            Advance();
803            break;
804        case '+':
805            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
806            PrintF("QuantifierPrefix %c\n", c0_);
807            min = 1;
808            max = INT32_MAX;
809            Advance();
810            break;
811        case '?':
812            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
813            PrintF("QuantifierPrefix %c\n", c0_);
814            Advance();
815            min = 0;
816            max = 1;
817            break;
818        case '{': {
819            uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
820            if (!ParserIntervalQuantifier(&min, &max)) {
821                pc_ = start;
822                Advance();  // back to '{'
823                return;
824            }
825            if (min > max) {
826                ParseError("Invalid repetition count");
827                return;
828            }
829            break;
830        }
831        default:
832            break;
833    }
834    if (c0_ == '?') {
835        isGreedy = false;
836        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
837        PrintF("Quantifier::QuantifierPrefix?\n");
838        Advance();
839    } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
840        ParseError("nothing to repeat");
841        return;
842    }
843
844    if (max == 0) {
845        buffer_.size_ = atomBcStart; // Drop all unnecessary bytecode
846    } else if (min != -1 && max != -1 && !isEmpty_) {
847        bool isLoopOp = false;
848        size_t checkCharPara = SIZE_MAX;
849
850        if (captureStart != 0) {
851            SaveResetOpCode saveResetOp;
852            saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
853        }
854
855        // zero advance check
856        uint8_t firstOp = buffer_.GetU8(atomBcStart);
857        if (max == INT32_MAX && firstOp != RegExpOpCode::OP_CHAR && firstOp != RegExpOpCode::OP_CHAR32 &&
858                                firstOp != RegExpOpCode::OP_RANGE && firstOp != RegExpOpCode::OP_RANGE32 &&
859                                firstOp != RegExpOpCode::OP_ALL && firstOp != RegExpOpCode::OP_DOTS &&
860                                firstOp != RegExpOpCode::OP_SPARSE) {
861            stackCount_++;
862            PushCharOpCode pushCharOp;
863            pushCharOp.InsertOpCode(&buffer_, atomBcStart);
864            CheckCharOpCode checkCharOp;
865            checkCharPara = buffer_.GetSize() + 1;
866            // NOLINTNEXTLINE(readability-magic-numbers)
867            checkCharOp.EmitOpCode(&buffer_, 0);
868        }
869
870        if (min <= 1 && max == INT32_MAX) {
871            if (checkCharPara != SIZE_MAX) {
872                buffer_.PutU32(checkCharPara, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_SPLIT_NEXT)->GetSize());
873            }
874            if (isGreedy) {
875                SplitFirstOpCode splitOp;
876                splitOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - splitOp.GetSize());
877            } else {
878                SplitNextOpCode splitOp;
879                splitOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - splitOp.GetSize());
880            }
881        } else if (max > 1) {
882            if (checkCharPara != SIZE_MAX) {
883                buffer_.PutU32(checkCharPara, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
884            }
885            if (isGreedy) {
886                LoopGreedyOpCode loopOp;
887                loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
888                isLoopOp = true;
889            } else {
890                LoopOpCode loopOp;
891                loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
892                isLoopOp = true;
893            }
894        }
895
896        if (min == 0) {
897            if (isGreedy) {
898                SplitNextOpCode splitNextOp;
899                splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
900            } else {
901                SplitFirstOpCode splitFirstOp;
902                splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
903            }
904        }
905        if (isLoopOp) {
906            stackCount_++;
907            PushOpCode pushOp;
908            pushOp.InsertOpCode(&buffer_, atomBcStart);
909            PopOpCode popOp;
910            popOp.EmitOpCode(&buffer_);
911        }
912    }
913    isEmpty_ = false;
914}
915
916bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
917{
918    const uint8_t *p = *pp;
919    uint32_t c = 0;
920    char buffer[CACHE_SIZE] = {0};
921    char *q = buffer;
922    while (true) {
923        if (p <= end_) {
924            c = *p;
925        } else {
926            c = KEY_EOF;
927        }
928        if (c == '\\') {
929            p++;
930            if (*p != 'u') {
931                return false;
932            }
933            if (!ParseUnicodeEscape(&c)) {
934                return false;
935            }
936        } else if (c == '>') {
937            break;
938        } else if (c > CACHE_SIZE && c != KEY_EOF) {
939            c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
940        } else if (c != KEY_EOF) {
941            p++;
942        } else {
943            return false;
944        }
945        if (q == buffer) {
946            if (!IsIdentFirst(c)) {
947                return false;
948            }
949        } else {
950            if (!u_isIDPart(c)) {
951                return false;
952            }
953        }
954        if (q != nullptr) {
955            *q++ = c;
956        }
957    } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
958    p++;
959    *pp = p;
960    name = buffer;
961    return true;
962}
963
964int RegExpParser::ParseCaptureCount(const char *groupName)
965{
966    const uint8_t *p = nullptr;
967    int captureIndex = 1;
968    CString name;
969    hasNamedCaptures_ = 0;
970    for (p = base_; p < end_; p++) {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
971        switch (*p) {
972            case '(': {
973                if (p[1] == '?') {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
974                    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
975                    if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
976                        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
977                        p[CAPTURE_CONUT_ADVANCE] != '=') {
978                        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
979                        hasNamedCaptures_ = 1;
980                        p += CAPTURE_CONUT_ADVANCE;
981                        if (groupName != nullptr) {
982                            if (ParseGroupSpecifier(&p, name)) {
983                                if (strcmp(name.c_str(), groupName) == 0) {
984                                    return captureIndex;
985                                }
986                            }
987                        }
988                        captureIndex++;
989                    }
990                } else {
991                    captureIndex++;
992                }
993                break;
994            }
995            case '\\':
996                p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
997                break;
998            case '[': {
999                while (p < end_ && *p != ']') {
1000                    if (*p == '\\') {
1001                        p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1002                    }
1003                    p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1004                }
1005                break;
1006            }
1007            default:
1008                break;
1009        }
1010    }
1011    return captureIndex;
1012}
1013
1014// NOLINTNEXTLINE(readability-function-size)
1015int RegExpParser::ParseAtomEscape(bool isBackward)
1016{
1017    // AtomEscape[U, N]::
1018    //     DecimalEscape
1019    //     CharacterClassEscape[?U]
1020    //     CharacterEscape[?U]
1021    //     [+N]kGroupName[?U]
1022    int result = -1;
1023    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1024    PrintF("Parse AtomEscape------\n");
1025    PrevOpCode prevOp;
1026    switch (c0_) {
1027        case KEY_EOF:
1028            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1029            ParseError("unexpected end");
1030            break;
1031        // DecimalEscape
1032        case '1':
1033        case '2':
1034        case '3':
1035        case '4':
1036        case '5':
1037        case '6':
1038        case '7':
1039        case '8':
1040        case '9': {
1041            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1042            PrintF("NonZeroDigit %c\n", c0_);
1043            int capture = ParseDecimalDigits();
1044            if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
1045                ParseError("invalid backreference count");
1046                break;
1047            }
1048            if (isBackward) {
1049                BackwardBackReferenceOpCode backReferenceOp;
1050                backReferenceOp.EmitOpCode(&buffer_, capture);
1051            } else {
1052                BackReferenceOpCode backReferenceOp;
1053                backReferenceOp.EmitOpCode(&buffer_, capture);
1054            }
1055            break;
1056        }
1057        // CharacterClassEscape
1058        case 'd': {
1059            // [0-9]
1060            RangeOpCode rangeOp;
1061            if (isBackward) {
1062                prevOp.EmitOpCode(&buffer_, 0);
1063            }
1064            rangeOp.InsertOpCode(&buffer_, g_rangeD);
1065            goto parseLookBehind;
1066        }
1067        case 'D': {
1068            // [^0-9]
1069            RangeSet atomRange(g_rangeD);
1070            atomRange.Invert(IsUtf16());
1071            Range32OpCode rangeOp;
1072            if (isBackward) {
1073                prevOp.EmitOpCode(&buffer_, 0);
1074            }
1075            rangeOp.InsertOpCode(&buffer_, atomRange);
1076            goto parseLookBehind;
1077        }
1078        case 's': {
1079            // [\f\n\r\t\v]
1080            RangeOpCode rangeOp;
1081            if (isBackward) {
1082                prevOp.EmitOpCode(&buffer_, 0);
1083            }
1084            rangeOp.InsertOpCode(&buffer_, g_rangeS);
1085            goto parseLookBehind;
1086        }
1087        case 'S': {
1088            RangeSet atomRange(g_rangeS);
1089            Range32OpCode rangeOp;
1090            atomRange.Invert(IsUtf16());
1091            if (isBackward) {
1092                prevOp.EmitOpCode(&buffer_, 0);
1093            }
1094            rangeOp.InsertOpCode(&buffer_, atomRange);
1095            goto parseLookBehind;
1096        }
1097        case 'w': {
1098            // [A-Za-z0-9]
1099            RangeOpCode rangeOp;
1100            if (isBackward) {
1101                prevOp.EmitOpCode(&buffer_, 0);
1102            }
1103            rangeOp.InsertOpCode(&buffer_, g_rangeW);
1104            goto parseLookBehind;
1105        }
1106        case 'W': {
1107            // [^A-Za-z0-9]
1108            RangeSet atomRange(g_rangeW);
1109            atomRange.Invert(IsUtf16());
1110            Range32OpCode rangeOp;
1111            if (isBackward) {
1112                prevOp.EmitOpCode(&buffer_, 0);
1113            }
1114            rangeOp.InsertOpCode(&buffer_, atomRange);
1115            goto parseLookBehind;
1116        }
1117        case 'P':
1118        case 'p': {
1119            //CharacterClassStrings
1120            RangeSet atomRange;
1121            Range32OpCode rangeOp;
1122            ParseClassEscape(&atomRange);
1123            if (isBackward) {
1124                prevOp.EmitOpCode(&buffer_, 0);
1125            }
1126            rangeOp.InsertOpCode(&buffer_, atomRange);
1127            break;
1128        }
1129        // [+N]kGroupName[?U]
1130        case 'k': {
1131            Advance();
1132            if (c0_ != '<') {
1133                if (!IsUtf16() || HasNamedCaptures()) {
1134                    ParseError("expecting group name.");
1135                    break;
1136                }
1137            }
1138            Advance();
1139            Prev();
1140            CString name;
1141            auto **pp = const_cast<const uint8_t **>(&pc_);
1142            if (!ParseGroupSpecifier(pp, name)) {
1143                ParseError("GroupName Syntax error.");
1144                break;
1145            }
1146            int postion = FindGroupName(name);
1147            if (postion < 0) {
1148                postion = ParseCaptureCount(name.c_str());
1149                if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1150                    ParseError("group name not defined");
1151                    break;
1152                }
1153            }
1154            if (isBackward) {
1155                BackwardBackReferenceOpCode backReferenceOp;
1156                backReferenceOp.EmitOpCode(&buffer_, postion);
1157            } else {
1158                BackReferenceOpCode backReferenceOp;
1159                backReferenceOp.EmitOpCode(&buffer_, postion);
1160            }
1161            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1162            Advance();
1163            break;
1164        }
1165        parseLookBehind: {
1166            if (isBackward) {
1167                prevOp.EmitOpCode(&buffer_, 0);
1168            }
1169            Advance();
1170            break;
1171        }
1172        default:
1173            result = ParseCharacterEscape();
1174            break;
1175    }
1176    return result;
1177}
1178
1179int RegExpParser::RecountCaptures()
1180{
1181    if (totalCaptureCount_ < 0) {
1182        const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1183        totalCaptureCount_ = ParseCaptureCount(name);
1184    }
1185    return totalCaptureCount_;
1186}
1187bool RegExpParser::HasNamedCaptures()
1188{
1189    if (hasNamedCaptures_ < 0) {
1190        RecountCaptures();
1191    }
1192    return false;
1193}
1194
1195int RegExpParser::ParseCharacterEscape()
1196{
1197    // CharacterEscape[U]::
1198    //     ControlEscape
1199    //     c ControlLetter
1200    //     0 [lookahead ? DecimalDigit]
1201    //     HexEscapeSequence
1202    //     RegExpUnicodeEscapeSequence[?U]
1203    //     IdentityEscape[?U]
1204    uint32_t result = 0;
1205    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1206    switch (c0_) {
1207        // ControlEscape
1208        case 'f':
1209            result = '\f';
1210            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1211            PrintF("ControlEscape %c\n", c0_);
1212            Advance();
1213            break;
1214        case 'n':
1215            result = '\n';
1216            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1217            PrintF("ControlEscape %c\n", c0_);
1218            Advance();
1219            break;
1220        case 'r':
1221            result = '\r';
1222            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1223            PrintF("ControlEscape %c\n", c0_);
1224            Advance();
1225            break;
1226        case 't':
1227            result = '\t';
1228            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1229            PrintF("ControlEscape %c\n", c0_);
1230            Advance();
1231            break;
1232        case 'v':
1233            result = '\v';
1234            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1235            PrintF("ControlEscape %c\n", c0_);
1236            Advance();
1237            break;
1238        // c ControlLetter
1239        case 'c': {
1240            Advance();
1241            if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1242                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1243                PrintF("ControlLetter %c\n", c0_);
1244                result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINTNEXTLINE(readability-magic-numbers)
1245                Advance();
1246            } else {
1247                if (!IsUtf16()) {
1248                    pc_--;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1249                    result = '\\';
1250                } else {
1251                    ParseError("Invalid control letter");
1252                    return -1;
1253                }
1254            }
1255            break;
1256        }
1257        case '0': {
1258            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1259            PrintF("CharacterEscape 0 [lookahead ? DecimalDigit]\n");
1260            if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINTNEXTLINE(readability-magic-numbers)
1261                Advance();
1262                result = 0;
1263                break;
1264            }
1265            [[fallthrough]];
1266        }
1267        case '1':
1268        case '2':
1269        case '3':
1270        case '4':
1271        case '5':
1272        case '6':
1273        case '7': {
1274            if (IsUtf16()) {
1275                // With /u, decimal escape is not interpreted as octal character code.
1276                ParseError("Invalid class escape");
1277                return 0;
1278            }
1279            result = ParseOctalLiteral();
1280            break;
1281        }
1282        // ParseHexEscapeSequence
1283        // ParseRegExpUnicodeEscapeSequence
1284        case 'x': {
1285            Advance();
1286            if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1287                return result;
1288            }
1289            if (IsUtf16()) {
1290                ParseError("Invalid class escape");
1291                return -1;
1292            }
1293            result = 'x';
1294            break;
1295        }
1296        case 'u': {
1297            Advance();
1298            if (ParseUnicodeEscape(&result)) {
1299                return result;
1300            }
1301            if (IsUtf16()) {
1302                // With /u, invalid escapes are not treated as identity escapes.
1303                ParseError("Invalid unicode escape");
1304                return 0;
1305            }
1306            // If \u is not followed by a two-digit hexadecimal, treat it
1307            // as an identity escape.
1308            result = 'u';
1309            break;
1310        }
1311        // IdentityEscape[?U]
1312        case '$':
1313        case '(':
1314        case ')':
1315        case '*':
1316        case '+':
1317        case '.':
1318        case '/':
1319        case '?':
1320        case '[':
1321        case '\\':
1322        case ']':
1323        case '^':
1324        case '{':
1325        case '|':
1326        case '}':
1327            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1328            PrintF("IdentityEscape %c\n", c0_);
1329            result = c0_;
1330            Advance();
1331            break;
1332        default: {
1333            if (IsUtf16()) {
1334                ParseError("Invalid unicode escape");
1335                return 0;
1336            }
1337            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1338            PrintF("SourceCharacter %c\n", c0_);
1339            result = c0_;
1340            if (result < CHAR_MAXS) {
1341                Advance();
1342            } else {
1343                Prev();
1344                const uint8_t *p = pc_;
1345                result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
1346                int offset = static_cast<int>(p - pc_);
1347                Advance(offset + 1);
1348            }
1349            break;
1350        }
1351    }
1352    return static_cast<int>(result);
1353}
1354
1355bool RegExpParser::ParseClassRanges(RangeSet *result)
1356{
1357    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1358    PrintF("Parse ClassRanges------\n");
1359    while (c0_ != ']') {
1360        RangeSet s1;
1361        bool needInter = false;
1362        uint32_t c1 = ParseClassAtom(&s1);
1363        if (c1 == UINT32_MAX) {
1364            ParseError("invalid class range");
1365            return false;
1366        }
1367        needInter = NeedIntersection(c1);
1368        int next_c0 = *pc_;
1369        if (c0_ == '-' && next_c0 != ']') {
1370            if (c1 == CLASS_RANGE_BASE) {
1371                if (IsUtf16()) {
1372                    ParseError("invalid class range");
1373                    return false;
1374                }
1375                result->Insert(s1);
1376                continue;
1377            }
1378            Advance();
1379            RangeSet s2;
1380            uint32_t c2 = ParseClassAtom(&s2);
1381            if (c2 == UINT32_MAX) {
1382                ParseError("invalid class range");
1383                return false;
1384            }
1385            if (c2 == CLASS_RANGE_BASE) {
1386                if (IsUtf16()) {
1387                    ParseError("invalid class range");
1388                    return false;
1389                }
1390                result->Insert(s2);
1391                continue;
1392            }
1393            if (c1 < INT8_MAX) {
1394                if (c1 > c2) {
1395                    ParseError("invalid class range");
1396                    return false;
1397                }
1398            }
1399            needInter = NeedIntersection(c2);
1400            result->Insert(c1, c2);
1401            if (IsIgnoreCase() && needInter) {
1402                ProcessIntersection(result);
1403            }
1404        } else {
1405            result->Insert(s1);
1406            if (!(IsIgnoreCase() && needInter)) {
1407                continue;
1408            }
1409            if (c1 <= 'z' && c1 >= 'a') {
1410                result->Insert(RangeSet(c1 - 'a' + 'A'));
1411            } else {
1412                result->Insert(RangeSet(c1 - 'A' + 'a'));
1413            }
1414        }
1415    }
1416    Advance();
1417    return true;
1418}
1419
1420uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1421{
1422    uint32_t ret = UINT32_MAX;
1423    switch (c0_) {
1424        case '\\': {
1425            Advance();
1426            ret = static_cast<uint32_t>(ParseClassEscape(atom));
1427            break;
1428        }
1429        case KEY_EOF:
1430            break;
1431        case 0: {
1432            if (pc_ >= end_) {
1433                return UINT32_MAX;
1434            }
1435            [[fallthrough]];
1436        }
1437        default: {
1438            uint32_t value = c0_;
1439            size_t u16_size = 0;
1440            if (c0_ > INT8_MAX) {  // NOLINTNEXTLINE(readability-magic-numbers)
1441                pc_ -= 1;          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1442                auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1443                value = u16_result.first;
1444                u16_size = u16_result.second;
1445                Advance(u16_size + 1);
1446            } else {
1447                Advance();
1448            }
1449            atom->Insert(RangeSet(value));
1450            ret = value;
1451            break;
1452        }
1453    }
1454    return ret;
1455}
1456
1457int RegExpParser::ParseClassEscape(RangeSet *atom)
1458{
1459    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1460    PrintF("Parse ClassEscape------\n");
1461    int result = -1;
1462    switch (c0_) {
1463        case 'b':
1464            Advance();
1465            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1466            PrintF("ClassEscape %c", 'b');
1467            result = '\b';
1468            atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1469            break;
1470        case '-':
1471            Advance();
1472            result = '-';
1473            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1474            PrintF("ClassEscape %c", '-');
1475            atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1476            break;
1477        // CharacterClassEscape
1478        case 'd':
1479        case 'D':
1480            result = CLASS_RANGE_BASE;
1481            atom->Insert(g_rangeD);
1482            if (c0_ == 'D') {
1483                atom->Invert(IsUtf16());
1484            }
1485            Advance();
1486            break;
1487        case 's':
1488        case 'S':
1489            result = CLASS_RANGE_BASE;
1490            atom->Insert(g_rangeS);
1491            if (c0_ == 'S') {
1492                atom->Invert(IsUtf16());
1493            }
1494            Advance();
1495            break;
1496        case 'w':
1497        case 'W':
1498            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1499            PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1500            result = CLASS_RANGE_BASE;
1501            atom->Insert(g_rangeW);
1502            if (c0_ == 'W') {
1503                atom->Invert(IsUtf16());
1504            }
1505            Advance();
1506            break;
1507        case 'P':
1508        case 'p': {
1509            bool negate = (c0_ == 'P');
1510            CString propertyName;
1511            CString valueName;
1512            if (!ParseUnicodePropertyValueCharacters(propertyName, valueName) ||
1513                !ParseUnicodePropertyClassRange(propertyName, valueName, atom, negate)) {
1514                char *originExpression = (char *)base_;
1515                CString expression(originExpression);
1516                CString msg = "Invalid regular expression :" + expression;
1517                ParseError(msg.c_str());
1518            }
1519            result = CLASS_RANGE_BASE;
1520            break;
1521        }
1522        default:
1523            result = ParseCharacterEscape();
1524            int value = result;
1525            if (IsIgnoreCase()) {
1526                value = Canonicalize(value, IsUtf16());
1527            }
1528            atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1529            break;
1530    }
1531    return result;
1532}
1533
1534bool RegExpParser::ParseUnicodePropertyValueCharacters(CString &propertyName, CString &valueName)
1535{
1536    Advance();
1537    if (c0_ == '{') {
1538        if (!GetUnicodePropertyName(propertyName)) {
1539            return false;
1540        }
1541
1542        if (!GetUnicodePropertyValueName(valueName)) {
1543            return false;
1544        }
1545    } else {
1546        return false;
1547    }
1548    Advance();
1549    return true;
1550}
1551
1552bool RegExpParser::GetUnicodePropertyName(CString &propertyName)
1553{
1554    Advance();
1555    while (c0_ != '}' && c0_ != '=') {
1556        if (IsUnicodePropertyValueCharacter(c0_)) {
1557            propertyName += c0_;
1558        } else {
1559            return false;
1560        }
1561        Advance();
1562    }
1563    return true;
1564}
1565
1566bool RegExpParser::GetUnicodePropertyValueName(CString &valueName)
1567{
1568    if (c0_ == '=') {
1569        Advance();
1570        while (c0_ != '}') {
1571            if (IsUnicodePropertyValueCharacter(c0_)) {
1572                valueName += c0_;
1573            } else {
1574                return false;
1575            }
1576            Advance();
1577        }
1578    }
1579    return true;
1580}
1581
1582// NOLINTNEXTLINE(cert-dcl50-cpp)
1583void RegExpParser::PrintF(const char *fmt, ...)
1584{
1585#ifndef _NO_DEBUG_
1586    va_list args;
1587    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1588    va_start(args, fmt);
1589    vprintf(fmt, args);
1590    va_end(args);
1591#else
1592    (void)fmt;
1593#endif
1594}
1595
1596void RegExpParser::ParseError(const char *errorMessage)
1597{
1598    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1599    PrintF("error: ");
1600    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1601    PrintF(errorMessage);
1602    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1603    PrintF("\n");
1604    SetIsError();
1605    size_t length = strlen(errorMessage) + 1;
1606    if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1607        LOG_FULL(FATAL) << "memcpy_s failed";
1608        UNREACHABLE();
1609    }
1610}
1611
1612int RegExpParser::IsIdentFirst(uint32_t c)
1613{
1614    if (c < CACHE_SIZE) {
1615        return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1616    } else {
1617        auto uchar = static_cast<UChar32>(c);
1618        return static_cast<int>(u_isIDStart(uchar));
1619    }
1620}
1621
1622int RegExpParser::Canonicalize(int c, bool isUnicode)
1623{
1624    if (c < TMP_BUF_SIZE) {  // NOLINTNEXTLINE(readability-magic-numbers)
1625        if (c >= 'a' && c <= 'z') {
1626            c = c - 'a' + 'A';
1627        }
1628    } else {
1629        int cur = c;
1630        if (isUnicode) {
1631            c = u_tolower(static_cast<UChar32>(c));
1632            if (c >= 'a' && c <= 'z') {
1633                c = cur;
1634            }
1635        } else {
1636            c = u_toupper(static_cast<UChar32>(c));
1637            if (c >= 'A' && c <= 'Z') {
1638                c = cur;
1639            }
1640        }
1641    }
1642    return c;
1643}
1644
1645bool RegExpParser::NeedIntersection(uint32_t c)
1646{
1647    return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A');
1648}
1649
1650void RegExpParser::DoParserStackOverflowCheck(const char *errorMessage)
1651{
1652    if (UNLIKELY(thread_->GetCurrentStackPosition() < thread_->GetStackLimit())) {
1653        LOG_ECMA(ERROR) << "Stack overflow! current:" << thread_->GetCurrentStackPosition() <<
1654            " limit:" << thread_->GetStackLimit();
1655        ParseError(errorMessage);
1656        return;
1657    }
1658}
1659
1660bool RegExpParser::ParseUnicodePropertyClassRange(CString &propertyName, CString &valueName,
1661                                                  RangeSet *atom, bool negate)
1662{
1663    const char *name = propertyName.c_str();
1664    if (valueName.size() == 0) {
1665        if (MatchUnicodeProperty(UCHAR_GENERAL_CATEGORY_MASK, name, atom, negate)) {
1666            return true;
1667        }
1668        if (MatchSepcialUnicodeProperty(propertyName, negate, atom)) {
1669            return true;
1670        }
1671        UProperty property = u_getPropertyEnum(name);
1672        if (!IsSupportedBinaryProperty(property)) {
1673            return false;
1674        }
1675        if (!IsExactPropertyAlias(name, property)) {
1676            return false;
1677        }
1678        if (negate && IsBinaryPropertyOfStrings(property)) {
1679            return false;
1680        }
1681        return MatchUnicodeProperty(property, negate ? "N" : "Y", atom, false);
1682    } else {
1683        UProperty property = u_getPropertyEnum(propertyName.c_str());
1684        if (property == UCHAR_GENERAL_CATEGORY) {
1685            property = UCHAR_GENERAL_CATEGORY_MASK;
1686        } else if (property != UCHAR_SCRIPT && property != UCHAR_SCRIPT_EXTENSIONS) {
1687            return false;
1688        }
1689        return MatchUnicodeProperty(property, valueName.c_str(), atom, negate);
1690    }
1691}
1692
1693bool RegExpParser::MatchUnicodeProperty(UProperty property, const char* propertyName, RangeSet *atom, bool negate)
1694{
1695    UProperty propertyForMatch = property;
1696    if (propertyForMatch == UCHAR_SCRIPT_EXTENSIONS) {
1697        propertyForMatch = UCHAR_SCRIPT;
1698    }
1699    int32_t propertyValue = u_getPropertyValueEnum(propertyForMatch, propertyName);
1700    if (propertyValue == UCHAR_INVALID_CODE) {
1701        return false;
1702    }
1703    if (!IsExactPropertyValueAlis(propertyName, propertyForMatch, propertyValue)) {
1704        return false;
1705    }
1706    UErrorCode ec = U_ZERO_ERROR;
1707    icu::UnicodeSet set;
1708    set.applyIntPropertyValue(property, propertyValue, ec);
1709    bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1710    if (success) {
1711        const bool caseFolding = IsIgnoreCase();
1712        if (negate) {
1713            set.complement();
1714        }
1715        if (caseFolding) {
1716            set.closeOver(USET_CASE_INSENSITIVE);
1717        }
1718        set.removeAllStrings();
1719        for (int i = 0; i < set.getRangeCount(); i++) {
1720            atom->Insert(set.getRangeStart(i),  set.getRangeEnd(i));
1721        }
1722    }
1723    return success;
1724}
1725
1726bool RegExpParser::IsExactPropertyValueAlis(const char *valueName, UProperty property, int32_t propertyValue)
1727{
1728    const char *shortName = u_getPropertyValueName(property, propertyValue, U_SHORT_PROPERTY_NAME);
1729    if (shortName != nullptr && strcmp(valueName, shortName) == 0) {
1730        return true;
1731    }
1732    int i = 0;
1733    bool flag = true;
1734    while (flag) {
1735        const char *longName = u_getPropertyValueName(property, propertyValue,
1736            static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1737        if (longName == nullptr) {
1738            flag = false;
1739            break;
1740        }
1741        if (strcmp(valueName, longName) == 0) {
1742            return true;
1743        }
1744        i++;
1745    }
1746    return false;
1747}
1748
1749bool RegExpParser::IsExactPropertyAlias(const char* propertyName, UProperty property)
1750{
1751    const char* shortName = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1752    if (shortName != nullptr && strcmp(propertyName, shortName) == 0) {
1753        return true;
1754    }
1755    int i = 0;
1756    bool flag = true;
1757    while (flag) {
1758        const char* longName = u_getPropertyName(property,
1759            static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1760        if (longName == nullptr) {
1761            flag = false;
1762            break;
1763        }
1764        if (strcmp(propertyName, longName) == 0) {
1765            return true;
1766        }
1767        i++;
1768    }
1769    return false;
1770}
1771
1772bool RegExpParser::MatchSepcialUnicodeProperty(CString &name, bool negate, RangeSet *atom)
1773{
1774    if (name == "Any") {
1775        if (!negate) {
1776            atom->Insert(0, 0x10FFFF);
1777        }
1778    } else if (name == "ASCII") {
1779        if (negate) {
1780            atom->Insert(0x80, 0x10FFFF);
1781        } else {
1782            atom->Insert(0x0, 0x7F);
1783        }
1784    } else if (name == "Assigned") {
1785        return MatchUnicodeProperty(UCHAR_GENERAL_CATEGORY, "Unassigned", atom, !negate);
1786    } else {
1787        return false;
1788    }
1789    return true;
1790}
1791
1792bool RegExpParser::IsSupportedBinaryProperty(UProperty property)
1793{
1794    switch (property) {
1795        case UCHAR_ALPHABETIC:
1796        case UCHAR_ASCII_HEX_DIGIT:
1797        case UCHAR_BIDI_CONTROL:
1798        case UCHAR_BIDI_MIRRORED:
1799        case UCHAR_DASH:
1800        case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1801        case UCHAR_DEPRECATED:
1802        case UCHAR_DIACRITIC:
1803        case UCHAR_JOIN_CONTROL:
1804        case UCHAR_IDS_TRINARY_OPERATOR:
1805        case UCHAR_IDS_BINARY_OPERATOR:
1806        case UCHAR_IDEOGRAPHIC:
1807        case UCHAR_S_TERM:
1808        case UCHAR_ID_START:
1809        case UCHAR_ID_CONTINUE:
1810        case UCHAR_HEX_DIGIT:
1811        case UCHAR_GRAPHEME_EXTEND:
1812        case UCHAR_GRAPHEME_BASE:
1813        case UCHAR_EXTENDER:
1814        case UCHAR_LOGICAL_ORDER_EXCEPTION:
1815        case UCHAR_LOWERCASE:
1816        case UCHAR_MATH:
1817        case UCHAR_NONCHARACTER_CODE_POINT:
1818        case UCHAR_QUOTATION_MARK:
1819        case UCHAR_RADICAL:
1820        case UCHAR_SOFT_DOTTED:
1821        case UCHAR_TERMINAL_PUNCTUATION:
1822        case UCHAR_UNIFIED_IDEOGRAPH:
1823        case UCHAR_UPPERCASE:
1824        case UCHAR_WHITE_SPACE:
1825        case UCHAR_XID_CONTINUE:
1826        case UCHAR_XID_START:
1827        case UCHAR_VARIATION_SELECTOR:
1828        case UCHAR_PATTERN_SYNTAX:
1829        case UCHAR_PATTERN_WHITE_SPACE:
1830        case UCHAR_CASED:
1831        case UCHAR_CASE_IGNORABLE:
1832        case UCHAR_CHANGES_WHEN_LOWERCASED:
1833        case UCHAR_CHANGES_WHEN_UPPERCASED:
1834        case UCHAR_CHANGES_WHEN_TITLECASED:
1835        case UCHAR_CHANGES_WHEN_CASEFOLDED:
1836        case UCHAR_CHANGES_WHEN_CASEMAPPED:
1837        case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1838        case UCHAR_REGIONAL_INDICATOR:
1839        case UCHAR_EMOJI:
1840        case UCHAR_EMOJI_PRESENTATION:
1841        case UCHAR_EMOJI_MODIFIER:
1842        case UCHAR_EMOJI_MODIFIER_BASE:
1843        case UCHAR_EMOJI_COMPONENT:
1844        case UCHAR_EXTENDED_PICTOGRAPHIC:
1845            return true;
1846        case UCHAR_BASIC_EMOJI:
1847        case UCHAR_EMOJI_KEYCAP_SEQUENCE:
1848        case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
1849        case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
1850        case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
1851        case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
1852        case UCHAR_RGI_EMOJI:
1853            return false;
1854        default:
1855            break;
1856    }
1857    return false;
1858}
1859
1860bool RegExpParser::IsBinaryPropertyOfStrings(UProperty property)
1861{
1862    switch (property) {
1863        case UCHAR_BASIC_EMOJI:
1864        case UCHAR_EMOJI_KEYCAP_SEQUENCE:
1865        case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
1866        case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
1867        case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
1868        case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
1869        case UCHAR_RGI_EMOJI:
1870            return true;
1871        default:
1872            break;
1873    }
1874    return false;
1875}
1876}  // namespace panda::ecmascript
1877