1/*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef ECMASCRIPT_REGEXP_PARSER_H
17#define ECMASCRIPT_REGEXP_PARSER_H
18
19#include <cstdarg>
20#include <cstdio>
21#include <cstdint>
22#include "ecmascript/js_thread.h"
23#include "ecmascript/ecma_macros.h"
24#include "ecmascript/mem/chunk.h"
25#include "ecmascript/mem/c_containers.h"
26#include "ecmascript/mem/c_string.h"
27#include "ecmascript/mem/dyn_chunk.h"
28#include "ecmascript/regexp/regexp_opcode.h"
29#include "unicode/stringpiece.h"
30#include "unicode/uchar.h"
31#include "unicode/utf16.h"
32#include "unicode/utf8.h"
33#include "unicode/utypes.h"
34#include "unicode/udata.h"
35#include "unicode/uniset.h"
36
37namespace panda::ecmascript {
38class RegExpParser {
39public:
40    static constexpr auto FLAG_GLOBAL = (1U << 0U);
41    static constexpr auto FLAG_IGNORECASE = (1U << 1U);
42    static constexpr auto FLAG_MULTILINE = (1U << 2U);
43    static constexpr auto FLAG_DOTALL = (1U << 3U);
44    static constexpr auto FLAG_UTF16 = (1U << 4U);
45    static constexpr auto FLAG_STICKY = (1U << 5U);
46    static constexpr auto FLAG_HASINDICES = (1U << 6U);
47    static constexpr uint32_t FLAG_NUM = 7;
48    static const uint32_t KEY_EOF = UINT32_MAX;
49    static constexpr int CLASS_RANGE_BASE = 0x40000000;
50    static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
51    static constexpr uint32_t NUM_STACK_OFFSET = 8;
52    static constexpr uint32_t OCTAL_VALUE = 8;
53    static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
54    static constexpr uint32_t HEX_VALUE = 16;
55    static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10;
56    static constexpr uint32_t FLAGS_OFFSET = 12;
57    static constexpr uint32_t PREFILTER_OFFSET = 16;
58    static constexpr uint32_t OP_START_OFFSET = 20;
59    static constexpr uint32_t UNICODE_HEX_VALUE = 4;
60    static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
61    static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
62    static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
63    static constexpr size_t SPARSE_HEAD_OFFSET = 3;
64    static constexpr size_t SPARSE_OFF_OFFSET = 2;
65    static constexpr size_t SPARSE_MAX_OFFSET = 6;
66    static int Canonicalize(int c, bool isUnicode);
67
68    explicit RegExpParser(JSThread *thread, Chunk *chunk)
69        : thread_(thread),
70          base_(nullptr),
71          pc_(nullptr),
72          end_(nullptr),
73          flags_(0),
74          c0_(KEY_EOF),
75          captureCount_(0),
76          stackCount_(0),
77          isError_(false),
78          isEmpty_(false),
79          buffer_(chunk),
80          groupNames_(chunk)
81    {
82    }
83
84    ~RegExpParser()
85    {
86        Clear();
87    }
88
89    NO_COPY_SEMANTIC(RegExpParser);
90    NO_MOVE_SEMANTIC(RegExpParser);
91
92    inline void Init(char *source, size_t length, uint32_t flags)
93    {
94        pc_ = reinterpret_cast<uint8_t *>(source);
95        base_ = pc_;
96        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
97        end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
98        flags_ = flags;
99    }
100
101    void Parse();
102    void ParseDisjunction(bool isBackward);
103    void ParseAlternative(bool isBackward);
104    bool ParseAssertionCapture(int *captureIndex, bool isBackward);
105    void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
106    int ParseDecimalDigits();
107    int ParseAtomEscape(bool isBackward);
108    int ParseCharacterEscape();
109    bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
110    int ParseCaptureCount(const char *groupName);
111    bool ParseClassRanges(RangeSet *result);
112    void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
113    uint32_t ParseClassAtom(RangeSet *atom);
114    int ParseClassEscape(RangeSet *atom);
115    void ParseError(const char *errorMessage);
116    bool ParseUnicodePropertyValueCharacters(CString &categoryName, CString &valueName);
117    int FindGroupName(const CString &name);
118    uint32_t ParseOctalLiteral();
119    bool ParseHexEscape(int length, uint32_t *value);
120    bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
121    bool ParseUnicodeEscape(uint32_t *value);
122    bool ParserIntervalQuantifier(int *pmin, int *pmax);
123    bool HasNamedCaptures();
124    int ParseEscape(const uint8_t **pp, int isUtf16);
125    int RecountCaptures();
126    int IsIdentFirst(uint32_t c);
127    bool NeedIntersection(uint32_t c);
128    void DoParserStackOverflowCheck(const char *errorMessage);
129    bool MatchUnicodeProperty(UProperty property, const char *propertyName, RangeSet *atom, bool negate);
130    bool IsExactPropertyValueAlis(const char *valueName, UProperty property, int32_t propertyValue);
131    bool ParseUnicodePropertyClassRange(CString &propertyName, CString &valueName, RangeSet *atom, bool negate);
132    bool GetUnicodePropertyName(CString &propertyName);
133    bool GetUnicodePropertyValueName(CString &valueName);
134    bool IsExactPropertyAlias(const char *propertyName, UProperty property);
135    bool MatchSepcialUnicodeProperty(CString &name, bool negate, RangeSet *atom);
136    bool IsSupportedBinaryProperty(UProperty property);
137    bool IsBinaryPropertyOfStrings(UProperty property);
138    inline CVector<CString> GetGroupNames() const
139    {
140        return newGroupNames_;
141    }
142
143    inline size_t GetGroupNamesSize() const
144    {
145        return groupNames_.size_;
146    }
147
148    inline bool IsError() const
149    {
150        return isError_;
151    }
152
153    inline uint8_t *GetOriginBuffer() const
154    {
155        return buffer_.buf_;
156    }
157
158    inline size_t GetOriginBufferSize() const
159    {
160        return buffer_.size_;
161    }
162
163    inline CString GetErrorMsg() const
164    {
165        if (isError_) {
166            return CString(errorMsg_);
167        }
168        return CString("");
169    }
170
171    inline bool IsGlobal() const
172    {
173        return (flags_ & FLAG_GLOBAL) != 0;
174    }
175
176    inline bool IsIgnoreCase() const
177    {
178        return (flags_ & FLAG_IGNORECASE) != 0;
179    }
180
181    inline bool IsMultiline() const
182    {
183        return (flags_ & FLAG_MULTILINE) != 0;
184    }
185
186    inline bool IsDotAll() const
187    {
188        return (flags_ & FLAG_DOTALL) != 0;
189    }
190
191    inline bool IsUtf16() const
192    {
193        return (flags_ & FLAG_UTF16) != 0;
194    }
195
196    inline bool IsStick() const
197    {
198        return (flags_ & FLAG_STICKY) != 0;
199    }
200
201    inline bool IsUnicodePropertyValueCharacter(char c) const
202    {
203        if (c >= 'a' && c <= 'z') {
204            return true;
205        }
206        if (c >= 'A' && c <= 'Z') {
207            return true;
208        }
209        if (c >= '0' && c <= '9') {
210            return true;
211        }
212        return (c == '_');
213    }
214
215    inline static int GetcurrentCharNext(int c)
216    {
217        int cur = c;
218        c = u_tolower(static_cast<UChar32>(c));
219        if (c == cur) {
220            c = u_toupper(static_cast<UChar32>(c));
221        }
222        if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) &&
223            !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) {
224            c = cur;
225        }
226        return c;
227    }
228    inline static void ProcessIntersection(RangeSet *result)
229    {
230        RangeSet cr;
231        RangeSet cr1;
232        const uint32_t MINLOWERCHAR = 'a';
233        const uint32_t MAXLOWERCHAR = 'z' + 1;
234        const uint32_t MINUPPERCHAR = 'A';
235        const uint32_t MAXUPPERCHAR = 'Z' + 1;
236        // Range values for a and z + 1
237        cr.Insert(MINLOWERCHAR, MAXLOWERCHAR);
238        // Range values for A and Z + 1
239        cr.Insert(MINUPPERCHAR, MAXUPPERCHAR);
240        result->Inter(cr1, cr);
241        result->Insert(cr1);
242    }
243private:
244    friend class RegExpExecutor;
245    static constexpr int TMP_BUF_SIZE = 128;
246    void Clear()
247    {
248        base_ = nullptr;
249        pc_ = nullptr;
250        end_ = nullptr;
251        c0_ = KEY_EOF;
252        isError_ = false;
253        isEmpty_ = false;
254    }
255
256    void Advance()
257    {
258        if (pc_ <= end_) {
259            DoParserStackOverflowCheck("Advance stack overflow!");
260            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
261            c0_ = *pc_++;
262        } else {
263            c0_ = KEY_EOF;
264        }
265    }
266
267    void Advance(int offset)
268    {
269        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
270        pc_ += offset - 1;
271        Advance();
272    }
273
274    void Prev()
275    {
276        if (pc_ >= base_) {
277            // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
278            c0_ = *pc_--;
279        } else {
280            c0_ = KEY_EOF;
281        }
282    }
283
284    void SetIsError()
285    {
286        isError_ = true;
287    }
288
289    void PrintF(const char *fmt, ...);
290    JSThread *thread_;
291    uint8_t *base_;
292    uint8_t *pc_;
293    uint8_t *end_;
294    uint32_t flags_;
295    uint32_t c0_;
296    int captureCount_;
297    int stackCount_;
298    bool isError_;
299    bool isEmpty_;
300    char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINTNEXTLINE(modernize-avoid-c-arrays)
301    int hasNamedCaptures_ = -1;
302    int totalCaptureCount_ = -1;
303    DynChunk buffer_;
304    DynChunk groupNames_;
305    CVector<CString> newGroupNames_;
306};
307}  // namespace panda::ecmascript
308#endif  // ECMASCRIPT_REGEXP_PARSER_H
309