1/*
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "ecmascript/regexp/regexp_executor.h"
17
18namespace panda::ecmascript {
19using RegExpState = RegExpExecutor::RegExpState;
20using RegExpGlobalResult = builtins::RegExpGlobalResult;
21bool RegExpExecutor::Execute(const uint8_t *input, uint32_t lastIndex, uint32_t length, uint8_t *buf, bool isWideChar)
22{
23    DynChunk buffer(buf, chunk_);
24    input_ = const_cast<uint8_t *>(input);
25    inputEnd_ = const_cast<uint8_t *>(input + length * (isWideChar ? WIDE_CHAR_SIZE : CHAR_SIZE));
26    uint32_t size = buffer.GetU32(0);
27    nCapture_ = buffer.GetU32(RegExpParser::NUM_CAPTURE__OFFSET);
28    nStack_ = buffer.GetU32(RegExpParser::NUM_STACK_OFFSET);
29    flags_ = buffer.GetU32(RegExpParser::FLAGS_OFFSET);
30    prefilter_ = buffer.GetU32(RegExpParser::PREFILTER_OFFSET);
31    isWideChar_ = isWideChar;
32
33    uint32_t captureResultSize = sizeof(CaptureState) * nCapture_;
34    uint32_t stackSize = sizeof(uintptr_t) * nStack_;
35    stateStackLen_ = 0;
36    currentStack_ = 0;
37
38    if (captureResultSize != 0) {
39        if (captureResultList_ == nullptr) {
40            captureResultList_ = chunk_->NewArray<CaptureState>(nCapture_);
41        }
42        if (memset_s(captureResultList_, captureResultSize, 0, captureResultSize) != EOK) {
43            LOG_FULL(FATAL) << "memset_s failed";
44            UNREACHABLE();
45        }
46    }
47    if (stackSize != 0 && stack_ == nullptr) {
48        stack_ = chunk_->NewArray<uintptr_t>(nStack_);
49        if (memset_s(stack_, stackSize, 0, stackSize) != EOK) {
50            LOG_FULL(FATAL) << "memset_s failed";
51            UNREACHABLE();
52        }
53    }
54    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
55    SetCurrentPtr(input + lastIndex * (isWideChar ? WIDE_CHAR_SIZE : CHAR_SIZE));
56    SetCurrentPC(RegExpParser::OP_START_OFFSET);
57
58    // first split
59    if ((flags_ & RegExpParser::FLAG_STICKY) == 0) {
60        PushRegExpState(STATE_SPLIT, RegExpParser::OP_START_OFFSET);
61    }
62    return ExecuteInternal(buffer, size);
63}
64
65bool RegExpExecutor::MatchFailed(bool isMatched)
66{
67    if (isMatched) {
68        stateStackLen_ = 0;
69        return true;
70    }
71    while (stateStackLen_ > 0) {
72        // StateType::STATE_SPLIT or STATE_NEGATIVE_MATCH_AHEAD
73        if (PopRegExpState() <= StateType::STATE_NEGATIVE_MATCH_AHEAD) {
74            return false;
75        }
76    }
77    return true;
78}
79
80// NOLINTNEXTLINE(readability-function-size)
81bool RegExpExecutor::ExecuteInternal(const DynChunk &byteCode, uint32_t pcEnd)
82{
83    while (GetCurrentPC() < pcEnd) {
84        // first split
85        if (!HandleFirstSplit()) {
86            return false;
87        }
88        uint8_t opCode = byteCode.GetU8(GetCurrentPC());
89        switch (opCode) {
90            case RegExpOpCode::OP_DOTS:
91            case RegExpOpCode::OP_ALL: {
92                if (!HandleOpAll(opCode)) {
93                    return false;
94                }
95                break;
96            }
97            case RegExpOpCode::OP_CHAR32:
98            case RegExpOpCode::OP_CHAR: {
99                if (!HandleOpChar(byteCode, opCode)) {
100                    return false;
101                }
102                break;
103            }
104            case RegExpOpCode::OP_NOT_WORD_BOUNDARY:
105            case RegExpOpCode::OP_WORD_BOUNDARY: {
106                if (!HandleOpWordBoundary(opCode)) {
107                    return false;
108                }
109                break;
110            }
111            case RegExpOpCode::OP_LINE_START: {
112                if (!HandleOpLineStart(opCode)) {
113                    return false;
114                }
115                break;
116            }
117            case RegExpOpCode::OP_LINE_END: {
118                if (!HandleOpLineEnd(opCode)) {
119                    return false;
120                }
121                break;
122            }
123            case RegExpOpCode::OP_SAVE_START:
124                HandleOpSaveStart(byteCode, opCode);
125                break;
126            case RegExpOpCode::OP_SAVE_END:
127                HandleOpSaveEnd(byteCode, opCode);
128                break;
129            case RegExpOpCode::OP_GOTO: {
130                uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
131                Advance(opCode, offset);
132                break;
133            }
134            case RegExpOpCode::OP_MATCH: {
135                ASSERT(stateStackLen_ > 0);
136                // jump to match ahead
137                uint32_t ahead = stateStackLen_ - 1;
138                auto stateStack = reinterpret_cast<RegExpState *>(stateStack_);
139                while (ahead != 0 && stateStack[ahead].type_ != StateType::STATE_MATCH_AHEAD &&
140                    stateStack[ahead].type_ != StateType::STATE_NEGATIVE_MATCH_AHEAD) {
141                    --ahead;
142                }
143                bool isNegative = stateStack[ahead].type_ == StateType::STATE_NEGATIVE_MATCH_AHEAD;
144                while (stateStackLen_ > ahead) {
145                    PopRegExpState(isNegative);
146                }
147                if (isNegative && MatchFailed(false)) {
148                    return false;
149                }
150                break;
151            }
152            case RegExpOpCode::OP_MATCH_END:
153                return true;
154            case RegExpOpCode::OP_SAVE_RESET:
155                HandleOpSaveReset(byteCode, opCode);
156                break;
157            case RegExpOpCode::OP_SPLIT_NEXT:
158            case RegExpOpCode::OP_MATCH_AHEAD:
159            case RegExpOpCode::OP_NEGATIVE_MATCH_AHEAD:
160                HandleOpMatch(byteCode, opCode);
161                break;
162            case RegExpOpCode::OP_SPLIT_FIRST:
163                HandleOpSplitFirst(byteCode, opCode);
164                break;
165            case RegExpOpCode::OP_PREV: {
166                if (!HandleOpPrev(opCode)) {
167                    return false;
168                }
169                break;
170            }
171            case RegExpOpCode::OP_LOOP_GREEDY:
172            case RegExpOpCode::OP_LOOP:
173                HandleOpLoop(byteCode, opCode);
174                break;
175            case RegExpOpCode::OP_PUSH_CHAR: {
176                PushRegExpState(StateType::STATE_PUSH, 0, 0);
177                PushStack(reinterpret_cast<uintptr_t>(GetCurrentPtr()));
178                Advance(opCode);
179                break;
180            }
181            case RegExpOpCode::OP_CHECK_CHAR: {
182                if (stateStackLen_ > 0 && PeekRegExpState()->type_ == StateType::STATE_PUSH) {
183                    DropRegExpState();
184                } else {
185                    ASSERT(currentStack_ > 0);
186                    PushRegExpState(StateType::STATE_POP, 0, stack_[currentStack_ - 1]);
187                }
188                if (PopStack() != reinterpret_cast<uintptr_t>(GetCurrentPtr())) {
189                    Advance(opCode);
190                } else {
191                    uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
192                    Advance(opCode, offset);
193                }
194                break;
195            }
196            case RegExpOpCode::OP_PUSH: {
197                PushRegExpState(StateType::STATE_PUSH, 0, 0);
198                PushStack(0);
199                Advance(opCode);
200                break;
201            }
202            case RegExpOpCode::OP_POP: {
203                ASSERT(currentStack_ > 0);
204                PushRegExpState(StateType::STATE_POP, 0, stack_[currentStack_ - 1]);
205                PopStack();
206                Advance(opCode);
207                break;
208            }
209            case RegExpOpCode::OP_RANGE32: {
210                if (!HandleOpRange32(byteCode)) {
211                    return false;
212                }
213                break;
214            }
215            case RegExpOpCode::OP_RANGE: {
216                if (!HandleOpRange(byteCode)) {
217                    return false;
218                }
219                break;
220            }
221            case RegExpOpCode::OP_SPARSE: {
222                if (!HandleOpSparse(byteCode)) {
223                    return false;
224                }
225                break;
226            }
227            case RegExpOpCode::OP_BACKREFERENCE:
228            case RegExpOpCode::OP_BACKWARD_BACKREFERENCE: {
229                if (!HandleOpBackReference(byteCode, opCode)) {
230                    return false;
231                }
232                break;
233            }
234            default:
235                UNREACHABLE();
236        }
237    }
238    // for loop match
239    return true;
240}
241
242void RegExpExecutor::DumpResult(std::ostream &out) const
243{
244    out << "captures:" << std::endl;
245    for (uint32_t i = 0; i < nCapture_; i++) {
246        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
247        CaptureState *captureState = &captureResultList_[i];
248        int32_t len = captureState->captureEnd - captureState->captureStart;
249        if ((captureState->captureStart != nullptr && captureState->captureEnd != nullptr) && (len >= 0)) {
250            out << i << ":\t" << CString(reinterpret_cast<const char *>(captureState->captureStart), len) << std::endl;
251        } else {
252            out << i << ":\t"
253                << "undefined" << std::endl;
254        }
255    }
256}
257
258void RegExpExecutor::GetResult(JSThread *thread)
259{
260    JSHandle<RegExpGlobalResult> matchResult(thread->GetCurrentEcmaContext()->GetRegExpGlobalResult());
261    matchResult->SetTotalCaptureCounts(thread, JSTaggedValue(nCapture_));
262    uint32_t firstIndex = RegExpGlobalResult::FIRST_CAPTURE_INDEX;
263    uint32_t availableCaptureSlot = matchResult->GetLength() - firstIndex;
264    uint32_t requiredLength =  nCapture_ * 2;
265    if (requiredLength > availableCaptureSlot) {
266        matchResult = RegExpGlobalResult::GrowCapturesCapacity(thread, matchResult, requiredLength + firstIndex);
267    }
268    for (uint32_t i = 0; i < nCapture_; i++) {
269        CaptureState *captureState = &captureResultList_[i];
270        int32_t len = captureState->captureEnd - captureState->captureStart;
271        if ((captureState->captureStart != nullptr && captureState->captureEnd != nullptr) && (len >= 0)) {
272            if (isWideChar_) {
273                matchResult->SetStartOfCaptureIndex(thread, i, JSTaggedValue(
274                    static_cast<int32_t>((captureState->captureStart - input_) / WIDE_CHAR_SIZE)));
275                matchResult->SetEndOfCaptureIndex(thread, i, JSTaggedValue(
276                    static_cast<int32_t>((captureState->captureEnd - input_) / WIDE_CHAR_SIZE)));
277            } else {
278                matchResult->SetStartOfCaptureIndex(thread, i, JSTaggedValue(
279                    static_cast<int32_t>(captureState->captureStart - input_)));
280                matchResult->SetEndOfCaptureIndex(thread, i, JSTaggedValue(
281                    static_cast<int32_t>(captureState->captureEnd - input_)));
282            }
283        } else {
284            // undefined
285            matchResult->SetStartOfCaptureIndex(thread, i, JSTaggedValue(0));
286            matchResult->SetEndOfCaptureIndex(thread, i, JSTaggedValue(-1));
287        }
288    }
289    uint32_t endIndex = currentPtr_ - input_;
290    if (isWideChar_) {
291        endIndex /= WIDE_CHAR_SIZE;
292    }
293    matchResult->SetEndIndex(thread, JSTaggedValue(endIndex));
294}
295
296void RegExpExecutor::PushRegExpState(StateType type, uint32_t pc)
297{
298    ReAllocStack(stateStackLen_ + 1);
299    auto state = reinterpret_cast<RegExpState *>(
300        stateStack_ +  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
301        stateStackLen_ * sizeof(RegExpState));
302    state->type_ = type;
303    state->currentPc_ = pc;
304    state->currentPtr_ = GetCurrentPtr();
305    stateStackLen_++;
306}
307
308void RegExpExecutor::PushRegExpState(StateType type, uint32_t pc, uintptr_t ptr)
309{
310    ReAllocStack(stateStackLen_ + 1);
311    auto state = reinterpret_cast<RegExpState *>(
312        stateStack_ + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
313        stateStackLen_ * sizeof(RegExpState));
314    state->type_ = type;
315    state->currentPc_ = pc;
316    state->currentPtr_ = reinterpret_cast<const uint8_t *>(ptr);
317    stateStackLen_++;
318}
319
320RegExpExecutor::StateType RegExpExecutor::PopRegExpState(bool copyCapture)
321{
322    if (stateStackLen_ != 0) {
323        auto state = PeekRegExpState();
324        stateStackLen_--;
325        switch (state->type_) {
326            case StateType::STATE_SPLIT:
327            case StateType::STATE_NEGATIVE_MATCH_AHEAD:
328            case StateType::STATE_MATCH_AHEAD:
329                SetCurrentPC(state->currentPc_);
330                SetCurrentPtr(state->currentPtr_);
331                break;
332            case StateType::STATE_SAVE:
333                if (copyCapture) {
334                    *(reinterpret_cast<const uint8_t **>(GetCaptureResultList()) + state->currentPc_) =
335                        state->currentPtr_;
336                }
337                break;
338            case StateType::STATE_PUSH:
339                PopStack();
340                break;
341            case StateType::STATE_POP:
342                PushStack((uintptr_t)state->currentPtr_);
343                break;
344            case StateType::STATE_SET:
345                SetStackValue((uintptr_t)state->currentPtr_);
346                break;
347            default:
348                UNREACHABLE();
349                break;
350        }
351        return state->type_;
352    }
353    return StateType::STATE_INVALID;
354}
355
356void RegExpExecutor::ReAllocStack(uint32_t stackLen)
357{
358    if (stackLen > stateStackSize_) {
359        ASSERT((static_cast<size_t>(stateStackSize_) * 2) <= static_cast<size_t>(UINT32_MAX)); // 2: double the size
360        uint32_t newStackSize = std::max(stateStackSize_ * 2, MIN_STACK_SIZE);  // 2: double the size
361        ASSERT((static_cast<size_t>(newStackSize) * static_cast<size_t>(sizeof(RegExpState))) <=
362            static_cast<size_t>(UINT32_MAX));
363        uint32_t stackByteSize = newStackSize * sizeof(RegExpState);
364        auto newStack = chunk_->NewArray<uint8_t>(stackByteSize);
365        if (memset_s(newStack, stackByteSize, 0, stackByteSize) != EOK) {
366            LOG_FULL(FATAL) << "memset_s failed";
367            UNREACHABLE();
368        }
369        if (stateStack_ != nullptr) {
370            auto stackSize = stateStackSize_ * sizeof(RegExpState);
371            if (memcpy_s(newStack, stackSize, stateStack_, stackSize) != EOK) {
372                return;
373            }
374        }
375        stateStack_ = newStack;
376        stateStackSize_ = newStackSize;
377    }
378}
379}  // namespace panda::ecmascript
380