1/**
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "regexp.h"
17
18#include <lexer/token/letters.h>
19#include <unicode/uchar.h>
20
21#include <iostream>
22
23namespace panda::es2panda::lexer {
24
25RegExpError::RegExpError(const std::string_view &m) : message(m) {}
26
27RegExp::RegExp(util::StringView p, util::StringView f, RegExpFlags reFlags) : patternStr(p), flagsStr(f), flags(reFlags)
28{
29}
30
31RegExpParser::RegExpParser(const RegExp &re, ArenaAllocator *allocator)
32    : re_(re), allocator_ {allocator}, iter_(re_.patternStr), capturingGroupCount_(0)
33{
34}
35
36bool RegExpParser::Unicode() const
37{
38    return (re_.flags & RegExpFlags::UNICODE) != 0;
39}
40
41char32_t RegExpParser::Peek() const
42{
43    return iter_.Peek();
44}
45
46char32_t RegExpParser::Next()
47{
48    return iter_.Next();
49}
50
51static bool IsDecimalDigit(char32_t cp)
52{
53    return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
54}
55
56static bool IsOctalDigit(char32_t cp)
57{
58    return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_7);
59}
60
61static bool IsHexDigit(char32_t cp)
62{
63    return IsDecimalDigit(cp) || (cp >= LEX_CHAR_LOWERCASE_A && cp <= LEX_CHAR_LOWERCASE_F) ||
64           (cp >= LEX_CHAR_UPPERCASE_A && cp <= LEX_CHAR_UPPERCASE_F);
65}
66
67static uint32_t DigitValue(char32_t cp)
68{
69    return (cp - LEX_CHAR_0);
70}
71
72static void ThrowError(const std::string_view &message)
73{
74    throw RegExpError(message);
75}
76
77static uint32_t HexValue(char32_t cp)
78{
79    if (IsDecimalDigit(cp)) {
80        return DigitValue(cp);
81    }
82
83    constexpr auto OFFSET = 10;
84
85    if (cp < LEX_CHAR_LOWERCASE_A) {
86        ASSERT(cp >= LEX_CHAR_UPPERCASE_A);
87        return cp - LEX_CHAR_UPPERCASE_A + OFFSET;
88    }
89
90    return (cp - LEX_CHAR_LOWERCASE_A + OFFSET);
91}
92
93void RegExpParser::ParsePattern()
94{
95    ParseDisjunction();
96
97    if (iter_.HasNext()) {
98        ThrowError("Invalid closing parenthesis");
99    }
100    ValidateNamedGroupReferences();
101}
102
103void RegExpParser::ParseDisjunction()
104{
105    while (true) {
106        ParseAlternatives();
107
108        if (Peek() != LEX_CHAR_VLINE) {
109            break;
110        }
111
112        Next();
113    };
114}
115
116void RegExpParser::ParseAlternative()
117{
118    switch (Peek()) {
119        case LEX_CHAR_BACKSLASH: {
120            Next();
121            char32_t cp = Peek();
122            if (cp == LEX_CHAR_LOWERCASE_B || cp == LEX_CHAR_UPPERCASE_B) {
123                /* assertion */
124                Next();
125                return;
126            }
127
128            ParseAtomEscape();
129            break;
130        }
131        case LEX_CHAR_CIRCUMFLEX:
132        case LEX_CHAR_DOLLAR_SIGN: {
133            /* assertion */
134            Next();
135            return;
136        }
137        case LEX_CHAR_LEFT_PAREN: {
138            Next();
139
140            if (Peek() != LEX_CHAR_QUESTION) {
141                ParseCapturingGroup();
142                break;
143            }
144
145            Next();  // eat '?'
146
147            char32_t cp = Next();
148            if (cp == LEX_CHAR_COLON) {
149                ParseNonCapturingGroup();
150                break;
151            }
152
153            if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
154                ParseAssertion();
155
156                if (Unicode()) {
157                    return;
158                }
159
160                break;
161            }
162
163            if (cp != LEX_CHAR_LESS_THAN) {
164                ThrowError("Invalid group");
165            }
166
167            cp = Peek();
168            if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
169                Next();
170                ParseAssertion();
171                return;
172            }
173
174            ParseNamedCapturingGroup();
175            break;
176        }
177        case LEX_CHAR_LEFT_SQUARE: {
178            Next();
179            ParseCharacterClass();
180            break;
181        }
182        case LEX_CHAR_DOT: {
183            Next();
184            break;
185        }
186        default: {
187            if (ParseBracedQuantifier()) {
188                ThrowError("Invalid quantifier, nothing to repeat");
189            }
190
191            if (!ParsePatternCharacter()) {
192                ThrowError("Invalid character");
193            }
194
195            break;
196        }
197    }
198
199    ParseQuantifier();
200}
201
202void RegExpParser::ParseAlternatives()
203{
204    while (true) {
205        switch (Peek()) {
206            case util::StringView::Iterator::INVALID_CP:
207            case LEX_CHAR_RIGHT_PAREN:
208            case LEX_CHAR_VLINE: {
209                return;
210            }
211            default: {
212                ParseAlternative();
213            }
214        }
215    }
216}
217
218void RegExpParser::ParseNonCapturingGroup()
219{
220    ParseDisjunction();
221
222    if (Peek() != LEX_CHAR_RIGHT_PAREN) {
223        ThrowError("Invalid non-capturing group");
224    }
225
226    Next();
227}
228
229void RegExpParser::ParseNamedCapturingGroup()
230{
231    util::StringView name = ParseIdent();
232
233    auto result = groupNames_.insert(name);
234    if (!result.second) {
235        ThrowError("Duplicate group name");
236    }
237
238    ParseCapturingGroup();
239}
240
241void RegExpParser::ParseCapturingGroup()
242{
243    capturingGroupCount_++;
244
245    ParseDisjunction();
246
247    if (Peek() != LEX_CHAR_RIGHT_PAREN) {
248        ThrowError("Invalid capturing group");
249    }
250
251    Next();
252}
253
254void RegExpParser::ParseAssertion()
255{
256    ParseDisjunction();
257
258    if (Peek() != LEX_CHAR_RIGHT_PAREN) {
259        ThrowError("Invalid assertion");
260    }
261
262    Next();
263}
264
265uint32_t RegExpParser::ParseControlEscape()
266{
267    char32_t cp = Peek();
268    if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
269        (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
270        if (Unicode()) {
271            ThrowError("Invalid control escape");
272        }
273
274        if (cp < LEX_CHAR_0 || cp > LEX_CHAR_9) {
275            return LEX_CHAR_LOWERCASE_C;
276        }
277    }
278
279    Next();
280    constexpr auto MODULO = 32;
281    return cp % MODULO;
282}
283
284char32_t RegExpParser::ParseClassAtom()
285{
286    char32_t cp = Next();
287    if (cp != LEX_CHAR_BACKSLASH) {
288        return cp;
289    }
290
291    cp = Peek();
292    if (cp == LEX_CHAR_0) {
293        if (!Unicode()) {
294            return ParseDecimalEscape();
295        }
296
297        Next();
298
299        if (IsDecimalDigit(Peek())) {
300            ThrowError("Invalid class escape");
301        }
302
303        return LEX_CHAR_NULL;
304    }
305
306    Next();
307
308    switch (cp) {
309        case LEX_CHAR_LOWERCASE_C: {
310            return ParseControlEscape();
311        }
312        case LEX_CHAR_LOWERCASE_X: {
313            return ParseHexEscape();
314        }
315        case LEX_CHAR_LOWERCASE_U: {
316            if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
317                return cp;
318            }
319
320            return ParseUnicodeEscape();
321        }
322        case LEX_CHAR_LOWERCASE_P:
323        case LEX_CHAR_UPPERCASE_P: {
324            if (!Unicode()) {
325                return cp;
326            }
327
328            ParseUnicodePropertyEscape();
329            [[fallthrough]];
330        }
331        case LEX_CHAR_LOWERCASE_D:
332        case LEX_CHAR_UPPERCASE_D:
333        case LEX_CHAR_LOWERCASE_S:
334        case LEX_CHAR_UPPERCASE_S:
335        case LEX_CHAR_LOWERCASE_W:
336        case LEX_CHAR_UPPERCASE_W: {
337            return std::numeric_limits<uint32_t>::max();
338        }
339        case LEX_CHAR_LOWERCASE_B: {
340            return LEX_CHAR_BS;
341        }
342        case LEX_CHAR_LOWERCASE_F: {
343            return LEX_CHAR_FF;
344        }
345        case LEX_CHAR_LOWERCASE_N: {
346            return LEX_CHAR_LF;
347        }
348        case LEX_CHAR_LOWERCASE_R: {
349            return LEX_CHAR_CR;
350        }
351        case LEX_CHAR_LOWERCASE_T: {
352            return LEX_CHAR_TAB;
353        }
354        case LEX_CHAR_LOWERCASE_V: {
355            return LEX_CHAR_VT;
356        }
357        case LEX_CHAR_MINUS: {
358            return cp;
359        }
360        default: {
361            if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
362                ThrowError("Invalid escape");
363            }
364
365            return cp;
366        }
367    }
368
369    return cp;
370}
371
372static bool IsClassEscape(uint32_t cp)
373{
374    return cp == std::numeric_limits<uint32_t>::max();
375}
376
377void RegExpParser::ParseCharacterClass()
378{
379    if (Peek() == LEX_CHAR_CIRCUMFLEX) {
380        Next();
381    }
382
383    while (true) {
384        if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
385            Next();
386            break;
387        }
388
389        uint32_t left = ParseClassAtom();
390
391        if (Peek() != LEX_CHAR_MINUS) {
392            continue;
393        }
394
395        Next();
396
397        if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
398            Next();
399            break;
400        }
401
402        uint32_t right = ParseClassAtom();
403        if ((IsClassEscape(left) || IsClassEscape(right))) {
404            if (Unicode()) {
405                ThrowError("Invalid character class");
406            }
407
408            continue;
409        }
410
411        if (left > right) {
412            ThrowError("Class range out of order");
413        }
414    }
415}
416
417bool RegExpParser::IsSyntaxCharacter(char32_t cp) const
418{
419    switch (cp) {
420        case LEX_CHAR_RIGHT_SQUARE:
421        case LEX_CHAR_LEFT_BRACE:
422        case LEX_CHAR_RIGHT_BRACE: {
423            if (!Unicode()) {
424                return false;
425            }
426
427            [[fallthrough]];
428        }
429        case LEX_CHAR_CIRCUMFLEX:
430        case LEX_CHAR_DOLLAR_SIGN:
431        case LEX_CHAR_BACKSLASH:
432        case LEX_CHAR_DOT:
433        case LEX_CHAR_ASTERISK:
434        case LEX_CHAR_PLUS:
435        case LEX_CHAR_QUESTION:
436        case LEX_CHAR_LEFT_PAREN:
437        case LEX_CHAR_RIGHT_PAREN:
438        case LEX_CHAR_LEFT_SQUARE:
439        case LEX_CHAR_VLINE: {
440            return true;
441        }
442        default: {
443            return false;
444        }
445    }
446}
447
448void RegExpParser::ParseAtomEscape()
449{
450    char32_t cp = Peek();
451    if (IsDecimalDigit(cp)) {
452        ParseDecimalEscape();
453        return;
454    }
455
456    Next();
457
458    switch (cp) {
459        case LEX_CHAR_LOWERCASE_X: {
460            if (Unicode()) {
461                ParseHexEscape();
462            }
463            break;
464        }
465        case LEX_CHAR_LOWERCASE_U: {
466            if (Unicode()) {
467                ParseUnicodeEscape();
468            }
469            break;
470        }
471        case LEX_CHAR_LOWERCASE_K: {
472            ParseNamedBackreference();
473            break;
474        }
475        /* ControlEscape */
476        case LEX_CHAR_LOWERCASE_F:
477        case LEX_CHAR_LOWERCASE_N:
478        case LEX_CHAR_LOWERCASE_R:
479        case LEX_CHAR_LOWERCASE_T:
480        case LEX_CHAR_LOWERCASE_V:
481        /* CharacterClassEscape */
482        case LEX_CHAR_LOWERCASE_D:
483        case LEX_CHAR_UPPERCASE_D:
484        case LEX_CHAR_LOWERCASE_S:
485        case LEX_CHAR_UPPERCASE_S:
486        case LEX_CHAR_LOWERCASE_W:
487        case LEX_CHAR_UPPERCASE_W: {
488            break;
489        }
490        case LEX_CHAR_LOWERCASE_P:
491        case LEX_CHAR_UPPERCASE_P: {
492            ParseUnicodePropertyEscape();
493            break;
494        }
495        case LEX_CHAR_LOWERCASE_C: {
496            cp = Peek();
497            if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
498                (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
499                ThrowError("Invalid control escape");
500            }
501
502            Next();
503            break;
504        }
505        default: {
506            /* IdentityEscape */
507            if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
508                ThrowError("Invalid escape");
509            }
510        }
511    }
512}
513
514uint32_t RegExpParser::ParseDecimalEscape()
515{
516    ASSERT(IsDecimalDigit(Peek()));
517
518    auto digitStart = iter_;
519    uint32_t decimalValue = DigitValue(Next());
520    if (decimalValue == 0) {
521        if (!IsDecimalDigit(Peek())) {
522            /* \0 */
523            return decimalValue;
524        }
525
526        if (Unicode()) {
527            ThrowError("Invalid decimal escape");
528        }
529
530        iter_ = digitStart;
531        return ParseLegacyOctalEscape();
532    }
533
534    constexpr auto MULTIPLIER = 10;
535
536    while (IsDecimalDigit(Peek())) {
537        uint32_t newValue = decimalValue * MULTIPLIER + DigitValue(Next());
538        if (newValue < decimalValue) {
539            ThrowError("Invalid decimal escape");
540        }
541
542        decimalValue = newValue;
543    }
544
545    if (decimalValue <= capturingGroupCount_) {
546        return decimalValue;
547    }
548
549    if (Unicode()) {
550        ThrowError("Invalid decimal escape");
551    }
552
553    iter_ = digitStart;
554
555    if (!IsOctalDigit(Peek())) {
556        /* \8 or \9 */
557        return DigitValue(Next());
558    }
559
560    return ParseLegacyOctalEscape();
561}
562
563uint32_t RegExpParser::ParseLegacyOctalEscape()
564{
565    ASSERT(IsOctalDigit(Peek()));
566    uint32_t octalValue = DigitValue(Next());
567
568    if (!IsOctalDigit(Peek())) {
569        return octalValue;
570    }
571
572    octalValue = octalValue * 8 + DigitValue(Next());
573
574    if (!IsOctalDigit(Peek())) {
575        return octalValue;
576    }
577
578    // 8 is to left shift octalValue by three bits
579    uint32_t newValue = octalValue * 8 + DigitValue(Peek());
580    constexpr uint32_t MAX_OCTAL_VALUE = 0xFF;
581
582    if (newValue <= MAX_OCTAL_VALUE) {
583        octalValue = newValue;
584        Next();
585    }
586
587    return octalValue;
588}
589
590uint32_t RegExpParser::ParseHexEscape()
591{
592    // two hexadecimal digits after x in the regular expression
593    char32_t digit = Next();
594    if (!IsHexDigit(digit)) {
595        ThrowError("Invalid hex escape");
596    }
597
598    constexpr auto MULTIPLIER = 16;
599    uint32_t cpValue = HexValue(digit) * MULTIPLIER;
600
601    digit = Next();
602    if (!IsHexDigit(digit)) {
603        ThrowError("Invalid hex escape");
604    }
605
606    cpValue += HexValue(digit);
607    return cpValue;
608}
609
610uint32_t RegExpParser::ParseUnicodeDigits()
611{
612    uint32_t value = 0;
613    uint32_t count = 4;
614
615    while (count--) {
616        char32_t digit = Next();
617        if (!IsHexDigit(digit)) {
618            ThrowError("Invalid Unicode escape");
619        }
620
621        constexpr auto MULTIPLIER = 16;
622        value = value * MULTIPLIER + HexValue(digit);
623    }
624
625    return value;
626}
627
628uint32_t RegExpParser::ParseUnicodeEscape()
629{
630    uint32_t value = 0;
631
632    if (Peek() == LEX_CHAR_LEFT_BRACE) {
633        Next();
634        if (!IsHexDigit(Peek())) {
635            ThrowError("Invalid Unicode escape");
636        }
637
638        while (IsHexDigit(Peek())) {
639            constexpr auto MULTIPLIER = 16;
640            value = value * MULTIPLIER + HexValue(Next());
641            constexpr uint32_t CODE_POINT_MAX = 0x10FFFF;
642
643            if (value > CODE_POINT_MAX) {
644                ThrowError("Invalid Unicode escape");
645            }
646        }
647
648        if (Peek() != LEX_CHAR_RIGHT_BRACE) {
649            ThrowError("Invalid Unicode escape");
650        }
651
652        Next();
653    } else {
654        value = ParseUnicodeDigits();
655        if (Unicode() && util::StringView::IsHighSurrogate(value)) {
656            auto pos = iter_;
657
658            if (Next() == LEX_CHAR_BACKSLASH && Next() == LEX_CHAR_LOWERCASE_U) {
659                uint32_t next = ParseUnicodeDigits();
660                if (util::StringView::IsLowSurrogate(next)) {
661                    return util::StringView::DecodeSurrogates(value, next);
662                }
663            }
664
665            iter_ = pos;
666        }
667    }
668
669    return value;
670}
671
672void RegExpParser::ParseUnicodePropertyEscape()
673{
674    if (!Unicode()) {
675        return;
676    }
677
678    if (Peek() != LEX_CHAR_LEFT_BRACE) {
679        ThrowError("Invalid Unicode property escape");
680    }
681
682    Next();
683
684    while (true) {
685        if (!iter_.HasNext()) {
686            ThrowError("Unterminated Unicode property escape");
687        }
688
689        char32_t ch = Next();
690        if (ch == LEX_CHAR_RIGHT_BRACE) {
691            break;
692        }
693
694        /* TODO(dbatyai): Parse and valide Unicode property names */
695    }
696}
697
698void RegExpParser::ParseNamedBackreference()
699{
700    if (groupNames_.empty()) {
701        /* Identity escape */
702        return;
703    }
704
705    if (Next() != LEX_CHAR_LESS_THAN) {
706        ThrowError("Invalid named backreference");
707    }
708
709    util::StringView name = ParseIdent();
710    namedGroupReferences_.insert(name);
711}
712
713void RegExpParser::ValidateNamedGroupReferences()
714{
715    for (auto& ref : namedGroupReferences_) {
716        auto result = groupNames_.find(ref);
717        if (result == groupNames_.end()) {
718            ThrowError("Invalid named capture referenced");
719        }
720    }
721}
722
723void RegExpParser::ParseQuantifier()
724{
725    switch (Peek()) {
726        case LEX_CHAR_ASTERISK:
727        case LEX_CHAR_PLUS:
728        case LEX_CHAR_QUESTION: {
729            Next();
730            break;
731        }
732        case LEX_CHAR_LEFT_BRACE: {
733            if (!ParseBracedQuantifier()) {
734                return;
735            }
736
737            break;
738        }
739        default: {
740            return;
741        }
742    }
743
744    if (Peek() == LEX_CHAR_QUESTION) {
745        Next();
746    }
747}
748
749bool RegExpParser::ParseBracedQuantifier()
750{
751    if (Peek() != LEX_CHAR_LEFT_BRACE) {
752        return false;
753    }
754
755    auto startPos = iter_;
756    Next();
757
758    if (!IsDecimalDigit(Peek())) {
759        iter_ = startPos;
760        return false;
761    }
762
763    uint32_t leftValue = 0;
764    constexpr auto MULTIPLIER = 10;
765
766    while (IsDecimalDigit(Peek())) {
767        uint32_t newValue = leftValue * MULTIPLIER + DigitValue(Next());
768        if (newValue < leftValue) {
769            leftValue = std::numeric_limits<uint32_t>::max();
770            continue;
771        }
772
773        leftValue = newValue;
774    }
775
776    if (Peek() == LEX_CHAR_COMMA) {
777        Next();
778    }
779
780    if (Peek() == LEX_CHAR_RIGHT_BRACE) {
781        Next();
782        return true;
783    }
784
785    if (IsDecimalDigit(Peek())) {
786        uint32_t rightValue = 0;
787        while (IsDecimalDigit(Peek())) {
788            uint32_t newValue = rightValue * MULTIPLIER + DigitValue(Next());
789            if (newValue < rightValue) {
790                rightValue = std::numeric_limits<uint32_t>::max();
791                continue;
792            }
793
794            rightValue = newValue;
795        }
796
797        if (Peek() == LEX_CHAR_RIGHT_BRACE) {
798            if (rightValue < leftValue) {
799                ThrowError("Quantifier range out of order");
800            }
801
802            Next();
803            return true;
804        }
805    }
806
807    iter_ = startPos;
808    return false;
809}
810
811bool RegExpParser::ParsePatternCharacter()
812{
813    char32_t cp = Peek();
814    if (IsSyntaxCharacter(cp)) {
815        return false;
816    }
817
818    Next();
819    return true;
820}
821
822static bool IsIdStart(uint32_t cp)
823{
824    auto uchar = static_cast<UChar32>(cp);
825    return u_isIDStart(uchar) != 0 || cp == LEX_CHAR_DOLLAR_SIGN || cp == LEX_CHAR_UNDERSCORE;
826}
827
828static bool IsIdCont(uint32_t cp)
829{
830    auto uchar = static_cast<UChar32>(cp);
831    return u_isIDPart(uchar) != 0 || cp == LEX_CHAR_DOLLAR_SIGN || cp == LEX_CHAR_ZWNJ || cp == LEX_CHAR_ZWJ;
832}
833
834util::StringView RegExpParser::ParseIdent()
835{
836    char32_t cp = Next();
837    if (cp == LEX_CHAR_BACKSLASH) {
838        if (Next() != LEX_CHAR_LOWERCASE_U) {
839            ThrowError("Invalid group name");
840        }
841
842        if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
843            ThrowError("Invalid Unicode escape");
844        }
845
846        cp = ParseUnicodeEscape();
847    }
848
849    if (!IsIdStart(cp)) {
850        ThrowError("Invalid group name");
851    }
852
853    util::UString ident(allocator_);
854    ident.Append(cp);
855
856    while (true) {
857        cp = Next();
858        if (cp == LEX_CHAR_GREATER_THAN) {
859            break;
860        }
861
862        if (cp == LEX_CHAR_BACKSLASH) {
863            if (Next() != LEX_CHAR_LOWERCASE_U) {
864                ThrowError("Invalid group name");
865            }
866
867            if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
868                ThrowError("Invalid Unicode escape");
869            }
870
871            cp = ParseUnicodeEscape();
872        }
873
874        if (!IsIdCont(cp)) {
875            ThrowError("Invalid group name");
876        }
877
878        ident.Append(cp);
879    }
880
881    return ident.View();
882}
883
884}  // namespace panda::es2panda::lexer
885