1/** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16#include "regexp.h" 17 18#include "lexer/token/letters.h" 19#include "unicode/uchar.h" 20 21#include <iostream> 22 23namespace ark::es2panda::lexer { 24RegExp::RegExp(util::StringView p, util::StringView f, RegExpFlags reFlags) : patternStr(p), flagsStr(f), flags(reFlags) 25{ 26} 27 28RegExpParser::RegExpParser(const RegExp &re, ArenaAllocator *allocator, const parser::ParserImpl &parser) 29 : re_(re), allocator_ {allocator}, iter_(re_.patternStr), parser_(parser) 30{ 31} 32 33bool RegExpParser::Unicode() const 34{ 35 return (re_.flags & RegExpFlags::UNICODE) != 0; 36} 37 38char32_t RegExpParser::Peek() const 39{ 40 return iter_.Peek(); 41} 42 43char32_t RegExpParser::Next() 44{ 45 return iter_.Next(); 46} 47 48static bool IsDecimalDigit(char32_t cp) 49{ 50 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9); 51} 52 53static bool IsOctalDigit(char32_t cp) 54{ 55 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_7); 56} 57 58static bool IsHexDigit(char32_t cp) 59{ 60 return IsDecimalDigit(cp) || (cp >= LEX_CHAR_LOWERCASE_A && cp <= LEX_CHAR_LOWERCASE_F) || 61 (cp >= LEX_CHAR_UPPERCASE_A && cp <= LEX_CHAR_UPPERCASE_F); 62} 63 64static uint32_t DigitValue(char32_t cp) 65{ 66 return (cp - LEX_CHAR_0); 67} 68 69static uint32_t HexValue(char32_t cp) 70{ 71 if (IsDecimalDigit(cp)) { 72 return DigitValue(cp); 73 } 74 75 constexpr auto OFFSET = 10; 76 77 if (cp < LEX_CHAR_LOWERCASE_A) { 78 return cp - LEX_CHAR_UPPERCASE_A + OFFSET; 79 } 80 81 return (cp - LEX_CHAR_LOWERCASE_A + OFFSET); 82} 83 84void RegExpParser::ParsePattern() 85{ 86 ParseDisjunction(); 87 88 if (iter_.HasNext()) { 89 parser_.ThrowSyntaxError("Invalid closing parenthesis"); 90 } 91 92 if (!backReferences_.empty() && !groupNames_.empty()) { 93 for (const auto it : backReferences_) { 94 auto result = groupNames_.find(it); 95 if (result == groupNames_.end()) { 96 parser_.ThrowSyntaxError("Invalid capturing group"); 97 } 98 } 99 } 100} 101 102void RegExpParser::ParseDisjunction() 103{ 104 while (true) { 105 ParseAlternatives(); 106 107 if (Peek() != LEX_CHAR_VLINE) { 108 break; 109 } 110 111 Next(); 112 }; 113} 114 115void RegExpParser::ParseAlternative() 116{ 117 switch (Peek()) { 118 case LEX_CHAR_BACKSLASH: { 119 Next(); 120 char32_t cp = Peek(); 121 if (cp == LEX_CHAR_LOWERCASE_B || cp == LEX_CHAR_UPPERCASE_B) { 122 /* assertion */ 123 Next(); 124 return; 125 } 126 127 ParseAtomEscape(); 128 break; 129 } 130 case LEX_CHAR_CIRCUMFLEX: 131 case LEX_CHAR_DOLLAR_SIGN: { 132 /* assertion */ 133 Next(); 134 return; 135 } 136 case LEX_CHAR_LEFT_PAREN: { 137 if (ParseAlternativeCharLeftParen()) { 138 return; 139 } 140 break; 141 } 142 case LEX_CHAR_LEFT_SQUARE: { 143 Next(); 144 ParseCharacterClass(); 145 break; 146 } 147 case LEX_CHAR_DOT: { 148 Next(); 149 break; 150 } 151 default: { 152 if (ParseBracedQuantifier()) { 153 parser_.ThrowSyntaxError("Invalid quantifier, nothing to repeat"); 154 } 155 156 if (!ParsePatternCharacter()) { 157 parser_.ThrowSyntaxError("Invalid character"); 158 } 159 160 break; 161 } 162 } 163 164 ParseQuantifier(); 165} 166 167bool RegExpParser::ParseAlternativeCharLeftParen() 168{ 169 Next(); 170 171 if (Peek() != LEX_CHAR_QUESTION) { 172 ParseCapturingGroup(); 173 return false; 174 } 175 176 Next(); // eat '?' 177 178 char32_t cp = Next(); 179 if (cp == LEX_CHAR_COLON) { 180 ParseNonCapturingGroup(); 181 return false; 182 } 183 184 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) { 185 ParseAssertion(); 186 187 return Unicode(); 188 } 189 190 if (cp != LEX_CHAR_LESS_THAN) { 191 parser_.ThrowSyntaxError("Invalid group"); 192 } 193 194 cp = Peek(); 195 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) { 196 Next(); 197 ParseAssertion(); 198 return true; 199 } 200 201 ParseNamedCapturingGroup(); 202 return false; 203} 204 205void RegExpParser::ParseAlternatives() 206{ 207 while (true) { 208 switch (Peek()) { 209 case util::StringView::Iterator::INVALID_CP: 210 case LEX_CHAR_RIGHT_PAREN: 211 case LEX_CHAR_VLINE: { 212 return; 213 } 214 default: { 215 ParseAlternative(); 216 } 217 } 218 } 219} 220 221void RegExpParser::ParseNonCapturingGroup() 222{ 223 ParseDisjunction(); 224 225 if (Peek() != LEX_CHAR_RIGHT_PAREN) { 226 parser_.ThrowSyntaxError("Invalid non-capturing group"); 227 } 228 229 Next(); 230} 231 232void RegExpParser::ParseNamedCapturingGroup() 233{ 234 util::StringView name = ParseIdent(); 235 236 auto result = groupNames_.insert(name); 237 if (!result.second) { 238 parser_.ThrowSyntaxError("Duplicate group name"); 239 } 240 241 ParseCapturingGroup(); 242} 243 244void RegExpParser::ParseCapturingGroup() 245{ 246 capturingGroupCount_++; 247 248 ParseDisjunction(); 249 250 if (Peek() != LEX_CHAR_RIGHT_PAREN) { 251 parser_.ThrowSyntaxError("Invalid capturing group"); 252 } 253 254 Next(); 255} 256 257void RegExpParser::ParseAssertion() 258{ 259 ParseDisjunction(); 260 261 if (Peek() != LEX_CHAR_RIGHT_PAREN) { 262 parser_.ThrowSyntaxError("Invalid assertion"); 263 } 264 265 Next(); 266} 267 268uint32_t RegExpParser::ParseControlEscape() 269{ 270 char32_t cp = Peek(); 271 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) && 272 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) { 273 if (Unicode()) { 274 parser_.ThrowSyntaxError("Invalid control escape"); 275 } 276 277 if (cp < LEX_CHAR_0 || cp > LEX_CHAR_9) { 278 return LEX_CHAR_LOWERCASE_C; 279 } 280 } 281 282 Next(); 283 constexpr auto MODULO = 32; 284 return cp % MODULO; 285} 286 287char32_t RegExpParser::ParseClassAtom() 288{ 289 char32_t cp = Next(); 290 if (cp != LEX_CHAR_BACKSLASH) { 291 return cp; 292 } 293 294 cp = Peek(); 295 if (cp == LEX_CHAR_0) { 296 if (!Unicode()) { 297 return ParseDecimalEscape(); 298 } 299 300 Next(); 301 302 if (IsDecimalDigit(Peek())) { 303 parser_.ThrowSyntaxError("Invalid class escape"); 304 } 305 306 return LEX_CHAR_NULL; 307 } 308 309 Next(); 310 311 switch (cp) { 312 case LEX_CHAR_LOWERCASE_C: { 313 return ParseControlEscape(); 314 } 315 case LEX_CHAR_LOWERCASE_X: { 316 return ParseHexEscape(); 317 } 318 case LEX_CHAR_LOWERCASE_U: { 319 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) { 320 return cp; 321 } 322 323 return ParseUnicodeEscape(); 324 } 325 case LEX_CHAR_LOWERCASE_P: 326 case LEX_CHAR_UPPERCASE_P: { 327 if (!Unicode()) { 328 return cp; 329 } 330 331 ParseUnicodePropertyEscape(); 332 [[fallthrough]]; 333 } 334 case LEX_CHAR_LOWERCASE_D: 335 case LEX_CHAR_UPPERCASE_D: 336 case LEX_CHAR_LOWERCASE_S: 337 case LEX_CHAR_UPPERCASE_S: 338 case LEX_CHAR_LOWERCASE_W: 339 case LEX_CHAR_UPPERCASE_W: { 340 return std::numeric_limits<uint32_t>::max(); 341 } 342 case LEX_CHAR_LOWERCASE_B: { 343 return LEX_CHAR_BS; 344 } 345 case LEX_CHAR_LOWERCASE_F: { 346 return LEX_CHAR_FF; 347 } 348 case LEX_CHAR_LOWERCASE_N: { 349 return LEX_CHAR_LF; 350 } 351 case LEX_CHAR_LOWERCASE_R: { 352 return LEX_CHAR_CR; 353 } 354 case LEX_CHAR_LOWERCASE_T: { 355 return LEX_CHAR_TAB; 356 } 357 case LEX_CHAR_LOWERCASE_V: { 358 return LEX_CHAR_VT; 359 } 360 case LEX_CHAR_MINUS: { 361 return cp; 362 } 363 default: { 364 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) { 365 parser_.ThrowSyntaxError("Invalid escape"); 366 } 367 368 return cp; 369 } 370 } 371 372 return cp; 373} 374 375static bool IsClassEscape(uint32_t cp) 376{ 377 return cp == std::numeric_limits<uint32_t>::max(); 378} 379 380void RegExpParser::ParseCharacterClass() 381{ 382 if (Peek() == LEX_CHAR_CIRCUMFLEX) { 383 Next(); 384 } 385 386 while (true) { 387 if (Peek() == LEX_CHAR_RIGHT_SQUARE) { 388 Next(); 389 break; 390 } 391 392 uint32_t left = ParseClassAtom(); 393 394 if (Peek() != LEX_CHAR_MINUS) { 395 continue; 396 } 397 398 Next(); 399 400 if (Peek() == LEX_CHAR_RIGHT_SQUARE) { 401 Next(); 402 break; 403 } 404 405 uint32_t right = ParseClassAtom(); 406 if ((IsClassEscape(left) || IsClassEscape(right))) { 407 if (Unicode()) { 408 parser_.ThrowSyntaxError("Invalid character class"); 409 } 410 411 continue; 412 } 413 414 if (left > right) { 415 parser_.ThrowSyntaxError("Class range out of order"); 416 } 417 } 418} 419 420bool RegExpParser::IsSyntaxCharacter(char32_t cp) const 421{ 422 switch (cp) { 423 case LEX_CHAR_RIGHT_SQUARE: 424 case LEX_CHAR_LEFT_BRACE: 425 case LEX_CHAR_RIGHT_BRACE: { 426 if (!Unicode()) { 427 return false; 428 } 429 430 [[fallthrough]]; 431 } 432 case LEX_CHAR_CIRCUMFLEX: 433 case LEX_CHAR_DOLLAR_SIGN: 434 case LEX_CHAR_BACKSLASH: 435 case LEX_CHAR_DOT: 436 case LEX_CHAR_ASTERISK: 437 case LEX_CHAR_PLUS: 438 case LEX_CHAR_QUESTION: 439 case LEX_CHAR_LEFT_PAREN: 440 case LEX_CHAR_RIGHT_PAREN: 441 case LEX_CHAR_LEFT_SQUARE: 442 case LEX_CHAR_VLINE: { 443 return true; 444 } 445 default: { 446 return false; 447 } 448 } 449} 450 451void RegExpParser::ParseAtomEscape() 452{ 453 char32_t cp = Peek(); 454 if (IsDecimalDigit(cp)) { 455 ParseDecimalEscape(); 456 return; 457 } 458 459 Next(); 460 461 ParseAtomEscapeSwitch(cp); 462} 463 464void RegExpParser::ParseAtomEscapeSwitch(char32_t cp) 465{ 466 switch (cp) { 467 case LEX_CHAR_LOWERCASE_X: { 468 ParseHexEscape(); 469 break; 470 } 471 case LEX_CHAR_LOWERCASE_U: { 472 ParseUnicodeEscape(); 473 break; 474 } 475 case LEX_CHAR_LOWERCASE_K: { 476 ParseNamedBackreference(); 477 return; 478 } 479 /* ControlEscape */ 480 case LEX_CHAR_LOWERCASE_F: 481 case LEX_CHAR_LOWERCASE_N: 482 case LEX_CHAR_LOWERCASE_R: 483 case LEX_CHAR_LOWERCASE_T: 484 case LEX_CHAR_LOWERCASE_V: 485 /* CharacterClassEscape */ 486 case LEX_CHAR_LOWERCASE_D: 487 case LEX_CHAR_UPPERCASE_D: 488 case LEX_CHAR_LOWERCASE_S: 489 case LEX_CHAR_UPPERCASE_S: 490 case LEX_CHAR_LOWERCASE_W: 491 case LEX_CHAR_UPPERCASE_W: { 492 return; 493 } 494 case LEX_CHAR_LOWERCASE_P: 495 case LEX_CHAR_UPPERCASE_P: { 496 ParseUnicodePropertyEscape(); 497 return; 498 } 499 case LEX_CHAR_LOWERCASE_C: { 500 cp = Peek(); 501 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) && 502 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) { 503 parser_.ThrowSyntaxError("Invalid control escape"); 504 } 505 506 Next(); 507 return; 508 } 509 default: { 510 /* IdentityEscape */ 511 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) { 512 parser_.ThrowSyntaxError("Invalid escape"); 513 } 514 } 515 } 516} 517 518uint32_t RegExpParser::ParseDecimalEscape() 519{ 520 ASSERT(IsDecimalDigit(Peek())); 521 522 auto digitStart = iter_; 523 uint32_t decimalValue = DigitValue(Next()); 524 if (decimalValue == 0) { 525 if (!IsDecimalDigit(Peek())) { 526 /* \0 */ 527 return decimalValue; 528 } 529 530 if (Unicode()) { 531 parser_.ThrowSyntaxError("Invalid decimal escape"); 532 } 533 534 iter_ = digitStart; 535 return ParseLegacyOctalEscape(); 536 } 537 538 constexpr auto MULTIPLIER = 10; 539 540 while (IsDecimalDigit(Peek())) { 541 uint32_t newValue = decimalValue * MULTIPLIER + DigitValue(Next()); 542 if (newValue < decimalValue) { 543 parser_.ThrowSyntaxError("Invalid decimal escape"); 544 } 545 546 decimalValue = newValue; 547 } 548 549 if (decimalValue <= capturingGroupCount_) { 550 return decimalValue; 551 } 552 553 if (Unicode()) { 554 parser_.ThrowSyntaxError("Invalid decimal escape"); 555 } 556 557 iter_ = digitStart; 558 559 if (!IsOctalDigit(Peek())) { 560 /* \8 or \9 */ 561 return DigitValue(Next()); 562 } 563 564 return ParseLegacyOctalEscape(); 565} 566 567uint32_t RegExpParser::ParseLegacyOctalEscape() 568{ 569 ASSERT(IsOctalDigit(Peek())); 570 uint32_t octalValue = DigitValue(Next()); 571 572 if (!IsOctalDigit(Peek())) { 573 return octalValue; 574 } 575 576 octalValue = octalValue * 8U + DigitValue(Next()); 577 578 if (!IsOctalDigit(Peek())) { 579 return octalValue; 580 } 581 582 uint32_t newValue = octalValue * 8 + DigitValue(Peek()); 583 constexpr uint32_t MAX_OCTAL_VALUE = 0xFF; 584 585 if (newValue <= MAX_OCTAL_VALUE) { 586 octalValue = newValue; 587 Next(); 588 } 589 590 return octalValue; 591} 592 593uint32_t RegExpParser::ParseHexEscape() 594{ 595 char32_t digit = Next(); 596 if (!IsHexDigit(digit)) { 597 parser_.ThrowSyntaxError("Invalid hex escape"); 598 } 599 600 constexpr auto MULTIPLIER = 16; 601 uint32_t cpValue = HexValue(digit) * MULTIPLIER; 602 603 digit = Next(); 604 if (!IsHexDigit(digit)) { 605 parser_.ThrowSyntaxError("Invalid hex escape"); 606 } 607 608 cpValue += HexValue(digit); 609 return cpValue; 610} 611 612uint32_t RegExpParser::ParseUnicodeDigits() 613{ 614 uint32_t value = 0; 615 uint32_t count = 4; 616 617 while ((count--) != 0U) { 618 char32_t digit = Next(); 619 if (!IsHexDigit(digit)) { 620 parser_.ThrowSyntaxError("Invalid Unicode escape"); 621 } 622 623 constexpr auto MULTIPLIER = 16; 624 value = value * MULTIPLIER + HexValue(digit); 625 } 626 627 return value; 628} 629 630uint32_t RegExpParser::ParseUnicodeEscape() 631{ 632 uint32_t value = 0; 633 634 if (Peek() == LEX_CHAR_LEFT_BRACE) { 635 Next(); 636 if (!IsHexDigit(Peek())) { 637 parser_.ThrowSyntaxError("Invalid Unicode escape"); 638 } 639 640 while (IsHexDigit(Peek())) { 641 constexpr auto MULTIPLIER = 16; 642 value = value * MULTIPLIER + HexValue(Next()); 643 constexpr uint32_t CODE_POINT_MAX = 0x10FFFF; 644 645 if (value > CODE_POINT_MAX) { 646 parser_.ThrowSyntaxError("Invalid Unicode escape"); 647 } 648 } 649 650 if (Peek() != LEX_CHAR_RIGHT_BRACE) { 651 parser_.ThrowSyntaxError("Invalid Unicode escape"); 652 } 653 654 Next(); 655 } else { 656 value = ParseUnicodeDigits(); 657 if (!util::StringView::IsHighSurrogate(value)) { 658 return value; 659 } 660 661 auto pos = iter_; 662 if (Next() == LEX_CHAR_BACKSLASH && Next() == LEX_CHAR_LOWERCASE_U) { 663 uint32_t next = ParseUnicodeDigits(); 664 if (util::StringView::IsLowSurrogate(next)) { 665 return util::StringView::DecodeSurrogates(value, next); 666 } 667 } 668 iter_ = pos; 669 } 670 671 return value; 672} 673 674void RegExpParser::ParseUnicodePropertyEscape() 675{ 676 if (!Unicode()) { 677 return; 678 } 679 680 if (Peek() != LEX_CHAR_LEFT_BRACE) { 681 parser_.ThrowSyntaxError("Invalid Unicode property escape"); 682 } 683 684 Next(); 685 686 while (true) { 687 if (!iter_.HasNext()) { 688 parser_.ThrowSyntaxError("Unterminated Unicode property escape"); 689 } 690 691 char32_t ch = Next(); 692 if (ch == LEX_CHAR_LEFT_BRACE) { 693 break; 694 } 695 696 /* NOTE: Parse and validate Unicode property names */ 697 } 698} 699 700void RegExpParser::ParseNamedBackreference() 701{ 702 if (Next() != LEX_CHAR_LESS_THAN) { 703 if (!Unicode() && groupNames_.empty()) { 704 return; 705 } 706 707 parser_.ThrowSyntaxError("Invalid named backreference"); 708 } 709 710 if (IsDecimalDigit(Peek())) { 711 return; 712 } 713 714 util::StringView name = ParseIdent(); 715 backReferences_.insert(name); 716 717 ValidateNamedBackreference(Unicode()); 718} 719 720void RegExpParser::ValidateNamedBackreference(bool isUnicode) 721{ 722 if (Peek() != LEX_CHAR_LEFT_PAREN || Peek() != LEX_CHAR_BACKSLASH || Peek() != UNICODE_INVALID_CP) { 723 if (!isUnicode) { 724 /* Identity escape */ 725 return; 726 } 727 728 if (groupNames_.empty()) { 729 parser_.ThrowSyntaxError("Invalid named backreference"); 730 } 731 } 732} 733 734void RegExpParser::ValidateGroupNameElement(char32_t cp) 735{ 736 if (IsDecimalDigit(cp) && !backReferences_.empty()) { 737 parser_.ThrowSyntaxError("Invalid group name"); 738 } 739 if (cp == UNICODE_INVALID_CP && !groupNames_.empty()) { 740 parser_.ThrowSyntaxError("Invalid group name"); 741 } 742} 743 744void RegExpParser::ParseQuantifier() 745{ 746 switch (Peek()) { 747 case LEX_CHAR_ASTERISK: 748 case LEX_CHAR_PLUS: 749 case LEX_CHAR_QUESTION: { 750 Next(); 751 break; 752 } 753 case LEX_CHAR_LEFT_BRACE: { 754 if (!ParseBracedQuantifier()) { 755 return; 756 } 757 758 break; 759 } 760 default: { 761 return; 762 } 763 } 764 765 if (Peek() == LEX_CHAR_QUESTION) { 766 Next(); 767 } 768} 769 770bool RegExpParser::ParseBracedQuantifier() 771{ 772 if (Peek() != LEX_CHAR_LEFT_BRACE) { 773 return false; 774 } 775 776 auto startPos = iter_; 777 Next(); 778 779 if (!IsDecimalDigit(Peek())) { 780 iter_ = startPos; 781 return false; 782 } 783 784 uint32_t leftValue = 0; 785 constexpr auto MULTIPLIER = 10; 786 787 while (IsDecimalDigit(Peek())) { 788 uint32_t newValue = leftValue * MULTIPLIER + DigitValue(Next()); 789 if (newValue < leftValue) { 790 leftValue = std::numeric_limits<uint32_t>::max(); 791 continue; 792 } 793 794 leftValue = newValue; 795 } 796 797 if (Peek() == LEX_CHAR_COMMA) { 798 Next(); 799 } 800 801 if (Peek() == LEX_CHAR_RIGHT_BRACE) { 802 Next(); 803 return true; 804 } 805 806 if (IsDecimalDigit(Peek())) { 807 uint32_t rightValue = 0; 808 while (IsDecimalDigit(Peek())) { 809 uint32_t newValue = rightValue * MULTIPLIER + DigitValue(Next()); 810 if (newValue < rightValue) { 811 rightValue = std::numeric_limits<uint32_t>::max(); 812 continue; 813 } 814 815 rightValue = newValue; 816 } 817 818 if (Peek() == LEX_CHAR_RIGHT_BRACE) { 819 if (rightValue < leftValue) { 820 parser_.ThrowSyntaxError("Quantifier range out of order"); 821 } 822 823 Next(); 824 return true; 825 } 826 } 827 828 iter_ = startPos; 829 return false; 830} 831 832bool RegExpParser::ParsePatternCharacter() 833{ 834 char32_t cp = Peek(); 835 if (IsSyntaxCharacter(cp)) { 836 return false; 837 } 838 839 Next(); 840 return true; 841} 842 843static bool IsIdStart(uint32_t cp) 844{ 845 auto uchar = static_cast<UChar>(cp); 846 return u_isIDStart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE || 847 uchar == LEX_CHAR_BACKSLASH; 848} 849 850static bool IsIdCont(uint32_t cp) 851{ 852 auto uchar = static_cast<UChar>(cp); 853 return u_isIDPart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE || 854 uchar == LEX_CHAR_BACKSLASH || uchar == LEX_CHAR_ZWNJ || uchar == LEX_CHAR_ZWJ; 855} 856 857util::StringView RegExpParser::ParseIdent() 858{ 859 char32_t cp = Next(); 860 if (cp == LEX_CHAR_BACKSLASH) { 861 if (Next() != LEX_CHAR_LOWERCASE_U) { 862 parser_.ThrowSyntaxError("Invalid group name"); 863 } 864 865 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) { 866 parser_.ThrowSyntaxError("Invalid Unicode escape"); 867 } 868 869 cp = ParseUnicodeEscape(); 870 } 871 872 if (!IsIdStart(cp) && cp != UNICODE_INVALID_CP && backReferences_.empty()) { 873 parser_.ThrowSyntaxError("Invalid group name"); 874 } 875 876 util::UString ident(allocator_); 877 ident.Append(cp); 878 879 while (true) { 880 cp = Next(); 881 if (cp == LEX_CHAR_GREATER_THAN) { 882 break; 883 } 884 885 if (cp == LEX_CHAR_BACKSLASH) { 886 if (Next() != LEX_CHAR_LOWERCASE_U) { 887 parser_.ThrowSyntaxError("Invalid group name"); 888 } 889 890 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) { 891 parser_.ThrowSyntaxError("Invalid Unicode escape"); 892 } 893 894 cp = ParseUnicodeEscape(); 895 } 896 897 ValidateGroupNameElement(cp); 898 899 if (cp == UNICODE_INVALID_CP) { 900 break; 901 } 902 903 if (!IsIdCont(cp)) { 904 parser_.ThrowSyntaxError("Invalid group name"); 905 } 906 907 ident.Append(cp); 908 } 909 910 return ident.View(); 911} 912} // namespace ark::es2panda::lexer 913