1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_BYTECODES_H_
6#define V8_REGEXP_REGEXP_BYTECODES_H_
7
8#include "src/base/bounds.h"
9#include "src/base/macros.h"
10#include "src/base/strings.h"
11#include "src/common/globals.h"
12
13namespace v8 {
14namespace internal {
15
16// Maximum number of bytecodes that will be used (next power of 2 of actually
17// defined bytecodes).
18// All slots between the last actually defined bytecode and maximum id will be
19// filled with BREAKs, indicating an invalid operation. This way using
20// BYTECODE_MASK guarantees no OOB access to the dispatch table.
21constexpr int kRegExpPaddedBytecodeCount = 1 << 6;
22constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
23// The first argument is packed in with the byte code in one word, but so it
24// has 24 bits, but it can be positive and negative so only use 23 bits for
25// positive values.
26const unsigned int MAX_FIRST_ARG = 0x7fffffu;
27const int BYTECODE_SHIFT = 8;
28STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
29
30// The list of bytecodes, in format: V(Name, Code, ByteLength).
31// TODO(pthier): Argument offsets of bytecodes should be easily accessible by
32// name or at least by position.
33// TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32).
34#define BYTECODE_ITERATOR(V)                                                   \
35  V(BREAK, 0, 4)              /* bc8                                        */ \
36  V(PUSH_CP, 1, 4)            /* bc8 pad24                                  */ \
37  V(PUSH_BT, 2, 8)            /* bc8 pad24 offset32                         */ \
38  V(PUSH_REGISTER, 3, 4)      /* bc8 reg_idx24                              */ \
39  V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32                     */ \
40  V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24                              */ \
41  V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24                              */ \
42  V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24                              */ \
43  V(SET_REGISTER, 8, 8)       /* bc8 reg_idx24 value32                      */ \
44  V(ADVANCE_REGISTER, 9, 8)   /* bc8 reg_idx24 value32                      */ \
45  V(POP_CP, 10, 4)            /* bc8 pad24                                  */ \
46  V(POP_BT, 11, 4)            /* bc8 pad24                                  */ \
47  V(POP_REGISTER, 12, 4)      /* bc8 reg_idx24                              */ \
48  V(FAIL, 13, 4)              /* bc8 pad24                                  */ \
49  V(SUCCEED, 14, 4)           /* bc8 pad24                                  */ \
50  V(ADVANCE_CP, 15, 4)        /* bc8 offset24                               */ \
51  /* Jump to another bytecode given its offset.                             */ \
52  /* Bit Layout:                                                            */ \
53  /* 0x00 - 0x07:   0x10 (fixed) Bytecode                                   */ \
54  /* 0x08 - 0x1F:   0x00 (unused) Padding                                   */ \
55  /* 0x20 - 0x3F:   Address of bytecode to jump to                          */ \
56  V(GOTO, 16, 8) /* bc8 pad24 addr32                           */              \
57  /* Check if offset is in range and load character at given offset.        */ \
58  /* Bit Layout:                                                            */ \
59  /* 0x00 - 0x07:   0x11 (fixed) Bytecode                                   */ \
60  /* 0x08 - 0x1F:   Offset from current position                            */ \
61  /* 0x20 - 0x3F:   Address of bytecode when load is out of range           */ \
62  V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32                        */ \
63  /* Load character at given offset without range checks.                   */ \
64  /* Bit Layout:                                                            */ \
65  /* 0x00 - 0x07:   0x12 (fixed) Bytecode                                   */ \
66  /* 0x08 - 0x1F:   Offset from current position                            */ \
67  V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4)    /* bc8 offset24 */                  \
68  V(LOAD_2_CURRENT_CHARS, 19, 8)           /* bc8 offset24 addr32 */           \
69  V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */                  \
70  V(LOAD_4_CURRENT_CHARS, 21, 8)           /* bc8 offset24 addr32 */           \
71  V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */                  \
72  V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32                    */    \
73  /* Check if current character is equal to a given character               */ \
74  /* Bit Layout:                                                            */ \
75  /* 0x00 - 0x07:   0x19 (fixed) Bytecode                                   */ \
76  /* 0x08 - 0x0F:   0x00 (unused) Padding                                   */ \
77  /* 0x10 - 0x1F:   Character to check                                      */ \
78  /* 0x20 - 0x3F:   Address of bytecode when matched                        */ \
79  V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32                     */        \
80  V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */                   \
81  V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32                     */    \
82  V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */            \
83  /* Checks if the current character combined with mask (bitwise and)       */ \
84  /* matches a character (e.g. used when two characters in a disjunction    */ \
85  /* differ by only a single bit                                            */ \
86  /* Bit Layout:                                                            */ \
87  /* 0x00 - 0x07:   0x1c (fixed) Bytecode                                   */ \
88  /* 0x08 - 0x0F:   0x00 (unused) Padding                                   */ \
89  /* 0x10 - 0x1F:   Character to match against (after mask aplied)          */ \
90  /* 0x20 - 0x3F:   Bitmask bitwise and combined with current character     */ \
91  /* 0x40 - 0x5F:   Address of bytecode when matched                        */ \
92  V(AND_CHECK_CHAR, 28, 12)        /* bc8 pad8 uint16 uint32 addr32      */    \
93  V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */        \
94  V(AND_CHECK_NOT_CHAR, 30, 12)    /* bc8 pad8 uint16 uint32 addr32 */         \
95  V(MINUS_AND_CHECK_NOT_CHAR, 31,                                              \
96    12) /* bc8 pad8 base::uc16 base::uc16 base::uc16 addr32 */                 \
97  V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */  \
98  V(CHECK_CHAR_NOT_IN_RANGE, 33,                                               \
99    12) /* bc8 pad24 base::uc16 base::uc16 addr32 */                           \
100  /* Checks if the current character matches any of the characters encoded  */ \
101  /* in a bit table. Similar to/inspired by boyer moore string search       */ \
102  /* Bit Layout:                                                            */ \
103  /* 0x00 - 0x07:   0x22 (fixed) Bytecode                                   */ \
104  /* 0x08 - 0x1F:   0x00 (unused) Padding                                   */ \
105  /* 0x20 - 0x3F:   Address of bytecode when bit is set                     */ \
106  /* 0x40 - 0xBF:   Bit table                                               */ \
107  V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128           */       \
108  V(CHECK_LT, 35, 8) /* bc8 pad8 base::uc16 addr32                       */    \
109  V(CHECK_GT, 36, 8) /* bc8 pad8 base::uc16 addr32                       */    \
110  V(CHECK_NOT_BACK_REF, 37, 8)         /* bc8 reg_idx24 addr32 */              \
111  V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */              \
112  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8)                                 \
113  V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8)         /* bc8 reg_idx24 addr32 */     \
114  V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */     \
115  V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8)                        \
116  V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */          \
117  V(CHECK_REGISTER_LT, 44, 12)    /* bc8 reg_idx24 value32 addr32 */           \
118  V(CHECK_REGISTER_GE, 45, 12)    /* bc8 reg_idx24 value32 addr32 */           \
119  V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */                   \
120  V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32                           */    \
121  V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */                       \
122  /* Checks if the current position matches top of backtrack stack          */ \
123  /* Bit Layout:                                                            */ \
124  /* 0x00 - 0x07:   0x31 (fixed) Bytecode                                   */ \
125  /* 0x08 - 0x1F:   0x00 (unused) Padding                                   */ \
126  /* 0x20 - 0x3F:   Address of bytecode when current matches tos            */ \
127  V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32                           */      \
128  /* Advance character pointer by given offset and jump to another bytecode.*/ \
129  /* Bit Layout:                                                            */ \
130  /* 0x00 - 0x07:   0x32 (fixed) Bytecode                                   */ \
131  /* 0x08 - 0x1F:   Number of characters to advance                         */ \
132  /* 0x20 - 0x3F:   Address of bytecode to jump to                          */ \
133  V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32                    */   \
134  V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */                      \
135  /* Checks if current position + given offset is in range.                 */ \
136  /* Bit Layout:                                                            */ \
137  /* 0x00 - 0x07:   0x34 (fixed) Bytecode                                   */ \
138  /* 0x08 - 0x1F:   Offset from current position                            */ \
139  /* 0x20 - 0x3F:   Address of bytecode when position is out of range       */ \
140  V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32                     */  \
141  /* Combination of:                                                        */ \
142  /* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO          */ \
143  /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
144  /* Bit Layout:                                                            */ \
145  /* 0x00 - 0x07    0x35 (fixed) Bytecode                                   */ \
146  /* 0x08 - 0x1F    Load character offset from current position             */ \
147  /* 0x20 - 0x3F    Number of characters to advance                         */ \
148  /* 0x40 - 0xBF    Bit Table                                               */ \
149  /* 0xC0 - 0xDF    Address of bytecode when character is matched           */ \
150  /* 0xE0 - 0xFF    Address of bytecode when no match                       */ \
151  V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32)                                           \
152  /* Combination of:                                                        */ \
153  /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR    */ \
154  /* and ADVANCE_CP_AND_GOTO                                                */ \
155  /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
156  /* Bit Layout:                                                            */ \
157  /* 0x00 - 0x07    0x36 (fixed) Bytecode                                   */ \
158  /* 0x08 - 0x1F    Load character offset from current position             */ \
159  /* 0x20 - 0x2F    Number of characters to advance                         */ \
160  /* 0x30 - 0x3F    Character to match against (after mask applied)         */ \
161  /* 0x40 - 0x5F:   Bitmask bitwise and combined with current character     */ \
162  /* 0x60 - 0x7F    Minimum number of characters this pattern consumes      */ \
163  /* 0x80 - 0x9F    Address of bytecode when character is matched           */ \
164  /* 0xA0 - 0xBF    Address of bytecode when no match                       */ \
165  V(SKIP_UNTIL_CHAR_AND, 54, 24)                                               \
166  /* Combination of:                                                        */ \
167  /* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO                  */ \
168  /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
169  /* Bit Layout:                                                            */ \
170  /* 0x00 - 0x07    0x37 (fixed) Bytecode                                   */ \
171  /* 0x08 - 0x1F    Load character offset from current position             */ \
172  /* 0x20 - 0x2F    Number of characters to advance                         */ \
173  /* 0x30 - 0x3F    Character to match                                      */ \
174  /* 0x40 - 0x5F    Address of bytecode when character is matched           */ \
175  /* 0x60 - 0x7F    Address of bytecode when no match                       */ \
176  V(SKIP_UNTIL_CHAR, 55, 16)                                                   \
177  /* Combination of:                                                        */ \
178  /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR        */ \
179  /* and ADVANCE_CP_AND_GOTO                                                */ \
180  /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
181  /* Bit Layout:                                                            */ \
182  /* 0x00 - 0x07    0x38 (fixed) Bytecode                                   */ \
183  /* 0x08 - 0x1F    Load character offset from current position             */ \
184  /* 0x20 - 0x2F    Number of characters to advance                         */ \
185  /* 0x30 - 0x3F    Character to match                                      */ \
186  /* 0x40 - 0x5F    Minimum number of characters this pattern consumes      */ \
187  /* 0x60 - 0x7F    Address of bytecode when character is matched           */ \
188  /* 0x80 - 0x9F    Address of bytecode when no match                       */ \
189  V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20)                                       \
190  /* Combination of:                                                        */ \
191  /* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO      */ \
192  /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
193  /* Bit Layout:                                                            */ \
194  /* 0x00 - 0x07    0x39 (fixed) Bytecode                                   */ \
195  /* 0x08 - 0x1F    Load character offset from current position             */ \
196  /* 0x20 - 0x3F    Number of characters to advance                         */ \
197  /* 0x40 - 0x4F    Character to match                                      */ \
198  /* 0x50 - 0x5F    Other Character to match                                */ \
199  /* 0x60 - 0x7F    Address of bytecode when either character is matched    */ \
200  /* 0x80 - 0x9F    Address of bytecode when no match                       */ \
201  V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20)                                           \
202  /* Combination of:                                                        */ \
203  /* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and              */ \
204  /* and ADVANCE_CP_AND_GOTO                                                */ \
205  /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
206  /* Bit Layout:                                                            */ \
207  /* 0x00 - 0x07    0x3A (fixed) Bytecode                                   */ \
208  /* 0x08 - 0x1F    Load character offset from current position             */ \
209  /* 0x20 - 0x2F    Number of characters to advance                         */ \
210  /* 0x30 - 0x3F    Character to check if it is less than current char      */ \
211  /* 0x40 - 0xBF    Bit Table                                               */ \
212  /* 0xC0 - 0xDF    Address of bytecode when character is matched           */ \
213  /* 0xE0 - 0xFF    Address of bytecode when no match                       */ \
214  V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32)
215
216#define COUNT(...) +1
217static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
218#undef COUNT
219
220// Just making sure we assigned values above properly. They should be
221// contiguous, strictly increasing, and start at 0.
222// TODO(jgruber): Do not explicitly assign values, instead generate them
223// implicitly from the list order.
224STATIC_ASSERT(kRegExpBytecodeCount == 59);
225
226#define DECLARE_BYTECODES(name, code, length) \
227  static constexpr int BC_##name = code;
228BYTECODE_ITERATOR(DECLARE_BYTECODES)
229#undef DECLARE_BYTECODES
230
231static constexpr int kRegExpBytecodeLengths[] = {
232#define DECLARE_BYTECODE_LENGTH(name, code, length) length,
233    BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH)
234#undef DECLARE_BYTECODE_LENGTH
235};
236
237inline constexpr int RegExpBytecodeLength(int bytecode) {
238  DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
239  return kRegExpBytecodeLengths[bytecode];
240}
241
242static constexpr const char* const kRegExpBytecodeNames[] = {
243#define DECLARE_BYTECODE_NAME(name, ...) #name,
244    BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME)
245#undef DECLARE_BYTECODE_NAME
246};
247
248inline constexpr const char* RegExpBytecodeName(int bytecode) {
249  DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
250  return kRegExpBytecodeNames[bytecode];
251}
252
253void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc);
254void RegExpBytecodeDisassemble(const byte* code_base, int length,
255                               const char* pattern);
256
257}  // namespace internal
258}  // namespace v8
259
260#endif  // V8_REGEXP_REGEXP_BYTECODES_H_
261