Commit | Line | Data |
a0d0e21e |
1 | /* regcomp.h |
a687059c |
2 | */ |
3 | |
c277df42 |
4 | typedef OP OP_4tree; /* Will be redefined later. */ |
5 | |
a687059c |
6 | /* |
7 | * The "internal use only" fields in regexp.h are present to pass info from |
8 | * compile to execute that permits the execute phase to run lots faster on |
9 | * simple cases. They are: |
10 | * |
79072805 |
11 | * regstart sv that must begin a match; Nullch if none obvious |
a687059c |
12 | * reganch is the match anchored (at beginning-of-line only)? |
13 | * regmust string (pointer into program) that match must include, or NULL |
79072805 |
14 | * [regmust changed to SV* for bminstr()--law] |
a687059c |
15 | * regmlen length of regmust string |
16 | * [regmlen not used currently] |
17 | * |
18 | * Regstart and reganch permit very fast decisions on suitable starting points |
19 | * for a match, cutting down the work a lot. Regmust permits fast rejection |
20 | * of lines that cannot possibly match. The regmust tests are costly enough |
e50aee73 |
21 | * that pregcomp() supplies a regmust only if the r.e. contains something |
a687059c |
22 | * potentially expensive (at present, the only such thing detected is * or + |
23 | * at the start of the r.e., which can involve a lot of backup). Regmlen is |
e50aee73 |
24 | * supplied because the test in pregexec() needs it and pregcomp() is computing |
a687059c |
25 | * it anyway. |
26 | * [regmust is now supplied always. The tests that use regmust have a |
27 | * heuristic that disables the test if it usually matches.] |
28 | * |
29 | * [In fact, we now use regmust in many cases to locate where the search |
30 | * starts in the string, so if regback is >= 0, the regmust search is never |
31 | * wasted effort. The regback variable says how many characters back from |
32 | * where regmust matched is the earliest possible start of the match. |
33 | * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.] |
34 | */ |
35 | |
c277df42 |
36 | /* #ifndef gould */ |
37 | /* #ifndef cray */ |
38 | /* #ifndef eta10 */ |
39 | #define REGALIGN |
40 | /* #endif */ |
41 | /* #endif */ |
42 | /* #endif */ |
43 | |
44 | #ifdef REGALIGN |
45 | # define REGALIGN_STRUCT |
46 | #endif |
47 | |
a687059c |
48 | /* |
49 | * Structure for regexp "program". This is essentially a linear encoding |
50 | * of a nondeterministic finite-state machine (aka syntax charts or |
51 | * "railroad normal form" in parsing technology). Each node is an opcode |
52 | * plus a "next" pointer, possibly plus an operand. "Next" pointers of |
53 | * all nodes except BRANCH implement concatenation; a "next" pointer with |
54 | * a BRANCH on both ends of it is connecting two alternatives. (Here we |
55 | * have one of the subtle syntax dependencies: an individual BRANCH (as |
56 | * opposed to a collection of them) is never concatenated with anything |
57 | * because of operator precedence.) The operand of some types of node is |
58 | * a literal string; for others, it is a node leading into a sub-FSM. In |
59 | * particular, the operand of a BRANCH node is the first node of the branch. |
60 | * (NB this is *not* a tree structure: the tail of the branch connects |
61 | * to the thing following the set of BRANCHes.) The opcodes are: |
62 | */ |
63 | |
64 | /* definition number opnd? meaning */ |
bbce6d69 |
65 | #define END 0 /* no End of program. */ |
66 | #define BOL 1 /* no Match "" at beginning of line. */ |
67 | #define MBOL 2 /* no Same, assuming multiline. */ |
68 | #define SBOL 3 /* no Same, assuming singleline. */ |
69 | #define EOL 4 /* no Match "" at end of line. */ |
70 | #define MEOL 5 /* no Same, assuming multiline. */ |
71 | #define SEOL 6 /* no Same, assuming singleline. */ |
72 | #define ANY 7 /* no Match any one character (except newline). */ |
73 | #define SANY 8 /* no Match any one character. */ |
74 | #define ANYOF 9 /* sv Match character in (or not in) this class. */ |
a0d0e21e |
75 | #define CURLY 10 /* sv Match this simple thing {n,m} times. */ |
76 | #define CURLYX 11 /* sv Match this complex thing {n,m} times. */ |
77 | #define BRANCH 12 /* node Match this alternative, or the next... */ |
78 | #define BACK 13 /* no Match "", "next" ptr points backward. */ |
bbce6d69 |
79 | #define EXACT 14 /* sv Match this string (preceded by length). */ |
80 | #define EXACTF 15 /* sv Match this string, folded (prec. by length). */ |
81 | #define EXACTFL 16 /* sv Match this string, folded in locale (w/len). */ |
82 | #define NOTHING 17 /* no Match empty string. */ |
83 | #define STAR 18 /* node Match this (simple) thing 0 or more times. */ |
84 | #define PLUS 19 /* node Match this (simple) thing 1 or more times. */ |
a0d0e21e |
85 | #define BOUND 20 /* no Match "" at any word boundary */ |
bbce6d69 |
86 | #define BOUNDL 21 /* no Match "" at any word boundary */ |
87 | #define NBOUND 22 /* no Match "" at any word non-boundary */ |
88 | #define NBOUNDL 23 /* no Match "" at any word non-boundary */ |
c277df42 |
89 | #define REF 24 /* num Match some already matched string */ |
90 | #define OPEN 25 /* num Mark this point in input as start of #n. */ |
91 | #define CLOSE 26 /* num Analogous to OPEN. */ |
92 | #define MINMOD 27 /* no Next operator is not greedy. */ |
93 | #define GPOS 28 /* no Matches where last m//g left off. */ |
94 | #define IFMATCH 29 /* off Succeeds if the following matches. */ |
95 | #define UNLESSM 30 /* off Fails if the following matches. */ |
96 | #define SUCCEED 31 /* no Return from a subroutine, basically. */ |
97 | #define WHILEM 32 /* no Do curly processing and see if rest matches. */ |
98 | #define ALNUM 33 /* no Match any alphanumeric character */ |
99 | #define ALNUML 34 /* no Match any alphanumeric char in locale */ |
100 | #define NALNUM 35 /* no Match any non-alphanumeric character */ |
101 | #define NALNUML 36 /* no Match any non-alphanumeric char in locale */ |
102 | #define SPACE 37 /* no Match any whitespace character */ |
103 | #define SPACEL 38 /* no Match any whitespace char in locale */ |
104 | #define NSPACE 39 /* no Match any non-whitespace character */ |
105 | #define NSPACEL 40 /* no Match any non-whitespace char in locale */ |
106 | #define DIGIT 41 /* no Match any numeric character */ |
107 | #define NDIGIT 42 /* no Match any non-numeric character */ |
108 | #define CURLYM 43 /* no Match this medium-complex thing {n,m} times. */ |
109 | #define CURLYN 44 /* no Match next-after-this simple thing |
110 | {n,m} times, set parenths. */ |
111 | #define TAIL 45 /* no Match empty string. Can jump here from outside. */ |
112 | #define REFF 46 /* num Match already matched string, folded */ |
113 | #define REFFL 47 /* num Match already matched string, folded in loc. */ |
114 | #define EVAL 48 /* evl Execute some Perl code. */ |
115 | #define LONGJMP 49 /* off Jump far away, requires REGALIGN_STRUCT. */ |
116 | #define BRANCHJ 50 /* off BRANCH with long offset, requires REGALIGN_STRUCT. */ |
117 | #define IFTHEN 51 /* off Switch, should be preceeded by switcher . */ |
118 | #define GROUPP 52 /* num Whether the group matched. */ |
119 | #define LOGICAL 53 /* no Next opcode should set the flag only. */ |
120 | #define SUSPEND 54 /* off "Independent" sub-RE. */ |
121 | #define RENUM 55 /* off Group with independently numbered parens. */ |
122 | #define OPTIMIZED 56 /* off Placeholder for dump. */ |
a687059c |
123 | |
124 | /* |
125 | * Opcode notes: |
126 | * |
127 | * BRANCH The set of branches constituting a single choice are hooked |
128 | * together with their "next" pointers, since precedence prevents |
129 | * anything being concatenated to any individual branch. The |
130 | * "next" pointer of the last BRANCH in a choice points to the |
131 | * thing following the whole choice. This is also where the |
132 | * final "next" pointer of each individual branch points; each |
133 | * branch starts with the operand node of a BRANCH node. |
134 | * |
135 | * BACK Normal "next" pointers all implicitly point forward; BACK |
136 | * exists to make loop structures possible. |
137 | * |
138 | * STAR,PLUS '?', and complex '*' and '+', are implemented as circular |
139 | * BRANCH structures using BACK. Simple cases (one character |
140 | * per match) are implemented with STAR and PLUS for speed |
141 | * and to minimize recursive plunges. |
142 | * |
c277df42 |
143 | * OPEN,CLOSE,GROUPP ...are numbered at compile time. |
a687059c |
144 | */ |
145 | |
fe14fcc3 |
146 | #ifndef DOINIT |
161b471a |
147 | EXTCONST U8 regkind[]; |
a0d0e21e |
148 | #else |
161b471a |
149 | EXTCONST U8 regkind[] = { |
a0d0e21e |
150 | END, |
151 | BOL, |
152 | BOL, |
153 | BOL, |
154 | EOL, |
155 | EOL, |
156 | EOL, |
157 | ANY, |
158 | ANY, |
159 | ANYOF, |
160 | CURLY, |
161 | CURLY, |
162 | BRANCH, |
163 | BACK, |
bbce6d69 |
164 | EXACT, |
165 | EXACT, |
166 | EXACT, |
a0d0e21e |
167 | NOTHING, |
168 | STAR, |
169 | PLUS, |
bbce6d69 |
170 | BOUND, |
a0d0e21e |
171 | BOUND, |
172 | NBOUND, |
bbce6d69 |
173 | NBOUND, |
a0d0e21e |
174 | REF, |
175 | OPEN, |
176 | CLOSE, |
177 | MINMOD, |
774d564b |
178 | GPOS, |
c277df42 |
179 | BRANCHJ, |
180 | BRANCHJ, |
a0d0e21e |
181 | END, |
bbce6d69 |
182 | WHILEM, |
183 | ALNUM, |
184 | ALNUM, |
185 | NALNUM, |
186 | NALNUM, |
187 | SPACE, |
188 | SPACE, |
189 | NSPACE, |
190 | NSPACE, |
191 | DIGIT, |
192 | NDIGIT, |
c277df42 |
193 | CURLY, |
194 | CURLY, |
195 | NOTHING, |
196 | REF, |
197 | REF, |
198 | EVAL, |
199 | LONGJMP, |
200 | BRANCHJ, |
201 | BRANCHJ, |
202 | GROUPP, |
203 | LOGICAL, |
204 | BRANCHJ, |
205 | BRANCHJ, |
206 | NOTHING, |
a0d0e21e |
207 | }; |
fe14fcc3 |
208 | #endif |
209 | |
c277df42 |
210 | /* The following have no fixed length. char* since we do strchr on it. */ |
a687059c |
211 | #ifndef DOINIT |
fc1ce8cc |
212 | EXTCONST char varies[]; |
a687059c |
213 | #else |
fc1ce8cc |
214 | EXTCONST char varies[] = { |
c277df42 |
215 | BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, |
216 | WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, 0 |
bbce6d69 |
217 | }; |
a687059c |
218 | #endif |
219 | |
c277df42 |
220 | /* The following always have a length of 1. char* since we do strchr on it. */ |
a687059c |
221 | #ifndef DOINIT |
fc1ce8cc |
222 | EXTCONST char simple[]; |
a687059c |
223 | #else |
fc1ce8cc |
224 | EXTCONST char simple[] = { |
bbce6d69 |
225 | ANY, SANY, ANYOF, |
226 | ALNUM, ALNUML, NALNUM, NALNUML, |
227 | SPACE, SPACEL, NSPACE, NSPACEL, |
228 | DIGIT, NDIGIT, 0 |
229 | }; |
a687059c |
230 | #endif |
231 | |
a687059c |
232 | /* |
233 | * A node is one char of opcode followed by two chars of "next" pointer. |
234 | * "Next" pointers are stored as two 8-bit pieces, high order first. The |
235 | * value is a positive offset from the opcode of the node containing it. |
236 | * An operand, if any, simply follows the node. (Note that much of the |
237 | * code generation knows about this implicit relationship.) |
238 | * |
239 | * Using two bytes for the "next" pointer is vast overkill for most things, |
240 | * but allows patterns to get big without disasters. |
241 | * |
242 | * [If REGALIGN is defined, the "next" pointer is always aligned on an even |
243 | * boundary, and reads the offset directly as a short. Also, there is no |
244 | * special test to reverse the sign of BACK pointers since the offset is |
245 | * stored negative.] |
246 | */ |
247 | |
c277df42 |
248 | #ifdef REGALIGN_STRUCT |
249 | |
250 | struct regnode_string { |
251 | U8 flags; |
252 | U8 type; |
253 | U16 next_off; |
254 | U8 string[1]; |
255 | }; |
a687059c |
256 | |
c277df42 |
257 | struct regnode_1 { |
258 | U8 flags; |
259 | U8 type; |
260 | U16 next_off; |
261 | U32 arg1; |
262 | }; |
263 | |
264 | struct regnode_2 { |
265 | U8 flags; |
266 | U8 type; |
267 | U16 next_off; |
268 | U16 arg1; |
269 | U16 arg2; |
270 | }; |
271 | |
272 | #endif |
273 | |
274 | #define REG_INFTY I16_MAX |
a687059c |
275 | |
a687059c |
276 | #ifdef REGALIGN |
c277df42 |
277 | # define ARG_VALUE(arg) (arg) |
278 | # define ARG__SET(arg,val) ((arg) = (val)) |
a687059c |
279 | #else |
c277df42 |
280 | # define ARG_VALUE(arg) (((*((char*)&arg)&0377)<<8) + (*(((char*)&arg)+1)&0377)) |
281 | # define ARG__SET(arg,val) (((char*)&arg)[0] = (val) >> 8; ((char*)&arg)[1] = (val) & 0377;) |
a687059c |
282 | #endif |
c277df42 |
283 | |
284 | #define ARG(p) ARG_VALUE(ARG_LOC(p)) |
285 | #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) |
286 | #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) |
287 | #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) |
288 | #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) |
289 | #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) |
290 | |
291 | #ifndef lint |
292 | # ifdef REGALIGN |
293 | # ifdef REGALIGN_STRUCT |
294 | # define NEXT_OFF(p) ((p)->next_off) |
295 | # define NODE_ALIGN(node) |
296 | # define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */ |
297 | # else |
298 | # define NEXT_OFF(p) (*(short*)(p+1)) |
299 | # define NODE_ALIGN(node) ((!((long)node & 1)) ? node++ : 0) |
300 | # define NODE_ALIGN_FILL(node) ((!((long)node & 1)) ? *node++ = 127 : 0) |
301 | # endif |
302 | # else |
303 | # define NEXT_OFF(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) |
304 | # define NODE_ALIGN(node) |
305 | # define NODE_ALIGN_FILL(node) |
306 | # endif |
a687059c |
307 | #else /* lint */ |
c277df42 |
308 | # define NEXT_OFF(p) 0 |
309 | # define NODE_ALIGN(node) |
310 | # define NODE_ALIGN_FILL(node) |
a687059c |
311 | #endif /* lint */ |
312 | |
c277df42 |
313 | #define SIZE_ALIGN NODE_ALIGN |
314 | |
315 | #ifdef REGALIGN_STRUCT |
316 | # define OP(p) ((p)->type) |
317 | # define OPERAND(p) (((struct regnode_string *)p)->string) |
318 | # define NODE_ALIGN(node) |
319 | # define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) |
320 | # define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) |
321 | # define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) |
322 | # define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ |
323 | # define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) |
324 | #else |
325 | # define OP(p) (*(p)) |
326 | # define OPERAND(p) ((p) + 3) |
327 | # define ARG_LOC(p) (*(unsigned short*)(p+3)) |
328 | # define ARG1_LOC(p) (*(unsigned short*)(p+3)) |
329 | # define ARG2_LOC(p) (*(unsigned short*)(p+5)) |
330 | typedef char* regnode; |
331 | # define NODE_STEP_REGNODE NODE_STEP_B |
332 | # define EXTRA_STEP_2ARGS 4 |
333 | #endif |
a687059c |
334 | |
335 | #ifdef REGALIGN |
c277df42 |
336 | # define NODE_STEP_B 4 |
a687059c |
337 | #else |
c277df42 |
338 | # define NODE_STEP_B 3 |
339 | #endif |
340 | |
341 | #define NEXTOPER(p) ((p) + NODE_STEP_REGNODE) |
342 | #define PREVOPER(p) ((p) - NODE_STEP_REGNODE) |
343 | |
344 | #ifdef REGALIGN_STRUCT |
345 | # define FILL_ADVANCE_NODE(ptr, op) STMT_START { \ |
346 | (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END |
347 | # define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \ |
348 | ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END |
349 | #else |
350 | # define FILL_ADVANCE_NODE(ptr, op) STMT_START { \ |
351 | *(ptr)++ = op; *(ptr)++ = '\0'; *(ptr)++ = '\0'; } STMT_END |
352 | # define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \ |
353 | ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 2; } STMT_END |
a687059c |
354 | #endif |
355 | |
356 | #define MAGIC 0234 |
357 | |
c277df42 |
358 | #define SIZE_ONLY (regcode == ®dummy) |
359 | |
bbce6d69 |
360 | /* Flags for first parameter byte of ANYOF */ |
361 | #define ANYOF_INVERT 0x40 |
362 | #define ANYOF_FOLD 0x20 |
363 | #define ANYOF_LOCALE 0x10 |
364 | #define ANYOF_ISA 0x0F |
365 | #define ANYOF_ALNUML 0x08 |
366 | #define ANYOF_NALNUML 0x04 |
367 | #define ANYOF_SPACEL 0x02 |
368 | #define ANYOF_NSPACEL 0x01 |
369 | |
c277df42 |
370 | #ifdef REGALIGN_STRUCT |
371 | #define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1) |
372 | #else |
373 | #define ANY_SKIP 32 /* overwrite the first byte of |
374 | * the next guy. */ |
375 | #endif |
376 | |
a687059c |
377 | /* |
378 | * Utility definitions. |
379 | */ |
380 | #ifndef lint |
2304df62 |
381 | #ifndef CHARMASK |
a687059c |
382 | #define UCHARAT(p) ((int)*(unsigned char *)(p)) |
383 | #else |
2304df62 |
384 | #define UCHARAT(p) ((int)*(p)&CHARMASK) |
a687059c |
385 | #endif |
386 | #else /* lint */ |
387 | #define UCHARAT(p) regdummy |
388 | #endif /* lint */ |
389 | |
c277df42 |
390 | #define FAIL(m) croak ("/%.127s/: %s", regprecomp,m) |
391 | #define FAIL2(pat,m) re_croak2("/%.127s/: ",pat,regprecomp,m) |
392 | |
393 | #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode)) |
394 | |
395 | #ifdef REG_COMP_C |
396 | const static U8 regarglen[] = { |
397 | # ifdef REGALIGN_STRUCT |
398 | 0,0,0,0,0,0,0,0,0,0, |
399 | /*CURLY*/ EXTRA_SIZE(struct regnode_2), |
400 | /*CURLYX*/ EXTRA_SIZE(struct regnode_2), |
401 | 0,0,0,0,0,0,0,0,0,0,0,0, |
402 | /*REF*/ EXTRA_SIZE(struct regnode_1), |
403 | /*OPEN*/ EXTRA_SIZE(struct regnode_1), |
404 | /*CLOSE*/ EXTRA_SIZE(struct regnode_1), |
405 | 0,0, |
406 | /*IFMATCH*/ EXTRA_SIZE(struct regnode_1), |
407 | /*UNLESSM*/ EXTRA_SIZE(struct regnode_1), |
408 | 0,0,0,0,0,0,0,0,0,0,0,0, |
409 | /*CURLYM*/ EXTRA_SIZE(struct regnode_2), |
410 | /*CURLYN*/ EXTRA_SIZE(struct regnode_2), |
411 | 0, |
412 | /*REFF*/ EXTRA_SIZE(struct regnode_1), |
413 | /*REFFL*/ EXTRA_SIZE(struct regnode_1), |
414 | /*EVAL*/ EXTRA_SIZE(struct regnode_1), |
415 | /*LONGJMP*/ EXTRA_SIZE(struct regnode_1), |
416 | /*BRANCHJ*/ EXTRA_SIZE(struct regnode_1), |
417 | /*IFTHEN*/ EXTRA_SIZE(struct regnode_1), |
418 | /*GROUPP*/ EXTRA_SIZE(struct regnode_1), |
419 | /*LOGICAL*/ 0, |
420 | /*SUSPEND*/ EXTRA_SIZE(struct regnode_1), |
421 | /*RENUM*/ EXTRA_SIZE(struct regnode_1), 0, |
422 | # else |
423 | 0,0,0,0,0,0,0,0,0,0, |
424 | /*CURLY*/ 4, /*CURLYX*/ 4, |
425 | 0,0,0,0,0,0,0,0,0,0,0,0, |
426 | /*REF*/ 2, /*OPEN*/ 2, /*CLOSE*/ 2, |
427 | 0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2, |
428 | 0,0,0,0,0,0,0,0,0,0,0,0,/*CURLYM*/ 4,/*CURLYN*/ 4, |
429 | 0, /*REFF*/ 2, /*REFFL*/ 2, /*EVAL*/ 2, /*LONGJMP*/ 2, /*BRANCHJ*/ 2, |
430 | /*IFTHEN*/ 2, /*GROUPP*/ 2, /*LOGICAL*/ 0, /*RENUM*/ 2, /*RENUM*/ 2, 0, |
431 | # endif |
432 | }; |
433 | |
434 | const static char reg_off_by_arg[] = { |
435 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0 .. 15 */ |
436 | 0,0,0,0,0,0,0,0,0,0,0,0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2, 0, |
437 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 32 .. 47 */ |
438 | 0, /*LONGJMP*/ 1, /*BRANCHJ*/ 1, /*IFTHEN*/ 1, 0, 0, |
439 | /*RENUM*/ 1, /*RENUM*/ 1,0, |
440 | }; |
441 | #endif |
442 | |
c277df42 |
443 | #define REG_SEEN_ZERO_LEN 1 |
444 | #define REG_SEEN_LOOKBEHIND 2 |
445 | #define REG_SEEN_GPOS 4 |
446 | |
447 | #ifdef DEBUGGING |
448 | extern char *colors[4]; |
449 | #endif |
450 | |
451 | void re_croak2 _((const char* pat1,const char* pat2,...)) __attribute__((noreturn)); |