4 typedef OP OP_4tree; /* Will be redefined later. */
7 * The "internal use only" fields in regexp.h are present to pass info from
8 * compile to execute that permits the execute phase to run lots faster on
9 * simple cases. They are:
11 * regstart sv that must begin a match; Nullch if none obvious
12 * reganch is the match anchored (at beginning-of-line only)?
13 * regmust string (pointer into program) that match must include, or NULL
14 * [regmust changed to SV* for bminstr()--law]
15 * regmlen length of regmust string
16 * [regmlen not used currently]
18 * Regstart and reganch permit very fast decisions on suitable starting points
19 * for a match, cutting down the work a lot. Regmust permits fast rejection
20 * of lines that cannot possibly match. The regmust tests are costly enough
21 * that pregcomp() supplies a regmust only if the r.e. contains something
22 * potentially expensive (at present, the only such thing detected is * or +
23 * at the start of the r.e., which can involve a lot of backup). Regmlen is
24 * supplied because the test in pregexec() needs it and pregcomp() is computing
26 * [regmust is now supplied always. The tests that use regmust have a
27 * heuristic that disables the test if it usually matches.]
29 * [In fact, we now use regmust in many cases to locate where the search
30 * starts in the string, so if regback is >= 0, the regmust search is never
31 * wasted effort. The regback variable says how many characters back from
32 * where regmust matched is the earliest possible start of the match.
33 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
37 * Structure for regexp "program". This is essentially a linear encoding
38 * of a nondeterministic finite-state machine (aka syntax charts or
39 * "railroad normal form" in parsing technology). Each node is an opcode
40 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
41 * all nodes except BRANCH implement concatenation; a "next" pointer with
42 * a BRANCH on both ends of it is connecting two alternatives. (Here we
43 * have one of the subtle syntax dependencies: an individual BRANCH (as
44 * opposed to a collection of them) is never concatenated with anything
45 * because of operator precedence.) The operand of some types of node is
46 * a literal string; for others, it is a node leading into a sub-FSM. In
47 * particular, the operand of a BRANCH node is the first node of the branch.
48 * (NB this is *not* a tree structure: the tail of the branch connects
49 * to the thing following the set of BRANCHes.) The opcodes are:
52 /* definition number opnd? meaning */
53 #define END 0 /* no End of program. */
54 #define BOL 1 /* no Match "" at beginning of line. */
55 #define MBOL 2 /* no Same, assuming multiline. */
56 #define SBOL 3 /* no Same, assuming singleline. */
57 #define EOL 4 /* no Match "" at end of line. */
58 #define MEOL 5 /* no Same, assuming multiline. */
59 #define SEOL 6 /* no Same, assuming singleline. */
60 #define ANY 7 /* no Match any one character (except newline). */
61 #define SANY 8 /* no Match any one character. */
62 #define ANYOF 9 /* sv Match character in (or not in) this class. */
63 #define CURLY 10 /* sv Match this simple thing {n,m} times. */
64 #define CURLYX 11 /* sv Match this complex thing {n,m} times. */
65 #define BRANCH 12 /* node Match this alternative, or the next... */
66 #define BACK 13 /* no Match "", "next" ptr points backward. */
67 #define EXACT 14 /* sv Match this string (preceded by length). */
68 #define EXACTF 15 /* sv Match this string, folded (prec. by length). */
69 #define EXACTFL 16 /* sv Match this string, folded in locale (w/len). */
70 #define NOTHING 17 /* no Match empty string. */
71 #define STAR 18 /* node Match this (simple) thing 0 or more times. */
72 #define PLUS 19 /* node Match this (simple) thing 1 or more times. */
73 #define BOUND 20 /* no Match "" at any word boundary */
74 #define BOUNDL 21 /* no Match "" at any word boundary */
75 #define NBOUND 22 /* no Match "" at any word non-boundary */
76 #define NBOUNDL 23 /* no Match "" at any word non-boundary */
77 #define REF 24 /* num Match some already matched string */
78 #define OPEN 25 /* num Mark this point in input as start of #n. */
79 #define CLOSE 26 /* num Analogous to OPEN. */
80 #define MINMOD 27 /* no Next operator is not greedy. */
81 #define GPOS 28 /* no Matches where last m//g left off. */
82 #define IFMATCH 29 /* off Succeeds if the following matches. */
83 #define UNLESSM 30 /* off Fails if the following matches. */
84 #define SUCCEED 31 /* no Return from a subroutine, basically. */
85 #define WHILEM 32 /* no Do curly processing and see if rest matches. */
86 #define ALNUM 33 /* no Match any alphanumeric character */
87 #define ALNUML 34 /* no Match any alphanumeric char in locale */
88 #define NALNUM 35 /* no Match any non-alphanumeric character */
89 #define NALNUML 36 /* no Match any non-alphanumeric char in locale */
90 #define SPACE 37 /* no Match any whitespace character */
91 #define SPACEL 38 /* no Match any whitespace char in locale */
92 #define NSPACE 39 /* no Match any non-whitespace character */
93 #define NSPACEL 40 /* no Match any non-whitespace char in locale */
94 #define DIGIT 41 /* no Match any numeric character */
95 #define NDIGIT 42 /* no Match any non-numeric character */
96 #define CURLYM 43 /* no Match this medium-complex thing {n,m} times. */
97 #define CURLYN 44 /* no Match next-after-this simple thing
98 {n,m} times, set parenths. */
99 #define TAIL 45 /* no Match empty string. Can jump here from outside. */
100 #define REFF 46 /* num Match already matched string, folded */
101 #define REFFL 47 /* num Match already matched string, folded in loc. */
102 #define EVAL 48 /* evl Execute some Perl code. */
103 #define LONGJMP 49 /* off Jump far away. */
104 #define BRANCHJ 50 /* off BRANCH with long offset. */
105 #define IFTHEN 51 /* off Switch, should be preceeded by switcher . */
106 #define GROUPP 52 /* num Whether the group matched. */
107 #define LOGICAL 53 /* no Next opcode should set the flag only. */
108 #define SUSPEND 54 /* off "Independent" sub-RE. */
109 #define RENUM 55 /* off Group with independently numbered parens. */
110 #define OPTIMIZED 56 /* off Placeholder for dump. */
115 * BRANCH The set of branches constituting a single choice are hooked
116 * together with their "next" pointers, since precedence prevents
117 * anything being concatenated to any individual branch. The
118 * "next" pointer of the last BRANCH in a choice points to the
119 * thing following the whole choice. This is also where the
120 * final "next" pointer of each individual branch points; each
121 * branch starts with the operand node of a BRANCH node.
123 * BACK Normal "next" pointers all implicitly point forward; BACK
124 * exists to make loop structures possible.
126 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
127 * BRANCH structures using BACK. Simple cases (one character
128 * per match) are implemented with STAR and PLUS for speed
129 * and to minimize recursive plunges.
131 * OPEN,CLOSE,GROUPP ...are numbered at compile time.
135 EXTCONST U8 regkind[];
137 EXTCONST U8 regkind[] = {
198 /* The following have no fixed length. char* since we do strchr on it. */
200 EXTCONST char varies[];
202 EXTCONST char varies[] = {
203 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
204 WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, 0
208 /* The following always have a length of 1. char* since we do strchr on it. */
210 EXTCONST char simple[];
212 EXTCONST char simple[] = {
214 ALNUM, ALNUML, NALNUM, NALNUML,
215 SPACE, SPACEL, NSPACE, NSPACEL,
221 * A node is one char of opcode followed by two chars of "next" pointer.
222 * "Next" pointers are stored as two 8-bit pieces, high order first. The
223 * value is a positive offset from the opcode of the node containing it.
224 * An operand, if any, simply follows the node. (Note that much of the
225 * code generation knows about this implicit relationship.)
227 * Using two bytes for the "next" pointer is vast overkill for most things,
228 * but allows patterns to get big without disasters.
230 * [The "next" pointer is always aligned on an even
231 * boundary, and reads the offset directly as a short. Also, there is no
232 * special test to reverse the sign of BACK pointers since the offset is
236 struct regnode_string {
258 /* XXX fix this description.
259 Impose a limit of REG_INFTY on various pattern matching operations
260 to limit stack growth and to avoid "infinite" recursions.
262 /* The default size for REG_INFTY is I16_MAX, which is the same as
263 SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits
264 (see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is
265 ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is
266 ((1<<63)-1). To limit stack growth to reasonable sizes, supply a
268 --Andy Dougherty 11 June 1998
272 # define REG_INFTY ((1<<15)-1)
277 # define REG_INFTY I16_MAX
280 #define ARG_VALUE(arg) (arg)
281 #define ARG__SET(arg,val) ((arg) = (val))
283 #define ARG(p) ARG_VALUE(ARG_LOC(p))
284 #define ARG1(p) ARG_VALUE(ARG1_LOC(p))
285 #define ARG2(p) ARG_VALUE(ARG2_LOC(p))
286 #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
287 #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
288 #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
291 # define NEXT_OFF(p) ((p)->next_off)
292 # define NODE_ALIGN(node)
293 # define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
295 # define NEXT_OFF(p) 0
296 # define NODE_ALIGN(node)
297 # define NODE_ALIGN_FILL(node)
300 #define SIZE_ALIGN NODE_ALIGN
302 #define OP(p) ((p)->type)
303 #define OPERAND(p) (((struct regnode_string *)p)->string)
304 #define NODE_ALIGN(node)
305 #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
306 #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
307 #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
308 #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
309 #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
311 #define NODE_STEP_B 4
313 #define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
314 #define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
316 #define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
317 (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
318 #define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
319 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
323 #define SIZE_ONLY (regcode == ®dummy)
325 /* Flags for first parameter byte of ANYOF */
326 #define ANYOF_INVERT 0x40
327 #define ANYOF_FOLD 0x20
328 #define ANYOF_LOCALE 0x10
329 #define ANYOF_ISA 0x0F
330 #define ANYOF_ALNUML 0x08
331 #define ANYOF_NALNUML 0x04
332 #define ANYOF_SPACEL 0x02
333 #define ANYOF_NSPACEL 0x01
335 /* Utility macros for bitmap of ANYOF */
336 #define ANYOF_BYTE(p,c) (p)[1 + (((c) >> 3) & 31)]
337 #define ANYOF_BIT(c) (1 << ((c) & 7))
338 #define ANYOF_SET(p,c) (ANYOF_BYTE(p,c) |= ANYOF_BIT(c))
339 #define ANYOF_CLEAR(p,c) (ANYOF_BYTE(p,c) &= ~ANYOF_BIT(c))
340 #define ANYOF_TEST(p,c) (ANYOF_BYTE(p,c) & ANYOF_BIT(c))
342 #define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1)
345 * Utility definitions.
349 #define UCHARAT(p) ((int)*(unsigned char *)(p))
351 #define UCHARAT(p) ((int)*(p)&CHARMASK)
354 #define UCHARAT(p) regdummy
357 #define FAIL(m) croak ("/%.127s/: %s", regprecomp,m)
358 #define FAIL2(pat,m) re_croak2("/%.127s/: ",pat,regprecomp,m)
360 #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
363 const static U8 regarglen[] = {
364 0,0,0,0,0,0,0,0,0,0,
365 /*CURLY*/ EXTRA_SIZE(struct regnode_2),
366 /*CURLYX*/ EXTRA_SIZE(struct regnode_2),
367 0,0,0,0,0,0,0,0,0,0,0,0,
368 /*REF*/ EXTRA_SIZE(struct regnode_1),
369 /*OPEN*/ EXTRA_SIZE(struct regnode_1),
370 /*CLOSE*/ EXTRA_SIZE(struct regnode_1),
372 /*IFMATCH*/ EXTRA_SIZE(struct regnode_1),
373 /*UNLESSM*/ EXTRA_SIZE(struct regnode_1),
374 0,0,0,0,0,0,0,0,0,0,0,0,
375 /*CURLYM*/ EXTRA_SIZE(struct regnode_2),
376 /*CURLYN*/ EXTRA_SIZE(struct regnode_2),
378 /*REFF*/ EXTRA_SIZE(struct regnode_1),
379 /*REFFL*/ EXTRA_SIZE(struct regnode_1),
380 /*EVAL*/ EXTRA_SIZE(struct regnode_1),
381 /*LONGJMP*/ EXTRA_SIZE(struct regnode_1),
382 /*BRANCHJ*/ EXTRA_SIZE(struct regnode_1),
383 /*IFTHEN*/ EXTRA_SIZE(struct regnode_1),
384 /*GROUPP*/ EXTRA_SIZE(struct regnode_1),
386 /*SUSPEND*/ EXTRA_SIZE(struct regnode_1),
387 /*RENUM*/ EXTRA_SIZE(struct regnode_1), 0,
390 const static char reg_off_by_arg[] = {
391 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0 .. 15 */
392 0,0,0,0,0,0,0,0,0,0,0,0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2, 0,
393 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 32 .. 47 */
394 0, /*LONGJMP*/ 1, /*BRANCHJ*/ 1, /*IFTHEN*/ 1, 0, 0,
395 /*RENUM*/ 1, /*RENUM*/ 1,0,
399 #define REG_SEEN_ZERO_LEN 1
400 #define REG_SEEN_LOOKBEHIND 2
401 #define REG_SEEN_GPOS 4
402 #define REG_SEEN_EVAL 8