undo goofed change 1157 (backed out the fix instead of keeping it)
[p5sagit/p5-mst-13.2.git] / regcomp.h
CommitLineData
a0d0e21e 1/* regcomp.h
a687059c 2 */
3
c277df42 4typedef OP OP_4tree; /* Will be redefined later. */
5
a687059c 6/*
7 * The "internal use only" fields in regexp.h are present to pass info from
8 * compile to execute that permits the execute phase to run lots faster on
9 * simple cases. They are:
10 *
79072805 11 * regstart sv that must begin a match; Nullch if none obvious
a687059c 12 * reganch is the match anchored (at beginning-of-line only)?
13 * regmust string (pointer into program) that match must include, or NULL
79072805 14 * [regmust changed to SV* for bminstr()--law]
a687059c 15 * regmlen length of regmust string
16 * [regmlen not used currently]
17 *
18 * Regstart and reganch permit very fast decisions on suitable starting points
19 * for a match, cutting down the work a lot. Regmust permits fast rejection
20 * of lines that cannot possibly match. The regmust tests are costly enough
e50aee73 21 * that pregcomp() supplies a regmust only if the r.e. contains something
a687059c 22 * potentially expensive (at present, the only such thing detected is * or +
23 * at the start of the r.e., which can involve a lot of backup). Regmlen is
e50aee73 24 * supplied because the test in pregexec() needs it and pregcomp() is computing
a687059c 25 * it anyway.
26 * [regmust is now supplied always. The tests that use regmust have a
27 * heuristic that disables the test if it usually matches.]
28 *
29 * [In fact, we now use regmust in many cases to locate where the search
30 * starts in the string, so if regback is >= 0, the regmust search is never
31 * wasted effort. The regback variable says how many characters back from
32 * where regmust matched is the earliest possible start of the match.
33 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
34 */
35
36/*
37 * Structure for regexp "program". This is essentially a linear encoding
38 * of a nondeterministic finite-state machine (aka syntax charts or
39 * "railroad normal form" in parsing technology). Each node is an opcode
40 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
41 * all nodes except BRANCH implement concatenation; a "next" pointer with
42 * a BRANCH on both ends of it is connecting two alternatives. (Here we
43 * have one of the subtle syntax dependencies: an individual BRANCH (as
44 * opposed to a collection of them) is never concatenated with anything
45 * because of operator precedence.) The operand of some types of node is
46 * a literal string; for others, it is a node leading into a sub-FSM. In
47 * particular, the operand of a BRANCH node is the first node of the branch.
48 * (NB this is *not* a tree structure: the tail of the branch connects
49 * to the thing following the set of BRANCHes.) The opcodes are:
50 */
51
52/* definition number opnd? meaning */
bbce6d69 53#define END 0 /* no End of program. */
54#define BOL 1 /* no Match "" at beginning of line. */
55#define MBOL 2 /* no Same, assuming multiline. */
56#define SBOL 3 /* no Same, assuming singleline. */
57#define EOL 4 /* no Match "" at end of line. */
58#define MEOL 5 /* no Same, assuming multiline. */
59#define SEOL 6 /* no Same, assuming singleline. */
60#define ANY 7 /* no Match any one character (except newline). */
61#define SANY 8 /* no Match any one character. */
62#define ANYOF 9 /* sv Match character in (or not in) this class. */
a0d0e21e 63#define CURLY 10 /* sv Match this simple thing {n,m} times. */
64#define CURLYX 11 /* sv Match this complex thing {n,m} times. */
65#define BRANCH 12 /* node Match this alternative, or the next... */
66#define BACK 13 /* no Match "", "next" ptr points backward. */
bbce6d69 67#define EXACT 14 /* sv Match this string (preceded by length). */
68#define EXACTF 15 /* sv Match this string, folded (prec. by length). */
69#define EXACTFL 16 /* sv Match this string, folded in locale (w/len). */
70#define NOTHING 17 /* no Match empty string. */
71#define STAR 18 /* node Match this (simple) thing 0 or more times. */
72#define PLUS 19 /* node Match this (simple) thing 1 or more times. */
a0d0e21e 73#define BOUND 20 /* no Match "" at any word boundary */
bbce6d69 74#define BOUNDL 21 /* no Match "" at any word boundary */
75#define NBOUND 22 /* no Match "" at any word non-boundary */
76#define NBOUNDL 23 /* no Match "" at any word non-boundary */
c277df42 77#define REF 24 /* num Match some already matched string */
78#define OPEN 25 /* num Mark this point in input as start of #n. */
79#define CLOSE 26 /* num Analogous to OPEN. */
80#define MINMOD 27 /* no Next operator is not greedy. */
81#define GPOS 28 /* no Matches where last m//g left off. */
82#define IFMATCH 29 /* off Succeeds if the following matches. */
83#define UNLESSM 30 /* off Fails if the following matches. */
84#define SUCCEED 31 /* no Return from a subroutine, basically. */
85#define WHILEM 32 /* no Do curly processing and see if rest matches. */
86#define ALNUM 33 /* no Match any alphanumeric character */
87#define ALNUML 34 /* no Match any alphanumeric char in locale */
88#define NALNUM 35 /* no Match any non-alphanumeric character */
89#define NALNUML 36 /* no Match any non-alphanumeric char in locale */
90#define SPACE 37 /* no Match any whitespace character */
91#define SPACEL 38 /* no Match any whitespace char in locale */
92#define NSPACE 39 /* no Match any non-whitespace character */
93#define NSPACEL 40 /* no Match any non-whitespace char in locale */
94#define DIGIT 41 /* no Match any numeric character */
95#define NDIGIT 42 /* no Match any non-numeric character */
96#define CURLYM 43 /* no Match this medium-complex thing {n,m} times. */
97#define CURLYN 44 /* no Match next-after-this simple thing
98 {n,m} times, set parenths. */
99#define TAIL 45 /* no Match empty string. Can jump here from outside. */
100#define REFF 46 /* num Match already matched string, folded */
101#define REFFL 47 /* num Match already matched string, folded in loc. */
102#define EVAL 48 /* evl Execute some Perl code. */
e91177ed 103#define LONGJMP 49 /* off Jump far away. */
104#define BRANCHJ 50 /* off BRANCH with long offset. */
c277df42 105#define IFTHEN 51 /* off Switch, should be preceeded by switcher . */
106#define GROUPP 52 /* num Whether the group matched. */
107#define LOGICAL 53 /* no Next opcode should set the flag only. */
108#define SUSPEND 54 /* off "Independent" sub-RE. */
109#define RENUM 55 /* off Group with independently numbered parens. */
110#define OPTIMIZED 56 /* off Placeholder for dump. */
a687059c 111
112/*
113 * Opcode notes:
114 *
115 * BRANCH The set of branches constituting a single choice are hooked
116 * together with their "next" pointers, since precedence prevents
117 * anything being concatenated to any individual branch. The
118 * "next" pointer of the last BRANCH in a choice points to the
119 * thing following the whole choice. This is also where the
120 * final "next" pointer of each individual branch points; each
121 * branch starts with the operand node of a BRANCH node.
122 *
123 * BACK Normal "next" pointers all implicitly point forward; BACK
124 * exists to make loop structures possible.
125 *
126 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
127 * BRANCH structures using BACK. Simple cases (one character
128 * per match) are implemented with STAR and PLUS for speed
129 * and to minimize recursive plunges.
130 *
c277df42 131 * OPEN,CLOSE,GROUPP ...are numbered at compile time.
a687059c 132 */
133
fe14fcc3 134#ifndef DOINIT
161b471a 135EXTCONST U8 regkind[];
a0d0e21e 136#else
161b471a 137EXTCONST U8 regkind[] = {
a0d0e21e 138 END,
139 BOL,
140 BOL,
141 BOL,
142 EOL,
143 EOL,
144 EOL,
145 ANY,
146 ANY,
147 ANYOF,
148 CURLY,
149 CURLY,
150 BRANCH,
151 BACK,
bbce6d69 152 EXACT,
153 EXACT,
154 EXACT,
a0d0e21e 155 NOTHING,
156 STAR,
157 PLUS,
bbce6d69 158 BOUND,
a0d0e21e 159 BOUND,
160 NBOUND,
bbce6d69 161 NBOUND,
a0d0e21e 162 REF,
163 OPEN,
164 CLOSE,
165 MINMOD,
774d564b 166 GPOS,
c277df42 167 BRANCHJ,
168 BRANCHJ,
a0d0e21e 169 END,
bbce6d69 170 WHILEM,
171 ALNUM,
172 ALNUM,
173 NALNUM,
174 NALNUM,
175 SPACE,
176 SPACE,
177 NSPACE,
178 NSPACE,
179 DIGIT,
180 NDIGIT,
c277df42 181 CURLY,
182 CURLY,
183 NOTHING,
184 REF,
185 REF,
186 EVAL,
187 LONGJMP,
188 BRANCHJ,
189 BRANCHJ,
190 GROUPP,
191 LOGICAL,
192 BRANCHJ,
193 BRANCHJ,
194 NOTHING,
a0d0e21e 195};
fe14fcc3 196#endif
197
c277df42 198/* The following have no fixed length. char* since we do strchr on it. */
a687059c 199#ifndef DOINIT
fc1ce8cc 200EXTCONST char varies[];
a687059c 201#else
fc1ce8cc 202EXTCONST char varies[] = {
c277df42 203 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
204 WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, 0
bbce6d69 205};
a687059c 206#endif
207
c277df42 208/* The following always have a length of 1. char* since we do strchr on it. */
a687059c 209#ifndef DOINIT
fc1ce8cc 210EXTCONST char simple[];
a687059c 211#else
fc1ce8cc 212EXTCONST char simple[] = {
bbce6d69 213 ANY, SANY, ANYOF,
214 ALNUM, ALNUML, NALNUM, NALNUML,
215 SPACE, SPACEL, NSPACE, NSPACEL,
216 DIGIT, NDIGIT, 0
217};
a687059c 218#endif
219
a687059c 220/*
221 * A node is one char of opcode followed by two chars of "next" pointer.
222 * "Next" pointers are stored as two 8-bit pieces, high order first. The
223 * value is a positive offset from the opcode of the node containing it.
224 * An operand, if any, simply follows the node. (Note that much of the
225 * code generation knows about this implicit relationship.)
226 *
227 * Using two bytes for the "next" pointer is vast overkill for most things,
228 * but allows patterns to get big without disasters.
229 *
e91177ed 230 * [The "next" pointer is always aligned on an even
a687059c 231 * boundary, and reads the offset directly as a short. Also, there is no
232 * special test to reverse the sign of BACK pointers since the offset is
233 * stored negative.]
234 */
235
c277df42 236struct regnode_string {
237 U8 flags;
238 U8 type;
239 U16 next_off;
240 U8 string[1];
241};
a687059c 242
c277df42 243struct regnode_1 {
244 U8 flags;
245 U8 type;
246 U16 next_off;
247 U32 arg1;
248};
249
250struct regnode_2 {
251 U8 flags;
252 U8 type;
253 U16 next_off;
254 U16 arg1;
255 U16 arg2;
256};
257
9c7e81e8 258/* XXX fix this description.
259 Impose a limit of REG_INFTY on various pattern matching operations
260 to limit stack growth and to avoid "infinite" recursions.
261*/
262/* The default size for REG_INFTY is I16_MAX, which is the same as
263 SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits
264 (see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is
265 ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is
266 ((1<<63)-1). To limit stack growth to reasonable sizes, supply a
267 smaller default.
268 --Andy Dougherty 11 June 1998
269*/
270#if SHORTSIZE > 2
271# ifndef REG_INFTY
272# define REG_INFTY ((1<<15)-1)
273# endif
274#endif
275
276#ifndef REG_INFTY
83cfe48c 277# define REG_INFTY I16_MAX
278#endif
a687059c 279
e91177ed 280#define ARG_VALUE(arg) (arg)
281#define ARG__SET(arg,val) ((arg) = (val))
c277df42 282
283#define ARG(p) ARG_VALUE(ARG_LOC(p))
284#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
285#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
286#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
287#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
288#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
289
290#ifndef lint
e91177ed 291# define NEXT_OFF(p) ((p)->next_off)
292# define NODE_ALIGN(node)
293# define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
a687059c 294#else /* lint */
c277df42 295# define NEXT_OFF(p) 0
e91177ed 296# define NODE_ALIGN(node)
297# define NODE_ALIGN_FILL(node)
a687059c 298#endif /* lint */
299
c277df42 300#define SIZE_ALIGN NODE_ALIGN
301
e91177ed 302#define OP(p) ((p)->type)
303#define OPERAND(p) (((struct regnode_string *)p)->string)
304#define NODE_ALIGN(node)
305#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
306#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
307#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
308#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
309#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
310
311#define NODE_STEP_B 4
c277df42 312
313#define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
314#define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
315
e91177ed 316#define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
c277df42 317 (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
e91177ed 318#define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
c277df42 319 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
a687059c 320
321#define MAGIC 0234
322
c277df42 323#define SIZE_ONLY (regcode == &regdummy)
324
bbce6d69 325/* Flags for first parameter byte of ANYOF */
326#define ANYOF_INVERT 0x40
327#define ANYOF_FOLD 0x20
328#define ANYOF_LOCALE 0x10
329#define ANYOF_ISA 0x0F
330#define ANYOF_ALNUML 0x08
331#define ANYOF_NALNUML 0x04
332#define ANYOF_SPACEL 0x02
333#define ANYOF_NSPACEL 0x01
334
ae5c130c 335/* Utility macros for bitmap of ANYOF */
336#define ANYOF_BYTE(p,c) (p)[1 + (((c) >> 3) & 31)]
337#define ANYOF_BIT(c) (1 << ((c) & 7))
338#define ANYOF_SET(p,c) (ANYOF_BYTE(p,c) |= ANYOF_BIT(c))
339#define ANYOF_CLEAR(p,c) (ANYOF_BYTE(p,c) &= ~ANYOF_BIT(c))
340#define ANYOF_TEST(p,c) (ANYOF_BYTE(p,c) & ANYOF_BIT(c))
341
c277df42 342#define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1)
c277df42 343
a687059c 344/*
345 * Utility definitions.
346 */
347#ifndef lint
2304df62 348#ifndef CHARMASK
a687059c 349#define UCHARAT(p) ((int)*(unsigned char *)(p))
350#else
2304df62 351#define UCHARAT(p) ((int)*(p)&CHARMASK)
a687059c 352#endif
353#else /* lint */
354#define UCHARAT(p) regdummy
355#endif /* lint */
356
c277df42 357#define FAIL(m) croak ("/%.127s/: %s", regprecomp,m)
358#define FAIL2(pat,m) re_croak2("/%.127s/: ",pat,regprecomp,m)
359
360#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
361
362#ifdef REG_COMP_C
363const static U8 regarglen[] = {
c277df42 364 0,0,0,0,0,0,0,0,0,0,
365 /*CURLY*/ EXTRA_SIZE(struct regnode_2),
366 /*CURLYX*/ EXTRA_SIZE(struct regnode_2),
367 0,0,0,0,0,0,0,0,0,0,0,0,
368 /*REF*/ EXTRA_SIZE(struct regnode_1),
369 /*OPEN*/ EXTRA_SIZE(struct regnode_1),
370 /*CLOSE*/ EXTRA_SIZE(struct regnode_1),
371 0,0,
372 /*IFMATCH*/ EXTRA_SIZE(struct regnode_1),
373 /*UNLESSM*/ EXTRA_SIZE(struct regnode_1),
374 0,0,0,0,0,0,0,0,0,0,0,0,
375 /*CURLYM*/ EXTRA_SIZE(struct regnode_2),
376 /*CURLYN*/ EXTRA_SIZE(struct regnode_2),
377 0,
378 /*REFF*/ EXTRA_SIZE(struct regnode_1),
379 /*REFFL*/ EXTRA_SIZE(struct regnode_1),
380 /*EVAL*/ EXTRA_SIZE(struct regnode_1),
381 /*LONGJMP*/ EXTRA_SIZE(struct regnode_1),
382 /*BRANCHJ*/ EXTRA_SIZE(struct regnode_1),
383 /*IFTHEN*/ EXTRA_SIZE(struct regnode_1),
384 /*GROUPP*/ EXTRA_SIZE(struct regnode_1),
385 /*LOGICAL*/ 0,
386 /*SUSPEND*/ EXTRA_SIZE(struct regnode_1),
387 /*RENUM*/ EXTRA_SIZE(struct regnode_1), 0,
c277df42 388};
389
390const static char reg_off_by_arg[] = {
391 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0 .. 15 */
392 0,0,0,0,0,0,0,0,0,0,0,0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2, 0,
393 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 32 .. 47 */
394 0, /*LONGJMP*/ 1, /*BRANCHJ*/ 1, /*IFTHEN*/ 1, 0, 0,
395 /*RENUM*/ 1, /*RENUM*/ 1,0,
396};
397#endif
398
c277df42 399#define REG_SEEN_ZERO_LEN 1
400#define REG_SEEN_LOOKBEHIND 2
401#define REG_SEEN_GPOS 4
ce862d02 402#define REG_SEEN_EVAL 8