4 typedef OP OP_4tree; /* Will be redefined later. */
7 * The "internal use only" fields in regexp.h are present to pass info from
8 * compile to execute that permits the execute phase to run lots faster on
9 * simple cases. They are:
11 * regstart sv that must begin a match; Nullch if none obvious
12 * reganch is the match anchored (at beginning-of-line only)?
13 * regmust string (pointer into program) that match must include, or NULL
14 * [regmust changed to SV* for bminstr()--law]
15 * regmlen length of regmust string
16 * [regmlen not used currently]
18 * Regstart and reganch permit very fast decisions on suitable starting points
19 * for a match, cutting down the work a lot. Regmust permits fast rejection
20 * of lines that cannot possibly match. The regmust tests are costly enough
21 * that pregcomp() supplies a regmust only if the r.e. contains something
22 * potentially expensive (at present, the only such thing detected is * or +
23 * at the start of the r.e., which can involve a lot of backup). Regmlen is
24 * supplied because the test in pregexec() needs it and pregcomp() is computing
26 * [regmust is now supplied always. The tests that use regmust have a
27 * heuristic that disables the test if it usually matches.]
29 * [In fact, we now use regmust in many cases to locate where the search
30 * starts in the string, so if regback is >= 0, the regmust search is never
31 * wasted effort. The regback variable says how many characters back from
32 * where regmust matched is the earliest possible start of the match.
33 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
37 * Structure for regexp "program". This is essentially a linear encoding
38 * of a nondeterministic finite-state machine (aka syntax charts or
39 * "railroad normal form" in parsing technology). Each node is an opcode
40 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
41 * all nodes except BRANCH implement concatenation; a "next" pointer with
42 * a BRANCH on both ends of it is connecting two alternatives. (Here we
43 * have one of the subtle syntax dependencies: an individual BRANCH (as
44 * opposed to a collection of them) is never concatenated with anything
45 * because of operator precedence.) The operand of some types of node is
46 * a literal string; for others, it is a node leading into a sub-FSM. In
47 * particular, the operand of a BRANCH node is the first node of the branch.
48 * (NB this is *not* a tree structure: the tail of the branch connects
49 * to the thing following the set of BRANCHes.) The opcodes are:
53 * A node is one char of opcode followed by two chars of "next" pointer.
54 * "Next" pointers are stored as two 8-bit pieces, high order first. The
55 * value is a positive offset from the opcode of the node containing it.
56 * An operand, if any, simply follows the node. (Note that much of the
57 * code generation knows about this implicit relationship.)
59 * Using two bytes for the "next" pointer is vast overkill for most things,
60 * but allows patterns to get big without disasters.
62 * [The "next" pointer is always aligned on an even
63 * boundary, and reads the offset directly as a short. Also, there is no
64 * special test to reverse the sign of BACK pointers since the offset is
68 struct regnode_string {
90 /* XXX fix this description.
91 Impose a limit of REG_INFTY on various pattern matching operations
92 to limit stack growth and to avoid "infinite" recursions.
94 /* The default size for REG_INFTY is I16_MAX, which is the same as
95 SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits
96 (see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is
97 ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is
98 ((1<<63)-1). To limit stack growth to reasonable sizes, supply a
100 --Andy Dougherty 11 June 1998
104 # define REG_INFTY ((1<<15)-1)
109 # define REG_INFTY I16_MAX
112 #define ARG_VALUE(arg) (arg)
113 #define ARG__SET(arg,val) ((arg) = (val))
115 #define ARG(p) ARG_VALUE(ARG_LOC(p))
116 #define ARG1(p) ARG_VALUE(ARG1_LOC(p))
117 #define ARG2(p) ARG_VALUE(ARG2_LOC(p))
118 #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
119 #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
120 #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
123 # define NEXT_OFF(p) ((p)->next_off)
124 # define NODE_ALIGN(node)
125 # define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
127 # define NEXT_OFF(p) 0
128 # define NODE_ALIGN(node)
129 # define NODE_ALIGN_FILL(node)
132 #define SIZE_ALIGN NODE_ALIGN
134 #define OP(p) ((p)->type)
135 #define OPERAND(p) (((struct regnode_string *)p)->string)
136 #define NODE_ALIGN(node)
137 #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
138 #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
139 #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
140 #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
141 #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
143 #define NODE_STEP_B 4
145 #define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
146 #define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
148 #define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
149 (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
150 #define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
151 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
155 #define SIZE_ONLY (PL_regcode == &PL_regdummy)
157 /* Flags for first parameter byte of ANYOF */
158 #define ANYOF_INVERT 0x40
159 #define ANYOF_FOLD 0x20
160 #define ANYOF_LOCALE 0x10
161 #define ANYOF_ISA 0x0F
162 #define ANYOF_ALNUML 0x08
163 #define ANYOF_NALNUML 0x04
164 #define ANYOF_SPACEL 0x02
165 #define ANYOF_NSPACEL 0x01
167 /* Utility macros for bitmap of ANYOF */
168 #define ANYOF_BYTE(p,c) (p)[1 + (((c) >> 3) & 31)]
169 #define ANYOF_BIT(c) (1 << ((c) & 7))
170 #define ANYOF_SET(p,c) (ANYOF_BYTE(p,c) |= ANYOF_BIT(c))
171 #define ANYOF_CLEAR(p,c) (ANYOF_BYTE(p,c) &= ~ANYOF_BIT(c))
172 #define ANYOF_TEST(p,c) (ANYOF_BYTE(p,c) & ANYOF_BIT(c))
174 #define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1)
177 * Utility definitions.
181 #define UCHARAT(p) ((int)*(unsigned char *)(p))
183 #define UCHARAT(p) ((int)*(p)&CHARMASK)
186 #define UCHARAT(p) PL_regdummy
189 #define FAIL(m) croak ("/%.127s/: %s", PL_regprecomp,m)
190 #define FAIL2(pat,m) re_croak2("/%.127s/: ",pat,PL_regprecomp,m)
192 #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
194 #define REG_SEEN_ZERO_LEN 1
195 #define REG_SEEN_LOOKBEHIND 2
196 #define REG_SEEN_GPOS 4
197 #define REG_SEEN_EVAL 8
199 #include "regnodes.h"
201 /* The following have no fixed length. char* since we do strchr on it. */
203 EXTCONST char varies[];
205 EXTCONST char varies[] = {
206 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
207 WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, 0
211 /* The following always have a length of 1. char* since we do strchr on it. */
212 /* (Note that lenght 1 means "one character" under UTF8, not "one octet".) */
214 EXTCONST char simple[];
216 EXTCONST char simple[] = {
217 ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8,
218 ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8,
219 NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8,
220 SPACE, SPACEUTF8, SPACEL, SPACELUTF8,
221 NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8,
222 DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0