Introduce pp_lock.
[p5sagit/p5-mst-13.2.git] / regcomp.h
CommitLineData
a0d0e21e 1/* regcomp.h
a687059c 2 */
3
4/*
5 * The "internal use only" fields in regexp.h are present to pass info from
6 * compile to execute that permits the execute phase to run lots faster on
7 * simple cases. They are:
8 *
79072805 9 * regstart sv that must begin a match; Nullch if none obvious
a687059c 10 * reganch is the match anchored (at beginning-of-line only)?
11 * regmust string (pointer into program) that match must include, or NULL
79072805 12 * [regmust changed to SV* for bminstr()--law]
a687059c 13 * regmlen length of regmust string
14 * [regmlen not used currently]
15 *
16 * Regstart and reganch permit very fast decisions on suitable starting points
17 * for a match, cutting down the work a lot. Regmust permits fast rejection
18 * of lines that cannot possibly match. The regmust tests are costly enough
e50aee73 19 * that pregcomp() supplies a regmust only if the r.e. contains something
a687059c 20 * potentially expensive (at present, the only such thing detected is * or +
21 * at the start of the r.e., which can involve a lot of backup). Regmlen is
e50aee73 22 * supplied because the test in pregexec() needs it and pregcomp() is computing
a687059c 23 * it anyway.
24 * [regmust is now supplied always. The tests that use regmust have a
25 * heuristic that disables the test if it usually matches.]
26 *
27 * [In fact, we now use regmust in many cases to locate where the search
28 * starts in the string, so if regback is >= 0, the regmust search is never
29 * wasted effort. The regback variable says how many characters back from
30 * where regmust matched is the earliest possible start of the match.
31 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
32 */
33
34/*
35 * Structure for regexp "program". This is essentially a linear encoding
36 * of a nondeterministic finite-state machine (aka syntax charts or
37 * "railroad normal form" in parsing technology). Each node is an opcode
38 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
39 * all nodes except BRANCH implement concatenation; a "next" pointer with
40 * a BRANCH on both ends of it is connecting two alternatives. (Here we
41 * have one of the subtle syntax dependencies: an individual BRANCH (as
42 * opposed to a collection of them) is never concatenated with anything
43 * because of operator precedence.) The operand of some types of node is
44 * a literal string; for others, it is a node leading into a sub-FSM. In
45 * particular, the operand of a BRANCH node is the first node of the branch.
46 * (NB this is *not* a tree structure: the tail of the branch connects
47 * to the thing following the set of BRANCHes.) The opcodes are:
48 */
49
50/* definition number opnd? meaning */
bbce6d69 51#define END 0 /* no End of program. */
52#define BOL 1 /* no Match "" at beginning of line. */
53#define MBOL 2 /* no Same, assuming multiline. */
54#define SBOL 3 /* no Same, assuming singleline. */
55#define EOL 4 /* no Match "" at end of line. */
56#define MEOL 5 /* no Same, assuming multiline. */
57#define SEOL 6 /* no Same, assuming singleline. */
58#define ANY 7 /* no Match any one character (except newline). */
59#define SANY 8 /* no Match any one character. */
60#define ANYOF 9 /* sv Match character in (or not in) this class. */
a0d0e21e 61#define CURLY 10 /* sv Match this simple thing {n,m} times. */
62#define CURLYX 11 /* sv Match this complex thing {n,m} times. */
63#define BRANCH 12 /* node Match this alternative, or the next... */
64#define BACK 13 /* no Match "", "next" ptr points backward. */
bbce6d69 65#define EXACT 14 /* sv Match this string (preceded by length). */
66#define EXACTF 15 /* sv Match this string, folded (prec. by length). */
67#define EXACTFL 16 /* sv Match this string, folded in locale (w/len). */
68#define NOTHING 17 /* no Match empty string. */
69#define STAR 18 /* node Match this (simple) thing 0 or more times. */
70#define PLUS 19 /* node Match this (simple) thing 1 or more times. */
a0d0e21e 71#define BOUND 20 /* no Match "" at any word boundary */
bbce6d69 72#define BOUNDL 21 /* no Match "" at any word boundary */
73#define NBOUND 22 /* no Match "" at any word non-boundary */
74#define NBOUNDL 23 /* no Match "" at any word non-boundary */
75#define REF 24 /* num Match some already matched string */
76#define OPEN 25 /* num Mark this point in input as start of #n. */
77#define CLOSE 26 /* num Analogous to OPEN. */
78#define MINMOD 27 /* no Next operator is not greedy. */
774d564b 79#define GPOS 28 /* no Matches where last m//g left off. */
bbce6d69 80#define IFMATCH 29 /* no Succeeds if the following matches. */
81#define UNLESSM 30 /* no Fails if the following matches. */
82#define SUCCEED 31 /* no Return from a subroutine, basically. */
83#define WHILEM 32 /* no Do curly processing and see if rest matches. */
84#define ALNUM 33 /* no Match any alphanumeric character */
85#define ALNUML 34 /* no Match any alphanumeric char in locale */
86#define NALNUM 35 /* no Match any non-alphanumeric character */
87#define NALNUML 36 /* no Match any non-alphanumeric char in locale */
88#define SPACE 37 /* no Match any whitespace character */
89#define SPACEL 38 /* no Match any whitespace char in locale */
90#define NSPACE 39 /* no Match any non-whitespace character */
91#define NSPACEL 40 /* no Match any non-whitespace char in locale */
92#define DIGIT 41 /* no Match any numeric character */
93#define NDIGIT 42 /* no Match any non-numeric character */
a687059c 94
95/*
96 * Opcode notes:
97 *
98 * BRANCH The set of branches constituting a single choice are hooked
99 * together with their "next" pointers, since precedence prevents
100 * anything being concatenated to any individual branch. The
101 * "next" pointer of the last BRANCH in a choice points to the
102 * thing following the whole choice. This is also where the
103 * final "next" pointer of each individual branch points; each
104 * branch starts with the operand node of a BRANCH node.
105 *
106 * BACK Normal "next" pointers all implicitly point forward; BACK
107 * exists to make loop structures possible.
108 *
109 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
110 * BRANCH structures using BACK. Simple cases (one character
111 * per match) are implemented with STAR and PLUS for speed
112 * and to minimize recursive plunges.
113 *
114 * OPEN,CLOSE ...are numbered at compile time.
115 */
116
fe14fcc3 117#ifndef DOINIT
a0d0e21e 118EXT char regarglen[];
119#else
bbce6d69 120EXT char regarglen[] = {
121 0,0,0,0,0,0,0,0,0,0,
122 /*CURLY*/ 4, /*CURLYX*/ 4,
123 0,0,0,0,0,0,0,0,0,0,0,0,
124 /*REF*/ 2, /*OPEN*/ 2, /*CLOSE*/ 2,
125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126};
a0d0e21e 127#endif
128
129#ifndef DOINIT
130EXT char regkind[];
fe14fcc3 131#else
a0d0e21e 132EXT char regkind[] = {
133 END,
134 BOL,
135 BOL,
136 BOL,
137 EOL,
138 EOL,
139 EOL,
140 ANY,
141 ANY,
142 ANYOF,
143 CURLY,
144 CURLY,
145 BRANCH,
146 BACK,
bbce6d69 147 EXACT,
148 EXACT,
149 EXACT,
a0d0e21e 150 NOTHING,
151 STAR,
152 PLUS,
bbce6d69 153 BOUND,
a0d0e21e 154 BOUND,
155 NBOUND,
bbce6d69 156 NBOUND,
a0d0e21e 157 REF,
158 OPEN,
159 CLOSE,
160 MINMOD,
774d564b 161 GPOS,
a0d0e21e 162 BRANCH,
163 BRANCH,
164 END,
bbce6d69 165 WHILEM,
166 ALNUM,
167 ALNUM,
168 NALNUM,
169 NALNUM,
170 SPACE,
171 SPACE,
172 NSPACE,
173 NSPACE,
174 DIGIT,
175 NDIGIT,
a0d0e21e 176};
fe14fcc3 177#endif
178
a687059c 179/* The following have no fixed length. */
180#ifndef DOINIT
a0d0e21e 181EXT char varies[];
a687059c 182#else
bbce6d69 183EXT char varies[] = {
184 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, WHILEM, 0
185};
a687059c 186#endif
187
188/* The following always have a length of 1. */
189#ifndef DOINIT
a0d0e21e 190EXT char simple[];
a687059c 191#else
bbce6d69 192EXT char simple[] = {
193 ANY, SANY, ANYOF,
194 ALNUM, ALNUML, NALNUM, NALNUML,
195 SPACE, SPACEL, NSPACE, NSPACEL,
196 DIGIT, NDIGIT, 0
197};
a687059c 198#endif
199
200EXT char regdummy;
201
202/*
203 * A node is one char of opcode followed by two chars of "next" pointer.
204 * "Next" pointers are stored as two 8-bit pieces, high order first. The
205 * value is a positive offset from the opcode of the node containing it.
206 * An operand, if any, simply follows the node. (Note that much of the
207 * code generation knows about this implicit relationship.)
208 *
209 * Using two bytes for the "next" pointer is vast overkill for most things,
210 * but allows patterns to get big without disasters.
211 *
212 * [If REGALIGN is defined, the "next" pointer is always aligned on an even
213 * boundary, and reads the offset directly as a short. Also, there is no
214 * special test to reverse the sign of BACK pointers since the offset is
215 * stored negative.]
216 */
217
218#ifndef gould
219#ifndef cray
34de22dd 220#ifndef eta10
a687059c 221#define REGALIGN
222#endif
223#endif
34de22dd 224#endif
a687059c 225
226#define OP(p) (*(p))
227
228#ifndef lint
229#ifdef REGALIGN
230#define NEXT(p) (*(short*)(p+1))
00bf170e 231#define ARG1(p) (*(unsigned short*)(p+3))
232#define ARG2(p) (*(unsigned short*)(p+5))
a687059c 233#else
234#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
00bf170e 235#define ARG1(p) (((*((p)+3)&0377)<<8) + (*((p)+4)&0377))
236#define ARG2(p) (((*((p)+5)&0377)<<8) + (*((p)+6)&0377))
a687059c 237#endif
238#else /* lint */
239#define NEXT(p) 0
240#endif /* lint */
241
242#define OPERAND(p) ((p) + 3)
243
244#ifdef REGALIGN
245#define NEXTOPER(p) ((p) + 4)
a0d0e21e 246#define PREVOPER(p) ((p) - 4)
a687059c 247#else
248#define NEXTOPER(p) ((p) + 3)
a0d0e21e 249#define PREVOPER(p) ((p) - 3)
a687059c 250#endif
251
252#define MAGIC 0234
253
bbce6d69 254/* Flags for first parameter byte of ANYOF */
255#define ANYOF_INVERT 0x40
256#define ANYOF_FOLD 0x20
257#define ANYOF_LOCALE 0x10
258#define ANYOF_ISA 0x0F
259#define ANYOF_ALNUML 0x08
260#define ANYOF_NALNUML 0x04
261#define ANYOF_SPACEL 0x02
262#define ANYOF_NSPACEL 0x01
263
a687059c 264/*
265 * Utility definitions.
266 */
267#ifndef lint
2304df62 268#ifndef CHARMASK
a687059c 269#define UCHARAT(p) ((int)*(unsigned char *)(p))
270#else
2304df62 271#define UCHARAT(p) ((int)*(p)&CHARMASK)
a687059c 272#endif
273#else /* lint */
274#define UCHARAT(p) regdummy
275#endif /* lint */
276
a0d0e21e 277#define FAIL(m) croak("/%.127s/: %s",regprecomp,m)