perl 4.0 patch 19: (combined patch)
[p5sagit/p5-mst-13.2.git] / regcomp.c
CommitLineData
a687059c 1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
55204971 10/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.4 $$Date: 91/11/05 22:55:14 $
a687059c 11 *
12 * $Log: regcomp.c,v $
55204971 13 * Revision 4.0.1.4 91/11/05 22:55:14 lwall
14 * patch11: Erratum
15 *
f0fcb552 16 * Revision 4.0.1.3 91/11/05 18:22:28 lwall
17 * patch11: minimum match length calculation in regexp is now cumulative
18 * patch11: initial .* in pattern had dependency on value of $*
19 * patch11: certain patterns made use of garbage pointers from uncleared memory
20 * patch11: prepared for ctype implementations that don't define isascii()
21 *
9ef589d8 22 * Revision 4.0.1.2 91/06/07 11:48:24 lwall
23 * patch4: new copyright notice
24 * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
25 * patch4: // wouldn't use previous pattern if it started with a null character
26 *
35c8bce7 27 * Revision 4.0.1.1 91/04/12 09:04:45 lwall
28 * patch1: random cleanup in cpp namespace
29 *
fe14fcc3 30 * Revision 4.0 91/03/20 01:39:01 lwall
31 * 4.0 baseline.
a687059c 32 *
33 */
f0fcb552 34/*SUPPRESS 112*/
a687059c 35/*
36 * regcomp and regexec -- regsub and regerror are not used in perl
37 *
38 * Copyright (c) 1986 by University of Toronto.
39 * Written by Henry Spencer. Not derived from licensed software.
40 *
41 * Permission is granted to anyone to use this software for any
42 * purpose on any computer system, and to redistribute it freely,
43 * subject to the following restrictions:
44 *
45 * 1. The author is not responsible for the consequences of use of
46 * this software, no matter how awful, even if they arise
47 * from defects in it.
48 *
49 * 2. The origin of this software must not be misrepresented, either
50 * by explicit claim or by omission.
51 *
52 * 3. Altered versions must be plainly marked as such, and must not
53 * be misrepresented as being the original software.
54 *
55 *
56 **** Alterations to Henry's code are...
57 ****
9ef589d8 58 **** Copyright (c) 1991, Larry Wall
a687059c 59 ****
9ef589d8 60 **** You may distribute under the terms of either the GNU General Public
61 **** License or the Artistic License, as specified in the README file.
62
a687059c 63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
69#include "perl.h"
70#include "INTERN.h"
71#include "regcomp.h"
72
fe14fcc3 73#ifdef MSDOS
74# if defined(BUGGY_MSC6)
75 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
76 # pragma optimize("a",off)
77 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
78 # pragma optimize("w",on )
79# endif /* BUGGY_MSC6 */
80#endif /* MSDOS */
81
a687059c 82#ifndef STATIC
83#define STATIC static
84#endif
85
86#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
87#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
88 ((*s) == '{' && regcurly(s)))
89#define META "^$.[()|?+*\\"
90
35c8bce7 91#ifdef SPSTART
92#undef SPSTART /* dratted cpp namespace... */
93#endif
a687059c 94/*
95 * Flags to be passed up and down.
96 */
97#define HASWIDTH 01 /* Known never to match null string. */
98#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
99#define SPSTART 04 /* Starts with * or +. */
100#define WORST 0 /* Worst case. */
101
102/*
103 * Global work variables for regcomp().
104 */
105static char *regprecomp; /* uncompiled string. */
106static char *regparse; /* Input-scan pointer. */
107static char *regxend; /* End of input for compile */
108static int regnpar; /* () count. */
109static char *regcode; /* Code-emit pointer; &regdummy = don't. */
110static long regsize; /* Code size. */
111static int regfold;
112static int regsawbracket; /* Did we do {d,d} trick? */
9ef589d8 113static int regsawback; /* Did we see \1, ...? */
a687059c 114
115/*
116 * Forward declarations for regcomp()'s friends.
117 */
118STATIC int regcurly();
119STATIC char *reg();
120STATIC char *regbranch();
121STATIC char *regpiece();
122STATIC char *regatom();
123STATIC char *regclass();
124STATIC char *regnode();
fe14fcc3 125STATIC char *reganode();
a687059c 126STATIC void regc();
127STATIC void reginsert();
128STATIC void regtail();
129STATIC void regoptail();
130
131/*
132 - regcomp - compile a regular expression into internal code
133 *
134 * We can't allocate space until we know how big the compiled form will be,
135 * but we can't compile it (and thus know how big it is) until we've got a
136 * place to put the code. So we cheat: we compile it twice, once with code
137 * generation turned off and size counting turned on, and once "for real".
138 * This also means that we don't allocate space until we are sure that the
139 * thing really will compile successfully, and we never have to move the
140 * code and thus invalidate pointers into it. (Note that it has to be in
141 * one piece because free() must be able to free it all.) [NB: not true in perl]
142 *
143 * Beware that the optimization-preparation code in here knows about some
144 * of the structure of the compiled regexp. [I'll say.]
145 */
146regexp *
00bf170e 147regcomp(exp,xend,fold)
a687059c 148char *exp;
149char *xend;
150int fold;
a687059c 151{
152 register regexp *r;
153 register char *scan;
0a12ae7d 154 register STR *longish;
155 STR *longest;
a687059c 156 register int len;
157 register char *first;
158 int flags;
34de22dd 159 int backish;
160 int backest;
a687059c 161 int curback;
f0fcb552 162 int minlen;
55204971 163#ifndef safemalloc
a687059c 164 extern char *safemalloc();
55204971 165#endif
a687059c 166 extern char *savestr();
00bf170e 167 int sawplus = 0;
9ef589d8 168 int sawopen = 0;
a687059c 169
170 if (exp == NULL)
171 fatal("NULL regexp argument");
172
173 /* First pass: determine size, legality. */
174 regfold = fold;
175 regparse = exp;
176 regxend = xend;
177 regprecomp = nsavestr(exp,xend-exp);
178 regsawbracket = 0;
9ef589d8 179 regsawback = 0;
a687059c 180 regnpar = 1;
181 regsize = 0L;
182 regcode = &regdummy;
f0fcb552 183 regc((char)MAGIC);
a687059c 184 if (reg(0, &flags) == NULL) {
185 Safefree(regprecomp);
fe14fcc3 186 regprecomp = Nullch;
a687059c 187 return(NULL);
188 }
189
190 /* Small enough for pointer-storage convention? */
191 if (regsize >= 32767L) /* Probably could be 65535L. */
192 FAIL("regexp too big");
193
194 /* Allocate space. */
195 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
196 if (r == NULL)
197 FAIL("regexp out of space");
198
199 /* Second pass: emit code. */
200 if (regsawbracket)
201 bcopy(regprecomp,exp,xend-exp);
9ef589d8 202 r->prelen = xend-exp;
a687059c 203 r->precomp = regprecomp;
9ef589d8 204 r->subbeg = r->subbase = NULL;
a687059c 205 regparse = exp;
206 regnpar = 1;
207 regcode = r->program;
f0fcb552 208 regc((char)MAGIC);
a687059c 209 if (reg(0, &flags) == NULL)
210 return(NULL);
211
212 /* Dig out information for optimizations. */
213 r->regstart = Nullstr; /* Worst-case defaults. */
214 r->reganch = 0;
215 r->regmust = Nullstr;
216 r->regback = -1;
217 r->regstclass = Nullch;
218 scan = r->program+1; /* First BRANCH. */
219 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
220 scan = NEXTOPER(scan);
221
222 first = scan;
9ef589d8 223 while ((OP(first) == OPEN && (sawopen = 1)) ||
a687059c 224 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
00bf170e 225 (OP(first) == PLUS) ||
226 (OP(first) == CURLY && ARG1(first) > 0) ) {
fe14fcc3 227 if (OP(first) == PLUS)
9ef589d8 228 sawplus = 1;
fe14fcc3 229 else
230 first += regarglen[OP(first)];
a687059c 231 first = NEXTOPER(first);
00bf170e 232 }
a687059c 233
234 /* Starting-point info. */
9ef589d8 235 again:
a687059c 236 if (OP(first) == EXACTLY) {
237 r->regstart =
238 str_make(OPERAND(first)+1,*OPERAND(first));
239 if (r->regstart->str_cur > !(sawstudy|fold))
240 fbmcompile(r->regstart,fold);
241 }
242 else if ((exp = index(simple,OP(first))) && exp > simple)
243 r->regstclass = first;
244 else if (OP(first) == BOUND || OP(first) == NBOUND)
245 r->regstclass = first;
00bf170e 246 else if (OP(first) == BOL ||
9ef589d8 247 (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
f0fcb552 248 /* kinda turn .* into ^.* */
249 r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
9ef589d8 250 first = NEXTOPER(first);
251 goto again;
252 }
253 if (sawplus && (!sawopen || !regsawback))
254 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
a687059c 255
256#ifdef DEBUGGING
257 if (debug & 512)
258 fprintf(stderr,"first %d next %d offset %d\n",
259 OP(first), OP(NEXTOPER(first)), first - scan);
260#endif
261 /*
262 * If there's something expensive in the r.e., find the
263 * longest literal string that must appear and make it the
264 * regmust. Resolve ties in favor of later strings, since
265 * the regstart check works with the beginning of the r.e.
266 * and avoiding duplication strengthens checking. Not a
267 * strong reason, but sufficient in the absence of others.
268 * [Now we resolve ties in favor of the earlier string if
269 * it happens that curback has been invalidated, since the
270 * earlier string may buy us something the later one won't.]
271 */
0a12ae7d 272 longish = str_make("",0);
a687059c 273 longest = str_make("",0);
274 len = 0;
f0fcb552 275 minlen = 0;
a687059c 276 curback = 0;
34de22dd 277 backish = 0;
278 backest = 0;
e5d73d77 279 while (OP(scan) != END) {
a687059c 280 if (OP(scan) == BRANCH) {
281 if (OP(regnext(scan)) == BRANCH) {
282 curback = -30000;
283 while (OP(scan) == BRANCH)
284 scan = regnext(scan);
285 }
286 else /* single branch is ok */
287 scan = NEXTOPER(scan);
288 }
289 if (OP(scan) == EXACTLY) {
fe14fcc3 290 char *t;
291
a687059c 292 first = scan;
fe14fcc3 293 while (OP(t = regnext(scan)) == CLOSE)
294 scan = t;
f0fcb552 295 minlen += *OPERAND(first);
34de22dd 296 if (curback - backish == len) {
0a12ae7d 297 str_ncat(longish, OPERAND(first)+1,
a687059c 298 *OPERAND(first));
299 len += *OPERAND(first);
300 curback += *OPERAND(first);
301 first = regnext(scan);
302 }
303 else if (*OPERAND(first) >= len + (curback >= 0)) {
304 len = *OPERAND(first);
0a12ae7d 305 str_nset(longish, OPERAND(first)+1,len);
34de22dd 306 backish = curback;
a687059c 307 curback += len;
308 first = regnext(scan);
309 }
310 else
311 curback += *OPERAND(first);
312 }
0a12ae7d 313 else if (index(varies,OP(scan))) {
314 curback = -30000;
315 len = 0;
34de22dd 316 if (longish->str_cur > longest->str_cur) {
0a12ae7d 317 str_sset(longest,longish);
34de22dd 318 backest = backish;
319 }
0a12ae7d 320 str_nset(longish,"",0);
f0fcb552 321 if (OP(scan) == PLUS &&
322 index(simple,OP(NEXTOPER(scan))))
323 minlen++;
324 else if (OP(scan) == CURLY &&
325 index(simple,OP(NEXTOPER(scan)+4)))
326 minlen += ARG1(scan);
0a12ae7d 327 }
e5d73d77 328 else if (index(simple,OP(scan))) {
0a12ae7d 329 curback++;
f0fcb552 330 minlen++;
e5d73d77 331 len = 0;
34de22dd 332 if (longish->str_cur > longest->str_cur) {
e5d73d77 333 str_sset(longest,longish);
34de22dd 334 backest = backish;
335 }
e5d73d77 336 str_nset(longish,"",0);
337 }
a687059c 338 scan = regnext(scan);
339 }
e5d73d77 340
341 /* Prefer earlier on tie, unless we can tail match latter */
342
34de22dd 343 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
0a12ae7d 344 str_sset(longest,longish);
34de22dd 345 backest = backish;
346 }
e5d73d77 347 else
348 str_nset(longish,"",0);
34de22dd 349 if (longest->str_cur
350 &&
351 (!r->regstart
352 ||
f0fcb552 353 !fbminstr((unsigned char*) r->regstart->str_ptr,
354 (unsigned char *) r->regstart->str_ptr
355 + r->regstart->str_cur,
34de22dd 356 longest)
357 )
358 )
359 {
a687059c 360 r->regmust = longest;
34de22dd 361 if (backest < 0)
362 backest = -1;
363 r->regback = backest;
0a12ae7d 364 if (longest->str_cur
365 > !(sawstudy || fold || OP(first) == EOL) )
a687059c 366 fbmcompile(r->regmust,fold);
367 r->regmust->str_u.str_useful = 100;
e5d73d77 368 if (OP(first) == EOL && longish->str_cur)
a687059c 369 r->regmust->str_pok |= SP_TAIL;
370 }
fe14fcc3 371 else {
a687059c 372 str_free(longest);
fe14fcc3 373 longest = Nullstr;
374 }
e5d73d77 375 str_free(longish);
a687059c 376 }
377
378 r->do_folding = fold;
379 r->nparens = regnpar - 1;
f0fcb552 380 r->minlen = minlen;
381 Newz(1002, r->startp, regnpar, char*);
382 Newz(1002, r->endp, regnpar, char*);
a687059c 383#ifdef DEBUGGING
384 if (debug & 512)
385 regdump(r);
386#endif
387 return(r);
388}
389
390/*
391 - reg - regular expression, i.e. main body or parenthesized thing
392 *
393 * Caller must absorb opening parenthesis.
394 *
395 * Combining parenthesis handling with the base level of regular expression
396 * is a trifle forced, but the need to tie the tails of the branches to what
397 * follows makes it hard to avoid.
398 */
399static char *
400reg(paren, flagp)
401int paren; /* Parenthesized? */
402int *flagp;
403{
404 register char *ret;
405 register char *br;
406 register char *ender;
407 register int parno;
408 int flags;
409
410 *flagp = HASWIDTH; /* Tentatively. */
411
412 /* Make an OPEN node, if parenthesized. */
413 if (paren) {
a687059c 414 parno = regnpar;
415 regnpar++;
fe14fcc3 416 ret = reganode(OPEN, parno);
a687059c 417 } else
418 ret = NULL;
419
420 /* Pick up the branches, linking them together. */
421 br = regbranch(&flags);
422 if (br == NULL)
423 return(NULL);
424 if (ret != NULL)
425 regtail(ret, br); /* OPEN -> first. */
426 else
427 ret = br;
428 if (!(flags&HASWIDTH))
429 *flagp &= ~HASWIDTH;
430 *flagp |= flags&SPSTART;
431 while (*regparse == '|') {
432 regparse++;
433 br = regbranch(&flags);
434 if (br == NULL)
435 return(NULL);
436 regtail(ret, br); /* BRANCH -> BRANCH. */
437 if (!(flags&HASWIDTH))
438 *flagp &= ~HASWIDTH;
439 *flagp |= flags&SPSTART;
440 }
441
442 /* Make a closing node, and hook it on the end. */
fe14fcc3 443 if (paren)
444 ender = reganode(CLOSE, parno);
445 else
446 ender = regnode(END);
a687059c 447 regtail(ret, ender);
448
449 /* Hook the tails of the branches to the closing node. */
450 for (br = ret; br != NULL; br = regnext(br))
451 regoptail(br, ender);
452
453 /* Check for proper termination. */
454 if (paren && *regparse++ != ')') {
455 FAIL("unmatched () in regexp");
456 } else if (!paren && regparse < regxend) {
457 if (*regparse == ')') {
458 FAIL("unmatched () in regexp");
459 } else
460 FAIL("junk on end of regexp"); /* "Can't happen". */
461 /* NOTREACHED */
462 }
463
464 return(ret);
465}
466
467/*
468 - regbranch - one alternative of an | operator
469 *
470 * Implements the concatenation operator.
471 */
472static char *
473regbranch(flagp)
474int *flagp;
475{
476 register char *ret;
477 register char *chain;
478 register char *latest;
479 int flags;
480
481 *flagp = WORST; /* Tentatively. */
482
483 ret = regnode(BRANCH);
484 chain = NULL;
485 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
486 latest = regpiece(&flags);
487 if (latest == NULL)
488 return(NULL);
489 *flagp |= flags&HASWIDTH;
490 if (chain == NULL) /* First piece. */
491 *flagp |= flags&SPSTART;
492 else
493 regtail(chain, latest);
494 chain = latest;
495 }
496 if (chain == NULL) /* Loop ran zero times. */
497 (void) regnode(NOTHING);
498
499 return(ret);
500}
501
502/*
503 - regpiece - something followed by possible [*+?]
504 *
505 * Note that the branching code sequences used for ? and the general cases
506 * of * and + are somewhat optimized: they use the same NOTHING node as
507 * both the endmarker for their branch list and the body of the last branch.
508 * It might seem that this node could be dispensed with entirely, but the
509 * endmarker role is not redundant.
510 */
511static char *
512regpiece(flagp)
513int *flagp;
514{
515 register char *ret;
516 register char op;
517 register char *next;
518 int flags;
519 char *origparse = regparse;
520 int orignpar = regnpar;
521 char *max;
522 int iter;
523 char ch;
524
525 ret = regatom(&flags);
526 if (ret == NULL)
527 return(NULL);
528
529 op = *regparse;
530
531 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
532 * then we decrement the first number by one and reset our
533 * parsing back to the beginning of the same atom. If the first number
534 * is down to 0, decrement the second number instead and fake up
535 * a ? after it. Given the way this compiler doesn't keep track
536 * of offsets on the first pass, this is the only way to replicate
537 * a piece of code. Sigh.
538 */
539 if (op == '{' && regcurly(regparse)) {
540 next = regparse + 1;
541 max = Nullch;
f0fcb552 542 while (isDIGIT(*next) || *next == ',') {
a687059c 543 if (*next == ',') {
544 if (max)
545 break;
546 else
547 max = next;
548 }
549 next++;
550 }
551 if (*next == '}') { /* got one */
a687059c 552 if (!max)
553 max = next;
554 regparse++;
555 iter = atoi(regparse);
00bf170e 556 if (flags&SIMPLE) { /* we can do it right after all */
557 int tmp;
558
559 reginsert(CURLY, ret);
fe14fcc3 560 if (iter > 0)
561 *flagp = (WORST|HASWIDTH);
00bf170e 562 if (*max == ',')
563 max++;
6eb13c3b 564 else
565 max = regparse;
00bf170e 566 tmp = atoi(max);
567 if (tmp && tmp < iter)
568 fatal("Can't do {n,m} with n > m");
569 if (regcode != &regdummy) {
570#ifdef REGALIGN
571 *(unsigned short *)(ret+3) = iter;
572 *(unsigned short *)(ret+5) = tmp;
573#else
574 ret[3] = iter >> 8; ret[4] = iter & 0377;
575 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
576#endif
577 }
578 regparse = next;
579 goto nest_check;
580 }
581 regsawbracket++; /* remember we clobbered exp */
a687059c 582 if (iter > 0) {
583 ch = *max;
584 sprintf(regparse,"%.*d", max-regparse, iter - 1);
585 *max = ch;
00bf170e 586 if (*max == ',' && max[1] != '}') {
587 if (atoi(max+1) <= 0)
588 fatal("Can't do {n,m} with n > m");
a687059c 589 ch = *next;
590 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
591 *next = ch;
592 }
00bf170e 593 if (iter != 1 || *max == ',') {
a687059c 594 regparse = origparse; /* back up input pointer */
595 regnpar = orignpar; /* don't make more parens */
596 }
597 else {
598 regparse = next;
599 goto nest_check;
600 }
601 *flagp = flags;
602 return ret;
603 }
604 if (*max == ',') {
605 max++;
606 iter = atoi(max);
607 if (max == next) { /* any number more? */
608 regparse = next;
609 op = '*'; /* fake up one with a star */
610 }
611 else if (iter > 0) {
612 op = '?'; /* fake up optional atom */
613 ch = *next;
614 sprintf(max,"%.*d", next-max, iter - 1);
615 *next = ch;
616 if (iter == 1)
617 regparse = next;
618 else {
619 regparse = origparse - 1; /* offset ++ below */
620 regnpar = orignpar;
621 }
622 }
623 else
624 fatal("Can't do {n,0}");
625 }
626 else
627 fatal("Can't do {0}");
628 }
629 }
630
631 if (!ISMULT1(op)) {
632 *flagp = flags;
633 return(ret);
634 }
635
636 if (!(flags&HASWIDTH) && op != '?')
637 FAIL("regexp *+ operand could be empty");
638 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
639
640 if (op == '*' && (flags&SIMPLE))
641 reginsert(STAR, ret);
642 else if (op == '*') {
643 /* Emit x* as (x&|), where & means "self". */
644 reginsert(BRANCH, ret); /* Either x */
645 regoptail(ret, regnode(BACK)); /* and loop */
646 regoptail(ret, ret); /* back */
647 regtail(ret, regnode(BRANCH)); /* or */
648 regtail(ret, regnode(NOTHING)); /* null. */
649 } else if (op == '+' && (flags&SIMPLE))
650 reginsert(PLUS, ret);
651 else if (op == '+') {
652 /* Emit x+ as x(&|), where & means "self". */
653 next = regnode(BRANCH); /* Either */
654 regtail(ret, next);
655 regtail(regnode(BACK), ret); /* loop back */
656 regtail(next, regnode(BRANCH)); /* or */
657 regtail(ret, regnode(NOTHING)); /* null. */
658 } else if (op == '?') {
659 /* Emit x? as (x|) */
660 reginsert(BRANCH, ret); /* Either x */
661 regtail(ret, regnode(BRANCH)); /* or */
662 next = regnode(NOTHING); /* null. */
663 regtail(ret, next);
664 regoptail(ret, next);
665 }
666 nest_check:
667 regparse++;
668 if (ISMULT2(regparse))
669 FAIL("nested *?+ in regexp");
670
671 return(ret);
672}
673
674/*
675 - regatom - the lowest level
676 *
677 * Optimization: gobbles an entire sequence of ordinary characters so that
678 * it can turn them into a single node, which is smaller to store and
679 * faster to run. Backslashed characters are exceptions, each becoming a
680 * separate node; the code is simpler that way and it's not worth fixing.
681 *
682 * [Yes, it is worth fixing, some scripts can run twice the speed.]
683 */
684static char *
685regatom(flagp)
686int *flagp;
687{
688 register char *ret;
689 int flags;
690
691 *flagp = WORST; /* Tentatively. */
692
693 switch (*regparse++) {
694 case '^':
695 ret = regnode(BOL);
696 break;
697 case '$':
698 ret = regnode(EOL);
699 break;
700 case '.':
701 ret = regnode(ANY);
702 *flagp |= HASWIDTH|SIMPLE;
703 break;
704 case '[':
705 ret = regclass();
706 *flagp |= HASWIDTH|SIMPLE;
707 break;
708 case '(':
709 ret = reg(1, &flags);
710 if (ret == NULL)
711 return(NULL);
712 *flagp |= flags&(HASWIDTH|SPSTART);
713 break;
714 case '|':
715 case ')':
716 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
717 break;
718 case '?':
719 case '+':
720 case '*':
721 FAIL("?+* follows nothing in regexp");
722 break;
723 case '\\':
724 switch (*regparse) {
725 case 'w':
726 ret = regnode(ALNUM);
727 *flagp |= HASWIDTH|SIMPLE;
728 regparse++;
729 break;
730 case 'W':
731 ret = regnode(NALNUM);
732 *flagp |= HASWIDTH|SIMPLE;
733 regparse++;
734 break;
735 case 'b':
736 ret = regnode(BOUND);
737 *flagp |= SIMPLE;
738 regparse++;
739 break;
740 case 'B':
741 ret = regnode(NBOUND);
742 *flagp |= SIMPLE;
743 regparse++;
744 break;
745 case 's':
746 ret = regnode(SPACE);
747 *flagp |= HASWIDTH|SIMPLE;
748 regparse++;
749 break;
750 case 'S':
751 ret = regnode(NSPACE);
752 *flagp |= HASWIDTH|SIMPLE;
753 regparse++;
754 break;
755 case 'd':
756 ret = regnode(DIGIT);
757 *flagp |= HASWIDTH|SIMPLE;
758 regparse++;
759 break;
760 case 'D':
761 ret = regnode(NDIGIT);
762 *flagp |= HASWIDTH|SIMPLE;
763 regparse++;
764 break;
765 case 'n':
766 case 'r':
767 case 't':
768 case 'f':
fe14fcc3 769 case 'e':
770 case 'a':
771 case 'x':
772 case 'c':
773 case '0':
a687059c 774 goto defchar;
fe14fcc3 775 case '1': case '2': case '3': case '4':
a687059c 776 case '5': case '6': case '7': case '8': case '9':
fe14fcc3 777 {
778 int num = atoi(regparse);
779
780 if (num > 9 && num >= regnpar)
a687059c 781 goto defchar;
fe14fcc3 782 else {
9ef589d8 783 regsawback = 1;
fe14fcc3 784 ret = reganode(REF, num);
f0fcb552 785 while (isDIGIT(*regparse))
fe14fcc3 786 regparse++;
a687059c 787 *flagp |= SIMPLE;
fe14fcc3 788 }
a687059c 789 }
790 break;
791 case '\0':
792 if (regparse >= regxend)
793 FAIL("trailing \\ in regexp");
794 /* FALL THROUGH */
795 default:
796 goto defchar;
797 }
798 break;
799 default: {
800 register int len;
801 register char ender;
802 register char *p;
803 char *oldp;
fe14fcc3 804 int numlen;
a687059c 805
806 defchar:
807 ret = regnode(EXACTLY);
808 regc(0); /* save spot for len */
809 for (len=0, p=regparse-1;
810 len < 127 && p < regxend;
811 len++)
812 {
813 oldp = p;
814 switch (*p) {
815 case '^':
816 case '$':
817 case '.':
818 case '[':
819 case '(':
820 case ')':
821 case '|':
822 goto loopdone;
823 case '\\':
824 switch (*++p) {
825 case 'w':
826 case 'W':
827 case 'b':
828 case 'B':
829 case 's':
830 case 'S':
831 case 'd':
832 case 'D':
833 --p;
834 goto loopdone;
835 case 'n':
836 ender = '\n';
837 p++;
838 break;
839 case 'r':
840 ender = '\r';
841 p++;
842 break;
843 case 't':
844 ender = '\t';
845 p++;
846 break;
847 case 'f':
848 ender = '\f';
849 p++;
850 break;
fe14fcc3 851 case 'e':
852 ender = '\033';
853 p++;
854 break;
855 case 'a':
856 ender = '\007';
857 p++;
858 break;
859 case 'x':
860 ender = scanhex(++p, 2, &numlen);
861 p += numlen;
862 break;
863 case 'c':
864 p++;
865 ender = *p++;
f0fcb552 866 if (isLOWER(ender))
fe14fcc3 867 ender = toupper(ender);
868 ender ^= 64;
869 break;
a687059c 870 case '0': case '1': case '2': case '3':case '4':
871 case '5': case '6': case '7': case '8':case '9':
fe14fcc3 872 if (*p == '0' ||
f0fcb552 873 (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
fe14fcc3 874 ender = scanoct(p, 3, &numlen);
875 p += numlen;
a687059c 876 }
877 else {
878 --p;
879 goto loopdone;
880 }
881 break;
882 case '\0':
883 if (p >= regxend)
884 FAIL("trailing \\ in regexp");
885 /* FALL THROUGH */
886 default:
887 ender = *p++;
888 break;
889 }
890 break;
891 default:
892 ender = *p++;
893 break;
894 }
f0fcb552 895 if (regfold && isUPPER(ender))
a687059c 896 ender = tolower(ender);
897 if (ISMULT2(p)) { /* Back off on ?+*. */
898 if (len)
899 p = oldp;
900 else {
901 len++;
902 regc(ender);
903 }
904 break;
905 }
906 regc(ender);
907 }
908 loopdone:
909 regparse = p;
910 if (len <= 0)
911 FAIL("internal disaster in regexp");
912 *flagp |= HASWIDTH;
913 if (len == 1)
914 *flagp |= SIMPLE;
915 if (regcode != &regdummy)
916 *OPERAND(ret) = len;
917 regc('\0');
918 }
919 break;
920 }
921
922 return(ret);
923}
924
925static void
926regset(bits,def,c)
927char *bits;
928int def;
929register int c;
930{
931 if (regcode == &regdummy)
932 return;
ac58e20f 933 c &= 255;
a687059c 934 if (def)
935 bits[c >> 3] &= ~(1 << (c & 7));
936 else
937 bits[c >> 3] |= (1 << (c & 7));
938}
939
940static char *
941regclass()
942{
943 register char *bits;
944 register int class;
945 register int lastclass;
946 register int range = 0;
947 register char *ret;
948 register int def;
fe14fcc3 949 int numlen;
a687059c 950
00bf170e 951 ret = regnode(ANYOF);
a687059c 952 if (*regparse == '^') { /* Complement of range. */
a687059c 953 regparse++;
954 def = 0;
955 } else {
a687059c 956 def = 255;
957 }
958 bits = regcode;
959 for (class = 0; class < 32; class++)
960 regc(def);
961 if (*regparse == ']' || *regparse == '-')
00bf170e 962 goto skipcond; /* allow 1st char to be ] or - */
a687059c 963 while (regparse < regxend && *regparse != ']') {
00bf170e 964 skipcond:
a687059c 965 class = UCHARAT(regparse++);
966 if (class == '\\') {
967 class = UCHARAT(regparse++);
968 switch (class) {
969 case 'w':
970 for (class = 'a'; class <= 'z'; class++)
971 regset(bits,def,class);
972 for (class = 'A'; class <= 'Z'; class++)
973 regset(bits,def,class);
974 for (class = '0'; class <= '9'; class++)
975 regset(bits,def,class);
976 regset(bits,def,'_');
977 lastclass = 1234;
978 continue;
979 case 's':
980 regset(bits,def,' ');
981 regset(bits,def,'\t');
982 regset(bits,def,'\r');
983 regset(bits,def,'\f');
984 regset(bits,def,'\n');
985 lastclass = 1234;
986 continue;
987 case 'd':
988 for (class = '0'; class <= '9'; class++)
989 regset(bits,def,class);
990 lastclass = 1234;
991 continue;
992 case 'n':
993 class = '\n';
994 break;
995 case 'r':
996 class = '\r';
997 break;
998 case 't':
999 class = '\t';
1000 break;
1001 case 'f':
1002 class = '\f';
1003 break;
1004 case 'b':
1005 class = '\b';
1006 break;
fe14fcc3 1007 case 'e':
1008 class = '\033';
1009 break;
1010 case 'a':
1011 class = '\007';
1012 break;
1013 case 'x':
1014 class = scanhex(regparse, 2, &numlen);
1015 regparse += numlen;
1016 break;
1017 case 'c':
1018 class = *regparse++;
f0fcb552 1019 if (isLOWER(class))
fe14fcc3 1020 class = toupper(class);
1021 class ^= 64;
1022 break;
a687059c 1023 case '0': case '1': case '2': case '3': case '4':
1024 case '5': case '6': case '7': case '8': case '9':
fe14fcc3 1025 class = scanoct(--regparse, 3, &numlen);
1026 regparse += numlen;
a687059c 1027 break;
1028 }
1029 }
a687059c 1030 if (range) {
1031 if (lastclass > class)
1032 FAIL("invalid [] range in regexp");
00bf170e 1033 range = 0;
a687059c 1034 }
00bf170e 1035 else {
1036 lastclass = class;
1037 if (*regparse == '-' && regparse+1 < regxend &&
1038 regparse[1] != ']') {
1039 regparse++;
1040 range = 1;
1041 continue; /* do it next time */
1042 }
1043 }
1044 for ( ; lastclass <= class; lastclass++) {
a687059c 1045 regset(bits,def,lastclass);
f0fcb552 1046 if (regfold && isUPPER(lastclass))
a687059c 1047 regset(bits,def,tolower(lastclass));
1048 }
1049 lastclass = class;
1050 }
1051 if (*regparse != ']')
1052 FAIL("unmatched [] in regexp");
a687059c 1053 regparse++;
1054 return ret;
1055}
1056
1057/*
1058 - regnode - emit a node
1059 */
1060static char * /* Location. */
1061regnode(op)
1062char op;
1063{
1064 register char *ret;
1065 register char *ptr;
1066
1067 ret = regcode;
1068 if (ret == &regdummy) {
1069#ifdef REGALIGN
1070 if (!(regsize & 1))
1071 regsize++;
1072#endif
1073 regsize += 3;
1074 return(ret);
1075 }
1076
1077#ifdef REGALIGN
1078#ifndef lint
1079 if (!((long)ret & 1))
1080 *ret++ = 127;
1081#endif
1082#endif
1083 ptr = ret;
1084 *ptr++ = op;
1085 *ptr++ = '\0'; /* Null "next" pointer. */
1086 *ptr++ = '\0';
1087 regcode = ptr;
1088
1089 return(ret);
1090}
1091
1092/*
fe14fcc3 1093 - reganode - emit a node with an argument
1094 */
1095static char * /* Location. */
1096reganode(op, arg)
1097char op;
1098unsigned short arg;
1099{
1100 register char *ret;
1101 register char *ptr;
1102
1103 ret = regcode;
1104 if (ret == &regdummy) {
1105#ifdef REGALIGN
1106 if (!(regsize & 1))
1107 regsize++;
1108#endif
1109 regsize += 5;
1110 return(ret);
1111 }
1112
1113#ifdef REGALIGN
1114#ifndef lint
1115 if (!((long)ret & 1))
1116 *ret++ = 127;
1117#endif
1118#endif
1119 ptr = ret;
1120 *ptr++ = op;
1121 *ptr++ = '\0'; /* Null "next" pointer. */
1122 *ptr++ = '\0';
1123#ifdef REGALIGN
1124 *(unsigned short *)(ret+3) = arg;
1125#else
1126 ret[3] = arg >> 8; ret[4] = arg & 0377;
1127#endif
1128 ptr += 2;
1129 regcode = ptr;
1130
1131 return(ret);
1132}
1133
1134/*
a687059c 1135 - regc - emit (if appropriate) a byte of code
1136 */
1137static void
1138regc(b)
1139char b;
1140{
1141 if (regcode != &regdummy)
1142 *regcode++ = b;
1143 else
1144 regsize++;
1145}
1146
1147/*
1148 - reginsert - insert an operator in front of already-emitted operand
1149 *
1150 * Means relocating the operand.
1151 */
1152static void
1153reginsert(op, opnd)
1154char op;
1155char *opnd;
1156{
1157 register char *src;
1158 register char *dst;
1159 register char *place;
00bf170e 1160 register offset = (op == CURLY ? 4 : 0);
a687059c 1161
1162 if (regcode == &regdummy) {
1163#ifdef REGALIGN
00bf170e 1164 regsize += 4 + offset;
a687059c 1165#else
00bf170e 1166 regsize += 3 + offset;
a687059c 1167#endif
1168 return;
1169 }
1170
1171 src = regcode;
1172#ifdef REGALIGN
00bf170e 1173 regcode += 4 + offset;
a687059c 1174#else
00bf170e 1175 regcode += 3 + offset;
a687059c 1176#endif
1177 dst = regcode;
1178 while (src > opnd)
1179 *--dst = *--src;
1180
1181 place = opnd; /* Op node, where operand used to be. */
1182 *place++ = op;
1183 *place++ = '\0';
1184 *place++ = '\0';
00bf170e 1185 while (offset-- > 0)
1186 *place++ = '\0';
a687059c 1187}
1188
1189/*
1190 - regtail - set the next-pointer at the end of a node chain
1191 */
1192static void
1193regtail(p, val)
1194char *p;
1195char *val;
1196{
1197 register char *scan;
1198 register char *temp;
1199 register int offset;
1200
1201 if (p == &regdummy)
1202 return;
1203
1204 /* Find last node. */
1205 scan = p;
1206 for (;;) {
1207 temp = regnext(scan);
1208 if (temp == NULL)
1209 break;
1210 scan = temp;
1211 }
1212
1213#ifdef REGALIGN
1214 offset = val - scan;
1215#ifndef lint
1216 *(short*)(scan+1) = offset;
1217#else
1218 offset = offset;
1219#endif
1220#else
1221 if (OP(scan) == BACK)
1222 offset = scan - val;
1223 else
1224 offset = val - scan;
1225 *(scan+1) = (offset>>8)&0377;
1226 *(scan+2) = offset&0377;
1227#endif
1228}
1229
1230/*
1231 - regoptail - regtail on operand of first argument; nop if operandless
1232 */
1233static void
1234regoptail(p, val)
1235char *p;
1236char *val;
1237{
1238 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1239 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1240 return;
1241 regtail(NEXTOPER(p), val);
1242}
1243
1244/*
1245 - regcurly - a little FSA that accepts {\d+,?\d*}
1246 */
1247STATIC int
1248regcurly(s)
1249register char *s;
1250{
1251 if (*s++ != '{')
1252 return FALSE;
f0fcb552 1253 if (!isDIGIT(*s))
a687059c 1254 return FALSE;
f0fcb552 1255 while (isDIGIT(*s))
a687059c 1256 s++;
1257 if (*s == ',')
1258 s++;
f0fcb552 1259 while (isDIGIT(*s))
a687059c 1260 s++;
1261 if (*s != '}')
1262 return FALSE;
1263 return TRUE;
1264}
1265
1266#ifdef DEBUGGING
1267
1268/*
1269 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1270 */
1271void
1272regdump(r)
1273regexp *r;
1274{
1275 register char *s;
1276 register char op = EXACTLY; /* Arbitrary non-END op. */
1277 register char *next;
a687059c 1278
1279
1280 s = r->program + 1;
1281 while (op != END) { /* While that wasn't END last time... */
1282#ifdef REGALIGN
1283 if (!((long)s & 1))
1284 s++;
1285#endif
1286 op = OP(s);
1287 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1288 next = regnext(s);
fe14fcc3 1289 s += regarglen[op];
a687059c 1290 if (next == NULL) /* Next ptr. */
1291 fprintf(stderr,"(0)");
1292 else
1293 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1294 s += 3;
00bf170e 1295 if (op == ANYOF) {
a687059c 1296 s += 32;
1297 }
1298 if (op == EXACTLY) {
1299 /* Literal string, where present. */
1300 s++;
1301 while (*s != '\0') {
1302 (void)putchar(*s);
1303 s++;
1304 }
1305 s++;
1306 }
1307 (void)putchar('\n');
1308 }
1309
1310 /* Header fields of interest. */
1311 if (r->regstart)
1312 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1313 if (r->regstclass)
1314 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
9ef589d8 1315 if (r->reganch & ROPT_ANCH)
a687059c 1316 fprintf(stderr,"anchored ");
9ef589d8 1317 if (r->reganch & ROPT_SKIP)
00bf170e 1318 fprintf(stderr,"plus ");
f0fcb552 1319 if (r->reganch & ROPT_IMPLICIT)
1320 fprintf(stderr,"implicit ");
a687059c 1321 if (r->regmust != NULL)
1322 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1323 r->regback);
f0fcb552 1324 fprintf(stderr, "minlen %d ", r->minlen);
a687059c 1325 fprintf(stderr,"\n");
1326}
1327
1328/*
1329 - regprop - printable representation of opcode
1330 */
1331char *
1332regprop(op)
1333char *op;
1334{
1335 register char *p;
1336
1337 (void) strcpy(buf, ":");
1338
1339 switch (OP(op)) {
1340 case BOL:
1341 p = "BOL";
1342 break;
1343 case EOL:
1344 p = "EOL";
1345 break;
1346 case ANY:
1347 p = "ANY";
1348 break;
1349 case ANYOF:
1350 p = "ANYOF";
1351 break;
a687059c 1352 case BRANCH:
1353 p = "BRANCH";
1354 break;
1355 case EXACTLY:
1356 p = "EXACTLY";
1357 break;
1358 case NOTHING:
1359 p = "NOTHING";
1360 break;
1361 case BACK:
1362 p = "BACK";
1363 break;
1364 case END:
1365 p = "END";
1366 break;
1367 case ALNUM:
1368 p = "ALNUM";
1369 break;
1370 case NALNUM:
1371 p = "NALNUM";
1372 break;
1373 case BOUND:
1374 p = "BOUND";
1375 break;
1376 case NBOUND:
1377 p = "NBOUND";
1378 break;
1379 case SPACE:
1380 p = "SPACE";
1381 break;
1382 case NSPACE:
1383 p = "NSPACE";
1384 break;
1385 case DIGIT:
1386 p = "DIGIT";
1387 break;
1388 case NDIGIT:
1389 p = "NDIGIT";
1390 break;
00bf170e 1391 case CURLY:
1392 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1393 ARG1(op),ARG2(op));
1394 p = NULL;
1395 break;
a687059c 1396 case REF:
fe14fcc3 1397 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
a687059c 1398 p = NULL;
1399 break;
fe14fcc3 1400 case OPEN:
1401 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
a687059c 1402 p = NULL;
1403 break;
fe14fcc3 1404 case CLOSE:
1405 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
a687059c 1406 p = NULL;
1407 break;
1408 case STAR:
1409 p = "STAR";
1410 break;
1411 case PLUS:
1412 p = "PLUS";
1413 break;
1414 default:
1415 FAIL("corrupted regexp opcode");
1416 }
1417 if (p != NULL)
1418 (void) strcat(buf, p);
1419 return(buf);
1420}
1421#endif /* DEBUGGING */
1422
1423regfree(r)
1424struct regexp *r;
1425{
fe14fcc3 1426 if (r->precomp) {
a687059c 1427 Safefree(r->precomp);
fe14fcc3 1428 r->precomp = Nullch;
1429 }
1430 if (r->subbase) {
a687059c 1431 Safefree(r->subbase);
fe14fcc3 1432 r->subbase = Nullch;
1433 }
1434 if (r->regmust) {
a687059c 1435 str_free(r->regmust);
fe14fcc3 1436 r->regmust = Nullstr;
1437 }
1438 if (r->regstart) {
a687059c 1439 str_free(r->regstart);
fe14fcc3 1440 r->regstart = Nullstr;
1441 }
1442 Safefree(r->startp);
1443 Safefree(r->endp);
a687059c 1444 Safefree(r);
1445}