perl 4.0 patch 11: (combined patch)
[p5sagit/p5-mst-13.2.git] / regcomp.c
CommitLineData
a687059c 1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
9ef589d8 10/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.2 $$Date: 91/06/07 11:48:24 $
a687059c 11 *
12 * $Log: regcomp.c,v $
9ef589d8 13 * Revision 4.0.1.2 91/06/07 11:48:24 lwall
14 * patch4: new copyright notice
15 * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
16 * patch4: // wouldn't use previous pattern if it started with a null character
17 *
35c8bce7 18 * Revision 4.0.1.1 91/04/12 09:04:45 lwall
19 * patch1: random cleanup in cpp namespace
20 *
fe14fcc3 21 * Revision 4.0 91/03/20 01:39:01 lwall
22 * 4.0 baseline.
a687059c 23 *
24 */
25
26/*
27 * regcomp and regexec -- regsub and regerror are not used in perl
28 *
29 * Copyright (c) 1986 by University of Toronto.
30 * Written by Henry Spencer. Not derived from licensed software.
31 *
32 * Permission is granted to anyone to use this software for any
33 * purpose on any computer system, and to redistribute it freely,
34 * subject to the following restrictions:
35 *
36 * 1. The author is not responsible for the consequences of use of
37 * this software, no matter how awful, even if they arise
38 * from defects in it.
39 *
40 * 2. The origin of this software must not be misrepresented, either
41 * by explicit claim or by omission.
42 *
43 * 3. Altered versions must be plainly marked as such, and must not
44 * be misrepresented as being the original software.
45 *
46 *
47 **** Alterations to Henry's code are...
48 ****
9ef589d8 49 **** Copyright (c) 1991, Larry Wall
a687059c 50 ****
9ef589d8 51 **** You may distribute under the terms of either the GNU General Public
52 **** License or the Artistic License, as specified in the README file.
53
a687059c 54 *
55 * Beware that some of this code is subtly aware of the way operator
56 * precedence is structured in regular expressions. Serious changes in
57 * regular-expression syntax might require a total rethink.
58 */
59#include "EXTERN.h"
60#include "perl.h"
61#include "INTERN.h"
62#include "regcomp.h"
63
fe14fcc3 64#ifdef MSDOS
65# if defined(BUGGY_MSC6)
66 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
67 # pragma optimize("a",off)
68 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
69 # pragma optimize("w",on )
70# endif /* BUGGY_MSC6 */
71#endif /* MSDOS */
72
a687059c 73#ifndef STATIC
74#define STATIC static
75#endif
76
77#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
78#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
79 ((*s) == '{' && regcurly(s)))
80#define META "^$.[()|?+*\\"
81
35c8bce7 82#ifdef SPSTART
83#undef SPSTART /* dratted cpp namespace... */
84#endif
a687059c 85/*
86 * Flags to be passed up and down.
87 */
88#define HASWIDTH 01 /* Known never to match null string. */
89#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
90#define SPSTART 04 /* Starts with * or +. */
91#define WORST 0 /* Worst case. */
92
93/*
94 * Global work variables for regcomp().
95 */
96static char *regprecomp; /* uncompiled string. */
97static char *regparse; /* Input-scan pointer. */
98static char *regxend; /* End of input for compile */
99static int regnpar; /* () count. */
100static char *regcode; /* Code-emit pointer; &regdummy = don't. */
101static long regsize; /* Code size. */
102static int regfold;
103static int regsawbracket; /* Did we do {d,d} trick? */
9ef589d8 104static int regsawback; /* Did we see \1, ...? */
a687059c 105
106/*
107 * Forward declarations for regcomp()'s friends.
108 */
109STATIC int regcurly();
110STATIC char *reg();
111STATIC char *regbranch();
112STATIC char *regpiece();
113STATIC char *regatom();
114STATIC char *regclass();
115STATIC char *regnode();
fe14fcc3 116STATIC char *reganode();
a687059c 117STATIC void regc();
118STATIC void reginsert();
119STATIC void regtail();
120STATIC void regoptail();
121
122/*
123 - regcomp - compile a regular expression into internal code
124 *
125 * We can't allocate space until we know how big the compiled form will be,
126 * but we can't compile it (and thus know how big it is) until we've got a
127 * place to put the code. So we cheat: we compile it twice, once with code
128 * generation turned off and size counting turned on, and once "for real".
129 * This also means that we don't allocate space until we are sure that the
130 * thing really will compile successfully, and we never have to move the
131 * code and thus invalidate pointers into it. (Note that it has to be in
132 * one piece because free() must be able to free it all.) [NB: not true in perl]
133 *
134 * Beware that the optimization-preparation code in here knows about some
135 * of the structure of the compiled regexp. [I'll say.]
136 */
137regexp *
00bf170e 138regcomp(exp,xend,fold)
a687059c 139char *exp;
140char *xend;
141int fold;
a687059c 142{
143 register regexp *r;
144 register char *scan;
0a12ae7d 145 register STR *longish;
146 STR *longest;
a687059c 147 register int len;
148 register char *first;
149 int flags;
34de22dd 150 int backish;
151 int backest;
a687059c 152 int curback;
153 extern char *safemalloc();
154 extern char *savestr();
00bf170e 155 int sawplus = 0;
9ef589d8 156 int sawopen = 0;
a687059c 157
158 if (exp == NULL)
159 fatal("NULL regexp argument");
160
161 /* First pass: determine size, legality. */
162 regfold = fold;
163 regparse = exp;
164 regxend = xend;
165 regprecomp = nsavestr(exp,xend-exp);
166 regsawbracket = 0;
9ef589d8 167 regsawback = 0;
a687059c 168 regnpar = 1;
169 regsize = 0L;
170 regcode = &regdummy;
171 regc(MAGIC);
172 if (reg(0, &flags) == NULL) {
173 Safefree(regprecomp);
fe14fcc3 174 regprecomp = Nullch;
a687059c 175 return(NULL);
176 }
177
178 /* Small enough for pointer-storage convention? */
179 if (regsize >= 32767L) /* Probably could be 65535L. */
180 FAIL("regexp too big");
181
182 /* Allocate space. */
183 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
184 if (r == NULL)
185 FAIL("regexp out of space");
186
187 /* Second pass: emit code. */
188 if (regsawbracket)
189 bcopy(regprecomp,exp,xend-exp);
9ef589d8 190 r->prelen = xend-exp;
a687059c 191 r->precomp = regprecomp;
9ef589d8 192 r->subbeg = r->subbase = NULL;
a687059c 193 regparse = exp;
194 regnpar = 1;
195 regcode = r->program;
196 regc(MAGIC);
197 if (reg(0, &flags) == NULL)
198 return(NULL);
199
200 /* Dig out information for optimizations. */
201 r->regstart = Nullstr; /* Worst-case defaults. */
202 r->reganch = 0;
203 r->regmust = Nullstr;
204 r->regback = -1;
205 r->regstclass = Nullch;
206 scan = r->program+1; /* First BRANCH. */
207 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
208 scan = NEXTOPER(scan);
209
210 first = scan;
9ef589d8 211 while ((OP(first) == OPEN && (sawopen = 1)) ||
a687059c 212 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
00bf170e 213 (OP(first) == PLUS) ||
214 (OP(first) == CURLY && ARG1(first) > 0) ) {
fe14fcc3 215 if (OP(first) == PLUS)
9ef589d8 216 sawplus = 1;
fe14fcc3 217 else
218 first += regarglen[OP(first)];
a687059c 219 first = NEXTOPER(first);
00bf170e 220 }
a687059c 221
222 /* Starting-point info. */
9ef589d8 223 again:
a687059c 224 if (OP(first) == EXACTLY) {
225 r->regstart =
226 str_make(OPERAND(first)+1,*OPERAND(first));
227 if (r->regstart->str_cur > !(sawstudy|fold))
228 fbmcompile(r->regstart,fold);
229 }
230 else if ((exp = index(simple,OP(first))) && exp > simple)
231 r->regstclass = first;
232 else if (OP(first) == BOUND || OP(first) == NBOUND)
233 r->regstclass = first;
00bf170e 234 else if (OP(first) == BOL ||
9ef589d8 235 (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
236 r->reganch = ROPT_ANCH; /* kinda turn .* into ^.* */
237 first = NEXTOPER(first);
238 goto again;
239 }
240 if (sawplus && (!sawopen || !regsawback))
241 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
a687059c 242
243#ifdef DEBUGGING
244 if (debug & 512)
245 fprintf(stderr,"first %d next %d offset %d\n",
246 OP(first), OP(NEXTOPER(first)), first - scan);
247#endif
248 /*
249 * If there's something expensive in the r.e., find the
250 * longest literal string that must appear and make it the
251 * regmust. Resolve ties in favor of later strings, since
252 * the regstart check works with the beginning of the r.e.
253 * and avoiding duplication strengthens checking. Not a
254 * strong reason, but sufficient in the absence of others.
255 * [Now we resolve ties in favor of the earlier string if
256 * it happens that curback has been invalidated, since the
257 * earlier string may buy us something the later one won't.]
258 */
0a12ae7d 259 longish = str_make("",0);
a687059c 260 longest = str_make("",0);
261 len = 0;
262 curback = 0;
34de22dd 263 backish = 0;
264 backest = 0;
e5d73d77 265 while (OP(scan) != END) {
a687059c 266 if (OP(scan) == BRANCH) {
267 if (OP(regnext(scan)) == BRANCH) {
268 curback = -30000;
269 while (OP(scan) == BRANCH)
270 scan = regnext(scan);
271 }
272 else /* single branch is ok */
273 scan = NEXTOPER(scan);
274 }
275 if (OP(scan) == EXACTLY) {
fe14fcc3 276 char *t;
277
a687059c 278 first = scan;
fe14fcc3 279 while (OP(t = regnext(scan)) == CLOSE)
280 scan = t;
34de22dd 281 if (curback - backish == len) {
0a12ae7d 282 str_ncat(longish, OPERAND(first)+1,
a687059c 283 *OPERAND(first));
284 len += *OPERAND(first);
285 curback += *OPERAND(first);
286 first = regnext(scan);
287 }
288 else if (*OPERAND(first) >= len + (curback >= 0)) {
289 len = *OPERAND(first);
0a12ae7d 290 str_nset(longish, OPERAND(first)+1,len);
34de22dd 291 backish = curback;
a687059c 292 curback += len;
293 first = regnext(scan);
294 }
295 else
296 curback += *OPERAND(first);
297 }
0a12ae7d 298 else if (index(varies,OP(scan))) {
299 curback = -30000;
300 len = 0;
34de22dd 301 if (longish->str_cur > longest->str_cur) {
0a12ae7d 302 str_sset(longest,longish);
34de22dd 303 backest = backish;
304 }
0a12ae7d 305 str_nset(longish,"",0);
306 }
e5d73d77 307 else if (index(simple,OP(scan))) {
0a12ae7d 308 curback++;
e5d73d77 309 len = 0;
34de22dd 310 if (longish->str_cur > longest->str_cur) {
e5d73d77 311 str_sset(longest,longish);
34de22dd 312 backest = backish;
313 }
e5d73d77 314 str_nset(longish,"",0);
315 }
a687059c 316 scan = regnext(scan);
317 }
e5d73d77 318
319 /* Prefer earlier on tie, unless we can tail match latter */
320
34de22dd 321 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
0a12ae7d 322 str_sset(longest,longish);
34de22dd 323 backest = backish;
324 }
e5d73d77 325 else
326 str_nset(longish,"",0);
34de22dd 327 if (longest->str_cur
328 &&
329 (!r->regstart
330 ||
331 !fbminstr(r->regstart->str_ptr,
332 r->regstart->str_ptr + r->regstart->str_cur,
333 longest)
334 )
335 )
336 {
a687059c 337 r->regmust = longest;
34de22dd 338 if (backest < 0)
339 backest = -1;
340 r->regback = backest;
0a12ae7d 341 if (longest->str_cur
342 > !(sawstudy || fold || OP(first) == EOL) )
a687059c 343 fbmcompile(r->regmust,fold);
344 r->regmust->str_u.str_useful = 100;
e5d73d77 345 if (OP(first) == EOL && longish->str_cur)
a687059c 346 r->regmust->str_pok |= SP_TAIL;
347 }
fe14fcc3 348 else {
a687059c 349 str_free(longest);
fe14fcc3 350 longest = Nullstr;
351 }
e5d73d77 352 str_free(longish);
a687059c 353 }
354
355 r->do_folding = fold;
356 r->nparens = regnpar - 1;
fe14fcc3 357 New(1002, r->startp, regnpar, char*);
358 New(1002, r->endp, regnpar, char*);
a687059c 359#ifdef DEBUGGING
360 if (debug & 512)
361 regdump(r);
362#endif
363 return(r);
364}
365
366/*
367 - reg - regular expression, i.e. main body or parenthesized thing
368 *
369 * Caller must absorb opening parenthesis.
370 *
371 * Combining parenthesis handling with the base level of regular expression
372 * is a trifle forced, but the need to tie the tails of the branches to what
373 * follows makes it hard to avoid.
374 */
375static char *
376reg(paren, flagp)
377int paren; /* Parenthesized? */
378int *flagp;
379{
380 register char *ret;
381 register char *br;
382 register char *ender;
383 register int parno;
384 int flags;
385
386 *flagp = HASWIDTH; /* Tentatively. */
387
388 /* Make an OPEN node, if parenthesized. */
389 if (paren) {
a687059c 390 parno = regnpar;
391 regnpar++;
fe14fcc3 392 ret = reganode(OPEN, parno);
a687059c 393 } else
394 ret = NULL;
395
396 /* Pick up the branches, linking them together. */
397 br = regbranch(&flags);
398 if (br == NULL)
399 return(NULL);
400 if (ret != NULL)
401 regtail(ret, br); /* OPEN -> first. */
402 else
403 ret = br;
404 if (!(flags&HASWIDTH))
405 *flagp &= ~HASWIDTH;
406 *flagp |= flags&SPSTART;
407 while (*regparse == '|') {
408 regparse++;
409 br = regbranch(&flags);
410 if (br == NULL)
411 return(NULL);
412 regtail(ret, br); /* BRANCH -> BRANCH. */
413 if (!(flags&HASWIDTH))
414 *flagp &= ~HASWIDTH;
415 *flagp |= flags&SPSTART;
416 }
417
418 /* Make a closing node, and hook it on the end. */
fe14fcc3 419 if (paren)
420 ender = reganode(CLOSE, parno);
421 else
422 ender = regnode(END);
a687059c 423 regtail(ret, ender);
424
425 /* Hook the tails of the branches to the closing node. */
426 for (br = ret; br != NULL; br = regnext(br))
427 regoptail(br, ender);
428
429 /* Check for proper termination. */
430 if (paren && *regparse++ != ')') {
431 FAIL("unmatched () in regexp");
432 } else if (!paren && regparse < regxend) {
433 if (*regparse == ')') {
434 FAIL("unmatched () in regexp");
435 } else
436 FAIL("junk on end of regexp"); /* "Can't happen". */
437 /* NOTREACHED */
438 }
439
440 return(ret);
441}
442
443/*
444 - regbranch - one alternative of an | operator
445 *
446 * Implements the concatenation operator.
447 */
448static char *
449regbranch(flagp)
450int *flagp;
451{
452 register char *ret;
453 register char *chain;
454 register char *latest;
455 int flags;
456
457 *flagp = WORST; /* Tentatively. */
458
459 ret = regnode(BRANCH);
460 chain = NULL;
461 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
462 latest = regpiece(&flags);
463 if (latest == NULL)
464 return(NULL);
465 *flagp |= flags&HASWIDTH;
466 if (chain == NULL) /* First piece. */
467 *flagp |= flags&SPSTART;
468 else
469 regtail(chain, latest);
470 chain = latest;
471 }
472 if (chain == NULL) /* Loop ran zero times. */
473 (void) regnode(NOTHING);
474
475 return(ret);
476}
477
478/*
479 - regpiece - something followed by possible [*+?]
480 *
481 * Note that the branching code sequences used for ? and the general cases
482 * of * and + are somewhat optimized: they use the same NOTHING node as
483 * both the endmarker for their branch list and the body of the last branch.
484 * It might seem that this node could be dispensed with entirely, but the
485 * endmarker role is not redundant.
486 */
487static char *
488regpiece(flagp)
489int *flagp;
490{
491 register char *ret;
492 register char op;
493 register char *next;
494 int flags;
495 char *origparse = regparse;
496 int orignpar = regnpar;
497 char *max;
498 int iter;
499 char ch;
500
501 ret = regatom(&flags);
502 if (ret == NULL)
503 return(NULL);
504
505 op = *regparse;
506
507 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
508 * then we decrement the first number by one and reset our
509 * parsing back to the beginning of the same atom. If the first number
510 * is down to 0, decrement the second number instead and fake up
511 * a ? after it. Given the way this compiler doesn't keep track
512 * of offsets on the first pass, this is the only way to replicate
513 * a piece of code. Sigh.
514 */
515 if (op == '{' && regcurly(regparse)) {
516 next = regparse + 1;
517 max = Nullch;
518 while (isdigit(*next) || *next == ',') {
519 if (*next == ',') {
520 if (max)
521 break;
522 else
523 max = next;
524 }
525 next++;
526 }
527 if (*next == '}') { /* got one */
a687059c 528 if (!max)
529 max = next;
530 regparse++;
531 iter = atoi(regparse);
00bf170e 532 if (flags&SIMPLE) { /* we can do it right after all */
533 int tmp;
534
535 reginsert(CURLY, ret);
fe14fcc3 536 if (iter > 0)
537 *flagp = (WORST|HASWIDTH);
00bf170e 538 if (*max == ',')
539 max++;
6eb13c3b 540 else
541 max = regparse;
00bf170e 542 tmp = atoi(max);
543 if (tmp && tmp < iter)
544 fatal("Can't do {n,m} with n > m");
545 if (regcode != &regdummy) {
546#ifdef REGALIGN
547 *(unsigned short *)(ret+3) = iter;
548 *(unsigned short *)(ret+5) = tmp;
549#else
550 ret[3] = iter >> 8; ret[4] = iter & 0377;
551 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
552#endif
553 }
554 regparse = next;
555 goto nest_check;
556 }
557 regsawbracket++; /* remember we clobbered exp */
a687059c 558 if (iter > 0) {
559 ch = *max;
560 sprintf(regparse,"%.*d", max-regparse, iter - 1);
561 *max = ch;
00bf170e 562 if (*max == ',' && max[1] != '}') {
563 if (atoi(max+1) <= 0)
564 fatal("Can't do {n,m} with n > m");
a687059c 565 ch = *next;
566 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
567 *next = ch;
568 }
00bf170e 569 if (iter != 1 || *max == ',') {
a687059c 570 regparse = origparse; /* back up input pointer */
571 regnpar = orignpar; /* don't make more parens */
572 }
573 else {
574 regparse = next;
575 goto nest_check;
576 }
577 *flagp = flags;
578 return ret;
579 }
580 if (*max == ',') {
581 max++;
582 iter = atoi(max);
583 if (max == next) { /* any number more? */
584 regparse = next;
585 op = '*'; /* fake up one with a star */
586 }
587 else if (iter > 0) {
588 op = '?'; /* fake up optional atom */
589 ch = *next;
590 sprintf(max,"%.*d", next-max, iter - 1);
591 *next = ch;
592 if (iter == 1)
593 regparse = next;
594 else {
595 regparse = origparse - 1; /* offset ++ below */
596 regnpar = orignpar;
597 }
598 }
599 else
600 fatal("Can't do {n,0}");
601 }
602 else
603 fatal("Can't do {0}");
604 }
605 }
606
607 if (!ISMULT1(op)) {
608 *flagp = flags;
609 return(ret);
610 }
611
612 if (!(flags&HASWIDTH) && op != '?')
613 FAIL("regexp *+ operand could be empty");
614 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
615
616 if (op == '*' && (flags&SIMPLE))
617 reginsert(STAR, ret);
618 else if (op == '*') {
619 /* Emit x* as (x&|), where & means "self". */
620 reginsert(BRANCH, ret); /* Either x */
621 regoptail(ret, regnode(BACK)); /* and loop */
622 regoptail(ret, ret); /* back */
623 regtail(ret, regnode(BRANCH)); /* or */
624 regtail(ret, regnode(NOTHING)); /* null. */
625 } else if (op == '+' && (flags&SIMPLE))
626 reginsert(PLUS, ret);
627 else if (op == '+') {
628 /* Emit x+ as x(&|), where & means "self". */
629 next = regnode(BRANCH); /* Either */
630 regtail(ret, next);
631 regtail(regnode(BACK), ret); /* loop back */
632 regtail(next, regnode(BRANCH)); /* or */
633 regtail(ret, regnode(NOTHING)); /* null. */
634 } else if (op == '?') {
635 /* Emit x? as (x|) */
636 reginsert(BRANCH, ret); /* Either x */
637 regtail(ret, regnode(BRANCH)); /* or */
638 next = regnode(NOTHING); /* null. */
639 regtail(ret, next);
640 regoptail(ret, next);
641 }
642 nest_check:
643 regparse++;
644 if (ISMULT2(regparse))
645 FAIL("nested *?+ in regexp");
646
647 return(ret);
648}
649
650/*
651 - regatom - the lowest level
652 *
653 * Optimization: gobbles an entire sequence of ordinary characters so that
654 * it can turn them into a single node, which is smaller to store and
655 * faster to run. Backslashed characters are exceptions, each becoming a
656 * separate node; the code is simpler that way and it's not worth fixing.
657 *
658 * [Yes, it is worth fixing, some scripts can run twice the speed.]
659 */
660static char *
661regatom(flagp)
662int *flagp;
663{
664 register char *ret;
665 int flags;
666
667 *flagp = WORST; /* Tentatively. */
668
669 switch (*regparse++) {
670 case '^':
671 ret = regnode(BOL);
672 break;
673 case '$':
674 ret = regnode(EOL);
675 break;
676 case '.':
677 ret = regnode(ANY);
678 *flagp |= HASWIDTH|SIMPLE;
679 break;
680 case '[':
681 ret = regclass();
682 *flagp |= HASWIDTH|SIMPLE;
683 break;
684 case '(':
685 ret = reg(1, &flags);
686 if (ret == NULL)
687 return(NULL);
688 *flagp |= flags&(HASWIDTH|SPSTART);
689 break;
690 case '|':
691 case ')':
692 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
693 break;
694 case '?':
695 case '+':
696 case '*':
697 FAIL("?+* follows nothing in regexp");
698 break;
699 case '\\':
700 switch (*regparse) {
701 case 'w':
702 ret = regnode(ALNUM);
703 *flagp |= HASWIDTH|SIMPLE;
704 regparse++;
705 break;
706 case 'W':
707 ret = regnode(NALNUM);
708 *flagp |= HASWIDTH|SIMPLE;
709 regparse++;
710 break;
711 case 'b':
712 ret = regnode(BOUND);
713 *flagp |= SIMPLE;
714 regparse++;
715 break;
716 case 'B':
717 ret = regnode(NBOUND);
718 *flagp |= SIMPLE;
719 regparse++;
720 break;
721 case 's':
722 ret = regnode(SPACE);
723 *flagp |= HASWIDTH|SIMPLE;
724 regparse++;
725 break;
726 case 'S':
727 ret = regnode(NSPACE);
728 *flagp |= HASWIDTH|SIMPLE;
729 regparse++;
730 break;
731 case 'd':
732 ret = regnode(DIGIT);
733 *flagp |= HASWIDTH|SIMPLE;
734 regparse++;
735 break;
736 case 'D':
737 ret = regnode(NDIGIT);
738 *flagp |= HASWIDTH|SIMPLE;
739 regparse++;
740 break;
741 case 'n':
742 case 'r':
743 case 't':
744 case 'f':
fe14fcc3 745 case 'e':
746 case 'a':
747 case 'x':
748 case 'c':
749 case '0':
a687059c 750 goto defchar;
fe14fcc3 751 case '1': case '2': case '3': case '4':
a687059c 752 case '5': case '6': case '7': case '8': case '9':
fe14fcc3 753 {
754 int num = atoi(regparse);
755
756 if (num > 9 && num >= regnpar)
a687059c 757 goto defchar;
fe14fcc3 758 else {
9ef589d8 759 regsawback = 1;
fe14fcc3 760 ret = reganode(REF, num);
761 while (isascii(*regparse) && isdigit(*regparse))
762 regparse++;
a687059c 763 *flagp |= SIMPLE;
fe14fcc3 764 }
a687059c 765 }
766 break;
767 case '\0':
768 if (regparse >= regxend)
769 FAIL("trailing \\ in regexp");
770 /* FALL THROUGH */
771 default:
772 goto defchar;
773 }
774 break;
775 default: {
776 register int len;
777 register char ender;
778 register char *p;
779 char *oldp;
fe14fcc3 780 int numlen;
a687059c 781
782 defchar:
783 ret = regnode(EXACTLY);
784 regc(0); /* save spot for len */
785 for (len=0, p=regparse-1;
786 len < 127 && p < regxend;
787 len++)
788 {
789 oldp = p;
790 switch (*p) {
791 case '^':
792 case '$':
793 case '.':
794 case '[':
795 case '(':
796 case ')':
797 case '|':
798 goto loopdone;
799 case '\\':
800 switch (*++p) {
801 case 'w':
802 case 'W':
803 case 'b':
804 case 'B':
805 case 's':
806 case 'S':
807 case 'd':
808 case 'D':
809 --p;
810 goto loopdone;
811 case 'n':
812 ender = '\n';
813 p++;
814 break;
815 case 'r':
816 ender = '\r';
817 p++;
818 break;
819 case 't':
820 ender = '\t';
821 p++;
822 break;
823 case 'f':
824 ender = '\f';
825 p++;
826 break;
fe14fcc3 827 case 'e':
828 ender = '\033';
829 p++;
830 break;
831 case 'a':
832 ender = '\007';
833 p++;
834 break;
835 case 'x':
836 ender = scanhex(++p, 2, &numlen);
837 p += numlen;
838 break;
839 case 'c':
840 p++;
841 ender = *p++;
842 if (islower(ender))
843 ender = toupper(ender);
844 ender ^= 64;
845 break;
a687059c 846 case '0': case '1': case '2': case '3':case '4':
847 case '5': case '6': case '7': case '8':case '9':
fe14fcc3 848 if (*p == '0' ||
849 (isdigit(p[1]) && atoi(p) >= regnpar) ) {
850 ender = scanoct(p, 3, &numlen);
851 p += numlen;
a687059c 852 }
853 else {
854 --p;
855 goto loopdone;
856 }
857 break;
858 case '\0':
859 if (p >= regxend)
860 FAIL("trailing \\ in regexp");
861 /* FALL THROUGH */
862 default:
863 ender = *p++;
864 break;
865 }
866 break;
867 default:
868 ender = *p++;
869 break;
870 }
871 if (regfold && isupper(ender))
872 ender = tolower(ender);
873 if (ISMULT2(p)) { /* Back off on ?+*. */
874 if (len)
875 p = oldp;
876 else {
877 len++;
878 regc(ender);
879 }
880 break;
881 }
882 regc(ender);
883 }
884 loopdone:
885 regparse = p;
886 if (len <= 0)
887 FAIL("internal disaster in regexp");
888 *flagp |= HASWIDTH;
889 if (len == 1)
890 *flagp |= SIMPLE;
891 if (regcode != &regdummy)
892 *OPERAND(ret) = len;
893 regc('\0');
894 }
895 break;
896 }
897
898 return(ret);
899}
900
901static void
902regset(bits,def,c)
903char *bits;
904int def;
905register int c;
906{
907 if (regcode == &regdummy)
908 return;
ac58e20f 909 c &= 255;
a687059c 910 if (def)
911 bits[c >> 3] &= ~(1 << (c & 7));
912 else
913 bits[c >> 3] |= (1 << (c & 7));
914}
915
916static char *
917regclass()
918{
919 register char *bits;
920 register int class;
921 register int lastclass;
922 register int range = 0;
923 register char *ret;
924 register int def;
fe14fcc3 925 int numlen;
a687059c 926
00bf170e 927 ret = regnode(ANYOF);
a687059c 928 if (*regparse == '^') { /* Complement of range. */
a687059c 929 regparse++;
930 def = 0;
931 } else {
a687059c 932 def = 255;
933 }
934 bits = regcode;
935 for (class = 0; class < 32; class++)
936 regc(def);
937 if (*regparse == ']' || *regparse == '-')
00bf170e 938 goto skipcond; /* allow 1st char to be ] or - */
a687059c 939 while (regparse < regxend && *regparse != ']') {
00bf170e 940 skipcond:
a687059c 941 class = UCHARAT(regparse++);
942 if (class == '\\') {
943 class = UCHARAT(regparse++);
944 switch (class) {
945 case 'w':
946 for (class = 'a'; class <= 'z'; class++)
947 regset(bits,def,class);
948 for (class = 'A'; class <= 'Z'; class++)
949 regset(bits,def,class);
950 for (class = '0'; class <= '9'; class++)
951 regset(bits,def,class);
952 regset(bits,def,'_');
953 lastclass = 1234;
954 continue;
955 case 's':
956 regset(bits,def,' ');
957 regset(bits,def,'\t');
958 regset(bits,def,'\r');
959 regset(bits,def,'\f');
960 regset(bits,def,'\n');
961 lastclass = 1234;
962 continue;
963 case 'd':
964 for (class = '0'; class <= '9'; class++)
965 regset(bits,def,class);
966 lastclass = 1234;
967 continue;
968 case 'n':
969 class = '\n';
970 break;
971 case 'r':
972 class = '\r';
973 break;
974 case 't':
975 class = '\t';
976 break;
977 case 'f':
978 class = '\f';
979 break;
980 case 'b':
981 class = '\b';
982 break;
fe14fcc3 983 case 'e':
984 class = '\033';
985 break;
986 case 'a':
987 class = '\007';
988 break;
989 case 'x':
990 class = scanhex(regparse, 2, &numlen);
991 regparse += numlen;
992 break;
993 case 'c':
994 class = *regparse++;
995 if (islower(class))
996 class = toupper(class);
997 class ^= 64;
998 break;
a687059c 999 case '0': case '1': case '2': case '3': case '4':
1000 case '5': case '6': case '7': case '8': case '9':
fe14fcc3 1001 class = scanoct(--regparse, 3, &numlen);
1002 regparse += numlen;
a687059c 1003 break;
1004 }
1005 }
a687059c 1006 if (range) {
1007 if (lastclass > class)
1008 FAIL("invalid [] range in regexp");
00bf170e 1009 range = 0;
a687059c 1010 }
00bf170e 1011 else {
1012 lastclass = class;
1013 if (*regparse == '-' && regparse+1 < regxend &&
1014 regparse[1] != ']') {
1015 regparse++;
1016 range = 1;
1017 continue; /* do it next time */
1018 }
1019 }
1020 for ( ; lastclass <= class; lastclass++) {
a687059c 1021 regset(bits,def,lastclass);
1022 if (regfold && isupper(lastclass))
1023 regset(bits,def,tolower(lastclass));
1024 }
1025 lastclass = class;
1026 }
1027 if (*regparse != ']')
1028 FAIL("unmatched [] in regexp");
a687059c 1029 regparse++;
1030 return ret;
1031}
1032
1033/*
1034 - regnode - emit a node
1035 */
1036static char * /* Location. */
1037regnode(op)
1038char op;
1039{
1040 register char *ret;
1041 register char *ptr;
1042
1043 ret = regcode;
1044 if (ret == &regdummy) {
1045#ifdef REGALIGN
1046 if (!(regsize & 1))
1047 regsize++;
1048#endif
1049 regsize += 3;
1050 return(ret);
1051 }
1052
1053#ifdef REGALIGN
1054#ifndef lint
1055 if (!((long)ret & 1))
1056 *ret++ = 127;
1057#endif
1058#endif
1059 ptr = ret;
1060 *ptr++ = op;
1061 *ptr++ = '\0'; /* Null "next" pointer. */
1062 *ptr++ = '\0';
1063 regcode = ptr;
1064
1065 return(ret);
1066}
1067
1068/*
fe14fcc3 1069 - reganode - emit a node with an argument
1070 */
1071static char * /* Location. */
1072reganode(op, arg)
1073char op;
1074unsigned short arg;
1075{
1076 register char *ret;
1077 register char *ptr;
1078
1079 ret = regcode;
1080 if (ret == &regdummy) {
1081#ifdef REGALIGN
1082 if (!(regsize & 1))
1083 regsize++;
1084#endif
1085 regsize += 5;
1086 return(ret);
1087 }
1088
1089#ifdef REGALIGN
1090#ifndef lint
1091 if (!((long)ret & 1))
1092 *ret++ = 127;
1093#endif
1094#endif
1095 ptr = ret;
1096 *ptr++ = op;
1097 *ptr++ = '\0'; /* Null "next" pointer. */
1098 *ptr++ = '\0';
1099#ifdef REGALIGN
1100 *(unsigned short *)(ret+3) = arg;
1101#else
1102 ret[3] = arg >> 8; ret[4] = arg & 0377;
1103#endif
1104 ptr += 2;
1105 regcode = ptr;
1106
1107 return(ret);
1108}
1109
1110/*
a687059c 1111 - regc - emit (if appropriate) a byte of code
1112 */
1113static void
1114regc(b)
1115char b;
1116{
1117 if (regcode != &regdummy)
1118 *regcode++ = b;
1119 else
1120 regsize++;
1121}
1122
1123/*
1124 - reginsert - insert an operator in front of already-emitted operand
1125 *
1126 * Means relocating the operand.
1127 */
1128static void
1129reginsert(op, opnd)
1130char op;
1131char *opnd;
1132{
1133 register char *src;
1134 register char *dst;
1135 register char *place;
00bf170e 1136 register offset = (op == CURLY ? 4 : 0);
a687059c 1137
1138 if (regcode == &regdummy) {
1139#ifdef REGALIGN
00bf170e 1140 regsize += 4 + offset;
a687059c 1141#else
00bf170e 1142 regsize += 3 + offset;
a687059c 1143#endif
1144 return;
1145 }
1146
1147 src = regcode;
1148#ifdef REGALIGN
00bf170e 1149 regcode += 4 + offset;
a687059c 1150#else
00bf170e 1151 regcode += 3 + offset;
a687059c 1152#endif
1153 dst = regcode;
1154 while (src > opnd)
1155 *--dst = *--src;
1156
1157 place = opnd; /* Op node, where operand used to be. */
1158 *place++ = op;
1159 *place++ = '\0';
1160 *place++ = '\0';
00bf170e 1161 while (offset-- > 0)
1162 *place++ = '\0';
a687059c 1163}
1164
1165/*
1166 - regtail - set the next-pointer at the end of a node chain
1167 */
1168static void
1169regtail(p, val)
1170char *p;
1171char *val;
1172{
1173 register char *scan;
1174 register char *temp;
1175 register int offset;
1176
1177 if (p == &regdummy)
1178 return;
1179
1180 /* Find last node. */
1181 scan = p;
1182 for (;;) {
1183 temp = regnext(scan);
1184 if (temp == NULL)
1185 break;
1186 scan = temp;
1187 }
1188
1189#ifdef REGALIGN
1190 offset = val - scan;
1191#ifndef lint
1192 *(short*)(scan+1) = offset;
1193#else
1194 offset = offset;
1195#endif
1196#else
1197 if (OP(scan) == BACK)
1198 offset = scan - val;
1199 else
1200 offset = val - scan;
1201 *(scan+1) = (offset>>8)&0377;
1202 *(scan+2) = offset&0377;
1203#endif
1204}
1205
1206/*
1207 - regoptail - regtail on operand of first argument; nop if operandless
1208 */
1209static void
1210regoptail(p, val)
1211char *p;
1212char *val;
1213{
1214 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1215 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1216 return;
1217 regtail(NEXTOPER(p), val);
1218}
1219
1220/*
1221 - regcurly - a little FSA that accepts {\d+,?\d*}
1222 */
1223STATIC int
1224regcurly(s)
1225register char *s;
1226{
1227 if (*s++ != '{')
1228 return FALSE;
1229 if (!isdigit(*s))
1230 return FALSE;
1231 while (isdigit(*s))
1232 s++;
1233 if (*s == ',')
1234 s++;
1235 while (isdigit(*s))
1236 s++;
1237 if (*s != '}')
1238 return FALSE;
1239 return TRUE;
1240}
1241
1242#ifdef DEBUGGING
1243
1244/*
1245 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1246 */
1247void
1248regdump(r)
1249regexp *r;
1250{
1251 register char *s;
1252 register char op = EXACTLY; /* Arbitrary non-END op. */
1253 register char *next;
a687059c 1254
1255
1256 s = r->program + 1;
1257 while (op != END) { /* While that wasn't END last time... */
1258#ifdef REGALIGN
1259 if (!((long)s & 1))
1260 s++;
1261#endif
1262 op = OP(s);
1263 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1264 next = regnext(s);
fe14fcc3 1265 s += regarglen[op];
a687059c 1266 if (next == NULL) /* Next ptr. */
1267 fprintf(stderr,"(0)");
1268 else
1269 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1270 s += 3;
00bf170e 1271 if (op == ANYOF) {
a687059c 1272 s += 32;
1273 }
1274 if (op == EXACTLY) {
1275 /* Literal string, where present. */
1276 s++;
1277 while (*s != '\0') {
1278 (void)putchar(*s);
1279 s++;
1280 }
1281 s++;
1282 }
1283 (void)putchar('\n');
1284 }
1285
1286 /* Header fields of interest. */
1287 if (r->regstart)
1288 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1289 if (r->regstclass)
1290 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
9ef589d8 1291 if (r->reganch & ROPT_ANCH)
a687059c 1292 fprintf(stderr,"anchored ");
9ef589d8 1293 if (r->reganch & ROPT_SKIP)
00bf170e 1294 fprintf(stderr,"plus ");
a687059c 1295 if (r->regmust != NULL)
1296 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1297 r->regback);
1298 fprintf(stderr,"\n");
1299}
1300
1301/*
1302 - regprop - printable representation of opcode
1303 */
1304char *
1305regprop(op)
1306char *op;
1307{
1308 register char *p;
1309
1310 (void) strcpy(buf, ":");
1311
1312 switch (OP(op)) {
1313 case BOL:
1314 p = "BOL";
1315 break;
1316 case EOL:
1317 p = "EOL";
1318 break;
1319 case ANY:
1320 p = "ANY";
1321 break;
1322 case ANYOF:
1323 p = "ANYOF";
1324 break;
a687059c 1325 case BRANCH:
1326 p = "BRANCH";
1327 break;
1328 case EXACTLY:
1329 p = "EXACTLY";
1330 break;
1331 case NOTHING:
1332 p = "NOTHING";
1333 break;
1334 case BACK:
1335 p = "BACK";
1336 break;
1337 case END:
1338 p = "END";
1339 break;
1340 case ALNUM:
1341 p = "ALNUM";
1342 break;
1343 case NALNUM:
1344 p = "NALNUM";
1345 break;
1346 case BOUND:
1347 p = "BOUND";
1348 break;
1349 case NBOUND:
1350 p = "NBOUND";
1351 break;
1352 case SPACE:
1353 p = "SPACE";
1354 break;
1355 case NSPACE:
1356 p = "NSPACE";
1357 break;
1358 case DIGIT:
1359 p = "DIGIT";
1360 break;
1361 case NDIGIT:
1362 p = "NDIGIT";
1363 break;
00bf170e 1364 case CURLY:
1365 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1366 ARG1(op),ARG2(op));
1367 p = NULL;
1368 break;
a687059c 1369 case REF:
fe14fcc3 1370 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
a687059c 1371 p = NULL;
1372 break;
fe14fcc3 1373 case OPEN:
1374 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
a687059c 1375 p = NULL;
1376 break;
fe14fcc3 1377 case CLOSE:
1378 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
a687059c 1379 p = NULL;
1380 break;
1381 case STAR:
1382 p = "STAR";
1383 break;
1384 case PLUS:
1385 p = "PLUS";
1386 break;
1387 default:
1388 FAIL("corrupted regexp opcode");
1389 }
1390 if (p != NULL)
1391 (void) strcat(buf, p);
1392 return(buf);
1393}
1394#endif /* DEBUGGING */
1395
1396regfree(r)
1397struct regexp *r;
1398{
fe14fcc3 1399 if (r->precomp) {
a687059c 1400 Safefree(r->precomp);
fe14fcc3 1401 r->precomp = Nullch;
1402 }
1403 if (r->subbase) {
a687059c 1404 Safefree(r->subbase);
fe14fcc3 1405 r->subbase = Nullch;
1406 }
1407 if (r->regmust) {
a687059c 1408 str_free(r->regmust);
fe14fcc3 1409 r->regmust = Nullstr;
1410 }
1411 if (r->regstart) {
a687059c 1412 str_free(r->regstart);
fe14fcc3 1413 r->regstart = Nullstr;
1414 }
1415 Safefree(r->startp);
1416 Safefree(r->endp);
a687059c 1417 Safefree(r);
1418}