perl 3.0 patch #24 patch #19, continued
[p5sagit/p5-mst-13.2.git] / regcomp.c
CommitLineData
a687059c 1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
79a0689e 10/* $Header: regcomp.c,v 3.0.1.3 90/03/12 16:59:22 lwall Locked $
a687059c 11 *
12 * $Log: regcomp.c,v $
79a0689e 13 * Revision 3.0.1.3 90/03/12 16:59:22 lwall
14 * patch13: pattern matches can now use \0 to mean \000
15 *
ac58e20f 16 * Revision 3.0.1.2 90/02/28 18:08:35 lwall
17 * patch9: /[\200-\377]/ didn't work on machines with signed chars
18 *
ae986130 19 * Revision 3.0.1.1 89/11/11 04:51:04 lwall
20 * patch2: /[\000]/ didn't work
21 *
a687059c 22 * Revision 3.0 89/10/18 15:22:29 lwall
23 * 3.0 baseline
24 *
25 */
26
27/*
28 * regcomp and regexec -- regsub and regerror are not used in perl
29 *
30 * Copyright (c) 1986 by University of Toronto.
31 * Written by Henry Spencer. Not derived from licensed software.
32 *
33 * Permission is granted to anyone to use this software for any
34 * purpose on any computer system, and to redistribute it freely,
35 * subject to the following restrictions:
36 *
37 * 1. The author is not responsible for the consequences of use of
38 * this software, no matter how awful, even if they arise
39 * from defects in it.
40 *
41 * 2. The origin of this software must not be misrepresented, either
42 * by explicit claim or by omission.
43 *
44 * 3. Altered versions must be plainly marked as such, and must not
45 * be misrepresented as being the original software.
46 *
47 *
48 **** Alterations to Henry's code are...
49 ****
50 **** Copyright (c) 1989, Larry Wall
51 ****
52 **** You may distribute under the terms of the GNU General Public License
53 **** as specified in the README file that comes with the perl 3.0 kit.
54 *
55 * Beware that some of this code is subtly aware of the way operator
56 * precedence is structured in regular expressions. Serious changes in
57 * regular-expression syntax might require a total rethink.
58 */
59#include "EXTERN.h"
60#include "perl.h"
61#include "INTERN.h"
62#include "regcomp.h"
63
64#ifndef STATIC
65#define STATIC static
66#endif
67
68#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
69#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
70 ((*s) == '{' && regcurly(s)))
71#define META "^$.[()|?+*\\"
72
73/*
74 * Flags to be passed up and down.
75 */
76#define HASWIDTH 01 /* Known never to match null string. */
77#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
78#define SPSTART 04 /* Starts with * or +. */
79#define WORST 0 /* Worst case. */
80
81/*
82 * Global work variables for regcomp().
83 */
84static char *regprecomp; /* uncompiled string. */
85static char *regparse; /* Input-scan pointer. */
86static char *regxend; /* End of input for compile */
87static int regnpar; /* () count. */
88static char *regcode; /* Code-emit pointer; &regdummy = don't. */
89static long regsize; /* Code size. */
90static int regfold;
91static int regsawbracket; /* Did we do {d,d} trick? */
92
93/*
94 * Forward declarations for regcomp()'s friends.
95 */
96STATIC int regcurly();
97STATIC char *reg();
98STATIC char *regbranch();
99STATIC char *regpiece();
100STATIC char *regatom();
101STATIC char *regclass();
102STATIC char *regnode();
103STATIC void regc();
104STATIC void reginsert();
105STATIC void regtail();
106STATIC void regoptail();
107
108/*
109 - regcomp - compile a regular expression into internal code
110 *
111 * We can't allocate space until we know how big the compiled form will be,
112 * but we can't compile it (and thus know how big it is) until we've got a
113 * place to put the code. So we cheat: we compile it twice, once with code
114 * generation turned off and size counting turned on, and once "for real".
115 * This also means that we don't allocate space until we are sure that the
116 * thing really will compile successfully, and we never have to move the
117 * code and thus invalidate pointers into it. (Note that it has to be in
118 * one piece because free() must be able to free it all.) [NB: not true in perl]
119 *
120 * Beware that the optimization-preparation code in here knows about some
121 * of the structure of the compiled regexp. [I'll say.]
122 */
123regexp *
124regcomp(exp,xend,fold,rare)
125char *exp;
126char *xend;
127int fold;
128int rare;
129{
130 register regexp *r;
131 register char *scan;
132 register STR *longest;
133 register int len;
134 register char *first;
135 int flags;
136 int back;
137 int curback;
138 extern char *safemalloc();
139 extern char *savestr();
140
141 if (exp == NULL)
142 fatal("NULL regexp argument");
143
144 /* First pass: determine size, legality. */
145 regfold = fold;
146 regparse = exp;
147 regxend = xend;
148 regprecomp = nsavestr(exp,xend-exp);
149 regsawbracket = 0;
150 regnpar = 1;
151 regsize = 0L;
152 regcode = &regdummy;
153 regc(MAGIC);
154 if (reg(0, &flags) == NULL) {
155 Safefree(regprecomp);
156 return(NULL);
157 }
158
159 /* Small enough for pointer-storage convention? */
160 if (regsize >= 32767L) /* Probably could be 65535L. */
161 FAIL("regexp too big");
162
163 /* Allocate space. */
164 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
165 if (r == NULL)
166 FAIL("regexp out of space");
167
168 /* Second pass: emit code. */
169 if (regsawbracket)
170 bcopy(regprecomp,exp,xend-exp);
171 r->precomp = regprecomp;
172 r->subbase = NULL;
173 regparse = exp;
174 regnpar = 1;
175 regcode = r->program;
176 regc(MAGIC);
177 if (reg(0, &flags) == NULL)
178 return(NULL);
179
180 /* Dig out information for optimizations. */
181 r->regstart = Nullstr; /* Worst-case defaults. */
182 r->reganch = 0;
183 r->regmust = Nullstr;
184 r->regback = -1;
185 r->regstclass = Nullch;
186 scan = r->program+1; /* First BRANCH. */
187 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
188 scan = NEXTOPER(scan);
189
190 first = scan;
191 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
192 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
193 (OP(first) == PLUS) )
194 first = NEXTOPER(first);
195
196 /* Starting-point info. */
197 if (OP(first) == EXACTLY) {
198 r->regstart =
199 str_make(OPERAND(first)+1,*OPERAND(first));
200 if (r->regstart->str_cur > !(sawstudy|fold))
201 fbmcompile(r->regstart,fold);
202 }
203 else if ((exp = index(simple,OP(first))) && exp > simple)
204 r->regstclass = first;
205 else if (OP(first) == BOUND || OP(first) == NBOUND)
206 r->regstclass = first;
207 else if (OP(first) == BOL)
208 r->reganch++;
209
210#ifdef DEBUGGING
211 if (debug & 512)
212 fprintf(stderr,"first %d next %d offset %d\n",
213 OP(first), OP(NEXTOPER(first)), first - scan);
214#endif
215 /*
216 * If there's something expensive in the r.e., find the
217 * longest literal string that must appear and make it the
218 * regmust. Resolve ties in favor of later strings, since
219 * the regstart check works with the beginning of the r.e.
220 * and avoiding duplication strengthens checking. Not a
221 * strong reason, but sufficient in the absence of others.
222 * [Now we resolve ties in favor of the earlier string if
223 * it happens that curback has been invalidated, since the
224 * earlier string may buy us something the later one won't.]
225 */
226 longest = str_make("",0);
227 len = 0;
228 curback = 0;
229 back = 0;
230 while (scan != NULL) {
231 if (OP(scan) == BRANCH) {
232 if (OP(regnext(scan)) == BRANCH) {
233 curback = -30000;
234 while (OP(scan) == BRANCH)
235 scan = regnext(scan);
236 }
237 else /* single branch is ok */
238 scan = NEXTOPER(scan);
239 }
240 if (OP(scan) == EXACTLY) {
241 first = scan;
242 while (OP(regnext(scan)) >= CLOSE)
243 scan = regnext(scan);
244 if (curback - back == len) {
245 str_ncat(longest, OPERAND(first)+1,
246 *OPERAND(first));
247 len += *OPERAND(first);
248 curback += *OPERAND(first);
249 first = regnext(scan);
250 }
251 else if (*OPERAND(first) >= len + (curback >= 0)) {
252 len = *OPERAND(first);
253 str_nset(longest, OPERAND(first)+1,len);
254 back = curback;
255 curback += len;
256 first = regnext(scan);
257 }
258 else
259 curback += *OPERAND(first);
260 }
261 else if (index(varies,OP(scan)))
262 curback = -30000;
263 else if (index(simple,OP(scan)))
264 curback++;
265 scan = regnext(scan);
266 }
267 if (len) {
268 r->regmust = longest;
269 if (back < 0)
270 back = -1;
271 r->regback = back;
272 if (len > !(sawstudy||fold||OP(first)==EOL))
273 fbmcompile(r->regmust,fold);
274 r->regmust->str_u.str_useful = 100;
275 if (OP(first) == EOL) /* is match anchored to EOL? */
276 r->regmust->str_pok |= SP_TAIL;
277 }
278 else
279 str_free(longest);
280 }
281
282 r->do_folding = fold;
283 r->nparens = regnpar - 1;
284#ifdef DEBUGGING
285 if (debug & 512)
286 regdump(r);
287#endif
288 return(r);
289}
290
291/*
292 - reg - regular expression, i.e. main body or parenthesized thing
293 *
294 * Caller must absorb opening parenthesis.
295 *
296 * Combining parenthesis handling with the base level of regular expression
297 * is a trifle forced, but the need to tie the tails of the branches to what
298 * follows makes it hard to avoid.
299 */
300static char *
301reg(paren, flagp)
302int paren; /* Parenthesized? */
303int *flagp;
304{
305 register char *ret;
306 register char *br;
307 register char *ender;
308 register int parno;
309 int flags;
310
311 *flagp = HASWIDTH; /* Tentatively. */
312
313 /* Make an OPEN node, if parenthesized. */
314 if (paren) {
315 if (regnpar >= NSUBEXP)
316 FAIL("too many () in regexp");
317 parno = regnpar;
318 regnpar++;
319 ret = regnode(OPEN+parno);
320 } else
321 ret = NULL;
322
323 /* Pick up the branches, linking them together. */
324 br = regbranch(&flags);
325 if (br == NULL)
326 return(NULL);
327 if (ret != NULL)
328 regtail(ret, br); /* OPEN -> first. */
329 else
330 ret = br;
331 if (!(flags&HASWIDTH))
332 *flagp &= ~HASWIDTH;
333 *flagp |= flags&SPSTART;
334 while (*regparse == '|') {
335 regparse++;
336 br = regbranch(&flags);
337 if (br == NULL)
338 return(NULL);
339 regtail(ret, br); /* BRANCH -> BRANCH. */
340 if (!(flags&HASWIDTH))
341 *flagp &= ~HASWIDTH;
342 *flagp |= flags&SPSTART;
343 }
344
345 /* Make a closing node, and hook it on the end. */
346 ender = regnode((paren) ? CLOSE+parno : END);
347 regtail(ret, ender);
348
349 /* Hook the tails of the branches to the closing node. */
350 for (br = ret; br != NULL; br = regnext(br))
351 regoptail(br, ender);
352
353 /* Check for proper termination. */
354 if (paren && *regparse++ != ')') {
355 FAIL("unmatched () in regexp");
356 } else if (!paren && regparse < regxend) {
357 if (*regparse == ')') {
358 FAIL("unmatched () in regexp");
359 } else
360 FAIL("junk on end of regexp"); /* "Can't happen". */
361 /* NOTREACHED */
362 }
363
364 return(ret);
365}
366
367/*
368 - regbranch - one alternative of an | operator
369 *
370 * Implements the concatenation operator.
371 */
372static char *
373regbranch(flagp)
374int *flagp;
375{
376 register char *ret;
377 register char *chain;
378 register char *latest;
379 int flags;
380
381 *flagp = WORST; /* Tentatively. */
382
383 ret = regnode(BRANCH);
384 chain = NULL;
385 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
386 latest = regpiece(&flags);
387 if (latest == NULL)
388 return(NULL);
389 *flagp |= flags&HASWIDTH;
390 if (chain == NULL) /* First piece. */
391 *flagp |= flags&SPSTART;
392 else
393 regtail(chain, latest);
394 chain = latest;
395 }
396 if (chain == NULL) /* Loop ran zero times. */
397 (void) regnode(NOTHING);
398
399 return(ret);
400}
401
402/*
403 - regpiece - something followed by possible [*+?]
404 *
405 * Note that the branching code sequences used for ? and the general cases
406 * of * and + are somewhat optimized: they use the same NOTHING node as
407 * both the endmarker for their branch list and the body of the last branch.
408 * It might seem that this node could be dispensed with entirely, but the
409 * endmarker role is not redundant.
410 */
411static char *
412regpiece(flagp)
413int *flagp;
414{
415 register char *ret;
416 register char op;
417 register char *next;
418 int flags;
419 char *origparse = regparse;
420 int orignpar = regnpar;
421 char *max;
422 int iter;
423 char ch;
424
425 ret = regatom(&flags);
426 if (ret == NULL)
427 return(NULL);
428
429 op = *regparse;
430
431 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
432 * then we decrement the first number by one and reset our
433 * parsing back to the beginning of the same atom. If the first number
434 * is down to 0, decrement the second number instead and fake up
435 * a ? after it. Given the way this compiler doesn't keep track
436 * of offsets on the first pass, this is the only way to replicate
437 * a piece of code. Sigh.
438 */
439 if (op == '{' && regcurly(regparse)) {
440 next = regparse + 1;
441 max = Nullch;
442 while (isdigit(*next) || *next == ',') {
443 if (*next == ',') {
444 if (max)
445 break;
446 else
447 max = next;
448 }
449 next++;
450 }
451 if (*next == '}') { /* got one */
452 regsawbracket++; /* remember we clobbered exp */
453 if (!max)
454 max = next;
455 regparse++;
456 iter = atoi(regparse);
457 if (iter > 0) {
458 ch = *max;
459 sprintf(regparse,"%.*d", max-regparse, iter - 1);
460 *max = ch;
461 if (*max == ',' && atoi(max+1) > 0) {
462 ch = *next;
463 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
464 *next = ch;
465 }
466 if (iter != 1 || (*max == ',' || atoi(max+1))) {
467 regparse = origparse; /* back up input pointer */
468 regnpar = orignpar; /* don't make more parens */
469 }
470 else {
471 regparse = next;
472 goto nest_check;
473 }
474 *flagp = flags;
475 return ret;
476 }
477 if (*max == ',') {
478 max++;
479 iter = atoi(max);
480 if (max == next) { /* any number more? */
481 regparse = next;
482 op = '*'; /* fake up one with a star */
483 }
484 else if (iter > 0) {
485 op = '?'; /* fake up optional atom */
486 ch = *next;
487 sprintf(max,"%.*d", next-max, iter - 1);
488 *next = ch;
489 if (iter == 1)
490 regparse = next;
491 else {
492 regparse = origparse - 1; /* offset ++ below */
493 regnpar = orignpar;
494 }
495 }
496 else
497 fatal("Can't do {n,0}");
498 }
499 else
500 fatal("Can't do {0}");
501 }
502 }
503
504 if (!ISMULT1(op)) {
505 *flagp = flags;
506 return(ret);
507 }
508
509 if (!(flags&HASWIDTH) && op != '?')
510 FAIL("regexp *+ operand could be empty");
511 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
512
513 if (op == '*' && (flags&SIMPLE))
514 reginsert(STAR, ret);
515 else if (op == '*') {
516 /* Emit x* as (x&|), where & means "self". */
517 reginsert(BRANCH, ret); /* Either x */
518 regoptail(ret, regnode(BACK)); /* and loop */
519 regoptail(ret, ret); /* back */
520 regtail(ret, regnode(BRANCH)); /* or */
521 regtail(ret, regnode(NOTHING)); /* null. */
522 } else if (op == '+' && (flags&SIMPLE))
523 reginsert(PLUS, ret);
524 else if (op == '+') {
525 /* Emit x+ as x(&|), where & means "self". */
526 next = regnode(BRANCH); /* Either */
527 regtail(ret, next);
528 regtail(regnode(BACK), ret); /* loop back */
529 regtail(next, regnode(BRANCH)); /* or */
530 regtail(ret, regnode(NOTHING)); /* null. */
531 } else if (op == '?') {
532 /* Emit x? as (x|) */
533 reginsert(BRANCH, ret); /* Either x */
534 regtail(ret, regnode(BRANCH)); /* or */
535 next = regnode(NOTHING); /* null. */
536 regtail(ret, next);
537 regoptail(ret, next);
538 }
539 nest_check:
540 regparse++;
541 if (ISMULT2(regparse))
542 FAIL("nested *?+ in regexp");
543
544 return(ret);
545}
546
547/*
548 - regatom - the lowest level
549 *
550 * Optimization: gobbles an entire sequence of ordinary characters so that
551 * it can turn them into a single node, which is smaller to store and
552 * faster to run. Backslashed characters are exceptions, each becoming a
553 * separate node; the code is simpler that way and it's not worth fixing.
554 *
555 * [Yes, it is worth fixing, some scripts can run twice the speed.]
556 */
557static char *
558regatom(flagp)
559int *flagp;
560{
561 register char *ret;
562 int flags;
563
564 *flagp = WORST; /* Tentatively. */
565
566 switch (*regparse++) {
567 case '^':
568 ret = regnode(BOL);
569 break;
570 case '$':
571 ret = regnode(EOL);
572 break;
573 case '.':
574 ret = regnode(ANY);
575 *flagp |= HASWIDTH|SIMPLE;
576 break;
577 case '[':
578 ret = regclass();
579 *flagp |= HASWIDTH|SIMPLE;
580 break;
581 case '(':
582 ret = reg(1, &flags);
583 if (ret == NULL)
584 return(NULL);
585 *flagp |= flags&(HASWIDTH|SPSTART);
586 break;
587 case '|':
588 case ')':
589 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
590 break;
591 case '?':
592 case '+':
593 case '*':
594 FAIL("?+* follows nothing in regexp");
595 break;
596 case '\\':
597 switch (*regparse) {
598 case 'w':
599 ret = regnode(ALNUM);
600 *flagp |= HASWIDTH|SIMPLE;
601 regparse++;
602 break;
603 case 'W':
604 ret = regnode(NALNUM);
605 *flagp |= HASWIDTH|SIMPLE;
606 regparse++;
607 break;
608 case 'b':
609 ret = regnode(BOUND);
610 *flagp |= SIMPLE;
611 regparse++;
612 break;
613 case 'B':
614 ret = regnode(NBOUND);
615 *flagp |= SIMPLE;
616 regparse++;
617 break;
618 case 's':
619 ret = regnode(SPACE);
620 *flagp |= HASWIDTH|SIMPLE;
621 regparse++;
622 break;
623 case 'S':
624 ret = regnode(NSPACE);
625 *flagp |= HASWIDTH|SIMPLE;
626 regparse++;
627 break;
628 case 'd':
629 ret = regnode(DIGIT);
630 *flagp |= HASWIDTH|SIMPLE;
631 regparse++;
632 break;
633 case 'D':
634 ret = regnode(NDIGIT);
635 *flagp |= HASWIDTH|SIMPLE;
636 regparse++;
637 break;
638 case 'n':
639 case 'r':
640 case 't':
641 case 'f':
642 goto defchar;
643 case '0': case '1': case '2': case '3': case '4':
644 case '5': case '6': case '7': case '8': case '9':
79a0689e 645 if (isdigit(regparse[1]) || *regparse == '0')
a687059c 646 goto defchar;
647 else {
648 ret = regnode(REF + *regparse++ - '0');
649 *flagp |= SIMPLE;
650 }
651 break;
652 case '\0':
653 if (regparse >= regxend)
654 FAIL("trailing \\ in regexp");
655 /* FALL THROUGH */
656 default:
657 goto defchar;
658 }
659 break;
660 default: {
661 register int len;
662 register char ender;
663 register char *p;
664 char *oldp;
665 int foo;
666
667 defchar:
668 ret = regnode(EXACTLY);
669 regc(0); /* save spot for len */
670 for (len=0, p=regparse-1;
671 len < 127 && p < regxend;
672 len++)
673 {
674 oldp = p;
675 switch (*p) {
676 case '^':
677 case '$':
678 case '.':
679 case '[':
680 case '(':
681 case ')':
682 case '|':
683 goto loopdone;
684 case '\\':
685 switch (*++p) {
686 case 'w':
687 case 'W':
688 case 'b':
689 case 'B':
690 case 's':
691 case 'S':
692 case 'd':
693 case 'D':
694 --p;
695 goto loopdone;
696 case 'n':
697 ender = '\n';
698 p++;
699 break;
700 case 'r':
701 ender = '\r';
702 p++;
703 break;
704 case 't':
705 ender = '\t';
706 p++;
707 break;
708 case 'f':
709 ender = '\f';
710 p++;
711 break;
712 case '0': case '1': case '2': case '3':case '4':
713 case '5': case '6': case '7': case '8':case '9':
79a0689e 714 if (isdigit(p[1]) || *p == '0') {
715 foo = *p - '0';
716 if (isdigit(p[1]))
717 foo = (foo<<3) + *++p - '0';
a687059c 718 if (isdigit(p[1]))
719 foo = (foo<<3) + *++p - '0';
720 ender = foo;
721 p++;
722 }
723 else {
724 --p;
725 goto loopdone;
726 }
727 break;
728 case '\0':
729 if (p >= regxend)
730 FAIL("trailing \\ in regexp");
731 /* FALL THROUGH */
732 default:
733 ender = *p++;
734 break;
735 }
736 break;
737 default:
738 ender = *p++;
739 break;
740 }
741 if (regfold && isupper(ender))
742 ender = tolower(ender);
743 if (ISMULT2(p)) { /* Back off on ?+*. */
744 if (len)
745 p = oldp;
746 else {
747 len++;
748 regc(ender);
749 }
750 break;
751 }
752 regc(ender);
753 }
754 loopdone:
755 regparse = p;
756 if (len <= 0)
757 FAIL("internal disaster in regexp");
758 *flagp |= HASWIDTH;
759 if (len == 1)
760 *flagp |= SIMPLE;
761 if (regcode != &regdummy)
762 *OPERAND(ret) = len;
763 regc('\0');
764 }
765 break;
766 }
767
768 return(ret);
769}
770
771static void
772regset(bits,def,c)
773char *bits;
774int def;
775register int c;
776{
777 if (regcode == &regdummy)
778 return;
ac58e20f 779 c &= 255;
a687059c 780 if (def)
781 bits[c >> 3] &= ~(1 << (c & 7));
782 else
783 bits[c >> 3] |= (1 << (c & 7));
784}
785
786static char *
787regclass()
788{
789 register char *bits;
790 register int class;
791 register int lastclass;
792 register int range = 0;
793 register char *ret;
794 register int def;
795
796 if (*regparse == '^') { /* Complement of range. */
797 ret = regnode(ANYBUT);
798 regparse++;
799 def = 0;
800 } else {
801 ret = regnode(ANYOF);
802 def = 255;
803 }
804 bits = regcode;
805 for (class = 0; class < 32; class++)
806 regc(def);
807 if (*regparse == ']' || *regparse == '-')
808 regset(bits,def,lastclass = *regparse++);
809 while (regparse < regxend && *regparse != ']') {
810 class = UCHARAT(regparse++);
811 if (class == '\\') {
812 class = UCHARAT(regparse++);
813 switch (class) {
814 case 'w':
815 for (class = 'a'; class <= 'z'; class++)
816 regset(bits,def,class);
817 for (class = 'A'; class <= 'Z'; class++)
818 regset(bits,def,class);
819 for (class = '0'; class <= '9'; class++)
820 regset(bits,def,class);
821 regset(bits,def,'_');
822 lastclass = 1234;
823 continue;
824 case 's':
825 regset(bits,def,' ');
826 regset(bits,def,'\t');
827 regset(bits,def,'\r');
828 regset(bits,def,'\f');
829 regset(bits,def,'\n');
830 lastclass = 1234;
831 continue;
832 case 'd':
833 for (class = '0'; class <= '9'; class++)
834 regset(bits,def,class);
835 lastclass = 1234;
836 continue;
837 case 'n':
838 class = '\n';
839 break;
840 case 'r':
841 class = '\r';
842 break;
843 case 't':
844 class = '\t';
845 break;
846 case 'f':
847 class = '\f';
848 break;
849 case 'b':
850 class = '\b';
851 break;
852 case '0': case '1': case '2': case '3': case '4':
853 case '5': case '6': case '7': case '8': case '9':
854 class -= '0';
855 if (isdigit(*regparse)) {
856 class <<= 3;
857 class += *regparse++ - '0';
858 }
859 if (isdigit(*regparse)) {
860 class <<= 3;
861 class += *regparse++ - '0';
862 }
863 break;
864 }
865 }
866 if (!range && class == '-' && regparse < regxend &&
867 *regparse != ']') {
868 range = 1;
869 continue;
870 }
871 if (range) {
872 if (lastclass > class)
873 FAIL("invalid [] range in regexp");
874 }
875 else
876 lastclass = class - 1;
877 range = 0;
878 for (lastclass++; lastclass <= class; lastclass++) {
879 regset(bits,def,lastclass);
880 if (regfold && isupper(lastclass))
881 regset(bits,def,tolower(lastclass));
882 }
883 lastclass = class;
884 }
885 if (*regparse != ']')
886 FAIL("unmatched [] in regexp");
a687059c 887 regparse++;
888 return ret;
889}
890
891/*
892 - regnode - emit a node
893 */
894static char * /* Location. */
895regnode(op)
896char op;
897{
898 register char *ret;
899 register char *ptr;
900
901 ret = regcode;
902 if (ret == &regdummy) {
903#ifdef REGALIGN
904 if (!(regsize & 1))
905 regsize++;
906#endif
907 regsize += 3;
908 return(ret);
909 }
910
911#ifdef REGALIGN
912#ifndef lint
913 if (!((long)ret & 1))
914 *ret++ = 127;
915#endif
916#endif
917 ptr = ret;
918 *ptr++ = op;
919 *ptr++ = '\0'; /* Null "next" pointer. */
920 *ptr++ = '\0';
921 regcode = ptr;
922
923 return(ret);
924}
925
926/*
927 - regc - emit (if appropriate) a byte of code
928 */
929static void
930regc(b)
931char b;
932{
933 if (regcode != &regdummy)
934 *regcode++ = b;
935 else
936 regsize++;
937}
938
939/*
940 - reginsert - insert an operator in front of already-emitted operand
941 *
942 * Means relocating the operand.
943 */
944static void
945reginsert(op, opnd)
946char op;
947char *opnd;
948{
949 register char *src;
950 register char *dst;
951 register char *place;
952
953 if (regcode == &regdummy) {
954#ifdef REGALIGN
955 regsize += 4;
956#else
957 regsize += 3;
958#endif
959 return;
960 }
961
962 src = regcode;
963#ifdef REGALIGN
964 regcode += 4;
965#else
966 regcode += 3;
967#endif
968 dst = regcode;
969 while (src > opnd)
970 *--dst = *--src;
971
972 place = opnd; /* Op node, where operand used to be. */
973 *place++ = op;
974 *place++ = '\0';
975 *place++ = '\0';
976}
977
978/*
979 - regtail - set the next-pointer at the end of a node chain
980 */
981static void
982regtail(p, val)
983char *p;
984char *val;
985{
986 register char *scan;
987 register char *temp;
988 register int offset;
989
990 if (p == &regdummy)
991 return;
992
993 /* Find last node. */
994 scan = p;
995 for (;;) {
996 temp = regnext(scan);
997 if (temp == NULL)
998 break;
999 scan = temp;
1000 }
1001
1002#ifdef REGALIGN
1003 offset = val - scan;
1004#ifndef lint
1005 *(short*)(scan+1) = offset;
1006#else
1007 offset = offset;
1008#endif
1009#else
1010 if (OP(scan) == BACK)
1011 offset = scan - val;
1012 else
1013 offset = val - scan;
1014 *(scan+1) = (offset>>8)&0377;
1015 *(scan+2) = offset&0377;
1016#endif
1017}
1018
1019/*
1020 - regoptail - regtail on operand of first argument; nop if operandless
1021 */
1022static void
1023regoptail(p, val)
1024char *p;
1025char *val;
1026{
1027 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1028 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1029 return;
1030 regtail(NEXTOPER(p), val);
1031}
1032
1033/*
1034 - regcurly - a little FSA that accepts {\d+,?\d*}
1035 */
1036STATIC int
1037regcurly(s)
1038register char *s;
1039{
1040 if (*s++ != '{')
1041 return FALSE;
1042 if (!isdigit(*s))
1043 return FALSE;
1044 while (isdigit(*s))
1045 s++;
1046 if (*s == ',')
1047 s++;
1048 while (isdigit(*s))
1049 s++;
1050 if (*s != '}')
1051 return FALSE;
1052 return TRUE;
1053}
1054
1055#ifdef DEBUGGING
1056
1057/*
1058 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1059 */
1060void
1061regdump(r)
1062regexp *r;
1063{
1064 register char *s;
1065 register char op = EXACTLY; /* Arbitrary non-END op. */
1066 register char *next;
1067 extern char *index();
1068
1069
1070 s = r->program + 1;
1071 while (op != END) { /* While that wasn't END last time... */
1072#ifdef REGALIGN
1073 if (!((long)s & 1))
1074 s++;
1075#endif
1076 op = OP(s);
1077 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1078 next = regnext(s);
1079 if (next == NULL) /* Next ptr. */
1080 fprintf(stderr,"(0)");
1081 else
1082 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1083 s += 3;
1084 if (op == ANYOF || op == ANYBUT) {
1085 s += 32;
1086 }
1087 if (op == EXACTLY) {
1088 /* Literal string, where present. */
1089 s++;
1090 while (*s != '\0') {
1091 (void)putchar(*s);
1092 s++;
1093 }
1094 s++;
1095 }
1096 (void)putchar('\n');
1097 }
1098
1099 /* Header fields of interest. */
1100 if (r->regstart)
1101 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1102 if (r->regstclass)
1103 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1104 if (r->reganch)
1105 fprintf(stderr,"anchored ");
1106 if (r->regmust != NULL)
1107 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1108 r->regback);
1109 fprintf(stderr,"\n");
1110}
1111
1112/*
1113 - regprop - printable representation of opcode
1114 */
1115char *
1116regprop(op)
1117char *op;
1118{
1119 register char *p;
1120
1121 (void) strcpy(buf, ":");
1122
1123 switch (OP(op)) {
1124 case BOL:
1125 p = "BOL";
1126 break;
1127 case EOL:
1128 p = "EOL";
1129 break;
1130 case ANY:
1131 p = "ANY";
1132 break;
1133 case ANYOF:
1134 p = "ANYOF";
1135 break;
1136 case ANYBUT:
1137 p = "ANYBUT";
1138 break;
1139 case BRANCH:
1140 p = "BRANCH";
1141 break;
1142 case EXACTLY:
1143 p = "EXACTLY";
1144 break;
1145 case NOTHING:
1146 p = "NOTHING";
1147 break;
1148 case BACK:
1149 p = "BACK";
1150 break;
1151 case END:
1152 p = "END";
1153 break;
1154 case ALNUM:
1155 p = "ALNUM";
1156 break;
1157 case NALNUM:
1158 p = "NALNUM";
1159 break;
1160 case BOUND:
1161 p = "BOUND";
1162 break;
1163 case NBOUND:
1164 p = "NBOUND";
1165 break;
1166 case SPACE:
1167 p = "SPACE";
1168 break;
1169 case NSPACE:
1170 p = "NSPACE";
1171 break;
1172 case DIGIT:
1173 p = "DIGIT";
1174 break;
1175 case NDIGIT:
1176 p = "NDIGIT";
1177 break;
1178 case REF:
1179 case REF+1:
1180 case REF+2:
1181 case REF+3:
1182 case REF+4:
1183 case REF+5:
1184 case REF+6:
1185 case REF+7:
1186 case REF+8:
1187 case REF+9:
1188 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1189 p = NULL;
1190 break;
1191 case OPEN+1:
1192 case OPEN+2:
1193 case OPEN+3:
1194 case OPEN+4:
1195 case OPEN+5:
1196 case OPEN+6:
1197 case OPEN+7:
1198 case OPEN+8:
1199 case OPEN+9:
1200 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1201 p = NULL;
1202 break;
1203 case CLOSE+1:
1204 case CLOSE+2:
1205 case CLOSE+3:
1206 case CLOSE+4:
1207 case CLOSE+5:
1208 case CLOSE+6:
1209 case CLOSE+7:
1210 case CLOSE+8:
1211 case CLOSE+9:
1212 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1213 p = NULL;
1214 break;
1215 case STAR:
1216 p = "STAR";
1217 break;
1218 case PLUS:
1219 p = "PLUS";
1220 break;
1221 default:
1222 FAIL("corrupted regexp opcode");
1223 }
1224 if (p != NULL)
1225 (void) strcat(buf, p);
1226 return(buf);
1227}
1228#endif /* DEBUGGING */
1229
1230regfree(r)
1231struct regexp *r;
1232{
1233 if (r->precomp)
1234 Safefree(r->precomp);
1235 if (r->subbase)
1236 Safefree(r->subbase);
1237 if (r->regmust)
1238 str_free(r->regmust);
1239 if (r->regstart)
1240 str_free(r->regstart);
1241 Safefree(r);
1242}