perl 3.0 patch #13 (combined patch)
[p5sagit/p5-mst-13.2.git] / regcomp.c
CommitLineData
a687059c 1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
ac58e20f 10/* $Header: regcomp.c,v 3.0.1.2 90/02/28 18:08:35 lwall Locked $
a687059c 11 *
12 * $Log: regcomp.c,v $
ac58e20f 13 * Revision 3.0.1.2 90/02/28 18:08:35 lwall
14 * patch9: /[\200-\377]/ didn't work on machines with signed chars
15 *
ae986130 16 * Revision 3.0.1.1 89/11/11 04:51:04 lwall
17 * patch2: /[\000]/ didn't work
18 *
a687059c 19 * Revision 3.0 89/10/18 15:22:29 lwall
20 * 3.0 baseline
21 *
22 */
23
24/*
25 * regcomp and regexec -- regsub and regerror are not used in perl
26 *
27 * Copyright (c) 1986 by University of Toronto.
28 * Written by Henry Spencer. Not derived from licensed software.
29 *
30 * Permission is granted to anyone to use this software for any
31 * purpose on any computer system, and to redistribute it freely,
32 * subject to the following restrictions:
33 *
34 * 1. The author is not responsible for the consequences of use of
35 * this software, no matter how awful, even if they arise
36 * from defects in it.
37 *
38 * 2. The origin of this software must not be misrepresented, either
39 * by explicit claim or by omission.
40 *
41 * 3. Altered versions must be plainly marked as such, and must not
42 * be misrepresented as being the original software.
43 *
44 *
45 **** Alterations to Henry's code are...
46 ****
47 **** Copyright (c) 1989, Larry Wall
48 ****
49 **** You may distribute under the terms of the GNU General Public License
50 **** as specified in the README file that comes with the perl 3.0 kit.
51 *
52 * Beware that some of this code is subtly aware of the way operator
53 * precedence is structured in regular expressions. Serious changes in
54 * regular-expression syntax might require a total rethink.
55 */
56#include "EXTERN.h"
57#include "perl.h"
58#include "INTERN.h"
59#include "regcomp.h"
60
61#ifndef STATIC
62#define STATIC static
63#endif
64
65#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
66#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
67 ((*s) == '{' && regcurly(s)))
68#define META "^$.[()|?+*\\"
69
70/*
71 * Flags to be passed up and down.
72 */
73#define HASWIDTH 01 /* Known never to match null string. */
74#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
75#define SPSTART 04 /* Starts with * or +. */
76#define WORST 0 /* Worst case. */
77
78/*
79 * Global work variables for regcomp().
80 */
81static char *regprecomp; /* uncompiled string. */
82static char *regparse; /* Input-scan pointer. */
83static char *regxend; /* End of input for compile */
84static int regnpar; /* () count. */
85static char *regcode; /* Code-emit pointer; &regdummy = don't. */
86static long regsize; /* Code size. */
87static int regfold;
88static int regsawbracket; /* Did we do {d,d} trick? */
89
90/*
91 * Forward declarations for regcomp()'s friends.
92 */
93STATIC int regcurly();
94STATIC char *reg();
95STATIC char *regbranch();
96STATIC char *regpiece();
97STATIC char *regatom();
98STATIC char *regclass();
99STATIC char *regnode();
100STATIC void regc();
101STATIC void reginsert();
102STATIC void regtail();
103STATIC void regoptail();
104
105/*
106 - regcomp - compile a regular expression into internal code
107 *
108 * We can't allocate space until we know how big the compiled form will be,
109 * but we can't compile it (and thus know how big it is) until we've got a
110 * place to put the code. So we cheat: we compile it twice, once with code
111 * generation turned off and size counting turned on, and once "for real".
112 * This also means that we don't allocate space until we are sure that the
113 * thing really will compile successfully, and we never have to move the
114 * code and thus invalidate pointers into it. (Note that it has to be in
115 * one piece because free() must be able to free it all.) [NB: not true in perl]
116 *
117 * Beware that the optimization-preparation code in here knows about some
118 * of the structure of the compiled regexp. [I'll say.]
119 */
120regexp *
121regcomp(exp,xend,fold,rare)
122char *exp;
123char *xend;
124int fold;
125int rare;
126{
127 register regexp *r;
128 register char *scan;
129 register STR *longest;
130 register int len;
131 register char *first;
132 int flags;
133 int back;
134 int curback;
135 extern char *safemalloc();
136 extern char *savestr();
137
138 if (exp == NULL)
139 fatal("NULL regexp argument");
140
141 /* First pass: determine size, legality. */
142 regfold = fold;
143 regparse = exp;
144 regxend = xend;
145 regprecomp = nsavestr(exp,xend-exp);
146 regsawbracket = 0;
147 regnpar = 1;
148 regsize = 0L;
149 regcode = &regdummy;
150 regc(MAGIC);
151 if (reg(0, &flags) == NULL) {
152 Safefree(regprecomp);
153 return(NULL);
154 }
155
156 /* Small enough for pointer-storage convention? */
157 if (regsize >= 32767L) /* Probably could be 65535L. */
158 FAIL("regexp too big");
159
160 /* Allocate space. */
161 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
162 if (r == NULL)
163 FAIL("regexp out of space");
164
165 /* Second pass: emit code. */
166 if (regsawbracket)
167 bcopy(regprecomp,exp,xend-exp);
168 r->precomp = regprecomp;
169 r->subbase = NULL;
170 regparse = exp;
171 regnpar = 1;
172 regcode = r->program;
173 regc(MAGIC);
174 if (reg(0, &flags) == NULL)
175 return(NULL);
176
177 /* Dig out information for optimizations. */
178 r->regstart = Nullstr; /* Worst-case defaults. */
179 r->reganch = 0;
180 r->regmust = Nullstr;
181 r->regback = -1;
182 r->regstclass = Nullch;
183 scan = r->program+1; /* First BRANCH. */
184 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
185 scan = NEXTOPER(scan);
186
187 first = scan;
188 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
189 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
190 (OP(first) == PLUS) )
191 first = NEXTOPER(first);
192
193 /* Starting-point info. */
194 if (OP(first) == EXACTLY) {
195 r->regstart =
196 str_make(OPERAND(first)+1,*OPERAND(first));
197 if (r->regstart->str_cur > !(sawstudy|fold))
198 fbmcompile(r->regstart,fold);
199 }
200 else if ((exp = index(simple,OP(first))) && exp > simple)
201 r->regstclass = first;
202 else if (OP(first) == BOUND || OP(first) == NBOUND)
203 r->regstclass = first;
204 else if (OP(first) == BOL)
205 r->reganch++;
206
207#ifdef DEBUGGING
208 if (debug & 512)
209 fprintf(stderr,"first %d next %d offset %d\n",
210 OP(first), OP(NEXTOPER(first)), first - scan);
211#endif
212 /*
213 * If there's something expensive in the r.e., find the
214 * longest literal string that must appear and make it the
215 * regmust. Resolve ties in favor of later strings, since
216 * the regstart check works with the beginning of the r.e.
217 * and avoiding duplication strengthens checking. Not a
218 * strong reason, but sufficient in the absence of others.
219 * [Now we resolve ties in favor of the earlier string if
220 * it happens that curback has been invalidated, since the
221 * earlier string may buy us something the later one won't.]
222 */
223 longest = str_make("",0);
224 len = 0;
225 curback = 0;
226 back = 0;
227 while (scan != NULL) {
228 if (OP(scan) == BRANCH) {
229 if (OP(regnext(scan)) == BRANCH) {
230 curback = -30000;
231 while (OP(scan) == BRANCH)
232 scan = regnext(scan);
233 }
234 else /* single branch is ok */
235 scan = NEXTOPER(scan);
236 }
237 if (OP(scan) == EXACTLY) {
238 first = scan;
239 while (OP(regnext(scan)) >= CLOSE)
240 scan = regnext(scan);
241 if (curback - back == len) {
242 str_ncat(longest, OPERAND(first)+1,
243 *OPERAND(first));
244 len += *OPERAND(first);
245 curback += *OPERAND(first);
246 first = regnext(scan);
247 }
248 else if (*OPERAND(first) >= len + (curback >= 0)) {
249 len = *OPERAND(first);
250 str_nset(longest, OPERAND(first)+1,len);
251 back = curback;
252 curback += len;
253 first = regnext(scan);
254 }
255 else
256 curback += *OPERAND(first);
257 }
258 else if (index(varies,OP(scan)))
259 curback = -30000;
260 else if (index(simple,OP(scan)))
261 curback++;
262 scan = regnext(scan);
263 }
264 if (len) {
265 r->regmust = longest;
266 if (back < 0)
267 back = -1;
268 r->regback = back;
269 if (len > !(sawstudy||fold||OP(first)==EOL))
270 fbmcompile(r->regmust,fold);
271 r->regmust->str_u.str_useful = 100;
272 if (OP(first) == EOL) /* is match anchored to EOL? */
273 r->regmust->str_pok |= SP_TAIL;
274 }
275 else
276 str_free(longest);
277 }
278
279 r->do_folding = fold;
280 r->nparens = regnpar - 1;
281#ifdef DEBUGGING
282 if (debug & 512)
283 regdump(r);
284#endif
285 return(r);
286}
287
288/*
289 - reg - regular expression, i.e. main body or parenthesized thing
290 *
291 * Caller must absorb opening parenthesis.
292 *
293 * Combining parenthesis handling with the base level of regular expression
294 * is a trifle forced, but the need to tie the tails of the branches to what
295 * follows makes it hard to avoid.
296 */
297static char *
298reg(paren, flagp)
299int paren; /* Parenthesized? */
300int *flagp;
301{
302 register char *ret;
303 register char *br;
304 register char *ender;
305 register int parno;
306 int flags;
307
308 *flagp = HASWIDTH; /* Tentatively. */
309
310 /* Make an OPEN node, if parenthesized. */
311 if (paren) {
312 if (regnpar >= NSUBEXP)
313 FAIL("too many () in regexp");
314 parno = regnpar;
315 regnpar++;
316 ret = regnode(OPEN+parno);
317 } else
318 ret = NULL;
319
320 /* Pick up the branches, linking them together. */
321 br = regbranch(&flags);
322 if (br == NULL)
323 return(NULL);
324 if (ret != NULL)
325 regtail(ret, br); /* OPEN -> first. */
326 else
327 ret = br;
328 if (!(flags&HASWIDTH))
329 *flagp &= ~HASWIDTH;
330 *flagp |= flags&SPSTART;
331 while (*regparse == '|') {
332 regparse++;
333 br = regbranch(&flags);
334 if (br == NULL)
335 return(NULL);
336 regtail(ret, br); /* BRANCH -> BRANCH. */
337 if (!(flags&HASWIDTH))
338 *flagp &= ~HASWIDTH;
339 *flagp |= flags&SPSTART;
340 }
341
342 /* Make a closing node, and hook it on the end. */
343 ender = regnode((paren) ? CLOSE+parno : END);
344 regtail(ret, ender);
345
346 /* Hook the tails of the branches to the closing node. */
347 for (br = ret; br != NULL; br = regnext(br))
348 regoptail(br, ender);
349
350 /* Check for proper termination. */
351 if (paren && *regparse++ != ')') {
352 FAIL("unmatched () in regexp");
353 } else if (!paren && regparse < regxend) {
354 if (*regparse == ')') {
355 FAIL("unmatched () in regexp");
356 } else
357 FAIL("junk on end of regexp"); /* "Can't happen". */
358 /* NOTREACHED */
359 }
360
361 return(ret);
362}
363
364/*
365 - regbranch - one alternative of an | operator
366 *
367 * Implements the concatenation operator.
368 */
369static char *
370regbranch(flagp)
371int *flagp;
372{
373 register char *ret;
374 register char *chain;
375 register char *latest;
376 int flags;
377
378 *flagp = WORST; /* Tentatively. */
379
380 ret = regnode(BRANCH);
381 chain = NULL;
382 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
383 latest = regpiece(&flags);
384 if (latest == NULL)
385 return(NULL);
386 *flagp |= flags&HASWIDTH;
387 if (chain == NULL) /* First piece. */
388 *flagp |= flags&SPSTART;
389 else
390 regtail(chain, latest);
391 chain = latest;
392 }
393 if (chain == NULL) /* Loop ran zero times. */
394 (void) regnode(NOTHING);
395
396 return(ret);
397}
398
399/*
400 - regpiece - something followed by possible [*+?]
401 *
402 * Note that the branching code sequences used for ? and the general cases
403 * of * and + are somewhat optimized: they use the same NOTHING node as
404 * both the endmarker for their branch list and the body of the last branch.
405 * It might seem that this node could be dispensed with entirely, but the
406 * endmarker role is not redundant.
407 */
408static char *
409regpiece(flagp)
410int *flagp;
411{
412 register char *ret;
413 register char op;
414 register char *next;
415 int flags;
416 char *origparse = regparse;
417 int orignpar = regnpar;
418 char *max;
419 int iter;
420 char ch;
421
422 ret = regatom(&flags);
423 if (ret == NULL)
424 return(NULL);
425
426 op = *regparse;
427
428 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
429 * then we decrement the first number by one and reset our
430 * parsing back to the beginning of the same atom. If the first number
431 * is down to 0, decrement the second number instead and fake up
432 * a ? after it. Given the way this compiler doesn't keep track
433 * of offsets on the first pass, this is the only way to replicate
434 * a piece of code. Sigh.
435 */
436 if (op == '{' && regcurly(regparse)) {
437 next = regparse + 1;
438 max = Nullch;
439 while (isdigit(*next) || *next == ',') {
440 if (*next == ',') {
441 if (max)
442 break;
443 else
444 max = next;
445 }
446 next++;
447 }
448 if (*next == '}') { /* got one */
449 regsawbracket++; /* remember we clobbered exp */
450 if (!max)
451 max = next;
452 regparse++;
453 iter = atoi(regparse);
454 if (iter > 0) {
455 ch = *max;
456 sprintf(regparse,"%.*d", max-regparse, iter - 1);
457 *max = ch;
458 if (*max == ',' && atoi(max+1) > 0) {
459 ch = *next;
460 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
461 *next = ch;
462 }
463 if (iter != 1 || (*max == ',' || atoi(max+1))) {
464 regparse = origparse; /* back up input pointer */
465 regnpar = orignpar; /* don't make more parens */
466 }
467 else {
468 regparse = next;
469 goto nest_check;
470 }
471 *flagp = flags;
472 return ret;
473 }
474 if (*max == ',') {
475 max++;
476 iter = atoi(max);
477 if (max == next) { /* any number more? */
478 regparse = next;
479 op = '*'; /* fake up one with a star */
480 }
481 else if (iter > 0) {
482 op = '?'; /* fake up optional atom */
483 ch = *next;
484 sprintf(max,"%.*d", next-max, iter - 1);
485 *next = ch;
486 if (iter == 1)
487 regparse = next;
488 else {
489 regparse = origparse - 1; /* offset ++ below */
490 regnpar = orignpar;
491 }
492 }
493 else
494 fatal("Can't do {n,0}");
495 }
496 else
497 fatal("Can't do {0}");
498 }
499 }
500
501 if (!ISMULT1(op)) {
502 *flagp = flags;
503 return(ret);
504 }
505
506 if (!(flags&HASWIDTH) && op != '?')
507 FAIL("regexp *+ operand could be empty");
508 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
509
510 if (op == '*' && (flags&SIMPLE))
511 reginsert(STAR, ret);
512 else if (op == '*') {
513 /* Emit x* as (x&|), where & means "self". */
514 reginsert(BRANCH, ret); /* Either x */
515 regoptail(ret, regnode(BACK)); /* and loop */
516 regoptail(ret, ret); /* back */
517 regtail(ret, regnode(BRANCH)); /* or */
518 regtail(ret, regnode(NOTHING)); /* null. */
519 } else if (op == '+' && (flags&SIMPLE))
520 reginsert(PLUS, ret);
521 else if (op == '+') {
522 /* Emit x+ as x(&|), where & means "self". */
523 next = regnode(BRANCH); /* Either */
524 regtail(ret, next);
525 regtail(regnode(BACK), ret); /* loop back */
526 regtail(next, regnode(BRANCH)); /* or */
527 regtail(ret, regnode(NOTHING)); /* null. */
528 } else if (op == '?') {
529 /* Emit x? as (x|) */
530 reginsert(BRANCH, ret); /* Either x */
531 regtail(ret, regnode(BRANCH)); /* or */
532 next = regnode(NOTHING); /* null. */
533 regtail(ret, next);
534 regoptail(ret, next);
535 }
536 nest_check:
537 regparse++;
538 if (ISMULT2(regparse))
539 FAIL("nested *?+ in regexp");
540
541 return(ret);
542}
543
544/*
545 - regatom - the lowest level
546 *
547 * Optimization: gobbles an entire sequence of ordinary characters so that
548 * it can turn them into a single node, which is smaller to store and
549 * faster to run. Backslashed characters are exceptions, each becoming a
550 * separate node; the code is simpler that way and it's not worth fixing.
551 *
552 * [Yes, it is worth fixing, some scripts can run twice the speed.]
553 */
554static char *
555regatom(flagp)
556int *flagp;
557{
558 register char *ret;
559 int flags;
560
561 *flagp = WORST; /* Tentatively. */
562
563 switch (*regparse++) {
564 case '^':
565 ret = regnode(BOL);
566 break;
567 case '$':
568 ret = regnode(EOL);
569 break;
570 case '.':
571 ret = regnode(ANY);
572 *flagp |= HASWIDTH|SIMPLE;
573 break;
574 case '[':
575 ret = regclass();
576 *flagp |= HASWIDTH|SIMPLE;
577 break;
578 case '(':
579 ret = reg(1, &flags);
580 if (ret == NULL)
581 return(NULL);
582 *flagp |= flags&(HASWIDTH|SPSTART);
583 break;
584 case '|':
585 case ')':
586 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
587 break;
588 case '?':
589 case '+':
590 case '*':
591 FAIL("?+* follows nothing in regexp");
592 break;
593 case '\\':
594 switch (*regparse) {
595 case 'w':
596 ret = regnode(ALNUM);
597 *flagp |= HASWIDTH|SIMPLE;
598 regparse++;
599 break;
600 case 'W':
601 ret = regnode(NALNUM);
602 *flagp |= HASWIDTH|SIMPLE;
603 regparse++;
604 break;
605 case 'b':
606 ret = regnode(BOUND);
607 *flagp |= SIMPLE;
608 regparse++;
609 break;
610 case 'B':
611 ret = regnode(NBOUND);
612 *flagp |= SIMPLE;
613 regparse++;
614 break;
615 case 's':
616 ret = regnode(SPACE);
617 *flagp |= HASWIDTH|SIMPLE;
618 regparse++;
619 break;
620 case 'S':
621 ret = regnode(NSPACE);
622 *flagp |= HASWIDTH|SIMPLE;
623 regparse++;
624 break;
625 case 'd':
626 ret = regnode(DIGIT);
627 *flagp |= HASWIDTH|SIMPLE;
628 regparse++;
629 break;
630 case 'D':
631 ret = regnode(NDIGIT);
632 *flagp |= HASWIDTH|SIMPLE;
633 regparse++;
634 break;
635 case 'n':
636 case 'r':
637 case 't':
638 case 'f':
639 goto defchar;
640 case '0': case '1': case '2': case '3': case '4':
641 case '5': case '6': case '7': case '8': case '9':
642 if (isdigit(regparse[1]))
643 goto defchar;
644 else {
645 ret = regnode(REF + *regparse++ - '0');
646 *flagp |= SIMPLE;
647 }
648 break;
649 case '\0':
650 if (regparse >= regxend)
651 FAIL("trailing \\ in regexp");
652 /* FALL THROUGH */
653 default:
654 goto defchar;
655 }
656 break;
657 default: {
658 register int len;
659 register char ender;
660 register char *p;
661 char *oldp;
662 int foo;
663
664 defchar:
665 ret = regnode(EXACTLY);
666 regc(0); /* save spot for len */
667 for (len=0, p=regparse-1;
668 len < 127 && p < regxend;
669 len++)
670 {
671 oldp = p;
672 switch (*p) {
673 case '^':
674 case '$':
675 case '.':
676 case '[':
677 case '(':
678 case ')':
679 case '|':
680 goto loopdone;
681 case '\\':
682 switch (*++p) {
683 case 'w':
684 case 'W':
685 case 'b':
686 case 'B':
687 case 's':
688 case 'S':
689 case 'd':
690 case 'D':
691 --p;
692 goto loopdone;
693 case 'n':
694 ender = '\n';
695 p++;
696 break;
697 case 'r':
698 ender = '\r';
699 p++;
700 break;
701 case 't':
702 ender = '\t';
703 p++;
704 break;
705 case 'f':
706 ender = '\f';
707 p++;
708 break;
709 case '0': case '1': case '2': case '3':case '4':
710 case '5': case '6': case '7': case '8':case '9':
711 if (isdigit(p[1])) {
712 foo = *p++ - '0';
713 foo <<= 3;
714 foo += *p - '0';
715 if (isdigit(p[1]))
716 foo = (foo<<3) + *++p - '0';
717 ender = foo;
718 p++;
719 }
720 else {
721 --p;
722 goto loopdone;
723 }
724 break;
725 case '\0':
726 if (p >= regxend)
727 FAIL("trailing \\ in regexp");
728 /* FALL THROUGH */
729 default:
730 ender = *p++;
731 break;
732 }
733 break;
734 default:
735 ender = *p++;
736 break;
737 }
738 if (regfold && isupper(ender))
739 ender = tolower(ender);
740 if (ISMULT2(p)) { /* Back off on ?+*. */
741 if (len)
742 p = oldp;
743 else {
744 len++;
745 regc(ender);
746 }
747 break;
748 }
749 regc(ender);
750 }
751 loopdone:
752 regparse = p;
753 if (len <= 0)
754 FAIL("internal disaster in regexp");
755 *flagp |= HASWIDTH;
756 if (len == 1)
757 *flagp |= SIMPLE;
758 if (regcode != &regdummy)
759 *OPERAND(ret) = len;
760 regc('\0');
761 }
762 break;
763 }
764
765 return(ret);
766}
767
768static void
769regset(bits,def,c)
770char *bits;
771int def;
772register int c;
773{
774 if (regcode == &regdummy)
775 return;
ac58e20f 776 c &= 255;
a687059c 777 if (def)
778 bits[c >> 3] &= ~(1 << (c & 7));
779 else
780 bits[c >> 3] |= (1 << (c & 7));
781}
782
783static char *
784regclass()
785{
786 register char *bits;
787 register int class;
788 register int lastclass;
789 register int range = 0;
790 register char *ret;
791 register int def;
792
793 if (*regparse == '^') { /* Complement of range. */
794 ret = regnode(ANYBUT);
795 regparse++;
796 def = 0;
797 } else {
798 ret = regnode(ANYOF);
799 def = 255;
800 }
801 bits = regcode;
802 for (class = 0; class < 32; class++)
803 regc(def);
804 if (*regparse == ']' || *regparse == '-')
805 regset(bits,def,lastclass = *regparse++);
806 while (regparse < regxend && *regparse != ']') {
807 class = UCHARAT(regparse++);
808 if (class == '\\') {
809 class = UCHARAT(regparse++);
810 switch (class) {
811 case 'w':
812 for (class = 'a'; class <= 'z'; class++)
813 regset(bits,def,class);
814 for (class = 'A'; class <= 'Z'; class++)
815 regset(bits,def,class);
816 for (class = '0'; class <= '9'; class++)
817 regset(bits,def,class);
818 regset(bits,def,'_');
819 lastclass = 1234;
820 continue;
821 case 's':
822 regset(bits,def,' ');
823 regset(bits,def,'\t');
824 regset(bits,def,'\r');
825 regset(bits,def,'\f');
826 regset(bits,def,'\n');
827 lastclass = 1234;
828 continue;
829 case 'd':
830 for (class = '0'; class <= '9'; class++)
831 regset(bits,def,class);
832 lastclass = 1234;
833 continue;
834 case 'n':
835 class = '\n';
836 break;
837 case 'r':
838 class = '\r';
839 break;
840 case 't':
841 class = '\t';
842 break;
843 case 'f':
844 class = '\f';
845 break;
846 case 'b':
847 class = '\b';
848 break;
849 case '0': case '1': case '2': case '3': case '4':
850 case '5': case '6': case '7': case '8': case '9':
851 class -= '0';
852 if (isdigit(*regparse)) {
853 class <<= 3;
854 class += *regparse++ - '0';
855 }
856 if (isdigit(*regparse)) {
857 class <<= 3;
858 class += *regparse++ - '0';
859 }
860 break;
861 }
862 }
863 if (!range && class == '-' && regparse < regxend &&
864 *regparse != ']') {
865 range = 1;
866 continue;
867 }
868 if (range) {
869 if (lastclass > class)
870 FAIL("invalid [] range in regexp");
871 }
872 else
873 lastclass = class - 1;
874 range = 0;
875 for (lastclass++; lastclass <= class; lastclass++) {
876 regset(bits,def,lastclass);
877 if (regfold && isupper(lastclass))
878 regset(bits,def,tolower(lastclass));
879 }
880 lastclass = class;
881 }
882 if (*regparse != ']')
883 FAIL("unmatched [] in regexp");
a687059c 884 regparse++;
885 return ret;
886}
887
888/*
889 - regnode - emit a node
890 */
891static char * /* Location. */
892regnode(op)
893char op;
894{
895 register char *ret;
896 register char *ptr;
897
898 ret = regcode;
899 if (ret == &regdummy) {
900#ifdef REGALIGN
901 if (!(regsize & 1))
902 regsize++;
903#endif
904 regsize += 3;
905 return(ret);
906 }
907
908#ifdef REGALIGN
909#ifndef lint
910 if (!((long)ret & 1))
911 *ret++ = 127;
912#endif
913#endif
914 ptr = ret;
915 *ptr++ = op;
916 *ptr++ = '\0'; /* Null "next" pointer. */
917 *ptr++ = '\0';
918 regcode = ptr;
919
920 return(ret);
921}
922
923/*
924 - regc - emit (if appropriate) a byte of code
925 */
926static void
927regc(b)
928char b;
929{
930 if (regcode != &regdummy)
931 *regcode++ = b;
932 else
933 regsize++;
934}
935
936/*
937 - reginsert - insert an operator in front of already-emitted operand
938 *
939 * Means relocating the operand.
940 */
941static void
942reginsert(op, opnd)
943char op;
944char *opnd;
945{
946 register char *src;
947 register char *dst;
948 register char *place;
949
950 if (regcode == &regdummy) {
951#ifdef REGALIGN
952 regsize += 4;
953#else
954 regsize += 3;
955#endif
956 return;
957 }
958
959 src = regcode;
960#ifdef REGALIGN
961 regcode += 4;
962#else
963 regcode += 3;
964#endif
965 dst = regcode;
966 while (src > opnd)
967 *--dst = *--src;
968
969 place = opnd; /* Op node, where operand used to be. */
970 *place++ = op;
971 *place++ = '\0';
972 *place++ = '\0';
973}
974
975/*
976 - regtail - set the next-pointer at the end of a node chain
977 */
978static void
979regtail(p, val)
980char *p;
981char *val;
982{
983 register char *scan;
984 register char *temp;
985 register int offset;
986
987 if (p == &regdummy)
988 return;
989
990 /* Find last node. */
991 scan = p;
992 for (;;) {
993 temp = regnext(scan);
994 if (temp == NULL)
995 break;
996 scan = temp;
997 }
998
999#ifdef REGALIGN
1000 offset = val - scan;
1001#ifndef lint
1002 *(short*)(scan+1) = offset;
1003#else
1004 offset = offset;
1005#endif
1006#else
1007 if (OP(scan) == BACK)
1008 offset = scan - val;
1009 else
1010 offset = val - scan;
1011 *(scan+1) = (offset>>8)&0377;
1012 *(scan+2) = offset&0377;
1013#endif
1014}
1015
1016/*
1017 - regoptail - regtail on operand of first argument; nop if operandless
1018 */
1019static void
1020regoptail(p, val)
1021char *p;
1022char *val;
1023{
1024 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1025 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1026 return;
1027 regtail(NEXTOPER(p), val);
1028}
1029
1030/*
1031 - regcurly - a little FSA that accepts {\d+,?\d*}
1032 */
1033STATIC int
1034regcurly(s)
1035register char *s;
1036{
1037 if (*s++ != '{')
1038 return FALSE;
1039 if (!isdigit(*s))
1040 return FALSE;
1041 while (isdigit(*s))
1042 s++;
1043 if (*s == ',')
1044 s++;
1045 while (isdigit(*s))
1046 s++;
1047 if (*s != '}')
1048 return FALSE;
1049 return TRUE;
1050}
1051
1052#ifdef DEBUGGING
1053
1054/*
1055 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1056 */
1057void
1058regdump(r)
1059regexp *r;
1060{
1061 register char *s;
1062 register char op = EXACTLY; /* Arbitrary non-END op. */
1063 register char *next;
1064 extern char *index();
1065
1066
1067 s = r->program + 1;
1068 while (op != END) { /* While that wasn't END last time... */
1069#ifdef REGALIGN
1070 if (!((long)s & 1))
1071 s++;
1072#endif
1073 op = OP(s);
1074 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1075 next = regnext(s);
1076 if (next == NULL) /* Next ptr. */
1077 fprintf(stderr,"(0)");
1078 else
1079 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1080 s += 3;
1081 if (op == ANYOF || op == ANYBUT) {
1082 s += 32;
1083 }
1084 if (op == EXACTLY) {
1085 /* Literal string, where present. */
1086 s++;
1087 while (*s != '\0') {
1088 (void)putchar(*s);
1089 s++;
1090 }
1091 s++;
1092 }
1093 (void)putchar('\n');
1094 }
1095
1096 /* Header fields of interest. */
1097 if (r->regstart)
1098 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1099 if (r->regstclass)
1100 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1101 if (r->reganch)
1102 fprintf(stderr,"anchored ");
1103 if (r->regmust != NULL)
1104 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1105 r->regback);
1106 fprintf(stderr,"\n");
1107}
1108
1109/*
1110 - regprop - printable representation of opcode
1111 */
1112char *
1113regprop(op)
1114char *op;
1115{
1116 register char *p;
1117
1118 (void) strcpy(buf, ":");
1119
1120 switch (OP(op)) {
1121 case BOL:
1122 p = "BOL";
1123 break;
1124 case EOL:
1125 p = "EOL";
1126 break;
1127 case ANY:
1128 p = "ANY";
1129 break;
1130 case ANYOF:
1131 p = "ANYOF";
1132 break;
1133 case ANYBUT:
1134 p = "ANYBUT";
1135 break;
1136 case BRANCH:
1137 p = "BRANCH";
1138 break;
1139 case EXACTLY:
1140 p = "EXACTLY";
1141 break;
1142 case NOTHING:
1143 p = "NOTHING";
1144 break;
1145 case BACK:
1146 p = "BACK";
1147 break;
1148 case END:
1149 p = "END";
1150 break;
1151 case ALNUM:
1152 p = "ALNUM";
1153 break;
1154 case NALNUM:
1155 p = "NALNUM";
1156 break;
1157 case BOUND:
1158 p = "BOUND";
1159 break;
1160 case NBOUND:
1161 p = "NBOUND";
1162 break;
1163 case SPACE:
1164 p = "SPACE";
1165 break;
1166 case NSPACE:
1167 p = "NSPACE";
1168 break;
1169 case DIGIT:
1170 p = "DIGIT";
1171 break;
1172 case NDIGIT:
1173 p = "NDIGIT";
1174 break;
1175 case REF:
1176 case REF+1:
1177 case REF+2:
1178 case REF+3:
1179 case REF+4:
1180 case REF+5:
1181 case REF+6:
1182 case REF+7:
1183 case REF+8:
1184 case REF+9:
1185 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1186 p = NULL;
1187 break;
1188 case OPEN+1:
1189 case OPEN+2:
1190 case OPEN+3:
1191 case OPEN+4:
1192 case OPEN+5:
1193 case OPEN+6:
1194 case OPEN+7:
1195 case OPEN+8:
1196 case OPEN+9:
1197 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1198 p = NULL;
1199 break;
1200 case CLOSE+1:
1201 case CLOSE+2:
1202 case CLOSE+3:
1203 case CLOSE+4:
1204 case CLOSE+5:
1205 case CLOSE+6:
1206 case CLOSE+7:
1207 case CLOSE+8:
1208 case CLOSE+9:
1209 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1210 p = NULL;
1211 break;
1212 case STAR:
1213 p = "STAR";
1214 break;
1215 case PLUS:
1216 p = "PLUS";
1217 break;
1218 default:
1219 FAIL("corrupted regexp opcode");
1220 }
1221 if (p != NULL)
1222 (void) strcat(buf, p);
1223 return(buf);
1224}
1225#endif /* DEBUGGING */
1226
1227regfree(r)
1228struct regexp *r;
1229{
1230 if (r->precomp)
1231 Safefree(r->precomp);
1232 if (r->subbase)
1233 Safefree(r->subbase);
1234 if (r->regmust)
1235 str_free(r->regmust);
1236 if (r->regstart)
1237 str_free(r->regstart);
1238 Safefree(r);
1239}