perl 3.0 patch #14 patch #13, continued
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0.1.3 90/03/12 16:59:22 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0.1.3  90/03/12  16:59:22  lwall
14  * patch13: pattern matches can now use \0 to mean \000
15  * 
16  * Revision 3.0.1.2  90/02/28  18:08:35  lwall
17  * patch9: /[\200-\377]/ didn't work on machines with signed chars
18  * 
19  * Revision 3.0.1.1  89/11/11  04:51:04  lwall
20  * patch2: /[\000]/ didn't work
21  * 
22  * Revision 3.0  89/10/18  15:22:29  lwall
23  * 3.0 baseline
24  * 
25  */
26
27 /*
28  * regcomp and regexec -- regsub and regerror are not used in perl
29  *
30  *      Copyright (c) 1986 by University of Toronto.
31  *      Written by Henry Spencer.  Not derived from licensed software.
32  *
33  *      Permission is granted to anyone to use this software for any
34  *      purpose on any computer system, and to redistribute it freely,
35  *      subject to the following restrictions:
36  *
37  *      1. The author is not responsible for the consequences of use of
38  *              this software, no matter how awful, even if they arise
39  *              from defects in it.
40  *
41  *      2. The origin of this software must not be misrepresented, either
42  *              by explicit claim or by omission.
43  *
44  *      3. Altered versions must be plainly marked as such, and must not
45  *              be misrepresented as being the original software.
46  *
47  *
48  ****    Alterations to Henry's code are...
49  ****
50  ****    Copyright (c) 1989, Larry Wall
51  ****
52  ****    You may distribute under the terms of the GNU General Public License
53  ****    as specified in the README file that comes with the perl 3.0 kit.
54  *
55  * Beware that some of this code is subtly aware of the way operator
56  * precedence is structured in regular expressions.  Serious changes in
57  * regular-expression syntax might require a total rethink.
58  */
59 #include "EXTERN.h"
60 #include "perl.h"
61 #include "INTERN.h"
62 #include "regcomp.h"
63
64 #ifndef STATIC
65 #define STATIC  static
66 #endif
67
68 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
69 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
70         ((*s) == '{' && regcurly(s)))
71 #define META    "^$.[()|?+*\\"
72
73 /*
74  * Flags to be passed up and down.
75  */
76 #define HASWIDTH        01      /* Known never to match null string. */
77 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
78 #define SPSTART         04      /* Starts with * or +. */
79 #define WORST           0       /* Worst case. */
80
81 /*
82  * Global work variables for regcomp().
83  */
84 static char *regprecomp;                /* uncompiled string. */
85 static char *regparse;          /* Input-scan pointer. */
86 static char *regxend;           /* End of input for compile */
87 static int regnpar;             /* () count. */
88 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
89 static long regsize;            /* Code size. */
90 static int regfold;
91 static int regsawbracket;       /* Did we do {d,d} trick? */
92
93 /*
94  * Forward declarations for regcomp()'s friends.
95  */
96 STATIC int regcurly();
97 STATIC char *reg();
98 STATIC char *regbranch();
99 STATIC char *regpiece();
100 STATIC char *regatom();
101 STATIC char *regclass();
102 STATIC char *regnode();
103 STATIC void regc();
104 STATIC void reginsert();
105 STATIC void regtail();
106 STATIC void regoptail();
107
108 /*
109  - regcomp - compile a regular expression into internal code
110  *
111  * We can't allocate space until we know how big the compiled form will be,
112  * but we can't compile it (and thus know how big it is) until we've got a
113  * place to put the code.  So we cheat:  we compile it twice, once with code
114  * generation turned off and size counting turned on, and once "for real".
115  * This also means that we don't allocate space until we are sure that the
116  * thing really will compile successfully, and we never have to move the
117  * code and thus invalidate pointers into it.  (Note that it has to be in
118  * one piece because free() must be able to free it all.) [NB: not true in perl]
119  *
120  * Beware that the optimization-preparation code in here knows about some
121  * of the structure of the compiled regexp.  [I'll say.]
122  */
123 regexp *
124 regcomp(exp,xend,fold,rare)
125 char *exp;
126 char *xend;
127 int fold;
128 int rare;
129 {
130         register regexp *r;
131         register char *scan;
132         register STR *longest;
133         register int len;
134         register char *first;
135         int flags;
136         int back;
137         int curback;
138         extern char *safemalloc();
139         extern char *savestr();
140
141         if (exp == NULL)
142                 fatal("NULL regexp argument");
143
144         /* First pass: determine size, legality. */
145         regfold = fold;
146         regparse = exp;
147         regxend = xend;
148         regprecomp = nsavestr(exp,xend-exp);
149         regsawbracket = 0;
150         regnpar = 1;
151         regsize = 0L;
152         regcode = &regdummy;
153         regc(MAGIC);
154         if (reg(0, &flags) == NULL) {
155                 Safefree(regprecomp);
156                 return(NULL);
157         }
158
159         /* Small enough for pointer-storage convention? */
160         if (regsize >= 32767L)          /* Probably could be 65535L. */
161                 FAIL("regexp too big");
162
163         /* Allocate space. */
164         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
165         if (r == NULL)
166                 FAIL("regexp out of space");
167
168         /* Second pass: emit code. */
169         if (regsawbracket)
170             bcopy(regprecomp,exp,xend-exp);
171         r->precomp = regprecomp;
172         r->subbase = NULL;
173         regparse = exp;
174         regnpar = 1;
175         regcode = r->program;
176         regc(MAGIC);
177         if (reg(0, &flags) == NULL)
178                 return(NULL);
179
180         /* Dig out information for optimizations. */
181         r->regstart = Nullstr;  /* Worst-case defaults. */
182         r->reganch = 0;
183         r->regmust = Nullstr;
184         r->regback = -1;
185         r->regstclass = Nullch;
186         scan = r->program+1;                    /* First BRANCH. */
187         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
188                 scan = NEXTOPER(scan);
189
190                 first = scan;
191                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
192                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
193                     (OP(first) == PLUS) )
194                         first = NEXTOPER(first);
195
196                 /* Starting-point info. */
197                 if (OP(first) == EXACTLY) {
198                         r->regstart =
199                             str_make(OPERAND(first)+1,*OPERAND(first));
200                         if (r->regstart->str_cur > !(sawstudy|fold))
201                                 fbmcompile(r->regstart,fold);
202                 }
203                 else if ((exp = index(simple,OP(first))) && exp > simple)
204                         r->regstclass = first;
205                 else if (OP(first) == BOUND || OP(first) == NBOUND)
206                         r->regstclass = first;
207                 else if (OP(first) == BOL)
208                         r->reganch++;
209
210 #ifdef DEBUGGING
211                 if (debug & 512)
212                     fprintf(stderr,"first %d next %d offset %d\n",
213                       OP(first), OP(NEXTOPER(first)), first - scan);
214 #endif
215                 /*
216                  * If there's something expensive in the r.e., find the
217                  * longest literal string that must appear and make it the
218                  * regmust.  Resolve ties in favor of later strings, since
219                  * the regstart check works with the beginning of the r.e.
220                  * and avoiding duplication strengthens checking.  Not a
221                  * strong reason, but sufficient in the absence of others.
222                  * [Now we resolve ties in favor of the earlier string if
223                  * it happens that curback has been invalidated, since the
224                  * earlier string may buy us something the later one won't.]
225                  */
226                 longest = str_make("",0);
227                 len = 0;
228                 curback = 0;
229                 back = 0;
230                 while (scan != NULL) {
231                         if (OP(scan) == BRANCH) {
232                             if (OP(regnext(scan)) == BRANCH) {
233                                 curback = -30000;
234                                 while (OP(scan) == BRANCH)
235                                     scan = regnext(scan);
236                             }
237                             else        /* single branch is ok */
238                                 scan = NEXTOPER(scan);
239                         }
240                         if (OP(scan) == EXACTLY) {
241                             first = scan;
242                             while (OP(regnext(scan)) >= CLOSE)
243                                 scan = regnext(scan);
244                             if (curback - back == len) {
245                                 str_ncat(longest, OPERAND(first)+1,
246                                     *OPERAND(first));
247                                 len += *OPERAND(first);
248                                 curback += *OPERAND(first);
249                                 first = regnext(scan);
250                             }
251                             else if (*OPERAND(first) >= len + (curback >= 0)) {
252                                 len = *OPERAND(first);
253                                 str_nset(longest, OPERAND(first)+1,len);
254                                 back = curback;
255                                 curback += len;
256                                 first = regnext(scan);
257                             }
258                             else
259                                 curback += *OPERAND(first);
260                         }
261                         else if (index(varies,OP(scan)))
262                                 curback = -30000;
263                         else if (index(simple,OP(scan)))
264                                 curback++;
265                         scan = regnext(scan);
266                 }
267                 if (len) {
268                         r->regmust = longest;
269                         if (back < 0)
270                                 back = -1;
271                         r->regback = back;
272                         if (len > !(sawstudy||fold||OP(first)==EOL))
273                                 fbmcompile(r->regmust,fold);
274                         r->regmust->str_u.str_useful = 100;
275                         if (OP(first) == EOL) /* is match anchored to EOL? */
276                             r->regmust->str_pok |= SP_TAIL;
277                 }
278                 else
279                         str_free(longest);
280         }
281
282         r->do_folding = fold;
283         r->nparens = regnpar - 1;
284 #ifdef DEBUGGING
285         if (debug & 512)
286                 regdump(r);
287 #endif
288         return(r);
289 }
290
291 /*
292  - reg - regular expression, i.e. main body or parenthesized thing
293  *
294  * Caller must absorb opening parenthesis.
295  *
296  * Combining parenthesis handling with the base level of regular expression
297  * is a trifle forced, but the need to tie the tails of the branches to what
298  * follows makes it hard to avoid.
299  */
300 static char *
301 reg(paren, flagp)
302 int paren;                      /* Parenthesized? */
303 int *flagp;
304 {
305         register char *ret;
306         register char *br;
307         register char *ender;
308         register int parno;
309         int flags;
310
311         *flagp = HASWIDTH;      /* Tentatively. */
312
313         /* Make an OPEN node, if parenthesized. */
314         if (paren) {
315                 if (regnpar >= NSUBEXP)
316                         FAIL("too many () in regexp");
317                 parno = regnpar;
318                 regnpar++;
319                 ret = regnode(OPEN+parno);
320         } else
321                 ret = NULL;
322
323         /* Pick up the branches, linking them together. */
324         br = regbranch(&flags);
325         if (br == NULL)
326                 return(NULL);
327         if (ret != NULL)
328                 regtail(ret, br);       /* OPEN -> first. */
329         else
330                 ret = br;
331         if (!(flags&HASWIDTH))
332                 *flagp &= ~HASWIDTH;
333         *flagp |= flags&SPSTART;
334         while (*regparse == '|') {
335                 regparse++;
336                 br = regbranch(&flags);
337                 if (br == NULL)
338                         return(NULL);
339                 regtail(ret, br);       /* BRANCH -> BRANCH. */
340                 if (!(flags&HASWIDTH))
341                         *flagp &= ~HASWIDTH;
342                 *flagp |= flags&SPSTART;
343         }
344
345         /* Make a closing node, and hook it on the end. */
346         ender = regnode((paren) ? CLOSE+parno : END);   
347         regtail(ret, ender);
348
349         /* Hook the tails of the branches to the closing node. */
350         for (br = ret; br != NULL; br = regnext(br))
351                 regoptail(br, ender);
352
353         /* Check for proper termination. */
354         if (paren && *regparse++ != ')') {
355                 FAIL("unmatched () in regexp");
356         } else if (!paren && regparse < regxend) {
357                 if (*regparse == ')') {
358                         FAIL("unmatched () in regexp");
359                 } else
360                         FAIL("junk on end of regexp");  /* "Can't happen". */
361                 /* NOTREACHED */
362         }
363
364         return(ret);
365 }
366
367 /*
368  - regbranch - one alternative of an | operator
369  *
370  * Implements the concatenation operator.
371  */
372 static char *
373 regbranch(flagp)
374 int *flagp;
375 {
376         register char *ret;
377         register char *chain;
378         register char *latest;
379         int flags;
380
381         *flagp = WORST;         /* Tentatively. */
382
383         ret = regnode(BRANCH);
384         chain = NULL;
385         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
386                 latest = regpiece(&flags);
387                 if (latest == NULL)
388                         return(NULL);
389                 *flagp |= flags&HASWIDTH;
390                 if (chain == NULL)      /* First piece. */
391                         *flagp |= flags&SPSTART;
392                 else
393                         regtail(chain, latest);
394                 chain = latest;
395         }
396         if (chain == NULL)      /* Loop ran zero times. */
397                 (void) regnode(NOTHING);
398
399         return(ret);
400 }
401
402 /*
403  - regpiece - something followed by possible [*+?]
404  *
405  * Note that the branching code sequences used for ? and the general cases
406  * of * and + are somewhat optimized:  they use the same NOTHING node as
407  * both the endmarker for their branch list and the body of the last branch.
408  * It might seem that this node could be dispensed with entirely, but the
409  * endmarker role is not redundant.
410  */
411 static char *
412 regpiece(flagp)
413 int *flagp;
414 {
415         register char *ret;
416         register char op;
417         register char *next;
418         int flags;
419         char *origparse = regparse;
420         int orignpar = regnpar;
421         char *max;
422         int iter;
423         char ch;
424
425         ret = regatom(&flags);
426         if (ret == NULL)
427                 return(NULL);
428
429         op = *regparse;
430
431         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
432          * then we decrement the first number by one and reset our
433          * parsing back to the beginning of the same atom.  If the first number
434          * is down to 0, decrement the second number instead and fake up
435          * a ? after it.  Given the way this compiler doesn't keep track
436          * of offsets on the first pass, this is the only way to replicate
437          * a piece of code.  Sigh.
438          */
439         if (op == '{' && regcurly(regparse)) {
440             next = regparse + 1;
441             max = Nullch;
442             while (isdigit(*next) || *next == ',') {
443                 if (*next == ',') {
444                     if (max)
445                         break;
446                     else
447                         max = next;
448                 }
449                 next++;
450             }
451             if (*next == '}') {         /* got one */
452                 regsawbracket++;        /* remember we clobbered exp */
453                 if (!max)
454                     max = next;
455                 regparse++;
456                 iter = atoi(regparse);
457                 if (iter > 0) {
458                     ch = *max;
459                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
460                     *max = ch;
461                     if (*max == ',' && atoi(max+1) > 0) {
462                         ch = *next;
463                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
464                         *next = ch;
465                     }
466                     if (iter != 1 || (*max == ',' || atoi(max+1))) {
467                         regparse = origparse;   /* back up input pointer */
468                         regnpar = orignpar;     /* don't make more parens */
469                     }
470                     else {
471                         regparse = next;
472                         goto nest_check;
473                     }
474                     *flagp = flags;
475                     return ret;
476                 }
477                 if (*max == ',') {
478                     max++;
479                     iter = atoi(max);
480                     if (max == next) {          /* any number more? */
481                         regparse = next;
482                         op = '*';               /* fake up one with a star */
483                     }
484                     else if (iter > 0) {
485                         op = '?';               /* fake up optional atom */
486                         ch = *next;
487                         sprintf(max,"%.*d", next-max, iter - 1);
488                         *next = ch;
489                         if (iter == 1)
490                             regparse = next;
491                         else {
492                             regparse = origparse - 1; /* offset ++ below */
493                             regnpar = orignpar;
494                         }
495                     }
496                     else
497                         fatal("Can't do {n,0}");
498                 }
499                 else
500                     fatal("Can't do {0}");
501             }
502         }
503
504         if (!ISMULT1(op)) {
505                 *flagp = flags;
506                 return(ret);
507         }
508
509         if (!(flags&HASWIDTH) && op != '?')
510                 FAIL("regexp *+ operand could be empty");
511         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
512
513         if (op == '*' && (flags&SIMPLE))
514                 reginsert(STAR, ret);
515         else if (op == '*') {
516                 /* Emit x* as (x&|), where & means "self". */
517                 reginsert(BRANCH, ret);                 /* Either x */
518                 regoptail(ret, regnode(BACK));          /* and loop */
519                 regoptail(ret, ret);                    /* back */
520                 regtail(ret, regnode(BRANCH));          /* or */
521                 regtail(ret, regnode(NOTHING));         /* null. */
522         } else if (op == '+' && (flags&SIMPLE))
523                 reginsert(PLUS, ret);
524         else if (op == '+') {
525                 /* Emit x+ as x(&|), where & means "self". */
526                 next = regnode(BRANCH);                 /* Either */
527                 regtail(ret, next);
528                 regtail(regnode(BACK), ret);            /* loop back */
529                 regtail(next, regnode(BRANCH));         /* or */
530                 regtail(ret, regnode(NOTHING));         /* null. */
531         } else if (op == '?') {
532                 /* Emit x? as (x|) */
533                 reginsert(BRANCH, ret);                 /* Either x */
534                 regtail(ret, regnode(BRANCH));          /* or */
535                 next = regnode(NOTHING);                /* null. */
536                 regtail(ret, next);
537                 regoptail(ret, next);
538         }
539       nest_check:
540         regparse++;
541         if (ISMULT2(regparse))
542                 FAIL("nested *?+ in regexp");
543
544         return(ret);
545 }
546
547 /*
548  - regatom - the lowest level
549  *
550  * Optimization:  gobbles an entire sequence of ordinary characters so that
551  * it can turn them into a single node, which is smaller to store and
552  * faster to run.  Backslashed characters are exceptions, each becoming a
553  * separate node; the code is simpler that way and it's not worth fixing.
554  *
555  * [Yes, it is worth fixing, some scripts can run twice the speed.]
556  */
557 static char *
558 regatom(flagp)
559 int *flagp;
560 {
561         register char *ret;
562         int flags;
563
564         *flagp = WORST;         /* Tentatively. */
565
566         switch (*regparse++) {
567         case '^':
568                 ret = regnode(BOL);
569                 break;
570         case '$':
571                 ret = regnode(EOL);
572                 break;
573         case '.':
574                 ret = regnode(ANY);
575                 *flagp |= HASWIDTH|SIMPLE;
576                 break;
577         case '[':
578                 ret = regclass();
579                 *flagp |= HASWIDTH|SIMPLE;
580                 break;
581         case '(':
582                 ret = reg(1, &flags);
583                 if (ret == NULL)
584                         return(NULL);
585                 *flagp |= flags&(HASWIDTH|SPSTART);
586                 break;
587         case '|':
588         case ')':
589                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
590                 break;
591         case '?':
592         case '+':
593         case '*':
594                 FAIL("?+* follows nothing in regexp");
595                 break;
596         case '\\':
597                 switch (*regparse) {
598                 case 'w':
599                         ret = regnode(ALNUM);
600                         *flagp |= HASWIDTH|SIMPLE;
601                         regparse++;
602                         break;
603                 case 'W':
604                         ret = regnode(NALNUM);
605                         *flagp |= HASWIDTH|SIMPLE;
606                         regparse++;
607                         break;
608                 case 'b':
609                         ret = regnode(BOUND);
610                         *flagp |= SIMPLE;
611                         regparse++;
612                         break;
613                 case 'B':
614                         ret = regnode(NBOUND);
615                         *flagp |= SIMPLE;
616                         regparse++;
617                         break;
618                 case 's':
619                         ret = regnode(SPACE);
620                         *flagp |= HASWIDTH|SIMPLE;
621                         regparse++;
622                         break;
623                 case 'S':
624                         ret = regnode(NSPACE);
625                         *flagp |= HASWIDTH|SIMPLE;
626                         regparse++;
627                         break;
628                 case 'd':
629                         ret = regnode(DIGIT);
630                         *flagp |= HASWIDTH|SIMPLE;
631                         regparse++;
632                         break;
633                 case 'D':
634                         ret = regnode(NDIGIT);
635                         *flagp |= HASWIDTH|SIMPLE;
636                         regparse++;
637                         break;
638                 case 'n':
639                 case 'r':
640                 case 't':
641                 case 'f':
642                         goto defchar;
643                 case '0': case '1': case '2': case '3': case '4':
644                 case '5': case '6': case '7': case '8': case '9':
645                         if (isdigit(regparse[1]) || *regparse == '0')
646                                 goto defchar;
647                         else {
648                                 ret = regnode(REF + *regparse++ - '0');
649                                 *flagp |= SIMPLE;
650                         }
651                         break;
652                 case '\0':
653                         if (regparse >= regxend)
654                             FAIL("trailing \\ in regexp");
655                         /* FALL THROUGH */
656                 default:
657                         goto defchar;
658                 }
659                 break;
660         default: {
661                         register int len;
662                         register char ender;
663                         register char *p;
664                         char *oldp;
665                         int foo;
666
667                     defchar:
668                         ret = regnode(EXACTLY);
669                         regc(0);                /* save spot for len */
670                         for (len=0, p=regparse-1;
671                           len < 127 && p < regxend;
672                           len++)
673                         {
674                             oldp = p;
675                             switch (*p) {
676                             case '^':
677                             case '$':
678                             case '.':
679                             case '[':
680                             case '(':
681                             case ')':
682                             case '|':
683                                 goto loopdone;
684                             case '\\':
685                                 switch (*++p) {
686                                 case 'w':
687                                 case 'W':
688                                 case 'b':
689                                 case 'B':
690                                 case 's':
691                                 case 'S':
692                                 case 'd':
693                                 case 'D':
694                                     --p;
695                                     goto loopdone;
696                                 case 'n':
697                                         ender = '\n';
698                                         p++;
699                                         break;
700                                 case 'r':
701                                         ender = '\r';
702                                         p++;
703                                         break;
704                                 case 't':
705                                         ender = '\t';
706                                         p++;
707                                         break;
708                                 case 'f':
709                                         ender = '\f';
710                                         p++;
711                                         break;
712                                 case '0': case '1': case '2': case '3':case '4':
713                                 case '5': case '6': case '7': case '8':case '9':
714                                     if (isdigit(p[1]) || *p == '0') {
715                                         foo = *p - '0';
716                                         if (isdigit(p[1]))
717                                             foo = (foo<<3) + *++p - '0';
718                                         if (isdigit(p[1]))
719                                             foo = (foo<<3) + *++p - '0';
720                                         ender = foo;
721                                         p++;
722                                     }
723                                     else {
724                                         --p;
725                                         goto loopdone;
726                                     }
727                                     break;
728                                 case '\0':
729                                     if (p >= regxend)
730                                         FAIL("trailing \\ in regexp");
731                                     /* FALL THROUGH */
732                                 default:
733                                     ender = *p++;
734                                     break;
735                                 }
736                                 break;
737                             default:
738                                 ender = *p++;
739                                 break;
740                             }
741                             if (regfold && isupper(ender))
742                                     ender = tolower(ender);
743                             if (ISMULT2(p)) { /* Back off on ?+*. */
744                                 if (len)
745                                     p = oldp;
746                                 else {
747                                     len++;
748                                     regc(ender);
749                                 }
750                                 break;
751                             }
752                             regc(ender);
753                         }
754                     loopdone:
755                         regparse = p;
756                         if (len <= 0)
757                                 FAIL("internal disaster in regexp");
758                         *flagp |= HASWIDTH;
759                         if (len == 1)
760                                 *flagp |= SIMPLE;
761                         if (regcode != &regdummy)
762                             *OPERAND(ret) = len;
763                         regc('\0');
764                 }
765                 break;
766         }
767
768         return(ret);
769 }
770
771 static void
772 regset(bits,def,c)
773 char *bits;
774 int def;
775 register int c;
776 {
777         if (regcode == &regdummy)
778             return;
779         c &= 255;
780         if (def)
781                 bits[c >> 3] &= ~(1 << (c & 7));
782         else
783                 bits[c >> 3] |=  (1 << (c & 7));
784 }
785
786 static char *
787 regclass()
788 {
789         register char *bits;
790         register int class;
791         register int lastclass;
792         register int range = 0;
793         register char *ret;
794         register int def;
795
796         if (*regparse == '^') { /* Complement of range. */
797                 ret = regnode(ANYBUT);
798                 regparse++;
799                 def = 0;
800         } else {
801                 ret = regnode(ANYOF);
802                 def = 255;
803         }
804         bits = regcode;
805         for (class = 0; class < 32; class++)
806             regc(def);
807         if (*regparse == ']' || *regparse == '-')
808                 regset(bits,def,lastclass = *regparse++);
809         while (regparse < regxend && *regparse != ']') {
810                 class = UCHARAT(regparse++);
811                 if (class == '\\') {
812                         class = UCHARAT(regparse++);
813                         switch (class) {
814                         case 'w':
815                                 for (class = 'a'; class <= 'z'; class++)
816                                         regset(bits,def,class);
817                                 for (class = 'A'; class <= 'Z'; class++)
818                                         regset(bits,def,class);
819                                 for (class = '0'; class <= '9'; class++)
820                                         regset(bits,def,class);
821                                 regset(bits,def,'_');
822                                 lastclass = 1234;
823                                 continue;
824                         case 's':
825                                 regset(bits,def,' ');
826                                 regset(bits,def,'\t');
827                                 regset(bits,def,'\r');
828                                 regset(bits,def,'\f');
829                                 regset(bits,def,'\n');
830                                 lastclass = 1234;
831                                 continue;
832                         case 'd':
833                                 for (class = '0'; class <= '9'; class++)
834                                         regset(bits,def,class);
835                                 lastclass = 1234;
836                                 continue;
837                         case 'n':
838                                 class = '\n';
839                                 break;
840                         case 'r':
841                                 class = '\r';
842                                 break;
843                         case 't':
844                                 class = '\t';
845                                 break;
846                         case 'f':
847                                 class = '\f';
848                                 break;
849                         case 'b':
850                                 class = '\b';
851                                 break;
852                         case '0': case '1': case '2': case '3': case '4':
853                         case '5': case '6': case '7': case '8': case '9':
854                                 class -= '0';
855                                 if (isdigit(*regparse)) {
856                                         class <<= 3;
857                                         class += *regparse++ - '0';
858                                 }
859                                 if (isdigit(*regparse)) {
860                                         class <<= 3;
861                                         class += *regparse++ - '0';
862                                 }
863                                 break;
864                         }
865                 }
866                 if (!range && class == '-' && regparse < regxend &&
867                     *regparse != ']') {
868                         range = 1;
869                         continue;
870                 }
871                 if (range) {
872                         if (lastclass > class)
873                                 FAIL("invalid [] range in regexp");
874                 }
875                 else
876                         lastclass = class - 1;
877                 range = 0;
878                 for (lastclass++; lastclass <= class; lastclass++) {
879                         regset(bits,def,lastclass);
880                         if (regfold && isupper(lastclass))
881                                 regset(bits,def,tolower(lastclass));
882                 }
883                 lastclass = class;
884         }
885         if (*regparse != ']')
886                 FAIL("unmatched [] in regexp");
887         regparse++;
888         return ret;
889 }
890
891 /*
892  - regnode - emit a node
893  */
894 static char *                   /* Location. */
895 regnode(op)
896 char op;
897 {
898         register char *ret;
899         register char *ptr;
900
901         ret = regcode;
902         if (ret == &regdummy) {
903 #ifdef REGALIGN
904                 if (!(regsize & 1))
905                         regsize++;
906 #endif
907                 regsize += 3;
908                 return(ret);
909         }
910
911 #ifdef REGALIGN
912 #ifndef lint
913         if (!((long)ret & 1))
914             *ret++ = 127;
915 #endif
916 #endif
917         ptr = ret;
918         *ptr++ = op;
919         *ptr++ = '\0';          /* Null "next" pointer. */
920         *ptr++ = '\0';
921         regcode = ptr;
922
923         return(ret);
924 }
925
926 /*
927  - regc - emit (if appropriate) a byte of code
928  */
929 static void
930 regc(b)
931 char b;
932 {
933         if (regcode != &regdummy)
934                 *regcode++ = b;
935         else
936                 regsize++;
937 }
938
939 /*
940  - reginsert - insert an operator in front of already-emitted operand
941  *
942  * Means relocating the operand.
943  */
944 static void
945 reginsert(op, opnd)
946 char op;
947 char *opnd;
948 {
949         register char *src;
950         register char *dst;
951         register char *place;
952
953         if (regcode == &regdummy) {
954 #ifdef REGALIGN
955                 regsize += 4;
956 #else
957                 regsize += 3;
958 #endif
959                 return;
960         }
961
962         src = regcode;
963 #ifdef REGALIGN
964         regcode += 4;
965 #else
966         regcode += 3;
967 #endif
968         dst = regcode;
969         while (src > opnd)
970                 *--dst = *--src;
971
972         place = opnd;           /* Op node, where operand used to be. */
973         *place++ = op;
974         *place++ = '\0';
975         *place++ = '\0';
976 }
977
978 /*
979  - regtail - set the next-pointer at the end of a node chain
980  */
981 static void
982 regtail(p, val)
983 char *p;
984 char *val;
985 {
986         register char *scan;
987         register char *temp;
988         register int offset;
989
990         if (p == &regdummy)
991                 return;
992
993         /* Find last node. */
994         scan = p;
995         for (;;) {
996                 temp = regnext(scan);
997                 if (temp == NULL)
998                         break;
999                 scan = temp;
1000         }
1001
1002 #ifdef REGALIGN
1003         offset = val - scan;
1004 #ifndef lint
1005         *(short*)(scan+1) = offset;
1006 #else
1007         offset = offset;
1008 #endif
1009 #else
1010         if (OP(scan) == BACK)
1011                 offset = scan - val;
1012         else
1013                 offset = val - scan;
1014         *(scan+1) = (offset>>8)&0377;
1015         *(scan+2) = offset&0377;
1016 #endif
1017 }
1018
1019 /*
1020  - regoptail - regtail on operand of first argument; nop if operandless
1021  */
1022 static void
1023 regoptail(p, val)
1024 char *p;
1025 char *val;
1026 {
1027         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1028         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1029                 return;
1030         regtail(NEXTOPER(p), val);
1031 }
1032
1033 /*
1034  - regcurly - a little FSA that accepts {\d+,?\d*}
1035  */
1036 STATIC int
1037 regcurly(s)
1038 register char *s;
1039 {
1040     if (*s++ != '{')
1041         return FALSE;
1042     if (!isdigit(*s))
1043         return FALSE;
1044     while (isdigit(*s))
1045         s++;
1046     if (*s == ',')
1047         s++;
1048     while (isdigit(*s))
1049         s++;
1050     if (*s != '}')
1051         return FALSE;
1052     return TRUE;
1053 }
1054
1055 #ifdef DEBUGGING
1056
1057 /*
1058  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1059  */
1060 void
1061 regdump(r)
1062 regexp *r;
1063 {
1064         register char *s;
1065         register char op = EXACTLY;     /* Arbitrary non-END op. */
1066         register char *next;
1067         extern char *index();
1068
1069
1070         s = r->program + 1;
1071         while (op != END) {     /* While that wasn't END last time... */
1072 #ifdef REGALIGN
1073                 if (!((long)s & 1))
1074                         s++;
1075 #endif
1076                 op = OP(s);
1077                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1078                 next = regnext(s);
1079                 if (next == NULL)               /* Next ptr. */
1080                         fprintf(stderr,"(0)");
1081                 else 
1082                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1083                 s += 3;
1084                 if (op == ANYOF || op == ANYBUT) {
1085                         s += 32;
1086                 }
1087                 if (op == EXACTLY) {
1088                         /* Literal string, where present. */
1089                         s++;
1090                         while (*s != '\0') {
1091                                 (void)putchar(*s);
1092                                 s++;
1093                         }
1094                         s++;
1095                 }
1096                 (void)putchar('\n');
1097         }
1098
1099         /* Header fields of interest. */
1100         if (r->regstart)
1101                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1102         if (r->regstclass)
1103                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1104         if (r->reganch)
1105                 fprintf(stderr,"anchored ");
1106         if (r->regmust != NULL)
1107                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1108                   r->regback);
1109         fprintf(stderr,"\n");
1110 }
1111
1112 /*
1113  - regprop - printable representation of opcode
1114  */
1115 char *
1116 regprop(op)
1117 char *op;
1118 {
1119         register char *p;
1120
1121         (void) strcpy(buf, ":");
1122
1123         switch (OP(op)) {
1124         case BOL:
1125                 p = "BOL";
1126                 break;
1127         case EOL:
1128                 p = "EOL";
1129                 break;
1130         case ANY:
1131                 p = "ANY";
1132                 break;
1133         case ANYOF:
1134                 p = "ANYOF";
1135                 break;
1136         case ANYBUT:
1137                 p = "ANYBUT";
1138                 break;
1139         case BRANCH:
1140                 p = "BRANCH";
1141                 break;
1142         case EXACTLY:
1143                 p = "EXACTLY";
1144                 break;
1145         case NOTHING:
1146                 p = "NOTHING";
1147                 break;
1148         case BACK:
1149                 p = "BACK";
1150                 break;
1151         case END:
1152                 p = "END";
1153                 break;
1154         case ALNUM:
1155                 p = "ALNUM";
1156                 break;
1157         case NALNUM:
1158                 p = "NALNUM";
1159                 break;
1160         case BOUND:
1161                 p = "BOUND";
1162                 break;
1163         case NBOUND:
1164                 p = "NBOUND";
1165                 break;
1166         case SPACE:
1167                 p = "SPACE";
1168                 break;
1169         case NSPACE:
1170                 p = "NSPACE";
1171                 break;
1172         case DIGIT:
1173                 p = "DIGIT";
1174                 break;
1175         case NDIGIT:
1176                 p = "NDIGIT";
1177                 break;
1178         case REF:
1179         case REF+1:
1180         case REF+2:
1181         case REF+3:
1182         case REF+4:
1183         case REF+5:
1184         case REF+6:
1185         case REF+7:
1186         case REF+8:
1187         case REF+9:
1188                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1189                 p = NULL;
1190                 break;
1191         case OPEN+1:
1192         case OPEN+2:
1193         case OPEN+3:
1194         case OPEN+4:
1195         case OPEN+5:
1196         case OPEN+6:
1197         case OPEN+7:
1198         case OPEN+8:
1199         case OPEN+9:
1200                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1201                 p = NULL;
1202                 break;
1203         case CLOSE+1:
1204         case CLOSE+2:
1205         case CLOSE+3:
1206         case CLOSE+4:
1207         case CLOSE+5:
1208         case CLOSE+6:
1209         case CLOSE+7:
1210         case CLOSE+8:
1211         case CLOSE+9:
1212                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1213                 p = NULL;
1214                 break;
1215         case STAR:
1216                 p = "STAR";
1217                 break;
1218         case PLUS:
1219                 p = "PLUS";
1220                 break;
1221         default:
1222                 FAIL("corrupted regexp opcode");
1223         }
1224         if (p != NULL)
1225                 (void) strcat(buf, p);
1226         return(buf);
1227 }
1228 #endif /* DEBUGGING */
1229
1230 regfree(r)
1231 struct regexp *r;
1232 {
1233         if (r->precomp)
1234                 Safefree(r->precomp);
1235         if (r->subbase)
1236                 Safefree(r->subbase);
1237         if (r->regmust)
1238                 str_free(r->regmust);
1239         if (r->regstart)
1240                 str_free(r->regstart);
1241         Safefree(r);
1242 }