perl 3.0: (no announcement message available)
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 3.0 89/10/18 15:22:29 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 3.0  89/10/18  15:22:29  lwall
14  * 3.0 baseline
15  * 
16  */
17
18 /*
19  * regcomp and regexec -- regsub and regerror are not used in perl
20  *
21  *      Copyright (c) 1986 by University of Toronto.
22  *      Written by Henry Spencer.  Not derived from licensed software.
23  *
24  *      Permission is granted to anyone to use this software for any
25  *      purpose on any computer system, and to redistribute it freely,
26  *      subject to the following restrictions:
27  *
28  *      1. The author is not responsible for the consequences of use of
29  *              this software, no matter how awful, even if they arise
30  *              from defects in it.
31  *
32  *      2. The origin of this software must not be misrepresented, either
33  *              by explicit claim or by omission.
34  *
35  *      3. Altered versions must be plainly marked as such, and must not
36  *              be misrepresented as being the original software.
37  *
38  *
39  ****    Alterations to Henry's code are...
40  ****
41  ****    Copyright (c) 1989, Larry Wall
42  ****
43  ****    You may distribute under the terms of the GNU General Public License
44  ****    as specified in the README file that comes with the perl 3.0 kit.
45  *
46  * Beware that some of this code is subtly aware of the way operator
47  * precedence is structured in regular expressions.  Serious changes in
48  * regular-expression syntax might require a total rethink.
49  */
50 #include "EXTERN.h"
51 #include "perl.h"
52 #include "INTERN.h"
53 #include "regcomp.h"
54
55 #ifndef STATIC
56 #define STATIC  static
57 #endif
58
59 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
60 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
61         ((*s) == '{' && regcurly(s)))
62 #define META    "^$.[()|?+*\\"
63
64 /*
65  * Flags to be passed up and down.
66  */
67 #define HASWIDTH        01      /* Known never to match null string. */
68 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
69 #define SPSTART         04      /* Starts with * or +. */
70 #define WORST           0       /* Worst case. */
71
72 /*
73  * Global work variables for regcomp().
74  */
75 static char *regprecomp;                /* uncompiled string. */
76 static char *regparse;          /* Input-scan pointer. */
77 static char *regxend;           /* End of input for compile */
78 static int regnpar;             /* () count. */
79 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
80 static long regsize;            /* Code size. */
81 static int regfold;
82 static int regsawbracket;       /* Did we do {d,d} trick? */
83
84 /*
85  * Forward declarations for regcomp()'s friends.
86  */
87 STATIC int regcurly();
88 STATIC char *reg();
89 STATIC char *regbranch();
90 STATIC char *regpiece();
91 STATIC char *regatom();
92 STATIC char *regclass();
93 STATIC char *regnode();
94 STATIC void regc();
95 STATIC void reginsert();
96 STATIC void regtail();
97 STATIC void regoptail();
98
99 /*
100  - regcomp - compile a regular expression into internal code
101  *
102  * We can't allocate space until we know how big the compiled form will be,
103  * but we can't compile it (and thus know how big it is) until we've got a
104  * place to put the code.  So we cheat:  we compile it twice, once with code
105  * generation turned off and size counting turned on, and once "for real".
106  * This also means that we don't allocate space until we are sure that the
107  * thing really will compile successfully, and we never have to move the
108  * code and thus invalidate pointers into it.  (Note that it has to be in
109  * one piece because free() must be able to free it all.) [NB: not true in perl]
110  *
111  * Beware that the optimization-preparation code in here knows about some
112  * of the structure of the compiled regexp.  [I'll say.]
113  */
114 regexp *
115 regcomp(exp,xend,fold,rare)
116 char *exp;
117 char *xend;
118 int fold;
119 int rare;
120 {
121         register regexp *r;
122         register char *scan;
123         register STR *longest;
124         register int len;
125         register char *first;
126         int flags;
127         int back;
128         int curback;
129         extern char *safemalloc();
130         extern char *savestr();
131
132         if (exp == NULL)
133                 fatal("NULL regexp argument");
134
135         /* First pass: determine size, legality. */
136         regfold = fold;
137         regparse = exp;
138         regxend = xend;
139         regprecomp = nsavestr(exp,xend-exp);
140         regsawbracket = 0;
141         regnpar = 1;
142         regsize = 0L;
143         regcode = &regdummy;
144         regc(MAGIC);
145         if (reg(0, &flags) == NULL) {
146                 Safefree(regprecomp);
147                 return(NULL);
148         }
149
150         /* Small enough for pointer-storage convention? */
151         if (regsize >= 32767L)          /* Probably could be 65535L. */
152                 FAIL("regexp too big");
153
154         /* Allocate space. */
155         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
156         if (r == NULL)
157                 FAIL("regexp out of space");
158
159         /* Second pass: emit code. */
160         if (regsawbracket)
161             bcopy(regprecomp,exp,xend-exp);
162         r->precomp = regprecomp;
163         r->subbase = NULL;
164         regparse = exp;
165         regnpar = 1;
166         regcode = r->program;
167         regc(MAGIC);
168         if (reg(0, &flags) == NULL)
169                 return(NULL);
170
171         /* Dig out information for optimizations. */
172         r->regstart = Nullstr;  /* Worst-case defaults. */
173         r->reganch = 0;
174         r->regmust = Nullstr;
175         r->regback = -1;
176         r->regstclass = Nullch;
177         scan = r->program+1;                    /* First BRANCH. */
178         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
179                 scan = NEXTOPER(scan);
180
181                 first = scan;
182                 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
183                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
184                     (OP(first) == PLUS) )
185                         first = NEXTOPER(first);
186
187                 /* Starting-point info. */
188                 if (OP(first) == EXACTLY) {
189                         r->regstart =
190                             str_make(OPERAND(first)+1,*OPERAND(first));
191                         if (r->regstart->str_cur > !(sawstudy|fold))
192                                 fbmcompile(r->regstart,fold);
193                 }
194                 else if ((exp = index(simple,OP(first))) && exp > simple)
195                         r->regstclass = first;
196                 else if (OP(first) == BOUND || OP(first) == NBOUND)
197                         r->regstclass = first;
198                 else if (OP(first) == BOL)
199                         r->reganch++;
200
201 #ifdef DEBUGGING
202                 if (debug & 512)
203                     fprintf(stderr,"first %d next %d offset %d\n",
204                       OP(first), OP(NEXTOPER(first)), first - scan);
205 #endif
206                 /*
207                  * If there's something expensive in the r.e., find the
208                  * longest literal string that must appear and make it the
209                  * regmust.  Resolve ties in favor of later strings, since
210                  * the regstart check works with the beginning of the r.e.
211                  * and avoiding duplication strengthens checking.  Not a
212                  * strong reason, but sufficient in the absence of others.
213                  * [Now we resolve ties in favor of the earlier string if
214                  * it happens that curback has been invalidated, since the
215                  * earlier string may buy us something the later one won't.]
216                  */
217                 longest = str_make("",0);
218                 len = 0;
219                 curback = 0;
220                 back = 0;
221                 while (scan != NULL) {
222                         if (OP(scan) == BRANCH) {
223                             if (OP(regnext(scan)) == BRANCH) {
224                                 curback = -30000;
225                                 while (OP(scan) == BRANCH)
226                                     scan = regnext(scan);
227                             }
228                             else        /* single branch is ok */
229                                 scan = NEXTOPER(scan);
230                         }
231                         if (OP(scan) == EXACTLY) {
232                             first = scan;
233                             while (OP(regnext(scan)) >= CLOSE)
234                                 scan = regnext(scan);
235                             if (curback - back == len) {
236                                 str_ncat(longest, OPERAND(first)+1,
237                                     *OPERAND(first));
238                                 len += *OPERAND(first);
239                                 curback += *OPERAND(first);
240                                 first = regnext(scan);
241                             }
242                             else if (*OPERAND(first) >= len + (curback >= 0)) {
243                                 len = *OPERAND(first);
244                                 str_nset(longest, OPERAND(first)+1,len);
245                                 back = curback;
246                                 curback += len;
247                                 first = regnext(scan);
248                             }
249                             else
250                                 curback += *OPERAND(first);
251                         }
252                         else if (index(varies,OP(scan)))
253                                 curback = -30000;
254                         else if (index(simple,OP(scan)))
255                                 curback++;
256                         scan = regnext(scan);
257                 }
258                 if (len) {
259                         r->regmust = longest;
260                         if (back < 0)
261                                 back = -1;
262                         r->regback = back;
263                         if (len > !(sawstudy||fold||OP(first)==EOL))
264                                 fbmcompile(r->regmust,fold);
265                         r->regmust->str_u.str_useful = 100;
266                         if (OP(first) == EOL) /* is match anchored to EOL? */
267                             r->regmust->str_pok |= SP_TAIL;
268                 }
269                 else
270                         str_free(longest);
271         }
272
273         r->do_folding = fold;
274         r->nparens = regnpar - 1;
275 #ifdef DEBUGGING
276         if (debug & 512)
277                 regdump(r);
278 #endif
279         return(r);
280 }
281
282 /*
283  - reg - regular expression, i.e. main body or parenthesized thing
284  *
285  * Caller must absorb opening parenthesis.
286  *
287  * Combining parenthesis handling with the base level of regular expression
288  * is a trifle forced, but the need to tie the tails of the branches to what
289  * follows makes it hard to avoid.
290  */
291 static char *
292 reg(paren, flagp)
293 int paren;                      /* Parenthesized? */
294 int *flagp;
295 {
296         register char *ret;
297         register char *br;
298         register char *ender;
299         register int parno;
300         int flags;
301
302         *flagp = HASWIDTH;      /* Tentatively. */
303
304         /* Make an OPEN node, if parenthesized. */
305         if (paren) {
306                 if (regnpar >= NSUBEXP)
307                         FAIL("too many () in regexp");
308                 parno = regnpar;
309                 regnpar++;
310                 ret = regnode(OPEN+parno);
311         } else
312                 ret = NULL;
313
314         /* Pick up the branches, linking them together. */
315         br = regbranch(&flags);
316         if (br == NULL)
317                 return(NULL);
318         if (ret != NULL)
319                 regtail(ret, br);       /* OPEN -> first. */
320         else
321                 ret = br;
322         if (!(flags&HASWIDTH))
323                 *flagp &= ~HASWIDTH;
324         *flagp |= flags&SPSTART;
325         while (*regparse == '|') {
326                 regparse++;
327                 br = regbranch(&flags);
328                 if (br == NULL)
329                         return(NULL);
330                 regtail(ret, br);       /* BRANCH -> BRANCH. */
331                 if (!(flags&HASWIDTH))
332                         *flagp &= ~HASWIDTH;
333                 *flagp |= flags&SPSTART;
334         }
335
336         /* Make a closing node, and hook it on the end. */
337         ender = regnode((paren) ? CLOSE+parno : END);   
338         regtail(ret, ender);
339
340         /* Hook the tails of the branches to the closing node. */
341         for (br = ret; br != NULL; br = regnext(br))
342                 regoptail(br, ender);
343
344         /* Check for proper termination. */
345         if (paren && *regparse++ != ')') {
346                 FAIL("unmatched () in regexp");
347         } else if (!paren && regparse < regxend) {
348                 if (*regparse == ')') {
349                         FAIL("unmatched () in regexp");
350                 } else
351                         FAIL("junk on end of regexp");  /* "Can't happen". */
352                 /* NOTREACHED */
353         }
354
355         return(ret);
356 }
357
358 /*
359  - regbranch - one alternative of an | operator
360  *
361  * Implements the concatenation operator.
362  */
363 static char *
364 regbranch(flagp)
365 int *flagp;
366 {
367         register char *ret;
368         register char *chain;
369         register char *latest;
370         int flags;
371
372         *flagp = WORST;         /* Tentatively. */
373
374         ret = regnode(BRANCH);
375         chain = NULL;
376         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
377                 latest = regpiece(&flags);
378                 if (latest == NULL)
379                         return(NULL);
380                 *flagp |= flags&HASWIDTH;
381                 if (chain == NULL)      /* First piece. */
382                         *flagp |= flags&SPSTART;
383                 else
384                         regtail(chain, latest);
385                 chain = latest;
386         }
387         if (chain == NULL)      /* Loop ran zero times. */
388                 (void) regnode(NOTHING);
389
390         return(ret);
391 }
392
393 /*
394  - regpiece - something followed by possible [*+?]
395  *
396  * Note that the branching code sequences used for ? and the general cases
397  * of * and + are somewhat optimized:  they use the same NOTHING node as
398  * both the endmarker for their branch list and the body of the last branch.
399  * It might seem that this node could be dispensed with entirely, but the
400  * endmarker role is not redundant.
401  */
402 static char *
403 regpiece(flagp)
404 int *flagp;
405 {
406         register char *ret;
407         register char op;
408         register char *next;
409         int flags;
410         char *origparse = regparse;
411         int orignpar = regnpar;
412         char *max;
413         int iter;
414         char ch;
415
416         ret = regatom(&flags);
417         if (ret == NULL)
418                 return(NULL);
419
420         op = *regparse;
421
422         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
423          * then we decrement the first number by one and reset our
424          * parsing back to the beginning of the same atom.  If the first number
425          * is down to 0, decrement the second number instead and fake up
426          * a ? after it.  Given the way this compiler doesn't keep track
427          * of offsets on the first pass, this is the only way to replicate
428          * a piece of code.  Sigh.
429          */
430         if (op == '{' && regcurly(regparse)) {
431             next = regparse + 1;
432             max = Nullch;
433             while (isdigit(*next) || *next == ',') {
434                 if (*next == ',') {
435                     if (max)
436                         break;
437                     else
438                         max = next;
439                 }
440                 next++;
441             }
442             if (*next == '}') {         /* got one */
443                 regsawbracket++;        /* remember we clobbered exp */
444                 if (!max)
445                     max = next;
446                 regparse++;
447                 iter = atoi(regparse);
448                 if (iter > 0) {
449                     ch = *max;
450                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
451                     *max = ch;
452                     if (*max == ',' && atoi(max+1) > 0) {
453                         ch = *next;
454                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
455                         *next = ch;
456                     }
457                     if (iter != 1 || (*max == ',' || atoi(max+1))) {
458                         regparse = origparse;   /* back up input pointer */
459                         regnpar = orignpar;     /* don't make more parens */
460                     }
461                     else {
462                         regparse = next;
463                         goto nest_check;
464                     }
465                     *flagp = flags;
466                     return ret;
467                 }
468                 if (*max == ',') {
469                     max++;
470                     iter = atoi(max);
471                     if (max == next) {          /* any number more? */
472                         regparse = next;
473                         op = '*';               /* fake up one with a star */
474                     }
475                     else if (iter > 0) {
476                         op = '?';               /* fake up optional atom */
477                         ch = *next;
478                         sprintf(max,"%.*d", next-max, iter - 1);
479                         *next = ch;
480                         if (iter == 1)
481                             regparse = next;
482                         else {
483                             regparse = origparse - 1; /* offset ++ below */
484                             regnpar = orignpar;
485                         }
486                     }
487                     else
488                         fatal("Can't do {n,0}");
489                 }
490                 else
491                     fatal("Can't do {0}");
492             }
493         }
494
495         if (!ISMULT1(op)) {
496                 *flagp = flags;
497                 return(ret);
498         }
499
500         if (!(flags&HASWIDTH) && op != '?')
501                 FAIL("regexp *+ operand could be empty");
502         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
503
504         if (op == '*' && (flags&SIMPLE))
505                 reginsert(STAR, ret);
506         else if (op == '*') {
507                 /* Emit x* as (x&|), where & means "self". */
508                 reginsert(BRANCH, ret);                 /* Either x */
509                 regoptail(ret, regnode(BACK));          /* and loop */
510                 regoptail(ret, ret);                    /* back */
511                 regtail(ret, regnode(BRANCH));          /* or */
512                 regtail(ret, regnode(NOTHING));         /* null. */
513         } else if (op == '+' && (flags&SIMPLE))
514                 reginsert(PLUS, ret);
515         else if (op == '+') {
516                 /* Emit x+ as x(&|), where & means "self". */
517                 next = regnode(BRANCH);                 /* Either */
518                 regtail(ret, next);
519                 regtail(regnode(BACK), ret);            /* loop back */
520                 regtail(next, regnode(BRANCH));         /* or */
521                 regtail(ret, regnode(NOTHING));         /* null. */
522         } else if (op == '?') {
523                 /* Emit x? as (x|) */
524                 reginsert(BRANCH, ret);                 /* Either x */
525                 regtail(ret, regnode(BRANCH));          /* or */
526                 next = regnode(NOTHING);                /* null. */
527                 regtail(ret, next);
528                 regoptail(ret, next);
529         }
530       nest_check:
531         regparse++;
532         if (ISMULT2(regparse))
533                 FAIL("nested *?+ in regexp");
534
535         return(ret);
536 }
537
538 /*
539  - regatom - the lowest level
540  *
541  * Optimization:  gobbles an entire sequence of ordinary characters so that
542  * it can turn them into a single node, which is smaller to store and
543  * faster to run.  Backslashed characters are exceptions, each becoming a
544  * separate node; the code is simpler that way and it's not worth fixing.
545  *
546  * [Yes, it is worth fixing, some scripts can run twice the speed.]
547  */
548 static char *
549 regatom(flagp)
550 int *flagp;
551 {
552         register char *ret;
553         int flags;
554
555         *flagp = WORST;         /* Tentatively. */
556
557         switch (*regparse++) {
558         case '^':
559                 ret = regnode(BOL);
560                 break;
561         case '$':
562                 ret = regnode(EOL);
563                 break;
564         case '.':
565                 ret = regnode(ANY);
566                 *flagp |= HASWIDTH|SIMPLE;
567                 break;
568         case '[':
569                 ret = regclass();
570                 *flagp |= HASWIDTH|SIMPLE;
571                 break;
572         case '(':
573                 ret = reg(1, &flags);
574                 if (ret == NULL)
575                         return(NULL);
576                 *flagp |= flags&(HASWIDTH|SPSTART);
577                 break;
578         case '|':
579         case ')':
580                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
581                 break;
582         case '?':
583         case '+':
584         case '*':
585                 FAIL("?+* follows nothing in regexp");
586                 break;
587         case '\\':
588                 switch (*regparse) {
589                 case 'w':
590                         ret = regnode(ALNUM);
591                         *flagp |= HASWIDTH|SIMPLE;
592                         regparse++;
593                         break;
594                 case 'W':
595                         ret = regnode(NALNUM);
596                         *flagp |= HASWIDTH|SIMPLE;
597                         regparse++;
598                         break;
599                 case 'b':
600                         ret = regnode(BOUND);
601                         *flagp |= SIMPLE;
602                         regparse++;
603                         break;
604                 case 'B':
605                         ret = regnode(NBOUND);
606                         *flagp |= SIMPLE;
607                         regparse++;
608                         break;
609                 case 's':
610                         ret = regnode(SPACE);
611                         *flagp |= HASWIDTH|SIMPLE;
612                         regparse++;
613                         break;
614                 case 'S':
615                         ret = regnode(NSPACE);
616                         *flagp |= HASWIDTH|SIMPLE;
617                         regparse++;
618                         break;
619                 case 'd':
620                         ret = regnode(DIGIT);
621                         *flagp |= HASWIDTH|SIMPLE;
622                         regparse++;
623                         break;
624                 case 'D':
625                         ret = regnode(NDIGIT);
626                         *flagp |= HASWIDTH|SIMPLE;
627                         regparse++;
628                         break;
629                 case 'n':
630                 case 'r':
631                 case 't':
632                 case 'f':
633                         goto defchar;
634                 case '0': case '1': case '2': case '3': case '4':
635                 case '5': case '6': case '7': case '8': case '9':
636                         if (isdigit(regparse[1]))
637                                 goto defchar;
638                         else {
639                                 ret = regnode(REF + *regparse++ - '0');
640                                 *flagp |= SIMPLE;
641                         }
642                         break;
643                 case '\0':
644                         if (regparse >= regxend)
645                             FAIL("trailing \\ in regexp");
646                         /* FALL THROUGH */
647                 default:
648                         goto defchar;
649                 }
650                 break;
651         default: {
652                         register int len;
653                         register char ender;
654                         register char *p;
655                         char *oldp;
656                         int foo;
657
658                     defchar:
659                         ret = regnode(EXACTLY);
660                         regc(0);                /* save spot for len */
661                         for (len=0, p=regparse-1;
662                           len < 127 && p < regxend;
663                           len++)
664                         {
665                             oldp = p;
666                             switch (*p) {
667                             case '^':
668                             case '$':
669                             case '.':
670                             case '[':
671                             case '(':
672                             case ')':
673                             case '|':
674                                 goto loopdone;
675                             case '\\':
676                                 switch (*++p) {
677                                 case 'w':
678                                 case 'W':
679                                 case 'b':
680                                 case 'B':
681                                 case 's':
682                                 case 'S':
683                                 case 'd':
684                                 case 'D':
685                                     --p;
686                                     goto loopdone;
687                                 case 'n':
688                                         ender = '\n';
689                                         p++;
690                                         break;
691                                 case 'r':
692                                         ender = '\r';
693                                         p++;
694                                         break;
695                                 case 't':
696                                         ender = '\t';
697                                         p++;
698                                         break;
699                                 case 'f':
700                                         ender = '\f';
701                                         p++;
702                                         break;
703                                 case '0': case '1': case '2': case '3':case '4':
704                                 case '5': case '6': case '7': case '8':case '9':
705                                     if (isdigit(p[1])) {
706                                         foo = *p++ - '0';
707                                         foo <<= 3;
708                                         foo += *p - '0';
709                                         if (isdigit(p[1]))
710                                             foo = (foo<<3) + *++p - '0';
711                                         ender = foo;
712                                         p++;
713                                     }
714                                     else {
715                                         --p;
716                                         goto loopdone;
717                                     }
718                                     break;
719                                 case '\0':
720                                     if (p >= regxend)
721                                         FAIL("trailing \\ in regexp");
722                                     /* FALL THROUGH */
723                                 default:
724                                     ender = *p++;
725                                     break;
726                                 }
727                                 break;
728                             default:
729                                 ender = *p++;
730                                 break;
731                             }
732                             if (regfold && isupper(ender))
733                                     ender = tolower(ender);
734                             if (ISMULT2(p)) { /* Back off on ?+*. */
735                                 if (len)
736                                     p = oldp;
737                                 else {
738                                     len++;
739                                     regc(ender);
740                                 }
741                                 break;
742                             }
743                             regc(ender);
744                         }
745                     loopdone:
746                         regparse = p;
747                         if (len <= 0)
748                                 FAIL("internal disaster in regexp");
749                         *flagp |= HASWIDTH;
750                         if (len == 1)
751                                 *flagp |= SIMPLE;
752                         if (regcode != &regdummy)
753                             *OPERAND(ret) = len;
754                         regc('\0');
755                 }
756                 break;
757         }
758
759         return(ret);
760 }
761
762 static void
763 regset(bits,def,c)
764 char *bits;
765 int def;
766 register int c;
767 {
768         if (regcode == &regdummy)
769             return;
770         if (def)
771                 bits[c >> 3] &= ~(1 << (c & 7));
772         else
773                 bits[c >> 3] |=  (1 << (c & 7));
774 }
775
776 static char *
777 regclass()
778 {
779         register char *bits;
780         register int class;
781         register int lastclass;
782         register int range = 0;
783         register char *ret;
784         register int def;
785
786         if (*regparse == '^') { /* Complement of range. */
787                 ret = regnode(ANYBUT);
788                 regparse++;
789                 def = 0;
790         } else {
791                 ret = regnode(ANYOF);
792                 def = 255;
793         }
794         bits = regcode;
795         for (class = 0; class < 32; class++)
796             regc(def);
797         if (*regparse == ']' || *regparse == '-')
798                 regset(bits,def,lastclass = *regparse++);
799         while (regparse < regxend && *regparse != ']') {
800                 class = UCHARAT(regparse++);
801                 if (class == '\\') {
802                         class = UCHARAT(regparse++);
803                         switch (class) {
804                         case 'w':
805                                 for (class = 'a'; class <= 'z'; class++)
806                                         regset(bits,def,class);
807                                 for (class = 'A'; class <= 'Z'; class++)
808                                         regset(bits,def,class);
809                                 for (class = '0'; class <= '9'; class++)
810                                         regset(bits,def,class);
811                                 regset(bits,def,'_');
812                                 lastclass = 1234;
813                                 continue;
814                         case 's':
815                                 regset(bits,def,' ');
816                                 regset(bits,def,'\t');
817                                 regset(bits,def,'\r');
818                                 regset(bits,def,'\f');
819                                 regset(bits,def,'\n');
820                                 lastclass = 1234;
821                                 continue;
822                         case 'd':
823                                 for (class = '0'; class <= '9'; class++)
824                                         regset(bits,def,class);
825                                 lastclass = 1234;
826                                 continue;
827                         case 'n':
828                                 class = '\n';
829                                 break;
830                         case 'r':
831                                 class = '\r';
832                                 break;
833                         case 't':
834                                 class = '\t';
835                                 break;
836                         case 'f':
837                                 class = '\f';
838                                 break;
839                         case 'b':
840                                 class = '\b';
841                                 break;
842                         case '0': case '1': case '2': case '3': case '4':
843                         case '5': case '6': case '7': case '8': case '9':
844                                 class -= '0';
845                                 if (isdigit(*regparse)) {
846                                         class <<= 3;
847                                         class += *regparse++ - '0';
848                                 }
849                                 if (isdigit(*regparse)) {
850                                         class <<= 3;
851                                         class += *regparse++ - '0';
852                                 }
853                                 break;
854                         }
855                 }
856                 if (!range && class == '-' && regparse < regxend &&
857                     *regparse != ']') {
858                         range = 1;
859                         continue;
860                 }
861                 if (range) {
862                         if (lastclass > class)
863                                 FAIL("invalid [] range in regexp");
864                 }
865                 else
866                         lastclass = class - 1;
867                 range = 0;
868                 for (lastclass++; lastclass <= class; lastclass++) {
869                         regset(bits,def,lastclass);
870                         if (regfold && isupper(lastclass))
871                                 regset(bits,def,tolower(lastclass));
872                 }
873                 lastclass = class;
874         }
875         if (*regparse != ']')
876                 FAIL("unmatched [] in regexp");
877         regset(bits,0,0);               /* always bomb out on null */
878         regparse++;
879         return ret;
880 }
881
882 /*
883  - regnode - emit a node
884  */
885 static char *                   /* Location. */
886 regnode(op)
887 char op;
888 {
889         register char *ret;
890         register char *ptr;
891
892         ret = regcode;
893         if (ret == &regdummy) {
894 #ifdef REGALIGN
895                 if (!(regsize & 1))
896                         regsize++;
897 #endif
898                 regsize += 3;
899                 return(ret);
900         }
901
902 #ifdef REGALIGN
903 #ifndef lint
904         if (!((long)ret & 1))
905             *ret++ = 127;
906 #endif
907 #endif
908         ptr = ret;
909         *ptr++ = op;
910         *ptr++ = '\0';          /* Null "next" pointer. */
911         *ptr++ = '\0';
912         regcode = ptr;
913
914         return(ret);
915 }
916
917 /*
918  - regc - emit (if appropriate) a byte of code
919  */
920 static void
921 regc(b)
922 char b;
923 {
924         if (regcode != &regdummy)
925                 *regcode++ = b;
926         else
927                 regsize++;
928 }
929
930 /*
931  - reginsert - insert an operator in front of already-emitted operand
932  *
933  * Means relocating the operand.
934  */
935 static void
936 reginsert(op, opnd)
937 char op;
938 char *opnd;
939 {
940         register char *src;
941         register char *dst;
942         register char *place;
943
944         if (regcode == &regdummy) {
945 #ifdef REGALIGN
946                 regsize += 4;
947 #else
948                 regsize += 3;
949 #endif
950                 return;
951         }
952
953         src = regcode;
954 #ifdef REGALIGN
955         regcode += 4;
956 #else
957         regcode += 3;
958 #endif
959         dst = regcode;
960         while (src > opnd)
961                 *--dst = *--src;
962
963         place = opnd;           /* Op node, where operand used to be. */
964         *place++ = op;
965         *place++ = '\0';
966         *place++ = '\0';
967 }
968
969 /*
970  - regtail - set the next-pointer at the end of a node chain
971  */
972 static void
973 regtail(p, val)
974 char *p;
975 char *val;
976 {
977         register char *scan;
978         register char *temp;
979         register int offset;
980
981         if (p == &regdummy)
982                 return;
983
984         /* Find last node. */
985         scan = p;
986         for (;;) {
987                 temp = regnext(scan);
988                 if (temp == NULL)
989                         break;
990                 scan = temp;
991         }
992
993 #ifdef REGALIGN
994         offset = val - scan;
995 #ifndef lint
996         *(short*)(scan+1) = offset;
997 #else
998         offset = offset;
999 #endif
1000 #else
1001         if (OP(scan) == BACK)
1002                 offset = scan - val;
1003         else
1004                 offset = val - scan;
1005         *(scan+1) = (offset>>8)&0377;
1006         *(scan+2) = offset&0377;
1007 #endif
1008 }
1009
1010 /*
1011  - regoptail - regtail on operand of first argument; nop if operandless
1012  */
1013 static void
1014 regoptail(p, val)
1015 char *p;
1016 char *val;
1017 {
1018         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1019         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1020                 return;
1021         regtail(NEXTOPER(p), val);
1022 }
1023
1024 /*
1025  - regcurly - a little FSA that accepts {\d+,?\d*}
1026  */
1027 STATIC int
1028 regcurly(s)
1029 register char *s;
1030 {
1031     if (*s++ != '{')
1032         return FALSE;
1033     if (!isdigit(*s))
1034         return FALSE;
1035     while (isdigit(*s))
1036         s++;
1037     if (*s == ',')
1038         s++;
1039     while (isdigit(*s))
1040         s++;
1041     if (*s != '}')
1042         return FALSE;
1043     return TRUE;
1044 }
1045
1046 #ifdef DEBUGGING
1047
1048 /*
1049  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1050  */
1051 void
1052 regdump(r)
1053 regexp *r;
1054 {
1055         register char *s;
1056         register char op = EXACTLY;     /* Arbitrary non-END op. */
1057         register char *next;
1058         extern char *index();
1059
1060
1061         s = r->program + 1;
1062         while (op != END) {     /* While that wasn't END last time... */
1063 #ifdef REGALIGN
1064                 if (!((long)s & 1))
1065                         s++;
1066 #endif
1067                 op = OP(s);
1068                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1069                 next = regnext(s);
1070                 if (next == NULL)               /* Next ptr. */
1071                         fprintf(stderr,"(0)");
1072                 else 
1073                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1074                 s += 3;
1075                 if (op == ANYOF || op == ANYBUT) {
1076                         s += 32;
1077                 }
1078                 if (op == EXACTLY) {
1079                         /* Literal string, where present. */
1080                         s++;
1081                         while (*s != '\0') {
1082                                 (void)putchar(*s);
1083                                 s++;
1084                         }
1085                         s++;
1086                 }
1087                 (void)putchar('\n');
1088         }
1089
1090         /* Header fields of interest. */
1091         if (r->regstart)
1092                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1093         if (r->regstclass)
1094                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1095         if (r->reganch)
1096                 fprintf(stderr,"anchored ");
1097         if (r->regmust != NULL)
1098                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1099                   r->regback);
1100         fprintf(stderr,"\n");
1101 }
1102
1103 /*
1104  - regprop - printable representation of opcode
1105  */
1106 char *
1107 regprop(op)
1108 char *op;
1109 {
1110         register char *p;
1111
1112         (void) strcpy(buf, ":");
1113
1114         switch (OP(op)) {
1115         case BOL:
1116                 p = "BOL";
1117                 break;
1118         case EOL:
1119                 p = "EOL";
1120                 break;
1121         case ANY:
1122                 p = "ANY";
1123                 break;
1124         case ANYOF:
1125                 p = "ANYOF";
1126                 break;
1127         case ANYBUT:
1128                 p = "ANYBUT";
1129                 break;
1130         case BRANCH:
1131                 p = "BRANCH";
1132                 break;
1133         case EXACTLY:
1134                 p = "EXACTLY";
1135                 break;
1136         case NOTHING:
1137                 p = "NOTHING";
1138                 break;
1139         case BACK:
1140                 p = "BACK";
1141                 break;
1142         case END:
1143                 p = "END";
1144                 break;
1145         case ALNUM:
1146                 p = "ALNUM";
1147                 break;
1148         case NALNUM:
1149                 p = "NALNUM";
1150                 break;
1151         case BOUND:
1152                 p = "BOUND";
1153                 break;
1154         case NBOUND:
1155                 p = "NBOUND";
1156                 break;
1157         case SPACE:
1158                 p = "SPACE";
1159                 break;
1160         case NSPACE:
1161                 p = "NSPACE";
1162                 break;
1163         case DIGIT:
1164                 p = "DIGIT";
1165                 break;
1166         case NDIGIT:
1167                 p = "NDIGIT";
1168                 break;
1169         case REF:
1170         case REF+1:
1171         case REF+2:
1172         case REF+3:
1173         case REF+4:
1174         case REF+5:
1175         case REF+6:
1176         case REF+7:
1177         case REF+8:
1178         case REF+9:
1179                 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1180                 p = NULL;
1181                 break;
1182         case OPEN+1:
1183         case OPEN+2:
1184         case OPEN+3:
1185         case OPEN+4:
1186         case OPEN+5:
1187         case OPEN+6:
1188         case OPEN+7:
1189         case OPEN+8:
1190         case OPEN+9:
1191                 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1192                 p = NULL;
1193                 break;
1194         case CLOSE+1:
1195         case CLOSE+2:
1196         case CLOSE+3:
1197         case CLOSE+4:
1198         case CLOSE+5:
1199         case CLOSE+6:
1200         case CLOSE+7:
1201         case CLOSE+8:
1202         case CLOSE+9:
1203                 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1204                 p = NULL;
1205                 break;
1206         case STAR:
1207                 p = "STAR";
1208                 break;
1209         case PLUS:
1210                 p = "PLUS";
1211                 break;
1212         default:
1213                 FAIL("corrupted regexp opcode");
1214         }
1215         if (p != NULL)
1216                 (void) strcat(buf, p);
1217         return(buf);
1218 }
1219 #endif /* DEBUGGING */
1220
1221 regfree(r)
1222 struct regexp *r;
1223 {
1224         if (r->precomp)
1225                 Safefree(r->precomp);
1226         if (r->subbase)
1227                 Safefree(r->subbase);
1228         if (r->regmust)
1229                 str_free(r->regmust);
1230         if (r->regstart)
1231                 str_free(r->regstart);
1232         Safefree(r);
1233 }