perl 4.0 patch 8: patch #4, continued
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $RCSfile: regcomp.c,v $$Revision: 4.0.1.2 $$Date: 91/06/07 11:48:24 $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 4.0.1.2  91/06/07  11:48:24  lwall
14  * patch4: new copyright notice
15  * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
16  * patch4: // wouldn't use previous pattern if it started with a null character
17  * 
18  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
19  * patch1: random cleanup in cpp namespace
20  * 
21  * Revision 4.0  91/03/20  01:39:01  lwall
22  * 4.0 baseline.
23  * 
24  */
25
26 /*
27  * regcomp and regexec -- regsub and regerror are not used in perl
28  *
29  *      Copyright (c) 1986 by University of Toronto.
30  *      Written by Henry Spencer.  Not derived from licensed software.
31  *
32  *      Permission is granted to anyone to use this software for any
33  *      purpose on any computer system, and to redistribute it freely,
34  *      subject to the following restrictions:
35  *
36  *      1. The author is not responsible for the consequences of use of
37  *              this software, no matter how awful, even if they arise
38  *              from defects in it.
39  *
40  *      2. The origin of this software must not be misrepresented, either
41  *              by explicit claim or by omission.
42  *
43  *      3. Altered versions must be plainly marked as such, and must not
44  *              be misrepresented as being the original software.
45  *
46  *
47  ****    Alterations to Henry's code are...
48  ****
49  ****    Copyright (c) 1991, Larry Wall
50  ****
51  ****    You may distribute under the terms of either the GNU General Public
52  ****    License or the Artistic License, as specified in the README file.
53
54  *
55  * Beware that some of this code is subtly aware of the way operator
56  * precedence is structured in regular expressions.  Serious changes in
57  * regular-expression syntax might require a total rethink.
58  */
59 #include "EXTERN.h"
60 #include "perl.h"
61 #include "INTERN.h"
62 #include "regcomp.h"
63
64 #ifdef MSDOS
65 # if defined(BUGGY_MSC6)
66  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
67  # pragma optimize("a",off)
68  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
69  # pragma optimize("w",on )
70 # endif /* BUGGY_MSC6 */
71 #endif /* MSDOS */
72
73 #ifndef STATIC
74 #define STATIC  static
75 #endif
76
77 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
78 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
79         ((*s) == '{' && regcurly(s)))
80 #define META    "^$.[()|?+*\\"
81
82 #ifdef SPSTART
83 #undef SPSTART          /* dratted cpp namespace... */
84 #endif
85 /*
86  * Flags to be passed up and down.
87  */
88 #define HASWIDTH        01      /* Known never to match null string. */
89 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
90 #define SPSTART         04      /* Starts with * or +. */
91 #define WORST           0       /* Worst case. */
92
93 /*
94  * Global work variables for regcomp().
95  */
96 static char *regprecomp;                /* uncompiled string. */
97 static char *regparse;          /* Input-scan pointer. */
98 static char *regxend;           /* End of input for compile */
99 static int regnpar;             /* () count. */
100 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
101 static long regsize;            /* Code size. */
102 static int regfold;
103 static int regsawbracket;       /* Did we do {d,d} trick? */
104 static int regsawback;          /* Did we see \1, ...? */
105
106 /*
107  * Forward declarations for regcomp()'s friends.
108  */
109 STATIC int regcurly();
110 STATIC char *reg();
111 STATIC char *regbranch();
112 STATIC char *regpiece();
113 STATIC char *regatom();
114 STATIC char *regclass();
115 STATIC char *regnode();
116 STATIC char *reganode();
117 STATIC void regc();
118 STATIC void reginsert();
119 STATIC void regtail();
120 STATIC void regoptail();
121
122 /*
123  - regcomp - compile a regular expression into internal code
124  *
125  * We can't allocate space until we know how big the compiled form will be,
126  * but we can't compile it (and thus know how big it is) until we've got a
127  * place to put the code.  So we cheat:  we compile it twice, once with code
128  * generation turned off and size counting turned on, and once "for real".
129  * This also means that we don't allocate space until we are sure that the
130  * thing really will compile successfully, and we never have to move the
131  * code and thus invalidate pointers into it.  (Note that it has to be in
132  * one piece because free() must be able to free it all.) [NB: not true in perl]
133  *
134  * Beware that the optimization-preparation code in here knows about some
135  * of the structure of the compiled regexp.  [I'll say.]
136  */
137 regexp *
138 regcomp(exp,xend,fold)
139 char *exp;
140 char *xend;
141 int fold;
142 {
143         register regexp *r;
144         register char *scan;
145         register STR *longish;
146         STR *longest;
147         register int len;
148         register char *first;
149         int flags;
150         int backish;
151         int backest;
152         int curback;
153         extern char *safemalloc();
154         extern char *savestr();
155         int sawplus = 0;
156         int sawopen = 0;
157
158         if (exp == NULL)
159                 fatal("NULL regexp argument");
160
161         /* First pass: determine size, legality. */
162         regfold = fold;
163         regparse = exp;
164         regxend = xend;
165         regprecomp = nsavestr(exp,xend-exp);
166         regsawbracket = 0;
167         regsawback = 0;
168         regnpar = 1;
169         regsize = 0L;
170         regcode = &regdummy;
171         regc(MAGIC);
172         if (reg(0, &flags) == NULL) {
173                 Safefree(regprecomp);
174                 regprecomp = Nullch;
175                 return(NULL);
176         }
177
178         /* Small enough for pointer-storage convention? */
179         if (regsize >= 32767L)          /* Probably could be 65535L. */
180                 FAIL("regexp too big");
181
182         /* Allocate space. */
183         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
184         if (r == NULL)
185                 FAIL("regexp out of space");
186
187         /* Second pass: emit code. */
188         if (regsawbracket)
189             bcopy(regprecomp,exp,xend-exp);
190         r->prelen = xend-exp;
191         r->precomp = regprecomp;
192         r->subbeg = r->subbase = NULL;
193         regparse = exp;
194         regnpar = 1;
195         regcode = r->program;
196         regc(MAGIC);
197         if (reg(0, &flags) == NULL)
198                 return(NULL);
199
200         /* Dig out information for optimizations. */
201         r->regstart = Nullstr;  /* Worst-case defaults. */
202         r->reganch = 0;
203         r->regmust = Nullstr;
204         r->regback = -1;
205         r->regstclass = Nullch;
206         scan = r->program+1;                    /* First BRANCH. */
207         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
208                 scan = NEXTOPER(scan);
209
210                 first = scan;
211                 while ((OP(first) == OPEN && (sawopen = 1)) ||
212                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
213                     (OP(first) == PLUS) ||
214                     (OP(first) == CURLY && ARG1(first) > 0) ) {
215                         if (OP(first) == PLUS)
216                             sawplus = 1;
217                         else
218                             first += regarglen[OP(first)];
219                         first = NEXTOPER(first);
220                 }
221
222                 /* Starting-point info. */
223             again:
224                 if (OP(first) == EXACTLY) {
225                         r->regstart =
226                             str_make(OPERAND(first)+1,*OPERAND(first));
227                         if (r->regstart->str_cur > !(sawstudy|fold))
228                                 fbmcompile(r->regstart,fold);
229                 }
230                 else if ((exp = index(simple,OP(first))) && exp > simple)
231                         r->regstclass = first;
232                 else if (OP(first) == BOUND || OP(first) == NBOUND)
233                         r->regstclass = first;
234                 else if (OP(first) == BOL ||
235                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
236                         r->reganch = ROPT_ANCH; /* kinda turn .* into ^.* */
237                         first = NEXTOPER(first);
238                         goto again;
239                 }
240                 if (sawplus && (!sawopen || !regsawback))
241                     r->reganch |= ROPT_SKIP;    /* x+ must match 1st of run */
242
243 #ifdef DEBUGGING
244                 if (debug & 512)
245                     fprintf(stderr,"first %d next %d offset %d\n",
246                       OP(first), OP(NEXTOPER(first)), first - scan);
247 #endif
248                 /*
249                  * If there's something expensive in the r.e., find the
250                  * longest literal string that must appear and make it the
251                  * regmust.  Resolve ties in favor of later strings, since
252                  * the regstart check works with the beginning of the r.e.
253                  * and avoiding duplication strengthens checking.  Not a
254                  * strong reason, but sufficient in the absence of others.
255                  * [Now we resolve ties in favor of the earlier string if
256                  * it happens that curback has been invalidated, since the
257                  * earlier string may buy us something the later one won't.]
258                  */
259                 longish = str_make("",0);
260                 longest = str_make("",0);
261                 len = 0;
262                 curback = 0;
263                 backish = 0;
264                 backest = 0;
265                 while (OP(scan) != END) {
266                         if (OP(scan) == BRANCH) {
267                             if (OP(regnext(scan)) == BRANCH) {
268                                 curback = -30000;
269                                 while (OP(scan) == BRANCH)
270                                     scan = regnext(scan);
271                             }
272                             else        /* single branch is ok */
273                                 scan = NEXTOPER(scan);
274                         }
275                         if (OP(scan) == EXACTLY) {
276                             char *t;
277
278                             first = scan;
279                             while (OP(t = regnext(scan)) == CLOSE)
280                                 scan = t;
281                             if (curback - backish == len) {
282                                 str_ncat(longish, OPERAND(first)+1,
283                                     *OPERAND(first));
284                                 len += *OPERAND(first);
285                                 curback += *OPERAND(first);
286                                 first = regnext(scan);
287                             }
288                             else if (*OPERAND(first) >= len + (curback >= 0)) {
289                                 len = *OPERAND(first);
290                                 str_nset(longish, OPERAND(first)+1,len);
291                                 backish = curback;
292                                 curback += len;
293                                 first = regnext(scan);
294                             }
295                             else
296                                 curback += *OPERAND(first);
297                         }
298                         else if (index(varies,OP(scan))) {
299                             curback = -30000;
300                             len = 0;
301                             if (longish->str_cur > longest->str_cur) {
302                                 str_sset(longest,longish);
303                                 backest = backish;
304                             }
305                             str_nset(longish,"",0);
306                         }
307                         else if (index(simple,OP(scan))) {
308                             curback++;
309                             len = 0;
310                             if (longish->str_cur > longest->str_cur) {
311                                 str_sset(longest,longish);
312                                 backest = backish;
313                             }
314                             str_nset(longish,"",0);
315                         }
316                         scan = regnext(scan);
317                 }
318
319                 /* Prefer earlier on tie, unless we can tail match latter */
320
321                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
322                     str_sset(longest,longish);
323                     backest = backish;
324                 }
325                 else
326                     str_nset(longish,"",0);
327                 if (longest->str_cur
328                     &&
329                     (!r->regstart
330                      ||
331                      !fbminstr(r->regstart->str_ptr,
332                           r->regstart->str_ptr + r->regstart->str_cur,
333                           longest)
334                     )
335                    )
336                 {
337                         r->regmust = longest;
338                         if (backest < 0)
339                                 backest = -1;
340                         r->regback = backest;
341                         if (longest->str_cur
342                           > !(sawstudy || fold || OP(first) == EOL) )
343                                 fbmcompile(r->regmust,fold);
344                         r->regmust->str_u.str_useful = 100;
345                         if (OP(first) == EOL && longish->str_cur)
346                             r->regmust->str_pok |= SP_TAIL;
347                 }
348                 else {
349                         str_free(longest);
350                         longest = Nullstr;
351                 }
352                 str_free(longish);
353         }
354
355         r->do_folding = fold;
356         r->nparens = regnpar - 1;
357         New(1002, r->startp, regnpar, char*);
358         New(1002, r->endp, regnpar, char*);
359 #ifdef DEBUGGING
360         if (debug & 512)
361                 regdump(r);
362 #endif
363         return(r);
364 }
365
366 /*
367  - reg - regular expression, i.e. main body or parenthesized thing
368  *
369  * Caller must absorb opening parenthesis.
370  *
371  * Combining parenthesis handling with the base level of regular expression
372  * is a trifle forced, but the need to tie the tails of the branches to what
373  * follows makes it hard to avoid.
374  */
375 static char *
376 reg(paren, flagp)
377 int paren;                      /* Parenthesized? */
378 int *flagp;
379 {
380         register char *ret;
381         register char *br;
382         register char *ender;
383         register int parno;
384         int flags;
385
386         *flagp = HASWIDTH;      /* Tentatively. */
387
388         /* Make an OPEN node, if parenthesized. */
389         if (paren) {
390                 parno = regnpar;
391                 regnpar++;
392                 ret = reganode(OPEN, parno);
393         } else
394                 ret = NULL;
395
396         /* Pick up the branches, linking them together. */
397         br = regbranch(&flags);
398         if (br == NULL)
399                 return(NULL);
400         if (ret != NULL)
401                 regtail(ret, br);       /* OPEN -> first. */
402         else
403                 ret = br;
404         if (!(flags&HASWIDTH))
405                 *flagp &= ~HASWIDTH;
406         *flagp |= flags&SPSTART;
407         while (*regparse == '|') {
408                 regparse++;
409                 br = regbranch(&flags);
410                 if (br == NULL)
411                         return(NULL);
412                 regtail(ret, br);       /* BRANCH -> BRANCH. */
413                 if (!(flags&HASWIDTH))
414                         *flagp &= ~HASWIDTH;
415                 *flagp |= flags&SPSTART;
416         }
417
418         /* Make a closing node, and hook it on the end. */
419         if (paren)
420             ender = reganode(CLOSE, parno);
421         else
422             ender = regnode(END);
423         regtail(ret, ender);
424
425         /* Hook the tails of the branches to the closing node. */
426         for (br = ret; br != NULL; br = regnext(br))
427                 regoptail(br, ender);
428
429         /* Check for proper termination. */
430         if (paren && *regparse++ != ')') {
431                 FAIL("unmatched () in regexp");
432         } else if (!paren && regparse < regxend) {
433                 if (*regparse == ')') {
434                         FAIL("unmatched () in regexp");
435                 } else
436                         FAIL("junk on end of regexp");  /* "Can't happen". */
437                 /* NOTREACHED */
438         }
439
440         return(ret);
441 }
442
443 /*
444  - regbranch - one alternative of an | operator
445  *
446  * Implements the concatenation operator.
447  */
448 static char *
449 regbranch(flagp)
450 int *flagp;
451 {
452         register char *ret;
453         register char *chain;
454         register char *latest;
455         int flags;
456
457         *flagp = WORST;         /* Tentatively. */
458
459         ret = regnode(BRANCH);
460         chain = NULL;
461         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
462                 latest = regpiece(&flags);
463                 if (latest == NULL)
464                         return(NULL);
465                 *flagp |= flags&HASWIDTH;
466                 if (chain == NULL)      /* First piece. */
467                         *flagp |= flags&SPSTART;
468                 else
469                         regtail(chain, latest);
470                 chain = latest;
471         }
472         if (chain == NULL)      /* Loop ran zero times. */
473                 (void) regnode(NOTHING);
474
475         return(ret);
476 }
477
478 /*
479  - regpiece - something followed by possible [*+?]
480  *
481  * Note that the branching code sequences used for ? and the general cases
482  * of * and + are somewhat optimized:  they use the same NOTHING node as
483  * both the endmarker for their branch list and the body of the last branch.
484  * It might seem that this node could be dispensed with entirely, but the
485  * endmarker role is not redundant.
486  */
487 static char *
488 regpiece(flagp)
489 int *flagp;
490 {
491         register char *ret;
492         register char op;
493         register char *next;
494         int flags;
495         char *origparse = regparse;
496         int orignpar = regnpar;
497         char *max;
498         int iter;
499         char ch;
500
501         ret = regatom(&flags);
502         if (ret == NULL)
503                 return(NULL);
504
505         op = *regparse;
506
507         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
508          * then we decrement the first number by one and reset our
509          * parsing back to the beginning of the same atom.  If the first number
510          * is down to 0, decrement the second number instead and fake up
511          * a ? after it.  Given the way this compiler doesn't keep track
512          * of offsets on the first pass, this is the only way to replicate
513          * a piece of code.  Sigh.
514          */
515         if (op == '{' && regcurly(regparse)) {
516             next = regparse + 1;
517             max = Nullch;
518             while (isdigit(*next) || *next == ',') {
519                 if (*next == ',') {
520                     if (max)
521                         break;
522                     else
523                         max = next;
524                 }
525                 next++;
526             }
527             if (*next == '}') {         /* got one */
528                 if (!max)
529                     max = next;
530                 regparse++;
531                 iter = atoi(regparse);
532                 if (flags&SIMPLE) {     /* we can do it right after all */
533                     int tmp;
534
535                     reginsert(CURLY, ret);
536                     if (iter > 0)
537                         *flagp = (WORST|HASWIDTH);
538                     if (*max == ',')
539                         max++;
540                     else
541                         max = regparse;
542                     tmp = atoi(max);
543                     if (tmp && tmp < iter)
544                         fatal("Can't do {n,m} with n > m");
545                     if (regcode != &regdummy) {
546 #ifdef REGALIGN
547                         *(unsigned short *)(ret+3) = iter;
548                         *(unsigned short *)(ret+5) = tmp;
549 #else
550                         ret[3] = iter >> 8; ret[4] = iter & 0377;
551                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
552 #endif
553                     }
554                     regparse = next;
555                     goto nest_check;
556                 }
557                 regsawbracket++;        /* remember we clobbered exp */
558                 if (iter > 0) {
559                     ch = *max;
560                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
561                     *max = ch;
562                     if (*max == ',' && max[1] != '}') {
563                         if (atoi(max+1) <= 0)
564                             fatal("Can't do {n,m} with n > m");
565                         ch = *next;
566                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
567                         *next = ch;
568                     }
569                     if (iter != 1 || *max == ',') {
570                         regparse = origparse;   /* back up input pointer */
571                         regnpar = orignpar;     /* don't make more parens */
572                     }
573                     else {
574                         regparse = next;
575                         goto nest_check;
576                     }
577                     *flagp = flags;
578                     return ret;
579                 }
580                 if (*max == ',') {
581                     max++;
582                     iter = atoi(max);
583                     if (max == next) {          /* any number more? */
584                         regparse = next;
585                         op = '*';               /* fake up one with a star */
586                     }
587                     else if (iter > 0) {
588                         op = '?';               /* fake up optional atom */
589                         ch = *next;
590                         sprintf(max,"%.*d", next-max, iter - 1);
591                         *next = ch;
592                         if (iter == 1)
593                             regparse = next;
594                         else {
595                             regparse = origparse - 1; /* offset ++ below */
596                             regnpar = orignpar;
597                         }
598                     }
599                     else
600                         fatal("Can't do {n,0}");
601                 }
602                 else
603                     fatal("Can't do {0}");
604             }
605         }
606
607         if (!ISMULT1(op)) {
608                 *flagp = flags;
609                 return(ret);
610         }
611
612         if (!(flags&HASWIDTH) && op != '?')
613                 FAIL("regexp *+ operand could be empty");
614         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
615
616         if (op == '*' && (flags&SIMPLE))
617                 reginsert(STAR, ret);
618         else if (op == '*') {
619                 /* Emit x* as (x&|), where & means "self". */
620                 reginsert(BRANCH, ret);                 /* Either x */
621                 regoptail(ret, regnode(BACK));          /* and loop */
622                 regoptail(ret, ret);                    /* back */
623                 regtail(ret, regnode(BRANCH));          /* or */
624                 regtail(ret, regnode(NOTHING));         /* null. */
625         } else if (op == '+' && (flags&SIMPLE))
626                 reginsert(PLUS, ret);
627         else if (op == '+') {
628                 /* Emit x+ as x(&|), where & means "self". */
629                 next = regnode(BRANCH);                 /* Either */
630                 regtail(ret, next);
631                 regtail(regnode(BACK), ret);            /* loop back */
632                 regtail(next, regnode(BRANCH));         /* or */
633                 regtail(ret, regnode(NOTHING));         /* null. */
634         } else if (op == '?') {
635                 /* Emit x? as (x|) */
636                 reginsert(BRANCH, ret);                 /* Either x */
637                 regtail(ret, regnode(BRANCH));          /* or */
638                 next = regnode(NOTHING);                /* null. */
639                 regtail(ret, next);
640                 regoptail(ret, next);
641         }
642       nest_check:
643         regparse++;
644         if (ISMULT2(regparse))
645                 FAIL("nested *?+ in regexp");
646
647         return(ret);
648 }
649
650 /*
651  - regatom - the lowest level
652  *
653  * Optimization:  gobbles an entire sequence of ordinary characters so that
654  * it can turn them into a single node, which is smaller to store and
655  * faster to run.  Backslashed characters are exceptions, each becoming a
656  * separate node; the code is simpler that way and it's not worth fixing.
657  *
658  * [Yes, it is worth fixing, some scripts can run twice the speed.]
659  */
660 static char *
661 regatom(flagp)
662 int *flagp;
663 {
664         register char *ret;
665         int flags;
666
667         *flagp = WORST;         /* Tentatively. */
668
669         switch (*regparse++) {
670         case '^':
671                 ret = regnode(BOL);
672                 break;
673         case '$':
674                 ret = regnode(EOL);
675                 break;
676         case '.':
677                 ret = regnode(ANY);
678                 *flagp |= HASWIDTH|SIMPLE;
679                 break;
680         case '[':
681                 ret = regclass();
682                 *flagp |= HASWIDTH|SIMPLE;
683                 break;
684         case '(':
685                 ret = reg(1, &flags);
686                 if (ret == NULL)
687                         return(NULL);
688                 *flagp |= flags&(HASWIDTH|SPSTART);
689                 break;
690         case '|':
691         case ')':
692                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
693                 break;
694         case '?':
695         case '+':
696         case '*':
697                 FAIL("?+* follows nothing in regexp");
698                 break;
699         case '\\':
700                 switch (*regparse) {
701                 case 'w':
702                         ret = regnode(ALNUM);
703                         *flagp |= HASWIDTH|SIMPLE;
704                         regparse++;
705                         break;
706                 case 'W':
707                         ret = regnode(NALNUM);
708                         *flagp |= HASWIDTH|SIMPLE;
709                         regparse++;
710                         break;
711                 case 'b':
712                         ret = regnode(BOUND);
713                         *flagp |= SIMPLE;
714                         regparse++;
715                         break;
716                 case 'B':
717                         ret = regnode(NBOUND);
718                         *flagp |= SIMPLE;
719                         regparse++;
720                         break;
721                 case 's':
722                         ret = regnode(SPACE);
723                         *flagp |= HASWIDTH|SIMPLE;
724                         regparse++;
725                         break;
726                 case 'S':
727                         ret = regnode(NSPACE);
728                         *flagp |= HASWIDTH|SIMPLE;
729                         regparse++;
730                         break;
731                 case 'd':
732                         ret = regnode(DIGIT);
733                         *flagp |= HASWIDTH|SIMPLE;
734                         regparse++;
735                         break;
736                 case 'D':
737                         ret = regnode(NDIGIT);
738                         *flagp |= HASWIDTH|SIMPLE;
739                         regparse++;
740                         break;
741                 case 'n':
742                 case 'r':
743                 case 't':
744                 case 'f':
745                 case 'e':
746                 case 'a':
747                 case 'x':
748                 case 'c':
749                 case '0':
750                         goto defchar;
751                 case '1': case '2': case '3': case '4':
752                 case '5': case '6': case '7': case '8': case '9':
753                         {
754                             int num = atoi(regparse);
755
756                             if (num > 9 && num >= regnpar)
757                                 goto defchar;
758                             else {
759                                 regsawback = 1;
760                                 ret = reganode(REF, num);
761                                 while (isascii(*regparse) && isdigit(*regparse))
762                                     regparse++;
763                                 *flagp |= SIMPLE;
764                             }
765                         }
766                         break;
767                 case '\0':
768                         if (regparse >= regxend)
769                             FAIL("trailing \\ in regexp");
770                         /* FALL THROUGH */
771                 default:
772                         goto defchar;
773                 }
774                 break;
775         default: {
776                         register int len;
777                         register char ender;
778                         register char *p;
779                         char *oldp;
780                         int numlen;
781
782                     defchar:
783                         ret = regnode(EXACTLY);
784                         regc(0);                /* save spot for len */
785                         for (len=0, p=regparse-1;
786                           len < 127 && p < regxend;
787                           len++)
788                         {
789                             oldp = p;
790                             switch (*p) {
791                             case '^':
792                             case '$':
793                             case '.':
794                             case '[':
795                             case '(':
796                             case ')':
797                             case '|':
798                                 goto loopdone;
799                             case '\\':
800                                 switch (*++p) {
801                                 case 'w':
802                                 case 'W':
803                                 case 'b':
804                                 case 'B':
805                                 case 's':
806                                 case 'S':
807                                 case 'd':
808                                 case 'D':
809                                     --p;
810                                     goto loopdone;
811                                 case 'n':
812                                         ender = '\n';
813                                         p++;
814                                         break;
815                                 case 'r':
816                                         ender = '\r';
817                                         p++;
818                                         break;
819                                 case 't':
820                                         ender = '\t';
821                                         p++;
822                                         break;
823                                 case 'f':
824                                         ender = '\f';
825                                         p++;
826                                         break;
827                                 case 'e':
828                                         ender = '\033';
829                                         p++;
830                                         break;
831                                 case 'a':
832                                         ender = '\007';
833                                         p++;
834                                         break;
835                                 case 'x':
836                                     ender = scanhex(++p, 2, &numlen);
837                                     p += numlen;
838                                     break;
839                                 case 'c':
840                                     p++;
841                                     ender = *p++;
842                                     if (islower(ender))
843                                         ender = toupper(ender);
844                                     ender ^= 64;
845                                     break;
846                                 case '0': case '1': case '2': case '3':case '4':
847                                 case '5': case '6': case '7': case '8':case '9':
848                                     if (*p == '0' ||
849                                       (isdigit(p[1]) && atoi(p) >= regnpar) ) {
850                                         ender = scanoct(p, 3, &numlen);
851                                         p += numlen;
852                                     }
853                                     else {
854                                         --p;
855                                         goto loopdone;
856                                     }
857                                     break;
858                                 case '\0':
859                                     if (p >= regxend)
860                                         FAIL("trailing \\ in regexp");
861                                     /* FALL THROUGH */
862                                 default:
863                                     ender = *p++;
864                                     break;
865                                 }
866                                 break;
867                             default:
868                                 ender = *p++;
869                                 break;
870                             }
871                             if (regfold && isupper(ender))
872                                     ender = tolower(ender);
873                             if (ISMULT2(p)) { /* Back off on ?+*. */
874                                 if (len)
875                                     p = oldp;
876                                 else {
877                                     len++;
878                                     regc(ender);
879                                 }
880                                 break;
881                             }
882                             regc(ender);
883                         }
884                     loopdone:
885                         regparse = p;
886                         if (len <= 0)
887                                 FAIL("internal disaster in regexp");
888                         *flagp |= HASWIDTH;
889                         if (len == 1)
890                                 *flagp |= SIMPLE;
891                         if (regcode != &regdummy)
892                             *OPERAND(ret) = len;
893                         regc('\0');
894                 }
895                 break;
896         }
897
898         return(ret);
899 }
900
901 static void
902 regset(bits,def,c)
903 char *bits;
904 int def;
905 register int c;
906 {
907         if (regcode == &regdummy)
908             return;
909         c &= 255;
910         if (def)
911                 bits[c >> 3] &= ~(1 << (c & 7));
912         else
913                 bits[c >> 3] |=  (1 << (c & 7));
914 }
915
916 static char *
917 regclass()
918 {
919         register char *bits;
920         register int class;
921         register int lastclass;
922         register int range = 0;
923         register char *ret;
924         register int def;
925         int numlen;
926
927         ret = regnode(ANYOF);
928         if (*regparse == '^') { /* Complement of range. */
929                 regparse++;
930                 def = 0;
931         } else {
932                 def = 255;
933         }
934         bits = regcode;
935         for (class = 0; class < 32; class++)
936             regc(def);
937         if (*regparse == ']' || *regparse == '-')
938                 goto skipcond;          /* allow 1st char to be ] or - */
939         while (regparse < regxend && *regparse != ']') {
940               skipcond:
941                 class = UCHARAT(regparse++);
942                 if (class == '\\') {
943                         class = UCHARAT(regparse++);
944                         switch (class) {
945                         case 'w':
946                                 for (class = 'a'; class <= 'z'; class++)
947                                         regset(bits,def,class);
948                                 for (class = 'A'; class <= 'Z'; class++)
949                                         regset(bits,def,class);
950                                 for (class = '0'; class <= '9'; class++)
951                                         regset(bits,def,class);
952                                 regset(bits,def,'_');
953                                 lastclass = 1234;
954                                 continue;
955                         case 's':
956                                 regset(bits,def,' ');
957                                 regset(bits,def,'\t');
958                                 regset(bits,def,'\r');
959                                 regset(bits,def,'\f');
960                                 regset(bits,def,'\n');
961                                 lastclass = 1234;
962                                 continue;
963                         case 'd':
964                                 for (class = '0'; class <= '9'; class++)
965                                         regset(bits,def,class);
966                                 lastclass = 1234;
967                                 continue;
968                         case 'n':
969                                 class = '\n';
970                                 break;
971                         case 'r':
972                                 class = '\r';
973                                 break;
974                         case 't':
975                                 class = '\t';
976                                 break;
977                         case 'f':
978                                 class = '\f';
979                                 break;
980                         case 'b':
981                                 class = '\b';
982                                 break;
983                         case 'e':
984                                 class = '\033';
985                                 break;
986                         case 'a':
987                                 class = '\007';
988                                 break;
989                         case 'x':
990                                 class = scanhex(regparse, 2, &numlen);
991                                 regparse += numlen;
992                                 break;
993                         case 'c':
994                                 class = *regparse++;
995                                 if (islower(class))
996                                     class = toupper(class);
997                                 class ^= 64;
998                                 break;
999                         case '0': case '1': case '2': case '3': case '4':
1000                         case '5': case '6': case '7': case '8': case '9':
1001                                 class = scanoct(--regparse, 3, &numlen);
1002                                 regparse += numlen;
1003                                 break;
1004                         }
1005                 }
1006                 if (range) {
1007                         if (lastclass > class)
1008                                 FAIL("invalid [] range in regexp");
1009                         range = 0;
1010                 }
1011                 else {
1012                         lastclass = class;
1013                         if (*regparse == '-' && regparse+1 < regxend &&
1014                             regparse[1] != ']') {
1015                                 regparse++;
1016                                 range = 1;
1017                                 continue;       /* do it next time */
1018                         }
1019                 }
1020                 for ( ; lastclass <= class; lastclass++) {
1021                         regset(bits,def,lastclass);
1022                         if (regfold && isupper(lastclass))
1023                                 regset(bits,def,tolower(lastclass));
1024                 }
1025                 lastclass = class;
1026         }
1027         if (*regparse != ']')
1028                 FAIL("unmatched [] in regexp");
1029         regparse++;
1030         return ret;
1031 }
1032
1033 /*
1034  - regnode - emit a node
1035  */
1036 static char *                   /* Location. */
1037 regnode(op)
1038 char op;
1039 {
1040         register char *ret;
1041         register char *ptr;
1042
1043         ret = regcode;
1044         if (ret == &regdummy) {
1045 #ifdef REGALIGN
1046                 if (!(regsize & 1))
1047                         regsize++;
1048 #endif
1049                 regsize += 3;
1050                 return(ret);
1051         }
1052
1053 #ifdef REGALIGN
1054 #ifndef lint
1055         if (!((long)ret & 1))
1056             *ret++ = 127;
1057 #endif
1058 #endif
1059         ptr = ret;
1060         *ptr++ = op;
1061         *ptr++ = '\0';          /* Null "next" pointer. */
1062         *ptr++ = '\0';
1063         regcode = ptr;
1064
1065         return(ret);
1066 }
1067
1068 /*
1069  - reganode - emit a node with an argument
1070  */
1071 static char *                   /* Location. */
1072 reganode(op, arg)
1073 char op;
1074 unsigned short arg;
1075 {
1076         register char *ret;
1077         register char *ptr;
1078
1079         ret = regcode;
1080         if (ret == &regdummy) {
1081 #ifdef REGALIGN
1082                 if (!(regsize & 1))
1083                         regsize++;
1084 #endif
1085                 regsize += 5;
1086                 return(ret);
1087         }
1088
1089 #ifdef REGALIGN
1090 #ifndef lint
1091         if (!((long)ret & 1))
1092             *ret++ = 127;
1093 #endif
1094 #endif
1095         ptr = ret;
1096         *ptr++ = op;
1097         *ptr++ = '\0';          /* Null "next" pointer. */
1098         *ptr++ = '\0';
1099 #ifdef REGALIGN
1100         *(unsigned short *)(ret+3) = arg;
1101 #else
1102         ret[3] = arg >> 8; ret[4] = arg & 0377;
1103 #endif
1104         ptr += 2;
1105         regcode = ptr;
1106
1107         return(ret);
1108 }
1109
1110 /*
1111  - regc - emit (if appropriate) a byte of code
1112  */
1113 static void
1114 regc(b)
1115 char b;
1116 {
1117         if (regcode != &regdummy)
1118                 *regcode++ = b;
1119         else
1120                 regsize++;
1121 }
1122
1123 /*
1124  - reginsert - insert an operator in front of already-emitted operand
1125  *
1126  * Means relocating the operand.
1127  */
1128 static void
1129 reginsert(op, opnd)
1130 char op;
1131 char *opnd;
1132 {
1133         register char *src;
1134         register char *dst;
1135         register char *place;
1136         register offset = (op == CURLY ? 4 : 0);
1137
1138         if (regcode == &regdummy) {
1139 #ifdef REGALIGN
1140                 regsize += 4 + offset;
1141 #else
1142                 regsize += 3 + offset;
1143 #endif
1144                 return;
1145         }
1146
1147         src = regcode;
1148 #ifdef REGALIGN
1149         regcode += 4 + offset;
1150 #else
1151         regcode += 3 + offset;
1152 #endif
1153         dst = regcode;
1154         while (src > opnd)
1155                 *--dst = *--src;
1156
1157         place = opnd;           /* Op node, where operand used to be. */
1158         *place++ = op;
1159         *place++ = '\0';
1160         *place++ = '\0';
1161         while (offset-- > 0)
1162             *place++ = '\0';
1163 }
1164
1165 /*
1166  - regtail - set the next-pointer at the end of a node chain
1167  */
1168 static void
1169 regtail(p, val)
1170 char *p;
1171 char *val;
1172 {
1173         register char *scan;
1174         register char *temp;
1175         register int offset;
1176
1177         if (p == &regdummy)
1178                 return;
1179
1180         /* Find last node. */
1181         scan = p;
1182         for (;;) {
1183                 temp = regnext(scan);
1184                 if (temp == NULL)
1185                         break;
1186                 scan = temp;
1187         }
1188
1189 #ifdef REGALIGN
1190         offset = val - scan;
1191 #ifndef lint
1192         *(short*)(scan+1) = offset;
1193 #else
1194         offset = offset;
1195 #endif
1196 #else
1197         if (OP(scan) == BACK)
1198                 offset = scan - val;
1199         else
1200                 offset = val - scan;
1201         *(scan+1) = (offset>>8)&0377;
1202         *(scan+2) = offset&0377;
1203 #endif
1204 }
1205
1206 /*
1207  - regoptail - regtail on operand of first argument; nop if operandless
1208  */
1209 static void
1210 regoptail(p, val)
1211 char *p;
1212 char *val;
1213 {
1214         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1215         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1216                 return;
1217         regtail(NEXTOPER(p), val);
1218 }
1219
1220 /*
1221  - regcurly - a little FSA that accepts {\d+,?\d*}
1222  */
1223 STATIC int
1224 regcurly(s)
1225 register char *s;
1226 {
1227     if (*s++ != '{')
1228         return FALSE;
1229     if (!isdigit(*s))
1230         return FALSE;
1231     while (isdigit(*s))
1232         s++;
1233     if (*s == ',')
1234         s++;
1235     while (isdigit(*s))
1236         s++;
1237     if (*s != '}')
1238         return FALSE;
1239     return TRUE;
1240 }
1241
1242 #ifdef DEBUGGING
1243
1244 /*
1245  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1246  */
1247 void
1248 regdump(r)
1249 regexp *r;
1250 {
1251         register char *s;
1252         register char op = EXACTLY;     /* Arbitrary non-END op. */
1253         register char *next;
1254
1255
1256         s = r->program + 1;
1257         while (op != END) {     /* While that wasn't END last time... */
1258 #ifdef REGALIGN
1259                 if (!((long)s & 1))
1260                         s++;
1261 #endif
1262                 op = OP(s);
1263                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1264                 next = regnext(s);
1265                 s += regarglen[op];
1266                 if (next == NULL)               /* Next ptr. */
1267                         fprintf(stderr,"(0)");
1268                 else 
1269                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1270                 s += 3;
1271                 if (op == ANYOF) {
1272                         s += 32;
1273                 }
1274                 if (op == EXACTLY) {
1275                         /* Literal string, where present. */
1276                         s++;
1277                         while (*s != '\0') {
1278                                 (void)putchar(*s);
1279                                 s++;
1280                         }
1281                         s++;
1282                 }
1283                 (void)putchar('\n');
1284         }
1285
1286         /* Header fields of interest. */
1287         if (r->regstart)
1288                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1289         if (r->regstclass)
1290                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1291         if (r->reganch & ROPT_ANCH)
1292                 fprintf(stderr,"anchored ");
1293         if (r->reganch & ROPT_SKIP)
1294                 fprintf(stderr,"plus ");
1295         if (r->regmust != NULL)
1296                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1297                   r->regback);
1298         fprintf(stderr,"\n");
1299 }
1300
1301 /*
1302  - regprop - printable representation of opcode
1303  */
1304 char *
1305 regprop(op)
1306 char *op;
1307 {
1308         register char *p;
1309
1310         (void) strcpy(buf, ":");
1311
1312         switch (OP(op)) {
1313         case BOL:
1314                 p = "BOL";
1315                 break;
1316         case EOL:
1317                 p = "EOL";
1318                 break;
1319         case ANY:
1320                 p = "ANY";
1321                 break;
1322         case ANYOF:
1323                 p = "ANYOF";
1324                 break;
1325         case BRANCH:
1326                 p = "BRANCH";
1327                 break;
1328         case EXACTLY:
1329                 p = "EXACTLY";
1330                 break;
1331         case NOTHING:
1332                 p = "NOTHING";
1333                 break;
1334         case BACK:
1335                 p = "BACK";
1336                 break;
1337         case END:
1338                 p = "END";
1339                 break;
1340         case ALNUM:
1341                 p = "ALNUM";
1342                 break;
1343         case NALNUM:
1344                 p = "NALNUM";
1345                 break;
1346         case BOUND:
1347                 p = "BOUND";
1348                 break;
1349         case NBOUND:
1350                 p = "NBOUND";
1351                 break;
1352         case SPACE:
1353                 p = "SPACE";
1354                 break;
1355         case NSPACE:
1356                 p = "NSPACE";
1357                 break;
1358         case DIGIT:
1359                 p = "DIGIT";
1360                 break;
1361         case NDIGIT:
1362                 p = "NDIGIT";
1363                 break;
1364         case CURLY:
1365                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1366                     ARG1(op),ARG2(op));
1367                 p = NULL;
1368                 break;
1369         case REF:
1370                 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1371                 p = NULL;
1372                 break;
1373         case OPEN:
1374                 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1375                 p = NULL;
1376                 break;
1377         case CLOSE:
1378                 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1379                 p = NULL;
1380                 break;
1381         case STAR:
1382                 p = "STAR";
1383                 break;
1384         case PLUS:
1385                 p = "PLUS";
1386                 break;
1387         default:
1388                 FAIL("corrupted regexp opcode");
1389         }
1390         if (p != NULL)
1391                 (void) strcat(buf, p);
1392         return(buf);
1393 }
1394 #endif /* DEBUGGING */
1395
1396 regfree(r)
1397 struct regexp *r;
1398 {
1399         if (r->precomp) {
1400                 Safefree(r->precomp);
1401                 r->precomp = Nullch;
1402         }
1403         if (r->subbase) {
1404                 Safefree(r->subbase);
1405                 r->subbase = Nullch;
1406         }
1407         if (r->regmust) {
1408                 str_free(r->regmust);
1409                 r->regmust = Nullstr;
1410         }
1411         if (r->regstart) {
1412                 str_free(r->regstart);
1413                 r->regstart = Nullstr;
1414         }
1415         Safefree(r->startp);
1416         Safefree(r->endp);
1417         Safefree(r);
1418 }