perl 4.0 patch 31: patch #20, continued
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $RCSfile: regcomp.c,v $$Revision: 4.0.1.5 $$Date: 92/06/08 15:23:36 $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 4.0.1.5  92/06/08  15:23:36  lwall
14  * patch20: Perl now distinguishes overlapped copies from non-overlapped
15  * patch20: /^stuff/ wrongly assumed an implicit $* == 1
16  * patch20: /x{0}/ was wrongly interpreted as /x{0,}/
17  * patch20: added \W, \S and \D inside /[...]/
18  * 
19  * Revision 4.0.1.4  91/11/05  22:55:14  lwall
20  * patch11: Erratum
21  * 
22  * Revision 4.0.1.3  91/11/05  18:22:28  lwall
23  * patch11: minimum match length calculation in regexp is now cumulative
24  * patch11: initial .* in pattern had dependency on value of $*
25  * patch11: certain patterns made use of garbage pointers from uncleared memory
26  * patch11: prepared for ctype implementations that don't define isascii()
27  * 
28  * Revision 4.0.1.2  91/06/07  11:48:24  lwall
29  * patch4: new copyright notice
30  * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
31  * patch4: // wouldn't use previous pattern if it started with a null character
32  * 
33  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
34  * patch1: random cleanup in cpp namespace
35  * 
36  * Revision 4.0  91/03/20  01:39:01  lwall
37  * 4.0 baseline.
38  * 
39  */
40 /*SUPPRESS 112*/
41 /*
42  * regcomp and regexec -- regsub and regerror are not used in perl
43  *
44  *      Copyright (c) 1986 by University of Toronto.
45  *      Written by Henry Spencer.  Not derived from licensed software.
46  *
47  *      Permission is granted to anyone to use this software for any
48  *      purpose on any computer system, and to redistribute it freely,
49  *      subject to the following restrictions:
50  *
51  *      1. The author is not responsible for the consequences of use of
52  *              this software, no matter how awful, even if they arise
53  *              from defects in it.
54  *
55  *      2. The origin of this software must not be misrepresented, either
56  *              by explicit claim or by omission.
57  *
58  *      3. Altered versions must be plainly marked as such, and must not
59  *              be misrepresented as being the original software.
60  *
61  *
62  ****    Alterations to Henry's code are...
63  ****
64  ****    Copyright (c) 1991, Larry Wall
65  ****
66  ****    You may distribute under the terms of either the GNU General Public
67  ****    License or the Artistic License, as specified in the README file.
68
69  *
70  * Beware that some of this code is subtly aware of the way operator
71  * precedence is structured in regular expressions.  Serious changes in
72  * regular-expression syntax might require a total rethink.
73  */
74 #include "EXTERN.h"
75 #include "perl.h"
76 #include "INTERN.h"
77 #include "regcomp.h"
78
79 #ifdef MSDOS
80 # if defined(BUGGY_MSC6)
81  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
82  # pragma optimize("a",off)
83  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
84  # pragma optimize("w",on )
85 # endif /* BUGGY_MSC6 */
86 #endif /* MSDOS */
87
88 #ifndef STATIC
89 #define STATIC  static
90 #endif
91
92 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
93 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
94         ((*s) == '{' && regcurly(s)))
95 #ifdef atarist
96 #define PERL_META       "^$.[()|?+*\\"
97 #else
98 #define META    "^$.[()|?+*\\"
99 #endif
100
101 #ifdef SPSTART
102 #undef SPSTART          /* dratted cpp namespace... */
103 #endif
104 /*
105  * Flags to be passed up and down.
106  */
107 #define HASWIDTH        01      /* Known never to match null string. */
108 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
109 #define SPSTART         04      /* Starts with * or +. */
110 #define WORST           0       /* Worst case. */
111
112 /*
113  * Global work variables for regcomp().
114  */
115 static char *regprecomp;                /* uncompiled string. */
116 static char *regparse;          /* Input-scan pointer. */
117 static char *regxend;           /* End of input for compile */
118 static int regnpar;             /* () count. */
119 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
120 static long regsize;            /* Code size. */
121 static int regfold;
122 static int regsawbracket;       /* Did we do {d,d} trick? */
123 static int regsawback;          /* Did we see \1, ...? */
124
125 /*
126  * Forward declarations for regcomp()'s friends.
127  */
128 STATIC int regcurly();
129 STATIC char *reg();
130 STATIC char *regbranch();
131 STATIC char *regpiece();
132 STATIC char *regatom();
133 STATIC char *regclass();
134 STATIC char *regnode();
135 STATIC char *reganode();
136 STATIC void regc();
137 STATIC void reginsert();
138 STATIC void regtail();
139 STATIC void regoptail();
140
141 /*
142  - regcomp - compile a regular expression into internal code
143  *
144  * We can't allocate space until we know how big the compiled form will be,
145  * but we can't compile it (and thus know how big it is) until we've got a
146  * place to put the code.  So we cheat:  we compile it twice, once with code
147  * generation turned off and size counting turned on, and once "for real".
148  * This also means that we don't allocate space until we are sure that the
149  * thing really will compile successfully, and we never have to move the
150  * code and thus invalidate pointers into it.  (Note that it has to be in
151  * one piece because free() must be able to free it all.) [NB: not true in perl]
152  *
153  * Beware that the optimization-preparation code in here knows about some
154  * of the structure of the compiled regexp.  [I'll say.]
155  */
156 regexp *
157 regcomp(exp,xend,fold)
158 char *exp;
159 char *xend;
160 int fold;
161 {
162         register regexp *r;
163         register char *scan;
164         register STR *longish;
165         STR *longest;
166         register int len;
167         register char *first;
168         int flags;
169         int backish;
170         int backest;
171         int curback;
172         int minlen;
173         int sawplus = 0;
174         int sawopen = 0;
175
176         if (exp == NULL)
177                 fatal("NULL regexp argument");
178
179         /* First pass: determine size, legality. */
180         regfold = fold;
181         regparse = exp;
182         regxend = xend;
183         regprecomp = nsavestr(exp,xend-exp);
184         regsawbracket = 0;
185         regsawback = 0;
186         regnpar = 1;
187         regsize = 0L;
188         regcode = &regdummy;
189         regc((char)MAGIC);
190         if (reg(0, &flags) == NULL) {
191                 Safefree(regprecomp);
192                 regprecomp = Nullch;
193                 return(NULL);
194         }
195
196         /* Small enough for pointer-storage convention? */
197         if (regsize >= 32767L)          /* Probably could be 65535L. */
198                 FAIL("regexp too big");
199
200         /* Allocate space. */
201         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
202         if (r == NULL)
203                 FAIL("regexp out of space");
204
205         /* Second pass: emit code. */
206         if (regsawbracket)
207             Copy(regprecomp,exp,xend-exp,char);
208         r->prelen = xend-exp;
209         r->precomp = regprecomp;
210         r->subbeg = r->subbase = NULL;
211         regparse = exp;
212         regnpar = 1;
213         regcode = r->program;
214         regc((char)MAGIC);
215         if (reg(0, &flags) == NULL)
216                 return(NULL);
217
218         /* Dig out information for optimizations. */
219         r->regstart = Nullstr;  /* Worst-case defaults. */
220         r->reganch = 0;
221         r->regmust = Nullstr;
222         r->regback = -1;
223         r->regstclass = Nullch;
224         scan = r->program+1;                    /* First BRANCH. */
225         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
226                 scan = NEXTOPER(scan);
227
228                 first = scan;
229                 while ((OP(first) == OPEN && (sawopen = 1)) ||
230                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
231                     (OP(first) == PLUS) ||
232                     (OP(first) == CURLY && ARG1(first) > 0) ) {
233                         if (OP(first) == PLUS)
234                             sawplus = 1;
235                         else
236                             first += regarglen[OP(first)];
237                         first = NEXTOPER(first);
238                 }
239
240                 /* Starting-point info. */
241             again:
242                 if (OP(first) == EXACTLY) {
243                         r->regstart =
244                             str_make(OPERAND(first)+1,*OPERAND(first));
245                         if (r->regstart->str_cur > !(sawstudy|fold))
246                                 fbmcompile(r->regstart,fold);
247                 }
248                 else if ((exp = index(simple,OP(first))) && exp > simple)
249                         r->regstclass = first;
250                 else if (OP(first) == BOUND || OP(first) == NBOUND)
251                         r->regstclass = first;
252                 else if (OP(first) == BOL) {
253                         r->reganch = ROPT_ANCH;
254                         first = NEXTOPER(first);
255                         goto again;
256                 }
257                 else if ((OP(first) == STAR && OP(NEXTOPER(first)) == ANY) &&
258                          !(r->reganch & ROPT_ANCH) ) {
259                         /* turn .* into ^.* with an implied $*=1 */
260                         r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
261                         first = NEXTOPER(first);
262                         goto again;
263                 }
264                 if (sawplus && (!sawopen || !regsawback))
265                     r->reganch |= ROPT_SKIP;    /* x+ must match 1st of run */
266
267 #ifdef DEBUGGING
268                 if (debug & 512)
269                     fprintf(stderr,"first %d next %d offset %d\n",
270                       OP(first), OP(NEXTOPER(first)), first - scan);
271 #endif
272                 /*
273                  * If there's something expensive in the r.e., find the
274                  * longest literal string that must appear and make it the
275                  * regmust.  Resolve ties in favor of later strings, since
276                  * the regstart check works with the beginning of the r.e.
277                  * and avoiding duplication strengthens checking.  Not a
278                  * strong reason, but sufficient in the absence of others.
279                  * [Now we resolve ties in favor of the earlier string if
280                  * it happens that curback has been invalidated, since the
281                  * earlier string may buy us something the later one won't.]
282                  */
283                 longish = str_make("",0);
284                 longest = str_make("",0);
285                 len = 0;
286                 minlen = 0;
287                 curback = 0;
288                 backish = 0;
289                 backest = 0;
290                 while (OP(scan) != END) {
291                         if (OP(scan) == BRANCH) {
292                             if (OP(regnext(scan)) == BRANCH) {
293                                 curback = -30000;
294                                 while (OP(scan) == BRANCH)
295                                     scan = regnext(scan);
296                             }
297                             else        /* single branch is ok */
298                                 scan = NEXTOPER(scan);
299                         }
300                         if (OP(scan) == EXACTLY) {
301                             char *t;
302
303                             first = scan;
304                             while (OP(t = regnext(scan)) == CLOSE)
305                                 scan = t;
306                             minlen += *OPERAND(first);
307                             if (curback - backish == len) {
308                                 str_ncat(longish, OPERAND(first)+1,
309                                     *OPERAND(first));
310                                 len += *OPERAND(first);
311                                 curback += *OPERAND(first);
312                                 first = regnext(scan);
313                             }
314                             else if (*OPERAND(first) >= len + (curback >= 0)) {
315                                 len = *OPERAND(first);
316                                 str_nset(longish, OPERAND(first)+1,len);
317                                 backish = curback;
318                                 curback += len;
319                                 first = regnext(scan);
320                             }
321                             else
322                                 curback += *OPERAND(first);
323                         }
324                         else if (index(varies,OP(scan))) {
325                             curback = -30000;
326                             len = 0;
327                             if (longish->str_cur > longest->str_cur) {
328                                 str_sset(longest,longish);
329                                 backest = backish;
330                             }
331                             str_nset(longish,"",0);
332                             if (OP(scan) == PLUS &&
333                               index(simple,OP(NEXTOPER(scan))))
334                                 minlen++;
335                             else if (OP(scan) == CURLY &&
336                               index(simple,OP(NEXTOPER(scan)+4)))
337                                 minlen += ARG1(scan);
338                         }
339                         else if (index(simple,OP(scan))) {
340                             curback++;
341                             minlen++;
342                             len = 0;
343                             if (longish->str_cur > longest->str_cur) {
344                                 str_sset(longest,longish);
345                                 backest = backish;
346                             }
347                             str_nset(longish,"",0);
348                         }
349                         scan = regnext(scan);
350                 }
351
352                 /* Prefer earlier on tie, unless we can tail match latter */
353
354                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
355                     str_sset(longest,longish);
356                     backest = backish;
357                 }
358                 else
359                     str_nset(longish,"",0);
360                 if (longest->str_cur
361                     &&
362                     (!r->regstart
363                      ||
364                      !fbminstr((unsigned char*) r->regstart->str_ptr,
365                           (unsigned char *) r->regstart->str_ptr
366                             + r->regstart->str_cur,
367                           longest)
368                     )
369                    )
370                 {
371                         r->regmust = longest;
372                         if (backest < 0)
373                                 backest = -1;
374                         r->regback = backest;
375                         if (longest->str_cur
376                           > !(sawstudy || fold || OP(first) == EOL) )
377                                 fbmcompile(r->regmust,fold);
378                         r->regmust->str_u.str_useful = 100;
379                         if (OP(first) == EOL && longish->str_cur)
380                             r->regmust->str_pok |= SP_TAIL;
381                 }
382                 else {
383                         str_free(longest);
384                         longest = Nullstr;
385                 }
386                 str_free(longish);
387         }
388
389         r->do_folding = fold;
390         r->nparens = regnpar - 1;
391         r->minlen = minlen;
392         Newz(1002, r->startp, regnpar, char*);
393         Newz(1002, r->endp, regnpar, char*);
394 #ifdef DEBUGGING
395         if (debug & 512)
396                 regdump(r);
397 #endif
398         return(r);
399 }
400
401 /*
402  - reg - regular expression, i.e. main body or parenthesized thing
403  *
404  * Caller must absorb opening parenthesis.
405  *
406  * Combining parenthesis handling with the base level of regular expression
407  * is a trifle forced, but the need to tie the tails of the branches to what
408  * follows makes it hard to avoid.
409  */
410 static char *
411 reg(paren, flagp)
412 int paren;                      /* Parenthesized? */
413 int *flagp;
414 {
415         register char *ret;
416         register char *br;
417         register char *ender;
418         register int parno;
419         int flags;
420
421         *flagp = HASWIDTH;      /* Tentatively. */
422
423         /* Make an OPEN node, if parenthesized. */
424         if (paren) {
425                 parno = regnpar;
426                 regnpar++;
427                 ret = reganode(OPEN, parno);
428         } else
429                 ret = NULL;
430
431         /* Pick up the branches, linking them together. */
432         br = regbranch(&flags);
433         if (br == NULL)
434                 return(NULL);
435         if (ret != NULL)
436                 regtail(ret, br);       /* OPEN -> first. */
437         else
438                 ret = br;
439         if (!(flags&HASWIDTH))
440                 *flagp &= ~HASWIDTH;
441         *flagp |= flags&SPSTART;
442         while (*regparse == '|') {
443                 regparse++;
444                 br = regbranch(&flags);
445                 if (br == NULL)
446                         return(NULL);
447                 regtail(ret, br);       /* BRANCH -> BRANCH. */
448                 if (!(flags&HASWIDTH))
449                         *flagp &= ~HASWIDTH;
450                 *flagp |= flags&SPSTART;
451         }
452
453         /* Make a closing node, and hook it on the end. */
454         if (paren)
455             ender = reganode(CLOSE, parno);
456         else
457             ender = regnode(END);
458         regtail(ret, ender);
459
460         /* Hook the tails of the branches to the closing node. */
461         for (br = ret; br != NULL; br = regnext(br))
462                 regoptail(br, ender);
463
464         /* Check for proper termination. */
465         if (paren && *regparse++ != ')') {
466                 FAIL("unmatched () in regexp");
467         } else if (!paren && regparse < regxend) {
468                 if (*regparse == ')') {
469                         FAIL("unmatched () in regexp");
470                 } else
471                         FAIL("junk on end of regexp");  /* "Can't happen". */
472                 /* NOTREACHED */
473         }
474
475         return(ret);
476 }
477
478 /*
479  - regbranch - one alternative of an | operator
480  *
481  * Implements the concatenation operator.
482  */
483 static char *
484 regbranch(flagp)
485 int *flagp;
486 {
487         register char *ret;
488         register char *chain;
489         register char *latest;
490         int flags;
491
492         *flagp = WORST;         /* Tentatively. */
493
494         ret = regnode(BRANCH);
495         chain = NULL;
496         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
497                 latest = regpiece(&flags);
498                 if (latest == NULL)
499                         return(NULL);
500                 *flagp |= flags&HASWIDTH;
501                 if (chain == NULL)      /* First piece. */
502                         *flagp |= flags&SPSTART;
503                 else
504                         regtail(chain, latest);
505                 chain = latest;
506         }
507         if (chain == NULL)      /* Loop ran zero times. */
508                 (void) regnode(NOTHING);
509
510         return(ret);
511 }
512
513 /*
514  - regpiece - something followed by possible [*+?]
515  *
516  * Note that the branching code sequences used for ? and the general cases
517  * of * and + are somewhat optimized:  they use the same NOTHING node as
518  * both the endmarker for their branch list and the body of the last branch.
519  * It might seem that this node could be dispensed with entirely, but the
520  * endmarker role is not redundant.
521  */
522 static char *
523 regpiece(flagp)
524 int *flagp;
525 {
526         register char *ret;
527         register char op;
528         register char *next;
529         int flags;
530         char *origparse = regparse;
531         int orignpar = regnpar;
532         char *max;
533         int iter;
534         char ch;
535
536         ret = regatom(&flags);
537         if (ret == NULL)
538                 return(NULL);
539
540         op = *regparse;
541
542         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
543          * then we decrement the first number by one and reset our
544          * parsing back to the beginning of the same atom.  If the first number
545          * is down to 0, decrement the second number instead and fake up
546          * a ? after it.  Given the way this compiler doesn't keep track
547          * of offsets on the first pass, this is the only way to replicate
548          * a piece of code.  Sigh.
549          */
550         if (op == '{' && regcurly(regparse)) {
551             next = regparse + 1;
552             max = Nullch;
553             while (isDIGIT(*next) || *next == ',') {
554                 if (*next == ',') {
555                     if (max)
556                         break;
557                     else
558                         max = next;
559                 }
560                 next++;
561             }
562             if (*next == '}') {         /* got one */
563                 if (!max)
564                     max = next;
565                 regparse++;
566                 iter = atoi(regparse);
567                 if (flags&SIMPLE) {     /* we can do it right after all */
568                     int tmp;
569
570                     reginsert(CURLY, ret);
571                     if (iter > 0)
572                         *flagp = (WORST|HASWIDTH);
573                     if (*max == ',')
574                         max++;
575                     else
576                         max = regparse;
577                     tmp = atoi(max);
578                     if (!tmp && *max != '0')
579                         tmp = 32767;            /* meaning "infinity" */
580                     if (tmp && tmp < iter)
581                         fatal("Can't do {n,m} with n > m");
582                     if (regcode != &regdummy) {
583 #ifdef REGALIGN
584                         *(unsigned short *)(ret+3) = iter;
585                         *(unsigned short *)(ret+5) = tmp;
586 #else
587                         ret[3] = iter >> 8; ret[4] = iter & 0377;
588                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
589 #endif
590                     }
591                     regparse = next;
592                     goto nest_check;
593                 }
594                 regsawbracket++;        /* remember we clobbered exp */
595                 if (iter > 0) {
596                     ch = *max;
597                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
598                     *max = ch;
599                     if (*max == ',' && max[1] != '}') {
600                         if (atoi(max+1) <= 0)
601                             fatal("Can't do {n,m} with n > m");
602                         ch = *next;
603                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
604                         *next = ch;
605                     }
606                     if (iter != 1 || *max == ',') {
607                         regparse = origparse;   /* back up input pointer */
608                         regnpar = orignpar;     /* don't make more parens */
609                     }
610                     else {
611                         regparse = next;
612                         goto nest_check;
613                     }
614                     *flagp = flags;
615                     return ret;
616                 }
617                 if (*max == ',') {
618                     max++;
619                     iter = atoi(max);
620                     if (max == next) {          /* any number more? */
621                         regparse = next;
622                         op = '*';               /* fake up one with a star */
623                     }
624                     else if (iter > 0) {
625                         op = '?';               /* fake up optional atom */
626                         ch = *next;
627                         sprintf(max,"%.*d", next-max, iter - 1);
628                         *next = ch;
629                         if (iter == 1)
630                             regparse = next;
631                         else {
632                             regparse = origparse - 1; /* offset ++ below */
633                             regnpar = orignpar;
634                         }
635                     }
636                     else
637                         fatal("Can't do {n,0}");
638                 }
639                 else
640                     fatal("Can't do {0}");
641             }
642         }
643
644         if (!ISMULT1(op)) {
645                 *flagp = flags;
646                 return(ret);
647         }
648
649         if (!(flags&HASWIDTH) && op != '?')
650                 FAIL("regexp *+ operand could be empty");
651         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
652
653         if (op == '*' && (flags&SIMPLE))
654                 reginsert(STAR, ret);
655         else if (op == '*') {
656                 /* Emit x* as (x&|), where & means "self". */
657                 reginsert(BRANCH, ret);                 /* Either x */
658                 regoptail(ret, regnode(BACK));          /* and loop */
659                 regoptail(ret, ret);                    /* back */
660                 regtail(ret, regnode(BRANCH));          /* or */
661                 regtail(ret, regnode(NOTHING));         /* null. */
662         } else if (op == '+' && (flags&SIMPLE))
663                 reginsert(PLUS, ret);
664         else if (op == '+') {
665                 /* Emit x+ as x(&|), where & means "self". */
666                 next = regnode(BRANCH);                 /* Either */
667                 regtail(ret, next);
668                 regtail(regnode(BACK), ret);            /* loop back */
669                 regtail(next, regnode(BRANCH));         /* or */
670                 regtail(ret, regnode(NOTHING));         /* null. */
671         } else if (op == '?') {
672                 /* Emit x? as (x|) */
673                 reginsert(BRANCH, ret);                 /* Either x */
674                 regtail(ret, regnode(BRANCH));          /* or */
675                 next = regnode(NOTHING);                /* null. */
676                 regtail(ret, next);
677                 regoptail(ret, next);
678         }
679       nest_check:
680         regparse++;
681         if (ISMULT2(regparse))
682                 FAIL("nested *?+ in regexp");
683
684         return(ret);
685 }
686
687 /*
688  - regatom - the lowest level
689  *
690  * Optimization:  gobbles an entire sequence of ordinary characters so that
691  * it can turn them into a single node, which is smaller to store and
692  * faster to run.  Backslashed characters are exceptions, each becoming a
693  * separate node; the code is simpler that way and it's not worth fixing.
694  *
695  * [Yes, it is worth fixing, some scripts can run twice the speed.]
696  */
697 static char *
698 regatom(flagp)
699 int *flagp;
700 {
701         register char *ret;
702         int flags;
703
704         *flagp = WORST;         /* Tentatively. */
705
706         switch (*regparse++) {
707         case '^':
708                 ret = regnode(BOL);
709                 break;
710         case '$':
711                 ret = regnode(EOL);
712                 break;
713         case '.':
714                 ret = regnode(ANY);
715                 *flagp |= HASWIDTH|SIMPLE;
716                 break;
717         case '[':
718                 ret = regclass();
719                 *flagp |= HASWIDTH|SIMPLE;
720                 break;
721         case '(':
722                 ret = reg(1, &flags);
723                 if (ret == NULL)
724                         return(NULL);
725                 *flagp |= flags&(HASWIDTH|SPSTART);
726                 break;
727         case '|':
728         case ')':
729                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
730                 break;
731         case '?':
732         case '+':
733         case '*':
734                 FAIL("?+* follows nothing in regexp");
735                 break;
736         case '\\':
737                 switch (*regparse) {
738                 case 'w':
739                         ret = regnode(ALNUM);
740                         *flagp |= HASWIDTH|SIMPLE;
741                         regparse++;
742                         break;
743                 case 'W':
744                         ret = regnode(NALNUM);
745                         *flagp |= HASWIDTH|SIMPLE;
746                         regparse++;
747                         break;
748                 case 'b':
749                         ret = regnode(BOUND);
750                         *flagp |= SIMPLE;
751                         regparse++;
752                         break;
753                 case 'B':
754                         ret = regnode(NBOUND);
755                         *flagp |= SIMPLE;
756                         regparse++;
757                         break;
758                 case 's':
759                         ret = regnode(SPACE);
760                         *flagp |= HASWIDTH|SIMPLE;
761                         regparse++;
762                         break;
763                 case 'S':
764                         ret = regnode(NSPACE);
765                         *flagp |= HASWIDTH|SIMPLE;
766                         regparse++;
767                         break;
768                 case 'd':
769                         ret = regnode(DIGIT);
770                         *flagp |= HASWIDTH|SIMPLE;
771                         regparse++;
772                         break;
773                 case 'D':
774                         ret = regnode(NDIGIT);
775                         *flagp |= HASWIDTH|SIMPLE;
776                         regparse++;
777                         break;
778                 case 'n':
779                 case 'r':
780                 case 't':
781                 case 'f':
782                 case 'e':
783                 case 'a':
784                 case 'x':
785                 case 'c':
786                 case '0':
787                         goto defchar;
788                 case '1': case '2': case '3': case '4':
789                 case '5': case '6': case '7': case '8': case '9':
790                         {
791                             int num = atoi(regparse);
792
793                             if (num > 9 && num >= regnpar)
794                                 goto defchar;
795                             else {
796                                 regsawback = 1;
797                                 ret = reganode(REF, num);
798                                 while (isDIGIT(*regparse))
799                                     regparse++;
800                                 *flagp |= SIMPLE;
801                             }
802                         }
803                         break;
804                 case '\0':
805                         if (regparse >= regxend)
806                             FAIL("trailing \\ in regexp");
807                         /* FALL THROUGH */
808                 default:
809                         goto defchar;
810                 }
811                 break;
812         default: {
813                         register int len;
814                         register char ender;
815                         register char *p;
816                         char *oldp;
817                         int numlen;
818
819                     defchar:
820                         ret = regnode(EXACTLY);
821                         regc(0);                /* save spot for len */
822                         for (len=0, p=regparse-1;
823                           len < 127 && p < regxend;
824                           len++)
825                         {
826                             oldp = p;
827                             switch (*p) {
828                             case '^':
829                             case '$':
830                             case '.':
831                             case '[':
832                             case '(':
833                             case ')':
834                             case '|':
835                                 goto loopdone;
836                             case '\\':
837                                 switch (*++p) {
838                                 case 'w':
839                                 case 'W':
840                                 case 'b':
841                                 case 'B':
842                                 case 's':
843                                 case 'S':
844                                 case 'd':
845                                 case 'D':
846                                     --p;
847                                     goto loopdone;
848                                 case 'n':
849                                         ender = '\n';
850                                         p++;
851                                         break;
852                                 case 'r':
853                                         ender = '\r';
854                                         p++;
855                                         break;
856                                 case 't':
857                                         ender = '\t';
858                                         p++;
859                                         break;
860                                 case 'f':
861                                         ender = '\f';
862                                         p++;
863                                         break;
864                                 case 'e':
865                                         ender = '\033';
866                                         p++;
867                                         break;
868                                 case 'a':
869                                         ender = '\007';
870                                         p++;
871                                         break;
872                                 case 'x':
873                                     ender = scanhex(++p, 2, &numlen);
874                                     p += numlen;
875                                     break;
876                                 case 'c':
877                                     p++;
878                                     ender = *p++;
879                                     if (isLOWER(ender))
880                                         ender = toupper(ender);
881                                     ender ^= 64;
882                                     break;
883                                 case '0': case '1': case '2': case '3':case '4':
884                                 case '5': case '6': case '7': case '8':case '9':
885                                     if (*p == '0' ||
886                                       (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
887                                         ender = scanoct(p, 3, &numlen);
888                                         p += numlen;
889                                     }
890                                     else {
891                                         --p;
892                                         goto loopdone;
893                                     }
894                                     break;
895                                 case '\0':
896                                     if (p >= regxend)
897                                         FAIL("trailing \\ in regexp");
898                                     /* FALL THROUGH */
899                                 default:
900                                     ender = *p++;
901                                     break;
902                                 }
903                                 break;
904                             default:
905                                 ender = *p++;
906                                 break;
907                             }
908                             if (regfold && isUPPER(ender))
909                                     ender = tolower(ender);
910                             if (ISMULT2(p)) { /* Back off on ?+*. */
911                                 if (len)
912                                     p = oldp;
913                                 else {
914                                     len++;
915                                     regc(ender);
916                                 }
917                                 break;
918                             }
919                             regc(ender);
920                         }
921                     loopdone:
922                         regparse = p;
923                         if (len <= 0)
924                                 FAIL("internal disaster in regexp");
925                         *flagp |= HASWIDTH;
926                         if (len == 1)
927                                 *flagp |= SIMPLE;
928                         if (regcode != &regdummy)
929                             *OPERAND(ret) = len;
930                         regc('\0');
931                 }
932                 break;
933         }
934
935         return(ret);
936 }
937
938 static void
939 regset(bits,def,c)
940 char *bits;
941 int def;
942 register int c;
943 {
944         if (regcode == &regdummy)
945             return;
946         c &= 255;
947         if (def)
948                 bits[c >> 3] &= ~(1 << (c & 7));
949         else
950                 bits[c >> 3] |=  (1 << (c & 7));
951 }
952
953 static char *
954 regclass()
955 {
956         register char *bits;
957         register int class;
958         register int lastclass;
959         register int range = 0;
960         register char *ret;
961         register int def;
962         int numlen;
963
964         ret = regnode(ANYOF);
965         if (*regparse == '^') { /* Complement of range. */
966                 regparse++;
967                 def = 0;
968         } else {
969                 def = 255;
970         }
971         bits = regcode;
972         for (class = 0; class < 32; class++)
973             regc(def);
974         if (*regparse == ']' || *regparse == '-')
975                 goto skipcond;          /* allow 1st char to be ] or - */
976         while (regparse < regxend && *regparse != ']') {
977               skipcond:
978                 class = UCHARAT(regparse++);
979                 if (class == '\\') {
980                         class = UCHARAT(regparse++);
981                         switch (class) {
982                         case 'w':
983                                 for (class = 0; class < 256; class++)
984                                     if (isALNUM(class))
985                                         regset(bits,def,class);
986                                 lastclass = 1234;
987                                 continue;
988                         case 'W':
989                                 for (class = 0; class < 256; class++)
990                                     if (!isALNUM(class))
991                                         regset(bits,def,class);
992                                 lastclass = 1234;
993                                 continue;
994                         case 's':
995                                 for (class = 0; class < 256; class++)
996                                     if (isSPACE(class))
997                                         regset(bits,def,class);
998                                 lastclass = 1234;
999                                 continue;
1000                         case 'S':
1001                                 for (class = 0; class < 256; class++)
1002                                     if (!isSPACE(class))
1003                                         regset(bits,def,class);
1004                                 lastclass = 1234;
1005                                 continue;
1006                         case 'd':
1007                                 for (class = '0'; class <= '9'; class++)
1008                                         regset(bits,def,class);
1009                                 lastclass = 1234;
1010                                 continue;
1011                         case 'D':
1012                                 for (class = 0; class < '0'; class++)
1013                                         regset(bits,def,class);
1014                                 for (class = '9' + 1; class < 256; class++)
1015                                         regset(bits,def,class);
1016                                 lastclass = 1234;
1017                                 continue;
1018                         case 'n':
1019                                 class = '\n';
1020                                 break;
1021                         case 'r':
1022                                 class = '\r';
1023                                 break;
1024                         case 't':
1025                                 class = '\t';
1026                                 break;
1027                         case 'f':
1028                                 class = '\f';
1029                                 break;
1030                         case 'b':
1031                                 class = '\b';
1032                                 break;
1033                         case 'e':
1034                                 class = '\033';
1035                                 break;
1036                         case 'a':
1037                                 class = '\007';
1038                                 break;
1039                         case 'x':
1040                                 class = scanhex(regparse, 2, &numlen);
1041                                 regparse += numlen;
1042                                 break;
1043                         case 'c':
1044                                 class = *regparse++;
1045                                 if (isLOWER(class))
1046                                     class = toupper(class);
1047                                 class ^= 64;
1048                                 break;
1049                         case '0': case '1': case '2': case '3': case '4':
1050                         case '5': case '6': case '7': case '8': case '9':
1051                                 class = scanoct(--regparse, 3, &numlen);
1052                                 regparse += numlen;
1053                                 break;
1054                         }
1055                 }
1056                 if (range) {
1057                         if (lastclass > class)
1058                                 FAIL("invalid [] range in regexp");
1059                         range = 0;
1060                 }
1061                 else {
1062                         lastclass = class;
1063                         if (*regparse == '-' && regparse+1 < regxend &&
1064                             regparse[1] != ']') {
1065                                 regparse++;
1066                                 range = 1;
1067                                 continue;       /* do it next time */
1068                         }
1069                 }
1070                 for ( ; lastclass <= class; lastclass++) {
1071                         regset(bits,def,lastclass);
1072                         if (regfold && isUPPER(lastclass))
1073                                 regset(bits,def,tolower(lastclass));
1074                 }
1075                 lastclass = class;
1076         }
1077         if (*regparse != ']')
1078                 FAIL("unmatched [] in regexp");
1079         regparse++;
1080         return ret;
1081 }
1082
1083 /*
1084  - regnode - emit a node
1085  */
1086 static char *                   /* Location. */
1087 regnode(op)
1088 char op;
1089 {
1090         register char *ret;
1091         register char *ptr;
1092
1093         ret = regcode;
1094         if (ret == &regdummy) {
1095 #ifdef REGALIGN
1096                 if (!(regsize & 1))
1097                         regsize++;
1098 #endif
1099                 regsize += 3;
1100                 return(ret);
1101         }
1102
1103 #ifdef REGALIGN
1104 #ifndef lint
1105         if (!((long)ret & 1))
1106             *ret++ = 127;
1107 #endif
1108 #endif
1109         ptr = ret;
1110         *ptr++ = op;
1111         *ptr++ = '\0';          /* Null "next" pointer. */
1112         *ptr++ = '\0';
1113         regcode = ptr;
1114
1115         return(ret);
1116 }
1117
1118 /*
1119  - reganode - emit a node with an argument
1120  */
1121 static char *                   /* Location. */
1122 reganode(op, arg)
1123 char op;
1124 unsigned short arg;
1125 {
1126         register char *ret;
1127         register char *ptr;
1128
1129         ret = regcode;
1130         if (ret == &regdummy) {
1131 #ifdef REGALIGN
1132                 if (!(regsize & 1))
1133                         regsize++;
1134 #endif
1135                 regsize += 5;
1136                 return(ret);
1137         }
1138
1139 #ifdef REGALIGN
1140 #ifndef lint
1141         if (!((long)ret & 1))
1142             *ret++ = 127;
1143 #endif
1144 #endif
1145         ptr = ret;
1146         *ptr++ = op;
1147         *ptr++ = '\0';          /* Null "next" pointer. */
1148         *ptr++ = '\0';
1149 #ifdef REGALIGN
1150         *(unsigned short *)(ret+3) = arg;
1151 #else
1152         ret[3] = arg >> 8; ret[4] = arg & 0377;
1153 #endif
1154         ptr += 2;
1155         regcode = ptr;
1156
1157         return(ret);
1158 }
1159
1160 /*
1161  - regc - emit (if appropriate) a byte of code
1162  */
1163 static void
1164 regc(b)
1165 char b;
1166 {
1167         if (regcode != &regdummy)
1168                 *regcode++ = b;
1169         else
1170                 regsize++;
1171 }
1172
1173 /*
1174  - reginsert - insert an operator in front of already-emitted operand
1175  *
1176  * Means relocating the operand.
1177  */
1178 static void
1179 reginsert(op, opnd)
1180 char op;
1181 char *opnd;
1182 {
1183         register char *src;
1184         register char *dst;
1185         register char *place;
1186         register offset = (op == CURLY ? 4 : 0);
1187
1188         if (regcode == &regdummy) {
1189 #ifdef REGALIGN
1190                 regsize += 4 + offset;
1191 #else
1192                 regsize += 3 + offset;
1193 #endif
1194                 return;
1195         }
1196
1197         src = regcode;
1198 #ifdef REGALIGN
1199         regcode += 4 + offset;
1200 #else
1201         regcode += 3 + offset;
1202 #endif
1203         dst = regcode;
1204         while (src > opnd)
1205                 *--dst = *--src;
1206
1207         place = opnd;           /* Op node, where operand used to be. */
1208         *place++ = op;
1209         *place++ = '\0';
1210         *place++ = '\0';
1211         while (offset-- > 0)
1212             *place++ = '\0';
1213 #ifdef REGALIGN
1214         *place++ = '\177';
1215 #endif
1216 }
1217
1218 /*
1219  - regtail - set the next-pointer at the end of a node chain
1220  */
1221 static void
1222 regtail(p, val)
1223 char *p;
1224 char *val;
1225 {
1226         register char *scan;
1227         register char *temp;
1228         register int offset;
1229
1230         if (p == &regdummy)
1231                 return;
1232
1233         /* Find last node. */
1234         scan = p;
1235         for (;;) {
1236                 temp = regnext(scan);
1237                 if (temp == NULL)
1238                         break;
1239                 scan = temp;
1240         }
1241
1242 #ifdef REGALIGN
1243         offset = val - scan;
1244 #ifndef lint
1245         *(short*)(scan+1) = offset;
1246 #else
1247         offset = offset;
1248 #endif
1249 #else
1250         if (OP(scan) == BACK)
1251                 offset = scan - val;
1252         else
1253                 offset = val - scan;
1254         *(scan+1) = (offset>>8)&0377;
1255         *(scan+2) = offset&0377;
1256 #endif
1257 }
1258
1259 /*
1260  - regoptail - regtail on operand of first argument; nop if operandless
1261  */
1262 static void
1263 regoptail(p, val)
1264 char *p;
1265 char *val;
1266 {
1267         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1268         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1269                 return;
1270         regtail(NEXTOPER(p), val);
1271 }
1272
1273 /*
1274  - regcurly - a little FSA that accepts {\d+,?\d*}
1275  */
1276 STATIC int
1277 regcurly(s)
1278 register char *s;
1279 {
1280     if (*s++ != '{')
1281         return FALSE;
1282     if (!isDIGIT(*s))
1283         return FALSE;
1284     while (isDIGIT(*s))
1285         s++;
1286     if (*s == ',')
1287         s++;
1288     while (isDIGIT(*s))
1289         s++;
1290     if (*s != '}')
1291         return FALSE;
1292     return TRUE;
1293 }
1294
1295 #ifdef DEBUGGING
1296
1297 /*
1298  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1299  */
1300 void
1301 regdump(r)
1302 regexp *r;
1303 {
1304         register char *s;
1305         register char op = EXACTLY;     /* Arbitrary non-END op. */
1306         register char *next;
1307
1308
1309         s = r->program + 1;
1310         while (op != END) {     /* While that wasn't END last time... */
1311 #ifdef REGALIGN
1312                 if (!((long)s & 1))
1313                         s++;
1314 #endif
1315                 op = OP(s);
1316                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1317                 next = regnext(s);
1318                 s += regarglen[op];
1319                 if (next == NULL)               /* Next ptr. */
1320                         fprintf(stderr,"(0)");
1321                 else 
1322                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1323                 s += 3;
1324                 if (op == ANYOF) {
1325                         s += 32;
1326                 }
1327                 if (op == EXACTLY) {
1328                         /* Literal string, where present. */
1329                         s++;
1330                         while (*s != '\0') {
1331                                 (void)putchar(*s);
1332                                 s++;
1333                         }
1334                         s++;
1335                 }
1336                 (void)putchar('\n');
1337         }
1338
1339         /* Header fields of interest. */
1340         if (r->regstart)
1341                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1342         if (r->regstclass)
1343                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1344         if (r->reganch & ROPT_ANCH)
1345                 fprintf(stderr,"anchored ");
1346         if (r->reganch & ROPT_SKIP)
1347                 fprintf(stderr,"plus ");
1348         if (r->reganch & ROPT_IMPLICIT)
1349                 fprintf(stderr,"implicit ");
1350         if (r->regmust != NULL)
1351                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1352                   r->regback);
1353         fprintf(stderr, "minlen %d ", r->minlen);
1354         fprintf(stderr,"\n");
1355 }
1356
1357 /*
1358  - regprop - printable representation of opcode
1359  */
1360 char *
1361 regprop(op)
1362 char *op;
1363 {
1364         register char *p;
1365
1366         (void) strcpy(buf, ":");
1367
1368         switch (OP(op)) {
1369         case BOL:
1370                 p = "BOL";
1371                 break;
1372         case EOL:
1373                 p = "EOL";
1374                 break;
1375         case ANY:
1376                 p = "ANY";
1377                 break;
1378         case ANYOF:
1379                 p = "ANYOF";
1380                 break;
1381         case BRANCH:
1382                 p = "BRANCH";
1383                 break;
1384         case EXACTLY:
1385                 p = "EXACTLY";
1386                 break;
1387         case NOTHING:
1388                 p = "NOTHING";
1389                 break;
1390         case BACK:
1391                 p = "BACK";
1392                 break;
1393         case END:
1394                 p = "END";
1395                 break;
1396         case ALNUM:
1397                 p = "ALNUM";
1398                 break;
1399         case NALNUM:
1400                 p = "NALNUM";
1401                 break;
1402         case BOUND:
1403                 p = "BOUND";
1404                 break;
1405         case NBOUND:
1406                 p = "NBOUND";
1407                 break;
1408         case SPACE:
1409                 p = "SPACE";
1410                 break;
1411         case NSPACE:
1412                 p = "NSPACE";
1413                 break;
1414         case DIGIT:
1415                 p = "DIGIT";
1416                 break;
1417         case NDIGIT:
1418                 p = "NDIGIT";
1419                 break;
1420         case CURLY:
1421                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1422                     ARG1(op),ARG2(op));
1423                 p = NULL;
1424                 break;
1425         case REF:
1426                 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1427                 p = NULL;
1428                 break;
1429         case OPEN:
1430                 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1431                 p = NULL;
1432                 break;
1433         case CLOSE:
1434                 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1435                 p = NULL;
1436                 break;
1437         case STAR:
1438                 p = "STAR";
1439                 break;
1440         case PLUS:
1441                 p = "PLUS";
1442                 break;
1443         default:
1444                 FAIL("corrupted regexp opcode");
1445         }
1446         if (p != NULL)
1447                 (void) strcat(buf, p);
1448         return(buf);
1449 }
1450 #endif /* DEBUGGING */
1451
1452 void
1453 regfree(r)
1454 struct regexp *r;
1455 {
1456         if (r->precomp) {
1457                 Safefree(r->precomp);
1458                 r->precomp = Nullch;
1459         }
1460         if (r->subbase) {
1461                 Safefree(r->subbase);
1462                 r->subbase = Nullch;
1463         }
1464         if (r->regmust) {
1465                 str_free(r->regmust);
1466                 r->regmust = Nullstr;
1467         }
1468         if (r->regstart) {
1469                 str_free(r->regstart);
1470                 r->regstart = Nullstr;
1471         }
1472         Safefree(r->startp);
1473         Safefree(r->endp);
1474         Safefree(r);
1475 }