perl 4.0.00: (no release announcement available)
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $Header: regcomp.c,v 4.0 91/03/20 01:39:01 lwall Locked $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 4.0  91/03/20  01:39:01  lwall
14  * 4.0 baseline.
15  * 
16  */
17
18 /*
19  * regcomp and regexec -- regsub and regerror are not used in perl
20  *
21  *      Copyright (c) 1986 by University of Toronto.
22  *      Written by Henry Spencer.  Not derived from licensed software.
23  *
24  *      Permission is granted to anyone to use this software for any
25  *      purpose on any computer system, and to redistribute it freely,
26  *      subject to the following restrictions:
27  *
28  *      1. The author is not responsible for the consequences of use of
29  *              this software, no matter how awful, even if they arise
30  *              from defects in it.
31  *
32  *      2. The origin of this software must not be misrepresented, either
33  *              by explicit claim or by omission.
34  *
35  *      3. Altered versions must be plainly marked as such, and must not
36  *              be misrepresented as being the original software.
37  *
38  *
39  ****    Alterations to Henry's code are...
40  ****
41  ****    Copyright (c) 1989, Larry Wall
42  ****
43  ****    You may distribute under the terms of the GNU General Public License
44  ****    as specified in the README file that comes with the perl 3.0 kit.
45  *
46  * Beware that some of this code is subtly aware of the way operator
47  * precedence is structured in regular expressions.  Serious changes in
48  * regular-expression syntax might require a total rethink.
49  */
50 #include "EXTERN.h"
51 #include "perl.h"
52 #include "INTERN.h"
53 #include "regcomp.h"
54
55 #ifdef MSDOS
56 # if defined(BUGGY_MSC6)
57  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
58  # pragma optimize("a",off)
59  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
60  # pragma optimize("w",on )
61 # endif /* BUGGY_MSC6 */
62 #endif /* MSDOS */
63
64 #ifndef STATIC
65 #define STATIC  static
66 #endif
67
68 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
69 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
70         ((*s) == '{' && regcurly(s)))
71 #define META    "^$.[()|?+*\\"
72
73 /*
74  * Flags to be passed up and down.
75  */
76 #define HASWIDTH        01      /* Known never to match null string. */
77 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
78 #define SPSTART         04      /* Starts with * or +. */
79 #define WORST           0       /* Worst case. */
80
81 /*
82  * Global work variables for regcomp().
83  */
84 static char *regprecomp;                /* uncompiled string. */
85 static char *regparse;          /* Input-scan pointer. */
86 static char *regxend;           /* End of input for compile */
87 static int regnpar;             /* () count. */
88 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
89 static long regsize;            /* Code size. */
90 static int regfold;
91 static int regsawbracket;       /* Did we do {d,d} trick? */
92
93 /*
94  * Forward declarations for regcomp()'s friends.
95  */
96 STATIC int regcurly();
97 STATIC char *reg();
98 STATIC char *regbranch();
99 STATIC char *regpiece();
100 STATIC char *regatom();
101 STATIC char *regclass();
102 STATIC char *regnode();
103 STATIC char *reganode();
104 STATIC void regc();
105 STATIC void reginsert();
106 STATIC void regtail();
107 STATIC void regoptail();
108
109 /*
110  - regcomp - compile a regular expression into internal code
111  *
112  * We can't allocate space until we know how big the compiled form will be,
113  * but we can't compile it (and thus know how big it is) until we've got a
114  * place to put the code.  So we cheat:  we compile it twice, once with code
115  * generation turned off and size counting turned on, and once "for real".
116  * This also means that we don't allocate space until we are sure that the
117  * thing really will compile successfully, and we never have to move the
118  * code and thus invalidate pointers into it.  (Note that it has to be in
119  * one piece because free() must be able to free it all.) [NB: not true in perl]
120  *
121  * Beware that the optimization-preparation code in here knows about some
122  * of the structure of the compiled regexp.  [I'll say.]
123  */
124 regexp *
125 regcomp(exp,xend,fold)
126 char *exp;
127 char *xend;
128 int fold;
129 {
130         register regexp *r;
131         register char *scan;
132         register STR *longish;
133         STR *longest;
134         register int len;
135         register char *first;
136         int flags;
137         int backish;
138         int backest;
139         int curback;
140         extern char *safemalloc();
141         extern char *savestr();
142         int sawplus = 0;
143
144         if (exp == NULL)
145                 fatal("NULL regexp argument");
146
147         /* First pass: determine size, legality. */
148         regfold = fold;
149         regparse = exp;
150         regxend = xend;
151         regprecomp = nsavestr(exp,xend-exp);
152         regsawbracket = 0;
153         regnpar = 1;
154         regsize = 0L;
155         regcode = &regdummy;
156         regc(MAGIC);
157         if (reg(0, &flags) == NULL) {
158                 Safefree(regprecomp);
159                 regprecomp = Nullch;
160                 return(NULL);
161         }
162
163         /* Small enough for pointer-storage convention? */
164         if (regsize >= 32767L)          /* Probably could be 65535L. */
165                 FAIL("regexp too big");
166
167         /* Allocate space. */
168         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
169         if (r == NULL)
170                 FAIL("regexp out of space");
171
172         /* Second pass: emit code. */
173         if (regsawbracket)
174             bcopy(regprecomp,exp,xend-exp);
175         r->precomp = regprecomp;
176         r->subbase = NULL;
177         regparse = exp;
178         regnpar = 1;
179         regcode = r->program;
180         regc(MAGIC);
181         if (reg(0, &flags) == NULL)
182                 return(NULL);
183
184         /* Dig out information for optimizations. */
185         r->regstart = Nullstr;  /* Worst-case defaults. */
186         r->reganch = 0;
187         r->regmust = Nullstr;
188         r->regback = -1;
189         r->regstclass = Nullch;
190         scan = r->program+1;                    /* First BRANCH. */
191         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
192                 scan = NEXTOPER(scan);
193
194                 first = scan;
195                 while (OP(first) == OPEN ||
196                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
197                     (OP(first) == PLUS) ||
198                     (OP(first) == CURLY && ARG1(first) > 0) ) {
199                         if (OP(first) == PLUS)
200                             sawplus = 2;
201                         else
202                             first += regarglen[OP(first)];
203                         first = NEXTOPER(first);
204                 }
205
206                 /* Starting-point info. */
207                 if (OP(first) == EXACTLY) {
208                         r->regstart =
209                             str_make(OPERAND(first)+1,*OPERAND(first));
210                         if (r->regstart->str_cur > !(sawstudy|fold))
211                                 fbmcompile(r->regstart,fold);
212                 }
213                 else if ((exp = index(simple,OP(first))) && exp > simple)
214                         r->regstclass = first;
215                 else if (OP(first) == BOUND || OP(first) == NBOUND)
216                         r->regstclass = first;
217                 else if (OP(first) == BOL ||
218                     (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
219                         r->reganch = 1;         /* kinda turn .* into ^.* */
220                 r->reganch |= sawplus;
221
222 #ifdef DEBUGGING
223                 if (debug & 512)
224                     fprintf(stderr,"first %d next %d offset %d\n",
225                       OP(first), OP(NEXTOPER(first)), first - scan);
226 #endif
227                 /*
228                  * If there's something expensive in the r.e., find the
229                  * longest literal string that must appear and make it the
230                  * regmust.  Resolve ties in favor of later strings, since
231                  * the regstart check works with the beginning of the r.e.
232                  * and avoiding duplication strengthens checking.  Not a
233                  * strong reason, but sufficient in the absence of others.
234                  * [Now we resolve ties in favor of the earlier string if
235                  * it happens that curback has been invalidated, since the
236                  * earlier string may buy us something the later one won't.]
237                  */
238                 longish = str_make("",0);
239                 longest = str_make("",0);
240                 len = 0;
241                 curback = 0;
242                 backish = 0;
243                 backest = 0;
244                 while (OP(scan) != END) {
245                         if (OP(scan) == BRANCH) {
246                             if (OP(regnext(scan)) == BRANCH) {
247                                 curback = -30000;
248                                 while (OP(scan) == BRANCH)
249                                     scan = regnext(scan);
250                             }
251                             else        /* single branch is ok */
252                                 scan = NEXTOPER(scan);
253                         }
254                         if (OP(scan) == EXACTLY) {
255                             char *t;
256
257                             first = scan;
258                             while (OP(t = regnext(scan)) == CLOSE)
259                                 scan = t;
260                             if (curback - backish == len) {
261                                 str_ncat(longish, OPERAND(first)+1,
262                                     *OPERAND(first));
263                                 len += *OPERAND(first);
264                                 curback += *OPERAND(first);
265                                 first = regnext(scan);
266                             }
267                             else if (*OPERAND(first) >= len + (curback >= 0)) {
268                                 len = *OPERAND(first);
269                                 str_nset(longish, OPERAND(first)+1,len);
270                                 backish = curback;
271                                 curback += len;
272                                 first = regnext(scan);
273                             }
274                             else
275                                 curback += *OPERAND(first);
276                         }
277                         else if (index(varies,OP(scan))) {
278                             curback = -30000;
279                             len = 0;
280                             if (longish->str_cur > longest->str_cur) {
281                                 str_sset(longest,longish);
282                                 backest = backish;
283                             }
284                             str_nset(longish,"",0);
285                         }
286                         else if (index(simple,OP(scan))) {
287                             curback++;
288                             len = 0;
289                             if (longish->str_cur > longest->str_cur) {
290                                 str_sset(longest,longish);
291                                 backest = backish;
292                             }
293                             str_nset(longish,"",0);
294                         }
295                         scan = regnext(scan);
296                 }
297
298                 /* Prefer earlier on tie, unless we can tail match latter */
299
300                 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
301                     str_sset(longest,longish);
302                     backest = backish;
303                 }
304                 else
305                     str_nset(longish,"",0);
306                 if (longest->str_cur
307                     &&
308                     (!r->regstart
309                      ||
310                      !fbminstr(r->regstart->str_ptr,
311                           r->regstart->str_ptr + r->regstart->str_cur,
312                           longest)
313                     )
314                    )
315                 {
316                         r->regmust = longest;
317                         if (backest < 0)
318                                 backest = -1;
319                         r->regback = backest;
320                         if (longest->str_cur
321                           > !(sawstudy || fold || OP(first) == EOL) )
322                                 fbmcompile(r->regmust,fold);
323                         r->regmust->str_u.str_useful = 100;
324                         if (OP(first) == EOL && longish->str_cur)
325                             r->regmust->str_pok |= SP_TAIL;
326                 }
327                 else {
328                         str_free(longest);
329                         longest = Nullstr;
330                 }
331                 str_free(longish);
332         }
333
334         r->do_folding = fold;
335         r->nparens = regnpar - 1;
336         New(1002, r->startp, regnpar, char*);
337         New(1002, r->endp, regnpar, char*);
338 #ifdef DEBUGGING
339         if (debug & 512)
340                 regdump(r);
341 #endif
342         return(r);
343 }
344
345 /*
346  - reg - regular expression, i.e. main body or parenthesized thing
347  *
348  * Caller must absorb opening parenthesis.
349  *
350  * Combining parenthesis handling with the base level of regular expression
351  * is a trifle forced, but the need to tie the tails of the branches to what
352  * follows makes it hard to avoid.
353  */
354 static char *
355 reg(paren, flagp)
356 int paren;                      /* Parenthesized? */
357 int *flagp;
358 {
359         register char *ret;
360         register char *br;
361         register char *ender;
362         register int parno;
363         int flags;
364
365         *flagp = HASWIDTH;      /* Tentatively. */
366
367         /* Make an OPEN node, if parenthesized. */
368         if (paren) {
369                 parno = regnpar;
370                 regnpar++;
371                 ret = reganode(OPEN, parno);
372         } else
373                 ret = NULL;
374
375         /* Pick up the branches, linking them together. */
376         br = regbranch(&flags);
377         if (br == NULL)
378                 return(NULL);
379         if (ret != NULL)
380                 regtail(ret, br);       /* OPEN -> first. */
381         else
382                 ret = br;
383         if (!(flags&HASWIDTH))
384                 *flagp &= ~HASWIDTH;
385         *flagp |= flags&SPSTART;
386         while (*regparse == '|') {
387                 regparse++;
388                 br = regbranch(&flags);
389                 if (br == NULL)
390                         return(NULL);
391                 regtail(ret, br);       /* BRANCH -> BRANCH. */
392                 if (!(flags&HASWIDTH))
393                         *flagp &= ~HASWIDTH;
394                 *flagp |= flags&SPSTART;
395         }
396
397         /* Make a closing node, and hook it on the end. */
398         if (paren)
399             ender = reganode(CLOSE, parno);
400         else
401             ender = regnode(END);
402         regtail(ret, ender);
403
404         /* Hook the tails of the branches to the closing node. */
405         for (br = ret; br != NULL; br = regnext(br))
406                 regoptail(br, ender);
407
408         /* Check for proper termination. */
409         if (paren && *regparse++ != ')') {
410                 FAIL("unmatched () in regexp");
411         } else if (!paren && regparse < regxend) {
412                 if (*regparse == ')') {
413                         FAIL("unmatched () in regexp");
414                 } else
415                         FAIL("junk on end of regexp");  /* "Can't happen". */
416                 /* NOTREACHED */
417         }
418
419         return(ret);
420 }
421
422 /*
423  - regbranch - one alternative of an | operator
424  *
425  * Implements the concatenation operator.
426  */
427 static char *
428 regbranch(flagp)
429 int *flagp;
430 {
431         register char *ret;
432         register char *chain;
433         register char *latest;
434         int flags;
435
436         *flagp = WORST;         /* Tentatively. */
437
438         ret = regnode(BRANCH);
439         chain = NULL;
440         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
441                 latest = regpiece(&flags);
442                 if (latest == NULL)
443                         return(NULL);
444                 *flagp |= flags&HASWIDTH;
445                 if (chain == NULL)      /* First piece. */
446                         *flagp |= flags&SPSTART;
447                 else
448                         regtail(chain, latest);
449                 chain = latest;
450         }
451         if (chain == NULL)      /* Loop ran zero times. */
452                 (void) regnode(NOTHING);
453
454         return(ret);
455 }
456
457 /*
458  - regpiece - something followed by possible [*+?]
459  *
460  * Note that the branching code sequences used for ? and the general cases
461  * of * and + are somewhat optimized:  they use the same NOTHING node as
462  * both the endmarker for their branch list and the body of the last branch.
463  * It might seem that this node could be dispensed with entirely, but the
464  * endmarker role is not redundant.
465  */
466 static char *
467 regpiece(flagp)
468 int *flagp;
469 {
470         register char *ret;
471         register char op;
472         register char *next;
473         int flags;
474         char *origparse = regparse;
475         int orignpar = regnpar;
476         char *max;
477         int iter;
478         char ch;
479
480         ret = regatom(&flags);
481         if (ret == NULL)
482                 return(NULL);
483
484         op = *regparse;
485
486         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
487          * then we decrement the first number by one and reset our
488          * parsing back to the beginning of the same atom.  If the first number
489          * is down to 0, decrement the second number instead and fake up
490          * a ? after it.  Given the way this compiler doesn't keep track
491          * of offsets on the first pass, this is the only way to replicate
492          * a piece of code.  Sigh.
493          */
494         if (op == '{' && regcurly(regparse)) {
495             next = regparse + 1;
496             max = Nullch;
497             while (isdigit(*next) || *next == ',') {
498                 if (*next == ',') {
499                     if (max)
500                         break;
501                     else
502                         max = next;
503                 }
504                 next++;
505             }
506             if (*next == '}') {         /* got one */
507                 if (!max)
508                     max = next;
509                 regparse++;
510                 iter = atoi(regparse);
511                 if (flags&SIMPLE) {     /* we can do it right after all */
512                     int tmp;
513
514                     reginsert(CURLY, ret);
515                     if (iter > 0)
516                         *flagp = (WORST|HASWIDTH);
517                     if (*max == ',')
518                         max++;
519                     else
520                         max = regparse;
521                     tmp = atoi(max);
522                     if (tmp && tmp < iter)
523                         fatal("Can't do {n,m} with n > m");
524                     if (regcode != &regdummy) {
525 #ifdef REGALIGN
526                         *(unsigned short *)(ret+3) = iter;
527                         *(unsigned short *)(ret+5) = tmp;
528 #else
529                         ret[3] = iter >> 8; ret[4] = iter & 0377;
530                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
531 #endif
532                     }
533                     regparse = next;
534                     goto nest_check;
535                 }
536                 regsawbracket++;        /* remember we clobbered exp */
537                 if (iter > 0) {
538                     ch = *max;
539                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
540                     *max = ch;
541                     if (*max == ',' && max[1] != '}') {
542                         if (atoi(max+1) <= 0)
543                             fatal("Can't do {n,m} with n > m");
544                         ch = *next;
545                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
546                         *next = ch;
547                     }
548                     if (iter != 1 || *max == ',') {
549                         regparse = origparse;   /* back up input pointer */
550                         regnpar = orignpar;     /* don't make more parens */
551                     }
552                     else {
553                         regparse = next;
554                         goto nest_check;
555                     }
556                     *flagp = flags;
557                     return ret;
558                 }
559                 if (*max == ',') {
560                     max++;
561                     iter = atoi(max);
562                     if (max == next) {          /* any number more? */
563                         regparse = next;
564                         op = '*';               /* fake up one with a star */
565                     }
566                     else if (iter > 0) {
567                         op = '?';               /* fake up optional atom */
568                         ch = *next;
569                         sprintf(max,"%.*d", next-max, iter - 1);
570                         *next = ch;
571                         if (iter == 1)
572                             regparse = next;
573                         else {
574                             regparse = origparse - 1; /* offset ++ below */
575                             regnpar = orignpar;
576                         }
577                     }
578                     else
579                         fatal("Can't do {n,0}");
580                 }
581                 else
582                     fatal("Can't do {0}");
583             }
584         }
585
586         if (!ISMULT1(op)) {
587                 *flagp = flags;
588                 return(ret);
589         }
590
591         if (!(flags&HASWIDTH) && op != '?')
592                 FAIL("regexp *+ operand could be empty");
593         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
594
595         if (op == '*' && (flags&SIMPLE))
596                 reginsert(STAR, ret);
597         else if (op == '*') {
598                 /* Emit x* as (x&|), where & means "self". */
599                 reginsert(BRANCH, ret);                 /* Either x */
600                 regoptail(ret, regnode(BACK));          /* and loop */
601                 regoptail(ret, ret);                    /* back */
602                 regtail(ret, regnode(BRANCH));          /* or */
603                 regtail(ret, regnode(NOTHING));         /* null. */
604         } else if (op == '+' && (flags&SIMPLE))
605                 reginsert(PLUS, ret);
606         else if (op == '+') {
607                 /* Emit x+ as x(&|), where & means "self". */
608                 next = regnode(BRANCH);                 /* Either */
609                 regtail(ret, next);
610                 regtail(regnode(BACK), ret);            /* loop back */
611                 regtail(next, regnode(BRANCH));         /* or */
612                 regtail(ret, regnode(NOTHING));         /* null. */
613         } else if (op == '?') {
614                 /* Emit x? as (x|) */
615                 reginsert(BRANCH, ret);                 /* Either x */
616                 regtail(ret, regnode(BRANCH));          /* or */
617                 next = regnode(NOTHING);                /* null. */
618                 regtail(ret, next);
619                 regoptail(ret, next);
620         }
621       nest_check:
622         regparse++;
623         if (ISMULT2(regparse))
624                 FAIL("nested *?+ in regexp");
625
626         return(ret);
627 }
628
629 /*
630  - regatom - the lowest level
631  *
632  * Optimization:  gobbles an entire sequence of ordinary characters so that
633  * it can turn them into a single node, which is smaller to store and
634  * faster to run.  Backslashed characters are exceptions, each becoming a
635  * separate node; the code is simpler that way and it's not worth fixing.
636  *
637  * [Yes, it is worth fixing, some scripts can run twice the speed.]
638  */
639 static char *
640 regatom(flagp)
641 int *flagp;
642 {
643         register char *ret;
644         int flags;
645
646         *flagp = WORST;         /* Tentatively. */
647
648         switch (*regparse++) {
649         case '^':
650                 ret = regnode(BOL);
651                 break;
652         case '$':
653                 ret = regnode(EOL);
654                 break;
655         case '.':
656                 ret = regnode(ANY);
657                 *flagp |= HASWIDTH|SIMPLE;
658                 break;
659         case '[':
660                 ret = regclass();
661                 *flagp |= HASWIDTH|SIMPLE;
662                 break;
663         case '(':
664                 ret = reg(1, &flags);
665                 if (ret == NULL)
666                         return(NULL);
667                 *flagp |= flags&(HASWIDTH|SPSTART);
668                 break;
669         case '|':
670         case ')':
671                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
672                 break;
673         case '?':
674         case '+':
675         case '*':
676                 FAIL("?+* follows nothing in regexp");
677                 break;
678         case '\\':
679                 switch (*regparse) {
680                 case 'w':
681                         ret = regnode(ALNUM);
682                         *flagp |= HASWIDTH|SIMPLE;
683                         regparse++;
684                         break;
685                 case 'W':
686                         ret = regnode(NALNUM);
687                         *flagp |= HASWIDTH|SIMPLE;
688                         regparse++;
689                         break;
690                 case 'b':
691                         ret = regnode(BOUND);
692                         *flagp |= SIMPLE;
693                         regparse++;
694                         break;
695                 case 'B':
696                         ret = regnode(NBOUND);
697                         *flagp |= SIMPLE;
698                         regparse++;
699                         break;
700                 case 's':
701                         ret = regnode(SPACE);
702                         *flagp |= HASWIDTH|SIMPLE;
703                         regparse++;
704                         break;
705                 case 'S':
706                         ret = regnode(NSPACE);
707                         *flagp |= HASWIDTH|SIMPLE;
708                         regparse++;
709                         break;
710                 case 'd':
711                         ret = regnode(DIGIT);
712                         *flagp |= HASWIDTH|SIMPLE;
713                         regparse++;
714                         break;
715                 case 'D':
716                         ret = regnode(NDIGIT);
717                         *flagp |= HASWIDTH|SIMPLE;
718                         regparse++;
719                         break;
720                 case 'n':
721                 case 'r':
722                 case 't':
723                 case 'f':
724                 case 'e':
725                 case 'a':
726                 case 'x':
727                 case 'c':
728                 case '0':
729                         goto defchar;
730                 case '1': case '2': case '3': case '4':
731                 case '5': case '6': case '7': case '8': case '9':
732                         {
733                             int num = atoi(regparse);
734
735                             if (num > 9 && num >= regnpar)
736                                 goto defchar;
737                             else {
738                                 ret = reganode(REF, num);
739                                 while (isascii(*regparse) && isdigit(*regparse))
740                                     regparse++;
741                                 *flagp |= SIMPLE;
742                             }
743                         }
744                         break;
745                 case '\0':
746                         if (regparse >= regxend)
747                             FAIL("trailing \\ in regexp");
748                         /* FALL THROUGH */
749                 default:
750                         goto defchar;
751                 }
752                 break;
753         default: {
754                         register int len;
755                         register char ender;
756                         register char *p;
757                         char *oldp;
758                         int numlen;
759
760                     defchar:
761                         ret = regnode(EXACTLY);
762                         regc(0);                /* save spot for len */
763                         for (len=0, p=regparse-1;
764                           len < 127 && p < regxend;
765                           len++)
766                         {
767                             oldp = p;
768                             switch (*p) {
769                             case '^':
770                             case '$':
771                             case '.':
772                             case '[':
773                             case '(':
774                             case ')':
775                             case '|':
776                                 goto loopdone;
777                             case '\\':
778                                 switch (*++p) {
779                                 case 'w':
780                                 case 'W':
781                                 case 'b':
782                                 case 'B':
783                                 case 's':
784                                 case 'S':
785                                 case 'd':
786                                 case 'D':
787                                     --p;
788                                     goto loopdone;
789                                 case 'n':
790                                         ender = '\n';
791                                         p++;
792                                         break;
793                                 case 'r':
794                                         ender = '\r';
795                                         p++;
796                                         break;
797                                 case 't':
798                                         ender = '\t';
799                                         p++;
800                                         break;
801                                 case 'f':
802                                         ender = '\f';
803                                         p++;
804                                         break;
805                                 case 'e':
806                                         ender = '\033';
807                                         p++;
808                                         break;
809                                 case 'a':
810                                         ender = '\007';
811                                         p++;
812                                         break;
813                                 case 'x':
814                                     ender = scanhex(++p, 2, &numlen);
815                                     p += numlen;
816                                     break;
817                                 case 'c':
818                                     p++;
819                                     ender = *p++;
820                                     if (islower(ender))
821                                         ender = toupper(ender);
822                                     ender ^= 64;
823                                     break;
824                                 case '0': case '1': case '2': case '3':case '4':
825                                 case '5': case '6': case '7': case '8':case '9':
826                                     if (*p == '0' ||
827                                       (isdigit(p[1]) && atoi(p) >= regnpar) ) {
828                                         ender = scanoct(p, 3, &numlen);
829                                         p += numlen;
830                                     }
831                                     else {
832                                         --p;
833                                         goto loopdone;
834                                     }
835                                     break;
836                                 case '\0':
837                                     if (p >= regxend)
838                                         FAIL("trailing \\ in regexp");
839                                     /* FALL THROUGH */
840                                 default:
841                                     ender = *p++;
842                                     break;
843                                 }
844                                 break;
845                             default:
846                                 ender = *p++;
847                                 break;
848                             }
849                             if (regfold && isupper(ender))
850                                     ender = tolower(ender);
851                             if (ISMULT2(p)) { /* Back off on ?+*. */
852                                 if (len)
853                                     p = oldp;
854                                 else {
855                                     len++;
856                                     regc(ender);
857                                 }
858                                 break;
859                             }
860                             regc(ender);
861                         }
862                     loopdone:
863                         regparse = p;
864                         if (len <= 0)
865                                 FAIL("internal disaster in regexp");
866                         *flagp |= HASWIDTH;
867                         if (len == 1)
868                                 *flagp |= SIMPLE;
869                         if (regcode != &regdummy)
870                             *OPERAND(ret) = len;
871                         regc('\0');
872                 }
873                 break;
874         }
875
876         return(ret);
877 }
878
879 static void
880 regset(bits,def,c)
881 char *bits;
882 int def;
883 register int c;
884 {
885         if (regcode == &regdummy)
886             return;
887         c &= 255;
888         if (def)
889                 bits[c >> 3] &= ~(1 << (c & 7));
890         else
891                 bits[c >> 3] |=  (1 << (c & 7));
892 }
893
894 static char *
895 regclass()
896 {
897         register char *bits;
898         register int class;
899         register int lastclass;
900         register int range = 0;
901         register char *ret;
902         register int def;
903         int numlen;
904
905         ret = regnode(ANYOF);
906         if (*regparse == '^') { /* Complement of range. */
907                 regparse++;
908                 def = 0;
909         } else {
910                 def = 255;
911         }
912         bits = regcode;
913         for (class = 0; class < 32; class++)
914             regc(def);
915         if (*regparse == ']' || *regparse == '-')
916                 goto skipcond;          /* allow 1st char to be ] or - */
917         while (regparse < regxend && *regparse != ']') {
918               skipcond:
919                 class = UCHARAT(regparse++);
920                 if (class == '\\') {
921                         class = UCHARAT(regparse++);
922                         switch (class) {
923                         case 'w':
924                                 for (class = 'a'; class <= 'z'; class++)
925                                         regset(bits,def,class);
926                                 for (class = 'A'; class <= 'Z'; class++)
927                                         regset(bits,def,class);
928                                 for (class = '0'; class <= '9'; class++)
929                                         regset(bits,def,class);
930                                 regset(bits,def,'_');
931                                 lastclass = 1234;
932                                 continue;
933                         case 's':
934                                 regset(bits,def,' ');
935                                 regset(bits,def,'\t');
936                                 regset(bits,def,'\r');
937                                 regset(bits,def,'\f');
938                                 regset(bits,def,'\n');
939                                 lastclass = 1234;
940                                 continue;
941                         case 'd':
942                                 for (class = '0'; class <= '9'; class++)
943                                         regset(bits,def,class);
944                                 lastclass = 1234;
945                                 continue;
946                         case 'n':
947                                 class = '\n';
948                                 break;
949                         case 'r':
950                                 class = '\r';
951                                 break;
952                         case 't':
953                                 class = '\t';
954                                 break;
955                         case 'f':
956                                 class = '\f';
957                                 break;
958                         case 'b':
959                                 class = '\b';
960                                 break;
961                         case 'e':
962                                 class = '\033';
963                                 break;
964                         case 'a':
965                                 class = '\007';
966                                 break;
967                         case 'x':
968                                 class = scanhex(regparse, 2, &numlen);
969                                 regparse += numlen;
970                                 break;
971                         case 'c':
972                                 class = *regparse++;
973                                 if (islower(class))
974                                     class = toupper(class);
975                                 class ^= 64;
976                                 break;
977                         case '0': case '1': case '2': case '3': case '4':
978                         case '5': case '6': case '7': case '8': case '9':
979                                 class = scanoct(--regparse, 3, &numlen);
980                                 regparse += numlen;
981                                 break;
982                         }
983                 }
984                 if (range) {
985                         if (lastclass > class)
986                                 FAIL("invalid [] range in regexp");
987                         range = 0;
988                 }
989                 else {
990                         lastclass = class;
991                         if (*regparse == '-' && regparse+1 < regxend &&
992                             regparse[1] != ']') {
993                                 regparse++;
994                                 range = 1;
995                                 continue;       /* do it next time */
996                         }
997                 }
998                 for ( ; lastclass <= class; lastclass++) {
999                         regset(bits,def,lastclass);
1000                         if (regfold && isupper(lastclass))
1001                                 regset(bits,def,tolower(lastclass));
1002                 }
1003                 lastclass = class;
1004         }
1005         if (*regparse != ']')
1006                 FAIL("unmatched [] in regexp");
1007         regparse++;
1008         return ret;
1009 }
1010
1011 /*
1012  - regnode - emit a node
1013  */
1014 static char *                   /* Location. */
1015 regnode(op)
1016 char op;
1017 {
1018         register char *ret;
1019         register char *ptr;
1020
1021         ret = regcode;
1022         if (ret == &regdummy) {
1023 #ifdef REGALIGN
1024                 if (!(regsize & 1))
1025                         regsize++;
1026 #endif
1027                 regsize += 3;
1028                 return(ret);
1029         }
1030
1031 #ifdef REGALIGN
1032 #ifndef lint
1033         if (!((long)ret & 1))
1034             *ret++ = 127;
1035 #endif
1036 #endif
1037         ptr = ret;
1038         *ptr++ = op;
1039         *ptr++ = '\0';          /* Null "next" pointer. */
1040         *ptr++ = '\0';
1041         regcode = ptr;
1042
1043         return(ret);
1044 }
1045
1046 /*
1047  - reganode - emit a node with an argument
1048  */
1049 static char *                   /* Location. */
1050 reganode(op, arg)
1051 char op;
1052 unsigned short arg;
1053 {
1054         register char *ret;
1055         register char *ptr;
1056
1057         ret = regcode;
1058         if (ret == &regdummy) {
1059 #ifdef REGALIGN
1060                 if (!(regsize & 1))
1061                         regsize++;
1062 #endif
1063                 regsize += 5;
1064                 return(ret);
1065         }
1066
1067 #ifdef REGALIGN
1068 #ifndef lint
1069         if (!((long)ret & 1))
1070             *ret++ = 127;
1071 #endif
1072 #endif
1073         ptr = ret;
1074         *ptr++ = op;
1075         *ptr++ = '\0';          /* Null "next" pointer. */
1076         *ptr++ = '\0';
1077 #ifdef REGALIGN
1078         *(unsigned short *)(ret+3) = arg;
1079 #else
1080         ret[3] = arg >> 8; ret[4] = arg & 0377;
1081 #endif
1082         ptr += 2;
1083         regcode = ptr;
1084
1085         return(ret);
1086 }
1087
1088 /*
1089  - regc - emit (if appropriate) a byte of code
1090  */
1091 static void
1092 regc(b)
1093 char b;
1094 {
1095         if (regcode != &regdummy)
1096                 *regcode++ = b;
1097         else
1098                 regsize++;
1099 }
1100
1101 /*
1102  - reginsert - insert an operator in front of already-emitted operand
1103  *
1104  * Means relocating the operand.
1105  */
1106 static void
1107 reginsert(op, opnd)
1108 char op;
1109 char *opnd;
1110 {
1111         register char *src;
1112         register char *dst;
1113         register char *place;
1114         register offset = (op == CURLY ? 4 : 0);
1115
1116         if (regcode == &regdummy) {
1117 #ifdef REGALIGN
1118                 regsize += 4 + offset;
1119 #else
1120                 regsize += 3 + offset;
1121 #endif
1122                 return;
1123         }
1124
1125         src = regcode;
1126 #ifdef REGALIGN
1127         regcode += 4 + offset;
1128 #else
1129         regcode += 3 + offset;
1130 #endif
1131         dst = regcode;
1132         while (src > opnd)
1133                 *--dst = *--src;
1134
1135         place = opnd;           /* Op node, where operand used to be. */
1136         *place++ = op;
1137         *place++ = '\0';
1138         *place++ = '\0';
1139         while (offset-- > 0)
1140             *place++ = '\0';
1141 }
1142
1143 /*
1144  - regtail - set the next-pointer at the end of a node chain
1145  */
1146 static void
1147 regtail(p, val)
1148 char *p;
1149 char *val;
1150 {
1151         register char *scan;
1152         register char *temp;
1153         register int offset;
1154
1155         if (p == &regdummy)
1156                 return;
1157
1158         /* Find last node. */
1159         scan = p;
1160         for (;;) {
1161                 temp = regnext(scan);
1162                 if (temp == NULL)
1163                         break;
1164                 scan = temp;
1165         }
1166
1167 #ifdef REGALIGN
1168         offset = val - scan;
1169 #ifndef lint
1170         *(short*)(scan+1) = offset;
1171 #else
1172         offset = offset;
1173 #endif
1174 #else
1175         if (OP(scan) == BACK)
1176                 offset = scan - val;
1177         else
1178                 offset = val - scan;
1179         *(scan+1) = (offset>>8)&0377;
1180         *(scan+2) = offset&0377;
1181 #endif
1182 }
1183
1184 /*
1185  - regoptail - regtail on operand of first argument; nop if operandless
1186  */
1187 static void
1188 regoptail(p, val)
1189 char *p;
1190 char *val;
1191 {
1192         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1193         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1194                 return;
1195         regtail(NEXTOPER(p), val);
1196 }
1197
1198 /*
1199  - regcurly - a little FSA that accepts {\d+,?\d*}
1200  */
1201 STATIC int
1202 regcurly(s)
1203 register char *s;
1204 {
1205     if (*s++ != '{')
1206         return FALSE;
1207     if (!isdigit(*s))
1208         return FALSE;
1209     while (isdigit(*s))
1210         s++;
1211     if (*s == ',')
1212         s++;
1213     while (isdigit(*s))
1214         s++;
1215     if (*s != '}')
1216         return FALSE;
1217     return TRUE;
1218 }
1219
1220 #ifdef DEBUGGING
1221
1222 /*
1223  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1224  */
1225 void
1226 regdump(r)
1227 regexp *r;
1228 {
1229         register char *s;
1230         register char op = EXACTLY;     /* Arbitrary non-END op. */
1231         register char *next;
1232
1233
1234         s = r->program + 1;
1235         while (op != END) {     /* While that wasn't END last time... */
1236 #ifdef REGALIGN
1237                 if (!((long)s & 1))
1238                         s++;
1239 #endif
1240                 op = OP(s);
1241                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1242                 next = regnext(s);
1243                 s += regarglen[op];
1244                 if (next == NULL)               /* Next ptr. */
1245                         fprintf(stderr,"(0)");
1246                 else 
1247                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1248                 s += 3;
1249                 if (op == ANYOF) {
1250                         s += 32;
1251                 }
1252                 if (op == EXACTLY) {
1253                         /* Literal string, where present. */
1254                         s++;
1255                         while (*s != '\0') {
1256                                 (void)putchar(*s);
1257                                 s++;
1258                         }
1259                         s++;
1260                 }
1261                 (void)putchar('\n');
1262         }
1263
1264         /* Header fields of interest. */
1265         if (r->regstart)
1266                 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1267         if (r->regstclass)
1268                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1269         if (r->reganch & 1)
1270                 fprintf(stderr,"anchored ");
1271         if (r->reganch & 2)
1272                 fprintf(stderr,"plus ");
1273         if (r->regmust != NULL)
1274                 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1275                   r->regback);
1276         fprintf(stderr,"\n");
1277 }
1278
1279 /*
1280  - regprop - printable representation of opcode
1281  */
1282 char *
1283 regprop(op)
1284 char *op;
1285 {
1286         register char *p;
1287
1288         (void) strcpy(buf, ":");
1289
1290         switch (OP(op)) {
1291         case BOL:
1292                 p = "BOL";
1293                 break;
1294         case EOL:
1295                 p = "EOL";
1296                 break;
1297         case ANY:
1298                 p = "ANY";
1299                 break;
1300         case ANYOF:
1301                 p = "ANYOF";
1302                 break;
1303         case BRANCH:
1304                 p = "BRANCH";
1305                 break;
1306         case EXACTLY:
1307                 p = "EXACTLY";
1308                 break;
1309         case NOTHING:
1310                 p = "NOTHING";
1311                 break;
1312         case BACK:
1313                 p = "BACK";
1314                 break;
1315         case END:
1316                 p = "END";
1317                 break;
1318         case ALNUM:
1319                 p = "ALNUM";
1320                 break;
1321         case NALNUM:
1322                 p = "NALNUM";
1323                 break;
1324         case BOUND:
1325                 p = "BOUND";
1326                 break;
1327         case NBOUND:
1328                 p = "NBOUND";
1329                 break;
1330         case SPACE:
1331                 p = "SPACE";
1332                 break;
1333         case NSPACE:
1334                 p = "NSPACE";
1335                 break;
1336         case DIGIT:
1337                 p = "DIGIT";
1338                 break;
1339         case NDIGIT:
1340                 p = "NDIGIT";
1341                 break;
1342         case CURLY:
1343                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1344                     ARG1(op),ARG2(op));
1345                 p = NULL;
1346                 break;
1347         case REF:
1348                 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1349                 p = NULL;
1350                 break;
1351         case OPEN:
1352                 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1353                 p = NULL;
1354                 break;
1355         case CLOSE:
1356                 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1357                 p = NULL;
1358                 break;
1359         case STAR:
1360                 p = "STAR";
1361                 break;
1362         case PLUS:
1363                 p = "PLUS";
1364                 break;
1365         default:
1366                 FAIL("corrupted regexp opcode");
1367         }
1368         if (p != NULL)
1369                 (void) strcat(buf, p);
1370         return(buf);
1371 }
1372 #endif /* DEBUGGING */
1373
1374 regfree(r)
1375 struct regexp *r;
1376 {
1377         if (r->precomp) {
1378                 Safefree(r->precomp);
1379                 r->precomp = Nullch;
1380         }
1381         if (r->subbase) {
1382                 Safefree(r->subbase);
1383                 r->subbase = Nullch;
1384         }
1385         if (r->regmust) {
1386                 str_free(r->regmust);
1387                 r->regmust = Nullstr;
1388         }
1389         if (r->regstart) {
1390                 str_free(r->regstart);
1391                 r->regstart = Nullstr;
1392         }
1393         Safefree(r->startp);
1394         Safefree(r->endp);
1395         Safefree(r);
1396 }