perl 5.0 alpha 3
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2  * confused with the original package (see point 3 below).  Thanks, Henry!
3  */
4
5 /* Additional note: this code is very heavily munged from Henry's version
6  * in places.  In some spots I've traded clarity for efficiency, so don't
7  * blame Henry for some of the lack of readability.
8  */
9
10 /* $RCSfile: regcomp.c,v $$Revision: 4.1 $$Date: 92/08/07 18:26:28 $
11  *
12  * $Log:        regcomp.c,v $
13  * Revision 4.1  92/08/07  18:26:28  lwall
14  * 
15  * Revision 4.0.1.5  92/06/08  15:23:36  lwall
16  * patch20: Perl now distinguishes overlapped copies from non-overlapped
17  * patch20: /^stuff/ wrongly assumed an implicit $* == 1
18  * patch20: /x{0}/ was wrongly interpreted as /x{0,}/
19  * patch20: added \W, \S and \D inside /[...]/
20  * 
21  * Revision 4.0.1.4  91/11/05  22:55:14  lwall
22  * patch11: Erratum
23  * 
24  * Revision 4.0.1.3  91/11/05  18:22:28  lwall
25  * patch11: minimum match length calculation in regexp is now cumulative
26  * patch11: initial .* in pattern had dependency on value of $*
27  * patch11: certain patterns made use of garbage pointers from uncleared memory
28  * patch11: prepared for ctype implementations that don't define isascii()
29  * 
30  * Revision 4.0.1.2  91/06/07  11:48:24  lwall
31  * patch4: new copyright notice
32  * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
33  * patch4: // wouldn't use previous pattern if it started with a null character
34  * 
35  * Revision 4.0.1.1  91/04/12  09:04:45  lwall
36  * patch1: random cleanup in cpp namespace
37  * 
38  * Revision 4.0  91/03/20  01:39:01  lwall
39  * 4.0 baseline.
40  * 
41  */
42 /*SUPPRESS 112*/
43 /*
44  * regcomp and regexec -- regsub and regerror are not used in perl
45  *
46  *      Copyright (c) 1986 by University of Toronto.
47  *      Written by Henry Spencer.  Not derived from licensed software.
48  *
49  *      Permission is granted to anyone to use this software for any
50  *      purpose on any computer system, and to redistribute it freely,
51  *      subject to the following restrictions:
52  *
53  *      1. The author is not responsible for the consequences of use of
54  *              this software, no matter how awful, even if they arise
55  *              from defects in it.
56  *
57  *      2. The origin of this software must not be misrepresented, either
58  *              by explicit claim or by omission.
59  *
60  *      3. Altered versions must be plainly marked as such, and must not
61  *              be misrepresented as being the original software.
62  *
63  *
64  ****    Alterations to Henry's code are...
65  ****
66  ****    Copyright (c) 1991, Larry Wall
67  ****
68  ****    You may distribute under the terms of either the GNU General Public
69  ****    License or the Artistic License, as specified in the README file.
70
71  *
72  * Beware that some of this code is subtly aware of the way operator
73  * precedence is structured in regular expressions.  Serious changes in
74  * regular-expression syntax might require a total rethink.
75  */
76 #include "EXTERN.h"
77 #include "perl.h"
78 #include "INTERN.h"
79 #include "regcomp.h"
80
81 #ifdef MSDOS
82 # if defined(BUGGY_MSC6)
83  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
84  # pragma optimize("a",off)
85  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
86  # pragma optimize("w",on )
87 # endif /* BUGGY_MSC6 */
88 #endif /* MSDOS */
89
90 #ifndef STATIC
91 #define STATIC  static
92 #endif
93
94 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
95 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
96         ((*s) == '{' && regcurly(s)))
97 #ifdef atarist
98 #define PERL_META       "^$.[()|?+*\\"
99 #else
100 #define META    "^$.[()|?+*\\"
101 #endif
102
103 #ifdef SPSTART
104 #undef SPSTART          /* dratted cpp namespace... */
105 #endif
106 /*
107  * Flags to be passed up and down.
108  */
109 #define HASWIDTH        01      /* Known never to match null string. */
110 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
111 #define SPSTART         04      /* Starts with * or +. */
112 #define WORST           0       /* Worst case. */
113
114 /*
115  * Forward declarations for regcomp()'s friends.
116  */
117 STATIC I32 regcurly();
118 STATIC char *reg();
119 STATIC char *regbranch();
120 STATIC char *regpiece();
121 STATIC char *regatom();
122 STATIC char *regclass();
123 STATIC char *regnode();
124 STATIC char *reganode();
125 STATIC void regc();
126 STATIC void reginsert();
127 STATIC void regtail();
128 STATIC void regoptail();
129
130 /*
131  - regcomp - compile a regular expression into internal code
132  *
133  * We can't allocate space until we know how big the compiled form will be,
134  * but we can't compile it (and thus know how big it is) until we've got a
135  * place to put the code.  So we cheat:  we compile it twice, once with code
136  * generation turned off and size counting turned on, and once "for real".
137  * This also means that we don't allocate space until we are sure that the
138  * thing really will compile successfully, and we never have to move the
139  * code and thus invalidate pointers into it.  (Note that it has to be in
140  * one piece because free() must be able to free it all.) [NB: not true in perl]
141  *
142  * Beware that the optimization-preparation code in here knows about some
143  * of the structure of the compiled regexp.  [I'll say.]
144  */
145 regexp *
146 regcomp(exp,xend,fold)
147 char *exp;
148 char *xend;
149 I32 fold;
150 {
151         register regexp *r;
152         register char *scan;
153         register SV *longish;
154         SV *longest;
155         register I32 len;
156         register char *first;
157         I32 flags;
158         I32 backish;
159         I32 backest;
160         I32 curback;
161         I32 minlen;
162         I32 sawplus = 0;
163         I32 sawopen = 0;
164
165         if (exp == NULL)
166                 fatal("NULL regexp argument");
167
168         /* First pass: determine size, legality. */
169         regfold = fold;
170         regparse = exp;
171         regxend = xend;
172         regprecomp = nsavestr(exp,xend-exp);
173         regsawbracket = 0;
174         regsawback = 0;
175         regnpar = 1;
176         regsize = 0L;
177         regcode = &regdummy;
178         regc((char)MAGIC);
179         if (reg(0, &flags) == NULL) {
180                 Safefree(regprecomp);
181                 regprecomp = Nullch;
182                 return(NULL);
183         }
184
185         /* Small enough for pointer-storage convention? */
186         if (regsize >= 32767L)          /* Probably could be 65535L. */
187                 FAIL("regexp too big");
188
189         /* Allocate space. */
190         Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
191         if (r == NULL)
192                 FAIL("regexp out of space");
193
194         /* Second pass: emit code. */
195         if (regsawbracket)
196             Copy(regprecomp,exp,xend-exp,char);
197         r->prelen = xend-exp;
198         r->precomp = regprecomp;
199         r->subbeg = r->subbase = NULL;
200         regparse = exp;
201         regnpar = 1;
202         regcode = r->program;
203         regc((char)MAGIC);
204         if (reg(0, &flags) == NULL)
205                 return(NULL);
206
207         /* Dig out information for optimizations. */
208         r->regstart = Nullsv;   /* Worst-case defaults. */
209         r->reganch = 0;
210         r->regmust = Nullsv;
211         r->regback = -1;
212         r->regstclass = Nullch;
213         scan = r->program+1;                    /* First BRANCH. */
214         if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
215                 scan = NEXTOPER(scan);
216
217                 first = scan;
218                 while ((OP(first) == OPEN && (sawopen = 1)) ||
219                     (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
220                     (OP(first) == PLUS) ||
221                     (OP(first) == CURLY && ARG1(first) > 0) ) {
222                         if (OP(first) == PLUS)
223                             sawplus = 1;
224                         else
225                             first += regarglen[OP(first)];
226                         first = NEXTOPER(first);
227                 }
228
229                 /* Starting-point info. */
230             again:
231                 if (OP(first) == EXACTLY) {
232                         r->regstart =
233                             newSVpv(OPERAND(first)+1,*OPERAND(first));
234                         if (SvCUR(r->regstart) > !(sawstudy|fold))
235                                 fbm_compile(r->regstart,fold);
236                         else
237                                 sv_upgrade(r->regstart, SVt_PVBM);
238                 }
239                 else if ((exp = strchr(simple,OP(first))) && exp > simple)
240                         r->regstclass = first;
241                 else if (OP(first) == BOUND || OP(first) == NBOUND)
242                         r->regstclass = first;
243                 else if (OP(first) == BOL) {
244                         r->reganch = ROPT_ANCH;
245                         first = NEXTOPER(first);
246                         goto again;
247                 }
248                 else if ((OP(first) == STAR && OP(NEXTOPER(first)) == ANY) &&
249                          !(r->reganch & ROPT_ANCH) ) {
250                         /* turn .* into ^.* with an implied $*=1 */
251                         r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
252                         first = NEXTOPER(first);
253                         goto again;
254                 }
255                 if (sawplus && (!sawopen || !regsawback))
256                     r->reganch |= ROPT_SKIP;    /* x+ must match 1st of run */
257
258                 DEBUG_r(fprintf(stderr,"first %d next %d offset %d\n",
259                       OP(first), OP(NEXTOPER(first)), first - scan));
260                 /*
261                  * If there's something expensive in the r.e., find the
262                  * longest literal string that must appear and make it the
263                  * regmust.  Resolve ties in favor of later strings, since
264                  * the regstart check works with the beginning of the r.e.
265                  * and avoiding duplication strengthens checking.  Not a
266                  * strong reason, but sufficient in the absence of others.
267                  * [Now we resolve ties in favor of the earlier string if
268                  * it happens that curback has been invalidated, since the
269                  * earlier string may buy us something the later one won't.]
270                  */
271                 longish = newSVpv("",0);
272                 longest = newSVpv("",0);
273                 len = 0;
274                 minlen = 0;
275                 curback = 0;
276                 backish = 0;
277                 backest = 0;
278                 while (OP(scan) != END) {
279                         if (OP(scan) == BRANCH) {
280                             if (OP(regnext(scan)) == BRANCH) {
281                                 curback = -30000;
282                                 while (OP(scan) == BRANCH)
283                                     scan = regnext(scan);
284                             }
285                             else        /* single branch is ok */
286                                 scan = NEXTOPER(scan);
287                         }
288                         if (OP(scan) == EXACTLY) {
289                             char *t;
290
291                             first = scan;
292                             while (OP(t = regnext(scan)) == CLOSE)
293                                 scan = t;
294                             minlen += *OPERAND(first);
295                             if (curback - backish == len) {
296                                 sv_catpvn(longish, OPERAND(first)+1,
297                                     *OPERAND(first));
298                                 len += *OPERAND(first);
299                                 curback += *OPERAND(first);
300                                 first = regnext(scan);
301                             }
302                             else if (*OPERAND(first) >= len + (curback >= 0)) {
303                                 len = *OPERAND(first);
304                                 sv_setpvn(longish, OPERAND(first)+1,len);
305                                 backish = curback;
306                                 curback += len;
307                                 first = regnext(scan);
308                             }
309                             else
310                                 curback += *OPERAND(first);
311                         }
312                         else if (strchr(varies,OP(scan))) {
313                             curback = -30000;
314                             len = 0;
315                             if (SvCUR(longish) > SvCUR(longest)) {
316                                 sv_setsv(longest,longish);
317                                 backest = backish;
318                             }
319                             sv_setpvn(longish,"",0);
320                             if (OP(scan) == PLUS &&
321                               strchr(simple,OP(NEXTOPER(scan))))
322                                 minlen++;
323                             else if (OP(scan) == CURLY &&
324                               strchr(simple,OP(NEXTOPER(scan)+4)))
325                                 minlen += ARG1(scan);
326                         }
327                         else if (strchr(simple,OP(scan))) {
328                             curback++;
329                             minlen++;
330                             len = 0;
331                             if (SvCUR(longish) > SvCUR(longest)) {
332                                 sv_setsv(longest,longish);
333                                 backest = backish;
334                             }
335                             sv_setpvn(longish,"",0);
336                         }
337                         scan = regnext(scan);
338                 }
339
340                 /* Prefer earlier on tie, unless we can tail match latter */
341
342                 if (SvCUR(longish) + (OP(first) == EOL) > SvCUR(longest)) {
343                     sv_setsv(longest,longish);
344                     backest = backish;
345                 }
346                 else
347                     sv_setpvn(longish,"",0);
348                 if (SvCUR(longest)
349                     &&
350                     (!r->regstart
351                      ||
352                      !fbm_instr((unsigned char*) SvPV(r->regstart),
353                           (unsigned char *) SvPV(r->regstart)
354                             + SvCUR(r->regstart),
355                           longest)
356                     )
357                    )
358                 {
359                         r->regmust = longest;
360                         if (backest < 0)
361                                 backest = -1;
362                         r->regback = backest;
363                         if (SvCUR(longest)
364                           > !(sawstudy || fold || OP(first) == EOL) )
365                                 fbm_compile(r->regmust,fold);
366                         SvUPGRADE(r->regmust, SVt_PVBM);
367                         BmUSEFUL(r->regmust) = 100;
368                         if (OP(first) == EOL && SvCUR(longish))
369                             SvTAIL_on(r->regmust);
370                 }
371                 else {
372                         sv_free(longest);
373                         longest = Nullsv;
374                 }
375                 sv_free(longish);
376         }
377
378         r->do_folding = fold;
379         r->nparens = regnpar - 1;
380         r->minlen = minlen;
381         Newz(1002, r->startp, regnpar, char*);
382         Newz(1002, r->endp, regnpar, char*);
383         DEBUG_r(regdump(r));
384         return(r);
385 }
386
387 /*
388  - reg - regular expression, i.e. main body or parenthesized thing
389  *
390  * Caller must absorb opening parenthesis.
391  *
392  * Combining parenthesis handling with the base level of regular expression
393  * is a trifle forced, but the need to tie the tails of the branches to what
394  * follows makes it hard to avoid.
395  */
396 static char *
397 reg(paren, flagp)
398 I32 paren;                      /* Parenthesized? */
399 I32 *flagp;
400 {
401         register char *ret;
402         register char *br;
403         register char *ender;
404         register I32 parno;
405         I32 flags;
406
407         *flagp = HASWIDTH;      /* Tentatively. */
408
409         /* Make an OPEN node, if parenthesized. */
410         if (paren) {
411                 parno = regnpar;
412                 regnpar++;
413                 ret = reganode(OPEN, parno);
414         } else
415                 ret = NULL;
416
417         /* Pick up the branches, linking them together. */
418         br = regbranch(&flags);
419         if (br == NULL)
420                 return(NULL);
421         if (ret != NULL)
422                 regtail(ret, br);       /* OPEN -> first. */
423         else
424                 ret = br;
425         if (!(flags&HASWIDTH))
426                 *flagp &= ~HASWIDTH;
427         *flagp |= flags&SPSTART;
428         while (*regparse == '|') {
429                 regparse++;
430                 br = regbranch(&flags);
431                 if (br == NULL)
432                         return(NULL);
433                 regtail(ret, br);       /* BRANCH -> BRANCH. */
434                 if (!(flags&HASWIDTH))
435                         *flagp &= ~HASWIDTH;
436                 *flagp |= flags&SPSTART;
437         }
438
439         /* Make a closing node, and hook it on the end. */
440         if (paren)
441             ender = reganode(CLOSE, parno);
442         else
443             ender = regnode(END);
444         regtail(ret, ender);
445
446         /* Hook the tails of the branches to the closing node. */
447         for (br = ret; br != NULL; br = regnext(br))
448                 regoptail(br, ender);
449
450         /* Check for proper termination. */
451         if (paren && *regparse++ != ')') {
452                 FAIL("unmatched () in regexp");
453         } else if (!paren && regparse < regxend) {
454                 if (*regparse == ')') {
455                         FAIL("unmatched () in regexp");
456                 } else
457                         FAIL("junk on end of regexp");  /* "Can't happen". */
458                 /* NOTREACHED */
459         }
460
461         return(ret);
462 }
463
464 /*
465  - regbranch - one alternative of an | operator
466  *
467  * Implements the concatenation operator.
468  */
469 static char *
470 regbranch(flagp)
471 I32 *flagp;
472 {
473         register char *ret;
474         register char *chain;
475         register char *latest;
476         I32 flags;
477
478         *flagp = WORST;         /* Tentatively. */
479
480         ret = regnode(BRANCH);
481         chain = NULL;
482         while (regparse < regxend && *regparse != '|' && *regparse != ')') {
483                 latest = regpiece(&flags);
484                 if (latest == NULL)
485                         return(NULL);
486                 *flagp |= flags&HASWIDTH;
487                 if (chain == NULL)      /* First piece. */
488                         *flagp |= flags&SPSTART;
489                 else
490                         regtail(chain, latest);
491                 chain = latest;
492         }
493         if (chain == NULL)      /* Loop ran zero times. */
494                 (void) regnode(NOTHING);
495
496         return(ret);
497 }
498
499 /*
500  - regpiece - something followed by possible [*+?]
501  *
502  * Note that the branching code sequences used for ? and the general cases
503  * of * and + are somewhat optimized:  they use the same NOTHING node as
504  * both the endmarker for their branch list and the body of the last branch.
505  * It might seem that this node could be dispensed with entirely, but the
506  * endmarker role is not redundant.
507  */
508 static char *
509 regpiece(flagp)
510 I32 *flagp;
511 {
512         register char *ret;
513         register char op;
514         register char *next;
515         I32 flags;
516         char *origparse = regparse;
517         I32 orignpar = regnpar;
518         char *max;
519         I32 iter;
520         char ch;
521
522         ret = regatom(&flags);
523         if (ret == NULL)
524                 return(NULL);
525
526         op = *regparse;
527
528         /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
529          * then we decrement the first number by one and reset our
530          * parsing back to the beginning of the same atom.  If the first number
531          * is down to 0, decrement the second number instead and fake up
532          * a ? after it.  Given the way this compiler doesn't keep track
533          * of offsets on the first pass, this is the only way to replicate
534          * a piece of code.  Sigh.
535          */
536         if (op == '{' && regcurly(regparse)) {
537             next = regparse + 1;
538             max = Nullch;
539             while (isDIGIT(*next) || *next == ',') {
540                 if (*next == ',') {
541                     if (max)
542                         break;
543                     else
544                         max = next;
545                 }
546                 next++;
547             }
548             if (*next == '}') {         /* got one */
549                 if (!max)
550                     max = next;
551                 regparse++;
552                 iter = atoi(regparse);
553                 if (flags&SIMPLE) {     /* we can do it right after all */
554                     I32 tmp;
555
556                     reginsert(CURLY, ret);
557                     if (iter > 0)
558                         *flagp = (WORST|HASWIDTH);
559                     if (*max == ',')
560                         max++;
561                     else
562                         max = regparse;
563                     tmp = atoi(max);
564                     if (!tmp && *max != '0')
565                         tmp = 32767;            /* meaning "infinity" */
566                     if (tmp && tmp < iter)
567                         fatal("Can't do {n,m} with n > m");
568                     if (regcode != &regdummy) {
569 #ifdef REGALIGN
570                         *(unsigned short *)(ret+3) = iter;
571                         *(unsigned short *)(ret+5) = tmp;
572 #else
573                         ret[3] = iter >> 8; ret[4] = iter & 0377;
574                         ret[5] = tmp  >> 8; ret[6] = tmp  & 0377;
575 #endif
576                     }
577                     regparse = next;
578                     goto nest_check;
579                 }
580                 regsawbracket++;        /* remember we clobbered exp */
581                 if (iter > 0) {
582                     ch = *max;
583                     sprintf(regparse,"%.*d", max-regparse, iter - 1);
584                     *max = ch;
585                     if (*max == ',' && max[1] != '}') {
586                         if (atoi(max+1) <= 0)
587                             fatal("Can't do {n,m} with n > m");
588                         ch = *next;
589                         sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
590                         *next = ch;
591                     }
592                     if (iter != 1 || *max == ',') {
593                         regparse = origparse;   /* back up input pointer */
594                         regnpar = orignpar;     /* don't make more parens */
595                     }
596                     else {
597                         regparse = next;
598                         goto nest_check;
599                     }
600                     *flagp = flags;
601                     return ret;
602                 }
603                 if (*max == ',') {
604                     max++;
605                     iter = atoi(max);
606                     if (max == next) {          /* any number more? */
607                         regparse = next;
608                         op = '*';               /* fake up one with a star */
609                     }
610                     else if (iter > 0) {
611                         op = '?';               /* fake up optional atom */
612                         ch = *next;
613                         sprintf(max,"%.*d", next-max, iter - 1);
614                         *next = ch;
615                         if (iter == 1)
616                             regparse = next;
617                         else {
618                             regparse = origparse - 1; /* offset ++ below */
619                             regnpar = orignpar;
620                         }
621                     }
622                     else
623                         fatal("Can't do {n,0}");
624                 }
625                 else
626                     fatal("Can't do {0}");
627             }
628         }
629
630         if (!ISMULT1(op)) {
631                 *flagp = flags;
632                 return(ret);
633         }
634
635         if (!(flags&HASWIDTH) && op != '?')
636                 FAIL("regexp *+ operand could be empty");
637         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
638
639         if (op == '*' && (flags&SIMPLE))
640                 reginsert(STAR, ret);
641         else if (op == '*') {
642                 /* Emit x* as (x&|), where & means "self". */
643                 reginsert(BRANCH, ret);                 /* Either x */
644                 regoptail(ret, regnode(BACK));          /* and loop */
645                 regoptail(ret, ret);                    /* back */
646                 regtail(ret, regnode(BRANCH));          /* or */
647                 regtail(ret, regnode(NOTHING));         /* null. */
648         } else if (op == '+' && (flags&SIMPLE))
649                 reginsert(PLUS, ret);
650         else if (op == '+') {
651                 /* Emit x+ as x(&|), where & means "self". */
652                 next = regnode(BRANCH);                 /* Either */
653                 regtail(ret, next);
654                 regtail(regnode(BACK), ret);            /* loop back */
655                 regtail(next, regnode(BRANCH));         /* or */
656                 regtail(ret, regnode(NOTHING));         /* null. */
657         } else if (op == '?') {
658                 /* Emit x? as (x|) */
659                 reginsert(BRANCH, ret);                 /* Either x */
660                 regtail(ret, regnode(BRANCH));          /* or */
661                 next = regnode(NOTHING);                /* null. */
662                 regtail(ret, next);
663                 regoptail(ret, next);
664         }
665       nest_check:
666         regparse++;
667         if (ISMULT2(regparse))
668                 FAIL("nested *?+ in regexp");
669
670         return(ret);
671 }
672
673 /*
674  - regatom - the lowest level
675  *
676  * Optimization:  gobbles an entire sequence of ordinary characters so that
677  * it can turn them into a single node, which is smaller to store and
678  * faster to run.  Backslashed characters are exceptions, each becoming a
679  * separate node; the code is simpler that way and it's not worth fixing.
680  *
681  * [Yes, it is worth fixing, some scripts can run twice the speed.]
682  */
683 static char *
684 regatom(flagp)
685 I32 *flagp;
686 {
687         register char *ret;
688         I32 flags;
689
690         *flagp = WORST;         /* Tentatively. */
691
692         switch (*regparse++) {
693         case '^':
694                 ret = regnode(BOL);
695                 break;
696         case '$':
697                 ret = regnode(EOL);
698                 break;
699         case '.':
700                 ret = regnode(ANY);
701                 *flagp |= HASWIDTH|SIMPLE;
702                 break;
703         case '[':
704                 ret = regclass();
705                 *flagp |= HASWIDTH|SIMPLE;
706                 break;
707         case '(':
708                 ret = reg(1, &flags);
709                 if (ret == NULL)
710                         return(NULL);
711                 *flagp |= flags&(HASWIDTH|SPSTART);
712                 break;
713         case '|':
714         case ')':
715                 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
716                 break;
717         case '?':
718         case '+':
719         case '*':
720                 FAIL("?+* follows nothing in regexp");
721                 break;
722         case '\\':
723                 switch (*regparse) {
724                 case 'w':
725                         ret = regnode(ALNUM);
726                         *flagp |= HASWIDTH|SIMPLE;
727                         regparse++;
728                         break;
729                 case 'W':
730                         ret = regnode(NALNUM);
731                         *flagp |= HASWIDTH|SIMPLE;
732                         regparse++;
733                         break;
734                 case 'b':
735                         ret = regnode(BOUND);
736                         *flagp |= SIMPLE;
737                         regparse++;
738                         break;
739                 case 'B':
740                         ret = regnode(NBOUND);
741                         *flagp |= SIMPLE;
742                         regparse++;
743                         break;
744                 case 's':
745                         ret = regnode(SPACE);
746                         *flagp |= HASWIDTH|SIMPLE;
747                         regparse++;
748                         break;
749                 case 'S':
750                         ret = regnode(NSPACE);
751                         *flagp |= HASWIDTH|SIMPLE;
752                         regparse++;
753                         break;
754                 case 'd':
755                         ret = regnode(DIGIT);
756                         *flagp |= HASWIDTH|SIMPLE;
757                         regparse++;
758                         break;
759                 case 'D':
760                         ret = regnode(NDIGIT);
761                         *flagp |= HASWIDTH|SIMPLE;
762                         regparse++;
763                         break;
764                 case 'n':
765                 case 'r':
766                 case 't':
767                 case 'f':
768                 case 'e':
769                 case 'a':
770                 case 'x':
771                 case 'c':
772                 case '0':
773                         goto defchar;
774                 case '1': case '2': case '3': case '4':
775                 case '5': case '6': case '7': case '8': case '9':
776                         {
777                             I32 num = atoi(regparse);
778
779                             if (num > 9 && num >= regnpar)
780                                 goto defchar;
781                             else {
782                                 regsawback = 1;
783                                 ret = reganode(REF, num);
784                                 while (isDIGIT(*regparse))
785                                     regparse++;
786                                 *flagp |= SIMPLE;
787                             }
788                         }
789                         break;
790                 case '\0':
791                         if (regparse >= regxend)
792                             FAIL("trailing \\ in regexp");
793                         /* FALL THROUGH */
794                 default:
795                         goto defchar;
796                 }
797                 break;
798         default: {
799                         register I32 len;
800                         register char ender;
801                         register char *p;
802                         char *oldp;
803                         I32 numlen;
804
805                     defchar:
806                         ret = regnode(EXACTLY);
807                         regc(0);                /* save spot for len */
808                         for (len=0, p=regparse-1;
809                           len < 127 && p < regxend;
810                           len++)
811                         {
812                             oldp = p;
813                             switch (*p) {
814                             case '^':
815                             case '$':
816                             case '.':
817                             case '[':
818                             case '(':
819                             case ')':
820                             case '|':
821                                 goto loopdone;
822                             case '\\':
823                                 switch (*++p) {
824                                 case 'w':
825                                 case 'W':
826                                 case 'b':
827                                 case 'B':
828                                 case 's':
829                                 case 'S':
830                                 case 'd':
831                                 case 'D':
832                                     --p;
833                                     goto loopdone;
834                                 case 'n':
835                                         ender = '\n';
836                                         p++;
837                                         break;
838                                 case 'r':
839                                         ender = '\r';
840                                         p++;
841                                         break;
842                                 case 't':
843                                         ender = '\t';
844                                         p++;
845                                         break;
846                                 case 'f':
847                                         ender = '\f';
848                                         p++;
849                                         break;
850                                 case 'e':
851                                         ender = '\033';
852                                         p++;
853                                         break;
854                                 case 'a':
855                                         ender = '\007';
856                                         p++;
857                                         break;
858                                 case 'x':
859                                     ender = scan_hex(++p, 2, &numlen);
860                                     p += numlen;
861                                     break;
862                                 case 'c':
863                                     p++;
864                                     ender = *p++;
865                                     if (isLOWER(ender))
866                                         ender = toupper(ender);
867                                     ender ^= 64;
868                                     break;
869                                 case '0': case '1': case '2': case '3':case '4':
870                                 case '5': case '6': case '7': case '8':case '9':
871                                     if (*p == '0' ||
872                                       (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
873                                         ender = scan_oct(p, 3, &numlen);
874                                         p += numlen;
875                                     }
876                                     else {
877                                         --p;
878                                         goto loopdone;
879                                     }
880                                     break;
881                                 case '\0':
882                                     if (p >= regxend)
883                                         FAIL("trailing \\ in regexp");
884                                     /* FALL THROUGH */
885                                 default:
886                                     ender = *p++;
887                                     break;
888                                 }
889                                 break;
890                             default:
891                                 ender = *p++;
892                                 break;
893                             }
894                             if (regfold && isUPPER(ender))
895                                     ender = tolower(ender);
896                             if (ISMULT2(p)) { /* Back off on ?+*. */
897                                 if (len)
898                                     p = oldp;
899                                 else {
900                                     len++;
901                                     regc(ender);
902                                 }
903                                 break;
904                             }
905                             regc(ender);
906                         }
907                     loopdone:
908                         regparse = p;
909                         if (len <= 0)
910                                 FAIL("internal disaster in regexp");
911                         *flagp |= HASWIDTH;
912                         if (len == 1)
913                                 *flagp |= SIMPLE;
914                         if (regcode != &regdummy)
915                             *OPERAND(ret) = len;
916                         regc('\0');
917                 }
918                 break;
919         }
920
921         return(ret);
922 }
923
924 static void
925 regset(bits,def,c)
926 char *bits;
927 I32 def;
928 register I32 c;
929 {
930         if (regcode == &regdummy)
931             return;
932         c &= 255;
933         if (def)
934                 bits[c >> 3] &= ~(1 << (c & 7));
935         else
936                 bits[c >> 3] |=  (1 << (c & 7));
937 }
938
939 static char *
940 regclass()
941 {
942         register char *bits;
943         register I32 class;
944         register I32 lastclass;
945         register I32 range = 0;
946         register char *ret;
947         register I32 def;
948         I32 numlen;
949
950         ret = regnode(ANYOF);
951         if (*regparse == '^') { /* Complement of range. */
952                 regparse++;
953                 def = 0;
954         } else {
955                 def = 255;
956         }
957         bits = regcode;
958         for (class = 0; class < 32; class++)
959             regc(def);
960         if (*regparse == ']' || *regparse == '-')
961                 goto skipcond;          /* allow 1st char to be ] or - */
962         while (regparse < regxend && *regparse != ']') {
963               skipcond:
964                 class = UCHARAT(regparse++);
965                 if (class == '\\') {
966                         class = UCHARAT(regparse++);
967                         switch (class) {
968                         case 'w':
969                                 for (class = 0; class < 256; class++)
970                                     if (isALNUM(class))
971                                         regset(bits,def,class);
972                                 lastclass = 1234;
973                                 continue;
974                         case 'W':
975                                 for (class = 0; class < 256; class++)
976                                     if (!isALNUM(class))
977                                         regset(bits,def,class);
978                                 lastclass = 1234;
979                                 continue;
980                         case 's':
981                                 for (class = 0; class < 256; class++)
982                                     if (isSPACE(class))
983                                         regset(bits,def,class);
984                                 lastclass = 1234;
985                                 continue;
986                         case 'S':
987                                 for (class = 0; class < 256; class++)
988                                     if (!isSPACE(class))
989                                         regset(bits,def,class);
990                                 lastclass = 1234;
991                                 continue;
992                         case 'd':
993                                 for (class = '0'; class <= '9'; class++)
994                                         regset(bits,def,class);
995                                 lastclass = 1234;
996                                 continue;
997                         case 'D':
998                                 for (class = 0; class < '0'; class++)
999                                         regset(bits,def,class);
1000                                 for (class = '9' + 1; class < 256; class++)
1001                                         regset(bits,def,class);
1002                                 lastclass = 1234;
1003                                 continue;
1004                         case 'n':
1005                                 class = '\n';
1006                                 break;
1007                         case 'r':
1008                                 class = '\r';
1009                                 break;
1010                         case 't':
1011                                 class = '\t';
1012                                 break;
1013                         case 'f':
1014                                 class = '\f';
1015                                 break;
1016                         case 'b':
1017                                 class = '\b';
1018                                 break;
1019                         case 'e':
1020                                 class = '\033';
1021                                 break;
1022                         case 'a':
1023                                 class = '\007';
1024                                 break;
1025                         case 'x':
1026                                 class = scan_hex(regparse, 2, &numlen);
1027                                 regparse += numlen;
1028                                 break;
1029                         case 'c':
1030                                 class = *regparse++;
1031                                 if (isLOWER(class))
1032                                     class = toupper(class);
1033                                 class ^= 64;
1034                                 break;
1035                         case '0': case '1': case '2': case '3': case '4':
1036                         case '5': case '6': case '7': case '8': case '9':
1037                                 class = scan_oct(--regparse, 3, &numlen);
1038                                 regparse += numlen;
1039                                 break;
1040                         }
1041                 }
1042                 if (range) {
1043                         if (lastclass > class)
1044                                 FAIL("invalid [] range in regexp");
1045                         range = 0;
1046                 }
1047                 else {
1048                         lastclass = class;
1049                         if (*regparse == '-' && regparse+1 < regxend &&
1050                             regparse[1] != ']') {
1051                                 regparse++;
1052                                 range = 1;
1053                                 continue;       /* do it next time */
1054                         }
1055                 }
1056                 for ( ; lastclass <= class; lastclass++) {
1057                         regset(bits,def,lastclass);
1058                         if (regfold && isUPPER(lastclass))
1059                                 regset(bits,def,tolower(lastclass));
1060                 }
1061                 lastclass = class;
1062         }
1063         if (*regparse != ']')
1064                 FAIL("unmatched [] in regexp");
1065         regparse++;
1066         return ret;
1067 }
1068
1069 /*
1070  - regnode - emit a node
1071  */
1072 static char *                   /* Location. */
1073 regnode(op)
1074 char op;
1075 {
1076         register char *ret;
1077         register char *ptr;
1078
1079         ret = regcode;
1080         if (ret == &regdummy) {
1081 #ifdef REGALIGN
1082                 if (!(regsize & 1))
1083                         regsize++;
1084 #endif
1085                 regsize += 3;
1086                 return(ret);
1087         }
1088
1089 #ifdef REGALIGN
1090 #ifndef lint
1091         if (!((long)ret & 1))
1092             *ret++ = 127;
1093 #endif
1094 #endif
1095         ptr = ret;
1096         *ptr++ = op;
1097         *ptr++ = '\0';          /* Null "next" pointer. */
1098         *ptr++ = '\0';
1099         regcode = ptr;
1100
1101         return(ret);
1102 }
1103
1104 /*
1105  - reganode - emit a node with an argument
1106  */
1107 static char *                   /* Location. */
1108 reganode(op, arg)
1109 char op;
1110 unsigned short arg;
1111 {
1112         register char *ret;
1113         register char *ptr;
1114
1115         ret = regcode;
1116         if (ret == &regdummy) {
1117 #ifdef REGALIGN
1118                 if (!(regsize & 1))
1119                         regsize++;
1120 #endif
1121                 regsize += 5;
1122                 return(ret);
1123         }
1124
1125 #ifdef REGALIGN
1126 #ifndef lint
1127         if (!((long)ret & 1))
1128             *ret++ = 127;
1129 #endif
1130 #endif
1131         ptr = ret;
1132         *ptr++ = op;
1133         *ptr++ = '\0';          /* Null "next" pointer. */
1134         *ptr++ = '\0';
1135 #ifdef REGALIGN
1136         *(unsigned short *)(ret+3) = arg;
1137 #else
1138         ret[3] = arg >> 8; ret[4] = arg & 0377;
1139 #endif
1140         ptr += 2;
1141         regcode = ptr;
1142
1143         return(ret);
1144 }
1145
1146 /*
1147  - regc - emit (if appropriate) a byte of code
1148  */
1149 static void
1150 regc(b)
1151 char b;
1152 {
1153         if (regcode != &regdummy)
1154                 *regcode++ = b;
1155         else
1156                 regsize++;
1157 }
1158
1159 /*
1160  - reginsert - insert an operator in front of already-emitted operand
1161  *
1162  * Means relocating the operand.
1163  */
1164 static void
1165 reginsert(op, opnd)
1166 char op;
1167 char *opnd;
1168 {
1169         register char *src;
1170         register char *dst;
1171         register char *place;
1172         register offset = (op == CURLY ? 4 : 0);
1173
1174         if (regcode == &regdummy) {
1175 #ifdef REGALIGN
1176                 regsize += 4 + offset;
1177 #else
1178                 regsize += 3 + offset;
1179 #endif
1180                 return;
1181         }
1182
1183         src = regcode;
1184 #ifdef REGALIGN
1185         regcode += 4 + offset;
1186 #else
1187         regcode += 3 + offset;
1188 #endif
1189         dst = regcode;
1190         while (src > opnd)
1191                 *--dst = *--src;
1192
1193         place = opnd;           /* Op node, where operand used to be. */
1194         *place++ = op;
1195         *place++ = '\0';
1196         *place++ = '\0';
1197         while (offset-- > 0)
1198             *place++ = '\0';
1199 #ifdef REGALIGN
1200         *place++ = '\177';
1201 #endif
1202 }
1203
1204 /*
1205  - regtail - set the next-pointer at the end of a node chain
1206  */
1207 static void
1208 regtail(p, val)
1209 char *p;
1210 char *val;
1211 {
1212         register char *scan;
1213         register char *temp;
1214         register I32 offset;
1215
1216         if (p == &regdummy)
1217                 return;
1218
1219         /* Find last node. */
1220         scan = p;
1221         for (;;) {
1222                 temp = regnext(scan);
1223                 if (temp == NULL)
1224                         break;
1225                 scan = temp;
1226         }
1227
1228 #ifdef REGALIGN
1229         offset = val - scan;
1230 #ifndef lint
1231         *(short*)(scan+1) = offset;
1232 #else
1233         offset = offset;
1234 #endif
1235 #else
1236         if (OP(scan) == BACK)
1237                 offset = scan - val;
1238         else
1239                 offset = val - scan;
1240         *(scan+1) = (offset>>8)&0377;
1241         *(scan+2) = offset&0377;
1242 #endif
1243 }
1244
1245 /*
1246  - regoptail - regtail on operand of first argument; nop if operandless
1247  */
1248 static void
1249 regoptail(p, val)
1250 char *p;
1251 char *val;
1252 {
1253         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1254         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1255                 return;
1256         regtail(NEXTOPER(p), val);
1257 }
1258
1259 /*
1260  - regcurly - a little FSA that accepts {\d+,?\d*}
1261  */
1262 STATIC I32
1263 regcurly(s)
1264 register char *s;
1265 {
1266     if (*s++ != '{')
1267         return FALSE;
1268     if (!isDIGIT(*s))
1269         return FALSE;
1270     while (isDIGIT(*s))
1271         s++;
1272     if (*s == ',')
1273         s++;
1274     while (isDIGIT(*s))
1275         s++;
1276     if (*s != '}')
1277         return FALSE;
1278     return TRUE;
1279 }
1280
1281 #ifdef DEBUGGING
1282
1283 /*
1284  - regdump - dump a regexp onto stderr in vaguely comprehensible form
1285  */
1286 void
1287 regdump(r)
1288 regexp *r;
1289 {
1290         register char *s;
1291         register char op = EXACTLY;     /* Arbitrary non-END op. */
1292         register char *next;
1293
1294
1295         s = r->program + 1;
1296         while (op != END) {     /* While that wasn't END last time... */
1297 #ifdef REGALIGN
1298                 if (!((long)s & 1))
1299                         s++;
1300 #endif
1301                 op = OP(s);
1302                 fprintf(stderr,"%2d%s", s-r->program, regprop(s));      /* Where, what. */
1303                 next = regnext(s);
1304                 s += regarglen[op];
1305                 if (next == NULL)               /* Next ptr. */
1306                         fprintf(stderr,"(0)");
1307                 else 
1308                         fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1309                 s += 3;
1310                 if (op == ANYOF) {
1311                         s += 32;
1312                 }
1313                 if (op == EXACTLY) {
1314                         /* Literal string, where present. */
1315                         s++;
1316                         while (*s != '\0') {
1317                                 (void)putc(*s, stderr);
1318                                 s++;
1319                         }
1320                         s++;
1321                 }
1322                 (void)putc('\n', stderr);
1323         }
1324
1325         /* Header fields of interest. */
1326         if (r->regstart)
1327                 fprintf(stderr,"start `%s' ", SvPV(r->regstart));
1328         if (r->regstclass)
1329                 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1330         if (r->reganch & ROPT_ANCH)
1331                 fprintf(stderr,"anchored ");
1332         if (r->reganch & ROPT_SKIP)
1333                 fprintf(stderr,"plus ");
1334         if (r->reganch & ROPT_IMPLICIT)
1335                 fprintf(stderr,"implicit ");
1336         if (r->regmust != NULL)
1337                 fprintf(stderr,"must have \"%s\" back %d ", SvPV(r->regmust),
1338                   r->regback);
1339         fprintf(stderr, "minlen %d ", r->minlen);
1340         fprintf(stderr,"\n");
1341 }
1342
1343 /*
1344  - regprop - printable representation of opcode
1345  */
1346 char *
1347 regprop(op)
1348 char *op;
1349 {
1350         register char *p;
1351
1352         (void) strcpy(buf, ":");
1353
1354         switch (OP(op)) {
1355         case BOL:
1356                 p = "BOL";
1357                 break;
1358         case EOL:
1359                 p = "EOL";
1360                 break;
1361         case ANY:
1362                 p = "ANY";
1363                 break;
1364         case ANYOF:
1365                 p = "ANYOF";
1366                 break;
1367         case BRANCH:
1368                 p = "BRANCH";
1369                 break;
1370         case EXACTLY:
1371                 p = "EXACTLY";
1372                 break;
1373         case NOTHING:
1374                 p = "NOTHING";
1375                 break;
1376         case BACK:
1377                 p = "BACK";
1378                 break;
1379         case END:
1380                 p = "END";
1381                 break;
1382         case ALNUM:
1383                 p = "ALNUM";
1384                 break;
1385         case NALNUM:
1386                 p = "NALNUM";
1387                 break;
1388         case BOUND:
1389                 p = "BOUND";
1390                 break;
1391         case NBOUND:
1392                 p = "NBOUND";
1393                 break;
1394         case SPACE:
1395                 p = "SPACE";
1396                 break;
1397         case NSPACE:
1398                 p = "NSPACE";
1399                 break;
1400         case DIGIT:
1401                 p = "DIGIT";
1402                 break;
1403         case NDIGIT:
1404                 p = "NDIGIT";
1405                 break;
1406         case CURLY:
1407                 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1408                     ARG1(op),ARG2(op));
1409                 p = NULL;
1410                 break;
1411         case REF:
1412                 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1413                 p = NULL;
1414                 break;
1415         case OPEN:
1416                 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1417                 p = NULL;
1418                 break;
1419         case CLOSE:
1420                 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1421                 p = NULL;
1422                 break;
1423         case STAR:
1424                 p = "STAR";
1425                 break;
1426         case PLUS:
1427                 p = "PLUS";
1428                 break;
1429         default:
1430                 FAIL("corrupted regexp opcode");
1431         }
1432         if (p != NULL)
1433                 (void) strcat(buf, p);
1434         return(buf);
1435 }
1436 #endif /* DEBUGGING */
1437
1438 void
1439 regfree(r)
1440 struct regexp *r;
1441 {
1442         if (r->precomp) {
1443                 Safefree(r->precomp);
1444                 r->precomp = Nullch;
1445         }
1446         if (r->subbase) {
1447                 Safefree(r->subbase);
1448                 r->subbase = Nullch;
1449         }
1450         if (r->regmust) {
1451                 sv_free(r->regmust);
1452                 r->regmust = Nullsv;
1453         }
1454         if (r->regstart) {
1455                 sv_free(r->regstart);
1456                 r->regstart = Nullsv;
1457         }
1458         Safefree(r->startp);
1459         Safefree(r->endp);
1460         Safefree(r);
1461 }