perl 3.0 patch #38 (combined patch)
[p5sagit/p5-mst-13.2.git] / regcomp.c
CommitLineData
a687059c 1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
e5d73d77 10/* $Header: regcomp.c,v 3.0.1.7 90/10/20 02:18:32 lwall Locked $
a687059c 11 *
12 * $Log: regcomp.c,v $
e5d73d77 13 * Revision 3.0.1.7 90/10/20 02:18:32 lwall
14 * patch37: /foo.*bar$/ wrongly optimized to do tail matching on "foo"
15 *
0a12ae7d 16 * Revision 3.0.1.6 90/10/16 10:17:33 lwall
17 * patch29: patterns with multiple short literal strings sometimes failed
18 *
6eb13c3b 19 * Revision 3.0.1.5 90/08/13 22:23:29 lwall
20 * patch28: /x{m}/ didn't work right
21 *
00bf170e 22 * Revision 3.0.1.4 90/08/09 05:05:33 lwall
23 * patch19: sped up /x+y/ patterns greatly by not retrying on every x
24 * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
25 * patch19: sped up {m,n} on simple items
26 * patch19: optimized /.*whatever/ to /^.*whatever/
27 * patch19: fixed character classes to allow backslashing hyphen
28 *
79a0689e 29 * Revision 3.0.1.3 90/03/12 16:59:22 lwall
30 * patch13: pattern matches can now use \0 to mean \000
31 *
ac58e20f 32 * Revision 3.0.1.2 90/02/28 18:08:35 lwall
33 * patch9: /[\200-\377]/ didn't work on machines with signed chars
34 *
ae986130 35 * Revision 3.0.1.1 89/11/11 04:51:04 lwall
36 * patch2: /[\000]/ didn't work
37 *
a687059c 38 * Revision 3.0 89/10/18 15:22:29 lwall
39 * 3.0 baseline
40 *
41 */
42
43/*
44 * regcomp and regexec -- regsub and regerror are not used in perl
45 *
46 * Copyright (c) 1986 by University of Toronto.
47 * Written by Henry Spencer. Not derived from licensed software.
48 *
49 * Permission is granted to anyone to use this software for any
50 * purpose on any computer system, and to redistribute it freely,
51 * subject to the following restrictions:
52 *
53 * 1. The author is not responsible for the consequences of use of
54 * this software, no matter how awful, even if they arise
55 * from defects in it.
56 *
57 * 2. The origin of this software must not be misrepresented, either
58 * by explicit claim or by omission.
59 *
60 * 3. Altered versions must be plainly marked as such, and must not
61 * be misrepresented as being the original software.
62 *
63 *
64 **** Alterations to Henry's code are...
65 ****
66 **** Copyright (c) 1989, Larry Wall
67 ****
68 **** You may distribute under the terms of the GNU General Public License
69 **** as specified in the README file that comes with the perl 3.0 kit.
70 *
71 * Beware that some of this code is subtly aware of the way operator
72 * precedence is structured in regular expressions. Serious changes in
73 * regular-expression syntax might require a total rethink.
74 */
75#include "EXTERN.h"
76#include "perl.h"
77#include "INTERN.h"
78#include "regcomp.h"
79
80#ifndef STATIC
81#define STATIC static
82#endif
83
84#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
85#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
86 ((*s) == '{' && regcurly(s)))
87#define META "^$.[()|?+*\\"
88
89/*
90 * Flags to be passed up and down.
91 */
92#define HASWIDTH 01 /* Known never to match null string. */
93#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
94#define SPSTART 04 /* Starts with * or +. */
95#define WORST 0 /* Worst case. */
96
97/*
98 * Global work variables for regcomp().
99 */
100static char *regprecomp; /* uncompiled string. */
101static char *regparse; /* Input-scan pointer. */
102static char *regxend; /* End of input for compile */
103static int regnpar; /* () count. */
104static char *regcode; /* Code-emit pointer; &regdummy = don't. */
105static long regsize; /* Code size. */
106static int regfold;
107static int regsawbracket; /* Did we do {d,d} trick? */
108
109/*
110 * Forward declarations for regcomp()'s friends.
111 */
112STATIC int regcurly();
113STATIC char *reg();
114STATIC char *regbranch();
115STATIC char *regpiece();
116STATIC char *regatom();
117STATIC char *regclass();
118STATIC char *regnode();
119STATIC void regc();
120STATIC void reginsert();
121STATIC void regtail();
122STATIC void regoptail();
123
124/*
125 - regcomp - compile a regular expression into internal code
126 *
127 * We can't allocate space until we know how big the compiled form will be,
128 * but we can't compile it (and thus know how big it is) until we've got a
129 * place to put the code. So we cheat: we compile it twice, once with code
130 * generation turned off and size counting turned on, and once "for real".
131 * This also means that we don't allocate space until we are sure that the
132 * thing really will compile successfully, and we never have to move the
133 * code and thus invalidate pointers into it. (Note that it has to be in
134 * one piece because free() must be able to free it all.) [NB: not true in perl]
135 *
136 * Beware that the optimization-preparation code in here knows about some
137 * of the structure of the compiled regexp. [I'll say.]
138 */
139regexp *
00bf170e 140regcomp(exp,xend,fold)
a687059c 141char *exp;
142char *xend;
143int fold;
a687059c 144{
145 register regexp *r;
146 register char *scan;
0a12ae7d 147 register STR *longish;
148 STR *longest;
a687059c 149 register int len;
150 register char *first;
151 int flags;
152 int back;
153 int curback;
154 extern char *safemalloc();
155 extern char *savestr();
00bf170e 156 int sawplus = 0;
a687059c 157
158 if (exp == NULL)
159 fatal("NULL regexp argument");
160
161 /* First pass: determine size, legality. */
162 regfold = fold;
163 regparse = exp;
164 regxend = xend;
165 regprecomp = nsavestr(exp,xend-exp);
166 regsawbracket = 0;
167 regnpar = 1;
168 regsize = 0L;
169 regcode = &regdummy;
170 regc(MAGIC);
171 if (reg(0, &flags) == NULL) {
172 Safefree(regprecomp);
173 return(NULL);
174 }
175
176 /* Small enough for pointer-storage convention? */
177 if (regsize >= 32767L) /* Probably could be 65535L. */
178 FAIL("regexp too big");
179
180 /* Allocate space. */
181 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
182 if (r == NULL)
183 FAIL("regexp out of space");
184
185 /* Second pass: emit code. */
186 if (regsawbracket)
187 bcopy(regprecomp,exp,xend-exp);
188 r->precomp = regprecomp;
189 r->subbase = NULL;
190 regparse = exp;
191 regnpar = 1;
192 regcode = r->program;
193 regc(MAGIC);
194 if (reg(0, &flags) == NULL)
195 return(NULL);
196
197 /* Dig out information for optimizations. */
198 r->regstart = Nullstr; /* Worst-case defaults. */
199 r->reganch = 0;
200 r->regmust = Nullstr;
201 r->regback = -1;
202 r->regstclass = Nullch;
203 scan = r->program+1; /* First BRANCH. */
204 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
205 scan = NEXTOPER(scan);
206
207 first = scan;
208 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
209 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
00bf170e 210 (OP(first) == PLUS) ||
211 (OP(first) == CURLY && ARG1(first) > 0) ) {
212 if (OP(first) == CURLY)
213 first += 4;
214 else if (OP(first) == PLUS)
215 sawplus = 2;
a687059c 216 first = NEXTOPER(first);
00bf170e 217 }
a687059c 218
219 /* Starting-point info. */
220 if (OP(first) == EXACTLY) {
221 r->regstart =
222 str_make(OPERAND(first)+1,*OPERAND(first));
223 if (r->regstart->str_cur > !(sawstudy|fold))
224 fbmcompile(r->regstart,fold);
225 }
226 else if ((exp = index(simple,OP(first))) && exp > simple)
227 r->regstclass = first;
228 else if (OP(first) == BOUND || OP(first) == NBOUND)
229 r->regstclass = first;
00bf170e 230 else if (OP(first) == BOL ||
231 (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
232 r->reganch = 1; /* kinda turn .* into ^.* */
233 r->reganch |= sawplus;
a687059c 234
235#ifdef DEBUGGING
236 if (debug & 512)
237 fprintf(stderr,"first %d next %d offset %d\n",
238 OP(first), OP(NEXTOPER(first)), first - scan);
239#endif
240 /*
241 * If there's something expensive in the r.e., find the
242 * longest literal string that must appear and make it the
243 * regmust. Resolve ties in favor of later strings, since
244 * the regstart check works with the beginning of the r.e.
245 * and avoiding duplication strengthens checking. Not a
246 * strong reason, but sufficient in the absence of others.
247 * [Now we resolve ties in favor of the earlier string if
248 * it happens that curback has been invalidated, since the
249 * earlier string may buy us something the later one won't.]
250 */
0a12ae7d 251 longish = str_make("",0);
a687059c 252 longest = str_make("",0);
253 len = 0;
254 curback = 0;
255 back = 0;
e5d73d77 256 while (OP(scan) != END) {
a687059c 257 if (OP(scan) == BRANCH) {
258 if (OP(regnext(scan)) == BRANCH) {
259 curback = -30000;
260 while (OP(scan) == BRANCH)
261 scan = regnext(scan);
262 }
263 else /* single branch is ok */
264 scan = NEXTOPER(scan);
265 }
266 if (OP(scan) == EXACTLY) {
267 first = scan;
268 while (OP(regnext(scan)) >= CLOSE)
269 scan = regnext(scan);
270 if (curback - back == len) {
0a12ae7d 271 str_ncat(longish, OPERAND(first)+1,
a687059c 272 *OPERAND(first));
273 len += *OPERAND(first);
274 curback += *OPERAND(first);
275 first = regnext(scan);
276 }
277 else if (*OPERAND(first) >= len + (curback >= 0)) {
278 len = *OPERAND(first);
0a12ae7d 279 str_nset(longish, OPERAND(first)+1,len);
a687059c 280 back = curback;
281 curback += len;
282 first = regnext(scan);
283 }
284 else
285 curback += *OPERAND(first);
286 }
0a12ae7d 287 else if (index(varies,OP(scan))) {
288 curback = -30000;
289 len = 0;
290 if (longish->str_cur > longest->str_cur)
291 str_sset(longest,longish);
292 str_nset(longish,"",0);
293 }
e5d73d77 294 else if (index(simple,OP(scan))) {
0a12ae7d 295 curback++;
e5d73d77 296 len = 0;
297 if (longish->str_cur > longest->str_cur)
298 str_sset(longest,longish);
299 str_nset(longish,"",0);
300 }
a687059c 301 scan = regnext(scan);
302 }
e5d73d77 303
304 /* Prefer earlier on tie, unless we can tail match latter */
305
306 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur)
0a12ae7d 307 str_sset(longest,longish);
e5d73d77 308 else
309 str_nset(longish,"",0);
0a12ae7d 310 if (longest->str_cur) {
a687059c 311 r->regmust = longest;
312 if (back < 0)
313 back = -1;
314 r->regback = back;
0a12ae7d 315 if (longest->str_cur
316 > !(sawstudy || fold || OP(first) == EOL) )
a687059c 317 fbmcompile(r->regmust,fold);
318 r->regmust->str_u.str_useful = 100;
e5d73d77 319 if (OP(first) == EOL && longish->str_cur)
a687059c 320 r->regmust->str_pok |= SP_TAIL;
321 }
322 else
323 str_free(longest);
e5d73d77 324 str_free(longish);
a687059c 325 }
326
327 r->do_folding = fold;
328 r->nparens = regnpar - 1;
329#ifdef DEBUGGING
330 if (debug & 512)
331 regdump(r);
332#endif
333 return(r);
334}
335
336/*
337 - reg - regular expression, i.e. main body or parenthesized thing
338 *
339 * Caller must absorb opening parenthesis.
340 *
341 * Combining parenthesis handling with the base level of regular expression
342 * is a trifle forced, but the need to tie the tails of the branches to what
343 * follows makes it hard to avoid.
344 */
345static char *
346reg(paren, flagp)
347int paren; /* Parenthesized? */
348int *flagp;
349{
350 register char *ret;
351 register char *br;
352 register char *ender;
353 register int parno;
354 int flags;
355
356 *flagp = HASWIDTH; /* Tentatively. */
357
358 /* Make an OPEN node, if parenthesized. */
359 if (paren) {
360 if (regnpar >= NSUBEXP)
361 FAIL("too many () in regexp");
362 parno = regnpar;
363 regnpar++;
364 ret = regnode(OPEN+parno);
365 } else
366 ret = NULL;
367
368 /* Pick up the branches, linking them together. */
369 br = regbranch(&flags);
370 if (br == NULL)
371 return(NULL);
372 if (ret != NULL)
373 regtail(ret, br); /* OPEN -> first. */
374 else
375 ret = br;
376 if (!(flags&HASWIDTH))
377 *flagp &= ~HASWIDTH;
378 *flagp |= flags&SPSTART;
379 while (*regparse == '|') {
380 regparse++;
381 br = regbranch(&flags);
382 if (br == NULL)
383 return(NULL);
384 regtail(ret, br); /* BRANCH -> BRANCH. */
385 if (!(flags&HASWIDTH))
386 *flagp &= ~HASWIDTH;
387 *flagp |= flags&SPSTART;
388 }
389
390 /* Make a closing node, and hook it on the end. */
391 ender = regnode((paren) ? CLOSE+parno : END);
392 regtail(ret, ender);
393
394 /* Hook the tails of the branches to the closing node. */
395 for (br = ret; br != NULL; br = regnext(br))
396 regoptail(br, ender);
397
398 /* Check for proper termination. */
399 if (paren && *regparse++ != ')') {
400 FAIL("unmatched () in regexp");
401 } else if (!paren && regparse < regxend) {
402 if (*regparse == ')') {
403 FAIL("unmatched () in regexp");
404 } else
405 FAIL("junk on end of regexp"); /* "Can't happen". */
406 /* NOTREACHED */
407 }
408
409 return(ret);
410}
411
412/*
413 - regbranch - one alternative of an | operator
414 *
415 * Implements the concatenation operator.
416 */
417static char *
418regbranch(flagp)
419int *flagp;
420{
421 register char *ret;
422 register char *chain;
423 register char *latest;
424 int flags;
425
426 *flagp = WORST; /* Tentatively. */
427
428 ret = regnode(BRANCH);
429 chain = NULL;
430 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
431 latest = regpiece(&flags);
432 if (latest == NULL)
433 return(NULL);
434 *flagp |= flags&HASWIDTH;
435 if (chain == NULL) /* First piece. */
436 *flagp |= flags&SPSTART;
437 else
438 regtail(chain, latest);
439 chain = latest;
440 }
441 if (chain == NULL) /* Loop ran zero times. */
442 (void) regnode(NOTHING);
443
444 return(ret);
445}
446
447/*
448 - regpiece - something followed by possible [*+?]
449 *
450 * Note that the branching code sequences used for ? and the general cases
451 * of * and + are somewhat optimized: they use the same NOTHING node as
452 * both the endmarker for their branch list and the body of the last branch.
453 * It might seem that this node could be dispensed with entirely, but the
454 * endmarker role is not redundant.
455 */
456static char *
457regpiece(flagp)
458int *flagp;
459{
460 register char *ret;
461 register char op;
462 register char *next;
463 int flags;
464 char *origparse = regparse;
465 int orignpar = regnpar;
466 char *max;
467 int iter;
468 char ch;
469
470 ret = regatom(&flags);
471 if (ret == NULL)
472 return(NULL);
473
474 op = *regparse;
475
476 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
477 * then we decrement the first number by one and reset our
478 * parsing back to the beginning of the same atom. If the first number
479 * is down to 0, decrement the second number instead and fake up
480 * a ? after it. Given the way this compiler doesn't keep track
481 * of offsets on the first pass, this is the only way to replicate
482 * a piece of code. Sigh.
483 */
484 if (op == '{' && regcurly(regparse)) {
485 next = regparse + 1;
486 max = Nullch;
487 while (isdigit(*next) || *next == ',') {
488 if (*next == ',') {
489 if (max)
490 break;
491 else
492 max = next;
493 }
494 next++;
495 }
496 if (*next == '}') { /* got one */
a687059c 497 if (!max)
498 max = next;
499 regparse++;
500 iter = atoi(regparse);
00bf170e 501 if (flags&SIMPLE) { /* we can do it right after all */
502 int tmp;
503
504 reginsert(CURLY, ret);
505 if (*max == ',')
506 max++;
6eb13c3b 507 else
508 max = regparse;
00bf170e 509 tmp = atoi(max);
510 if (tmp && tmp < iter)
511 fatal("Can't do {n,m} with n > m");
512 if (regcode != &regdummy) {
513#ifdef REGALIGN
514 *(unsigned short *)(ret+3) = iter;
515 *(unsigned short *)(ret+5) = tmp;
516#else
517 ret[3] = iter >> 8; ret[4] = iter & 0377;
518 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
519#endif
520 }
521 regparse = next;
522 goto nest_check;
523 }
524 regsawbracket++; /* remember we clobbered exp */
a687059c 525 if (iter > 0) {
526 ch = *max;
527 sprintf(regparse,"%.*d", max-regparse, iter - 1);
528 *max = ch;
00bf170e 529 if (*max == ',' && max[1] != '}') {
530 if (atoi(max+1) <= 0)
531 fatal("Can't do {n,m} with n > m");
a687059c 532 ch = *next;
533 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
534 *next = ch;
535 }
00bf170e 536 if (iter != 1 || *max == ',') {
a687059c 537 regparse = origparse; /* back up input pointer */
538 regnpar = orignpar; /* don't make more parens */
539 }
540 else {
541 regparse = next;
542 goto nest_check;
543 }
544 *flagp = flags;
545 return ret;
546 }
547 if (*max == ',') {
548 max++;
549 iter = atoi(max);
550 if (max == next) { /* any number more? */
551 regparse = next;
552 op = '*'; /* fake up one with a star */
553 }
554 else if (iter > 0) {
555 op = '?'; /* fake up optional atom */
556 ch = *next;
557 sprintf(max,"%.*d", next-max, iter - 1);
558 *next = ch;
559 if (iter == 1)
560 regparse = next;
561 else {
562 regparse = origparse - 1; /* offset ++ below */
563 regnpar = orignpar;
564 }
565 }
566 else
567 fatal("Can't do {n,0}");
568 }
569 else
570 fatal("Can't do {0}");
571 }
572 }
573
574 if (!ISMULT1(op)) {
575 *flagp = flags;
576 return(ret);
577 }
578
579 if (!(flags&HASWIDTH) && op != '?')
580 FAIL("regexp *+ operand could be empty");
581 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
582
583 if (op == '*' && (flags&SIMPLE))
584 reginsert(STAR, ret);
585 else if (op == '*') {
586 /* Emit x* as (x&|), where & means "self". */
587 reginsert(BRANCH, ret); /* Either x */
588 regoptail(ret, regnode(BACK)); /* and loop */
589 regoptail(ret, ret); /* back */
590 regtail(ret, regnode(BRANCH)); /* or */
591 regtail(ret, regnode(NOTHING)); /* null. */
592 } else if (op == '+' && (flags&SIMPLE))
593 reginsert(PLUS, ret);
594 else if (op == '+') {
595 /* Emit x+ as x(&|), where & means "self". */
596 next = regnode(BRANCH); /* Either */
597 regtail(ret, next);
598 regtail(regnode(BACK), ret); /* loop back */
599 regtail(next, regnode(BRANCH)); /* or */
600 regtail(ret, regnode(NOTHING)); /* null. */
601 } else if (op == '?') {
602 /* Emit x? as (x|) */
603 reginsert(BRANCH, ret); /* Either x */
604 regtail(ret, regnode(BRANCH)); /* or */
605 next = regnode(NOTHING); /* null. */
606 regtail(ret, next);
607 regoptail(ret, next);
608 }
609 nest_check:
610 regparse++;
611 if (ISMULT2(regparse))
612 FAIL("nested *?+ in regexp");
613
614 return(ret);
615}
616
617/*
618 - regatom - the lowest level
619 *
620 * Optimization: gobbles an entire sequence of ordinary characters so that
621 * it can turn them into a single node, which is smaller to store and
622 * faster to run. Backslashed characters are exceptions, each becoming a
623 * separate node; the code is simpler that way and it's not worth fixing.
624 *
625 * [Yes, it is worth fixing, some scripts can run twice the speed.]
626 */
627static char *
628regatom(flagp)
629int *flagp;
630{
631 register char *ret;
632 int flags;
633
634 *flagp = WORST; /* Tentatively. */
635
636 switch (*regparse++) {
637 case '^':
638 ret = regnode(BOL);
639 break;
640 case '$':
641 ret = regnode(EOL);
642 break;
643 case '.':
644 ret = regnode(ANY);
645 *flagp |= HASWIDTH|SIMPLE;
646 break;
647 case '[':
648 ret = regclass();
649 *flagp |= HASWIDTH|SIMPLE;
650 break;
651 case '(':
652 ret = reg(1, &flags);
653 if (ret == NULL)
654 return(NULL);
655 *flagp |= flags&(HASWIDTH|SPSTART);
656 break;
657 case '|':
658 case ')':
659 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
660 break;
661 case '?':
662 case '+':
663 case '*':
664 FAIL("?+* follows nothing in regexp");
665 break;
666 case '\\':
667 switch (*regparse) {
668 case 'w':
669 ret = regnode(ALNUM);
670 *flagp |= HASWIDTH|SIMPLE;
671 regparse++;
672 break;
673 case 'W':
674 ret = regnode(NALNUM);
675 *flagp |= HASWIDTH|SIMPLE;
676 regparse++;
677 break;
678 case 'b':
679 ret = regnode(BOUND);
680 *flagp |= SIMPLE;
681 regparse++;
682 break;
683 case 'B':
684 ret = regnode(NBOUND);
685 *flagp |= SIMPLE;
686 regparse++;
687 break;
688 case 's':
689 ret = regnode(SPACE);
690 *flagp |= HASWIDTH|SIMPLE;
691 regparse++;
692 break;
693 case 'S':
694 ret = regnode(NSPACE);
695 *flagp |= HASWIDTH|SIMPLE;
696 regparse++;
697 break;
698 case 'd':
699 ret = regnode(DIGIT);
700 *flagp |= HASWIDTH|SIMPLE;
701 regparse++;
702 break;
703 case 'D':
704 ret = regnode(NDIGIT);
705 *flagp |= HASWIDTH|SIMPLE;
706 regparse++;
707 break;
708 case 'n':
709 case 'r':
710 case 't':
711 case 'f':
712 goto defchar;
713 case '0': case '1': case '2': case '3': case '4':
714 case '5': case '6': case '7': case '8': case '9':
79a0689e 715 if (isdigit(regparse[1]) || *regparse == '0')
a687059c 716 goto defchar;
717 else {
718 ret = regnode(REF + *regparse++ - '0');
719 *flagp |= SIMPLE;
720 }
721 break;
722 case '\0':
723 if (regparse >= regxend)
724 FAIL("trailing \\ in regexp");
725 /* FALL THROUGH */
726 default:
727 goto defchar;
728 }
729 break;
730 default: {
731 register int len;
732 register char ender;
733 register char *p;
734 char *oldp;
735 int foo;
736
737 defchar:
738 ret = regnode(EXACTLY);
739 regc(0); /* save spot for len */
740 for (len=0, p=regparse-1;
741 len < 127 && p < regxend;
742 len++)
743 {
744 oldp = p;
745 switch (*p) {
746 case '^':
747 case '$':
748 case '.':
749 case '[':
750 case '(':
751 case ')':
752 case '|':
753 goto loopdone;
754 case '\\':
755 switch (*++p) {
756 case 'w':
757 case 'W':
758 case 'b':
759 case 'B':
760 case 's':
761 case 'S':
762 case 'd':
763 case 'D':
764 --p;
765 goto loopdone;
766 case 'n':
767 ender = '\n';
768 p++;
769 break;
770 case 'r':
771 ender = '\r';
772 p++;
773 break;
774 case 't':
775 ender = '\t';
776 p++;
777 break;
778 case 'f':
779 ender = '\f';
780 p++;
781 break;
782 case '0': case '1': case '2': case '3':case '4':
783 case '5': case '6': case '7': case '8':case '9':
79a0689e 784 if (isdigit(p[1]) || *p == '0') {
785 foo = *p - '0';
786 if (isdigit(p[1]))
787 foo = (foo<<3) + *++p - '0';
a687059c 788 if (isdigit(p[1]))
789 foo = (foo<<3) + *++p - '0';
790 ender = foo;
791 p++;
792 }
793 else {
794 --p;
795 goto loopdone;
796 }
797 break;
798 case '\0':
799 if (p >= regxend)
800 FAIL("trailing \\ in regexp");
801 /* FALL THROUGH */
802 default:
803 ender = *p++;
804 break;
805 }
806 break;
807 default:
808 ender = *p++;
809 break;
810 }
811 if (regfold && isupper(ender))
812 ender = tolower(ender);
813 if (ISMULT2(p)) { /* Back off on ?+*. */
814 if (len)
815 p = oldp;
816 else {
817 len++;
818 regc(ender);
819 }
820 break;
821 }
822 regc(ender);
823 }
824 loopdone:
825 regparse = p;
826 if (len <= 0)
827 FAIL("internal disaster in regexp");
828 *flagp |= HASWIDTH;
829 if (len == 1)
830 *flagp |= SIMPLE;
831 if (regcode != &regdummy)
832 *OPERAND(ret) = len;
833 regc('\0');
834 }
835 break;
836 }
837
838 return(ret);
839}
840
841static void
842regset(bits,def,c)
843char *bits;
844int def;
845register int c;
846{
847 if (regcode == &regdummy)
848 return;
ac58e20f 849 c &= 255;
a687059c 850 if (def)
851 bits[c >> 3] &= ~(1 << (c & 7));
852 else
853 bits[c >> 3] |= (1 << (c & 7));
854}
855
856static char *
857regclass()
858{
859 register char *bits;
860 register int class;
861 register int lastclass;
862 register int range = 0;
863 register char *ret;
864 register int def;
865
00bf170e 866 ret = regnode(ANYOF);
a687059c 867 if (*regparse == '^') { /* Complement of range. */
a687059c 868 regparse++;
869 def = 0;
870 } else {
a687059c 871 def = 255;
872 }
873 bits = regcode;
874 for (class = 0; class < 32; class++)
875 regc(def);
876 if (*regparse == ']' || *regparse == '-')
00bf170e 877 goto skipcond; /* allow 1st char to be ] or - */
a687059c 878 while (regparse < regxend && *regparse != ']') {
00bf170e 879 skipcond:
a687059c 880 class = UCHARAT(regparse++);
881 if (class == '\\') {
882 class = UCHARAT(regparse++);
883 switch (class) {
884 case 'w':
885 for (class = 'a'; class <= 'z'; class++)
886 regset(bits,def,class);
887 for (class = 'A'; class <= 'Z'; class++)
888 regset(bits,def,class);
889 for (class = '0'; class <= '9'; class++)
890 regset(bits,def,class);
891 regset(bits,def,'_');
892 lastclass = 1234;
893 continue;
894 case 's':
895 regset(bits,def,' ');
896 regset(bits,def,'\t');
897 regset(bits,def,'\r');
898 regset(bits,def,'\f');
899 regset(bits,def,'\n');
900 lastclass = 1234;
901 continue;
902 case 'd':
903 for (class = '0'; class <= '9'; class++)
904 regset(bits,def,class);
905 lastclass = 1234;
906 continue;
907 case 'n':
908 class = '\n';
909 break;
910 case 'r':
911 class = '\r';
912 break;
913 case 't':
914 class = '\t';
915 break;
916 case 'f':
917 class = '\f';
918 break;
919 case 'b':
920 class = '\b';
921 break;
922 case '0': case '1': case '2': case '3': case '4':
923 case '5': case '6': case '7': case '8': case '9':
924 class -= '0';
925 if (isdigit(*regparse)) {
926 class <<= 3;
927 class += *regparse++ - '0';
928 }
929 if (isdigit(*regparse)) {
930 class <<= 3;
931 class += *regparse++ - '0';
932 }
933 break;
934 }
935 }
a687059c 936 if (range) {
937 if (lastclass > class)
938 FAIL("invalid [] range in regexp");
00bf170e 939 range = 0;
a687059c 940 }
00bf170e 941 else {
942 lastclass = class;
943 if (*regparse == '-' && regparse+1 < regxend &&
944 regparse[1] != ']') {
945 regparse++;
946 range = 1;
947 continue; /* do it next time */
948 }
949 }
950 for ( ; lastclass <= class; lastclass++) {
a687059c 951 regset(bits,def,lastclass);
952 if (regfold && isupper(lastclass))
953 regset(bits,def,tolower(lastclass));
954 }
955 lastclass = class;
956 }
957 if (*regparse != ']')
958 FAIL("unmatched [] in regexp");
a687059c 959 regparse++;
960 return ret;
961}
962
963/*
964 - regnode - emit a node
965 */
966static char * /* Location. */
967regnode(op)
968char op;
969{
970 register char *ret;
971 register char *ptr;
972
973 ret = regcode;
974 if (ret == &regdummy) {
975#ifdef REGALIGN
976 if (!(regsize & 1))
977 regsize++;
978#endif
979 regsize += 3;
980 return(ret);
981 }
982
983#ifdef REGALIGN
984#ifndef lint
985 if (!((long)ret & 1))
986 *ret++ = 127;
987#endif
988#endif
989 ptr = ret;
990 *ptr++ = op;
991 *ptr++ = '\0'; /* Null "next" pointer. */
992 *ptr++ = '\0';
993 regcode = ptr;
994
995 return(ret);
996}
997
998/*
999 - regc - emit (if appropriate) a byte of code
1000 */
1001static void
1002regc(b)
1003char b;
1004{
1005 if (regcode != &regdummy)
1006 *regcode++ = b;
1007 else
1008 regsize++;
1009}
1010
1011/*
1012 - reginsert - insert an operator in front of already-emitted operand
1013 *
1014 * Means relocating the operand.
1015 */
1016static void
1017reginsert(op, opnd)
1018char op;
1019char *opnd;
1020{
1021 register char *src;
1022 register char *dst;
1023 register char *place;
00bf170e 1024 register offset = (op == CURLY ? 4 : 0);
a687059c 1025
1026 if (regcode == &regdummy) {
1027#ifdef REGALIGN
00bf170e 1028 regsize += 4 + offset;
a687059c 1029#else
00bf170e 1030 regsize += 3 + offset;
a687059c 1031#endif
1032 return;
1033 }
1034
1035 src = regcode;
1036#ifdef REGALIGN
00bf170e 1037 regcode += 4 + offset;
a687059c 1038#else
00bf170e 1039 regcode += 3 + offset;
a687059c 1040#endif
1041 dst = regcode;
1042 while (src > opnd)
1043 *--dst = *--src;
1044
1045 place = opnd; /* Op node, where operand used to be. */
1046 *place++ = op;
1047 *place++ = '\0';
1048 *place++ = '\0';
00bf170e 1049 while (offset-- > 0)
1050 *place++ = '\0';
a687059c 1051}
1052
1053/*
1054 - regtail - set the next-pointer at the end of a node chain
1055 */
1056static void
1057regtail(p, val)
1058char *p;
1059char *val;
1060{
1061 register char *scan;
1062 register char *temp;
1063 register int offset;
1064
1065 if (p == &regdummy)
1066 return;
1067
1068 /* Find last node. */
1069 scan = p;
1070 for (;;) {
1071 temp = regnext(scan);
1072 if (temp == NULL)
1073 break;
1074 scan = temp;
1075 }
1076
1077#ifdef REGALIGN
1078 offset = val - scan;
1079#ifndef lint
1080 *(short*)(scan+1) = offset;
1081#else
1082 offset = offset;
1083#endif
1084#else
1085 if (OP(scan) == BACK)
1086 offset = scan - val;
1087 else
1088 offset = val - scan;
1089 *(scan+1) = (offset>>8)&0377;
1090 *(scan+2) = offset&0377;
1091#endif
1092}
1093
1094/*
1095 - regoptail - regtail on operand of first argument; nop if operandless
1096 */
1097static void
1098regoptail(p, val)
1099char *p;
1100char *val;
1101{
1102 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1103 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1104 return;
1105 regtail(NEXTOPER(p), val);
1106}
1107
1108/*
1109 - regcurly - a little FSA that accepts {\d+,?\d*}
1110 */
1111STATIC int
1112regcurly(s)
1113register char *s;
1114{
1115 if (*s++ != '{')
1116 return FALSE;
1117 if (!isdigit(*s))
1118 return FALSE;
1119 while (isdigit(*s))
1120 s++;
1121 if (*s == ',')
1122 s++;
1123 while (isdigit(*s))
1124 s++;
1125 if (*s != '}')
1126 return FALSE;
1127 return TRUE;
1128}
1129
1130#ifdef DEBUGGING
1131
1132/*
1133 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1134 */
1135void
1136regdump(r)
1137regexp *r;
1138{
1139 register char *s;
1140 register char op = EXACTLY; /* Arbitrary non-END op. */
1141 register char *next;
1142 extern char *index();
1143
1144
1145 s = r->program + 1;
1146 while (op != END) { /* While that wasn't END last time... */
1147#ifdef REGALIGN
1148 if (!((long)s & 1))
1149 s++;
1150#endif
1151 op = OP(s);
1152 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
0a12ae7d 1153 if (op == CURLY)
1154 s += 4;
a687059c 1155 next = regnext(s);
1156 if (next == NULL) /* Next ptr. */
1157 fprintf(stderr,"(0)");
1158 else
1159 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1160 s += 3;
00bf170e 1161 if (op == ANYOF) {
a687059c 1162 s += 32;
1163 }
1164 if (op == EXACTLY) {
1165 /* Literal string, where present. */
1166 s++;
1167 while (*s != '\0') {
1168 (void)putchar(*s);
1169 s++;
1170 }
1171 s++;
1172 }
1173 (void)putchar('\n');
1174 }
1175
1176 /* Header fields of interest. */
1177 if (r->regstart)
1178 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1179 if (r->regstclass)
1180 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
00bf170e 1181 if (r->reganch & 1)
a687059c 1182 fprintf(stderr,"anchored ");
00bf170e 1183 if (r->reganch & 2)
1184 fprintf(stderr,"plus ");
a687059c 1185 if (r->regmust != NULL)
1186 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1187 r->regback);
1188 fprintf(stderr,"\n");
1189}
1190
1191/*
1192 - regprop - printable representation of opcode
1193 */
1194char *
1195regprop(op)
1196char *op;
1197{
1198 register char *p;
1199
1200 (void) strcpy(buf, ":");
1201
1202 switch (OP(op)) {
1203 case BOL:
1204 p = "BOL";
1205 break;
1206 case EOL:
1207 p = "EOL";
1208 break;
1209 case ANY:
1210 p = "ANY";
1211 break;
1212 case ANYOF:
1213 p = "ANYOF";
1214 break;
a687059c 1215 case BRANCH:
1216 p = "BRANCH";
1217 break;
1218 case EXACTLY:
1219 p = "EXACTLY";
1220 break;
1221 case NOTHING:
1222 p = "NOTHING";
1223 break;
1224 case BACK:
1225 p = "BACK";
1226 break;
1227 case END:
1228 p = "END";
1229 break;
1230 case ALNUM:
1231 p = "ALNUM";
1232 break;
1233 case NALNUM:
1234 p = "NALNUM";
1235 break;
1236 case BOUND:
1237 p = "BOUND";
1238 break;
1239 case NBOUND:
1240 p = "NBOUND";
1241 break;
1242 case SPACE:
1243 p = "SPACE";
1244 break;
1245 case NSPACE:
1246 p = "NSPACE";
1247 break;
1248 case DIGIT:
1249 p = "DIGIT";
1250 break;
1251 case NDIGIT:
1252 p = "NDIGIT";
1253 break;
00bf170e 1254 case CURLY:
1255 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1256 ARG1(op),ARG2(op));
1257 p = NULL;
1258 break;
a687059c 1259 case REF:
1260 case REF+1:
1261 case REF+2:
1262 case REF+3:
1263 case REF+4:
1264 case REF+5:
1265 case REF+6:
1266 case REF+7:
1267 case REF+8:
1268 case REF+9:
1269 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1270 p = NULL;
1271 break;
1272 case OPEN+1:
1273 case OPEN+2:
1274 case OPEN+3:
1275 case OPEN+4:
1276 case OPEN+5:
1277 case OPEN+6:
1278 case OPEN+7:
1279 case OPEN+8:
1280 case OPEN+9:
1281 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1282 p = NULL;
1283 break;
1284 case CLOSE+1:
1285 case CLOSE+2:
1286 case CLOSE+3:
1287 case CLOSE+4:
1288 case CLOSE+5:
1289 case CLOSE+6:
1290 case CLOSE+7:
1291 case CLOSE+8:
1292 case CLOSE+9:
1293 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1294 p = NULL;
1295 break;
1296 case STAR:
1297 p = "STAR";
1298 break;
1299 case PLUS:
1300 p = "PLUS";
1301 break;
1302 default:
1303 FAIL("corrupted regexp opcode");
1304 }
1305 if (p != NULL)
1306 (void) strcat(buf, p);
1307 return(buf);
1308}
1309#endif /* DEBUGGING */
1310
1311regfree(r)
1312struct regexp *r;
1313{
1314 if (r->precomp)
1315 Safefree(r->precomp);
1316 if (r->subbase)
1317 Safefree(r->subbase);
1318 if (r->regmust)
1319 str_free(r->regmust);
1320 if (r->regstart)
1321 str_free(r->regstart);
1322 Safefree(r);
1323}