perl 5.003_01: mg.c
[p5sagit/p5-mst-13.2.git] / regexec.c
CommitLineData
a0d0e21e 1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c 8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73 17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
f0fcb552 22/*SUPPRESS 112*/
a687059c 23/*
e50aee73 24 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c 25 *
26 * Copyright (c) 1986 by University of Toronto.
27 * Written by Henry Spencer. Not derived from licensed software.
28 *
29 * Permission is granted to anyone to use this software for any
30 * purpose on any computer system, and to redistribute it freely,
31 * subject to the following restrictions:
32 *
33 * 1. The author is not responsible for the consequences of use of
34 * this software, no matter how awful, even if they arise
35 * from defects in it.
36 *
37 * 2. The origin of this software must not be misrepresented, either
38 * by explicit claim or by omission.
39 *
40 * 3. Altered versions must be plainly marked as such, and must not
41 * be misrepresented as being the original software.
42 *
43 **** Alterations to Henry's code are...
44 ****
a0d0e21e 45 **** Copyright (c) 1991-1994, Larry Wall
a687059c 46 ****
9ef589d8 47 **** You may distribute under the terms of either the GNU General Public
48 **** License or the Artistic License, as specified in the README file.
a687059c 49 *
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
53 */
54#include "EXTERN.h"
55#include "perl.h"
56#include "regcomp.h"
57
58#ifndef STATIC
59#define STATIC static
60#endif
61
62#ifdef DEBUGGING
a0d0e21e 63static I32 regnarrate = 0;
64static char* regprogram = 0;
a687059c 65#endif
66
a0d0e21e 67/* Current curly descriptor */
68typedef struct curcur CURCUR;
69struct curcur {
70 int parenfloor; /* how far back to strip paren data */
71 int cur; /* how many instances of scan we've matched */
72 int min; /* the minimal number of scans to match */
73 int max; /* the maximal number of scans to match */
74 int minmod; /* whether to work our way up or down */
75 char * scan; /* the thing to match */
76 char * next; /* what has to match after it */
77 char * lastloc; /* where we started matching this scan */
78 CURCUR * oldcc; /* current curly before we started this one */
79};
80
81static CURCUR* regcc;
82
83typedef I32 CHECKPOINT;
84
85CHECKPOINT regcppush _((I32 parenfloor));
86char * regcppop _((void));
87
88CHECKPOINT
89regcppush(parenfloor)
90I32 parenfloor;
91{
92 int retval = savestack_ix;
93 int i = (regsize - parenfloor) * 3;
94 int p;
95
96 SSCHECK(i + 5);
97 for (p = regsize; p > parenfloor; p--) {
98 SSPUSHPTR(regendp[p]);
99 SSPUSHPTR(regstartp[p]);
100 SSPUSHINT(p);
101 }
102 SSPUSHINT(regsize);
103 SSPUSHINT(*reglastparen);
104 SSPUSHPTR(reginput);
105 SSPUSHINT(i + 3);
106 SSPUSHINT(SAVEt_REGCONTEXT);
107 return retval;
108}
109
110char*
111regcppop()
112{
113 I32 i = SSPOPINT;
114 U32 paren = 0;
115 char *input;
116 char *tmps;
117 assert(i == SAVEt_REGCONTEXT);
118 i = SSPOPINT;
119 input = (char *) SSPOPPTR;
120 *reglastparen = SSPOPINT;
121 regsize = SSPOPINT;
122 for (i -= 3; i > 0; i -= 3) {
123 paren = (U32)SSPOPINT;
124 regstartp[paren] = (char *) SSPOPPTR;
125 tmps = (char*)SSPOPPTR;
126 if (paren <= *reglastparen)
127 regendp[paren] = tmps;
128 }
129 for (paren = *reglastparen + 1; paren <= regnpar; paren++) {
130 if (paren > regsize)
131 regstartp[paren] = Nullch;
132 regendp[paren] = Nullch;
133 }
134 return input;
135}
136
137#define regcpblow(cp) leave_scope(cp)
138
a687059c 139/*
e50aee73 140 * pregexec and friends
a687059c 141 */
142
143/*
a687059c 144 * Forwards.
145 */
a0d0e21e 146
147static I32 regmatch _((char *prog));
148static I32 regrepeat _((char *p, I32 max));
149static I32 regtry _((regexp *prog, char *startpos));
a687059c 150
151/*
e50aee73 152 - pregexec - match a regexp against a string
a687059c 153 */
79072805 154I32
e50aee73 155pregexec(prog, stringarg, strend, strbeg, minend, screamer, safebase)
a687059c 156register regexp *prog;
157char *stringarg;
158register char *strend; /* pointer to null at end of string */
159char *strbeg; /* real beginning of string */
79072805 160I32 minend; /* end of match must be at least minend after stringarg */
161SV *screamer;
162I32 safebase; /* no need to remember string in subbase */
a687059c 163{
a0d0e21e 164 register char *s;
165 register I32 i;
166 register char *c;
167 register char *startpos = stringarg;
168 register I32 tmp;
169 I32 minlen = 0; /* must match at least this many chars */
170 I32 dontbother = 0; /* how many characters not to try at end */
171 CURCUR cc;
a687059c 172
a0d0e21e 173 cc.cur = 0;
4633a7c4 174 cc.oldcc = 0;
a0d0e21e 175 regcc = &cc;
176
177#ifdef DEBUGGING
178 regnarrate = debug & 512;
179 regprogram = prog->program;
180#endif
181
182 /* Be paranoid... */
183 if (prog == NULL || startpos == NULL) {
184 croak("NULL regexp parameter");
185 return 0;
186 }
187
188 if (startpos == strbeg) /* is ^ valid at stringarg? */
189 regprev = '\n';
190 else {
191 regprev = stringarg[-1];
192 if (!multiline && regprev == '\n')
193 regprev = '\0'; /* force ^ to NOT match */
194 }
195 regprecomp = prog->precomp;
196 regnpar = prog->nparens;
197 /* Check validity of program. */
198 if (UCHARAT(prog->program) != MAGIC) {
199 FAIL("corrupted regexp program");
200 }
201
202 if (prog->do_folding) {
203 i = strend - startpos;
204 New(1101,c,i+1,char);
205 Copy(startpos, c, i+1, char);
206 startpos = c;
207 strend = startpos + i;
208 for (s = startpos; s < strend; s++)
209 if (isUPPER(*s))
210 *s = toLOWER(*s);
211 }
212
213 /* If there is a "must appear" string, look for it. */
214 s = startpos;
215 if (prog->regmust != Nullsv &&
216 (!(prog->reganch & ROPT_ANCH)
217 || (multiline && prog->regback >= 0)) )
218 {
219 if (stringarg == strbeg && screamer) {
220 if (screamfirst[BmRARE(prog->regmust)] >= 0)
221 s = screaminstr(screamer,prog->regmust);
222 else
223 s = Nullch;
0a12ae7d 224 }
a0d0e21e 225 else
226 s = fbm_instr((unsigned char*)s, (unsigned char*)strend,
227 prog->regmust);
228 if (!s) {
229 ++BmUSEFUL(prog->regmust); /* hooray */
230 goto phooey; /* not present */
a687059c 231 }
a0d0e21e 232 else if (prog->regback >= 0) {
233 s -= prog->regback;
234 if (s < startpos)
235 s = startpos;
236 minlen = prog->regback + SvCUR(prog->regmust);
a687059c 237 }
a0d0e21e 238 else if (!prog->naughty && --BmUSEFUL(prog->regmust) < 0) { /* boo */
239 SvREFCNT_dec(prog->regmust);
240 prog->regmust = Nullsv; /* disable regmust */
241 s = startpos;
242 }
243 else {
244 s = startpos;
245 minlen = SvCUR(prog->regmust);
a687059c 246 }
a0d0e21e 247 }
a687059c 248
a0d0e21e 249 /* Mark beginning of line for ^ . */
250 regbol = startpos;
a687059c 251
a0d0e21e 252 /* Mark end of line for $ (and such) */
253 regeol = strend;
a687059c 254
a0d0e21e 255 /* see how far we have to get to not match where we matched before */
256 regtill = startpos+minend;
a687059c 257
a0d0e21e 258 /* Simplest case: anchored match need be tried only once. */
259 /* [unless multiline is set] */
260 if (prog->reganch & ROPT_ANCH) {
261 if (regtry(prog, startpos))
262 goto got_it;
263 else if (multiline || (prog->reganch & ROPT_IMPLICIT)) {
264 if (minlen)
265 dontbother = minlen - 1;
266 strend -= dontbother;
267 /* for multiline we only have to try after newlines */
268 if (s > startpos)
269 s--;
270 while (s < strend) {
271 if (*s++ == '\n') {
272 if (s < strend && regtry(prog, s))
273 goto got_it;
274 }
35c8bce7 275 }
35c8bce7 276 }
a0d0e21e 277 goto phooey;
278 }
35c8bce7 279
a0d0e21e 280 /* Messy cases: unanchored match. */
281 if (prog->regstart) {
282 if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */
283 /* it must be a one character string */
284 i = SvPVX(prog->regstart)[0];
285 while (s < strend) {
286 if (*s == i) {
287 if (regtry(prog, s))
a687059c 288 goto got_it;
a0d0e21e 289 s++;
290 while (s < strend && *s == i)
291 s++;
a687059c 292 }
a0d0e21e 293 s++;
294 }
a687059c 295 }
a0d0e21e 296 else if (SvPOK(prog->regstart) == 3) {
297 /* We know what string it must start with. */
298 while ((s = fbm_instr((unsigned char*)s,
299 (unsigned char*)strend, prog->regstart)) != NULL)
300 {
301 if (regtry(prog, s))
302 goto got_it;
303 s++;
304 }
305 }
306 else {
307 c = SvPVX(prog->regstart);
308 while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL)
309 {
310 if (regtry(prog, s))
311 goto got_it;
312 s++;
313 }
314 }
315 goto phooey;
316 }
317 /*SUPPRESS 560*/
318 if (c = prog->regstclass) {
319 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
a687059c 320
a0d0e21e 321 if (minlen)
322 dontbother = minlen - 1;
323 strend -= dontbother; /* don't bother with what can't match */
324 tmp = 1;
325 /* We know what class it must start with. */
326 switch (OP(c)) {
327 case ANYOF:
328 c = OPERAND(c);
329 while (s < strend) {
330 i = UCHARAT(s);
331 if (!(c[i >> 3] & (1 << (i&7)))) {
332 if (tmp && regtry(prog, s))
333 goto got_it;
334 else
335 tmp = doevery;
a687059c 336 }
a0d0e21e 337 else
338 tmp = 1;
339 s++;
340 }
341 break;
342 case BOUND:
343 if (minlen)
344 dontbother++,strend--;
345 if (s != startpos) {
346 i = s[-1];
347 tmp = isALNUM(i);
348 }
349 else
350 tmp = isALNUM(regprev); /* assume not alphanumeric */
351 while (s < strend) {
352 i = *s;
353 if (tmp != isALNUM(i)) {
354 tmp = !tmp;
355 if (regtry(prog, s))
356 goto got_it;
a687059c 357 }
a0d0e21e 358 s++;
359 }
360 if ((minlen || tmp) && regtry(prog,s))
361 goto got_it;
362 break;
363 case NBOUND:
364 if (minlen)
365 dontbother++,strend--;
366 if (s != startpos) {
367 i = s[-1];
368 tmp = isALNUM(i);
369 }
370 else
371 tmp = isALNUM(regprev); /* assume not alphanumeric */
372 while (s < strend) {
373 i = *s;
374 if (tmp != isALNUM(i))
375 tmp = !tmp;
376 else if (regtry(prog, s))
377 goto got_it;
378 s++;
379 }
380 if ((minlen || !tmp) && regtry(prog,s))
381 goto got_it;
382 break;
383 case ALNUM:
384 while (s < strend) {
385 i = *s;
386 if (isALNUM(i)) {
387 if (tmp && regtry(prog, s))
388 goto got_it;
a687059c 389 else
a0d0e21e 390 tmp = doevery;
391 }
392 else
393 tmp = 1;
394 s++;
395 }
396 break;
397 case NALNUM:
398 while (s < strend) {
399 i = *s;
400 if (!isALNUM(i)) {
401 if (tmp && regtry(prog, s))
402 goto got_it;
a687059c 403 else
a0d0e21e 404 tmp = doevery;
a687059c 405 }
a0d0e21e 406 else
407 tmp = 1;
408 s++;
409 }
410 break;
411 case SPACE:
412 while (s < strend) {
413 if (isSPACE(*s)) {
414 if (tmp && regtry(prog, s))
415 goto got_it;
416 else
417 tmp = doevery;
2304df62 418 }
a0d0e21e 419 else
420 tmp = 1;
421 s++;
422 }
423 break;
424 case NSPACE:
425 while (s < strend) {
426 if (!isSPACE(*s)) {
427 if (tmp && regtry(prog, s))
428 goto got_it;
429 else
430 tmp = doevery;
a687059c 431 }
a0d0e21e 432 else
433 tmp = 1;
434 s++;
435 }
436 break;
437 case DIGIT:
438 while (s < strend) {
439 if (isDIGIT(*s)) {
440 if (tmp && regtry(prog, s))
441 goto got_it;
442 else
443 tmp = doevery;
2b69d0c2 444 }
a0d0e21e 445 else
446 tmp = 1;
447 s++;
448 }
449 break;
450 case NDIGIT:
451 while (s < strend) {
452 if (!isDIGIT(*s)) {
453 if (tmp && regtry(prog, s))
454 goto got_it;
455 else
456 tmp = doevery;
a687059c 457 }
a0d0e21e 458 else
459 tmp = 1;
460 s++;
461 }
462 break;
a687059c 463 }
a0d0e21e 464 }
465 else {
466 if (minlen)
467 dontbother = minlen - 1;
468 strend -= dontbother;
469 /* We don't know much -- general case. */
470 do {
471 if (regtry(prog, s))
472 goto got_it;
473 } while (s++ < strend);
474 }
475
476 /* Failure. */
477 goto phooey;
a687059c 478
a0d0e21e 479got_it:
420218e7 480 strend += dontbother; /* uncheat */
a0d0e21e 481 prog->subbeg = strbeg;
482 prog->subend = strend;
483 if ((!safebase && (prog->nparens || sawampersand)) || prog->do_folding) {
a0d0e21e 484 i = strend - startpos + (stringarg - strbeg);
485 if (safebase) { /* no need for $digit later */
486 s = strbeg;
487 prog->subend = s+i;
488 }
489 else if (strbeg != prog->subbase) {
490 s = savepvn(strbeg,i); /* so $digit will work later */
491 if (prog->subbase)
492 Safefree(prog->subbase);
493 prog->subbeg = prog->subbase = s;
494 prog->subend = s+i;
495 }
496 else {
497 prog->subbeg = s = prog->subbase;
498 prog->subend = s+i;
499 }
500 s += (stringarg - strbeg);
501 for (i = 0; i <= prog->nparens; i++) {
502 if (prog->endp[i]) {
503 prog->startp[i] = s + (prog->startp[i] - startpos);
504 prog->endp[i] = s + (prog->endp[i] - startpos);
505 }
506 }
a687059c 507 if (prog->do_folding)
a0d0e21e 508 Safefree(startpos);
509 }
510 return 1;
511
512phooey:
513 if (prog->do_folding)
514 Safefree(startpos);
515 return 0;
a687059c 516}
517
518/*
519 - regtry - try match at specific point
520 */
79072805 521static I32 /* 0 failure, 1 success */
a0d0e21e 522regtry(prog, startpos)
a687059c 523regexp *prog;
a0d0e21e 524char *startpos;
a687059c 525{
a0d0e21e 526 register I32 i;
527 register char **sp;
528 register char **ep;
529
530 reginput = startpos;
531 regstartp = prog->startp;
532 regendp = prog->endp;
533 reglastparen = &prog->lastparen;
534 prog->lastparen = 0;
535 regsize = 0;
536
537 sp = prog->startp;
538 ep = prog->endp;
539 if (prog->nparens) {
540 for (i = prog->nparens; i >= 0; i--) {
541 *sp++ = NULL;
542 *ep++ = NULL;
a687059c 543 }
a0d0e21e 544 }
545 if (regmatch(prog->program + 1) && reginput >= regtill) {
546 prog->startp[0] = startpos;
547 prog->endp[0] = reginput;
548 return 1;
549 }
550 else
551 return 0;
a687059c 552}
553
554/*
555 - regmatch - main matching routine
556 *
557 * Conceptually the strategy is simple: check to see whether the current
558 * node matches, call self recursively to see whether the rest matches,
559 * and then act accordingly. In practice we make some effort to avoid
560 * recursion, in particular by going through "ordinary" nodes (that don't
561 * need to know whether the rest of the match failed) by a loop instead of
562 * by recursion.
563 */
564/* [lwall] I've hoisted the register declarations to the outer block in order to
565 * maybe save a little bit of pushing and popping on the stack. It also takes
566 * advantage of machines that use a register save mask on subroutine entry.
567 */
79072805 568static I32 /* 0 failure, 1 success */
a687059c 569regmatch(prog)
570char *prog;
571{
a0d0e21e 572 register char *scan; /* Current node. */
573 char *next; /* Next node. */
574 register I32 nextchar;
575 register I32 n; /* no or next */
576 register I32 ln; /* len or last */
577 register char *s; /* operand or save */
578 register char *locinput = reginput;
579 int minmod = 0;
4633a7c4 580#ifdef DEBUGGING
581 static int regindent = 0;
582 regindent++;
583#endif
a0d0e21e 584
585 nextchar = *locinput;
586 scan = prog;
587 while (scan != NULL) {
a687059c 588#ifdef DEBUGGING
4633a7c4 589#define sayYES goto yes
590#define sayNO goto no
591#define saySAME(x) if (x) goto yes; else goto no
592 if (regnarrate) {
593 fprintf(stderr, "%*s%2d%-8.8s\t<%.10s>\n", regindent*2, "",
a0d0e21e 594 scan - regprogram, regprop(scan), locinput);
4633a7c4 595 }
596#else
597#define sayYES return 1
598#define sayNO return 0
599#define saySAME(x) return x
a687059c 600#endif
601
602#ifdef REGALIGN
a0d0e21e 603 next = scan + NEXT(scan);
604 if (next == scan)
605 next = NULL;
a687059c 606#else
a0d0e21e 607 next = regnext(scan);
a687059c 608#endif
609
a0d0e21e 610 switch (OP(scan)) {
611 case BOL:
612 if (locinput == regbol
613 ? regprev == '\n'
614 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
615 {
616 /* regtill = regbol; */
617 break;
618 }
4633a7c4 619 sayNO;
a0d0e21e 620 case MBOL:
621 if (locinput == regbol
622 ? regprev == '\n'
623 : ((nextchar || locinput < regeol) && locinput[-1] == '\n') )
624 {
625 break;
626 }
4633a7c4 627 sayNO;
a0d0e21e 628 case SBOL:
629 if (locinput == regbol && regprev == '\n')
630 break;
4633a7c4 631 sayNO;
a0d0e21e 632 case GBOL:
633 if (locinput == regbol)
634 break;
4633a7c4 635 sayNO;
a0d0e21e 636 case EOL:
637 if (multiline)
638 goto meol;
639 else
640 goto seol;
641 case MEOL:
642 meol:
643 if ((nextchar || locinput < regeol) && nextchar != '\n')
4633a7c4 644 sayNO;
a0d0e21e 645 break;
646 case SEOL:
647 seol:
648 if ((nextchar || locinput < regeol) && nextchar != '\n')
4633a7c4 649 sayNO;
a0d0e21e 650 if (regeol - locinput > 1)
4633a7c4 651 sayNO;
a0d0e21e 652 break;
653 case SANY:
654 if (!nextchar && locinput >= regeol)
4633a7c4 655 sayNO;
a0d0e21e 656 nextchar = *++locinput;
657 break;
658 case ANY:
659 if (!nextchar && locinput >= regeol || nextchar == '\n')
4633a7c4 660 sayNO;
a0d0e21e 661 nextchar = *++locinput;
662 break;
663 case EXACTLY:
664 s = OPERAND(scan);
665 ln = *s++;
666 /* Inline the first character, for speed. */
667 if (*s != nextchar)
4633a7c4 668 sayNO;
a0d0e21e 669 if (regeol - locinput < ln)
4633a7c4 670 sayNO;
a0d0e21e 671 if (ln > 1 && bcmp(s, locinput, ln) != 0)
4633a7c4 672 sayNO;
a0d0e21e 673 locinput += ln;
674 nextchar = *locinput;
675 break;
676 case ANYOF:
677 s = OPERAND(scan);
678 if (nextchar < 0)
679 nextchar = UCHARAT(locinput);
680 if (s[nextchar >> 3] & (1 << (nextchar&7)))
4633a7c4 681 sayNO;
a0d0e21e 682 if (!nextchar && locinput >= regeol)
4633a7c4 683 sayNO;
a0d0e21e 684 nextchar = *++locinput;
685 break;
686 case ALNUM:
687 if (!nextchar)
4633a7c4 688 sayNO;
a0d0e21e 689 if (!isALNUM(nextchar))
4633a7c4 690 sayNO;
a0d0e21e 691 nextchar = *++locinput;
692 break;
693 case NALNUM:
694 if (!nextchar && locinput >= regeol)
4633a7c4 695 sayNO;
a0d0e21e 696 if (isALNUM(nextchar))
4633a7c4 697 sayNO;
a0d0e21e 698 nextchar = *++locinput;
699 break;
700 case NBOUND:
701 case BOUND:
702 if (locinput == regbol) /* was last char in word? */
703 ln = isALNUM(regprev);
704 else
705 ln = isALNUM(locinput[-1]);
706 n = isALNUM(nextchar); /* is next char in word? */
707 if ((ln == n) == (OP(scan) == BOUND))
4633a7c4 708 sayNO;
a0d0e21e 709 break;
710 case SPACE:
711 if (!nextchar && locinput >= regeol)
4633a7c4 712 sayNO;
a0d0e21e 713 if (!isSPACE(nextchar))
4633a7c4 714 sayNO;
a0d0e21e 715 nextchar = *++locinput;
716 break;
717 case NSPACE:
718 if (!nextchar)
4633a7c4 719 sayNO;
a0d0e21e 720 if (isSPACE(nextchar))
4633a7c4 721 sayNO;
a0d0e21e 722 nextchar = *++locinput;
723 break;
724 case DIGIT:
725 if (!isDIGIT(nextchar))
4633a7c4 726 sayNO;
a0d0e21e 727 nextchar = *++locinput;
728 break;
729 case NDIGIT:
730 if (!nextchar && locinput >= regeol)
4633a7c4 731 sayNO;
a0d0e21e 732 if (isDIGIT(nextchar))
4633a7c4 733 sayNO;
a0d0e21e 734 nextchar = *++locinput;
735 break;
736 case REF:
737 n = ARG1(scan); /* which paren pair */
738 s = regstartp[n];
739 if (!s)
4633a7c4 740 sayNO;
a0d0e21e 741 if (!regendp[n])
4633a7c4 742 sayNO;
a0d0e21e 743 if (s == regendp[n])
744 break;
745 /* Inline the first character, for speed. */
746 if (*s != nextchar)
4633a7c4 747 sayNO;
a0d0e21e 748 ln = regendp[n] - s;
749 if (locinput + ln > regeol)
4633a7c4 750 sayNO;
a0d0e21e 751 if (ln > 1 && bcmp(s, locinput, ln) != 0)
4633a7c4 752 sayNO;
a0d0e21e 753 locinput += ln;
754 nextchar = *locinput;
755 break;
756
757 case NOTHING:
758 break;
759 case BACK:
760 break;
761 case OPEN:
762 n = ARG1(scan); /* which paren pair */
763 regstartp[n] = locinput;
764 if (n > regsize)
765 regsize = n;
766 break;
767 case CLOSE:
768 n = ARG1(scan); /* which paren pair */
769 regendp[n] = locinput;
770 if (n > *reglastparen)
771 *reglastparen = n;
772 break;
773 case CURLYX: {
774 CURCUR cc;
775 CHECKPOINT cp = savestack_ix;
776 cc.oldcc = regcc;
777 regcc = &cc;
778 cc.parenfloor = *reglastparen;
779 cc.cur = -1;
780 cc.min = ARG1(scan);
781 cc.max = ARG2(scan);
782 cc.scan = NEXTOPER(scan) + 4;
783 cc.next = next;
784 cc.minmod = minmod;
785 cc.lastloc = 0;
786 reginput = locinput;
787 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
788 regcpblow(cp);
789 regcc = cc.oldcc;
4633a7c4 790 saySAME(n);
a0d0e21e 791 }
792 /* NOT REACHED */
793 case WHILEM: {
794 /*
795 * This is really hard to understand, because after we match
796 * what we're trying to match, we must make sure the rest of
797 * the RE is going to match for sure, and to do that we have
798 * to go back UP the parse tree by recursing ever deeper. And
799 * if it fails, we have to reset our parent's current state
800 * that we can try again after backing off.
801 */
802
803 CURCUR* cc = regcc;
4633a7c4 804 n = cc->cur + 1; /* how many we know we matched */
a0d0e21e 805 reginput = locinput;
806
4633a7c4 807#ifdef DEBUGGING
808 if (regnarrate)
809 fprintf(stderr, "%*s %d %lx\n", regindent*2, "",
810 n, (long)cc);
811#endif
812
a0d0e21e 813 /* If degenerate scan matches "", assume scan done. */
814
815 if (locinput == cc->lastloc) {
816 regcc = cc->oldcc;
817 ln = regcc->cur;
818 if (regmatch(cc->next))
4633a7c4 819 sayYES;
a0d0e21e 820 regcc->cur = ln;
821 regcc = cc;
4633a7c4 822 sayNO;
a0d0e21e 823 }
824
825 /* First just match a string of min scans. */
826
827 if (n < cc->min) {
828 cc->cur = n;
829 cc->lastloc = locinput;
4633a7c4 830 if (regmatch(cc->scan))
831 sayYES;
832 cc->cur = n - 1;
833 sayNO;
a0d0e21e 834 }
835
836 /* Prefer next over scan for minimal matching. */
837
838 if (cc->minmod) {
839 regcc = cc->oldcc;
840 ln = regcc->cur;
841 if (regmatch(cc->next))
4633a7c4 842 sayYES; /* All done. */
a0d0e21e 843 regcc->cur = ln;
844 regcc = cc;
845
846 if (n >= cc->max) /* Maximum greed exceeded? */
4633a7c4 847 sayNO;
a687059c 848
a0d0e21e 849 /* Try scanning more and see if it helps. */
850 reginput = locinput;
851 cc->cur = n;
852 cc->lastloc = locinput;
4633a7c4 853 if (regmatch(cc->scan))
854 sayYES;
855 cc->cur = n - 1;
856 sayNO;
a0d0e21e 857 }
858
859 /* Prefer scan over next for maximal matching. */
860
861 if (n < cc->max) { /* More greed allowed? */
862 regcppush(cc->parenfloor);
863 cc->cur = n;
864 cc->lastloc = locinput;
865 if (regmatch(cc->scan))
4633a7c4 866 sayYES;
a0d0e21e 867 regcppop(); /* Restore some previous $<digit>s? */
868 reginput = locinput;
869 }
870
871 /* Failed deeper matches of scan, so see if this one works. */
872 regcc = cc->oldcc;
873 ln = regcc->cur;
874 if (regmatch(cc->next))
4633a7c4 875 sayYES;
a0d0e21e 876 regcc->cur = ln;
877 regcc = cc;
4633a7c4 878 cc->cur = n - 1;
879 sayNO;
a0d0e21e 880 }
881 /* NOT REACHED */
882 case BRANCH: {
883 if (OP(next) != BRANCH) /* No choice. */
884 next = NEXTOPER(scan);/* Avoid recursion. */
885 else {
748a9306 886 int lastparen = *reglastparen;
a0d0e21e 887 do {
888 reginput = locinput;
889 if (regmatch(NEXTOPER(scan)))
4633a7c4 890 sayYES;
748a9306 891 for (n = *reglastparen; n > lastparen; n--)
892 regendp[n] = 0;
893 *reglastparen = n;
894
a687059c 895#ifdef REGALIGN
a0d0e21e 896 /*SUPPRESS 560*/
897 if (n = NEXT(scan))
898 scan += n;
899 else
900 scan = NULL;
a687059c 901#else
a0d0e21e 902 scan = regnext(scan);
79072805 903#endif
a0d0e21e 904 } while (scan != NULL && OP(scan) == BRANCH);
4633a7c4 905 sayNO;
a0d0e21e 906 /* NOTREACHED */
a687059c 907 }
a0d0e21e 908 }
909 break;
910 case MINMOD:
911 minmod = 1;
912 break;
913 case CURLY:
914 ln = ARG1(scan); /* min to match */
915 n = ARG2(scan); /* max to match */
916 scan = NEXTOPER(scan) + 4;
917 goto repeat;
918 case STAR:
919 ln = 0;
920 n = 32767;
921 scan = NEXTOPER(scan);
922 goto repeat;
923 case PLUS:
924 /*
925 * Lookahead to avoid useless match attempts
926 * when we know what character comes next.
927 */
928 ln = 1;
929 n = 32767;
930 scan = NEXTOPER(scan);
931 repeat:
932 if (OP(next) == EXACTLY)
933 nextchar = *(OPERAND(next)+1);
934 else
935 nextchar = -1000;
936 reginput = locinput;
937 if (minmod) {
938 minmod = 0;
939 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 940 sayNO;
8e07c86e 941 while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */
a0d0e21e 942 /* If it could work, try it. */
943 if (nextchar == -1000 || *reginput == nextchar)
944 if (regmatch(next))
4633a7c4 945 sayYES;
a0d0e21e 946 /* Couldn't or didn't -- back up. */
748a9306 947 reginput = locinput + ln;
a0d0e21e 948 if (regrepeat(scan, 1)) {
949 ln++;
950 reginput = locinput + ln;
951 }
952 else
4633a7c4 953 sayNO;
a0d0e21e 954 }
955 }
956 else {
957 n = regrepeat(scan, n);
958 if (ln < n && regkind[(U8)OP(next)] == EOL &&
959 (!multiline || OP(next) == SEOL))
960 ln = n; /* why back off? */
961 while (n >= ln) {
962 /* If it could work, try it. */
963 if (nextchar == -1000 || *reginput == nextchar)
964 if (regmatch(next))
4633a7c4 965 sayYES;
a0d0e21e 966 /* Couldn't or didn't -- back up. */
967 n--;
968 reginput = locinput + n;
969 }
970 }
4633a7c4 971 sayNO;
a0d0e21e 972 case SUCCEED:
973 case END:
974 reginput = locinput; /* put where regtry can find it */
4633a7c4 975 sayYES; /* Success! */
a0d0e21e 976 case IFMATCH:
977 reginput = locinput;
978 scan = NEXTOPER(scan);
979 if (!regmatch(scan))
4633a7c4 980 sayNO;
a0d0e21e 981 break;
982 case UNLESSM:
983 reginput = locinput;
984 scan = NEXTOPER(scan);
985 if (regmatch(scan))
4633a7c4 986 sayNO;
a0d0e21e 987 break;
988 default:
989 fprintf(stderr, "%x %d\n",(unsigned)scan,scan[1]);
990 FAIL("regexp memory corruption");
a687059c 991 }
a0d0e21e 992 scan = next;
993 }
a687059c 994
a0d0e21e 995 /*
996 * We get here only if there's trouble -- normally "case END" is
997 * the terminating point.
998 */
999 FAIL("corrupted regexp pointers");
1000 /*NOTREACHED*/
4633a7c4 1001 sayNO;
1002
1003yes:
1004#ifdef DEBUGGING
1005 regindent--;
1006#endif
1007 return 1;
1008
1009no:
1010#ifdef DEBUGGING
1011 regindent--;
1012#endif
a0d0e21e 1013 return 0;
a687059c 1014}
1015
1016/*
1017 - regrepeat - repeatedly match something simple, report how many
1018 */
1019/*
1020 * [This routine now assumes that it will only match on things of length 1.
1021 * That was true before, but now we assume scan - reginput is the count,
1022 * rather than incrementing count on every character.]
1023 */
79072805 1024static I32
00bf170e 1025regrepeat(p, max)
a687059c 1026char *p;
79072805 1027I32 max;
a687059c 1028{
a0d0e21e 1029 register char *scan;
1030 register char *opnd;
1031 register I32 c;
1032 register char *loceol = regeol;
1033
1034 scan = reginput;
1035 if (max != 32767 && max < loceol - scan)
1036 loceol = scan + max;
1037 opnd = OPERAND(p);
1038 switch (OP(p)) {
1039 case ANY:
1040 while (scan < loceol && *scan != '\n')
1041 scan++;
1042 break;
1043 case SANY:
1044 scan = loceol;
1045 break;
1046 case EXACTLY: /* length of string is 1 */
1047 opnd++;
1048 while (scan < loceol && *opnd == *scan)
1049 scan++;
1050 break;
1051 case ANYOF:
1052 c = UCHARAT(scan);
1053 while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
1054 scan++;
1055 c = UCHARAT(scan);
a687059c 1056 }
a0d0e21e 1057 break;
1058 case ALNUM:
1059 while (scan < loceol && isALNUM(*scan))
1060 scan++;
1061 break;
1062 case NALNUM:
1063 while (scan < loceol && !isALNUM(*scan))
1064 scan++;
1065 break;
1066 case SPACE:
1067 while (scan < loceol && isSPACE(*scan))
1068 scan++;
1069 break;
1070 case NSPACE:
1071 while (scan < loceol && !isSPACE(*scan))
1072 scan++;
1073 break;
1074 case DIGIT:
1075 while (scan < loceol && isDIGIT(*scan))
1076 scan++;
1077 break;
1078 case NDIGIT:
1079 while (scan < loceol && !isDIGIT(*scan))
1080 scan++;
1081 break;
1082 default: /* Called on something of 0 width. */
1083 break; /* So match right here or not at all. */
1084 }
a687059c 1085
a0d0e21e 1086 c = scan - reginput;
1087 reginput = scan;
a687059c 1088
a0d0e21e 1089 return(c);
a687059c 1090}
1091
1092/*
1093 - regnext - dig the "next" pointer out of a node
1094 *
1095 * [Note, when REGALIGN is defined there are two places in regmatch()
1096 * that bypass this code for speed.]
1097 */
1098char *
1099regnext(p)
1100register char *p;
1101{
a0d0e21e 1102 register I32 offset;
a687059c 1103
a0d0e21e 1104 if (p == &regdummy)
1105 return(NULL);
a687059c 1106
a0d0e21e 1107 offset = NEXT(p);
1108 if (offset == 0)
1109 return(NULL);
a687059c 1110
1111#ifdef REGALIGN
a0d0e21e 1112 return(p+offset);
a687059c 1113#else
a0d0e21e 1114 if (OP(p) == BACK)
1115 return(p-offset);
1116 else
1117 return(p+offset);
a687059c 1118#endif
1119}