5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%i\n", \
150 PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%i..%i\n", \
155 lastcp, PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%d..\\%d to undef\n",
190 *PL_reglastparen + 1, PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* A failure to find a constant substring means that there is no need to make
279 an expensive call to REx engine, thus we celebrate a failure. Similarly,
280 finding a substring too deep into the string means that less calls to
281 regtry() should be needed.
283 REx compiler's optimizer found 4 possible hints:
284 a) Anchored substring;
286 c) Whether we are anchored (beginning-of-line or \G);
287 d) First node (of those at offset 0) which may distingush positions;
288 We use 'a', 'b', multiline-part of 'c', and try to find a position in the
289 string which does not contradict any of them.
293 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
294 char *strend, U32 flags, re_scream_pos_data *data)
296 register I32 start_shift;
297 /* Should be nonnegative! */
298 register I32 end_shift;
304 register char *other_last = Nullch;
306 char *i_strpos = strpos;
309 DEBUG_r( if (!PL_colorset) reginitcolors() );
310 DEBUG_r(PerlIO_printf(Perl_debug_log,
311 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
312 PL_colors[4],PL_colors[5],PL_colors[0],
315 (strlen(prog->precomp) > 60 ? "..." : ""),
317 (int)(strend - strpos > 60 ? 60 : strend - strpos),
318 strpos, PL_colors[1],
319 (strend - strpos > 60 ? "..." : ""))
322 if (prog->minlen > strend - strpos) {
323 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
326 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
327 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
328 || ( (prog->reganch & ROPT_ANCH_BOL)
329 && !PL_multiline ) ); /* Check after \n? */
331 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
332 /* Substring at constant offset from beg-of-str... */
335 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
336 && (sv && (strpos + SvCUR(sv) != strend)) ) {
337 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
340 PL_regeol = strend; /* Used in HOP() */
341 s = HOPc(strpos, prog->check_offset_min);
342 if (SvTAIL(prog->check_substr)) {
343 slen = SvCUR(prog->check_substr); /* >= 1 */
345 if ( strend - s > slen || strend - s < slen - 1
346 || (strend - s == slen && strend[-1] != '\n')) {
347 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
350 /* Now should match s[0..slen-2] */
352 if (slen && (*SvPVX(prog->check_substr) != *s
354 && memNE(SvPVX(prog->check_substr), s, slen)))) {
356 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
360 else if (*SvPVX(prog->check_substr) != *s
361 || ((slen = SvCUR(prog->check_substr)) > 1
362 && memNE(SvPVX(prog->check_substr), s, slen)))
364 goto success_at_start;
366 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
368 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
369 /* Should be nonnegative! */
370 end_shift = prog->minlen - start_shift -
371 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
373 I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr)
374 - (SvTAIL(prog->check_substr) != 0);
375 I32 eshift = strend - s - end;
377 if (end_shift < eshift)
381 else { /* Can match at random position */
384 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
385 /* Should be nonnegative! */
386 end_shift = prog->minlen - start_shift -
387 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
390 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
392 Perl_croak(aTHX_ "panic: end_shift");
395 check = prog->check_substr;
397 /* Find a possible match in the region s..strend by looking for
398 the "check" substring in the region corrected by start/end_shift. */
399 if (flags & REXEC_SCREAM) {
400 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
401 I32 p = -1; /* Internal iterator of scream. */
402 I32 *pp = data ? data->scream_pos : &p;
404 if (PL_screamfirst[BmRARE(check)] >= 0
405 || ( BmRARE(check) == '\n'
406 && (BmPREVIOUS(check) == SvCUR(check) - 1)
408 s = screaminstr(sv, check,
409 start_shift + (s - strbeg), end_shift, pp, 0);
413 *data->scream_olds = s;
416 s = fbm_instr((unsigned char*)s + start_shift,
417 (unsigned char*)strend - end_shift,
418 check, PL_multiline ? FBMrf_MULTILINE : 0);
420 /* Update the count-of-usability, remove useless subpatterns,
423 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
424 (s ? "Found" : "Did not find"),
425 ((check == prog->anchored_substr) ? "anchored" : "floating"),
427 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
429 PL_colors[1], (SvTAIL(check) ? "$" : ""),
430 (s ? " at offset " : "...\n") ) );
435 /* Finish the diagnostic message */
436 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
438 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
439 Start with the other substr.
440 XXXX no SCREAM optimization yet - and a very coarse implementation
441 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
442 *always* match. Probably should be marked during compile...
443 Probably it is right to do no SCREAM here...
446 if (prog->float_substr && prog->anchored_substr) {
447 /* Take into account the "other" substring. */
448 /* XXXX May be hopelessly wrong for UTF... */
450 other_last = strpos - 1;
451 if (check == prog->float_substr) {
454 char *last = s - start_shift, *last1, *last2;
458 t = s - prog->check_offset_max;
459 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
460 && (!(prog->reganch & ROPT_UTF8)
461 || (PL_bostr = strpos, /* Used in regcopmaybe() */
462 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
467 t += prog->anchored_offset;
471 last2 = last1 = strend - prog->minlen;
474 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
475 /* On end-of-str: see comment below. */
476 s = fbm_instr((unsigned char*)t,
477 (unsigned char*)last1 + prog->anchored_offset
478 + SvCUR(prog->anchored_substr)
479 - (SvTAIL(prog->anchored_substr)!=0),
480 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
481 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
482 (s ? "Found" : "Contradicts"),
484 (int)(SvCUR(prog->anchored_substr)
485 - (SvTAIL(prog->anchored_substr)!=0)),
486 SvPVX(prog->anchored_substr),
487 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
489 if (last1 >= last2) {
490 DEBUG_r(PerlIO_printf(Perl_debug_log,
491 ", giving up...\n"));
494 DEBUG_r(PerlIO_printf(Perl_debug_log,
495 ", trying floating at offset %ld...\n",
496 (long)(s1 + 1 - i_strpos)));
497 PL_regeol = strend; /* Used in HOP() */
498 other_last = last1 + prog->anchored_offset;
503 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
504 (long)(s - i_strpos)));
505 t = s - prog->anchored_offset;
514 else { /* Take into account the floating substring. */
519 last1 = last = strend - prog->minlen + prog->float_min_offset;
520 if (last - t > prog->float_max_offset)
521 last = t + prog->float_max_offset;
522 s = t + prog->float_min_offset;
525 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
526 /* fbm_instr() takes into account exact value of end-of-str
527 if the check is SvTAIL(ed). Since false positives are OK,
528 and end-of-str is not later than strend we are OK. */
529 s = fbm_instr((unsigned char*)s,
530 (unsigned char*)last + SvCUR(prog->float_substr)
531 - (SvTAIL(prog->float_substr)!=0),
532 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
533 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
534 (s ? "Found" : "Contradicts"),
536 (int)(SvCUR(prog->float_substr)
537 - (SvTAIL(prog->float_substr)!=0)),
538 SvPVX(prog->float_substr),
539 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
542 DEBUG_r(PerlIO_printf(Perl_debug_log,
543 ", giving up...\n"));
546 DEBUG_r(PerlIO_printf(Perl_debug_log,
547 ", trying anchored starting at offset %ld...\n",
548 (long)(s1 + 1 - i_strpos)));
550 PL_regeol = strend; /* Used in HOP() */
555 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
556 (long)(s - i_strpos)));
566 t = s - prog->check_offset_max;
568 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
569 && (!(prog->reganch & ROPT_UTF8)
570 || (PL_bostr = strpos, /* Used in regcopmaybe() */
571 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
574 /* Fixed substring is found far enough so that the match
575 cannot start at strpos. */
577 if (ml_anch && t[-1] != '\n') {
578 /* Eventually fbm_*() should handle this, but often
579 anchored_offset is not 0, so this check will not be wasted. */
580 /* XXXX In the code below we prefer to look for "^" even in
581 presence of anchored substrings. And we search even
582 beyond the found float position. These pessimizations
583 are historical artefacts only. */
585 while (t < strend - prog->minlen) {
587 if (t < s - prog->check_offset_min) {
588 if (prog->anchored_substr) {
589 /* We definitely contradict the found anchored
590 substr. Due to the above check we do not
591 contradict "check" substr.
592 Thus we can arrive here only if check substr
593 is float. Redo checking for "other"=="fixed".
596 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
597 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
598 goto do_other_anchored;
601 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
602 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
605 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
606 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
612 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
613 PL_colors[0],PL_colors[1]));
618 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
622 /* The found string does not prohibit matching at beg-of-str
623 - no optimization of calling REx engine can be performed,
624 unless it was an MBOL and we are not after MBOL. */
626 /* Even in this situation we may use MBOL flag if strpos is offset
627 wrt the start of the string. */
629 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
633 DEBUG_r( if (ml_anch)
634 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
635 PL_colors[0],PL_colors[1]);
638 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
639 && --BmUSEFUL(prog->check_substr) < 0
640 && prog->check_substr == prog->float_substr) { /* boo */
641 /* If flags & SOMETHING - do not do it many times on the same match */
642 SvREFCNT_dec(prog->check_substr);
643 prog->check_substr = Nullsv; /* disable */
644 prog->float_substr = Nullsv; /* clear */
646 /* XXXX This is a remnant of the old implementation. It
647 looks wasteful, since now INTUIT can use many
648 other heuristics too. */
649 prog->reganch &= ~RE_USE_INTUIT;
655 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
656 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
659 fail_finish: /* Substring not found */
660 BmUSEFUL(prog->check_substr) += 5; /* hooray */
662 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
663 PL_colors[4],PL_colors[5]));
668 - regexec_flags - match a regexp against a string
671 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
672 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
673 /* strend: pointer to null at end of string */
674 /* strbeg: real beginning of string */
675 /* minend: end of match must be >=minend after stringarg. */
676 /* data: May be used for some additional optimizations. */
677 /* nosave: For optimizations. */
682 register char *startpos = stringarg;
684 I32 minlen; /* must match at least this many chars */
685 I32 dontbother = 0; /* how many characters not to try at end */
686 I32 start_shift = 0; /* Offset of the start to find
687 constant substr. */ /* CC */
688 I32 end_shift = 0; /* Same for the end. */ /* CC */
689 I32 scream_pos = -1; /* Internal iterator of scream. */
691 SV* oreplsv = GvSV(PL_replgv);
697 PL_regnarrate = PL_debug & 512;
701 if (prog == NULL || startpos == NULL) {
702 Perl_croak(aTHX_ "NULL regexp parameter");
706 minlen = prog->minlen;
707 if (strend - startpos < minlen) goto phooey;
709 if (startpos == strbeg) /* is ^ valid at stringarg? */
712 PL_regprev = (U32)stringarg[-1];
713 if (!PL_multiline && PL_regprev == '\n')
714 PL_regprev = '\0'; /* force ^ to NOT match */
717 /* Check validity of program. */
718 if (UCHARAT(prog->program) != REG_MAGIC) {
719 Perl_croak(aTHX_ "corrupted regexp program");
726 if (prog->reganch & ROPT_UTF8)
727 PL_reg_flags |= RF_utf8;
729 /* Mark beginning of line for ^ and lookbehind. */
730 PL_regbol = startpos;
734 /* Mark end of line for $ (and such) */
737 /* see how far we have to get to not match where we matched before */
738 PL_regtill = startpos+minend;
740 /* We start without call_cc context. */
743 /* If there is a "must appear" string, look for it. */
746 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
749 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
750 PL_reg_ganch = startpos;
751 else if (sv && SvTYPE(sv) >= SVt_PVMG
753 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
754 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
755 if (prog->reganch & ROPT_ANCH_GPOS) {
756 if (s > PL_reg_ganch)
761 else /* pos() not defined */
762 PL_reg_ganch = strbeg;
765 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
766 re_scream_pos_data d;
768 d.scream_olds = &scream_olds;
769 d.scream_pos = &scream_pos;
770 s = re_intuit_start(prog, sv, s, strend, flags, &d);
772 goto phooey; /* not present */
775 DEBUG_r( if (!PL_colorset) reginitcolors() );
776 DEBUG_r(PerlIO_printf(Perl_debug_log,
777 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
778 PL_colors[4],PL_colors[5],PL_colors[0],
781 (strlen(prog->precomp) > 60 ? "..." : ""),
783 (int)(strend - startpos > 60 ? 60 : strend - startpos),
784 startpos, PL_colors[1],
785 (strend - startpos > 60 ? "..." : ""))
788 /* Simplest case: anchored match need be tried only once. */
789 /* [unless only anchor is BOL and multiline is set] */
790 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
791 if (s == startpos && regtry(prog, startpos))
793 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
794 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
799 dontbother = minlen - 1;
800 end = HOPc(strend, -dontbother) - 1;
801 /* for multiline we only have to try after newlines */
802 if (prog->check_substr) {
811 if (prog->reganch & RE_USE_INTUIT) {
812 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
823 if (*s++ == '\n') { /* don't need PL_utf8skip here */
831 } else if (prog->reganch & ROPT_ANCH_GPOS) {
832 if (regtry(prog, PL_reg_ganch))
837 /* Messy cases: unanchored match. */
838 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
839 /* we have /x+whatever/ */
840 /* it must be a one character string (XXXX Except UTF?) */
841 char ch = SvPVX(prog->anchored_substr)[0];
845 if (regtry(prog, s)) goto got_it;
847 while (s < strend && *s == ch)
856 if (regtry(prog, s)) goto got_it;
858 while (s < strend && *s == ch)
866 else if (prog->anchored_substr != Nullsv
867 || (prog->float_substr != Nullsv
868 && prog->float_max_offset < strend - s)) {
869 SV *must = prog->anchored_substr
870 ? prog->anchored_substr : prog->float_substr;
872 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
874 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
875 I32 delta = back_max - back_min;
876 char *last = HOPc(strend, /* Cannot start after this */
877 -(I32)(CHR_SVLEN(must)
878 - (SvTAIL(must) != 0) + back_min));
879 char *last1; /* Last position checked before */
884 last1 = s - 1; /* bogus */
886 /* XXXX check_substr already used to find `s', can optimize if
887 check_substr==must. */
889 dontbother = end_shift;
890 strend = HOPc(strend, -dontbother);
891 while ( (s <= last) &&
892 ((flags & REXEC_SCREAM)
893 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
894 end_shift, &scream_pos, 0))
895 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
896 (unsigned char*)strend, must,
897 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
898 if (HOPc(s, -back_max) > last1) {
899 last1 = HOPc(s, -back_min);
900 s = HOPc(s, -back_max);
903 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
905 last1 = HOPc(s, -back_min);
925 else if (c = prog->regstclass) {
926 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
934 dontbother = minlen - 1;
935 strend = HOPc(strend, -dontbother); /* don't bother with what can't match */
937 /* We know what class it must start with. */
941 if (REGINCLASSUTF8(c, (U8*)s)) {
942 if (tmp && regtry(prog, s))
954 if (REGINCLASS(c, *s)) {
955 if (tmp && regtry(prog, s))
975 c2 = PL_fold_locale[c1];
979 /* Here it is NOT UTF! */
983 && (ln == 1 || (OP(c) == EXACTF
985 : ibcmp_locale(s, m, ln)))
992 if ( (*s == c1 || *s == c2)
993 && (ln == 1 || (OP(c) == EXACTF
995 : ibcmp_locale(s, m, ln)))
1003 PL_reg_flags |= RF_tainted;
1010 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
1011 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1012 while (s < strend) {
1013 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1015 if (regtry(prog, s))
1020 if ((minlen || tmp) && regtry(prog,s))
1024 PL_reg_flags |= RF_tainted;
1029 strend = reghop_c(strend, -1);
1031 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
1032 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
1033 while (s < strend) {
1034 if (tmp == !(OP(c) == BOUND ?
1035 swash_fetch(PL_utf8_alnum, (U8*)s) :
1036 isALNUM_LC_utf8((U8*)s)))
1039 if (regtry(prog, s))
1044 if ((minlen || tmp) && regtry(prog,s))
1048 PL_reg_flags |= RF_tainted;
1055 tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev;
1056 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1057 while (s < strend) {
1058 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1060 else if (regtry(prog, s))
1064 if ((minlen || !tmp) && regtry(prog,s))
1068 PL_reg_flags |= RF_tainted;
1073 strend = reghop_c(strend, -1);
1075 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev;
1076 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
1077 while (s < strend) {
1078 if (tmp == !(OP(c) == NBOUND ?
1079 swash_fetch(PL_utf8_alnum, (U8*)s) :
1080 isALNUM_LC_utf8((U8*)s)))
1082 else if (regtry(prog, s))
1086 if ((minlen || !tmp) && regtry(prog,s))
1090 while (s < strend) {
1092 if (tmp && regtry(prog, s))
1103 while (s < strend) {
1104 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
1105 if (tmp && regtry(prog, s))
1116 PL_reg_flags |= RF_tainted;
1117 while (s < strend) {
1118 if (isALNUM_LC(*s)) {
1119 if (tmp && regtry(prog, s))
1130 PL_reg_flags |= RF_tainted;
1131 while (s < strend) {
1132 if (isALNUM_LC_utf8((U8*)s)) {
1133 if (tmp && regtry(prog, s))
1144 while (s < strend) {
1146 if (tmp && regtry(prog, s))
1157 while (s < strend) {
1158 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1159 if (tmp && regtry(prog, s))
1170 PL_reg_flags |= RF_tainted;
1171 while (s < strend) {
1172 if (!isALNUM_LC(*s)) {
1173 if (tmp && regtry(prog, s))
1184 PL_reg_flags |= RF_tainted;
1185 while (s < strend) {
1186 if (!isALNUM_LC_utf8((U8*)s)) {
1187 if (tmp && regtry(prog, s))
1198 while (s < strend) {
1200 if (tmp && regtry(prog, s))
1211 while (s < strend) {
1212 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1213 if (tmp && regtry(prog, s))
1224 PL_reg_flags |= RF_tainted;
1225 while (s < strend) {
1226 if (isSPACE_LC(*s)) {
1227 if (tmp && regtry(prog, s))
1238 PL_reg_flags |= RF_tainted;
1239 while (s < strend) {
1240 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1241 if (tmp && regtry(prog, s))
1252 while (s < strend) {
1254 if (tmp && regtry(prog, s))
1265 while (s < strend) {
1266 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1267 if (tmp && regtry(prog, s))
1278 PL_reg_flags |= RF_tainted;
1279 while (s < strend) {
1280 if (!isSPACE_LC(*s)) {
1281 if (tmp && regtry(prog, s))
1292 PL_reg_flags |= RF_tainted;
1293 while (s < strend) {
1294 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1295 if (tmp && regtry(prog, s))
1306 while (s < strend) {
1308 if (tmp && regtry(prog, s))
1319 while (s < strend) {
1320 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1321 if (tmp && regtry(prog, s))
1332 PL_reg_flags |= RF_tainted;
1333 while (s < strend) {
1334 if (isDIGIT_LC(*s)) {
1335 if (tmp && regtry(prog, s))
1346 PL_reg_flags |= RF_tainted;
1347 while (s < strend) {
1348 if (isDIGIT_LC_utf8((U8*)s)) {
1349 if (tmp && regtry(prog, s))
1360 while (s < strend) {
1362 if (tmp && regtry(prog, s))
1373 while (s < strend) {
1374 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1375 if (tmp && regtry(prog, s))
1386 PL_reg_flags |= RF_tainted;
1387 while (s < strend) {
1388 if (!isDIGIT_LC(*s)) {
1389 if (tmp && regtry(prog, s))
1400 PL_reg_flags |= RF_tainted;
1401 while (s < strend) {
1402 if (!isDIGIT_LC_utf8((U8*)s)) {
1403 if (tmp && regtry(prog, s))
1414 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1420 if (prog->float_substr != Nullsv) { /* Trim the end. */
1422 I32 oldpos = scream_pos;
1424 if (flags & REXEC_SCREAM) {
1425 last = screaminstr(sv, prog->float_substr, s - strbeg,
1426 end_shift, &scream_pos, 1); /* last one */
1428 last = scream_olds; /* Only one occurence. */
1432 char *little = SvPV(prog->float_substr, len);
1434 if (SvTAIL(prog->float_substr)) {
1435 if (memEQ(strend - len + 1, little, len - 1))
1436 last = strend - len + 1;
1437 else if (!PL_multiline)
1438 last = memEQ(strend - len, little, len)
1439 ? strend - len : Nullch;
1445 last = rninstr(s, strend, little, little + len);
1447 last = strend; /* matching `$' */
1450 if (last == NULL) goto phooey; /* Should not happen! */
1451 dontbother = strend - last + prog->float_min_offset;
1453 if (minlen && (dontbother < minlen))
1454 dontbother = minlen - 1;
1455 strend -= dontbother; /* this one's always in bytes! */
1456 /* We don't know much -- general case. */
1459 if (regtry(prog, s))
1468 if (regtry(prog, s))
1470 } while (s++ < strend);
1478 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1480 if (PL_reg_eval_set) {
1481 /* Preserve the current value of $^R */
1482 if (oreplsv != GvSV(PL_replgv))
1483 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1484 restored, the value remains
1486 restore_pos(aTHXo_ 0);
1489 /* make sure $`, $&, $', and $digit will work later */
1490 if ( !(flags & REXEC_NOT_FIRST) ) {
1491 if (RX_MATCH_COPIED(prog)) {
1492 Safefree(prog->subbeg);
1493 RX_MATCH_COPIED_off(prog);
1495 if (flags & REXEC_COPY_STR) {
1496 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1498 s = savepvn(strbeg, i);
1501 RX_MATCH_COPIED_on(prog);
1504 prog->subbeg = strbeg;
1505 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1512 if (PL_reg_eval_set)
1513 restore_pos(aTHXo_ 0);
1518 - regtry - try match at specific point
1520 STATIC I32 /* 0 failure, 1 success */
1521 S_regtry(pTHX_ regexp *prog, char *startpos)
1529 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1532 PL_reg_eval_set = RS_init;
1534 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1535 (IV)(PL_stack_sp - PL_stack_base));
1537 SAVEINT(cxstack[cxstack_ix].blk_oldsp);
1538 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1539 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1541 /* Apparently this is not needed, judging by wantarray. */
1542 /* SAVEINT(cxstack[cxstack_ix].blk_gimme);
1543 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1546 /* Make $_ available to executed code. */
1547 if (PL_reg_sv != DEFSV) {
1548 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1553 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1554 && (mg = mg_find(PL_reg_sv, 'g')))) {
1555 /* prepare for quick setting of pos */
1556 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1557 mg = mg_find(PL_reg_sv, 'g');
1561 PL_reg_oldpos = mg->mg_len;
1562 SAVEDESTRUCTOR_X(restore_pos, 0);
1565 New(22,PL_reg_curpm, 1, PMOP);
1566 PL_reg_curpm->op_pmregexp = prog;
1567 PL_reg_oldcurpm = PL_curpm;
1568 PL_curpm = PL_reg_curpm;
1569 if (RX_MATCH_COPIED(prog)) {
1570 /* Here is a serious problem: we cannot rewrite subbeg,
1571 since it may be needed if this match fails. Thus
1572 $` inside (?{}) could fail... */
1573 PL_reg_oldsaved = prog->subbeg;
1574 PL_reg_oldsavedlen = prog->sublen;
1575 RX_MATCH_COPIED_off(prog);
1578 PL_reg_oldsaved = Nullch;
1579 prog->subbeg = PL_bostr;
1580 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1582 prog->startp[0] = startpos - PL_bostr;
1583 PL_reginput = startpos;
1584 PL_regstartp = prog->startp;
1585 PL_regendp = prog->endp;
1586 PL_reglastparen = &prog->lastparen;
1587 prog->lastparen = 0;
1589 DEBUG_r(PL_reg_starttry = startpos);
1590 if (PL_reg_start_tmpl <= prog->nparens) {
1591 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1592 if(PL_reg_start_tmp)
1593 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1595 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1598 /* XXXX What this code is doing here?!!! There should be no need
1599 to do this again and again, PL_reglastparen should take care of
1603 if (prog->nparens) {
1604 for (i = prog->nparens; i >= 1; i--) {
1610 if (regmatch(prog->program + 1)) {
1611 prog->endp[0] = PL_reginput - PL_bostr;
1619 - regmatch - main matching routine
1621 * Conceptually the strategy is simple: check to see whether the current
1622 * node matches, call self recursively to see whether the rest matches,
1623 * and then act accordingly. In practice we make some effort to avoid
1624 * recursion, in particular by going through "ordinary" nodes (that don't
1625 * need to know whether the rest of the match failed) by a loop instead of
1628 /* [lwall] I've hoisted the register declarations to the outer block in order to
1629 * maybe save a little bit of pushing and popping on the stack. It also takes
1630 * advantage of machines that use a register save mask on subroutine entry.
1632 STATIC I32 /* 0 failure, 1 success */
1633 S_regmatch(pTHX_ regnode *prog)
1636 register regnode *scan; /* Current node. */
1637 regnode *next; /* Next node. */
1638 regnode *inner; /* Next node in internal branch. */
1639 register I32 nextchr; /* renamed nextchr - nextchar colides with
1640 function of same name */
1641 register I32 n; /* no or next */
1642 register I32 ln; /* len or last */
1643 register char *s; /* operand or save */
1644 register char *locinput = PL_reginput;
1645 register I32 c1, c2, paren; /* case fold search, parenth */
1646 int minmod = 0, sw = 0, logical = 0;
1651 /* Note that nextchr is a byte even in UTF */
1652 nextchr = UCHARAT(locinput);
1654 while (scan != NULL) {
1655 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1657 # define sayYES goto yes
1658 # define sayNO goto no
1659 # define sayYES_FINAL goto yes_final
1660 # define sayYES_LOUD goto yes_loud
1661 # define sayNO_FINAL goto no_final
1662 # define sayNO_SILENT goto do_no
1663 # define saySAME(x) if (x) goto yes; else goto no
1664 # define REPORT_CODE_OFF 24
1666 # define sayYES return 1
1667 # define sayNO return 0
1668 # define sayYES_FINAL return 1
1669 # define sayYES_LOUD return 1
1670 # define sayNO_FINAL return 0
1671 # define sayNO_SILENT return 0
1672 # define saySAME(x) return x
1675 SV *prop = sv_newmortal();
1676 int docolor = *PL_colors[0];
1677 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1678 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1679 /* The part of the string before starttry has one color
1680 (pref0_len chars), between starttry and current
1681 position another one (pref_len - pref0_len chars),
1682 after the current position the third one.
1683 We assume that pref0_len <= pref_len, otherwise we
1684 decrease pref0_len. */
1685 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1686 ? (5 + taill) - l : locinput - PL_bostr);
1687 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1689 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1690 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1691 ? (5 + taill) - pref_len : PL_regeol - locinput);
1694 if (pref0_len > pref_len)
1695 pref0_len = pref_len;
1696 regprop(prop, scan);
1697 PerlIO_printf(Perl_debug_log,
1698 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1699 (IV)(locinput - PL_bostr),
1700 PL_colors[4], pref0_len,
1701 locinput - pref_len, PL_colors[5],
1702 PL_colors[2], pref_len - pref0_len,
1703 locinput - pref_len + pref0_len, PL_colors[3],
1704 (docolor ? "" : "> <"),
1705 PL_colors[0], l, locinput, PL_colors[1],
1706 15 - l - pref_len + 1,
1708 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1712 next = scan + NEXT_OFF(scan);
1718 if (locinput == PL_bostr
1719 ? PL_regprev == '\n'
1721 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1723 /* regtill = regbol; */
1728 if (locinput == PL_bostr
1729 ? PL_regprev == '\n'
1730 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1736 if (locinput == PL_regbol && PL_regprev == '\n')
1740 if (locinput == PL_reg_ganch)
1750 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1755 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1757 if (PL_regeol - locinput > 1)
1761 if (PL_regeol != locinput)
1765 if (nextchr & 0x80) {
1766 locinput += PL_utf8skip[nextchr];
1767 if (locinput > PL_regeol)
1769 nextchr = UCHARAT(locinput);
1772 if (!nextchr && locinput >= PL_regeol)
1774 nextchr = UCHARAT(++locinput);
1777 if (!nextchr && locinput >= PL_regeol)
1779 nextchr = UCHARAT(++locinput);
1782 if (nextchr & 0x80) {
1783 locinput += PL_utf8skip[nextchr];
1784 if (locinput > PL_regeol)
1786 nextchr = UCHARAT(locinput);
1789 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1791 nextchr = UCHARAT(++locinput);
1794 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1796 nextchr = UCHARAT(++locinput);
1801 /* Inline the first character, for speed. */
1802 if (UCHARAT(s) != nextchr)
1804 if (PL_regeol - locinput < ln)
1806 if (ln > 1 && memNE(s, locinput, ln))
1809 nextchr = UCHARAT(locinput);
1812 PL_reg_flags |= RF_tainted;
1821 c1 = OP(scan) == EXACTF;
1825 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1826 toLOWER_utf8((U8*)l) :
1827 toLOWER_LC_utf8((U8*)l)))
1835 nextchr = UCHARAT(locinput);
1839 /* Inline the first character, for speed. */
1840 if (UCHARAT(s) != nextchr &&
1841 UCHARAT(s) != ((OP(scan) == EXACTF)
1842 ? PL_fold : PL_fold_locale)[nextchr])
1844 if (PL_regeol - locinput < ln)
1846 if (ln > 1 && (OP(scan) == EXACTF
1847 ? ibcmp(s, locinput, ln)
1848 : ibcmp_locale(s, locinput, ln)))
1851 nextchr = UCHARAT(locinput);
1854 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1856 if (locinput >= PL_regeol)
1858 locinput += PL_utf8skip[nextchr];
1859 nextchr = UCHARAT(locinput);
1863 nextchr = UCHARAT(locinput);
1864 if (!REGINCLASS(scan, nextchr))
1866 if (!nextchr && locinput >= PL_regeol)
1868 nextchr = UCHARAT(++locinput);
1871 PL_reg_flags |= RF_tainted;
1876 if (!(OP(scan) == ALNUM
1877 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1879 nextchr = UCHARAT(++locinput);
1882 PL_reg_flags |= RF_tainted;
1887 if (nextchr & 0x80) {
1888 if (!(OP(scan) == ALNUMUTF8
1889 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1890 : isALNUM_LC_utf8((U8*)locinput)))
1894 locinput += PL_utf8skip[nextchr];
1895 nextchr = UCHARAT(locinput);
1898 if (!(OP(scan) == ALNUMUTF8
1899 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1901 nextchr = UCHARAT(++locinput);
1904 PL_reg_flags |= RF_tainted;
1907 if (!nextchr && locinput >= PL_regeol)
1909 if (OP(scan) == NALNUM
1910 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1912 nextchr = UCHARAT(++locinput);
1915 PL_reg_flags |= RF_tainted;
1918 if (!nextchr && locinput >= PL_regeol)
1920 if (nextchr & 0x80) {
1921 if (OP(scan) == NALNUMUTF8
1922 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1923 : isALNUM_LC_utf8((U8*)locinput))
1927 locinput += PL_utf8skip[nextchr];
1928 nextchr = UCHARAT(locinput);
1931 if (OP(scan) == NALNUMUTF8
1932 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1934 nextchr = UCHARAT(++locinput);
1938 PL_reg_flags |= RF_tainted;
1942 /* was last char in word? */
1943 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
1944 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
1946 n = isALNUM(nextchr);
1949 ln = isALNUM_LC(ln);
1950 n = isALNUM_LC(nextchr);
1952 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
1957 PL_reg_flags |= RF_tainted;
1961 /* was last char in word? */
1962 ln = (locinput != PL_regbol)
1963 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
1964 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
1965 ln = isALNUM_uni(ln);
1966 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
1969 ln = isALNUM_LC_uni(ln);
1970 n = isALNUM_LC_utf8((U8*)locinput);
1972 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
1976 PL_reg_flags |= RF_tainted;
1979 if (!nextchr && locinput >= PL_regeol)
1981 if (!(OP(scan) == SPACE
1982 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
1984 nextchr = UCHARAT(++locinput);
1987 PL_reg_flags |= RF_tainted;
1990 if (!nextchr && locinput >= PL_regeol)
1992 if (nextchr & 0x80) {
1993 if (!(OP(scan) == SPACEUTF8
1994 ? swash_fetch(PL_utf8_space,(U8*)locinput)
1995 : isSPACE_LC_utf8((U8*)locinput)))
1999 locinput += PL_utf8skip[nextchr];
2000 nextchr = UCHARAT(locinput);
2003 if (!(OP(scan) == SPACEUTF8
2004 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2006 nextchr = UCHARAT(++locinput);
2009 PL_reg_flags |= RF_tainted;
2014 if (OP(scan) == SPACE
2015 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2017 nextchr = UCHARAT(++locinput);
2020 PL_reg_flags |= RF_tainted;
2025 if (nextchr & 0x80) {
2026 if (OP(scan) == NSPACEUTF8
2027 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2028 : isSPACE_LC_utf8((U8*)locinput))
2032 locinput += PL_utf8skip[nextchr];
2033 nextchr = UCHARAT(locinput);
2036 if (OP(scan) == NSPACEUTF8
2037 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2039 nextchr = UCHARAT(++locinput);
2042 PL_reg_flags |= RF_tainted;
2045 if (!nextchr && locinput >= PL_regeol)
2047 if (!(OP(scan) == DIGIT
2048 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2050 nextchr = UCHARAT(++locinput);
2053 PL_reg_flags |= RF_tainted;
2058 if (nextchr & 0x80) {
2059 if (OP(scan) == NDIGITUTF8
2060 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2061 : isDIGIT_LC_utf8((U8*)locinput))
2065 locinput += PL_utf8skip[nextchr];
2066 nextchr = UCHARAT(locinput);
2069 if (!isDIGIT(nextchr))
2071 nextchr = UCHARAT(++locinput);
2074 PL_reg_flags |= RF_tainted;
2079 if (OP(scan) == DIGIT
2080 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2082 nextchr = UCHARAT(++locinput);
2085 PL_reg_flags |= RF_tainted;
2088 if (!nextchr && locinput >= PL_regeol)
2090 if (nextchr & 0x80) {
2091 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2093 locinput += PL_utf8skip[nextchr];
2094 nextchr = UCHARAT(locinput);
2097 if (isDIGIT(nextchr))
2099 nextchr = UCHARAT(++locinput);
2102 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2104 locinput += PL_utf8skip[nextchr];
2105 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2106 locinput += UTF8SKIP(locinput);
2107 if (locinput > PL_regeol)
2109 nextchr = UCHARAT(locinput);
2112 PL_reg_flags |= RF_tainted;
2116 n = ARG(scan); /* which paren pair */
2117 ln = PL_regstartp[n];
2118 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2119 if (*PL_reglastparen < n || ln == -1)
2120 sayNO; /* Do not match unless seen CLOSEn. */
2121 if (ln == PL_regendp[n])
2125 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2127 char *e = PL_bostr + PL_regendp[n];
2129 * Note that we can't do the "other character" lookup trick as
2130 * in the 8-bit case (no pun intended) because in Unicode we
2131 * have to map both upper and title case to lower case.
2133 if (OP(scan) == REFF) {
2137 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2147 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2154 nextchr = UCHARAT(locinput);
2158 /* Inline the first character, for speed. */
2159 if (UCHARAT(s) != nextchr &&
2161 (UCHARAT(s) != ((OP(scan) == REFF
2162 ? PL_fold : PL_fold_locale)[nextchr]))))
2164 ln = PL_regendp[n] - ln;
2165 if (locinput + ln > PL_regeol)
2167 if (ln > 1 && (OP(scan) == REF
2168 ? memNE(s, locinput, ln)
2170 ? ibcmp(s, locinput, ln)
2171 : ibcmp_locale(s, locinput, ln))))
2174 nextchr = UCHARAT(locinput);
2185 OP_4tree *oop = PL_op;
2186 COP *ocurcop = PL_curcop;
2187 SV **ocurpad = PL_curpad;
2191 PL_op = (OP_4tree*)PL_regdata->data[n];
2192 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2193 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2194 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2196 CALLRUNOPS(aTHX); /* Scalar context. */
2202 PL_curpad = ocurpad;
2203 PL_curcop = ocurcop;
2205 if (logical == 2) { /* Postponed subexpression. */
2207 MAGIC *mg = Null(MAGIC*);
2209 CHECKPOINT cp, lastcp;
2211 if(SvROK(ret) || SvRMAGICAL(ret)) {
2212 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2215 mg = mg_find(sv, 'r');
2218 re = (regexp *)mg->mg_obj;
2219 (void)ReREFCNT_inc(re);
2223 char *t = SvPV(ret, len);
2225 char *oprecomp = PL_regprecomp;
2226 I32 osize = PL_regsize;
2227 I32 onpar = PL_regnpar;
2230 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2232 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2233 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2234 PL_regprecomp = oprecomp;
2239 PerlIO_printf(Perl_debug_log,
2240 "Entering embedded `%s%.60s%s%s'\n",
2244 (strlen(re->precomp) > 60 ? "..." : ""))
2247 state.prev = PL_reg_call_cc;
2248 state.cc = PL_regcc;
2249 state.re = PL_reg_re;
2253 cp = regcppush(0); /* Save *all* the positions. */
2256 state.ss = PL_savestack_ix;
2257 *PL_reglastparen = 0;
2258 PL_reg_call_cc = &state;
2259 PL_reginput = locinput;
2261 /* XXXX This is too dramatic a measure... */
2264 if (regmatch(re->program + 1)) {
2265 /* Even though we succeeded, we need to restore
2266 global variables, since we may be wrapped inside
2267 SUSPEND, thus the match may be not finished yet. */
2269 /* XXXX Do this only if SUSPENDed? */
2270 PL_reg_call_cc = state.prev;
2271 PL_regcc = state.cc;
2272 PL_reg_re = state.re;
2273 cache_re(PL_reg_re);
2275 /* XXXX This is too dramatic a measure... */
2278 /* These are needed even if not SUSPEND. */
2286 PL_reg_call_cc = state.prev;
2287 PL_regcc = state.cc;
2288 PL_reg_re = state.re;
2289 cache_re(PL_reg_re);
2291 /* XXXX This is too dramatic a measure... */
2300 sv_setsv(save_scalar(PL_replgv), ret);
2304 n = ARG(scan); /* which paren pair */
2305 PL_reg_start_tmp[n] = locinput;
2310 n = ARG(scan); /* which paren pair */
2311 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2312 PL_regendp[n] = locinput - PL_bostr;
2313 if (n > *PL_reglastparen)
2314 *PL_reglastparen = n;
2317 n = ARG(scan); /* which paren pair */
2318 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2321 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2323 next = NEXTOPER(NEXTOPER(scan));
2325 next = scan + ARG(scan);
2326 if (OP(next) == IFTHEN) /* Fake one. */
2327 next = NEXTOPER(NEXTOPER(next));
2331 logical = scan->flags;
2333 /*******************************************************************
2334 PL_regcc contains infoblock about the innermost (...)* loop, and
2335 a pointer to the next outer infoblock.
2337 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2339 1) After matching X, regnode for CURLYX is processed;
2341 2) This regnode creates infoblock on the stack, and calls
2342 regmatch() recursively with the starting point at WHILEM node;
2344 3) Each hit of WHILEM node tries to match A and Z (in the order
2345 depending on the current iteration, min/max of {min,max} and
2346 greediness). The information about where are nodes for "A"
2347 and "Z" is read from the infoblock, as is info on how many times "A"
2348 was already matched, and greediness.
2350 4) After A matches, the same WHILEM node is hit again.
2352 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2353 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2354 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2355 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2356 of the external loop.
2358 Currently present infoblocks form a tree with a stem formed by PL_curcc
2359 and whatever it mentions via ->next, and additional attached trees
2360 corresponding to temporarily unset infoblocks as in "5" above.
2362 In the following picture infoblocks for outer loop of
2363 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2364 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2365 infoblocks are drawn below the "reset" infoblock.
2367 In fact in the picture below we do not show failed matches for Z and T
2368 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2369 more obvious *why* one needs to *temporary* unset infoblocks.]
2371 Matched REx position InfoBlocks Comment
2375 Y A)*?Z)*?T x <- O <- I
2376 YA )*?Z)*?T x <- O <- I
2377 YA A)*?Z)*?T x <- O <- I
2378 YAA )*?Z)*?T x <- O <- I
2379 YAA Z)*?T x <- O # Temporary unset I
2382 YAAZ Y(A)*?Z)*?T x <- O
2385 YAAZY (A)*?Z)*?T x <- O
2388 YAAZY A)*?Z)*?T x <- O <- I
2391 YAAZYA )*?Z)*?T x <- O <- I
2394 YAAZYA Z)*?T x <- O # Temporary unset I
2400 YAAZYAZ T x # Temporary unset O
2407 *******************************************************************/
2410 CHECKPOINT cp = PL_savestack_ix;
2412 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2414 cc.oldcc = PL_regcc;
2416 cc.parenfloor = *PL_reglastparen;
2418 cc.min = ARG1(scan);
2419 cc.max = ARG2(scan);
2420 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2424 PL_reginput = locinput;
2425 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2427 PL_regcc = cc.oldcc;
2433 * This is really hard to understand, because after we match
2434 * what we're trying to match, we must make sure the rest of
2435 * the REx is going to match for sure, and to do that we have
2436 * to go back UP the parse tree by recursing ever deeper. And
2437 * if it fails, we have to reset our parent's current state
2438 * that we can try again after backing off.
2441 CHECKPOINT cp, lastcp;
2442 CURCUR* cc = PL_regcc;
2443 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2445 n = cc->cur + 1; /* how many we know we matched */
2446 PL_reginput = locinput;
2449 PerlIO_printf(Perl_debug_log,
2450 "%*s %ld out of %ld..%ld cc=%lx\n",
2451 REPORT_CODE_OFF+PL_regindent*2, "",
2452 (long)n, (long)cc->min,
2453 (long)cc->max, (long)cc)
2456 /* If degenerate scan matches "", assume scan done. */
2458 if (locinput == cc->lastloc && n >= cc->min) {
2459 PL_regcc = cc->oldcc;
2463 PerlIO_printf(Perl_debug_log,
2464 "%*s empty match detected, try continuation...\n",
2465 REPORT_CODE_OFF+PL_regindent*2, "")
2467 if (regmatch(cc->next))
2475 /* First just match a string of min scans. */
2479 cc->lastloc = locinput;
2480 if (regmatch(cc->scan))
2483 cc->lastloc = lastloc;
2488 /* Check whether we already were at this position.
2489 Postpone detection until we know the match is not
2490 *that* much linear. */
2491 if (!PL_reg_maxiter) {
2492 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2493 PL_reg_leftiter = PL_reg_maxiter;
2495 if (PL_reg_leftiter-- == 0) {
2496 I32 size = (PL_reg_maxiter + 7)/8;
2497 if (PL_reg_poscache) {
2498 if (PL_reg_poscache_size < size) {
2499 Renew(PL_reg_poscache, size, char);
2500 PL_reg_poscache_size = size;
2502 Zero(PL_reg_poscache, size, char);
2505 PL_reg_poscache_size = size;
2506 Newz(29, PL_reg_poscache, size, char);
2509 PerlIO_printf(Perl_debug_log,
2510 "%sDetected a super-linear match, switching on caching%s...\n",
2511 PL_colors[4], PL_colors[5])
2514 if (PL_reg_leftiter < 0) {
2515 I32 o = locinput - PL_bostr, b;
2517 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2520 if (PL_reg_poscache[o] & (1<<b)) {
2522 PerlIO_printf(Perl_debug_log,
2523 "%*s already tried at this position...\n",
2524 REPORT_CODE_OFF+PL_regindent*2, "")
2528 PL_reg_poscache[o] |= (1<<b);
2532 /* Prefer next over scan for minimal matching. */
2535 PL_regcc = cc->oldcc;
2538 cp = regcppush(cc->parenfloor);
2540 if (regmatch(cc->next)) {
2542 sayYES; /* All done. */
2550 if (n >= cc->max) { /* Maximum greed exceeded? */
2551 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2552 && !(PL_reg_flags & RF_warned)) {
2553 PL_reg_flags |= RF_warned;
2554 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2555 "Complex regular subexpression recursion",
2562 PerlIO_printf(Perl_debug_log,
2563 "%*s trying longer...\n",
2564 REPORT_CODE_OFF+PL_regindent*2, "")
2566 /* Try scanning more and see if it helps. */
2567 PL_reginput = locinput;
2569 cc->lastloc = locinput;
2570 cp = regcppush(cc->parenfloor);
2572 if (regmatch(cc->scan)) {
2579 cc->lastloc = lastloc;
2583 /* Prefer scan over next for maximal matching. */
2585 if (n < cc->max) { /* More greed allowed? */
2586 cp = regcppush(cc->parenfloor);
2588 cc->lastloc = locinput;
2590 if (regmatch(cc->scan)) {
2595 regcppop(); /* Restore some previous $<digit>s? */
2596 PL_reginput = locinput;
2598 PerlIO_printf(Perl_debug_log,
2599 "%*s failed, try continuation...\n",
2600 REPORT_CODE_OFF+PL_regindent*2, "")
2603 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2604 && !(PL_reg_flags & RF_warned)) {
2605 PL_reg_flags |= RF_warned;
2606 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2607 "Complex regular subexpression recursion",
2611 /* Failed deeper matches of scan, so see if this one works. */
2612 PL_regcc = cc->oldcc;
2615 if (regmatch(cc->next))
2621 cc->lastloc = lastloc;
2626 next = scan + ARG(scan);
2629 inner = NEXTOPER(NEXTOPER(scan));
2632 inner = NEXTOPER(scan);
2637 if (OP(next) != c1) /* No choice. */
2638 next = inner; /* Avoid recursion. */
2640 int lastparen = *PL_reglastparen;
2644 PL_reginput = locinput;
2645 if (regmatch(inner))
2648 for (n = *PL_reglastparen; n > lastparen; n--)
2650 *PL_reglastparen = n;
2653 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2657 inner = NEXTOPER(scan);
2658 if (c1 == BRANCHJ) {
2659 inner = NEXTOPER(inner);
2661 } while (scan != NULL && OP(scan) == c1);
2675 /* We suppose that the next guy does not need
2676 backtracking: in particular, it is of constant length,
2677 and has no parenths to influence future backrefs. */
2678 ln = ARG1(scan); /* min to match */
2679 n = ARG2(scan); /* max to match */
2680 paren = scan->flags;
2682 if (paren > PL_regsize)
2684 if (paren > *PL_reglastparen)
2685 *PL_reglastparen = paren;
2687 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2689 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2690 PL_reginput = locinput;
2693 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2695 if (ln && l == 0 && n >= ln
2696 /* In fact, this is tricky. If paren, then the
2697 fact that we did/didnot match may influence
2698 future execution. */
2699 && !(paren && ln == 0))
2701 locinput = PL_reginput;
2702 if (PL_regkind[(U8)OP(next)] == EXACT) {
2703 c1 = (U8)*STRING(next);
2704 if (OP(next) == EXACTF)
2706 else if (OP(next) == EXACTFL)
2707 c2 = PL_fold_locale[c1];
2714 /* This may be improved if l == 0. */
2715 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2716 /* If it could work, try it. */
2718 UCHARAT(PL_reginput) == c1 ||
2719 UCHARAT(PL_reginput) == c2)
2723 PL_regstartp[paren] =
2724 HOPc(PL_reginput, -l) - PL_bostr;
2725 PL_regendp[paren] = PL_reginput - PL_bostr;
2728 PL_regendp[paren] = -1;
2734 /* Couldn't or didn't -- move forward. */
2735 PL_reginput = locinput;
2736 if (regrepeat_hard(scan, 1, &l)) {
2738 locinput = PL_reginput;
2745 n = regrepeat_hard(scan, n, &l);
2746 if (n != 0 && l == 0
2747 /* In fact, this is tricky. If paren, then the
2748 fact that we did/didnot match may influence
2749 future execution. */
2750 && !(paren && ln == 0))
2752 locinput = PL_reginput;
2754 PerlIO_printf(Perl_debug_log,
2755 "%*s matched %d times, len=%"IVdf"...\n",
2756 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2760 if (PL_regkind[(U8)OP(next)] == EXACT) {
2761 c1 = (U8)*STRING(next);
2762 if (OP(next) == EXACTF)
2764 else if (OP(next) == EXACTFL)
2765 c2 = PL_fold_locale[c1];
2774 /* If it could work, try it. */
2776 UCHARAT(PL_reginput) == c1 ||
2777 UCHARAT(PL_reginput) == c2)
2780 PerlIO_printf(Perl_debug_log,
2781 "%*s trying tail with n=%"IVdf"...\n",
2782 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2786 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2787 PL_regendp[paren] = PL_reginput - PL_bostr;
2790 PL_regendp[paren] = -1;
2796 /* Couldn't or didn't -- back up. */
2798 locinput = HOPc(locinput, -l);
2799 PL_reginput = locinput;
2806 paren = scan->flags; /* Which paren to set */
2807 if (paren > PL_regsize)
2809 if (paren > *PL_reglastparen)
2810 *PL_reglastparen = paren;
2811 ln = ARG1(scan); /* min to match */
2812 n = ARG2(scan); /* max to match */
2813 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2817 ln = ARG1(scan); /* min to match */
2818 n = ARG2(scan); /* max to match */
2819 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2824 scan = NEXTOPER(scan);
2830 scan = NEXTOPER(scan);
2834 * Lookahead to avoid useless match attempts
2835 * when we know what character comes next.
2837 if (PL_regkind[(U8)OP(next)] == EXACT) {
2838 c1 = (U8)*STRING(next);
2839 if (OP(next) == EXACTF)
2841 else if (OP(next) == EXACTFL)
2842 c2 = PL_fold_locale[c1];
2848 PL_reginput = locinput;
2852 if (ln && regrepeat(scan, ln) < ln)
2854 locinput = PL_reginput;
2857 char *e = locinput + n - ln; /* Should not check after this */
2858 char *old = locinput;
2860 if (e >= PL_regeol || (n == REG_INFTY))
2863 /* Find place 'next' could work */
2865 while (locinput <= e && *locinput != c1)
2868 while (locinput <= e
2875 /* PL_reginput == old now */
2876 if (locinput != old) {
2877 ln = 1; /* Did some */
2878 if (regrepeat(scan, locinput - old) <
2882 /* PL_reginput == locinput now */
2885 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2886 PL_regendp[paren] = locinput - PL_bostr;
2889 PL_regendp[paren] = -1;
2893 PL_reginput = locinput; /* Could be reset... */
2895 /* Couldn't or didn't -- move forward. */
2900 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
2901 /* If it could work, try it. */
2903 UCHARAT(PL_reginput) == c1 ||
2904 UCHARAT(PL_reginput) == c2)
2908 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2909 PL_regendp[paren] = PL_reginput - PL_bostr;
2912 PL_regendp[paren] = -1;
2918 /* Couldn't or didn't -- move forward. */
2919 PL_reginput = locinput;
2920 if (regrepeat(scan, 1)) {
2922 locinput = PL_reginput;
2930 n = regrepeat(scan, n);
2931 locinput = PL_reginput;
2932 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
2933 (!PL_multiline || OP(next) == SEOL))
2934 ln = n; /* why back off? */
2938 /* If it could work, try it. */
2940 UCHARAT(PL_reginput) == c1 ||
2941 UCHARAT(PL_reginput) == c2)
2945 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2946 PL_regendp[paren] = PL_reginput - PL_bostr;
2949 PL_regendp[paren] = -1;
2955 /* Couldn't or didn't -- back up. */
2957 PL_reginput = locinput = HOPc(locinput, -1);
2962 /* If it could work, try it. */
2964 UCHARAT(PL_reginput) == c1 ||
2965 UCHARAT(PL_reginput) == c2)
2971 /* Couldn't or didn't -- back up. */
2973 PL_reginput = locinput = HOPc(locinput, -1);
2980 if (PL_reg_call_cc) {
2981 re_cc_state *cur_call_cc = PL_reg_call_cc;
2982 CURCUR *cctmp = PL_regcc;
2983 regexp *re = PL_reg_re;
2984 CHECKPOINT cp, lastcp;
2986 cp = regcppush(0); /* Save *all* the positions. */
2988 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
2990 PL_reginput = locinput; /* Make position available to
2992 cache_re(PL_reg_call_cc->re);
2993 PL_regcc = PL_reg_call_cc->cc;
2994 PL_reg_call_cc = PL_reg_call_cc->prev;
2995 if (regmatch(cur_call_cc->node)) {
2996 PL_reg_call_cc = cur_call_cc;
3002 PL_reg_call_cc = cur_call_cc;
3008 PerlIO_printf(Perl_debug_log,
3009 "%*s continuation failed...\n",
3010 REPORT_CODE_OFF+PL_regindent*2, "")
3014 if (locinput < PL_regtill) {
3015 DEBUG_r(PerlIO_printf(Perl_debug_log,
3016 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3018 (long)(locinput - PL_reg_starttry),
3019 (long)(PL_regtill - PL_reg_starttry),
3021 sayNO_FINAL; /* Cannot match: too short. */
3023 PL_reginput = locinput; /* put where regtry can find it */
3024 sayYES_FINAL; /* Success! */
3026 PL_reginput = locinput; /* put where regtry can find it */
3027 sayYES_LOUD; /* Success! */
3030 PL_reginput = locinput;
3035 if (UTF) { /* XXXX This is absolutely
3036 broken, we read before
3038 s = HOPMAYBEc(locinput, -scan->flags);
3044 if (locinput < PL_bostr + scan->flags)
3046 PL_reginput = locinput - scan->flags;
3051 PL_reginput = locinput;
3056 if (UTF) { /* XXXX This is absolutely
3057 broken, we read before
3059 s = HOPMAYBEc(locinput, -scan->flags);
3060 if (!s || s < PL_bostr)
3065 if (locinput < PL_bostr + scan->flags)
3067 PL_reginput = locinput - scan->flags;
3072 PL_reginput = locinput;
3075 inner = NEXTOPER(NEXTOPER(scan));
3076 if (regmatch(inner) != n) {
3091 if (OP(scan) == SUSPEND) {
3092 locinput = PL_reginput;
3093 nextchr = UCHARAT(locinput);
3098 next = scan + ARG(scan);
3103 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3104 PTR2UV(scan), OP(scan));
3105 Perl_croak(aTHX_ "regexp memory corruption");
3111 * We get here only if there's trouble -- normally "case END" is
3112 * the terminating point.
3114 Perl_croak(aTHX_ "corrupted regexp pointers");
3120 PerlIO_printf(Perl_debug_log,
3121 "%*s %scould match...%s\n",
3122 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3126 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3127 PL_colors[4],PL_colors[5]));
3136 PerlIO_printf(Perl_debug_log,
3137 "%*s %sfailed...%s\n",
3138 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3150 - regrepeat - repeatedly match something simple, report how many
3153 * [This routine now assumes that it will only match on things of length 1.
3154 * That was true before, but now we assume scan - reginput is the count,
3155 * rather than incrementing count on every character. [Er, except utf8.]]
3158 S_regrepeat(pTHX_ regnode *p, I32 max)
3161 register char *scan;
3163 register char *loceol = PL_regeol;
3164 register I32 hardcount = 0;
3167 if (max != REG_INFTY && max < loceol - scan)
3168 loceol = scan + max;
3171 while (scan < loceol && *scan != '\n')
3179 while (scan < loceol && *scan != '\n') {
3180 scan += UTF8SKIP(scan);
3186 while (scan < loceol) {
3187 scan += UTF8SKIP(scan);
3191 case EXACT: /* length of string is 1 */
3193 while (scan < loceol && UCHARAT(scan) == c)
3196 case EXACTF: /* length of string is 1 */
3198 while (scan < loceol &&
3199 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3202 case EXACTFL: /* length of string is 1 */
3203 PL_reg_flags |= RF_tainted;
3205 while (scan < loceol &&
3206 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3211 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3212 scan += UTF8SKIP(scan);
3217 while (scan < loceol && REGINCLASS(p, *scan))
3221 while (scan < loceol && isALNUM(*scan))
3226 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3227 scan += UTF8SKIP(scan);
3232 PL_reg_flags |= RF_tainted;
3233 while (scan < loceol && isALNUM_LC(*scan))
3237 PL_reg_flags |= RF_tainted;
3239 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3240 scan += UTF8SKIP(scan);
3246 while (scan < loceol && !isALNUM(*scan))
3251 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3252 scan += UTF8SKIP(scan);
3257 PL_reg_flags |= RF_tainted;
3258 while (scan < loceol && !isALNUM_LC(*scan))
3262 PL_reg_flags |= RF_tainted;
3264 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3265 scan += UTF8SKIP(scan);
3270 while (scan < loceol && isSPACE(*scan))
3275 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3276 scan += UTF8SKIP(scan);
3281 PL_reg_flags |= RF_tainted;
3282 while (scan < loceol && isSPACE_LC(*scan))
3286 PL_reg_flags |= RF_tainted;
3288 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3289 scan += UTF8SKIP(scan);
3294 while (scan < loceol && !isSPACE(*scan))
3299 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3300 scan += UTF8SKIP(scan);
3305 PL_reg_flags |= RF_tainted;
3306 while (scan < loceol && !isSPACE_LC(*scan))
3310 PL_reg_flags |= RF_tainted;
3312 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3313 scan += UTF8SKIP(scan);
3318 while (scan < loceol && isDIGIT(*scan))
3323 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3324 scan += UTF8SKIP(scan);
3330 while (scan < loceol && !isDIGIT(*scan))
3335 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3336 scan += UTF8SKIP(scan);
3340 default: /* Called on something of 0 width. */
3341 break; /* So match right here or not at all. */
3347 c = scan - PL_reginput;
3352 SV *prop = sv_newmortal();
3355 PerlIO_printf(Perl_debug_log,
3356 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3357 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3364 - regrepeat_hard - repeatedly match something, report total lenth and length
3366 * The repeater is supposed to have constant length.
3370 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3373 register char *scan;
3374 register char *start;
3375 register char *loceol = PL_regeol;
3377 I32 count = 0, res = 1;
3382 start = PL_reginput;
3384 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3387 while (start < PL_reginput) {
3389 start += UTF8SKIP(start);
3400 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3402 *lp = l = PL_reginput - start;
3403 if (max != REG_INFTY && l*max < loceol - scan)
3404 loceol = scan + l*max;
3417 - reginclass - determine if a character falls into a character class
3421 S_reginclass(pTHX_ register regnode *p, register I32 c)
3424 char flags = ANYOF_FLAGS(p);
3428 if (ANYOF_BITMAP_TEST(p, c))
3430 else if (flags & ANYOF_FOLD) {
3432 if (flags & ANYOF_LOCALE) {
3433 PL_reg_flags |= RF_tainted;
3434 cf = PL_fold_locale[c];
3438 if (ANYOF_BITMAP_TEST(p, cf))
3442 if (!match && (flags & ANYOF_CLASS)) {
3443 PL_reg_flags |= RF_tainted;
3445 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3446 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3447 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3448 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3449 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3450 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3451 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3452 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3453 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3454 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3455 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3456 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3457 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3458 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3459 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3460 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3461 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3462 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3463 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3464 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3465 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3466 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3467 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3468 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3469 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3470 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3471 ) /* How's that for a conditional? */
3477 return (flags & ANYOF_INVERT) ? !match : match;
3481 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3484 char flags = ARG1(f);
3486 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3488 if (swash_fetch(sv, p))
3490 else if (flags & ANYOF_FOLD) {
3493 if (flags & ANYOF_LOCALE) {
3494 PL_reg_flags |= RF_tainted;
3495 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3498 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3499 if (swash_fetch(sv, tmpbuf))
3503 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3505 return (flags & ANYOF_INVERT) ? !match : match;
3509 S_reghop(pTHX_ U8 *s, I32 off)
3513 while (off-- && s < (U8*)PL_regeol)
3518 if (s > (U8*)PL_bostr) {
3521 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3523 } /* XXX could check well-formedness here */
3531 S_reghopmaybe(pTHX_ U8* s, I32 off)
3535 while (off-- && s < (U8*)PL_regeol)
3542 if (s > (U8*)PL_bostr) {
3545 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3547 } /* XXX could check well-formedness here */
3564 restore_pos(pTHXo_ void *arg)
3567 if (PL_reg_eval_set) {
3568 if (PL_reg_oldsaved) {
3569 PL_reg_re->subbeg = PL_reg_oldsaved;
3570 PL_reg_re->sublen = PL_reg_oldsavedlen;
3571 RX_MATCH_COPIED_on(PL_reg_re);
3573 PL_reg_magic->mg_len = PL_reg_oldpos;
3574 PL_reg_eval_set = 0;
3575 PL_curpm = PL_reg_oldcurpm;