5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 && (sv && (strpos + SvCUR(sv) != strend)) ) {
350 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
353 PL_regeol = strend; /* Used in HOP() */
354 s = HOPc(strpos, prog->check_offset_min);
356 slen = SvCUR(check); /* >= 1 */
358 if ( strend - s > slen || strend - s < slen - 1
359 || (strend - s == slen && strend[-1] != '\n')) {
360 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
363 /* Now should match s[0..slen-2] */
365 if (slen && (*SvPVX(check) != *s
367 && memNE(SvPVX(check), s, slen)))) {
369 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
373 else if (*SvPVX(check) != *s
374 || ((slen = SvCUR(check)) > 1
375 && memNE(SvPVX(check), s, slen)))
377 goto success_at_start;
379 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
381 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
382 end_shift = prog->minlen - start_shift -
383 CHR_SVLEN(check) + (SvTAIL(check) != 0);
385 I32 end = prog->check_offset_max + CHR_SVLEN(check)
386 - (SvTAIL(check) != 0);
387 I32 eshift = strend - s - end;
389 if (end_shift < eshift)
393 else { /* Can match at random position */
396 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
397 /* Should be nonnegative! */
398 end_shift = prog->minlen - start_shift -
399 CHR_SVLEN(check) + (SvTAIL(check) != 0);
402 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
404 Perl_croak(aTHX_ "panic: end_shift");
408 /* Find a possible match in the region s..strend by looking for
409 the "check" substring in the region corrected by start/end_shift. */
410 if (flags & REXEC_SCREAM) {
411 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
412 I32 p = -1; /* Internal iterator of scream. */
413 I32 *pp = data ? data->scream_pos : &p;
415 if (PL_screamfirst[BmRARE(check)] >= 0
416 || ( BmRARE(check) == '\n'
417 && (BmPREVIOUS(check) == SvCUR(check) - 1)
419 s = screaminstr(sv, check,
420 start_shift + (s - strbeg), end_shift, pp, 0);
424 *data->scream_olds = s;
427 s = fbm_instr((unsigned char*)s + start_shift,
428 (unsigned char*)strend - end_shift,
429 check, PL_multiline ? FBMrf_MULTILINE : 0);
431 /* Update the count-of-usability, remove useless subpatterns,
434 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
435 (s ? "Found" : "Did not find"),
436 ((check == prog->anchored_substr) ? "anchored" : "floating"),
438 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
440 PL_colors[1], (SvTAIL(check) ? "$" : ""),
441 (s ? " at offset " : "...\n") ) );
448 /* Finish the diagnostic message */
449 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
451 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
452 Start with the other substr.
453 XXXX no SCREAM optimization yet - and a very coarse implementation
454 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
455 *always* match. Probably should be marked during compile...
456 Probably it is right to do no SCREAM here...
459 if (prog->float_substr && prog->anchored_substr) {
460 /* Take into account the "other" substring. */
461 /* XXXX May be hopelessly wrong for UTF... */
464 if (check == prog->float_substr) {
467 char *last = s - start_shift, *last1, *last2;
471 t = s - prog->check_offset_max;
472 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
473 && (!(prog->reganch & ROPT_UTF8)
474 || (PL_bostr = strpos, /* Used in regcopmaybe() */
475 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
480 t += prog->anchored_offset;
481 if (t < other_last) /* These positions already checked */
484 last2 = last1 = strend - prog->minlen;
487 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
488 /* On end-of-str: see comment below. */
489 s = fbm_instr((unsigned char*)t,
490 (unsigned char*)last1 + prog->anchored_offset
491 + SvCUR(prog->anchored_substr)
492 - (SvTAIL(prog->anchored_substr)!=0),
493 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
494 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
495 (s ? "Found" : "Contradicts"),
497 (int)(SvCUR(prog->anchored_substr)
498 - (SvTAIL(prog->anchored_substr)!=0)),
499 SvPVX(prog->anchored_substr),
500 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
502 if (last1 >= last2) {
503 DEBUG_r(PerlIO_printf(Perl_debug_log,
504 ", giving up...\n"));
507 DEBUG_r(PerlIO_printf(Perl_debug_log,
508 ", trying floating at offset %ld...\n",
509 (long)(s1 + 1 - i_strpos)));
510 PL_regeol = strend; /* Used in HOP() */
511 other_last = last1 + prog->anchored_offset + 1;
516 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
517 (long)(s - i_strpos)));
518 t = s - prog->anchored_offset;
527 else { /* Take into account the floating substring. */
532 last1 = last = strend - prog->minlen + prog->float_min_offset;
533 if (last - t > prog->float_max_offset)
534 last = t + prog->float_max_offset;
535 s = t + prog->float_min_offset;
538 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
539 /* fbm_instr() takes into account exact value of end-of-str
540 if the check is SvTAIL(ed). Since false positives are OK,
541 and end-of-str is not later than strend we are OK. */
542 s = fbm_instr((unsigned char*)s,
543 (unsigned char*)last + SvCUR(prog->float_substr)
544 - (SvTAIL(prog->float_substr)!=0),
545 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
546 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
547 (s ? "Found" : "Contradicts"),
549 (int)(SvCUR(prog->float_substr)
550 - (SvTAIL(prog->float_substr)!=0)),
551 SvPVX(prog->float_substr),
552 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
555 DEBUG_r(PerlIO_printf(Perl_debug_log,
556 ", giving up...\n"));
559 DEBUG_r(PerlIO_printf(Perl_debug_log,
560 ", trying anchored starting at offset %ld...\n",
561 (long)(s1 + 1 - i_strpos)));
562 other_last = last + 1;
563 PL_regeol = strend; /* Used in HOP() */
568 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
569 (long)(s - i_strpos)));
579 t = s - prog->check_offset_max;
581 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
582 && (!(prog->reganch & ROPT_UTF8)
583 || (PL_bostr = strpos, /* Used in regcopmaybe() */
584 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
587 /* Fixed substring is found far enough so that the match
588 cannot start at strpos. */
590 if (ml_anch && t[-1] != '\n') {
591 /* Eventually fbm_*() should handle this, but often
592 anchored_offset is not 0, so this check will not be wasted. */
593 /* XXXX In the code below we prefer to look for "^" even in
594 presence of anchored substrings. And we search even
595 beyond the found float position. These pessimizations
596 are historical artefacts only. */
598 while (t < strend - prog->minlen) {
600 if (t < s - prog->check_offset_min) {
601 if (prog->anchored_substr) {
602 /* We definitely contradict the found anchored
603 substr. Due to the above check we do not
604 contradict "check" substr.
605 Thus we can arrive here only if check substr
606 is float. Redo checking for "other"=="fixed".
609 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
610 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
611 goto do_other_anchored;
614 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
615 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
618 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
619 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
625 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
626 PL_colors[0],PL_colors[1]));
631 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
635 /* The found string does not prohibit matching at beg-of-str
636 - no optimization of calling REx engine can be performed,
637 unless it was an MBOL and we are not after MBOL. */
639 /* Even in this situation we may use MBOL flag if strpos is offset
640 wrt the start of the string. */
642 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
646 DEBUG_r( if (ml_anch)
647 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
648 PL_colors[0],PL_colors[1]);
651 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
652 && prog->check_substr /* Could be deleted already */
653 && --BmUSEFUL(prog->check_substr) < 0
654 && prog->check_substr == prog->float_substr)
656 /* If flags & SOMETHING - do not do it many times on the same match */
657 SvREFCNT_dec(prog->check_substr);
658 prog->check_substr = Nullsv; /* disable */
659 prog->float_substr = Nullsv; /* clear */
661 /* XXXX This is a remnant of the old implementation. It
662 looks wasteful, since now INTUIT can use many
664 prog->reganch &= ~RE_USE_INTUIT;
671 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
672 if (prog->regstclass) {
673 /* minlen == 0 is possible if regstclass is \b or \B,
674 and the fixed substr is ''$.
675 Since minlen is already taken into account, s+1 is before strend;
676 accidentally, minlen >= 1 guaranties no false positives at s + 1
677 even for \b or \B. But (minlen? 1 : 0) below assumes that
678 regstclass does not come from lookahead... */
679 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
680 This leaves EXACTF only, which is dealt with in find_byclass(). */
681 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
682 ? STR_LEN(prog->regstclass)
684 char *endpos = (prog->anchored_substr || ml_anch)
685 ? s + (prog->minlen? cl_l : 0)
686 : (prog->float_substr ? check_at - start_shift + cl_l
688 char *startpos = sv ? strend - SvCUR(sv) : s;
691 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
696 if (endpos == strend) {
697 DEBUG_r( PerlIO_printf(Perl_debug_log,
698 "Could not match STCLASS...\n") );
701 DEBUG_r( PerlIO_printf(Perl_debug_log,
702 "This position contradicts STCLASS...\n") );
703 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
705 /* Contradict one of substrings */
706 if (prog->anchored_substr) {
707 if (prog->anchored_substr == check) {
708 DEBUG_r( what = "anchored" );
710 PL_regeol = strend; /* Used in HOP() */
712 if (s + start_shift + end_shift > strend) {
713 /* XXXX Should be taken into account earlier? */
714 DEBUG_r( PerlIO_printf(Perl_debug_log,
715 "Could not match STCLASS...\n") );
718 DEBUG_r( PerlIO_printf(Perl_debug_log,
719 "Trying %s substr starting at offset %ld...\n",
720 what, (long)(s + start_shift - i_strpos)) );
723 /* Have both, check_string is floating */
724 if (t + start_shift >= check_at) /* Contradicts floating=check */
725 goto retry_floating_check;
726 /* Recheck anchored substring, but not floating... */
728 DEBUG_r( PerlIO_printf(Perl_debug_log,
729 "Trying anchored substr starting at offset %ld...\n",
730 (long)(other_last - i_strpos)) );
731 goto do_other_anchored;
733 /* Another way we could have checked stclass at the
734 current position only: */
737 DEBUG_r( PerlIO_printf(Perl_debug_log,
738 "Trying /^/m starting at offset %ld...\n",
739 (long)(t - i_strpos)) );
742 if (!prog->float_substr) /* Could have been deleted */
744 /* Check is floating subtring. */
745 retry_floating_check:
746 t = check_at - start_shift;
747 DEBUG_r( what = "floating" );
748 goto hop_and_restart;
751 PerlIO_printf(Perl_debug_log,
752 "By STCLASS: moving %ld --> %ld\n",
753 (long)(t - i_strpos), (long)(s - i_strpos));
755 PerlIO_printf(Perl_debug_log,
756 "Does not contradict STCLASS...\n") );
758 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
759 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
762 fail_finish: /* Substring not found */
763 if (prog->check_substr) /* could be removed already */
764 BmUSEFUL(prog->check_substr) += 5; /* hooray */
766 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
767 PL_colors[4],PL_colors[5]));
771 /* We know what class REx starts with. Try to find this position... */
773 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
775 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
781 register I32 tmp = 1; /* Scratch variable? */
783 /* We know what class it must start with. */
787 if (REGINCLASSUTF8(c, (U8*)s)) {
788 if (tmp && (norun || regtry(prog, s)))
800 if (REGINCLASS(c, *s)) {
801 if (tmp && (norun || regtry(prog, s)))
821 c2 = PL_fold_locale[c1];
826 e = s; /* Due to minlen logic of intuit() */
827 /* Here it is NOT UTF! */
831 && (ln == 1 || !(OP(c) == EXACTF
833 : ibcmp_locale(s, m, ln)))
834 && (norun || regtry(prog, s)) )
840 if ( (*s == c1 || *s == c2)
841 && (ln == 1 || !(OP(c) == EXACTF
843 : ibcmp_locale(s, m, ln)))
844 && (norun || regtry(prog, s)) )
851 PL_reg_flags |= RF_tainted;
854 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
855 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
857 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
859 if ((norun || regtry(prog, s)))
864 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
868 PL_reg_flags |= RF_tainted;
871 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
872 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
874 if (tmp == !(OP(c) == BOUND ?
875 swash_fetch(PL_utf8_alnum, (U8*)s) :
876 isALNUM_LC_utf8((U8*)s)))
879 if ((norun || regtry(prog, s)))
884 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
888 PL_reg_flags |= RF_tainted;
891 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
892 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
894 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
896 else if ((norun || regtry(prog, s)))
900 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
904 PL_reg_flags |= RF_tainted;
908 strend = reghop_c(strend, -1);
909 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
910 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
912 if (tmp == !(OP(c) == NBOUND ?
913 swash_fetch(PL_utf8_alnum, (U8*)s) :
914 isALNUM_LC_utf8((U8*)s)))
916 else if ((norun || regtry(prog, s)))
920 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
926 if (tmp && (norun || regtry(prog, s)))
938 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
939 if (tmp && (norun || regtry(prog, s)))
950 PL_reg_flags |= RF_tainted;
952 if (isALNUM_LC(*s)) {
953 if (tmp && (norun || regtry(prog, s)))
964 PL_reg_flags |= RF_tainted;
966 if (isALNUM_LC_utf8((U8*)s)) {
967 if (tmp && (norun || regtry(prog, s)))
980 if (tmp && (norun || regtry(prog, s)))
992 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
993 if (tmp && (norun || regtry(prog, s)))
1004 PL_reg_flags |= RF_tainted;
1005 while (s < strend) {
1006 if (!isALNUM_LC(*s)) {
1007 if (tmp && (norun || regtry(prog, s)))
1018 PL_reg_flags |= RF_tainted;
1019 while (s < strend) {
1020 if (!isALNUM_LC_utf8((U8*)s)) {
1021 if (tmp && (norun || regtry(prog, s)))
1032 while (s < strend) {
1034 if (tmp && (norun || regtry(prog, s)))
1045 while (s < strend) {
1046 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1047 if (tmp && (norun || regtry(prog, s)))
1058 PL_reg_flags |= RF_tainted;
1059 while (s < strend) {
1060 if (isSPACE_LC(*s)) {
1061 if (tmp && (norun || regtry(prog, s)))
1072 PL_reg_flags |= RF_tainted;
1073 while (s < strend) {
1074 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1075 if (tmp && (norun || regtry(prog, s)))
1086 while (s < strend) {
1088 if (tmp && (norun || regtry(prog, s)))
1099 while (s < strend) {
1100 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1101 if (tmp && (norun || regtry(prog, s)))
1112 PL_reg_flags |= RF_tainted;
1113 while (s < strend) {
1114 if (!isSPACE_LC(*s)) {
1115 if (tmp && (norun || regtry(prog, s)))
1126 PL_reg_flags |= RF_tainted;
1127 while (s < strend) {
1128 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1129 if (tmp && (norun || regtry(prog, s)))
1140 while (s < strend) {
1142 if (tmp && (norun || regtry(prog, s)))
1153 while (s < strend) {
1154 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1155 if (tmp && (norun || regtry(prog, s)))
1166 PL_reg_flags |= RF_tainted;
1167 while (s < strend) {
1168 if (isDIGIT_LC(*s)) {
1169 if (tmp && (norun || regtry(prog, s)))
1180 PL_reg_flags |= RF_tainted;
1181 while (s < strend) {
1182 if (isDIGIT_LC_utf8((U8*)s)) {
1183 if (tmp && (norun || regtry(prog, s)))
1194 while (s < strend) {
1196 if (tmp && (norun || regtry(prog, s)))
1207 while (s < strend) {
1208 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1209 if (tmp && (norun || regtry(prog, s)))
1220 PL_reg_flags |= RF_tainted;
1221 while (s < strend) {
1222 if (!isDIGIT_LC(*s)) {
1223 if (tmp && (norun || regtry(prog, s)))
1234 PL_reg_flags |= RF_tainted;
1235 while (s < strend) {
1236 if (!isDIGIT_LC_utf8((U8*)s)) {
1237 if (tmp && (norun || regtry(prog, s)))
1248 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1257 - regexec_flags - match a regexp against a string
1260 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1261 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1262 /* strend: pointer to null at end of string */
1263 /* strbeg: real beginning of string */
1264 /* minend: end of match must be >=minend after stringarg. */
1265 /* data: May be used for some additional optimizations. */
1266 /* nosave: For optimizations. */
1270 register regnode *c;
1271 register char *startpos = stringarg;
1273 I32 minlen; /* must match at least this many chars */
1274 I32 dontbother = 0; /* how many characters not to try at end */
1275 I32 start_shift = 0; /* Offset of the start to find
1276 constant substr. */ /* CC */
1277 I32 end_shift = 0; /* Same for the end. */ /* CC */
1278 I32 scream_pos = -1; /* Internal iterator of scream. */
1280 SV* oreplsv = GvSV(PL_replgv);
1286 PL_regnarrate = PL_debug & 512;
1289 /* Be paranoid... */
1290 if (prog == NULL || startpos == NULL) {
1291 Perl_croak(aTHX_ "NULL regexp parameter");
1295 minlen = prog->minlen;
1296 if (strend - startpos < minlen) goto phooey;
1298 if (startpos == strbeg) /* is ^ valid at stringarg? */
1301 PL_regprev = (U32)stringarg[-1];
1302 if (!PL_multiline && PL_regprev == '\n')
1303 PL_regprev = '\0'; /* force ^ to NOT match */
1306 /* Check validity of program. */
1307 if (UCHARAT(prog->program) != REG_MAGIC) {
1308 Perl_croak(aTHX_ "corrupted regexp program");
1312 PL_reg_eval_set = 0;
1315 if (prog->reganch & ROPT_UTF8)
1316 PL_reg_flags |= RF_utf8;
1318 /* Mark beginning of line for ^ and lookbehind. */
1319 PL_regbol = startpos;
1323 /* Mark end of line for $ (and such) */
1326 /* see how far we have to get to not match where we matched before */
1327 PL_regtill = startpos+minend;
1329 /* We start without call_cc context. */
1332 /* If there is a "must appear" string, look for it. */
1335 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1338 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1339 PL_reg_ganch = startpos;
1340 else if (sv && SvTYPE(sv) >= SVt_PVMG
1342 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1343 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1344 if (prog->reganch & ROPT_ANCH_GPOS) {
1345 if (s > PL_reg_ganch)
1350 else /* pos() not defined */
1351 PL_reg_ganch = strbeg;
1354 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1355 re_scream_pos_data d;
1357 d.scream_olds = &scream_olds;
1358 d.scream_pos = &scream_pos;
1359 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1361 goto phooey; /* not present */
1364 DEBUG_r( if (!PL_colorset) reginitcolors() );
1365 DEBUG_r(PerlIO_printf(Perl_debug_log,
1366 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1367 PL_colors[4],PL_colors[5],PL_colors[0],
1370 (strlen(prog->precomp) > 60 ? "..." : ""),
1372 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1373 startpos, PL_colors[1],
1374 (strend - startpos > 60 ? "..." : ""))
1377 /* Simplest case: anchored match need be tried only once. */
1378 /* [unless only anchor is BOL and multiline is set] */
1379 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1380 if (s == startpos && regtry(prog, startpos))
1382 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1383 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1388 dontbother = minlen - 1;
1389 end = HOPc(strend, -dontbother) - 1;
1390 /* for multiline we only have to try after newlines */
1391 if (prog->check_substr) {
1395 if (regtry(prog, s))
1400 if (prog->reganch & RE_USE_INTUIT) {
1401 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1412 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1413 if (regtry(prog, s))
1420 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1421 if (regtry(prog, PL_reg_ganch))
1426 /* Messy cases: unanchored match. */
1427 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1428 /* we have /x+whatever/ */
1429 /* it must be a one character string (XXXX Except UTF?) */
1430 char ch = SvPVX(prog->anchored_substr)[0];
1432 while (s < strend) {
1434 if (regtry(prog, s)) goto got_it;
1436 while (s < strend && *s == ch)
1443 while (s < strend) {
1445 if (regtry(prog, s)) goto got_it;
1447 while (s < strend && *s == ch)
1455 else if (prog->anchored_substr != Nullsv
1456 || (prog->float_substr != Nullsv
1457 && prog->float_max_offset < strend - s)) {
1458 SV *must = prog->anchored_substr
1459 ? prog->anchored_substr : prog->float_substr;
1461 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1463 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1464 I32 delta = back_max - back_min;
1465 char *last = HOPc(strend, /* Cannot start after this */
1466 -(I32)(CHR_SVLEN(must)
1467 - (SvTAIL(must) != 0) + back_min));
1468 char *last1; /* Last position checked before */
1471 last1 = HOPc(s, -1);
1473 last1 = s - 1; /* bogus */
1475 /* XXXX check_substr already used to find `s', can optimize if
1476 check_substr==must. */
1478 dontbother = end_shift;
1479 strend = HOPc(strend, -dontbother);
1480 while ( (s <= last) &&
1481 ((flags & REXEC_SCREAM)
1482 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1483 end_shift, &scream_pos, 0))
1484 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1485 (unsigned char*)strend, must,
1486 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1487 if (HOPc(s, -back_max) > last1) {
1488 last1 = HOPc(s, -back_min);
1489 s = HOPc(s, -back_max);
1492 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1494 last1 = HOPc(s, -back_min);
1498 while (s <= last1) {
1499 if (regtry(prog, s))
1505 while (s <= last1) {
1506 if (regtry(prog, s))
1514 else if (c = prog->regstclass) {
1515 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1516 /* don't bother with what can't match */
1517 strend = HOPc(strend, -(minlen - 1));
1518 if (find_byclass(prog, c, s, strend, startpos, 0))
1523 if (prog->float_substr != Nullsv) { /* Trim the end. */
1525 I32 oldpos = scream_pos;
1527 if (flags & REXEC_SCREAM) {
1528 last = screaminstr(sv, prog->float_substr, s - strbeg,
1529 end_shift, &scream_pos, 1); /* last one */
1531 last = scream_olds; /* Only one occurence. */
1535 char *little = SvPV(prog->float_substr, len);
1537 if (SvTAIL(prog->float_substr)) {
1538 if (memEQ(strend - len + 1, little, len - 1))
1539 last = strend - len + 1;
1540 else if (!PL_multiline)
1541 last = memEQ(strend - len, little, len)
1542 ? strend - len : Nullch;
1548 last = rninstr(s, strend, little, little + len);
1550 last = strend; /* matching `$' */
1553 if (last == NULL) goto phooey; /* Should not happen! */
1554 dontbother = strend - last + prog->float_min_offset;
1556 if (minlen && (dontbother < minlen))
1557 dontbother = minlen - 1;
1558 strend -= dontbother; /* this one's always in bytes! */
1559 /* We don't know much -- general case. */
1562 if (regtry(prog, s))
1571 if (regtry(prog, s))
1573 } while (s++ < strend);
1581 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1583 if (PL_reg_eval_set) {
1584 /* Preserve the current value of $^R */
1585 if (oreplsv != GvSV(PL_replgv))
1586 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1587 restored, the value remains
1589 restore_pos(aTHXo_ 0);
1592 /* make sure $`, $&, $', and $digit will work later */
1593 if ( !(flags & REXEC_NOT_FIRST) ) {
1594 if (RX_MATCH_COPIED(prog)) {
1595 Safefree(prog->subbeg);
1596 RX_MATCH_COPIED_off(prog);
1598 if (flags & REXEC_COPY_STR) {
1599 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1601 s = savepvn(strbeg, i);
1604 RX_MATCH_COPIED_on(prog);
1607 prog->subbeg = strbeg;
1608 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1615 if (PL_reg_eval_set)
1616 restore_pos(aTHXo_ 0);
1621 - regtry - try match at specific point
1623 STATIC I32 /* 0 failure, 1 success */
1624 S_regtry(pTHX_ regexp *prog, char *startpos)
1632 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1635 PL_reg_eval_set = RS_init;
1637 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1638 (IV)(PL_stack_sp - PL_stack_base));
1640 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1641 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1642 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1644 /* Apparently this is not needed, judging by wantarray. */
1645 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1646 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1649 /* Make $_ available to executed code. */
1650 if (PL_reg_sv != DEFSV) {
1651 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1656 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1657 && (mg = mg_find(PL_reg_sv, 'g')))) {
1658 /* prepare for quick setting of pos */
1659 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1660 mg = mg_find(PL_reg_sv, 'g');
1664 PL_reg_oldpos = mg->mg_len;
1665 SAVEDESTRUCTOR_X(restore_pos, 0);
1668 New(22,PL_reg_curpm, 1, PMOP);
1669 PL_reg_curpm->op_pmregexp = prog;
1670 PL_reg_oldcurpm = PL_curpm;
1671 PL_curpm = PL_reg_curpm;
1672 if (RX_MATCH_COPIED(prog)) {
1673 /* Here is a serious problem: we cannot rewrite subbeg,
1674 since it may be needed if this match fails. Thus
1675 $` inside (?{}) could fail... */
1676 PL_reg_oldsaved = prog->subbeg;
1677 PL_reg_oldsavedlen = prog->sublen;
1678 RX_MATCH_COPIED_off(prog);
1681 PL_reg_oldsaved = Nullch;
1682 prog->subbeg = PL_bostr;
1683 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1685 prog->startp[0] = startpos - PL_bostr;
1686 PL_reginput = startpos;
1687 PL_regstartp = prog->startp;
1688 PL_regendp = prog->endp;
1689 PL_reglastparen = &prog->lastparen;
1690 prog->lastparen = 0;
1692 DEBUG_r(PL_reg_starttry = startpos);
1693 if (PL_reg_start_tmpl <= prog->nparens) {
1694 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1695 if(PL_reg_start_tmp)
1696 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1698 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1701 /* XXXX What this code is doing here?!!! There should be no need
1702 to do this again and again, PL_reglastparen should take care of
1706 if (prog->nparens) {
1707 for (i = prog->nparens; i >= 1; i--) {
1713 if (regmatch(prog->program + 1)) {
1714 prog->endp[0] = PL_reginput - PL_bostr;
1722 - regmatch - main matching routine
1724 * Conceptually the strategy is simple: check to see whether the current
1725 * node matches, call self recursively to see whether the rest matches,
1726 * and then act accordingly. In practice we make some effort to avoid
1727 * recursion, in particular by going through "ordinary" nodes (that don't
1728 * need to know whether the rest of the match failed) by a loop instead of
1731 /* [lwall] I've hoisted the register declarations to the outer block in order to
1732 * maybe save a little bit of pushing and popping on the stack. It also takes
1733 * advantage of machines that use a register save mask on subroutine entry.
1735 STATIC I32 /* 0 failure, 1 success */
1736 S_regmatch(pTHX_ regnode *prog)
1739 register regnode *scan; /* Current node. */
1740 regnode *next; /* Next node. */
1741 regnode *inner; /* Next node in internal branch. */
1742 register I32 nextchr; /* renamed nextchr - nextchar colides with
1743 function of same name */
1744 register I32 n; /* no or next */
1745 register I32 ln; /* len or last */
1746 register char *s; /* operand or save */
1747 register char *locinput = PL_reginput;
1748 register I32 c1, c2, paren; /* case fold search, parenth */
1749 int minmod = 0, sw = 0, logical = 0;
1754 /* Note that nextchr is a byte even in UTF */
1755 nextchr = UCHARAT(locinput);
1757 while (scan != NULL) {
1758 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1760 # define sayYES goto yes
1761 # define sayNO goto no
1762 # define sayYES_FINAL goto yes_final
1763 # define sayYES_LOUD goto yes_loud
1764 # define sayNO_FINAL goto no_final
1765 # define sayNO_SILENT goto do_no
1766 # define saySAME(x) if (x) goto yes; else goto no
1767 # define REPORT_CODE_OFF 24
1769 # define sayYES return 1
1770 # define sayNO return 0
1771 # define sayYES_FINAL return 1
1772 # define sayYES_LOUD return 1
1773 # define sayNO_FINAL return 0
1774 # define sayNO_SILENT return 0
1775 # define saySAME(x) return x
1778 SV *prop = sv_newmortal();
1779 int docolor = *PL_colors[0];
1780 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1781 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1782 /* The part of the string before starttry has one color
1783 (pref0_len chars), between starttry and current
1784 position another one (pref_len - pref0_len chars),
1785 after the current position the third one.
1786 We assume that pref0_len <= pref_len, otherwise we
1787 decrease pref0_len. */
1788 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1789 ? (5 + taill) - l : locinput - PL_bostr);
1790 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1792 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1793 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1794 ? (5 + taill) - pref_len : PL_regeol - locinput);
1797 if (pref0_len > pref_len)
1798 pref0_len = pref_len;
1799 regprop(prop, scan);
1800 PerlIO_printf(Perl_debug_log,
1801 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1802 (IV)(locinput - PL_bostr),
1803 PL_colors[4], pref0_len,
1804 locinput - pref_len, PL_colors[5],
1805 PL_colors[2], pref_len - pref0_len,
1806 locinput - pref_len + pref0_len, PL_colors[3],
1807 (docolor ? "" : "> <"),
1808 PL_colors[0], l, locinput, PL_colors[1],
1809 15 - l - pref_len + 1,
1811 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1815 next = scan + NEXT_OFF(scan);
1821 if (locinput == PL_bostr
1822 ? PL_regprev == '\n'
1824 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1826 /* regtill = regbol; */
1831 if (locinput == PL_bostr
1832 ? PL_regprev == '\n'
1833 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1839 if (locinput == PL_regbol && PL_regprev == '\n')
1843 if (locinput == PL_reg_ganch)
1853 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1858 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1860 if (PL_regeol - locinput > 1)
1864 if (PL_regeol != locinput)
1868 if (nextchr & 0x80) {
1869 locinput += PL_utf8skip[nextchr];
1870 if (locinput > PL_regeol)
1872 nextchr = UCHARAT(locinput);
1875 if (!nextchr && locinput >= PL_regeol)
1877 nextchr = UCHARAT(++locinput);
1880 if (!nextchr && locinput >= PL_regeol)
1882 nextchr = UCHARAT(++locinput);
1885 if (nextchr & 0x80) {
1886 locinput += PL_utf8skip[nextchr];
1887 if (locinput > PL_regeol)
1889 nextchr = UCHARAT(locinput);
1892 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1894 nextchr = UCHARAT(++locinput);
1897 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1899 nextchr = UCHARAT(++locinput);
1904 /* Inline the first character, for speed. */
1905 if (UCHARAT(s) != nextchr)
1907 if (PL_regeol - locinput < ln)
1909 if (ln > 1 && memNE(s, locinput, ln))
1912 nextchr = UCHARAT(locinput);
1915 PL_reg_flags |= RF_tainted;
1924 c1 = OP(scan) == EXACTF;
1928 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1929 toLOWER_utf8((U8*)l) :
1930 toLOWER_LC_utf8((U8*)l)))
1938 nextchr = UCHARAT(locinput);
1942 /* Inline the first character, for speed. */
1943 if (UCHARAT(s) != nextchr &&
1944 UCHARAT(s) != ((OP(scan) == EXACTF)
1945 ? PL_fold : PL_fold_locale)[nextchr])
1947 if (PL_regeol - locinput < ln)
1949 if (ln > 1 && (OP(scan) == EXACTF
1950 ? ibcmp(s, locinput, ln)
1951 : ibcmp_locale(s, locinput, ln)))
1954 nextchr = UCHARAT(locinput);
1957 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1959 if (locinput >= PL_regeol)
1961 locinput += PL_utf8skip[nextchr];
1962 nextchr = UCHARAT(locinput);
1966 nextchr = UCHARAT(locinput);
1967 if (!REGINCLASS(scan, nextchr))
1969 if (!nextchr && locinput >= PL_regeol)
1971 nextchr = UCHARAT(++locinput);
1974 PL_reg_flags |= RF_tainted;
1979 if (!(OP(scan) == ALNUM
1980 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1982 nextchr = UCHARAT(++locinput);
1985 PL_reg_flags |= RF_tainted;
1990 if (nextchr & 0x80) {
1991 if (!(OP(scan) == ALNUMUTF8
1992 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1993 : isALNUM_LC_utf8((U8*)locinput)))
1997 locinput += PL_utf8skip[nextchr];
1998 nextchr = UCHARAT(locinput);
2001 if (!(OP(scan) == ALNUMUTF8
2002 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2004 nextchr = UCHARAT(++locinput);
2007 PL_reg_flags |= RF_tainted;
2010 if (!nextchr && locinput >= PL_regeol)
2012 if (OP(scan) == NALNUM
2013 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2015 nextchr = UCHARAT(++locinput);
2018 PL_reg_flags |= RF_tainted;
2021 if (!nextchr && locinput >= PL_regeol)
2023 if (nextchr & 0x80) {
2024 if (OP(scan) == NALNUMUTF8
2025 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2026 : isALNUM_LC_utf8((U8*)locinput))
2030 locinput += PL_utf8skip[nextchr];
2031 nextchr = UCHARAT(locinput);
2034 if (OP(scan) == NALNUMUTF8
2035 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2037 nextchr = UCHARAT(++locinput);
2041 PL_reg_flags |= RF_tainted;
2045 /* was last char in word? */
2046 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2047 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2049 n = isALNUM(nextchr);
2052 ln = isALNUM_LC(ln);
2053 n = isALNUM_LC(nextchr);
2055 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2060 PL_reg_flags |= RF_tainted;
2064 /* was last char in word? */
2065 ln = (locinput != PL_regbol)
2066 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2067 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2068 ln = isALNUM_uni(ln);
2069 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2072 ln = isALNUM_LC_uni(ln);
2073 n = isALNUM_LC_utf8((U8*)locinput);
2075 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2079 PL_reg_flags |= RF_tainted;
2082 if (!nextchr && locinput >= PL_regeol)
2084 if (!(OP(scan) == SPACE
2085 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2087 nextchr = UCHARAT(++locinput);
2090 PL_reg_flags |= RF_tainted;
2093 if (!nextchr && locinput >= PL_regeol)
2095 if (nextchr & 0x80) {
2096 if (!(OP(scan) == SPACEUTF8
2097 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2098 : isSPACE_LC_utf8((U8*)locinput)))
2102 locinput += PL_utf8skip[nextchr];
2103 nextchr = UCHARAT(locinput);
2106 if (!(OP(scan) == SPACEUTF8
2107 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2109 nextchr = UCHARAT(++locinput);
2112 PL_reg_flags |= RF_tainted;
2117 if (OP(scan) == SPACE
2118 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2120 nextchr = UCHARAT(++locinput);
2123 PL_reg_flags |= RF_tainted;
2128 if (nextchr & 0x80) {
2129 if (OP(scan) == NSPACEUTF8
2130 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2131 : isSPACE_LC_utf8((U8*)locinput))
2135 locinput += PL_utf8skip[nextchr];
2136 nextchr = UCHARAT(locinput);
2139 if (OP(scan) == NSPACEUTF8
2140 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2142 nextchr = UCHARAT(++locinput);
2145 PL_reg_flags |= RF_tainted;
2148 if (!nextchr && locinput >= PL_regeol)
2150 if (!(OP(scan) == DIGIT
2151 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2153 nextchr = UCHARAT(++locinput);
2156 PL_reg_flags |= RF_tainted;
2161 if (nextchr & 0x80) {
2162 if (OP(scan) == NDIGITUTF8
2163 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2164 : isDIGIT_LC_utf8((U8*)locinput))
2168 locinput += PL_utf8skip[nextchr];
2169 nextchr = UCHARAT(locinput);
2172 if (!isDIGIT(nextchr))
2174 nextchr = UCHARAT(++locinput);
2177 PL_reg_flags |= RF_tainted;
2182 if (OP(scan) == DIGIT
2183 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2185 nextchr = UCHARAT(++locinput);
2188 PL_reg_flags |= RF_tainted;
2191 if (!nextchr && locinput >= PL_regeol)
2193 if (nextchr & 0x80) {
2194 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2196 locinput += PL_utf8skip[nextchr];
2197 nextchr = UCHARAT(locinput);
2200 if (isDIGIT(nextchr))
2202 nextchr = UCHARAT(++locinput);
2205 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2207 locinput += PL_utf8skip[nextchr];
2208 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2209 locinput += UTF8SKIP(locinput);
2210 if (locinput > PL_regeol)
2212 nextchr = UCHARAT(locinput);
2215 PL_reg_flags |= RF_tainted;
2219 n = ARG(scan); /* which paren pair */
2220 ln = PL_regstartp[n];
2221 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2222 if (*PL_reglastparen < n || ln == -1)
2223 sayNO; /* Do not match unless seen CLOSEn. */
2224 if (ln == PL_regendp[n])
2228 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2230 char *e = PL_bostr + PL_regendp[n];
2232 * Note that we can't do the "other character" lookup trick as
2233 * in the 8-bit case (no pun intended) because in Unicode we
2234 * have to map both upper and title case to lower case.
2236 if (OP(scan) == REFF) {
2240 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2250 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2257 nextchr = UCHARAT(locinput);
2261 /* Inline the first character, for speed. */
2262 if (UCHARAT(s) != nextchr &&
2264 (UCHARAT(s) != ((OP(scan) == REFF
2265 ? PL_fold : PL_fold_locale)[nextchr]))))
2267 ln = PL_regendp[n] - ln;
2268 if (locinput + ln > PL_regeol)
2270 if (ln > 1 && (OP(scan) == REF
2271 ? memNE(s, locinput, ln)
2273 ? ibcmp(s, locinput, ln)
2274 : ibcmp_locale(s, locinput, ln))))
2277 nextchr = UCHARAT(locinput);
2288 OP_4tree *oop = PL_op;
2289 COP *ocurcop = PL_curcop;
2290 SV **ocurpad = PL_curpad;
2294 PL_op = (OP_4tree*)PL_regdata->data[n];
2295 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2296 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2297 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2299 CALLRUNOPS(aTHX); /* Scalar context. */
2305 PL_curpad = ocurpad;
2306 PL_curcop = ocurcop;
2308 if (logical == 2) { /* Postponed subexpression. */
2310 MAGIC *mg = Null(MAGIC*);
2312 CHECKPOINT cp, lastcp;
2314 if(SvROK(ret) || SvRMAGICAL(ret)) {
2315 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2318 mg = mg_find(sv, 'r');
2321 re = (regexp *)mg->mg_obj;
2322 (void)ReREFCNT_inc(re);
2326 char *t = SvPV(ret, len);
2328 char *oprecomp = PL_regprecomp;
2329 I32 osize = PL_regsize;
2330 I32 onpar = PL_regnpar;
2333 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2335 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2336 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2337 PL_regprecomp = oprecomp;
2342 PerlIO_printf(Perl_debug_log,
2343 "Entering embedded `%s%.60s%s%s'\n",
2347 (strlen(re->precomp) > 60 ? "..." : ""))
2350 state.prev = PL_reg_call_cc;
2351 state.cc = PL_regcc;
2352 state.re = PL_reg_re;
2356 cp = regcppush(0); /* Save *all* the positions. */
2359 state.ss = PL_savestack_ix;
2360 *PL_reglastparen = 0;
2361 PL_reg_call_cc = &state;
2362 PL_reginput = locinput;
2364 /* XXXX This is too dramatic a measure... */
2367 if (regmatch(re->program + 1)) {
2368 /* Even though we succeeded, we need to restore
2369 global variables, since we may be wrapped inside
2370 SUSPEND, thus the match may be not finished yet. */
2372 /* XXXX Do this only if SUSPENDed? */
2373 PL_reg_call_cc = state.prev;
2374 PL_regcc = state.cc;
2375 PL_reg_re = state.re;
2376 cache_re(PL_reg_re);
2378 /* XXXX This is too dramatic a measure... */
2381 /* These are needed even if not SUSPEND. */
2389 PL_reg_call_cc = state.prev;
2390 PL_regcc = state.cc;
2391 PL_reg_re = state.re;
2392 cache_re(PL_reg_re);
2394 /* XXXX This is too dramatic a measure... */
2403 sv_setsv(save_scalar(PL_replgv), ret);
2407 n = ARG(scan); /* which paren pair */
2408 PL_reg_start_tmp[n] = locinput;
2413 n = ARG(scan); /* which paren pair */
2414 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2415 PL_regendp[n] = locinput - PL_bostr;
2416 if (n > *PL_reglastparen)
2417 *PL_reglastparen = n;
2420 n = ARG(scan); /* which paren pair */
2421 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2424 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2426 next = NEXTOPER(NEXTOPER(scan));
2428 next = scan + ARG(scan);
2429 if (OP(next) == IFTHEN) /* Fake one. */
2430 next = NEXTOPER(NEXTOPER(next));
2434 logical = scan->flags;
2436 /*******************************************************************
2437 PL_regcc contains infoblock about the innermost (...)* loop, and
2438 a pointer to the next outer infoblock.
2440 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2442 1) After matching X, regnode for CURLYX is processed;
2444 2) This regnode creates infoblock on the stack, and calls
2445 regmatch() recursively with the starting point at WHILEM node;
2447 3) Each hit of WHILEM node tries to match A and Z (in the order
2448 depending on the current iteration, min/max of {min,max} and
2449 greediness). The information about where are nodes for "A"
2450 and "Z" is read from the infoblock, as is info on how many times "A"
2451 was already matched, and greediness.
2453 4) After A matches, the same WHILEM node is hit again.
2455 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2456 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2457 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2458 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2459 of the external loop.
2461 Currently present infoblocks form a tree with a stem formed by PL_curcc
2462 and whatever it mentions via ->next, and additional attached trees
2463 corresponding to temporarily unset infoblocks as in "5" above.
2465 In the following picture infoblocks for outer loop of
2466 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2467 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2468 infoblocks are drawn below the "reset" infoblock.
2470 In fact in the picture below we do not show failed matches for Z and T
2471 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2472 more obvious *why* one needs to *temporary* unset infoblocks.]
2474 Matched REx position InfoBlocks Comment
2478 Y A)*?Z)*?T x <- O <- I
2479 YA )*?Z)*?T x <- O <- I
2480 YA A)*?Z)*?T x <- O <- I
2481 YAA )*?Z)*?T x <- O <- I
2482 YAA Z)*?T x <- O # Temporary unset I
2485 YAAZ Y(A)*?Z)*?T x <- O
2488 YAAZY (A)*?Z)*?T x <- O
2491 YAAZY A)*?Z)*?T x <- O <- I
2494 YAAZYA )*?Z)*?T x <- O <- I
2497 YAAZYA Z)*?T x <- O # Temporary unset I
2503 YAAZYAZ T x # Temporary unset O
2510 *******************************************************************/
2513 CHECKPOINT cp = PL_savestack_ix;
2515 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2517 cc.oldcc = PL_regcc;
2519 cc.parenfloor = *PL_reglastparen;
2521 cc.min = ARG1(scan);
2522 cc.max = ARG2(scan);
2523 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2527 PL_reginput = locinput;
2528 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2530 PL_regcc = cc.oldcc;
2536 * This is really hard to understand, because after we match
2537 * what we're trying to match, we must make sure the rest of
2538 * the REx is going to match for sure, and to do that we have
2539 * to go back UP the parse tree by recursing ever deeper. And
2540 * if it fails, we have to reset our parent's current state
2541 * that we can try again after backing off.
2544 CHECKPOINT cp, lastcp;
2545 CURCUR* cc = PL_regcc;
2546 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2548 n = cc->cur + 1; /* how many we know we matched */
2549 PL_reginput = locinput;
2552 PerlIO_printf(Perl_debug_log,
2553 "%*s %ld out of %ld..%ld cc=%lx\n",
2554 REPORT_CODE_OFF+PL_regindent*2, "",
2555 (long)n, (long)cc->min,
2556 (long)cc->max, (long)cc)
2559 /* If degenerate scan matches "", assume scan done. */
2561 if (locinput == cc->lastloc && n >= cc->min) {
2562 PL_regcc = cc->oldcc;
2566 PerlIO_printf(Perl_debug_log,
2567 "%*s empty match detected, try continuation...\n",
2568 REPORT_CODE_OFF+PL_regindent*2, "")
2570 if (regmatch(cc->next))
2578 /* First just match a string of min scans. */
2582 cc->lastloc = locinput;
2583 if (regmatch(cc->scan))
2586 cc->lastloc = lastloc;
2591 /* Check whether we already were at this position.
2592 Postpone detection until we know the match is not
2593 *that* much linear. */
2594 if (!PL_reg_maxiter) {
2595 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2596 PL_reg_leftiter = PL_reg_maxiter;
2598 if (PL_reg_leftiter-- == 0) {
2599 I32 size = (PL_reg_maxiter + 7)/8;
2600 if (PL_reg_poscache) {
2601 if (PL_reg_poscache_size < size) {
2602 Renew(PL_reg_poscache, size, char);
2603 PL_reg_poscache_size = size;
2605 Zero(PL_reg_poscache, size, char);
2608 PL_reg_poscache_size = size;
2609 Newz(29, PL_reg_poscache, size, char);
2612 PerlIO_printf(Perl_debug_log,
2613 "%sDetected a super-linear match, switching on caching%s...\n",
2614 PL_colors[4], PL_colors[5])
2617 if (PL_reg_leftiter < 0) {
2618 I32 o = locinput - PL_bostr, b;
2620 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2623 if (PL_reg_poscache[o] & (1<<b)) {
2625 PerlIO_printf(Perl_debug_log,
2626 "%*s already tried at this position...\n",
2627 REPORT_CODE_OFF+PL_regindent*2, "")
2631 PL_reg_poscache[o] |= (1<<b);
2635 /* Prefer next over scan for minimal matching. */
2638 PL_regcc = cc->oldcc;
2641 cp = regcppush(cc->parenfloor);
2643 if (regmatch(cc->next)) {
2645 sayYES; /* All done. */
2653 if (n >= cc->max) { /* Maximum greed exceeded? */
2654 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2655 && !(PL_reg_flags & RF_warned)) {
2656 PL_reg_flags |= RF_warned;
2657 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2658 "Complex regular subexpression recursion",
2665 PerlIO_printf(Perl_debug_log,
2666 "%*s trying longer...\n",
2667 REPORT_CODE_OFF+PL_regindent*2, "")
2669 /* Try scanning more and see if it helps. */
2670 PL_reginput = locinput;
2672 cc->lastloc = locinput;
2673 cp = regcppush(cc->parenfloor);
2675 if (regmatch(cc->scan)) {
2682 cc->lastloc = lastloc;
2686 /* Prefer scan over next for maximal matching. */
2688 if (n < cc->max) { /* More greed allowed? */
2689 cp = regcppush(cc->parenfloor);
2691 cc->lastloc = locinput;
2693 if (regmatch(cc->scan)) {
2698 regcppop(); /* Restore some previous $<digit>s? */
2699 PL_reginput = locinput;
2701 PerlIO_printf(Perl_debug_log,
2702 "%*s failed, try continuation...\n",
2703 REPORT_CODE_OFF+PL_regindent*2, "")
2706 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2707 && !(PL_reg_flags & RF_warned)) {
2708 PL_reg_flags |= RF_warned;
2709 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2710 "Complex regular subexpression recursion",
2714 /* Failed deeper matches of scan, so see if this one works. */
2715 PL_regcc = cc->oldcc;
2718 if (regmatch(cc->next))
2724 cc->lastloc = lastloc;
2729 next = scan + ARG(scan);
2732 inner = NEXTOPER(NEXTOPER(scan));
2735 inner = NEXTOPER(scan);
2740 if (OP(next) != c1) /* No choice. */
2741 next = inner; /* Avoid recursion. */
2743 int lastparen = *PL_reglastparen;
2747 PL_reginput = locinput;
2748 if (regmatch(inner))
2751 for (n = *PL_reglastparen; n > lastparen; n--)
2753 *PL_reglastparen = n;
2756 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2760 inner = NEXTOPER(scan);
2761 if (c1 == BRANCHJ) {
2762 inner = NEXTOPER(inner);
2764 } while (scan != NULL && OP(scan) == c1);
2778 /* We suppose that the next guy does not need
2779 backtracking: in particular, it is of constant length,
2780 and has no parenths to influence future backrefs. */
2781 ln = ARG1(scan); /* min to match */
2782 n = ARG2(scan); /* max to match */
2783 paren = scan->flags;
2785 if (paren > PL_regsize)
2787 if (paren > *PL_reglastparen)
2788 *PL_reglastparen = paren;
2790 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2792 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2793 PL_reginput = locinput;
2796 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2798 if (ln && l == 0 && n >= ln
2799 /* In fact, this is tricky. If paren, then the
2800 fact that we did/didnot match may influence
2801 future execution. */
2802 && !(paren && ln == 0))
2804 locinput = PL_reginput;
2805 if (PL_regkind[(U8)OP(next)] == EXACT) {
2806 c1 = (U8)*STRING(next);
2807 if (OP(next) == EXACTF)
2809 else if (OP(next) == EXACTFL)
2810 c2 = PL_fold_locale[c1];
2817 /* This may be improved if l == 0. */
2818 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2819 /* If it could work, try it. */
2821 UCHARAT(PL_reginput) == c1 ||
2822 UCHARAT(PL_reginput) == c2)
2826 PL_regstartp[paren] =
2827 HOPc(PL_reginput, -l) - PL_bostr;
2828 PL_regendp[paren] = PL_reginput - PL_bostr;
2831 PL_regendp[paren] = -1;
2837 /* Couldn't or didn't -- move forward. */
2838 PL_reginput = locinput;
2839 if (regrepeat_hard(scan, 1, &l)) {
2841 locinput = PL_reginput;
2848 n = regrepeat_hard(scan, n, &l);
2849 if (n != 0 && l == 0
2850 /* In fact, this is tricky. If paren, then the
2851 fact that we did/didnot match may influence
2852 future execution. */
2853 && !(paren && ln == 0))
2855 locinput = PL_reginput;
2857 PerlIO_printf(Perl_debug_log,
2858 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2859 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2863 if (PL_regkind[(U8)OP(next)] == EXACT) {
2864 c1 = (U8)*STRING(next);
2865 if (OP(next) == EXACTF)
2867 else if (OP(next) == EXACTFL)
2868 c2 = PL_fold_locale[c1];
2877 /* If it could work, try it. */
2879 UCHARAT(PL_reginput) == c1 ||
2880 UCHARAT(PL_reginput) == c2)
2883 PerlIO_printf(Perl_debug_log,
2884 "%*s trying tail with n=%"IVdf"...\n",
2885 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2889 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2890 PL_regendp[paren] = PL_reginput - PL_bostr;
2893 PL_regendp[paren] = -1;
2899 /* Couldn't or didn't -- back up. */
2901 locinput = HOPc(locinput, -l);
2902 PL_reginput = locinput;
2909 paren = scan->flags; /* Which paren to set */
2910 if (paren > PL_regsize)
2912 if (paren > *PL_reglastparen)
2913 *PL_reglastparen = paren;
2914 ln = ARG1(scan); /* min to match */
2915 n = ARG2(scan); /* max to match */
2916 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2920 ln = ARG1(scan); /* min to match */
2921 n = ARG2(scan); /* max to match */
2922 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2927 scan = NEXTOPER(scan);
2933 scan = NEXTOPER(scan);
2937 * Lookahead to avoid useless match attempts
2938 * when we know what character comes next.
2940 if (PL_regkind[(U8)OP(next)] == EXACT) {
2941 c1 = (U8)*STRING(next);
2942 if (OP(next) == EXACTF)
2944 else if (OP(next) == EXACTFL)
2945 c2 = PL_fold_locale[c1];
2951 PL_reginput = locinput;
2955 if (ln && regrepeat(scan, ln) < ln)
2957 locinput = PL_reginput;
2960 char *e = locinput + n - ln; /* Should not check after this */
2961 char *old = locinput;
2963 if (e >= PL_regeol || (n == REG_INFTY))
2966 /* Find place 'next' could work */
2968 while (locinput <= e && *locinput != c1)
2971 while (locinput <= e
2978 /* PL_reginput == old now */
2979 if (locinput != old) {
2980 ln = 1; /* Did some */
2981 if (regrepeat(scan, locinput - old) <
2985 /* PL_reginput == locinput now */
2988 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2989 PL_regendp[paren] = locinput - PL_bostr;
2992 PL_regendp[paren] = -1;
2996 PL_reginput = locinput; /* Could be reset... */
2998 /* Couldn't or didn't -- move forward. */
3003 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3004 /* If it could work, try it. */
3006 UCHARAT(PL_reginput) == c1 ||
3007 UCHARAT(PL_reginput) == c2)
3011 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3012 PL_regendp[paren] = PL_reginput - PL_bostr;
3015 PL_regendp[paren] = -1;
3021 /* Couldn't or didn't -- move forward. */
3022 PL_reginput = locinput;
3023 if (regrepeat(scan, 1)) {
3025 locinput = PL_reginput;
3033 n = regrepeat(scan, n);
3034 locinput = PL_reginput;
3035 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3036 (!PL_multiline || OP(next) == SEOL))
3037 ln = n; /* why back off? */
3041 /* If it could work, try it. */
3043 UCHARAT(PL_reginput) == c1 ||
3044 UCHARAT(PL_reginput) == c2)
3048 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3049 PL_regendp[paren] = PL_reginput - PL_bostr;
3052 PL_regendp[paren] = -1;
3058 /* Couldn't or didn't -- back up. */
3060 PL_reginput = locinput = HOPc(locinput, -1);
3065 /* If it could work, try it. */
3067 UCHARAT(PL_reginput) == c1 ||
3068 UCHARAT(PL_reginput) == c2)
3074 /* Couldn't or didn't -- back up. */
3076 PL_reginput = locinput = HOPc(locinput, -1);
3083 if (PL_reg_call_cc) {
3084 re_cc_state *cur_call_cc = PL_reg_call_cc;
3085 CURCUR *cctmp = PL_regcc;
3086 regexp *re = PL_reg_re;
3087 CHECKPOINT cp, lastcp;
3089 cp = regcppush(0); /* Save *all* the positions. */
3091 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3093 PL_reginput = locinput; /* Make position available to
3095 cache_re(PL_reg_call_cc->re);
3096 PL_regcc = PL_reg_call_cc->cc;
3097 PL_reg_call_cc = PL_reg_call_cc->prev;
3098 if (regmatch(cur_call_cc->node)) {
3099 PL_reg_call_cc = cur_call_cc;
3105 PL_reg_call_cc = cur_call_cc;
3111 PerlIO_printf(Perl_debug_log,
3112 "%*s continuation failed...\n",
3113 REPORT_CODE_OFF+PL_regindent*2, "")
3117 if (locinput < PL_regtill) {
3118 DEBUG_r(PerlIO_printf(Perl_debug_log,
3119 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3121 (long)(locinput - PL_reg_starttry),
3122 (long)(PL_regtill - PL_reg_starttry),
3124 sayNO_FINAL; /* Cannot match: too short. */
3126 PL_reginput = locinput; /* put where regtry can find it */
3127 sayYES_FINAL; /* Success! */
3129 PL_reginput = locinput; /* put where regtry can find it */
3130 sayYES_LOUD; /* Success! */
3133 PL_reginput = locinput;
3138 if (UTF) { /* XXXX This is absolutely
3139 broken, we read before
3141 s = HOPMAYBEc(locinput, -scan->flags);
3147 if (locinput < PL_bostr + scan->flags)
3149 PL_reginput = locinput - scan->flags;
3154 PL_reginput = locinput;
3159 if (UTF) { /* XXXX This is absolutely
3160 broken, we read before
3162 s = HOPMAYBEc(locinput, -scan->flags);
3163 if (!s || s < PL_bostr)
3168 if (locinput < PL_bostr + scan->flags)
3170 PL_reginput = locinput - scan->flags;
3175 PL_reginput = locinput;
3178 inner = NEXTOPER(NEXTOPER(scan));
3179 if (regmatch(inner) != n) {
3194 if (OP(scan) == SUSPEND) {
3195 locinput = PL_reginput;
3196 nextchr = UCHARAT(locinput);
3201 next = scan + ARG(scan);
3206 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3207 PTR2UV(scan), OP(scan));
3208 Perl_croak(aTHX_ "regexp memory corruption");
3214 * We get here only if there's trouble -- normally "case END" is
3215 * the terminating point.
3217 Perl_croak(aTHX_ "corrupted regexp pointers");
3223 PerlIO_printf(Perl_debug_log,
3224 "%*s %scould match...%s\n",
3225 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3229 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3230 PL_colors[4],PL_colors[5]));
3239 PerlIO_printf(Perl_debug_log,
3240 "%*s %sfailed...%s\n",
3241 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3253 - regrepeat - repeatedly match something simple, report how many
3256 * [This routine now assumes that it will only match on things of length 1.
3257 * That was true before, but now we assume scan - reginput is the count,
3258 * rather than incrementing count on every character. [Er, except utf8.]]
3261 S_regrepeat(pTHX_ regnode *p, I32 max)
3264 register char *scan;
3266 register char *loceol = PL_regeol;
3267 register I32 hardcount = 0;
3270 if (max != REG_INFTY && max < loceol - scan)
3271 loceol = scan + max;
3274 while (scan < loceol && *scan != '\n')
3282 while (scan < loceol && *scan != '\n') {
3283 scan += UTF8SKIP(scan);
3289 while (scan < loceol) {
3290 scan += UTF8SKIP(scan);
3294 case EXACT: /* length of string is 1 */
3296 while (scan < loceol && UCHARAT(scan) == c)
3299 case EXACTF: /* length of string is 1 */
3301 while (scan < loceol &&
3302 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3305 case EXACTFL: /* length of string is 1 */
3306 PL_reg_flags |= RF_tainted;
3308 while (scan < loceol &&
3309 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3314 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3315 scan += UTF8SKIP(scan);
3320 while (scan < loceol && REGINCLASS(p, *scan))
3324 while (scan < loceol && isALNUM(*scan))
3329 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3330 scan += UTF8SKIP(scan);
3335 PL_reg_flags |= RF_tainted;
3336 while (scan < loceol && isALNUM_LC(*scan))
3340 PL_reg_flags |= RF_tainted;
3342 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3343 scan += UTF8SKIP(scan);
3349 while (scan < loceol && !isALNUM(*scan))
3354 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3355 scan += UTF8SKIP(scan);
3360 PL_reg_flags |= RF_tainted;
3361 while (scan < loceol && !isALNUM_LC(*scan))
3365 PL_reg_flags |= RF_tainted;
3367 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3368 scan += UTF8SKIP(scan);
3373 while (scan < loceol && isSPACE(*scan))
3378 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3379 scan += UTF8SKIP(scan);
3384 PL_reg_flags |= RF_tainted;
3385 while (scan < loceol && isSPACE_LC(*scan))
3389 PL_reg_flags |= RF_tainted;
3391 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3392 scan += UTF8SKIP(scan);
3397 while (scan < loceol && !isSPACE(*scan))
3402 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3403 scan += UTF8SKIP(scan);
3408 PL_reg_flags |= RF_tainted;
3409 while (scan < loceol && !isSPACE_LC(*scan))
3413 PL_reg_flags |= RF_tainted;
3415 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3416 scan += UTF8SKIP(scan);
3421 while (scan < loceol && isDIGIT(*scan))
3426 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3427 scan += UTF8SKIP(scan);
3433 while (scan < loceol && !isDIGIT(*scan))
3438 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3439 scan += UTF8SKIP(scan);
3443 default: /* Called on something of 0 width. */
3444 break; /* So match right here or not at all. */
3450 c = scan - PL_reginput;
3455 SV *prop = sv_newmortal();
3458 PerlIO_printf(Perl_debug_log,
3459 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3460 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3467 - regrepeat_hard - repeatedly match something, report total lenth and length
3469 * The repeater is supposed to have constant length.
3473 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3476 register char *scan;
3477 register char *start;
3478 register char *loceol = PL_regeol;
3480 I32 count = 0, res = 1;
3485 start = PL_reginput;
3487 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3490 while (start < PL_reginput) {
3492 start += UTF8SKIP(start);
3503 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3505 *lp = l = PL_reginput - start;
3506 if (max != REG_INFTY && l*max < loceol - scan)
3507 loceol = scan + l*max;
3520 - reginclass - determine if a character falls into a character class
3524 S_reginclass(pTHX_ register regnode *p, register I32 c)
3527 char flags = ANYOF_FLAGS(p);
3531 if (ANYOF_BITMAP_TEST(p, c))
3533 else if (flags & ANYOF_FOLD) {
3535 if (flags & ANYOF_LOCALE) {
3536 PL_reg_flags |= RF_tainted;
3537 cf = PL_fold_locale[c];
3541 if (ANYOF_BITMAP_TEST(p, cf))
3545 if (!match && (flags & ANYOF_CLASS)) {
3546 PL_reg_flags |= RF_tainted;
3548 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3549 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3550 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3551 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3552 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3553 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3554 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3555 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3556 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3557 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3558 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3559 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3560 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3561 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3562 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3563 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3564 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3565 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3566 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3567 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3568 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3569 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3570 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3571 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3572 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3573 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3574 ) /* How's that for a conditional? */
3580 return (flags & ANYOF_INVERT) ? !match : match;
3584 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3587 char flags = ARG1(f);
3589 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3591 if (swash_fetch(sv, p))
3593 else if (flags & ANYOF_FOLD) {
3596 if (flags & ANYOF_LOCALE) {
3597 PL_reg_flags |= RF_tainted;
3598 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3601 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3602 if (swash_fetch(sv, tmpbuf))
3606 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3608 return (flags & ANYOF_INVERT) ? !match : match;
3612 S_reghop(pTHX_ U8 *s, I32 off)
3616 while (off-- && s < (U8*)PL_regeol)
3621 if (s > (U8*)PL_bostr) {
3624 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3626 } /* XXX could check well-formedness here */
3634 S_reghopmaybe(pTHX_ U8* s, I32 off)
3638 while (off-- && s < (U8*)PL_regeol)
3645 if (s > (U8*)PL_bostr) {
3648 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3650 } /* XXX could check well-formedness here */
3666 restore_pos(pTHXo_ void *arg)
3669 if (PL_reg_eval_set) {
3670 if (PL_reg_oldsaved) {
3671 PL_reg_re->subbeg = PL_reg_oldsaved;
3672 PL_reg_re->sublen = PL_reg_oldsavedlen;
3673 RX_MATCH_COPIED_on(PL_reg_re);
3675 PL_reg_magic->mg_len = PL_reg_oldpos;
3676 PL_reg_eval_set = 0;
3677 PL_curpm = PL_reg_oldcurpm;