5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 && (sv && (strpos + SvCUR(sv) != strend)) ) {
350 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
353 PL_regeol = strend; /* Used in HOP() */
354 s = HOPc(strpos, prog->check_offset_min);
356 slen = SvCUR(check); /* >= 1 */
358 if ( strend - s > slen || strend - s < slen - 1
359 || (strend - s == slen && strend[-1] != '\n')) {
360 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
363 /* Now should match s[0..slen-2] */
365 if (slen && (*SvPVX(check) != *s
367 && memNE(SvPVX(check), s, slen)))) {
369 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
373 else if (*SvPVX(check) != *s
374 || ((slen = SvCUR(check)) > 1
375 && memNE(SvPVX(check), s, slen)))
377 goto success_at_start;
379 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
381 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
382 end_shift = prog->minlen - start_shift -
383 CHR_SVLEN(check) + (SvTAIL(check) != 0);
385 I32 end = prog->check_offset_max + CHR_SVLEN(check)
386 - (SvTAIL(check) != 0);
387 I32 eshift = strend - s - end;
389 if (end_shift < eshift)
393 else { /* Can match at random position */
396 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
397 /* Should be nonnegative! */
398 end_shift = prog->minlen - start_shift -
399 CHR_SVLEN(check) + (SvTAIL(check) != 0);
402 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
404 Perl_croak(aTHX_ "panic: end_shift");
408 /* Find a possible match in the region s..strend by looking for
409 the "check" substring in the region corrected by start/end_shift. */
410 if (flags & REXEC_SCREAM) {
411 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
412 I32 p = -1; /* Internal iterator of scream. */
413 I32 *pp = data ? data->scream_pos : &p;
415 if (PL_screamfirst[BmRARE(check)] >= 0
416 || ( BmRARE(check) == '\n'
417 && (BmPREVIOUS(check) == SvCUR(check) - 1)
419 s = screaminstr(sv, check,
420 start_shift + (s - strbeg), end_shift, pp, 0);
424 *data->scream_olds = s;
427 s = fbm_instr((unsigned char*)s + start_shift,
428 (unsigned char*)strend - end_shift,
429 check, PL_multiline ? FBMrf_MULTILINE : 0);
431 /* Update the count-of-usability, remove useless subpatterns,
434 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
435 (s ? "Found" : "Did not find"),
436 ((check == prog->anchored_substr) ? "anchored" : "floating"),
438 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
440 PL_colors[1], (SvTAIL(check) ? "$" : ""),
441 (s ? " at offset " : "...\n") ) );
448 /* Finish the diagnostic message */
449 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
451 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
452 Start with the other substr.
453 XXXX no SCREAM optimization yet - and a very coarse implementation
454 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
455 *always* match. Probably should be marked during compile...
456 Probably it is right to do no SCREAM here...
459 if (prog->float_substr && prog->anchored_substr) {
460 /* Take into account the "other" substring. */
461 /* XXXX May be hopelessly wrong for UTF... */
464 if (check == prog->float_substr) {
467 char *last = s - start_shift, *last1, *last2;
471 t = s - prog->check_offset_max;
472 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
473 && (!(prog->reganch & ROPT_UTF8)
474 || (PL_bostr = strpos, /* Used in regcopmaybe() */
475 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
480 t += prog->anchored_offset;
481 if (t < other_last) /* These positions already checked */
484 last2 = last1 = strend - prog->minlen;
487 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
488 /* On end-of-str: see comment below. */
489 s = fbm_instr((unsigned char*)t,
490 (unsigned char*)last1 + prog->anchored_offset
491 + SvCUR(prog->anchored_substr)
492 - (SvTAIL(prog->anchored_substr)!=0),
493 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
494 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
495 (s ? "Found" : "Contradicts"),
497 (int)(SvCUR(prog->anchored_substr)
498 - (SvTAIL(prog->anchored_substr)!=0)),
499 SvPVX(prog->anchored_substr),
500 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
502 if (last1 >= last2) {
503 DEBUG_r(PerlIO_printf(Perl_debug_log,
504 ", giving up...\n"));
507 DEBUG_r(PerlIO_printf(Perl_debug_log,
508 ", trying floating at offset %ld...\n",
509 (long)(s1 + 1 - i_strpos)));
510 PL_regeol = strend; /* Used in HOP() */
511 other_last = last1 + prog->anchored_offset + 1;
516 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
517 (long)(s - i_strpos)));
518 t = s - prog->anchored_offset;
527 else { /* Take into account the floating substring. */
532 last1 = last = strend - prog->minlen + prog->float_min_offset;
533 if (last - t > prog->float_max_offset)
534 last = t + prog->float_max_offset;
535 s = t + prog->float_min_offset;
538 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
539 /* fbm_instr() takes into account exact value of end-of-str
540 if the check is SvTAIL(ed). Since false positives are OK,
541 and end-of-str is not later than strend we are OK. */
542 s = fbm_instr((unsigned char*)s,
543 (unsigned char*)last + SvCUR(prog->float_substr)
544 - (SvTAIL(prog->float_substr)!=0),
545 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
546 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
547 (s ? "Found" : "Contradicts"),
549 (int)(SvCUR(prog->float_substr)
550 - (SvTAIL(prog->float_substr)!=0)),
551 SvPVX(prog->float_substr),
552 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
555 DEBUG_r(PerlIO_printf(Perl_debug_log,
556 ", giving up...\n"));
559 DEBUG_r(PerlIO_printf(Perl_debug_log,
560 ", trying anchored starting at offset %ld...\n",
561 (long)(s1 + 1 - i_strpos)));
562 other_last = last + 1;
563 PL_regeol = strend; /* Used in HOP() */
568 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
569 (long)(s - i_strpos)));
579 t = s - prog->check_offset_max;
581 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
582 && (!(prog->reganch & ROPT_UTF8)
583 || (PL_bostr = strpos, /* Used in regcopmaybe() */
584 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
587 /* Fixed substring is found far enough so that the match
588 cannot start at strpos. */
590 if (ml_anch && t[-1] != '\n') {
591 /* Eventually fbm_*() should handle this, but often
592 anchored_offset is not 0, so this check will not be wasted. */
593 /* XXXX In the code below we prefer to look for "^" even in
594 presence of anchored substrings. And we search even
595 beyond the found float position. These pessimizations
596 are historical artefacts only. */
598 while (t < strend - prog->minlen) {
600 if (t < s - prog->check_offset_min) {
601 if (prog->anchored_substr) {
602 /* We definitely contradict the found anchored
603 substr. Due to the above check we do not
604 contradict "check" substr.
605 Thus we can arrive here only if check substr
606 is float. Redo checking for "other"=="fixed".
609 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
610 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
611 goto do_other_anchored;
614 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
615 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
618 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
619 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
625 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
626 PL_colors[0],PL_colors[1]));
631 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
635 /* The found string does not prohibit matching at beg-of-str
636 - no optimization of calling REx engine can be performed,
637 unless it was an MBOL and we are not after MBOL. */
639 /* Even in this situation we may use MBOL flag if strpos is offset
640 wrt the start of the string. */
642 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
643 /* May be due to an implicit anchor of m{.*foo} */
644 && !(prog->reganch & ROPT_IMPLICIT))
649 DEBUG_r( if (ml_anch)
650 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
651 PL_colors[0],PL_colors[1]);
654 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
655 && prog->check_substr /* Could be deleted already */
656 && --BmUSEFUL(prog->check_substr) < 0
657 && prog->check_substr == prog->float_substr)
659 /* If flags & SOMETHING - do not do it many times on the same match */
660 SvREFCNT_dec(prog->check_substr);
661 prog->check_substr = Nullsv; /* disable */
662 prog->float_substr = Nullsv; /* clear */
664 /* XXXX This is a remnant of the old implementation. It
665 looks wasteful, since now INTUIT can use many
667 prog->reganch &= ~RE_USE_INTUIT;
674 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
675 if (prog->regstclass) {
676 /* minlen == 0 is possible if regstclass is \b or \B,
677 and the fixed substr is ''$.
678 Since minlen is already taken into account, s+1 is before strend;
679 accidentally, minlen >= 1 guaranties no false positives at s + 1
680 even for \b or \B. But (minlen? 1 : 0) below assumes that
681 regstclass does not come from lookahead... */
682 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
683 This leaves EXACTF only, which is dealt with in find_byclass(). */
684 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
685 ? STR_LEN(prog->regstclass)
687 char *endpos = (prog->anchored_substr || ml_anch)
688 ? s + (prog->minlen? cl_l : 0)
689 : (prog->float_substr ? check_at - start_shift + cl_l
691 char *startpos = sv && SvPOK(sv) ? strend - SvCUR(sv) : s;
694 if (prog->reganch & ROPT_UTF8) {
695 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
698 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
703 if (endpos == strend) {
704 DEBUG_r( PerlIO_printf(Perl_debug_log,
705 "Could not match STCLASS...\n") );
708 DEBUG_r( PerlIO_printf(Perl_debug_log,
709 "This position contradicts STCLASS...\n") );
710 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
712 /* Contradict one of substrings */
713 if (prog->anchored_substr) {
714 if (prog->anchored_substr == check) {
715 DEBUG_r( what = "anchored" );
717 PL_regeol = strend; /* Used in HOP() */
719 if (s + start_shift + end_shift > strend) {
720 /* XXXX Should be taken into account earlier? */
721 DEBUG_r( PerlIO_printf(Perl_debug_log,
722 "Could not match STCLASS...\n") );
725 DEBUG_r( PerlIO_printf(Perl_debug_log,
726 "Trying %s substr starting at offset %ld...\n",
727 what, (long)(s + start_shift - i_strpos)) );
730 /* Have both, check_string is floating */
731 if (t + start_shift >= check_at) /* Contradicts floating=check */
732 goto retry_floating_check;
733 /* Recheck anchored substring, but not floating... */
735 DEBUG_r( PerlIO_printf(Perl_debug_log,
736 "Trying anchored substr starting at offset %ld...\n",
737 (long)(other_last - i_strpos)) );
738 goto do_other_anchored;
740 /* Another way we could have checked stclass at the
741 current position only: */
744 DEBUG_r( PerlIO_printf(Perl_debug_log,
745 "Trying /^/m starting at offset %ld...\n",
746 (long)(t - i_strpos)) );
749 if (!prog->float_substr) /* Could have been deleted */
751 /* Check is floating subtring. */
752 retry_floating_check:
753 t = check_at - start_shift;
754 DEBUG_r( what = "floating" );
755 goto hop_and_restart;
758 PerlIO_printf(Perl_debug_log,
759 "By STCLASS: moving %ld --> %ld\n",
760 (long)(t - i_strpos), (long)(s - i_strpos));
762 PerlIO_printf(Perl_debug_log,
763 "Does not contradict STCLASS...\n") );
765 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
766 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
769 fail_finish: /* Substring not found */
770 if (prog->check_substr) /* could be removed already */
771 BmUSEFUL(prog->check_substr) += 5; /* hooray */
773 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
774 PL_colors[4],PL_colors[5]));
778 /* We know what class REx starts with. Try to find this position... */
780 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
782 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
788 register I32 tmp = 1; /* Scratch variable? */
790 /* We know what class it must start with. */
794 if (REGINCLASSUTF8(c, (U8*)s)) {
795 if (tmp && (norun || regtry(prog, s)))
807 if (REGINCLASS(c, *(U8*)s)) {
808 if (tmp && (norun || regtry(prog, s)))
828 c2 = PL_fold_locale[c1];
833 e = s; /* Due to minlen logic of intuit() */
834 /* Here it is NOT UTF! */
838 && (ln == 1 || !(OP(c) == EXACTF
840 : ibcmp_locale(s, m, ln)))
841 && (norun || regtry(prog, s)) )
847 if ( (*(U8*)s == c1 || *(U8*)s == c2)
848 && (ln == 1 || !(OP(c) == EXACTF
850 : ibcmp_locale(s, m, ln)))
851 && (norun || regtry(prog, s)) )
858 PL_reg_flags |= RF_tainted;
861 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
862 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
864 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
866 if ((norun || regtry(prog, s)))
871 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
875 PL_reg_flags |= RF_tainted;
878 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
879 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
881 if (tmp == !(OP(c) == BOUNDUTF8 ?
882 swash_fetch(PL_utf8_alnum, (U8*)s) :
883 isALNUM_LC_utf8((U8*)s)))
886 if ((norun || regtry(prog, s)))
891 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
895 PL_reg_flags |= RF_tainted;
898 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
899 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
901 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
903 else if ((norun || regtry(prog, s)))
907 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
911 PL_reg_flags |= RF_tainted;
914 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
915 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
917 if (tmp == !(OP(c) == NBOUNDUTF8 ?
918 swash_fetch(PL_utf8_alnum, (U8*)s) :
919 isALNUM_LC_utf8((U8*)s)))
921 else if ((norun || regtry(prog, s)))
925 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
931 if (tmp && (norun || regtry(prog, s)))
943 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
944 if (tmp && (norun || regtry(prog, s)))
955 PL_reg_flags |= RF_tainted;
957 if (isALNUM_LC(*s)) {
958 if (tmp && (norun || regtry(prog, s)))
969 PL_reg_flags |= RF_tainted;
971 if (isALNUM_LC_utf8((U8*)s)) {
972 if (tmp && (norun || regtry(prog, s)))
985 if (tmp && (norun || regtry(prog, s)))
997 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
998 if (tmp && (norun || regtry(prog, s)))
1009 PL_reg_flags |= RF_tainted;
1010 while (s < strend) {
1011 if (!isALNUM_LC(*s)) {
1012 if (tmp && (norun || regtry(prog, s)))
1023 PL_reg_flags |= RF_tainted;
1024 while (s < strend) {
1025 if (!isALNUM_LC_utf8((U8*)s)) {
1026 if (tmp && (norun || regtry(prog, s)))
1037 while (s < strend) {
1039 if (tmp && (norun || regtry(prog, s)))
1050 while (s < strend) {
1051 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1052 if (tmp && (norun || regtry(prog, s)))
1063 PL_reg_flags |= RF_tainted;
1064 while (s < strend) {
1065 if (isSPACE_LC(*s)) {
1066 if (tmp && (norun || regtry(prog, s)))
1077 PL_reg_flags |= RF_tainted;
1078 while (s < strend) {
1079 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1080 if (tmp && (norun || regtry(prog, s)))
1091 while (s < strend) {
1093 if (tmp && (norun || regtry(prog, s)))
1104 while (s < strend) {
1105 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1106 if (tmp && (norun || regtry(prog, s)))
1117 PL_reg_flags |= RF_tainted;
1118 while (s < strend) {
1119 if (!isSPACE_LC(*s)) {
1120 if (tmp && (norun || regtry(prog, s)))
1131 PL_reg_flags |= RF_tainted;
1132 while (s < strend) {
1133 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1134 if (tmp && (norun || regtry(prog, s)))
1145 while (s < strend) {
1147 if (tmp && (norun || regtry(prog, s)))
1158 while (s < strend) {
1159 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1160 if (tmp && (norun || regtry(prog, s)))
1171 PL_reg_flags |= RF_tainted;
1172 while (s < strend) {
1173 if (isDIGIT_LC(*s)) {
1174 if (tmp && (norun || regtry(prog, s)))
1185 PL_reg_flags |= RF_tainted;
1186 while (s < strend) {
1187 if (isDIGIT_LC_utf8((U8*)s)) {
1188 if (tmp && (norun || regtry(prog, s)))
1199 while (s < strend) {
1201 if (tmp && (norun || regtry(prog, s)))
1212 while (s < strend) {
1213 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1214 if (tmp && (norun || regtry(prog, s)))
1225 PL_reg_flags |= RF_tainted;
1226 while (s < strend) {
1227 if (!isDIGIT_LC(*s)) {
1228 if (tmp && (norun || regtry(prog, s)))
1239 PL_reg_flags |= RF_tainted;
1240 while (s < strend) {
1241 if (!isDIGIT_LC_utf8((U8*)s)) {
1242 if (tmp && (norun || regtry(prog, s)))
1253 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1262 - regexec_flags - match a regexp against a string
1265 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1266 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1267 /* strend: pointer to null at end of string */
1268 /* strbeg: real beginning of string */
1269 /* minend: end of match must be >=minend after stringarg. */
1270 /* data: May be used for some additional optimizations. */
1271 /* nosave: For optimizations. */
1275 register regnode *c;
1276 register char *startpos = stringarg;
1277 I32 minlen; /* must match at least this many chars */
1278 I32 dontbother = 0; /* how many characters not to try at end */
1279 /* I32 start_shift = 0; */ /* Offset of the start to find
1280 constant substr. */ /* CC */
1281 I32 end_shift = 0; /* Same for the end. */ /* CC */
1282 I32 scream_pos = -1; /* Internal iterator of scream. */
1284 SV* oreplsv = GvSV(PL_replgv);
1290 PL_regnarrate = PL_debug & 512;
1293 /* Be paranoid... */
1294 if (prog == NULL || startpos == NULL) {
1295 Perl_croak(aTHX_ "NULL regexp parameter");
1299 minlen = prog->minlen;
1300 if (strend - startpos < minlen) goto phooey;
1302 if (startpos == strbeg) /* is ^ valid at stringarg? */
1305 PL_regprev = (U32)stringarg[-1];
1306 if (!PL_multiline && PL_regprev == '\n')
1307 PL_regprev = '\0'; /* force ^ to NOT match */
1310 /* Check validity of program. */
1311 if (UCHARAT(prog->program) != REG_MAGIC) {
1312 Perl_croak(aTHX_ "corrupted regexp program");
1316 PL_reg_eval_set = 0;
1319 if (prog->reganch & ROPT_UTF8)
1320 PL_reg_flags |= RF_utf8;
1322 /* Mark beginning of line for ^ and lookbehind. */
1323 PL_regbol = startpos;
1327 /* Mark end of line for $ (and such) */
1330 /* see how far we have to get to not match where we matched before */
1331 PL_regtill = startpos+minend;
1333 /* We start without call_cc context. */
1336 /* If there is a "must appear" string, look for it. */
1339 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1342 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1343 PL_reg_ganch = startpos;
1344 else if (sv && SvTYPE(sv) >= SVt_PVMG
1346 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1347 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1348 if (prog->reganch & ROPT_ANCH_GPOS) {
1349 if (s > PL_reg_ganch)
1354 else /* pos() not defined */
1355 PL_reg_ganch = strbeg;
1358 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1359 re_scream_pos_data d;
1361 d.scream_olds = &scream_olds;
1362 d.scream_pos = &scream_pos;
1363 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1365 goto phooey; /* not present */
1368 DEBUG_r( if (!PL_colorset) reginitcolors() );
1369 DEBUG_r(PerlIO_printf(Perl_debug_log,
1370 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1371 PL_colors[4],PL_colors[5],PL_colors[0],
1374 (strlen(prog->precomp) > 60 ? "..." : ""),
1376 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1377 startpos, PL_colors[1],
1378 (strend - startpos > 60 ? "..." : ""))
1381 /* Simplest case: anchored match need be tried only once. */
1382 /* [unless only anchor is BOL and multiline is set] */
1383 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1384 if (s == startpos && regtry(prog, startpos))
1386 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1387 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1392 dontbother = minlen - 1;
1393 end = HOPc(strend, -dontbother) - 1;
1394 /* for multiline we only have to try after newlines */
1395 if (prog->check_substr) {
1399 if (regtry(prog, s))
1404 if (prog->reganch & RE_USE_INTUIT) {
1405 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1416 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1417 if (regtry(prog, s))
1424 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1425 if (regtry(prog, PL_reg_ganch))
1430 /* Messy cases: unanchored match. */
1431 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1432 /* we have /x+whatever/ */
1433 /* it must be a one character string (XXXX Except UTF?) */
1434 char ch = SvPVX(prog->anchored_substr)[0];
1436 while (s < strend) {
1438 if (regtry(prog, s)) goto got_it;
1440 while (s < strend && *s == ch)
1447 while (s < strend) {
1449 if (regtry(prog, s)) goto got_it;
1451 while (s < strend && *s == ch)
1459 else if (prog->anchored_substr != Nullsv
1460 || (prog->float_substr != Nullsv
1461 && prog->float_max_offset < strend - s)) {
1462 SV *must = prog->anchored_substr
1463 ? prog->anchored_substr : prog->float_substr;
1465 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1467 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1468 char *last = HOPc(strend, /* Cannot start after this */
1469 -(I32)(CHR_SVLEN(must)
1470 - (SvTAIL(must) != 0) + back_min));
1471 char *last1; /* Last position checked before */
1474 last1 = HOPc(s, -1);
1476 last1 = s - 1; /* bogus */
1478 /* XXXX check_substr already used to find `s', can optimize if
1479 check_substr==must. */
1481 dontbother = end_shift;
1482 strend = HOPc(strend, -dontbother);
1483 while ( (s <= last) &&
1484 ((flags & REXEC_SCREAM)
1485 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1486 end_shift, &scream_pos, 0))
1487 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1488 (unsigned char*)strend, must,
1489 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1490 if (HOPc(s, -back_max) > last1) {
1491 last1 = HOPc(s, -back_min);
1492 s = HOPc(s, -back_max);
1495 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1497 last1 = HOPc(s, -back_min);
1501 while (s <= last1) {
1502 if (regtry(prog, s))
1508 while (s <= last1) {
1509 if (regtry(prog, s))
1517 else if ((c = prog->regstclass)) {
1518 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1519 /* don't bother with what can't match */
1520 strend = HOPc(strend, -(minlen - 1));
1521 if (find_byclass(prog, c, s, strend, startpos, 0))
1526 if (prog->float_substr != Nullsv) { /* Trim the end. */
1529 if (flags & REXEC_SCREAM) {
1530 last = screaminstr(sv, prog->float_substr, s - strbeg,
1531 end_shift, &scream_pos, 1); /* last one */
1533 last = scream_olds; /* Only one occurence. */
1537 char *little = SvPV(prog->float_substr, len);
1539 if (SvTAIL(prog->float_substr)) {
1540 if (memEQ(strend - len + 1, little, len - 1))
1541 last = strend - len + 1;
1542 else if (!PL_multiline)
1543 last = memEQ(strend - len, little, len)
1544 ? strend - len : Nullch;
1550 last = rninstr(s, strend, little, little + len);
1552 last = strend; /* matching `$' */
1555 if (last == NULL) goto phooey; /* Should not happen! */
1556 dontbother = strend - last + prog->float_min_offset;
1558 if (minlen && (dontbother < minlen))
1559 dontbother = minlen - 1;
1560 strend -= dontbother; /* this one's always in bytes! */
1561 /* We don't know much -- general case. */
1564 if (regtry(prog, s))
1573 if (regtry(prog, s))
1575 } while (s++ < strend);
1583 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1585 if (PL_reg_eval_set) {
1586 /* Preserve the current value of $^R */
1587 if (oreplsv != GvSV(PL_replgv))
1588 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1589 restored, the value remains
1591 restore_pos(aTHXo_ 0);
1594 /* make sure $`, $&, $', and $digit will work later */
1595 if ( !(flags & REXEC_NOT_FIRST) ) {
1596 if (RX_MATCH_COPIED(prog)) {
1597 Safefree(prog->subbeg);
1598 RX_MATCH_COPIED_off(prog);
1600 if (flags & REXEC_COPY_STR) {
1601 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1603 s = savepvn(strbeg, i);
1606 RX_MATCH_COPIED_on(prog);
1609 prog->subbeg = strbeg;
1610 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1617 if (PL_reg_eval_set)
1618 restore_pos(aTHXo_ 0);
1623 - regtry - try match at specific point
1625 STATIC I32 /* 0 failure, 1 success */
1626 S_regtry(pTHX_ regexp *prog, char *startpos)
1634 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1637 PL_reg_eval_set = RS_init;
1639 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1640 (IV)(PL_stack_sp - PL_stack_base));
1642 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1643 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1644 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1646 /* Apparently this is not needed, judging by wantarray. */
1647 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1648 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1651 /* Make $_ available to executed code. */
1652 if (PL_reg_sv != DEFSV) {
1653 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1658 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1659 && (mg = mg_find(PL_reg_sv, 'g')))) {
1660 /* prepare for quick setting of pos */
1661 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1662 mg = mg_find(PL_reg_sv, 'g');
1666 PL_reg_oldpos = mg->mg_len;
1667 SAVEDESTRUCTOR_X(restore_pos, 0);
1670 Newz(22,PL_reg_curpm, 1, PMOP);
1671 PL_reg_curpm->op_pmregexp = prog;
1672 PL_reg_oldcurpm = PL_curpm;
1673 PL_curpm = PL_reg_curpm;
1674 if (RX_MATCH_COPIED(prog)) {
1675 /* Here is a serious problem: we cannot rewrite subbeg,
1676 since it may be needed if this match fails. Thus
1677 $` inside (?{}) could fail... */
1678 PL_reg_oldsaved = prog->subbeg;
1679 PL_reg_oldsavedlen = prog->sublen;
1680 RX_MATCH_COPIED_off(prog);
1683 PL_reg_oldsaved = Nullch;
1684 prog->subbeg = PL_bostr;
1685 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1687 prog->startp[0] = startpos - PL_bostr;
1688 PL_reginput = startpos;
1689 PL_regstartp = prog->startp;
1690 PL_regendp = prog->endp;
1691 PL_reglastparen = &prog->lastparen;
1692 prog->lastparen = 0;
1694 DEBUG_r(PL_reg_starttry = startpos);
1695 if (PL_reg_start_tmpl <= prog->nparens) {
1696 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1697 if(PL_reg_start_tmp)
1698 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1700 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1703 /* XXXX What this code is doing here?!!! There should be no need
1704 to do this again and again, PL_reglastparen should take care of
1708 if (prog->nparens) {
1709 for (i = prog->nparens; i >= 1; i--) {
1715 if (regmatch(prog->program + 1)) {
1716 prog->endp[0] = PL_reginput - PL_bostr;
1724 - regmatch - main matching routine
1726 * Conceptually the strategy is simple: check to see whether the current
1727 * node matches, call self recursively to see whether the rest matches,
1728 * and then act accordingly. In practice we make some effort to avoid
1729 * recursion, in particular by going through "ordinary" nodes (that don't
1730 * need to know whether the rest of the match failed) by a loop instead of
1733 /* [lwall] I've hoisted the register declarations to the outer block in order to
1734 * maybe save a little bit of pushing and popping on the stack. It also takes
1735 * advantage of machines that use a register save mask on subroutine entry.
1737 STATIC I32 /* 0 failure, 1 success */
1738 S_regmatch(pTHX_ regnode *prog)
1741 register regnode *scan; /* Current node. */
1742 regnode *next; /* Next node. */
1743 regnode *inner; /* Next node in internal branch. */
1744 register I32 nextchr; /* renamed nextchr - nextchar colides with
1745 function of same name */
1746 register I32 n; /* no or next */
1747 register I32 ln; /* len or last */
1748 register char *s; /* operand or save */
1749 register char *locinput = PL_reginput;
1750 register I32 c1, c2, paren; /* case fold search, parenth */
1751 int minmod = 0, sw = 0, logical = 0;
1756 /* Note that nextchr is a byte even in UTF */
1757 nextchr = UCHARAT(locinput);
1759 while (scan != NULL) {
1760 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1762 # define sayYES goto yes
1763 # define sayNO goto no
1764 # define sayYES_FINAL goto yes_final
1765 # define sayYES_LOUD goto yes_loud
1766 # define sayNO_FINAL goto no_final
1767 # define sayNO_SILENT goto do_no
1768 # define saySAME(x) if (x) goto yes; else goto no
1769 # define REPORT_CODE_OFF 24
1771 # define sayYES return 1
1772 # define sayNO return 0
1773 # define sayYES_FINAL return 1
1774 # define sayYES_LOUD return 1
1775 # define sayNO_FINAL return 0
1776 # define sayNO_SILENT return 0
1777 # define saySAME(x) return x
1780 SV *prop = sv_newmortal();
1781 int docolor = *PL_colors[0];
1782 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1783 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1784 /* The part of the string before starttry has one color
1785 (pref0_len chars), between starttry and current
1786 position another one (pref_len - pref0_len chars),
1787 after the current position the third one.
1788 We assume that pref0_len <= pref_len, otherwise we
1789 decrease pref0_len. */
1790 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1791 ? (5 + taill) - l : locinput - PL_bostr);
1792 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1794 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1795 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1796 ? (5 + taill) - pref_len : PL_regeol - locinput);
1799 if (pref0_len > pref_len)
1800 pref0_len = pref_len;
1801 regprop(prop, scan);
1802 PerlIO_printf(Perl_debug_log,
1803 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1804 (IV)(locinput - PL_bostr),
1805 PL_colors[4], pref0_len,
1806 locinput - pref_len, PL_colors[5],
1807 PL_colors[2], pref_len - pref0_len,
1808 locinput - pref_len + pref0_len, PL_colors[3],
1809 (docolor ? "" : "> <"),
1810 PL_colors[0], l, locinput, PL_colors[1],
1811 15 - l - pref_len + 1,
1813 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1817 next = scan + NEXT_OFF(scan);
1823 if (locinput == PL_bostr
1824 ? PL_regprev == '\n'
1826 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1828 /* regtill = regbol; */
1833 if (locinput == PL_bostr
1834 ? PL_regprev == '\n'
1835 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1841 if (locinput == PL_regbol && PL_regprev == '\n')
1845 if (locinput == PL_reg_ganch)
1855 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1860 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1862 if (PL_regeol - locinput > 1)
1866 if (PL_regeol != locinput)
1870 if (nextchr & 0x80) {
1871 locinput += PL_utf8skip[nextchr];
1872 if (locinput > PL_regeol)
1874 nextchr = UCHARAT(locinput);
1877 if (!nextchr && locinput >= PL_regeol)
1879 nextchr = UCHARAT(++locinput);
1882 if (!nextchr && locinput >= PL_regeol)
1884 nextchr = UCHARAT(++locinput);
1887 if (nextchr & 0x80) {
1888 locinput += PL_utf8skip[nextchr];
1889 if (locinput > PL_regeol)
1891 nextchr = UCHARAT(locinput);
1894 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1896 nextchr = UCHARAT(++locinput);
1899 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1901 nextchr = UCHARAT(++locinput);
1906 /* Inline the first character, for speed. */
1907 if (UCHARAT(s) != nextchr)
1909 if (PL_regeol - locinput < ln)
1911 if (ln > 1 && memNE(s, locinput, ln))
1914 nextchr = UCHARAT(locinput);
1917 PL_reg_flags |= RF_tainted;
1926 c1 = OP(scan) == EXACTF;
1930 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1931 toLOWER_utf8((U8*)l) :
1932 toLOWER_LC_utf8((U8*)l)))
1940 nextchr = UCHARAT(locinput);
1944 /* Inline the first character, for speed. */
1945 if (UCHARAT(s) != nextchr &&
1946 UCHARAT(s) != ((OP(scan) == EXACTF)
1947 ? PL_fold : PL_fold_locale)[nextchr])
1949 if (PL_regeol - locinput < ln)
1951 if (ln > 1 && (OP(scan) == EXACTF
1952 ? ibcmp(s, locinput, ln)
1953 : ibcmp_locale(s, locinput, ln)))
1956 nextchr = UCHARAT(locinput);
1959 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1961 if (locinput >= PL_regeol)
1963 locinput += PL_utf8skip[nextchr];
1964 nextchr = UCHARAT(locinput);
1968 nextchr = UCHARAT(locinput);
1969 if (!REGINCLASS(scan, nextchr))
1971 if (!nextchr && locinput >= PL_regeol)
1973 nextchr = UCHARAT(++locinput);
1976 PL_reg_flags |= RF_tainted;
1981 if (!(OP(scan) == ALNUM
1982 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1984 nextchr = UCHARAT(++locinput);
1987 PL_reg_flags |= RF_tainted;
1992 if (nextchr & 0x80) {
1993 if (!(OP(scan) == ALNUMUTF8
1994 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1995 : isALNUM_LC_utf8((U8*)locinput)))
1999 locinput += PL_utf8skip[nextchr];
2000 nextchr = UCHARAT(locinput);
2003 if (!(OP(scan) == ALNUMUTF8
2004 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2006 nextchr = UCHARAT(++locinput);
2009 PL_reg_flags |= RF_tainted;
2012 if (!nextchr && locinput >= PL_regeol)
2014 if (OP(scan) == NALNUM
2015 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2017 nextchr = UCHARAT(++locinput);
2020 PL_reg_flags |= RF_tainted;
2023 if (!nextchr && locinput >= PL_regeol)
2025 if (nextchr & 0x80) {
2026 if (OP(scan) == NALNUMUTF8
2027 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2028 : isALNUM_LC_utf8((U8*)locinput))
2032 locinput += PL_utf8skip[nextchr];
2033 nextchr = UCHARAT(locinput);
2036 if (OP(scan) == NALNUMUTF8
2037 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2039 nextchr = UCHARAT(++locinput);
2043 PL_reg_flags |= RF_tainted;
2047 /* was last char in word? */
2048 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2049 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2051 n = isALNUM(nextchr);
2054 ln = isALNUM_LC(ln);
2055 n = isALNUM_LC(nextchr);
2057 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2062 PL_reg_flags |= RF_tainted;
2066 /* was last char in word? */
2067 ln = (locinput != PL_regbol)
2068 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2069 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2070 ln = isALNUM_uni(ln);
2071 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2074 ln = isALNUM_LC_uni(ln);
2075 n = isALNUM_LC_utf8((U8*)locinput);
2077 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2081 PL_reg_flags |= RF_tainted;
2086 if (!(OP(scan) == SPACE
2087 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2089 nextchr = UCHARAT(++locinput);
2092 PL_reg_flags |= RF_tainted;
2097 if (nextchr & 0x80) {
2098 if (!(OP(scan) == SPACEUTF8
2099 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2100 : isSPACE_LC_utf8((U8*)locinput)))
2104 locinput += PL_utf8skip[nextchr];
2105 nextchr = UCHARAT(locinput);
2108 if (!(OP(scan) == SPACEUTF8
2109 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2111 nextchr = UCHARAT(++locinput);
2114 PL_reg_flags |= RF_tainted;
2117 if (!nextchr && locinput >= PL_regeol)
2119 if (OP(scan) == NSPACE
2120 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2122 nextchr = UCHARAT(++locinput);
2125 PL_reg_flags |= RF_tainted;
2128 if (!nextchr && locinput >= PL_regeol)
2130 if (nextchr & 0x80) {
2131 if (OP(scan) == NSPACEUTF8
2132 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2133 : isSPACE_LC_utf8((U8*)locinput))
2137 locinput += PL_utf8skip[nextchr];
2138 nextchr = UCHARAT(locinput);
2141 if (OP(scan) == NSPACEUTF8
2142 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2144 nextchr = UCHARAT(++locinput);
2147 PL_reg_flags |= RF_tainted;
2152 if (!(OP(scan) == DIGIT
2153 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2155 nextchr = UCHARAT(++locinput);
2158 PL_reg_flags |= RF_tainted;
2163 if (nextchr & 0x80) {
2164 if (!(OP(scan) == DIGITUTF8
2165 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2166 : isDIGIT_LC_utf8((U8*)locinput)))
2170 locinput += PL_utf8skip[nextchr];
2171 nextchr = UCHARAT(locinput);
2174 if (!(OP(scan) == DIGITUTF8
2175 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2177 nextchr = UCHARAT(++locinput);
2180 PL_reg_flags |= RF_tainted;
2183 if (!nextchr && locinput >= PL_regeol)
2185 if (OP(scan) == NDIGIT
2186 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2188 nextchr = UCHARAT(++locinput);
2191 PL_reg_flags |= RF_tainted;
2194 if (!nextchr && locinput >= PL_regeol)
2196 if (nextchr & 0x80) {
2197 if (OP(scan) == NDIGITUTF8
2198 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2199 : isDIGIT_LC_utf8((U8*)locinput))
2203 locinput += PL_utf8skip[nextchr];
2204 nextchr = UCHARAT(locinput);
2207 if (OP(scan) == NDIGITUTF8
2208 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2210 nextchr = UCHARAT(++locinput);
2213 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2215 locinput += PL_utf8skip[nextchr];
2216 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2217 locinput += UTF8SKIP(locinput);
2218 if (locinput > PL_regeol)
2220 nextchr = UCHARAT(locinput);
2223 PL_reg_flags |= RF_tainted;
2227 n = ARG(scan); /* which paren pair */
2228 ln = PL_regstartp[n];
2229 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2230 if (*PL_reglastparen < n || ln == -1)
2231 sayNO; /* Do not match unless seen CLOSEn. */
2232 if (ln == PL_regendp[n])
2236 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2238 char *e = PL_bostr + PL_regendp[n];
2240 * Note that we can't do the "other character" lookup trick as
2241 * in the 8-bit case (no pun intended) because in Unicode we
2242 * have to map both upper and title case to lower case.
2244 if (OP(scan) == REFF) {
2248 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2258 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2265 nextchr = UCHARAT(locinput);
2269 /* Inline the first character, for speed. */
2270 if (UCHARAT(s) != nextchr &&
2272 (UCHARAT(s) != ((OP(scan) == REFF
2273 ? PL_fold : PL_fold_locale)[nextchr]))))
2275 ln = PL_regendp[n] - ln;
2276 if (locinput + ln > PL_regeol)
2278 if (ln > 1 && (OP(scan) == REF
2279 ? memNE(s, locinput, ln)
2281 ? ibcmp(s, locinput, ln)
2282 : ibcmp_locale(s, locinput, ln))))
2285 nextchr = UCHARAT(locinput);
2296 OP_4tree *oop = PL_op;
2297 COP *ocurcop = PL_curcop;
2298 SV **ocurpad = PL_curpad;
2302 PL_op = (OP_4tree*)PL_regdata->data[n];
2303 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2304 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2305 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2307 CALLRUNOPS(aTHX); /* Scalar context. */
2313 PL_curpad = ocurpad;
2314 PL_curcop = ocurcop;
2316 if (logical == 2) { /* Postponed subexpression. */
2318 MAGIC *mg = Null(MAGIC*);
2320 CHECKPOINT cp, lastcp;
2322 if(SvROK(ret) || SvRMAGICAL(ret)) {
2323 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2326 mg = mg_find(sv, 'r');
2329 re = (regexp *)mg->mg_obj;
2330 (void)ReREFCNT_inc(re);
2334 char *t = SvPV(ret, len);
2336 char *oprecomp = PL_regprecomp;
2337 I32 osize = PL_regsize;
2338 I32 onpar = PL_regnpar;
2341 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2342 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2344 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2345 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2346 PL_regprecomp = oprecomp;
2351 PerlIO_printf(Perl_debug_log,
2352 "Entering embedded `%s%.60s%s%s'\n",
2356 (strlen(re->precomp) > 60 ? "..." : ""))
2359 state.prev = PL_reg_call_cc;
2360 state.cc = PL_regcc;
2361 state.re = PL_reg_re;
2365 cp = regcppush(0); /* Save *all* the positions. */
2368 state.ss = PL_savestack_ix;
2369 *PL_reglastparen = 0;
2370 PL_reg_call_cc = &state;
2371 PL_reginput = locinput;
2373 /* XXXX This is too dramatic a measure... */
2376 if (regmatch(re->program + 1)) {
2377 /* Even though we succeeded, we need to restore
2378 global variables, since we may be wrapped inside
2379 SUSPEND, thus the match may be not finished yet. */
2381 /* XXXX Do this only if SUSPENDed? */
2382 PL_reg_call_cc = state.prev;
2383 PL_regcc = state.cc;
2384 PL_reg_re = state.re;
2385 cache_re(PL_reg_re);
2387 /* XXXX This is too dramatic a measure... */
2390 /* These are needed even if not SUSPEND. */
2398 PL_reg_call_cc = state.prev;
2399 PL_regcc = state.cc;
2400 PL_reg_re = state.re;
2401 cache_re(PL_reg_re);
2403 /* XXXX This is too dramatic a measure... */
2412 sv_setsv(save_scalar(PL_replgv), ret);
2416 n = ARG(scan); /* which paren pair */
2417 PL_reg_start_tmp[n] = locinput;
2422 n = ARG(scan); /* which paren pair */
2423 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2424 PL_regendp[n] = locinput - PL_bostr;
2425 if (n > *PL_reglastparen)
2426 *PL_reglastparen = n;
2429 n = ARG(scan); /* which paren pair */
2430 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2433 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2435 next = NEXTOPER(NEXTOPER(scan));
2437 next = scan + ARG(scan);
2438 if (OP(next) == IFTHEN) /* Fake one. */
2439 next = NEXTOPER(NEXTOPER(next));
2443 logical = scan->flags;
2445 /*******************************************************************
2446 PL_regcc contains infoblock about the innermost (...)* loop, and
2447 a pointer to the next outer infoblock.
2449 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2451 1) After matching X, regnode for CURLYX is processed;
2453 2) This regnode creates infoblock on the stack, and calls
2454 regmatch() recursively with the starting point at WHILEM node;
2456 3) Each hit of WHILEM node tries to match A and Z (in the order
2457 depending on the current iteration, min/max of {min,max} and
2458 greediness). The information about where are nodes for "A"
2459 and "Z" is read from the infoblock, as is info on how many times "A"
2460 was already matched, and greediness.
2462 4) After A matches, the same WHILEM node is hit again.
2464 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2465 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2466 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2467 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2468 of the external loop.
2470 Currently present infoblocks form a tree with a stem formed by PL_curcc
2471 and whatever it mentions via ->next, and additional attached trees
2472 corresponding to temporarily unset infoblocks as in "5" above.
2474 In the following picture infoblocks for outer loop of
2475 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2476 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2477 infoblocks are drawn below the "reset" infoblock.
2479 In fact in the picture below we do not show failed matches for Z and T
2480 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2481 more obvious *why* one needs to *temporary* unset infoblocks.]
2483 Matched REx position InfoBlocks Comment
2487 Y A)*?Z)*?T x <- O <- I
2488 YA )*?Z)*?T x <- O <- I
2489 YA A)*?Z)*?T x <- O <- I
2490 YAA )*?Z)*?T x <- O <- I
2491 YAA Z)*?T x <- O # Temporary unset I
2494 YAAZ Y(A)*?Z)*?T x <- O
2497 YAAZY (A)*?Z)*?T x <- O
2500 YAAZY A)*?Z)*?T x <- O <- I
2503 YAAZYA )*?Z)*?T x <- O <- I
2506 YAAZYA Z)*?T x <- O # Temporary unset I
2512 YAAZYAZ T x # Temporary unset O
2519 *******************************************************************/
2522 CHECKPOINT cp = PL_savestack_ix;
2524 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2526 cc.oldcc = PL_regcc;
2528 cc.parenfloor = *PL_reglastparen;
2530 cc.min = ARG1(scan);
2531 cc.max = ARG2(scan);
2532 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2536 PL_reginput = locinput;
2537 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2539 PL_regcc = cc.oldcc;
2545 * This is really hard to understand, because after we match
2546 * what we're trying to match, we must make sure the rest of
2547 * the REx is going to match for sure, and to do that we have
2548 * to go back UP the parse tree by recursing ever deeper. And
2549 * if it fails, we have to reset our parent's current state
2550 * that we can try again after backing off.
2553 CHECKPOINT cp, lastcp;
2554 CURCUR* cc = PL_regcc;
2555 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2557 n = cc->cur + 1; /* how many we know we matched */
2558 PL_reginput = locinput;
2561 PerlIO_printf(Perl_debug_log,
2562 "%*s %ld out of %ld..%ld cc=%lx\n",
2563 REPORT_CODE_OFF+PL_regindent*2, "",
2564 (long)n, (long)cc->min,
2565 (long)cc->max, (long)cc)
2568 /* If degenerate scan matches "", assume scan done. */
2570 if (locinput == cc->lastloc && n >= cc->min) {
2571 PL_regcc = cc->oldcc;
2575 PerlIO_printf(Perl_debug_log,
2576 "%*s empty match detected, try continuation...\n",
2577 REPORT_CODE_OFF+PL_regindent*2, "")
2579 if (regmatch(cc->next))
2587 /* First just match a string of min scans. */
2591 cc->lastloc = locinput;
2592 if (regmatch(cc->scan))
2595 cc->lastloc = lastloc;
2600 /* Check whether we already were at this position.
2601 Postpone detection until we know the match is not
2602 *that* much linear. */
2603 if (!PL_reg_maxiter) {
2604 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2605 PL_reg_leftiter = PL_reg_maxiter;
2607 if (PL_reg_leftiter-- == 0) {
2608 I32 size = (PL_reg_maxiter + 7)/8;
2609 if (PL_reg_poscache) {
2610 if (PL_reg_poscache_size < size) {
2611 Renew(PL_reg_poscache, size, char);
2612 PL_reg_poscache_size = size;
2614 Zero(PL_reg_poscache, size, char);
2617 PL_reg_poscache_size = size;
2618 Newz(29, PL_reg_poscache, size, char);
2621 PerlIO_printf(Perl_debug_log,
2622 "%sDetected a super-linear match, switching on caching%s...\n",
2623 PL_colors[4], PL_colors[5])
2626 if (PL_reg_leftiter < 0) {
2627 I32 o = locinput - PL_bostr, b;
2629 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2632 if (PL_reg_poscache[o] & (1<<b)) {
2634 PerlIO_printf(Perl_debug_log,
2635 "%*s already tried at this position...\n",
2636 REPORT_CODE_OFF+PL_regindent*2, "")
2640 PL_reg_poscache[o] |= (1<<b);
2644 /* Prefer next over scan for minimal matching. */
2647 PL_regcc = cc->oldcc;
2650 cp = regcppush(cc->parenfloor);
2652 if (regmatch(cc->next)) {
2654 sayYES; /* All done. */
2662 if (n >= cc->max) { /* Maximum greed exceeded? */
2663 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2664 && !(PL_reg_flags & RF_warned)) {
2665 PL_reg_flags |= RF_warned;
2666 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2667 "Complex regular subexpression recursion",
2674 PerlIO_printf(Perl_debug_log,
2675 "%*s trying longer...\n",
2676 REPORT_CODE_OFF+PL_regindent*2, "")
2678 /* Try scanning more and see if it helps. */
2679 PL_reginput = locinput;
2681 cc->lastloc = locinput;
2682 cp = regcppush(cc->parenfloor);
2684 if (regmatch(cc->scan)) {
2691 cc->lastloc = lastloc;
2695 /* Prefer scan over next for maximal matching. */
2697 if (n < cc->max) { /* More greed allowed? */
2698 cp = regcppush(cc->parenfloor);
2700 cc->lastloc = locinput;
2702 if (regmatch(cc->scan)) {
2707 regcppop(); /* Restore some previous $<digit>s? */
2708 PL_reginput = locinput;
2710 PerlIO_printf(Perl_debug_log,
2711 "%*s failed, try continuation...\n",
2712 REPORT_CODE_OFF+PL_regindent*2, "")
2715 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2716 && !(PL_reg_flags & RF_warned)) {
2717 PL_reg_flags |= RF_warned;
2718 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2719 "Complex regular subexpression recursion",
2723 /* Failed deeper matches of scan, so see if this one works. */
2724 PL_regcc = cc->oldcc;
2727 if (regmatch(cc->next))
2733 cc->lastloc = lastloc;
2738 next = scan + ARG(scan);
2741 inner = NEXTOPER(NEXTOPER(scan));
2744 inner = NEXTOPER(scan);
2749 if (OP(next) != c1) /* No choice. */
2750 next = inner; /* Avoid recursion. */
2752 int lastparen = *PL_reglastparen;
2756 PL_reginput = locinput;
2757 if (regmatch(inner))
2760 for (n = *PL_reglastparen; n > lastparen; n--)
2762 *PL_reglastparen = n;
2765 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2769 inner = NEXTOPER(scan);
2770 if (c1 == BRANCHJ) {
2771 inner = NEXTOPER(inner);
2773 } while (scan != NULL && OP(scan) == c1);
2787 /* We suppose that the next guy does not need
2788 backtracking: in particular, it is of constant length,
2789 and has no parenths to influence future backrefs. */
2790 ln = ARG1(scan); /* min to match */
2791 n = ARG2(scan); /* max to match */
2792 paren = scan->flags;
2794 if (paren > PL_regsize)
2796 if (paren > *PL_reglastparen)
2797 *PL_reglastparen = paren;
2799 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2801 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2802 PL_reginput = locinput;
2805 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2807 if (ln && l == 0 && n >= ln
2808 /* In fact, this is tricky. If paren, then the
2809 fact that we did/didnot match may influence
2810 future execution. */
2811 && !(paren && ln == 0))
2813 locinput = PL_reginput;
2814 if (PL_regkind[(U8)OP(next)] == EXACT) {
2815 c1 = (U8)*STRING(next);
2816 if (OP(next) == EXACTF)
2818 else if (OP(next) == EXACTFL)
2819 c2 = PL_fold_locale[c1];
2826 /* This may be improved if l == 0. */
2827 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2828 /* If it could work, try it. */
2830 UCHARAT(PL_reginput) == c1 ||
2831 UCHARAT(PL_reginput) == c2)
2835 PL_regstartp[paren] =
2836 HOPc(PL_reginput, -l) - PL_bostr;
2837 PL_regendp[paren] = PL_reginput - PL_bostr;
2840 PL_regendp[paren] = -1;
2846 /* Couldn't or didn't -- move forward. */
2847 PL_reginput = locinput;
2848 if (regrepeat_hard(scan, 1, &l)) {
2850 locinput = PL_reginput;
2857 n = regrepeat_hard(scan, n, &l);
2858 if (n != 0 && l == 0
2859 /* In fact, this is tricky. If paren, then the
2860 fact that we did/didnot match may influence
2861 future execution. */
2862 && !(paren && ln == 0))
2864 locinput = PL_reginput;
2866 PerlIO_printf(Perl_debug_log,
2867 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2868 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2872 if (PL_regkind[(U8)OP(next)] == EXACT) {
2873 c1 = (U8)*STRING(next);
2874 if (OP(next) == EXACTF)
2876 else if (OP(next) == EXACTFL)
2877 c2 = PL_fold_locale[c1];
2886 /* If it could work, try it. */
2888 UCHARAT(PL_reginput) == c1 ||
2889 UCHARAT(PL_reginput) == c2)
2892 PerlIO_printf(Perl_debug_log,
2893 "%*s trying tail with n=%"IVdf"...\n",
2894 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2898 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2899 PL_regendp[paren] = PL_reginput - PL_bostr;
2902 PL_regendp[paren] = -1;
2908 /* Couldn't or didn't -- back up. */
2910 locinput = HOPc(locinput, -l);
2911 PL_reginput = locinput;
2918 paren = scan->flags; /* Which paren to set */
2919 if (paren > PL_regsize)
2921 if (paren > *PL_reglastparen)
2922 *PL_reglastparen = paren;
2923 ln = ARG1(scan); /* min to match */
2924 n = ARG2(scan); /* max to match */
2925 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2929 ln = ARG1(scan); /* min to match */
2930 n = ARG2(scan); /* max to match */
2931 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2936 scan = NEXTOPER(scan);
2942 scan = NEXTOPER(scan);
2946 * Lookahead to avoid useless match attempts
2947 * when we know what character comes next.
2949 if (PL_regkind[(U8)OP(next)] == EXACT) {
2950 c1 = (U8)*STRING(next);
2951 if (OP(next) == EXACTF)
2953 else if (OP(next) == EXACTFL)
2954 c2 = PL_fold_locale[c1];
2960 PL_reginput = locinput;
2964 if (ln && regrepeat(scan, ln) < ln)
2966 locinput = PL_reginput;
2969 char *e = locinput + n - ln; /* Should not check after this */
2970 char *old = locinput;
2972 if (e >= PL_regeol || (n == REG_INFTY))
2975 /* Find place 'next' could work */
2977 while (locinput <= e && *locinput != c1)
2980 while (locinput <= e
2987 /* PL_reginput == old now */
2988 if (locinput != old) {
2989 ln = 1; /* Did some */
2990 if (regrepeat(scan, locinput - old) <
2994 /* PL_reginput == locinput now */
2997 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2998 PL_regendp[paren] = locinput - PL_bostr;
3001 PL_regendp[paren] = -1;
3005 PL_reginput = locinput; /* Could be reset... */
3007 /* Couldn't or didn't -- move forward. */
3012 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3013 /* If it could work, try it. */
3015 UCHARAT(PL_reginput) == c1 ||
3016 UCHARAT(PL_reginput) == c2)
3020 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3021 PL_regendp[paren] = PL_reginput - PL_bostr;
3024 PL_regendp[paren] = -1;
3030 /* Couldn't or didn't -- move forward. */
3031 PL_reginput = locinput;
3032 if (regrepeat(scan, 1)) {
3034 locinput = PL_reginput;
3042 n = regrepeat(scan, n);
3043 locinput = PL_reginput;
3044 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3045 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3046 ln = n; /* why back off? */
3047 /* ...because $ and \Z can match before *and* after
3048 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3049 We should back off by one in this case. */
3050 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3056 /* If it could work, try it. */
3058 UCHARAT(PL_reginput) == c1 ||
3059 UCHARAT(PL_reginput) == c2)
3063 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3064 PL_regendp[paren] = PL_reginput - PL_bostr;
3067 PL_regendp[paren] = -1;
3073 /* Couldn't or didn't -- back up. */
3075 PL_reginput = locinput = HOPc(locinput, -1);
3080 /* If it could work, try it. */
3082 UCHARAT(PL_reginput) == c1 ||
3083 UCHARAT(PL_reginput) == c2)
3089 /* Couldn't or didn't -- back up. */
3091 PL_reginput = locinput = HOPc(locinput, -1);
3098 if (PL_reg_call_cc) {
3099 re_cc_state *cur_call_cc = PL_reg_call_cc;
3100 CURCUR *cctmp = PL_regcc;
3101 regexp *re = PL_reg_re;
3102 CHECKPOINT cp, lastcp;
3104 cp = regcppush(0); /* Save *all* the positions. */
3106 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3108 PL_reginput = locinput; /* Make position available to
3110 cache_re(PL_reg_call_cc->re);
3111 PL_regcc = PL_reg_call_cc->cc;
3112 PL_reg_call_cc = PL_reg_call_cc->prev;
3113 if (regmatch(cur_call_cc->node)) {
3114 PL_reg_call_cc = cur_call_cc;
3120 PL_reg_call_cc = cur_call_cc;
3126 PerlIO_printf(Perl_debug_log,
3127 "%*s continuation failed...\n",
3128 REPORT_CODE_OFF+PL_regindent*2, "")
3132 if (locinput < PL_regtill) {
3133 DEBUG_r(PerlIO_printf(Perl_debug_log,
3134 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3136 (long)(locinput - PL_reg_starttry),
3137 (long)(PL_regtill - PL_reg_starttry),
3139 sayNO_FINAL; /* Cannot match: too short. */
3141 PL_reginput = locinput; /* put where regtry can find it */
3142 sayYES_FINAL; /* Success! */
3144 PL_reginput = locinput; /* put where regtry can find it */
3145 sayYES_LOUD; /* Success! */
3148 PL_reginput = locinput;
3153 if (UTF) { /* XXXX This is absolutely
3154 broken, we read before
3156 s = HOPMAYBEc(locinput, -scan->flags);
3162 if (locinput < PL_bostr + scan->flags)
3164 PL_reginput = locinput - scan->flags;
3169 PL_reginput = locinput;
3174 if (UTF) { /* XXXX This is absolutely
3175 broken, we read before
3177 s = HOPMAYBEc(locinput, -scan->flags);
3178 if (!s || s < PL_bostr)
3183 if (locinput < PL_bostr + scan->flags)
3185 PL_reginput = locinput - scan->flags;
3190 PL_reginput = locinput;
3193 inner = NEXTOPER(NEXTOPER(scan));
3194 if (regmatch(inner) != n) {
3209 if (OP(scan) == SUSPEND) {
3210 locinput = PL_reginput;
3211 nextchr = UCHARAT(locinput);
3216 next = scan + ARG(scan);
3221 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3222 PTR2UV(scan), OP(scan));
3223 Perl_croak(aTHX_ "regexp memory corruption");
3229 * We get here only if there's trouble -- normally "case END" is
3230 * the terminating point.
3232 Perl_croak(aTHX_ "corrupted regexp pointers");
3238 PerlIO_printf(Perl_debug_log,
3239 "%*s %scould match...%s\n",
3240 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3244 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3245 PL_colors[4],PL_colors[5]));
3254 PerlIO_printf(Perl_debug_log,
3255 "%*s %sfailed...%s\n",
3256 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3268 - regrepeat - repeatedly match something simple, report how many
3271 * [This routine now assumes that it will only match on things of length 1.
3272 * That was true before, but now we assume scan - reginput is the count,
3273 * rather than incrementing count on every character. [Er, except utf8.]]
3276 S_regrepeat(pTHX_ regnode *p, I32 max)
3279 register char *scan;
3281 register char *loceol = PL_regeol;
3282 register I32 hardcount = 0;
3285 if (max != REG_INFTY && max < loceol - scan)
3286 loceol = scan + max;
3289 while (scan < loceol && *scan != '\n')
3297 while (scan < loceol && *scan != '\n') {
3298 scan += UTF8SKIP(scan);
3304 while (scan < loceol) {
3305 scan += UTF8SKIP(scan);
3309 case EXACT: /* length of string is 1 */
3311 while (scan < loceol && UCHARAT(scan) == c)
3314 case EXACTF: /* length of string is 1 */
3316 while (scan < loceol &&
3317 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3320 case EXACTFL: /* length of string is 1 */
3321 PL_reg_flags |= RF_tainted;
3323 while (scan < loceol &&
3324 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3329 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3330 scan += UTF8SKIP(scan);
3335 while (scan < loceol && REGINCLASS(p, *scan))
3339 while (scan < loceol && isALNUM(*scan))
3344 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3345 scan += UTF8SKIP(scan);
3350 PL_reg_flags |= RF_tainted;
3351 while (scan < loceol && isALNUM_LC(*scan))
3355 PL_reg_flags |= RF_tainted;
3357 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3358 scan += UTF8SKIP(scan);
3364 while (scan < loceol && !isALNUM(*scan))
3369 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3370 scan += UTF8SKIP(scan);
3375 PL_reg_flags |= RF_tainted;
3376 while (scan < loceol && !isALNUM_LC(*scan))
3380 PL_reg_flags |= RF_tainted;
3382 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3383 scan += UTF8SKIP(scan);
3388 while (scan < loceol && isSPACE(*scan))
3393 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3394 scan += UTF8SKIP(scan);
3399 PL_reg_flags |= RF_tainted;
3400 while (scan < loceol && isSPACE_LC(*scan))
3404 PL_reg_flags |= RF_tainted;
3406 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3407 scan += UTF8SKIP(scan);
3412 while (scan < loceol && !isSPACE(*scan))
3417 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3418 scan += UTF8SKIP(scan);
3423 PL_reg_flags |= RF_tainted;
3424 while (scan < loceol && !isSPACE_LC(*scan))
3428 PL_reg_flags |= RF_tainted;
3430 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3431 scan += UTF8SKIP(scan);
3436 while (scan < loceol && isDIGIT(*scan))
3441 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3442 scan += UTF8SKIP(scan);
3448 while (scan < loceol && !isDIGIT(*scan))
3453 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3454 scan += UTF8SKIP(scan);
3458 default: /* Called on something of 0 width. */
3459 break; /* So match right here or not at all. */
3465 c = scan - PL_reginput;
3470 SV *prop = sv_newmortal();
3473 PerlIO_printf(Perl_debug_log,
3474 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3475 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3482 - regrepeat_hard - repeatedly match something, report total lenth and length
3484 * The repeater is supposed to have constant length.
3488 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3491 register char *scan;
3492 register char *start;
3493 register char *loceol = PL_regeol;
3495 I32 count = 0, res = 1;
3500 start = PL_reginput;
3502 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3505 while (start < PL_reginput) {
3507 start += UTF8SKIP(start);
3518 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3520 *lp = l = PL_reginput - start;
3521 if (max != REG_INFTY && l*max < loceol - scan)
3522 loceol = scan + l*max;
3535 - reginclass - determine if a character falls into a character class
3539 S_reginclass(pTHX_ register regnode *p, register I32 c)
3542 char flags = ANYOF_FLAGS(p);
3546 if (ANYOF_BITMAP_TEST(p, c))
3548 else if (flags & ANYOF_FOLD) {
3550 if (flags & ANYOF_LOCALE) {
3551 PL_reg_flags |= RF_tainted;
3552 cf = PL_fold_locale[c];
3556 if (ANYOF_BITMAP_TEST(p, cf))
3560 if (!match && (flags & ANYOF_CLASS)) {
3561 PL_reg_flags |= RF_tainted;
3563 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3564 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3565 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3566 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3567 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3568 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3569 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3570 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3571 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3572 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3573 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3574 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3575 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3576 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3577 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3578 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3579 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3580 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3581 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3582 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3583 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3584 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3585 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3586 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3587 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3588 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3589 ) /* How's that for a conditional? */
3595 return (flags & ANYOF_INVERT) ? !match : match;
3599 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3602 char flags = ARG1(f);
3604 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3606 if (swash_fetch(sv, p))
3608 else if (flags & ANYOF_FOLD) {
3609 U8 tmpbuf[UTF8_MAXLEN];
3610 if (flags & ANYOF_LOCALE) {
3611 PL_reg_flags |= RF_tainted;
3612 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3615 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3616 if (swash_fetch(sv, tmpbuf))
3620 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3622 return (flags & ANYOF_INVERT) ? !match : match;
3626 S_reghop(pTHX_ U8 *s, I32 off)
3630 while (off-- && s < (U8*)PL_regeol)
3635 if (s > (U8*)PL_bostr) {
3638 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3640 } /* XXX could check well-formedness here */
3648 S_reghopmaybe(pTHX_ U8* s, I32 off)
3652 while (off-- && s < (U8*)PL_regeol)
3659 if (s > (U8*)PL_bostr) {
3662 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3664 } /* XXX could check well-formedness here */
3680 restore_pos(pTHXo_ void *arg)
3683 if (PL_reg_eval_set) {
3684 if (PL_reg_oldsaved) {
3685 PL_reg_re->subbeg = PL_reg_oldsaved;
3686 PL_reg_re->sublen = PL_reg_oldsavedlen;
3687 RX_MATCH_COPIED_on(PL_reg_re);
3689 PL_reg_magic->mg_len = PL_reg_oldpos;
3690 PL_reg_eval_set = 0;
3691 PL_curpm = PL_reg_oldcurpm;