5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 && (sv && (strpos + SvCUR(sv) != strend)) ) {
350 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
353 PL_regeol = strend; /* Used in HOP() */
354 s = HOPc(strpos, prog->check_offset_min);
356 slen = SvCUR(check); /* >= 1 */
358 if ( strend - s > slen || strend - s < slen - 1
359 || (strend - s == slen && strend[-1] != '\n')) {
360 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
363 /* Now should match s[0..slen-2] */
365 if (slen && (*SvPVX(check) != *s
367 && memNE(SvPVX(check), s, slen)))) {
369 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
373 else if (*SvPVX(check) != *s
374 || ((slen = SvCUR(check)) > 1
375 && memNE(SvPVX(check), s, slen)))
377 goto success_at_start;
379 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
381 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
382 end_shift = prog->minlen - start_shift -
383 CHR_SVLEN(check) + (SvTAIL(check) != 0);
385 I32 end = prog->check_offset_max + CHR_SVLEN(check)
386 - (SvTAIL(check) != 0);
387 I32 eshift = strend - s - end;
389 if (end_shift < eshift)
393 else { /* Can match at random position */
396 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
397 /* Should be nonnegative! */
398 end_shift = prog->minlen - start_shift -
399 CHR_SVLEN(check) + (SvTAIL(check) != 0);
402 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
404 Perl_croak(aTHX_ "panic: end_shift");
408 /* Find a possible match in the region s..strend by looking for
409 the "check" substring in the region corrected by start/end_shift. */
410 if (flags & REXEC_SCREAM) {
411 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
412 I32 p = -1; /* Internal iterator of scream. */
413 I32 *pp = data ? data->scream_pos : &p;
415 if (PL_screamfirst[BmRARE(check)] >= 0
416 || ( BmRARE(check) == '\n'
417 && (BmPREVIOUS(check) == SvCUR(check) - 1)
419 s = screaminstr(sv, check,
420 start_shift + (s - strbeg), end_shift, pp, 0);
424 *data->scream_olds = s;
427 s = fbm_instr((unsigned char*)s + start_shift,
428 (unsigned char*)strend - end_shift,
429 check, PL_multiline ? FBMrf_MULTILINE : 0);
431 /* Update the count-of-usability, remove useless subpatterns,
434 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
435 (s ? "Found" : "Did not find"),
436 ((check == prog->anchored_substr) ? "anchored" : "floating"),
438 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
440 PL_colors[1], (SvTAIL(check) ? "$" : ""),
441 (s ? " at offset " : "...\n") ) );
448 /* Finish the diagnostic message */
449 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
451 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
452 Start with the other substr.
453 XXXX no SCREAM optimization yet - and a very coarse implementation
454 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
455 *always* match. Probably should be marked during compile...
456 Probably it is right to do no SCREAM here...
459 if (prog->float_substr && prog->anchored_substr) {
460 /* Take into account the "other" substring. */
461 /* XXXX May be hopelessly wrong for UTF... */
464 if (check == prog->float_substr) {
467 char *last = s - start_shift, *last1, *last2;
471 t = s - prog->check_offset_max;
472 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
473 && (!(prog->reganch & ROPT_UTF8)
474 || (PL_bostr = strpos, /* Used in regcopmaybe() */
475 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
480 t += prog->anchored_offset;
481 if (t < other_last) /* These positions already checked */
484 last2 = last1 = strend - prog->minlen;
487 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
488 /* On end-of-str: see comment below. */
489 s = fbm_instr((unsigned char*)t,
490 (unsigned char*)last1 + prog->anchored_offset
491 + SvCUR(prog->anchored_substr)
492 - (SvTAIL(prog->anchored_substr)!=0),
493 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
494 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
495 (s ? "Found" : "Contradicts"),
497 (int)(SvCUR(prog->anchored_substr)
498 - (SvTAIL(prog->anchored_substr)!=0)),
499 SvPVX(prog->anchored_substr),
500 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
502 if (last1 >= last2) {
503 DEBUG_r(PerlIO_printf(Perl_debug_log,
504 ", giving up...\n"));
507 DEBUG_r(PerlIO_printf(Perl_debug_log,
508 ", trying floating at offset %ld...\n",
509 (long)(s1 + 1 - i_strpos)));
510 PL_regeol = strend; /* Used in HOP() */
511 other_last = last1 + prog->anchored_offset + 1;
516 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
517 (long)(s - i_strpos)));
518 t = s - prog->anchored_offset;
527 else { /* Take into account the floating substring. */
532 last1 = last = strend - prog->minlen + prog->float_min_offset;
533 if (last - t > prog->float_max_offset)
534 last = t + prog->float_max_offset;
535 s = t + prog->float_min_offset;
538 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
539 /* fbm_instr() takes into account exact value of end-of-str
540 if the check is SvTAIL(ed). Since false positives are OK,
541 and end-of-str is not later than strend we are OK. */
542 s = fbm_instr((unsigned char*)s,
543 (unsigned char*)last + SvCUR(prog->float_substr)
544 - (SvTAIL(prog->float_substr)!=0),
545 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
546 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
547 (s ? "Found" : "Contradicts"),
549 (int)(SvCUR(prog->float_substr)
550 - (SvTAIL(prog->float_substr)!=0)),
551 SvPVX(prog->float_substr),
552 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
555 DEBUG_r(PerlIO_printf(Perl_debug_log,
556 ", giving up...\n"));
559 DEBUG_r(PerlIO_printf(Perl_debug_log,
560 ", trying anchored starting at offset %ld...\n",
561 (long)(s1 + 1 - i_strpos)));
562 other_last = last + 1;
563 PL_regeol = strend; /* Used in HOP() */
568 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
569 (long)(s - i_strpos)));
579 t = s - prog->check_offset_max;
581 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
582 && (!(prog->reganch & ROPT_UTF8)
583 || (PL_bostr = strpos, /* Used in regcopmaybe() */
584 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
587 /* Fixed substring is found far enough so that the match
588 cannot start at strpos. */
590 if (ml_anch && t[-1] != '\n') {
591 /* Eventually fbm_*() should handle this, but often
592 anchored_offset is not 0, so this check will not be wasted. */
593 /* XXXX In the code below we prefer to look for "^" even in
594 presence of anchored substrings. And we search even
595 beyond the found float position. These pessimizations
596 are historical artefacts only. */
598 while (t < strend - prog->minlen) {
600 if (t < s - prog->check_offset_min) {
601 if (prog->anchored_substr) {
602 /* We definitely contradict the found anchored
603 substr. Due to the above check we do not
604 contradict "check" substr.
605 Thus we can arrive here only if check substr
606 is float. Redo checking for "other"=="fixed".
609 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
610 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
611 goto do_other_anchored;
614 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
615 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
618 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
619 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
625 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
626 PL_colors[0],PL_colors[1]));
631 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
635 /* The found string does not prohibit matching at beg-of-str
636 - no optimization of calling REx engine can be performed,
637 unless it was an MBOL and we are not after MBOL. */
639 /* Even in this situation we may use MBOL flag if strpos is offset
640 wrt the start of the string. */
642 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
643 /* May be due to an implicit anchor of m{.*foo} */
644 && !(prog->reganch & ROPT_IMPLICIT))
649 DEBUG_r( if (ml_anch)
650 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
651 PL_colors[0],PL_colors[1]);
654 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
655 && prog->check_substr /* Could be deleted already */
656 && --BmUSEFUL(prog->check_substr) < 0
657 && prog->check_substr == prog->float_substr)
659 /* If flags & SOMETHING - do not do it many times on the same match */
660 SvREFCNT_dec(prog->check_substr);
661 prog->check_substr = Nullsv; /* disable */
662 prog->float_substr = Nullsv; /* clear */
664 /* XXXX This is a remnant of the old implementation. It
665 looks wasteful, since now INTUIT can use many
667 prog->reganch &= ~RE_USE_INTUIT;
674 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
675 if (prog->regstclass) {
676 /* minlen == 0 is possible if regstclass is \b or \B,
677 and the fixed substr is ''$.
678 Since minlen is already taken into account, s+1 is before strend;
679 accidentally, minlen >= 1 guaranties no false positives at s + 1
680 even for \b or \B. But (minlen? 1 : 0) below assumes that
681 regstclass does not come from lookahead... */
682 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
683 This leaves EXACTF only, which is dealt with in find_byclass(). */
684 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
685 ? STR_LEN(prog->regstclass)
687 char *endpos = (prog->anchored_substr || ml_anch)
688 ? s + (prog->minlen? cl_l : 0)
689 : (prog->float_substr ? check_at - start_shift + cl_l
691 char *startpos = sv ? strend - SvCUR(sv) : s;
694 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
699 if (endpos == strend) {
700 DEBUG_r( PerlIO_printf(Perl_debug_log,
701 "Could not match STCLASS...\n") );
704 DEBUG_r( PerlIO_printf(Perl_debug_log,
705 "This position contradicts STCLASS...\n") );
706 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
708 /* Contradict one of substrings */
709 if (prog->anchored_substr) {
710 if (prog->anchored_substr == check) {
711 DEBUG_r( what = "anchored" );
713 PL_regeol = strend; /* Used in HOP() */
715 if (s + start_shift + end_shift > strend) {
716 /* XXXX Should be taken into account earlier? */
717 DEBUG_r( PerlIO_printf(Perl_debug_log,
718 "Could not match STCLASS...\n") );
721 DEBUG_r( PerlIO_printf(Perl_debug_log,
722 "Trying %s substr starting at offset %ld...\n",
723 what, (long)(s + start_shift - i_strpos)) );
726 /* Have both, check_string is floating */
727 if (t + start_shift >= check_at) /* Contradicts floating=check */
728 goto retry_floating_check;
729 /* Recheck anchored substring, but not floating... */
731 DEBUG_r( PerlIO_printf(Perl_debug_log,
732 "Trying anchored substr starting at offset %ld...\n",
733 (long)(other_last - i_strpos)) );
734 goto do_other_anchored;
736 /* Another way we could have checked stclass at the
737 current position only: */
740 DEBUG_r( PerlIO_printf(Perl_debug_log,
741 "Trying /^/m starting at offset %ld...\n",
742 (long)(t - i_strpos)) );
745 if (!prog->float_substr) /* Could have been deleted */
747 /* Check is floating subtring. */
748 retry_floating_check:
749 t = check_at - start_shift;
750 DEBUG_r( what = "floating" );
751 goto hop_and_restart;
754 PerlIO_printf(Perl_debug_log,
755 "By STCLASS: moving %ld --> %ld\n",
756 (long)(t - i_strpos), (long)(s - i_strpos));
758 PerlIO_printf(Perl_debug_log,
759 "Does not contradict STCLASS...\n") );
761 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
762 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
765 fail_finish: /* Substring not found */
766 if (prog->check_substr) /* could be removed already */
767 BmUSEFUL(prog->check_substr) += 5; /* hooray */
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
770 PL_colors[4],PL_colors[5]));
774 /* We know what class REx starts with. Try to find this position... */
776 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
778 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
784 register I32 tmp = 1; /* Scratch variable? */
786 /* We know what class it must start with. */
790 if (REGINCLASSUTF8(c, (U8*)s)) {
791 if (tmp && (norun || regtry(prog, s)))
803 if (REGINCLASS(c, *s)) {
804 if (tmp && (norun || regtry(prog, s)))
824 c2 = PL_fold_locale[c1];
829 e = s; /* Due to minlen logic of intuit() */
830 /* Here it is NOT UTF! */
834 && (ln == 1 || !(OP(c) == EXACTF
836 : ibcmp_locale(s, m, ln)))
837 && (norun || regtry(prog, s)) )
843 if ( (*s == c1 || *s == c2)
844 && (ln == 1 || !(OP(c) == EXACTF
846 : ibcmp_locale(s, m, ln)))
847 && (norun || regtry(prog, s)) )
854 PL_reg_flags |= RF_tainted;
857 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
858 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
860 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
862 if ((norun || regtry(prog, s)))
867 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
871 PL_reg_flags |= RF_tainted;
874 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
875 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
877 if (tmp == !(OP(c) == BOUND ?
878 swash_fetch(PL_utf8_alnum, (U8*)s) :
879 isALNUM_LC_utf8((U8*)s)))
882 if ((norun || regtry(prog, s)))
887 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
891 PL_reg_flags |= RF_tainted;
894 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
895 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
897 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
899 else if ((norun || regtry(prog, s)))
903 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
907 PL_reg_flags |= RF_tainted;
911 strend = reghop_c(strend, -1);
912 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
913 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
915 if (tmp == !(OP(c) == NBOUND ?
916 swash_fetch(PL_utf8_alnum, (U8*)s) :
917 isALNUM_LC_utf8((U8*)s)))
919 else if ((norun || regtry(prog, s)))
923 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
929 if (tmp && (norun || regtry(prog, s)))
941 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
942 if (tmp && (norun || regtry(prog, s)))
953 PL_reg_flags |= RF_tainted;
955 if (isALNUM_LC(*s)) {
956 if (tmp && (norun || regtry(prog, s)))
967 PL_reg_flags |= RF_tainted;
969 if (isALNUM_LC_utf8((U8*)s)) {
970 if (tmp && (norun || regtry(prog, s)))
983 if (tmp && (norun || regtry(prog, s)))
995 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
996 if (tmp && (norun || regtry(prog, s)))
1007 PL_reg_flags |= RF_tainted;
1008 while (s < strend) {
1009 if (!isALNUM_LC(*s)) {
1010 if (tmp && (norun || regtry(prog, s)))
1021 PL_reg_flags |= RF_tainted;
1022 while (s < strend) {
1023 if (!isALNUM_LC_utf8((U8*)s)) {
1024 if (tmp && (norun || regtry(prog, s)))
1035 while (s < strend) {
1037 if (tmp && (norun || regtry(prog, s)))
1048 while (s < strend) {
1049 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1050 if (tmp && (norun || regtry(prog, s)))
1061 PL_reg_flags |= RF_tainted;
1062 while (s < strend) {
1063 if (isSPACE_LC(*s)) {
1064 if (tmp && (norun || regtry(prog, s)))
1075 PL_reg_flags |= RF_tainted;
1076 while (s < strend) {
1077 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1078 if (tmp && (norun || regtry(prog, s)))
1089 while (s < strend) {
1091 if (tmp && (norun || regtry(prog, s)))
1102 while (s < strend) {
1103 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1104 if (tmp && (norun || regtry(prog, s)))
1115 PL_reg_flags |= RF_tainted;
1116 while (s < strend) {
1117 if (!isSPACE_LC(*s)) {
1118 if (tmp && (norun || regtry(prog, s)))
1129 PL_reg_flags |= RF_tainted;
1130 while (s < strend) {
1131 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1132 if (tmp && (norun || regtry(prog, s)))
1143 while (s < strend) {
1145 if (tmp && (norun || regtry(prog, s)))
1156 while (s < strend) {
1157 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1158 if (tmp && (norun || regtry(prog, s)))
1169 PL_reg_flags |= RF_tainted;
1170 while (s < strend) {
1171 if (isDIGIT_LC(*s)) {
1172 if (tmp && (norun || regtry(prog, s)))
1183 PL_reg_flags |= RF_tainted;
1184 while (s < strend) {
1185 if (isDIGIT_LC_utf8((U8*)s)) {
1186 if (tmp && (norun || regtry(prog, s)))
1197 while (s < strend) {
1199 if (tmp && (norun || regtry(prog, s)))
1210 while (s < strend) {
1211 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1212 if (tmp && (norun || regtry(prog, s)))
1223 PL_reg_flags |= RF_tainted;
1224 while (s < strend) {
1225 if (!isDIGIT_LC(*s)) {
1226 if (tmp && (norun || regtry(prog, s)))
1237 PL_reg_flags |= RF_tainted;
1238 while (s < strend) {
1239 if (!isDIGIT_LC_utf8((U8*)s)) {
1240 if (tmp && (norun || regtry(prog, s)))
1251 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1260 - regexec_flags - match a regexp against a string
1263 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1264 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1265 /* strend: pointer to null at end of string */
1266 /* strbeg: real beginning of string */
1267 /* minend: end of match must be >=minend after stringarg. */
1268 /* data: May be used for some additional optimizations. */
1269 /* nosave: For optimizations. */
1273 register regnode *c;
1274 register char *startpos = stringarg;
1276 I32 minlen; /* must match at least this many chars */
1277 I32 dontbother = 0; /* how many characters not to try at end */
1278 I32 start_shift = 0; /* Offset of the start to find
1279 constant substr. */ /* CC */
1280 I32 end_shift = 0; /* Same for the end. */ /* CC */
1281 I32 scream_pos = -1; /* Internal iterator of scream. */
1283 SV* oreplsv = GvSV(PL_replgv);
1289 PL_regnarrate = PL_debug & 512;
1292 /* Be paranoid... */
1293 if (prog == NULL || startpos == NULL) {
1294 Perl_croak(aTHX_ "NULL regexp parameter");
1298 minlen = prog->minlen;
1299 if (strend - startpos < minlen) goto phooey;
1301 if (startpos == strbeg) /* is ^ valid at stringarg? */
1304 PL_regprev = (U32)stringarg[-1];
1305 if (!PL_multiline && PL_regprev == '\n')
1306 PL_regprev = '\0'; /* force ^ to NOT match */
1309 /* Check validity of program. */
1310 if (UCHARAT(prog->program) != REG_MAGIC) {
1311 Perl_croak(aTHX_ "corrupted regexp program");
1315 PL_reg_eval_set = 0;
1318 if (prog->reganch & ROPT_UTF8)
1319 PL_reg_flags |= RF_utf8;
1321 /* Mark beginning of line for ^ and lookbehind. */
1322 PL_regbol = startpos;
1326 /* Mark end of line for $ (and such) */
1329 /* see how far we have to get to not match where we matched before */
1330 PL_regtill = startpos+minend;
1332 /* We start without call_cc context. */
1335 /* If there is a "must appear" string, look for it. */
1338 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1341 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1342 PL_reg_ganch = startpos;
1343 else if (sv && SvTYPE(sv) >= SVt_PVMG
1345 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1346 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1347 if (prog->reganch & ROPT_ANCH_GPOS) {
1348 if (s > PL_reg_ganch)
1353 else /* pos() not defined */
1354 PL_reg_ganch = strbeg;
1357 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1358 re_scream_pos_data d;
1360 d.scream_olds = &scream_olds;
1361 d.scream_pos = &scream_pos;
1362 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1364 goto phooey; /* not present */
1367 DEBUG_r( if (!PL_colorset) reginitcolors() );
1368 DEBUG_r(PerlIO_printf(Perl_debug_log,
1369 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1370 PL_colors[4],PL_colors[5],PL_colors[0],
1373 (strlen(prog->precomp) > 60 ? "..." : ""),
1375 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1376 startpos, PL_colors[1],
1377 (strend - startpos > 60 ? "..." : ""))
1380 /* Simplest case: anchored match need be tried only once. */
1381 /* [unless only anchor is BOL and multiline is set] */
1382 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1383 if (s == startpos && regtry(prog, startpos))
1385 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1386 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1391 dontbother = minlen - 1;
1392 end = HOPc(strend, -dontbother) - 1;
1393 /* for multiline we only have to try after newlines */
1394 if (prog->check_substr) {
1398 if (regtry(prog, s))
1403 if (prog->reganch & RE_USE_INTUIT) {
1404 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1415 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1416 if (regtry(prog, s))
1423 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1424 if (regtry(prog, PL_reg_ganch))
1429 /* Messy cases: unanchored match. */
1430 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1431 /* we have /x+whatever/ */
1432 /* it must be a one character string (XXXX Except UTF?) */
1433 char ch = SvPVX(prog->anchored_substr)[0];
1435 while (s < strend) {
1437 if (regtry(prog, s)) goto got_it;
1439 while (s < strend && *s == ch)
1446 while (s < strend) {
1448 if (regtry(prog, s)) goto got_it;
1450 while (s < strend && *s == ch)
1458 else if (prog->anchored_substr != Nullsv
1459 || (prog->float_substr != Nullsv
1460 && prog->float_max_offset < strend - s)) {
1461 SV *must = prog->anchored_substr
1462 ? prog->anchored_substr : prog->float_substr;
1464 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1466 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1467 I32 delta = back_max - back_min;
1468 char *last = HOPc(strend, /* Cannot start after this */
1469 -(I32)(CHR_SVLEN(must)
1470 - (SvTAIL(must) != 0) + back_min));
1471 char *last1; /* Last position checked before */
1474 last1 = HOPc(s, -1);
1476 last1 = s - 1; /* bogus */
1478 /* XXXX check_substr already used to find `s', can optimize if
1479 check_substr==must. */
1481 dontbother = end_shift;
1482 strend = HOPc(strend, -dontbother);
1483 while ( (s <= last) &&
1484 ((flags & REXEC_SCREAM)
1485 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1486 end_shift, &scream_pos, 0))
1487 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1488 (unsigned char*)strend, must,
1489 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1490 if (HOPc(s, -back_max) > last1) {
1491 last1 = HOPc(s, -back_min);
1492 s = HOPc(s, -back_max);
1495 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1497 last1 = HOPc(s, -back_min);
1501 while (s <= last1) {
1502 if (regtry(prog, s))
1508 while (s <= last1) {
1509 if (regtry(prog, s))
1517 else if (c = prog->regstclass) {
1518 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1519 /* don't bother with what can't match */
1520 strend = HOPc(strend, -(minlen - 1));
1521 if (find_byclass(prog, c, s, strend, startpos, 0))
1526 if (prog->float_substr != Nullsv) { /* Trim the end. */
1528 I32 oldpos = scream_pos;
1530 if (flags & REXEC_SCREAM) {
1531 last = screaminstr(sv, prog->float_substr, s - strbeg,
1532 end_shift, &scream_pos, 1); /* last one */
1534 last = scream_olds; /* Only one occurence. */
1538 char *little = SvPV(prog->float_substr, len);
1540 if (SvTAIL(prog->float_substr)) {
1541 if (memEQ(strend - len + 1, little, len - 1))
1542 last = strend - len + 1;
1543 else if (!PL_multiline)
1544 last = memEQ(strend - len, little, len)
1545 ? strend - len : Nullch;
1551 last = rninstr(s, strend, little, little + len);
1553 last = strend; /* matching `$' */
1556 if (last == NULL) goto phooey; /* Should not happen! */
1557 dontbother = strend - last + prog->float_min_offset;
1559 if (minlen && (dontbother < minlen))
1560 dontbother = minlen - 1;
1561 strend -= dontbother; /* this one's always in bytes! */
1562 /* We don't know much -- general case. */
1565 if (regtry(prog, s))
1574 if (regtry(prog, s))
1576 } while (s++ < strend);
1584 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1586 if (PL_reg_eval_set) {
1587 /* Preserve the current value of $^R */
1588 if (oreplsv != GvSV(PL_replgv))
1589 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1590 restored, the value remains
1592 restore_pos(aTHXo_ 0);
1595 /* make sure $`, $&, $', and $digit will work later */
1596 if ( !(flags & REXEC_NOT_FIRST) ) {
1597 if (RX_MATCH_COPIED(prog)) {
1598 Safefree(prog->subbeg);
1599 RX_MATCH_COPIED_off(prog);
1601 if (flags & REXEC_COPY_STR) {
1602 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1604 s = savepvn(strbeg, i);
1607 RX_MATCH_COPIED_on(prog);
1610 prog->subbeg = strbeg;
1611 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1618 if (PL_reg_eval_set)
1619 restore_pos(aTHXo_ 0);
1624 - regtry - try match at specific point
1626 STATIC I32 /* 0 failure, 1 success */
1627 S_regtry(pTHX_ regexp *prog, char *startpos)
1635 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1638 PL_reg_eval_set = RS_init;
1640 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1641 (IV)(PL_stack_sp - PL_stack_base));
1643 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1644 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1645 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1647 /* Apparently this is not needed, judging by wantarray. */
1648 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1649 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1652 /* Make $_ available to executed code. */
1653 if (PL_reg_sv != DEFSV) {
1654 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1659 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1660 && (mg = mg_find(PL_reg_sv, 'g')))) {
1661 /* prepare for quick setting of pos */
1662 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1663 mg = mg_find(PL_reg_sv, 'g');
1667 PL_reg_oldpos = mg->mg_len;
1668 SAVEDESTRUCTOR_X(restore_pos, 0);
1671 New(22,PL_reg_curpm, 1, PMOP);
1672 PL_reg_curpm->op_pmregexp = prog;
1673 PL_reg_oldcurpm = PL_curpm;
1674 PL_curpm = PL_reg_curpm;
1675 if (RX_MATCH_COPIED(prog)) {
1676 /* Here is a serious problem: we cannot rewrite subbeg,
1677 since it may be needed if this match fails. Thus
1678 $` inside (?{}) could fail... */
1679 PL_reg_oldsaved = prog->subbeg;
1680 PL_reg_oldsavedlen = prog->sublen;
1681 RX_MATCH_COPIED_off(prog);
1684 PL_reg_oldsaved = Nullch;
1685 prog->subbeg = PL_bostr;
1686 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1688 prog->startp[0] = startpos - PL_bostr;
1689 PL_reginput = startpos;
1690 PL_regstartp = prog->startp;
1691 PL_regendp = prog->endp;
1692 PL_reglastparen = &prog->lastparen;
1693 prog->lastparen = 0;
1695 DEBUG_r(PL_reg_starttry = startpos);
1696 if (PL_reg_start_tmpl <= prog->nparens) {
1697 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1698 if(PL_reg_start_tmp)
1699 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1701 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1704 /* XXXX What this code is doing here?!!! There should be no need
1705 to do this again and again, PL_reglastparen should take care of
1709 if (prog->nparens) {
1710 for (i = prog->nparens; i >= 1; i--) {
1716 if (regmatch(prog->program + 1)) {
1717 prog->endp[0] = PL_reginput - PL_bostr;
1725 - regmatch - main matching routine
1727 * Conceptually the strategy is simple: check to see whether the current
1728 * node matches, call self recursively to see whether the rest matches,
1729 * and then act accordingly. In practice we make some effort to avoid
1730 * recursion, in particular by going through "ordinary" nodes (that don't
1731 * need to know whether the rest of the match failed) by a loop instead of
1734 /* [lwall] I've hoisted the register declarations to the outer block in order to
1735 * maybe save a little bit of pushing and popping on the stack. It also takes
1736 * advantage of machines that use a register save mask on subroutine entry.
1738 STATIC I32 /* 0 failure, 1 success */
1739 S_regmatch(pTHX_ regnode *prog)
1742 register regnode *scan; /* Current node. */
1743 regnode *next; /* Next node. */
1744 regnode *inner; /* Next node in internal branch. */
1745 register I32 nextchr; /* renamed nextchr - nextchar colides with
1746 function of same name */
1747 register I32 n; /* no or next */
1748 register I32 ln; /* len or last */
1749 register char *s; /* operand or save */
1750 register char *locinput = PL_reginput;
1751 register I32 c1, c2, paren; /* case fold search, parenth */
1752 int minmod = 0, sw = 0, logical = 0;
1757 /* Note that nextchr is a byte even in UTF */
1758 nextchr = UCHARAT(locinput);
1760 while (scan != NULL) {
1761 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1763 # define sayYES goto yes
1764 # define sayNO goto no
1765 # define sayYES_FINAL goto yes_final
1766 # define sayYES_LOUD goto yes_loud
1767 # define sayNO_FINAL goto no_final
1768 # define sayNO_SILENT goto do_no
1769 # define saySAME(x) if (x) goto yes; else goto no
1770 # define REPORT_CODE_OFF 24
1772 # define sayYES return 1
1773 # define sayNO return 0
1774 # define sayYES_FINAL return 1
1775 # define sayYES_LOUD return 1
1776 # define sayNO_FINAL return 0
1777 # define sayNO_SILENT return 0
1778 # define saySAME(x) return x
1781 SV *prop = sv_newmortal();
1782 int docolor = *PL_colors[0];
1783 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1784 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1785 /* The part of the string before starttry has one color
1786 (pref0_len chars), between starttry and current
1787 position another one (pref_len - pref0_len chars),
1788 after the current position the third one.
1789 We assume that pref0_len <= pref_len, otherwise we
1790 decrease pref0_len. */
1791 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1792 ? (5 + taill) - l : locinput - PL_bostr);
1793 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1795 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1796 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1797 ? (5 + taill) - pref_len : PL_regeol - locinput);
1800 if (pref0_len > pref_len)
1801 pref0_len = pref_len;
1802 regprop(prop, scan);
1803 PerlIO_printf(Perl_debug_log,
1804 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1805 (IV)(locinput - PL_bostr),
1806 PL_colors[4], pref0_len,
1807 locinput - pref_len, PL_colors[5],
1808 PL_colors[2], pref_len - pref0_len,
1809 locinput - pref_len + pref0_len, PL_colors[3],
1810 (docolor ? "" : "> <"),
1811 PL_colors[0], l, locinput, PL_colors[1],
1812 15 - l - pref_len + 1,
1814 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1818 next = scan + NEXT_OFF(scan);
1824 if (locinput == PL_bostr
1825 ? PL_regprev == '\n'
1827 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1829 /* regtill = regbol; */
1834 if (locinput == PL_bostr
1835 ? PL_regprev == '\n'
1836 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1842 if (locinput == PL_regbol && PL_regprev == '\n')
1846 if (locinput == PL_reg_ganch)
1856 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1861 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1863 if (PL_regeol - locinput > 1)
1867 if (PL_regeol != locinput)
1871 if (nextchr & 0x80) {
1872 locinput += PL_utf8skip[nextchr];
1873 if (locinput > PL_regeol)
1875 nextchr = UCHARAT(locinput);
1878 if (!nextchr && locinput >= PL_regeol)
1880 nextchr = UCHARAT(++locinput);
1883 if (!nextchr && locinput >= PL_regeol)
1885 nextchr = UCHARAT(++locinput);
1888 if (nextchr & 0x80) {
1889 locinput += PL_utf8skip[nextchr];
1890 if (locinput > PL_regeol)
1892 nextchr = UCHARAT(locinput);
1895 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1897 nextchr = UCHARAT(++locinput);
1900 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1902 nextchr = UCHARAT(++locinput);
1907 /* Inline the first character, for speed. */
1908 if (UCHARAT(s) != nextchr)
1910 if (PL_regeol - locinput < ln)
1912 if (ln > 1 && memNE(s, locinput, ln))
1915 nextchr = UCHARAT(locinput);
1918 PL_reg_flags |= RF_tainted;
1927 c1 = OP(scan) == EXACTF;
1931 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1932 toLOWER_utf8((U8*)l) :
1933 toLOWER_LC_utf8((U8*)l)))
1941 nextchr = UCHARAT(locinput);
1945 /* Inline the first character, for speed. */
1946 if (UCHARAT(s) != nextchr &&
1947 UCHARAT(s) != ((OP(scan) == EXACTF)
1948 ? PL_fold : PL_fold_locale)[nextchr])
1950 if (PL_regeol - locinput < ln)
1952 if (ln > 1 && (OP(scan) == EXACTF
1953 ? ibcmp(s, locinput, ln)
1954 : ibcmp_locale(s, locinput, ln)))
1957 nextchr = UCHARAT(locinput);
1960 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1962 if (locinput >= PL_regeol)
1964 locinput += PL_utf8skip[nextchr];
1965 nextchr = UCHARAT(locinput);
1969 nextchr = UCHARAT(locinput);
1970 if (!REGINCLASS(scan, nextchr))
1972 if (!nextchr && locinput >= PL_regeol)
1974 nextchr = UCHARAT(++locinput);
1977 PL_reg_flags |= RF_tainted;
1982 if (!(OP(scan) == ALNUM
1983 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1985 nextchr = UCHARAT(++locinput);
1988 PL_reg_flags |= RF_tainted;
1993 if (nextchr & 0x80) {
1994 if (!(OP(scan) == ALNUMUTF8
1995 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1996 : isALNUM_LC_utf8((U8*)locinput)))
2000 locinput += PL_utf8skip[nextchr];
2001 nextchr = UCHARAT(locinput);
2004 if (!(OP(scan) == ALNUMUTF8
2005 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2007 nextchr = UCHARAT(++locinput);
2010 PL_reg_flags |= RF_tainted;
2013 if (!nextchr && locinput >= PL_regeol)
2015 if (OP(scan) == NALNUM
2016 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2018 nextchr = UCHARAT(++locinput);
2021 PL_reg_flags |= RF_tainted;
2024 if (!nextchr && locinput >= PL_regeol)
2026 if (nextchr & 0x80) {
2027 if (OP(scan) == NALNUMUTF8
2028 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2029 : isALNUM_LC_utf8((U8*)locinput))
2033 locinput += PL_utf8skip[nextchr];
2034 nextchr = UCHARAT(locinput);
2037 if (OP(scan) == NALNUMUTF8
2038 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2040 nextchr = UCHARAT(++locinput);
2044 PL_reg_flags |= RF_tainted;
2048 /* was last char in word? */
2049 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2050 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2052 n = isALNUM(nextchr);
2055 ln = isALNUM_LC(ln);
2056 n = isALNUM_LC(nextchr);
2058 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2063 PL_reg_flags |= RF_tainted;
2067 /* was last char in word? */
2068 ln = (locinput != PL_regbol)
2069 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2070 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2071 ln = isALNUM_uni(ln);
2072 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2075 ln = isALNUM_LC_uni(ln);
2076 n = isALNUM_LC_utf8((U8*)locinput);
2078 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2082 PL_reg_flags |= RF_tainted;
2085 if (!nextchr && locinput >= PL_regeol)
2087 if (!(OP(scan) == SPACE
2088 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2090 nextchr = UCHARAT(++locinput);
2093 PL_reg_flags |= RF_tainted;
2096 if (!nextchr && locinput >= PL_regeol)
2098 if (nextchr & 0x80) {
2099 if (!(OP(scan) == SPACEUTF8
2100 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2101 : isSPACE_LC_utf8((U8*)locinput)))
2105 locinput += PL_utf8skip[nextchr];
2106 nextchr = UCHARAT(locinput);
2109 if (!(OP(scan) == SPACEUTF8
2110 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2112 nextchr = UCHARAT(++locinput);
2115 PL_reg_flags |= RF_tainted;
2120 if (OP(scan) == SPACE
2121 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2123 nextchr = UCHARAT(++locinput);
2126 PL_reg_flags |= RF_tainted;
2131 if (nextchr & 0x80) {
2132 if (OP(scan) == NSPACEUTF8
2133 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2134 : isSPACE_LC_utf8((U8*)locinput))
2138 locinput += PL_utf8skip[nextchr];
2139 nextchr = UCHARAT(locinput);
2142 if (OP(scan) == NSPACEUTF8
2143 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2145 nextchr = UCHARAT(++locinput);
2148 PL_reg_flags |= RF_tainted;
2151 if (!nextchr && locinput >= PL_regeol)
2153 if (!(OP(scan) == DIGIT
2154 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2156 nextchr = UCHARAT(++locinput);
2159 PL_reg_flags |= RF_tainted;
2164 if (nextchr & 0x80) {
2165 if (OP(scan) == NDIGITUTF8
2166 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2167 : isDIGIT_LC_utf8((U8*)locinput))
2171 locinput += PL_utf8skip[nextchr];
2172 nextchr = UCHARAT(locinput);
2175 if (!isDIGIT(nextchr))
2177 nextchr = UCHARAT(++locinput);
2180 PL_reg_flags |= RF_tainted;
2185 if (OP(scan) == DIGIT
2186 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2188 nextchr = UCHARAT(++locinput);
2191 PL_reg_flags |= RF_tainted;
2194 if (!nextchr && locinput >= PL_regeol)
2196 if (nextchr & 0x80) {
2197 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2199 locinput += PL_utf8skip[nextchr];
2200 nextchr = UCHARAT(locinput);
2203 if (isDIGIT(nextchr))
2205 nextchr = UCHARAT(++locinput);
2208 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2210 locinput += PL_utf8skip[nextchr];
2211 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2212 locinput += UTF8SKIP(locinput);
2213 if (locinput > PL_regeol)
2215 nextchr = UCHARAT(locinput);
2218 PL_reg_flags |= RF_tainted;
2222 n = ARG(scan); /* which paren pair */
2223 ln = PL_regstartp[n];
2224 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2225 if (*PL_reglastparen < n || ln == -1)
2226 sayNO; /* Do not match unless seen CLOSEn. */
2227 if (ln == PL_regendp[n])
2231 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2233 char *e = PL_bostr + PL_regendp[n];
2235 * Note that we can't do the "other character" lookup trick as
2236 * in the 8-bit case (no pun intended) because in Unicode we
2237 * have to map both upper and title case to lower case.
2239 if (OP(scan) == REFF) {
2243 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2253 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2260 nextchr = UCHARAT(locinput);
2264 /* Inline the first character, for speed. */
2265 if (UCHARAT(s) != nextchr &&
2267 (UCHARAT(s) != ((OP(scan) == REFF
2268 ? PL_fold : PL_fold_locale)[nextchr]))))
2270 ln = PL_regendp[n] - ln;
2271 if (locinput + ln > PL_regeol)
2273 if (ln > 1 && (OP(scan) == REF
2274 ? memNE(s, locinput, ln)
2276 ? ibcmp(s, locinput, ln)
2277 : ibcmp_locale(s, locinput, ln))))
2280 nextchr = UCHARAT(locinput);
2291 OP_4tree *oop = PL_op;
2292 COP *ocurcop = PL_curcop;
2293 SV **ocurpad = PL_curpad;
2297 PL_op = (OP_4tree*)PL_regdata->data[n];
2298 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2299 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2300 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2302 CALLRUNOPS(aTHX); /* Scalar context. */
2308 PL_curpad = ocurpad;
2309 PL_curcop = ocurcop;
2311 if (logical == 2) { /* Postponed subexpression. */
2313 MAGIC *mg = Null(MAGIC*);
2315 CHECKPOINT cp, lastcp;
2317 if(SvROK(ret) || SvRMAGICAL(ret)) {
2318 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2321 mg = mg_find(sv, 'r');
2324 re = (regexp *)mg->mg_obj;
2325 (void)ReREFCNT_inc(re);
2329 char *t = SvPV(ret, len);
2331 char *oprecomp = PL_regprecomp;
2332 I32 osize = PL_regsize;
2333 I32 onpar = PL_regnpar;
2336 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2338 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2339 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2340 PL_regprecomp = oprecomp;
2345 PerlIO_printf(Perl_debug_log,
2346 "Entering embedded `%s%.60s%s%s'\n",
2350 (strlen(re->precomp) > 60 ? "..." : ""))
2353 state.prev = PL_reg_call_cc;
2354 state.cc = PL_regcc;
2355 state.re = PL_reg_re;
2359 cp = regcppush(0); /* Save *all* the positions. */
2362 state.ss = PL_savestack_ix;
2363 *PL_reglastparen = 0;
2364 PL_reg_call_cc = &state;
2365 PL_reginput = locinput;
2367 /* XXXX This is too dramatic a measure... */
2370 if (regmatch(re->program + 1)) {
2371 /* Even though we succeeded, we need to restore
2372 global variables, since we may be wrapped inside
2373 SUSPEND, thus the match may be not finished yet. */
2375 /* XXXX Do this only if SUSPENDed? */
2376 PL_reg_call_cc = state.prev;
2377 PL_regcc = state.cc;
2378 PL_reg_re = state.re;
2379 cache_re(PL_reg_re);
2381 /* XXXX This is too dramatic a measure... */
2384 /* These are needed even if not SUSPEND. */
2392 PL_reg_call_cc = state.prev;
2393 PL_regcc = state.cc;
2394 PL_reg_re = state.re;
2395 cache_re(PL_reg_re);
2397 /* XXXX This is too dramatic a measure... */
2406 sv_setsv(save_scalar(PL_replgv), ret);
2410 n = ARG(scan); /* which paren pair */
2411 PL_reg_start_tmp[n] = locinput;
2416 n = ARG(scan); /* which paren pair */
2417 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2418 PL_regendp[n] = locinput - PL_bostr;
2419 if (n > *PL_reglastparen)
2420 *PL_reglastparen = n;
2423 n = ARG(scan); /* which paren pair */
2424 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2427 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2429 next = NEXTOPER(NEXTOPER(scan));
2431 next = scan + ARG(scan);
2432 if (OP(next) == IFTHEN) /* Fake one. */
2433 next = NEXTOPER(NEXTOPER(next));
2437 logical = scan->flags;
2439 /*******************************************************************
2440 PL_regcc contains infoblock about the innermost (...)* loop, and
2441 a pointer to the next outer infoblock.
2443 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2445 1) After matching X, regnode for CURLYX is processed;
2447 2) This regnode creates infoblock on the stack, and calls
2448 regmatch() recursively with the starting point at WHILEM node;
2450 3) Each hit of WHILEM node tries to match A and Z (in the order
2451 depending on the current iteration, min/max of {min,max} and
2452 greediness). The information about where are nodes for "A"
2453 and "Z" is read from the infoblock, as is info on how many times "A"
2454 was already matched, and greediness.
2456 4) After A matches, the same WHILEM node is hit again.
2458 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2459 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2460 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2461 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2462 of the external loop.
2464 Currently present infoblocks form a tree with a stem formed by PL_curcc
2465 and whatever it mentions via ->next, and additional attached trees
2466 corresponding to temporarily unset infoblocks as in "5" above.
2468 In the following picture infoblocks for outer loop of
2469 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2470 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2471 infoblocks are drawn below the "reset" infoblock.
2473 In fact in the picture below we do not show failed matches for Z and T
2474 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2475 more obvious *why* one needs to *temporary* unset infoblocks.]
2477 Matched REx position InfoBlocks Comment
2481 Y A)*?Z)*?T x <- O <- I
2482 YA )*?Z)*?T x <- O <- I
2483 YA A)*?Z)*?T x <- O <- I
2484 YAA )*?Z)*?T x <- O <- I
2485 YAA Z)*?T x <- O # Temporary unset I
2488 YAAZ Y(A)*?Z)*?T x <- O
2491 YAAZY (A)*?Z)*?T x <- O
2494 YAAZY A)*?Z)*?T x <- O <- I
2497 YAAZYA )*?Z)*?T x <- O <- I
2500 YAAZYA Z)*?T x <- O # Temporary unset I
2506 YAAZYAZ T x # Temporary unset O
2513 *******************************************************************/
2516 CHECKPOINT cp = PL_savestack_ix;
2518 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2520 cc.oldcc = PL_regcc;
2522 cc.parenfloor = *PL_reglastparen;
2524 cc.min = ARG1(scan);
2525 cc.max = ARG2(scan);
2526 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2530 PL_reginput = locinput;
2531 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2533 PL_regcc = cc.oldcc;
2539 * This is really hard to understand, because after we match
2540 * what we're trying to match, we must make sure the rest of
2541 * the REx is going to match for sure, and to do that we have
2542 * to go back UP the parse tree by recursing ever deeper. And
2543 * if it fails, we have to reset our parent's current state
2544 * that we can try again after backing off.
2547 CHECKPOINT cp, lastcp;
2548 CURCUR* cc = PL_regcc;
2549 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2551 n = cc->cur + 1; /* how many we know we matched */
2552 PL_reginput = locinput;
2555 PerlIO_printf(Perl_debug_log,
2556 "%*s %ld out of %ld..%ld cc=%lx\n",
2557 REPORT_CODE_OFF+PL_regindent*2, "",
2558 (long)n, (long)cc->min,
2559 (long)cc->max, (long)cc)
2562 /* If degenerate scan matches "", assume scan done. */
2564 if (locinput == cc->lastloc && n >= cc->min) {
2565 PL_regcc = cc->oldcc;
2569 PerlIO_printf(Perl_debug_log,
2570 "%*s empty match detected, try continuation...\n",
2571 REPORT_CODE_OFF+PL_regindent*2, "")
2573 if (regmatch(cc->next))
2581 /* First just match a string of min scans. */
2585 cc->lastloc = locinput;
2586 if (regmatch(cc->scan))
2589 cc->lastloc = lastloc;
2594 /* Check whether we already were at this position.
2595 Postpone detection until we know the match is not
2596 *that* much linear. */
2597 if (!PL_reg_maxiter) {
2598 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2599 PL_reg_leftiter = PL_reg_maxiter;
2601 if (PL_reg_leftiter-- == 0) {
2602 I32 size = (PL_reg_maxiter + 7)/8;
2603 if (PL_reg_poscache) {
2604 if (PL_reg_poscache_size < size) {
2605 Renew(PL_reg_poscache, size, char);
2606 PL_reg_poscache_size = size;
2608 Zero(PL_reg_poscache, size, char);
2611 PL_reg_poscache_size = size;
2612 Newz(29, PL_reg_poscache, size, char);
2615 PerlIO_printf(Perl_debug_log,
2616 "%sDetected a super-linear match, switching on caching%s...\n",
2617 PL_colors[4], PL_colors[5])
2620 if (PL_reg_leftiter < 0) {
2621 I32 o = locinput - PL_bostr, b;
2623 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2626 if (PL_reg_poscache[o] & (1<<b)) {
2628 PerlIO_printf(Perl_debug_log,
2629 "%*s already tried at this position...\n",
2630 REPORT_CODE_OFF+PL_regindent*2, "")
2634 PL_reg_poscache[o] |= (1<<b);
2638 /* Prefer next over scan for minimal matching. */
2641 PL_regcc = cc->oldcc;
2644 cp = regcppush(cc->parenfloor);
2646 if (regmatch(cc->next)) {
2648 sayYES; /* All done. */
2656 if (n >= cc->max) { /* Maximum greed exceeded? */
2657 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2658 && !(PL_reg_flags & RF_warned)) {
2659 PL_reg_flags |= RF_warned;
2660 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2661 "Complex regular subexpression recursion",
2668 PerlIO_printf(Perl_debug_log,
2669 "%*s trying longer...\n",
2670 REPORT_CODE_OFF+PL_regindent*2, "")
2672 /* Try scanning more and see if it helps. */
2673 PL_reginput = locinput;
2675 cc->lastloc = locinput;
2676 cp = regcppush(cc->parenfloor);
2678 if (regmatch(cc->scan)) {
2685 cc->lastloc = lastloc;
2689 /* Prefer scan over next for maximal matching. */
2691 if (n < cc->max) { /* More greed allowed? */
2692 cp = regcppush(cc->parenfloor);
2694 cc->lastloc = locinput;
2696 if (regmatch(cc->scan)) {
2701 regcppop(); /* Restore some previous $<digit>s? */
2702 PL_reginput = locinput;
2704 PerlIO_printf(Perl_debug_log,
2705 "%*s failed, try continuation...\n",
2706 REPORT_CODE_OFF+PL_regindent*2, "")
2709 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2710 && !(PL_reg_flags & RF_warned)) {
2711 PL_reg_flags |= RF_warned;
2712 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2713 "Complex regular subexpression recursion",
2717 /* Failed deeper matches of scan, so see if this one works. */
2718 PL_regcc = cc->oldcc;
2721 if (regmatch(cc->next))
2727 cc->lastloc = lastloc;
2732 next = scan + ARG(scan);
2735 inner = NEXTOPER(NEXTOPER(scan));
2738 inner = NEXTOPER(scan);
2743 if (OP(next) != c1) /* No choice. */
2744 next = inner; /* Avoid recursion. */
2746 int lastparen = *PL_reglastparen;
2750 PL_reginput = locinput;
2751 if (regmatch(inner))
2754 for (n = *PL_reglastparen; n > lastparen; n--)
2756 *PL_reglastparen = n;
2759 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2763 inner = NEXTOPER(scan);
2764 if (c1 == BRANCHJ) {
2765 inner = NEXTOPER(inner);
2767 } while (scan != NULL && OP(scan) == c1);
2781 /* We suppose that the next guy does not need
2782 backtracking: in particular, it is of constant length,
2783 and has no parenths to influence future backrefs. */
2784 ln = ARG1(scan); /* min to match */
2785 n = ARG2(scan); /* max to match */
2786 paren = scan->flags;
2788 if (paren > PL_regsize)
2790 if (paren > *PL_reglastparen)
2791 *PL_reglastparen = paren;
2793 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2795 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2796 PL_reginput = locinput;
2799 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2801 if (ln && l == 0 && n >= ln
2802 /* In fact, this is tricky. If paren, then the
2803 fact that we did/didnot match may influence
2804 future execution. */
2805 && !(paren && ln == 0))
2807 locinput = PL_reginput;
2808 if (PL_regkind[(U8)OP(next)] == EXACT) {
2809 c1 = (U8)*STRING(next);
2810 if (OP(next) == EXACTF)
2812 else if (OP(next) == EXACTFL)
2813 c2 = PL_fold_locale[c1];
2820 /* This may be improved if l == 0. */
2821 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2822 /* If it could work, try it. */
2824 UCHARAT(PL_reginput) == c1 ||
2825 UCHARAT(PL_reginput) == c2)
2829 PL_regstartp[paren] =
2830 HOPc(PL_reginput, -l) - PL_bostr;
2831 PL_regendp[paren] = PL_reginput - PL_bostr;
2834 PL_regendp[paren] = -1;
2840 /* Couldn't or didn't -- move forward. */
2841 PL_reginput = locinput;
2842 if (regrepeat_hard(scan, 1, &l)) {
2844 locinput = PL_reginput;
2851 n = regrepeat_hard(scan, n, &l);
2852 if (n != 0 && l == 0
2853 /* In fact, this is tricky. If paren, then the
2854 fact that we did/didnot match may influence
2855 future execution. */
2856 && !(paren && ln == 0))
2858 locinput = PL_reginput;
2860 PerlIO_printf(Perl_debug_log,
2861 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2862 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2866 if (PL_regkind[(U8)OP(next)] == EXACT) {
2867 c1 = (U8)*STRING(next);
2868 if (OP(next) == EXACTF)
2870 else if (OP(next) == EXACTFL)
2871 c2 = PL_fold_locale[c1];
2880 /* If it could work, try it. */
2882 UCHARAT(PL_reginput) == c1 ||
2883 UCHARAT(PL_reginput) == c2)
2886 PerlIO_printf(Perl_debug_log,
2887 "%*s trying tail with n=%"IVdf"...\n",
2888 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2892 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2893 PL_regendp[paren] = PL_reginput - PL_bostr;
2896 PL_regendp[paren] = -1;
2902 /* Couldn't or didn't -- back up. */
2904 locinput = HOPc(locinput, -l);
2905 PL_reginput = locinput;
2912 paren = scan->flags; /* Which paren to set */
2913 if (paren > PL_regsize)
2915 if (paren > *PL_reglastparen)
2916 *PL_reglastparen = paren;
2917 ln = ARG1(scan); /* min to match */
2918 n = ARG2(scan); /* max to match */
2919 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2923 ln = ARG1(scan); /* min to match */
2924 n = ARG2(scan); /* max to match */
2925 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2930 scan = NEXTOPER(scan);
2936 scan = NEXTOPER(scan);
2940 * Lookahead to avoid useless match attempts
2941 * when we know what character comes next.
2943 if (PL_regkind[(U8)OP(next)] == EXACT) {
2944 c1 = (U8)*STRING(next);
2945 if (OP(next) == EXACTF)
2947 else if (OP(next) == EXACTFL)
2948 c2 = PL_fold_locale[c1];
2954 PL_reginput = locinput;
2958 if (ln && regrepeat(scan, ln) < ln)
2960 locinput = PL_reginput;
2963 char *e = locinput + n - ln; /* Should not check after this */
2964 char *old = locinput;
2966 if (e >= PL_regeol || (n == REG_INFTY))
2969 /* Find place 'next' could work */
2971 while (locinput <= e && *locinput != c1)
2974 while (locinput <= e
2981 /* PL_reginput == old now */
2982 if (locinput != old) {
2983 ln = 1; /* Did some */
2984 if (regrepeat(scan, locinput - old) <
2988 /* PL_reginput == locinput now */
2991 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2992 PL_regendp[paren] = locinput - PL_bostr;
2995 PL_regendp[paren] = -1;
2999 PL_reginput = locinput; /* Could be reset... */
3001 /* Couldn't or didn't -- move forward. */
3006 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3007 /* If it could work, try it. */
3009 UCHARAT(PL_reginput) == c1 ||
3010 UCHARAT(PL_reginput) == c2)
3014 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3015 PL_regendp[paren] = PL_reginput - PL_bostr;
3018 PL_regendp[paren] = -1;
3024 /* Couldn't or didn't -- move forward. */
3025 PL_reginput = locinput;
3026 if (regrepeat(scan, 1)) {
3028 locinput = PL_reginput;
3036 n = regrepeat(scan, n);
3037 locinput = PL_reginput;
3038 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3039 (!PL_multiline || OP(next) == SEOL))
3040 ln = n; /* why back off? */
3044 /* If it could work, try it. */
3046 UCHARAT(PL_reginput) == c1 ||
3047 UCHARAT(PL_reginput) == c2)
3051 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3052 PL_regendp[paren] = PL_reginput - PL_bostr;
3055 PL_regendp[paren] = -1;
3061 /* Couldn't or didn't -- back up. */
3063 PL_reginput = locinput = HOPc(locinput, -1);
3068 /* If it could work, try it. */
3070 UCHARAT(PL_reginput) == c1 ||
3071 UCHARAT(PL_reginput) == c2)
3077 /* Couldn't or didn't -- back up. */
3079 PL_reginput = locinput = HOPc(locinput, -1);
3086 if (PL_reg_call_cc) {
3087 re_cc_state *cur_call_cc = PL_reg_call_cc;
3088 CURCUR *cctmp = PL_regcc;
3089 regexp *re = PL_reg_re;
3090 CHECKPOINT cp, lastcp;
3092 cp = regcppush(0); /* Save *all* the positions. */
3094 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3096 PL_reginput = locinput; /* Make position available to
3098 cache_re(PL_reg_call_cc->re);
3099 PL_regcc = PL_reg_call_cc->cc;
3100 PL_reg_call_cc = PL_reg_call_cc->prev;
3101 if (regmatch(cur_call_cc->node)) {
3102 PL_reg_call_cc = cur_call_cc;
3108 PL_reg_call_cc = cur_call_cc;
3114 PerlIO_printf(Perl_debug_log,
3115 "%*s continuation failed...\n",
3116 REPORT_CODE_OFF+PL_regindent*2, "")
3120 if (locinput < PL_regtill) {
3121 DEBUG_r(PerlIO_printf(Perl_debug_log,
3122 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3124 (long)(locinput - PL_reg_starttry),
3125 (long)(PL_regtill - PL_reg_starttry),
3127 sayNO_FINAL; /* Cannot match: too short. */
3129 PL_reginput = locinput; /* put where regtry can find it */
3130 sayYES_FINAL; /* Success! */
3132 PL_reginput = locinput; /* put where regtry can find it */
3133 sayYES_LOUD; /* Success! */
3136 PL_reginput = locinput;
3141 if (UTF) { /* XXXX This is absolutely
3142 broken, we read before
3144 s = HOPMAYBEc(locinput, -scan->flags);
3150 if (locinput < PL_bostr + scan->flags)
3152 PL_reginput = locinput - scan->flags;
3157 PL_reginput = locinput;
3162 if (UTF) { /* XXXX This is absolutely
3163 broken, we read before
3165 s = HOPMAYBEc(locinput, -scan->flags);
3166 if (!s || s < PL_bostr)
3171 if (locinput < PL_bostr + scan->flags)
3173 PL_reginput = locinput - scan->flags;
3178 PL_reginput = locinput;
3181 inner = NEXTOPER(NEXTOPER(scan));
3182 if (regmatch(inner) != n) {
3197 if (OP(scan) == SUSPEND) {
3198 locinput = PL_reginput;
3199 nextchr = UCHARAT(locinput);
3204 next = scan + ARG(scan);
3209 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3210 PTR2UV(scan), OP(scan));
3211 Perl_croak(aTHX_ "regexp memory corruption");
3217 * We get here only if there's trouble -- normally "case END" is
3218 * the terminating point.
3220 Perl_croak(aTHX_ "corrupted regexp pointers");
3226 PerlIO_printf(Perl_debug_log,
3227 "%*s %scould match...%s\n",
3228 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3232 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3233 PL_colors[4],PL_colors[5]));
3242 PerlIO_printf(Perl_debug_log,
3243 "%*s %sfailed...%s\n",
3244 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3256 - regrepeat - repeatedly match something simple, report how many
3259 * [This routine now assumes that it will only match on things of length 1.
3260 * That was true before, but now we assume scan - reginput is the count,
3261 * rather than incrementing count on every character. [Er, except utf8.]]
3264 S_regrepeat(pTHX_ regnode *p, I32 max)
3267 register char *scan;
3269 register char *loceol = PL_regeol;
3270 register I32 hardcount = 0;
3273 if (max != REG_INFTY && max < loceol - scan)
3274 loceol = scan + max;
3277 while (scan < loceol && *scan != '\n')
3285 while (scan < loceol && *scan != '\n') {
3286 scan += UTF8SKIP(scan);
3292 while (scan < loceol) {
3293 scan += UTF8SKIP(scan);
3297 case EXACT: /* length of string is 1 */
3299 while (scan < loceol && UCHARAT(scan) == c)
3302 case EXACTF: /* length of string is 1 */
3304 while (scan < loceol &&
3305 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3308 case EXACTFL: /* length of string is 1 */
3309 PL_reg_flags |= RF_tainted;
3311 while (scan < loceol &&
3312 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3317 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3318 scan += UTF8SKIP(scan);
3323 while (scan < loceol && REGINCLASS(p, *scan))
3327 while (scan < loceol && isALNUM(*scan))
3332 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3333 scan += UTF8SKIP(scan);
3338 PL_reg_flags |= RF_tainted;
3339 while (scan < loceol && isALNUM_LC(*scan))
3343 PL_reg_flags |= RF_tainted;
3345 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3346 scan += UTF8SKIP(scan);
3352 while (scan < loceol && !isALNUM(*scan))
3357 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3358 scan += UTF8SKIP(scan);
3363 PL_reg_flags |= RF_tainted;
3364 while (scan < loceol && !isALNUM_LC(*scan))
3368 PL_reg_flags |= RF_tainted;
3370 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3371 scan += UTF8SKIP(scan);
3376 while (scan < loceol && isSPACE(*scan))
3381 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3382 scan += UTF8SKIP(scan);
3387 PL_reg_flags |= RF_tainted;
3388 while (scan < loceol && isSPACE_LC(*scan))
3392 PL_reg_flags |= RF_tainted;
3394 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3395 scan += UTF8SKIP(scan);
3400 while (scan < loceol && !isSPACE(*scan))
3405 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3406 scan += UTF8SKIP(scan);
3411 PL_reg_flags |= RF_tainted;
3412 while (scan < loceol && !isSPACE_LC(*scan))
3416 PL_reg_flags |= RF_tainted;
3418 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3419 scan += UTF8SKIP(scan);
3424 while (scan < loceol && isDIGIT(*scan))
3429 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3430 scan += UTF8SKIP(scan);
3436 while (scan < loceol && !isDIGIT(*scan))
3441 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3442 scan += UTF8SKIP(scan);
3446 default: /* Called on something of 0 width. */
3447 break; /* So match right here or not at all. */
3453 c = scan - PL_reginput;
3458 SV *prop = sv_newmortal();
3461 PerlIO_printf(Perl_debug_log,
3462 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3463 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3470 - regrepeat_hard - repeatedly match something, report total lenth and length
3472 * The repeater is supposed to have constant length.
3476 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3479 register char *scan;
3480 register char *start;
3481 register char *loceol = PL_regeol;
3483 I32 count = 0, res = 1;
3488 start = PL_reginput;
3490 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3493 while (start < PL_reginput) {
3495 start += UTF8SKIP(start);
3506 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3508 *lp = l = PL_reginput - start;
3509 if (max != REG_INFTY && l*max < loceol - scan)
3510 loceol = scan + l*max;
3523 - reginclass - determine if a character falls into a character class
3527 S_reginclass(pTHX_ register regnode *p, register I32 c)
3530 char flags = ANYOF_FLAGS(p);
3534 if (ANYOF_BITMAP_TEST(p, c))
3536 else if (flags & ANYOF_FOLD) {
3538 if (flags & ANYOF_LOCALE) {
3539 PL_reg_flags |= RF_tainted;
3540 cf = PL_fold_locale[c];
3544 if (ANYOF_BITMAP_TEST(p, cf))
3548 if (!match && (flags & ANYOF_CLASS)) {
3549 PL_reg_flags |= RF_tainted;
3551 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3552 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3553 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3554 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3555 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3556 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3557 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3558 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3559 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3560 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3561 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3562 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3563 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3564 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3565 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3566 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3567 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3568 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3569 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3570 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3571 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3572 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3573 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3574 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3575 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3576 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3577 ) /* How's that for a conditional? */
3583 return (flags & ANYOF_INVERT) ? !match : match;
3587 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3590 char flags = ARG1(f);
3592 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3594 if (swash_fetch(sv, p))
3596 else if (flags & ANYOF_FOLD) {
3599 if (flags & ANYOF_LOCALE) {
3600 PL_reg_flags |= RF_tainted;
3601 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3604 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3605 if (swash_fetch(sv, tmpbuf))
3609 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3611 return (flags & ANYOF_INVERT) ? !match : match;
3615 S_reghop(pTHX_ U8 *s, I32 off)
3619 while (off-- && s < (U8*)PL_regeol)
3624 if (s > (U8*)PL_bostr) {
3627 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3629 } /* XXX could check well-formedness here */
3637 S_reghopmaybe(pTHX_ U8* s, I32 off)
3641 while (off-- && s < (U8*)PL_regeol)
3648 if (s > (U8*)PL_bostr) {
3651 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3653 } /* XXX could check well-formedness here */
3669 restore_pos(pTHXo_ void *arg)
3672 if (PL_reg_eval_set) {
3673 if (PL_reg_oldsaved) {
3674 PL_reg_re->subbeg = PL_reg_oldsaved;
3675 PL_reg_re->sublen = PL_reg_oldsavedlen;
3676 RX_MATCH_COPIED_on(PL_reg_re);
3678 PL_reg_magic->mg_len = PL_reg_oldpos;
3679 PL_reg_eval_set = 0;
3680 PL_curpm = PL_reg_oldcurpm;