5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 && (sv && (strpos + SvCUR(sv) != strend)) ) {
350 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
353 PL_regeol = strend; /* Used in HOP() */
354 s = HOPc(strpos, prog->check_offset_min);
356 slen = SvCUR(check); /* >= 1 */
358 if ( strend - s > slen || strend - s < slen - 1
359 || (strend - s == slen && strend[-1] != '\n')) {
360 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
363 /* Now should match s[0..slen-2] */
365 if (slen && (*SvPVX(check) != *s
367 && memNE(SvPVX(check), s, slen)))) {
369 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
373 else if (*SvPVX(check) != *s
374 || ((slen = SvCUR(check)) > 1
375 && memNE(SvPVX(check), s, slen)))
377 goto success_at_start;
379 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
381 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
382 end_shift = prog->minlen - start_shift -
383 CHR_SVLEN(check) + (SvTAIL(check) != 0);
385 I32 end = prog->check_offset_max + CHR_SVLEN(check)
386 - (SvTAIL(check) != 0);
387 I32 eshift = strend - s - end;
389 if (end_shift < eshift)
393 else { /* Can match at random position */
396 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
397 /* Should be nonnegative! */
398 end_shift = prog->minlen - start_shift -
399 CHR_SVLEN(check) + (SvTAIL(check) != 0);
402 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
404 Perl_croak(aTHX_ "panic: end_shift");
408 /* Find a possible match in the region s..strend by looking for
409 the "check" substring in the region corrected by start/end_shift. */
410 if (flags & REXEC_SCREAM) {
411 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
412 I32 p = -1; /* Internal iterator of scream. */
413 I32 *pp = data ? data->scream_pos : &p;
415 if (PL_screamfirst[BmRARE(check)] >= 0
416 || ( BmRARE(check) == '\n'
417 && (BmPREVIOUS(check) == SvCUR(check) - 1)
419 s = screaminstr(sv, check,
420 start_shift + (s - strbeg), end_shift, pp, 0);
424 *data->scream_olds = s;
427 s = fbm_instr((unsigned char*)s + start_shift,
428 (unsigned char*)strend - end_shift,
429 check, PL_multiline ? FBMrf_MULTILINE : 0);
431 /* Update the count-of-usability, remove useless subpatterns,
434 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
435 (s ? "Found" : "Did not find"),
436 ((check == prog->anchored_substr) ? "anchored" : "floating"),
438 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
440 PL_colors[1], (SvTAIL(check) ? "$" : ""),
441 (s ? " at offset " : "...\n") ) );
448 /* Finish the diagnostic message */
449 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
451 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
452 Start with the other substr.
453 XXXX no SCREAM optimization yet - and a very coarse implementation
454 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
455 *always* match. Probably should be marked during compile...
456 Probably it is right to do no SCREAM here...
459 if (prog->float_substr && prog->anchored_substr) {
460 /* Take into account the "other" substring. */
461 /* XXXX May be hopelessly wrong for UTF... */
464 if (check == prog->float_substr) {
467 char *last = s - start_shift, *last1, *last2;
471 t = s - prog->check_offset_max;
472 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
473 && (!(prog->reganch & ROPT_UTF8)
474 || (PL_bostr = strpos, /* Used in regcopmaybe() */
475 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
480 t += prog->anchored_offset;
481 if (t < other_last) /* These positions already checked */
484 last2 = last1 = strend - prog->minlen;
487 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
488 /* On end-of-str: see comment below. */
489 s = fbm_instr((unsigned char*)t,
490 (unsigned char*)last1 + prog->anchored_offset
491 + SvCUR(prog->anchored_substr)
492 - (SvTAIL(prog->anchored_substr)!=0),
493 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
494 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
495 (s ? "Found" : "Contradicts"),
497 (int)(SvCUR(prog->anchored_substr)
498 - (SvTAIL(prog->anchored_substr)!=0)),
499 SvPVX(prog->anchored_substr),
500 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
502 if (last1 >= last2) {
503 DEBUG_r(PerlIO_printf(Perl_debug_log,
504 ", giving up...\n"));
507 DEBUG_r(PerlIO_printf(Perl_debug_log,
508 ", trying floating at offset %ld...\n",
509 (long)(s1 + 1 - i_strpos)));
510 PL_regeol = strend; /* Used in HOP() */
511 other_last = last1 + prog->anchored_offset + 1;
516 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
517 (long)(s - i_strpos)));
518 t = s - prog->anchored_offset;
527 else { /* Take into account the floating substring. */
532 last1 = last = strend - prog->minlen + prog->float_min_offset;
533 if (last - t > prog->float_max_offset)
534 last = t + prog->float_max_offset;
535 s = t + prog->float_min_offset;
538 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
539 /* fbm_instr() takes into account exact value of end-of-str
540 if the check is SvTAIL(ed). Since false positives are OK,
541 and end-of-str is not later than strend we are OK. */
542 s = fbm_instr((unsigned char*)s,
543 (unsigned char*)last + SvCUR(prog->float_substr)
544 - (SvTAIL(prog->float_substr)!=0),
545 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
546 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
547 (s ? "Found" : "Contradicts"),
549 (int)(SvCUR(prog->float_substr)
550 - (SvTAIL(prog->float_substr)!=0)),
551 SvPVX(prog->float_substr),
552 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
555 DEBUG_r(PerlIO_printf(Perl_debug_log,
556 ", giving up...\n"));
559 DEBUG_r(PerlIO_printf(Perl_debug_log,
560 ", trying anchored starting at offset %ld...\n",
561 (long)(s1 + 1 - i_strpos)));
562 other_last = last + 1;
563 PL_regeol = strend; /* Used in HOP() */
568 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
569 (long)(s - i_strpos)));
579 t = s - prog->check_offset_max;
581 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
582 && (!(prog->reganch & ROPT_UTF8)
583 || (PL_bostr = strpos, /* Used in regcopmaybe() */
584 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
587 /* Fixed substring is found far enough so that the match
588 cannot start at strpos. */
590 if (ml_anch && t[-1] != '\n') {
591 /* Eventually fbm_*() should handle this, but often
592 anchored_offset is not 0, so this check will not be wasted. */
593 /* XXXX In the code below we prefer to look for "^" even in
594 presence of anchored substrings. And we search even
595 beyond the found float position. These pessimizations
596 are historical artefacts only. */
598 while (t < strend - prog->minlen) {
600 if (t < s - prog->check_offset_min) {
601 if (prog->anchored_substr) {
602 /* We definitely contradict the found anchored
603 substr. Due to the above check we do not
604 contradict "check" substr.
605 Thus we can arrive here only if check substr
606 is float. Redo checking for "other"=="fixed".
609 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
610 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
611 goto do_other_anchored;
614 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
615 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
618 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
619 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
625 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
626 PL_colors[0],PL_colors[1]));
631 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
635 /* The found string does not prohibit matching at beg-of-str
636 - no optimization of calling REx engine can be performed,
637 unless it was an MBOL and we are not after MBOL. */
639 /* Even in this situation we may use MBOL flag if strpos is offset
640 wrt the start of the string. */
642 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
646 DEBUG_r( if (ml_anch)
647 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
648 PL_colors[0],PL_colors[1]);
651 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
652 && prog->check_substr /* Could be deleted already */
653 && --BmUSEFUL(prog->check_substr) < 0
654 && prog->check_substr == prog->float_substr)
656 /* If flags & SOMETHING - do not do it many times on the same match */
657 SvREFCNT_dec(prog->check_substr);
658 prog->check_substr = Nullsv; /* disable */
659 prog->float_substr = Nullsv; /* clear */
661 /* XXXX This is a remnant of the old implementation. It
662 looks wasteful, since now INTUIT can use many
664 prog->reganch &= ~RE_USE_INTUIT;
671 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
672 if (prog->regstclass) {
673 /* minlen == 0 is possible if regstclass is \b or \B,
674 and the fixed substr is ''$.
675 Since minlen is already taken into account, s+1 is before strend;
676 accidentally, minlen >= 1 guaranties no false positives at s + 1
677 even for \b or \B. But (minlen? 1 : 0) below assumes that
678 regstclass does not come from lookahead... */
679 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
680 This leaves EXACTF only, which is dealt with in find_byclass(). */
681 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
682 ? STR_LEN(prog->regstclass)
684 char *endpos = (prog->anchored_substr || ml_anch)
685 ? s + (prog->minlen? cl_l : 0)
686 : (prog->float_substr ? check_at - start_shift + cl_l
688 char *startpos = sv ? strend - SvCUR(sv) : s;
691 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
696 if (endpos == strend) {
697 DEBUG_r( PerlIO_printf(Perl_debug_log,
698 "Could not match STCLASS...\n") );
701 DEBUG_r( PerlIO_printf(Perl_debug_log,
702 "This position contradicts STCLASS...\n") );
703 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
705 /* Contradict one of substrings */
706 if (prog->anchored_substr) {
707 if (prog->anchored_substr == check) {
708 DEBUG_r( what = "anchored" );
710 PL_regeol = strend; /* Used in HOP() */
712 if (s + start_shift + end_shift > strend) {
713 /* XXXX Should be taken into account earlier? */
714 DEBUG_r( PerlIO_printf(Perl_debug_log,
715 "Could not match STCLASS...\n") );
718 DEBUG_r( PerlIO_printf(Perl_debug_log,
719 "Trying %s substr starting at offset %ld...\n",
720 what, (long)(s + start_shift - i_strpos)) );
723 /* Have both, check_string is floating */
724 if (t + start_shift >= check_at) /* Contradicts floating=check */
725 goto retry_floating_check;
726 /* Recheck anchored substring, but not floating... */
728 DEBUG_r( PerlIO_printf(Perl_debug_log,
729 "Trying anchored substr starting at offset %ld...\n",
730 (long)(other_last - i_strpos)) );
731 goto do_other_anchored;
733 if (!prog->float_substr) { /* Could have been deleted */
740 /* Check is floating subtring. */
741 retry_floating_check:
742 t = check_at - start_shift;
743 DEBUG_r( what = "floating" );
744 goto hop_and_restart;
747 PerlIO_printf(Perl_debug_log,
748 "By STCLASS: moving %ld --> %ld\n",
749 (long)(t - i_strpos), (long)(s - i_strpos));
751 PerlIO_printf(Perl_debug_log,
752 "Does not contradict STCLASS...\n") );
754 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
755 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
758 fail_finish: /* Substring not found */
759 if (prog->check_substr) /* could be removed already */
760 BmUSEFUL(prog->check_substr) += 5; /* hooray */
762 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
763 PL_colors[4],PL_colors[5]));
767 /* We know what class REx starts with. Try to find this position... */
769 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
771 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
777 register I32 tmp = 1; /* Scratch variable? */
779 /* We know what class it must start with. */
783 if (REGINCLASSUTF8(c, (U8*)s)) {
784 if (tmp && (norun || regtry(prog, s)))
796 if (REGINCLASS(c, *s)) {
797 if (tmp && (norun || regtry(prog, s)))
817 c2 = PL_fold_locale[c1];
822 e = s; /* Due to minlen logic of intuit() */
823 /* Here it is NOT UTF! */
827 && (ln == 1 || !(OP(c) == EXACTF
829 : ibcmp_locale(s, m, ln)))
830 && (norun || regtry(prog, s)) )
836 if ( (*s == c1 || *s == c2)
837 && (ln == 1 || !(OP(c) == EXACTF
839 : ibcmp_locale(s, m, ln)))
840 && (norun || regtry(prog, s)) )
847 PL_reg_flags |= RF_tainted;
850 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
851 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
853 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
855 if ((norun || regtry(prog, s)))
860 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
864 PL_reg_flags |= RF_tainted;
867 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
868 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
870 if (tmp == !(OP(c) == BOUND ?
871 swash_fetch(PL_utf8_alnum, (U8*)s) :
872 isALNUM_LC_utf8((U8*)s)))
875 if ((norun || regtry(prog, s)))
880 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
884 PL_reg_flags |= RF_tainted;
887 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
888 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
890 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
892 else if ((norun || regtry(prog, s)))
896 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
900 PL_reg_flags |= RF_tainted;
904 strend = reghop_c(strend, -1);
905 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
906 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
908 if (tmp == !(OP(c) == NBOUND ?
909 swash_fetch(PL_utf8_alnum, (U8*)s) :
910 isALNUM_LC_utf8((U8*)s)))
912 else if ((norun || regtry(prog, s)))
916 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
922 if (tmp && (norun || regtry(prog, s)))
934 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
935 if (tmp && (norun || regtry(prog, s)))
946 PL_reg_flags |= RF_tainted;
948 if (isALNUM_LC(*s)) {
949 if (tmp && (norun || regtry(prog, s)))
960 PL_reg_flags |= RF_tainted;
962 if (isALNUM_LC_utf8((U8*)s)) {
963 if (tmp && (norun || regtry(prog, s)))
976 if (tmp && (norun || regtry(prog, s)))
988 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
989 if (tmp && (norun || regtry(prog, s)))
1000 PL_reg_flags |= RF_tainted;
1001 while (s < strend) {
1002 if (!isALNUM_LC(*s)) {
1003 if (tmp && (norun || regtry(prog, s)))
1014 PL_reg_flags |= RF_tainted;
1015 while (s < strend) {
1016 if (!isALNUM_LC_utf8((U8*)s)) {
1017 if (tmp && (norun || regtry(prog, s)))
1028 while (s < strend) {
1030 if (tmp && (norun || regtry(prog, s)))
1041 while (s < strend) {
1042 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1043 if (tmp && (norun || regtry(prog, s)))
1054 PL_reg_flags |= RF_tainted;
1055 while (s < strend) {
1056 if (isSPACE_LC(*s)) {
1057 if (tmp && (norun || regtry(prog, s)))
1068 PL_reg_flags |= RF_tainted;
1069 while (s < strend) {
1070 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1071 if (tmp && (norun || regtry(prog, s)))
1082 while (s < strend) {
1084 if (tmp && (norun || regtry(prog, s)))
1095 while (s < strend) {
1096 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1097 if (tmp && (norun || regtry(prog, s)))
1108 PL_reg_flags |= RF_tainted;
1109 while (s < strend) {
1110 if (!isSPACE_LC(*s)) {
1111 if (tmp && (norun || regtry(prog, s)))
1122 PL_reg_flags |= RF_tainted;
1123 while (s < strend) {
1124 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1125 if (tmp && (norun || regtry(prog, s)))
1136 while (s < strend) {
1138 if (tmp && (norun || regtry(prog, s)))
1149 while (s < strend) {
1150 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1151 if (tmp && (norun || regtry(prog, s)))
1162 PL_reg_flags |= RF_tainted;
1163 while (s < strend) {
1164 if (isDIGIT_LC(*s)) {
1165 if (tmp && (norun || regtry(prog, s)))
1176 PL_reg_flags |= RF_tainted;
1177 while (s < strend) {
1178 if (isDIGIT_LC_utf8((U8*)s)) {
1179 if (tmp && (norun || regtry(prog, s)))
1190 while (s < strend) {
1192 if (tmp && (norun || regtry(prog, s)))
1203 while (s < strend) {
1204 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1205 if (tmp && (norun || regtry(prog, s)))
1216 PL_reg_flags |= RF_tainted;
1217 while (s < strend) {
1218 if (!isDIGIT_LC(*s)) {
1219 if (tmp && (norun || regtry(prog, s)))
1230 PL_reg_flags |= RF_tainted;
1231 while (s < strend) {
1232 if (!isDIGIT_LC_utf8((U8*)s)) {
1233 if (tmp && (norun || regtry(prog, s)))
1244 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1253 - regexec_flags - match a regexp against a string
1256 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1257 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1258 /* strend: pointer to null at end of string */
1259 /* strbeg: real beginning of string */
1260 /* minend: end of match must be >=minend after stringarg. */
1261 /* data: May be used for some additional optimizations. */
1262 /* nosave: For optimizations. */
1266 register regnode *c;
1267 register char *startpos = stringarg;
1269 I32 minlen; /* must match at least this many chars */
1270 I32 dontbother = 0; /* how many characters not to try at end */
1271 I32 start_shift = 0; /* Offset of the start to find
1272 constant substr. */ /* CC */
1273 I32 end_shift = 0; /* Same for the end. */ /* CC */
1274 I32 scream_pos = -1; /* Internal iterator of scream. */
1276 SV* oreplsv = GvSV(PL_replgv);
1282 PL_regnarrate = PL_debug & 512;
1285 /* Be paranoid... */
1286 if (prog == NULL || startpos == NULL) {
1287 Perl_croak(aTHX_ "NULL regexp parameter");
1291 minlen = prog->minlen;
1292 if (strend - startpos < minlen) goto phooey;
1294 if (startpos == strbeg) /* is ^ valid at stringarg? */
1297 PL_regprev = (U32)stringarg[-1];
1298 if (!PL_multiline && PL_regprev == '\n')
1299 PL_regprev = '\0'; /* force ^ to NOT match */
1302 /* Check validity of program. */
1303 if (UCHARAT(prog->program) != REG_MAGIC) {
1304 Perl_croak(aTHX_ "corrupted regexp program");
1308 PL_reg_eval_set = 0;
1311 if (prog->reganch & ROPT_UTF8)
1312 PL_reg_flags |= RF_utf8;
1314 /* Mark beginning of line for ^ and lookbehind. */
1315 PL_regbol = startpos;
1319 /* Mark end of line for $ (and such) */
1322 /* see how far we have to get to not match where we matched before */
1323 PL_regtill = startpos+minend;
1325 /* We start without call_cc context. */
1328 /* If there is a "must appear" string, look for it. */
1331 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1334 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1335 PL_reg_ganch = startpos;
1336 else if (sv && SvTYPE(sv) >= SVt_PVMG
1338 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1339 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1340 if (prog->reganch & ROPT_ANCH_GPOS) {
1341 if (s > PL_reg_ganch)
1346 else /* pos() not defined */
1347 PL_reg_ganch = strbeg;
1350 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1351 re_scream_pos_data d;
1353 d.scream_olds = &scream_olds;
1354 d.scream_pos = &scream_pos;
1355 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1357 goto phooey; /* not present */
1360 DEBUG_r( if (!PL_colorset) reginitcolors() );
1361 DEBUG_r(PerlIO_printf(Perl_debug_log,
1362 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1363 PL_colors[4],PL_colors[5],PL_colors[0],
1366 (strlen(prog->precomp) > 60 ? "..." : ""),
1368 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1369 startpos, PL_colors[1],
1370 (strend - startpos > 60 ? "..." : ""))
1373 /* Simplest case: anchored match need be tried only once. */
1374 /* [unless only anchor is BOL and multiline is set] */
1375 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1376 if (s == startpos && regtry(prog, startpos))
1378 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1379 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1384 dontbother = minlen - 1;
1385 end = HOPc(strend, -dontbother) - 1;
1386 /* for multiline we only have to try after newlines */
1387 if (prog->check_substr) {
1391 if (regtry(prog, s))
1396 if (prog->reganch & RE_USE_INTUIT) {
1397 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1408 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1409 if (regtry(prog, s))
1416 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1417 if (regtry(prog, PL_reg_ganch))
1422 /* Messy cases: unanchored match. */
1423 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1424 /* we have /x+whatever/ */
1425 /* it must be a one character string (XXXX Except UTF?) */
1426 char ch = SvPVX(prog->anchored_substr)[0];
1428 while (s < strend) {
1430 if (regtry(prog, s)) goto got_it;
1432 while (s < strend && *s == ch)
1439 while (s < strend) {
1441 if (regtry(prog, s)) goto got_it;
1443 while (s < strend && *s == ch)
1451 else if (prog->anchored_substr != Nullsv
1452 || (prog->float_substr != Nullsv
1453 && prog->float_max_offset < strend - s)) {
1454 SV *must = prog->anchored_substr
1455 ? prog->anchored_substr : prog->float_substr;
1457 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1459 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1460 I32 delta = back_max - back_min;
1461 char *last = HOPc(strend, /* Cannot start after this */
1462 -(I32)(CHR_SVLEN(must)
1463 - (SvTAIL(must) != 0) + back_min));
1464 char *last1; /* Last position checked before */
1467 last1 = HOPc(s, -1);
1469 last1 = s - 1; /* bogus */
1471 /* XXXX check_substr already used to find `s', can optimize if
1472 check_substr==must. */
1474 dontbother = end_shift;
1475 strend = HOPc(strend, -dontbother);
1476 while ( (s <= last) &&
1477 ((flags & REXEC_SCREAM)
1478 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1479 end_shift, &scream_pos, 0))
1480 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1481 (unsigned char*)strend, must,
1482 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1483 if (HOPc(s, -back_max) > last1) {
1484 last1 = HOPc(s, -back_min);
1485 s = HOPc(s, -back_max);
1488 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1490 last1 = HOPc(s, -back_min);
1494 while (s <= last1) {
1495 if (regtry(prog, s))
1501 while (s <= last1) {
1502 if (regtry(prog, s))
1510 else if (c = prog->regstclass) {
1511 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1512 /* don't bother with what can't match */
1513 strend = HOPc(strend, -(minlen - 1));
1514 if (find_byclass(prog, c, s, strend, startpos, 0))
1519 if (prog->float_substr != Nullsv) { /* Trim the end. */
1521 I32 oldpos = scream_pos;
1523 if (flags & REXEC_SCREAM) {
1524 last = screaminstr(sv, prog->float_substr, s - strbeg,
1525 end_shift, &scream_pos, 1); /* last one */
1527 last = scream_olds; /* Only one occurence. */
1531 char *little = SvPV(prog->float_substr, len);
1533 if (SvTAIL(prog->float_substr)) {
1534 if (memEQ(strend - len + 1, little, len - 1))
1535 last = strend - len + 1;
1536 else if (!PL_multiline)
1537 last = memEQ(strend - len, little, len)
1538 ? strend - len : Nullch;
1544 last = rninstr(s, strend, little, little + len);
1546 last = strend; /* matching `$' */
1549 if (last == NULL) goto phooey; /* Should not happen! */
1550 dontbother = strend - last + prog->float_min_offset;
1552 if (minlen && (dontbother < minlen))
1553 dontbother = minlen - 1;
1554 strend -= dontbother; /* this one's always in bytes! */
1555 /* We don't know much -- general case. */
1558 if (regtry(prog, s))
1567 if (regtry(prog, s))
1569 } while (s++ < strend);
1577 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1579 if (PL_reg_eval_set) {
1580 /* Preserve the current value of $^R */
1581 if (oreplsv != GvSV(PL_replgv))
1582 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1583 restored, the value remains
1585 restore_pos(aTHXo_ 0);
1588 /* make sure $`, $&, $', and $digit will work later */
1589 if ( !(flags & REXEC_NOT_FIRST) ) {
1590 if (RX_MATCH_COPIED(prog)) {
1591 Safefree(prog->subbeg);
1592 RX_MATCH_COPIED_off(prog);
1594 if (flags & REXEC_COPY_STR) {
1595 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1597 s = savepvn(strbeg, i);
1600 RX_MATCH_COPIED_on(prog);
1603 prog->subbeg = strbeg;
1604 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1611 if (PL_reg_eval_set)
1612 restore_pos(aTHXo_ 0);
1617 - regtry - try match at specific point
1619 STATIC I32 /* 0 failure, 1 success */
1620 S_regtry(pTHX_ regexp *prog, char *startpos)
1628 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1631 PL_reg_eval_set = RS_init;
1633 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1634 (IV)(PL_stack_sp - PL_stack_base));
1636 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1637 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1638 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1640 /* Apparently this is not needed, judging by wantarray. */
1641 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1642 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1645 /* Make $_ available to executed code. */
1646 if (PL_reg_sv != DEFSV) {
1647 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1652 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1653 && (mg = mg_find(PL_reg_sv, 'g')))) {
1654 /* prepare for quick setting of pos */
1655 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1656 mg = mg_find(PL_reg_sv, 'g');
1660 PL_reg_oldpos = mg->mg_len;
1661 SAVEDESTRUCTOR_X(restore_pos, 0);
1664 New(22,PL_reg_curpm, 1, PMOP);
1665 PL_reg_curpm->op_pmregexp = prog;
1666 PL_reg_oldcurpm = PL_curpm;
1667 PL_curpm = PL_reg_curpm;
1668 if (RX_MATCH_COPIED(prog)) {
1669 /* Here is a serious problem: we cannot rewrite subbeg,
1670 since it may be needed if this match fails. Thus
1671 $` inside (?{}) could fail... */
1672 PL_reg_oldsaved = prog->subbeg;
1673 PL_reg_oldsavedlen = prog->sublen;
1674 RX_MATCH_COPIED_off(prog);
1677 PL_reg_oldsaved = Nullch;
1678 prog->subbeg = PL_bostr;
1679 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1681 prog->startp[0] = startpos - PL_bostr;
1682 PL_reginput = startpos;
1683 PL_regstartp = prog->startp;
1684 PL_regendp = prog->endp;
1685 PL_reglastparen = &prog->lastparen;
1686 prog->lastparen = 0;
1688 DEBUG_r(PL_reg_starttry = startpos);
1689 if (PL_reg_start_tmpl <= prog->nparens) {
1690 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1691 if(PL_reg_start_tmp)
1692 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1694 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1697 /* XXXX What this code is doing here?!!! There should be no need
1698 to do this again and again, PL_reglastparen should take care of
1702 if (prog->nparens) {
1703 for (i = prog->nparens; i >= 1; i--) {
1709 if (regmatch(prog->program + 1)) {
1710 prog->endp[0] = PL_reginput - PL_bostr;
1718 - regmatch - main matching routine
1720 * Conceptually the strategy is simple: check to see whether the current
1721 * node matches, call self recursively to see whether the rest matches,
1722 * and then act accordingly. In practice we make some effort to avoid
1723 * recursion, in particular by going through "ordinary" nodes (that don't
1724 * need to know whether the rest of the match failed) by a loop instead of
1727 /* [lwall] I've hoisted the register declarations to the outer block in order to
1728 * maybe save a little bit of pushing and popping on the stack. It also takes
1729 * advantage of machines that use a register save mask on subroutine entry.
1731 STATIC I32 /* 0 failure, 1 success */
1732 S_regmatch(pTHX_ regnode *prog)
1735 register regnode *scan; /* Current node. */
1736 regnode *next; /* Next node. */
1737 regnode *inner; /* Next node in internal branch. */
1738 register I32 nextchr; /* renamed nextchr - nextchar colides with
1739 function of same name */
1740 register I32 n; /* no or next */
1741 register I32 ln; /* len or last */
1742 register char *s; /* operand or save */
1743 register char *locinput = PL_reginput;
1744 register I32 c1, c2, paren; /* case fold search, parenth */
1745 int minmod = 0, sw = 0, logical = 0;
1750 /* Note that nextchr is a byte even in UTF */
1751 nextchr = UCHARAT(locinput);
1753 while (scan != NULL) {
1754 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1756 # define sayYES goto yes
1757 # define sayNO goto no
1758 # define sayYES_FINAL goto yes_final
1759 # define sayYES_LOUD goto yes_loud
1760 # define sayNO_FINAL goto no_final
1761 # define sayNO_SILENT goto do_no
1762 # define saySAME(x) if (x) goto yes; else goto no
1763 # define REPORT_CODE_OFF 24
1765 # define sayYES return 1
1766 # define sayNO return 0
1767 # define sayYES_FINAL return 1
1768 # define sayYES_LOUD return 1
1769 # define sayNO_FINAL return 0
1770 # define sayNO_SILENT return 0
1771 # define saySAME(x) return x
1774 SV *prop = sv_newmortal();
1775 int docolor = *PL_colors[0];
1776 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1777 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1778 /* The part of the string before starttry has one color
1779 (pref0_len chars), between starttry and current
1780 position another one (pref_len - pref0_len chars),
1781 after the current position the third one.
1782 We assume that pref0_len <= pref_len, otherwise we
1783 decrease pref0_len. */
1784 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1785 ? (5 + taill) - l : locinput - PL_bostr);
1786 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1788 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1789 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1790 ? (5 + taill) - pref_len : PL_regeol - locinput);
1793 if (pref0_len > pref_len)
1794 pref0_len = pref_len;
1795 regprop(prop, scan);
1796 PerlIO_printf(Perl_debug_log,
1797 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1798 (IV)(locinput - PL_bostr),
1799 PL_colors[4], pref0_len,
1800 locinput - pref_len, PL_colors[5],
1801 PL_colors[2], pref_len - pref0_len,
1802 locinput - pref_len + pref0_len, PL_colors[3],
1803 (docolor ? "" : "> <"),
1804 PL_colors[0], l, locinput, PL_colors[1],
1805 15 - l - pref_len + 1,
1807 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1811 next = scan + NEXT_OFF(scan);
1817 if (locinput == PL_bostr
1818 ? PL_regprev == '\n'
1820 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1822 /* regtill = regbol; */
1827 if (locinput == PL_bostr
1828 ? PL_regprev == '\n'
1829 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1835 if (locinput == PL_regbol && PL_regprev == '\n')
1839 if (locinput == PL_reg_ganch)
1849 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1854 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1856 if (PL_regeol - locinput > 1)
1860 if (PL_regeol != locinput)
1864 if (nextchr & 0x80) {
1865 locinput += PL_utf8skip[nextchr];
1866 if (locinput > PL_regeol)
1868 nextchr = UCHARAT(locinput);
1871 if (!nextchr && locinput >= PL_regeol)
1873 nextchr = UCHARAT(++locinput);
1876 if (!nextchr && locinput >= PL_regeol)
1878 nextchr = UCHARAT(++locinput);
1881 if (nextchr & 0x80) {
1882 locinput += PL_utf8skip[nextchr];
1883 if (locinput > PL_regeol)
1885 nextchr = UCHARAT(locinput);
1888 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1890 nextchr = UCHARAT(++locinput);
1893 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1895 nextchr = UCHARAT(++locinput);
1900 /* Inline the first character, for speed. */
1901 if (UCHARAT(s) != nextchr)
1903 if (PL_regeol - locinput < ln)
1905 if (ln > 1 && memNE(s, locinput, ln))
1908 nextchr = UCHARAT(locinput);
1911 PL_reg_flags |= RF_tainted;
1920 c1 = OP(scan) == EXACTF;
1924 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1925 toLOWER_utf8((U8*)l) :
1926 toLOWER_LC_utf8((U8*)l)))
1934 nextchr = UCHARAT(locinput);
1938 /* Inline the first character, for speed. */
1939 if (UCHARAT(s) != nextchr &&
1940 UCHARAT(s) != ((OP(scan) == EXACTF)
1941 ? PL_fold : PL_fold_locale)[nextchr])
1943 if (PL_regeol - locinput < ln)
1945 if (ln > 1 && (OP(scan) == EXACTF
1946 ? ibcmp(s, locinput, ln)
1947 : ibcmp_locale(s, locinput, ln)))
1950 nextchr = UCHARAT(locinput);
1953 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1955 if (locinput >= PL_regeol)
1957 locinput += PL_utf8skip[nextchr];
1958 nextchr = UCHARAT(locinput);
1962 nextchr = UCHARAT(locinput);
1963 if (!REGINCLASS(scan, nextchr))
1965 if (!nextchr && locinput >= PL_regeol)
1967 nextchr = UCHARAT(++locinput);
1970 PL_reg_flags |= RF_tainted;
1975 if (!(OP(scan) == ALNUM
1976 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1978 nextchr = UCHARAT(++locinput);
1981 PL_reg_flags |= RF_tainted;
1986 if (nextchr & 0x80) {
1987 if (!(OP(scan) == ALNUMUTF8
1988 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1989 : isALNUM_LC_utf8((U8*)locinput)))
1993 locinput += PL_utf8skip[nextchr];
1994 nextchr = UCHARAT(locinput);
1997 if (!(OP(scan) == ALNUMUTF8
1998 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2000 nextchr = UCHARAT(++locinput);
2003 PL_reg_flags |= RF_tainted;
2006 if (!nextchr && locinput >= PL_regeol)
2008 if (OP(scan) == NALNUM
2009 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2011 nextchr = UCHARAT(++locinput);
2014 PL_reg_flags |= RF_tainted;
2017 if (!nextchr && locinput >= PL_regeol)
2019 if (nextchr & 0x80) {
2020 if (OP(scan) == NALNUMUTF8
2021 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2022 : isALNUM_LC_utf8((U8*)locinput))
2026 locinput += PL_utf8skip[nextchr];
2027 nextchr = UCHARAT(locinput);
2030 if (OP(scan) == NALNUMUTF8
2031 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2033 nextchr = UCHARAT(++locinput);
2037 PL_reg_flags |= RF_tainted;
2041 /* was last char in word? */
2042 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2043 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2045 n = isALNUM(nextchr);
2048 ln = isALNUM_LC(ln);
2049 n = isALNUM_LC(nextchr);
2051 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2056 PL_reg_flags |= RF_tainted;
2060 /* was last char in word? */
2061 ln = (locinput != PL_regbol)
2062 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2063 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2064 ln = isALNUM_uni(ln);
2065 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2068 ln = isALNUM_LC_uni(ln);
2069 n = isALNUM_LC_utf8((U8*)locinput);
2071 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2075 PL_reg_flags |= RF_tainted;
2078 if (!nextchr && locinput >= PL_regeol)
2080 if (!(OP(scan) == SPACE
2081 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2083 nextchr = UCHARAT(++locinput);
2086 PL_reg_flags |= RF_tainted;
2089 if (!nextchr && locinput >= PL_regeol)
2091 if (nextchr & 0x80) {
2092 if (!(OP(scan) == SPACEUTF8
2093 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2094 : isSPACE_LC_utf8((U8*)locinput)))
2098 locinput += PL_utf8skip[nextchr];
2099 nextchr = UCHARAT(locinput);
2102 if (!(OP(scan) == SPACEUTF8
2103 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2105 nextchr = UCHARAT(++locinput);
2108 PL_reg_flags |= RF_tainted;
2113 if (OP(scan) == SPACE
2114 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2116 nextchr = UCHARAT(++locinput);
2119 PL_reg_flags |= RF_tainted;
2124 if (nextchr & 0x80) {
2125 if (OP(scan) == NSPACEUTF8
2126 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2127 : isSPACE_LC_utf8((U8*)locinput))
2131 locinput += PL_utf8skip[nextchr];
2132 nextchr = UCHARAT(locinput);
2135 if (OP(scan) == NSPACEUTF8
2136 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2138 nextchr = UCHARAT(++locinput);
2141 PL_reg_flags |= RF_tainted;
2144 if (!nextchr && locinput >= PL_regeol)
2146 if (!(OP(scan) == DIGIT
2147 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2149 nextchr = UCHARAT(++locinput);
2152 PL_reg_flags |= RF_tainted;
2157 if (nextchr & 0x80) {
2158 if (OP(scan) == NDIGITUTF8
2159 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2160 : isDIGIT_LC_utf8((U8*)locinput))
2164 locinput += PL_utf8skip[nextchr];
2165 nextchr = UCHARAT(locinput);
2168 if (!isDIGIT(nextchr))
2170 nextchr = UCHARAT(++locinput);
2173 PL_reg_flags |= RF_tainted;
2178 if (OP(scan) == DIGIT
2179 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2181 nextchr = UCHARAT(++locinput);
2184 PL_reg_flags |= RF_tainted;
2187 if (!nextchr && locinput >= PL_regeol)
2189 if (nextchr & 0x80) {
2190 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2192 locinput += PL_utf8skip[nextchr];
2193 nextchr = UCHARAT(locinput);
2196 if (isDIGIT(nextchr))
2198 nextchr = UCHARAT(++locinput);
2201 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2203 locinput += PL_utf8skip[nextchr];
2204 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2205 locinput += UTF8SKIP(locinput);
2206 if (locinput > PL_regeol)
2208 nextchr = UCHARAT(locinput);
2211 PL_reg_flags |= RF_tainted;
2215 n = ARG(scan); /* which paren pair */
2216 ln = PL_regstartp[n];
2217 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2218 if (*PL_reglastparen < n || ln == -1)
2219 sayNO; /* Do not match unless seen CLOSEn. */
2220 if (ln == PL_regendp[n])
2224 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2226 char *e = PL_bostr + PL_regendp[n];
2228 * Note that we can't do the "other character" lookup trick as
2229 * in the 8-bit case (no pun intended) because in Unicode we
2230 * have to map both upper and title case to lower case.
2232 if (OP(scan) == REFF) {
2236 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2246 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2253 nextchr = UCHARAT(locinput);
2257 /* Inline the first character, for speed. */
2258 if (UCHARAT(s) != nextchr &&
2260 (UCHARAT(s) != ((OP(scan) == REFF
2261 ? PL_fold : PL_fold_locale)[nextchr]))))
2263 ln = PL_regendp[n] - ln;
2264 if (locinput + ln > PL_regeol)
2266 if (ln > 1 && (OP(scan) == REF
2267 ? memNE(s, locinput, ln)
2269 ? ibcmp(s, locinput, ln)
2270 : ibcmp_locale(s, locinput, ln))))
2273 nextchr = UCHARAT(locinput);
2284 OP_4tree *oop = PL_op;
2285 COP *ocurcop = PL_curcop;
2286 SV **ocurpad = PL_curpad;
2290 PL_op = (OP_4tree*)PL_regdata->data[n];
2291 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2292 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2293 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2295 CALLRUNOPS(aTHX); /* Scalar context. */
2301 PL_curpad = ocurpad;
2302 PL_curcop = ocurcop;
2304 if (logical == 2) { /* Postponed subexpression. */
2306 MAGIC *mg = Null(MAGIC*);
2308 CHECKPOINT cp, lastcp;
2310 if(SvROK(ret) || SvRMAGICAL(ret)) {
2311 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2314 mg = mg_find(sv, 'r');
2317 re = (regexp *)mg->mg_obj;
2318 (void)ReREFCNT_inc(re);
2322 char *t = SvPV(ret, len);
2324 char *oprecomp = PL_regprecomp;
2325 I32 osize = PL_regsize;
2326 I32 onpar = PL_regnpar;
2329 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2331 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2332 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2333 PL_regprecomp = oprecomp;
2338 PerlIO_printf(Perl_debug_log,
2339 "Entering embedded `%s%.60s%s%s'\n",
2343 (strlen(re->precomp) > 60 ? "..." : ""))
2346 state.prev = PL_reg_call_cc;
2347 state.cc = PL_regcc;
2348 state.re = PL_reg_re;
2352 cp = regcppush(0); /* Save *all* the positions. */
2355 state.ss = PL_savestack_ix;
2356 *PL_reglastparen = 0;
2357 PL_reg_call_cc = &state;
2358 PL_reginput = locinput;
2360 /* XXXX This is too dramatic a measure... */
2363 if (regmatch(re->program + 1)) {
2364 /* Even though we succeeded, we need to restore
2365 global variables, since we may be wrapped inside
2366 SUSPEND, thus the match may be not finished yet. */
2368 /* XXXX Do this only if SUSPENDed? */
2369 PL_reg_call_cc = state.prev;
2370 PL_regcc = state.cc;
2371 PL_reg_re = state.re;
2372 cache_re(PL_reg_re);
2374 /* XXXX This is too dramatic a measure... */
2377 /* These are needed even if not SUSPEND. */
2385 PL_reg_call_cc = state.prev;
2386 PL_regcc = state.cc;
2387 PL_reg_re = state.re;
2388 cache_re(PL_reg_re);
2390 /* XXXX This is too dramatic a measure... */
2399 sv_setsv(save_scalar(PL_replgv), ret);
2403 n = ARG(scan); /* which paren pair */
2404 PL_reg_start_tmp[n] = locinput;
2409 n = ARG(scan); /* which paren pair */
2410 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2411 PL_regendp[n] = locinput - PL_bostr;
2412 if (n > *PL_reglastparen)
2413 *PL_reglastparen = n;
2416 n = ARG(scan); /* which paren pair */
2417 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2420 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2422 next = NEXTOPER(NEXTOPER(scan));
2424 next = scan + ARG(scan);
2425 if (OP(next) == IFTHEN) /* Fake one. */
2426 next = NEXTOPER(NEXTOPER(next));
2430 logical = scan->flags;
2432 /*******************************************************************
2433 PL_regcc contains infoblock about the innermost (...)* loop, and
2434 a pointer to the next outer infoblock.
2436 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2438 1) After matching X, regnode for CURLYX is processed;
2440 2) This regnode creates infoblock on the stack, and calls
2441 regmatch() recursively with the starting point at WHILEM node;
2443 3) Each hit of WHILEM node tries to match A and Z (in the order
2444 depending on the current iteration, min/max of {min,max} and
2445 greediness). The information about where are nodes for "A"
2446 and "Z" is read from the infoblock, as is info on how many times "A"
2447 was already matched, and greediness.
2449 4) After A matches, the same WHILEM node is hit again.
2451 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2452 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2453 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2454 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2455 of the external loop.
2457 Currently present infoblocks form a tree with a stem formed by PL_curcc
2458 and whatever it mentions via ->next, and additional attached trees
2459 corresponding to temporarily unset infoblocks as in "5" above.
2461 In the following picture infoblocks for outer loop of
2462 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2463 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2464 infoblocks are drawn below the "reset" infoblock.
2466 In fact in the picture below we do not show failed matches for Z and T
2467 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2468 more obvious *why* one needs to *temporary* unset infoblocks.]
2470 Matched REx position InfoBlocks Comment
2474 Y A)*?Z)*?T x <- O <- I
2475 YA )*?Z)*?T x <- O <- I
2476 YA A)*?Z)*?T x <- O <- I
2477 YAA )*?Z)*?T x <- O <- I
2478 YAA Z)*?T x <- O # Temporary unset I
2481 YAAZ Y(A)*?Z)*?T x <- O
2484 YAAZY (A)*?Z)*?T x <- O
2487 YAAZY A)*?Z)*?T x <- O <- I
2490 YAAZYA )*?Z)*?T x <- O <- I
2493 YAAZYA Z)*?T x <- O # Temporary unset I
2499 YAAZYAZ T x # Temporary unset O
2506 *******************************************************************/
2509 CHECKPOINT cp = PL_savestack_ix;
2511 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2513 cc.oldcc = PL_regcc;
2515 cc.parenfloor = *PL_reglastparen;
2517 cc.min = ARG1(scan);
2518 cc.max = ARG2(scan);
2519 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2523 PL_reginput = locinput;
2524 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2526 PL_regcc = cc.oldcc;
2532 * This is really hard to understand, because after we match
2533 * what we're trying to match, we must make sure the rest of
2534 * the REx is going to match for sure, and to do that we have
2535 * to go back UP the parse tree by recursing ever deeper. And
2536 * if it fails, we have to reset our parent's current state
2537 * that we can try again after backing off.
2540 CHECKPOINT cp, lastcp;
2541 CURCUR* cc = PL_regcc;
2542 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2544 n = cc->cur + 1; /* how many we know we matched */
2545 PL_reginput = locinput;
2548 PerlIO_printf(Perl_debug_log,
2549 "%*s %ld out of %ld..%ld cc=%lx\n",
2550 REPORT_CODE_OFF+PL_regindent*2, "",
2551 (long)n, (long)cc->min,
2552 (long)cc->max, (long)cc)
2555 /* If degenerate scan matches "", assume scan done. */
2557 if (locinput == cc->lastloc && n >= cc->min) {
2558 PL_regcc = cc->oldcc;
2562 PerlIO_printf(Perl_debug_log,
2563 "%*s empty match detected, try continuation...\n",
2564 REPORT_CODE_OFF+PL_regindent*2, "")
2566 if (regmatch(cc->next))
2574 /* First just match a string of min scans. */
2578 cc->lastloc = locinput;
2579 if (regmatch(cc->scan))
2582 cc->lastloc = lastloc;
2587 /* Check whether we already were at this position.
2588 Postpone detection until we know the match is not
2589 *that* much linear. */
2590 if (!PL_reg_maxiter) {
2591 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2592 PL_reg_leftiter = PL_reg_maxiter;
2594 if (PL_reg_leftiter-- == 0) {
2595 I32 size = (PL_reg_maxiter + 7)/8;
2596 if (PL_reg_poscache) {
2597 if (PL_reg_poscache_size < size) {
2598 Renew(PL_reg_poscache, size, char);
2599 PL_reg_poscache_size = size;
2601 Zero(PL_reg_poscache, size, char);
2604 PL_reg_poscache_size = size;
2605 Newz(29, PL_reg_poscache, size, char);
2608 PerlIO_printf(Perl_debug_log,
2609 "%sDetected a super-linear match, switching on caching%s...\n",
2610 PL_colors[4], PL_colors[5])
2613 if (PL_reg_leftiter < 0) {
2614 I32 o = locinput - PL_bostr, b;
2616 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2619 if (PL_reg_poscache[o] & (1<<b)) {
2621 PerlIO_printf(Perl_debug_log,
2622 "%*s already tried at this position...\n",
2623 REPORT_CODE_OFF+PL_regindent*2, "")
2627 PL_reg_poscache[o] |= (1<<b);
2631 /* Prefer next over scan for minimal matching. */
2634 PL_regcc = cc->oldcc;
2637 cp = regcppush(cc->parenfloor);
2639 if (regmatch(cc->next)) {
2641 sayYES; /* All done. */
2649 if (n >= cc->max) { /* Maximum greed exceeded? */
2650 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2651 && !(PL_reg_flags & RF_warned)) {
2652 PL_reg_flags |= RF_warned;
2653 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2654 "Complex regular subexpression recursion",
2661 PerlIO_printf(Perl_debug_log,
2662 "%*s trying longer...\n",
2663 REPORT_CODE_OFF+PL_regindent*2, "")
2665 /* Try scanning more and see if it helps. */
2666 PL_reginput = locinput;
2668 cc->lastloc = locinput;
2669 cp = regcppush(cc->parenfloor);
2671 if (regmatch(cc->scan)) {
2678 cc->lastloc = lastloc;
2682 /* Prefer scan over next for maximal matching. */
2684 if (n < cc->max) { /* More greed allowed? */
2685 cp = regcppush(cc->parenfloor);
2687 cc->lastloc = locinput;
2689 if (regmatch(cc->scan)) {
2694 regcppop(); /* Restore some previous $<digit>s? */
2695 PL_reginput = locinput;
2697 PerlIO_printf(Perl_debug_log,
2698 "%*s failed, try continuation...\n",
2699 REPORT_CODE_OFF+PL_regindent*2, "")
2702 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2703 && !(PL_reg_flags & RF_warned)) {
2704 PL_reg_flags |= RF_warned;
2705 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2706 "Complex regular subexpression recursion",
2710 /* Failed deeper matches of scan, so see if this one works. */
2711 PL_regcc = cc->oldcc;
2714 if (regmatch(cc->next))
2720 cc->lastloc = lastloc;
2725 next = scan + ARG(scan);
2728 inner = NEXTOPER(NEXTOPER(scan));
2731 inner = NEXTOPER(scan);
2736 if (OP(next) != c1) /* No choice. */
2737 next = inner; /* Avoid recursion. */
2739 int lastparen = *PL_reglastparen;
2743 PL_reginput = locinput;
2744 if (regmatch(inner))
2747 for (n = *PL_reglastparen; n > lastparen; n--)
2749 *PL_reglastparen = n;
2752 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2756 inner = NEXTOPER(scan);
2757 if (c1 == BRANCHJ) {
2758 inner = NEXTOPER(inner);
2760 } while (scan != NULL && OP(scan) == c1);
2774 /* We suppose that the next guy does not need
2775 backtracking: in particular, it is of constant length,
2776 and has no parenths to influence future backrefs. */
2777 ln = ARG1(scan); /* min to match */
2778 n = ARG2(scan); /* max to match */
2779 paren = scan->flags;
2781 if (paren > PL_regsize)
2783 if (paren > *PL_reglastparen)
2784 *PL_reglastparen = paren;
2786 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2788 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2789 PL_reginput = locinput;
2792 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2794 if (ln && l == 0 && n >= ln
2795 /* In fact, this is tricky. If paren, then the
2796 fact that we did/didnot match may influence
2797 future execution. */
2798 && !(paren && ln == 0))
2800 locinput = PL_reginput;
2801 if (PL_regkind[(U8)OP(next)] == EXACT) {
2802 c1 = (U8)*STRING(next);
2803 if (OP(next) == EXACTF)
2805 else if (OP(next) == EXACTFL)
2806 c2 = PL_fold_locale[c1];
2813 /* This may be improved if l == 0. */
2814 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2815 /* If it could work, try it. */
2817 UCHARAT(PL_reginput) == c1 ||
2818 UCHARAT(PL_reginput) == c2)
2822 PL_regstartp[paren] =
2823 HOPc(PL_reginput, -l) - PL_bostr;
2824 PL_regendp[paren] = PL_reginput - PL_bostr;
2827 PL_regendp[paren] = -1;
2833 /* Couldn't or didn't -- move forward. */
2834 PL_reginput = locinput;
2835 if (regrepeat_hard(scan, 1, &l)) {
2837 locinput = PL_reginput;
2844 n = regrepeat_hard(scan, n, &l);
2845 if (n != 0 && l == 0
2846 /* In fact, this is tricky. If paren, then the
2847 fact that we did/didnot match may influence
2848 future execution. */
2849 && !(paren && ln == 0))
2851 locinput = PL_reginput;
2853 PerlIO_printf(Perl_debug_log,
2854 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2855 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2859 if (PL_regkind[(U8)OP(next)] == EXACT) {
2860 c1 = (U8)*STRING(next);
2861 if (OP(next) == EXACTF)
2863 else if (OP(next) == EXACTFL)
2864 c2 = PL_fold_locale[c1];
2873 /* If it could work, try it. */
2875 UCHARAT(PL_reginput) == c1 ||
2876 UCHARAT(PL_reginput) == c2)
2879 PerlIO_printf(Perl_debug_log,
2880 "%*s trying tail with n=%"IVdf"...\n",
2881 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2885 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2886 PL_regendp[paren] = PL_reginput - PL_bostr;
2889 PL_regendp[paren] = -1;
2895 /* Couldn't or didn't -- back up. */
2897 locinput = HOPc(locinput, -l);
2898 PL_reginput = locinput;
2905 paren = scan->flags; /* Which paren to set */
2906 if (paren > PL_regsize)
2908 if (paren > *PL_reglastparen)
2909 *PL_reglastparen = paren;
2910 ln = ARG1(scan); /* min to match */
2911 n = ARG2(scan); /* max to match */
2912 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2916 ln = ARG1(scan); /* min to match */
2917 n = ARG2(scan); /* max to match */
2918 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2923 scan = NEXTOPER(scan);
2929 scan = NEXTOPER(scan);
2933 * Lookahead to avoid useless match attempts
2934 * when we know what character comes next.
2936 if (PL_regkind[(U8)OP(next)] == EXACT) {
2937 c1 = (U8)*STRING(next);
2938 if (OP(next) == EXACTF)
2940 else if (OP(next) == EXACTFL)
2941 c2 = PL_fold_locale[c1];
2947 PL_reginput = locinput;
2951 if (ln && regrepeat(scan, ln) < ln)
2953 locinput = PL_reginput;
2956 char *e = locinput + n - ln; /* Should not check after this */
2957 char *old = locinput;
2959 if (e >= PL_regeol || (n == REG_INFTY))
2962 /* Find place 'next' could work */
2964 while (locinput <= e && *locinput != c1)
2967 while (locinput <= e
2974 /* PL_reginput == old now */
2975 if (locinput != old) {
2976 ln = 1; /* Did some */
2977 if (regrepeat(scan, locinput - old) <
2981 /* PL_reginput == locinput now */
2984 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2985 PL_regendp[paren] = locinput - PL_bostr;
2988 PL_regendp[paren] = -1;
2992 PL_reginput = locinput; /* Could be reset... */
2994 /* Couldn't or didn't -- move forward. */
2999 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3000 /* If it could work, try it. */
3002 UCHARAT(PL_reginput) == c1 ||
3003 UCHARAT(PL_reginput) == c2)
3007 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3008 PL_regendp[paren] = PL_reginput - PL_bostr;
3011 PL_regendp[paren] = -1;
3017 /* Couldn't or didn't -- move forward. */
3018 PL_reginput = locinput;
3019 if (regrepeat(scan, 1)) {
3021 locinput = PL_reginput;
3029 n = regrepeat(scan, n);
3030 locinput = PL_reginput;
3031 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3032 (!PL_multiline || OP(next) == SEOL))
3033 ln = n; /* why back off? */
3037 /* If it could work, try it. */
3039 UCHARAT(PL_reginput) == c1 ||
3040 UCHARAT(PL_reginput) == c2)
3044 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3045 PL_regendp[paren] = PL_reginput - PL_bostr;
3048 PL_regendp[paren] = -1;
3054 /* Couldn't or didn't -- back up. */
3056 PL_reginput = locinput = HOPc(locinput, -1);
3061 /* If it could work, try it. */
3063 UCHARAT(PL_reginput) == c1 ||
3064 UCHARAT(PL_reginput) == c2)
3070 /* Couldn't or didn't -- back up. */
3072 PL_reginput = locinput = HOPc(locinput, -1);
3079 if (PL_reg_call_cc) {
3080 re_cc_state *cur_call_cc = PL_reg_call_cc;
3081 CURCUR *cctmp = PL_regcc;
3082 regexp *re = PL_reg_re;
3083 CHECKPOINT cp, lastcp;
3085 cp = regcppush(0); /* Save *all* the positions. */
3087 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3089 PL_reginput = locinput; /* Make position available to
3091 cache_re(PL_reg_call_cc->re);
3092 PL_regcc = PL_reg_call_cc->cc;
3093 PL_reg_call_cc = PL_reg_call_cc->prev;
3094 if (regmatch(cur_call_cc->node)) {
3095 PL_reg_call_cc = cur_call_cc;
3101 PL_reg_call_cc = cur_call_cc;
3107 PerlIO_printf(Perl_debug_log,
3108 "%*s continuation failed...\n",
3109 REPORT_CODE_OFF+PL_regindent*2, "")
3113 if (locinput < PL_regtill) {
3114 DEBUG_r(PerlIO_printf(Perl_debug_log,
3115 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3117 (long)(locinput - PL_reg_starttry),
3118 (long)(PL_regtill - PL_reg_starttry),
3120 sayNO_FINAL; /* Cannot match: too short. */
3122 PL_reginput = locinput; /* put where regtry can find it */
3123 sayYES_FINAL; /* Success! */
3125 PL_reginput = locinput; /* put where regtry can find it */
3126 sayYES_LOUD; /* Success! */
3129 PL_reginput = locinput;
3134 if (UTF) { /* XXXX This is absolutely
3135 broken, we read before
3137 s = HOPMAYBEc(locinput, -scan->flags);
3143 if (locinput < PL_bostr + scan->flags)
3145 PL_reginput = locinput - scan->flags;
3150 PL_reginput = locinput;
3155 if (UTF) { /* XXXX This is absolutely
3156 broken, we read before
3158 s = HOPMAYBEc(locinput, -scan->flags);
3159 if (!s || s < PL_bostr)
3164 if (locinput < PL_bostr + scan->flags)
3166 PL_reginput = locinput - scan->flags;
3171 PL_reginput = locinput;
3174 inner = NEXTOPER(NEXTOPER(scan));
3175 if (regmatch(inner) != n) {
3190 if (OP(scan) == SUSPEND) {
3191 locinput = PL_reginput;
3192 nextchr = UCHARAT(locinput);
3197 next = scan + ARG(scan);
3202 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3203 PTR2UV(scan), OP(scan));
3204 Perl_croak(aTHX_ "regexp memory corruption");
3210 * We get here only if there's trouble -- normally "case END" is
3211 * the terminating point.
3213 Perl_croak(aTHX_ "corrupted regexp pointers");
3219 PerlIO_printf(Perl_debug_log,
3220 "%*s %scould match...%s\n",
3221 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3225 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3226 PL_colors[4],PL_colors[5]));
3235 PerlIO_printf(Perl_debug_log,
3236 "%*s %sfailed...%s\n",
3237 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3249 - regrepeat - repeatedly match something simple, report how many
3252 * [This routine now assumes that it will only match on things of length 1.
3253 * That was true before, but now we assume scan - reginput is the count,
3254 * rather than incrementing count on every character. [Er, except utf8.]]
3257 S_regrepeat(pTHX_ regnode *p, I32 max)
3260 register char *scan;
3262 register char *loceol = PL_regeol;
3263 register I32 hardcount = 0;
3266 if (max != REG_INFTY && max < loceol - scan)
3267 loceol = scan + max;
3270 while (scan < loceol && *scan != '\n')
3278 while (scan < loceol && *scan != '\n') {
3279 scan += UTF8SKIP(scan);
3285 while (scan < loceol) {
3286 scan += UTF8SKIP(scan);
3290 case EXACT: /* length of string is 1 */
3292 while (scan < loceol && UCHARAT(scan) == c)
3295 case EXACTF: /* length of string is 1 */
3297 while (scan < loceol &&
3298 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3301 case EXACTFL: /* length of string is 1 */
3302 PL_reg_flags |= RF_tainted;
3304 while (scan < loceol &&
3305 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3310 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3311 scan += UTF8SKIP(scan);
3316 while (scan < loceol && REGINCLASS(p, *scan))
3320 while (scan < loceol && isALNUM(*scan))
3325 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3326 scan += UTF8SKIP(scan);
3331 PL_reg_flags |= RF_tainted;
3332 while (scan < loceol && isALNUM_LC(*scan))
3336 PL_reg_flags |= RF_tainted;
3338 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3339 scan += UTF8SKIP(scan);
3345 while (scan < loceol && !isALNUM(*scan))
3350 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3351 scan += UTF8SKIP(scan);
3356 PL_reg_flags |= RF_tainted;
3357 while (scan < loceol && !isALNUM_LC(*scan))
3361 PL_reg_flags |= RF_tainted;
3363 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3364 scan += UTF8SKIP(scan);
3369 while (scan < loceol && isSPACE(*scan))
3374 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3375 scan += UTF8SKIP(scan);
3380 PL_reg_flags |= RF_tainted;
3381 while (scan < loceol && isSPACE_LC(*scan))
3385 PL_reg_flags |= RF_tainted;
3387 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3388 scan += UTF8SKIP(scan);
3393 while (scan < loceol && !isSPACE(*scan))
3398 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3399 scan += UTF8SKIP(scan);
3404 PL_reg_flags |= RF_tainted;
3405 while (scan < loceol && !isSPACE_LC(*scan))
3409 PL_reg_flags |= RF_tainted;
3411 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3412 scan += UTF8SKIP(scan);
3417 while (scan < loceol && isDIGIT(*scan))
3422 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3423 scan += UTF8SKIP(scan);
3429 while (scan < loceol && !isDIGIT(*scan))
3434 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3435 scan += UTF8SKIP(scan);
3439 default: /* Called on something of 0 width. */
3440 break; /* So match right here or not at all. */
3446 c = scan - PL_reginput;
3451 SV *prop = sv_newmortal();
3454 PerlIO_printf(Perl_debug_log,
3455 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3456 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3463 - regrepeat_hard - repeatedly match something, report total lenth and length
3465 * The repeater is supposed to have constant length.
3469 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3472 register char *scan;
3473 register char *start;
3474 register char *loceol = PL_regeol;
3476 I32 count = 0, res = 1;
3481 start = PL_reginput;
3483 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3486 while (start < PL_reginput) {
3488 start += UTF8SKIP(start);
3499 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3501 *lp = l = PL_reginput - start;
3502 if (max != REG_INFTY && l*max < loceol - scan)
3503 loceol = scan + l*max;
3516 - reginclass - determine if a character falls into a character class
3520 S_reginclass(pTHX_ register regnode *p, register I32 c)
3523 char flags = ANYOF_FLAGS(p);
3527 if (ANYOF_BITMAP_TEST(p, c))
3529 else if (flags & ANYOF_FOLD) {
3531 if (flags & ANYOF_LOCALE) {
3532 PL_reg_flags |= RF_tainted;
3533 cf = PL_fold_locale[c];
3537 if (ANYOF_BITMAP_TEST(p, cf))
3541 if (!match && (flags & ANYOF_CLASS)) {
3542 PL_reg_flags |= RF_tainted;
3544 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3545 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3546 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3547 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3548 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3549 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3550 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3551 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3552 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3553 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3554 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3555 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3556 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3557 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3558 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3559 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3560 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3561 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3562 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3563 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3564 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3565 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3566 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3567 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3568 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3569 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3570 ) /* How's that for a conditional? */
3576 return (flags & ANYOF_INVERT) ? !match : match;
3580 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3583 char flags = ARG1(f);
3585 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3587 if (swash_fetch(sv, p))
3589 else if (flags & ANYOF_FOLD) {
3592 if (flags & ANYOF_LOCALE) {
3593 PL_reg_flags |= RF_tainted;
3594 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3597 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3598 if (swash_fetch(sv, tmpbuf))
3602 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3604 return (flags & ANYOF_INVERT) ? !match : match;
3608 S_reghop(pTHX_ U8 *s, I32 off)
3612 while (off-- && s < (U8*)PL_regeol)
3617 if (s > (U8*)PL_bostr) {
3620 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3622 } /* XXX could check well-formedness here */
3630 S_reghopmaybe(pTHX_ U8* s, I32 off)
3634 while (off-- && s < (U8*)PL_regeol)
3641 if (s > (U8*)PL_bostr) {
3644 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3646 } /* XXX could check well-formedness here */
3662 restore_pos(pTHXo_ void *arg)
3665 if (PL_reg_eval_set) {
3666 if (PL_reg_oldsaved) {
3667 PL_reg_re->subbeg = PL_reg_oldsaved;
3668 PL_reg_re->sublen = PL_reg_oldsavedlen;
3669 RX_MATCH_COPIED_on(PL_reg_re);
3671 PL_reg_magic->mg_len = PL_reg_oldpos;
3672 PL_reg_eval_set = 0;
3673 PL_curpm = PL_reg_oldcurpm;