5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
224 #define TRYPAREN(paren, n, input) { \
227 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
228 PL_regendp[paren] = input - PL_bostr; \
231 PL_regendp[paren] = -1; \
233 if (regmatch(next)) \
236 PL_regendp[paren] = -1; \
241 * pregexec and friends
245 - pregexec - match a regexp against a string
248 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
249 char *strbeg, I32 minend, SV *screamer, U32 nosave)
250 /* strend: pointer to null at end of string */
251 /* strbeg: real beginning of string */
252 /* minend: end of match must be >=minend after stringarg. */
253 /* nosave: For optimizations. */
256 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
257 nosave ? 0 : REXEC_COPY_STR);
261 S_cache_re(pTHX_ regexp *prog)
264 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
266 PL_regprogram = prog->program;
268 PL_regnpar = prog->nparens;
269 PL_regdata = prog->data;
274 * Need to implement the following flags for reg_anch:
276 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
278 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
279 * INTUIT_AUTORITATIVE_ML
280 * INTUIT_ONCE_NOML - Intuit can match in one location only.
283 * Another flag for this function: SECOND_TIME (so that float substrs
284 * with giant delta may be not rechecked).
287 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
289 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
290 Otherwise, only SvCUR(sv) is used to get strbeg. */
292 /* XXXX We assume that strpos is strbeg unless sv. */
294 /* XXXX Some places assume that there is a fixed substring.
295 An update may be needed if optimizer marks as "INTUITable"
296 RExen without fixed substrings. Similarly, it is assumed that
297 lengths of all the strings are no more than minlen, thus they
298 cannot come from lookahead.
299 (Or minlen should take into account lookahead.) */
301 /* A failure to find a constant substring means that there is no need to make
302 an expensive call to REx engine, thus we celebrate a failure. Similarly,
303 finding a substring too deep into the string means that less calls to
304 regtry() should be needed.
306 REx compiler's optimizer found 4 possible hints:
307 a) Anchored substring;
309 c) Whether we are anchored (beginning-of-line or \G);
310 d) First node (of those at offset 0) which may distingush positions;
311 We use a)b)d) and multiline-part of c), and try to find a position in the
312 string which does not contradict any of them.
315 /* Most of decisions we do here should have been done at compile time.
316 The nodes of the REx which we used for the search should have been
317 deleted from the finite automaton. */
320 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
321 char *strend, U32 flags, re_scream_pos_data *data)
323 register I32 start_shift;
324 /* Should be nonnegative! */
325 register I32 end_shift;
331 register char *other_last = Nullch; /* other substr checked before this */
332 char *check_at; /* check substr found at this pos */
334 char *i_strpos = strpos;
337 DEBUG_r( if (!PL_colorset) reginitcolors() );
338 DEBUG_r(PerlIO_printf(Perl_debug_log,
339 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
340 PL_colors[4],PL_colors[5],PL_colors[0],
343 (strlen(prog->precomp) > 60 ? "..." : ""),
345 (int)(strend - strpos > 60 ? 60 : strend - strpos),
346 strpos, PL_colors[1],
347 (strend - strpos > 60 ? "..." : ""))
350 if (prog->minlen > strend - strpos) {
351 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
354 check = prog->check_substr;
355 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
356 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
357 || ( (prog->reganch & ROPT_ANCH_BOL)
358 && !PL_multiline ) ); /* Check after \n? */
360 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
361 /* Substring at constant offset from beg-of-str... */
364 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
365 /* SvCUR is not set on references: SvRV and SvPVX overlap */
367 && (strpos + SvCUR(sv) != strend)) {
368 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
371 PL_regeol = strend; /* Used in HOP() */
372 s = HOPc(strpos, prog->check_offset_min);
374 slen = SvCUR(check); /* >= 1 */
376 if ( strend - s > slen || strend - s < slen - 1
377 || (strend - s == slen && strend[-1] != '\n')) {
378 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
381 /* Now should match s[0..slen-2] */
383 if (slen && (*SvPVX(check) != *s
385 && memNE(SvPVX(check), s, slen)))) {
387 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
391 else if (*SvPVX(check) != *s
392 || ((slen = SvCUR(check)) > 1
393 && memNE(SvPVX(check), s, slen)))
395 goto success_at_start;
397 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
399 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
400 end_shift = prog->minlen - start_shift -
401 CHR_SVLEN(check) + (SvTAIL(check) != 0);
403 I32 end = prog->check_offset_max + CHR_SVLEN(check)
404 - (SvTAIL(check) != 0);
405 I32 eshift = strend - s - end;
407 if (end_shift < eshift)
411 else { /* Can match at random position */
414 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
415 /* Should be nonnegative! */
416 end_shift = prog->minlen - start_shift -
417 CHR_SVLEN(check) + (SvTAIL(check) != 0);
420 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
422 Perl_croak(aTHX_ "panic: end_shift");
426 /* Find a possible match in the region s..strend by looking for
427 the "check" substring in the region corrected by start/end_shift. */
428 if (flags & REXEC_SCREAM) {
429 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
430 I32 p = -1; /* Internal iterator of scream. */
431 I32 *pp = data ? data->scream_pos : &p;
433 if (PL_screamfirst[BmRARE(check)] >= 0
434 || ( BmRARE(check) == '\n'
435 && (BmPREVIOUS(check) == SvCUR(check) - 1)
437 s = screaminstr(sv, check,
438 start_shift + (s - strbeg), end_shift, pp, 0);
442 *data->scream_olds = s;
445 s = fbm_instr((unsigned char*)s + start_shift,
446 (unsigned char*)strend - end_shift,
447 check, PL_multiline ? FBMrf_MULTILINE : 0);
449 /* Update the count-of-usability, remove useless subpatterns,
452 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
453 (s ? "Found" : "Did not find"),
454 ((check == prog->anchored_substr) ? "anchored" : "floating"),
456 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
458 PL_colors[1], (SvTAIL(check) ? "$" : ""),
459 (s ? " at offset " : "...\n") ) );
466 /* Finish the diagnostic message */
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
469 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
470 Start with the other substr.
471 XXXX no SCREAM optimization yet - and a very coarse implementation
472 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
473 *always* match. Probably should be marked during compile...
474 Probably it is right to do no SCREAM here...
477 if (prog->float_substr && prog->anchored_substr) {
478 /* Take into account the "other" substring. */
479 /* XXXX May be hopelessly wrong for UTF... */
482 if (check == prog->float_substr) {
485 char *last = s - start_shift, *last1, *last2;
489 t = s - prog->check_offset_max;
490 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
491 && (!(prog->reganch & ROPT_UTF8)
492 || (PL_bostr = strpos, /* Used in regcopmaybe() */
493 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
498 t += prog->anchored_offset;
499 if (t < other_last) /* These positions already checked */
502 last2 = last1 = strend - prog->minlen;
505 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
506 /* On end-of-str: see comment below. */
507 s = fbm_instr((unsigned char*)t,
508 (unsigned char*)last1 + prog->anchored_offset
509 + SvCUR(prog->anchored_substr)
510 - (SvTAIL(prog->anchored_substr)!=0),
511 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
512 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
513 (s ? "Found" : "Contradicts"),
515 (int)(SvCUR(prog->anchored_substr)
516 - (SvTAIL(prog->anchored_substr)!=0)),
517 SvPVX(prog->anchored_substr),
518 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
520 if (last1 >= last2) {
521 DEBUG_r(PerlIO_printf(Perl_debug_log,
522 ", giving up...\n"));
525 DEBUG_r(PerlIO_printf(Perl_debug_log,
526 ", trying floating at offset %ld...\n",
527 (long)(s1 + 1 - i_strpos)));
528 PL_regeol = strend; /* Used in HOP() */
529 other_last = last1 + prog->anchored_offset + 1;
534 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
535 (long)(s - i_strpos)));
536 t = s - prog->anchored_offset;
545 else { /* Take into account the floating substring. */
550 last1 = last = strend - prog->minlen + prog->float_min_offset;
551 if (last - t > prog->float_max_offset)
552 last = t + prog->float_max_offset;
553 s = t + prog->float_min_offset;
556 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
557 /* fbm_instr() takes into account exact value of end-of-str
558 if the check is SvTAIL(ed). Since false positives are OK,
559 and end-of-str is not later than strend we are OK. */
560 s = fbm_instr((unsigned char*)s,
561 (unsigned char*)last + SvCUR(prog->float_substr)
562 - (SvTAIL(prog->float_substr)!=0),
563 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
564 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
565 (s ? "Found" : "Contradicts"),
567 (int)(SvCUR(prog->float_substr)
568 - (SvTAIL(prog->float_substr)!=0)),
569 SvPVX(prog->float_substr),
570 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
573 DEBUG_r(PerlIO_printf(Perl_debug_log,
574 ", giving up...\n"));
577 DEBUG_r(PerlIO_printf(Perl_debug_log,
578 ", trying anchored starting at offset %ld...\n",
579 (long)(s1 + 1 - i_strpos)));
580 other_last = last + 1;
581 PL_regeol = strend; /* Used in HOP() */
586 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
587 (long)(s - i_strpos)));
597 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
600 && (!(prog->reganch & ROPT_UTF8)
601 || (PL_bostr = strpos, /* Used in regcopmaybe() */
602 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
605 /* Fixed substring is found far enough so that the match
606 cannot start at strpos. */
608 if (ml_anch && t[-1] != '\n') {
609 /* Eventually fbm_*() should handle this, but often
610 anchored_offset is not 0, so this check will not be wasted. */
611 /* XXXX In the code below we prefer to look for "^" even in
612 presence of anchored substrings. And we search even
613 beyond the found float position. These pessimizations
614 are historical artefacts only. */
616 while (t < strend - prog->minlen) {
618 if (t < check_at - prog->check_offset_min) {
619 if (prog->anchored_substr) {
620 /* Since we moved from the found position,
621 we definitely contradict the found anchored
622 substr. Due to the above check we do not
623 contradict "check" substr.
624 Thus we can arrive here only if check substr
625 is float. Redo checking for "other"=="fixed".
628 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
629 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
630 goto do_other_anchored;
632 /* We don't contradict the found floating substring. */
633 /* XXXX Why not check for STCLASS? */
635 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
636 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
639 /* Position contradicts check-string */
640 /* XXXX probably better to look for check-string
641 than for "\n", so one should lower the limit for t? */
642 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
643 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
644 other_last = strpos = s = t + 1;
649 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
650 PL_colors[0],PL_colors[1]));
654 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
655 PL_colors[0],PL_colors[1]));
659 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
663 /* The found string does not prohibit matching at strpos,
664 - no optimization of calling REx engine can be performed,
665 unless it was an MBOL and we are not after MBOL,
666 or a future STCLASS check will fail this. */
668 /* Even in this situation we may use MBOL flag if strpos is offset
669 wrt the start of the string. */
670 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
671 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
672 /* May be due to an implicit anchor of m{.*foo} */
673 && !(prog->reganch & ROPT_IMPLICIT))
678 DEBUG_r( if (ml_anch)
679 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
680 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
683 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
684 && prog->check_substr /* Could be deleted already */
685 && --BmUSEFUL(prog->check_substr) < 0
686 && prog->check_substr == prog->float_substr)
688 /* If flags & SOMETHING - do not do it many times on the same match */
689 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
690 SvREFCNT_dec(prog->check_substr);
691 prog->check_substr = Nullsv; /* disable */
692 prog->float_substr = Nullsv; /* clear */
694 /* XXXX This is a remnant of the old implementation. It
695 looks wasteful, since now INTUIT can use many
697 prog->reganch &= ~RE_USE_INTUIT;
704 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
705 if (prog->regstclass) {
706 /* minlen == 0 is possible if regstclass is \b or \B,
707 and the fixed substr is ''$.
708 Since minlen is already taken into account, s+1 is before strend;
709 accidentally, minlen >= 1 guaranties no false positives at s + 1
710 even for \b or \B. But (minlen? 1 : 0) below assumes that
711 regstclass does not come from lookahead... */
712 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
713 This leaves EXACTF only, which is dealt with in find_byclass(). */
714 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
715 ? STR_LEN(prog->regstclass)
717 char *endpos = (prog->anchored_substr || ml_anch)
718 ? s + (prog->minlen? cl_l : 0)
719 : (prog->float_substr ? check_at - start_shift + cl_l
721 char *startpos = sv && SvPOK(sv) ? strend - SvCUR(sv) : s;
724 if (prog->reganch & ROPT_UTF8) {
725 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
728 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
733 if (endpos == strend) {
734 DEBUG_r( PerlIO_printf(Perl_debug_log,
735 "Could not match STCLASS...\n") );
738 DEBUG_r( PerlIO_printf(Perl_debug_log,
739 "This position contradicts STCLASS...\n") );
740 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
742 /* Contradict one of substrings */
743 if (prog->anchored_substr) {
744 if (prog->anchored_substr == check) {
745 DEBUG_r( what = "anchored" );
747 PL_regeol = strend; /* Used in HOP() */
749 if (s + start_shift + end_shift > strend) {
750 /* XXXX Should be taken into account earlier? */
751 DEBUG_r( PerlIO_printf(Perl_debug_log,
752 "Could not match STCLASS...\n") );
755 DEBUG_r( PerlIO_printf(Perl_debug_log,
756 "Looking for %s substr starting at offset %ld...\n",
757 what, (long)(s + start_shift - i_strpos)) );
760 /* Have both, check_string is floating */
761 if (t + start_shift >= check_at) /* Contradicts floating=check */
762 goto retry_floating_check;
763 /* Recheck anchored substring, but not floating... */
765 DEBUG_r( PerlIO_printf(Perl_debug_log,
766 "Looking for anchored substr starting at offset %ld...\n",
767 (long)(other_last - i_strpos)) );
768 goto do_other_anchored;
770 /* Another way we could have checked stclass at the
771 current position only: */
774 DEBUG_r( PerlIO_printf(Perl_debug_log,
775 "Looking for /%s^%s/m starting at offset %ld...\n",
776 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
779 if (!prog->float_substr) /* Could have been deleted */
781 /* Check is floating subtring. */
782 retry_floating_check:
783 t = check_at - start_shift;
784 DEBUG_r( what = "floating" );
785 goto hop_and_restart;
788 PerlIO_printf(Perl_debug_log,
789 "By STCLASS: moving %ld --> %ld\n",
790 (long)(t - i_strpos), (long)(s - i_strpos));
792 PerlIO_printf(Perl_debug_log,
793 "Does not contradict STCLASS...\n") );
795 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
796 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
799 fail_finish: /* Substring not found */
800 if (prog->check_substr) /* could be removed already */
801 BmUSEFUL(prog->check_substr) += 5; /* hooray */
803 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
804 PL_colors[4],PL_colors[5]));
808 /* We know what class REx starts with. Try to find this position... */
810 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
812 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
818 register I32 tmp = 1; /* Scratch variable? */
820 /* We know what class it must start with. */
824 if (REGINCLASSUTF8(c, (U8*)s)) {
825 if (tmp && (norun || regtry(prog, s)))
837 if (REGINCLASS(c, *(U8*)s)) {
838 if (tmp && (norun || regtry(prog, s)))
858 c2 = PL_fold_locale[c1];
863 e = s; /* Due to minlen logic of intuit() */
864 /* Here it is NOT UTF! */
868 && (ln == 1 || !(OP(c) == EXACTF
870 : ibcmp_locale(s, m, ln)))
871 && (norun || regtry(prog, s)) )
877 if ( (*(U8*)s == c1 || *(U8*)s == c2)
878 && (ln == 1 || !(OP(c) == EXACTF
880 : ibcmp_locale(s, m, ln)))
881 && (norun || regtry(prog, s)) )
888 PL_reg_flags |= RF_tainted;
891 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
892 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
894 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
896 if ((norun || regtry(prog, s)))
901 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
905 PL_reg_flags |= RF_tainted;
908 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
909 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
911 if (tmp == !(OP(c) == BOUNDUTF8 ?
912 swash_fetch(PL_utf8_alnum, (U8*)s) :
913 isALNUM_LC_utf8((U8*)s)))
916 if ((norun || regtry(prog, s)))
921 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
925 PL_reg_flags |= RF_tainted;
928 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
929 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
931 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
933 else if ((norun || regtry(prog, s)))
937 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
941 PL_reg_flags |= RF_tainted;
944 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
945 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
947 if (tmp == !(OP(c) == NBOUNDUTF8 ?
948 swash_fetch(PL_utf8_alnum, (U8*)s) :
949 isALNUM_LC_utf8((U8*)s)))
951 else if ((norun || regtry(prog, s)))
955 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
961 if (tmp && (norun || regtry(prog, s)))
973 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
974 if (tmp && (norun || regtry(prog, s)))
985 PL_reg_flags |= RF_tainted;
987 if (isALNUM_LC(*s)) {
988 if (tmp && (norun || regtry(prog, s)))
999 PL_reg_flags |= RF_tainted;
1000 while (s < strend) {
1001 if (isALNUM_LC_utf8((U8*)s)) {
1002 if (tmp && (norun || regtry(prog, s)))
1013 while (s < strend) {
1015 if (tmp && (norun || regtry(prog, s)))
1026 while (s < strend) {
1027 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1028 if (tmp && (norun || regtry(prog, s)))
1039 PL_reg_flags |= RF_tainted;
1040 while (s < strend) {
1041 if (!isALNUM_LC(*s)) {
1042 if (tmp && (norun || regtry(prog, s)))
1053 PL_reg_flags |= RF_tainted;
1054 while (s < strend) {
1055 if (!isALNUM_LC_utf8((U8*)s)) {
1056 if (tmp && (norun || regtry(prog, s)))
1067 while (s < strend) {
1069 if (tmp && (norun || regtry(prog, s)))
1080 while (s < strend) {
1081 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1082 if (tmp && (norun || regtry(prog, s)))
1093 PL_reg_flags |= RF_tainted;
1094 while (s < strend) {
1095 if (isSPACE_LC(*s)) {
1096 if (tmp && (norun || regtry(prog, s)))
1107 PL_reg_flags |= RF_tainted;
1108 while (s < strend) {
1109 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1110 if (tmp && (norun || regtry(prog, s)))
1121 while (s < strend) {
1123 if (tmp && (norun || regtry(prog, s)))
1134 while (s < strend) {
1135 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1136 if (tmp && (norun || regtry(prog, s)))
1147 PL_reg_flags |= RF_tainted;
1148 while (s < strend) {
1149 if (!isSPACE_LC(*s)) {
1150 if (tmp && (norun || regtry(prog, s)))
1161 PL_reg_flags |= RF_tainted;
1162 while (s < strend) {
1163 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1164 if (tmp && (norun || regtry(prog, s)))
1175 while (s < strend) {
1177 if (tmp && (norun || regtry(prog, s)))
1188 while (s < strend) {
1189 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1190 if (tmp && (norun || regtry(prog, s)))
1201 PL_reg_flags |= RF_tainted;
1202 while (s < strend) {
1203 if (isDIGIT_LC(*s)) {
1204 if (tmp && (norun || regtry(prog, s)))
1215 PL_reg_flags |= RF_tainted;
1216 while (s < strend) {
1217 if (isDIGIT_LC_utf8((U8*)s)) {
1218 if (tmp && (norun || regtry(prog, s)))
1229 while (s < strend) {
1231 if (tmp && (norun || regtry(prog, s)))
1242 while (s < strend) {
1243 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1244 if (tmp && (norun || regtry(prog, s)))
1255 PL_reg_flags |= RF_tainted;
1256 while (s < strend) {
1257 if (!isDIGIT_LC(*s)) {
1258 if (tmp && (norun || regtry(prog, s)))
1269 PL_reg_flags |= RF_tainted;
1270 while (s < strend) {
1271 if (!isDIGIT_LC_utf8((U8*)s)) {
1272 if (tmp && (norun || regtry(prog, s)))
1283 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1292 - regexec_flags - match a regexp against a string
1295 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1296 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1297 /* strend: pointer to null at end of string */
1298 /* strbeg: real beginning of string */
1299 /* minend: end of match must be >=minend after stringarg. */
1300 /* data: May be used for some additional optimizations. */
1301 /* nosave: For optimizations. */
1305 register regnode *c;
1306 register char *startpos = stringarg;
1307 I32 minlen; /* must match at least this many chars */
1308 I32 dontbother = 0; /* how many characters not to try at end */
1309 /* I32 start_shift = 0; */ /* Offset of the start to find
1310 constant substr. */ /* CC */
1311 I32 end_shift = 0; /* Same for the end. */ /* CC */
1312 I32 scream_pos = -1; /* Internal iterator of scream. */
1314 SV* oreplsv = GvSV(PL_replgv);
1320 PL_regnarrate = PL_debug & 512;
1323 /* Be paranoid... */
1324 if (prog == NULL || startpos == NULL) {
1325 Perl_croak(aTHX_ "NULL regexp parameter");
1329 minlen = prog->minlen;
1330 if (strend - startpos < minlen) goto phooey;
1332 if (startpos == strbeg) /* is ^ valid at stringarg? */
1335 PL_regprev = (U32)stringarg[-1];
1336 if (!PL_multiline && PL_regprev == '\n')
1337 PL_regprev = '\0'; /* force ^ to NOT match */
1340 /* Check validity of program. */
1341 if (UCHARAT(prog->program) != REG_MAGIC) {
1342 Perl_croak(aTHX_ "corrupted regexp program");
1346 PL_reg_eval_set = 0;
1349 if (prog->reganch & ROPT_UTF8)
1350 PL_reg_flags |= RF_utf8;
1352 /* Mark beginning of line for ^ and lookbehind. */
1353 PL_regbol = startpos;
1357 /* Mark end of line for $ (and such) */
1360 /* see how far we have to get to not match where we matched before */
1361 PL_regtill = startpos+minend;
1363 /* We start without call_cc context. */
1366 /* If there is a "must appear" string, look for it. */
1369 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1372 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1373 PL_reg_ganch = startpos;
1374 else if (sv && SvTYPE(sv) >= SVt_PVMG
1376 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1377 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1378 if (prog->reganch & ROPT_ANCH_GPOS) {
1379 if (s > PL_reg_ganch)
1384 else /* pos() not defined */
1385 PL_reg_ganch = strbeg;
1388 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1389 re_scream_pos_data d;
1391 d.scream_olds = &scream_olds;
1392 d.scream_pos = &scream_pos;
1393 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1395 goto phooey; /* not present */
1398 DEBUG_r( if (!PL_colorset) reginitcolors() );
1399 DEBUG_r(PerlIO_printf(Perl_debug_log,
1400 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1401 PL_colors[4],PL_colors[5],PL_colors[0],
1404 (strlen(prog->precomp) > 60 ? "..." : ""),
1406 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1407 startpos, PL_colors[1],
1408 (strend - startpos > 60 ? "..." : ""))
1411 /* Simplest case: anchored match need be tried only once. */
1412 /* [unless only anchor is BOL and multiline is set] */
1413 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1414 if (s == startpos && regtry(prog, startpos))
1416 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1417 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1422 dontbother = minlen - 1;
1423 end = HOPc(strend, -dontbother) - 1;
1424 /* for multiline we only have to try after newlines */
1425 if (prog->check_substr) {
1429 if (regtry(prog, s))
1434 if (prog->reganch & RE_USE_INTUIT) {
1435 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1446 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1447 if (regtry(prog, s))
1454 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1455 if (regtry(prog, PL_reg_ganch))
1460 /* Messy cases: unanchored match. */
1461 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1462 /* we have /x+whatever/ */
1463 /* it must be a one character string (XXXX Except UTF?) */
1464 char ch = SvPVX(prog->anchored_substr)[0];
1470 while (s < strend) {
1472 DEBUG_r( did_match = 1 );
1473 if (regtry(prog, s)) goto got_it;
1475 while (s < strend && *s == ch)
1482 while (s < strend) {
1484 DEBUG_r( did_match = 1 );
1485 if (regtry(prog, s)) goto got_it;
1487 while (s < strend && *s == ch)
1493 DEBUG_r(did_match ||
1494 PerlIO_printf(Perl_debug_log,
1495 "Did not find anchored character...\n"));
1498 else if (prog->anchored_substr != Nullsv
1499 || (prog->float_substr != Nullsv
1500 && prog->float_max_offset < strend - s)) {
1501 SV *must = prog->anchored_substr
1502 ? prog->anchored_substr : prog->float_substr;
1504 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1506 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1507 char *last = HOPc(strend, /* Cannot start after this */
1508 -(I32)(CHR_SVLEN(must)
1509 - (SvTAIL(must) != 0) + back_min));
1510 char *last1; /* Last position checked before */
1516 last1 = HOPc(s, -1);
1518 last1 = s - 1; /* bogus */
1520 /* XXXX check_substr already used to find `s', can optimize if
1521 check_substr==must. */
1523 dontbother = end_shift;
1524 strend = HOPc(strend, -dontbother);
1525 while ( (s <= last) &&
1526 ((flags & REXEC_SCREAM)
1527 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1528 end_shift, &scream_pos, 0))
1529 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1530 (unsigned char*)strend, must,
1531 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1532 DEBUG_r( did_match = 1 );
1533 if (HOPc(s, -back_max) > last1) {
1534 last1 = HOPc(s, -back_min);
1535 s = HOPc(s, -back_max);
1538 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1540 last1 = HOPc(s, -back_min);
1544 while (s <= last1) {
1545 if (regtry(prog, s))
1551 while (s <= last1) {
1552 if (regtry(prog, s))
1558 DEBUG_r(did_match ||
1559 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1560 ((must == prog->anchored_substr)
1561 ? "anchored" : "floating"),
1563 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1565 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1568 else if ((c = prog->regstclass)) {
1569 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1570 /* don't bother with what can't match */
1571 strend = HOPc(strend, -(minlen - 1));
1572 if (find_byclass(prog, c, s, strend, startpos, 0))
1574 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1578 if (prog->float_substr != Nullsv) { /* Trim the end. */
1581 if (flags & REXEC_SCREAM) {
1582 last = screaminstr(sv, prog->float_substr, s - strbeg,
1583 end_shift, &scream_pos, 1); /* last one */
1585 last = scream_olds; /* Only one occurence. */
1589 char *little = SvPV(prog->float_substr, len);
1591 if (SvTAIL(prog->float_substr)) {
1592 if (memEQ(strend - len + 1, little, len - 1))
1593 last = strend - len + 1;
1594 else if (!PL_multiline)
1595 last = memEQ(strend - len, little, len)
1596 ? strend - len : Nullch;
1602 last = rninstr(s, strend, little, little + len);
1604 last = strend; /* matching `$' */
1608 DEBUG_r(PerlIO_printf(Perl_debug_log,
1609 "%sCan't trim the tail, match fails (should not happen)%s\n",
1610 PL_colors[4],PL_colors[5]));
1611 goto phooey; /* Should not happen! */
1613 dontbother = strend - last + prog->float_min_offset;
1615 if (minlen && (dontbother < minlen))
1616 dontbother = minlen - 1;
1617 strend -= dontbother; /* this one's always in bytes! */
1618 /* We don't know much -- general case. */
1621 if (regtry(prog, s))
1630 if (regtry(prog, s))
1632 } while (s++ < strend);
1640 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1642 if (PL_reg_eval_set) {
1643 /* Preserve the current value of $^R */
1644 if (oreplsv != GvSV(PL_replgv))
1645 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1646 restored, the value remains
1648 restore_pos(aTHXo_ 0);
1651 /* make sure $`, $&, $', and $digit will work later */
1652 if ( !(flags & REXEC_NOT_FIRST) ) {
1653 if (RX_MATCH_COPIED(prog)) {
1654 Safefree(prog->subbeg);
1655 RX_MATCH_COPIED_off(prog);
1657 if (flags & REXEC_COPY_STR) {
1658 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1660 s = savepvn(strbeg, i);
1663 RX_MATCH_COPIED_on(prog);
1666 prog->subbeg = strbeg;
1667 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1674 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1675 PL_colors[4],PL_colors[5]));
1676 if (PL_reg_eval_set)
1677 restore_pos(aTHXo_ 0);
1682 - regtry - try match at specific point
1684 STATIC I32 /* 0 failure, 1 success */
1685 S_regtry(pTHX_ regexp *prog, char *startpos)
1693 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1696 PL_reg_eval_set = RS_init;
1698 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1699 (IV)(PL_stack_sp - PL_stack_base));
1701 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1702 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1703 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1705 /* Apparently this is not needed, judging by wantarray. */
1706 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1707 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1710 /* Make $_ available to executed code. */
1711 if (PL_reg_sv != DEFSV) {
1712 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1717 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1718 && (mg = mg_find(PL_reg_sv, 'g')))) {
1719 /* prepare for quick setting of pos */
1720 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1721 mg = mg_find(PL_reg_sv, 'g');
1725 PL_reg_oldpos = mg->mg_len;
1726 SAVEDESTRUCTOR_X(restore_pos, 0);
1729 Newz(22,PL_reg_curpm, 1, PMOP);
1730 PL_reg_curpm->op_pmregexp = prog;
1731 PL_reg_oldcurpm = PL_curpm;
1732 PL_curpm = PL_reg_curpm;
1733 if (RX_MATCH_COPIED(prog)) {
1734 /* Here is a serious problem: we cannot rewrite subbeg,
1735 since it may be needed if this match fails. Thus
1736 $` inside (?{}) could fail... */
1737 PL_reg_oldsaved = prog->subbeg;
1738 PL_reg_oldsavedlen = prog->sublen;
1739 RX_MATCH_COPIED_off(prog);
1742 PL_reg_oldsaved = Nullch;
1743 prog->subbeg = PL_bostr;
1744 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1746 prog->startp[0] = startpos - PL_bostr;
1747 PL_reginput = startpos;
1748 PL_regstartp = prog->startp;
1749 PL_regendp = prog->endp;
1750 PL_reglastparen = &prog->lastparen;
1751 prog->lastparen = 0;
1753 DEBUG_r(PL_reg_starttry = startpos);
1754 if (PL_reg_start_tmpl <= prog->nparens) {
1755 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1756 if(PL_reg_start_tmp)
1757 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1759 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1762 /* XXXX What this code is doing here?!!! There should be no need
1763 to do this again and again, PL_reglastparen should take care of
1767 if (prog->nparens) {
1768 for (i = prog->nparens; i >= 1; i--) {
1774 if (regmatch(prog->program + 1)) {
1775 prog->endp[0] = PL_reginput - PL_bostr;
1783 - regmatch - main matching routine
1785 * Conceptually the strategy is simple: check to see whether the current
1786 * node matches, call self recursively to see whether the rest matches,
1787 * and then act accordingly. In practice we make some effort to avoid
1788 * recursion, in particular by going through "ordinary" nodes (that don't
1789 * need to know whether the rest of the match failed) by a loop instead of
1792 /* [lwall] I've hoisted the register declarations to the outer block in order to
1793 * maybe save a little bit of pushing and popping on the stack. It also takes
1794 * advantage of machines that use a register save mask on subroutine entry.
1796 STATIC I32 /* 0 failure, 1 success */
1797 S_regmatch(pTHX_ regnode *prog)
1800 register regnode *scan; /* Current node. */
1801 regnode *next; /* Next node. */
1802 regnode *inner; /* Next node in internal branch. */
1803 register I32 nextchr; /* renamed nextchr - nextchar colides with
1804 function of same name */
1805 register I32 n; /* no or next */
1806 register I32 ln; /* len or last */
1807 register char *s; /* operand or save */
1808 register char *locinput = PL_reginput;
1809 register I32 c1, c2, paren; /* case fold search, parenth */
1810 int minmod = 0, sw = 0, logical = 0;
1815 /* Note that nextchr is a byte even in UTF */
1816 nextchr = UCHARAT(locinput);
1818 while (scan != NULL) {
1819 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1821 # define sayYES goto yes
1822 # define sayNO goto no
1823 # define sayYES_FINAL goto yes_final
1824 # define sayYES_LOUD goto yes_loud
1825 # define sayNO_FINAL goto no_final
1826 # define sayNO_SILENT goto do_no
1827 # define saySAME(x) if (x) goto yes; else goto no
1828 # define REPORT_CODE_OFF 24
1830 # define sayYES return 1
1831 # define sayNO return 0
1832 # define sayYES_FINAL return 1
1833 # define sayYES_LOUD return 1
1834 # define sayNO_FINAL return 0
1835 # define sayNO_SILENT return 0
1836 # define saySAME(x) return x
1839 SV *prop = sv_newmortal();
1840 int docolor = *PL_colors[0];
1841 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1842 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1843 /* The part of the string before starttry has one color
1844 (pref0_len chars), between starttry and current
1845 position another one (pref_len - pref0_len chars),
1846 after the current position the third one.
1847 We assume that pref0_len <= pref_len, otherwise we
1848 decrease pref0_len. */
1849 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1850 ? (5 + taill) - l : locinput - PL_bostr);
1851 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1853 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1854 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1855 ? (5 + taill) - pref_len : PL_regeol - locinput);
1858 if (pref0_len > pref_len)
1859 pref0_len = pref_len;
1860 regprop(prop, scan);
1861 PerlIO_printf(Perl_debug_log,
1862 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1863 (IV)(locinput - PL_bostr),
1864 PL_colors[4], pref0_len,
1865 locinput - pref_len, PL_colors[5],
1866 PL_colors[2], pref_len - pref0_len,
1867 locinput - pref_len + pref0_len, PL_colors[3],
1868 (docolor ? "" : "> <"),
1869 PL_colors[0], l, locinput, PL_colors[1],
1870 15 - l - pref_len + 1,
1872 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1876 next = scan + NEXT_OFF(scan);
1882 if (locinput == PL_bostr
1883 ? PL_regprev == '\n'
1885 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1887 /* regtill = regbol; */
1892 if (locinput == PL_bostr
1893 ? PL_regprev == '\n'
1894 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1900 if (locinput == PL_bostr)
1904 if (locinput == PL_reg_ganch)
1914 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1919 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1921 if (PL_regeol - locinput > 1)
1925 if (PL_regeol != locinput)
1929 if (nextchr & 0x80) {
1930 locinput += PL_utf8skip[nextchr];
1931 if (locinput > PL_regeol)
1933 nextchr = UCHARAT(locinput);
1936 if (!nextchr && locinput >= PL_regeol)
1938 nextchr = UCHARAT(++locinput);
1941 if (!nextchr && locinput >= PL_regeol)
1943 nextchr = UCHARAT(++locinput);
1946 if (nextchr & 0x80) {
1947 locinput += PL_utf8skip[nextchr];
1948 if (locinput > PL_regeol)
1950 nextchr = UCHARAT(locinput);
1953 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1955 nextchr = UCHARAT(++locinput);
1958 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1960 nextchr = UCHARAT(++locinput);
1965 /* Inline the first character, for speed. */
1966 if (UCHARAT(s) != nextchr)
1968 if (PL_regeol - locinput < ln)
1970 if (ln > 1 && memNE(s, locinput, ln))
1973 nextchr = UCHARAT(locinput);
1976 PL_reg_flags |= RF_tainted;
1985 c1 = OP(scan) == EXACTF;
1989 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1990 toLOWER_utf8((U8*)l) :
1991 toLOWER_LC_utf8((U8*)l)))
1999 nextchr = UCHARAT(locinput);
2003 /* Inline the first character, for speed. */
2004 if (UCHARAT(s) != nextchr &&
2005 UCHARAT(s) != ((OP(scan) == EXACTF)
2006 ? PL_fold : PL_fold_locale)[nextchr])
2008 if (PL_regeol - locinput < ln)
2010 if (ln > 1 && (OP(scan) == EXACTF
2011 ? ibcmp(s, locinput, ln)
2012 : ibcmp_locale(s, locinput, ln)))
2015 nextchr = UCHARAT(locinput);
2018 if (!REGINCLASSUTF8(scan, (U8*)locinput))
2020 if (locinput >= PL_regeol)
2022 locinput += PL_utf8skip[nextchr];
2023 nextchr = UCHARAT(locinput);
2027 nextchr = UCHARAT(locinput);
2028 if (!REGINCLASS(scan, nextchr))
2030 if (!nextchr && locinput >= PL_regeol)
2032 nextchr = UCHARAT(++locinput);
2035 PL_reg_flags |= RF_tainted;
2040 if (!(OP(scan) == ALNUM
2041 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2043 nextchr = UCHARAT(++locinput);
2046 PL_reg_flags |= RF_tainted;
2051 if (nextchr & 0x80) {
2052 if (!(OP(scan) == ALNUMUTF8
2053 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2054 : isALNUM_LC_utf8((U8*)locinput)))
2058 locinput += PL_utf8skip[nextchr];
2059 nextchr = UCHARAT(locinput);
2062 if (!(OP(scan) == ALNUMUTF8
2063 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2065 nextchr = UCHARAT(++locinput);
2068 PL_reg_flags |= RF_tainted;
2071 if (!nextchr && locinput >= PL_regeol)
2073 if (OP(scan) == NALNUM
2074 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2076 nextchr = UCHARAT(++locinput);
2079 PL_reg_flags |= RF_tainted;
2082 if (!nextchr && locinput >= PL_regeol)
2084 if (nextchr & 0x80) {
2085 if (OP(scan) == NALNUMUTF8
2086 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2087 : isALNUM_LC_utf8((U8*)locinput))
2091 locinput += PL_utf8skip[nextchr];
2092 nextchr = UCHARAT(locinput);
2095 if (OP(scan) == NALNUMUTF8
2096 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2098 nextchr = UCHARAT(++locinput);
2102 PL_reg_flags |= RF_tainted;
2106 /* was last char in word? */
2107 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2108 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2110 n = isALNUM(nextchr);
2113 ln = isALNUM_LC(ln);
2114 n = isALNUM_LC(nextchr);
2116 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2121 PL_reg_flags |= RF_tainted;
2125 /* was last char in word? */
2126 ln = (locinput != PL_regbol)
2127 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2128 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2129 ln = isALNUM_uni(ln);
2130 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2133 ln = isALNUM_LC_uni(ln);
2134 n = isALNUM_LC_utf8((U8*)locinput);
2136 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2140 PL_reg_flags |= RF_tainted;
2145 if (!(OP(scan) == SPACE
2146 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2148 nextchr = UCHARAT(++locinput);
2151 PL_reg_flags |= RF_tainted;
2156 if (nextchr & 0x80) {
2157 if (!(OP(scan) == SPACEUTF8
2158 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2159 : isSPACE_LC_utf8((U8*)locinput)))
2163 locinput += PL_utf8skip[nextchr];
2164 nextchr = UCHARAT(locinput);
2167 if (!(OP(scan) == SPACEUTF8
2168 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2170 nextchr = UCHARAT(++locinput);
2173 PL_reg_flags |= RF_tainted;
2176 if (!nextchr && locinput >= PL_regeol)
2178 if (OP(scan) == NSPACE
2179 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2181 nextchr = UCHARAT(++locinput);
2184 PL_reg_flags |= RF_tainted;
2187 if (!nextchr && locinput >= PL_regeol)
2189 if (nextchr & 0x80) {
2190 if (OP(scan) == NSPACEUTF8
2191 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2192 : isSPACE_LC_utf8((U8*)locinput))
2196 locinput += PL_utf8skip[nextchr];
2197 nextchr = UCHARAT(locinput);
2200 if (OP(scan) == NSPACEUTF8
2201 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2203 nextchr = UCHARAT(++locinput);
2206 PL_reg_flags |= RF_tainted;
2211 if (!(OP(scan) == DIGIT
2212 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2214 nextchr = UCHARAT(++locinput);
2217 PL_reg_flags |= RF_tainted;
2222 if (nextchr & 0x80) {
2223 if (!(OP(scan) == DIGITUTF8
2224 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2225 : isDIGIT_LC_utf8((U8*)locinput)))
2229 locinput += PL_utf8skip[nextchr];
2230 nextchr = UCHARAT(locinput);
2233 if (!(OP(scan) == DIGITUTF8
2234 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2236 nextchr = UCHARAT(++locinput);
2239 PL_reg_flags |= RF_tainted;
2242 if (!nextchr && locinput >= PL_regeol)
2244 if (OP(scan) == NDIGIT
2245 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2247 nextchr = UCHARAT(++locinput);
2250 PL_reg_flags |= RF_tainted;
2253 if (!nextchr && locinput >= PL_regeol)
2255 if (nextchr & 0x80) {
2256 if (OP(scan) == NDIGITUTF8
2257 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2258 : isDIGIT_LC_utf8((U8*)locinput))
2262 locinput += PL_utf8skip[nextchr];
2263 nextchr = UCHARAT(locinput);
2266 if (OP(scan) == NDIGITUTF8
2267 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2269 nextchr = UCHARAT(++locinput);
2272 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2274 locinput += PL_utf8skip[nextchr];
2275 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2276 locinput += UTF8SKIP(locinput);
2277 if (locinput > PL_regeol)
2279 nextchr = UCHARAT(locinput);
2282 PL_reg_flags |= RF_tainted;
2286 n = ARG(scan); /* which paren pair */
2287 ln = PL_regstartp[n];
2288 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2289 if (*PL_reglastparen < n || ln == -1)
2290 sayNO; /* Do not match unless seen CLOSEn. */
2291 if (ln == PL_regendp[n])
2295 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2297 char *e = PL_bostr + PL_regendp[n];
2299 * Note that we can't do the "other character" lookup trick as
2300 * in the 8-bit case (no pun intended) because in Unicode we
2301 * have to map both upper and title case to lower case.
2303 if (OP(scan) == REFF) {
2307 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2317 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2324 nextchr = UCHARAT(locinput);
2328 /* Inline the first character, for speed. */
2329 if (UCHARAT(s) != nextchr &&
2331 (UCHARAT(s) != ((OP(scan) == REFF
2332 ? PL_fold : PL_fold_locale)[nextchr]))))
2334 ln = PL_regendp[n] - ln;
2335 if (locinput + ln > PL_regeol)
2337 if (ln > 1 && (OP(scan) == REF
2338 ? memNE(s, locinput, ln)
2340 ? ibcmp(s, locinput, ln)
2341 : ibcmp_locale(s, locinput, ln))))
2344 nextchr = UCHARAT(locinput);
2355 OP_4tree *oop = PL_op;
2356 COP *ocurcop = PL_curcop;
2357 SV **ocurpad = PL_curpad;
2361 PL_op = (OP_4tree*)PL_regdata->data[n];
2362 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2363 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2364 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2366 CALLRUNOPS(aTHX); /* Scalar context. */
2372 PL_curpad = ocurpad;
2373 PL_curcop = ocurcop;
2375 if (logical == 2) { /* Postponed subexpression. */
2377 MAGIC *mg = Null(MAGIC*);
2379 CHECKPOINT cp, lastcp;
2381 if(SvROK(ret) || SvRMAGICAL(ret)) {
2382 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2385 mg = mg_find(sv, 'r');
2388 re = (regexp *)mg->mg_obj;
2389 (void)ReREFCNT_inc(re);
2393 char *t = SvPV(ret, len);
2395 char *oprecomp = PL_regprecomp;
2396 I32 osize = PL_regsize;
2397 I32 onpar = PL_regnpar;
2400 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2401 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2403 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2404 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2405 PL_regprecomp = oprecomp;
2410 PerlIO_printf(Perl_debug_log,
2411 "Entering embedded `%s%.60s%s%s'\n",
2415 (strlen(re->precomp) > 60 ? "..." : ""))
2418 state.prev = PL_reg_call_cc;
2419 state.cc = PL_regcc;
2420 state.re = PL_reg_re;
2424 cp = regcppush(0); /* Save *all* the positions. */
2427 state.ss = PL_savestack_ix;
2428 *PL_reglastparen = 0;
2429 PL_reg_call_cc = &state;
2430 PL_reginput = locinput;
2432 /* XXXX This is too dramatic a measure... */
2435 if (regmatch(re->program + 1)) {
2436 /* Even though we succeeded, we need to restore
2437 global variables, since we may be wrapped inside
2438 SUSPEND, thus the match may be not finished yet. */
2440 /* XXXX Do this only if SUSPENDed? */
2441 PL_reg_call_cc = state.prev;
2442 PL_regcc = state.cc;
2443 PL_reg_re = state.re;
2444 cache_re(PL_reg_re);
2446 /* XXXX This is too dramatic a measure... */
2449 /* These are needed even if not SUSPEND. */
2457 PL_reg_call_cc = state.prev;
2458 PL_regcc = state.cc;
2459 PL_reg_re = state.re;
2460 cache_re(PL_reg_re);
2462 /* XXXX This is too dramatic a measure... */
2471 sv_setsv(save_scalar(PL_replgv), ret);
2475 n = ARG(scan); /* which paren pair */
2476 PL_reg_start_tmp[n] = locinput;
2481 n = ARG(scan); /* which paren pair */
2482 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2483 PL_regendp[n] = locinput - PL_bostr;
2484 if (n > *PL_reglastparen)
2485 *PL_reglastparen = n;
2488 n = ARG(scan); /* which paren pair */
2489 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2492 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2494 next = NEXTOPER(NEXTOPER(scan));
2496 next = scan + ARG(scan);
2497 if (OP(next) == IFTHEN) /* Fake one. */
2498 next = NEXTOPER(NEXTOPER(next));
2502 logical = scan->flags;
2504 /*******************************************************************
2505 PL_regcc contains infoblock about the innermost (...)* loop, and
2506 a pointer to the next outer infoblock.
2508 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2510 1) After matching X, regnode for CURLYX is processed;
2512 2) This regnode creates infoblock on the stack, and calls
2513 regmatch() recursively with the starting point at WHILEM node;
2515 3) Each hit of WHILEM node tries to match A and Z (in the order
2516 depending on the current iteration, min/max of {min,max} and
2517 greediness). The information about where are nodes for "A"
2518 and "Z" is read from the infoblock, as is info on how many times "A"
2519 was already matched, and greediness.
2521 4) After A matches, the same WHILEM node is hit again.
2523 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2524 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2525 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2526 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2527 of the external loop.
2529 Currently present infoblocks form a tree with a stem formed by PL_curcc
2530 and whatever it mentions via ->next, and additional attached trees
2531 corresponding to temporarily unset infoblocks as in "5" above.
2533 In the following picture infoblocks for outer loop of
2534 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2535 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2536 infoblocks are drawn below the "reset" infoblock.
2538 In fact in the picture below we do not show failed matches for Z and T
2539 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2540 more obvious *why* one needs to *temporary* unset infoblocks.]
2542 Matched REx position InfoBlocks Comment
2546 Y A)*?Z)*?T x <- O <- I
2547 YA )*?Z)*?T x <- O <- I
2548 YA A)*?Z)*?T x <- O <- I
2549 YAA )*?Z)*?T x <- O <- I
2550 YAA Z)*?T x <- O # Temporary unset I
2553 YAAZ Y(A)*?Z)*?T x <- O
2556 YAAZY (A)*?Z)*?T x <- O
2559 YAAZY A)*?Z)*?T x <- O <- I
2562 YAAZYA )*?Z)*?T x <- O <- I
2565 YAAZYA Z)*?T x <- O # Temporary unset I
2571 YAAZYAZ T x # Temporary unset O
2578 *******************************************************************/
2581 CHECKPOINT cp = PL_savestack_ix;
2583 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2585 cc.oldcc = PL_regcc;
2587 cc.parenfloor = *PL_reglastparen;
2589 cc.min = ARG1(scan);
2590 cc.max = ARG2(scan);
2591 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2595 PL_reginput = locinput;
2596 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2598 PL_regcc = cc.oldcc;
2604 * This is really hard to understand, because after we match
2605 * what we're trying to match, we must make sure the rest of
2606 * the REx is going to match for sure, and to do that we have
2607 * to go back UP the parse tree by recursing ever deeper. And
2608 * if it fails, we have to reset our parent's current state
2609 * that we can try again after backing off.
2612 CHECKPOINT cp, lastcp;
2613 CURCUR* cc = PL_regcc;
2614 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2616 n = cc->cur + 1; /* how many we know we matched */
2617 PL_reginput = locinput;
2620 PerlIO_printf(Perl_debug_log,
2621 "%*s %ld out of %ld..%ld cc=%lx\n",
2622 REPORT_CODE_OFF+PL_regindent*2, "",
2623 (long)n, (long)cc->min,
2624 (long)cc->max, (long)cc)
2627 /* If degenerate scan matches "", assume scan done. */
2629 if (locinput == cc->lastloc && n >= cc->min) {
2630 PL_regcc = cc->oldcc;
2634 PerlIO_printf(Perl_debug_log,
2635 "%*s empty match detected, try continuation...\n",
2636 REPORT_CODE_OFF+PL_regindent*2, "")
2638 if (regmatch(cc->next))
2646 /* First just match a string of min scans. */
2650 cc->lastloc = locinput;
2651 if (regmatch(cc->scan))
2654 cc->lastloc = lastloc;
2659 /* Check whether we already were at this position.
2660 Postpone detection until we know the match is not
2661 *that* much linear. */
2662 if (!PL_reg_maxiter) {
2663 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2664 PL_reg_leftiter = PL_reg_maxiter;
2666 if (PL_reg_leftiter-- == 0) {
2667 I32 size = (PL_reg_maxiter + 7)/8;
2668 if (PL_reg_poscache) {
2669 if (PL_reg_poscache_size < size) {
2670 Renew(PL_reg_poscache, size, char);
2671 PL_reg_poscache_size = size;
2673 Zero(PL_reg_poscache, size, char);
2676 PL_reg_poscache_size = size;
2677 Newz(29, PL_reg_poscache, size, char);
2680 PerlIO_printf(Perl_debug_log,
2681 "%sDetected a super-linear match, switching on caching%s...\n",
2682 PL_colors[4], PL_colors[5])
2685 if (PL_reg_leftiter < 0) {
2686 I32 o = locinput - PL_bostr, b;
2688 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2691 if (PL_reg_poscache[o] & (1<<b)) {
2693 PerlIO_printf(Perl_debug_log,
2694 "%*s already tried at this position...\n",
2695 REPORT_CODE_OFF+PL_regindent*2, "")
2699 PL_reg_poscache[o] |= (1<<b);
2703 /* Prefer next over scan for minimal matching. */
2706 PL_regcc = cc->oldcc;
2709 cp = regcppush(cc->parenfloor);
2711 if (regmatch(cc->next)) {
2713 sayYES; /* All done. */
2721 if (n >= cc->max) { /* Maximum greed exceeded? */
2722 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2723 && !(PL_reg_flags & RF_warned)) {
2724 PL_reg_flags |= RF_warned;
2725 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2726 "Complex regular subexpression recursion",
2733 PerlIO_printf(Perl_debug_log,
2734 "%*s trying longer...\n",
2735 REPORT_CODE_OFF+PL_regindent*2, "")
2737 /* Try scanning more and see if it helps. */
2738 PL_reginput = locinput;
2740 cc->lastloc = locinput;
2741 cp = regcppush(cc->parenfloor);
2743 if (regmatch(cc->scan)) {
2750 cc->lastloc = lastloc;
2754 /* Prefer scan over next for maximal matching. */
2756 if (n < cc->max) { /* More greed allowed? */
2757 cp = regcppush(cc->parenfloor);
2759 cc->lastloc = locinput;
2761 if (regmatch(cc->scan)) {
2766 regcppop(); /* Restore some previous $<digit>s? */
2767 PL_reginput = locinput;
2769 PerlIO_printf(Perl_debug_log,
2770 "%*s failed, try continuation...\n",
2771 REPORT_CODE_OFF+PL_regindent*2, "")
2774 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2775 && !(PL_reg_flags & RF_warned)) {
2776 PL_reg_flags |= RF_warned;
2777 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2778 "Complex regular subexpression recursion",
2782 /* Failed deeper matches of scan, so see if this one works. */
2783 PL_regcc = cc->oldcc;
2786 if (regmatch(cc->next))
2792 cc->lastloc = lastloc;
2797 next = scan + ARG(scan);
2800 inner = NEXTOPER(NEXTOPER(scan));
2803 inner = NEXTOPER(scan);
2808 if (OP(next) != c1) /* No choice. */
2809 next = inner; /* Avoid recursion. */
2811 int lastparen = *PL_reglastparen;
2815 PL_reginput = locinput;
2816 if (regmatch(inner))
2819 for (n = *PL_reglastparen; n > lastparen; n--)
2821 *PL_reglastparen = n;
2824 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2828 inner = NEXTOPER(scan);
2829 if (c1 == BRANCHJ) {
2830 inner = NEXTOPER(inner);
2832 } while (scan != NULL && OP(scan) == c1);
2846 /* We suppose that the next guy does not need
2847 backtracking: in particular, it is of constant length,
2848 and has no parenths to influence future backrefs. */
2849 ln = ARG1(scan); /* min to match */
2850 n = ARG2(scan); /* max to match */
2851 paren = scan->flags;
2853 if (paren > PL_regsize)
2855 if (paren > *PL_reglastparen)
2856 *PL_reglastparen = paren;
2858 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2860 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2861 PL_reginput = locinput;
2864 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2866 if (ln && l == 0 && n >= ln
2867 /* In fact, this is tricky. If paren, then the
2868 fact that we did/didnot match may influence
2869 future execution. */
2870 && !(paren && ln == 0))
2872 locinput = PL_reginput;
2873 if (PL_regkind[(U8)OP(next)] == EXACT) {
2874 c1 = (U8)*STRING(next);
2875 if (OP(next) == EXACTF)
2877 else if (OP(next) == EXACTFL)
2878 c2 = PL_fold_locale[c1];
2885 /* This may be improved if l == 0. */
2886 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2887 /* If it could work, try it. */
2889 UCHARAT(PL_reginput) == c1 ||
2890 UCHARAT(PL_reginput) == c2)
2894 PL_regstartp[paren] =
2895 HOPc(PL_reginput, -l) - PL_bostr;
2896 PL_regendp[paren] = PL_reginput - PL_bostr;
2899 PL_regendp[paren] = -1;
2905 /* Couldn't or didn't -- move forward. */
2906 PL_reginput = locinput;
2907 if (regrepeat_hard(scan, 1, &l)) {
2909 locinput = PL_reginput;
2916 n = regrepeat_hard(scan, n, &l);
2917 if (n != 0 && l == 0
2918 /* In fact, this is tricky. If paren, then the
2919 fact that we did/didnot match may influence
2920 future execution. */
2921 && !(paren && ln == 0))
2923 locinput = PL_reginput;
2925 PerlIO_printf(Perl_debug_log,
2926 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2927 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2931 if (PL_regkind[(U8)OP(next)] == EXACT) {
2932 c1 = (U8)*STRING(next);
2933 if (OP(next) == EXACTF)
2935 else if (OP(next) == EXACTFL)
2936 c2 = PL_fold_locale[c1];
2945 /* If it could work, try it. */
2947 UCHARAT(PL_reginput) == c1 ||
2948 UCHARAT(PL_reginput) == c2)
2951 PerlIO_printf(Perl_debug_log,
2952 "%*s trying tail with n=%"IVdf"...\n",
2953 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2957 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2958 PL_regendp[paren] = PL_reginput - PL_bostr;
2961 PL_regendp[paren] = -1;
2967 /* Couldn't or didn't -- back up. */
2969 locinput = HOPc(locinput, -l);
2970 PL_reginput = locinput;
2977 paren = scan->flags; /* Which paren to set */
2978 if (paren > PL_regsize)
2980 if (paren > *PL_reglastparen)
2981 *PL_reglastparen = paren;
2982 ln = ARG1(scan); /* min to match */
2983 n = ARG2(scan); /* max to match */
2984 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2988 ln = ARG1(scan); /* min to match */
2989 n = ARG2(scan); /* max to match */
2990 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2995 scan = NEXTOPER(scan);
3001 scan = NEXTOPER(scan);
3005 * Lookahead to avoid useless match attempts
3006 * when we know what character comes next.
3008 if (PL_regkind[(U8)OP(next)] == EXACT) {
3009 c1 = (U8)*STRING(next);
3010 if (OP(next) == EXACTF)
3012 else if (OP(next) == EXACTFL)
3013 c2 = PL_fold_locale[c1];
3019 PL_reginput = locinput;
3023 if (ln && regrepeat(scan, ln) < ln)
3025 locinput = PL_reginput;
3028 char *e = locinput + n - ln; /* Should not check after this */
3029 char *old = locinput;
3031 if (e >= PL_regeol || (n == REG_INFTY))
3034 /* Find place 'next' could work */
3036 while (locinput <= e && *locinput != c1)
3039 while (locinput <= e
3046 /* PL_reginput == old now */
3047 if (locinput != old) {
3048 ln = 1; /* Did some */
3049 if (regrepeat(scan, locinput - old) <
3053 /* PL_reginput == locinput now */
3054 TRYPAREN(paren, ln, locinput);
3055 PL_reginput = locinput; /* Could be reset... */
3057 /* Couldn't or didn't -- move forward. */
3062 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3063 /* If it could work, try it. */
3065 UCHARAT(PL_reginput) == c1 ||
3066 UCHARAT(PL_reginput) == c2)
3068 TRYPAREN(paren, n, PL_reginput);
3071 /* Couldn't or didn't -- move forward. */
3072 PL_reginput = locinput;
3073 if (regrepeat(scan, 1)) {
3075 locinput = PL_reginput;
3083 n = regrepeat(scan, n);
3084 locinput = PL_reginput;
3085 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3086 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3087 ln = n; /* why back off? */
3088 /* ...because $ and \Z can match before *and* after
3089 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3090 We should back off by one in this case. */
3091 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3097 /* If it could work, try it. */
3099 UCHARAT(PL_reginput) == c1 ||
3100 UCHARAT(PL_reginput) == c2)
3102 TRYPAREN(paren, n, PL_reginput);
3105 /* Couldn't or didn't -- back up. */
3107 PL_reginput = locinput = HOPc(locinput, -1);
3112 /* If it could work, try it. */
3114 UCHARAT(PL_reginput) == c1 ||
3115 UCHARAT(PL_reginput) == c2)
3117 TRYPAREN(paren, n, PL_reginput);
3120 /* Couldn't or didn't -- back up. */
3122 PL_reginput = locinput = HOPc(locinput, -1);
3129 if (PL_reg_call_cc) {
3130 re_cc_state *cur_call_cc = PL_reg_call_cc;
3131 CURCUR *cctmp = PL_regcc;
3132 regexp *re = PL_reg_re;
3133 CHECKPOINT cp, lastcp;
3135 cp = regcppush(0); /* Save *all* the positions. */
3137 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3139 PL_reginput = locinput; /* Make position available to
3141 cache_re(PL_reg_call_cc->re);
3142 PL_regcc = PL_reg_call_cc->cc;
3143 PL_reg_call_cc = PL_reg_call_cc->prev;
3144 if (regmatch(cur_call_cc->node)) {
3145 PL_reg_call_cc = cur_call_cc;
3151 PL_reg_call_cc = cur_call_cc;
3157 PerlIO_printf(Perl_debug_log,
3158 "%*s continuation failed...\n",
3159 REPORT_CODE_OFF+PL_regindent*2, "")
3163 if (locinput < PL_regtill) {
3164 DEBUG_r(PerlIO_printf(Perl_debug_log,
3165 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3167 (long)(locinput - PL_reg_starttry),
3168 (long)(PL_regtill - PL_reg_starttry),
3170 sayNO_FINAL; /* Cannot match: too short. */
3172 PL_reginput = locinput; /* put where regtry can find it */
3173 sayYES_FINAL; /* Success! */
3175 PL_reginput = locinput; /* put where regtry can find it */
3176 sayYES_LOUD; /* Success! */
3179 PL_reginput = locinput;
3184 if (UTF) { /* XXXX This is absolutely
3185 broken, we read before
3187 s = HOPMAYBEc(locinput, -scan->flags);
3193 if (locinput < PL_bostr + scan->flags)
3195 PL_reginput = locinput - scan->flags;
3200 PL_reginput = locinput;
3205 if (UTF) { /* XXXX This is absolutely
3206 broken, we read before
3208 s = HOPMAYBEc(locinput, -scan->flags);
3209 if (!s || s < PL_bostr)
3214 if (locinput < PL_bostr + scan->flags)
3216 PL_reginput = locinput - scan->flags;
3221 PL_reginput = locinput;
3224 inner = NEXTOPER(NEXTOPER(scan));
3225 if (regmatch(inner) != n) {
3240 if (OP(scan) == SUSPEND) {
3241 locinput = PL_reginput;
3242 nextchr = UCHARAT(locinput);
3247 next = scan + ARG(scan);
3252 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3253 PTR2UV(scan), OP(scan));
3254 Perl_croak(aTHX_ "regexp memory corruption");
3260 * We get here only if there's trouble -- normally "case END" is
3261 * the terminating point.
3263 Perl_croak(aTHX_ "corrupted regexp pointers");
3269 PerlIO_printf(Perl_debug_log,
3270 "%*s %scould match...%s\n",
3271 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3275 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3276 PL_colors[4],PL_colors[5]));
3285 PerlIO_printf(Perl_debug_log,
3286 "%*s %sfailed...%s\n",
3287 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3299 - regrepeat - repeatedly match something simple, report how many
3302 * [This routine now assumes that it will only match on things of length 1.
3303 * That was true before, but now we assume scan - reginput is the count,
3304 * rather than incrementing count on every character. [Er, except utf8.]]
3307 S_regrepeat(pTHX_ regnode *p, I32 max)
3310 register char *scan;
3312 register char *loceol = PL_regeol;
3313 register I32 hardcount = 0;
3316 if (max != REG_INFTY && max < loceol - scan)
3317 loceol = scan + max;
3320 while (scan < loceol && *scan != '\n')
3328 while (scan < loceol && *scan != '\n') {
3329 scan += UTF8SKIP(scan);
3335 while (scan < loceol) {
3336 scan += UTF8SKIP(scan);
3340 case EXACT: /* length of string is 1 */
3342 while (scan < loceol && UCHARAT(scan) == c)
3345 case EXACTF: /* length of string is 1 */
3347 while (scan < loceol &&
3348 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3351 case EXACTFL: /* length of string is 1 */
3352 PL_reg_flags |= RF_tainted;
3354 while (scan < loceol &&
3355 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3360 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3361 scan += UTF8SKIP(scan);
3366 while (scan < loceol && REGINCLASS(p, *scan))
3370 while (scan < loceol && isALNUM(*scan))
3375 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3376 scan += UTF8SKIP(scan);
3381 PL_reg_flags |= RF_tainted;
3382 while (scan < loceol && isALNUM_LC(*scan))
3386 PL_reg_flags |= RF_tainted;
3388 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3389 scan += UTF8SKIP(scan);
3395 while (scan < loceol && !isALNUM(*scan))
3400 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3401 scan += UTF8SKIP(scan);
3406 PL_reg_flags |= RF_tainted;
3407 while (scan < loceol && !isALNUM_LC(*scan))
3411 PL_reg_flags |= RF_tainted;
3413 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3414 scan += UTF8SKIP(scan);
3419 while (scan < loceol && isSPACE(*scan))
3424 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3425 scan += UTF8SKIP(scan);
3430 PL_reg_flags |= RF_tainted;
3431 while (scan < loceol && isSPACE_LC(*scan))
3435 PL_reg_flags |= RF_tainted;
3437 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3438 scan += UTF8SKIP(scan);
3443 while (scan < loceol && !isSPACE(*scan))
3448 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3449 scan += UTF8SKIP(scan);
3454 PL_reg_flags |= RF_tainted;
3455 while (scan < loceol && !isSPACE_LC(*scan))
3459 PL_reg_flags |= RF_tainted;
3461 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3462 scan += UTF8SKIP(scan);
3467 while (scan < loceol && isDIGIT(*scan))
3472 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3473 scan += UTF8SKIP(scan);
3479 while (scan < loceol && !isDIGIT(*scan))
3484 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3485 scan += UTF8SKIP(scan);
3489 default: /* Called on something of 0 width. */
3490 break; /* So match right here or not at all. */
3496 c = scan - PL_reginput;
3501 SV *prop = sv_newmortal();
3504 PerlIO_printf(Perl_debug_log,
3505 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3506 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3513 - regrepeat_hard - repeatedly match something, report total lenth and length
3515 * The repeater is supposed to have constant length.
3519 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3522 register char *scan;
3523 register char *start;
3524 register char *loceol = PL_regeol;
3526 I32 count = 0, res = 1;
3531 start = PL_reginput;
3533 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3536 while (start < PL_reginput) {
3538 start += UTF8SKIP(start);
3549 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3551 *lp = l = PL_reginput - start;
3552 if (max != REG_INFTY && l*max < loceol - scan)
3553 loceol = scan + l*max;
3566 - reginclass - determine if a character falls into a character class
3570 S_reginclass(pTHX_ register regnode *p, register I32 c)
3573 char flags = ANYOF_FLAGS(p);
3577 if (ANYOF_BITMAP_TEST(p, c))
3579 else if (flags & ANYOF_FOLD) {
3581 if (flags & ANYOF_LOCALE) {
3582 PL_reg_flags |= RF_tainted;
3583 cf = PL_fold_locale[c];
3587 if (ANYOF_BITMAP_TEST(p, cf))
3591 if (!match && (flags & ANYOF_CLASS)) {
3592 PL_reg_flags |= RF_tainted;
3594 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3595 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3596 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3597 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3598 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3599 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3600 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3601 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3602 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3603 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3604 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3605 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3606 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3607 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3608 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3609 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3610 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3611 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3612 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3613 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3614 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3615 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3616 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3617 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3618 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3619 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3620 ) /* How's that for a conditional? */
3626 return (flags & ANYOF_INVERT) ? !match : match;
3630 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3633 char flags = ARG1(f);
3635 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3637 if (swash_fetch(sv, p))
3639 else if (flags & ANYOF_FOLD) {
3640 U8 tmpbuf[UTF8_MAXLEN];
3641 if (flags & ANYOF_LOCALE) {
3642 PL_reg_flags |= RF_tainted;
3643 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3646 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3647 if (swash_fetch(sv, tmpbuf))
3651 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3653 return (flags & ANYOF_INVERT) ? !match : match;
3657 S_reghop(pTHX_ U8 *s, I32 off)
3661 while (off-- && s < (U8*)PL_regeol)
3666 if (s > (U8*)PL_bostr) {
3669 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3671 } /* XXX could check well-formedness here */
3679 S_reghopmaybe(pTHX_ U8* s, I32 off)
3683 while (off-- && s < (U8*)PL_regeol)
3690 if (s > (U8*)PL_bostr) {
3693 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3695 } /* XXX could check well-formedness here */
3711 restore_pos(pTHXo_ void *arg)
3714 if (PL_reg_eval_set) {
3715 if (PL_reg_oldsaved) {
3716 PL_reg_re->subbeg = PL_reg_oldsaved;
3717 PL_reg_re->sublen = PL_reg_oldsavedlen;
3718 RX_MATCH_COPIED_on(PL_reg_re);
3720 PL_reg_magic->mg_len = PL_reg_oldpos;
3721 PL_reg_eval_set = 0;
3722 PL_curpm = PL_reg_oldcurpm;