5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
224 #define TRYPAREN(paren, n, input) { \
227 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
228 PL_regendp[paren] = input - PL_bostr; \
231 PL_regendp[paren] = -1; \
233 if (regmatch(next)) \
236 PL_regendp[paren] = -1; \
241 * pregexec and friends
245 - pregexec - match a regexp against a string
248 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
249 char *strbeg, I32 minend, SV *screamer, U32 nosave)
250 /* strend: pointer to null at end of string */
251 /* strbeg: real beginning of string */
252 /* minend: end of match must be >=minend after stringarg. */
253 /* nosave: For optimizations. */
256 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
257 nosave ? 0 : REXEC_COPY_STR);
261 S_cache_re(pTHX_ regexp *prog)
264 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
266 PL_regprogram = prog->program;
268 PL_regnpar = prog->nparens;
269 PL_regdata = prog->data;
274 * Need to implement the following flags for reg_anch:
276 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
278 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
279 * INTUIT_AUTORITATIVE_ML
280 * INTUIT_ONCE_NOML - Intuit can match in one location only.
283 * Another flag for this function: SECOND_TIME (so that float substrs
284 * with giant delta may be not rechecked).
287 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
289 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
290 Otherwise, only SvCUR(sv) is used to get strbeg. */
292 /* XXXX We assume that strpos is strbeg unless sv. */
294 /* XXXX Some places assume that there is a fixed substring.
295 An update may be needed if optimizer marks as "INTUITable"
296 RExen without fixed substrings. Similarly, it is assumed that
297 lengths of all the strings are no more than minlen, thus they
298 cannot come from lookahead.
299 (Or minlen should take into account lookahead.) */
301 /* A failure to find a constant substring means that there is no need to make
302 an expensive call to REx engine, thus we celebrate a failure. Similarly,
303 finding a substring too deep into the string means that less calls to
304 regtry() should be needed.
306 REx compiler's optimizer found 4 possible hints:
307 a) Anchored substring;
309 c) Whether we are anchored (beginning-of-line or \G);
310 d) First node (of those at offset 0) which may distingush positions;
311 We use a)b)d) and multiline-part of c), and try to find a position in the
312 string which does not contradict any of them.
315 /* Most of decisions we do here should have been done at compile time.
316 The nodes of the REx which we used for the search should have been
317 deleted from the finite automaton. */
320 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
321 char *strend, U32 flags, re_scream_pos_data *data)
323 register I32 start_shift;
324 /* Should be nonnegative! */
325 register I32 end_shift;
331 register char *other_last = Nullch; /* other substr checked before this */
332 char *check_at; /* check substr found at this pos */
334 char *i_strpos = strpos;
337 DEBUG_r( if (!PL_colorset) reginitcolors() );
338 DEBUG_r(PerlIO_printf(Perl_debug_log,
339 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
340 PL_colors[4],PL_colors[5],PL_colors[0],
343 (strlen(prog->precomp) > 60 ? "..." : ""),
345 (int)(strend - strpos > 60 ? 60 : strend - strpos),
346 strpos, PL_colors[1],
347 (strend - strpos > 60 ? "..." : ""))
350 if (prog->minlen > strend - strpos) {
351 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
354 check = prog->check_substr;
355 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
356 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
357 || ( (prog->reganch & ROPT_ANCH_BOL)
358 && !PL_multiline ) ); /* Check after \n? */
360 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
361 /* Substring at constant offset from beg-of-str... */
364 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
365 /* SvCUR is not set on references: SvRV and SvPVX overlap */
367 && (strpos + SvCUR(sv) != strend)) {
368 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
371 PL_regeol = strend; /* Used in HOP() */
372 s = HOPc(strpos, prog->check_offset_min);
374 slen = SvCUR(check); /* >= 1 */
376 if ( strend - s > slen || strend - s < slen - 1
377 || (strend - s == slen && strend[-1] != '\n')) {
378 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
381 /* Now should match s[0..slen-2] */
383 if (slen && (*SvPVX(check) != *s
385 && memNE(SvPVX(check), s, slen)))) {
387 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
391 else if (*SvPVX(check) != *s
392 || ((slen = SvCUR(check)) > 1
393 && memNE(SvPVX(check), s, slen)))
395 goto success_at_start;
397 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
399 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
400 end_shift = prog->minlen - start_shift -
401 CHR_SVLEN(check) + (SvTAIL(check) != 0);
403 I32 end = prog->check_offset_max + CHR_SVLEN(check)
404 - (SvTAIL(check) != 0);
405 I32 eshift = strend - s - end;
407 if (end_shift < eshift)
411 else { /* Can match at random position */
414 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
415 /* Should be nonnegative! */
416 end_shift = prog->minlen - start_shift -
417 CHR_SVLEN(check) + (SvTAIL(check) != 0);
420 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
422 Perl_croak(aTHX_ "panic: end_shift");
426 /* Find a possible match in the region s..strend by looking for
427 the "check" substring in the region corrected by start/end_shift. */
428 if (flags & REXEC_SCREAM) {
429 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
430 I32 p = -1; /* Internal iterator of scream. */
431 I32 *pp = data ? data->scream_pos : &p;
433 if (PL_screamfirst[BmRARE(check)] >= 0
434 || ( BmRARE(check) == '\n'
435 && (BmPREVIOUS(check) == SvCUR(check) - 1)
437 s = screaminstr(sv, check,
438 start_shift + (s - strbeg), end_shift, pp, 0);
442 *data->scream_olds = s;
445 s = fbm_instr((unsigned char*)s + start_shift,
446 (unsigned char*)strend - end_shift,
447 check, PL_multiline ? FBMrf_MULTILINE : 0);
449 /* Update the count-of-usability, remove useless subpatterns,
452 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
453 (s ? "Found" : "Did not find"),
454 ((check == prog->anchored_substr) ? "anchored" : "floating"),
456 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
458 PL_colors[1], (SvTAIL(check) ? "$" : ""),
459 (s ? " at offset " : "...\n") ) );
466 /* Finish the diagnostic message */
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
469 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
470 Start with the other substr.
471 XXXX no SCREAM optimization yet - and a very coarse implementation
472 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
473 *always* match. Probably should be marked during compile...
474 Probably it is right to do no SCREAM here...
477 if (prog->float_substr && prog->anchored_substr) {
478 /* Take into account the "other" substring. */
479 /* XXXX May be hopelessly wrong for UTF... */
482 if (check == prog->float_substr) {
485 char *last = s - start_shift, *last1, *last2;
489 t = s - prog->check_offset_max;
490 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
491 && (!(prog->reganch & ROPT_UTF8)
492 || (PL_bostr = strpos, /* Used in regcopmaybe() */
493 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
498 t += prog->anchored_offset;
499 if (t < other_last) /* These positions already checked */
502 last2 = last1 = strend - prog->minlen;
505 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
506 /* On end-of-str: see comment below. */
507 s = fbm_instr((unsigned char*)t,
508 (unsigned char*)last1 + prog->anchored_offset
509 + SvCUR(prog->anchored_substr)
510 - (SvTAIL(prog->anchored_substr)!=0),
511 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
512 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
513 (s ? "Found" : "Contradicts"),
515 (int)(SvCUR(prog->anchored_substr)
516 - (SvTAIL(prog->anchored_substr)!=0)),
517 SvPVX(prog->anchored_substr),
518 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
520 if (last1 >= last2) {
521 DEBUG_r(PerlIO_printf(Perl_debug_log,
522 ", giving up...\n"));
525 DEBUG_r(PerlIO_printf(Perl_debug_log,
526 ", trying floating at offset %ld...\n",
527 (long)(s1 + 1 - i_strpos)));
528 PL_regeol = strend; /* Used in HOP() */
529 other_last = last1 + prog->anchored_offset + 1;
534 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
535 (long)(s - i_strpos)));
536 t = s - prog->anchored_offset;
545 else { /* Take into account the floating substring. */
550 last1 = last = strend - prog->minlen + prog->float_min_offset;
551 if (last - t > prog->float_max_offset)
552 last = t + prog->float_max_offset;
553 s = t + prog->float_min_offset;
556 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
557 /* fbm_instr() takes into account exact value of end-of-str
558 if the check is SvTAIL(ed). Since false positives are OK,
559 and end-of-str is not later than strend we are OK. */
560 s = fbm_instr((unsigned char*)s,
561 (unsigned char*)last + SvCUR(prog->float_substr)
562 - (SvTAIL(prog->float_substr)!=0),
563 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
564 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
565 (s ? "Found" : "Contradicts"),
567 (int)(SvCUR(prog->float_substr)
568 - (SvTAIL(prog->float_substr)!=0)),
569 SvPVX(prog->float_substr),
570 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
573 DEBUG_r(PerlIO_printf(Perl_debug_log,
574 ", giving up...\n"));
577 DEBUG_r(PerlIO_printf(Perl_debug_log,
578 ", trying anchored starting at offset %ld...\n",
579 (long)(s1 + 1 - i_strpos)));
580 other_last = last + 1;
581 PL_regeol = strend; /* Used in HOP() */
586 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
587 (long)(s - i_strpos)));
597 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
600 && (!(prog->reganch & ROPT_UTF8)
601 || (PL_bostr = strpos, /* Used in regcopmaybe() */
602 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
605 /* Fixed substring is found far enough so that the match
606 cannot start at strpos. */
608 if (ml_anch && t[-1] != '\n') {
609 /* Eventually fbm_*() should handle this, but often
610 anchored_offset is not 0, so this check will not be wasted. */
611 /* XXXX In the code below we prefer to look for "^" even in
612 presence of anchored substrings. And we search even
613 beyond the found float position. These pessimizations
614 are historical artefacts only. */
616 while (t < strend - prog->minlen) {
618 if (t < check_at - prog->check_offset_min) {
619 if (prog->anchored_substr) {
620 /* Since we moved from the found position,
621 we definitely contradict the found anchored
622 substr. Due to the above check we do not
623 contradict "check" substr.
624 Thus we can arrive here only if check substr
625 is float. Redo checking for "other"=="fixed".
628 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
629 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
630 goto do_other_anchored;
632 /* We don't contradict the found floating substring. */
633 /* XXXX Why not check for STCLASS? */
635 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
636 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
639 /* Position contradicts check-string */
640 /* XXXX probably better to look for check-string
641 than for "\n", so one should lower the limit for t? */
642 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
643 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
644 other_last = strpos = s = t + 1;
649 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
650 PL_colors[0],PL_colors[1]));
654 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
655 PL_colors[0],PL_colors[1]));
659 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
663 /* The found string does not prohibit matching at strpos,
664 - no optimization of calling REx engine can be performed,
665 unless it was an MBOL and we are not after MBOL,
666 or a future STCLASS check will fail this. */
668 /* Even in this situation we may use MBOL flag if strpos is offset
669 wrt the start of the string. */
670 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
671 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
672 /* May be due to an implicit anchor of m{.*foo} */
673 && !(prog->reganch & ROPT_IMPLICIT))
678 DEBUG_r( if (ml_anch)
679 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
680 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
683 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
684 && prog->check_substr /* Could be deleted already */
685 && --BmUSEFUL(prog->check_substr) < 0
686 && prog->check_substr == prog->float_substr)
688 /* If flags & SOMETHING - do not do it many times on the same match */
689 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
690 SvREFCNT_dec(prog->check_substr);
691 prog->check_substr = Nullsv; /* disable */
692 prog->float_substr = Nullsv; /* clear */
693 check = Nullsv; /* abort */
695 /* XXXX This is a remnant of the old implementation. It
696 looks wasteful, since now INTUIT can use many
698 prog->reganch &= ~RE_USE_INTUIT;
705 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
706 if (prog->regstclass) {
707 /* minlen == 0 is possible if regstclass is \b or \B,
708 and the fixed substr is ''$.
709 Since minlen is already taken into account, s+1 is before strend;
710 accidentally, minlen >= 1 guaranties no false positives at s + 1
711 even for \b or \B. But (minlen? 1 : 0) below assumes that
712 regstclass does not come from lookahead... */
713 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
714 This leaves EXACTF only, which is dealt with in find_byclass(). */
715 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
716 ? STR_LEN(prog->regstclass)
718 char *endpos = (prog->anchored_substr || ml_anch)
719 ? s + (prog->minlen? cl_l : 0)
720 : (prog->float_substr ? check_at - start_shift + cl_l
722 char *startpos = sv && SvPOK(sv) ? strend - SvCUR(sv) : s;
725 if (prog->reganch & ROPT_UTF8) {
726 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
729 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
734 if (endpos == strend) {
735 DEBUG_r( PerlIO_printf(Perl_debug_log,
736 "Could not match STCLASS...\n") );
739 DEBUG_r( PerlIO_printf(Perl_debug_log,
740 "This position contradicts STCLASS...\n") );
741 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
743 /* Contradict one of substrings */
744 if (prog->anchored_substr) {
745 if (prog->anchored_substr == check) {
746 DEBUG_r( what = "anchored" );
748 PL_regeol = strend; /* Used in HOP() */
750 if (s + start_shift + end_shift > strend) {
751 /* XXXX Should be taken into account earlier? */
752 DEBUG_r( PerlIO_printf(Perl_debug_log,
753 "Could not match STCLASS...\n") );
758 DEBUG_r( PerlIO_printf(Perl_debug_log,
759 "Looking for %s substr starting at offset %ld...\n",
760 what, (long)(s + start_shift - i_strpos)) );
763 /* Have both, check_string is floating */
764 if (t + start_shift >= check_at) /* Contradicts floating=check */
765 goto retry_floating_check;
766 /* Recheck anchored substring, but not floating... */
770 DEBUG_r( PerlIO_printf(Perl_debug_log,
771 "Looking for anchored substr starting at offset %ld...\n",
772 (long)(other_last - i_strpos)) );
773 goto do_other_anchored;
775 /* Another way we could have checked stclass at the
776 current position only: */
781 DEBUG_r( PerlIO_printf(Perl_debug_log,
782 "Looking for /%s^%s/m starting at offset %ld...\n",
783 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
786 if (!prog->float_substr) /* Could have been deleted */
788 /* Check is floating subtring. */
789 retry_floating_check:
790 t = check_at - start_shift;
791 DEBUG_r( what = "floating" );
792 goto hop_and_restart;
795 PerlIO_printf(Perl_debug_log,
796 "By STCLASS: moving %ld --> %ld\n",
797 (long)(t - i_strpos), (long)(s - i_strpos));
799 PerlIO_printf(Perl_debug_log,
800 "Does not contradict STCLASS...\n") );
803 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
804 PL_colors[4], (check ? "Guessed" : "Giving up"),
805 PL_colors[5], (long)(s - i_strpos)) );
808 fail_finish: /* Substring not found */
809 if (prog->check_substr) /* could be removed already */
810 BmUSEFUL(prog->check_substr) += 5; /* hooray */
812 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
813 PL_colors[4],PL_colors[5]));
817 /* We know what class REx starts with. Try to find this position... */
819 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
821 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
827 register I32 tmp = 1; /* Scratch variable? */
829 /* We know what class it must start with. */
833 if (REGINCLASSUTF8(c, (U8*)s)) {
834 if (tmp && (norun || regtry(prog, s)))
846 if (REGINCLASS(c, *(U8*)s)) {
847 if (tmp && (norun || regtry(prog, s)))
867 c2 = PL_fold_locale[c1];
872 e = s; /* Due to minlen logic of intuit() */
873 /* Here it is NOT UTF! */
877 && (ln == 1 || !(OP(c) == EXACTF
879 : ibcmp_locale(s, m, ln)))
880 && (norun || regtry(prog, s)) )
886 if ( (*(U8*)s == c1 || *(U8*)s == c2)
887 && (ln == 1 || !(OP(c) == EXACTF
889 : ibcmp_locale(s, m, ln)))
890 && (norun || regtry(prog, s)) )
897 PL_reg_flags |= RF_tainted;
900 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
901 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
903 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
905 if ((norun || regtry(prog, s)))
910 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
914 PL_reg_flags |= RF_tainted;
917 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
918 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
920 if (tmp == !(OP(c) == BOUNDUTF8 ?
921 swash_fetch(PL_utf8_alnum, (U8*)s) :
922 isALNUM_LC_utf8((U8*)s)))
925 if ((norun || regtry(prog, s)))
930 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
934 PL_reg_flags |= RF_tainted;
937 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
938 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
940 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
942 else if ((norun || regtry(prog, s)))
946 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
950 PL_reg_flags |= RF_tainted;
953 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
954 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
956 if (tmp == !(OP(c) == NBOUNDUTF8 ?
957 swash_fetch(PL_utf8_alnum, (U8*)s) :
958 isALNUM_LC_utf8((U8*)s)))
960 else if ((norun || regtry(prog, s)))
964 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
970 if (tmp && (norun || regtry(prog, s)))
982 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
983 if (tmp && (norun || regtry(prog, s)))
994 PL_reg_flags |= RF_tainted;
996 if (isALNUM_LC(*s)) {
997 if (tmp && (norun || regtry(prog, s)))
1008 PL_reg_flags |= RF_tainted;
1009 while (s < strend) {
1010 if (isALNUM_LC_utf8((U8*)s)) {
1011 if (tmp && (norun || regtry(prog, s)))
1022 while (s < strend) {
1024 if (tmp && (norun || regtry(prog, s)))
1035 while (s < strend) {
1036 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1037 if (tmp && (norun || regtry(prog, s)))
1048 PL_reg_flags |= RF_tainted;
1049 while (s < strend) {
1050 if (!isALNUM_LC(*s)) {
1051 if (tmp && (norun || regtry(prog, s)))
1062 PL_reg_flags |= RF_tainted;
1063 while (s < strend) {
1064 if (!isALNUM_LC_utf8((U8*)s)) {
1065 if (tmp && (norun || regtry(prog, s)))
1076 while (s < strend) {
1078 if (tmp && (norun || regtry(prog, s)))
1089 while (s < strend) {
1090 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1091 if (tmp && (norun || regtry(prog, s)))
1102 PL_reg_flags |= RF_tainted;
1103 while (s < strend) {
1104 if (isSPACE_LC(*s)) {
1105 if (tmp && (norun || regtry(prog, s)))
1116 PL_reg_flags |= RF_tainted;
1117 while (s < strend) {
1118 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1119 if (tmp && (norun || regtry(prog, s)))
1130 while (s < strend) {
1132 if (tmp && (norun || regtry(prog, s)))
1143 while (s < strend) {
1144 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1145 if (tmp && (norun || regtry(prog, s)))
1156 PL_reg_flags |= RF_tainted;
1157 while (s < strend) {
1158 if (!isSPACE_LC(*s)) {
1159 if (tmp && (norun || regtry(prog, s)))
1170 PL_reg_flags |= RF_tainted;
1171 while (s < strend) {
1172 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1173 if (tmp && (norun || regtry(prog, s)))
1184 while (s < strend) {
1186 if (tmp && (norun || regtry(prog, s)))
1197 while (s < strend) {
1198 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1199 if (tmp && (norun || regtry(prog, s)))
1210 PL_reg_flags |= RF_tainted;
1211 while (s < strend) {
1212 if (isDIGIT_LC(*s)) {
1213 if (tmp && (norun || regtry(prog, s)))
1224 PL_reg_flags |= RF_tainted;
1225 while (s < strend) {
1226 if (isDIGIT_LC_utf8((U8*)s)) {
1227 if (tmp && (norun || regtry(prog, s)))
1238 while (s < strend) {
1240 if (tmp && (norun || regtry(prog, s)))
1251 while (s < strend) {
1252 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1253 if (tmp && (norun || regtry(prog, s)))
1264 PL_reg_flags |= RF_tainted;
1265 while (s < strend) {
1266 if (!isDIGIT_LC(*s)) {
1267 if (tmp && (norun || regtry(prog, s)))
1278 PL_reg_flags |= RF_tainted;
1279 while (s < strend) {
1280 if (!isDIGIT_LC_utf8((U8*)s)) {
1281 if (tmp && (norun || regtry(prog, s)))
1292 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1301 - regexec_flags - match a regexp against a string
1304 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1305 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1306 /* strend: pointer to null at end of string */
1307 /* strbeg: real beginning of string */
1308 /* minend: end of match must be >=minend after stringarg. */
1309 /* data: May be used for some additional optimizations. */
1310 /* nosave: For optimizations. */
1314 register regnode *c;
1315 register char *startpos = stringarg;
1316 I32 minlen; /* must match at least this many chars */
1317 I32 dontbother = 0; /* how many characters not to try at end */
1318 /* I32 start_shift = 0; */ /* Offset of the start to find
1319 constant substr. */ /* CC */
1320 I32 end_shift = 0; /* Same for the end. */ /* CC */
1321 I32 scream_pos = -1; /* Internal iterator of scream. */
1323 SV* oreplsv = GvSV(PL_replgv);
1329 PL_regnarrate = PL_debug & 512;
1332 /* Be paranoid... */
1333 if (prog == NULL || startpos == NULL) {
1334 Perl_croak(aTHX_ "NULL regexp parameter");
1338 minlen = prog->minlen;
1339 if (strend - startpos < minlen) goto phooey;
1341 if (startpos == strbeg) /* is ^ valid at stringarg? */
1344 PL_regprev = (U32)stringarg[-1];
1345 if (!PL_multiline && PL_regprev == '\n')
1346 PL_regprev = '\0'; /* force ^ to NOT match */
1349 /* Check validity of program. */
1350 if (UCHARAT(prog->program) != REG_MAGIC) {
1351 Perl_croak(aTHX_ "corrupted regexp program");
1355 PL_reg_eval_set = 0;
1358 if (prog->reganch & ROPT_UTF8)
1359 PL_reg_flags |= RF_utf8;
1361 /* Mark beginning of line for ^ and lookbehind. */
1362 PL_regbol = startpos;
1366 /* Mark end of line for $ (and such) */
1369 /* see how far we have to get to not match where we matched before */
1370 PL_regtill = startpos+minend;
1372 /* We start without call_cc context. */
1375 /* If there is a "must appear" string, look for it. */
1378 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1381 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1382 PL_reg_ganch = startpos;
1383 else if (sv && SvTYPE(sv) >= SVt_PVMG
1385 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1386 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1387 if (prog->reganch & ROPT_ANCH_GPOS) {
1388 if (s > PL_reg_ganch)
1393 else /* pos() not defined */
1394 PL_reg_ganch = strbeg;
1397 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1398 re_scream_pos_data d;
1400 d.scream_olds = &scream_olds;
1401 d.scream_pos = &scream_pos;
1402 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1404 goto phooey; /* not present */
1407 DEBUG_r( if (!PL_colorset) reginitcolors() );
1408 DEBUG_r(PerlIO_printf(Perl_debug_log,
1409 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1410 PL_colors[4],PL_colors[5],PL_colors[0],
1413 (strlen(prog->precomp) > 60 ? "..." : ""),
1415 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1416 startpos, PL_colors[1],
1417 (strend - startpos > 60 ? "..." : ""))
1420 /* Simplest case: anchored match need be tried only once. */
1421 /* [unless only anchor is BOL and multiline is set] */
1422 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1423 if (s == startpos && regtry(prog, startpos))
1425 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1426 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1431 dontbother = minlen - 1;
1432 end = HOPc(strend, -dontbother) - 1;
1433 /* for multiline we only have to try after newlines */
1434 if (prog->check_substr) {
1438 if (regtry(prog, s))
1443 if (prog->reganch & RE_USE_INTUIT) {
1444 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1455 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1456 if (regtry(prog, s))
1463 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1464 if (regtry(prog, PL_reg_ganch))
1469 /* Messy cases: unanchored match. */
1470 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1471 /* we have /x+whatever/ */
1472 /* it must be a one character string (XXXX Except UTF?) */
1473 char ch = SvPVX(prog->anchored_substr)[0];
1479 while (s < strend) {
1481 DEBUG_r( did_match = 1 );
1482 if (regtry(prog, s)) goto got_it;
1484 while (s < strend && *s == ch)
1491 while (s < strend) {
1493 DEBUG_r( did_match = 1 );
1494 if (regtry(prog, s)) goto got_it;
1496 while (s < strend && *s == ch)
1502 DEBUG_r(did_match ||
1503 PerlIO_printf(Perl_debug_log,
1504 "Did not find anchored character...\n"));
1507 else if (prog->anchored_substr != Nullsv
1508 || (prog->float_substr != Nullsv
1509 && prog->float_max_offset < strend - s)) {
1510 SV *must = prog->anchored_substr
1511 ? prog->anchored_substr : prog->float_substr;
1513 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1515 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1516 char *last = HOPc(strend, /* Cannot start after this */
1517 -(I32)(CHR_SVLEN(must)
1518 - (SvTAIL(must) != 0) + back_min));
1519 char *last1; /* Last position checked before */
1525 last1 = HOPc(s, -1);
1527 last1 = s - 1; /* bogus */
1529 /* XXXX check_substr already used to find `s', can optimize if
1530 check_substr==must. */
1532 dontbother = end_shift;
1533 strend = HOPc(strend, -dontbother);
1534 while ( (s <= last) &&
1535 ((flags & REXEC_SCREAM)
1536 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1537 end_shift, &scream_pos, 0))
1538 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1539 (unsigned char*)strend, must,
1540 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1541 DEBUG_r( did_match = 1 );
1542 if (HOPc(s, -back_max) > last1) {
1543 last1 = HOPc(s, -back_min);
1544 s = HOPc(s, -back_max);
1547 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1549 last1 = HOPc(s, -back_min);
1553 while (s <= last1) {
1554 if (regtry(prog, s))
1560 while (s <= last1) {
1561 if (regtry(prog, s))
1567 DEBUG_r(did_match ||
1568 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1569 ((must == prog->anchored_substr)
1570 ? "anchored" : "floating"),
1572 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1574 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1577 else if ((c = prog->regstclass)) {
1578 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1579 /* don't bother with what can't match */
1580 strend = HOPc(strend, -(minlen - 1));
1581 if (find_byclass(prog, c, s, strend, startpos, 0))
1583 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1587 if (prog->float_substr != Nullsv) { /* Trim the end. */
1590 if (flags & REXEC_SCREAM) {
1591 last = screaminstr(sv, prog->float_substr, s - strbeg,
1592 end_shift, &scream_pos, 1); /* last one */
1594 last = scream_olds; /* Only one occurence. */
1598 char *little = SvPV(prog->float_substr, len);
1600 if (SvTAIL(prog->float_substr)) {
1601 if (memEQ(strend - len + 1, little, len - 1))
1602 last = strend - len + 1;
1603 else if (!PL_multiline)
1604 last = memEQ(strend - len, little, len)
1605 ? strend - len : Nullch;
1611 last = rninstr(s, strend, little, little + len);
1613 last = strend; /* matching `$' */
1617 DEBUG_r(PerlIO_printf(Perl_debug_log,
1618 "%sCan't trim the tail, match fails (should not happen)%s\n",
1619 PL_colors[4],PL_colors[5]));
1620 goto phooey; /* Should not happen! */
1622 dontbother = strend - last + prog->float_min_offset;
1624 if (minlen && (dontbother < minlen))
1625 dontbother = minlen - 1;
1626 strend -= dontbother; /* this one's always in bytes! */
1627 /* We don't know much -- general case. */
1630 if (regtry(prog, s))
1639 if (regtry(prog, s))
1641 } while (s++ < strend);
1649 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1651 if (PL_reg_eval_set) {
1652 /* Preserve the current value of $^R */
1653 if (oreplsv != GvSV(PL_replgv))
1654 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1655 restored, the value remains
1657 restore_pos(aTHXo_ 0);
1660 /* make sure $`, $&, $', and $digit will work later */
1661 if ( !(flags & REXEC_NOT_FIRST) ) {
1662 if (RX_MATCH_COPIED(prog)) {
1663 Safefree(prog->subbeg);
1664 RX_MATCH_COPIED_off(prog);
1666 if (flags & REXEC_COPY_STR) {
1667 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1669 s = savepvn(strbeg, i);
1672 RX_MATCH_COPIED_on(prog);
1675 prog->subbeg = strbeg;
1676 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1683 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1684 PL_colors[4],PL_colors[5]));
1685 if (PL_reg_eval_set)
1686 restore_pos(aTHXo_ 0);
1691 - regtry - try match at specific point
1693 STATIC I32 /* 0 failure, 1 success */
1694 S_regtry(pTHX_ regexp *prog, char *startpos)
1702 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1705 PL_reg_eval_set = RS_init;
1707 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1708 (IV)(PL_stack_sp - PL_stack_base));
1710 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1711 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1712 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1714 /* Apparently this is not needed, judging by wantarray. */
1715 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1716 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1719 /* Make $_ available to executed code. */
1720 if (PL_reg_sv != DEFSV) {
1721 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1726 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1727 && (mg = mg_find(PL_reg_sv, 'g')))) {
1728 /* prepare for quick setting of pos */
1729 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1730 mg = mg_find(PL_reg_sv, 'g');
1734 PL_reg_oldpos = mg->mg_len;
1735 SAVEDESTRUCTOR_X(restore_pos, 0);
1738 Newz(22,PL_reg_curpm, 1, PMOP);
1739 PL_reg_curpm->op_pmregexp = prog;
1740 PL_reg_oldcurpm = PL_curpm;
1741 PL_curpm = PL_reg_curpm;
1742 if (RX_MATCH_COPIED(prog)) {
1743 /* Here is a serious problem: we cannot rewrite subbeg,
1744 since it may be needed if this match fails. Thus
1745 $` inside (?{}) could fail... */
1746 PL_reg_oldsaved = prog->subbeg;
1747 PL_reg_oldsavedlen = prog->sublen;
1748 RX_MATCH_COPIED_off(prog);
1751 PL_reg_oldsaved = Nullch;
1752 prog->subbeg = PL_bostr;
1753 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1755 prog->startp[0] = startpos - PL_bostr;
1756 PL_reginput = startpos;
1757 PL_regstartp = prog->startp;
1758 PL_regendp = prog->endp;
1759 PL_reglastparen = &prog->lastparen;
1760 prog->lastparen = 0;
1762 DEBUG_r(PL_reg_starttry = startpos);
1763 if (PL_reg_start_tmpl <= prog->nparens) {
1764 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1765 if(PL_reg_start_tmp)
1766 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1768 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1771 /* XXXX What this code is doing here?!!! There should be no need
1772 to do this again and again, PL_reglastparen should take care of
1776 if (prog->nparens) {
1777 for (i = prog->nparens; i >= 1; i--) {
1783 if (regmatch(prog->program + 1)) {
1784 prog->endp[0] = PL_reginput - PL_bostr;
1792 - regmatch - main matching routine
1794 * Conceptually the strategy is simple: check to see whether the current
1795 * node matches, call self recursively to see whether the rest matches,
1796 * and then act accordingly. In practice we make some effort to avoid
1797 * recursion, in particular by going through "ordinary" nodes (that don't
1798 * need to know whether the rest of the match failed) by a loop instead of
1801 /* [lwall] I've hoisted the register declarations to the outer block in order to
1802 * maybe save a little bit of pushing and popping on the stack. It also takes
1803 * advantage of machines that use a register save mask on subroutine entry.
1805 STATIC I32 /* 0 failure, 1 success */
1806 S_regmatch(pTHX_ regnode *prog)
1809 register regnode *scan; /* Current node. */
1810 regnode *next; /* Next node. */
1811 regnode *inner; /* Next node in internal branch. */
1812 register I32 nextchr; /* renamed nextchr - nextchar colides with
1813 function of same name */
1814 register I32 n; /* no or next */
1815 register I32 ln; /* len or last */
1816 register char *s; /* operand or save */
1817 register char *locinput = PL_reginput;
1818 register I32 c1, c2, paren; /* case fold search, parenth */
1819 int minmod = 0, sw = 0, logical = 0;
1824 /* Note that nextchr is a byte even in UTF */
1825 nextchr = UCHARAT(locinput);
1827 while (scan != NULL) {
1828 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1830 # define sayYES goto yes
1831 # define sayNO goto no
1832 # define sayYES_FINAL goto yes_final
1833 # define sayYES_LOUD goto yes_loud
1834 # define sayNO_FINAL goto no_final
1835 # define sayNO_SILENT goto do_no
1836 # define saySAME(x) if (x) goto yes; else goto no
1837 # define REPORT_CODE_OFF 24
1839 # define sayYES return 1
1840 # define sayNO return 0
1841 # define sayYES_FINAL return 1
1842 # define sayYES_LOUD return 1
1843 # define sayNO_FINAL return 0
1844 # define sayNO_SILENT return 0
1845 # define saySAME(x) return x
1848 SV *prop = sv_newmortal();
1849 int docolor = *PL_colors[0];
1850 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1851 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1852 /* The part of the string before starttry has one color
1853 (pref0_len chars), between starttry and current
1854 position another one (pref_len - pref0_len chars),
1855 after the current position the third one.
1856 We assume that pref0_len <= pref_len, otherwise we
1857 decrease pref0_len. */
1858 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1859 ? (5 + taill) - l : locinput - PL_bostr);
1860 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1862 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1863 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1864 ? (5 + taill) - pref_len : PL_regeol - locinput);
1867 if (pref0_len > pref_len)
1868 pref0_len = pref_len;
1869 regprop(prop, scan);
1870 PerlIO_printf(Perl_debug_log,
1871 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1872 (IV)(locinput - PL_bostr),
1873 PL_colors[4], pref0_len,
1874 locinput - pref_len, PL_colors[5],
1875 PL_colors[2], pref_len - pref0_len,
1876 locinput - pref_len + pref0_len, PL_colors[3],
1877 (docolor ? "" : "> <"),
1878 PL_colors[0], l, locinput, PL_colors[1],
1879 15 - l - pref_len + 1,
1881 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1885 next = scan + NEXT_OFF(scan);
1891 if (locinput == PL_bostr
1892 ? PL_regprev == '\n'
1894 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1896 /* regtill = regbol; */
1901 if (locinput == PL_bostr
1902 ? PL_regprev == '\n'
1903 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1909 if (locinput == PL_bostr)
1913 if (locinput == PL_reg_ganch)
1923 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1928 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1930 if (PL_regeol - locinput > 1)
1934 if (PL_regeol != locinput)
1938 if (nextchr & 0x80) {
1939 locinput += PL_utf8skip[nextchr];
1940 if (locinput > PL_regeol)
1942 nextchr = UCHARAT(locinput);
1945 if (!nextchr && locinput >= PL_regeol)
1947 nextchr = UCHARAT(++locinput);
1950 if (!nextchr && locinput >= PL_regeol)
1952 nextchr = UCHARAT(++locinput);
1955 if (nextchr & 0x80) {
1956 locinput += PL_utf8skip[nextchr];
1957 if (locinput > PL_regeol)
1959 nextchr = UCHARAT(locinput);
1962 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1964 nextchr = UCHARAT(++locinput);
1967 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1969 nextchr = UCHARAT(++locinput);
1974 /* Inline the first character, for speed. */
1975 if (UCHARAT(s) != nextchr)
1977 if (PL_regeol - locinput < ln)
1979 if (ln > 1 && memNE(s, locinput, ln))
1982 nextchr = UCHARAT(locinput);
1985 PL_reg_flags |= RF_tainted;
1994 c1 = OP(scan) == EXACTF;
1998 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1999 toLOWER_utf8((U8*)l) :
2000 toLOWER_LC_utf8((U8*)l)))
2008 nextchr = UCHARAT(locinput);
2012 /* Inline the first character, for speed. */
2013 if (UCHARAT(s) != nextchr &&
2014 UCHARAT(s) != ((OP(scan) == EXACTF)
2015 ? PL_fold : PL_fold_locale)[nextchr])
2017 if (PL_regeol - locinput < ln)
2019 if (ln > 1 && (OP(scan) == EXACTF
2020 ? ibcmp(s, locinput, ln)
2021 : ibcmp_locale(s, locinput, ln)))
2024 nextchr = UCHARAT(locinput);
2027 if (!REGINCLASSUTF8(scan, (U8*)locinput))
2029 if (locinput >= PL_regeol)
2031 locinput += PL_utf8skip[nextchr];
2032 nextchr = UCHARAT(locinput);
2036 nextchr = UCHARAT(locinput);
2037 if (!REGINCLASS(scan, nextchr))
2039 if (!nextchr && locinput >= PL_regeol)
2041 nextchr = UCHARAT(++locinput);
2044 PL_reg_flags |= RF_tainted;
2049 if (!(OP(scan) == ALNUM
2050 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2052 nextchr = UCHARAT(++locinput);
2055 PL_reg_flags |= RF_tainted;
2060 if (nextchr & 0x80) {
2061 if (!(OP(scan) == ALNUMUTF8
2062 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2063 : isALNUM_LC_utf8((U8*)locinput)))
2067 locinput += PL_utf8skip[nextchr];
2068 nextchr = UCHARAT(locinput);
2071 if (!(OP(scan) == ALNUMUTF8
2072 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2074 nextchr = UCHARAT(++locinput);
2077 PL_reg_flags |= RF_tainted;
2080 if (!nextchr && locinput >= PL_regeol)
2082 if (OP(scan) == NALNUM
2083 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2085 nextchr = UCHARAT(++locinput);
2088 PL_reg_flags |= RF_tainted;
2091 if (!nextchr && locinput >= PL_regeol)
2093 if (nextchr & 0x80) {
2094 if (OP(scan) == NALNUMUTF8
2095 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2096 : isALNUM_LC_utf8((U8*)locinput))
2100 locinput += PL_utf8skip[nextchr];
2101 nextchr = UCHARAT(locinput);
2104 if (OP(scan) == NALNUMUTF8
2105 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2107 nextchr = UCHARAT(++locinput);
2111 PL_reg_flags |= RF_tainted;
2115 /* was last char in word? */
2116 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2117 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2119 n = isALNUM(nextchr);
2122 ln = isALNUM_LC(ln);
2123 n = isALNUM_LC(nextchr);
2125 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2130 PL_reg_flags |= RF_tainted;
2134 /* was last char in word? */
2135 ln = (locinput != PL_regbol)
2136 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2137 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2138 ln = isALNUM_uni(ln);
2139 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2142 ln = isALNUM_LC_uni(ln);
2143 n = isALNUM_LC_utf8((U8*)locinput);
2145 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2149 PL_reg_flags |= RF_tainted;
2154 if (!(OP(scan) == SPACE
2155 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2157 nextchr = UCHARAT(++locinput);
2160 PL_reg_flags |= RF_tainted;
2165 if (nextchr & 0x80) {
2166 if (!(OP(scan) == SPACEUTF8
2167 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2168 : isSPACE_LC_utf8((U8*)locinput)))
2172 locinput += PL_utf8skip[nextchr];
2173 nextchr = UCHARAT(locinput);
2176 if (!(OP(scan) == SPACEUTF8
2177 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2179 nextchr = UCHARAT(++locinput);
2182 PL_reg_flags |= RF_tainted;
2185 if (!nextchr && locinput >= PL_regeol)
2187 if (OP(scan) == NSPACE
2188 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2190 nextchr = UCHARAT(++locinput);
2193 PL_reg_flags |= RF_tainted;
2196 if (!nextchr && locinput >= PL_regeol)
2198 if (nextchr & 0x80) {
2199 if (OP(scan) == NSPACEUTF8
2200 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2201 : isSPACE_LC_utf8((U8*)locinput))
2205 locinput += PL_utf8skip[nextchr];
2206 nextchr = UCHARAT(locinput);
2209 if (OP(scan) == NSPACEUTF8
2210 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2212 nextchr = UCHARAT(++locinput);
2215 PL_reg_flags |= RF_tainted;
2220 if (!(OP(scan) == DIGIT
2221 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2223 nextchr = UCHARAT(++locinput);
2226 PL_reg_flags |= RF_tainted;
2231 if (nextchr & 0x80) {
2232 if (!(OP(scan) == DIGITUTF8
2233 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2234 : isDIGIT_LC_utf8((U8*)locinput)))
2238 locinput += PL_utf8skip[nextchr];
2239 nextchr = UCHARAT(locinput);
2242 if (!(OP(scan) == DIGITUTF8
2243 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2245 nextchr = UCHARAT(++locinput);
2248 PL_reg_flags |= RF_tainted;
2251 if (!nextchr && locinput >= PL_regeol)
2253 if (OP(scan) == NDIGIT
2254 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2256 nextchr = UCHARAT(++locinput);
2259 PL_reg_flags |= RF_tainted;
2262 if (!nextchr && locinput >= PL_regeol)
2264 if (nextchr & 0x80) {
2265 if (OP(scan) == NDIGITUTF8
2266 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2267 : isDIGIT_LC_utf8((U8*)locinput))
2271 locinput += PL_utf8skip[nextchr];
2272 nextchr = UCHARAT(locinput);
2275 if (OP(scan) == NDIGITUTF8
2276 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2278 nextchr = UCHARAT(++locinput);
2281 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2283 locinput += PL_utf8skip[nextchr];
2284 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2285 locinput += UTF8SKIP(locinput);
2286 if (locinput > PL_regeol)
2288 nextchr = UCHARAT(locinput);
2291 PL_reg_flags |= RF_tainted;
2295 n = ARG(scan); /* which paren pair */
2296 ln = PL_regstartp[n];
2297 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2298 if (*PL_reglastparen < n || ln == -1)
2299 sayNO; /* Do not match unless seen CLOSEn. */
2300 if (ln == PL_regendp[n])
2304 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2306 char *e = PL_bostr + PL_regendp[n];
2308 * Note that we can't do the "other character" lookup trick as
2309 * in the 8-bit case (no pun intended) because in Unicode we
2310 * have to map both upper and title case to lower case.
2312 if (OP(scan) == REFF) {
2316 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2326 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2333 nextchr = UCHARAT(locinput);
2337 /* Inline the first character, for speed. */
2338 if (UCHARAT(s) != nextchr &&
2340 (UCHARAT(s) != ((OP(scan) == REFF
2341 ? PL_fold : PL_fold_locale)[nextchr]))))
2343 ln = PL_regendp[n] - ln;
2344 if (locinput + ln > PL_regeol)
2346 if (ln > 1 && (OP(scan) == REF
2347 ? memNE(s, locinput, ln)
2349 ? ibcmp(s, locinput, ln)
2350 : ibcmp_locale(s, locinput, ln))))
2353 nextchr = UCHARAT(locinput);
2364 OP_4tree *oop = PL_op;
2365 COP *ocurcop = PL_curcop;
2366 SV **ocurpad = PL_curpad;
2370 PL_op = (OP_4tree*)PL_regdata->data[n];
2371 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2372 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2373 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2375 CALLRUNOPS(aTHX); /* Scalar context. */
2381 PL_curpad = ocurpad;
2382 PL_curcop = ocurcop;
2384 if (logical == 2) { /* Postponed subexpression. */
2386 MAGIC *mg = Null(MAGIC*);
2388 CHECKPOINT cp, lastcp;
2390 if(SvROK(ret) || SvRMAGICAL(ret)) {
2391 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2394 mg = mg_find(sv, 'r');
2397 re = (regexp *)mg->mg_obj;
2398 (void)ReREFCNT_inc(re);
2402 char *t = SvPV(ret, len);
2404 char *oprecomp = PL_regprecomp;
2405 I32 osize = PL_regsize;
2406 I32 onpar = PL_regnpar;
2409 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2410 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2412 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2413 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2414 PL_regprecomp = oprecomp;
2419 PerlIO_printf(Perl_debug_log,
2420 "Entering embedded `%s%.60s%s%s'\n",
2424 (strlen(re->precomp) > 60 ? "..." : ""))
2427 state.prev = PL_reg_call_cc;
2428 state.cc = PL_regcc;
2429 state.re = PL_reg_re;
2433 cp = regcppush(0); /* Save *all* the positions. */
2436 state.ss = PL_savestack_ix;
2437 *PL_reglastparen = 0;
2438 PL_reg_call_cc = &state;
2439 PL_reginput = locinput;
2441 /* XXXX This is too dramatic a measure... */
2444 if (regmatch(re->program + 1)) {
2445 /* Even though we succeeded, we need to restore
2446 global variables, since we may be wrapped inside
2447 SUSPEND, thus the match may be not finished yet. */
2449 /* XXXX Do this only if SUSPENDed? */
2450 PL_reg_call_cc = state.prev;
2451 PL_regcc = state.cc;
2452 PL_reg_re = state.re;
2453 cache_re(PL_reg_re);
2455 /* XXXX This is too dramatic a measure... */
2458 /* These are needed even if not SUSPEND. */
2466 PL_reg_call_cc = state.prev;
2467 PL_regcc = state.cc;
2468 PL_reg_re = state.re;
2469 cache_re(PL_reg_re);
2471 /* XXXX This is too dramatic a measure... */
2480 sv_setsv(save_scalar(PL_replgv), ret);
2484 n = ARG(scan); /* which paren pair */
2485 PL_reg_start_tmp[n] = locinput;
2490 n = ARG(scan); /* which paren pair */
2491 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2492 PL_regendp[n] = locinput - PL_bostr;
2493 if (n > *PL_reglastparen)
2494 *PL_reglastparen = n;
2497 n = ARG(scan); /* which paren pair */
2498 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2501 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2503 next = NEXTOPER(NEXTOPER(scan));
2505 next = scan + ARG(scan);
2506 if (OP(next) == IFTHEN) /* Fake one. */
2507 next = NEXTOPER(NEXTOPER(next));
2511 logical = scan->flags;
2513 /*******************************************************************
2514 PL_regcc contains infoblock about the innermost (...)* loop, and
2515 a pointer to the next outer infoblock.
2517 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2519 1) After matching X, regnode for CURLYX is processed;
2521 2) This regnode creates infoblock on the stack, and calls
2522 regmatch() recursively with the starting point at WHILEM node;
2524 3) Each hit of WHILEM node tries to match A and Z (in the order
2525 depending on the current iteration, min/max of {min,max} and
2526 greediness). The information about where are nodes for "A"
2527 and "Z" is read from the infoblock, as is info on how many times "A"
2528 was already matched, and greediness.
2530 4) After A matches, the same WHILEM node is hit again.
2532 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2533 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2534 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2535 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2536 of the external loop.
2538 Currently present infoblocks form a tree with a stem formed by PL_curcc
2539 and whatever it mentions via ->next, and additional attached trees
2540 corresponding to temporarily unset infoblocks as in "5" above.
2542 In the following picture infoblocks for outer loop of
2543 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2544 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2545 infoblocks are drawn below the "reset" infoblock.
2547 In fact in the picture below we do not show failed matches for Z and T
2548 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2549 more obvious *why* one needs to *temporary* unset infoblocks.]
2551 Matched REx position InfoBlocks Comment
2555 Y A)*?Z)*?T x <- O <- I
2556 YA )*?Z)*?T x <- O <- I
2557 YA A)*?Z)*?T x <- O <- I
2558 YAA )*?Z)*?T x <- O <- I
2559 YAA Z)*?T x <- O # Temporary unset I
2562 YAAZ Y(A)*?Z)*?T x <- O
2565 YAAZY (A)*?Z)*?T x <- O
2568 YAAZY A)*?Z)*?T x <- O <- I
2571 YAAZYA )*?Z)*?T x <- O <- I
2574 YAAZYA Z)*?T x <- O # Temporary unset I
2580 YAAZYAZ T x # Temporary unset O
2587 *******************************************************************/
2590 CHECKPOINT cp = PL_savestack_ix;
2592 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2594 cc.oldcc = PL_regcc;
2596 cc.parenfloor = *PL_reglastparen;
2598 cc.min = ARG1(scan);
2599 cc.max = ARG2(scan);
2600 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2604 PL_reginput = locinput;
2605 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2607 PL_regcc = cc.oldcc;
2613 * This is really hard to understand, because after we match
2614 * what we're trying to match, we must make sure the rest of
2615 * the REx is going to match for sure, and to do that we have
2616 * to go back UP the parse tree by recursing ever deeper. And
2617 * if it fails, we have to reset our parent's current state
2618 * that we can try again after backing off.
2621 CHECKPOINT cp, lastcp;
2622 CURCUR* cc = PL_regcc;
2623 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2625 n = cc->cur + 1; /* how many we know we matched */
2626 PL_reginput = locinput;
2629 PerlIO_printf(Perl_debug_log,
2630 "%*s %ld out of %ld..%ld cc=%lx\n",
2631 REPORT_CODE_OFF+PL_regindent*2, "",
2632 (long)n, (long)cc->min,
2633 (long)cc->max, (long)cc)
2636 /* If degenerate scan matches "", assume scan done. */
2638 if (locinput == cc->lastloc && n >= cc->min) {
2639 PL_regcc = cc->oldcc;
2643 PerlIO_printf(Perl_debug_log,
2644 "%*s empty match detected, try continuation...\n",
2645 REPORT_CODE_OFF+PL_regindent*2, "")
2647 if (regmatch(cc->next))
2655 /* First just match a string of min scans. */
2659 cc->lastloc = locinput;
2660 if (regmatch(cc->scan))
2663 cc->lastloc = lastloc;
2668 /* Check whether we already were at this position.
2669 Postpone detection until we know the match is not
2670 *that* much linear. */
2671 if (!PL_reg_maxiter) {
2672 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2673 PL_reg_leftiter = PL_reg_maxiter;
2675 if (PL_reg_leftiter-- == 0) {
2676 I32 size = (PL_reg_maxiter + 7)/8;
2677 if (PL_reg_poscache) {
2678 if (PL_reg_poscache_size < size) {
2679 Renew(PL_reg_poscache, size, char);
2680 PL_reg_poscache_size = size;
2682 Zero(PL_reg_poscache, size, char);
2685 PL_reg_poscache_size = size;
2686 Newz(29, PL_reg_poscache, size, char);
2689 PerlIO_printf(Perl_debug_log,
2690 "%sDetected a super-linear match, switching on caching%s...\n",
2691 PL_colors[4], PL_colors[5])
2694 if (PL_reg_leftiter < 0) {
2695 I32 o = locinput - PL_bostr, b;
2697 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2700 if (PL_reg_poscache[o] & (1<<b)) {
2702 PerlIO_printf(Perl_debug_log,
2703 "%*s already tried at this position...\n",
2704 REPORT_CODE_OFF+PL_regindent*2, "")
2708 PL_reg_poscache[o] |= (1<<b);
2712 /* Prefer next over scan for minimal matching. */
2715 PL_regcc = cc->oldcc;
2718 cp = regcppush(cc->parenfloor);
2720 if (regmatch(cc->next)) {
2722 sayYES; /* All done. */
2730 if (n >= cc->max) { /* Maximum greed exceeded? */
2731 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2732 && !(PL_reg_flags & RF_warned)) {
2733 PL_reg_flags |= RF_warned;
2734 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2735 "Complex regular subexpression recursion",
2742 PerlIO_printf(Perl_debug_log,
2743 "%*s trying longer...\n",
2744 REPORT_CODE_OFF+PL_regindent*2, "")
2746 /* Try scanning more and see if it helps. */
2747 PL_reginput = locinput;
2749 cc->lastloc = locinput;
2750 cp = regcppush(cc->parenfloor);
2752 if (regmatch(cc->scan)) {
2759 cc->lastloc = lastloc;
2763 /* Prefer scan over next for maximal matching. */
2765 if (n < cc->max) { /* More greed allowed? */
2766 cp = regcppush(cc->parenfloor);
2768 cc->lastloc = locinput;
2770 if (regmatch(cc->scan)) {
2775 regcppop(); /* Restore some previous $<digit>s? */
2776 PL_reginput = locinput;
2778 PerlIO_printf(Perl_debug_log,
2779 "%*s failed, try continuation...\n",
2780 REPORT_CODE_OFF+PL_regindent*2, "")
2783 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2784 && !(PL_reg_flags & RF_warned)) {
2785 PL_reg_flags |= RF_warned;
2786 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2787 "Complex regular subexpression recursion",
2791 /* Failed deeper matches of scan, so see if this one works. */
2792 PL_regcc = cc->oldcc;
2795 if (regmatch(cc->next))
2801 cc->lastloc = lastloc;
2806 next = scan + ARG(scan);
2809 inner = NEXTOPER(NEXTOPER(scan));
2812 inner = NEXTOPER(scan);
2817 if (OP(next) != c1) /* No choice. */
2818 next = inner; /* Avoid recursion. */
2820 int lastparen = *PL_reglastparen;
2824 PL_reginput = locinput;
2825 if (regmatch(inner))
2828 for (n = *PL_reglastparen; n > lastparen; n--)
2830 *PL_reglastparen = n;
2833 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2837 inner = NEXTOPER(scan);
2838 if (c1 == BRANCHJ) {
2839 inner = NEXTOPER(inner);
2841 } while (scan != NULL && OP(scan) == c1);
2855 /* We suppose that the next guy does not need
2856 backtracking: in particular, it is of constant length,
2857 and has no parenths to influence future backrefs. */
2858 ln = ARG1(scan); /* min to match */
2859 n = ARG2(scan); /* max to match */
2860 paren = scan->flags;
2862 if (paren > PL_regsize)
2864 if (paren > *PL_reglastparen)
2865 *PL_reglastparen = paren;
2867 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2869 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2870 PL_reginput = locinput;
2873 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2875 if (ln && l == 0 && n >= ln
2876 /* In fact, this is tricky. If paren, then the
2877 fact that we did/didnot match may influence
2878 future execution. */
2879 && !(paren && ln == 0))
2881 locinput = PL_reginput;
2882 if (PL_regkind[(U8)OP(next)] == EXACT) {
2883 c1 = (U8)*STRING(next);
2884 if (OP(next) == EXACTF)
2886 else if (OP(next) == EXACTFL)
2887 c2 = PL_fold_locale[c1];
2894 /* This may be improved if l == 0. */
2895 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2896 /* If it could work, try it. */
2898 UCHARAT(PL_reginput) == c1 ||
2899 UCHARAT(PL_reginput) == c2)
2903 PL_regstartp[paren] =
2904 HOPc(PL_reginput, -l) - PL_bostr;
2905 PL_regendp[paren] = PL_reginput - PL_bostr;
2908 PL_regendp[paren] = -1;
2914 /* Couldn't or didn't -- move forward. */
2915 PL_reginput = locinput;
2916 if (regrepeat_hard(scan, 1, &l)) {
2918 locinput = PL_reginput;
2925 n = regrepeat_hard(scan, n, &l);
2926 if (n != 0 && l == 0
2927 /* In fact, this is tricky. If paren, then the
2928 fact that we did/didnot match may influence
2929 future execution. */
2930 && !(paren && ln == 0))
2932 locinput = PL_reginput;
2934 PerlIO_printf(Perl_debug_log,
2935 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2936 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2940 if (PL_regkind[(U8)OP(next)] == EXACT) {
2941 c1 = (U8)*STRING(next);
2942 if (OP(next) == EXACTF)
2944 else if (OP(next) == EXACTFL)
2945 c2 = PL_fold_locale[c1];
2954 /* If it could work, try it. */
2956 UCHARAT(PL_reginput) == c1 ||
2957 UCHARAT(PL_reginput) == c2)
2960 PerlIO_printf(Perl_debug_log,
2961 "%*s trying tail with n=%"IVdf"...\n",
2962 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2966 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2967 PL_regendp[paren] = PL_reginput - PL_bostr;
2970 PL_regendp[paren] = -1;
2976 /* Couldn't or didn't -- back up. */
2978 locinput = HOPc(locinput, -l);
2979 PL_reginput = locinput;
2986 paren = scan->flags; /* Which paren to set */
2987 if (paren > PL_regsize)
2989 if (paren > *PL_reglastparen)
2990 *PL_reglastparen = paren;
2991 ln = ARG1(scan); /* min to match */
2992 n = ARG2(scan); /* max to match */
2993 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2997 ln = ARG1(scan); /* min to match */
2998 n = ARG2(scan); /* max to match */
2999 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3004 scan = NEXTOPER(scan);
3010 scan = NEXTOPER(scan);
3014 * Lookahead to avoid useless match attempts
3015 * when we know what character comes next.
3017 if (PL_regkind[(U8)OP(next)] == EXACT) {
3018 c1 = (U8)*STRING(next);
3019 if (OP(next) == EXACTF)
3021 else if (OP(next) == EXACTFL)
3022 c2 = PL_fold_locale[c1];
3028 PL_reginput = locinput;
3032 if (ln && regrepeat(scan, ln) < ln)
3034 locinput = PL_reginput;
3037 char *e = locinput + n - ln; /* Should not check after this */
3038 char *old = locinput;
3040 if (e >= PL_regeol || (n == REG_INFTY))
3043 /* Find place 'next' could work */
3045 while (locinput <= e && *locinput != c1)
3048 while (locinput <= e
3055 /* PL_reginput == old now */
3056 if (locinput != old) {
3057 ln = 1; /* Did some */
3058 if (regrepeat(scan, locinput - old) <
3062 /* PL_reginput == locinput now */
3063 TRYPAREN(paren, ln, locinput);
3064 PL_reginput = locinput; /* Could be reset... */
3066 /* Couldn't or didn't -- move forward. */
3071 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3072 /* If it could work, try it. */
3074 UCHARAT(PL_reginput) == c1 ||
3075 UCHARAT(PL_reginput) == c2)
3077 TRYPAREN(paren, n, PL_reginput);
3080 /* Couldn't or didn't -- move forward. */
3081 PL_reginput = locinput;
3082 if (regrepeat(scan, 1)) {
3084 locinput = PL_reginput;
3092 n = regrepeat(scan, n);
3093 locinput = PL_reginput;
3094 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3095 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3096 ln = n; /* why back off? */
3097 /* ...because $ and \Z can match before *and* after
3098 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3099 We should back off by one in this case. */
3100 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3106 /* If it could work, try it. */
3108 UCHARAT(PL_reginput) == c1 ||
3109 UCHARAT(PL_reginput) == c2)
3111 TRYPAREN(paren, n, PL_reginput);
3114 /* Couldn't or didn't -- back up. */
3116 PL_reginput = locinput = HOPc(locinput, -1);
3121 /* If it could work, try it. */
3123 UCHARAT(PL_reginput) == c1 ||
3124 UCHARAT(PL_reginput) == c2)
3126 TRYPAREN(paren, n, PL_reginput);
3129 /* Couldn't or didn't -- back up. */
3131 PL_reginput = locinput = HOPc(locinput, -1);
3138 if (PL_reg_call_cc) {
3139 re_cc_state *cur_call_cc = PL_reg_call_cc;
3140 CURCUR *cctmp = PL_regcc;
3141 regexp *re = PL_reg_re;
3142 CHECKPOINT cp, lastcp;
3144 cp = regcppush(0); /* Save *all* the positions. */
3146 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3148 PL_reginput = locinput; /* Make position available to
3150 cache_re(PL_reg_call_cc->re);
3151 PL_regcc = PL_reg_call_cc->cc;
3152 PL_reg_call_cc = PL_reg_call_cc->prev;
3153 if (regmatch(cur_call_cc->node)) {
3154 PL_reg_call_cc = cur_call_cc;
3160 PL_reg_call_cc = cur_call_cc;
3166 PerlIO_printf(Perl_debug_log,
3167 "%*s continuation failed...\n",
3168 REPORT_CODE_OFF+PL_regindent*2, "")
3172 if (locinput < PL_regtill) {
3173 DEBUG_r(PerlIO_printf(Perl_debug_log,
3174 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3176 (long)(locinput - PL_reg_starttry),
3177 (long)(PL_regtill - PL_reg_starttry),
3179 sayNO_FINAL; /* Cannot match: too short. */
3181 PL_reginput = locinput; /* put where regtry can find it */
3182 sayYES_FINAL; /* Success! */
3184 PL_reginput = locinput; /* put where regtry can find it */
3185 sayYES_LOUD; /* Success! */
3188 PL_reginput = locinput;
3193 if (UTF) { /* XXXX This is absolutely
3194 broken, we read before
3196 s = HOPMAYBEc(locinput, -scan->flags);
3202 if (locinput < PL_bostr + scan->flags)
3204 PL_reginput = locinput - scan->flags;
3209 PL_reginput = locinput;
3214 if (UTF) { /* XXXX This is absolutely
3215 broken, we read before
3217 s = HOPMAYBEc(locinput, -scan->flags);
3218 if (!s || s < PL_bostr)
3223 if (locinput < PL_bostr + scan->flags)
3225 PL_reginput = locinput - scan->flags;
3230 PL_reginput = locinput;
3233 inner = NEXTOPER(NEXTOPER(scan));
3234 if (regmatch(inner) != n) {
3249 if (OP(scan) == SUSPEND) {
3250 locinput = PL_reginput;
3251 nextchr = UCHARAT(locinput);
3256 next = scan + ARG(scan);
3261 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3262 PTR2UV(scan), OP(scan));
3263 Perl_croak(aTHX_ "regexp memory corruption");
3269 * We get here only if there's trouble -- normally "case END" is
3270 * the terminating point.
3272 Perl_croak(aTHX_ "corrupted regexp pointers");
3278 PerlIO_printf(Perl_debug_log,
3279 "%*s %scould match...%s\n",
3280 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3284 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3285 PL_colors[4],PL_colors[5]));
3294 PerlIO_printf(Perl_debug_log,
3295 "%*s %sfailed...%s\n",
3296 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3308 - regrepeat - repeatedly match something simple, report how many
3311 * [This routine now assumes that it will only match on things of length 1.
3312 * That was true before, but now we assume scan - reginput is the count,
3313 * rather than incrementing count on every character. [Er, except utf8.]]
3316 S_regrepeat(pTHX_ regnode *p, I32 max)
3319 register char *scan;
3321 register char *loceol = PL_regeol;
3322 register I32 hardcount = 0;
3325 if (max != REG_INFTY && max < loceol - scan)
3326 loceol = scan + max;
3329 while (scan < loceol && *scan != '\n')
3337 while (scan < loceol && *scan != '\n') {
3338 scan += UTF8SKIP(scan);
3344 while (scan < loceol) {
3345 scan += UTF8SKIP(scan);
3349 case EXACT: /* length of string is 1 */
3351 while (scan < loceol && UCHARAT(scan) == c)
3354 case EXACTF: /* length of string is 1 */
3356 while (scan < loceol &&
3357 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3360 case EXACTFL: /* length of string is 1 */
3361 PL_reg_flags |= RF_tainted;
3363 while (scan < loceol &&
3364 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3369 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3370 scan += UTF8SKIP(scan);
3375 while (scan < loceol && REGINCLASS(p, *scan))
3379 while (scan < loceol && isALNUM(*scan))
3384 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3385 scan += UTF8SKIP(scan);
3390 PL_reg_flags |= RF_tainted;
3391 while (scan < loceol && isALNUM_LC(*scan))
3395 PL_reg_flags |= RF_tainted;
3397 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3398 scan += UTF8SKIP(scan);
3404 while (scan < loceol && !isALNUM(*scan))
3409 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3410 scan += UTF8SKIP(scan);
3415 PL_reg_flags |= RF_tainted;
3416 while (scan < loceol && !isALNUM_LC(*scan))
3420 PL_reg_flags |= RF_tainted;
3422 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3423 scan += UTF8SKIP(scan);
3428 while (scan < loceol && isSPACE(*scan))
3433 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3434 scan += UTF8SKIP(scan);
3439 PL_reg_flags |= RF_tainted;
3440 while (scan < loceol && isSPACE_LC(*scan))
3444 PL_reg_flags |= RF_tainted;
3446 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3447 scan += UTF8SKIP(scan);
3452 while (scan < loceol && !isSPACE(*scan))
3457 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3458 scan += UTF8SKIP(scan);
3463 PL_reg_flags |= RF_tainted;
3464 while (scan < loceol && !isSPACE_LC(*scan))
3468 PL_reg_flags |= RF_tainted;
3470 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3471 scan += UTF8SKIP(scan);
3476 while (scan < loceol && isDIGIT(*scan))
3481 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3482 scan += UTF8SKIP(scan);
3488 while (scan < loceol && !isDIGIT(*scan))
3493 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3494 scan += UTF8SKIP(scan);
3498 default: /* Called on something of 0 width. */
3499 break; /* So match right here or not at all. */
3505 c = scan - PL_reginput;
3510 SV *prop = sv_newmortal();
3513 PerlIO_printf(Perl_debug_log,
3514 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3515 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3522 - regrepeat_hard - repeatedly match something, report total lenth and length
3524 * The repeater is supposed to have constant length.
3528 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3531 register char *scan;
3532 register char *start;
3533 register char *loceol = PL_regeol;
3535 I32 count = 0, res = 1;
3540 start = PL_reginput;
3542 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3545 while (start < PL_reginput) {
3547 start += UTF8SKIP(start);
3558 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3560 *lp = l = PL_reginput - start;
3561 if (max != REG_INFTY && l*max < loceol - scan)
3562 loceol = scan + l*max;
3575 - reginclass - determine if a character falls into a character class
3579 S_reginclass(pTHX_ register regnode *p, register I32 c)
3582 char flags = ANYOF_FLAGS(p);
3586 if (ANYOF_BITMAP_TEST(p, c))
3588 else if (flags & ANYOF_FOLD) {
3590 if (flags & ANYOF_LOCALE) {
3591 PL_reg_flags |= RF_tainted;
3592 cf = PL_fold_locale[c];
3596 if (ANYOF_BITMAP_TEST(p, cf))
3600 if (!match && (flags & ANYOF_CLASS)) {
3601 PL_reg_flags |= RF_tainted;
3603 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3604 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3605 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3606 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3607 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3608 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3609 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3610 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3611 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3612 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3613 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3614 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3615 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3616 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3617 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3618 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3619 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3620 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3621 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3622 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3623 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3624 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3625 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3626 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3627 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3628 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3629 ) /* How's that for a conditional? */
3635 return (flags & ANYOF_INVERT) ? !match : match;
3639 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3642 char flags = ARG1(f);
3644 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3646 if (swash_fetch(sv, p))
3648 else if (flags & ANYOF_FOLD) {
3649 U8 tmpbuf[UTF8_MAXLEN];
3650 if (flags & ANYOF_LOCALE) {
3651 PL_reg_flags |= RF_tainted;
3652 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3655 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3656 if (swash_fetch(sv, tmpbuf))
3660 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3662 return (flags & ANYOF_INVERT) ? !match : match;
3666 S_reghop(pTHX_ U8 *s, I32 off)
3670 while (off-- && s < (U8*)PL_regeol)
3675 if (s > (U8*)PL_bostr) {
3678 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3680 } /* XXX could check well-formedness here */
3688 S_reghopmaybe(pTHX_ U8* s, I32 off)
3692 while (off-- && s < (U8*)PL_regeol)
3699 if (s > (U8*)PL_bostr) {
3702 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3704 } /* XXX could check well-formedness here */
3720 restore_pos(pTHXo_ void *arg)
3723 if (PL_reg_eval_set) {
3724 if (PL_reg_oldsaved) {
3725 PL_reg_re->subbeg = PL_reg_oldsaved;
3726 PL_reg_re->sublen = PL_reg_oldsavedlen;
3727 RX_MATCH_COPIED_on(PL_reg_re);
3729 PL_reg_magic->mg_len = PL_reg_oldpos;
3730 PL_reg_eval_set = 0;
3731 PL_curpm = PL_reg_oldcurpm;