5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 /* SvCUR is not set on references: SvRV and SvPVX overlap */
351 && (strpos + SvCUR(sv) != strend)) {
352 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
355 PL_regeol = strend; /* Used in HOP() */
356 s = HOPc(strpos, prog->check_offset_min);
358 slen = SvCUR(check); /* >= 1 */
360 if ( strend - s > slen || strend - s < slen - 1
361 || (strend - s == slen && strend[-1] != '\n')) {
362 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
365 /* Now should match s[0..slen-2] */
367 if (slen && (*SvPVX(check) != *s
369 && memNE(SvPVX(check), s, slen)))) {
371 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
375 else if (*SvPVX(check) != *s
376 || ((slen = SvCUR(check)) > 1
377 && memNE(SvPVX(check), s, slen)))
379 goto success_at_start;
381 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
383 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
384 end_shift = prog->minlen - start_shift -
385 CHR_SVLEN(check) + (SvTAIL(check) != 0);
387 I32 end = prog->check_offset_max + CHR_SVLEN(check)
388 - (SvTAIL(check) != 0);
389 I32 eshift = strend - s - end;
391 if (end_shift < eshift)
395 else { /* Can match at random position */
398 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
399 /* Should be nonnegative! */
400 end_shift = prog->minlen - start_shift -
401 CHR_SVLEN(check) + (SvTAIL(check) != 0);
404 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
406 Perl_croak(aTHX_ "panic: end_shift");
410 /* Find a possible match in the region s..strend by looking for
411 the "check" substring in the region corrected by start/end_shift. */
412 if (flags & REXEC_SCREAM) {
413 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
414 I32 p = -1; /* Internal iterator of scream. */
415 I32 *pp = data ? data->scream_pos : &p;
417 if (PL_screamfirst[BmRARE(check)] >= 0
418 || ( BmRARE(check) == '\n'
419 && (BmPREVIOUS(check) == SvCUR(check) - 1)
421 s = screaminstr(sv, check,
422 start_shift + (s - strbeg), end_shift, pp, 0);
426 *data->scream_olds = s;
429 s = fbm_instr((unsigned char*)s + start_shift,
430 (unsigned char*)strend - end_shift,
431 check, PL_multiline ? FBMrf_MULTILINE : 0);
433 /* Update the count-of-usability, remove useless subpatterns,
436 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
437 (s ? "Found" : "Did not find"),
438 ((check == prog->anchored_substr) ? "anchored" : "floating"),
440 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
442 PL_colors[1], (SvTAIL(check) ? "$" : ""),
443 (s ? " at offset " : "...\n") ) );
450 /* Finish the diagnostic message */
451 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
453 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
454 Start with the other substr.
455 XXXX no SCREAM optimization yet - and a very coarse implementation
456 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
457 *always* match. Probably should be marked during compile...
458 Probably it is right to do no SCREAM here...
461 if (prog->float_substr && prog->anchored_substr) {
462 /* Take into account the "other" substring. */
463 /* XXXX May be hopelessly wrong for UTF... */
466 if (check == prog->float_substr) {
469 char *last = s - start_shift, *last1, *last2;
473 t = s - prog->check_offset_max;
474 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
475 && (!(prog->reganch & ROPT_UTF8)
476 || (PL_bostr = strpos, /* Used in regcopmaybe() */
477 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
482 t += prog->anchored_offset;
483 if (t < other_last) /* These positions already checked */
486 last2 = last1 = strend - prog->minlen;
489 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
490 /* On end-of-str: see comment below. */
491 s = fbm_instr((unsigned char*)t,
492 (unsigned char*)last1 + prog->anchored_offset
493 + SvCUR(prog->anchored_substr)
494 - (SvTAIL(prog->anchored_substr)!=0),
495 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
496 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
497 (s ? "Found" : "Contradicts"),
499 (int)(SvCUR(prog->anchored_substr)
500 - (SvTAIL(prog->anchored_substr)!=0)),
501 SvPVX(prog->anchored_substr),
502 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
504 if (last1 >= last2) {
505 DEBUG_r(PerlIO_printf(Perl_debug_log,
506 ", giving up...\n"));
509 DEBUG_r(PerlIO_printf(Perl_debug_log,
510 ", trying floating at offset %ld...\n",
511 (long)(s1 + 1 - i_strpos)));
512 PL_regeol = strend; /* Used in HOP() */
513 other_last = last1 + prog->anchored_offset + 1;
518 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
519 (long)(s - i_strpos)));
520 t = s - prog->anchored_offset;
529 else { /* Take into account the floating substring. */
534 last1 = last = strend - prog->minlen + prog->float_min_offset;
535 if (last - t > prog->float_max_offset)
536 last = t + prog->float_max_offset;
537 s = t + prog->float_min_offset;
540 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
541 /* fbm_instr() takes into account exact value of end-of-str
542 if the check is SvTAIL(ed). Since false positives are OK,
543 and end-of-str is not later than strend we are OK. */
544 s = fbm_instr((unsigned char*)s,
545 (unsigned char*)last + SvCUR(prog->float_substr)
546 - (SvTAIL(prog->float_substr)!=0),
547 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
548 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
549 (s ? "Found" : "Contradicts"),
551 (int)(SvCUR(prog->float_substr)
552 - (SvTAIL(prog->float_substr)!=0)),
553 SvPVX(prog->float_substr),
554 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 ", giving up...\n"));
561 DEBUG_r(PerlIO_printf(Perl_debug_log,
562 ", trying anchored starting at offset %ld...\n",
563 (long)(s1 + 1 - i_strpos)));
564 other_last = last + 1;
565 PL_regeol = strend; /* Used in HOP() */
570 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
571 (long)(s - i_strpos)));
581 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
584 && (!(prog->reganch & ROPT_UTF8)
585 || (PL_bostr = strpos, /* Used in regcopmaybe() */
586 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
589 /* Fixed substring is found far enough so that the match
590 cannot start at strpos. */
592 if (ml_anch && t[-1] != '\n') {
593 /* Eventually fbm_*() should handle this, but often
594 anchored_offset is not 0, so this check will not be wasted. */
595 /* XXXX In the code below we prefer to look for "^" even in
596 presence of anchored substrings. And we search even
597 beyond the found float position. These pessimizations
598 are historical artefacts only. */
600 while (t < strend - prog->minlen) {
602 if (t < s - prog->check_offset_min) {
603 if (prog->anchored_substr) {
604 /* We definitely contradict the found anchored
605 substr. Due to the above check we do not
606 contradict "check" substr.
607 Thus we can arrive here only if check substr
608 is float. Redo checking for "other"=="fixed".
611 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
612 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
613 goto do_other_anchored;
616 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
617 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
620 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
621 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
627 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
628 PL_colors[0],PL_colors[1]));
633 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
637 /* The found string does not prohibit matching at beg-of-str
638 - no optimization of calling REx engine can be performed,
639 unless it was an MBOL and we are not after MBOL. */
641 /* Even in this situation we may use MBOL flag if strpos is offset
642 wrt the start of the string. */
643 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
644 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
645 /* May be due to an implicit anchor of m{.*foo} */
646 && !(prog->reganch & ROPT_IMPLICIT))
651 DEBUG_r( if (ml_anch)
652 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
653 PL_colors[0],PL_colors[1]);
656 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
657 && prog->check_substr /* Could be deleted already */
658 && --BmUSEFUL(prog->check_substr) < 0
659 && prog->check_substr == prog->float_substr)
661 /* If flags & SOMETHING - do not do it many times on the same match */
662 SvREFCNT_dec(prog->check_substr);
663 prog->check_substr = Nullsv; /* disable */
664 prog->float_substr = Nullsv; /* clear */
666 /* XXXX This is a remnant of the old implementation. It
667 looks wasteful, since now INTUIT can use many
669 prog->reganch &= ~RE_USE_INTUIT;
676 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
677 if (prog->regstclass) {
678 /* minlen == 0 is possible if regstclass is \b or \B,
679 and the fixed substr is ''$.
680 Since minlen is already taken into account, s+1 is before strend;
681 accidentally, minlen >= 1 guaranties no false positives at s + 1
682 even for \b or \B. But (minlen? 1 : 0) below assumes that
683 regstclass does not come from lookahead... */
684 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
685 This leaves EXACTF only, which is dealt with in find_byclass(). */
686 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
687 ? STR_LEN(prog->regstclass)
689 char *endpos = (prog->anchored_substr || ml_anch)
690 ? s + (prog->minlen? cl_l : 0)
691 : (prog->float_substr ? check_at - start_shift + cl_l
693 char *startpos = sv && SvPOK(sv) ? strend - SvCUR(sv) : s;
696 if (prog->reganch & ROPT_UTF8) {
697 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
700 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
705 if (endpos == strend) {
706 DEBUG_r( PerlIO_printf(Perl_debug_log,
707 "Could not match STCLASS...\n") );
710 DEBUG_r( PerlIO_printf(Perl_debug_log,
711 "This position contradicts STCLASS...\n") );
712 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
714 /* Contradict one of substrings */
715 if (prog->anchored_substr) {
716 if (prog->anchored_substr == check) {
717 DEBUG_r( what = "anchored" );
719 PL_regeol = strend; /* Used in HOP() */
721 if (s + start_shift + end_shift > strend) {
722 /* XXXX Should be taken into account earlier? */
723 DEBUG_r( PerlIO_printf(Perl_debug_log,
724 "Could not match STCLASS...\n") );
727 DEBUG_r( PerlIO_printf(Perl_debug_log,
728 "Trying %s substr starting at offset %ld...\n",
729 what, (long)(s + start_shift - i_strpos)) );
732 /* Have both, check_string is floating */
733 if (t + start_shift >= check_at) /* Contradicts floating=check */
734 goto retry_floating_check;
735 /* Recheck anchored substring, but not floating... */
737 DEBUG_r( PerlIO_printf(Perl_debug_log,
738 "Trying anchored substr starting at offset %ld...\n",
739 (long)(other_last - i_strpos)) );
740 goto do_other_anchored;
742 /* Another way we could have checked stclass at the
743 current position only: */
746 DEBUG_r( PerlIO_printf(Perl_debug_log,
747 "Trying /^/m starting at offset %ld...\n",
748 (long)(t - i_strpos)) );
751 if (!prog->float_substr) /* Could have been deleted */
753 /* Check is floating subtring. */
754 retry_floating_check:
755 t = check_at - start_shift;
756 DEBUG_r( what = "floating" );
757 goto hop_and_restart;
760 PerlIO_printf(Perl_debug_log,
761 "By STCLASS: moving %ld --> %ld\n",
762 (long)(t - i_strpos), (long)(s - i_strpos));
764 PerlIO_printf(Perl_debug_log,
765 "Does not contradict STCLASS...\n") );
767 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
768 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
771 fail_finish: /* Substring not found */
772 if (prog->check_substr) /* could be removed already */
773 BmUSEFUL(prog->check_substr) += 5; /* hooray */
775 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
776 PL_colors[4],PL_colors[5]));
780 /* We know what class REx starts with. Try to find this position... */
782 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
784 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
790 register I32 tmp = 1; /* Scratch variable? */
792 /* We know what class it must start with. */
796 if (REGINCLASSUTF8(c, (U8*)s)) {
797 if (tmp && (norun || regtry(prog, s)))
809 if (REGINCLASS(c, *(U8*)s)) {
810 if (tmp && (norun || regtry(prog, s)))
830 c2 = PL_fold_locale[c1];
835 e = s; /* Due to minlen logic of intuit() */
836 /* Here it is NOT UTF! */
840 && (ln == 1 || !(OP(c) == EXACTF
842 : ibcmp_locale(s, m, ln)))
843 && (norun || regtry(prog, s)) )
849 if ( (*(U8*)s == c1 || *(U8*)s == c2)
850 && (ln == 1 || !(OP(c) == EXACTF
852 : ibcmp_locale(s, m, ln)))
853 && (norun || regtry(prog, s)) )
860 PL_reg_flags |= RF_tainted;
863 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
864 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
866 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
868 if ((norun || regtry(prog, s)))
873 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
877 PL_reg_flags |= RF_tainted;
880 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
881 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
883 if (tmp == !(OP(c) == BOUNDUTF8 ?
884 swash_fetch(PL_utf8_alnum, (U8*)s) :
885 isALNUM_LC_utf8((U8*)s)))
888 if ((norun || regtry(prog, s)))
893 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
897 PL_reg_flags |= RF_tainted;
900 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
901 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
903 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
905 else if ((norun || regtry(prog, s)))
909 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
913 PL_reg_flags |= RF_tainted;
916 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
917 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
919 if (tmp == !(OP(c) == NBOUNDUTF8 ?
920 swash_fetch(PL_utf8_alnum, (U8*)s) :
921 isALNUM_LC_utf8((U8*)s)))
923 else if ((norun || regtry(prog, s)))
927 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
933 if (tmp && (norun || regtry(prog, s)))
945 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
946 if (tmp && (norun || regtry(prog, s)))
957 PL_reg_flags |= RF_tainted;
959 if (isALNUM_LC(*s)) {
960 if (tmp && (norun || regtry(prog, s)))
971 PL_reg_flags |= RF_tainted;
973 if (isALNUM_LC_utf8((U8*)s)) {
974 if (tmp && (norun || regtry(prog, s)))
987 if (tmp && (norun || regtry(prog, s)))
999 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1000 if (tmp && (norun || regtry(prog, s)))
1011 PL_reg_flags |= RF_tainted;
1012 while (s < strend) {
1013 if (!isALNUM_LC(*s)) {
1014 if (tmp && (norun || regtry(prog, s)))
1025 PL_reg_flags |= RF_tainted;
1026 while (s < strend) {
1027 if (!isALNUM_LC_utf8((U8*)s)) {
1028 if (tmp && (norun || regtry(prog, s)))
1039 while (s < strend) {
1041 if (tmp && (norun || regtry(prog, s)))
1052 while (s < strend) {
1053 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1054 if (tmp && (norun || regtry(prog, s)))
1065 PL_reg_flags |= RF_tainted;
1066 while (s < strend) {
1067 if (isSPACE_LC(*s)) {
1068 if (tmp && (norun || regtry(prog, s)))
1079 PL_reg_flags |= RF_tainted;
1080 while (s < strend) {
1081 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1082 if (tmp && (norun || regtry(prog, s)))
1093 while (s < strend) {
1095 if (tmp && (norun || regtry(prog, s)))
1106 while (s < strend) {
1107 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1108 if (tmp && (norun || regtry(prog, s)))
1119 PL_reg_flags |= RF_tainted;
1120 while (s < strend) {
1121 if (!isSPACE_LC(*s)) {
1122 if (tmp && (norun || regtry(prog, s)))
1133 PL_reg_flags |= RF_tainted;
1134 while (s < strend) {
1135 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1136 if (tmp && (norun || regtry(prog, s)))
1147 while (s < strend) {
1149 if (tmp && (norun || regtry(prog, s)))
1160 while (s < strend) {
1161 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1162 if (tmp && (norun || regtry(prog, s)))
1173 PL_reg_flags |= RF_tainted;
1174 while (s < strend) {
1175 if (isDIGIT_LC(*s)) {
1176 if (tmp && (norun || regtry(prog, s)))
1187 PL_reg_flags |= RF_tainted;
1188 while (s < strend) {
1189 if (isDIGIT_LC_utf8((U8*)s)) {
1190 if (tmp && (norun || regtry(prog, s)))
1201 while (s < strend) {
1203 if (tmp && (norun || regtry(prog, s)))
1214 while (s < strend) {
1215 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1216 if (tmp && (norun || regtry(prog, s)))
1227 PL_reg_flags |= RF_tainted;
1228 while (s < strend) {
1229 if (!isDIGIT_LC(*s)) {
1230 if (tmp && (norun || regtry(prog, s)))
1241 PL_reg_flags |= RF_tainted;
1242 while (s < strend) {
1243 if (!isDIGIT_LC_utf8((U8*)s)) {
1244 if (tmp && (norun || regtry(prog, s)))
1255 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1264 - regexec_flags - match a regexp against a string
1267 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1268 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1269 /* strend: pointer to null at end of string */
1270 /* strbeg: real beginning of string */
1271 /* minend: end of match must be >=minend after stringarg. */
1272 /* data: May be used for some additional optimizations. */
1273 /* nosave: For optimizations. */
1277 register regnode *c;
1278 register char *startpos = stringarg;
1279 I32 minlen; /* must match at least this many chars */
1280 I32 dontbother = 0; /* how many characters not to try at end */
1281 /* I32 start_shift = 0; */ /* Offset of the start to find
1282 constant substr. */ /* CC */
1283 I32 end_shift = 0; /* Same for the end. */ /* CC */
1284 I32 scream_pos = -1; /* Internal iterator of scream. */
1286 SV* oreplsv = GvSV(PL_replgv);
1292 PL_regnarrate = PL_debug & 512;
1295 /* Be paranoid... */
1296 if (prog == NULL || startpos == NULL) {
1297 Perl_croak(aTHX_ "NULL regexp parameter");
1301 minlen = prog->minlen;
1302 if (strend - startpos < minlen) goto phooey;
1304 if (startpos == strbeg) /* is ^ valid at stringarg? */
1307 PL_regprev = (U32)stringarg[-1];
1308 if (!PL_multiline && PL_regprev == '\n')
1309 PL_regprev = '\0'; /* force ^ to NOT match */
1312 /* Check validity of program. */
1313 if (UCHARAT(prog->program) != REG_MAGIC) {
1314 Perl_croak(aTHX_ "corrupted regexp program");
1318 PL_reg_eval_set = 0;
1321 if (prog->reganch & ROPT_UTF8)
1322 PL_reg_flags |= RF_utf8;
1324 /* Mark beginning of line for ^ and lookbehind. */
1325 PL_regbol = startpos;
1329 /* Mark end of line for $ (and such) */
1332 /* see how far we have to get to not match where we matched before */
1333 PL_regtill = startpos+minend;
1335 /* We start without call_cc context. */
1338 /* If there is a "must appear" string, look for it. */
1341 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1344 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1345 PL_reg_ganch = startpos;
1346 else if (sv && SvTYPE(sv) >= SVt_PVMG
1348 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1349 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1350 if (prog->reganch & ROPT_ANCH_GPOS) {
1351 if (s > PL_reg_ganch)
1356 else /* pos() not defined */
1357 PL_reg_ganch = strbeg;
1360 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1361 re_scream_pos_data d;
1363 d.scream_olds = &scream_olds;
1364 d.scream_pos = &scream_pos;
1365 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1367 goto phooey; /* not present */
1370 DEBUG_r( if (!PL_colorset) reginitcolors() );
1371 DEBUG_r(PerlIO_printf(Perl_debug_log,
1372 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1373 PL_colors[4],PL_colors[5],PL_colors[0],
1376 (strlen(prog->precomp) > 60 ? "..." : ""),
1378 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1379 startpos, PL_colors[1],
1380 (strend - startpos > 60 ? "..." : ""))
1383 /* Simplest case: anchored match need be tried only once. */
1384 /* [unless only anchor is BOL and multiline is set] */
1385 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1386 if (s == startpos && regtry(prog, startpos))
1388 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1389 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1394 dontbother = minlen - 1;
1395 end = HOPc(strend, -dontbother) - 1;
1396 /* for multiline we only have to try after newlines */
1397 if (prog->check_substr) {
1401 if (regtry(prog, s))
1406 if (prog->reganch & RE_USE_INTUIT) {
1407 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1418 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1419 if (regtry(prog, s))
1426 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1427 if (regtry(prog, PL_reg_ganch))
1432 /* Messy cases: unanchored match. */
1433 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1434 /* we have /x+whatever/ */
1435 /* it must be a one character string (XXXX Except UTF?) */
1436 char ch = SvPVX(prog->anchored_substr)[0];
1438 while (s < strend) {
1440 if (regtry(prog, s)) goto got_it;
1442 while (s < strend && *s == ch)
1449 while (s < strend) {
1451 if (regtry(prog, s)) goto got_it;
1453 while (s < strend && *s == ch)
1461 else if (prog->anchored_substr != Nullsv
1462 || (prog->float_substr != Nullsv
1463 && prog->float_max_offset < strend - s)) {
1464 SV *must = prog->anchored_substr
1465 ? prog->anchored_substr : prog->float_substr;
1467 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1469 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1470 char *last = HOPc(strend, /* Cannot start after this */
1471 -(I32)(CHR_SVLEN(must)
1472 - (SvTAIL(must) != 0) + back_min));
1473 char *last1; /* Last position checked before */
1476 last1 = HOPc(s, -1);
1478 last1 = s - 1; /* bogus */
1480 /* XXXX check_substr already used to find `s', can optimize if
1481 check_substr==must. */
1483 dontbother = end_shift;
1484 strend = HOPc(strend, -dontbother);
1485 while ( (s <= last) &&
1486 ((flags & REXEC_SCREAM)
1487 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1488 end_shift, &scream_pos, 0))
1489 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1490 (unsigned char*)strend, must,
1491 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1492 if (HOPc(s, -back_max) > last1) {
1493 last1 = HOPc(s, -back_min);
1494 s = HOPc(s, -back_max);
1497 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1499 last1 = HOPc(s, -back_min);
1503 while (s <= last1) {
1504 if (regtry(prog, s))
1510 while (s <= last1) {
1511 if (regtry(prog, s))
1519 else if ((c = prog->regstclass)) {
1520 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1521 /* don't bother with what can't match */
1522 strend = HOPc(strend, -(minlen - 1));
1523 if (find_byclass(prog, c, s, strend, startpos, 0))
1528 if (prog->float_substr != Nullsv) { /* Trim the end. */
1531 if (flags & REXEC_SCREAM) {
1532 last = screaminstr(sv, prog->float_substr, s - strbeg,
1533 end_shift, &scream_pos, 1); /* last one */
1535 last = scream_olds; /* Only one occurence. */
1539 char *little = SvPV(prog->float_substr, len);
1541 if (SvTAIL(prog->float_substr)) {
1542 if (memEQ(strend - len + 1, little, len - 1))
1543 last = strend - len + 1;
1544 else if (!PL_multiline)
1545 last = memEQ(strend - len, little, len)
1546 ? strend - len : Nullch;
1552 last = rninstr(s, strend, little, little + len);
1554 last = strend; /* matching `$' */
1557 if (last == NULL) goto phooey; /* Should not happen! */
1558 dontbother = strend - last + prog->float_min_offset;
1560 if (minlen && (dontbother < minlen))
1561 dontbother = minlen - 1;
1562 strend -= dontbother; /* this one's always in bytes! */
1563 /* We don't know much -- general case. */
1566 if (regtry(prog, s))
1575 if (regtry(prog, s))
1577 } while (s++ < strend);
1585 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1587 if (PL_reg_eval_set) {
1588 /* Preserve the current value of $^R */
1589 if (oreplsv != GvSV(PL_replgv))
1590 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1591 restored, the value remains
1593 restore_pos(aTHXo_ 0);
1596 /* make sure $`, $&, $', and $digit will work later */
1597 if ( !(flags & REXEC_NOT_FIRST) ) {
1598 if (RX_MATCH_COPIED(prog)) {
1599 Safefree(prog->subbeg);
1600 RX_MATCH_COPIED_off(prog);
1602 if (flags & REXEC_COPY_STR) {
1603 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1605 s = savepvn(strbeg, i);
1608 RX_MATCH_COPIED_on(prog);
1611 prog->subbeg = strbeg;
1612 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1619 if (PL_reg_eval_set)
1620 restore_pos(aTHXo_ 0);
1625 - regtry - try match at specific point
1627 STATIC I32 /* 0 failure, 1 success */
1628 S_regtry(pTHX_ regexp *prog, char *startpos)
1636 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1639 PL_reg_eval_set = RS_init;
1641 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1642 (IV)(PL_stack_sp - PL_stack_base));
1644 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1645 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1646 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1648 /* Apparently this is not needed, judging by wantarray. */
1649 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1650 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1653 /* Make $_ available to executed code. */
1654 if (PL_reg_sv != DEFSV) {
1655 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1660 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1661 && (mg = mg_find(PL_reg_sv, 'g')))) {
1662 /* prepare for quick setting of pos */
1663 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1664 mg = mg_find(PL_reg_sv, 'g');
1668 PL_reg_oldpos = mg->mg_len;
1669 SAVEDESTRUCTOR_X(restore_pos, 0);
1672 Newz(22,PL_reg_curpm, 1, PMOP);
1673 PL_reg_curpm->op_pmregexp = prog;
1674 PL_reg_oldcurpm = PL_curpm;
1675 PL_curpm = PL_reg_curpm;
1676 if (RX_MATCH_COPIED(prog)) {
1677 /* Here is a serious problem: we cannot rewrite subbeg,
1678 since it may be needed if this match fails. Thus
1679 $` inside (?{}) could fail... */
1680 PL_reg_oldsaved = prog->subbeg;
1681 PL_reg_oldsavedlen = prog->sublen;
1682 RX_MATCH_COPIED_off(prog);
1685 PL_reg_oldsaved = Nullch;
1686 prog->subbeg = PL_bostr;
1687 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1689 prog->startp[0] = startpos - PL_bostr;
1690 PL_reginput = startpos;
1691 PL_regstartp = prog->startp;
1692 PL_regendp = prog->endp;
1693 PL_reglastparen = &prog->lastparen;
1694 prog->lastparen = 0;
1696 DEBUG_r(PL_reg_starttry = startpos);
1697 if (PL_reg_start_tmpl <= prog->nparens) {
1698 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1699 if(PL_reg_start_tmp)
1700 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1702 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1705 /* XXXX What this code is doing here?!!! There should be no need
1706 to do this again and again, PL_reglastparen should take care of
1710 if (prog->nparens) {
1711 for (i = prog->nparens; i >= 1; i--) {
1717 if (regmatch(prog->program + 1)) {
1718 prog->endp[0] = PL_reginput - PL_bostr;
1726 - regmatch - main matching routine
1728 * Conceptually the strategy is simple: check to see whether the current
1729 * node matches, call self recursively to see whether the rest matches,
1730 * and then act accordingly. In practice we make some effort to avoid
1731 * recursion, in particular by going through "ordinary" nodes (that don't
1732 * need to know whether the rest of the match failed) by a loop instead of
1735 /* [lwall] I've hoisted the register declarations to the outer block in order to
1736 * maybe save a little bit of pushing and popping on the stack. It also takes
1737 * advantage of machines that use a register save mask on subroutine entry.
1739 STATIC I32 /* 0 failure, 1 success */
1740 S_regmatch(pTHX_ regnode *prog)
1743 register regnode *scan; /* Current node. */
1744 regnode *next; /* Next node. */
1745 regnode *inner; /* Next node in internal branch. */
1746 register I32 nextchr; /* renamed nextchr - nextchar colides with
1747 function of same name */
1748 register I32 n; /* no or next */
1749 register I32 ln; /* len or last */
1750 register char *s; /* operand or save */
1751 register char *locinput = PL_reginput;
1752 register I32 c1, c2, paren; /* case fold search, parenth */
1753 int minmod = 0, sw = 0, logical = 0;
1758 /* Note that nextchr is a byte even in UTF */
1759 nextchr = UCHARAT(locinput);
1761 while (scan != NULL) {
1762 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1764 # define sayYES goto yes
1765 # define sayNO goto no
1766 # define sayYES_FINAL goto yes_final
1767 # define sayYES_LOUD goto yes_loud
1768 # define sayNO_FINAL goto no_final
1769 # define sayNO_SILENT goto do_no
1770 # define saySAME(x) if (x) goto yes; else goto no
1771 # define REPORT_CODE_OFF 24
1773 # define sayYES return 1
1774 # define sayNO return 0
1775 # define sayYES_FINAL return 1
1776 # define sayYES_LOUD return 1
1777 # define sayNO_FINAL return 0
1778 # define sayNO_SILENT return 0
1779 # define saySAME(x) return x
1782 SV *prop = sv_newmortal();
1783 int docolor = *PL_colors[0];
1784 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1785 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1786 /* The part of the string before starttry has one color
1787 (pref0_len chars), between starttry and current
1788 position another one (pref_len - pref0_len chars),
1789 after the current position the third one.
1790 We assume that pref0_len <= pref_len, otherwise we
1791 decrease pref0_len. */
1792 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1793 ? (5 + taill) - l : locinput - PL_bostr);
1794 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1796 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1797 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1798 ? (5 + taill) - pref_len : PL_regeol - locinput);
1801 if (pref0_len > pref_len)
1802 pref0_len = pref_len;
1803 regprop(prop, scan);
1804 PerlIO_printf(Perl_debug_log,
1805 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1806 (IV)(locinput - PL_bostr),
1807 PL_colors[4], pref0_len,
1808 locinput - pref_len, PL_colors[5],
1809 PL_colors[2], pref_len - pref0_len,
1810 locinput - pref_len + pref0_len, PL_colors[3],
1811 (docolor ? "" : "> <"),
1812 PL_colors[0], l, locinput, PL_colors[1],
1813 15 - l - pref_len + 1,
1815 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1819 next = scan + NEXT_OFF(scan);
1825 if (locinput == PL_bostr
1826 ? PL_regprev == '\n'
1828 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1830 /* regtill = regbol; */
1835 if (locinput == PL_bostr
1836 ? PL_regprev == '\n'
1837 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1843 if (locinput == PL_regbol && PL_regprev == '\n')
1847 if (locinput == PL_reg_ganch)
1857 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1862 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1864 if (PL_regeol - locinput > 1)
1868 if (PL_regeol != locinput)
1872 if (nextchr & 0x80) {
1873 locinput += PL_utf8skip[nextchr];
1874 if (locinput > PL_regeol)
1876 nextchr = UCHARAT(locinput);
1879 if (!nextchr && locinput >= PL_regeol)
1881 nextchr = UCHARAT(++locinput);
1884 if (!nextchr && locinput >= PL_regeol)
1886 nextchr = UCHARAT(++locinput);
1889 if (nextchr & 0x80) {
1890 locinput += PL_utf8skip[nextchr];
1891 if (locinput > PL_regeol)
1893 nextchr = UCHARAT(locinput);
1896 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1898 nextchr = UCHARAT(++locinput);
1901 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1903 nextchr = UCHARAT(++locinput);
1908 /* Inline the first character, for speed. */
1909 if (UCHARAT(s) != nextchr)
1911 if (PL_regeol - locinput < ln)
1913 if (ln > 1 && memNE(s, locinput, ln))
1916 nextchr = UCHARAT(locinput);
1919 PL_reg_flags |= RF_tainted;
1928 c1 = OP(scan) == EXACTF;
1932 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1933 toLOWER_utf8((U8*)l) :
1934 toLOWER_LC_utf8((U8*)l)))
1942 nextchr = UCHARAT(locinput);
1946 /* Inline the first character, for speed. */
1947 if (UCHARAT(s) != nextchr &&
1948 UCHARAT(s) != ((OP(scan) == EXACTF)
1949 ? PL_fold : PL_fold_locale)[nextchr])
1951 if (PL_regeol - locinput < ln)
1953 if (ln > 1 && (OP(scan) == EXACTF
1954 ? ibcmp(s, locinput, ln)
1955 : ibcmp_locale(s, locinput, ln)))
1958 nextchr = UCHARAT(locinput);
1961 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1963 if (locinput >= PL_regeol)
1965 locinput += PL_utf8skip[nextchr];
1966 nextchr = UCHARAT(locinput);
1970 nextchr = UCHARAT(locinput);
1971 if (!REGINCLASS(scan, nextchr))
1973 if (!nextchr && locinput >= PL_regeol)
1975 nextchr = UCHARAT(++locinput);
1978 PL_reg_flags |= RF_tainted;
1983 if (!(OP(scan) == ALNUM
1984 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1986 nextchr = UCHARAT(++locinput);
1989 PL_reg_flags |= RF_tainted;
1994 if (nextchr & 0x80) {
1995 if (!(OP(scan) == ALNUMUTF8
1996 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1997 : isALNUM_LC_utf8((U8*)locinput)))
2001 locinput += PL_utf8skip[nextchr];
2002 nextchr = UCHARAT(locinput);
2005 if (!(OP(scan) == ALNUMUTF8
2006 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2008 nextchr = UCHARAT(++locinput);
2011 PL_reg_flags |= RF_tainted;
2014 if (!nextchr && locinput >= PL_regeol)
2016 if (OP(scan) == NALNUM
2017 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2019 nextchr = UCHARAT(++locinput);
2022 PL_reg_flags |= RF_tainted;
2025 if (!nextchr && locinput >= PL_regeol)
2027 if (nextchr & 0x80) {
2028 if (OP(scan) == NALNUMUTF8
2029 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2030 : isALNUM_LC_utf8((U8*)locinput))
2034 locinput += PL_utf8skip[nextchr];
2035 nextchr = UCHARAT(locinput);
2038 if (OP(scan) == NALNUMUTF8
2039 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2041 nextchr = UCHARAT(++locinput);
2045 PL_reg_flags |= RF_tainted;
2049 /* was last char in word? */
2050 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2051 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2053 n = isALNUM(nextchr);
2056 ln = isALNUM_LC(ln);
2057 n = isALNUM_LC(nextchr);
2059 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2064 PL_reg_flags |= RF_tainted;
2068 /* was last char in word? */
2069 ln = (locinput != PL_regbol)
2070 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2071 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2072 ln = isALNUM_uni(ln);
2073 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2076 ln = isALNUM_LC_uni(ln);
2077 n = isALNUM_LC_utf8((U8*)locinput);
2079 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2083 PL_reg_flags |= RF_tainted;
2088 if (!(OP(scan) == SPACE
2089 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2091 nextchr = UCHARAT(++locinput);
2094 PL_reg_flags |= RF_tainted;
2099 if (nextchr & 0x80) {
2100 if (!(OP(scan) == SPACEUTF8
2101 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2102 : isSPACE_LC_utf8((U8*)locinput)))
2106 locinput += PL_utf8skip[nextchr];
2107 nextchr = UCHARAT(locinput);
2110 if (!(OP(scan) == SPACEUTF8
2111 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2113 nextchr = UCHARAT(++locinput);
2116 PL_reg_flags |= RF_tainted;
2119 if (!nextchr && locinput >= PL_regeol)
2121 if (OP(scan) == NSPACE
2122 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2124 nextchr = UCHARAT(++locinput);
2127 PL_reg_flags |= RF_tainted;
2130 if (!nextchr && locinput >= PL_regeol)
2132 if (nextchr & 0x80) {
2133 if (OP(scan) == NSPACEUTF8
2134 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2135 : isSPACE_LC_utf8((U8*)locinput))
2139 locinput += PL_utf8skip[nextchr];
2140 nextchr = UCHARAT(locinput);
2143 if (OP(scan) == NSPACEUTF8
2144 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2146 nextchr = UCHARAT(++locinput);
2149 PL_reg_flags |= RF_tainted;
2154 if (!(OP(scan) == DIGIT
2155 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2157 nextchr = UCHARAT(++locinput);
2160 PL_reg_flags |= RF_tainted;
2165 if (nextchr & 0x80) {
2166 if (!(OP(scan) == DIGITUTF8
2167 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2168 : isDIGIT_LC_utf8((U8*)locinput)))
2172 locinput += PL_utf8skip[nextchr];
2173 nextchr = UCHARAT(locinput);
2176 if (!(OP(scan) == DIGITUTF8
2177 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2179 nextchr = UCHARAT(++locinput);
2182 PL_reg_flags |= RF_tainted;
2185 if (!nextchr && locinput >= PL_regeol)
2187 if (OP(scan) == NDIGIT
2188 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2190 nextchr = UCHARAT(++locinput);
2193 PL_reg_flags |= RF_tainted;
2196 if (!nextchr && locinput >= PL_regeol)
2198 if (nextchr & 0x80) {
2199 if (OP(scan) == NDIGITUTF8
2200 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2201 : isDIGIT_LC_utf8((U8*)locinput))
2205 locinput += PL_utf8skip[nextchr];
2206 nextchr = UCHARAT(locinput);
2209 if (OP(scan) == NDIGITUTF8
2210 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2212 nextchr = UCHARAT(++locinput);
2215 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2217 locinput += PL_utf8skip[nextchr];
2218 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2219 locinput += UTF8SKIP(locinput);
2220 if (locinput > PL_regeol)
2222 nextchr = UCHARAT(locinput);
2225 PL_reg_flags |= RF_tainted;
2229 n = ARG(scan); /* which paren pair */
2230 ln = PL_regstartp[n];
2231 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2232 if (*PL_reglastparen < n || ln == -1)
2233 sayNO; /* Do not match unless seen CLOSEn. */
2234 if (ln == PL_regendp[n])
2238 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2240 char *e = PL_bostr + PL_regendp[n];
2242 * Note that we can't do the "other character" lookup trick as
2243 * in the 8-bit case (no pun intended) because in Unicode we
2244 * have to map both upper and title case to lower case.
2246 if (OP(scan) == REFF) {
2250 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2260 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2267 nextchr = UCHARAT(locinput);
2271 /* Inline the first character, for speed. */
2272 if (UCHARAT(s) != nextchr &&
2274 (UCHARAT(s) != ((OP(scan) == REFF
2275 ? PL_fold : PL_fold_locale)[nextchr]))))
2277 ln = PL_regendp[n] - ln;
2278 if (locinput + ln > PL_regeol)
2280 if (ln > 1 && (OP(scan) == REF
2281 ? memNE(s, locinput, ln)
2283 ? ibcmp(s, locinput, ln)
2284 : ibcmp_locale(s, locinput, ln))))
2287 nextchr = UCHARAT(locinput);
2298 OP_4tree *oop = PL_op;
2299 COP *ocurcop = PL_curcop;
2300 SV **ocurpad = PL_curpad;
2304 PL_op = (OP_4tree*)PL_regdata->data[n];
2305 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2306 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2307 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2309 CALLRUNOPS(aTHX); /* Scalar context. */
2315 PL_curpad = ocurpad;
2316 PL_curcop = ocurcop;
2318 if (logical == 2) { /* Postponed subexpression. */
2320 MAGIC *mg = Null(MAGIC*);
2322 CHECKPOINT cp, lastcp;
2324 if(SvROK(ret) || SvRMAGICAL(ret)) {
2325 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2328 mg = mg_find(sv, 'r');
2331 re = (regexp *)mg->mg_obj;
2332 (void)ReREFCNT_inc(re);
2336 char *t = SvPV(ret, len);
2338 char *oprecomp = PL_regprecomp;
2339 I32 osize = PL_regsize;
2340 I32 onpar = PL_regnpar;
2343 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2344 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2346 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2347 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2348 PL_regprecomp = oprecomp;
2353 PerlIO_printf(Perl_debug_log,
2354 "Entering embedded `%s%.60s%s%s'\n",
2358 (strlen(re->precomp) > 60 ? "..." : ""))
2361 state.prev = PL_reg_call_cc;
2362 state.cc = PL_regcc;
2363 state.re = PL_reg_re;
2367 cp = regcppush(0); /* Save *all* the positions. */
2370 state.ss = PL_savestack_ix;
2371 *PL_reglastparen = 0;
2372 PL_reg_call_cc = &state;
2373 PL_reginput = locinput;
2375 /* XXXX This is too dramatic a measure... */
2378 if (regmatch(re->program + 1)) {
2379 /* Even though we succeeded, we need to restore
2380 global variables, since we may be wrapped inside
2381 SUSPEND, thus the match may be not finished yet. */
2383 /* XXXX Do this only if SUSPENDed? */
2384 PL_reg_call_cc = state.prev;
2385 PL_regcc = state.cc;
2386 PL_reg_re = state.re;
2387 cache_re(PL_reg_re);
2389 /* XXXX This is too dramatic a measure... */
2392 /* These are needed even if not SUSPEND. */
2400 PL_reg_call_cc = state.prev;
2401 PL_regcc = state.cc;
2402 PL_reg_re = state.re;
2403 cache_re(PL_reg_re);
2405 /* XXXX This is too dramatic a measure... */
2414 sv_setsv(save_scalar(PL_replgv), ret);
2418 n = ARG(scan); /* which paren pair */
2419 PL_reg_start_tmp[n] = locinput;
2424 n = ARG(scan); /* which paren pair */
2425 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2426 PL_regendp[n] = locinput - PL_bostr;
2427 if (n > *PL_reglastparen)
2428 *PL_reglastparen = n;
2431 n = ARG(scan); /* which paren pair */
2432 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2435 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2437 next = NEXTOPER(NEXTOPER(scan));
2439 next = scan + ARG(scan);
2440 if (OP(next) == IFTHEN) /* Fake one. */
2441 next = NEXTOPER(NEXTOPER(next));
2445 logical = scan->flags;
2447 /*******************************************************************
2448 PL_regcc contains infoblock about the innermost (...)* loop, and
2449 a pointer to the next outer infoblock.
2451 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2453 1) After matching X, regnode for CURLYX is processed;
2455 2) This regnode creates infoblock on the stack, and calls
2456 regmatch() recursively with the starting point at WHILEM node;
2458 3) Each hit of WHILEM node tries to match A and Z (in the order
2459 depending on the current iteration, min/max of {min,max} and
2460 greediness). The information about where are nodes for "A"
2461 and "Z" is read from the infoblock, as is info on how many times "A"
2462 was already matched, and greediness.
2464 4) After A matches, the same WHILEM node is hit again.
2466 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2467 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2468 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2469 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2470 of the external loop.
2472 Currently present infoblocks form a tree with a stem formed by PL_curcc
2473 and whatever it mentions via ->next, and additional attached trees
2474 corresponding to temporarily unset infoblocks as in "5" above.
2476 In the following picture infoblocks for outer loop of
2477 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2478 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2479 infoblocks are drawn below the "reset" infoblock.
2481 In fact in the picture below we do not show failed matches for Z and T
2482 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2483 more obvious *why* one needs to *temporary* unset infoblocks.]
2485 Matched REx position InfoBlocks Comment
2489 Y A)*?Z)*?T x <- O <- I
2490 YA )*?Z)*?T x <- O <- I
2491 YA A)*?Z)*?T x <- O <- I
2492 YAA )*?Z)*?T x <- O <- I
2493 YAA Z)*?T x <- O # Temporary unset I
2496 YAAZ Y(A)*?Z)*?T x <- O
2499 YAAZY (A)*?Z)*?T x <- O
2502 YAAZY A)*?Z)*?T x <- O <- I
2505 YAAZYA )*?Z)*?T x <- O <- I
2508 YAAZYA Z)*?T x <- O # Temporary unset I
2514 YAAZYAZ T x # Temporary unset O
2521 *******************************************************************/
2524 CHECKPOINT cp = PL_savestack_ix;
2526 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2528 cc.oldcc = PL_regcc;
2530 cc.parenfloor = *PL_reglastparen;
2532 cc.min = ARG1(scan);
2533 cc.max = ARG2(scan);
2534 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2538 PL_reginput = locinput;
2539 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2541 PL_regcc = cc.oldcc;
2547 * This is really hard to understand, because after we match
2548 * what we're trying to match, we must make sure the rest of
2549 * the REx is going to match for sure, and to do that we have
2550 * to go back UP the parse tree by recursing ever deeper. And
2551 * if it fails, we have to reset our parent's current state
2552 * that we can try again after backing off.
2555 CHECKPOINT cp, lastcp;
2556 CURCUR* cc = PL_regcc;
2557 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2559 n = cc->cur + 1; /* how many we know we matched */
2560 PL_reginput = locinput;
2563 PerlIO_printf(Perl_debug_log,
2564 "%*s %ld out of %ld..%ld cc=%lx\n",
2565 REPORT_CODE_OFF+PL_regindent*2, "",
2566 (long)n, (long)cc->min,
2567 (long)cc->max, (long)cc)
2570 /* If degenerate scan matches "", assume scan done. */
2572 if (locinput == cc->lastloc && n >= cc->min) {
2573 PL_regcc = cc->oldcc;
2577 PerlIO_printf(Perl_debug_log,
2578 "%*s empty match detected, try continuation...\n",
2579 REPORT_CODE_OFF+PL_regindent*2, "")
2581 if (regmatch(cc->next))
2589 /* First just match a string of min scans. */
2593 cc->lastloc = locinput;
2594 if (regmatch(cc->scan))
2597 cc->lastloc = lastloc;
2602 /* Check whether we already were at this position.
2603 Postpone detection until we know the match is not
2604 *that* much linear. */
2605 if (!PL_reg_maxiter) {
2606 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2607 PL_reg_leftiter = PL_reg_maxiter;
2609 if (PL_reg_leftiter-- == 0) {
2610 I32 size = (PL_reg_maxiter + 7)/8;
2611 if (PL_reg_poscache) {
2612 if (PL_reg_poscache_size < size) {
2613 Renew(PL_reg_poscache, size, char);
2614 PL_reg_poscache_size = size;
2616 Zero(PL_reg_poscache, size, char);
2619 PL_reg_poscache_size = size;
2620 Newz(29, PL_reg_poscache, size, char);
2623 PerlIO_printf(Perl_debug_log,
2624 "%sDetected a super-linear match, switching on caching%s...\n",
2625 PL_colors[4], PL_colors[5])
2628 if (PL_reg_leftiter < 0) {
2629 I32 o = locinput - PL_bostr, b;
2631 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2634 if (PL_reg_poscache[o] & (1<<b)) {
2636 PerlIO_printf(Perl_debug_log,
2637 "%*s already tried at this position...\n",
2638 REPORT_CODE_OFF+PL_regindent*2, "")
2642 PL_reg_poscache[o] |= (1<<b);
2646 /* Prefer next over scan for minimal matching. */
2649 PL_regcc = cc->oldcc;
2652 cp = regcppush(cc->parenfloor);
2654 if (regmatch(cc->next)) {
2656 sayYES; /* All done. */
2664 if (n >= cc->max) { /* Maximum greed exceeded? */
2665 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2666 && !(PL_reg_flags & RF_warned)) {
2667 PL_reg_flags |= RF_warned;
2668 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2669 "Complex regular subexpression recursion",
2676 PerlIO_printf(Perl_debug_log,
2677 "%*s trying longer...\n",
2678 REPORT_CODE_OFF+PL_regindent*2, "")
2680 /* Try scanning more and see if it helps. */
2681 PL_reginput = locinput;
2683 cc->lastloc = locinput;
2684 cp = regcppush(cc->parenfloor);
2686 if (regmatch(cc->scan)) {
2693 cc->lastloc = lastloc;
2697 /* Prefer scan over next for maximal matching. */
2699 if (n < cc->max) { /* More greed allowed? */
2700 cp = regcppush(cc->parenfloor);
2702 cc->lastloc = locinput;
2704 if (regmatch(cc->scan)) {
2709 regcppop(); /* Restore some previous $<digit>s? */
2710 PL_reginput = locinput;
2712 PerlIO_printf(Perl_debug_log,
2713 "%*s failed, try continuation...\n",
2714 REPORT_CODE_OFF+PL_regindent*2, "")
2717 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2718 && !(PL_reg_flags & RF_warned)) {
2719 PL_reg_flags |= RF_warned;
2720 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2721 "Complex regular subexpression recursion",
2725 /* Failed deeper matches of scan, so see if this one works. */
2726 PL_regcc = cc->oldcc;
2729 if (regmatch(cc->next))
2735 cc->lastloc = lastloc;
2740 next = scan + ARG(scan);
2743 inner = NEXTOPER(NEXTOPER(scan));
2746 inner = NEXTOPER(scan);
2751 if (OP(next) != c1) /* No choice. */
2752 next = inner; /* Avoid recursion. */
2754 int lastparen = *PL_reglastparen;
2758 PL_reginput = locinput;
2759 if (regmatch(inner))
2762 for (n = *PL_reglastparen; n > lastparen; n--)
2764 *PL_reglastparen = n;
2767 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2771 inner = NEXTOPER(scan);
2772 if (c1 == BRANCHJ) {
2773 inner = NEXTOPER(inner);
2775 } while (scan != NULL && OP(scan) == c1);
2789 /* We suppose that the next guy does not need
2790 backtracking: in particular, it is of constant length,
2791 and has no parenths to influence future backrefs. */
2792 ln = ARG1(scan); /* min to match */
2793 n = ARG2(scan); /* max to match */
2794 paren = scan->flags;
2796 if (paren > PL_regsize)
2798 if (paren > *PL_reglastparen)
2799 *PL_reglastparen = paren;
2801 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2803 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2804 PL_reginput = locinput;
2807 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2809 if (ln && l == 0 && n >= ln
2810 /* In fact, this is tricky. If paren, then the
2811 fact that we did/didnot match may influence
2812 future execution. */
2813 && !(paren && ln == 0))
2815 locinput = PL_reginput;
2816 if (PL_regkind[(U8)OP(next)] == EXACT) {
2817 c1 = (U8)*STRING(next);
2818 if (OP(next) == EXACTF)
2820 else if (OP(next) == EXACTFL)
2821 c2 = PL_fold_locale[c1];
2828 /* This may be improved if l == 0. */
2829 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2830 /* If it could work, try it. */
2832 UCHARAT(PL_reginput) == c1 ||
2833 UCHARAT(PL_reginput) == c2)
2837 PL_regstartp[paren] =
2838 HOPc(PL_reginput, -l) - PL_bostr;
2839 PL_regendp[paren] = PL_reginput - PL_bostr;
2842 PL_regendp[paren] = -1;
2848 /* Couldn't or didn't -- move forward. */
2849 PL_reginput = locinput;
2850 if (regrepeat_hard(scan, 1, &l)) {
2852 locinput = PL_reginput;
2859 n = regrepeat_hard(scan, n, &l);
2860 if (n != 0 && l == 0
2861 /* In fact, this is tricky. If paren, then the
2862 fact that we did/didnot match may influence
2863 future execution. */
2864 && !(paren && ln == 0))
2866 locinput = PL_reginput;
2868 PerlIO_printf(Perl_debug_log,
2869 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2870 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2874 if (PL_regkind[(U8)OP(next)] == EXACT) {
2875 c1 = (U8)*STRING(next);
2876 if (OP(next) == EXACTF)
2878 else if (OP(next) == EXACTFL)
2879 c2 = PL_fold_locale[c1];
2888 /* If it could work, try it. */
2890 UCHARAT(PL_reginput) == c1 ||
2891 UCHARAT(PL_reginput) == c2)
2894 PerlIO_printf(Perl_debug_log,
2895 "%*s trying tail with n=%"IVdf"...\n",
2896 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2900 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2901 PL_regendp[paren] = PL_reginput - PL_bostr;
2904 PL_regendp[paren] = -1;
2910 /* Couldn't or didn't -- back up. */
2912 locinput = HOPc(locinput, -l);
2913 PL_reginput = locinput;
2920 paren = scan->flags; /* Which paren to set */
2921 if (paren > PL_regsize)
2923 if (paren > *PL_reglastparen)
2924 *PL_reglastparen = paren;
2925 ln = ARG1(scan); /* min to match */
2926 n = ARG2(scan); /* max to match */
2927 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2931 ln = ARG1(scan); /* min to match */
2932 n = ARG2(scan); /* max to match */
2933 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2938 scan = NEXTOPER(scan);
2944 scan = NEXTOPER(scan);
2948 * Lookahead to avoid useless match attempts
2949 * when we know what character comes next.
2951 if (PL_regkind[(U8)OP(next)] == EXACT) {
2952 c1 = (U8)*STRING(next);
2953 if (OP(next) == EXACTF)
2955 else if (OP(next) == EXACTFL)
2956 c2 = PL_fold_locale[c1];
2962 PL_reginput = locinput;
2966 if (ln && regrepeat(scan, ln) < ln)
2968 locinput = PL_reginput;
2971 char *e = locinput + n - ln; /* Should not check after this */
2972 char *old = locinput;
2974 if (e >= PL_regeol || (n == REG_INFTY))
2977 /* Find place 'next' could work */
2979 while (locinput <= e && *locinput != c1)
2982 while (locinput <= e
2989 /* PL_reginput == old now */
2990 if (locinput != old) {
2991 ln = 1; /* Did some */
2992 if (regrepeat(scan, locinput - old) <
2996 /* PL_reginput == locinput now */
2999 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
3000 PL_regendp[paren] = locinput - PL_bostr;
3003 PL_regendp[paren] = -1;
3007 PL_reginput = locinput; /* Could be reset... */
3009 /* Couldn't or didn't -- move forward. */
3014 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3015 /* If it could work, try it. */
3017 UCHARAT(PL_reginput) == c1 ||
3018 UCHARAT(PL_reginput) == c2)
3022 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3023 PL_regendp[paren] = PL_reginput - PL_bostr;
3026 PL_regendp[paren] = -1;
3032 /* Couldn't or didn't -- move forward. */
3033 PL_reginput = locinput;
3034 if (regrepeat(scan, 1)) {
3036 locinput = PL_reginput;
3044 n = regrepeat(scan, n);
3045 locinput = PL_reginput;
3046 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3047 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3048 ln = n; /* why back off? */
3049 /* ...because $ and \Z can match before *and* after
3050 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3051 We should back off by one in this case. */
3052 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3058 /* If it could work, try it. */
3060 UCHARAT(PL_reginput) == c1 ||
3061 UCHARAT(PL_reginput) == c2)
3065 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3066 PL_regendp[paren] = PL_reginput - PL_bostr;
3069 PL_regendp[paren] = -1;
3075 /* Couldn't or didn't -- back up. */
3077 PL_reginput = locinput = HOPc(locinput, -1);
3082 /* If it could work, try it. */
3084 UCHARAT(PL_reginput) == c1 ||
3085 UCHARAT(PL_reginput) == c2)
3091 /* Couldn't or didn't -- back up. */
3093 PL_reginput = locinput = HOPc(locinput, -1);
3100 if (PL_reg_call_cc) {
3101 re_cc_state *cur_call_cc = PL_reg_call_cc;
3102 CURCUR *cctmp = PL_regcc;
3103 regexp *re = PL_reg_re;
3104 CHECKPOINT cp, lastcp;
3106 cp = regcppush(0); /* Save *all* the positions. */
3108 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3110 PL_reginput = locinput; /* Make position available to
3112 cache_re(PL_reg_call_cc->re);
3113 PL_regcc = PL_reg_call_cc->cc;
3114 PL_reg_call_cc = PL_reg_call_cc->prev;
3115 if (regmatch(cur_call_cc->node)) {
3116 PL_reg_call_cc = cur_call_cc;
3122 PL_reg_call_cc = cur_call_cc;
3128 PerlIO_printf(Perl_debug_log,
3129 "%*s continuation failed...\n",
3130 REPORT_CODE_OFF+PL_regindent*2, "")
3134 if (locinput < PL_regtill) {
3135 DEBUG_r(PerlIO_printf(Perl_debug_log,
3136 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3138 (long)(locinput - PL_reg_starttry),
3139 (long)(PL_regtill - PL_reg_starttry),
3141 sayNO_FINAL; /* Cannot match: too short. */
3143 PL_reginput = locinput; /* put where regtry can find it */
3144 sayYES_FINAL; /* Success! */
3146 PL_reginput = locinput; /* put where regtry can find it */
3147 sayYES_LOUD; /* Success! */
3150 PL_reginput = locinput;
3155 if (UTF) { /* XXXX This is absolutely
3156 broken, we read before
3158 s = HOPMAYBEc(locinput, -scan->flags);
3164 if (locinput < PL_bostr + scan->flags)
3166 PL_reginput = locinput - scan->flags;
3171 PL_reginput = locinput;
3176 if (UTF) { /* XXXX This is absolutely
3177 broken, we read before
3179 s = HOPMAYBEc(locinput, -scan->flags);
3180 if (!s || s < PL_bostr)
3185 if (locinput < PL_bostr + scan->flags)
3187 PL_reginput = locinput - scan->flags;
3192 PL_reginput = locinput;
3195 inner = NEXTOPER(NEXTOPER(scan));
3196 if (regmatch(inner) != n) {
3211 if (OP(scan) == SUSPEND) {
3212 locinput = PL_reginput;
3213 nextchr = UCHARAT(locinput);
3218 next = scan + ARG(scan);
3223 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3224 PTR2UV(scan), OP(scan));
3225 Perl_croak(aTHX_ "regexp memory corruption");
3231 * We get here only if there's trouble -- normally "case END" is
3232 * the terminating point.
3234 Perl_croak(aTHX_ "corrupted regexp pointers");
3240 PerlIO_printf(Perl_debug_log,
3241 "%*s %scould match...%s\n",
3242 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3246 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3247 PL_colors[4],PL_colors[5]));
3256 PerlIO_printf(Perl_debug_log,
3257 "%*s %sfailed...%s\n",
3258 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3270 - regrepeat - repeatedly match something simple, report how many
3273 * [This routine now assumes that it will only match on things of length 1.
3274 * That was true before, but now we assume scan - reginput is the count,
3275 * rather than incrementing count on every character. [Er, except utf8.]]
3278 S_regrepeat(pTHX_ regnode *p, I32 max)
3281 register char *scan;
3283 register char *loceol = PL_regeol;
3284 register I32 hardcount = 0;
3287 if (max != REG_INFTY && max < loceol - scan)
3288 loceol = scan + max;
3291 while (scan < loceol && *scan != '\n')
3299 while (scan < loceol && *scan != '\n') {
3300 scan += UTF8SKIP(scan);
3306 while (scan < loceol) {
3307 scan += UTF8SKIP(scan);
3311 case EXACT: /* length of string is 1 */
3313 while (scan < loceol && UCHARAT(scan) == c)
3316 case EXACTF: /* length of string is 1 */
3318 while (scan < loceol &&
3319 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3322 case EXACTFL: /* length of string is 1 */
3323 PL_reg_flags |= RF_tainted;
3325 while (scan < loceol &&
3326 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3331 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3332 scan += UTF8SKIP(scan);
3337 while (scan < loceol && REGINCLASS(p, *scan))
3341 while (scan < loceol && isALNUM(*scan))
3346 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3347 scan += UTF8SKIP(scan);
3352 PL_reg_flags |= RF_tainted;
3353 while (scan < loceol && isALNUM_LC(*scan))
3357 PL_reg_flags |= RF_tainted;
3359 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3360 scan += UTF8SKIP(scan);
3366 while (scan < loceol && !isALNUM(*scan))
3371 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3372 scan += UTF8SKIP(scan);
3377 PL_reg_flags |= RF_tainted;
3378 while (scan < loceol && !isALNUM_LC(*scan))
3382 PL_reg_flags |= RF_tainted;
3384 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3385 scan += UTF8SKIP(scan);
3390 while (scan < loceol && isSPACE(*scan))
3395 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3396 scan += UTF8SKIP(scan);
3401 PL_reg_flags |= RF_tainted;
3402 while (scan < loceol && isSPACE_LC(*scan))
3406 PL_reg_flags |= RF_tainted;
3408 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3409 scan += UTF8SKIP(scan);
3414 while (scan < loceol && !isSPACE(*scan))
3419 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3420 scan += UTF8SKIP(scan);
3425 PL_reg_flags |= RF_tainted;
3426 while (scan < loceol && !isSPACE_LC(*scan))
3430 PL_reg_flags |= RF_tainted;
3432 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3433 scan += UTF8SKIP(scan);
3438 while (scan < loceol && isDIGIT(*scan))
3443 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3444 scan += UTF8SKIP(scan);
3450 while (scan < loceol && !isDIGIT(*scan))
3455 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3456 scan += UTF8SKIP(scan);
3460 default: /* Called on something of 0 width. */
3461 break; /* So match right here or not at all. */
3467 c = scan - PL_reginput;
3472 SV *prop = sv_newmortal();
3475 PerlIO_printf(Perl_debug_log,
3476 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3477 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3484 - regrepeat_hard - repeatedly match something, report total lenth and length
3486 * The repeater is supposed to have constant length.
3490 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3493 register char *scan;
3494 register char *start;
3495 register char *loceol = PL_regeol;
3497 I32 count = 0, res = 1;
3502 start = PL_reginput;
3504 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3507 while (start < PL_reginput) {
3509 start += UTF8SKIP(start);
3520 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3522 *lp = l = PL_reginput - start;
3523 if (max != REG_INFTY && l*max < loceol - scan)
3524 loceol = scan + l*max;
3537 - reginclass - determine if a character falls into a character class
3541 S_reginclass(pTHX_ register regnode *p, register I32 c)
3544 char flags = ANYOF_FLAGS(p);
3548 if (ANYOF_BITMAP_TEST(p, c))
3550 else if (flags & ANYOF_FOLD) {
3552 if (flags & ANYOF_LOCALE) {
3553 PL_reg_flags |= RF_tainted;
3554 cf = PL_fold_locale[c];
3558 if (ANYOF_BITMAP_TEST(p, cf))
3562 if (!match && (flags & ANYOF_CLASS)) {
3563 PL_reg_flags |= RF_tainted;
3565 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3566 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3567 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3568 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3569 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3570 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3571 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3572 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3573 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3574 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3575 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3576 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3577 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3578 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3579 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3580 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3581 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3582 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3583 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3584 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3585 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3586 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3587 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3588 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3589 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3590 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3591 ) /* How's that for a conditional? */
3597 return (flags & ANYOF_INVERT) ? !match : match;
3601 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3604 char flags = ARG1(f);
3606 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3608 if (swash_fetch(sv, p))
3610 else if (flags & ANYOF_FOLD) {
3611 U8 tmpbuf[UTF8_MAXLEN];
3612 if (flags & ANYOF_LOCALE) {
3613 PL_reg_flags |= RF_tainted;
3614 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3617 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3618 if (swash_fetch(sv, tmpbuf))
3622 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3624 return (flags & ANYOF_INVERT) ? !match : match;
3628 S_reghop(pTHX_ U8 *s, I32 off)
3632 while (off-- && s < (U8*)PL_regeol)
3637 if (s > (U8*)PL_bostr) {
3640 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3642 } /* XXX could check well-formedness here */
3650 S_reghopmaybe(pTHX_ U8* s, I32 off)
3654 while (off-- && s < (U8*)PL_regeol)
3661 if (s > (U8*)PL_bostr) {
3664 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3666 } /* XXX could check well-formedness here */
3682 restore_pos(pTHXo_ void *arg)
3685 if (PL_reg_eval_set) {
3686 if (PL_reg_oldsaved) {
3687 PL_reg_re->subbeg = PL_reg_oldsaved;
3688 PL_reg_re->sublen = PL_reg_oldsavedlen;
3689 RX_MATCH_COPIED_on(PL_reg_re);
3691 PL_reg_magic->mg_len = PL_reg_oldpos;
3692 PL_reg_eval_set = 0;
3693 PL_curpm = PL_reg_oldcurpm;