5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
224 #define TRYPAREN(paren, n, input) { \
227 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
228 PL_regendp[paren] = input - PL_bostr; \
231 PL_regendp[paren] = -1; \
233 if (regmatch(next)) \
236 PL_regendp[paren] = -1; \
241 * pregexec and friends
245 - pregexec - match a regexp against a string
248 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
249 char *strbeg, I32 minend, SV *screamer, U32 nosave)
250 /* strend: pointer to null at end of string */
251 /* strbeg: real beginning of string */
252 /* minend: end of match must be >=minend after stringarg. */
253 /* nosave: For optimizations. */
256 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
257 nosave ? 0 : REXEC_COPY_STR);
261 S_cache_re(pTHX_ regexp *prog)
264 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
266 PL_regprogram = prog->program;
268 PL_regnpar = prog->nparens;
269 PL_regdata = prog->data;
274 * Need to implement the following flags for reg_anch:
276 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
278 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
279 * INTUIT_AUTORITATIVE_ML
280 * INTUIT_ONCE_NOML - Intuit can match in one location only.
283 * Another flag for this function: SECOND_TIME (so that float substrs
284 * with giant delta may be not rechecked).
287 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
289 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
290 Otherwise, only SvCUR(sv) is used to get strbeg. */
292 /* XXXX We assume that strpos is strbeg unless sv. */
294 /* XXXX Some places assume that there is a fixed substring.
295 An update may be needed if optimizer marks as "INTUITable"
296 RExen without fixed substrings. Similarly, it is assumed that
297 lengths of all the strings are no more than minlen, thus they
298 cannot come from lookahead.
299 (Or minlen should take into account lookahead.) */
301 /* A failure to find a constant substring means that there is no need to make
302 an expensive call to REx engine, thus we celebrate a failure. Similarly,
303 finding a substring too deep into the string means that less calls to
304 regtry() should be needed.
306 REx compiler's optimizer found 4 possible hints:
307 a) Anchored substring;
309 c) Whether we are anchored (beginning-of-line or \G);
310 d) First node (of those at offset 0) which may distingush positions;
311 We use a)b)d) and multiline-part of c), and try to find a position in the
312 string which does not contradict any of them.
315 /* Most of decisions we do here should have been done at compile time.
316 The nodes of the REx which we used for the search should have been
317 deleted from the finite automaton. */
320 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
321 char *strend, U32 flags, re_scream_pos_data *data)
323 register I32 start_shift;
324 /* Should be nonnegative! */
325 register I32 end_shift;
332 register char *other_last = Nullch; /* other substr checked before this */
333 char *check_at; /* check substr found at this pos */
335 char *i_strpos = strpos;
338 DEBUG_r( if (!PL_colorset) reginitcolors() );
339 DEBUG_r(PerlIO_printf(Perl_debug_log,
340 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
341 PL_colors[4],PL_colors[5],PL_colors[0],
344 (strlen(prog->precomp) > 60 ? "..." : ""),
346 (int)(strend - strpos > 60 ? 60 : strend - strpos),
347 strpos, PL_colors[1],
348 (strend - strpos > 60 ? "..." : ""))
351 if (prog->minlen > strend - strpos) {
352 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
355 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
356 check = prog->check_substr;
357 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
358 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
359 || ( (prog->reganch & ROPT_ANCH_BOL)
360 && !PL_multiline ) ); /* Check after \n? */
363 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
364 /* SvCUR is not set on references: SvRV and SvPVX overlap */
366 && (strpos != strbeg)) {
367 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
370 if (prog->check_offset_min == prog->check_offset_max) {
371 /* Substring at constant offset from beg-of-str... */
374 PL_regeol = strend; /* Used in HOP() */
375 s = HOPc(strpos, prog->check_offset_min);
377 slen = SvCUR(check); /* >= 1 */
379 if ( strend - s > slen || strend - s < slen - 1
380 || (strend - s == slen && strend[-1] != '\n')) {
381 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
384 /* Now should match s[0..slen-2] */
386 if (slen && (*SvPVX(check) != *s
388 && memNE(SvPVX(check), s, slen)))) {
390 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
394 else if (*SvPVX(check) != *s
395 || ((slen = SvCUR(check)) > 1
396 && memNE(SvPVX(check), s, slen)))
398 goto success_at_start;
401 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
403 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
404 end_shift = prog->minlen - start_shift -
405 CHR_SVLEN(check) + (SvTAIL(check) != 0);
407 I32 end = prog->check_offset_max + CHR_SVLEN(check)
408 - (SvTAIL(check) != 0);
409 I32 eshift = strend - s - end;
411 if (end_shift < eshift)
415 else { /* Can match at random position */
418 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
419 /* Should be nonnegative! */
420 end_shift = prog->minlen - start_shift -
421 CHR_SVLEN(check) + (SvTAIL(check) != 0);
424 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
426 Perl_croak(aTHX_ "panic: end_shift");
430 /* Find a possible match in the region s..strend by looking for
431 the "check" substring in the region corrected by start/end_shift. */
432 if (flags & REXEC_SCREAM) {
433 I32 p = -1; /* Internal iterator of scream. */
434 I32 *pp = data ? data->scream_pos : &p;
436 if (PL_screamfirst[BmRARE(check)] >= 0
437 || ( BmRARE(check) == '\n'
438 && (BmPREVIOUS(check) == SvCUR(check) - 1)
440 s = screaminstr(sv, check,
441 start_shift + (s - strbeg), end_shift, pp, 0);
445 *data->scream_olds = s;
448 s = fbm_instr((unsigned char*)s + start_shift,
449 (unsigned char*)strend - end_shift,
450 check, PL_multiline ? FBMrf_MULTILINE : 0);
452 /* Update the count-of-usability, remove useless subpatterns,
455 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
456 (s ? "Found" : "Did not find"),
457 ((check == prog->anchored_substr) ? "anchored" : "floating"),
459 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
461 PL_colors[1], (SvTAIL(check) ? "$" : ""),
462 (s ? " at offset " : "...\n") ) );
469 /* Finish the diagnostic message */
470 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
472 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
473 Start with the other substr.
474 XXXX no SCREAM optimization yet - and a very coarse implementation
475 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
476 *always* match. Probably should be marked during compile...
477 Probably it is right to do no SCREAM here...
480 if (prog->float_substr && prog->anchored_substr) {
481 /* Take into account the "other" substring. */
482 /* XXXX May be hopelessly wrong for UTF... */
485 if (check == prog->float_substr) {
488 char *last = s - start_shift, *last1, *last2;
492 t = s - prog->check_offset_max;
493 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
494 && (!(prog->reganch & ROPT_UTF8)
495 || (PL_bostr = strpos, /* Used in regcopmaybe() */
496 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
501 t += prog->anchored_offset;
502 if (t < other_last) /* These positions already checked */
505 last2 = last1 = strend - prog->minlen;
508 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
509 /* On end-of-str: see comment below. */
510 s = fbm_instr((unsigned char*)t,
511 (unsigned char*)last1 + prog->anchored_offset
512 + SvCUR(prog->anchored_substr)
513 - (SvTAIL(prog->anchored_substr)!=0),
514 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
515 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
516 (s ? "Found" : "Contradicts"),
518 (int)(SvCUR(prog->anchored_substr)
519 - (SvTAIL(prog->anchored_substr)!=0)),
520 SvPVX(prog->anchored_substr),
521 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
523 if (last1 >= last2) {
524 DEBUG_r(PerlIO_printf(Perl_debug_log,
525 ", giving up...\n"));
528 DEBUG_r(PerlIO_printf(Perl_debug_log,
529 ", trying floating at offset %ld...\n",
530 (long)(s1 + 1 - i_strpos)));
531 PL_regeol = strend; /* Used in HOP() */
532 other_last = last1 + prog->anchored_offset + 1;
537 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
538 (long)(s - i_strpos)));
539 t = s - prog->anchored_offset;
548 else { /* Take into account the floating substring. */
553 last1 = last = strend - prog->minlen + prog->float_min_offset;
554 if (last - t > prog->float_max_offset)
555 last = t + prog->float_max_offset;
556 s = t + prog->float_min_offset;
559 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
560 /* fbm_instr() takes into account exact value of end-of-str
561 if the check is SvTAIL(ed). Since false positives are OK,
562 and end-of-str is not later than strend we are OK. */
563 s = fbm_instr((unsigned char*)s,
564 (unsigned char*)last + SvCUR(prog->float_substr)
565 - (SvTAIL(prog->float_substr)!=0),
566 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
567 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
568 (s ? "Found" : "Contradicts"),
570 (int)(SvCUR(prog->float_substr)
571 - (SvTAIL(prog->float_substr)!=0)),
572 SvPVX(prog->float_substr),
573 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
576 DEBUG_r(PerlIO_printf(Perl_debug_log,
577 ", giving up...\n"));
580 DEBUG_r(PerlIO_printf(Perl_debug_log,
581 ", trying anchored starting at offset %ld...\n",
582 (long)(s1 + 1 - i_strpos)));
583 other_last = last + 1;
584 PL_regeol = strend; /* Used in HOP() */
589 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
590 (long)(s - i_strpos)));
600 t = s - prog->check_offset_max;
602 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
603 && (!(prog->reganch & ROPT_UTF8)
604 || (PL_bostr = strpos, /* Used in regcopmaybe() */
605 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
608 /* Fixed substring is found far enough so that the match
609 cannot start at strpos. */
611 if (ml_anch && t[-1] != '\n') {
612 /* Eventually fbm_*() should handle this, but often
613 anchored_offset is not 0, so this check will not be wasted. */
614 /* XXXX In the code below we prefer to look for "^" even in
615 presence of anchored substrings. And we search even
616 beyond the found float position. These pessimizations
617 are historical artefacts only. */
619 while (t < strend - prog->minlen) {
621 if (t < check_at - prog->check_offset_min) {
622 if (prog->anchored_substr) {
623 /* Since we moved from the found position,
624 we definitely contradict the found anchored
625 substr. Due to the above check we do not
626 contradict "check" substr.
627 Thus we can arrive here only if check substr
628 is float. Redo checking for "other"=="fixed".
631 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
632 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
633 goto do_other_anchored;
635 /* We don't contradict the found floating substring. */
636 /* XXXX Why not check for STCLASS? */
638 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
639 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
642 /* Position contradicts check-string */
643 /* XXXX probably better to look for check-string
644 than for "\n", so one should lower the limit for t? */
645 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
646 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
647 other_last = strpos = s = t + 1;
652 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
653 PL_colors[0],PL_colors[1]));
657 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
658 PL_colors[0],PL_colors[1]));
662 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
666 /* The found string does not prohibit matching at strpos,
667 - no optimization of calling REx engine can be performed,
668 unless it was an MBOL and we are not after MBOL,
669 or a future STCLASS check will fail this. */
671 /* Even in this situation we may use MBOL flag if strpos is offset
672 wrt the start of the string. */
673 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
674 && (strpos != strbeg) && strpos[-1] != '\n'
675 /* May be due to an implicit anchor of m{.*foo} */
676 && !(prog->reganch & ROPT_IMPLICIT))
681 DEBUG_r( if (ml_anch)
682 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
683 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
686 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
687 && prog->check_substr /* Could be deleted already */
688 && --BmUSEFUL(prog->check_substr) < 0
689 && prog->check_substr == prog->float_substr)
691 /* If flags & SOMETHING - do not do it many times on the same match */
692 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
693 SvREFCNT_dec(prog->check_substr);
694 prog->check_substr = Nullsv; /* disable */
695 prog->float_substr = Nullsv; /* clear */
696 check = Nullsv; /* abort */
698 /* XXXX This is a remnant of the old implementation. It
699 looks wasteful, since now INTUIT can use many
701 prog->reganch &= ~RE_USE_INTUIT;
708 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
709 if (prog->regstclass) {
710 /* minlen == 0 is possible if regstclass is \b or \B,
711 and the fixed substr is ''$.
712 Since minlen is already taken into account, s+1 is before strend;
713 accidentally, minlen >= 1 guaranties no false positives at s + 1
714 even for \b or \B. But (minlen? 1 : 0) below assumes that
715 regstclass does not come from lookahead... */
716 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
717 This leaves EXACTF only, which is dealt with in find_byclass(). */
718 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
719 ? STR_LEN(prog->regstclass)
721 char *endpos = (prog->anchored_substr || ml_anch)
722 ? s + (prog->minlen? cl_l : 0)
723 : (prog->float_substr ? check_at - start_shift + cl_l
725 char *startpos = strbeg;
728 if (prog->reganch & ROPT_UTF8) {
729 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
732 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
737 if (endpos == strend) {
738 DEBUG_r( PerlIO_printf(Perl_debug_log,
739 "Could not match STCLASS...\n") );
742 DEBUG_r( PerlIO_printf(Perl_debug_log,
743 "This position contradicts STCLASS...\n") );
744 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
746 /* Contradict one of substrings */
747 if (prog->anchored_substr) {
748 if (prog->anchored_substr == check) {
749 DEBUG_r( what = "anchored" );
751 PL_regeol = strend; /* Used in HOP() */
753 if (s + start_shift + end_shift > strend) {
754 /* XXXX Should be taken into account earlier? */
755 DEBUG_r( PerlIO_printf(Perl_debug_log,
756 "Could not match STCLASS...\n") );
761 DEBUG_r( PerlIO_printf(Perl_debug_log,
762 "Looking for %s substr starting at offset %ld...\n",
763 what, (long)(s + start_shift - i_strpos)) );
766 /* Have both, check_string is floating */
767 if (t + start_shift >= check_at) /* Contradicts floating=check */
768 goto retry_floating_check;
769 /* Recheck anchored substring, but not floating... */
773 DEBUG_r( PerlIO_printf(Perl_debug_log,
774 "Looking for anchored substr starting at offset %ld...\n",
775 (long)(other_last - i_strpos)) );
776 goto do_other_anchored;
778 /* Another way we could have checked stclass at the
779 current position only: */
784 DEBUG_r( PerlIO_printf(Perl_debug_log,
785 "Looking for /%s^%s/m starting at offset %ld...\n",
786 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
789 if (!prog->float_substr) /* Could have been deleted */
791 /* Check is floating subtring. */
792 retry_floating_check:
793 t = check_at - start_shift;
794 DEBUG_r( what = "floating" );
795 goto hop_and_restart;
798 PerlIO_printf(Perl_debug_log,
799 "By STCLASS: moving %ld --> %ld\n",
800 (long)(t - i_strpos), (long)(s - i_strpos));
802 PerlIO_printf(Perl_debug_log,
803 "Does not contradict STCLASS...\n") );
806 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
807 PL_colors[4], (check ? "Guessed" : "Giving up"),
808 PL_colors[5], (long)(s - i_strpos)) );
811 fail_finish: /* Substring not found */
812 if (prog->check_substr) /* could be removed already */
813 BmUSEFUL(prog->check_substr) += 5; /* hooray */
815 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
816 PL_colors[4],PL_colors[5]));
820 /* We know what class REx starts with. Try to find this position... */
822 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
824 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
830 register I32 tmp = 1; /* Scratch variable? */
832 /* We know what class it must start with. */
836 if (REGINCLASSUTF8(c, (U8*)s)) {
837 if (tmp && (norun || regtry(prog, s)))
849 if (REGINCLASS(c, *(U8*)s)) {
850 if (tmp && (norun || regtry(prog, s)))
870 c2 = PL_fold_locale[c1];
875 e = s; /* Due to minlen logic of intuit() */
876 /* Here it is NOT UTF! */
880 && (ln == 1 || !(OP(c) == EXACTF
882 : ibcmp_locale(s, m, ln)))
883 && (norun || regtry(prog, s)) )
889 if ( (*(U8*)s == c1 || *(U8*)s == c2)
890 && (ln == 1 || !(OP(c) == EXACTF
892 : ibcmp_locale(s, m, ln)))
893 && (norun || regtry(prog, s)) )
900 PL_reg_flags |= RF_tainted;
903 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
904 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
906 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
908 if ((norun || regtry(prog, s)))
913 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
917 PL_reg_flags |= RF_tainted;
920 tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
921 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
923 if (tmp == !(OP(c) == BOUNDUTF8 ?
924 swash_fetch(PL_utf8_alnum, (U8*)s) :
925 isALNUM_LC_utf8((U8*)s)))
928 if ((norun || regtry(prog, s)))
933 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
937 PL_reg_flags |= RF_tainted;
940 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
941 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
943 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
945 else if ((norun || regtry(prog, s)))
949 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
953 PL_reg_flags |= RF_tainted;
956 tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
957 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
959 if (tmp == !(OP(c) == NBOUNDUTF8 ?
960 swash_fetch(PL_utf8_alnum, (U8*)s) :
961 isALNUM_LC_utf8((U8*)s)))
963 else if ((norun || regtry(prog, s)))
967 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
973 if (tmp && (norun || regtry(prog, s)))
985 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
986 if (tmp && (norun || regtry(prog, s)))
997 PL_reg_flags |= RF_tainted;
999 if (isALNUM_LC(*s)) {
1000 if (tmp && (norun || regtry(prog, s)))
1011 PL_reg_flags |= RF_tainted;
1012 while (s < strend) {
1013 if (isALNUM_LC_utf8((U8*)s)) {
1014 if (tmp && (norun || regtry(prog, s)))
1025 while (s < strend) {
1027 if (tmp && (norun || regtry(prog, s)))
1038 while (s < strend) {
1039 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1040 if (tmp && (norun || regtry(prog, s)))
1051 PL_reg_flags |= RF_tainted;
1052 while (s < strend) {
1053 if (!isALNUM_LC(*s)) {
1054 if (tmp && (norun || regtry(prog, s)))
1065 PL_reg_flags |= RF_tainted;
1066 while (s < strend) {
1067 if (!isALNUM_LC_utf8((U8*)s)) {
1068 if (tmp && (norun || regtry(prog, s)))
1079 while (s < strend) {
1081 if (tmp && (norun || regtry(prog, s)))
1092 while (s < strend) {
1093 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1094 if (tmp && (norun || regtry(prog, s)))
1105 PL_reg_flags |= RF_tainted;
1106 while (s < strend) {
1107 if (isSPACE_LC(*s)) {
1108 if (tmp && (norun || regtry(prog, s)))
1119 PL_reg_flags |= RF_tainted;
1120 while (s < strend) {
1121 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1122 if (tmp && (norun || regtry(prog, s)))
1133 while (s < strend) {
1135 if (tmp && (norun || regtry(prog, s)))
1146 while (s < strend) {
1147 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1148 if (tmp && (norun || regtry(prog, s)))
1159 PL_reg_flags |= RF_tainted;
1160 while (s < strend) {
1161 if (!isSPACE_LC(*s)) {
1162 if (tmp && (norun || regtry(prog, s)))
1173 PL_reg_flags |= RF_tainted;
1174 while (s < strend) {
1175 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1176 if (tmp && (norun || regtry(prog, s)))
1187 while (s < strend) {
1189 if (tmp && (norun || regtry(prog, s)))
1200 while (s < strend) {
1201 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1202 if (tmp && (norun || regtry(prog, s)))
1213 PL_reg_flags |= RF_tainted;
1214 while (s < strend) {
1215 if (isDIGIT_LC(*s)) {
1216 if (tmp && (norun || regtry(prog, s)))
1227 PL_reg_flags |= RF_tainted;
1228 while (s < strend) {
1229 if (isDIGIT_LC_utf8((U8*)s)) {
1230 if (tmp && (norun || regtry(prog, s)))
1241 while (s < strend) {
1243 if (tmp && (norun || regtry(prog, s)))
1254 while (s < strend) {
1255 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1256 if (tmp && (norun || regtry(prog, s)))
1267 PL_reg_flags |= RF_tainted;
1268 while (s < strend) {
1269 if (!isDIGIT_LC(*s)) {
1270 if (tmp && (norun || regtry(prog, s)))
1281 PL_reg_flags |= RF_tainted;
1282 while (s < strend) {
1283 if (!isDIGIT_LC_utf8((U8*)s)) {
1284 if (tmp && (norun || regtry(prog, s)))
1295 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1304 - regexec_flags - match a regexp against a string
1307 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1308 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1309 /* strend: pointer to null at end of string */
1310 /* strbeg: real beginning of string */
1311 /* minend: end of match must be >=minend after stringarg. */
1312 /* data: May be used for some additional optimizations. */
1313 /* nosave: For optimizations. */
1317 register regnode *c;
1318 register char *startpos = stringarg;
1319 I32 minlen; /* must match at least this many chars */
1320 I32 dontbother = 0; /* how many characters not to try at end */
1321 /* I32 start_shift = 0; */ /* Offset of the start to find
1322 constant substr. */ /* CC */
1323 I32 end_shift = 0; /* Same for the end. */ /* CC */
1324 I32 scream_pos = -1; /* Internal iterator of scream. */
1326 SV* oreplsv = GvSV(PL_replgv);
1332 PL_regnarrate = PL_debug & 512;
1335 /* Be paranoid... */
1336 if (prog == NULL || startpos == NULL) {
1337 Perl_croak(aTHX_ "NULL regexp parameter");
1341 minlen = prog->minlen;
1342 if (strend - startpos < minlen) goto phooey;
1344 if (startpos == strbeg) /* is ^ valid at stringarg? */
1347 PL_regprev = (U32)stringarg[-1];
1348 if (!PL_multiline && PL_regprev == '\n')
1349 PL_regprev = '\0'; /* force ^ to NOT match */
1352 /* Check validity of program. */
1353 if (UCHARAT(prog->program) != REG_MAGIC) {
1354 Perl_croak(aTHX_ "corrupted regexp program");
1358 PL_reg_eval_set = 0;
1361 if (prog->reganch & ROPT_UTF8)
1362 PL_reg_flags |= RF_utf8;
1364 /* Mark beginning of line for ^ and lookbehind. */
1365 PL_regbol = startpos;
1369 /* Mark end of line for $ (and such) */
1372 /* see how far we have to get to not match where we matched before */
1373 PL_regtill = startpos+minend;
1375 /* We start without call_cc context. */
1378 /* If there is a "must appear" string, look for it. */
1381 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1384 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1385 PL_reg_ganch = startpos;
1386 else if (sv && SvTYPE(sv) >= SVt_PVMG
1388 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1389 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1390 if (prog->reganch & ROPT_ANCH_GPOS) {
1391 if (s > PL_reg_ganch)
1396 else /* pos() not defined */
1397 PL_reg_ganch = strbeg;
1400 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1401 re_scream_pos_data d;
1403 d.scream_olds = &scream_olds;
1404 d.scream_pos = &scream_pos;
1405 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1407 goto phooey; /* not present */
1410 DEBUG_r( if (!PL_colorset) reginitcolors() );
1411 DEBUG_r(PerlIO_printf(Perl_debug_log,
1412 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1413 PL_colors[4],PL_colors[5],PL_colors[0],
1416 (strlen(prog->precomp) > 60 ? "..." : ""),
1418 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1419 startpos, PL_colors[1],
1420 (strend - startpos > 60 ? "..." : ""))
1423 /* Simplest case: anchored match need be tried only once. */
1424 /* [unless only anchor is BOL and multiline is set] */
1425 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1426 if (s == startpos && regtry(prog, startpos))
1428 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1429 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1434 dontbother = minlen - 1;
1435 end = HOPc(strend, -dontbother) - 1;
1436 /* for multiline we only have to try after newlines */
1437 if (prog->check_substr) {
1441 if (regtry(prog, s))
1446 if (prog->reganch & RE_USE_INTUIT) {
1447 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1458 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1459 if (regtry(prog, s))
1466 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1467 if (regtry(prog, PL_reg_ganch))
1472 /* Messy cases: unanchored match. */
1473 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1474 /* we have /x+whatever/ */
1475 /* it must be a one character string (XXXX Except UTF?) */
1476 char ch = SvPVX(prog->anchored_substr)[0];
1482 while (s < strend) {
1484 DEBUG_r( did_match = 1 );
1485 if (regtry(prog, s)) goto got_it;
1487 while (s < strend && *s == ch)
1494 while (s < strend) {
1496 DEBUG_r( did_match = 1 );
1497 if (regtry(prog, s)) goto got_it;
1499 while (s < strend && *s == ch)
1505 DEBUG_r(did_match ||
1506 PerlIO_printf(Perl_debug_log,
1507 "Did not find anchored character...\n"));
1510 else if (prog->anchored_substr != Nullsv
1511 || (prog->float_substr != Nullsv
1512 && prog->float_max_offset < strend - s)) {
1513 SV *must = prog->anchored_substr
1514 ? prog->anchored_substr : prog->float_substr;
1516 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1518 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1519 char *last = HOPc(strend, /* Cannot start after this */
1520 -(I32)(CHR_SVLEN(must)
1521 - (SvTAIL(must) != 0) + back_min));
1522 char *last1; /* Last position checked before */
1528 last1 = HOPc(s, -1);
1530 last1 = s - 1; /* bogus */
1532 /* XXXX check_substr already used to find `s', can optimize if
1533 check_substr==must. */
1535 dontbother = end_shift;
1536 strend = HOPc(strend, -dontbother);
1537 while ( (s <= last) &&
1538 ((flags & REXEC_SCREAM)
1539 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1540 end_shift, &scream_pos, 0))
1541 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1542 (unsigned char*)strend, must,
1543 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1544 DEBUG_r( did_match = 1 );
1545 if (HOPc(s, -back_max) > last1) {
1546 last1 = HOPc(s, -back_min);
1547 s = HOPc(s, -back_max);
1550 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1552 last1 = HOPc(s, -back_min);
1556 while (s <= last1) {
1557 if (regtry(prog, s))
1563 while (s <= last1) {
1564 if (regtry(prog, s))
1570 DEBUG_r(did_match ||
1571 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1572 ((must == prog->anchored_substr)
1573 ? "anchored" : "floating"),
1575 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1577 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1580 else if ((c = prog->regstclass)) {
1581 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1582 /* don't bother with what can't match */
1583 strend = HOPc(strend, -(minlen - 1));
1584 if (find_byclass(prog, c, s, strend, startpos, 0))
1586 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1590 if (prog->float_substr != Nullsv) { /* Trim the end. */
1593 if (flags & REXEC_SCREAM) {
1594 last = screaminstr(sv, prog->float_substr, s - strbeg,
1595 end_shift, &scream_pos, 1); /* last one */
1597 last = scream_olds; /* Only one occurence. */
1601 char *little = SvPV(prog->float_substr, len);
1603 if (SvTAIL(prog->float_substr)) {
1604 if (memEQ(strend - len + 1, little, len - 1))
1605 last = strend - len + 1;
1606 else if (!PL_multiline)
1607 last = memEQ(strend - len, little, len)
1608 ? strend - len : Nullch;
1614 last = rninstr(s, strend, little, little + len);
1616 last = strend; /* matching `$' */
1620 DEBUG_r(PerlIO_printf(Perl_debug_log,
1621 "%sCan't trim the tail, match fails (should not happen)%s\n",
1622 PL_colors[4],PL_colors[5]));
1623 goto phooey; /* Should not happen! */
1625 dontbother = strend - last + prog->float_min_offset;
1627 if (minlen && (dontbother < minlen))
1628 dontbother = minlen - 1;
1629 strend -= dontbother; /* this one's always in bytes! */
1630 /* We don't know much -- general case. */
1633 if (regtry(prog, s))
1642 if (regtry(prog, s))
1644 } while (s++ < strend);
1652 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1654 if (PL_reg_eval_set) {
1655 /* Preserve the current value of $^R */
1656 if (oreplsv != GvSV(PL_replgv))
1657 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1658 restored, the value remains
1660 restore_pos(aTHXo_ 0);
1663 /* make sure $`, $&, $', and $digit will work later */
1664 if ( !(flags & REXEC_NOT_FIRST) ) {
1665 if (RX_MATCH_COPIED(prog)) {
1666 Safefree(prog->subbeg);
1667 RX_MATCH_COPIED_off(prog);
1669 if (flags & REXEC_COPY_STR) {
1670 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1672 s = savepvn(strbeg, i);
1675 RX_MATCH_COPIED_on(prog);
1678 prog->subbeg = strbeg;
1679 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1686 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1687 PL_colors[4],PL_colors[5]));
1688 if (PL_reg_eval_set)
1689 restore_pos(aTHXo_ 0);
1694 - regtry - try match at specific point
1696 STATIC I32 /* 0 failure, 1 success */
1697 S_regtry(pTHX_ regexp *prog, char *startpos)
1705 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1708 PL_reg_eval_set = RS_init;
1710 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1711 (IV)(PL_stack_sp - PL_stack_base));
1713 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1714 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1715 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1717 /* Apparently this is not needed, judging by wantarray. */
1718 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1719 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1722 /* Make $_ available to executed code. */
1723 if (PL_reg_sv != DEFSV) {
1724 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1729 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1730 && (mg = mg_find(PL_reg_sv, 'g')))) {
1731 /* prepare for quick setting of pos */
1732 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1733 mg = mg_find(PL_reg_sv, 'g');
1737 PL_reg_oldpos = mg->mg_len;
1738 SAVEDESTRUCTOR_X(restore_pos, 0);
1741 Newz(22,PL_reg_curpm, 1, PMOP);
1742 PL_reg_curpm->op_pmregexp = prog;
1743 PL_reg_oldcurpm = PL_curpm;
1744 PL_curpm = PL_reg_curpm;
1745 if (RX_MATCH_COPIED(prog)) {
1746 /* Here is a serious problem: we cannot rewrite subbeg,
1747 since it may be needed if this match fails. Thus
1748 $` inside (?{}) could fail... */
1749 PL_reg_oldsaved = prog->subbeg;
1750 PL_reg_oldsavedlen = prog->sublen;
1751 RX_MATCH_COPIED_off(prog);
1754 PL_reg_oldsaved = Nullch;
1755 prog->subbeg = PL_bostr;
1756 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1758 prog->startp[0] = startpos - PL_bostr;
1759 PL_reginput = startpos;
1760 PL_regstartp = prog->startp;
1761 PL_regendp = prog->endp;
1762 PL_reglastparen = &prog->lastparen;
1763 prog->lastparen = 0;
1765 DEBUG_r(PL_reg_starttry = startpos);
1766 if (PL_reg_start_tmpl <= prog->nparens) {
1767 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1768 if(PL_reg_start_tmp)
1769 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1771 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1774 /* XXXX What this code is doing here?!!! There should be no need
1775 to do this again and again, PL_reglastparen should take care of
1779 if (prog->nparens) {
1780 for (i = prog->nparens; i >= 1; i--) {
1786 if (regmatch(prog->program + 1)) {
1787 prog->endp[0] = PL_reginput - PL_bostr;
1795 - regmatch - main matching routine
1797 * Conceptually the strategy is simple: check to see whether the current
1798 * node matches, call self recursively to see whether the rest matches,
1799 * and then act accordingly. In practice we make some effort to avoid
1800 * recursion, in particular by going through "ordinary" nodes (that don't
1801 * need to know whether the rest of the match failed) by a loop instead of
1804 /* [lwall] I've hoisted the register declarations to the outer block in order to
1805 * maybe save a little bit of pushing and popping on the stack. It also takes
1806 * advantage of machines that use a register save mask on subroutine entry.
1808 STATIC I32 /* 0 failure, 1 success */
1809 S_regmatch(pTHX_ regnode *prog)
1812 register regnode *scan; /* Current node. */
1813 regnode *next; /* Next node. */
1814 regnode *inner; /* Next node in internal branch. */
1815 register I32 nextchr; /* renamed nextchr - nextchar colides with
1816 function of same name */
1817 register I32 n; /* no or next */
1818 register I32 ln; /* len or last */
1819 register char *s; /* operand or save */
1820 register char *locinput = PL_reginput;
1821 register I32 c1, c2, paren; /* case fold search, parenth */
1822 int minmod = 0, sw = 0, logical = 0;
1827 /* Note that nextchr is a byte even in UTF */
1828 nextchr = UCHARAT(locinput);
1830 while (scan != NULL) {
1831 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1833 # define sayYES goto yes
1834 # define sayNO goto no
1835 # define sayYES_FINAL goto yes_final
1836 # define sayYES_LOUD goto yes_loud
1837 # define sayNO_FINAL goto no_final
1838 # define sayNO_SILENT goto do_no
1839 # define saySAME(x) if (x) goto yes; else goto no
1840 # define REPORT_CODE_OFF 24
1842 # define sayYES return 1
1843 # define sayNO return 0
1844 # define sayYES_FINAL return 1
1845 # define sayYES_LOUD return 1
1846 # define sayNO_FINAL return 0
1847 # define sayNO_SILENT return 0
1848 # define saySAME(x) return x
1851 SV *prop = sv_newmortal();
1852 int docolor = *PL_colors[0];
1853 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1854 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1855 /* The part of the string before starttry has one color
1856 (pref0_len chars), between starttry and current
1857 position another one (pref_len - pref0_len chars),
1858 after the current position the third one.
1859 We assume that pref0_len <= pref_len, otherwise we
1860 decrease pref0_len. */
1861 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1862 ? (5 + taill) - l : locinput - PL_bostr);
1863 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1865 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1866 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1867 ? (5 + taill) - pref_len : PL_regeol - locinput);
1870 if (pref0_len > pref_len)
1871 pref0_len = pref_len;
1872 regprop(prop, scan);
1873 PerlIO_printf(Perl_debug_log,
1874 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1875 (IV)(locinput - PL_bostr),
1876 PL_colors[4], pref0_len,
1877 locinput - pref_len, PL_colors[5],
1878 PL_colors[2], pref_len - pref0_len,
1879 locinput - pref_len + pref0_len, PL_colors[3],
1880 (docolor ? "" : "> <"),
1881 PL_colors[0], l, locinput, PL_colors[1],
1882 15 - l - pref_len + 1,
1884 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1888 next = scan + NEXT_OFF(scan);
1894 if (locinput == PL_bostr
1895 ? PL_regprev == '\n'
1897 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1899 /* regtill = regbol; */
1904 if (locinput == PL_bostr
1905 ? PL_regprev == '\n'
1906 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1912 if (locinput == PL_bostr)
1916 if (locinput == PL_reg_ganch)
1926 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1931 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1933 if (PL_regeol - locinput > 1)
1937 if (PL_regeol != locinput)
1941 if (nextchr & 0x80) {
1942 locinput += PL_utf8skip[nextchr];
1943 if (locinput > PL_regeol)
1945 nextchr = UCHARAT(locinput);
1948 if (!nextchr && locinput >= PL_regeol)
1950 nextchr = UCHARAT(++locinput);
1953 if (!nextchr && locinput >= PL_regeol)
1955 nextchr = UCHARAT(++locinput);
1958 if (nextchr & 0x80) {
1959 locinput += PL_utf8skip[nextchr];
1960 if (locinput > PL_regeol)
1962 nextchr = UCHARAT(locinput);
1965 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1967 nextchr = UCHARAT(++locinput);
1970 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1972 nextchr = UCHARAT(++locinput);
1977 /* Inline the first character, for speed. */
1978 if (UCHARAT(s) != nextchr)
1980 if (PL_regeol - locinput < ln)
1982 if (ln > 1 && memNE(s, locinput, ln))
1985 nextchr = UCHARAT(locinput);
1988 PL_reg_flags |= RF_tainted;
1997 c1 = OP(scan) == EXACTF;
2001 if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ?
2002 toLOWER_utf8((U8*)l) :
2003 toLOWER_LC_utf8((U8*)l)))
2011 nextchr = UCHARAT(locinput);
2015 /* Inline the first character, for speed. */
2016 if (UCHARAT(s) != nextchr &&
2017 UCHARAT(s) != ((OP(scan) == EXACTF)
2018 ? PL_fold : PL_fold_locale)[nextchr])
2020 if (PL_regeol - locinput < ln)
2022 if (ln > 1 && (OP(scan) == EXACTF
2023 ? ibcmp(s, locinput, ln)
2024 : ibcmp_locale(s, locinput, ln)))
2027 nextchr = UCHARAT(locinput);
2030 if (!REGINCLASSUTF8(scan, (U8*)locinput))
2032 if (locinput >= PL_regeol)
2034 locinput += PL_utf8skip[nextchr];
2035 nextchr = UCHARAT(locinput);
2039 nextchr = UCHARAT(locinput);
2040 if (!REGINCLASS(scan, nextchr))
2042 if (!nextchr && locinput >= PL_regeol)
2044 nextchr = UCHARAT(++locinput);
2047 PL_reg_flags |= RF_tainted;
2052 if (!(OP(scan) == ALNUM
2053 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2055 nextchr = UCHARAT(++locinput);
2058 PL_reg_flags |= RF_tainted;
2063 if (nextchr & 0x80) {
2064 if (!(OP(scan) == ALNUMUTF8
2065 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2066 : isALNUM_LC_utf8((U8*)locinput)))
2070 locinput += PL_utf8skip[nextchr];
2071 nextchr = UCHARAT(locinput);
2074 if (!(OP(scan) == ALNUMUTF8
2075 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2077 nextchr = UCHARAT(++locinput);
2080 PL_reg_flags |= RF_tainted;
2083 if (!nextchr && locinput >= PL_regeol)
2085 if (OP(scan) == NALNUM
2086 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2088 nextchr = UCHARAT(++locinput);
2091 PL_reg_flags |= RF_tainted;
2094 if (!nextchr && locinput >= PL_regeol)
2096 if (nextchr & 0x80) {
2097 if (OP(scan) == NALNUMUTF8
2098 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2099 : isALNUM_LC_utf8((U8*)locinput))
2103 locinput += PL_utf8skip[nextchr];
2104 nextchr = UCHARAT(locinput);
2107 if (OP(scan) == NALNUMUTF8
2108 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2110 nextchr = UCHARAT(++locinput);
2114 PL_reg_flags |= RF_tainted;
2118 /* was last char in word? */
2119 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2120 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2122 n = isALNUM(nextchr);
2125 ln = isALNUM_LC(ln);
2126 n = isALNUM_LC(nextchr);
2128 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2133 PL_reg_flags |= RF_tainted;
2137 /* was last char in word? */
2138 ln = (locinput != PL_regbol)
2139 ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
2140 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2141 ln = isALNUM_uni(ln);
2142 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2145 ln = isALNUM_LC_uni(ln);
2146 n = isALNUM_LC_utf8((U8*)locinput);
2148 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2152 PL_reg_flags |= RF_tainted;
2157 if (!(OP(scan) == SPACE
2158 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2160 nextchr = UCHARAT(++locinput);
2163 PL_reg_flags |= RF_tainted;
2168 if (nextchr & 0x80) {
2169 if (!(OP(scan) == SPACEUTF8
2170 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2171 : isSPACE_LC_utf8((U8*)locinput)))
2175 locinput += PL_utf8skip[nextchr];
2176 nextchr = UCHARAT(locinput);
2179 if (!(OP(scan) == SPACEUTF8
2180 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2182 nextchr = UCHARAT(++locinput);
2185 PL_reg_flags |= RF_tainted;
2188 if (!nextchr && locinput >= PL_regeol)
2190 if (OP(scan) == NSPACE
2191 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2193 nextchr = UCHARAT(++locinput);
2196 PL_reg_flags |= RF_tainted;
2199 if (!nextchr && locinput >= PL_regeol)
2201 if (nextchr & 0x80) {
2202 if (OP(scan) == NSPACEUTF8
2203 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2204 : isSPACE_LC_utf8((U8*)locinput))
2208 locinput += PL_utf8skip[nextchr];
2209 nextchr = UCHARAT(locinput);
2212 if (OP(scan) == NSPACEUTF8
2213 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2215 nextchr = UCHARAT(++locinput);
2218 PL_reg_flags |= RF_tainted;
2223 if (!(OP(scan) == DIGIT
2224 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2226 nextchr = UCHARAT(++locinput);
2229 PL_reg_flags |= RF_tainted;
2234 if (nextchr & 0x80) {
2235 if (!(OP(scan) == DIGITUTF8
2236 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2237 : isDIGIT_LC_utf8((U8*)locinput)))
2241 locinput += PL_utf8skip[nextchr];
2242 nextchr = UCHARAT(locinput);
2245 if (!(OP(scan) == DIGITUTF8
2246 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2248 nextchr = UCHARAT(++locinput);
2251 PL_reg_flags |= RF_tainted;
2254 if (!nextchr && locinput >= PL_regeol)
2256 if (OP(scan) == NDIGIT
2257 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2259 nextchr = UCHARAT(++locinput);
2262 PL_reg_flags |= RF_tainted;
2265 if (!nextchr && locinput >= PL_regeol)
2267 if (nextchr & 0x80) {
2268 if (OP(scan) == NDIGITUTF8
2269 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2270 : isDIGIT_LC_utf8((U8*)locinput))
2274 locinput += PL_utf8skip[nextchr];
2275 nextchr = UCHARAT(locinput);
2278 if (OP(scan) == NDIGITUTF8
2279 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2281 nextchr = UCHARAT(++locinput);
2284 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2286 locinput += PL_utf8skip[nextchr];
2287 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2288 locinput += UTF8SKIP(locinput);
2289 if (locinput > PL_regeol)
2291 nextchr = UCHARAT(locinput);
2294 PL_reg_flags |= RF_tainted;
2298 n = ARG(scan); /* which paren pair */
2299 ln = PL_regstartp[n];
2300 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2301 if (*PL_reglastparen < n || ln == -1)
2302 sayNO; /* Do not match unless seen CLOSEn. */
2303 if (ln == PL_regendp[n])
2307 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2309 char *e = PL_bostr + PL_regendp[n];
2311 * Note that we can't do the "other character" lookup trick as
2312 * in the 8-bit case (no pun intended) because in Unicode we
2313 * have to map both upper and title case to lower case.
2315 if (OP(scan) == REFF) {
2319 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2329 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2336 nextchr = UCHARAT(locinput);
2340 /* Inline the first character, for speed. */
2341 if (UCHARAT(s) != nextchr &&
2343 (UCHARAT(s) != ((OP(scan) == REFF
2344 ? PL_fold : PL_fold_locale)[nextchr]))))
2346 ln = PL_regendp[n] - ln;
2347 if (locinput + ln > PL_regeol)
2349 if (ln > 1 && (OP(scan) == REF
2350 ? memNE(s, locinput, ln)
2352 ? ibcmp(s, locinput, ln)
2353 : ibcmp_locale(s, locinput, ln))))
2356 nextchr = UCHARAT(locinput);
2367 OP_4tree *oop = PL_op;
2368 COP *ocurcop = PL_curcop;
2369 SV **ocurpad = PL_curpad;
2373 PL_op = (OP_4tree*)PL_regdata->data[n];
2374 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2375 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2376 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2378 CALLRUNOPS(aTHX); /* Scalar context. */
2384 PL_curpad = ocurpad;
2385 PL_curcop = ocurcop;
2387 if (logical == 2) { /* Postponed subexpression. */
2389 MAGIC *mg = Null(MAGIC*);
2391 CHECKPOINT cp, lastcp;
2393 if(SvROK(ret) || SvRMAGICAL(ret)) {
2394 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2397 mg = mg_find(sv, 'r');
2400 re = (regexp *)mg->mg_obj;
2401 (void)ReREFCNT_inc(re);
2405 char *t = SvPV(ret, len);
2407 char *oprecomp = PL_regprecomp;
2408 I32 osize = PL_regsize;
2409 I32 onpar = PL_regnpar;
2412 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2413 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2415 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2416 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2417 PL_regprecomp = oprecomp;
2422 PerlIO_printf(Perl_debug_log,
2423 "Entering embedded `%s%.60s%s%s'\n",
2427 (strlen(re->precomp) > 60 ? "..." : ""))
2430 state.prev = PL_reg_call_cc;
2431 state.cc = PL_regcc;
2432 state.re = PL_reg_re;
2436 cp = regcppush(0); /* Save *all* the positions. */
2439 state.ss = PL_savestack_ix;
2440 *PL_reglastparen = 0;
2441 PL_reg_call_cc = &state;
2442 PL_reginput = locinput;
2444 /* XXXX This is too dramatic a measure... */
2447 if (regmatch(re->program + 1)) {
2448 /* Even though we succeeded, we need to restore
2449 global variables, since we may be wrapped inside
2450 SUSPEND, thus the match may be not finished yet. */
2452 /* XXXX Do this only if SUSPENDed? */
2453 PL_reg_call_cc = state.prev;
2454 PL_regcc = state.cc;
2455 PL_reg_re = state.re;
2456 cache_re(PL_reg_re);
2458 /* XXXX This is too dramatic a measure... */
2461 /* These are needed even if not SUSPEND. */
2469 PL_reg_call_cc = state.prev;
2470 PL_regcc = state.cc;
2471 PL_reg_re = state.re;
2472 cache_re(PL_reg_re);
2474 /* XXXX This is too dramatic a measure... */
2483 sv_setsv(save_scalar(PL_replgv), ret);
2487 n = ARG(scan); /* which paren pair */
2488 PL_reg_start_tmp[n] = locinput;
2493 n = ARG(scan); /* which paren pair */
2494 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2495 PL_regendp[n] = locinput - PL_bostr;
2496 if (n > *PL_reglastparen)
2497 *PL_reglastparen = n;
2500 n = ARG(scan); /* which paren pair */
2501 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2504 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2506 next = NEXTOPER(NEXTOPER(scan));
2508 next = scan + ARG(scan);
2509 if (OP(next) == IFTHEN) /* Fake one. */
2510 next = NEXTOPER(NEXTOPER(next));
2514 logical = scan->flags;
2516 /*******************************************************************
2517 PL_regcc contains infoblock about the innermost (...)* loop, and
2518 a pointer to the next outer infoblock.
2520 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2522 1) After matching X, regnode for CURLYX is processed;
2524 2) This regnode creates infoblock on the stack, and calls
2525 regmatch() recursively with the starting point at WHILEM node;
2527 3) Each hit of WHILEM node tries to match A and Z (in the order
2528 depending on the current iteration, min/max of {min,max} and
2529 greediness). The information about where are nodes for "A"
2530 and "Z" is read from the infoblock, as is info on how many times "A"
2531 was already matched, and greediness.
2533 4) After A matches, the same WHILEM node is hit again.
2535 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2536 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2537 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2538 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2539 of the external loop.
2541 Currently present infoblocks form a tree with a stem formed by PL_curcc
2542 and whatever it mentions via ->next, and additional attached trees
2543 corresponding to temporarily unset infoblocks as in "5" above.
2545 In the following picture infoblocks for outer loop of
2546 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2547 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2548 infoblocks are drawn below the "reset" infoblock.
2550 In fact in the picture below we do not show failed matches for Z and T
2551 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2552 more obvious *why* one needs to *temporary* unset infoblocks.]
2554 Matched REx position InfoBlocks Comment
2558 Y A)*?Z)*?T x <- O <- I
2559 YA )*?Z)*?T x <- O <- I
2560 YA A)*?Z)*?T x <- O <- I
2561 YAA )*?Z)*?T x <- O <- I
2562 YAA Z)*?T x <- O # Temporary unset I
2565 YAAZ Y(A)*?Z)*?T x <- O
2568 YAAZY (A)*?Z)*?T x <- O
2571 YAAZY A)*?Z)*?T x <- O <- I
2574 YAAZYA )*?Z)*?T x <- O <- I
2577 YAAZYA Z)*?T x <- O # Temporary unset I
2583 YAAZYAZ T x # Temporary unset O
2590 *******************************************************************/
2593 CHECKPOINT cp = PL_savestack_ix;
2595 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2597 cc.oldcc = PL_regcc;
2599 cc.parenfloor = *PL_reglastparen;
2601 cc.min = ARG1(scan);
2602 cc.max = ARG2(scan);
2603 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2607 PL_reginput = locinput;
2608 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2610 PL_regcc = cc.oldcc;
2616 * This is really hard to understand, because after we match
2617 * what we're trying to match, we must make sure the rest of
2618 * the REx is going to match for sure, and to do that we have
2619 * to go back UP the parse tree by recursing ever deeper. And
2620 * if it fails, we have to reset our parent's current state
2621 * that we can try again after backing off.
2624 CHECKPOINT cp, lastcp;
2625 CURCUR* cc = PL_regcc;
2626 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2628 n = cc->cur + 1; /* how many we know we matched */
2629 PL_reginput = locinput;
2632 PerlIO_printf(Perl_debug_log,
2633 "%*s %ld out of %ld..%ld cc=%lx\n",
2634 REPORT_CODE_OFF+PL_regindent*2, "",
2635 (long)n, (long)cc->min,
2636 (long)cc->max, (long)cc)
2639 /* If degenerate scan matches "", assume scan done. */
2641 if (locinput == cc->lastloc && n >= cc->min) {
2642 PL_regcc = cc->oldcc;
2646 PerlIO_printf(Perl_debug_log,
2647 "%*s empty match detected, try continuation...\n",
2648 REPORT_CODE_OFF+PL_regindent*2, "")
2650 if (regmatch(cc->next))
2658 /* First just match a string of min scans. */
2662 cc->lastloc = locinput;
2663 if (regmatch(cc->scan))
2666 cc->lastloc = lastloc;
2671 /* Check whether we already were at this position.
2672 Postpone detection until we know the match is not
2673 *that* much linear. */
2674 if (!PL_reg_maxiter) {
2675 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2676 PL_reg_leftiter = PL_reg_maxiter;
2678 if (PL_reg_leftiter-- == 0) {
2679 I32 size = (PL_reg_maxiter + 7)/8;
2680 if (PL_reg_poscache) {
2681 if (PL_reg_poscache_size < size) {
2682 Renew(PL_reg_poscache, size, char);
2683 PL_reg_poscache_size = size;
2685 Zero(PL_reg_poscache, size, char);
2688 PL_reg_poscache_size = size;
2689 Newz(29, PL_reg_poscache, size, char);
2692 PerlIO_printf(Perl_debug_log,
2693 "%sDetected a super-linear match, switching on caching%s...\n",
2694 PL_colors[4], PL_colors[5])
2697 if (PL_reg_leftiter < 0) {
2698 I32 o = locinput - PL_bostr, b;
2700 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2703 if (PL_reg_poscache[o] & (1<<b)) {
2705 PerlIO_printf(Perl_debug_log,
2706 "%*s already tried at this position...\n",
2707 REPORT_CODE_OFF+PL_regindent*2, "")
2711 PL_reg_poscache[o] |= (1<<b);
2715 /* Prefer next over scan for minimal matching. */
2718 PL_regcc = cc->oldcc;
2721 cp = regcppush(cc->parenfloor);
2723 if (regmatch(cc->next)) {
2725 sayYES; /* All done. */
2733 if (n >= cc->max) { /* Maximum greed exceeded? */
2734 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2735 && !(PL_reg_flags & RF_warned)) {
2736 PL_reg_flags |= RF_warned;
2737 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2738 "Complex regular subexpression recursion",
2745 PerlIO_printf(Perl_debug_log,
2746 "%*s trying longer...\n",
2747 REPORT_CODE_OFF+PL_regindent*2, "")
2749 /* Try scanning more and see if it helps. */
2750 PL_reginput = locinput;
2752 cc->lastloc = locinput;
2753 cp = regcppush(cc->parenfloor);
2755 if (regmatch(cc->scan)) {
2762 cc->lastloc = lastloc;
2766 /* Prefer scan over next for maximal matching. */
2768 if (n < cc->max) { /* More greed allowed? */
2769 cp = regcppush(cc->parenfloor);
2771 cc->lastloc = locinput;
2773 if (regmatch(cc->scan)) {
2778 regcppop(); /* Restore some previous $<digit>s? */
2779 PL_reginput = locinput;
2781 PerlIO_printf(Perl_debug_log,
2782 "%*s failed, try continuation...\n",
2783 REPORT_CODE_OFF+PL_regindent*2, "")
2786 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2787 && !(PL_reg_flags & RF_warned)) {
2788 PL_reg_flags |= RF_warned;
2789 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2790 "Complex regular subexpression recursion",
2794 /* Failed deeper matches of scan, so see if this one works. */
2795 PL_regcc = cc->oldcc;
2798 if (regmatch(cc->next))
2804 cc->lastloc = lastloc;
2809 next = scan + ARG(scan);
2812 inner = NEXTOPER(NEXTOPER(scan));
2815 inner = NEXTOPER(scan);
2820 if (OP(next) != c1) /* No choice. */
2821 next = inner; /* Avoid recursion. */
2823 int lastparen = *PL_reglastparen;
2827 PL_reginput = locinput;
2828 if (regmatch(inner))
2831 for (n = *PL_reglastparen; n > lastparen; n--)
2833 *PL_reglastparen = n;
2836 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2840 inner = NEXTOPER(scan);
2841 if (c1 == BRANCHJ) {
2842 inner = NEXTOPER(inner);
2844 } while (scan != NULL && OP(scan) == c1);
2858 /* We suppose that the next guy does not need
2859 backtracking: in particular, it is of constant length,
2860 and has no parenths to influence future backrefs. */
2861 ln = ARG1(scan); /* min to match */
2862 n = ARG2(scan); /* max to match */
2863 paren = scan->flags;
2865 if (paren > PL_regsize)
2867 if (paren > *PL_reglastparen)
2868 *PL_reglastparen = paren;
2870 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2872 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2873 PL_reginput = locinput;
2876 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2878 if (ln && l == 0 && n >= ln
2879 /* In fact, this is tricky. If paren, then the
2880 fact that we did/didnot match may influence
2881 future execution. */
2882 && !(paren && ln == 0))
2884 locinput = PL_reginput;
2885 if (PL_regkind[(U8)OP(next)] == EXACT) {
2886 c1 = (U8)*STRING(next);
2887 if (OP(next) == EXACTF)
2889 else if (OP(next) == EXACTFL)
2890 c2 = PL_fold_locale[c1];
2897 /* This may be improved if l == 0. */
2898 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2899 /* If it could work, try it. */
2901 UCHARAT(PL_reginput) == c1 ||
2902 UCHARAT(PL_reginput) == c2)
2906 PL_regstartp[paren] =
2907 HOPc(PL_reginput, -l) - PL_bostr;
2908 PL_regendp[paren] = PL_reginput - PL_bostr;
2911 PL_regendp[paren] = -1;
2917 /* Couldn't or didn't -- move forward. */
2918 PL_reginput = locinput;
2919 if (regrepeat_hard(scan, 1, &l)) {
2921 locinput = PL_reginput;
2928 n = regrepeat_hard(scan, n, &l);
2929 if (n != 0 && l == 0
2930 /* In fact, this is tricky. If paren, then the
2931 fact that we did/didnot match may influence
2932 future execution. */
2933 && !(paren && ln == 0))
2935 locinput = PL_reginput;
2937 PerlIO_printf(Perl_debug_log,
2938 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2939 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2943 if (PL_regkind[(U8)OP(next)] == EXACT) {
2944 c1 = (U8)*STRING(next);
2945 if (OP(next) == EXACTF)
2947 else if (OP(next) == EXACTFL)
2948 c2 = PL_fold_locale[c1];
2957 /* If it could work, try it. */
2959 UCHARAT(PL_reginput) == c1 ||
2960 UCHARAT(PL_reginput) == c2)
2963 PerlIO_printf(Perl_debug_log,
2964 "%*s trying tail with n=%"IVdf"...\n",
2965 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2969 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2970 PL_regendp[paren] = PL_reginput - PL_bostr;
2973 PL_regendp[paren] = -1;
2979 /* Couldn't or didn't -- back up. */
2981 locinput = HOPc(locinput, -l);
2982 PL_reginput = locinput;
2989 paren = scan->flags; /* Which paren to set */
2990 if (paren > PL_regsize)
2992 if (paren > *PL_reglastparen)
2993 *PL_reglastparen = paren;
2994 ln = ARG1(scan); /* min to match */
2995 n = ARG2(scan); /* max to match */
2996 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3000 ln = ARG1(scan); /* min to match */
3001 n = ARG2(scan); /* max to match */
3002 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3007 scan = NEXTOPER(scan);
3013 scan = NEXTOPER(scan);
3017 * Lookahead to avoid useless match attempts
3018 * when we know what character comes next.
3020 if (PL_regkind[(U8)OP(next)] == EXACT) {
3021 c1 = (U8)*STRING(next);
3022 if (OP(next) == EXACTF)
3024 else if (OP(next) == EXACTFL)
3025 c2 = PL_fold_locale[c1];
3031 PL_reginput = locinput;
3035 if (ln && regrepeat(scan, ln) < ln)
3037 locinput = PL_reginput;
3040 char *e = locinput + n - ln; /* Should not check after this */
3041 char *old = locinput;
3043 if (e >= PL_regeol || (n == REG_INFTY))
3046 /* Find place 'next' could work */
3048 while (locinput <= e && *locinput != c1)
3051 while (locinput <= e
3058 /* PL_reginput == old now */
3059 if (locinput != old) {
3060 ln = 1; /* Did some */
3061 if (regrepeat(scan, locinput - old) <
3065 /* PL_reginput == locinput now */
3066 TRYPAREN(paren, ln, locinput);
3067 PL_reginput = locinput; /* Could be reset... */
3069 /* Couldn't or didn't -- move forward. */
3074 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3075 /* If it could work, try it. */
3077 UCHARAT(PL_reginput) == c1 ||
3078 UCHARAT(PL_reginput) == c2)
3080 TRYPAREN(paren, n, PL_reginput);
3083 /* Couldn't or didn't -- move forward. */
3084 PL_reginput = locinput;
3085 if (regrepeat(scan, 1)) {
3087 locinput = PL_reginput;
3095 n = regrepeat(scan, n);
3096 locinput = PL_reginput;
3097 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3098 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3099 ln = n; /* why back off? */
3100 /* ...because $ and \Z can match before *and* after
3101 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3102 We should back off by one in this case. */
3103 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3109 /* If it could work, try it. */
3111 UCHARAT(PL_reginput) == c1 ||
3112 UCHARAT(PL_reginput) == c2)
3114 TRYPAREN(paren, n, PL_reginput);
3117 /* Couldn't or didn't -- back up. */
3119 PL_reginput = locinput = HOPc(locinput, -1);
3124 /* If it could work, try it. */
3126 UCHARAT(PL_reginput) == c1 ||
3127 UCHARAT(PL_reginput) == c2)
3129 TRYPAREN(paren, n, PL_reginput);
3132 /* Couldn't or didn't -- back up. */
3134 PL_reginput = locinput = HOPc(locinput, -1);
3141 if (PL_reg_call_cc) {
3142 re_cc_state *cur_call_cc = PL_reg_call_cc;
3143 CURCUR *cctmp = PL_regcc;
3144 regexp *re = PL_reg_re;
3145 CHECKPOINT cp, lastcp;
3147 cp = regcppush(0); /* Save *all* the positions. */
3149 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3151 PL_reginput = locinput; /* Make position available to
3153 cache_re(PL_reg_call_cc->re);
3154 PL_regcc = PL_reg_call_cc->cc;
3155 PL_reg_call_cc = PL_reg_call_cc->prev;
3156 if (regmatch(cur_call_cc->node)) {
3157 PL_reg_call_cc = cur_call_cc;
3163 PL_reg_call_cc = cur_call_cc;
3169 PerlIO_printf(Perl_debug_log,
3170 "%*s continuation failed...\n",
3171 REPORT_CODE_OFF+PL_regindent*2, "")
3175 if (locinput < PL_regtill) {
3176 DEBUG_r(PerlIO_printf(Perl_debug_log,
3177 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3179 (long)(locinput - PL_reg_starttry),
3180 (long)(PL_regtill - PL_reg_starttry),
3182 sayNO_FINAL; /* Cannot match: too short. */
3184 PL_reginput = locinput; /* put where regtry can find it */
3185 sayYES_FINAL; /* Success! */
3187 PL_reginput = locinput; /* put where regtry can find it */
3188 sayYES_LOUD; /* Success! */
3191 PL_reginput = locinput;
3196 if (UTF) { /* XXXX This is absolutely
3197 broken, we read before
3199 s = HOPMAYBEc(locinput, -scan->flags);
3205 if (locinput < PL_bostr + scan->flags)
3207 PL_reginput = locinput - scan->flags;
3212 PL_reginput = locinput;
3217 if (UTF) { /* XXXX This is absolutely
3218 broken, we read before
3220 s = HOPMAYBEc(locinput, -scan->flags);
3221 if (!s || s < PL_bostr)
3226 if (locinput < PL_bostr + scan->flags)
3228 PL_reginput = locinput - scan->flags;
3233 PL_reginput = locinput;
3236 inner = NEXTOPER(NEXTOPER(scan));
3237 if (regmatch(inner) != n) {
3252 if (OP(scan) == SUSPEND) {
3253 locinput = PL_reginput;
3254 nextchr = UCHARAT(locinput);
3259 next = scan + ARG(scan);
3264 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3265 PTR2UV(scan), OP(scan));
3266 Perl_croak(aTHX_ "regexp memory corruption");
3272 * We get here only if there's trouble -- normally "case END" is
3273 * the terminating point.
3275 Perl_croak(aTHX_ "corrupted regexp pointers");
3281 PerlIO_printf(Perl_debug_log,
3282 "%*s %scould match...%s\n",
3283 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3287 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3288 PL_colors[4],PL_colors[5]));
3297 PerlIO_printf(Perl_debug_log,
3298 "%*s %sfailed...%s\n",
3299 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3311 - regrepeat - repeatedly match something simple, report how many
3314 * [This routine now assumes that it will only match on things of length 1.
3315 * That was true before, but now we assume scan - reginput is the count,
3316 * rather than incrementing count on every character. [Er, except utf8.]]
3319 S_regrepeat(pTHX_ regnode *p, I32 max)
3322 register char *scan;
3324 register char *loceol = PL_regeol;
3325 register I32 hardcount = 0;
3328 if (max != REG_INFTY && max < loceol - scan)
3329 loceol = scan + max;
3332 while (scan < loceol && *scan != '\n')
3340 while (scan < loceol && *scan != '\n') {
3341 scan += UTF8SKIP(scan);
3347 while (scan < loceol) {
3348 scan += UTF8SKIP(scan);
3352 case EXACT: /* length of string is 1 */
3354 while (scan < loceol && UCHARAT(scan) == c)
3357 case EXACTF: /* length of string is 1 */
3359 while (scan < loceol &&
3360 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3363 case EXACTFL: /* length of string is 1 */
3364 PL_reg_flags |= RF_tainted;
3366 while (scan < loceol &&
3367 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3372 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3373 scan += UTF8SKIP(scan);
3378 while (scan < loceol && REGINCLASS(p, *scan))
3382 while (scan < loceol && isALNUM(*scan))
3387 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3388 scan += UTF8SKIP(scan);
3393 PL_reg_flags |= RF_tainted;
3394 while (scan < loceol && isALNUM_LC(*scan))
3398 PL_reg_flags |= RF_tainted;
3400 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3401 scan += UTF8SKIP(scan);
3407 while (scan < loceol && !isALNUM(*scan))
3412 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3413 scan += UTF8SKIP(scan);
3418 PL_reg_flags |= RF_tainted;
3419 while (scan < loceol && !isALNUM_LC(*scan))
3423 PL_reg_flags |= RF_tainted;
3425 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3426 scan += UTF8SKIP(scan);
3431 while (scan < loceol && isSPACE(*scan))
3436 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3437 scan += UTF8SKIP(scan);
3442 PL_reg_flags |= RF_tainted;
3443 while (scan < loceol && isSPACE_LC(*scan))
3447 PL_reg_flags |= RF_tainted;
3449 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3450 scan += UTF8SKIP(scan);
3455 while (scan < loceol && !isSPACE(*scan))
3460 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3461 scan += UTF8SKIP(scan);
3466 PL_reg_flags |= RF_tainted;
3467 while (scan < loceol && !isSPACE_LC(*scan))
3471 PL_reg_flags |= RF_tainted;
3473 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3474 scan += UTF8SKIP(scan);
3479 while (scan < loceol && isDIGIT(*scan))
3484 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3485 scan += UTF8SKIP(scan);
3491 while (scan < loceol && !isDIGIT(*scan))
3496 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3497 scan += UTF8SKIP(scan);
3501 default: /* Called on something of 0 width. */
3502 break; /* So match right here or not at all. */
3508 c = scan - PL_reginput;
3513 SV *prop = sv_newmortal();
3516 PerlIO_printf(Perl_debug_log,
3517 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3518 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3525 - regrepeat_hard - repeatedly match something, report total lenth and length
3527 * The repeater is supposed to have constant length.
3531 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3534 register char *scan;
3535 register char *start;
3536 register char *loceol = PL_regeol;
3538 I32 count = 0, res = 1;
3543 start = PL_reginput;
3545 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3548 while (start < PL_reginput) {
3550 start += UTF8SKIP(start);
3561 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3563 *lp = l = PL_reginput - start;
3564 if (max != REG_INFTY && l*max < loceol - scan)
3565 loceol = scan + l*max;
3578 - reginclass - determine if a character falls into a character class
3582 S_reginclass(pTHX_ register regnode *p, register I32 c)
3585 char flags = ANYOF_FLAGS(p);
3589 if (ANYOF_BITMAP_TEST(p, c))
3591 else if (flags & ANYOF_FOLD) {
3593 if (flags & ANYOF_LOCALE) {
3594 PL_reg_flags |= RF_tainted;
3595 cf = PL_fold_locale[c];
3599 if (ANYOF_BITMAP_TEST(p, cf))
3603 if (!match && (flags & ANYOF_CLASS)) {
3604 PL_reg_flags |= RF_tainted;
3606 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3607 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3608 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3609 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3610 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3611 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3612 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3613 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3614 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3615 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3616 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3617 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3618 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3619 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3620 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3621 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3622 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3623 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3624 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3625 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3626 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3627 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3628 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3629 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3630 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3631 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3632 (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3633 (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3634 (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) ||
3635 (ANYOF_CLASS_TEST(p, ANYOF_NBLANK) && !isBLANK(c))
3636 ) /* How's that for a conditional? */
3642 return (flags & ANYOF_INVERT) ? !match : match;
3646 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3649 char flags = ARG1(f);
3651 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3653 if (swash_fetch(sv, p))
3655 else if (flags & ANYOF_FOLD) {
3656 U8 tmpbuf[UTF8_MAXLEN];
3657 if (flags & ANYOF_LOCALE) {
3658 PL_reg_flags |= RF_tainted;
3659 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3662 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3663 if (swash_fetch(sv, tmpbuf))
3667 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3669 return (flags & ANYOF_INVERT) ? !match : match;
3673 S_reghop(pTHX_ U8 *s, I32 off)
3677 while (off-- && s < (U8*)PL_regeol)
3682 if (s > (U8*)PL_bostr) {
3685 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3687 } /* XXX could check well-formedness here */
3695 S_reghopmaybe(pTHX_ U8* s, I32 off)
3699 while (off-- && s < (U8*)PL_regeol)
3706 if (s > (U8*)PL_bostr) {
3709 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3711 } /* XXX could check well-formedness here */
3727 restore_pos(pTHXo_ void *arg)
3730 if (PL_reg_eval_set) {
3731 if (PL_reg_oldsaved) {
3732 PL_reg_re->subbeg = PL_reg_oldsaved;
3733 PL_reg_re->sublen = PL_reg_oldsavedlen;
3734 RX_MATCH_COPIED_on(PL_reg_re);
3736 PL_reg_magic->mg_len = PL_reg_oldpos;
3737 PL_reg_eval_set = 0;
3738 PL_curpm = PL_reg_oldcurpm;