5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 /* SvCUR is not set on references: SvRV and SvPVX overlap */
351 && (strpos + SvCUR(sv) != strend)) {
352 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
355 PL_regeol = strend; /* Used in HOP() */
356 s = HOPc(strpos, prog->check_offset_min);
358 slen = SvCUR(check); /* >= 1 */
360 if ( strend - s > slen || strend - s < slen - 1
361 || (strend - s == slen && strend[-1] != '\n')) {
362 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
365 /* Now should match s[0..slen-2] */
367 if (slen && (*SvPVX(check) != *s
369 && memNE(SvPVX(check), s, slen)))) {
371 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
375 else if (*SvPVX(check) != *s
376 || ((slen = SvCUR(check)) > 1
377 && memNE(SvPVX(check), s, slen)))
379 goto success_at_start;
381 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
383 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
384 end_shift = prog->minlen - start_shift -
385 CHR_SVLEN(check) + (SvTAIL(check) != 0);
387 I32 end = prog->check_offset_max + CHR_SVLEN(check)
388 - (SvTAIL(check) != 0);
389 I32 eshift = strend - s - end;
391 if (end_shift < eshift)
395 else { /* Can match at random position */
398 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
399 /* Should be nonnegative! */
400 end_shift = prog->minlen - start_shift -
401 CHR_SVLEN(check) + (SvTAIL(check) != 0);
404 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
406 Perl_croak(aTHX_ "panic: end_shift");
410 /* Find a possible match in the region s..strend by looking for
411 the "check" substring in the region corrected by start/end_shift. */
412 if (flags & REXEC_SCREAM) {
413 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
414 I32 p = -1; /* Internal iterator of scream. */
415 I32 *pp = data ? data->scream_pos : &p;
417 if (PL_screamfirst[BmRARE(check)] >= 0
418 || ( BmRARE(check) == '\n'
419 && (BmPREVIOUS(check) == SvCUR(check) - 1)
421 s = screaminstr(sv, check,
422 start_shift + (s - strbeg), end_shift, pp, 0);
426 *data->scream_olds = s;
429 s = fbm_instr((unsigned char*)s + start_shift,
430 (unsigned char*)strend - end_shift,
431 check, PL_multiline ? FBMrf_MULTILINE : 0);
433 /* Update the count-of-usability, remove useless subpatterns,
436 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
437 (s ? "Found" : "Did not find"),
438 ((check == prog->anchored_substr) ? "anchored" : "floating"),
440 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
442 PL_colors[1], (SvTAIL(check) ? "$" : ""),
443 (s ? " at offset " : "...\n") ) );
450 /* Finish the diagnostic message */
451 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
453 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
454 Start with the other substr.
455 XXXX no SCREAM optimization yet - and a very coarse implementation
456 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
457 *always* match. Probably should be marked during compile...
458 Probably it is right to do no SCREAM here...
461 if (prog->float_substr && prog->anchored_substr) {
462 /* Take into account the "other" substring. */
463 /* XXXX May be hopelessly wrong for UTF... */
466 if (check == prog->float_substr) {
469 char *last = s - start_shift, *last1, *last2;
473 t = s - prog->check_offset_max;
474 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
475 && (!(prog->reganch & ROPT_UTF8)
476 || (PL_bostr = strpos, /* Used in regcopmaybe() */
477 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
482 t += prog->anchored_offset;
483 if (t < other_last) /* These positions already checked */
486 last2 = last1 = strend - prog->minlen;
489 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
490 /* On end-of-str: see comment below. */
491 s = fbm_instr((unsigned char*)t,
492 (unsigned char*)last1 + prog->anchored_offset
493 + SvCUR(prog->anchored_substr)
494 - (SvTAIL(prog->anchored_substr)!=0),
495 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
496 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
497 (s ? "Found" : "Contradicts"),
499 (int)(SvCUR(prog->anchored_substr)
500 - (SvTAIL(prog->anchored_substr)!=0)),
501 SvPVX(prog->anchored_substr),
502 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
504 if (last1 >= last2) {
505 DEBUG_r(PerlIO_printf(Perl_debug_log,
506 ", giving up...\n"));
509 DEBUG_r(PerlIO_printf(Perl_debug_log,
510 ", trying floating at offset %ld...\n",
511 (long)(s1 + 1 - i_strpos)));
512 PL_regeol = strend; /* Used in HOP() */
513 other_last = last1 + prog->anchored_offset + 1;
518 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
519 (long)(s - i_strpos)));
520 t = s - prog->anchored_offset;
529 else { /* Take into account the floating substring. */
534 last1 = last = strend - prog->minlen + prog->float_min_offset;
535 if (last - t > prog->float_max_offset)
536 last = t + prog->float_max_offset;
537 s = t + prog->float_min_offset;
540 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
541 /* fbm_instr() takes into account exact value of end-of-str
542 if the check is SvTAIL(ed). Since false positives are OK,
543 and end-of-str is not later than strend we are OK. */
544 s = fbm_instr((unsigned char*)s,
545 (unsigned char*)last + SvCUR(prog->float_substr)
546 - (SvTAIL(prog->float_substr)!=0),
547 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
548 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
549 (s ? "Found" : "Contradicts"),
551 (int)(SvCUR(prog->float_substr)
552 - (SvTAIL(prog->float_substr)!=0)),
553 SvPVX(prog->float_substr),
554 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 ", giving up...\n"));
561 DEBUG_r(PerlIO_printf(Perl_debug_log,
562 ", trying anchored starting at offset %ld...\n",
563 (long)(s1 + 1 - i_strpos)));
564 other_last = last + 1;
565 PL_regeol = strend; /* Used in HOP() */
570 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
571 (long)(s - i_strpos)));
581 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
584 && (!(prog->reganch & ROPT_UTF8)
585 || (PL_bostr = strpos, /* Used in regcopmaybe() */
586 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
589 /* Fixed substring is found far enough so that the match
590 cannot start at strpos. */
592 if (ml_anch && t[-1] != '\n') {
593 /* Eventually fbm_*() should handle this, but often
594 anchored_offset is not 0, so this check will not be wasted. */
595 /* XXXX In the code below we prefer to look for "^" even in
596 presence of anchored substrings. And we search even
597 beyond the found float position. These pessimizations
598 are historical artefacts only. */
600 while (t < strend - prog->minlen) {
602 if (t < s - prog->check_offset_min) {
603 if (prog->anchored_substr) {
604 /* We definitely contradict the found anchored
605 substr. Due to the above check we do not
606 contradict "check" substr.
607 Thus we can arrive here only if check substr
608 is float. Redo checking for "other"=="fixed".
611 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
612 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
613 goto do_other_anchored;
616 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
617 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
620 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
621 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
627 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
628 PL_colors[0],PL_colors[1]));
633 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
637 /* The found string does not prohibit matching at beg-of-str
638 - no optimization of calling REx engine can be performed,
639 unless it was an MBOL and we are not after MBOL. */
641 /* Even in this situation we may use MBOL flag if strpos is offset
642 wrt the start of the string. */
643 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
644 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
645 /* May be due to an implicit anchor of m{.*foo} */
646 && !(prog->reganch & ROPT_IMPLICIT))
651 DEBUG_r( if (ml_anch)
652 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
653 PL_colors[0],PL_colors[1]);
656 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
657 && prog->check_substr /* Could be deleted already */
658 && --BmUSEFUL(prog->check_substr) < 0
659 && prog->check_substr == prog->float_substr)
661 /* If flags & SOMETHING - do not do it many times on the same match */
662 SvREFCNT_dec(prog->check_substr);
663 prog->check_substr = Nullsv; /* disable */
664 prog->float_substr = Nullsv; /* clear */
666 /* XXXX This is a remnant of the old implementation. It
667 looks wasteful, since now INTUIT can use many
669 prog->reganch &= ~RE_USE_INTUIT;
676 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
677 if (prog->regstclass) {
678 /* minlen == 0 is possible if regstclass is \b or \B,
679 and the fixed substr is ''$.
680 Since minlen is already taken into account, s+1 is before strend;
681 accidentally, minlen >= 1 guaranties no false positives at s + 1
682 even for \b or \B. But (minlen? 1 : 0) below assumes that
683 regstclass does not come from lookahead... */
684 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
685 This leaves EXACTF only, which is dealt with in find_byclass(). */
686 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
687 ? STR_LEN(prog->regstclass)
689 char *endpos = (prog->anchored_substr || ml_anch)
690 ? s + (prog->minlen? cl_l : 0)
691 : (prog->float_substr ? check_at - start_shift + cl_l
693 char *startpos = sv && SvPOK(sv) ? strend - SvCUR(sv) : s;
696 if (prog->reganch & ROPT_UTF8) {
697 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
700 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
705 if (endpos == strend) {
706 DEBUG_r( PerlIO_printf(Perl_debug_log,
707 "Could not match STCLASS...\n") );
710 DEBUG_r( PerlIO_printf(Perl_debug_log,
711 "This position contradicts STCLASS...\n") );
712 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
714 /* Contradict one of substrings */
715 if (prog->anchored_substr) {
716 if (prog->anchored_substr == check) {
717 DEBUG_r( what = "anchored" );
719 PL_regeol = strend; /* Used in HOP() */
721 if (s + start_shift + end_shift > strend) {
722 /* XXXX Should be taken into account earlier? */
723 DEBUG_r( PerlIO_printf(Perl_debug_log,
724 "Could not match STCLASS...\n") );
727 DEBUG_r( PerlIO_printf(Perl_debug_log,
728 "Trying %s substr starting at offset %ld...\n",
729 what, (long)(s + start_shift - i_strpos)) );
732 /* Have both, check_string is floating */
733 if (t + start_shift >= check_at) /* Contradicts floating=check */
734 goto retry_floating_check;
735 /* Recheck anchored substring, but not floating... */
737 DEBUG_r( PerlIO_printf(Perl_debug_log,
738 "Trying anchored substr starting at offset %ld...\n",
739 (long)(other_last - i_strpos)) );
740 goto do_other_anchored;
742 /* Another way we could have checked stclass at the
743 current position only: */
746 DEBUG_r( PerlIO_printf(Perl_debug_log,
747 "Trying /^/m starting at offset %ld...\n",
748 (long)(t - i_strpos)) );
751 if (!prog->float_substr) /* Could have been deleted */
753 /* Check is floating subtring. */
754 retry_floating_check:
755 t = check_at - start_shift;
756 DEBUG_r( what = "floating" );
757 goto hop_and_restart;
760 PerlIO_printf(Perl_debug_log,
761 "By STCLASS: moving %ld --> %ld\n",
762 (long)(t - i_strpos), (long)(s - i_strpos));
764 PerlIO_printf(Perl_debug_log,
765 "Does not contradict STCLASS...\n") );
767 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
768 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
771 fail_finish: /* Substring not found */
772 if (prog->check_substr) /* could be removed already */
773 BmUSEFUL(prog->check_substr) += 5; /* hooray */
775 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
776 PL_colors[4],PL_colors[5]));
780 /* We know what class REx starts with. Try to find this position... */
782 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
784 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
790 register I32 tmp = 1; /* Scratch variable? */
792 /* We know what class it must start with. */
796 if (REGINCLASSUTF8(c, (U8*)s)) {
797 if (tmp && (norun || regtry(prog, s)))
809 if (REGINCLASS(c, *(U8*)s)) {
810 if (tmp && (norun || regtry(prog, s)))
830 c2 = PL_fold_locale[c1];
835 e = s; /* Due to minlen logic of intuit() */
836 /* Here it is NOT UTF! */
840 && (ln == 1 || !(OP(c) == EXACTF
842 : ibcmp_locale(s, m, ln)))
843 && (norun || regtry(prog, s)) )
849 if ( (*(U8*)s == c1 || *(U8*)s == c2)
850 && (ln == 1 || !(OP(c) == EXACTF
852 : ibcmp_locale(s, m, ln)))
853 && (norun || regtry(prog, s)) )
860 PL_reg_flags |= RF_tainted;
863 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
864 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
866 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
868 if ((norun || regtry(prog, s)))
873 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
877 PL_reg_flags |= RF_tainted;
880 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
881 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
883 if (tmp == !(OP(c) == BOUNDUTF8 ?
884 swash_fetch(PL_utf8_alnum, (U8*)s) :
885 isALNUM_LC_utf8((U8*)s)))
888 if ((norun || regtry(prog, s)))
893 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
897 PL_reg_flags |= RF_tainted;
900 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
901 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
903 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
905 else if ((norun || regtry(prog, s)))
909 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
913 PL_reg_flags |= RF_tainted;
916 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
917 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
919 if (tmp == !(OP(c) == NBOUNDUTF8 ?
920 swash_fetch(PL_utf8_alnum, (U8*)s) :
921 isALNUM_LC_utf8((U8*)s)))
923 else if ((norun || regtry(prog, s)))
927 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
933 if (tmp && (norun || regtry(prog, s)))
945 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
946 if (tmp && (norun || regtry(prog, s)))
957 PL_reg_flags |= RF_tainted;
959 if (isALNUM_LC(*s)) {
960 if (tmp && (norun || regtry(prog, s)))
971 PL_reg_flags |= RF_tainted;
973 if (isALNUM_LC_utf8((U8*)s)) {
974 if (tmp && (norun || regtry(prog, s)))
987 if (tmp && (norun || regtry(prog, s)))
999 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1000 if (tmp && (norun || regtry(prog, s)))
1011 PL_reg_flags |= RF_tainted;
1012 while (s < strend) {
1013 if (!isALNUM_LC(*s)) {
1014 if (tmp && (norun || regtry(prog, s)))
1025 PL_reg_flags |= RF_tainted;
1026 while (s < strend) {
1027 if (!isALNUM_LC_utf8((U8*)s)) {
1028 if (tmp && (norun || regtry(prog, s)))
1039 while (s < strend) {
1041 if (tmp && (norun || regtry(prog, s)))
1052 while (s < strend) {
1053 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1054 if (tmp && (norun || regtry(prog, s)))
1065 PL_reg_flags |= RF_tainted;
1066 while (s < strend) {
1067 if (isSPACE_LC(*s)) {
1068 if (tmp && (norun || regtry(prog, s)))
1079 PL_reg_flags |= RF_tainted;
1080 while (s < strend) {
1081 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1082 if (tmp && (norun || regtry(prog, s)))
1093 while (s < strend) {
1095 if (tmp && (norun || regtry(prog, s)))
1106 while (s < strend) {
1107 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1108 if (tmp && (norun || regtry(prog, s)))
1119 PL_reg_flags |= RF_tainted;
1120 while (s < strend) {
1121 if (!isSPACE_LC(*s)) {
1122 if (tmp && (norun || regtry(prog, s)))
1133 PL_reg_flags |= RF_tainted;
1134 while (s < strend) {
1135 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1136 if (tmp && (norun || regtry(prog, s)))
1147 while (s < strend) {
1149 if (tmp && (norun || regtry(prog, s)))
1160 while (s < strend) {
1161 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1162 if (tmp && (norun || regtry(prog, s)))
1173 PL_reg_flags |= RF_tainted;
1174 while (s < strend) {
1175 if (isDIGIT_LC(*s)) {
1176 if (tmp && (norun || regtry(prog, s)))
1187 PL_reg_flags |= RF_tainted;
1188 while (s < strend) {
1189 if (isDIGIT_LC_utf8((U8*)s)) {
1190 if (tmp && (norun || regtry(prog, s)))
1201 while (s < strend) {
1203 if (tmp && (norun || regtry(prog, s)))
1214 while (s < strend) {
1215 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1216 if (tmp && (norun || regtry(prog, s)))
1227 PL_reg_flags |= RF_tainted;
1228 while (s < strend) {
1229 if (!isDIGIT_LC(*s)) {
1230 if (tmp && (norun || regtry(prog, s)))
1241 PL_reg_flags |= RF_tainted;
1242 while (s < strend) {
1243 if (!isDIGIT_LC_utf8((U8*)s)) {
1244 if (tmp && (norun || regtry(prog, s)))
1255 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1264 - regexec_flags - match a regexp against a string
1267 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1268 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1269 /* strend: pointer to null at end of string */
1270 /* strbeg: real beginning of string */
1271 /* minend: end of match must be >=minend after stringarg. */
1272 /* data: May be used for some additional optimizations. */
1273 /* nosave: For optimizations. */
1277 register regnode *c;
1278 register char *startpos = stringarg;
1279 I32 minlen; /* must match at least this many chars */
1280 I32 dontbother = 0; /* how many characters not to try at end */
1281 /* I32 start_shift = 0; */ /* Offset of the start to find
1282 constant substr. */ /* CC */
1283 I32 end_shift = 0; /* Same for the end. */ /* CC */
1284 I32 scream_pos = -1; /* Internal iterator of scream. */
1286 SV* oreplsv = GvSV(PL_replgv);
1292 PL_regnarrate = PL_debug & 512;
1295 /* Be paranoid... */
1296 if (prog == NULL || startpos == NULL) {
1297 Perl_croak(aTHX_ "NULL regexp parameter");
1301 minlen = prog->minlen;
1302 if (strend - startpos < minlen) goto phooey;
1304 if (startpos == strbeg) /* is ^ valid at stringarg? */
1307 PL_regprev = (U32)stringarg[-1];
1308 if (!PL_multiline && PL_regprev == '\n')
1309 PL_regprev = '\0'; /* force ^ to NOT match */
1312 /* Check validity of program. */
1313 if (UCHARAT(prog->program) != REG_MAGIC) {
1314 Perl_croak(aTHX_ "corrupted regexp program");
1318 PL_reg_eval_set = 0;
1321 if (prog->reganch & ROPT_UTF8)
1322 PL_reg_flags |= RF_utf8;
1324 /* Mark beginning of line for ^ and lookbehind. */
1325 PL_regbol = startpos;
1329 /* Mark end of line for $ (and such) */
1332 /* see how far we have to get to not match where we matched before */
1333 PL_regtill = startpos+minend;
1335 /* We start without call_cc context. */
1338 /* If there is a "must appear" string, look for it. */
1341 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1344 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1345 PL_reg_ganch = startpos;
1346 else if (sv && SvTYPE(sv) >= SVt_PVMG
1348 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1349 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1350 if (prog->reganch & ROPT_ANCH_GPOS) {
1351 if (s > PL_reg_ganch)
1356 else /* pos() not defined */
1357 PL_reg_ganch = strbeg;
1360 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1361 re_scream_pos_data d;
1363 d.scream_olds = &scream_olds;
1364 d.scream_pos = &scream_pos;
1365 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1367 goto phooey; /* not present */
1370 DEBUG_r( if (!PL_colorset) reginitcolors() );
1371 DEBUG_r(PerlIO_printf(Perl_debug_log,
1372 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1373 PL_colors[4],PL_colors[5],PL_colors[0],
1376 (strlen(prog->precomp) > 60 ? "..." : ""),
1378 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1379 startpos, PL_colors[1],
1380 (strend - startpos > 60 ? "..." : ""))
1383 /* Simplest case: anchored match need be tried only once. */
1384 /* [unless only anchor is BOL and multiline is set] */
1385 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1386 if (s == startpos && regtry(prog, startpos))
1388 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1389 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1394 dontbother = minlen - 1;
1395 end = HOPc(strend, -dontbother) - 1;
1396 /* for multiline we only have to try after newlines */
1397 if (prog->check_substr) {
1401 if (regtry(prog, s))
1406 if (prog->reganch & RE_USE_INTUIT) {
1407 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1418 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1419 if (regtry(prog, s))
1426 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1427 if (regtry(prog, PL_reg_ganch))
1432 /* Messy cases: unanchored match. */
1433 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1434 /* we have /x+whatever/ */
1435 /* it must be a one character string (XXXX Except UTF?) */
1436 char ch = SvPVX(prog->anchored_substr)[0];
1442 while (s < strend) {
1444 DEBUG_r( did_match = 1 );
1445 if (regtry(prog, s)) goto got_it;
1447 while (s < strend && *s == ch)
1454 while (s < strend) {
1456 DEBUG_r( did_match = 1 );
1457 if (regtry(prog, s)) goto got_it;
1459 while (s < strend && *s == ch)
1465 DEBUG_r(did_match ||
1466 PerlIO_printf(Perl_debug_log,
1467 "Did not find anchored character...\n"));
1470 else if (prog->anchored_substr != Nullsv
1471 || (prog->float_substr != Nullsv
1472 && prog->float_max_offset < strend - s)) {
1473 SV *must = prog->anchored_substr
1474 ? prog->anchored_substr : prog->float_substr;
1476 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1478 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1479 char *last = HOPc(strend, /* Cannot start after this */
1480 -(I32)(CHR_SVLEN(must)
1481 - (SvTAIL(must) != 0) + back_min));
1482 char *last1; /* Last position checked before */
1488 last1 = HOPc(s, -1);
1490 last1 = s - 1; /* bogus */
1492 /* XXXX check_substr already used to find `s', can optimize if
1493 check_substr==must. */
1495 dontbother = end_shift;
1496 strend = HOPc(strend, -dontbother);
1497 while ( (s <= last) &&
1498 ((flags & REXEC_SCREAM)
1499 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1500 end_shift, &scream_pos, 0))
1501 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1502 (unsigned char*)strend, must,
1503 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1504 DEBUG_r( did_match = 1 );
1505 if (HOPc(s, -back_max) > last1) {
1506 last1 = HOPc(s, -back_min);
1507 s = HOPc(s, -back_max);
1510 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1512 last1 = HOPc(s, -back_min);
1516 while (s <= last1) {
1517 if (regtry(prog, s))
1523 while (s <= last1) {
1524 if (regtry(prog, s))
1530 DEBUG_r(did_match ||
1531 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1532 ((must == prog->anchored_substr)
1533 ? "anchored" : "floating"),
1535 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1537 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1540 else if ((c = prog->regstclass)) {
1541 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1542 /* don't bother with what can't match */
1543 strend = HOPc(strend, -(minlen - 1));
1544 if (find_byclass(prog, c, s, strend, startpos, 0))
1546 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1550 if (prog->float_substr != Nullsv) { /* Trim the end. */
1553 if (flags & REXEC_SCREAM) {
1554 last = screaminstr(sv, prog->float_substr, s - strbeg,
1555 end_shift, &scream_pos, 1); /* last one */
1557 last = scream_olds; /* Only one occurence. */
1561 char *little = SvPV(prog->float_substr, len);
1563 if (SvTAIL(prog->float_substr)) {
1564 if (memEQ(strend - len + 1, little, len - 1))
1565 last = strend - len + 1;
1566 else if (!PL_multiline)
1567 last = memEQ(strend - len, little, len)
1568 ? strend - len : Nullch;
1574 last = rninstr(s, strend, little, little + len);
1576 last = strend; /* matching `$' */
1580 DEBUG_r(PerlIO_printf(Perl_debug_log,
1581 "%sCan't trim the tail, match fails (should not happen)%s\n",
1582 PL_colors[4],PL_colors[5]));
1583 goto phooey; /* Should not happen! */
1585 dontbother = strend - last + prog->float_min_offset;
1587 if (minlen && (dontbother < minlen))
1588 dontbother = minlen - 1;
1589 strend -= dontbother; /* this one's always in bytes! */
1590 /* We don't know much -- general case. */
1593 if (regtry(prog, s))
1602 if (regtry(prog, s))
1604 } while (s++ < strend);
1612 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1614 if (PL_reg_eval_set) {
1615 /* Preserve the current value of $^R */
1616 if (oreplsv != GvSV(PL_replgv))
1617 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1618 restored, the value remains
1620 restore_pos(aTHXo_ 0);
1623 /* make sure $`, $&, $', and $digit will work later */
1624 if ( !(flags & REXEC_NOT_FIRST) ) {
1625 if (RX_MATCH_COPIED(prog)) {
1626 Safefree(prog->subbeg);
1627 RX_MATCH_COPIED_off(prog);
1629 if (flags & REXEC_COPY_STR) {
1630 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1632 s = savepvn(strbeg, i);
1635 RX_MATCH_COPIED_on(prog);
1638 prog->subbeg = strbeg;
1639 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1646 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1647 PL_colors[4],PL_colors[5]));
1648 if (PL_reg_eval_set)
1649 restore_pos(aTHXo_ 0);
1654 - regtry - try match at specific point
1656 STATIC I32 /* 0 failure, 1 success */
1657 S_regtry(pTHX_ regexp *prog, char *startpos)
1665 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1668 PL_reg_eval_set = RS_init;
1670 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1671 (IV)(PL_stack_sp - PL_stack_base));
1673 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1674 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1675 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1677 /* Apparently this is not needed, judging by wantarray. */
1678 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1679 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1682 /* Make $_ available to executed code. */
1683 if (PL_reg_sv != DEFSV) {
1684 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1689 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1690 && (mg = mg_find(PL_reg_sv, 'g')))) {
1691 /* prepare for quick setting of pos */
1692 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1693 mg = mg_find(PL_reg_sv, 'g');
1697 PL_reg_oldpos = mg->mg_len;
1698 SAVEDESTRUCTOR_X(restore_pos, 0);
1701 Newz(22,PL_reg_curpm, 1, PMOP);
1702 PL_reg_curpm->op_pmregexp = prog;
1703 PL_reg_oldcurpm = PL_curpm;
1704 PL_curpm = PL_reg_curpm;
1705 if (RX_MATCH_COPIED(prog)) {
1706 /* Here is a serious problem: we cannot rewrite subbeg,
1707 since it may be needed if this match fails. Thus
1708 $` inside (?{}) could fail... */
1709 PL_reg_oldsaved = prog->subbeg;
1710 PL_reg_oldsavedlen = prog->sublen;
1711 RX_MATCH_COPIED_off(prog);
1714 PL_reg_oldsaved = Nullch;
1715 prog->subbeg = PL_bostr;
1716 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1718 prog->startp[0] = startpos - PL_bostr;
1719 PL_reginput = startpos;
1720 PL_regstartp = prog->startp;
1721 PL_regendp = prog->endp;
1722 PL_reglastparen = &prog->lastparen;
1723 prog->lastparen = 0;
1725 DEBUG_r(PL_reg_starttry = startpos);
1726 if (PL_reg_start_tmpl <= prog->nparens) {
1727 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1728 if(PL_reg_start_tmp)
1729 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1731 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1734 /* XXXX What this code is doing here?!!! There should be no need
1735 to do this again and again, PL_reglastparen should take care of
1739 if (prog->nparens) {
1740 for (i = prog->nparens; i >= 1; i--) {
1746 if (regmatch(prog->program + 1)) {
1747 prog->endp[0] = PL_reginput - PL_bostr;
1755 - regmatch - main matching routine
1757 * Conceptually the strategy is simple: check to see whether the current
1758 * node matches, call self recursively to see whether the rest matches,
1759 * and then act accordingly. In practice we make some effort to avoid
1760 * recursion, in particular by going through "ordinary" nodes (that don't
1761 * need to know whether the rest of the match failed) by a loop instead of
1764 /* [lwall] I've hoisted the register declarations to the outer block in order to
1765 * maybe save a little bit of pushing and popping on the stack. It also takes
1766 * advantage of machines that use a register save mask on subroutine entry.
1768 STATIC I32 /* 0 failure, 1 success */
1769 S_regmatch(pTHX_ regnode *prog)
1772 register regnode *scan; /* Current node. */
1773 regnode *next; /* Next node. */
1774 regnode *inner; /* Next node in internal branch. */
1775 register I32 nextchr; /* renamed nextchr - nextchar colides with
1776 function of same name */
1777 register I32 n; /* no or next */
1778 register I32 ln; /* len or last */
1779 register char *s; /* operand or save */
1780 register char *locinput = PL_reginput;
1781 register I32 c1, c2, paren; /* case fold search, parenth */
1782 int minmod = 0, sw = 0, logical = 0;
1787 /* Note that nextchr is a byte even in UTF */
1788 nextchr = UCHARAT(locinput);
1790 while (scan != NULL) {
1791 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1793 # define sayYES goto yes
1794 # define sayNO goto no
1795 # define sayYES_FINAL goto yes_final
1796 # define sayYES_LOUD goto yes_loud
1797 # define sayNO_FINAL goto no_final
1798 # define sayNO_SILENT goto do_no
1799 # define saySAME(x) if (x) goto yes; else goto no
1800 # define REPORT_CODE_OFF 24
1802 # define sayYES return 1
1803 # define sayNO return 0
1804 # define sayYES_FINAL return 1
1805 # define sayYES_LOUD return 1
1806 # define sayNO_FINAL return 0
1807 # define sayNO_SILENT return 0
1808 # define saySAME(x) return x
1811 SV *prop = sv_newmortal();
1812 int docolor = *PL_colors[0];
1813 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1814 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1815 /* The part of the string before starttry has one color
1816 (pref0_len chars), between starttry and current
1817 position another one (pref_len - pref0_len chars),
1818 after the current position the third one.
1819 We assume that pref0_len <= pref_len, otherwise we
1820 decrease pref0_len. */
1821 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1822 ? (5 + taill) - l : locinput - PL_bostr);
1823 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1825 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1826 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1827 ? (5 + taill) - pref_len : PL_regeol - locinput);
1830 if (pref0_len > pref_len)
1831 pref0_len = pref_len;
1832 regprop(prop, scan);
1833 PerlIO_printf(Perl_debug_log,
1834 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1835 (IV)(locinput - PL_bostr),
1836 PL_colors[4], pref0_len,
1837 locinput - pref_len, PL_colors[5],
1838 PL_colors[2], pref_len - pref0_len,
1839 locinput - pref_len + pref0_len, PL_colors[3],
1840 (docolor ? "" : "> <"),
1841 PL_colors[0], l, locinput, PL_colors[1],
1842 15 - l - pref_len + 1,
1844 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1848 next = scan + NEXT_OFF(scan);
1854 if (locinput == PL_bostr
1855 ? PL_regprev == '\n'
1857 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1859 /* regtill = regbol; */
1864 if (locinput == PL_bostr
1865 ? PL_regprev == '\n'
1866 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1872 if (locinput == PL_regbol && PL_regprev == '\n')
1876 if (locinput == PL_reg_ganch)
1886 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1891 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1893 if (PL_regeol - locinput > 1)
1897 if (PL_regeol != locinput)
1901 if (nextchr & 0x80) {
1902 locinput += PL_utf8skip[nextchr];
1903 if (locinput > PL_regeol)
1905 nextchr = UCHARAT(locinput);
1908 if (!nextchr && locinput >= PL_regeol)
1910 nextchr = UCHARAT(++locinput);
1913 if (!nextchr && locinput >= PL_regeol)
1915 nextchr = UCHARAT(++locinput);
1918 if (nextchr & 0x80) {
1919 locinput += PL_utf8skip[nextchr];
1920 if (locinput > PL_regeol)
1922 nextchr = UCHARAT(locinput);
1925 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1927 nextchr = UCHARAT(++locinput);
1930 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1932 nextchr = UCHARAT(++locinput);
1937 /* Inline the first character, for speed. */
1938 if (UCHARAT(s) != nextchr)
1940 if (PL_regeol - locinput < ln)
1942 if (ln > 1 && memNE(s, locinput, ln))
1945 nextchr = UCHARAT(locinput);
1948 PL_reg_flags |= RF_tainted;
1957 c1 = OP(scan) == EXACTF;
1961 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1962 toLOWER_utf8((U8*)l) :
1963 toLOWER_LC_utf8((U8*)l)))
1971 nextchr = UCHARAT(locinput);
1975 /* Inline the first character, for speed. */
1976 if (UCHARAT(s) != nextchr &&
1977 UCHARAT(s) != ((OP(scan) == EXACTF)
1978 ? PL_fold : PL_fold_locale)[nextchr])
1980 if (PL_regeol - locinput < ln)
1982 if (ln > 1 && (OP(scan) == EXACTF
1983 ? ibcmp(s, locinput, ln)
1984 : ibcmp_locale(s, locinput, ln)))
1987 nextchr = UCHARAT(locinput);
1990 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1992 if (locinput >= PL_regeol)
1994 locinput += PL_utf8skip[nextchr];
1995 nextchr = UCHARAT(locinput);
1999 nextchr = UCHARAT(locinput);
2000 if (!REGINCLASS(scan, nextchr))
2002 if (!nextchr && locinput >= PL_regeol)
2004 nextchr = UCHARAT(++locinput);
2007 PL_reg_flags |= RF_tainted;
2012 if (!(OP(scan) == ALNUM
2013 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2015 nextchr = UCHARAT(++locinput);
2018 PL_reg_flags |= RF_tainted;
2023 if (nextchr & 0x80) {
2024 if (!(OP(scan) == ALNUMUTF8
2025 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2026 : isALNUM_LC_utf8((U8*)locinput)))
2030 locinput += PL_utf8skip[nextchr];
2031 nextchr = UCHARAT(locinput);
2034 if (!(OP(scan) == ALNUMUTF8
2035 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2037 nextchr = UCHARAT(++locinput);
2040 PL_reg_flags |= RF_tainted;
2043 if (!nextchr && locinput >= PL_regeol)
2045 if (OP(scan) == NALNUM
2046 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2048 nextchr = UCHARAT(++locinput);
2051 PL_reg_flags |= RF_tainted;
2054 if (!nextchr && locinput >= PL_regeol)
2056 if (nextchr & 0x80) {
2057 if (OP(scan) == NALNUMUTF8
2058 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2059 : isALNUM_LC_utf8((U8*)locinput))
2063 locinput += PL_utf8skip[nextchr];
2064 nextchr = UCHARAT(locinput);
2067 if (OP(scan) == NALNUMUTF8
2068 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2070 nextchr = UCHARAT(++locinput);
2074 PL_reg_flags |= RF_tainted;
2078 /* was last char in word? */
2079 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2080 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2082 n = isALNUM(nextchr);
2085 ln = isALNUM_LC(ln);
2086 n = isALNUM_LC(nextchr);
2088 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2093 PL_reg_flags |= RF_tainted;
2097 /* was last char in word? */
2098 ln = (locinput != PL_regbol)
2099 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2100 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2101 ln = isALNUM_uni(ln);
2102 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2105 ln = isALNUM_LC_uni(ln);
2106 n = isALNUM_LC_utf8((U8*)locinput);
2108 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2112 PL_reg_flags |= RF_tainted;
2117 if (!(OP(scan) == SPACE
2118 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2120 nextchr = UCHARAT(++locinput);
2123 PL_reg_flags |= RF_tainted;
2128 if (nextchr & 0x80) {
2129 if (!(OP(scan) == SPACEUTF8
2130 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2131 : isSPACE_LC_utf8((U8*)locinput)))
2135 locinput += PL_utf8skip[nextchr];
2136 nextchr = UCHARAT(locinput);
2139 if (!(OP(scan) == SPACEUTF8
2140 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2142 nextchr = UCHARAT(++locinput);
2145 PL_reg_flags |= RF_tainted;
2148 if (!nextchr && locinput >= PL_regeol)
2150 if (OP(scan) == NSPACE
2151 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2153 nextchr = UCHARAT(++locinput);
2156 PL_reg_flags |= RF_tainted;
2159 if (!nextchr && locinput >= PL_regeol)
2161 if (nextchr & 0x80) {
2162 if (OP(scan) == NSPACEUTF8
2163 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2164 : isSPACE_LC_utf8((U8*)locinput))
2168 locinput += PL_utf8skip[nextchr];
2169 nextchr = UCHARAT(locinput);
2172 if (OP(scan) == NSPACEUTF8
2173 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2175 nextchr = UCHARAT(++locinput);
2178 PL_reg_flags |= RF_tainted;
2183 if (!(OP(scan) == DIGIT
2184 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2186 nextchr = UCHARAT(++locinput);
2189 PL_reg_flags |= RF_tainted;
2194 if (nextchr & 0x80) {
2195 if (!(OP(scan) == DIGITUTF8
2196 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2197 : isDIGIT_LC_utf8((U8*)locinput)))
2201 locinput += PL_utf8skip[nextchr];
2202 nextchr = UCHARAT(locinput);
2205 if (!(OP(scan) == DIGITUTF8
2206 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2208 nextchr = UCHARAT(++locinput);
2211 PL_reg_flags |= RF_tainted;
2214 if (!nextchr && locinput >= PL_regeol)
2216 if (OP(scan) == NDIGIT
2217 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2219 nextchr = UCHARAT(++locinput);
2222 PL_reg_flags |= RF_tainted;
2225 if (!nextchr && locinput >= PL_regeol)
2227 if (nextchr & 0x80) {
2228 if (OP(scan) == NDIGITUTF8
2229 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2230 : isDIGIT_LC_utf8((U8*)locinput))
2234 locinput += PL_utf8skip[nextchr];
2235 nextchr = UCHARAT(locinput);
2238 if (OP(scan) == NDIGITUTF8
2239 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2241 nextchr = UCHARAT(++locinput);
2244 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2246 locinput += PL_utf8skip[nextchr];
2247 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2248 locinput += UTF8SKIP(locinput);
2249 if (locinput > PL_regeol)
2251 nextchr = UCHARAT(locinput);
2254 PL_reg_flags |= RF_tainted;
2258 n = ARG(scan); /* which paren pair */
2259 ln = PL_regstartp[n];
2260 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2261 if (*PL_reglastparen < n || ln == -1)
2262 sayNO; /* Do not match unless seen CLOSEn. */
2263 if (ln == PL_regendp[n])
2267 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2269 char *e = PL_bostr + PL_regendp[n];
2271 * Note that we can't do the "other character" lookup trick as
2272 * in the 8-bit case (no pun intended) because in Unicode we
2273 * have to map both upper and title case to lower case.
2275 if (OP(scan) == REFF) {
2279 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2289 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2296 nextchr = UCHARAT(locinput);
2300 /* Inline the first character, for speed. */
2301 if (UCHARAT(s) != nextchr &&
2303 (UCHARAT(s) != ((OP(scan) == REFF
2304 ? PL_fold : PL_fold_locale)[nextchr]))))
2306 ln = PL_regendp[n] - ln;
2307 if (locinput + ln > PL_regeol)
2309 if (ln > 1 && (OP(scan) == REF
2310 ? memNE(s, locinput, ln)
2312 ? ibcmp(s, locinput, ln)
2313 : ibcmp_locale(s, locinput, ln))))
2316 nextchr = UCHARAT(locinput);
2327 OP_4tree *oop = PL_op;
2328 COP *ocurcop = PL_curcop;
2329 SV **ocurpad = PL_curpad;
2333 PL_op = (OP_4tree*)PL_regdata->data[n];
2334 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2335 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2336 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2338 CALLRUNOPS(aTHX); /* Scalar context. */
2344 PL_curpad = ocurpad;
2345 PL_curcop = ocurcop;
2347 if (logical == 2) { /* Postponed subexpression. */
2349 MAGIC *mg = Null(MAGIC*);
2351 CHECKPOINT cp, lastcp;
2353 if(SvROK(ret) || SvRMAGICAL(ret)) {
2354 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2357 mg = mg_find(sv, 'r');
2360 re = (regexp *)mg->mg_obj;
2361 (void)ReREFCNT_inc(re);
2365 char *t = SvPV(ret, len);
2367 char *oprecomp = PL_regprecomp;
2368 I32 osize = PL_regsize;
2369 I32 onpar = PL_regnpar;
2372 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2373 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2375 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2376 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2377 PL_regprecomp = oprecomp;
2382 PerlIO_printf(Perl_debug_log,
2383 "Entering embedded `%s%.60s%s%s'\n",
2387 (strlen(re->precomp) > 60 ? "..." : ""))
2390 state.prev = PL_reg_call_cc;
2391 state.cc = PL_regcc;
2392 state.re = PL_reg_re;
2396 cp = regcppush(0); /* Save *all* the positions. */
2399 state.ss = PL_savestack_ix;
2400 *PL_reglastparen = 0;
2401 PL_reg_call_cc = &state;
2402 PL_reginput = locinput;
2404 /* XXXX This is too dramatic a measure... */
2407 if (regmatch(re->program + 1)) {
2408 /* Even though we succeeded, we need to restore
2409 global variables, since we may be wrapped inside
2410 SUSPEND, thus the match may be not finished yet. */
2412 /* XXXX Do this only if SUSPENDed? */
2413 PL_reg_call_cc = state.prev;
2414 PL_regcc = state.cc;
2415 PL_reg_re = state.re;
2416 cache_re(PL_reg_re);
2418 /* XXXX This is too dramatic a measure... */
2421 /* These are needed even if not SUSPEND. */
2429 PL_reg_call_cc = state.prev;
2430 PL_regcc = state.cc;
2431 PL_reg_re = state.re;
2432 cache_re(PL_reg_re);
2434 /* XXXX This is too dramatic a measure... */
2443 sv_setsv(save_scalar(PL_replgv), ret);
2447 n = ARG(scan); /* which paren pair */
2448 PL_reg_start_tmp[n] = locinput;
2453 n = ARG(scan); /* which paren pair */
2454 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2455 PL_regendp[n] = locinput - PL_bostr;
2456 if (n > *PL_reglastparen)
2457 *PL_reglastparen = n;
2460 n = ARG(scan); /* which paren pair */
2461 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2464 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2466 next = NEXTOPER(NEXTOPER(scan));
2468 next = scan + ARG(scan);
2469 if (OP(next) == IFTHEN) /* Fake one. */
2470 next = NEXTOPER(NEXTOPER(next));
2474 logical = scan->flags;
2476 /*******************************************************************
2477 PL_regcc contains infoblock about the innermost (...)* loop, and
2478 a pointer to the next outer infoblock.
2480 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2482 1) After matching X, regnode for CURLYX is processed;
2484 2) This regnode creates infoblock on the stack, and calls
2485 regmatch() recursively with the starting point at WHILEM node;
2487 3) Each hit of WHILEM node tries to match A and Z (in the order
2488 depending on the current iteration, min/max of {min,max} and
2489 greediness). The information about where are nodes for "A"
2490 and "Z" is read from the infoblock, as is info on how many times "A"
2491 was already matched, and greediness.
2493 4) After A matches, the same WHILEM node is hit again.
2495 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2496 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2497 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2498 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2499 of the external loop.
2501 Currently present infoblocks form a tree with a stem formed by PL_curcc
2502 and whatever it mentions via ->next, and additional attached trees
2503 corresponding to temporarily unset infoblocks as in "5" above.
2505 In the following picture infoblocks for outer loop of
2506 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2507 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2508 infoblocks are drawn below the "reset" infoblock.
2510 In fact in the picture below we do not show failed matches for Z and T
2511 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2512 more obvious *why* one needs to *temporary* unset infoblocks.]
2514 Matched REx position InfoBlocks Comment
2518 Y A)*?Z)*?T x <- O <- I
2519 YA )*?Z)*?T x <- O <- I
2520 YA A)*?Z)*?T x <- O <- I
2521 YAA )*?Z)*?T x <- O <- I
2522 YAA Z)*?T x <- O # Temporary unset I
2525 YAAZ Y(A)*?Z)*?T x <- O
2528 YAAZY (A)*?Z)*?T x <- O
2531 YAAZY A)*?Z)*?T x <- O <- I
2534 YAAZYA )*?Z)*?T x <- O <- I
2537 YAAZYA Z)*?T x <- O # Temporary unset I
2543 YAAZYAZ T x # Temporary unset O
2550 *******************************************************************/
2553 CHECKPOINT cp = PL_savestack_ix;
2555 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2557 cc.oldcc = PL_regcc;
2559 cc.parenfloor = *PL_reglastparen;
2561 cc.min = ARG1(scan);
2562 cc.max = ARG2(scan);
2563 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2567 PL_reginput = locinput;
2568 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2570 PL_regcc = cc.oldcc;
2576 * This is really hard to understand, because after we match
2577 * what we're trying to match, we must make sure the rest of
2578 * the REx is going to match for sure, and to do that we have
2579 * to go back UP the parse tree by recursing ever deeper. And
2580 * if it fails, we have to reset our parent's current state
2581 * that we can try again after backing off.
2584 CHECKPOINT cp, lastcp;
2585 CURCUR* cc = PL_regcc;
2586 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2588 n = cc->cur + 1; /* how many we know we matched */
2589 PL_reginput = locinput;
2592 PerlIO_printf(Perl_debug_log,
2593 "%*s %ld out of %ld..%ld cc=%lx\n",
2594 REPORT_CODE_OFF+PL_regindent*2, "",
2595 (long)n, (long)cc->min,
2596 (long)cc->max, (long)cc)
2599 /* If degenerate scan matches "", assume scan done. */
2601 if (locinput == cc->lastloc && n >= cc->min) {
2602 PL_regcc = cc->oldcc;
2606 PerlIO_printf(Perl_debug_log,
2607 "%*s empty match detected, try continuation...\n",
2608 REPORT_CODE_OFF+PL_regindent*2, "")
2610 if (regmatch(cc->next))
2618 /* First just match a string of min scans. */
2622 cc->lastloc = locinput;
2623 if (regmatch(cc->scan))
2626 cc->lastloc = lastloc;
2631 /* Check whether we already were at this position.
2632 Postpone detection until we know the match is not
2633 *that* much linear. */
2634 if (!PL_reg_maxiter) {
2635 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2636 PL_reg_leftiter = PL_reg_maxiter;
2638 if (PL_reg_leftiter-- == 0) {
2639 I32 size = (PL_reg_maxiter + 7)/8;
2640 if (PL_reg_poscache) {
2641 if (PL_reg_poscache_size < size) {
2642 Renew(PL_reg_poscache, size, char);
2643 PL_reg_poscache_size = size;
2645 Zero(PL_reg_poscache, size, char);
2648 PL_reg_poscache_size = size;
2649 Newz(29, PL_reg_poscache, size, char);
2652 PerlIO_printf(Perl_debug_log,
2653 "%sDetected a super-linear match, switching on caching%s...\n",
2654 PL_colors[4], PL_colors[5])
2657 if (PL_reg_leftiter < 0) {
2658 I32 o = locinput - PL_bostr, b;
2660 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2663 if (PL_reg_poscache[o] & (1<<b)) {
2665 PerlIO_printf(Perl_debug_log,
2666 "%*s already tried at this position...\n",
2667 REPORT_CODE_OFF+PL_regindent*2, "")
2671 PL_reg_poscache[o] |= (1<<b);
2675 /* Prefer next over scan for minimal matching. */
2678 PL_regcc = cc->oldcc;
2681 cp = regcppush(cc->parenfloor);
2683 if (regmatch(cc->next)) {
2685 sayYES; /* All done. */
2693 if (n >= cc->max) { /* Maximum greed exceeded? */
2694 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2695 && !(PL_reg_flags & RF_warned)) {
2696 PL_reg_flags |= RF_warned;
2697 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2698 "Complex regular subexpression recursion",
2705 PerlIO_printf(Perl_debug_log,
2706 "%*s trying longer...\n",
2707 REPORT_CODE_OFF+PL_regindent*2, "")
2709 /* Try scanning more and see if it helps. */
2710 PL_reginput = locinput;
2712 cc->lastloc = locinput;
2713 cp = regcppush(cc->parenfloor);
2715 if (regmatch(cc->scan)) {
2722 cc->lastloc = lastloc;
2726 /* Prefer scan over next for maximal matching. */
2728 if (n < cc->max) { /* More greed allowed? */
2729 cp = regcppush(cc->parenfloor);
2731 cc->lastloc = locinput;
2733 if (regmatch(cc->scan)) {
2738 regcppop(); /* Restore some previous $<digit>s? */
2739 PL_reginput = locinput;
2741 PerlIO_printf(Perl_debug_log,
2742 "%*s failed, try continuation...\n",
2743 REPORT_CODE_OFF+PL_regindent*2, "")
2746 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2747 && !(PL_reg_flags & RF_warned)) {
2748 PL_reg_flags |= RF_warned;
2749 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2750 "Complex regular subexpression recursion",
2754 /* Failed deeper matches of scan, so see if this one works. */
2755 PL_regcc = cc->oldcc;
2758 if (regmatch(cc->next))
2764 cc->lastloc = lastloc;
2769 next = scan + ARG(scan);
2772 inner = NEXTOPER(NEXTOPER(scan));
2775 inner = NEXTOPER(scan);
2780 if (OP(next) != c1) /* No choice. */
2781 next = inner; /* Avoid recursion. */
2783 int lastparen = *PL_reglastparen;
2787 PL_reginput = locinput;
2788 if (regmatch(inner))
2791 for (n = *PL_reglastparen; n > lastparen; n--)
2793 *PL_reglastparen = n;
2796 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2800 inner = NEXTOPER(scan);
2801 if (c1 == BRANCHJ) {
2802 inner = NEXTOPER(inner);
2804 } while (scan != NULL && OP(scan) == c1);
2818 /* We suppose that the next guy does not need
2819 backtracking: in particular, it is of constant length,
2820 and has no parenths to influence future backrefs. */
2821 ln = ARG1(scan); /* min to match */
2822 n = ARG2(scan); /* max to match */
2823 paren = scan->flags;
2825 if (paren > PL_regsize)
2827 if (paren > *PL_reglastparen)
2828 *PL_reglastparen = paren;
2830 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2832 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2833 PL_reginput = locinput;
2836 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2838 if (ln && l == 0 && n >= ln
2839 /* In fact, this is tricky. If paren, then the
2840 fact that we did/didnot match may influence
2841 future execution. */
2842 && !(paren && ln == 0))
2844 locinput = PL_reginput;
2845 if (PL_regkind[(U8)OP(next)] == EXACT) {
2846 c1 = (U8)*STRING(next);
2847 if (OP(next) == EXACTF)
2849 else if (OP(next) == EXACTFL)
2850 c2 = PL_fold_locale[c1];
2857 /* This may be improved if l == 0. */
2858 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2859 /* If it could work, try it. */
2861 UCHARAT(PL_reginput) == c1 ||
2862 UCHARAT(PL_reginput) == c2)
2866 PL_regstartp[paren] =
2867 HOPc(PL_reginput, -l) - PL_bostr;
2868 PL_regendp[paren] = PL_reginput - PL_bostr;
2871 PL_regendp[paren] = -1;
2877 /* Couldn't or didn't -- move forward. */
2878 PL_reginput = locinput;
2879 if (regrepeat_hard(scan, 1, &l)) {
2881 locinput = PL_reginput;
2888 n = regrepeat_hard(scan, n, &l);
2889 if (n != 0 && l == 0
2890 /* In fact, this is tricky. If paren, then the
2891 fact that we did/didnot match may influence
2892 future execution. */
2893 && !(paren && ln == 0))
2895 locinput = PL_reginput;
2897 PerlIO_printf(Perl_debug_log,
2898 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2899 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2903 if (PL_regkind[(U8)OP(next)] == EXACT) {
2904 c1 = (U8)*STRING(next);
2905 if (OP(next) == EXACTF)
2907 else if (OP(next) == EXACTFL)
2908 c2 = PL_fold_locale[c1];
2917 /* If it could work, try it. */
2919 UCHARAT(PL_reginput) == c1 ||
2920 UCHARAT(PL_reginput) == c2)
2923 PerlIO_printf(Perl_debug_log,
2924 "%*s trying tail with n=%"IVdf"...\n",
2925 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2929 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2930 PL_regendp[paren] = PL_reginput - PL_bostr;
2933 PL_regendp[paren] = -1;
2939 /* Couldn't or didn't -- back up. */
2941 locinput = HOPc(locinput, -l);
2942 PL_reginput = locinput;
2949 paren = scan->flags; /* Which paren to set */
2950 if (paren > PL_regsize)
2952 if (paren > *PL_reglastparen)
2953 *PL_reglastparen = paren;
2954 ln = ARG1(scan); /* min to match */
2955 n = ARG2(scan); /* max to match */
2956 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2960 ln = ARG1(scan); /* min to match */
2961 n = ARG2(scan); /* max to match */
2962 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2967 scan = NEXTOPER(scan);
2973 scan = NEXTOPER(scan);
2977 * Lookahead to avoid useless match attempts
2978 * when we know what character comes next.
2980 if (PL_regkind[(U8)OP(next)] == EXACT) {
2981 c1 = (U8)*STRING(next);
2982 if (OP(next) == EXACTF)
2984 else if (OP(next) == EXACTFL)
2985 c2 = PL_fold_locale[c1];
2991 PL_reginput = locinput;
2995 if (ln && regrepeat(scan, ln) < ln)
2997 locinput = PL_reginput;
3000 char *e = locinput + n - ln; /* Should not check after this */
3001 char *old = locinput;
3003 if (e >= PL_regeol || (n == REG_INFTY))
3006 /* Find place 'next' could work */
3008 while (locinput <= e && *locinput != c1)
3011 while (locinput <= e
3018 /* PL_reginput == old now */
3019 if (locinput != old) {
3020 ln = 1; /* Did some */
3021 if (regrepeat(scan, locinput - old) <
3025 /* PL_reginput == locinput now */
3028 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
3029 PL_regendp[paren] = locinput - PL_bostr;
3032 PL_regendp[paren] = -1;
3036 PL_reginput = locinput; /* Could be reset... */
3038 /* Couldn't or didn't -- move forward. */
3043 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3044 /* If it could work, try it. */
3046 UCHARAT(PL_reginput) == c1 ||
3047 UCHARAT(PL_reginput) == c2)
3051 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3052 PL_regendp[paren] = PL_reginput - PL_bostr;
3055 PL_regendp[paren] = -1;
3061 /* Couldn't or didn't -- move forward. */
3062 PL_reginput = locinput;
3063 if (regrepeat(scan, 1)) {
3065 locinput = PL_reginput;
3073 n = regrepeat(scan, n);
3074 locinput = PL_reginput;
3075 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3076 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3077 ln = n; /* why back off? */
3078 /* ...because $ and \Z can match before *and* after
3079 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3080 We should back off by one in this case. */
3081 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3087 /* If it could work, try it. */
3089 UCHARAT(PL_reginput) == c1 ||
3090 UCHARAT(PL_reginput) == c2)
3094 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3095 PL_regendp[paren] = PL_reginput - PL_bostr;
3098 PL_regendp[paren] = -1;
3104 /* Couldn't or didn't -- back up. */
3106 PL_reginput = locinput = HOPc(locinput, -1);
3111 /* If it could work, try it. */
3113 UCHARAT(PL_reginput) == c1 ||
3114 UCHARAT(PL_reginput) == c2)
3120 /* Couldn't or didn't -- back up. */
3122 PL_reginput = locinput = HOPc(locinput, -1);
3129 if (PL_reg_call_cc) {
3130 re_cc_state *cur_call_cc = PL_reg_call_cc;
3131 CURCUR *cctmp = PL_regcc;
3132 regexp *re = PL_reg_re;
3133 CHECKPOINT cp, lastcp;
3135 cp = regcppush(0); /* Save *all* the positions. */
3137 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3139 PL_reginput = locinput; /* Make position available to
3141 cache_re(PL_reg_call_cc->re);
3142 PL_regcc = PL_reg_call_cc->cc;
3143 PL_reg_call_cc = PL_reg_call_cc->prev;
3144 if (regmatch(cur_call_cc->node)) {
3145 PL_reg_call_cc = cur_call_cc;
3151 PL_reg_call_cc = cur_call_cc;
3157 PerlIO_printf(Perl_debug_log,
3158 "%*s continuation failed...\n",
3159 REPORT_CODE_OFF+PL_regindent*2, "")
3163 if (locinput < PL_regtill) {
3164 DEBUG_r(PerlIO_printf(Perl_debug_log,
3165 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3167 (long)(locinput - PL_reg_starttry),
3168 (long)(PL_regtill - PL_reg_starttry),
3170 sayNO_FINAL; /* Cannot match: too short. */
3172 PL_reginput = locinput; /* put where regtry can find it */
3173 sayYES_FINAL; /* Success! */
3175 PL_reginput = locinput; /* put where regtry can find it */
3176 sayYES_LOUD; /* Success! */
3179 PL_reginput = locinput;
3184 if (UTF) { /* XXXX This is absolutely
3185 broken, we read before
3187 s = HOPMAYBEc(locinput, -scan->flags);
3193 if (locinput < PL_bostr + scan->flags)
3195 PL_reginput = locinput - scan->flags;
3200 PL_reginput = locinput;
3205 if (UTF) { /* XXXX This is absolutely
3206 broken, we read before
3208 s = HOPMAYBEc(locinput, -scan->flags);
3209 if (!s || s < PL_bostr)
3214 if (locinput < PL_bostr + scan->flags)
3216 PL_reginput = locinput - scan->flags;
3221 PL_reginput = locinput;
3224 inner = NEXTOPER(NEXTOPER(scan));
3225 if (regmatch(inner) != n) {
3240 if (OP(scan) == SUSPEND) {
3241 locinput = PL_reginput;
3242 nextchr = UCHARAT(locinput);
3247 next = scan + ARG(scan);
3252 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3253 PTR2UV(scan), OP(scan));
3254 Perl_croak(aTHX_ "regexp memory corruption");
3260 * We get here only if there's trouble -- normally "case END" is
3261 * the terminating point.
3263 Perl_croak(aTHX_ "corrupted regexp pointers");
3269 PerlIO_printf(Perl_debug_log,
3270 "%*s %scould match...%s\n",
3271 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3275 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3276 PL_colors[4],PL_colors[5]));
3285 PerlIO_printf(Perl_debug_log,
3286 "%*s %sfailed...%s\n",
3287 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3299 - regrepeat - repeatedly match something simple, report how many
3302 * [This routine now assumes that it will only match on things of length 1.
3303 * That was true before, but now we assume scan - reginput is the count,
3304 * rather than incrementing count on every character. [Er, except utf8.]]
3307 S_regrepeat(pTHX_ regnode *p, I32 max)
3310 register char *scan;
3312 register char *loceol = PL_regeol;
3313 register I32 hardcount = 0;
3316 if (max != REG_INFTY && max < loceol - scan)
3317 loceol = scan + max;
3320 while (scan < loceol && *scan != '\n')
3328 while (scan < loceol && *scan != '\n') {
3329 scan += UTF8SKIP(scan);
3335 while (scan < loceol) {
3336 scan += UTF8SKIP(scan);
3340 case EXACT: /* length of string is 1 */
3342 while (scan < loceol && UCHARAT(scan) == c)
3345 case EXACTF: /* length of string is 1 */
3347 while (scan < loceol &&
3348 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3351 case EXACTFL: /* length of string is 1 */
3352 PL_reg_flags |= RF_tainted;
3354 while (scan < loceol &&
3355 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3360 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3361 scan += UTF8SKIP(scan);
3366 while (scan < loceol && REGINCLASS(p, *scan))
3370 while (scan < loceol && isALNUM(*scan))
3375 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3376 scan += UTF8SKIP(scan);
3381 PL_reg_flags |= RF_tainted;
3382 while (scan < loceol && isALNUM_LC(*scan))
3386 PL_reg_flags |= RF_tainted;
3388 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3389 scan += UTF8SKIP(scan);
3395 while (scan < loceol && !isALNUM(*scan))
3400 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3401 scan += UTF8SKIP(scan);
3406 PL_reg_flags |= RF_tainted;
3407 while (scan < loceol && !isALNUM_LC(*scan))
3411 PL_reg_flags |= RF_tainted;
3413 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3414 scan += UTF8SKIP(scan);
3419 while (scan < loceol && isSPACE(*scan))
3424 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3425 scan += UTF8SKIP(scan);
3430 PL_reg_flags |= RF_tainted;
3431 while (scan < loceol && isSPACE_LC(*scan))
3435 PL_reg_flags |= RF_tainted;
3437 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3438 scan += UTF8SKIP(scan);
3443 while (scan < loceol && !isSPACE(*scan))
3448 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3449 scan += UTF8SKIP(scan);
3454 PL_reg_flags |= RF_tainted;
3455 while (scan < loceol && !isSPACE_LC(*scan))
3459 PL_reg_flags |= RF_tainted;
3461 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3462 scan += UTF8SKIP(scan);
3467 while (scan < loceol && isDIGIT(*scan))
3472 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3473 scan += UTF8SKIP(scan);
3479 while (scan < loceol && !isDIGIT(*scan))
3484 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3485 scan += UTF8SKIP(scan);
3489 default: /* Called on something of 0 width. */
3490 break; /* So match right here or not at all. */
3496 c = scan - PL_reginput;
3501 SV *prop = sv_newmortal();
3504 PerlIO_printf(Perl_debug_log,
3505 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3506 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3513 - regrepeat_hard - repeatedly match something, report total lenth and length
3515 * The repeater is supposed to have constant length.
3519 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3522 register char *scan;
3523 register char *start;
3524 register char *loceol = PL_regeol;
3526 I32 count = 0, res = 1;
3531 start = PL_reginput;
3533 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3536 while (start < PL_reginput) {
3538 start += UTF8SKIP(start);
3549 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3551 *lp = l = PL_reginput - start;
3552 if (max != REG_INFTY && l*max < loceol - scan)
3553 loceol = scan + l*max;
3566 - reginclass - determine if a character falls into a character class
3570 S_reginclass(pTHX_ register regnode *p, register I32 c)
3573 char flags = ANYOF_FLAGS(p);
3577 if (ANYOF_BITMAP_TEST(p, c))
3579 else if (flags & ANYOF_FOLD) {
3581 if (flags & ANYOF_LOCALE) {
3582 PL_reg_flags |= RF_tainted;
3583 cf = PL_fold_locale[c];
3587 if (ANYOF_BITMAP_TEST(p, cf))
3591 if (!match && (flags & ANYOF_CLASS)) {
3592 PL_reg_flags |= RF_tainted;
3594 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3595 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3596 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3597 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3598 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3599 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3600 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3601 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3602 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3603 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3604 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3605 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3606 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3607 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3608 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3609 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3610 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3611 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3612 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3613 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3614 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3615 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3616 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3617 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3618 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3619 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3620 ) /* How's that for a conditional? */
3626 return (flags & ANYOF_INVERT) ? !match : match;
3630 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3633 char flags = ARG1(f);
3635 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3637 if (swash_fetch(sv, p))
3639 else if (flags & ANYOF_FOLD) {
3640 U8 tmpbuf[UTF8_MAXLEN];
3641 if (flags & ANYOF_LOCALE) {
3642 PL_reg_flags |= RF_tainted;
3643 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3646 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3647 if (swash_fetch(sv, tmpbuf))
3651 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3653 return (flags & ANYOF_INVERT) ? !match : match;
3657 S_reghop(pTHX_ U8 *s, I32 off)
3661 while (off-- && s < (U8*)PL_regeol)
3666 if (s > (U8*)PL_bostr) {
3669 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3671 } /* XXX could check well-formedness here */
3679 S_reghopmaybe(pTHX_ U8* s, I32 off)
3683 while (off-- && s < (U8*)PL_regeol)
3690 if (s > (U8*)PL_bostr) {
3693 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3695 } /* XXX could check well-formedness here */
3711 restore_pos(pTHXo_ void *arg)
3714 if (PL_reg_eval_set) {
3715 if (PL_reg_oldsaved) {
3716 PL_reg_re->subbeg = PL_reg_oldsaved;
3717 PL_reg_re->sublen = PL_reg_oldsavedlen;
3718 RX_MATCH_COPIED_on(PL_reg_re);
3720 PL_reg_magic->mg_len = PL_reg_oldpos;
3721 PL_reg_eval_set = 0;
3722 PL_curpm = PL_reg_oldcurpm;