5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 /* SvCUR is not set on references: SvRV and SvPVX overlap */
351 && (strpos + SvCUR(sv) != strend)) {
352 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
355 PL_regeol = strend; /* Used in HOP() */
356 s = HOPc(strpos, prog->check_offset_min);
358 slen = SvCUR(check); /* >= 1 */
360 if ( strend - s > slen || strend - s < slen - 1
361 || (strend - s == slen && strend[-1] != '\n')) {
362 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
365 /* Now should match s[0..slen-2] */
367 if (slen && (*SvPVX(check) != *s
369 && memNE(SvPVX(check), s, slen)))) {
371 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
375 else if (*SvPVX(check) != *s
376 || ((slen = SvCUR(check)) > 1
377 && memNE(SvPVX(check), s, slen)))
379 goto success_at_start;
381 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
383 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
384 end_shift = prog->minlen - start_shift -
385 CHR_SVLEN(check) + (SvTAIL(check) != 0);
387 I32 end = prog->check_offset_max + CHR_SVLEN(check)
388 - (SvTAIL(check) != 0);
389 I32 eshift = strend - s - end;
391 if (end_shift < eshift)
395 else { /* Can match at random position */
398 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
399 /* Should be nonnegative! */
400 end_shift = prog->minlen - start_shift -
401 CHR_SVLEN(check) + (SvTAIL(check) != 0);
404 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
406 Perl_croak(aTHX_ "panic: end_shift");
410 /* Find a possible match in the region s..strend by looking for
411 the "check" substring in the region corrected by start/end_shift. */
412 if (flags & REXEC_SCREAM) {
413 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
414 I32 p = -1; /* Internal iterator of scream. */
415 I32 *pp = data ? data->scream_pos : &p;
417 if (PL_screamfirst[BmRARE(check)] >= 0
418 || ( BmRARE(check) == '\n'
419 && (BmPREVIOUS(check) == SvCUR(check) - 1)
421 s = screaminstr(sv, check,
422 start_shift + (s - strbeg), end_shift, pp, 0);
426 *data->scream_olds = s;
429 s = fbm_instr((unsigned char*)s + start_shift,
430 (unsigned char*)strend - end_shift,
431 check, PL_multiline ? FBMrf_MULTILINE : 0);
433 /* Update the count-of-usability, remove useless subpatterns,
436 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
437 (s ? "Found" : "Did not find"),
438 ((check == prog->anchored_substr) ? "anchored" : "floating"),
440 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
442 PL_colors[1], (SvTAIL(check) ? "$" : ""),
443 (s ? " at offset " : "...\n") ) );
450 /* Finish the diagnostic message */
451 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
453 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
454 Start with the other substr.
455 XXXX no SCREAM optimization yet - and a very coarse implementation
456 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
457 *always* match. Probably should be marked during compile...
458 Probably it is right to do no SCREAM here...
461 if (prog->float_substr && prog->anchored_substr) {
462 /* Take into account the "other" substring. */
463 /* XXXX May be hopelessly wrong for UTF... */
466 if (check == prog->float_substr) {
469 char *last = s - start_shift, *last1, *last2;
473 t = s - prog->check_offset_max;
474 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
475 && (!(prog->reganch & ROPT_UTF8)
476 || (PL_bostr = strpos, /* Used in regcopmaybe() */
477 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
482 t += prog->anchored_offset;
483 if (t < other_last) /* These positions already checked */
486 last2 = last1 = strend - prog->minlen;
489 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
490 /* On end-of-str: see comment below. */
491 s = fbm_instr((unsigned char*)t,
492 (unsigned char*)last1 + prog->anchored_offset
493 + SvCUR(prog->anchored_substr)
494 - (SvTAIL(prog->anchored_substr)!=0),
495 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
496 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
497 (s ? "Found" : "Contradicts"),
499 (int)(SvCUR(prog->anchored_substr)
500 - (SvTAIL(prog->anchored_substr)!=0)),
501 SvPVX(prog->anchored_substr),
502 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
504 if (last1 >= last2) {
505 DEBUG_r(PerlIO_printf(Perl_debug_log,
506 ", giving up...\n"));
509 DEBUG_r(PerlIO_printf(Perl_debug_log,
510 ", trying floating at offset %ld...\n",
511 (long)(s1 + 1 - i_strpos)));
512 PL_regeol = strend; /* Used in HOP() */
513 other_last = last1 + prog->anchored_offset + 1;
518 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
519 (long)(s - i_strpos)));
520 t = s - prog->anchored_offset;
529 else { /* Take into account the floating substring. */
534 last1 = last = strend - prog->minlen + prog->float_min_offset;
535 if (last - t > prog->float_max_offset)
536 last = t + prog->float_max_offset;
537 s = t + prog->float_min_offset;
540 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
541 /* fbm_instr() takes into account exact value of end-of-str
542 if the check is SvTAIL(ed). Since false positives are OK,
543 and end-of-str is not later than strend we are OK. */
544 s = fbm_instr((unsigned char*)s,
545 (unsigned char*)last + SvCUR(prog->float_substr)
546 - (SvTAIL(prog->float_substr)!=0),
547 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
548 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
549 (s ? "Found" : "Contradicts"),
551 (int)(SvCUR(prog->float_substr)
552 - (SvTAIL(prog->float_substr)!=0)),
553 SvPVX(prog->float_substr),
554 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 ", giving up...\n"));
561 DEBUG_r(PerlIO_printf(Perl_debug_log,
562 ", trying anchored starting at offset %ld...\n",
563 (long)(s1 + 1 - i_strpos)));
564 other_last = last + 1;
565 PL_regeol = strend; /* Used in HOP() */
570 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
571 (long)(s - i_strpos)));
581 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
584 && (!(prog->reganch & ROPT_UTF8)
585 || (PL_bostr = strpos, /* Used in regcopmaybe() */
586 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
589 /* Fixed substring is found far enough so that the match
590 cannot start at strpos. */
592 if (ml_anch && t[-1] != '\n') {
593 /* Eventually fbm_*() should handle this, but often
594 anchored_offset is not 0, so this check will not be wasted. */
595 /* XXXX In the code below we prefer to look for "^" even in
596 presence of anchored substrings. And we search even
597 beyond the found float position. These pessimizations
598 are historical artefacts only. */
600 while (t < strend - prog->minlen) {
602 if (t < check_at - prog->check_offset_min) {
603 if (prog->anchored_substr) {
604 /* Since we moved from the found position,
605 we definitely contradict the found anchored
606 substr. Due to the above check we do not
607 contradict "check" substr.
608 Thus we can arrive here only if check substr
609 is float. Redo checking for "other"=="fixed".
612 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
613 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
614 goto do_other_anchored;
616 /* We don't contradict the found floating substring. */
617 /* XXXX Why not check for STCLASS? */
619 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
620 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
623 /* Position contradicts check-string */
624 /* XXXX probably better to look for check-string
625 than for "\n", so one should lower the limit for t? */
626 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
627 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
633 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
634 PL_colors[0],PL_colors[1]));
638 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
639 PL_colors[0],PL_colors[1]));
643 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
647 /* The found string does not prohibit matching at strpos,
648 - no optimization of calling REx engine can be performed,
649 unless it was an MBOL and we are not after MBOL,
650 or a future STCLASS check will fail this. */
652 /* Even in this situation we may use MBOL flag if strpos is offset
653 wrt the start of the string. */
654 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
655 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
656 /* May be due to an implicit anchor of m{.*foo} */
657 && !(prog->reganch & ROPT_IMPLICIT))
662 DEBUG_r( if (ml_anch)
663 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
664 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
667 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
668 && prog->check_substr /* Could be deleted already */
669 && --BmUSEFUL(prog->check_substr) < 0
670 && prog->check_substr == prog->float_substr)
672 /* If flags & SOMETHING - do not do it many times on the same match */
673 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
674 SvREFCNT_dec(prog->check_substr);
675 prog->check_substr = Nullsv; /* disable */
676 prog->float_substr = Nullsv; /* clear */
678 /* XXXX This is a remnant of the old implementation. It
679 looks wasteful, since now INTUIT can use many
681 prog->reganch &= ~RE_USE_INTUIT;
688 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
689 if (prog->regstclass) {
690 /* minlen == 0 is possible if regstclass is \b or \B,
691 and the fixed substr is ''$.
692 Since minlen is already taken into account, s+1 is before strend;
693 accidentally, minlen >= 1 guaranties no false positives at s + 1
694 even for \b or \B. But (minlen? 1 : 0) below assumes that
695 regstclass does not come from lookahead... */
696 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
697 This leaves EXACTF only, which is dealt with in find_byclass(). */
698 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
699 ? STR_LEN(prog->regstclass)
701 char *endpos = (prog->anchored_substr || ml_anch)
702 ? s + (prog->minlen? cl_l : 0)
703 : (prog->float_substr ? check_at - start_shift + cl_l
705 char *startpos = sv && SvPOK(sv) ? strend - SvCUR(sv) : s;
708 if (prog->reganch & ROPT_UTF8) {
709 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
712 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
717 if (endpos == strend) {
718 DEBUG_r( PerlIO_printf(Perl_debug_log,
719 "Could not match STCLASS...\n") );
722 DEBUG_r( PerlIO_printf(Perl_debug_log,
723 "This position contradicts STCLASS...\n") );
724 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
726 /* Contradict one of substrings */
727 if (prog->anchored_substr) {
728 if (prog->anchored_substr == check) {
729 DEBUG_r( what = "anchored" );
731 PL_regeol = strend; /* Used in HOP() */
733 if (s + start_shift + end_shift > strend) {
734 /* XXXX Should be taken into account earlier? */
735 DEBUG_r( PerlIO_printf(Perl_debug_log,
736 "Could not match STCLASS...\n") );
739 DEBUG_r( PerlIO_printf(Perl_debug_log,
740 "Looking for %s substr starting at offset %ld...\n",
741 what, (long)(s + start_shift - i_strpos)) );
744 /* Have both, check_string is floating */
745 if (t + start_shift >= check_at) /* Contradicts floating=check */
746 goto retry_floating_check;
747 /* Recheck anchored substring, but not floating... */
749 DEBUG_r( PerlIO_printf(Perl_debug_log,
750 "Looking for anchored substr starting at offset %ld...\n",
751 (long)(other_last - i_strpos)) );
752 goto do_other_anchored;
754 /* Another way we could have checked stclass at the
755 current position only: */
758 DEBUG_r( PerlIO_printf(Perl_debug_log,
759 "Looking for /%s^%s/m starting at offset %ld...\n",
760 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
763 if (!prog->float_substr) /* Could have been deleted */
765 /* Check is floating subtring. */
766 retry_floating_check:
767 t = check_at - start_shift;
768 DEBUG_r( what = "floating" );
769 goto hop_and_restart;
772 PerlIO_printf(Perl_debug_log,
773 "By STCLASS: moving %ld --> %ld\n",
774 (long)(t - i_strpos), (long)(s - i_strpos));
776 PerlIO_printf(Perl_debug_log,
777 "Does not contradict STCLASS...\n") );
779 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
780 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
783 fail_finish: /* Substring not found */
784 if (prog->check_substr) /* could be removed already */
785 BmUSEFUL(prog->check_substr) += 5; /* hooray */
787 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
788 PL_colors[4],PL_colors[5]));
792 /* We know what class REx starts with. Try to find this position... */
794 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
796 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
802 register I32 tmp = 1; /* Scratch variable? */
804 /* We know what class it must start with. */
808 if (REGINCLASSUTF8(c, (U8*)s)) {
809 if (tmp && (norun || regtry(prog, s)))
821 if (REGINCLASS(c, *(U8*)s)) {
822 if (tmp && (norun || regtry(prog, s)))
842 c2 = PL_fold_locale[c1];
847 e = s; /* Due to minlen logic of intuit() */
848 /* Here it is NOT UTF! */
852 && (ln == 1 || !(OP(c) == EXACTF
854 : ibcmp_locale(s, m, ln)))
855 && (norun || regtry(prog, s)) )
861 if ( (*(U8*)s == c1 || *(U8*)s == c2)
862 && (ln == 1 || !(OP(c) == EXACTF
864 : ibcmp_locale(s, m, ln)))
865 && (norun || regtry(prog, s)) )
872 PL_reg_flags |= RF_tainted;
875 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
876 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
878 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
880 if ((norun || regtry(prog, s)))
885 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
889 PL_reg_flags |= RF_tainted;
892 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
893 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
895 if (tmp == !(OP(c) == BOUNDUTF8 ?
896 swash_fetch(PL_utf8_alnum, (U8*)s) :
897 isALNUM_LC_utf8((U8*)s)))
900 if ((norun || regtry(prog, s)))
905 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
909 PL_reg_flags |= RF_tainted;
912 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
913 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
915 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
917 else if ((norun || regtry(prog, s)))
921 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
925 PL_reg_flags |= RF_tainted;
928 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
929 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
931 if (tmp == !(OP(c) == NBOUNDUTF8 ?
932 swash_fetch(PL_utf8_alnum, (U8*)s) :
933 isALNUM_LC_utf8((U8*)s)))
935 else if ((norun || regtry(prog, s)))
939 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
945 if (tmp && (norun || regtry(prog, s)))
957 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
958 if (tmp && (norun || regtry(prog, s)))
969 PL_reg_flags |= RF_tainted;
971 if (isALNUM_LC(*s)) {
972 if (tmp && (norun || regtry(prog, s)))
983 PL_reg_flags |= RF_tainted;
985 if (isALNUM_LC_utf8((U8*)s)) {
986 if (tmp && (norun || regtry(prog, s)))
999 if (tmp && (norun || regtry(prog, s)))
1010 while (s < strend) {
1011 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1012 if (tmp && (norun || regtry(prog, s)))
1023 PL_reg_flags |= RF_tainted;
1024 while (s < strend) {
1025 if (!isALNUM_LC(*s)) {
1026 if (tmp && (norun || regtry(prog, s)))
1037 PL_reg_flags |= RF_tainted;
1038 while (s < strend) {
1039 if (!isALNUM_LC_utf8((U8*)s)) {
1040 if (tmp && (norun || regtry(prog, s)))
1051 while (s < strend) {
1053 if (tmp && (norun || regtry(prog, s)))
1064 while (s < strend) {
1065 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1066 if (tmp && (norun || regtry(prog, s)))
1077 PL_reg_flags |= RF_tainted;
1078 while (s < strend) {
1079 if (isSPACE_LC(*s)) {
1080 if (tmp && (norun || regtry(prog, s)))
1091 PL_reg_flags |= RF_tainted;
1092 while (s < strend) {
1093 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1094 if (tmp && (norun || regtry(prog, s)))
1105 while (s < strend) {
1107 if (tmp && (norun || regtry(prog, s)))
1118 while (s < strend) {
1119 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1120 if (tmp && (norun || regtry(prog, s)))
1131 PL_reg_flags |= RF_tainted;
1132 while (s < strend) {
1133 if (!isSPACE_LC(*s)) {
1134 if (tmp && (norun || regtry(prog, s)))
1145 PL_reg_flags |= RF_tainted;
1146 while (s < strend) {
1147 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1148 if (tmp && (norun || regtry(prog, s)))
1159 while (s < strend) {
1161 if (tmp && (norun || regtry(prog, s)))
1172 while (s < strend) {
1173 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1174 if (tmp && (norun || regtry(prog, s)))
1185 PL_reg_flags |= RF_tainted;
1186 while (s < strend) {
1187 if (isDIGIT_LC(*s)) {
1188 if (tmp && (norun || regtry(prog, s)))
1199 PL_reg_flags |= RF_tainted;
1200 while (s < strend) {
1201 if (isDIGIT_LC_utf8((U8*)s)) {
1202 if (tmp && (norun || regtry(prog, s)))
1213 while (s < strend) {
1215 if (tmp && (norun || regtry(prog, s)))
1226 while (s < strend) {
1227 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1228 if (tmp && (norun || regtry(prog, s)))
1239 PL_reg_flags |= RF_tainted;
1240 while (s < strend) {
1241 if (!isDIGIT_LC(*s)) {
1242 if (tmp && (norun || regtry(prog, s)))
1253 PL_reg_flags |= RF_tainted;
1254 while (s < strend) {
1255 if (!isDIGIT_LC_utf8((U8*)s)) {
1256 if (tmp && (norun || regtry(prog, s)))
1267 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1276 - regexec_flags - match a regexp against a string
1279 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1280 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1281 /* strend: pointer to null at end of string */
1282 /* strbeg: real beginning of string */
1283 /* minend: end of match must be >=minend after stringarg. */
1284 /* data: May be used for some additional optimizations. */
1285 /* nosave: For optimizations. */
1289 register regnode *c;
1290 register char *startpos = stringarg;
1291 I32 minlen; /* must match at least this many chars */
1292 I32 dontbother = 0; /* how many characters not to try at end */
1293 /* I32 start_shift = 0; */ /* Offset of the start to find
1294 constant substr. */ /* CC */
1295 I32 end_shift = 0; /* Same for the end. */ /* CC */
1296 I32 scream_pos = -1; /* Internal iterator of scream. */
1298 SV* oreplsv = GvSV(PL_replgv);
1304 PL_regnarrate = PL_debug & 512;
1307 /* Be paranoid... */
1308 if (prog == NULL || startpos == NULL) {
1309 Perl_croak(aTHX_ "NULL regexp parameter");
1313 minlen = prog->minlen;
1314 if (strend - startpos < minlen) goto phooey;
1316 if (startpos == strbeg) /* is ^ valid at stringarg? */
1319 PL_regprev = (U32)stringarg[-1];
1320 if (!PL_multiline && PL_regprev == '\n')
1321 PL_regprev = '\0'; /* force ^ to NOT match */
1324 /* Check validity of program. */
1325 if (UCHARAT(prog->program) != REG_MAGIC) {
1326 Perl_croak(aTHX_ "corrupted regexp program");
1330 PL_reg_eval_set = 0;
1333 if (prog->reganch & ROPT_UTF8)
1334 PL_reg_flags |= RF_utf8;
1336 /* Mark beginning of line for ^ and lookbehind. */
1337 PL_regbol = startpos;
1341 /* Mark end of line for $ (and such) */
1344 /* see how far we have to get to not match where we matched before */
1345 PL_regtill = startpos+minend;
1347 /* We start without call_cc context. */
1350 /* If there is a "must appear" string, look for it. */
1353 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1356 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1357 PL_reg_ganch = startpos;
1358 else if (sv && SvTYPE(sv) >= SVt_PVMG
1360 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1361 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1362 if (prog->reganch & ROPT_ANCH_GPOS) {
1363 if (s > PL_reg_ganch)
1368 else /* pos() not defined */
1369 PL_reg_ganch = strbeg;
1372 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1373 re_scream_pos_data d;
1375 d.scream_olds = &scream_olds;
1376 d.scream_pos = &scream_pos;
1377 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1379 goto phooey; /* not present */
1382 DEBUG_r( if (!PL_colorset) reginitcolors() );
1383 DEBUG_r(PerlIO_printf(Perl_debug_log,
1384 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1385 PL_colors[4],PL_colors[5],PL_colors[0],
1388 (strlen(prog->precomp) > 60 ? "..." : ""),
1390 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1391 startpos, PL_colors[1],
1392 (strend - startpos > 60 ? "..." : ""))
1395 /* Simplest case: anchored match need be tried only once. */
1396 /* [unless only anchor is BOL and multiline is set] */
1397 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1398 if (s == startpos && regtry(prog, startpos))
1400 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1401 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1406 dontbother = minlen - 1;
1407 end = HOPc(strend, -dontbother) - 1;
1408 /* for multiline we only have to try after newlines */
1409 if (prog->check_substr) {
1413 if (regtry(prog, s))
1418 if (prog->reganch & RE_USE_INTUIT) {
1419 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1430 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1431 if (regtry(prog, s))
1438 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1439 if (regtry(prog, PL_reg_ganch))
1444 /* Messy cases: unanchored match. */
1445 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1446 /* we have /x+whatever/ */
1447 /* it must be a one character string (XXXX Except UTF?) */
1448 char ch = SvPVX(prog->anchored_substr)[0];
1454 while (s < strend) {
1456 DEBUG_r( did_match = 1 );
1457 if (regtry(prog, s)) goto got_it;
1459 while (s < strend && *s == ch)
1466 while (s < strend) {
1468 DEBUG_r( did_match = 1 );
1469 if (regtry(prog, s)) goto got_it;
1471 while (s < strend && *s == ch)
1477 DEBUG_r(did_match ||
1478 PerlIO_printf(Perl_debug_log,
1479 "Did not find anchored character...\n"));
1482 else if (prog->anchored_substr != Nullsv
1483 || (prog->float_substr != Nullsv
1484 && prog->float_max_offset < strend - s)) {
1485 SV *must = prog->anchored_substr
1486 ? prog->anchored_substr : prog->float_substr;
1488 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1490 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1491 char *last = HOPc(strend, /* Cannot start after this */
1492 -(I32)(CHR_SVLEN(must)
1493 - (SvTAIL(must) != 0) + back_min));
1494 char *last1; /* Last position checked before */
1500 last1 = HOPc(s, -1);
1502 last1 = s - 1; /* bogus */
1504 /* XXXX check_substr already used to find `s', can optimize if
1505 check_substr==must. */
1507 dontbother = end_shift;
1508 strend = HOPc(strend, -dontbother);
1509 while ( (s <= last) &&
1510 ((flags & REXEC_SCREAM)
1511 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1512 end_shift, &scream_pos, 0))
1513 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1514 (unsigned char*)strend, must,
1515 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1516 DEBUG_r( did_match = 1 );
1517 if (HOPc(s, -back_max) > last1) {
1518 last1 = HOPc(s, -back_min);
1519 s = HOPc(s, -back_max);
1522 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1524 last1 = HOPc(s, -back_min);
1528 while (s <= last1) {
1529 if (regtry(prog, s))
1535 while (s <= last1) {
1536 if (regtry(prog, s))
1542 DEBUG_r(did_match ||
1543 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1544 ((must == prog->anchored_substr)
1545 ? "anchored" : "floating"),
1547 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1549 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1552 else if ((c = prog->regstclass)) {
1553 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1554 /* don't bother with what can't match */
1555 strend = HOPc(strend, -(minlen - 1));
1556 if (find_byclass(prog, c, s, strend, startpos, 0))
1558 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1562 if (prog->float_substr != Nullsv) { /* Trim the end. */
1565 if (flags & REXEC_SCREAM) {
1566 last = screaminstr(sv, prog->float_substr, s - strbeg,
1567 end_shift, &scream_pos, 1); /* last one */
1569 last = scream_olds; /* Only one occurence. */
1573 char *little = SvPV(prog->float_substr, len);
1575 if (SvTAIL(prog->float_substr)) {
1576 if (memEQ(strend - len + 1, little, len - 1))
1577 last = strend - len + 1;
1578 else if (!PL_multiline)
1579 last = memEQ(strend - len, little, len)
1580 ? strend - len : Nullch;
1586 last = rninstr(s, strend, little, little + len);
1588 last = strend; /* matching `$' */
1592 DEBUG_r(PerlIO_printf(Perl_debug_log,
1593 "%sCan't trim the tail, match fails (should not happen)%s\n",
1594 PL_colors[4],PL_colors[5]));
1595 goto phooey; /* Should not happen! */
1597 dontbother = strend - last + prog->float_min_offset;
1599 if (minlen && (dontbother < minlen))
1600 dontbother = minlen - 1;
1601 strend -= dontbother; /* this one's always in bytes! */
1602 /* We don't know much -- general case. */
1605 if (regtry(prog, s))
1614 if (regtry(prog, s))
1616 } while (s++ < strend);
1624 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1626 if (PL_reg_eval_set) {
1627 /* Preserve the current value of $^R */
1628 if (oreplsv != GvSV(PL_replgv))
1629 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1630 restored, the value remains
1632 restore_pos(aTHXo_ 0);
1635 /* make sure $`, $&, $', and $digit will work later */
1636 if ( !(flags & REXEC_NOT_FIRST) ) {
1637 if (RX_MATCH_COPIED(prog)) {
1638 Safefree(prog->subbeg);
1639 RX_MATCH_COPIED_off(prog);
1641 if (flags & REXEC_COPY_STR) {
1642 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1644 s = savepvn(strbeg, i);
1647 RX_MATCH_COPIED_on(prog);
1650 prog->subbeg = strbeg;
1651 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1658 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1659 PL_colors[4],PL_colors[5]));
1660 if (PL_reg_eval_set)
1661 restore_pos(aTHXo_ 0);
1666 - regtry - try match at specific point
1668 STATIC I32 /* 0 failure, 1 success */
1669 S_regtry(pTHX_ regexp *prog, char *startpos)
1677 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1680 PL_reg_eval_set = RS_init;
1682 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1683 (IV)(PL_stack_sp - PL_stack_base));
1685 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1686 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1687 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1689 /* Apparently this is not needed, judging by wantarray. */
1690 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1691 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1694 /* Make $_ available to executed code. */
1695 if (PL_reg_sv != DEFSV) {
1696 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1701 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1702 && (mg = mg_find(PL_reg_sv, 'g')))) {
1703 /* prepare for quick setting of pos */
1704 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1705 mg = mg_find(PL_reg_sv, 'g');
1709 PL_reg_oldpos = mg->mg_len;
1710 SAVEDESTRUCTOR_X(restore_pos, 0);
1713 Newz(22,PL_reg_curpm, 1, PMOP);
1714 PL_reg_curpm->op_pmregexp = prog;
1715 PL_reg_oldcurpm = PL_curpm;
1716 PL_curpm = PL_reg_curpm;
1717 if (RX_MATCH_COPIED(prog)) {
1718 /* Here is a serious problem: we cannot rewrite subbeg,
1719 since it may be needed if this match fails. Thus
1720 $` inside (?{}) could fail... */
1721 PL_reg_oldsaved = prog->subbeg;
1722 PL_reg_oldsavedlen = prog->sublen;
1723 RX_MATCH_COPIED_off(prog);
1726 PL_reg_oldsaved = Nullch;
1727 prog->subbeg = PL_bostr;
1728 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1730 prog->startp[0] = startpos - PL_bostr;
1731 PL_reginput = startpos;
1732 PL_regstartp = prog->startp;
1733 PL_regendp = prog->endp;
1734 PL_reglastparen = &prog->lastparen;
1735 prog->lastparen = 0;
1737 DEBUG_r(PL_reg_starttry = startpos);
1738 if (PL_reg_start_tmpl <= prog->nparens) {
1739 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1740 if(PL_reg_start_tmp)
1741 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1743 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1746 /* XXXX What this code is doing here?!!! There should be no need
1747 to do this again and again, PL_reglastparen should take care of
1751 if (prog->nparens) {
1752 for (i = prog->nparens; i >= 1; i--) {
1758 if (regmatch(prog->program + 1)) {
1759 prog->endp[0] = PL_reginput - PL_bostr;
1767 - regmatch - main matching routine
1769 * Conceptually the strategy is simple: check to see whether the current
1770 * node matches, call self recursively to see whether the rest matches,
1771 * and then act accordingly. In practice we make some effort to avoid
1772 * recursion, in particular by going through "ordinary" nodes (that don't
1773 * need to know whether the rest of the match failed) by a loop instead of
1776 /* [lwall] I've hoisted the register declarations to the outer block in order to
1777 * maybe save a little bit of pushing and popping on the stack. It also takes
1778 * advantage of machines that use a register save mask on subroutine entry.
1780 STATIC I32 /* 0 failure, 1 success */
1781 S_regmatch(pTHX_ regnode *prog)
1784 register regnode *scan; /* Current node. */
1785 regnode *next; /* Next node. */
1786 regnode *inner; /* Next node in internal branch. */
1787 register I32 nextchr; /* renamed nextchr - nextchar colides with
1788 function of same name */
1789 register I32 n; /* no or next */
1790 register I32 ln; /* len or last */
1791 register char *s; /* operand or save */
1792 register char *locinput = PL_reginput;
1793 register I32 c1, c2, paren; /* case fold search, parenth */
1794 int minmod = 0, sw = 0, logical = 0;
1799 /* Note that nextchr is a byte even in UTF */
1800 nextchr = UCHARAT(locinput);
1802 while (scan != NULL) {
1803 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1805 # define sayYES goto yes
1806 # define sayNO goto no
1807 # define sayYES_FINAL goto yes_final
1808 # define sayYES_LOUD goto yes_loud
1809 # define sayNO_FINAL goto no_final
1810 # define sayNO_SILENT goto do_no
1811 # define saySAME(x) if (x) goto yes; else goto no
1812 # define REPORT_CODE_OFF 24
1814 # define sayYES return 1
1815 # define sayNO return 0
1816 # define sayYES_FINAL return 1
1817 # define sayYES_LOUD return 1
1818 # define sayNO_FINAL return 0
1819 # define sayNO_SILENT return 0
1820 # define saySAME(x) return x
1823 SV *prop = sv_newmortal();
1824 int docolor = *PL_colors[0];
1825 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1826 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1827 /* The part of the string before starttry has one color
1828 (pref0_len chars), between starttry and current
1829 position another one (pref_len - pref0_len chars),
1830 after the current position the third one.
1831 We assume that pref0_len <= pref_len, otherwise we
1832 decrease pref0_len. */
1833 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1834 ? (5 + taill) - l : locinput - PL_bostr);
1835 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1837 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1838 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1839 ? (5 + taill) - pref_len : PL_regeol - locinput);
1842 if (pref0_len > pref_len)
1843 pref0_len = pref_len;
1844 regprop(prop, scan);
1845 PerlIO_printf(Perl_debug_log,
1846 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1847 (IV)(locinput - PL_bostr),
1848 PL_colors[4], pref0_len,
1849 locinput - pref_len, PL_colors[5],
1850 PL_colors[2], pref_len - pref0_len,
1851 locinput - pref_len + pref0_len, PL_colors[3],
1852 (docolor ? "" : "> <"),
1853 PL_colors[0], l, locinput, PL_colors[1],
1854 15 - l - pref_len + 1,
1856 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1860 next = scan + NEXT_OFF(scan);
1866 if (locinput == PL_bostr
1867 ? PL_regprev == '\n'
1869 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1871 /* regtill = regbol; */
1876 if (locinput == PL_bostr
1877 ? PL_regprev == '\n'
1878 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1884 if (locinput == PL_bostr)
1888 if (locinput == PL_reg_ganch)
1898 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1903 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1905 if (PL_regeol - locinput > 1)
1909 if (PL_regeol != locinput)
1913 if (nextchr & 0x80) {
1914 locinput += PL_utf8skip[nextchr];
1915 if (locinput > PL_regeol)
1917 nextchr = UCHARAT(locinput);
1920 if (!nextchr && locinput >= PL_regeol)
1922 nextchr = UCHARAT(++locinput);
1925 if (!nextchr && locinput >= PL_regeol)
1927 nextchr = UCHARAT(++locinput);
1930 if (nextchr & 0x80) {
1931 locinput += PL_utf8skip[nextchr];
1932 if (locinput > PL_regeol)
1934 nextchr = UCHARAT(locinput);
1937 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1939 nextchr = UCHARAT(++locinput);
1942 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1944 nextchr = UCHARAT(++locinput);
1949 /* Inline the first character, for speed. */
1950 if (UCHARAT(s) != nextchr)
1952 if (PL_regeol - locinput < ln)
1954 if (ln > 1 && memNE(s, locinput, ln))
1957 nextchr = UCHARAT(locinput);
1960 PL_reg_flags |= RF_tainted;
1969 c1 = OP(scan) == EXACTF;
1973 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1974 toLOWER_utf8((U8*)l) :
1975 toLOWER_LC_utf8((U8*)l)))
1983 nextchr = UCHARAT(locinput);
1987 /* Inline the first character, for speed. */
1988 if (UCHARAT(s) != nextchr &&
1989 UCHARAT(s) != ((OP(scan) == EXACTF)
1990 ? PL_fold : PL_fold_locale)[nextchr])
1992 if (PL_regeol - locinput < ln)
1994 if (ln > 1 && (OP(scan) == EXACTF
1995 ? ibcmp(s, locinput, ln)
1996 : ibcmp_locale(s, locinput, ln)))
1999 nextchr = UCHARAT(locinput);
2002 if (!REGINCLASSUTF8(scan, (U8*)locinput))
2004 if (locinput >= PL_regeol)
2006 locinput += PL_utf8skip[nextchr];
2007 nextchr = UCHARAT(locinput);
2011 nextchr = UCHARAT(locinput);
2012 if (!REGINCLASS(scan, nextchr))
2014 if (!nextchr && locinput >= PL_regeol)
2016 nextchr = UCHARAT(++locinput);
2019 PL_reg_flags |= RF_tainted;
2024 if (!(OP(scan) == ALNUM
2025 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2027 nextchr = UCHARAT(++locinput);
2030 PL_reg_flags |= RF_tainted;
2035 if (nextchr & 0x80) {
2036 if (!(OP(scan) == ALNUMUTF8
2037 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2038 : isALNUM_LC_utf8((U8*)locinput)))
2042 locinput += PL_utf8skip[nextchr];
2043 nextchr = UCHARAT(locinput);
2046 if (!(OP(scan) == ALNUMUTF8
2047 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2049 nextchr = UCHARAT(++locinput);
2052 PL_reg_flags |= RF_tainted;
2055 if (!nextchr && locinput >= PL_regeol)
2057 if (OP(scan) == NALNUM
2058 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2060 nextchr = UCHARAT(++locinput);
2063 PL_reg_flags |= RF_tainted;
2066 if (!nextchr && locinput >= PL_regeol)
2068 if (nextchr & 0x80) {
2069 if (OP(scan) == NALNUMUTF8
2070 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2071 : isALNUM_LC_utf8((U8*)locinput))
2075 locinput += PL_utf8skip[nextchr];
2076 nextchr = UCHARAT(locinput);
2079 if (OP(scan) == NALNUMUTF8
2080 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2082 nextchr = UCHARAT(++locinput);
2086 PL_reg_flags |= RF_tainted;
2090 /* was last char in word? */
2091 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2092 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2094 n = isALNUM(nextchr);
2097 ln = isALNUM_LC(ln);
2098 n = isALNUM_LC(nextchr);
2100 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2105 PL_reg_flags |= RF_tainted;
2109 /* was last char in word? */
2110 ln = (locinput != PL_regbol)
2111 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2112 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2113 ln = isALNUM_uni(ln);
2114 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2117 ln = isALNUM_LC_uni(ln);
2118 n = isALNUM_LC_utf8((U8*)locinput);
2120 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2124 PL_reg_flags |= RF_tainted;
2129 if (!(OP(scan) == SPACE
2130 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2132 nextchr = UCHARAT(++locinput);
2135 PL_reg_flags |= RF_tainted;
2140 if (nextchr & 0x80) {
2141 if (!(OP(scan) == SPACEUTF8
2142 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2143 : isSPACE_LC_utf8((U8*)locinput)))
2147 locinput += PL_utf8skip[nextchr];
2148 nextchr = UCHARAT(locinput);
2151 if (!(OP(scan) == SPACEUTF8
2152 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2154 nextchr = UCHARAT(++locinput);
2157 PL_reg_flags |= RF_tainted;
2160 if (!nextchr && locinput >= PL_regeol)
2162 if (OP(scan) == NSPACE
2163 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2165 nextchr = UCHARAT(++locinput);
2168 PL_reg_flags |= RF_tainted;
2171 if (!nextchr && locinput >= PL_regeol)
2173 if (nextchr & 0x80) {
2174 if (OP(scan) == NSPACEUTF8
2175 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2176 : isSPACE_LC_utf8((U8*)locinput))
2180 locinput += PL_utf8skip[nextchr];
2181 nextchr = UCHARAT(locinput);
2184 if (OP(scan) == NSPACEUTF8
2185 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2187 nextchr = UCHARAT(++locinput);
2190 PL_reg_flags |= RF_tainted;
2195 if (!(OP(scan) == DIGIT
2196 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2198 nextchr = UCHARAT(++locinput);
2201 PL_reg_flags |= RF_tainted;
2206 if (nextchr & 0x80) {
2207 if (!(OP(scan) == DIGITUTF8
2208 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2209 : isDIGIT_LC_utf8((U8*)locinput)))
2213 locinput += PL_utf8skip[nextchr];
2214 nextchr = UCHARAT(locinput);
2217 if (!(OP(scan) == DIGITUTF8
2218 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2220 nextchr = UCHARAT(++locinput);
2223 PL_reg_flags |= RF_tainted;
2226 if (!nextchr && locinput >= PL_regeol)
2228 if (OP(scan) == NDIGIT
2229 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2231 nextchr = UCHARAT(++locinput);
2234 PL_reg_flags |= RF_tainted;
2237 if (!nextchr && locinput >= PL_regeol)
2239 if (nextchr & 0x80) {
2240 if (OP(scan) == NDIGITUTF8
2241 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2242 : isDIGIT_LC_utf8((U8*)locinput))
2246 locinput += PL_utf8skip[nextchr];
2247 nextchr = UCHARAT(locinput);
2250 if (OP(scan) == NDIGITUTF8
2251 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2253 nextchr = UCHARAT(++locinput);
2256 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2258 locinput += PL_utf8skip[nextchr];
2259 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2260 locinput += UTF8SKIP(locinput);
2261 if (locinput > PL_regeol)
2263 nextchr = UCHARAT(locinput);
2266 PL_reg_flags |= RF_tainted;
2270 n = ARG(scan); /* which paren pair */
2271 ln = PL_regstartp[n];
2272 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2273 if (*PL_reglastparen < n || ln == -1)
2274 sayNO; /* Do not match unless seen CLOSEn. */
2275 if (ln == PL_regendp[n])
2279 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2281 char *e = PL_bostr + PL_regendp[n];
2283 * Note that we can't do the "other character" lookup trick as
2284 * in the 8-bit case (no pun intended) because in Unicode we
2285 * have to map both upper and title case to lower case.
2287 if (OP(scan) == REFF) {
2291 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2301 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2308 nextchr = UCHARAT(locinput);
2312 /* Inline the first character, for speed. */
2313 if (UCHARAT(s) != nextchr &&
2315 (UCHARAT(s) != ((OP(scan) == REFF
2316 ? PL_fold : PL_fold_locale)[nextchr]))))
2318 ln = PL_regendp[n] - ln;
2319 if (locinput + ln > PL_regeol)
2321 if (ln > 1 && (OP(scan) == REF
2322 ? memNE(s, locinput, ln)
2324 ? ibcmp(s, locinput, ln)
2325 : ibcmp_locale(s, locinput, ln))))
2328 nextchr = UCHARAT(locinput);
2339 OP_4tree *oop = PL_op;
2340 COP *ocurcop = PL_curcop;
2341 SV **ocurpad = PL_curpad;
2345 PL_op = (OP_4tree*)PL_regdata->data[n];
2346 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2347 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2348 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2350 CALLRUNOPS(aTHX); /* Scalar context. */
2356 PL_curpad = ocurpad;
2357 PL_curcop = ocurcop;
2359 if (logical == 2) { /* Postponed subexpression. */
2361 MAGIC *mg = Null(MAGIC*);
2363 CHECKPOINT cp, lastcp;
2365 if(SvROK(ret) || SvRMAGICAL(ret)) {
2366 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2369 mg = mg_find(sv, 'r');
2372 re = (regexp *)mg->mg_obj;
2373 (void)ReREFCNT_inc(re);
2377 char *t = SvPV(ret, len);
2379 char *oprecomp = PL_regprecomp;
2380 I32 osize = PL_regsize;
2381 I32 onpar = PL_regnpar;
2384 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2385 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2387 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2388 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2389 PL_regprecomp = oprecomp;
2394 PerlIO_printf(Perl_debug_log,
2395 "Entering embedded `%s%.60s%s%s'\n",
2399 (strlen(re->precomp) > 60 ? "..." : ""))
2402 state.prev = PL_reg_call_cc;
2403 state.cc = PL_regcc;
2404 state.re = PL_reg_re;
2408 cp = regcppush(0); /* Save *all* the positions. */
2411 state.ss = PL_savestack_ix;
2412 *PL_reglastparen = 0;
2413 PL_reg_call_cc = &state;
2414 PL_reginput = locinput;
2416 /* XXXX This is too dramatic a measure... */
2419 if (regmatch(re->program + 1)) {
2420 /* Even though we succeeded, we need to restore
2421 global variables, since we may be wrapped inside
2422 SUSPEND, thus the match may be not finished yet. */
2424 /* XXXX Do this only if SUSPENDed? */
2425 PL_reg_call_cc = state.prev;
2426 PL_regcc = state.cc;
2427 PL_reg_re = state.re;
2428 cache_re(PL_reg_re);
2430 /* XXXX This is too dramatic a measure... */
2433 /* These are needed even if not SUSPEND. */
2441 PL_reg_call_cc = state.prev;
2442 PL_regcc = state.cc;
2443 PL_reg_re = state.re;
2444 cache_re(PL_reg_re);
2446 /* XXXX This is too dramatic a measure... */
2455 sv_setsv(save_scalar(PL_replgv), ret);
2459 n = ARG(scan); /* which paren pair */
2460 PL_reg_start_tmp[n] = locinput;
2465 n = ARG(scan); /* which paren pair */
2466 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2467 PL_regendp[n] = locinput - PL_bostr;
2468 if (n > *PL_reglastparen)
2469 *PL_reglastparen = n;
2472 n = ARG(scan); /* which paren pair */
2473 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2476 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2478 next = NEXTOPER(NEXTOPER(scan));
2480 next = scan + ARG(scan);
2481 if (OP(next) == IFTHEN) /* Fake one. */
2482 next = NEXTOPER(NEXTOPER(next));
2486 logical = scan->flags;
2488 /*******************************************************************
2489 PL_regcc contains infoblock about the innermost (...)* loop, and
2490 a pointer to the next outer infoblock.
2492 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2494 1) After matching X, regnode for CURLYX is processed;
2496 2) This regnode creates infoblock on the stack, and calls
2497 regmatch() recursively with the starting point at WHILEM node;
2499 3) Each hit of WHILEM node tries to match A and Z (in the order
2500 depending on the current iteration, min/max of {min,max} and
2501 greediness). The information about where are nodes for "A"
2502 and "Z" is read from the infoblock, as is info on how many times "A"
2503 was already matched, and greediness.
2505 4) After A matches, the same WHILEM node is hit again.
2507 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2508 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2509 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2510 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2511 of the external loop.
2513 Currently present infoblocks form a tree with a stem formed by PL_curcc
2514 and whatever it mentions via ->next, and additional attached trees
2515 corresponding to temporarily unset infoblocks as in "5" above.
2517 In the following picture infoblocks for outer loop of
2518 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2519 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2520 infoblocks are drawn below the "reset" infoblock.
2522 In fact in the picture below we do not show failed matches for Z and T
2523 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2524 more obvious *why* one needs to *temporary* unset infoblocks.]
2526 Matched REx position InfoBlocks Comment
2530 Y A)*?Z)*?T x <- O <- I
2531 YA )*?Z)*?T x <- O <- I
2532 YA A)*?Z)*?T x <- O <- I
2533 YAA )*?Z)*?T x <- O <- I
2534 YAA Z)*?T x <- O # Temporary unset I
2537 YAAZ Y(A)*?Z)*?T x <- O
2540 YAAZY (A)*?Z)*?T x <- O
2543 YAAZY A)*?Z)*?T x <- O <- I
2546 YAAZYA )*?Z)*?T x <- O <- I
2549 YAAZYA Z)*?T x <- O # Temporary unset I
2555 YAAZYAZ T x # Temporary unset O
2562 *******************************************************************/
2565 CHECKPOINT cp = PL_savestack_ix;
2567 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2569 cc.oldcc = PL_regcc;
2571 cc.parenfloor = *PL_reglastparen;
2573 cc.min = ARG1(scan);
2574 cc.max = ARG2(scan);
2575 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2579 PL_reginput = locinput;
2580 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2582 PL_regcc = cc.oldcc;
2588 * This is really hard to understand, because after we match
2589 * what we're trying to match, we must make sure the rest of
2590 * the REx is going to match for sure, and to do that we have
2591 * to go back UP the parse tree by recursing ever deeper. And
2592 * if it fails, we have to reset our parent's current state
2593 * that we can try again after backing off.
2596 CHECKPOINT cp, lastcp;
2597 CURCUR* cc = PL_regcc;
2598 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2600 n = cc->cur + 1; /* how many we know we matched */
2601 PL_reginput = locinput;
2604 PerlIO_printf(Perl_debug_log,
2605 "%*s %ld out of %ld..%ld cc=%lx\n",
2606 REPORT_CODE_OFF+PL_regindent*2, "",
2607 (long)n, (long)cc->min,
2608 (long)cc->max, (long)cc)
2611 /* If degenerate scan matches "", assume scan done. */
2613 if (locinput == cc->lastloc && n >= cc->min) {
2614 PL_regcc = cc->oldcc;
2618 PerlIO_printf(Perl_debug_log,
2619 "%*s empty match detected, try continuation...\n",
2620 REPORT_CODE_OFF+PL_regindent*2, "")
2622 if (regmatch(cc->next))
2630 /* First just match a string of min scans. */
2634 cc->lastloc = locinput;
2635 if (regmatch(cc->scan))
2638 cc->lastloc = lastloc;
2643 /* Check whether we already were at this position.
2644 Postpone detection until we know the match is not
2645 *that* much linear. */
2646 if (!PL_reg_maxiter) {
2647 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2648 PL_reg_leftiter = PL_reg_maxiter;
2650 if (PL_reg_leftiter-- == 0) {
2651 I32 size = (PL_reg_maxiter + 7)/8;
2652 if (PL_reg_poscache) {
2653 if (PL_reg_poscache_size < size) {
2654 Renew(PL_reg_poscache, size, char);
2655 PL_reg_poscache_size = size;
2657 Zero(PL_reg_poscache, size, char);
2660 PL_reg_poscache_size = size;
2661 Newz(29, PL_reg_poscache, size, char);
2664 PerlIO_printf(Perl_debug_log,
2665 "%sDetected a super-linear match, switching on caching%s...\n",
2666 PL_colors[4], PL_colors[5])
2669 if (PL_reg_leftiter < 0) {
2670 I32 o = locinput - PL_bostr, b;
2672 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2675 if (PL_reg_poscache[o] & (1<<b)) {
2677 PerlIO_printf(Perl_debug_log,
2678 "%*s already tried at this position...\n",
2679 REPORT_CODE_OFF+PL_regindent*2, "")
2683 PL_reg_poscache[o] |= (1<<b);
2687 /* Prefer next over scan for minimal matching. */
2690 PL_regcc = cc->oldcc;
2693 cp = regcppush(cc->parenfloor);
2695 if (regmatch(cc->next)) {
2697 sayYES; /* All done. */
2705 if (n >= cc->max) { /* Maximum greed exceeded? */
2706 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2707 && !(PL_reg_flags & RF_warned)) {
2708 PL_reg_flags |= RF_warned;
2709 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2710 "Complex regular subexpression recursion",
2717 PerlIO_printf(Perl_debug_log,
2718 "%*s trying longer...\n",
2719 REPORT_CODE_OFF+PL_regindent*2, "")
2721 /* Try scanning more and see if it helps. */
2722 PL_reginput = locinput;
2724 cc->lastloc = locinput;
2725 cp = regcppush(cc->parenfloor);
2727 if (regmatch(cc->scan)) {
2734 cc->lastloc = lastloc;
2738 /* Prefer scan over next for maximal matching. */
2740 if (n < cc->max) { /* More greed allowed? */
2741 cp = regcppush(cc->parenfloor);
2743 cc->lastloc = locinput;
2745 if (regmatch(cc->scan)) {
2750 regcppop(); /* Restore some previous $<digit>s? */
2751 PL_reginput = locinput;
2753 PerlIO_printf(Perl_debug_log,
2754 "%*s failed, try continuation...\n",
2755 REPORT_CODE_OFF+PL_regindent*2, "")
2758 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2759 && !(PL_reg_flags & RF_warned)) {
2760 PL_reg_flags |= RF_warned;
2761 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2762 "Complex regular subexpression recursion",
2766 /* Failed deeper matches of scan, so see if this one works. */
2767 PL_regcc = cc->oldcc;
2770 if (regmatch(cc->next))
2776 cc->lastloc = lastloc;
2781 next = scan + ARG(scan);
2784 inner = NEXTOPER(NEXTOPER(scan));
2787 inner = NEXTOPER(scan);
2792 if (OP(next) != c1) /* No choice. */
2793 next = inner; /* Avoid recursion. */
2795 int lastparen = *PL_reglastparen;
2799 PL_reginput = locinput;
2800 if (regmatch(inner))
2803 for (n = *PL_reglastparen; n > lastparen; n--)
2805 *PL_reglastparen = n;
2808 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2812 inner = NEXTOPER(scan);
2813 if (c1 == BRANCHJ) {
2814 inner = NEXTOPER(inner);
2816 } while (scan != NULL && OP(scan) == c1);
2830 /* We suppose that the next guy does not need
2831 backtracking: in particular, it is of constant length,
2832 and has no parenths to influence future backrefs. */
2833 ln = ARG1(scan); /* min to match */
2834 n = ARG2(scan); /* max to match */
2835 paren = scan->flags;
2837 if (paren > PL_regsize)
2839 if (paren > *PL_reglastparen)
2840 *PL_reglastparen = paren;
2842 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2844 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2845 PL_reginput = locinput;
2848 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2850 if (ln && l == 0 && n >= ln
2851 /* In fact, this is tricky. If paren, then the
2852 fact that we did/didnot match may influence
2853 future execution. */
2854 && !(paren && ln == 0))
2856 locinput = PL_reginput;
2857 if (PL_regkind[(U8)OP(next)] == EXACT) {
2858 c1 = (U8)*STRING(next);
2859 if (OP(next) == EXACTF)
2861 else if (OP(next) == EXACTFL)
2862 c2 = PL_fold_locale[c1];
2869 /* This may be improved if l == 0. */
2870 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2871 /* If it could work, try it. */
2873 UCHARAT(PL_reginput) == c1 ||
2874 UCHARAT(PL_reginput) == c2)
2878 PL_regstartp[paren] =
2879 HOPc(PL_reginput, -l) - PL_bostr;
2880 PL_regendp[paren] = PL_reginput - PL_bostr;
2883 PL_regendp[paren] = -1;
2889 /* Couldn't or didn't -- move forward. */
2890 PL_reginput = locinput;
2891 if (regrepeat_hard(scan, 1, &l)) {
2893 locinput = PL_reginput;
2900 n = regrepeat_hard(scan, n, &l);
2901 if (n != 0 && l == 0
2902 /* In fact, this is tricky. If paren, then the
2903 fact that we did/didnot match may influence
2904 future execution. */
2905 && !(paren && ln == 0))
2907 locinput = PL_reginput;
2909 PerlIO_printf(Perl_debug_log,
2910 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2911 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2915 if (PL_regkind[(U8)OP(next)] == EXACT) {
2916 c1 = (U8)*STRING(next);
2917 if (OP(next) == EXACTF)
2919 else if (OP(next) == EXACTFL)
2920 c2 = PL_fold_locale[c1];
2929 /* If it could work, try it. */
2931 UCHARAT(PL_reginput) == c1 ||
2932 UCHARAT(PL_reginput) == c2)
2935 PerlIO_printf(Perl_debug_log,
2936 "%*s trying tail with n=%"IVdf"...\n",
2937 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2941 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2942 PL_regendp[paren] = PL_reginput - PL_bostr;
2945 PL_regendp[paren] = -1;
2951 /* Couldn't or didn't -- back up. */
2953 locinput = HOPc(locinput, -l);
2954 PL_reginput = locinput;
2961 paren = scan->flags; /* Which paren to set */
2962 if (paren > PL_regsize)
2964 if (paren > *PL_reglastparen)
2965 *PL_reglastparen = paren;
2966 ln = ARG1(scan); /* min to match */
2967 n = ARG2(scan); /* max to match */
2968 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2972 ln = ARG1(scan); /* min to match */
2973 n = ARG2(scan); /* max to match */
2974 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2979 scan = NEXTOPER(scan);
2985 scan = NEXTOPER(scan);
2989 * Lookahead to avoid useless match attempts
2990 * when we know what character comes next.
2992 if (PL_regkind[(U8)OP(next)] == EXACT) {
2993 c1 = (U8)*STRING(next);
2994 if (OP(next) == EXACTF)
2996 else if (OP(next) == EXACTFL)
2997 c2 = PL_fold_locale[c1];
3003 PL_reginput = locinput;
3005 PL_regendp[paren] = -1;
3009 if (ln && regrepeat(scan, ln) < ln)
3011 locinput = PL_reginput;
3014 char *e = locinput + n - ln; /* Should not check after this */
3015 char *old = locinput;
3017 if (e >= PL_regeol || (n == REG_INFTY))
3020 /* Find place 'next' could work */
3022 while (locinput <= e && *locinput != c1)
3025 while (locinput <= e
3032 /* PL_reginput == old now */
3033 if (locinput != old) {
3034 ln = 1; /* Did some */
3035 if (regrepeat(scan, locinput - old) <
3039 /* PL_reginput == locinput now */
3042 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
3043 PL_regendp[paren] = locinput - PL_bostr;
3046 PL_regendp[paren] = -1;
3050 PL_reginput = locinput; /* Could be reset... */
3052 /* Couldn't or didn't -- move forward. */
3057 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3058 /* If it could work, try it. */
3060 UCHARAT(PL_reginput) == c1 ||
3061 UCHARAT(PL_reginput) == c2)
3065 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3066 PL_regendp[paren] = PL_reginput - PL_bostr;
3069 PL_regendp[paren] = -1;
3075 /* Couldn't or didn't -- move forward. */
3076 PL_reginput = locinput;
3077 if (regrepeat(scan, 1)) {
3079 locinput = PL_reginput;
3087 n = regrepeat(scan, n);
3088 locinput = PL_reginput;
3089 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3090 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3091 ln = n; /* why back off? */
3092 /* ...because $ and \Z can match before *and* after
3093 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3094 We should back off by one in this case. */
3095 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3101 /* If it could work, try it. */
3103 UCHARAT(PL_reginput) == c1 ||
3104 UCHARAT(PL_reginput) == c2)
3108 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3109 PL_regendp[paren] = PL_reginput - PL_bostr;
3112 PL_regendp[paren] = -1;
3118 /* Couldn't or didn't -- back up. */
3120 PL_reginput = locinput = HOPc(locinput, -1);
3125 /* If it could work, try it. */
3127 UCHARAT(PL_reginput) == c1 ||
3128 UCHARAT(PL_reginput) == c2)
3134 /* Couldn't or didn't -- back up. */
3136 PL_reginput = locinput = HOPc(locinput, -1);
3143 if (PL_reg_call_cc) {
3144 re_cc_state *cur_call_cc = PL_reg_call_cc;
3145 CURCUR *cctmp = PL_regcc;
3146 regexp *re = PL_reg_re;
3147 CHECKPOINT cp, lastcp;
3149 cp = regcppush(0); /* Save *all* the positions. */
3151 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3153 PL_reginput = locinput; /* Make position available to
3155 cache_re(PL_reg_call_cc->re);
3156 PL_regcc = PL_reg_call_cc->cc;
3157 PL_reg_call_cc = PL_reg_call_cc->prev;
3158 if (regmatch(cur_call_cc->node)) {
3159 PL_reg_call_cc = cur_call_cc;
3165 PL_reg_call_cc = cur_call_cc;
3171 PerlIO_printf(Perl_debug_log,
3172 "%*s continuation failed...\n",
3173 REPORT_CODE_OFF+PL_regindent*2, "")
3177 if (locinput < PL_regtill) {
3178 DEBUG_r(PerlIO_printf(Perl_debug_log,
3179 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3181 (long)(locinput - PL_reg_starttry),
3182 (long)(PL_regtill - PL_reg_starttry),
3184 sayNO_FINAL; /* Cannot match: too short. */
3186 PL_reginput = locinput; /* put where regtry can find it */
3187 sayYES_FINAL; /* Success! */
3189 PL_reginput = locinput; /* put where regtry can find it */
3190 sayYES_LOUD; /* Success! */
3193 PL_reginput = locinput;
3198 if (UTF) { /* XXXX This is absolutely
3199 broken, we read before
3201 s = HOPMAYBEc(locinput, -scan->flags);
3207 if (locinput < PL_bostr + scan->flags)
3209 PL_reginput = locinput - scan->flags;
3214 PL_reginput = locinput;
3219 if (UTF) { /* XXXX This is absolutely
3220 broken, we read before
3222 s = HOPMAYBEc(locinput, -scan->flags);
3223 if (!s || s < PL_bostr)
3228 if (locinput < PL_bostr + scan->flags)
3230 PL_reginput = locinput - scan->flags;
3235 PL_reginput = locinput;
3238 inner = NEXTOPER(NEXTOPER(scan));
3239 if (regmatch(inner) != n) {
3254 if (OP(scan) == SUSPEND) {
3255 locinput = PL_reginput;
3256 nextchr = UCHARAT(locinput);
3261 next = scan + ARG(scan);
3266 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3267 PTR2UV(scan), OP(scan));
3268 Perl_croak(aTHX_ "regexp memory corruption");
3274 * We get here only if there's trouble -- normally "case END" is
3275 * the terminating point.
3277 Perl_croak(aTHX_ "corrupted regexp pointers");
3283 PerlIO_printf(Perl_debug_log,
3284 "%*s %scould match...%s\n",
3285 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3289 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3290 PL_colors[4],PL_colors[5]));
3299 PerlIO_printf(Perl_debug_log,
3300 "%*s %sfailed...%s\n",
3301 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3313 - regrepeat - repeatedly match something simple, report how many
3316 * [This routine now assumes that it will only match on things of length 1.
3317 * That was true before, but now we assume scan - reginput is the count,
3318 * rather than incrementing count on every character. [Er, except utf8.]]
3321 S_regrepeat(pTHX_ regnode *p, I32 max)
3324 register char *scan;
3326 register char *loceol = PL_regeol;
3327 register I32 hardcount = 0;
3330 if (max != REG_INFTY && max < loceol - scan)
3331 loceol = scan + max;
3334 while (scan < loceol && *scan != '\n')
3342 while (scan < loceol && *scan != '\n') {
3343 scan += UTF8SKIP(scan);
3349 while (scan < loceol) {
3350 scan += UTF8SKIP(scan);
3354 case EXACT: /* length of string is 1 */
3356 while (scan < loceol && UCHARAT(scan) == c)
3359 case EXACTF: /* length of string is 1 */
3361 while (scan < loceol &&
3362 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3365 case EXACTFL: /* length of string is 1 */
3366 PL_reg_flags |= RF_tainted;
3368 while (scan < loceol &&
3369 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3374 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3375 scan += UTF8SKIP(scan);
3380 while (scan < loceol && REGINCLASS(p, *scan))
3384 while (scan < loceol && isALNUM(*scan))
3389 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3390 scan += UTF8SKIP(scan);
3395 PL_reg_flags |= RF_tainted;
3396 while (scan < loceol && isALNUM_LC(*scan))
3400 PL_reg_flags |= RF_tainted;
3402 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3403 scan += UTF8SKIP(scan);
3409 while (scan < loceol && !isALNUM(*scan))
3414 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3415 scan += UTF8SKIP(scan);
3420 PL_reg_flags |= RF_tainted;
3421 while (scan < loceol && !isALNUM_LC(*scan))
3425 PL_reg_flags |= RF_tainted;
3427 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3428 scan += UTF8SKIP(scan);
3433 while (scan < loceol && isSPACE(*scan))
3438 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3439 scan += UTF8SKIP(scan);
3444 PL_reg_flags |= RF_tainted;
3445 while (scan < loceol && isSPACE_LC(*scan))
3449 PL_reg_flags |= RF_tainted;
3451 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3452 scan += UTF8SKIP(scan);
3457 while (scan < loceol && !isSPACE(*scan))
3462 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3463 scan += UTF8SKIP(scan);
3468 PL_reg_flags |= RF_tainted;
3469 while (scan < loceol && !isSPACE_LC(*scan))
3473 PL_reg_flags |= RF_tainted;
3475 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3476 scan += UTF8SKIP(scan);
3481 while (scan < loceol && isDIGIT(*scan))
3486 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3487 scan += UTF8SKIP(scan);
3493 while (scan < loceol && !isDIGIT(*scan))
3498 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3499 scan += UTF8SKIP(scan);
3503 default: /* Called on something of 0 width. */
3504 break; /* So match right here or not at all. */
3510 c = scan - PL_reginput;
3515 SV *prop = sv_newmortal();
3518 PerlIO_printf(Perl_debug_log,
3519 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3520 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3527 - regrepeat_hard - repeatedly match something, report total lenth and length
3529 * The repeater is supposed to have constant length.
3533 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3536 register char *scan;
3537 register char *start;
3538 register char *loceol = PL_regeol;
3540 I32 count = 0, res = 1;
3545 start = PL_reginput;
3547 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3550 while (start < PL_reginput) {
3552 start += UTF8SKIP(start);
3563 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3565 *lp = l = PL_reginput - start;
3566 if (max != REG_INFTY && l*max < loceol - scan)
3567 loceol = scan + l*max;
3580 - reginclass - determine if a character falls into a character class
3584 S_reginclass(pTHX_ register regnode *p, register I32 c)
3587 char flags = ANYOF_FLAGS(p);
3591 if (ANYOF_BITMAP_TEST(p, c))
3593 else if (flags & ANYOF_FOLD) {
3595 if (flags & ANYOF_LOCALE) {
3596 PL_reg_flags |= RF_tainted;
3597 cf = PL_fold_locale[c];
3601 if (ANYOF_BITMAP_TEST(p, cf))
3605 if (!match && (flags & ANYOF_CLASS)) {
3606 PL_reg_flags |= RF_tainted;
3608 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3609 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3610 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3611 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3612 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3613 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3614 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3615 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3616 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3617 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3618 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3619 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3620 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3621 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3622 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3623 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3624 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3625 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3626 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3627 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3628 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3629 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3630 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3631 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3632 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3633 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3634 ) /* How's that for a conditional? */
3640 return (flags & ANYOF_INVERT) ? !match : match;
3644 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3647 char flags = ARG1(f);
3649 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3651 if (swash_fetch(sv, p))
3653 else if (flags & ANYOF_FOLD) {
3654 U8 tmpbuf[UTF8_MAXLEN];
3655 if (flags & ANYOF_LOCALE) {
3656 PL_reg_flags |= RF_tainted;
3657 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3660 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3661 if (swash_fetch(sv, tmpbuf))
3665 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3667 return (flags & ANYOF_INVERT) ? !match : match;
3671 S_reghop(pTHX_ U8 *s, I32 off)
3675 while (off-- && s < (U8*)PL_regeol)
3680 if (s > (U8*)PL_bostr) {
3683 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3685 } /* XXX could check well-formedness here */
3693 S_reghopmaybe(pTHX_ U8* s, I32 off)
3697 while (off-- && s < (U8*)PL_regeol)
3704 if (s > (U8*)PL_bostr) {
3707 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3709 } /* XXX could check well-formedness here */
3725 restore_pos(pTHXo_ void *arg)
3728 if (PL_reg_eval_set) {
3729 if (PL_reg_oldsaved) {
3730 PL_reg_re->subbeg = PL_reg_oldsaved;
3731 PL_reg_re->sublen = PL_reg_oldsavedlen;
3732 RX_MATCH_COPIED_on(PL_reg_re);
3734 PL_reg_magic->mg_len = PL_reg_oldpos;
3735 PL_reg_eval_set = 0;
3736 PL_curpm = PL_reg_oldcurpm;