5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-2000, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 check = prog->check_substr;
339 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
340 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
341 || ( (prog->reganch & ROPT_ANCH_BOL)
342 && !PL_multiline ) ); /* Check after \n? */
344 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
345 /* Substring at constant offset from beg-of-str... */
348 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
349 /* SvCUR is not set on references: SvRV and SvPVX overlap */
351 && (strpos + SvCUR(sv) != strend)) {
352 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
355 PL_regeol = strend; /* Used in HOP() */
356 s = HOPc(strpos, prog->check_offset_min);
358 slen = SvCUR(check); /* >= 1 */
360 if ( strend - s > slen || strend - s < slen - 1
361 || (strend - s == slen && strend[-1] != '\n')) {
362 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
365 /* Now should match s[0..slen-2] */
367 if (slen && (*SvPVX(check) != *s
369 && memNE(SvPVX(check), s, slen)))) {
371 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
375 else if (*SvPVX(check) != *s
376 || ((slen = SvCUR(check)) > 1
377 && memNE(SvPVX(check), s, slen)))
379 goto success_at_start;
381 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
383 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
384 end_shift = prog->minlen - start_shift -
385 CHR_SVLEN(check) + (SvTAIL(check) != 0);
387 I32 end = prog->check_offset_max + CHR_SVLEN(check)
388 - (SvTAIL(check) != 0);
389 I32 eshift = strend - s - end;
391 if (end_shift < eshift)
395 else { /* Can match at random position */
398 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
399 /* Should be nonnegative! */
400 end_shift = prog->minlen - start_shift -
401 CHR_SVLEN(check) + (SvTAIL(check) != 0);
404 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
406 Perl_croak(aTHX_ "panic: end_shift");
410 /* Find a possible match in the region s..strend by looking for
411 the "check" substring in the region corrected by start/end_shift. */
412 if (flags & REXEC_SCREAM) {
413 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
414 I32 p = -1; /* Internal iterator of scream. */
415 I32 *pp = data ? data->scream_pos : &p;
417 if (PL_screamfirst[BmRARE(check)] >= 0
418 || ( BmRARE(check) == '\n'
419 && (BmPREVIOUS(check) == SvCUR(check) - 1)
421 s = screaminstr(sv, check,
422 start_shift + (s - strbeg), end_shift, pp, 0);
426 *data->scream_olds = s;
429 s = fbm_instr((unsigned char*)s + start_shift,
430 (unsigned char*)strend - end_shift,
431 check, PL_multiline ? FBMrf_MULTILINE : 0);
433 /* Update the count-of-usability, remove useless subpatterns,
436 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
437 (s ? "Found" : "Did not find"),
438 ((check == prog->anchored_substr) ? "anchored" : "floating"),
440 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
442 PL_colors[1], (SvTAIL(check) ? "$" : ""),
443 (s ? " at offset " : "...\n") ) );
450 /* Finish the diagnostic message */
451 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
453 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
454 Start with the other substr.
455 XXXX no SCREAM optimization yet - and a very coarse implementation
456 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
457 *always* match. Probably should be marked during compile...
458 Probably it is right to do no SCREAM here...
461 if (prog->float_substr && prog->anchored_substr) {
462 /* Take into account the "other" substring. */
463 /* XXXX May be hopelessly wrong for UTF... */
466 if (check == prog->float_substr) {
469 char *last = s - start_shift, *last1, *last2;
473 t = s - prog->check_offset_max;
474 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
475 && (!(prog->reganch & ROPT_UTF8)
476 || (PL_bostr = strpos, /* Used in regcopmaybe() */
477 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
482 t += prog->anchored_offset;
483 if (t < other_last) /* These positions already checked */
486 last2 = last1 = strend - prog->minlen;
489 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
490 /* On end-of-str: see comment below. */
491 s = fbm_instr((unsigned char*)t,
492 (unsigned char*)last1 + prog->anchored_offset
493 + SvCUR(prog->anchored_substr)
494 - (SvTAIL(prog->anchored_substr)!=0),
495 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
496 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
497 (s ? "Found" : "Contradicts"),
499 (int)(SvCUR(prog->anchored_substr)
500 - (SvTAIL(prog->anchored_substr)!=0)),
501 SvPVX(prog->anchored_substr),
502 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
504 if (last1 >= last2) {
505 DEBUG_r(PerlIO_printf(Perl_debug_log,
506 ", giving up...\n"));
509 DEBUG_r(PerlIO_printf(Perl_debug_log,
510 ", trying floating at offset %ld...\n",
511 (long)(s1 + 1 - i_strpos)));
512 PL_regeol = strend; /* Used in HOP() */
513 other_last = last1 + prog->anchored_offset + 1;
518 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
519 (long)(s - i_strpos)));
520 t = s - prog->anchored_offset;
529 else { /* Take into account the floating substring. */
534 last1 = last = strend - prog->minlen + prog->float_min_offset;
535 if (last - t > prog->float_max_offset)
536 last = t + prog->float_max_offset;
537 s = t + prog->float_min_offset;
540 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
541 /* fbm_instr() takes into account exact value of end-of-str
542 if the check is SvTAIL(ed). Since false positives are OK,
543 and end-of-str is not later than strend we are OK. */
544 s = fbm_instr((unsigned char*)s,
545 (unsigned char*)last + SvCUR(prog->float_substr)
546 - (SvTAIL(prog->float_substr)!=0),
547 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
548 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
549 (s ? "Found" : "Contradicts"),
551 (int)(SvCUR(prog->float_substr)
552 - (SvTAIL(prog->float_substr)!=0)),
553 SvPVX(prog->float_substr),
554 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
557 DEBUG_r(PerlIO_printf(Perl_debug_log,
558 ", giving up...\n"));
561 DEBUG_r(PerlIO_printf(Perl_debug_log,
562 ", trying anchored starting at offset %ld...\n",
563 (long)(s1 + 1 - i_strpos)));
564 other_last = last + 1;
565 PL_regeol = strend; /* Used in HOP() */
570 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
571 (long)(s - i_strpos)));
581 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
584 && (!(prog->reganch & ROPT_UTF8)
585 || (PL_bostr = strpos, /* Used in regcopmaybe() */
586 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
589 /* Fixed substring is found far enough so that the match
590 cannot start at strpos. */
592 if (ml_anch && t[-1] != '\n') {
593 /* Eventually fbm_*() should handle this, but often
594 anchored_offset is not 0, so this check will not be wasted. */
595 /* XXXX In the code below we prefer to look for "^" even in
596 presence of anchored substrings. And we search even
597 beyond the found float position. These pessimizations
598 are historical artefacts only. */
600 while (t < strend - prog->minlen) {
602 if (t < check_at - prog->check_offset_min) {
603 if (prog->anchored_substr) {
604 /* Since we moved from the found position,
605 we definitely contradict the found anchored
606 substr. Due to the above check we do not
607 contradict "check" substr.
608 Thus we can arrive here only if check substr
609 is float. Redo checking for "other"=="fixed".
612 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
613 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
614 goto do_other_anchored;
616 /* We don't contradict the found floating substring. */
617 /* XXXX Why not check for STCLASS? */
619 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
620 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
623 /* Position contradicts check-string */
624 /* XXXX probably better to look for check-string
625 than for "\n", so one should lower the limit for t? */
626 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
627 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
633 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
634 PL_colors[0],PL_colors[1]));
639 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
643 /* The found string does not prohibit matching at beg-of-str
644 - no optimization of calling REx engine can be performed,
645 unless it was an MBOL and we are not after MBOL. */
647 /* Even in this situation we may use MBOL flag if strpos is offset
648 wrt the start of the string. */
649 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
650 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n'
651 /* May be due to an implicit anchor of m{.*foo} */
652 && !(prog->reganch & ROPT_IMPLICIT))
657 DEBUG_r( if (ml_anch)
658 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
659 PL_colors[0],PL_colors[1]);
662 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
663 && prog->check_substr /* Could be deleted already */
664 && --BmUSEFUL(prog->check_substr) < 0
665 && prog->check_substr == prog->float_substr)
667 /* If flags & SOMETHING - do not do it many times on the same match */
668 SvREFCNT_dec(prog->check_substr);
669 prog->check_substr = Nullsv; /* disable */
670 prog->float_substr = Nullsv; /* clear */
672 /* XXXX This is a remnant of the old implementation. It
673 looks wasteful, since now INTUIT can use many
675 prog->reganch &= ~RE_USE_INTUIT;
682 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
683 if (prog->regstclass) {
684 /* minlen == 0 is possible if regstclass is \b or \B,
685 and the fixed substr is ''$.
686 Since minlen is already taken into account, s+1 is before strend;
687 accidentally, minlen >= 1 guaranties no false positives at s + 1
688 even for \b or \B. But (minlen? 1 : 0) below assumes that
689 regstclass does not come from lookahead... */
690 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
691 This leaves EXACTF only, which is dealt with in find_byclass(). */
692 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
693 ? STR_LEN(prog->regstclass)
695 char *endpos = (prog->anchored_substr || ml_anch)
696 ? s + (prog->minlen? cl_l : 0)
697 : (prog->float_substr ? check_at - start_shift + cl_l
699 char *startpos = sv && SvPOK(sv) ? strend - SvCUR(sv) : s;
702 if (prog->reganch & ROPT_UTF8) {
703 PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
706 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
711 if (endpos == strend) {
712 DEBUG_r( PerlIO_printf(Perl_debug_log,
713 "Could not match STCLASS...\n") );
716 DEBUG_r( PerlIO_printf(Perl_debug_log,
717 "This position contradicts STCLASS...\n") );
718 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
720 /* Contradict one of substrings */
721 if (prog->anchored_substr) {
722 if (prog->anchored_substr == check) {
723 DEBUG_r( what = "anchored" );
725 PL_regeol = strend; /* Used in HOP() */
727 if (s + start_shift + end_shift > strend) {
728 /* XXXX Should be taken into account earlier? */
729 DEBUG_r( PerlIO_printf(Perl_debug_log,
730 "Could not match STCLASS...\n") );
733 DEBUG_r( PerlIO_printf(Perl_debug_log,
734 "Trying %s substr starting at offset %ld...\n",
735 what, (long)(s + start_shift - i_strpos)) );
738 /* Have both, check_string is floating */
739 if (t + start_shift >= check_at) /* Contradicts floating=check */
740 goto retry_floating_check;
741 /* Recheck anchored substring, but not floating... */
743 DEBUG_r( PerlIO_printf(Perl_debug_log,
744 "Trying anchored substr starting at offset %ld...\n",
745 (long)(other_last - i_strpos)) );
746 goto do_other_anchored;
748 /* Another way we could have checked stclass at the
749 current position only: */
752 DEBUG_r( PerlIO_printf(Perl_debug_log,
753 "Trying /^/m starting at offset %ld...\n",
754 (long)(t - i_strpos)) );
757 if (!prog->float_substr) /* Could have been deleted */
759 /* Check is floating subtring. */
760 retry_floating_check:
761 t = check_at - start_shift;
762 DEBUG_r( what = "floating" );
763 goto hop_and_restart;
766 PerlIO_printf(Perl_debug_log,
767 "By STCLASS: moving %ld --> %ld\n",
768 (long)(t - i_strpos), (long)(s - i_strpos));
770 PerlIO_printf(Perl_debug_log,
771 "Does not contradict STCLASS...\n") );
773 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
774 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
777 fail_finish: /* Substring not found */
778 if (prog->check_substr) /* could be removed already */
779 BmUSEFUL(prog->check_substr) += 5; /* hooray */
781 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
782 PL_colors[4],PL_colors[5]));
786 /* We know what class REx starts with. Try to find this position... */
788 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
790 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
796 register I32 tmp = 1; /* Scratch variable? */
798 /* We know what class it must start with. */
802 if (REGINCLASSUTF8(c, (U8*)s)) {
803 if (tmp && (norun || regtry(prog, s)))
815 if (REGINCLASS(c, *(U8*)s)) {
816 if (tmp && (norun || regtry(prog, s)))
836 c2 = PL_fold_locale[c1];
841 e = s; /* Due to minlen logic of intuit() */
842 /* Here it is NOT UTF! */
846 && (ln == 1 || !(OP(c) == EXACTF
848 : ibcmp_locale(s, m, ln)))
849 && (norun || regtry(prog, s)) )
855 if ( (*(U8*)s == c1 || *(U8*)s == c2)
856 && (ln == 1 || !(OP(c) == EXACTF
858 : ibcmp_locale(s, m, ln)))
859 && (norun || regtry(prog, s)) )
866 PL_reg_flags |= RF_tainted;
869 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
870 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
872 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
874 if ((norun || regtry(prog, s)))
879 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
883 PL_reg_flags |= RF_tainted;
886 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
887 tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
889 if (tmp == !(OP(c) == BOUNDUTF8 ?
890 swash_fetch(PL_utf8_alnum, (U8*)s) :
891 isALNUM_LC_utf8((U8*)s)))
894 if ((norun || regtry(prog, s)))
899 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
903 PL_reg_flags |= RF_tainted;
906 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
907 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
909 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
911 else if ((norun || regtry(prog, s)))
915 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
919 PL_reg_flags |= RF_tainted;
922 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
923 tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
925 if (tmp == !(OP(c) == NBOUNDUTF8 ?
926 swash_fetch(PL_utf8_alnum, (U8*)s) :
927 isALNUM_LC_utf8((U8*)s)))
929 else if ((norun || regtry(prog, s)))
933 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
939 if (tmp && (norun || regtry(prog, s)))
951 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
952 if (tmp && (norun || regtry(prog, s)))
963 PL_reg_flags |= RF_tainted;
965 if (isALNUM_LC(*s)) {
966 if (tmp && (norun || regtry(prog, s)))
977 PL_reg_flags |= RF_tainted;
979 if (isALNUM_LC_utf8((U8*)s)) {
980 if (tmp && (norun || regtry(prog, s)))
993 if (tmp && (norun || regtry(prog, s)))
1004 while (s < strend) {
1005 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
1006 if (tmp && (norun || regtry(prog, s)))
1017 PL_reg_flags |= RF_tainted;
1018 while (s < strend) {
1019 if (!isALNUM_LC(*s)) {
1020 if (tmp && (norun || regtry(prog, s)))
1031 PL_reg_flags |= RF_tainted;
1032 while (s < strend) {
1033 if (!isALNUM_LC_utf8((U8*)s)) {
1034 if (tmp && (norun || regtry(prog, s)))
1045 while (s < strend) {
1047 if (tmp && (norun || regtry(prog, s)))
1058 while (s < strend) {
1059 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1060 if (tmp && (norun || regtry(prog, s)))
1071 PL_reg_flags |= RF_tainted;
1072 while (s < strend) {
1073 if (isSPACE_LC(*s)) {
1074 if (tmp && (norun || regtry(prog, s)))
1085 PL_reg_flags |= RF_tainted;
1086 while (s < strend) {
1087 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1088 if (tmp && (norun || regtry(prog, s)))
1099 while (s < strend) {
1101 if (tmp && (norun || regtry(prog, s)))
1112 while (s < strend) {
1113 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1114 if (tmp && (norun || regtry(prog, s)))
1125 PL_reg_flags |= RF_tainted;
1126 while (s < strend) {
1127 if (!isSPACE_LC(*s)) {
1128 if (tmp && (norun || regtry(prog, s)))
1139 PL_reg_flags |= RF_tainted;
1140 while (s < strend) {
1141 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1142 if (tmp && (norun || regtry(prog, s)))
1153 while (s < strend) {
1155 if (tmp && (norun || regtry(prog, s)))
1166 while (s < strend) {
1167 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1168 if (tmp && (norun || regtry(prog, s)))
1179 PL_reg_flags |= RF_tainted;
1180 while (s < strend) {
1181 if (isDIGIT_LC(*s)) {
1182 if (tmp && (norun || regtry(prog, s)))
1193 PL_reg_flags |= RF_tainted;
1194 while (s < strend) {
1195 if (isDIGIT_LC_utf8((U8*)s)) {
1196 if (tmp && (norun || regtry(prog, s)))
1207 while (s < strend) {
1209 if (tmp && (norun || regtry(prog, s)))
1220 while (s < strend) {
1221 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1222 if (tmp && (norun || regtry(prog, s)))
1233 PL_reg_flags |= RF_tainted;
1234 while (s < strend) {
1235 if (!isDIGIT_LC(*s)) {
1236 if (tmp && (norun || regtry(prog, s)))
1247 PL_reg_flags |= RF_tainted;
1248 while (s < strend) {
1249 if (!isDIGIT_LC_utf8((U8*)s)) {
1250 if (tmp && (norun || regtry(prog, s)))
1261 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1270 - regexec_flags - match a regexp against a string
1273 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1274 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1275 /* strend: pointer to null at end of string */
1276 /* strbeg: real beginning of string */
1277 /* minend: end of match must be >=minend after stringarg. */
1278 /* data: May be used for some additional optimizations. */
1279 /* nosave: For optimizations. */
1283 register regnode *c;
1284 register char *startpos = stringarg;
1285 I32 minlen; /* must match at least this many chars */
1286 I32 dontbother = 0; /* how many characters not to try at end */
1287 /* I32 start_shift = 0; */ /* Offset of the start to find
1288 constant substr. */ /* CC */
1289 I32 end_shift = 0; /* Same for the end. */ /* CC */
1290 I32 scream_pos = -1; /* Internal iterator of scream. */
1292 SV* oreplsv = GvSV(PL_replgv);
1298 PL_regnarrate = PL_debug & 512;
1301 /* Be paranoid... */
1302 if (prog == NULL || startpos == NULL) {
1303 Perl_croak(aTHX_ "NULL regexp parameter");
1307 minlen = prog->minlen;
1308 if (strend - startpos < minlen) goto phooey;
1310 if (startpos == strbeg) /* is ^ valid at stringarg? */
1313 PL_regprev = (U32)stringarg[-1];
1314 if (!PL_multiline && PL_regprev == '\n')
1315 PL_regprev = '\0'; /* force ^ to NOT match */
1318 /* Check validity of program. */
1319 if (UCHARAT(prog->program) != REG_MAGIC) {
1320 Perl_croak(aTHX_ "corrupted regexp program");
1324 PL_reg_eval_set = 0;
1327 if (prog->reganch & ROPT_UTF8)
1328 PL_reg_flags |= RF_utf8;
1330 /* Mark beginning of line for ^ and lookbehind. */
1331 PL_regbol = startpos;
1335 /* Mark end of line for $ (and such) */
1338 /* see how far we have to get to not match where we matched before */
1339 PL_regtill = startpos+minend;
1341 /* We start without call_cc context. */
1344 /* If there is a "must appear" string, look for it. */
1347 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1350 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1351 PL_reg_ganch = startpos;
1352 else if (sv && SvTYPE(sv) >= SVt_PVMG
1354 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1355 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1356 if (prog->reganch & ROPT_ANCH_GPOS) {
1357 if (s > PL_reg_ganch)
1362 else /* pos() not defined */
1363 PL_reg_ganch = strbeg;
1366 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1367 re_scream_pos_data d;
1369 d.scream_olds = &scream_olds;
1370 d.scream_pos = &scream_pos;
1371 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1373 goto phooey; /* not present */
1376 DEBUG_r( if (!PL_colorset) reginitcolors() );
1377 DEBUG_r(PerlIO_printf(Perl_debug_log,
1378 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1379 PL_colors[4],PL_colors[5],PL_colors[0],
1382 (strlen(prog->precomp) > 60 ? "..." : ""),
1384 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1385 startpos, PL_colors[1],
1386 (strend - startpos > 60 ? "..." : ""))
1389 /* Simplest case: anchored match need be tried only once. */
1390 /* [unless only anchor is BOL and multiline is set] */
1391 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1392 if (s == startpos && regtry(prog, startpos))
1394 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1395 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1400 dontbother = minlen - 1;
1401 end = HOPc(strend, -dontbother) - 1;
1402 /* for multiline we only have to try after newlines */
1403 if (prog->check_substr) {
1407 if (regtry(prog, s))
1412 if (prog->reganch & RE_USE_INTUIT) {
1413 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1424 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1425 if (regtry(prog, s))
1432 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1433 if (regtry(prog, PL_reg_ganch))
1438 /* Messy cases: unanchored match. */
1439 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1440 /* we have /x+whatever/ */
1441 /* it must be a one character string (XXXX Except UTF?) */
1442 char ch = SvPVX(prog->anchored_substr)[0];
1448 while (s < strend) {
1450 DEBUG_r( did_match = 1 );
1451 if (regtry(prog, s)) goto got_it;
1453 while (s < strend && *s == ch)
1460 while (s < strend) {
1462 DEBUG_r( did_match = 1 );
1463 if (regtry(prog, s)) goto got_it;
1465 while (s < strend && *s == ch)
1471 DEBUG_r(did_match ||
1472 PerlIO_printf(Perl_debug_log,
1473 "Did not find anchored character...\n"));
1476 else if (prog->anchored_substr != Nullsv
1477 || (prog->float_substr != Nullsv
1478 && prog->float_max_offset < strend - s)) {
1479 SV *must = prog->anchored_substr
1480 ? prog->anchored_substr : prog->float_substr;
1482 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1484 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1485 char *last = HOPc(strend, /* Cannot start after this */
1486 -(I32)(CHR_SVLEN(must)
1487 - (SvTAIL(must) != 0) + back_min));
1488 char *last1; /* Last position checked before */
1494 last1 = HOPc(s, -1);
1496 last1 = s - 1; /* bogus */
1498 /* XXXX check_substr already used to find `s', can optimize if
1499 check_substr==must. */
1501 dontbother = end_shift;
1502 strend = HOPc(strend, -dontbother);
1503 while ( (s <= last) &&
1504 ((flags & REXEC_SCREAM)
1505 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1506 end_shift, &scream_pos, 0))
1507 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1508 (unsigned char*)strend, must,
1509 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1510 DEBUG_r( did_match = 1 );
1511 if (HOPc(s, -back_max) > last1) {
1512 last1 = HOPc(s, -back_min);
1513 s = HOPc(s, -back_max);
1516 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1518 last1 = HOPc(s, -back_min);
1522 while (s <= last1) {
1523 if (regtry(prog, s))
1529 while (s <= last1) {
1530 if (regtry(prog, s))
1536 DEBUG_r(did_match ||
1537 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1538 ((must == prog->anchored_substr)
1539 ? "anchored" : "floating"),
1541 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1543 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1546 else if ((c = prog->regstclass)) {
1547 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1548 /* don't bother with what can't match */
1549 strend = HOPc(strend, -(minlen - 1));
1550 if (find_byclass(prog, c, s, strend, startpos, 0))
1552 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1556 if (prog->float_substr != Nullsv) { /* Trim the end. */
1559 if (flags & REXEC_SCREAM) {
1560 last = screaminstr(sv, prog->float_substr, s - strbeg,
1561 end_shift, &scream_pos, 1); /* last one */
1563 last = scream_olds; /* Only one occurence. */
1567 char *little = SvPV(prog->float_substr, len);
1569 if (SvTAIL(prog->float_substr)) {
1570 if (memEQ(strend - len + 1, little, len - 1))
1571 last = strend - len + 1;
1572 else if (!PL_multiline)
1573 last = memEQ(strend - len, little, len)
1574 ? strend - len : Nullch;
1580 last = rninstr(s, strend, little, little + len);
1582 last = strend; /* matching `$' */
1586 DEBUG_r(PerlIO_printf(Perl_debug_log,
1587 "%sCan't trim the tail, match fails (should not happen)%s\n",
1588 PL_colors[4],PL_colors[5]));
1589 goto phooey; /* Should not happen! */
1591 dontbother = strend - last + prog->float_min_offset;
1593 if (minlen && (dontbother < minlen))
1594 dontbother = minlen - 1;
1595 strend -= dontbother; /* this one's always in bytes! */
1596 /* We don't know much -- general case. */
1599 if (regtry(prog, s))
1608 if (regtry(prog, s))
1610 } while (s++ < strend);
1618 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1620 if (PL_reg_eval_set) {
1621 /* Preserve the current value of $^R */
1622 if (oreplsv != GvSV(PL_replgv))
1623 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1624 restored, the value remains
1626 restore_pos(aTHXo_ 0);
1629 /* make sure $`, $&, $', and $digit will work later */
1630 if ( !(flags & REXEC_NOT_FIRST) ) {
1631 if (RX_MATCH_COPIED(prog)) {
1632 Safefree(prog->subbeg);
1633 RX_MATCH_COPIED_off(prog);
1635 if (flags & REXEC_COPY_STR) {
1636 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1638 s = savepvn(strbeg, i);
1641 RX_MATCH_COPIED_on(prog);
1644 prog->subbeg = strbeg;
1645 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1652 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1653 PL_colors[4],PL_colors[5]));
1654 if (PL_reg_eval_set)
1655 restore_pos(aTHXo_ 0);
1660 - regtry - try match at specific point
1662 STATIC I32 /* 0 failure, 1 success */
1663 S_regtry(pTHX_ regexp *prog, char *startpos)
1671 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1674 PL_reg_eval_set = RS_init;
1676 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1677 (IV)(PL_stack_sp - PL_stack_base));
1679 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1680 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1681 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1683 /* Apparently this is not needed, judging by wantarray. */
1684 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1685 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1688 /* Make $_ available to executed code. */
1689 if (PL_reg_sv != DEFSV) {
1690 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1695 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1696 && (mg = mg_find(PL_reg_sv, 'g')))) {
1697 /* prepare for quick setting of pos */
1698 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1699 mg = mg_find(PL_reg_sv, 'g');
1703 PL_reg_oldpos = mg->mg_len;
1704 SAVEDESTRUCTOR_X(restore_pos, 0);
1707 Newz(22,PL_reg_curpm, 1, PMOP);
1708 PL_reg_curpm->op_pmregexp = prog;
1709 PL_reg_oldcurpm = PL_curpm;
1710 PL_curpm = PL_reg_curpm;
1711 if (RX_MATCH_COPIED(prog)) {
1712 /* Here is a serious problem: we cannot rewrite subbeg,
1713 since it may be needed if this match fails. Thus
1714 $` inside (?{}) could fail... */
1715 PL_reg_oldsaved = prog->subbeg;
1716 PL_reg_oldsavedlen = prog->sublen;
1717 RX_MATCH_COPIED_off(prog);
1720 PL_reg_oldsaved = Nullch;
1721 prog->subbeg = PL_bostr;
1722 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1724 prog->startp[0] = startpos - PL_bostr;
1725 PL_reginput = startpos;
1726 PL_regstartp = prog->startp;
1727 PL_regendp = prog->endp;
1728 PL_reglastparen = &prog->lastparen;
1729 prog->lastparen = 0;
1731 DEBUG_r(PL_reg_starttry = startpos);
1732 if (PL_reg_start_tmpl <= prog->nparens) {
1733 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1734 if(PL_reg_start_tmp)
1735 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1737 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1740 /* XXXX What this code is doing here?!!! There should be no need
1741 to do this again and again, PL_reglastparen should take care of
1745 if (prog->nparens) {
1746 for (i = prog->nparens; i >= 1; i--) {
1752 if (regmatch(prog->program + 1)) {
1753 prog->endp[0] = PL_reginput - PL_bostr;
1761 - regmatch - main matching routine
1763 * Conceptually the strategy is simple: check to see whether the current
1764 * node matches, call self recursively to see whether the rest matches,
1765 * and then act accordingly. In practice we make some effort to avoid
1766 * recursion, in particular by going through "ordinary" nodes (that don't
1767 * need to know whether the rest of the match failed) by a loop instead of
1770 /* [lwall] I've hoisted the register declarations to the outer block in order to
1771 * maybe save a little bit of pushing and popping on the stack. It also takes
1772 * advantage of machines that use a register save mask on subroutine entry.
1774 STATIC I32 /* 0 failure, 1 success */
1775 S_regmatch(pTHX_ regnode *prog)
1778 register regnode *scan; /* Current node. */
1779 regnode *next; /* Next node. */
1780 regnode *inner; /* Next node in internal branch. */
1781 register I32 nextchr; /* renamed nextchr - nextchar colides with
1782 function of same name */
1783 register I32 n; /* no or next */
1784 register I32 ln; /* len or last */
1785 register char *s; /* operand or save */
1786 register char *locinput = PL_reginput;
1787 register I32 c1, c2, paren; /* case fold search, parenth */
1788 int minmod = 0, sw = 0, logical = 0;
1793 /* Note that nextchr is a byte even in UTF */
1794 nextchr = UCHARAT(locinput);
1796 while (scan != NULL) {
1797 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1799 # define sayYES goto yes
1800 # define sayNO goto no
1801 # define sayYES_FINAL goto yes_final
1802 # define sayYES_LOUD goto yes_loud
1803 # define sayNO_FINAL goto no_final
1804 # define sayNO_SILENT goto do_no
1805 # define saySAME(x) if (x) goto yes; else goto no
1806 # define REPORT_CODE_OFF 24
1808 # define sayYES return 1
1809 # define sayNO return 0
1810 # define sayYES_FINAL return 1
1811 # define sayYES_LOUD return 1
1812 # define sayNO_FINAL return 0
1813 # define sayNO_SILENT return 0
1814 # define saySAME(x) return x
1817 SV *prop = sv_newmortal();
1818 int docolor = *PL_colors[0];
1819 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1820 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1821 /* The part of the string before starttry has one color
1822 (pref0_len chars), between starttry and current
1823 position another one (pref_len - pref0_len chars),
1824 after the current position the third one.
1825 We assume that pref0_len <= pref_len, otherwise we
1826 decrease pref0_len. */
1827 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1828 ? (5 + taill) - l : locinput - PL_bostr);
1829 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1831 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1832 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1833 ? (5 + taill) - pref_len : PL_regeol - locinput);
1836 if (pref0_len > pref_len)
1837 pref0_len = pref_len;
1838 regprop(prop, scan);
1839 PerlIO_printf(Perl_debug_log,
1840 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1841 (IV)(locinput - PL_bostr),
1842 PL_colors[4], pref0_len,
1843 locinput - pref_len, PL_colors[5],
1844 PL_colors[2], pref_len - pref0_len,
1845 locinput - pref_len + pref0_len, PL_colors[3],
1846 (docolor ? "" : "> <"),
1847 PL_colors[0], l, locinput, PL_colors[1],
1848 15 - l - pref_len + 1,
1850 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1854 next = scan + NEXT_OFF(scan);
1860 if (locinput == PL_bostr
1861 ? PL_regprev == '\n'
1863 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1865 /* regtill = regbol; */
1870 if (locinput == PL_bostr
1871 ? PL_regprev == '\n'
1872 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1878 if (locinput == PL_regbol && PL_regprev == '\n')
1882 if (locinput == PL_reg_ganch)
1892 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1897 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1899 if (PL_regeol - locinput > 1)
1903 if (PL_regeol != locinput)
1907 if (nextchr & 0x80) {
1908 locinput += PL_utf8skip[nextchr];
1909 if (locinput > PL_regeol)
1911 nextchr = UCHARAT(locinput);
1914 if (!nextchr && locinput >= PL_regeol)
1916 nextchr = UCHARAT(++locinput);
1919 if (!nextchr && locinput >= PL_regeol)
1921 nextchr = UCHARAT(++locinput);
1924 if (nextchr & 0x80) {
1925 locinput += PL_utf8skip[nextchr];
1926 if (locinput > PL_regeol)
1928 nextchr = UCHARAT(locinput);
1931 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1933 nextchr = UCHARAT(++locinput);
1936 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
1938 nextchr = UCHARAT(++locinput);
1943 /* Inline the first character, for speed. */
1944 if (UCHARAT(s) != nextchr)
1946 if (PL_regeol - locinput < ln)
1948 if (ln > 1 && memNE(s, locinput, ln))
1951 nextchr = UCHARAT(locinput);
1954 PL_reg_flags |= RF_tainted;
1963 c1 = OP(scan) == EXACTF;
1967 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1968 toLOWER_utf8((U8*)l) :
1969 toLOWER_LC_utf8((U8*)l)))
1977 nextchr = UCHARAT(locinput);
1981 /* Inline the first character, for speed. */
1982 if (UCHARAT(s) != nextchr &&
1983 UCHARAT(s) != ((OP(scan) == EXACTF)
1984 ? PL_fold : PL_fold_locale)[nextchr])
1986 if (PL_regeol - locinput < ln)
1988 if (ln > 1 && (OP(scan) == EXACTF
1989 ? ibcmp(s, locinput, ln)
1990 : ibcmp_locale(s, locinput, ln)))
1993 nextchr = UCHARAT(locinput);
1996 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1998 if (locinput >= PL_regeol)
2000 locinput += PL_utf8skip[nextchr];
2001 nextchr = UCHARAT(locinput);
2005 nextchr = UCHARAT(locinput);
2006 if (!REGINCLASS(scan, nextchr))
2008 if (!nextchr && locinput >= PL_regeol)
2010 nextchr = UCHARAT(++locinput);
2013 PL_reg_flags |= RF_tainted;
2018 if (!(OP(scan) == ALNUM
2019 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2021 nextchr = UCHARAT(++locinput);
2024 PL_reg_flags |= RF_tainted;
2029 if (nextchr & 0x80) {
2030 if (!(OP(scan) == ALNUMUTF8
2031 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2032 : isALNUM_LC_utf8((U8*)locinput)))
2036 locinput += PL_utf8skip[nextchr];
2037 nextchr = UCHARAT(locinput);
2040 if (!(OP(scan) == ALNUMUTF8
2041 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2043 nextchr = UCHARAT(++locinput);
2046 PL_reg_flags |= RF_tainted;
2049 if (!nextchr && locinput >= PL_regeol)
2051 if (OP(scan) == NALNUM
2052 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2054 nextchr = UCHARAT(++locinput);
2057 PL_reg_flags |= RF_tainted;
2060 if (!nextchr && locinput >= PL_regeol)
2062 if (nextchr & 0x80) {
2063 if (OP(scan) == NALNUMUTF8
2064 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2065 : isALNUM_LC_utf8((U8*)locinput))
2069 locinput += PL_utf8skip[nextchr];
2070 nextchr = UCHARAT(locinput);
2073 if (OP(scan) == NALNUMUTF8
2074 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2076 nextchr = UCHARAT(++locinput);
2080 PL_reg_flags |= RF_tainted;
2084 /* was last char in word? */
2085 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2086 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2088 n = isALNUM(nextchr);
2091 ln = isALNUM_LC(ln);
2092 n = isALNUM_LC(nextchr);
2094 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2099 PL_reg_flags |= RF_tainted;
2103 /* was last char in word? */
2104 ln = (locinput != PL_regbol)
2105 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2106 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2107 ln = isALNUM_uni(ln);
2108 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2111 ln = isALNUM_LC_uni(ln);
2112 n = isALNUM_LC_utf8((U8*)locinput);
2114 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2118 PL_reg_flags |= RF_tainted;
2123 if (!(OP(scan) == SPACE
2124 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2126 nextchr = UCHARAT(++locinput);
2129 PL_reg_flags |= RF_tainted;
2134 if (nextchr & 0x80) {
2135 if (!(OP(scan) == SPACEUTF8
2136 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2137 : isSPACE_LC_utf8((U8*)locinput)))
2141 locinput += PL_utf8skip[nextchr];
2142 nextchr = UCHARAT(locinput);
2145 if (!(OP(scan) == SPACEUTF8
2146 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2148 nextchr = UCHARAT(++locinput);
2151 PL_reg_flags |= RF_tainted;
2154 if (!nextchr && locinput >= PL_regeol)
2156 if (OP(scan) == NSPACE
2157 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2159 nextchr = UCHARAT(++locinput);
2162 PL_reg_flags |= RF_tainted;
2165 if (!nextchr && locinput >= PL_regeol)
2167 if (nextchr & 0x80) {
2168 if (OP(scan) == NSPACEUTF8
2169 ? swash_fetch(PL_utf8_space, (U8*)locinput)
2170 : isSPACE_LC_utf8((U8*)locinput))
2174 locinput += PL_utf8skip[nextchr];
2175 nextchr = UCHARAT(locinput);
2178 if (OP(scan) == NSPACEUTF8
2179 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2181 nextchr = UCHARAT(++locinput);
2184 PL_reg_flags |= RF_tainted;
2189 if (!(OP(scan) == DIGIT
2190 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2192 nextchr = UCHARAT(++locinput);
2195 PL_reg_flags |= RF_tainted;
2200 if (nextchr & 0x80) {
2201 if (!(OP(scan) == DIGITUTF8
2202 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2203 : isDIGIT_LC_utf8((U8*)locinput)))
2207 locinput += PL_utf8skip[nextchr];
2208 nextchr = UCHARAT(locinput);
2211 if (!(OP(scan) == DIGITUTF8
2212 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2214 nextchr = UCHARAT(++locinput);
2217 PL_reg_flags |= RF_tainted;
2220 if (!nextchr && locinput >= PL_regeol)
2222 if (OP(scan) == NDIGIT
2223 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2225 nextchr = UCHARAT(++locinput);
2228 PL_reg_flags |= RF_tainted;
2231 if (!nextchr && locinput >= PL_regeol)
2233 if (nextchr & 0x80) {
2234 if (OP(scan) == NDIGITUTF8
2235 ? swash_fetch(PL_utf8_digit, (U8*)locinput)
2236 : isDIGIT_LC_utf8((U8*)locinput))
2240 locinput += PL_utf8skip[nextchr];
2241 nextchr = UCHARAT(locinput);
2244 if (OP(scan) == NDIGITUTF8
2245 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2247 nextchr = UCHARAT(++locinput);
2250 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2252 locinput += PL_utf8skip[nextchr];
2253 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2254 locinput += UTF8SKIP(locinput);
2255 if (locinput > PL_regeol)
2257 nextchr = UCHARAT(locinput);
2260 PL_reg_flags |= RF_tainted;
2264 n = ARG(scan); /* which paren pair */
2265 ln = PL_regstartp[n];
2266 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2267 if (*PL_reglastparen < n || ln == -1)
2268 sayNO; /* Do not match unless seen CLOSEn. */
2269 if (ln == PL_regendp[n])
2273 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2275 char *e = PL_bostr + PL_regendp[n];
2277 * Note that we can't do the "other character" lookup trick as
2278 * in the 8-bit case (no pun intended) because in Unicode we
2279 * have to map both upper and title case to lower case.
2281 if (OP(scan) == REFF) {
2285 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2295 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2302 nextchr = UCHARAT(locinput);
2306 /* Inline the first character, for speed. */
2307 if (UCHARAT(s) != nextchr &&
2309 (UCHARAT(s) != ((OP(scan) == REFF
2310 ? PL_fold : PL_fold_locale)[nextchr]))))
2312 ln = PL_regendp[n] - ln;
2313 if (locinput + ln > PL_regeol)
2315 if (ln > 1 && (OP(scan) == REF
2316 ? memNE(s, locinput, ln)
2318 ? ibcmp(s, locinput, ln)
2319 : ibcmp_locale(s, locinput, ln))))
2322 nextchr = UCHARAT(locinput);
2333 OP_4tree *oop = PL_op;
2334 COP *ocurcop = PL_curcop;
2335 SV **ocurpad = PL_curpad;
2339 PL_op = (OP_4tree*)PL_regdata->data[n];
2340 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2341 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2342 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2344 CALLRUNOPS(aTHX); /* Scalar context. */
2350 PL_curpad = ocurpad;
2351 PL_curcop = ocurcop;
2353 if (logical == 2) { /* Postponed subexpression. */
2355 MAGIC *mg = Null(MAGIC*);
2357 CHECKPOINT cp, lastcp;
2359 if(SvROK(ret) || SvRMAGICAL(ret)) {
2360 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2363 mg = mg_find(sv, 'r');
2366 re = (regexp *)mg->mg_obj;
2367 (void)ReREFCNT_inc(re);
2371 char *t = SvPV(ret, len);
2373 char *oprecomp = PL_regprecomp;
2374 I32 osize = PL_regsize;
2375 I32 onpar = PL_regnpar;
2378 pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
2379 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2381 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2382 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2383 PL_regprecomp = oprecomp;
2388 PerlIO_printf(Perl_debug_log,
2389 "Entering embedded `%s%.60s%s%s'\n",
2393 (strlen(re->precomp) > 60 ? "..." : ""))
2396 state.prev = PL_reg_call_cc;
2397 state.cc = PL_regcc;
2398 state.re = PL_reg_re;
2402 cp = regcppush(0); /* Save *all* the positions. */
2405 state.ss = PL_savestack_ix;
2406 *PL_reglastparen = 0;
2407 PL_reg_call_cc = &state;
2408 PL_reginput = locinput;
2410 /* XXXX This is too dramatic a measure... */
2413 if (regmatch(re->program + 1)) {
2414 /* Even though we succeeded, we need to restore
2415 global variables, since we may be wrapped inside
2416 SUSPEND, thus the match may be not finished yet. */
2418 /* XXXX Do this only if SUSPENDed? */
2419 PL_reg_call_cc = state.prev;
2420 PL_regcc = state.cc;
2421 PL_reg_re = state.re;
2422 cache_re(PL_reg_re);
2424 /* XXXX This is too dramatic a measure... */
2427 /* These are needed even if not SUSPEND. */
2435 PL_reg_call_cc = state.prev;
2436 PL_regcc = state.cc;
2437 PL_reg_re = state.re;
2438 cache_re(PL_reg_re);
2440 /* XXXX This is too dramatic a measure... */
2449 sv_setsv(save_scalar(PL_replgv), ret);
2453 n = ARG(scan); /* which paren pair */
2454 PL_reg_start_tmp[n] = locinput;
2459 n = ARG(scan); /* which paren pair */
2460 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2461 PL_regendp[n] = locinput - PL_bostr;
2462 if (n > *PL_reglastparen)
2463 *PL_reglastparen = n;
2466 n = ARG(scan); /* which paren pair */
2467 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2470 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2472 next = NEXTOPER(NEXTOPER(scan));
2474 next = scan + ARG(scan);
2475 if (OP(next) == IFTHEN) /* Fake one. */
2476 next = NEXTOPER(NEXTOPER(next));
2480 logical = scan->flags;
2482 /*******************************************************************
2483 PL_regcc contains infoblock about the innermost (...)* loop, and
2484 a pointer to the next outer infoblock.
2486 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2488 1) After matching X, regnode for CURLYX is processed;
2490 2) This regnode creates infoblock on the stack, and calls
2491 regmatch() recursively with the starting point at WHILEM node;
2493 3) Each hit of WHILEM node tries to match A and Z (in the order
2494 depending on the current iteration, min/max of {min,max} and
2495 greediness). The information about where are nodes for "A"
2496 and "Z" is read from the infoblock, as is info on how many times "A"
2497 was already matched, and greediness.
2499 4) After A matches, the same WHILEM node is hit again.
2501 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2502 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2503 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2504 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2505 of the external loop.
2507 Currently present infoblocks form a tree with a stem formed by PL_curcc
2508 and whatever it mentions via ->next, and additional attached trees
2509 corresponding to temporarily unset infoblocks as in "5" above.
2511 In the following picture infoblocks for outer loop of
2512 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2513 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2514 infoblocks are drawn below the "reset" infoblock.
2516 In fact in the picture below we do not show failed matches for Z and T
2517 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2518 more obvious *why* one needs to *temporary* unset infoblocks.]
2520 Matched REx position InfoBlocks Comment
2524 Y A)*?Z)*?T x <- O <- I
2525 YA )*?Z)*?T x <- O <- I
2526 YA A)*?Z)*?T x <- O <- I
2527 YAA )*?Z)*?T x <- O <- I
2528 YAA Z)*?T x <- O # Temporary unset I
2531 YAAZ Y(A)*?Z)*?T x <- O
2534 YAAZY (A)*?Z)*?T x <- O
2537 YAAZY A)*?Z)*?T x <- O <- I
2540 YAAZYA )*?Z)*?T x <- O <- I
2543 YAAZYA Z)*?T x <- O # Temporary unset I
2549 YAAZYAZ T x # Temporary unset O
2556 *******************************************************************/
2559 CHECKPOINT cp = PL_savestack_ix;
2561 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2563 cc.oldcc = PL_regcc;
2565 cc.parenfloor = *PL_reglastparen;
2567 cc.min = ARG1(scan);
2568 cc.max = ARG2(scan);
2569 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2573 PL_reginput = locinput;
2574 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2576 PL_regcc = cc.oldcc;
2582 * This is really hard to understand, because after we match
2583 * what we're trying to match, we must make sure the rest of
2584 * the REx is going to match for sure, and to do that we have
2585 * to go back UP the parse tree by recursing ever deeper. And
2586 * if it fails, we have to reset our parent's current state
2587 * that we can try again after backing off.
2590 CHECKPOINT cp, lastcp;
2591 CURCUR* cc = PL_regcc;
2592 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2594 n = cc->cur + 1; /* how many we know we matched */
2595 PL_reginput = locinput;
2598 PerlIO_printf(Perl_debug_log,
2599 "%*s %ld out of %ld..%ld cc=%lx\n",
2600 REPORT_CODE_OFF+PL_regindent*2, "",
2601 (long)n, (long)cc->min,
2602 (long)cc->max, (long)cc)
2605 /* If degenerate scan matches "", assume scan done. */
2607 if (locinput == cc->lastloc && n >= cc->min) {
2608 PL_regcc = cc->oldcc;
2612 PerlIO_printf(Perl_debug_log,
2613 "%*s empty match detected, try continuation...\n",
2614 REPORT_CODE_OFF+PL_regindent*2, "")
2616 if (regmatch(cc->next))
2624 /* First just match a string of min scans. */
2628 cc->lastloc = locinput;
2629 if (regmatch(cc->scan))
2632 cc->lastloc = lastloc;
2637 /* Check whether we already were at this position.
2638 Postpone detection until we know the match is not
2639 *that* much linear. */
2640 if (!PL_reg_maxiter) {
2641 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2642 PL_reg_leftiter = PL_reg_maxiter;
2644 if (PL_reg_leftiter-- == 0) {
2645 I32 size = (PL_reg_maxiter + 7)/8;
2646 if (PL_reg_poscache) {
2647 if (PL_reg_poscache_size < size) {
2648 Renew(PL_reg_poscache, size, char);
2649 PL_reg_poscache_size = size;
2651 Zero(PL_reg_poscache, size, char);
2654 PL_reg_poscache_size = size;
2655 Newz(29, PL_reg_poscache, size, char);
2658 PerlIO_printf(Perl_debug_log,
2659 "%sDetected a super-linear match, switching on caching%s...\n",
2660 PL_colors[4], PL_colors[5])
2663 if (PL_reg_leftiter < 0) {
2664 I32 o = locinput - PL_bostr, b;
2666 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2669 if (PL_reg_poscache[o] & (1<<b)) {
2671 PerlIO_printf(Perl_debug_log,
2672 "%*s already tried at this position...\n",
2673 REPORT_CODE_OFF+PL_regindent*2, "")
2677 PL_reg_poscache[o] |= (1<<b);
2681 /* Prefer next over scan for minimal matching. */
2684 PL_regcc = cc->oldcc;
2687 cp = regcppush(cc->parenfloor);
2689 if (regmatch(cc->next)) {
2691 sayYES; /* All done. */
2699 if (n >= cc->max) { /* Maximum greed exceeded? */
2700 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2701 && !(PL_reg_flags & RF_warned)) {
2702 PL_reg_flags |= RF_warned;
2703 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2704 "Complex regular subexpression recursion",
2711 PerlIO_printf(Perl_debug_log,
2712 "%*s trying longer...\n",
2713 REPORT_CODE_OFF+PL_regindent*2, "")
2715 /* Try scanning more and see if it helps. */
2716 PL_reginput = locinput;
2718 cc->lastloc = locinput;
2719 cp = regcppush(cc->parenfloor);
2721 if (regmatch(cc->scan)) {
2728 cc->lastloc = lastloc;
2732 /* Prefer scan over next for maximal matching. */
2734 if (n < cc->max) { /* More greed allowed? */
2735 cp = regcppush(cc->parenfloor);
2737 cc->lastloc = locinput;
2739 if (regmatch(cc->scan)) {
2744 regcppop(); /* Restore some previous $<digit>s? */
2745 PL_reginput = locinput;
2747 PerlIO_printf(Perl_debug_log,
2748 "%*s failed, try continuation...\n",
2749 REPORT_CODE_OFF+PL_regindent*2, "")
2752 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2753 && !(PL_reg_flags & RF_warned)) {
2754 PL_reg_flags |= RF_warned;
2755 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2756 "Complex regular subexpression recursion",
2760 /* Failed deeper matches of scan, so see if this one works. */
2761 PL_regcc = cc->oldcc;
2764 if (regmatch(cc->next))
2770 cc->lastloc = lastloc;
2775 next = scan + ARG(scan);
2778 inner = NEXTOPER(NEXTOPER(scan));
2781 inner = NEXTOPER(scan);
2786 if (OP(next) != c1) /* No choice. */
2787 next = inner; /* Avoid recursion. */
2789 int lastparen = *PL_reglastparen;
2793 PL_reginput = locinput;
2794 if (regmatch(inner))
2797 for (n = *PL_reglastparen; n > lastparen; n--)
2799 *PL_reglastparen = n;
2802 if ((n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))))
2806 inner = NEXTOPER(scan);
2807 if (c1 == BRANCHJ) {
2808 inner = NEXTOPER(inner);
2810 } while (scan != NULL && OP(scan) == c1);
2824 /* We suppose that the next guy does not need
2825 backtracking: in particular, it is of constant length,
2826 and has no parenths to influence future backrefs. */
2827 ln = ARG1(scan); /* min to match */
2828 n = ARG2(scan); /* max to match */
2829 paren = scan->flags;
2831 if (paren > PL_regsize)
2833 if (paren > *PL_reglastparen)
2834 *PL_reglastparen = paren;
2836 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2838 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2839 PL_reginput = locinput;
2842 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2844 if (ln && l == 0 && n >= ln
2845 /* In fact, this is tricky. If paren, then the
2846 fact that we did/didnot match may influence
2847 future execution. */
2848 && !(paren && ln == 0))
2850 locinput = PL_reginput;
2851 if (PL_regkind[(U8)OP(next)] == EXACT) {
2852 c1 = (U8)*STRING(next);
2853 if (OP(next) == EXACTF)
2855 else if (OP(next) == EXACTFL)
2856 c2 = PL_fold_locale[c1];
2863 /* This may be improved if l == 0. */
2864 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2865 /* If it could work, try it. */
2867 UCHARAT(PL_reginput) == c1 ||
2868 UCHARAT(PL_reginput) == c2)
2872 PL_regstartp[paren] =
2873 HOPc(PL_reginput, -l) - PL_bostr;
2874 PL_regendp[paren] = PL_reginput - PL_bostr;
2877 PL_regendp[paren] = -1;
2883 /* Couldn't or didn't -- move forward. */
2884 PL_reginput = locinput;
2885 if (regrepeat_hard(scan, 1, &l)) {
2887 locinput = PL_reginput;
2894 n = regrepeat_hard(scan, n, &l);
2895 if (n != 0 && l == 0
2896 /* In fact, this is tricky. If paren, then the
2897 fact that we did/didnot match may influence
2898 future execution. */
2899 && !(paren && ln == 0))
2901 locinput = PL_reginput;
2903 PerlIO_printf(Perl_debug_log,
2904 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2905 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2909 if (PL_regkind[(U8)OP(next)] == EXACT) {
2910 c1 = (U8)*STRING(next);
2911 if (OP(next) == EXACTF)
2913 else if (OP(next) == EXACTFL)
2914 c2 = PL_fold_locale[c1];
2923 /* If it could work, try it. */
2925 UCHARAT(PL_reginput) == c1 ||
2926 UCHARAT(PL_reginput) == c2)
2929 PerlIO_printf(Perl_debug_log,
2930 "%*s trying tail with n=%"IVdf"...\n",
2931 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2935 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2936 PL_regendp[paren] = PL_reginput - PL_bostr;
2939 PL_regendp[paren] = -1;
2945 /* Couldn't or didn't -- back up. */
2947 locinput = HOPc(locinput, -l);
2948 PL_reginput = locinput;
2955 paren = scan->flags; /* Which paren to set */
2956 if (paren > PL_regsize)
2958 if (paren > *PL_reglastparen)
2959 *PL_reglastparen = paren;
2960 ln = ARG1(scan); /* min to match */
2961 n = ARG2(scan); /* max to match */
2962 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2966 ln = ARG1(scan); /* min to match */
2967 n = ARG2(scan); /* max to match */
2968 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2973 scan = NEXTOPER(scan);
2979 scan = NEXTOPER(scan);
2983 * Lookahead to avoid useless match attempts
2984 * when we know what character comes next.
2986 if (PL_regkind[(U8)OP(next)] == EXACT) {
2987 c1 = (U8)*STRING(next);
2988 if (OP(next) == EXACTF)
2990 else if (OP(next) == EXACTFL)
2991 c2 = PL_fold_locale[c1];
2997 PL_reginput = locinput;
3001 if (ln && regrepeat(scan, ln) < ln)
3003 locinput = PL_reginput;
3006 char *e = locinput + n - ln; /* Should not check after this */
3007 char *old = locinput;
3009 if (e >= PL_regeol || (n == REG_INFTY))
3012 /* Find place 'next' could work */
3014 while (locinput <= e && *locinput != c1)
3017 while (locinput <= e
3024 /* PL_reginput == old now */
3025 if (locinput != old) {
3026 ln = 1; /* Did some */
3027 if (regrepeat(scan, locinput - old) <
3031 /* PL_reginput == locinput now */
3034 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
3035 PL_regendp[paren] = locinput - PL_bostr;
3038 PL_regendp[paren] = -1;
3042 PL_reginput = locinput; /* Could be reset... */
3044 /* Couldn't or didn't -- move forward. */
3049 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3050 /* If it could work, try it. */
3052 UCHARAT(PL_reginput) == c1 ||
3053 UCHARAT(PL_reginput) == c2)
3057 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3058 PL_regendp[paren] = PL_reginput - PL_bostr;
3061 PL_regendp[paren] = -1;
3067 /* Couldn't or didn't -- move forward. */
3068 PL_reginput = locinput;
3069 if (regrepeat(scan, 1)) {
3071 locinput = PL_reginput;
3079 n = regrepeat(scan, n);
3080 locinput = PL_reginput;
3081 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3082 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3083 ln = n; /* why back off? */
3084 /* ...because $ and \Z can match before *and* after
3085 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3086 We should back off by one in this case. */
3087 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3093 /* If it could work, try it. */
3095 UCHARAT(PL_reginput) == c1 ||
3096 UCHARAT(PL_reginput) == c2)
3100 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3101 PL_regendp[paren] = PL_reginput - PL_bostr;
3104 PL_regendp[paren] = -1;
3110 /* Couldn't or didn't -- back up. */
3112 PL_reginput = locinput = HOPc(locinput, -1);
3117 /* If it could work, try it. */
3119 UCHARAT(PL_reginput) == c1 ||
3120 UCHARAT(PL_reginput) == c2)
3126 /* Couldn't or didn't -- back up. */
3128 PL_reginput = locinput = HOPc(locinput, -1);
3135 if (PL_reg_call_cc) {
3136 re_cc_state *cur_call_cc = PL_reg_call_cc;
3137 CURCUR *cctmp = PL_regcc;
3138 regexp *re = PL_reg_re;
3139 CHECKPOINT cp, lastcp;
3141 cp = regcppush(0); /* Save *all* the positions. */
3143 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3145 PL_reginput = locinput; /* Make position available to
3147 cache_re(PL_reg_call_cc->re);
3148 PL_regcc = PL_reg_call_cc->cc;
3149 PL_reg_call_cc = PL_reg_call_cc->prev;
3150 if (regmatch(cur_call_cc->node)) {
3151 PL_reg_call_cc = cur_call_cc;
3157 PL_reg_call_cc = cur_call_cc;
3163 PerlIO_printf(Perl_debug_log,
3164 "%*s continuation failed...\n",
3165 REPORT_CODE_OFF+PL_regindent*2, "")
3169 if (locinput < PL_regtill) {
3170 DEBUG_r(PerlIO_printf(Perl_debug_log,
3171 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3173 (long)(locinput - PL_reg_starttry),
3174 (long)(PL_regtill - PL_reg_starttry),
3176 sayNO_FINAL; /* Cannot match: too short. */
3178 PL_reginput = locinput; /* put where regtry can find it */
3179 sayYES_FINAL; /* Success! */
3181 PL_reginput = locinput; /* put where regtry can find it */
3182 sayYES_LOUD; /* Success! */
3185 PL_reginput = locinput;
3190 if (UTF) { /* XXXX This is absolutely
3191 broken, we read before
3193 s = HOPMAYBEc(locinput, -scan->flags);
3199 if (locinput < PL_bostr + scan->flags)
3201 PL_reginput = locinput - scan->flags;
3206 PL_reginput = locinput;
3211 if (UTF) { /* XXXX This is absolutely
3212 broken, we read before
3214 s = HOPMAYBEc(locinput, -scan->flags);
3215 if (!s || s < PL_bostr)
3220 if (locinput < PL_bostr + scan->flags)
3222 PL_reginput = locinput - scan->flags;
3227 PL_reginput = locinput;
3230 inner = NEXTOPER(NEXTOPER(scan));
3231 if (regmatch(inner) != n) {
3246 if (OP(scan) == SUSPEND) {
3247 locinput = PL_reginput;
3248 nextchr = UCHARAT(locinput);
3253 next = scan + ARG(scan);
3258 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3259 PTR2UV(scan), OP(scan));
3260 Perl_croak(aTHX_ "regexp memory corruption");
3266 * We get here only if there's trouble -- normally "case END" is
3267 * the terminating point.
3269 Perl_croak(aTHX_ "corrupted regexp pointers");
3275 PerlIO_printf(Perl_debug_log,
3276 "%*s %scould match...%s\n",
3277 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3281 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3282 PL_colors[4],PL_colors[5]));
3291 PerlIO_printf(Perl_debug_log,
3292 "%*s %sfailed...%s\n",
3293 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3305 - regrepeat - repeatedly match something simple, report how many
3308 * [This routine now assumes that it will only match on things of length 1.
3309 * That was true before, but now we assume scan - reginput is the count,
3310 * rather than incrementing count on every character. [Er, except utf8.]]
3313 S_regrepeat(pTHX_ regnode *p, I32 max)
3316 register char *scan;
3318 register char *loceol = PL_regeol;
3319 register I32 hardcount = 0;
3322 if (max != REG_INFTY && max < loceol - scan)
3323 loceol = scan + max;
3326 while (scan < loceol && *scan != '\n')
3334 while (scan < loceol && *scan != '\n') {
3335 scan += UTF8SKIP(scan);
3341 while (scan < loceol) {
3342 scan += UTF8SKIP(scan);
3346 case EXACT: /* length of string is 1 */
3348 while (scan < loceol && UCHARAT(scan) == c)
3351 case EXACTF: /* length of string is 1 */
3353 while (scan < loceol &&
3354 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3357 case EXACTFL: /* length of string is 1 */
3358 PL_reg_flags |= RF_tainted;
3360 while (scan < loceol &&
3361 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3366 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3367 scan += UTF8SKIP(scan);
3372 while (scan < loceol && REGINCLASS(p, *scan))
3376 while (scan < loceol && isALNUM(*scan))
3381 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3382 scan += UTF8SKIP(scan);
3387 PL_reg_flags |= RF_tainted;
3388 while (scan < loceol && isALNUM_LC(*scan))
3392 PL_reg_flags |= RF_tainted;
3394 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3395 scan += UTF8SKIP(scan);
3401 while (scan < loceol && !isALNUM(*scan))
3406 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3407 scan += UTF8SKIP(scan);
3412 PL_reg_flags |= RF_tainted;
3413 while (scan < loceol && !isALNUM_LC(*scan))
3417 PL_reg_flags |= RF_tainted;
3419 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3420 scan += UTF8SKIP(scan);
3425 while (scan < loceol && isSPACE(*scan))
3430 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3431 scan += UTF8SKIP(scan);
3436 PL_reg_flags |= RF_tainted;
3437 while (scan < loceol && isSPACE_LC(*scan))
3441 PL_reg_flags |= RF_tainted;
3443 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3444 scan += UTF8SKIP(scan);
3449 while (scan < loceol && !isSPACE(*scan))
3454 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3455 scan += UTF8SKIP(scan);
3460 PL_reg_flags |= RF_tainted;
3461 while (scan < loceol && !isSPACE_LC(*scan))
3465 PL_reg_flags |= RF_tainted;
3467 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3468 scan += UTF8SKIP(scan);
3473 while (scan < loceol && isDIGIT(*scan))
3478 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3479 scan += UTF8SKIP(scan);
3485 while (scan < loceol && !isDIGIT(*scan))
3490 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3491 scan += UTF8SKIP(scan);
3495 default: /* Called on something of 0 width. */
3496 break; /* So match right here or not at all. */
3502 c = scan - PL_reginput;
3507 SV *prop = sv_newmortal();
3510 PerlIO_printf(Perl_debug_log,
3511 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3512 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3519 - regrepeat_hard - repeatedly match something, report total lenth and length
3521 * The repeater is supposed to have constant length.
3525 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3528 register char *scan;
3529 register char *start;
3530 register char *loceol = PL_regeol;
3532 I32 count = 0, res = 1;
3537 start = PL_reginput;
3539 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3542 while (start < PL_reginput) {
3544 start += UTF8SKIP(start);
3555 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3557 *lp = l = PL_reginput - start;
3558 if (max != REG_INFTY && l*max < loceol - scan)
3559 loceol = scan + l*max;
3572 - reginclass - determine if a character falls into a character class
3576 S_reginclass(pTHX_ register regnode *p, register I32 c)
3579 char flags = ANYOF_FLAGS(p);
3583 if (ANYOF_BITMAP_TEST(p, c))
3585 else if (flags & ANYOF_FOLD) {
3587 if (flags & ANYOF_LOCALE) {
3588 PL_reg_flags |= RF_tainted;
3589 cf = PL_fold_locale[c];
3593 if (ANYOF_BITMAP_TEST(p, cf))
3597 if (!match && (flags & ANYOF_CLASS)) {
3598 PL_reg_flags |= RF_tainted;
3600 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3601 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3602 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3603 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3604 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3605 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3606 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3607 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3608 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3609 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3610 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3611 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3612 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3613 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3614 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3615 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3616 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3617 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3618 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3619 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3620 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3621 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3622 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3623 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3624 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3625 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3626 ) /* How's that for a conditional? */
3632 return (flags & ANYOF_INVERT) ? !match : match;
3636 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3639 char flags = ARG1(f);
3641 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3643 if (swash_fetch(sv, p))
3645 else if (flags & ANYOF_FOLD) {
3646 U8 tmpbuf[UTF8_MAXLEN];
3647 if (flags & ANYOF_LOCALE) {
3648 PL_reg_flags |= RF_tainted;
3649 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3652 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3653 if (swash_fetch(sv, tmpbuf))
3657 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3659 return (flags & ANYOF_INVERT) ? !match : match;
3663 S_reghop(pTHX_ U8 *s, I32 off)
3667 while (off-- && s < (U8*)PL_regeol)
3672 if (s > (U8*)PL_bostr) {
3675 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3677 } /* XXX could check well-formedness here */
3685 S_reghopmaybe(pTHX_ U8* s, I32 off)
3689 while (off-- && s < (U8*)PL_regeol)
3696 if (s > (U8*)PL_bostr) {
3699 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3701 } /* XXX could check well-formedness here */
3717 restore_pos(pTHXo_ void *arg)
3720 if (PL_reg_eval_set) {
3721 if (PL_reg_oldsaved) {
3722 PL_reg_re->subbeg = PL_reg_oldsaved;
3723 PL_reg_re->sublen = PL_reg_oldsavedlen;
3724 RX_MATCH_COPIED_on(PL_reg_re);
3726 PL_reg_magic->mg_len = PL_reg_oldpos;
3727 PL_reg_eval_set = 0;
3728 PL_curpm = PL_reg_oldcurpm;