5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
339 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
340 || ( (prog->reganch & ROPT_ANCH_BOL)
341 && !PL_multiline ) ); /* Check after \n? */
343 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
344 /* Substring at constant offset from beg-of-str... */
347 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
348 && (sv && (strpos + SvCUR(sv) != strend)) ) {
349 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
352 PL_regeol = strend; /* Used in HOP() */
353 s = HOPc(strpos, prog->check_offset_min);
354 if (SvTAIL(prog->check_substr)) {
355 slen = SvCUR(prog->check_substr); /* >= 1 */
357 if ( strend - s > slen || strend - s < slen - 1
358 || (strend - s == slen && strend[-1] != '\n')) {
359 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
362 /* Now should match s[0..slen-2] */
364 if (slen && (*SvPVX(prog->check_substr) != *s
366 && memNE(SvPVX(prog->check_substr), s, slen)))) {
368 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
372 else if (*SvPVX(prog->check_substr) != *s
373 || ((slen = SvCUR(prog->check_substr)) > 1
374 && memNE(SvPVX(prog->check_substr), s, slen)))
376 goto success_at_start;
378 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
380 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
381 /* Should be nonnegative! */
382 end_shift = prog->minlen - start_shift -
383 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
385 I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr)
386 - (SvTAIL(prog->check_substr) != 0);
387 I32 eshift = strend - s - end;
389 if (end_shift < eshift)
393 else { /* Can match at random position */
396 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
397 /* Should be nonnegative! */
398 end_shift = prog->minlen - start_shift -
399 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
402 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
404 Perl_croak(aTHX_ "panic: end_shift");
407 check = prog->check_substr;
409 /* Find a possible match in the region s..strend by looking for
410 the "check" substring in the region corrected by start/end_shift. */
411 if (flags & REXEC_SCREAM) {
412 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
413 I32 p = -1; /* Internal iterator of scream. */
414 I32 *pp = data ? data->scream_pos : &p;
416 if (PL_screamfirst[BmRARE(check)] >= 0
417 || ( BmRARE(check) == '\n'
418 && (BmPREVIOUS(check) == SvCUR(check) - 1)
420 s = screaminstr(sv, check,
421 start_shift + (s - strbeg), end_shift, pp, 0);
425 *data->scream_olds = s;
428 s = fbm_instr((unsigned char*)s + start_shift,
429 (unsigned char*)strend - end_shift,
430 check, PL_multiline ? FBMrf_MULTILINE : 0);
432 /* Update the count-of-usability, remove useless subpatterns,
435 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
436 (s ? "Found" : "Did not find"),
437 ((check == prog->anchored_substr) ? "anchored" : "floating"),
439 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
441 PL_colors[1], (SvTAIL(check) ? "$" : ""),
442 (s ? " at offset " : "...\n") ) );
449 /* Finish the diagnostic message */
450 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
452 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
453 Start with the other substr.
454 XXXX no SCREAM optimization yet - and a very coarse implementation
455 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
456 *always* match. Probably should be marked during compile...
457 Probably it is right to do no SCREAM here...
460 if (prog->float_substr && prog->anchored_substr) {
461 /* Take into account the "other" substring. */
462 /* XXXX May be hopelessly wrong for UTF... */
465 if (check == prog->float_substr) {
468 char *last = s - start_shift, *last1, *last2;
472 t = s - prog->check_offset_max;
473 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
474 && (!(prog->reganch & ROPT_UTF8)
475 || (PL_bostr = strpos, /* Used in regcopmaybe() */
476 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
481 t += prog->anchored_offset;
482 if (t < other_last) /* These positions already checked */
485 last2 = last1 = strend - prog->minlen;
488 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
489 /* On end-of-str: see comment below. */
490 s = fbm_instr((unsigned char*)t,
491 (unsigned char*)last1 + prog->anchored_offset
492 + SvCUR(prog->anchored_substr)
493 - (SvTAIL(prog->anchored_substr)!=0),
494 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
495 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
496 (s ? "Found" : "Contradicts"),
498 (int)(SvCUR(prog->anchored_substr)
499 - (SvTAIL(prog->anchored_substr)!=0)),
500 SvPVX(prog->anchored_substr),
501 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
503 if (last1 >= last2) {
504 DEBUG_r(PerlIO_printf(Perl_debug_log,
505 ", giving up...\n"));
508 DEBUG_r(PerlIO_printf(Perl_debug_log,
509 ", trying floating at offset %ld...\n",
510 (long)(s1 + 1 - i_strpos)));
511 PL_regeol = strend; /* Used in HOP() */
512 other_last = last1 + prog->anchored_offset + 1;
517 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
518 (long)(s - i_strpos)));
519 t = s - prog->anchored_offset;
528 else { /* Take into account the floating substring. */
533 last1 = last = strend - prog->minlen + prog->float_min_offset;
534 if (last - t > prog->float_max_offset)
535 last = t + prog->float_max_offset;
536 s = t + prog->float_min_offset;
539 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
540 /* fbm_instr() takes into account exact value of end-of-str
541 if the check is SvTAIL(ed). Since false positives are OK,
542 and end-of-str is not later than strend we are OK. */
543 s = fbm_instr((unsigned char*)s,
544 (unsigned char*)last + SvCUR(prog->float_substr)
545 - (SvTAIL(prog->float_substr)!=0),
546 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
547 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
548 (s ? "Found" : "Contradicts"),
550 (int)(SvCUR(prog->float_substr)
551 - (SvTAIL(prog->float_substr)!=0)),
552 SvPVX(prog->float_substr),
553 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
556 DEBUG_r(PerlIO_printf(Perl_debug_log,
557 ", giving up...\n"));
560 DEBUG_r(PerlIO_printf(Perl_debug_log,
561 ", trying anchored starting at offset %ld...\n",
562 (long)(s1 + 1 - i_strpos)));
563 other_last = last + 1;
564 PL_regeol = strend; /* Used in HOP() */
569 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
570 (long)(s - i_strpos)));
580 t = s - prog->check_offset_max;
582 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
583 && (!(prog->reganch & ROPT_UTF8)
584 || (PL_bostr = strpos, /* Used in regcopmaybe() */
585 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
588 /* Fixed substring is found far enough so that the match
589 cannot start at strpos. */
591 if (ml_anch && t[-1] != '\n') {
592 /* Eventually fbm_*() should handle this, but often
593 anchored_offset is not 0, so this check will not be wasted. */
594 /* XXXX In the code below we prefer to look for "^" even in
595 presence of anchored substrings. And we search even
596 beyond the found float position. These pessimizations
597 are historical artefacts only. */
599 while (t < strend - prog->minlen) {
601 if (t < s - prog->check_offset_min) {
602 if (prog->anchored_substr) {
603 /* We definitely contradict the found anchored
604 substr. Due to the above check we do not
605 contradict "check" substr.
606 Thus we can arrive here only if check substr
607 is float. Redo checking for "other"=="fixed".
610 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
611 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
612 goto do_other_anchored;
615 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
616 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
619 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
620 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
626 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
627 PL_colors[0],PL_colors[1]));
632 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
636 /* The found string does not prohibit matching at beg-of-str
637 - no optimization of calling REx engine can be performed,
638 unless it was an MBOL and we are not after MBOL. */
640 /* Even in this situation we may use MBOL flag if strpos is offset
641 wrt the start of the string. */
643 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
647 DEBUG_r( if (ml_anch)
648 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
649 PL_colors[0],PL_colors[1]);
652 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
653 && --BmUSEFUL(prog->check_substr) < 0
654 && prog->check_substr == prog->float_substr) { /* boo */
655 /* If flags & SOMETHING - do not do it many times on the same match */
656 SvREFCNT_dec(prog->check_substr);
657 prog->check_substr = Nullsv; /* disable */
658 prog->float_substr = Nullsv; /* clear */
660 /* XXXX This is a remnant of the old implementation. It
661 looks wasteful, since now INTUIT can use many
663 prog->reganch &= ~RE_USE_INTUIT;
670 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
671 if (prog->regstclass) {
672 /* minlen == 0 is possible if regstclass is \b or \B,
673 and the fixed substr is ''$.
674 Since minlen is already taken into account, s+1 is before strend;
675 accidentally, minlen >= 1 guaranties no false positives at s + 1
676 even for \b or \B. But (minlen? 1 : 0) below assumes that
677 regstclass does not come from lookahead... */
678 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
679 This leaves EXACTF only, which is dealt with in find_byclass(). */
680 char *endpos = (prog->anchored_substr || ml_anch)
681 ? s + (prog->minlen? 1 : 0)
682 : (prog->float_substr ? check_at - start_shift + 1
684 char *startpos = sv ? strend - SvCUR(sv) : s;
687 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
692 if (endpos == strend) {
693 DEBUG_r( PerlIO_printf(Perl_debug_log,
694 "Could not match STCLASS...\n") );
697 /* Contradict one of substrings */
698 if (prog->anchored_substr) {
699 DEBUG_r( PerlIO_printf(Perl_debug_log,
700 "This position contradicts STCLASS...\n") );
701 if (prog->anchored_substr == check) {
702 DEBUG_r( what = "anchored" );
704 PL_regeol = strend; /* Used in HOP() */
706 DEBUG_r( PerlIO_printf(Perl_debug_log,
707 "trying %s substr starting at offset %ld...\n",
708 what, (long)(s + start_shift - i_strpos)) );
711 /* Have both, check is floating */
712 if (t + start_shift >= check_at) /* Contradicts floating=check */
713 goto retry_floating_check;
714 /* Recheck anchored substring, but not floating... */
716 DEBUG_r( PerlIO_printf(Perl_debug_log,
717 "trying anchored substr starting at offset %ld...\n",
718 (long)(other_last - i_strpos)) );
719 goto do_other_anchored;
721 /* Check is floating subtring. */
722 retry_floating_check:
723 t = check_at - start_shift;
724 DEBUG_r( what = "floating" );
725 goto hop_and_restart;
728 PerlIO_printf(Perl_debug_log,
729 "By STCLASS: moving %ld --> %ld\n",
730 (long)(t - i_strpos), (long)(s - i_strpos));
732 PerlIO_printf(Perl_debug_log,
733 "Does not contradict STCLASS...\n") );
735 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
736 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
739 fail_finish: /* Substring not found */
740 BmUSEFUL(prog->check_substr) += 5; /* hooray */
742 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
743 PL_colors[4],PL_colors[5]));
747 /* We know what class REx starts with. Try to find this position... */
749 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
751 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
757 register I32 tmp = 1; /* Scratch variable? */
759 /* We know what class it must start with. */
763 if (REGINCLASSUTF8(c, (U8*)s)) {
764 if (tmp && (norun || regtry(prog, s)))
776 if (REGINCLASS(c, *s)) {
777 if (tmp && (norun || regtry(prog, s)))
797 c2 = PL_fold_locale[c1];
802 e = s; /* Due to minlen logic of intuit() */
803 /* Here it is NOT UTF! */
807 && (ln == 1 || (OP(c) == EXACTF
809 : ibcmp_locale(s, m, ln)))
810 && (norun || regtry(prog, s)) )
816 if ( (*s == c1 || *s == c2)
817 && (ln == 1 || (OP(c) == EXACTF
819 : ibcmp_locale(s, m, ln)))
820 && (norun || regtry(prog, s)) )
827 PL_reg_flags |= RF_tainted;
830 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
831 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
833 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
835 if ((norun || regtry(prog, s)))
840 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
844 PL_reg_flags |= RF_tainted;
847 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
848 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
850 if (tmp == !(OP(c) == BOUND ?
851 swash_fetch(PL_utf8_alnum, (U8*)s) :
852 isALNUM_LC_utf8((U8*)s)))
855 if ((norun || regtry(prog, s)))
860 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
864 PL_reg_flags |= RF_tainted;
867 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
868 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
870 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
872 else if ((norun || regtry(prog, s)))
876 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
880 PL_reg_flags |= RF_tainted;
884 strend = reghop_c(strend, -1);
885 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
886 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
888 if (tmp == !(OP(c) == NBOUND ?
889 swash_fetch(PL_utf8_alnum, (U8*)s) :
890 isALNUM_LC_utf8((U8*)s)))
892 else if ((norun || regtry(prog, s)))
896 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
902 if (tmp && (norun || regtry(prog, s)))
914 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
915 if (tmp && (norun || regtry(prog, s)))
926 PL_reg_flags |= RF_tainted;
928 if (isALNUM_LC(*s)) {
929 if (tmp && (norun || regtry(prog, s)))
940 PL_reg_flags |= RF_tainted;
942 if (isALNUM_LC_utf8((U8*)s)) {
943 if (tmp && (norun || regtry(prog, s)))
956 if (tmp && (norun || regtry(prog, s)))
968 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
969 if (tmp && (norun || regtry(prog, s)))
980 PL_reg_flags |= RF_tainted;
982 if (!isALNUM_LC(*s)) {
983 if (tmp && (norun || regtry(prog, s)))
994 PL_reg_flags |= RF_tainted;
996 if (!isALNUM_LC_utf8((U8*)s)) {
997 if (tmp && (norun || regtry(prog, s)))
1008 while (s < strend) {
1010 if (tmp && (norun || regtry(prog, s)))
1021 while (s < strend) {
1022 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1023 if (tmp && (norun || regtry(prog, s)))
1034 PL_reg_flags |= RF_tainted;
1035 while (s < strend) {
1036 if (isSPACE_LC(*s)) {
1037 if (tmp && (norun || regtry(prog, s)))
1048 PL_reg_flags |= RF_tainted;
1049 while (s < strend) {
1050 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1051 if (tmp && (norun || regtry(prog, s)))
1062 while (s < strend) {
1064 if (tmp && (norun || regtry(prog, s)))
1075 while (s < strend) {
1076 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1077 if (tmp && (norun || regtry(prog, s)))
1088 PL_reg_flags |= RF_tainted;
1089 while (s < strend) {
1090 if (!isSPACE_LC(*s)) {
1091 if (tmp && (norun || regtry(prog, s)))
1102 PL_reg_flags |= RF_tainted;
1103 while (s < strend) {
1104 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1105 if (tmp && (norun || regtry(prog, s)))
1116 while (s < strend) {
1118 if (tmp && (norun || regtry(prog, s)))
1129 while (s < strend) {
1130 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1131 if (tmp && (norun || regtry(prog, s)))
1142 PL_reg_flags |= RF_tainted;
1143 while (s < strend) {
1144 if (isDIGIT_LC(*s)) {
1145 if (tmp && (norun || regtry(prog, s)))
1156 PL_reg_flags |= RF_tainted;
1157 while (s < strend) {
1158 if (isDIGIT_LC_utf8((U8*)s)) {
1159 if (tmp && (norun || regtry(prog, s)))
1170 while (s < strend) {
1172 if (tmp && (norun || regtry(prog, s)))
1183 while (s < strend) {
1184 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1185 if (tmp && (norun || regtry(prog, s)))
1196 PL_reg_flags |= RF_tainted;
1197 while (s < strend) {
1198 if (!isDIGIT_LC(*s)) {
1199 if (tmp && (norun || regtry(prog, s)))
1210 PL_reg_flags |= RF_tainted;
1211 while (s < strend) {
1212 if (!isDIGIT_LC_utf8((U8*)s)) {
1213 if (tmp && (norun || regtry(prog, s)))
1224 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1233 - regexec_flags - match a regexp against a string
1236 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1237 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1238 /* strend: pointer to null at end of string */
1239 /* strbeg: real beginning of string */
1240 /* minend: end of match must be >=minend after stringarg. */
1241 /* data: May be used for some additional optimizations. */
1242 /* nosave: For optimizations. */
1246 register regnode *c;
1247 register char *startpos = stringarg;
1249 I32 minlen; /* must match at least this many chars */
1250 I32 dontbother = 0; /* how many characters not to try at end */
1251 I32 start_shift = 0; /* Offset of the start to find
1252 constant substr. */ /* CC */
1253 I32 end_shift = 0; /* Same for the end. */ /* CC */
1254 I32 scream_pos = -1; /* Internal iterator of scream. */
1256 SV* oreplsv = GvSV(PL_replgv);
1262 PL_regnarrate = PL_debug & 512;
1265 /* Be paranoid... */
1266 if (prog == NULL || startpos == NULL) {
1267 Perl_croak(aTHX_ "NULL regexp parameter");
1271 minlen = prog->minlen;
1272 if (strend - startpos < minlen) goto phooey;
1274 if (startpos == strbeg) /* is ^ valid at stringarg? */
1277 PL_regprev = (U32)stringarg[-1];
1278 if (!PL_multiline && PL_regprev == '\n')
1279 PL_regprev = '\0'; /* force ^ to NOT match */
1282 /* Check validity of program. */
1283 if (UCHARAT(prog->program) != REG_MAGIC) {
1284 Perl_croak(aTHX_ "corrupted regexp program");
1288 PL_reg_eval_set = 0;
1291 if (prog->reganch & ROPT_UTF8)
1292 PL_reg_flags |= RF_utf8;
1294 /* Mark beginning of line for ^ and lookbehind. */
1295 PL_regbol = startpos;
1299 /* Mark end of line for $ (and such) */
1302 /* see how far we have to get to not match where we matched before */
1303 PL_regtill = startpos+minend;
1305 /* We start without call_cc context. */
1308 /* If there is a "must appear" string, look for it. */
1311 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1314 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1315 PL_reg_ganch = startpos;
1316 else if (sv && SvTYPE(sv) >= SVt_PVMG
1318 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1319 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1320 if (prog->reganch & ROPT_ANCH_GPOS) {
1321 if (s > PL_reg_ganch)
1326 else /* pos() not defined */
1327 PL_reg_ganch = strbeg;
1330 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1331 re_scream_pos_data d;
1333 d.scream_olds = &scream_olds;
1334 d.scream_pos = &scream_pos;
1335 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1337 goto phooey; /* not present */
1340 DEBUG_r( if (!PL_colorset) reginitcolors() );
1341 DEBUG_r(PerlIO_printf(Perl_debug_log,
1342 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1343 PL_colors[4],PL_colors[5],PL_colors[0],
1346 (strlen(prog->precomp) > 60 ? "..." : ""),
1348 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1349 startpos, PL_colors[1],
1350 (strend - startpos > 60 ? "..." : ""))
1353 /* Simplest case: anchored match need be tried only once. */
1354 /* [unless only anchor is BOL and multiline is set] */
1355 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1356 if (s == startpos && regtry(prog, startpos))
1358 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1359 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1364 dontbother = minlen - 1;
1365 end = HOPc(strend, -dontbother) - 1;
1366 /* for multiline we only have to try after newlines */
1367 if (prog->check_substr) {
1371 if (regtry(prog, s))
1376 if (prog->reganch & RE_USE_INTUIT) {
1377 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1388 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1389 if (regtry(prog, s))
1396 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1397 if (regtry(prog, PL_reg_ganch))
1402 /* Messy cases: unanchored match. */
1403 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1404 /* we have /x+whatever/ */
1405 /* it must be a one character string (XXXX Except UTF?) */
1406 char ch = SvPVX(prog->anchored_substr)[0];
1408 while (s < strend) {
1410 if (regtry(prog, s)) goto got_it;
1412 while (s < strend && *s == ch)
1419 while (s < strend) {
1421 if (regtry(prog, s)) goto got_it;
1423 while (s < strend && *s == ch)
1431 else if (prog->anchored_substr != Nullsv
1432 || (prog->float_substr != Nullsv
1433 && prog->float_max_offset < strend - s)) {
1434 SV *must = prog->anchored_substr
1435 ? prog->anchored_substr : prog->float_substr;
1437 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1439 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1440 I32 delta = back_max - back_min;
1441 char *last = HOPc(strend, /* Cannot start after this */
1442 -(I32)(CHR_SVLEN(must)
1443 - (SvTAIL(must) != 0) + back_min));
1444 char *last1; /* Last position checked before */
1447 last1 = HOPc(s, -1);
1449 last1 = s - 1; /* bogus */
1451 /* XXXX check_substr already used to find `s', can optimize if
1452 check_substr==must. */
1454 dontbother = end_shift;
1455 strend = HOPc(strend, -dontbother);
1456 while ( (s <= last) &&
1457 ((flags & REXEC_SCREAM)
1458 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1459 end_shift, &scream_pos, 0))
1460 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1461 (unsigned char*)strend, must,
1462 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1463 if (HOPc(s, -back_max) > last1) {
1464 last1 = HOPc(s, -back_min);
1465 s = HOPc(s, -back_max);
1468 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1470 last1 = HOPc(s, -back_min);
1474 while (s <= last1) {
1475 if (regtry(prog, s))
1481 while (s <= last1) {
1482 if (regtry(prog, s))
1490 else if (c = prog->regstclass) {
1491 if (minlen) /* don't bother with what can't match */
1492 strend = HOPc(strend, -(minlen - 1));
1493 if (find_byclass(prog, c, s, strend, startpos, 0))
1498 if (prog->float_substr != Nullsv) { /* Trim the end. */
1500 I32 oldpos = scream_pos;
1502 if (flags & REXEC_SCREAM) {
1503 last = screaminstr(sv, prog->float_substr, s - strbeg,
1504 end_shift, &scream_pos, 1); /* last one */
1506 last = scream_olds; /* Only one occurence. */
1510 char *little = SvPV(prog->float_substr, len);
1512 if (SvTAIL(prog->float_substr)) {
1513 if (memEQ(strend - len + 1, little, len - 1))
1514 last = strend - len + 1;
1515 else if (!PL_multiline)
1516 last = memEQ(strend - len, little, len)
1517 ? strend - len : Nullch;
1523 last = rninstr(s, strend, little, little + len);
1525 last = strend; /* matching `$' */
1528 if (last == NULL) goto phooey; /* Should not happen! */
1529 dontbother = strend - last + prog->float_min_offset;
1531 if (minlen && (dontbother < minlen))
1532 dontbother = minlen - 1;
1533 strend -= dontbother; /* this one's always in bytes! */
1534 /* We don't know much -- general case. */
1537 if (regtry(prog, s))
1546 if (regtry(prog, s))
1548 } while (s++ < strend);
1556 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1558 if (PL_reg_eval_set) {
1559 /* Preserve the current value of $^R */
1560 if (oreplsv != GvSV(PL_replgv))
1561 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1562 restored, the value remains
1564 restore_pos(aTHXo_ 0);
1567 /* make sure $`, $&, $', and $digit will work later */
1568 if ( !(flags & REXEC_NOT_FIRST) ) {
1569 if (RX_MATCH_COPIED(prog)) {
1570 Safefree(prog->subbeg);
1571 RX_MATCH_COPIED_off(prog);
1573 if (flags & REXEC_COPY_STR) {
1574 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1576 s = savepvn(strbeg, i);
1579 RX_MATCH_COPIED_on(prog);
1582 prog->subbeg = strbeg;
1583 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1590 if (PL_reg_eval_set)
1591 restore_pos(aTHXo_ 0);
1596 - regtry - try match at specific point
1598 STATIC I32 /* 0 failure, 1 success */
1599 S_regtry(pTHX_ regexp *prog, char *startpos)
1607 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1610 PL_reg_eval_set = RS_init;
1612 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1613 (IV)(PL_stack_sp - PL_stack_base));
1615 SAVEINT(cxstack[cxstack_ix].blk_oldsp);
1616 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1617 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1619 /* Apparently this is not needed, judging by wantarray. */
1620 /* SAVEINT(cxstack[cxstack_ix].blk_gimme);
1621 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1624 /* Make $_ available to executed code. */
1625 if (PL_reg_sv != DEFSV) {
1626 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1631 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1632 && (mg = mg_find(PL_reg_sv, 'g')))) {
1633 /* prepare for quick setting of pos */
1634 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1635 mg = mg_find(PL_reg_sv, 'g');
1639 PL_reg_oldpos = mg->mg_len;
1640 SAVEDESTRUCTOR_X(restore_pos, 0);
1643 New(22,PL_reg_curpm, 1, PMOP);
1644 PL_reg_curpm->op_pmregexp = prog;
1645 PL_reg_oldcurpm = PL_curpm;
1646 PL_curpm = PL_reg_curpm;
1647 if (RX_MATCH_COPIED(prog)) {
1648 /* Here is a serious problem: we cannot rewrite subbeg,
1649 since it may be needed if this match fails. Thus
1650 $` inside (?{}) could fail... */
1651 PL_reg_oldsaved = prog->subbeg;
1652 PL_reg_oldsavedlen = prog->sublen;
1653 RX_MATCH_COPIED_off(prog);
1656 PL_reg_oldsaved = Nullch;
1657 prog->subbeg = PL_bostr;
1658 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1660 prog->startp[0] = startpos - PL_bostr;
1661 PL_reginput = startpos;
1662 PL_regstartp = prog->startp;
1663 PL_regendp = prog->endp;
1664 PL_reglastparen = &prog->lastparen;
1665 prog->lastparen = 0;
1667 DEBUG_r(PL_reg_starttry = startpos);
1668 if (PL_reg_start_tmpl <= prog->nparens) {
1669 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1670 if(PL_reg_start_tmp)
1671 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1673 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1676 /* XXXX What this code is doing here?!!! There should be no need
1677 to do this again and again, PL_reglastparen should take care of
1681 if (prog->nparens) {
1682 for (i = prog->nparens; i >= 1; i--) {
1688 if (regmatch(prog->program + 1)) {
1689 prog->endp[0] = PL_reginput - PL_bostr;
1697 - regmatch - main matching routine
1699 * Conceptually the strategy is simple: check to see whether the current
1700 * node matches, call self recursively to see whether the rest matches,
1701 * and then act accordingly. In practice we make some effort to avoid
1702 * recursion, in particular by going through "ordinary" nodes (that don't
1703 * need to know whether the rest of the match failed) by a loop instead of
1706 /* [lwall] I've hoisted the register declarations to the outer block in order to
1707 * maybe save a little bit of pushing and popping on the stack. It also takes
1708 * advantage of machines that use a register save mask on subroutine entry.
1710 STATIC I32 /* 0 failure, 1 success */
1711 S_regmatch(pTHX_ regnode *prog)
1714 register regnode *scan; /* Current node. */
1715 regnode *next; /* Next node. */
1716 regnode *inner; /* Next node in internal branch. */
1717 register I32 nextchr; /* renamed nextchr - nextchar colides with
1718 function of same name */
1719 register I32 n; /* no or next */
1720 register I32 ln; /* len or last */
1721 register char *s; /* operand or save */
1722 register char *locinput = PL_reginput;
1723 register I32 c1, c2, paren; /* case fold search, parenth */
1724 int minmod = 0, sw = 0, logical = 0;
1729 /* Note that nextchr is a byte even in UTF */
1730 nextchr = UCHARAT(locinput);
1732 while (scan != NULL) {
1733 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1735 # define sayYES goto yes
1736 # define sayNO goto no
1737 # define sayYES_FINAL goto yes_final
1738 # define sayYES_LOUD goto yes_loud
1739 # define sayNO_FINAL goto no_final
1740 # define sayNO_SILENT goto do_no
1741 # define saySAME(x) if (x) goto yes; else goto no
1742 # define REPORT_CODE_OFF 24
1744 # define sayYES return 1
1745 # define sayNO return 0
1746 # define sayYES_FINAL return 1
1747 # define sayYES_LOUD return 1
1748 # define sayNO_FINAL return 0
1749 # define sayNO_SILENT return 0
1750 # define saySAME(x) return x
1753 SV *prop = sv_newmortal();
1754 int docolor = *PL_colors[0];
1755 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1756 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1757 /* The part of the string before starttry has one color
1758 (pref0_len chars), between starttry and current
1759 position another one (pref_len - pref0_len chars),
1760 after the current position the third one.
1761 We assume that pref0_len <= pref_len, otherwise we
1762 decrease pref0_len. */
1763 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1764 ? (5 + taill) - l : locinput - PL_bostr);
1765 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1767 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1768 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1769 ? (5 + taill) - pref_len : PL_regeol - locinput);
1772 if (pref0_len > pref_len)
1773 pref0_len = pref_len;
1774 regprop(prop, scan);
1775 PerlIO_printf(Perl_debug_log,
1776 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1777 (IV)(locinput - PL_bostr),
1778 PL_colors[4], pref0_len,
1779 locinput - pref_len, PL_colors[5],
1780 PL_colors[2], pref_len - pref0_len,
1781 locinput - pref_len + pref0_len, PL_colors[3],
1782 (docolor ? "" : "> <"),
1783 PL_colors[0], l, locinput, PL_colors[1],
1784 15 - l - pref_len + 1,
1786 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1790 next = scan + NEXT_OFF(scan);
1796 if (locinput == PL_bostr
1797 ? PL_regprev == '\n'
1799 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1801 /* regtill = regbol; */
1806 if (locinput == PL_bostr
1807 ? PL_regprev == '\n'
1808 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1814 if (locinput == PL_regbol && PL_regprev == '\n')
1818 if (locinput == PL_reg_ganch)
1828 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1833 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1835 if (PL_regeol - locinput > 1)
1839 if (PL_regeol != locinput)
1843 if (nextchr & 0x80) {
1844 locinput += PL_utf8skip[nextchr];
1845 if (locinput > PL_regeol)
1847 nextchr = UCHARAT(locinput);
1850 if (!nextchr && locinput >= PL_regeol)
1852 nextchr = UCHARAT(++locinput);
1855 if (!nextchr && locinput >= PL_regeol)
1857 nextchr = UCHARAT(++locinput);
1860 if (nextchr & 0x80) {
1861 locinput += PL_utf8skip[nextchr];
1862 if (locinput > PL_regeol)
1864 nextchr = UCHARAT(locinput);
1867 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1869 nextchr = UCHARAT(++locinput);
1872 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1874 nextchr = UCHARAT(++locinput);
1879 /* Inline the first character, for speed. */
1880 if (UCHARAT(s) != nextchr)
1882 if (PL_regeol - locinput < ln)
1884 if (ln > 1 && memNE(s, locinput, ln))
1887 nextchr = UCHARAT(locinput);
1890 PL_reg_flags |= RF_tainted;
1899 c1 = OP(scan) == EXACTF;
1903 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1904 toLOWER_utf8((U8*)l) :
1905 toLOWER_LC_utf8((U8*)l)))
1913 nextchr = UCHARAT(locinput);
1917 /* Inline the first character, for speed. */
1918 if (UCHARAT(s) != nextchr &&
1919 UCHARAT(s) != ((OP(scan) == EXACTF)
1920 ? PL_fold : PL_fold_locale)[nextchr])
1922 if (PL_regeol - locinput < ln)
1924 if (ln > 1 && (OP(scan) == EXACTF
1925 ? ibcmp(s, locinput, ln)
1926 : ibcmp_locale(s, locinput, ln)))
1929 nextchr = UCHARAT(locinput);
1932 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1934 if (locinput >= PL_regeol)
1936 locinput += PL_utf8skip[nextchr];
1937 nextchr = UCHARAT(locinput);
1941 nextchr = UCHARAT(locinput);
1942 if (!REGINCLASS(scan, nextchr))
1944 if (!nextchr && locinput >= PL_regeol)
1946 nextchr = UCHARAT(++locinput);
1949 PL_reg_flags |= RF_tainted;
1954 if (!(OP(scan) == ALNUM
1955 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1957 nextchr = UCHARAT(++locinput);
1960 PL_reg_flags |= RF_tainted;
1965 if (nextchr & 0x80) {
1966 if (!(OP(scan) == ALNUMUTF8
1967 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1968 : isALNUM_LC_utf8((U8*)locinput)))
1972 locinput += PL_utf8skip[nextchr];
1973 nextchr = UCHARAT(locinput);
1976 if (!(OP(scan) == ALNUMUTF8
1977 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1979 nextchr = UCHARAT(++locinput);
1982 PL_reg_flags |= RF_tainted;
1985 if (!nextchr && locinput >= PL_regeol)
1987 if (OP(scan) == NALNUM
1988 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1990 nextchr = UCHARAT(++locinput);
1993 PL_reg_flags |= RF_tainted;
1996 if (!nextchr && locinput >= PL_regeol)
1998 if (nextchr & 0x80) {
1999 if (OP(scan) == NALNUMUTF8
2000 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2001 : isALNUM_LC_utf8((U8*)locinput))
2005 locinput += PL_utf8skip[nextchr];
2006 nextchr = UCHARAT(locinput);
2009 if (OP(scan) == NALNUMUTF8
2010 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2012 nextchr = UCHARAT(++locinput);
2016 PL_reg_flags |= RF_tainted;
2020 /* was last char in word? */
2021 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2022 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2024 n = isALNUM(nextchr);
2027 ln = isALNUM_LC(ln);
2028 n = isALNUM_LC(nextchr);
2030 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2035 PL_reg_flags |= RF_tainted;
2039 /* was last char in word? */
2040 ln = (locinput != PL_regbol)
2041 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2042 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2043 ln = isALNUM_uni(ln);
2044 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2047 ln = isALNUM_LC_uni(ln);
2048 n = isALNUM_LC_utf8((U8*)locinput);
2050 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2054 PL_reg_flags |= RF_tainted;
2057 if (!nextchr && locinput >= PL_regeol)
2059 if (!(OP(scan) == SPACE
2060 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2062 nextchr = UCHARAT(++locinput);
2065 PL_reg_flags |= RF_tainted;
2068 if (!nextchr && locinput >= PL_regeol)
2070 if (nextchr & 0x80) {
2071 if (!(OP(scan) == SPACEUTF8
2072 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2073 : isSPACE_LC_utf8((U8*)locinput)))
2077 locinput += PL_utf8skip[nextchr];
2078 nextchr = UCHARAT(locinput);
2081 if (!(OP(scan) == SPACEUTF8
2082 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2084 nextchr = UCHARAT(++locinput);
2087 PL_reg_flags |= RF_tainted;
2092 if (OP(scan) == SPACE
2093 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2095 nextchr = UCHARAT(++locinput);
2098 PL_reg_flags |= RF_tainted;
2103 if (nextchr & 0x80) {
2104 if (OP(scan) == NSPACEUTF8
2105 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2106 : isSPACE_LC_utf8((U8*)locinput))
2110 locinput += PL_utf8skip[nextchr];
2111 nextchr = UCHARAT(locinput);
2114 if (OP(scan) == NSPACEUTF8
2115 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2117 nextchr = UCHARAT(++locinput);
2120 PL_reg_flags |= RF_tainted;
2123 if (!nextchr && locinput >= PL_regeol)
2125 if (!(OP(scan) == DIGIT
2126 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2128 nextchr = UCHARAT(++locinput);
2131 PL_reg_flags |= RF_tainted;
2136 if (nextchr & 0x80) {
2137 if (OP(scan) == NDIGITUTF8
2138 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2139 : isDIGIT_LC_utf8((U8*)locinput))
2143 locinput += PL_utf8skip[nextchr];
2144 nextchr = UCHARAT(locinput);
2147 if (!isDIGIT(nextchr))
2149 nextchr = UCHARAT(++locinput);
2152 PL_reg_flags |= RF_tainted;
2157 if (OP(scan) == DIGIT
2158 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2160 nextchr = UCHARAT(++locinput);
2163 PL_reg_flags |= RF_tainted;
2166 if (!nextchr && locinput >= PL_regeol)
2168 if (nextchr & 0x80) {
2169 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2171 locinput += PL_utf8skip[nextchr];
2172 nextchr = UCHARAT(locinput);
2175 if (isDIGIT(nextchr))
2177 nextchr = UCHARAT(++locinput);
2180 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2182 locinput += PL_utf8skip[nextchr];
2183 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2184 locinput += UTF8SKIP(locinput);
2185 if (locinput > PL_regeol)
2187 nextchr = UCHARAT(locinput);
2190 PL_reg_flags |= RF_tainted;
2194 n = ARG(scan); /* which paren pair */
2195 ln = PL_regstartp[n];
2196 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2197 if (*PL_reglastparen < n || ln == -1)
2198 sayNO; /* Do not match unless seen CLOSEn. */
2199 if (ln == PL_regendp[n])
2203 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2205 char *e = PL_bostr + PL_regendp[n];
2207 * Note that we can't do the "other character" lookup trick as
2208 * in the 8-bit case (no pun intended) because in Unicode we
2209 * have to map both upper and title case to lower case.
2211 if (OP(scan) == REFF) {
2215 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2225 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2232 nextchr = UCHARAT(locinput);
2236 /* Inline the first character, for speed. */
2237 if (UCHARAT(s) != nextchr &&
2239 (UCHARAT(s) != ((OP(scan) == REFF
2240 ? PL_fold : PL_fold_locale)[nextchr]))))
2242 ln = PL_regendp[n] - ln;
2243 if (locinput + ln > PL_regeol)
2245 if (ln > 1 && (OP(scan) == REF
2246 ? memNE(s, locinput, ln)
2248 ? ibcmp(s, locinput, ln)
2249 : ibcmp_locale(s, locinput, ln))))
2252 nextchr = UCHARAT(locinput);
2263 OP_4tree *oop = PL_op;
2264 COP *ocurcop = PL_curcop;
2265 SV **ocurpad = PL_curpad;
2269 PL_op = (OP_4tree*)PL_regdata->data[n];
2270 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2271 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2272 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2274 CALLRUNOPS(aTHX); /* Scalar context. */
2280 PL_curpad = ocurpad;
2281 PL_curcop = ocurcop;
2283 if (logical == 2) { /* Postponed subexpression. */
2285 MAGIC *mg = Null(MAGIC*);
2287 CHECKPOINT cp, lastcp;
2289 if(SvROK(ret) || SvRMAGICAL(ret)) {
2290 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2293 mg = mg_find(sv, 'r');
2296 re = (regexp *)mg->mg_obj;
2297 (void)ReREFCNT_inc(re);
2301 char *t = SvPV(ret, len);
2303 char *oprecomp = PL_regprecomp;
2304 I32 osize = PL_regsize;
2305 I32 onpar = PL_regnpar;
2308 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2310 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2311 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2312 PL_regprecomp = oprecomp;
2317 PerlIO_printf(Perl_debug_log,
2318 "Entering embedded `%s%.60s%s%s'\n",
2322 (strlen(re->precomp) > 60 ? "..." : ""))
2325 state.prev = PL_reg_call_cc;
2326 state.cc = PL_regcc;
2327 state.re = PL_reg_re;
2331 cp = regcppush(0); /* Save *all* the positions. */
2334 state.ss = PL_savestack_ix;
2335 *PL_reglastparen = 0;
2336 PL_reg_call_cc = &state;
2337 PL_reginput = locinput;
2339 /* XXXX This is too dramatic a measure... */
2342 if (regmatch(re->program + 1)) {
2343 /* Even though we succeeded, we need to restore
2344 global variables, since we may be wrapped inside
2345 SUSPEND, thus the match may be not finished yet. */
2347 /* XXXX Do this only if SUSPENDed? */
2348 PL_reg_call_cc = state.prev;
2349 PL_regcc = state.cc;
2350 PL_reg_re = state.re;
2351 cache_re(PL_reg_re);
2353 /* XXXX This is too dramatic a measure... */
2356 /* These are needed even if not SUSPEND. */
2364 PL_reg_call_cc = state.prev;
2365 PL_regcc = state.cc;
2366 PL_reg_re = state.re;
2367 cache_re(PL_reg_re);
2369 /* XXXX This is too dramatic a measure... */
2378 sv_setsv(save_scalar(PL_replgv), ret);
2382 n = ARG(scan); /* which paren pair */
2383 PL_reg_start_tmp[n] = locinput;
2388 n = ARG(scan); /* which paren pair */
2389 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2390 PL_regendp[n] = locinput - PL_bostr;
2391 if (n > *PL_reglastparen)
2392 *PL_reglastparen = n;
2395 n = ARG(scan); /* which paren pair */
2396 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2399 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2401 next = NEXTOPER(NEXTOPER(scan));
2403 next = scan + ARG(scan);
2404 if (OP(next) == IFTHEN) /* Fake one. */
2405 next = NEXTOPER(NEXTOPER(next));
2409 logical = scan->flags;
2411 /*******************************************************************
2412 PL_regcc contains infoblock about the innermost (...)* loop, and
2413 a pointer to the next outer infoblock.
2415 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2417 1) After matching X, regnode for CURLYX is processed;
2419 2) This regnode creates infoblock on the stack, and calls
2420 regmatch() recursively with the starting point at WHILEM node;
2422 3) Each hit of WHILEM node tries to match A and Z (in the order
2423 depending on the current iteration, min/max of {min,max} and
2424 greediness). The information about where are nodes for "A"
2425 and "Z" is read from the infoblock, as is info on how many times "A"
2426 was already matched, and greediness.
2428 4) After A matches, the same WHILEM node is hit again.
2430 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2431 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2432 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2433 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2434 of the external loop.
2436 Currently present infoblocks form a tree with a stem formed by PL_curcc
2437 and whatever it mentions via ->next, and additional attached trees
2438 corresponding to temporarily unset infoblocks as in "5" above.
2440 In the following picture infoblocks for outer loop of
2441 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2442 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2443 infoblocks are drawn below the "reset" infoblock.
2445 In fact in the picture below we do not show failed matches for Z and T
2446 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2447 more obvious *why* one needs to *temporary* unset infoblocks.]
2449 Matched REx position InfoBlocks Comment
2453 Y A)*?Z)*?T x <- O <- I
2454 YA )*?Z)*?T x <- O <- I
2455 YA A)*?Z)*?T x <- O <- I
2456 YAA )*?Z)*?T x <- O <- I
2457 YAA Z)*?T x <- O # Temporary unset I
2460 YAAZ Y(A)*?Z)*?T x <- O
2463 YAAZY (A)*?Z)*?T x <- O
2466 YAAZY A)*?Z)*?T x <- O <- I
2469 YAAZYA )*?Z)*?T x <- O <- I
2472 YAAZYA Z)*?T x <- O # Temporary unset I
2478 YAAZYAZ T x # Temporary unset O
2485 *******************************************************************/
2488 CHECKPOINT cp = PL_savestack_ix;
2490 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2492 cc.oldcc = PL_regcc;
2494 cc.parenfloor = *PL_reglastparen;
2496 cc.min = ARG1(scan);
2497 cc.max = ARG2(scan);
2498 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2502 PL_reginput = locinput;
2503 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2505 PL_regcc = cc.oldcc;
2511 * This is really hard to understand, because after we match
2512 * what we're trying to match, we must make sure the rest of
2513 * the REx is going to match for sure, and to do that we have
2514 * to go back UP the parse tree by recursing ever deeper. And
2515 * if it fails, we have to reset our parent's current state
2516 * that we can try again after backing off.
2519 CHECKPOINT cp, lastcp;
2520 CURCUR* cc = PL_regcc;
2521 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2523 n = cc->cur + 1; /* how many we know we matched */
2524 PL_reginput = locinput;
2527 PerlIO_printf(Perl_debug_log,
2528 "%*s %ld out of %ld..%ld cc=%lx\n",
2529 REPORT_CODE_OFF+PL_regindent*2, "",
2530 (long)n, (long)cc->min,
2531 (long)cc->max, (long)cc)
2534 /* If degenerate scan matches "", assume scan done. */
2536 if (locinput == cc->lastloc && n >= cc->min) {
2537 PL_regcc = cc->oldcc;
2541 PerlIO_printf(Perl_debug_log,
2542 "%*s empty match detected, try continuation...\n",
2543 REPORT_CODE_OFF+PL_regindent*2, "")
2545 if (regmatch(cc->next))
2553 /* First just match a string of min scans. */
2557 cc->lastloc = locinput;
2558 if (regmatch(cc->scan))
2561 cc->lastloc = lastloc;
2566 /* Check whether we already were at this position.
2567 Postpone detection until we know the match is not
2568 *that* much linear. */
2569 if (!PL_reg_maxiter) {
2570 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2571 PL_reg_leftiter = PL_reg_maxiter;
2573 if (PL_reg_leftiter-- == 0) {
2574 I32 size = (PL_reg_maxiter + 7)/8;
2575 if (PL_reg_poscache) {
2576 if (PL_reg_poscache_size < size) {
2577 Renew(PL_reg_poscache, size, char);
2578 PL_reg_poscache_size = size;
2580 Zero(PL_reg_poscache, size, char);
2583 PL_reg_poscache_size = size;
2584 Newz(29, PL_reg_poscache, size, char);
2587 PerlIO_printf(Perl_debug_log,
2588 "%sDetected a super-linear match, switching on caching%s...\n",
2589 PL_colors[4], PL_colors[5])
2592 if (PL_reg_leftiter < 0) {
2593 I32 o = locinput - PL_bostr, b;
2595 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2598 if (PL_reg_poscache[o] & (1<<b)) {
2600 PerlIO_printf(Perl_debug_log,
2601 "%*s already tried at this position...\n",
2602 REPORT_CODE_OFF+PL_regindent*2, "")
2606 PL_reg_poscache[o] |= (1<<b);
2610 /* Prefer next over scan for minimal matching. */
2613 PL_regcc = cc->oldcc;
2616 cp = regcppush(cc->parenfloor);
2618 if (regmatch(cc->next)) {
2620 sayYES; /* All done. */
2628 if (n >= cc->max) { /* Maximum greed exceeded? */
2629 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2630 && !(PL_reg_flags & RF_warned)) {
2631 PL_reg_flags |= RF_warned;
2632 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2633 "Complex regular subexpression recursion",
2640 PerlIO_printf(Perl_debug_log,
2641 "%*s trying longer...\n",
2642 REPORT_CODE_OFF+PL_regindent*2, "")
2644 /* Try scanning more and see if it helps. */
2645 PL_reginput = locinput;
2647 cc->lastloc = locinput;
2648 cp = regcppush(cc->parenfloor);
2650 if (regmatch(cc->scan)) {
2657 cc->lastloc = lastloc;
2661 /* Prefer scan over next for maximal matching. */
2663 if (n < cc->max) { /* More greed allowed? */
2664 cp = regcppush(cc->parenfloor);
2666 cc->lastloc = locinput;
2668 if (regmatch(cc->scan)) {
2673 regcppop(); /* Restore some previous $<digit>s? */
2674 PL_reginput = locinput;
2676 PerlIO_printf(Perl_debug_log,
2677 "%*s failed, try continuation...\n",
2678 REPORT_CODE_OFF+PL_regindent*2, "")
2681 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2682 && !(PL_reg_flags & RF_warned)) {
2683 PL_reg_flags |= RF_warned;
2684 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2685 "Complex regular subexpression recursion",
2689 /* Failed deeper matches of scan, so see if this one works. */
2690 PL_regcc = cc->oldcc;
2693 if (regmatch(cc->next))
2699 cc->lastloc = lastloc;
2704 next = scan + ARG(scan);
2707 inner = NEXTOPER(NEXTOPER(scan));
2710 inner = NEXTOPER(scan);
2715 if (OP(next) != c1) /* No choice. */
2716 next = inner; /* Avoid recursion. */
2718 int lastparen = *PL_reglastparen;
2722 PL_reginput = locinput;
2723 if (regmatch(inner))
2726 for (n = *PL_reglastparen; n > lastparen; n--)
2728 *PL_reglastparen = n;
2731 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2735 inner = NEXTOPER(scan);
2736 if (c1 == BRANCHJ) {
2737 inner = NEXTOPER(inner);
2739 } while (scan != NULL && OP(scan) == c1);
2753 /* We suppose that the next guy does not need
2754 backtracking: in particular, it is of constant length,
2755 and has no parenths to influence future backrefs. */
2756 ln = ARG1(scan); /* min to match */
2757 n = ARG2(scan); /* max to match */
2758 paren = scan->flags;
2760 if (paren > PL_regsize)
2762 if (paren > *PL_reglastparen)
2763 *PL_reglastparen = paren;
2765 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2767 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2768 PL_reginput = locinput;
2771 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2773 if (ln && l == 0 && n >= ln
2774 /* In fact, this is tricky. If paren, then the
2775 fact that we did/didnot match may influence
2776 future execution. */
2777 && !(paren && ln == 0))
2779 locinput = PL_reginput;
2780 if (PL_regkind[(U8)OP(next)] == EXACT) {
2781 c1 = (U8)*STRING(next);
2782 if (OP(next) == EXACTF)
2784 else if (OP(next) == EXACTFL)
2785 c2 = PL_fold_locale[c1];
2792 /* This may be improved if l == 0. */
2793 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2794 /* If it could work, try it. */
2796 UCHARAT(PL_reginput) == c1 ||
2797 UCHARAT(PL_reginput) == c2)
2801 PL_regstartp[paren] =
2802 HOPc(PL_reginput, -l) - PL_bostr;
2803 PL_regendp[paren] = PL_reginput - PL_bostr;
2806 PL_regendp[paren] = -1;
2812 /* Couldn't or didn't -- move forward. */
2813 PL_reginput = locinput;
2814 if (regrepeat_hard(scan, 1, &l)) {
2816 locinput = PL_reginput;
2823 n = regrepeat_hard(scan, n, &l);
2824 if (n != 0 && l == 0
2825 /* In fact, this is tricky. If paren, then the
2826 fact that we did/didnot match may influence
2827 future execution. */
2828 && !(paren && ln == 0))
2830 locinput = PL_reginput;
2832 PerlIO_printf(Perl_debug_log,
2833 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2834 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2838 if (PL_regkind[(U8)OP(next)] == EXACT) {
2839 c1 = (U8)*STRING(next);
2840 if (OP(next) == EXACTF)
2842 else if (OP(next) == EXACTFL)
2843 c2 = PL_fold_locale[c1];
2852 /* If it could work, try it. */
2854 UCHARAT(PL_reginput) == c1 ||
2855 UCHARAT(PL_reginput) == c2)
2858 PerlIO_printf(Perl_debug_log,
2859 "%*s trying tail with n=%"IVdf"...\n",
2860 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2864 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2865 PL_regendp[paren] = PL_reginput - PL_bostr;
2868 PL_regendp[paren] = -1;
2874 /* Couldn't or didn't -- back up. */
2876 locinput = HOPc(locinput, -l);
2877 PL_reginput = locinput;
2884 paren = scan->flags; /* Which paren to set */
2885 if (paren > PL_regsize)
2887 if (paren > *PL_reglastparen)
2888 *PL_reglastparen = paren;
2889 ln = ARG1(scan); /* min to match */
2890 n = ARG2(scan); /* max to match */
2891 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2895 ln = ARG1(scan); /* min to match */
2896 n = ARG2(scan); /* max to match */
2897 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2902 scan = NEXTOPER(scan);
2908 scan = NEXTOPER(scan);
2912 * Lookahead to avoid useless match attempts
2913 * when we know what character comes next.
2915 if (PL_regkind[(U8)OP(next)] == EXACT) {
2916 c1 = (U8)*STRING(next);
2917 if (OP(next) == EXACTF)
2919 else if (OP(next) == EXACTFL)
2920 c2 = PL_fold_locale[c1];
2926 PL_reginput = locinput;
2930 if (ln && regrepeat(scan, ln) < ln)
2932 locinput = PL_reginput;
2935 char *e = locinput + n - ln; /* Should not check after this */
2936 char *old = locinput;
2938 if (e >= PL_regeol || (n == REG_INFTY))
2941 /* Find place 'next' could work */
2943 while (locinput <= e && *locinput != c1)
2946 while (locinput <= e
2953 /* PL_reginput == old now */
2954 if (locinput != old) {
2955 ln = 1; /* Did some */
2956 if (regrepeat(scan, locinput - old) <
2960 /* PL_reginput == locinput now */
2963 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2964 PL_regendp[paren] = locinput - PL_bostr;
2967 PL_regendp[paren] = -1;
2971 PL_reginput = locinput; /* Could be reset... */
2973 /* Couldn't or didn't -- move forward. */
2978 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
2979 /* If it could work, try it. */
2981 UCHARAT(PL_reginput) == c1 ||
2982 UCHARAT(PL_reginput) == c2)
2986 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2987 PL_regendp[paren] = PL_reginput - PL_bostr;
2990 PL_regendp[paren] = -1;
2996 /* Couldn't or didn't -- move forward. */
2997 PL_reginput = locinput;
2998 if (regrepeat(scan, 1)) {
3000 locinput = PL_reginput;
3008 n = regrepeat(scan, n);
3009 locinput = PL_reginput;
3010 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3011 (!PL_multiline || OP(next) == SEOL))
3012 ln = n; /* why back off? */
3016 /* If it could work, try it. */
3018 UCHARAT(PL_reginput) == c1 ||
3019 UCHARAT(PL_reginput) == c2)
3023 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3024 PL_regendp[paren] = PL_reginput - PL_bostr;
3027 PL_regendp[paren] = -1;
3033 /* Couldn't or didn't -- back up. */
3035 PL_reginput = locinput = HOPc(locinput, -1);
3040 /* If it could work, try it. */
3042 UCHARAT(PL_reginput) == c1 ||
3043 UCHARAT(PL_reginput) == c2)
3049 /* Couldn't or didn't -- back up. */
3051 PL_reginput = locinput = HOPc(locinput, -1);
3058 if (PL_reg_call_cc) {
3059 re_cc_state *cur_call_cc = PL_reg_call_cc;
3060 CURCUR *cctmp = PL_regcc;
3061 regexp *re = PL_reg_re;
3062 CHECKPOINT cp, lastcp;
3064 cp = regcppush(0); /* Save *all* the positions. */
3066 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3068 PL_reginput = locinput; /* Make position available to
3070 cache_re(PL_reg_call_cc->re);
3071 PL_regcc = PL_reg_call_cc->cc;
3072 PL_reg_call_cc = PL_reg_call_cc->prev;
3073 if (regmatch(cur_call_cc->node)) {
3074 PL_reg_call_cc = cur_call_cc;
3080 PL_reg_call_cc = cur_call_cc;
3086 PerlIO_printf(Perl_debug_log,
3087 "%*s continuation failed...\n",
3088 REPORT_CODE_OFF+PL_regindent*2, "")
3092 if (locinput < PL_regtill) {
3093 DEBUG_r(PerlIO_printf(Perl_debug_log,
3094 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3096 (long)(locinput - PL_reg_starttry),
3097 (long)(PL_regtill - PL_reg_starttry),
3099 sayNO_FINAL; /* Cannot match: too short. */
3101 PL_reginput = locinput; /* put where regtry can find it */
3102 sayYES_FINAL; /* Success! */
3104 PL_reginput = locinput; /* put where regtry can find it */
3105 sayYES_LOUD; /* Success! */
3108 PL_reginput = locinput;
3113 if (UTF) { /* XXXX This is absolutely
3114 broken, we read before
3116 s = HOPMAYBEc(locinput, -scan->flags);
3122 if (locinput < PL_bostr + scan->flags)
3124 PL_reginput = locinput - scan->flags;
3129 PL_reginput = locinput;
3134 if (UTF) { /* XXXX This is absolutely
3135 broken, we read before
3137 s = HOPMAYBEc(locinput, -scan->flags);
3138 if (!s || s < PL_bostr)
3143 if (locinput < PL_bostr + scan->flags)
3145 PL_reginput = locinput - scan->flags;
3150 PL_reginput = locinput;
3153 inner = NEXTOPER(NEXTOPER(scan));
3154 if (regmatch(inner) != n) {
3169 if (OP(scan) == SUSPEND) {
3170 locinput = PL_reginput;
3171 nextchr = UCHARAT(locinput);
3176 next = scan + ARG(scan);
3181 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3182 PTR2UV(scan), OP(scan));
3183 Perl_croak(aTHX_ "regexp memory corruption");
3189 * We get here only if there's trouble -- normally "case END" is
3190 * the terminating point.
3192 Perl_croak(aTHX_ "corrupted regexp pointers");
3198 PerlIO_printf(Perl_debug_log,
3199 "%*s %scould match...%s\n",
3200 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3204 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3205 PL_colors[4],PL_colors[5]));
3214 PerlIO_printf(Perl_debug_log,
3215 "%*s %sfailed...%s\n",
3216 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3228 - regrepeat - repeatedly match something simple, report how many
3231 * [This routine now assumes that it will only match on things of length 1.
3232 * That was true before, but now we assume scan - reginput is the count,
3233 * rather than incrementing count on every character. [Er, except utf8.]]
3236 S_regrepeat(pTHX_ regnode *p, I32 max)
3239 register char *scan;
3241 register char *loceol = PL_regeol;
3242 register I32 hardcount = 0;
3245 if (max != REG_INFTY && max < loceol - scan)
3246 loceol = scan + max;
3249 while (scan < loceol && *scan != '\n')
3257 while (scan < loceol && *scan != '\n') {
3258 scan += UTF8SKIP(scan);
3264 while (scan < loceol) {
3265 scan += UTF8SKIP(scan);
3269 case EXACT: /* length of string is 1 */
3271 while (scan < loceol && UCHARAT(scan) == c)
3274 case EXACTF: /* length of string is 1 */
3276 while (scan < loceol &&
3277 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3280 case EXACTFL: /* length of string is 1 */
3281 PL_reg_flags |= RF_tainted;
3283 while (scan < loceol &&
3284 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3289 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3290 scan += UTF8SKIP(scan);
3295 while (scan < loceol && REGINCLASS(p, *scan))
3299 while (scan < loceol && isALNUM(*scan))
3304 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3305 scan += UTF8SKIP(scan);
3310 PL_reg_flags |= RF_tainted;
3311 while (scan < loceol && isALNUM_LC(*scan))
3315 PL_reg_flags |= RF_tainted;
3317 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3318 scan += UTF8SKIP(scan);
3324 while (scan < loceol && !isALNUM(*scan))
3329 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3330 scan += UTF8SKIP(scan);
3335 PL_reg_flags |= RF_tainted;
3336 while (scan < loceol && !isALNUM_LC(*scan))
3340 PL_reg_flags |= RF_tainted;
3342 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3343 scan += UTF8SKIP(scan);
3348 while (scan < loceol && isSPACE(*scan))
3353 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3354 scan += UTF8SKIP(scan);
3359 PL_reg_flags |= RF_tainted;
3360 while (scan < loceol && isSPACE_LC(*scan))
3364 PL_reg_flags |= RF_tainted;
3366 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3367 scan += UTF8SKIP(scan);
3372 while (scan < loceol && !isSPACE(*scan))
3377 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3378 scan += UTF8SKIP(scan);
3383 PL_reg_flags |= RF_tainted;
3384 while (scan < loceol && !isSPACE_LC(*scan))
3388 PL_reg_flags |= RF_tainted;
3390 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3391 scan += UTF8SKIP(scan);
3396 while (scan < loceol && isDIGIT(*scan))
3401 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3402 scan += UTF8SKIP(scan);
3408 while (scan < loceol && !isDIGIT(*scan))
3413 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3414 scan += UTF8SKIP(scan);
3418 default: /* Called on something of 0 width. */
3419 break; /* So match right here or not at all. */
3425 c = scan - PL_reginput;
3430 SV *prop = sv_newmortal();
3433 PerlIO_printf(Perl_debug_log,
3434 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3435 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3442 - regrepeat_hard - repeatedly match something, report total lenth and length
3444 * The repeater is supposed to have constant length.
3448 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3451 register char *scan;
3452 register char *start;
3453 register char *loceol = PL_regeol;
3455 I32 count = 0, res = 1;
3460 start = PL_reginput;
3462 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3465 while (start < PL_reginput) {
3467 start += UTF8SKIP(start);
3478 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3480 *lp = l = PL_reginput - start;
3481 if (max != REG_INFTY && l*max < loceol - scan)
3482 loceol = scan + l*max;
3495 - reginclass - determine if a character falls into a character class
3499 S_reginclass(pTHX_ register regnode *p, register I32 c)
3502 char flags = ANYOF_FLAGS(p);
3506 if (ANYOF_BITMAP_TEST(p, c))
3508 else if (flags & ANYOF_FOLD) {
3510 if (flags & ANYOF_LOCALE) {
3511 PL_reg_flags |= RF_tainted;
3512 cf = PL_fold_locale[c];
3516 if (ANYOF_BITMAP_TEST(p, cf))
3520 if (!match && (flags & ANYOF_CLASS)) {
3521 PL_reg_flags |= RF_tainted;
3523 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3524 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3525 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3526 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3527 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3528 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3529 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3530 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3531 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3532 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3533 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3534 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3535 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3536 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3537 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3538 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3539 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3540 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3541 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3542 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3543 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3544 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3545 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3546 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3547 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3548 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3549 ) /* How's that for a conditional? */
3555 return (flags & ANYOF_INVERT) ? !match : match;
3559 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3562 char flags = ARG1(f);
3564 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3566 if (swash_fetch(sv, p))
3568 else if (flags & ANYOF_FOLD) {
3571 if (flags & ANYOF_LOCALE) {
3572 PL_reg_flags |= RF_tainted;
3573 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3576 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3577 if (swash_fetch(sv, tmpbuf))
3581 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3583 return (flags & ANYOF_INVERT) ? !match : match;
3587 S_reghop(pTHX_ U8 *s, I32 off)
3591 while (off-- && s < (U8*)PL_regeol)
3596 if (s > (U8*)PL_bostr) {
3599 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3601 } /* XXX could check well-formedness here */
3609 S_reghopmaybe(pTHX_ U8* s, I32 off)
3613 while (off-- && s < (U8*)PL_regeol)
3620 if (s > (U8*)PL_bostr) {
3623 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3625 } /* XXX could check well-formedness here */
3641 restore_pos(pTHXo_ void *arg)
3644 if (PL_reg_eval_set) {
3645 if (PL_reg_oldsaved) {
3646 PL_reg_re->subbeg = PL_reg_oldsaved;
3647 PL_reg_re->sublen = PL_reg_oldsavedlen;
3648 RX_MATCH_COPIED_on(PL_reg_re);
3650 PL_reg_magic->mg_len = PL_reg_oldpos;
3651 PL_reg_eval_set = 0;
3652 PL_curpm = PL_reg_oldcurpm;