5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
43 # define PERL_NO_GET_CONTEXT
48 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 * Copyright (c) 1986 by University of Toronto.
51 * Written by Henry Spencer. Not derived from licensed software.
53 * Permission is granted to anyone to use this software for any
54 * purpose on any computer system, and to redistribute it freely,
55 * subject to the following restrictions:
57 * 1. The author is not responsible for the consequences of use of
58 * this software, no matter how awful, even if they arise
61 * 2. The origin of this software must not be misrepresented, either
62 * by explicit claim or by omission.
64 * 3. Altered versions must be plainly marked as such, and must not
65 * be misrepresented as being the original software.
67 **** Alterations to Henry's code are...
69 **** Copyright (c) 1991-1999, Larry Wall
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
83 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
90 #define RF_tainted 1 /* tainted information used? */
91 #define RF_warned 2 /* warned about big count? */
92 #define RF_evaled 4 /* Did an EVAL with setting? */
93 #define RF_utf8 8 /* String contains multibyte chars? */
95 #define UTF (PL_reg_flags & RF_utf8)
97 #define RS_init 1 /* eval environment created */
98 #define RS_set 2 /* replsv value is set */
101 #define STATIC static
108 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
109 #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
111 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
112 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
114 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
115 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
116 #define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
117 #define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
118 #define HOPc(pos,off) ((char*)HOP(pos,off))
119 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
121 static void restore_pos(pTHXo_ void *arg);
125 S_regcppush(pTHX_ I32 parenfloor)
128 int retval = PL_savestack_ix;
129 int i = (PL_regsize - parenfloor) * 4;
133 for (p = PL_regsize; p > parenfloor; p--) {
134 SSPUSHINT(PL_regendp[p]);
135 SSPUSHINT(PL_regstartp[p]);
136 SSPUSHPTR(PL_reg_start_tmp[p]);
139 SSPUSHINT(PL_regsize);
140 SSPUSHINT(*PL_reglastparen);
141 SSPUSHPTR(PL_reginput);
143 SSPUSHINT(SAVEt_REGCONTEXT);
147 /* These are needed since we do not localize EVAL nodes: */
148 # define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, \
149 " Setting an EVAL scope, savestack=%"IVdf"\n", \
150 (IV)PL_savestack_ix)); lastcp = PL_savestack_ix
152 # define REGCP_UNWIND DEBUG_r(lastcp != PL_savestack_ix ? \
153 PerlIO_printf(Perl_debug_log, \
154 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
155 (IV)lastcp, (IV)PL_savestack_ix) : 0); regcpblow(lastcp)
165 assert(i == SAVEt_REGCONTEXT);
167 input = (char *) SSPOPPTR;
168 *PL_reglastparen = SSPOPINT;
169 PL_regsize = SSPOPINT;
170 for (i -= 3; i > 0; i -= 4) {
171 paren = (U32)SSPOPINT;
172 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
173 PL_regstartp[paren] = SSPOPINT;
175 if (paren <= *PL_reglastparen)
176 PL_regendp[paren] = tmps;
178 PerlIO_printf(Perl_debug_log,
179 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
180 (UV)paren, (IV)PL_regstartp[paren],
181 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
182 (IV)PL_regendp[paren],
183 (paren > *PL_reglastparen ? "(no)" : ""));
187 if (*PL_reglastparen + 1 <= PL_regnpar) {
188 PerlIO_printf(Perl_debug_log,
189 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
190 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
193 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
194 if (paren > PL_regsize)
195 PL_regstartp[paren] = -1;
196 PL_regendp[paren] = -1;
202 S_regcp_set_to(pTHX_ I32 ss)
205 I32 tmp = PL_savestack_ix;
207 PL_savestack_ix = ss;
209 PL_savestack_ix = tmp;
213 typedef struct re_cc_state
217 struct re_cc_state *prev;
222 #define regcpblow(cp) LEAVE_SCOPE(cp)
225 * pregexec and friends
229 - pregexec - match a regexp against a string
232 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
233 char *strbeg, I32 minend, SV *screamer, U32 nosave)
234 /* strend: pointer to null at end of string */
235 /* strbeg: real beginning of string */
236 /* minend: end of match must be >=minend after stringarg. */
237 /* nosave: For optimizations. */
240 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
241 nosave ? 0 : REXEC_COPY_STR);
245 S_cache_re(pTHX_ regexp *prog)
248 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
250 PL_regprogram = prog->program;
252 PL_regnpar = prog->nparens;
253 PL_regdata = prog->data;
258 * Need to implement the following flags for reg_anch:
260 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
262 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
263 * INTUIT_AUTORITATIVE_ML
264 * INTUIT_ONCE_NOML - Intuit can match in one location only.
267 * Another flag for this function: SECOND_TIME (so that float substrs
268 * with giant delta may be not rechecked).
271 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
273 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
274 Otherwise, only SvCUR(sv) is used to get strbeg. */
276 /* XXXX We assume that strpos is strbeg unless sv. */
278 /* XXXX Some places assume that there is a fixed substring.
279 An update may be needed if optimizer marks as "INTUITable"
280 RExen without fixed substrings. Similarly, it is assumed that
281 lengths of all the strings are no more than minlen, thus they
282 cannot come from lookahead.
283 (Or minlen should take into account lookahead.) */
285 /* A failure to find a constant substring means that there is no need to make
286 an expensive call to REx engine, thus we celebrate a failure. Similarly,
287 finding a substring too deep into the string means that less calls to
288 regtry() should be needed.
290 REx compiler's optimizer found 4 possible hints:
291 a) Anchored substring;
293 c) Whether we are anchored (beginning-of-line or \G);
294 d) First node (of those at offset 0) which may distingush positions;
295 We use a)b)d) and multiline-part of c), and try to find a position in the
296 string which does not contradict any of them.
299 /* Most of decisions we do here should have been done at compile time.
300 The nodes of the REx which we used for the search should have been
301 deleted from the finite automaton. */
304 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
305 char *strend, U32 flags, re_scream_pos_data *data)
307 register I32 start_shift;
308 /* Should be nonnegative! */
309 register I32 end_shift;
315 register char *other_last = Nullch; /* other substr checked before this */
316 char *check_at; /* check substr found at this pos */
318 char *i_strpos = strpos;
321 DEBUG_r( if (!PL_colorset) reginitcolors() );
322 DEBUG_r(PerlIO_printf(Perl_debug_log,
323 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
324 PL_colors[4],PL_colors[5],PL_colors[0],
327 (strlen(prog->precomp) > 60 ? "..." : ""),
329 (int)(strend - strpos > 60 ? 60 : strend - strpos),
330 strpos, PL_colors[1],
331 (strend - strpos > 60 ? "..." : ""))
334 if (prog->minlen > strend - strpos) {
335 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
338 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
339 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
340 || ( (prog->reganch & ROPT_ANCH_BOL)
341 && !PL_multiline ) ); /* Check after \n? */
343 if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) {
344 /* Substring at constant offset from beg-of-str... */
347 if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */
348 && (sv && (strpos + SvCUR(sv) != strend)) ) {
349 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
352 PL_regeol = strend; /* Used in HOP() */
353 s = HOPc(strpos, prog->check_offset_min);
354 if (SvTAIL(prog->check_substr)) {
355 slen = SvCUR(prog->check_substr); /* >= 1 */
357 if ( strend - s > slen || strend - s < slen - 1
358 || (strend - s == slen && strend[-1] != '\n')) {
359 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
362 /* Now should match s[0..slen-2] */
364 if (slen && (*SvPVX(prog->check_substr) != *s
366 && memNE(SvPVX(prog->check_substr), s, slen)))) {
368 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
372 else if (*SvPVX(prog->check_substr) != *s
373 || ((slen = SvCUR(prog->check_substr)) > 1
374 && memNE(SvPVX(prog->check_substr), s, slen)))
376 goto success_at_start;
378 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
380 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
381 /* Should be nonnegative! */
382 end_shift = prog->minlen - start_shift -
383 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
385 I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr)
386 - (SvTAIL(prog->check_substr) != 0);
387 I32 eshift = strend - s - end;
389 if (end_shift < eshift)
393 else { /* Can match at random position */
396 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
397 /* Should be nonnegative! */
398 end_shift = prog->minlen - start_shift -
399 CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0);
402 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
404 Perl_croak(aTHX_ "panic: end_shift");
407 check = prog->check_substr;
409 /* Find a possible match in the region s..strend by looking for
410 the "check" substring in the region corrected by start/end_shift. */
411 if (flags & REXEC_SCREAM) {
412 char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */
413 I32 p = -1; /* Internal iterator of scream. */
414 I32 *pp = data ? data->scream_pos : &p;
416 if (PL_screamfirst[BmRARE(check)] >= 0
417 || ( BmRARE(check) == '\n'
418 && (BmPREVIOUS(check) == SvCUR(check) - 1)
420 s = screaminstr(sv, check,
421 start_shift + (s - strbeg), end_shift, pp, 0);
425 *data->scream_olds = s;
428 s = fbm_instr((unsigned char*)s + start_shift,
429 (unsigned char*)strend - end_shift,
430 check, PL_multiline ? FBMrf_MULTILINE : 0);
432 /* Update the count-of-usability, remove useless subpatterns,
435 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
436 (s ? "Found" : "Did not find"),
437 ((check == prog->anchored_substr) ? "anchored" : "floating"),
439 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
441 PL_colors[1], (SvTAIL(check) ? "$" : ""),
442 (s ? " at offset " : "...\n") ) );
449 /* Finish the diagnostic message */
450 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
452 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
453 Start with the other substr.
454 XXXX no SCREAM optimization yet - and a very coarse implementation
455 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
456 *always* match. Probably should be marked during compile...
457 Probably it is right to do no SCREAM here...
460 if (prog->float_substr && prog->anchored_substr) {
461 /* Take into account the "other" substring. */
462 /* XXXX May be hopelessly wrong for UTF... */
465 if (check == prog->float_substr) {
468 char *last = s - start_shift, *last1, *last2;
472 t = s - prog->check_offset_max;
473 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
474 && (!(prog->reganch & ROPT_UTF8)
475 || (PL_bostr = strpos, /* Used in regcopmaybe() */
476 (t = reghopmaybe_c(s, -(prog->check_offset_max)))
481 t += prog->anchored_offset;
482 if (t < other_last) /* These positions already checked */
485 last2 = last1 = strend - prog->minlen;
488 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
489 /* On end-of-str: see comment below. */
490 s = fbm_instr((unsigned char*)t,
491 (unsigned char*)last1 + prog->anchored_offset
492 + SvCUR(prog->anchored_substr)
493 - (SvTAIL(prog->anchored_substr)!=0),
494 prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
495 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
496 (s ? "Found" : "Contradicts"),
498 (int)(SvCUR(prog->anchored_substr)
499 - (SvTAIL(prog->anchored_substr)!=0)),
500 SvPVX(prog->anchored_substr),
501 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
503 if (last1 >= last2) {
504 DEBUG_r(PerlIO_printf(Perl_debug_log,
505 ", giving up...\n"));
508 DEBUG_r(PerlIO_printf(Perl_debug_log,
509 ", trying floating at offset %ld...\n",
510 (long)(s1 + 1 - i_strpos)));
511 PL_regeol = strend; /* Used in HOP() */
512 other_last = last1 + prog->anchored_offset + 1;
517 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
518 (long)(s - i_strpos)));
519 t = s - prog->anchored_offset;
528 else { /* Take into account the floating substring. */
533 last1 = last = strend - prog->minlen + prog->float_min_offset;
534 if (last - t > prog->float_max_offset)
535 last = t + prog->float_max_offset;
536 s = t + prog->float_min_offset;
539 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
540 /* fbm_instr() takes into account exact value of end-of-str
541 if the check is SvTAIL(ed). Since false positives are OK,
542 and end-of-str is not later than strend we are OK. */
543 s = fbm_instr((unsigned char*)s,
544 (unsigned char*)last + SvCUR(prog->float_substr)
545 - (SvTAIL(prog->float_substr)!=0),
546 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
547 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
548 (s ? "Found" : "Contradicts"),
550 (int)(SvCUR(prog->float_substr)
551 - (SvTAIL(prog->float_substr)!=0)),
552 SvPVX(prog->float_substr),
553 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
556 DEBUG_r(PerlIO_printf(Perl_debug_log,
557 ", giving up...\n"));
560 DEBUG_r(PerlIO_printf(Perl_debug_log,
561 ", trying anchored starting at offset %ld...\n",
562 (long)(s1 + 1 - i_strpos)));
563 other_last = last + 1;
564 PL_regeol = strend; /* Used in HOP() */
569 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
570 (long)(s - i_strpos)));
580 t = s - prog->check_offset_max;
582 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
583 && (!(prog->reganch & ROPT_UTF8)
584 || (PL_bostr = strpos, /* Used in regcopmaybe() */
585 ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
588 /* Fixed substring is found far enough so that the match
589 cannot start at strpos. */
591 if (ml_anch && t[-1] != '\n') {
592 /* Eventually fbm_*() should handle this, but often
593 anchored_offset is not 0, so this check will not be wasted. */
594 /* XXXX In the code below we prefer to look for "^" even in
595 presence of anchored substrings. And we search even
596 beyond the found float position. These pessimizations
597 are historical artefacts only. */
599 while (t < strend - prog->minlen) {
601 if (t < s - prog->check_offset_min) {
602 if (prog->anchored_substr) {
603 /* We definitely contradict the found anchored
604 substr. Due to the above check we do not
605 contradict "check" substr.
606 Thus we can arrive here only if check substr
607 is float. Redo checking for "other"=="fixed".
610 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
611 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
612 goto do_other_anchored;
615 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
616 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
619 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n",
620 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
626 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
627 PL_colors[0],PL_colors[1]));
632 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
636 /* The found string does not prohibit matching at beg-of-str
637 - no optimization of calling REx engine can be performed,
638 unless it was an MBOL and we are not after MBOL. */
640 /* Even in this situation we may use MBOL flag if strpos is offset
641 wrt the start of the string. */
643 && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') {
647 DEBUG_r( if (ml_anch)
648 PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n",
649 PL_colors[0],PL_colors[1]);
652 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
653 && --BmUSEFUL(prog->check_substr) < 0
654 && prog->check_substr == prog->float_substr) { /* boo */
655 /* If flags & SOMETHING - do not do it many times on the same match */
656 SvREFCNT_dec(prog->check_substr);
657 prog->check_substr = Nullsv; /* disable */
658 prog->float_substr = Nullsv; /* clear */
660 /* XXXX This is a remnant of the old implementation. It
661 looks wasteful, since now INTUIT can use many
663 prog->reganch &= ~RE_USE_INTUIT;
670 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
671 if (prog->regstclass) {
672 /* minlen == 0 is possible if regstclass is \b or \B,
673 and the fixed substr is ''$.
674 Since minlen is already taken into account, s+1 is before strend;
675 accidentally, minlen >= 1 guaranties no false positives at s + 1
676 even for \b or \B. But (minlen? 1 : 0) below assumes that
677 regstclass does not come from lookahead... */
678 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
679 This leaves EXACTF only, which is dealt with in find_byclass(). */
680 char *endpos = (prog->anchored_substr || ml_anch)
681 ? s + (prog->minlen? 1 : 0)
682 : (prog->float_substr ? check_at - start_shift + 1
684 char *startpos = sv ? strend - SvCUR(sv) : s;
687 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
692 if (endpos == strend) {
693 DEBUG_r( PerlIO_printf(Perl_debug_log,
694 "Could not match STCLASS...\n") );
697 /* Contradict one of substrings */
698 if (prog->anchored_substr) {
699 DEBUG_r( PerlIO_printf(Perl_debug_log,
700 "This position contradicts STCLASS...\n") );
701 if (prog->anchored_substr == check) {
702 DEBUG_r( what = "anchored" );
704 PL_regeol = strend; /* Used in HOP() */
706 DEBUG_r( PerlIO_printf(Perl_debug_log,
707 "trying %s substr starting at offset %ld...\n",
708 what, (long)(s + start_shift - i_strpos)) );
711 /* Have both, check is floating */
712 if (t + start_shift >= check_at) /* Contradicts floating=check */
713 goto retry_floating_check;
714 /* Recheck anchored substring, but not floating... */
716 DEBUG_r( PerlIO_printf(Perl_debug_log,
717 "trying anchored substr starting at offset %ld...\n",
718 (long)(other_last - i_strpos)) );
719 goto do_other_anchored;
721 /* Check is floating subtring. */
722 retry_floating_check:
723 t = check_at - start_shift;
724 DEBUG_r( what = "floating" );
725 goto hop_and_restart;
728 PerlIO_printf(Perl_debug_log,
729 "By STCLASS: moving %ld --> %ld\n",
730 (long)(t - i_strpos), (long)(s - i_strpos));
732 PerlIO_printf(Perl_debug_log,
733 "Does not contradict STCLASS...\n") );
735 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n",
736 PL_colors[4], PL_colors[5], (long)(s - i_strpos)) );
739 fail_finish: /* Substring not found */
740 BmUSEFUL(prog->check_substr) += 5; /* hooray */
742 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
743 PL_colors[4],PL_colors[5]));
747 /* We know what class REx starts with. Try to find this position... */
749 find_byclass(regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
751 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
757 register I32 tmp = 1; /* Scratch variable? */
759 /* We know what class it must start with. */
763 if (REGINCLASSUTF8(c, (U8*)s)) {
764 if (tmp && (norun || regtry(prog, s)))
776 if (REGINCLASS(c, *s)) {
777 if (tmp && (norun || regtry(prog, s)))
797 c2 = PL_fold_locale[c1];
802 e = s; /* Due to minlen logic of intuit() */
803 /* Here it is NOT UTF! */
807 && (ln == 1 || (OP(c) == EXACTF
809 : ibcmp_locale(s, m, ln)))
810 && (norun || regtry(prog, s)) )
816 if ( (*s == c1 || *s == c2)
817 && (ln == 1 || (OP(c) == EXACTF
819 : ibcmp_locale(s, m, ln)))
820 && (norun || regtry(prog, s)) )
827 PL_reg_flags |= RF_tainted;
830 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
831 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
833 if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
835 if ((norun || regtry(prog, s)))
840 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
844 PL_reg_flags |= RF_tainted;
847 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
848 tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
850 if (tmp == !(OP(c) == BOUND ?
851 swash_fetch(PL_utf8_alnum, (U8*)s) :
852 isALNUM_LC_utf8((U8*)s)))
855 if ((norun || regtry(prog, s)))
860 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
864 PL_reg_flags |= RF_tainted;
867 tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
868 tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
870 if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
872 else if ((norun || regtry(prog, s)))
876 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
880 PL_reg_flags |= RF_tainted;
884 strend = reghop_c(strend, -1);
885 tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
886 tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
888 if (tmp == !(OP(c) == NBOUND ?
889 swash_fetch(PL_utf8_alnum, (U8*)s) :
890 isALNUM_LC_utf8((U8*)s)))
892 else if ((norun || regtry(prog, s)))
896 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
902 if (tmp && (norun || regtry(prog, s)))
914 if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
915 if (tmp && (norun || regtry(prog, s)))
926 PL_reg_flags |= RF_tainted;
928 if (isALNUM_LC(*s)) {
929 if (tmp && (norun || regtry(prog, s)))
940 PL_reg_flags |= RF_tainted;
942 if (isALNUM_LC_utf8((U8*)s)) {
943 if (tmp && (norun || regtry(prog, s)))
956 if (tmp && (norun || regtry(prog, s)))
968 if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
969 if (tmp && (norun || regtry(prog, s)))
980 PL_reg_flags |= RF_tainted;
982 if (!isALNUM_LC(*s)) {
983 if (tmp && (norun || regtry(prog, s)))
994 PL_reg_flags |= RF_tainted;
996 if (!isALNUM_LC_utf8((U8*)s)) {
997 if (tmp && (norun || regtry(prog, s)))
1008 while (s < strend) {
1010 if (tmp && (norun || regtry(prog, s)))
1021 while (s < strend) {
1022 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
1023 if (tmp && (norun || regtry(prog, s)))
1034 PL_reg_flags |= RF_tainted;
1035 while (s < strend) {
1036 if (isSPACE_LC(*s)) {
1037 if (tmp && (norun || regtry(prog, s)))
1048 PL_reg_flags |= RF_tainted;
1049 while (s < strend) {
1050 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1051 if (tmp && (norun || regtry(prog, s)))
1062 while (s < strend) {
1064 if (tmp && (norun || regtry(prog, s)))
1075 while (s < strend) {
1076 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
1077 if (tmp && (norun || regtry(prog, s)))
1088 PL_reg_flags |= RF_tainted;
1089 while (s < strend) {
1090 if (!isSPACE_LC(*s)) {
1091 if (tmp && (norun || regtry(prog, s)))
1102 PL_reg_flags |= RF_tainted;
1103 while (s < strend) {
1104 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1105 if (tmp && (norun || regtry(prog, s)))
1116 while (s < strend) {
1118 if (tmp && (norun || regtry(prog, s)))
1129 while (s < strend) {
1130 if (swash_fetch(PL_utf8_digit,(U8*)s)) {
1131 if (tmp && (norun || regtry(prog, s)))
1142 PL_reg_flags |= RF_tainted;
1143 while (s < strend) {
1144 if (isDIGIT_LC(*s)) {
1145 if (tmp && (norun || regtry(prog, s)))
1156 PL_reg_flags |= RF_tainted;
1157 while (s < strend) {
1158 if (isDIGIT_LC_utf8((U8*)s)) {
1159 if (tmp && (norun || regtry(prog, s)))
1170 while (s < strend) {
1172 if (tmp && (norun || regtry(prog, s)))
1183 while (s < strend) {
1184 if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
1185 if (tmp && (norun || regtry(prog, s)))
1196 PL_reg_flags |= RF_tainted;
1197 while (s < strend) {
1198 if (!isDIGIT_LC(*s)) {
1199 if (tmp && (norun || regtry(prog, s)))
1210 PL_reg_flags |= RF_tainted;
1211 while (s < strend) {
1212 if (!isDIGIT_LC_utf8((U8*)s)) {
1213 if (tmp && (norun || regtry(prog, s)))
1226 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1236 - regexec_flags - match a regexp against a string
1239 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1240 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1241 /* strend: pointer to null at end of string */
1242 /* strbeg: real beginning of string */
1243 /* minend: end of match must be >=minend after stringarg. */
1244 /* data: May be used for some additional optimizations. */
1245 /* nosave: For optimizations. */
1249 register regnode *c;
1250 register char *startpos = stringarg;
1252 I32 minlen; /* must match at least this many chars */
1253 I32 dontbother = 0; /* how many characters not to try at end */
1254 I32 start_shift = 0; /* Offset of the start to find
1255 constant substr. */ /* CC */
1256 I32 end_shift = 0; /* Same for the end. */ /* CC */
1257 I32 scream_pos = -1; /* Internal iterator of scream. */
1259 SV* oreplsv = GvSV(PL_replgv);
1265 PL_regnarrate = PL_debug & 512;
1268 /* Be paranoid... */
1269 if (prog == NULL || startpos == NULL) {
1270 Perl_croak(aTHX_ "NULL regexp parameter");
1274 minlen = prog->minlen;
1275 if (strend - startpos < minlen) goto phooey;
1277 if (startpos == strbeg) /* is ^ valid at stringarg? */
1280 PL_regprev = (U32)stringarg[-1];
1281 if (!PL_multiline && PL_regprev == '\n')
1282 PL_regprev = '\0'; /* force ^ to NOT match */
1285 /* Check validity of program. */
1286 if (UCHARAT(prog->program) != REG_MAGIC) {
1287 Perl_croak(aTHX_ "corrupted regexp program");
1291 PL_reg_eval_set = 0;
1294 if (prog->reganch & ROPT_UTF8)
1295 PL_reg_flags |= RF_utf8;
1297 /* Mark beginning of line for ^ and lookbehind. */
1298 PL_regbol = startpos;
1302 /* Mark end of line for $ (and such) */
1305 /* see how far we have to get to not match where we matched before */
1306 PL_regtill = startpos+minend;
1308 /* We start without call_cc context. */
1311 /* If there is a "must appear" string, look for it. */
1314 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1317 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1318 PL_reg_ganch = startpos;
1319 else if (sv && SvTYPE(sv) >= SVt_PVMG
1321 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1322 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1323 if (prog->reganch & ROPT_ANCH_GPOS) {
1324 if (s > PL_reg_ganch)
1329 else /* pos() not defined */
1330 PL_reg_ganch = strbeg;
1333 if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1334 re_scream_pos_data d;
1336 d.scream_olds = &scream_olds;
1337 d.scream_pos = &scream_pos;
1338 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1340 goto phooey; /* not present */
1343 DEBUG_r( if (!PL_colorset) reginitcolors() );
1344 DEBUG_r(PerlIO_printf(Perl_debug_log,
1345 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1346 PL_colors[4],PL_colors[5],PL_colors[0],
1349 (strlen(prog->precomp) > 60 ? "..." : ""),
1351 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1352 startpos, PL_colors[1],
1353 (strend - startpos > 60 ? "..." : ""))
1356 /* Simplest case: anchored match need be tried only once. */
1357 /* [unless only anchor is BOL and multiline is set] */
1358 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1359 if (s == startpos && regtry(prog, startpos))
1361 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1362 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1367 dontbother = minlen - 1;
1368 end = HOPc(strend, -dontbother) - 1;
1369 /* for multiline we only have to try after newlines */
1370 if (prog->check_substr) {
1374 if (regtry(prog, s))
1379 if (prog->reganch & RE_USE_INTUIT) {
1380 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1391 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1392 if (regtry(prog, s))
1399 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1400 if (regtry(prog, PL_reg_ganch))
1405 /* Messy cases: unanchored match. */
1406 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1407 /* we have /x+whatever/ */
1408 /* it must be a one character string (XXXX Except UTF?) */
1409 char ch = SvPVX(prog->anchored_substr)[0];
1411 while (s < strend) {
1413 if (regtry(prog, s)) goto got_it;
1415 while (s < strend && *s == ch)
1422 while (s < strend) {
1424 if (regtry(prog, s)) goto got_it;
1426 while (s < strend && *s == ch)
1434 else if (prog->anchored_substr != Nullsv
1435 || (prog->float_substr != Nullsv
1436 && prog->float_max_offset < strend - s)) {
1437 SV *must = prog->anchored_substr
1438 ? prog->anchored_substr : prog->float_substr;
1440 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1442 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1443 I32 delta = back_max - back_min;
1444 char *last = HOPc(strend, /* Cannot start after this */
1445 -(I32)(CHR_SVLEN(must)
1446 - (SvTAIL(must) != 0) + back_min));
1447 char *last1; /* Last position checked before */
1450 last1 = HOPc(s, -1);
1452 last1 = s - 1; /* bogus */
1454 /* XXXX check_substr already used to find `s', can optimize if
1455 check_substr==must. */
1457 dontbother = end_shift;
1458 strend = HOPc(strend, -dontbother);
1459 while ( (s <= last) &&
1460 ((flags & REXEC_SCREAM)
1461 ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
1462 end_shift, &scream_pos, 0))
1463 : (s = fbm_instr((unsigned char*)HOP(s, back_min),
1464 (unsigned char*)strend, must,
1465 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1466 if (HOPc(s, -back_max) > last1) {
1467 last1 = HOPc(s, -back_min);
1468 s = HOPc(s, -back_max);
1471 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1473 last1 = HOPc(s, -back_min);
1477 while (s <= last1) {
1478 if (regtry(prog, s))
1484 while (s <= last1) {
1485 if (regtry(prog, s))
1493 else if (c = prog->regstclass) {
1494 if (minlen) /* don't bother with what can't match */
1495 strend = HOPc(strend, -(minlen - 1));
1496 if (find_byclass(prog, c, s, strend, startpos, 0))
1501 if (prog->float_substr != Nullsv) { /* Trim the end. */
1503 I32 oldpos = scream_pos;
1505 if (flags & REXEC_SCREAM) {
1506 last = screaminstr(sv, prog->float_substr, s - strbeg,
1507 end_shift, &scream_pos, 1); /* last one */
1509 last = scream_olds; /* Only one occurence. */
1513 char *little = SvPV(prog->float_substr, len);
1515 if (SvTAIL(prog->float_substr)) {
1516 if (memEQ(strend - len + 1, little, len - 1))
1517 last = strend - len + 1;
1518 else if (!PL_multiline)
1519 last = memEQ(strend - len, little, len)
1520 ? strend - len : Nullch;
1526 last = rninstr(s, strend, little, little + len);
1528 last = strend; /* matching `$' */
1531 if (last == NULL) goto phooey; /* Should not happen! */
1532 dontbother = strend - last + prog->float_min_offset;
1534 if (minlen && (dontbother < minlen))
1535 dontbother = minlen - 1;
1536 strend -= dontbother; /* this one's always in bytes! */
1537 /* We don't know much -- general case. */
1540 if (regtry(prog, s))
1549 if (regtry(prog, s))
1551 } while (s++ < strend);
1559 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1561 if (PL_reg_eval_set) {
1562 /* Preserve the current value of $^R */
1563 if (oreplsv != GvSV(PL_replgv))
1564 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1565 restored, the value remains
1567 restore_pos(aTHXo_ 0);
1570 /* make sure $`, $&, $', and $digit will work later */
1571 if ( !(flags & REXEC_NOT_FIRST) ) {
1572 if (RX_MATCH_COPIED(prog)) {
1573 Safefree(prog->subbeg);
1574 RX_MATCH_COPIED_off(prog);
1576 if (flags & REXEC_COPY_STR) {
1577 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1579 s = savepvn(strbeg, i);
1582 RX_MATCH_COPIED_on(prog);
1585 prog->subbeg = strbeg;
1586 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1593 if (PL_reg_eval_set)
1594 restore_pos(aTHXo_ 0);
1599 - regtry - try match at specific point
1601 STATIC I32 /* 0 failure, 1 success */
1602 S_regtry(pTHX_ regexp *prog, char *startpos)
1610 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1613 PL_reg_eval_set = RS_init;
1615 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1616 (IV)(PL_stack_sp - PL_stack_base));
1618 SAVEINT(cxstack[cxstack_ix].blk_oldsp);
1619 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1620 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1622 /* Apparently this is not needed, judging by wantarray. */
1623 /* SAVEINT(cxstack[cxstack_ix].blk_gimme);
1624 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1627 /* Make $_ available to executed code. */
1628 if (PL_reg_sv != DEFSV) {
1629 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1634 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1635 && (mg = mg_find(PL_reg_sv, 'g')))) {
1636 /* prepare for quick setting of pos */
1637 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1638 mg = mg_find(PL_reg_sv, 'g');
1642 PL_reg_oldpos = mg->mg_len;
1643 SAVEDESTRUCTOR_X(restore_pos, 0);
1646 New(22,PL_reg_curpm, 1, PMOP);
1647 PL_reg_curpm->op_pmregexp = prog;
1648 PL_reg_oldcurpm = PL_curpm;
1649 PL_curpm = PL_reg_curpm;
1650 if (RX_MATCH_COPIED(prog)) {
1651 /* Here is a serious problem: we cannot rewrite subbeg,
1652 since it may be needed if this match fails. Thus
1653 $` inside (?{}) could fail... */
1654 PL_reg_oldsaved = prog->subbeg;
1655 PL_reg_oldsavedlen = prog->sublen;
1656 RX_MATCH_COPIED_off(prog);
1659 PL_reg_oldsaved = Nullch;
1660 prog->subbeg = PL_bostr;
1661 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1663 prog->startp[0] = startpos - PL_bostr;
1664 PL_reginput = startpos;
1665 PL_regstartp = prog->startp;
1666 PL_regendp = prog->endp;
1667 PL_reglastparen = &prog->lastparen;
1668 prog->lastparen = 0;
1670 DEBUG_r(PL_reg_starttry = startpos);
1671 if (PL_reg_start_tmpl <= prog->nparens) {
1672 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1673 if(PL_reg_start_tmp)
1674 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1676 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1679 /* XXXX What this code is doing here?!!! There should be no need
1680 to do this again and again, PL_reglastparen should take care of
1684 if (prog->nparens) {
1685 for (i = prog->nparens; i >= 1; i--) {
1691 if (regmatch(prog->program + 1)) {
1692 prog->endp[0] = PL_reginput - PL_bostr;
1700 - regmatch - main matching routine
1702 * Conceptually the strategy is simple: check to see whether the current
1703 * node matches, call self recursively to see whether the rest matches,
1704 * and then act accordingly. In practice we make some effort to avoid
1705 * recursion, in particular by going through "ordinary" nodes (that don't
1706 * need to know whether the rest of the match failed) by a loop instead of
1709 /* [lwall] I've hoisted the register declarations to the outer block in order to
1710 * maybe save a little bit of pushing and popping on the stack. It also takes
1711 * advantage of machines that use a register save mask on subroutine entry.
1713 STATIC I32 /* 0 failure, 1 success */
1714 S_regmatch(pTHX_ regnode *prog)
1717 register regnode *scan; /* Current node. */
1718 regnode *next; /* Next node. */
1719 regnode *inner; /* Next node in internal branch. */
1720 register I32 nextchr; /* renamed nextchr - nextchar colides with
1721 function of same name */
1722 register I32 n; /* no or next */
1723 register I32 ln; /* len or last */
1724 register char *s; /* operand or save */
1725 register char *locinput = PL_reginput;
1726 register I32 c1, c2, paren; /* case fold search, parenth */
1727 int minmod = 0, sw = 0, logical = 0;
1732 /* Note that nextchr is a byte even in UTF */
1733 nextchr = UCHARAT(locinput);
1735 while (scan != NULL) {
1736 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1738 # define sayYES goto yes
1739 # define sayNO goto no
1740 # define sayYES_FINAL goto yes_final
1741 # define sayYES_LOUD goto yes_loud
1742 # define sayNO_FINAL goto no_final
1743 # define sayNO_SILENT goto do_no
1744 # define saySAME(x) if (x) goto yes; else goto no
1745 # define REPORT_CODE_OFF 24
1747 # define sayYES return 1
1748 # define sayNO return 0
1749 # define sayYES_FINAL return 1
1750 # define sayYES_LOUD return 1
1751 # define sayNO_FINAL return 0
1752 # define sayNO_SILENT return 0
1753 # define saySAME(x) return x
1756 SV *prop = sv_newmortal();
1757 int docolor = *PL_colors[0];
1758 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1759 int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
1760 /* The part of the string before starttry has one color
1761 (pref0_len chars), between starttry and current
1762 position another one (pref_len - pref0_len chars),
1763 after the current position the third one.
1764 We assume that pref0_len <= pref_len, otherwise we
1765 decrease pref0_len. */
1766 int pref_len = (locinput - PL_bostr > (5 + taill) - l
1767 ? (5 + taill) - l : locinput - PL_bostr);
1768 int pref0_len = pref_len - (locinput - PL_reg_starttry);
1770 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
1771 l = ( PL_regeol - locinput > (5 + taill) - pref_len
1772 ? (5 + taill) - pref_len : PL_regeol - locinput);
1775 if (pref0_len > pref_len)
1776 pref0_len = pref_len;
1777 regprop(prop, scan);
1778 PerlIO_printf(Perl_debug_log,
1779 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
1780 (IV)(locinput - PL_bostr),
1781 PL_colors[4], pref0_len,
1782 locinput - pref_len, PL_colors[5],
1783 PL_colors[2], pref_len - pref0_len,
1784 locinput - pref_len + pref0_len, PL_colors[3],
1785 (docolor ? "" : "> <"),
1786 PL_colors[0], l, locinput, PL_colors[1],
1787 15 - l - pref_len + 1,
1789 (IV)(scan - PL_regprogram), PL_regindent*2, "",
1793 next = scan + NEXT_OFF(scan);
1799 if (locinput == PL_bostr
1800 ? PL_regprev == '\n'
1802 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1804 /* regtill = regbol; */
1809 if (locinput == PL_bostr
1810 ? PL_regprev == '\n'
1811 : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
1817 if (locinput == PL_regbol && PL_regprev == '\n')
1821 if (locinput == PL_reg_ganch)
1831 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1836 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
1838 if (PL_regeol - locinput > 1)
1842 if (PL_regeol != locinput)
1846 if (nextchr & 0x80) {
1847 locinput += PL_utf8skip[nextchr];
1848 if (locinput > PL_regeol)
1850 nextchr = UCHARAT(locinput);
1853 if (!nextchr && locinput >= PL_regeol)
1855 nextchr = UCHARAT(++locinput);
1858 if (!nextchr && locinput >= PL_regeol)
1860 nextchr = UCHARAT(++locinput);
1863 if (nextchr & 0x80) {
1864 locinput += PL_utf8skip[nextchr];
1865 if (locinput > PL_regeol)
1867 nextchr = UCHARAT(locinput);
1870 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1872 nextchr = UCHARAT(++locinput);
1875 if (!nextchr && locinput >= PL_regeol || nextchr == '\n')
1877 nextchr = UCHARAT(++locinput);
1882 /* Inline the first character, for speed. */
1883 if (UCHARAT(s) != nextchr)
1885 if (PL_regeol - locinput < ln)
1887 if (ln > 1 && memNE(s, locinput, ln))
1890 nextchr = UCHARAT(locinput);
1893 PL_reg_flags |= RF_tainted;
1902 c1 = OP(scan) == EXACTF;
1906 if (utf8_to_uv((U8*)s, 0) != (c1 ?
1907 toLOWER_utf8((U8*)l) :
1908 toLOWER_LC_utf8((U8*)l)))
1916 nextchr = UCHARAT(locinput);
1920 /* Inline the first character, for speed. */
1921 if (UCHARAT(s) != nextchr &&
1922 UCHARAT(s) != ((OP(scan) == EXACTF)
1923 ? PL_fold : PL_fold_locale)[nextchr])
1925 if (PL_regeol - locinput < ln)
1927 if (ln > 1 && (OP(scan) == EXACTF
1928 ? ibcmp(s, locinput, ln)
1929 : ibcmp_locale(s, locinput, ln)))
1932 nextchr = UCHARAT(locinput);
1935 if (!REGINCLASSUTF8(scan, (U8*)locinput))
1937 if (locinput >= PL_regeol)
1939 locinput += PL_utf8skip[nextchr];
1940 nextchr = UCHARAT(locinput);
1944 nextchr = UCHARAT(locinput);
1945 if (!REGINCLASS(scan, nextchr))
1947 if (!nextchr && locinput >= PL_regeol)
1949 nextchr = UCHARAT(++locinput);
1952 PL_reg_flags |= RF_tainted;
1957 if (!(OP(scan) == ALNUM
1958 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1960 nextchr = UCHARAT(++locinput);
1963 PL_reg_flags |= RF_tainted;
1968 if (nextchr & 0x80) {
1969 if (!(OP(scan) == ALNUMUTF8
1970 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
1971 : isALNUM_LC_utf8((U8*)locinput)))
1975 locinput += PL_utf8skip[nextchr];
1976 nextchr = UCHARAT(locinput);
1979 if (!(OP(scan) == ALNUMUTF8
1980 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
1982 nextchr = UCHARAT(++locinput);
1985 PL_reg_flags |= RF_tainted;
1988 if (!nextchr && locinput >= PL_regeol)
1990 if (OP(scan) == NALNUM
1991 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
1993 nextchr = UCHARAT(++locinput);
1996 PL_reg_flags |= RF_tainted;
1999 if (!nextchr && locinput >= PL_regeol)
2001 if (nextchr & 0x80) {
2002 if (OP(scan) == NALNUMUTF8
2003 ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
2004 : isALNUM_LC_utf8((U8*)locinput))
2008 locinput += PL_utf8skip[nextchr];
2009 nextchr = UCHARAT(locinput);
2012 if (OP(scan) == NALNUMUTF8
2013 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2015 nextchr = UCHARAT(++locinput);
2019 PL_reg_flags |= RF_tainted;
2023 /* was last char in word? */
2024 ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
2025 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2027 n = isALNUM(nextchr);
2030 ln = isALNUM_LC(ln);
2031 n = isALNUM_LC(nextchr);
2033 if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
2038 PL_reg_flags |= RF_tainted;
2042 /* was last char in word? */
2043 ln = (locinput != PL_regbol)
2044 ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
2045 if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
2046 ln = isALNUM_uni(ln);
2047 n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
2050 ln = isALNUM_LC_uni(ln);
2051 n = isALNUM_LC_utf8((U8*)locinput);
2053 if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
2057 PL_reg_flags |= RF_tainted;
2060 if (!nextchr && locinput >= PL_regeol)
2062 if (!(OP(scan) == SPACE
2063 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2065 nextchr = UCHARAT(++locinput);
2068 PL_reg_flags |= RF_tainted;
2071 if (!nextchr && locinput >= PL_regeol)
2073 if (nextchr & 0x80) {
2074 if (!(OP(scan) == SPACEUTF8
2075 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2076 : isSPACE_LC_utf8((U8*)locinput)))
2080 locinput += PL_utf8skip[nextchr];
2081 nextchr = UCHARAT(locinput);
2084 if (!(OP(scan) == SPACEUTF8
2085 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2087 nextchr = UCHARAT(++locinput);
2090 PL_reg_flags |= RF_tainted;
2095 if (OP(scan) == SPACE
2096 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2098 nextchr = UCHARAT(++locinput);
2101 PL_reg_flags |= RF_tainted;
2106 if (nextchr & 0x80) {
2107 if (OP(scan) == NSPACEUTF8
2108 ? swash_fetch(PL_utf8_space,(U8*)locinput)
2109 : isSPACE_LC_utf8((U8*)locinput))
2113 locinput += PL_utf8skip[nextchr];
2114 nextchr = UCHARAT(locinput);
2117 if (OP(scan) == NSPACEUTF8
2118 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2120 nextchr = UCHARAT(++locinput);
2123 PL_reg_flags |= RF_tainted;
2126 if (!nextchr && locinput >= PL_regeol)
2128 if (!(OP(scan) == DIGIT
2129 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2131 nextchr = UCHARAT(++locinput);
2134 PL_reg_flags |= RF_tainted;
2139 if (nextchr & 0x80) {
2140 if (OP(scan) == NDIGITUTF8
2141 ? swash_fetch(PL_utf8_digit,(U8*)locinput)
2142 : isDIGIT_LC_utf8((U8*)locinput))
2146 locinput += PL_utf8skip[nextchr];
2147 nextchr = UCHARAT(locinput);
2150 if (!isDIGIT(nextchr))
2152 nextchr = UCHARAT(++locinput);
2155 PL_reg_flags |= RF_tainted;
2160 if (OP(scan) == DIGIT
2161 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2163 nextchr = UCHARAT(++locinput);
2166 PL_reg_flags |= RF_tainted;
2169 if (!nextchr && locinput >= PL_regeol)
2171 if (nextchr & 0x80) {
2172 if (swash_fetch(PL_utf8_digit,(U8*)locinput))
2174 locinput += PL_utf8skip[nextchr];
2175 nextchr = UCHARAT(locinput);
2178 if (isDIGIT(nextchr))
2180 nextchr = UCHARAT(++locinput);
2183 if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
2185 locinput += PL_utf8skip[nextchr];
2186 while (locinput < PL_regeol && swash_fetch(PL_utf8_mark,(U8*)locinput))
2187 locinput += UTF8SKIP(locinput);
2188 if (locinput > PL_regeol)
2190 nextchr = UCHARAT(locinput);
2193 PL_reg_flags |= RF_tainted;
2197 n = ARG(scan); /* which paren pair */
2198 ln = PL_regstartp[n];
2199 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2200 if (*PL_reglastparen < n || ln == -1)
2201 sayNO; /* Do not match unless seen CLOSEn. */
2202 if (ln == PL_regendp[n])
2206 if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
2208 char *e = PL_bostr + PL_regendp[n];
2210 * Note that we can't do the "other character" lookup trick as
2211 * in the 8-bit case (no pun intended) because in Unicode we
2212 * have to map both upper and title case to lower case.
2214 if (OP(scan) == REFF) {
2218 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2228 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2235 nextchr = UCHARAT(locinput);
2239 /* Inline the first character, for speed. */
2240 if (UCHARAT(s) != nextchr &&
2242 (UCHARAT(s) != ((OP(scan) == REFF
2243 ? PL_fold : PL_fold_locale)[nextchr]))))
2245 ln = PL_regendp[n] - ln;
2246 if (locinput + ln > PL_regeol)
2248 if (ln > 1 && (OP(scan) == REF
2249 ? memNE(s, locinput, ln)
2251 ? ibcmp(s, locinput, ln)
2252 : ibcmp_locale(s, locinput, ln))))
2255 nextchr = UCHARAT(locinput);
2266 OP_4tree *oop = PL_op;
2267 COP *ocurcop = PL_curcop;
2268 SV **ocurpad = PL_curpad;
2272 PL_op = (OP_4tree*)PL_regdata->data[n];
2273 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2274 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2275 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2277 CALLRUNOPS(aTHX); /* Scalar context. */
2283 PL_curpad = ocurpad;
2284 PL_curcop = ocurcop;
2286 if (logical == 2) { /* Postponed subexpression. */
2288 MAGIC *mg = Null(MAGIC*);
2290 CHECKPOINT cp, lastcp;
2292 if(SvROK(ret) || SvRMAGICAL(ret)) {
2293 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2296 mg = mg_find(sv, 'r');
2299 re = (regexp *)mg->mg_obj;
2300 (void)ReREFCNT_inc(re);
2304 char *t = SvPV(ret, len);
2306 char *oprecomp = PL_regprecomp;
2307 I32 osize = PL_regsize;
2308 I32 onpar = PL_regnpar;
2311 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2313 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2314 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2315 PL_regprecomp = oprecomp;
2320 PerlIO_printf(Perl_debug_log,
2321 "Entering embedded `%s%.60s%s%s'\n",
2325 (strlen(re->precomp) > 60 ? "..." : ""))
2328 state.prev = PL_reg_call_cc;
2329 state.cc = PL_regcc;
2330 state.re = PL_reg_re;
2334 cp = regcppush(0); /* Save *all* the positions. */
2337 state.ss = PL_savestack_ix;
2338 *PL_reglastparen = 0;
2339 PL_reg_call_cc = &state;
2340 PL_reginput = locinput;
2342 /* XXXX This is too dramatic a measure... */
2345 if (regmatch(re->program + 1)) {
2346 /* Even though we succeeded, we need to restore
2347 global variables, since we may be wrapped inside
2348 SUSPEND, thus the match may be not finished yet. */
2350 /* XXXX Do this only if SUSPENDed? */
2351 PL_reg_call_cc = state.prev;
2352 PL_regcc = state.cc;
2353 PL_reg_re = state.re;
2354 cache_re(PL_reg_re);
2356 /* XXXX This is too dramatic a measure... */
2359 /* These are needed even if not SUSPEND. */
2367 PL_reg_call_cc = state.prev;
2368 PL_regcc = state.cc;
2369 PL_reg_re = state.re;
2370 cache_re(PL_reg_re);
2372 /* XXXX This is too dramatic a measure... */
2381 sv_setsv(save_scalar(PL_replgv), ret);
2385 n = ARG(scan); /* which paren pair */
2386 PL_reg_start_tmp[n] = locinput;
2391 n = ARG(scan); /* which paren pair */
2392 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2393 PL_regendp[n] = locinput - PL_bostr;
2394 if (n > *PL_reglastparen)
2395 *PL_reglastparen = n;
2398 n = ARG(scan); /* which paren pair */
2399 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2402 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2404 next = NEXTOPER(NEXTOPER(scan));
2406 next = scan + ARG(scan);
2407 if (OP(next) == IFTHEN) /* Fake one. */
2408 next = NEXTOPER(NEXTOPER(next));
2412 logical = scan->flags;
2414 /*******************************************************************
2415 PL_regcc contains infoblock about the innermost (...)* loop, and
2416 a pointer to the next outer infoblock.
2418 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2420 1) After matching X, regnode for CURLYX is processed;
2422 2) This regnode creates infoblock on the stack, and calls
2423 regmatch() recursively with the starting point at WHILEM node;
2425 3) Each hit of WHILEM node tries to match A and Z (in the order
2426 depending on the current iteration, min/max of {min,max} and
2427 greediness). The information about where are nodes for "A"
2428 and "Z" is read from the infoblock, as is info on how many times "A"
2429 was already matched, and greediness.
2431 4) After A matches, the same WHILEM node is hit again.
2433 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2434 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2435 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2436 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2437 of the external loop.
2439 Currently present infoblocks form a tree with a stem formed by PL_curcc
2440 and whatever it mentions via ->next, and additional attached trees
2441 corresponding to temporarily unset infoblocks as in "5" above.
2443 In the following picture infoblocks for outer loop of
2444 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2445 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2446 infoblocks are drawn below the "reset" infoblock.
2448 In fact in the picture below we do not show failed matches for Z and T
2449 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2450 more obvious *why* one needs to *temporary* unset infoblocks.]
2452 Matched REx position InfoBlocks Comment
2456 Y A)*?Z)*?T x <- O <- I
2457 YA )*?Z)*?T x <- O <- I
2458 YA A)*?Z)*?T x <- O <- I
2459 YAA )*?Z)*?T x <- O <- I
2460 YAA Z)*?T x <- O # Temporary unset I
2463 YAAZ Y(A)*?Z)*?T x <- O
2466 YAAZY (A)*?Z)*?T x <- O
2469 YAAZY A)*?Z)*?T x <- O <- I
2472 YAAZYA )*?Z)*?T x <- O <- I
2475 YAAZYA Z)*?T x <- O # Temporary unset I
2481 YAAZYAZ T x # Temporary unset O
2488 *******************************************************************/
2491 CHECKPOINT cp = PL_savestack_ix;
2493 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2495 cc.oldcc = PL_regcc;
2497 cc.parenfloor = *PL_reglastparen;
2499 cc.min = ARG1(scan);
2500 cc.max = ARG2(scan);
2501 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2505 PL_reginput = locinput;
2506 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2508 PL_regcc = cc.oldcc;
2514 * This is really hard to understand, because after we match
2515 * what we're trying to match, we must make sure the rest of
2516 * the REx is going to match for sure, and to do that we have
2517 * to go back UP the parse tree by recursing ever deeper. And
2518 * if it fails, we have to reset our parent's current state
2519 * that we can try again after backing off.
2522 CHECKPOINT cp, lastcp;
2523 CURCUR* cc = PL_regcc;
2524 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2526 n = cc->cur + 1; /* how many we know we matched */
2527 PL_reginput = locinput;
2530 PerlIO_printf(Perl_debug_log,
2531 "%*s %ld out of %ld..%ld cc=%lx\n",
2532 REPORT_CODE_OFF+PL_regindent*2, "",
2533 (long)n, (long)cc->min,
2534 (long)cc->max, (long)cc)
2537 /* If degenerate scan matches "", assume scan done. */
2539 if (locinput == cc->lastloc && n >= cc->min) {
2540 PL_regcc = cc->oldcc;
2544 PerlIO_printf(Perl_debug_log,
2545 "%*s empty match detected, try continuation...\n",
2546 REPORT_CODE_OFF+PL_regindent*2, "")
2548 if (regmatch(cc->next))
2556 /* First just match a string of min scans. */
2560 cc->lastloc = locinput;
2561 if (regmatch(cc->scan))
2564 cc->lastloc = lastloc;
2569 /* Check whether we already were at this position.
2570 Postpone detection until we know the match is not
2571 *that* much linear. */
2572 if (!PL_reg_maxiter) {
2573 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2574 PL_reg_leftiter = PL_reg_maxiter;
2576 if (PL_reg_leftiter-- == 0) {
2577 I32 size = (PL_reg_maxiter + 7)/8;
2578 if (PL_reg_poscache) {
2579 if (PL_reg_poscache_size < size) {
2580 Renew(PL_reg_poscache, size, char);
2581 PL_reg_poscache_size = size;
2583 Zero(PL_reg_poscache, size, char);
2586 PL_reg_poscache_size = size;
2587 Newz(29, PL_reg_poscache, size, char);
2590 PerlIO_printf(Perl_debug_log,
2591 "%sDetected a super-linear match, switching on caching%s...\n",
2592 PL_colors[4], PL_colors[5])
2595 if (PL_reg_leftiter < 0) {
2596 I32 o = locinput - PL_bostr, b;
2598 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2601 if (PL_reg_poscache[o] & (1<<b)) {
2603 PerlIO_printf(Perl_debug_log,
2604 "%*s already tried at this position...\n",
2605 REPORT_CODE_OFF+PL_regindent*2, "")
2609 PL_reg_poscache[o] |= (1<<b);
2613 /* Prefer next over scan for minimal matching. */
2616 PL_regcc = cc->oldcc;
2619 cp = regcppush(cc->parenfloor);
2621 if (regmatch(cc->next)) {
2623 sayYES; /* All done. */
2631 if (n >= cc->max) { /* Maximum greed exceeded? */
2632 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2633 && !(PL_reg_flags & RF_warned)) {
2634 PL_reg_flags |= RF_warned;
2635 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2636 "Complex regular subexpression recursion",
2643 PerlIO_printf(Perl_debug_log,
2644 "%*s trying longer...\n",
2645 REPORT_CODE_OFF+PL_regindent*2, "")
2647 /* Try scanning more and see if it helps. */
2648 PL_reginput = locinput;
2650 cc->lastloc = locinput;
2651 cp = regcppush(cc->parenfloor);
2653 if (regmatch(cc->scan)) {
2660 cc->lastloc = lastloc;
2664 /* Prefer scan over next for maximal matching. */
2666 if (n < cc->max) { /* More greed allowed? */
2667 cp = regcppush(cc->parenfloor);
2669 cc->lastloc = locinput;
2671 if (regmatch(cc->scan)) {
2676 regcppop(); /* Restore some previous $<digit>s? */
2677 PL_reginput = locinput;
2679 PerlIO_printf(Perl_debug_log,
2680 "%*s failed, try continuation...\n",
2681 REPORT_CODE_OFF+PL_regindent*2, "")
2684 if (ckWARN(WARN_UNSAFE) && n >= REG_INFTY
2685 && !(PL_reg_flags & RF_warned)) {
2686 PL_reg_flags |= RF_warned;
2687 Perl_warner(aTHX_ WARN_UNSAFE, "%s limit (%d) exceeded",
2688 "Complex regular subexpression recursion",
2692 /* Failed deeper matches of scan, so see if this one works. */
2693 PL_regcc = cc->oldcc;
2696 if (regmatch(cc->next))
2702 cc->lastloc = lastloc;
2707 next = scan + ARG(scan);
2710 inner = NEXTOPER(NEXTOPER(scan));
2713 inner = NEXTOPER(scan);
2718 if (OP(next) != c1) /* No choice. */
2719 next = inner; /* Avoid recursion. */
2721 int lastparen = *PL_reglastparen;
2725 PL_reginput = locinput;
2726 if (regmatch(inner))
2729 for (n = *PL_reglastparen; n > lastparen; n--)
2731 *PL_reglastparen = n;
2734 if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next)))
2738 inner = NEXTOPER(scan);
2739 if (c1 == BRANCHJ) {
2740 inner = NEXTOPER(inner);
2742 } while (scan != NULL && OP(scan) == c1);
2756 /* We suppose that the next guy does not need
2757 backtracking: in particular, it is of constant length,
2758 and has no parenths to influence future backrefs. */
2759 ln = ARG1(scan); /* min to match */
2760 n = ARG2(scan); /* max to match */
2761 paren = scan->flags;
2763 if (paren > PL_regsize)
2765 if (paren > *PL_reglastparen)
2766 *PL_reglastparen = paren;
2768 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2770 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2771 PL_reginput = locinput;
2774 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2776 if (ln && l == 0 && n >= ln
2777 /* In fact, this is tricky. If paren, then the
2778 fact that we did/didnot match may influence
2779 future execution. */
2780 && !(paren && ln == 0))
2782 locinput = PL_reginput;
2783 if (PL_regkind[(U8)OP(next)] == EXACT) {
2784 c1 = (U8)*STRING(next);
2785 if (OP(next) == EXACTF)
2787 else if (OP(next) == EXACTFL)
2788 c2 = PL_fold_locale[c1];
2795 /* This may be improved if l == 0. */
2796 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
2797 /* If it could work, try it. */
2799 UCHARAT(PL_reginput) == c1 ||
2800 UCHARAT(PL_reginput) == c2)
2804 PL_regstartp[paren] =
2805 HOPc(PL_reginput, -l) - PL_bostr;
2806 PL_regendp[paren] = PL_reginput - PL_bostr;
2809 PL_regendp[paren] = -1;
2815 /* Couldn't or didn't -- move forward. */
2816 PL_reginput = locinput;
2817 if (regrepeat_hard(scan, 1, &l)) {
2819 locinput = PL_reginput;
2826 n = regrepeat_hard(scan, n, &l);
2827 if (n != 0 && l == 0
2828 /* In fact, this is tricky. If paren, then the
2829 fact that we did/didnot match may influence
2830 future execution. */
2831 && !(paren && ln == 0))
2833 locinput = PL_reginput;
2835 PerlIO_printf(Perl_debug_log,
2836 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
2837 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
2841 if (PL_regkind[(U8)OP(next)] == EXACT) {
2842 c1 = (U8)*STRING(next);
2843 if (OP(next) == EXACTF)
2845 else if (OP(next) == EXACTFL)
2846 c2 = PL_fold_locale[c1];
2855 /* If it could work, try it. */
2857 UCHARAT(PL_reginput) == c1 ||
2858 UCHARAT(PL_reginput) == c2)
2861 PerlIO_printf(Perl_debug_log,
2862 "%*s trying tail with n=%"IVdf"...\n",
2863 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
2867 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
2868 PL_regendp[paren] = PL_reginput - PL_bostr;
2871 PL_regendp[paren] = -1;
2877 /* Couldn't or didn't -- back up. */
2879 locinput = HOPc(locinput, -l);
2880 PL_reginput = locinput;
2887 paren = scan->flags; /* Which paren to set */
2888 if (paren > PL_regsize)
2890 if (paren > *PL_reglastparen)
2891 *PL_reglastparen = paren;
2892 ln = ARG1(scan); /* min to match */
2893 n = ARG2(scan); /* max to match */
2894 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
2898 ln = ARG1(scan); /* min to match */
2899 n = ARG2(scan); /* max to match */
2900 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2905 scan = NEXTOPER(scan);
2911 scan = NEXTOPER(scan);
2915 * Lookahead to avoid useless match attempts
2916 * when we know what character comes next.
2918 if (PL_regkind[(U8)OP(next)] == EXACT) {
2919 c1 = (U8)*STRING(next);
2920 if (OP(next) == EXACTF)
2922 else if (OP(next) == EXACTFL)
2923 c2 = PL_fold_locale[c1];
2929 PL_reginput = locinput;
2933 if (ln && regrepeat(scan, ln) < ln)
2935 locinput = PL_reginput;
2938 char *e = locinput + n - ln; /* Should not check after this */
2939 char *old = locinput;
2941 if (e >= PL_regeol || (n == REG_INFTY))
2944 /* Find place 'next' could work */
2946 while (locinput <= e && *locinput != c1)
2949 while (locinput <= e
2956 /* PL_reginput == old now */
2957 if (locinput != old) {
2958 ln = 1; /* Did some */
2959 if (regrepeat(scan, locinput - old) <
2963 /* PL_reginput == locinput now */
2966 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr;
2967 PL_regendp[paren] = locinput - PL_bostr;
2970 PL_regendp[paren] = -1;
2974 PL_reginput = locinput; /* Could be reset... */
2976 /* Couldn't or didn't -- move forward. */
2981 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
2982 /* If it could work, try it. */
2984 UCHARAT(PL_reginput) == c1 ||
2985 UCHARAT(PL_reginput) == c2)
2989 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
2990 PL_regendp[paren] = PL_reginput - PL_bostr;
2993 PL_regendp[paren] = -1;
2999 /* Couldn't or didn't -- move forward. */
3000 PL_reginput = locinput;
3001 if (regrepeat(scan, 1)) {
3003 locinput = PL_reginput;
3011 n = regrepeat(scan, n);
3012 locinput = PL_reginput;
3013 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3014 (!PL_multiline || OP(next) == SEOL))
3015 ln = n; /* why back off? */
3019 /* If it could work, try it. */
3021 UCHARAT(PL_reginput) == c1 ||
3022 UCHARAT(PL_reginput) == c2)
3026 PL_regstartp[paren] = HOPc(PL_reginput, -1) - PL_bostr;
3027 PL_regendp[paren] = PL_reginput - PL_bostr;
3030 PL_regendp[paren] = -1;
3036 /* Couldn't or didn't -- back up. */
3038 PL_reginput = locinput = HOPc(locinput, -1);
3043 /* If it could work, try it. */
3045 UCHARAT(PL_reginput) == c1 ||
3046 UCHARAT(PL_reginput) == c2)
3052 /* Couldn't or didn't -- back up. */
3054 PL_reginput = locinput = HOPc(locinput, -1);
3061 if (PL_reg_call_cc) {
3062 re_cc_state *cur_call_cc = PL_reg_call_cc;
3063 CURCUR *cctmp = PL_regcc;
3064 regexp *re = PL_reg_re;
3065 CHECKPOINT cp, lastcp;
3067 cp = regcppush(0); /* Save *all* the positions. */
3069 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3071 PL_reginput = locinput; /* Make position available to
3073 cache_re(PL_reg_call_cc->re);
3074 PL_regcc = PL_reg_call_cc->cc;
3075 PL_reg_call_cc = PL_reg_call_cc->prev;
3076 if (regmatch(cur_call_cc->node)) {
3077 PL_reg_call_cc = cur_call_cc;
3083 PL_reg_call_cc = cur_call_cc;
3089 PerlIO_printf(Perl_debug_log,
3090 "%*s continuation failed...\n",
3091 REPORT_CODE_OFF+PL_regindent*2, "")
3095 if (locinput < PL_regtill) {
3096 DEBUG_r(PerlIO_printf(Perl_debug_log,
3097 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3099 (long)(locinput - PL_reg_starttry),
3100 (long)(PL_regtill - PL_reg_starttry),
3102 sayNO_FINAL; /* Cannot match: too short. */
3104 PL_reginput = locinput; /* put where regtry can find it */
3105 sayYES_FINAL; /* Success! */
3107 PL_reginput = locinput; /* put where regtry can find it */
3108 sayYES_LOUD; /* Success! */
3111 PL_reginput = locinput;
3116 if (UTF) { /* XXXX This is absolutely
3117 broken, we read before
3119 s = HOPMAYBEc(locinput, -scan->flags);
3125 if (locinput < PL_bostr + scan->flags)
3127 PL_reginput = locinput - scan->flags;
3132 PL_reginput = locinput;
3137 if (UTF) { /* XXXX This is absolutely
3138 broken, we read before
3140 s = HOPMAYBEc(locinput, -scan->flags);
3141 if (!s || s < PL_bostr)
3146 if (locinput < PL_bostr + scan->flags)
3148 PL_reginput = locinput - scan->flags;
3153 PL_reginput = locinput;
3156 inner = NEXTOPER(NEXTOPER(scan));
3157 if (regmatch(inner) != n) {
3172 if (OP(scan) == SUSPEND) {
3173 locinput = PL_reginput;
3174 nextchr = UCHARAT(locinput);
3179 next = scan + ARG(scan);
3184 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3185 PTR2UV(scan), OP(scan));
3186 Perl_croak(aTHX_ "regexp memory corruption");
3192 * We get here only if there's trouble -- normally "case END" is
3193 * the terminating point.
3195 Perl_croak(aTHX_ "corrupted regexp pointers");
3201 PerlIO_printf(Perl_debug_log,
3202 "%*s %scould match...%s\n",
3203 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3207 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3208 PL_colors[4],PL_colors[5]));
3217 PerlIO_printf(Perl_debug_log,
3218 "%*s %sfailed...%s\n",
3219 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3231 - regrepeat - repeatedly match something simple, report how many
3234 * [This routine now assumes that it will only match on things of length 1.
3235 * That was true before, but now we assume scan - reginput is the count,
3236 * rather than incrementing count on every character. [Er, except utf8.]]
3239 S_regrepeat(pTHX_ regnode *p, I32 max)
3242 register char *scan;
3244 register char *loceol = PL_regeol;
3245 register I32 hardcount = 0;
3248 if (max != REG_INFTY && max < loceol - scan)
3249 loceol = scan + max;
3252 while (scan < loceol && *scan != '\n')
3260 while (scan < loceol && *scan != '\n') {
3261 scan += UTF8SKIP(scan);
3267 while (scan < loceol) {
3268 scan += UTF8SKIP(scan);
3272 case EXACT: /* length of string is 1 */
3274 while (scan < loceol && UCHARAT(scan) == c)
3277 case EXACTF: /* length of string is 1 */
3279 while (scan < loceol &&
3280 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3283 case EXACTFL: /* length of string is 1 */
3284 PL_reg_flags |= RF_tainted;
3286 while (scan < loceol &&
3287 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3292 while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
3293 scan += UTF8SKIP(scan);
3298 while (scan < loceol && REGINCLASS(p, *scan))
3302 while (scan < loceol && isALNUM(*scan))
3307 while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3308 scan += UTF8SKIP(scan);
3313 PL_reg_flags |= RF_tainted;
3314 while (scan < loceol && isALNUM_LC(*scan))
3318 PL_reg_flags |= RF_tainted;
3320 while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
3321 scan += UTF8SKIP(scan);
3327 while (scan < loceol && !isALNUM(*scan))
3332 while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
3333 scan += UTF8SKIP(scan);
3338 PL_reg_flags |= RF_tainted;
3339 while (scan < loceol && !isALNUM_LC(*scan))
3343 PL_reg_flags |= RF_tainted;
3345 while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
3346 scan += UTF8SKIP(scan);
3351 while (scan < loceol && isSPACE(*scan))
3356 while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3357 scan += UTF8SKIP(scan);
3362 PL_reg_flags |= RF_tainted;
3363 while (scan < loceol && isSPACE_LC(*scan))
3367 PL_reg_flags |= RF_tainted;
3369 while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3370 scan += UTF8SKIP(scan);
3375 while (scan < loceol && !isSPACE(*scan))
3380 while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
3381 scan += UTF8SKIP(scan);
3386 PL_reg_flags |= RF_tainted;
3387 while (scan < loceol && !isSPACE_LC(*scan))
3391 PL_reg_flags |= RF_tainted;
3393 while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3394 scan += UTF8SKIP(scan);
3399 while (scan < loceol && isDIGIT(*scan))
3404 while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
3405 scan += UTF8SKIP(scan);
3411 while (scan < loceol && !isDIGIT(*scan))
3416 while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
3417 scan += UTF8SKIP(scan);
3421 default: /* Called on something of 0 width. */
3422 break; /* So match right here or not at all. */
3428 c = scan - PL_reginput;
3433 SV *prop = sv_newmortal();
3436 PerlIO_printf(Perl_debug_log,
3437 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3438 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3445 - regrepeat_hard - repeatedly match something, report total lenth and length
3447 * The repeater is supposed to have constant length.
3451 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3454 register char *scan;
3455 register char *start;
3456 register char *loceol = PL_regeol;
3458 I32 count = 0, res = 1;
3463 start = PL_reginput;
3465 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3468 while (start < PL_reginput) {
3470 start += UTF8SKIP(start);
3481 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3483 *lp = l = PL_reginput - start;
3484 if (max != REG_INFTY && l*max < loceol - scan)
3485 loceol = scan + l*max;
3498 - reginclass - determine if a character falls into a character class
3502 S_reginclass(pTHX_ register regnode *p, register I32 c)
3505 char flags = ANYOF_FLAGS(p);
3509 if (ANYOF_BITMAP_TEST(p, c))
3511 else if (flags & ANYOF_FOLD) {
3513 if (flags & ANYOF_LOCALE) {
3514 PL_reg_flags |= RF_tainted;
3515 cf = PL_fold_locale[c];
3519 if (ANYOF_BITMAP_TEST(p, cf))
3523 if (!match && (flags & ANYOF_CLASS)) {
3524 PL_reg_flags |= RF_tainted;
3526 (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3527 (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3528 (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
3529 (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3530 (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3531 (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3532 (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3533 (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3534 (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3535 (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3536 (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
3537 (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
3538 (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3539 (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3540 (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3541 (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3542 (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
3543 (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3544 (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
3545 (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3546 (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3547 (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3548 (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
3549 (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3550 (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3551 (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))
3552 ) /* How's that for a conditional? */
3558 return (flags & ANYOF_INVERT) ? !match : match;
3562 S_reginclassutf8(pTHX_ regnode *f, U8 *p)
3565 char flags = ARG1(f);
3567 SV *sv = (SV*)PL_regdata->data[ARG2(f)];
3569 if (swash_fetch(sv, p))
3571 else if (flags & ANYOF_FOLD) {
3574 if (flags & ANYOF_LOCALE) {
3575 PL_reg_flags |= RF_tainted;
3576 uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3579 uv_to_utf8(tmpbuf, toLOWER_utf8(p));
3580 if (swash_fetch(sv, tmpbuf))
3584 /* UTF8 combined with ANYOF_CLASS is ill-defined. */
3586 return (flags & ANYOF_INVERT) ? !match : match;
3590 S_reghop(pTHX_ U8 *s, I32 off)
3594 while (off-- && s < (U8*)PL_regeol)
3599 if (s > (U8*)PL_bostr) {
3602 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3604 } /* XXX could check well-formedness here */
3612 S_reghopmaybe(pTHX_ U8* s, I32 off)
3616 while (off-- && s < (U8*)PL_regeol)
3623 if (s > (U8*)PL_bostr) {
3626 while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
3628 } /* XXX could check well-formedness here */
3644 restore_pos(pTHXo_ void *arg)
3647 if (PL_reg_eval_set) {
3648 if (PL_reg_oldsaved) {
3649 PL_reg_re->subbeg = PL_reg_oldsaved;
3650 PL_reg_re->sublen = PL_reg_oldsavedlen;
3651 RX_MATCH_COPIED_on(PL_reg_re);
3653 PL_reg_magic->mg_len = PL_reg_oldpos;
3654 PL_reg_eval_set = 0;
3655 PL_curpm = PL_reg_oldcurpm;