5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define HOPBACK(pos, off) ( \
120 (UTF && DO_UTF8(PL_reg_sv)) \
121 ? reghopmaybe((U8*)pos, -off) \
122 : (pos - off >= PL_bostr) \
126 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
128 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
129 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
130 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
131 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
133 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
135 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
137 static void restore_pos(pTHXo_ void *arg);
140 S_regcppush(pTHX_ I32 parenfloor)
142 int retval = PL_savestack_ix;
143 #define REGCP_PAREN_ELEMS 4
144 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
147 if (paren_elems_to_push < 0)
148 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
150 #define REGCP_OTHER_ELEMS 5
151 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
152 for (p = PL_regsize; p > parenfloor; p--) {
153 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
154 SSPUSHINT(PL_regendp[p]);
155 SSPUSHINT(PL_regstartp[p]);
156 SSPUSHPTR(PL_reg_start_tmp[p]);
159 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
160 SSPUSHINT(PL_regsize);
161 SSPUSHINT(*PL_reglastparen);
162 SSPUSHPTR(PL_reginput);
163 #define REGCP_FRAME_ELEMS 2
164 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
165 * are needed for the regexp context stack bookkeeping. */
166 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
167 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
172 /* These are needed since we do not localize EVAL nodes: */
173 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
174 " Setting an EVAL scope, savestack=%"IVdf"\n", \
175 (IV)PL_savestack_ix)); cp = PL_savestack_ix
177 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
178 PerlIO_printf(Perl_debug_log, \
179 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
180 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
190 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
192 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
193 i = SSPOPINT; /* Parentheses elements to pop. */
194 input = (char *) SSPOPPTR;
195 *PL_reglastparen = SSPOPINT;
196 PL_regsize = SSPOPINT;
198 /* Now restore the parentheses context. */
199 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
200 i > 0; i -= REGCP_PAREN_ELEMS) {
201 paren = (U32)SSPOPINT;
202 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
203 PL_regstartp[paren] = SSPOPINT;
205 if (paren <= *PL_reglastparen)
206 PL_regendp[paren] = tmps;
208 PerlIO_printf(Perl_debug_log,
209 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
210 (UV)paren, (IV)PL_regstartp[paren],
211 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
212 (IV)PL_regendp[paren],
213 (paren > *PL_reglastparen ? "(no)" : ""));
217 if (*PL_reglastparen + 1 <= PL_regnpar) {
218 PerlIO_printf(Perl_debug_log,
219 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
220 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
224 /* It would seem that the similar code in regtry()
225 * already takes care of this, and in fact it is in
226 * a better location to since this code can #if 0-ed out
227 * but the code in regtry() is needed or otherwise tests
228 * requiring null fields (pat.t#187 and split.t#{13,14}
229 * (as of patchlevel 7877) will fail. Then again,
230 * this code seems to be necessary or otherwise
231 * building DynaLoader will fail:
232 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
234 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
235 if (paren > PL_regsize)
236 PL_regstartp[paren] = -1;
237 PL_regendp[paren] = -1;
244 S_regcp_set_to(pTHX_ I32 ss)
246 I32 tmp = PL_savestack_ix;
248 PL_savestack_ix = ss;
250 PL_savestack_ix = tmp;
254 typedef struct re_cc_state
258 struct re_cc_state *prev;
263 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
265 #define TRYPAREN(paren, n, input) { \
268 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
269 PL_regendp[paren] = input - PL_bostr; \
272 PL_regendp[paren] = -1; \
274 if (regmatch(next)) \
277 PL_regendp[paren] = -1; \
282 * pregexec and friends
286 - pregexec - match a regexp against a string
289 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
290 char *strbeg, I32 minend, SV *screamer, U32 nosave)
291 /* strend: pointer to null at end of string */
292 /* strbeg: real beginning of string */
293 /* minend: end of match must be >=minend after stringarg. */
294 /* nosave: For optimizations. */
297 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
298 nosave ? 0 : REXEC_COPY_STR);
302 S_cache_re(pTHX_ regexp *prog)
304 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
306 PL_regprogram = prog->program;
308 PL_regnpar = prog->nparens;
309 PL_regdata = prog->data;
314 * Need to implement the following flags for reg_anch:
316 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
318 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
319 * INTUIT_AUTORITATIVE_ML
320 * INTUIT_ONCE_NOML - Intuit can match in one location only.
323 * Another flag for this function: SECOND_TIME (so that float substrs
324 * with giant delta may be not rechecked).
327 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
329 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
330 Otherwise, only SvCUR(sv) is used to get strbeg. */
332 /* XXXX We assume that strpos is strbeg unless sv. */
334 /* XXXX Some places assume that there is a fixed substring.
335 An update may be needed if optimizer marks as "INTUITable"
336 RExen without fixed substrings. Similarly, it is assumed that
337 lengths of all the strings are no more than minlen, thus they
338 cannot come from lookahead.
339 (Or minlen should take into account lookahead.) */
341 /* A failure to find a constant substring means that there is no need to make
342 an expensive call to REx engine, thus we celebrate a failure. Similarly,
343 finding a substring too deep into the string means that less calls to
344 regtry() should be needed.
346 REx compiler's optimizer found 4 possible hints:
347 a) Anchored substring;
349 c) Whether we are anchored (beginning-of-line or \G);
350 d) First node (of those at offset 0) which may distingush positions;
351 We use a)b)d) and multiline-part of c), and try to find a position in the
352 string which does not contradict any of them.
355 /* Most of decisions we do here should have been done at compile time.
356 The nodes of the REx which we used for the search should have been
357 deleted from the finite automaton. */
360 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
361 char *strend, U32 flags, re_scream_pos_data *data)
363 register I32 start_shift = 0;
364 /* Should be nonnegative! */
365 register I32 end_shift = 0;
371 register char *other_last = Nullch; /* other substr checked before this */
372 char *check_at = Nullch; /* check substr found at this pos */
374 char *i_strpos = strpos;
377 DEBUG_r( if (!PL_colorset) reginitcolors() );
378 DEBUG_r(PerlIO_printf(Perl_debug_log,
379 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
380 PL_colors[4],PL_colors[5],PL_colors[0],
383 (strlen(prog->precomp) > 60 ? "..." : ""),
385 (int)(strend - strpos > 60 ? 60 : strend - strpos),
386 strpos, PL_colors[1],
387 (strend - strpos > 60 ? "..." : ""))
390 if (prog->reganch & ROPT_UTF8)
391 PL_reg_flags |= RF_utf8;
393 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
394 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
397 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
399 check = prog->check_substr;
400 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
401 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
402 || ( (prog->reganch & ROPT_ANCH_BOL)
403 && !PL_multiline ) ); /* Check after \n? */
406 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
407 | ROPT_IMPLICIT)) /* not a real BOL */
408 /* SvCUR is not set on references: SvRV and SvPVX overlap */
410 && (strpos != strbeg)) {
411 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
414 if (prog->check_offset_min == prog->check_offset_max &&
415 !(prog->reganch & ROPT_SANY_SEEN)) {
416 /* Substring at constant offset from beg-of-str... */
419 s = HOP3c(strpos, prog->check_offset_min, strend);
421 slen = SvCUR(check); /* >= 1 */
423 if ( strend - s > slen || strend - s < slen - 1
424 || (strend - s == slen && strend[-1] != '\n')) {
425 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
428 /* Now should match s[0..slen-2] */
430 if (slen && (*SvPVX(check) != *s
432 && memNE(SvPVX(check), s, slen)))) {
434 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
438 else if (*SvPVX(check) != *s
439 || ((slen = SvCUR(check)) > 1
440 && memNE(SvPVX(check), s, slen)))
442 goto success_at_start;
445 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
447 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
448 end_shift = prog->minlen - start_shift -
449 CHR_SVLEN(check) + (SvTAIL(check) != 0);
451 I32 end = prog->check_offset_max + CHR_SVLEN(check)
452 - (SvTAIL(check) != 0);
453 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
455 if (end_shift < eshift)
459 else { /* Can match at random position */
462 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
463 /* Should be nonnegative! */
464 end_shift = prog->minlen - start_shift -
465 CHR_SVLEN(check) + (SvTAIL(check) != 0);
468 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
470 Perl_croak(aTHX_ "panic: end_shift");
474 /* Find a possible match in the region s..strend by looking for
475 the "check" substring in the region corrected by start/end_shift. */
476 if (flags & REXEC_SCREAM) {
477 I32 p = -1; /* Internal iterator of scream. */
478 I32 *pp = data ? data->scream_pos : &p;
480 if (PL_screamfirst[BmRARE(check)] >= 0
481 || ( BmRARE(check) == '\n'
482 && (BmPREVIOUS(check) == SvCUR(check) - 1)
484 s = screaminstr(sv, check,
485 start_shift + (s - strbeg), end_shift, pp, 0);
489 *data->scream_olds = s;
491 else if (prog->reganch & ROPT_SANY_SEEN)
492 s = fbm_instr((U8*)(s + start_shift),
493 (U8*)(strend - end_shift),
494 check, PL_multiline ? FBMrf_MULTILINE : 0);
496 s = fbm_instr(HOP3(s, start_shift, strend),
497 HOP3(strend, -end_shift, strbeg),
498 check, PL_multiline ? FBMrf_MULTILINE : 0);
500 /* Update the count-of-usability, remove useless subpatterns,
503 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
504 (s ? "Found" : "Did not find"),
505 ((check == prog->anchored_substr) ? "anchored" : "floating"),
507 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
509 PL_colors[1], (SvTAIL(check) ? "$" : ""),
510 (s ? " at offset " : "...\n") ) );
517 /* Finish the diagnostic message */
518 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
520 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
521 Start with the other substr.
522 XXXX no SCREAM optimization yet - and a very coarse implementation
523 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
524 *always* match. Probably should be marked during compile...
525 Probably it is right to do no SCREAM here...
528 if (prog->float_substr && prog->anchored_substr) {
529 /* Take into account the "other" substring. */
530 /* XXXX May be hopelessly wrong for UTF... */
533 if (check == prog->float_substr) {
536 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
539 t = s - prog->check_offset_max;
540 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
541 && (!(prog->reganch & ROPT_UTF8)
542 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
547 t = HOP3c(t, prog->anchored_offset, strend);
548 if (t < other_last) /* These positions already checked */
550 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
553 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
554 /* On end-of-str: see comment below. */
555 s = fbm_instr((unsigned char*)t,
556 HOP3(HOP3(last1, prog->anchored_offset, strend)
557 + SvCUR(prog->anchored_substr),
558 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
559 prog->anchored_substr,
560 PL_multiline ? FBMrf_MULTILINE : 0);
561 DEBUG_r(PerlIO_printf(Perl_debug_log,
562 "%s anchored substr `%s%.*s%s'%s",
563 (s ? "Found" : "Contradicts"),
565 (int)(SvCUR(prog->anchored_substr)
566 - (SvTAIL(prog->anchored_substr)!=0)),
567 SvPVX(prog->anchored_substr),
568 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
570 if (last1 >= last2) {
571 DEBUG_r(PerlIO_printf(Perl_debug_log,
572 ", giving up...\n"));
575 DEBUG_r(PerlIO_printf(Perl_debug_log,
576 ", trying floating at offset %ld...\n",
577 (long)(HOP3c(s1, 1, strend) - i_strpos)));
578 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
579 s = HOP3c(last, 1, strend);
583 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
584 (long)(s - i_strpos)));
585 t = HOP3c(s, -prog->anchored_offset, strbeg);
586 other_last = HOP3c(s, 1, strend);
594 else { /* Take into account the floating substring. */
598 t = HOP3c(s, -start_shift, strbeg);
600 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
601 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
602 last = HOP3c(t, prog->float_max_offset, strend);
603 s = HOP3c(t, prog->float_min_offset, strend);
606 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
607 /* fbm_instr() takes into account exact value of end-of-str
608 if the check is SvTAIL(ed). Since false positives are OK,
609 and end-of-str is not later than strend we are OK. */
610 s = fbm_instr((unsigned char*)s,
611 (unsigned char*)last + SvCUR(prog->float_substr)
612 - (SvTAIL(prog->float_substr)!=0),
613 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
614 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
615 (s ? "Found" : "Contradicts"),
617 (int)(SvCUR(prog->float_substr)
618 - (SvTAIL(prog->float_substr)!=0)),
619 SvPVX(prog->float_substr),
620 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
623 DEBUG_r(PerlIO_printf(Perl_debug_log,
624 ", giving up...\n"));
627 DEBUG_r(PerlIO_printf(Perl_debug_log,
628 ", trying anchored starting at offset %ld...\n",
629 (long)(s1 + 1 - i_strpos)));
631 s = HOP3c(t, 1, strend);
635 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
636 (long)(s - i_strpos)));
637 other_last = s; /* Fix this later. --Hugo */
646 t = s - prog->check_offset_max;
647 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
648 && (!(prog->reganch & ROPT_UTF8)
649 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
651 /* Fixed substring is found far enough so that the match
652 cannot start at strpos. */
654 if (ml_anch && t[-1] != '\n') {
655 /* Eventually fbm_*() should handle this, but often
656 anchored_offset is not 0, so this check will not be wasted. */
657 /* XXXX In the code below we prefer to look for "^" even in
658 presence of anchored substrings. And we search even
659 beyond the found float position. These pessimizations
660 are historical artefacts only. */
662 while (t < strend - prog->minlen) {
664 if (t < check_at - prog->check_offset_min) {
665 if (prog->anchored_substr) {
666 /* Since we moved from the found position,
667 we definitely contradict the found anchored
668 substr. Due to the above check we do not
669 contradict "check" substr.
670 Thus we can arrive here only if check substr
671 is float. Redo checking for "other"=="fixed".
674 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
675 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
676 goto do_other_anchored;
678 /* We don't contradict the found floating substring. */
679 /* XXXX Why not check for STCLASS? */
681 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
682 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
685 /* Position contradicts check-string */
686 /* XXXX probably better to look for check-string
687 than for "\n", so one should lower the limit for t? */
688 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
689 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
690 other_last = strpos = s = t + 1;
695 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
696 PL_colors[0],PL_colors[1]));
700 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
701 PL_colors[0],PL_colors[1]));
705 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
708 /* The found string does not prohibit matching at strpos,
709 - no optimization of calling REx engine can be performed,
710 unless it was an MBOL and we are not after MBOL,
711 or a future STCLASS check will fail this. */
713 /* Even in this situation we may use MBOL flag if strpos is offset
714 wrt the start of the string. */
715 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
716 && (strpos != strbeg) && strpos[-1] != '\n'
717 /* May be due to an implicit anchor of m{.*foo} */
718 && !(prog->reganch & ROPT_IMPLICIT))
723 DEBUG_r( if (ml_anch)
724 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
725 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
728 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
729 && prog->check_substr /* Could be deleted already */
730 && --BmUSEFUL(prog->check_substr) < 0
731 && prog->check_substr == prog->float_substr)
733 /* If flags & SOMETHING - do not do it many times on the same match */
734 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
735 SvREFCNT_dec(prog->check_substr);
736 prog->check_substr = Nullsv; /* disable */
737 prog->float_substr = Nullsv; /* clear */
738 check = Nullsv; /* abort */
740 /* XXXX This is a remnant of the old implementation. It
741 looks wasteful, since now INTUIT can use many
743 prog->reganch &= ~RE_USE_INTUIT;
750 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
751 if (prog->regstclass) {
752 /* minlen == 0 is possible if regstclass is \b or \B,
753 and the fixed substr is ''$.
754 Since minlen is already taken into account, s+1 is before strend;
755 accidentally, minlen >= 1 guaranties no false positives at s + 1
756 even for \b or \B. But (minlen? 1 : 0) below assumes that
757 regstclass does not come from lookahead... */
758 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
759 This leaves EXACTF only, which is dealt with in find_byclass(). */
760 U8* str = (U8*)STRING(prog->regstclass);
761 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
762 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
764 char *endpos = (prog->anchored_substr || ml_anch)
765 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
766 : (prog->float_substr
767 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
770 char *startpos = strbeg;
773 if (prog->reganch & ROPT_UTF8) {
774 PL_regdata = prog->data;
777 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
782 if (endpos == strend) {
783 DEBUG_r( PerlIO_printf(Perl_debug_log,
784 "Could not match STCLASS...\n") );
787 DEBUG_r( PerlIO_printf(Perl_debug_log,
788 "This position contradicts STCLASS...\n") );
789 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
791 /* Contradict one of substrings */
792 if (prog->anchored_substr) {
793 if (prog->anchored_substr == check) {
794 DEBUG_r( what = "anchored" );
796 s = HOP3c(t, 1, strend);
797 if (s + start_shift + end_shift > strend) {
798 /* XXXX Should be taken into account earlier? */
799 DEBUG_r( PerlIO_printf(Perl_debug_log,
800 "Could not match STCLASS...\n") );
805 DEBUG_r( PerlIO_printf(Perl_debug_log,
806 "Looking for %s substr starting at offset %ld...\n",
807 what, (long)(s + start_shift - i_strpos)) );
810 /* Have both, check_string is floating */
811 if (t + start_shift >= check_at) /* Contradicts floating=check */
812 goto retry_floating_check;
813 /* Recheck anchored substring, but not floating... */
817 DEBUG_r( PerlIO_printf(Perl_debug_log,
818 "Looking for anchored substr starting at offset %ld...\n",
819 (long)(other_last - i_strpos)) );
820 goto do_other_anchored;
822 /* Another way we could have checked stclass at the
823 current position only: */
828 DEBUG_r( PerlIO_printf(Perl_debug_log,
829 "Looking for /%s^%s/m starting at offset %ld...\n",
830 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
833 if (!prog->float_substr) /* Could have been deleted */
835 /* Check is floating subtring. */
836 retry_floating_check:
837 t = check_at - start_shift;
838 DEBUG_r( what = "floating" );
839 goto hop_and_restart;
842 DEBUG_r(PerlIO_printf(Perl_debug_log,
843 "By STCLASS: moving %ld --> %ld\n",
844 (long)(t - i_strpos), (long)(s - i_strpos))
848 DEBUG_r(PerlIO_printf(Perl_debug_log,
849 "Does not contradict STCLASS...\n");
854 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
855 PL_colors[4], (check ? "Guessed" : "Giving up"),
856 PL_colors[5], (long)(s - i_strpos)) );
859 fail_finish: /* Substring not found */
860 if (prog->check_substr) /* could be removed already */
861 BmUSEFUL(prog->check_substr) += 5; /* hooray */
863 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
864 PL_colors[4],PL_colors[5]));
868 /* We know what class REx starts with. Try to find this position... */
870 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
872 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
878 register I32 tmp = 1; /* Scratch variable? */
879 register bool do_utf8 = DO_UTF8(PL_reg_sv);
881 /* We know what class it must start with. */
885 if (reginclass(c, (U8*)s, do_utf8)) {
886 if (tmp && (norun || regtry(prog, s)))
893 s += do_utf8 ? UTF8SKIP(s) : 1;
900 c1 = to_utf8_lower((U8*)m);
901 c2 = to_utf8_upper((U8*)m);
912 c2 = PL_fold_locale[c1];
917 e = s; /* Due to minlen logic of intuit() */
923 if ( utf8_to_uvchr((U8*)s, &len) == c1
930 UV c = utf8_to_uvchr((U8*)s, &len);
931 if ( (c == c1 || c == c2) && regtry(prog, s) )
940 && (ln == 1 || !(OP(c) == EXACTF
942 : ibcmp_locale(s, m, ln)))
943 && (norun || regtry(prog, s)) )
949 if ( (*(U8*)s == c1 || *(U8*)s == c2)
950 && (ln == 1 || !(OP(c) == EXACTF
952 : ibcmp_locale(s, m, ln)))
953 && (norun || regtry(prog, s)) )
960 PL_reg_flags |= RF_tainted;
967 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
970 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
972 tmp = ((OP(c) == BOUND ?
973 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
974 LOAD_UTF8_CHARCLASS(alnum,"a");
976 if (tmp == !(OP(c) == BOUND ?
977 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
978 isALNUM_LC_utf8((U8*)s)))
981 if ((norun || regtry(prog, s)))
988 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
989 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
992 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
994 if ((norun || regtry(prog, s)))
1000 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1004 PL_reg_flags |= RF_tainted;
1011 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1014 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1016 tmp = ((OP(c) == NBOUND ?
1017 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1018 LOAD_UTF8_CHARCLASS(alnum,"a");
1019 while (s < strend) {
1020 if (tmp == !(OP(c) == NBOUND ?
1021 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1022 isALNUM_LC_utf8((U8*)s)))
1024 else if ((norun || regtry(prog, s)))
1030 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1031 tmp = ((OP(c) == NBOUND ?
1032 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1033 while (s < strend) {
1035 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1037 else if ((norun || regtry(prog, s)))
1042 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1047 LOAD_UTF8_CHARCLASS(alnum,"a");
1048 while (s < strend) {
1049 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1050 if (tmp && (norun || regtry(prog, s)))
1061 while (s < strend) {
1063 if (tmp && (norun || regtry(prog, s)))
1075 PL_reg_flags |= RF_tainted;
1077 while (s < strend) {
1078 if (isALNUM_LC_utf8((U8*)s)) {
1079 if (tmp && (norun || regtry(prog, s)))
1090 while (s < strend) {
1091 if (isALNUM_LC(*s)) {
1092 if (tmp && (norun || regtry(prog, s)))
1105 LOAD_UTF8_CHARCLASS(alnum,"a");
1106 while (s < strend) {
1107 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1108 if (tmp && (norun || regtry(prog, s)))
1119 while (s < strend) {
1121 if (tmp && (norun || regtry(prog, s)))
1133 PL_reg_flags |= RF_tainted;
1135 while (s < strend) {
1136 if (!isALNUM_LC_utf8((U8*)s)) {
1137 if (tmp && (norun || regtry(prog, s)))
1148 while (s < strend) {
1149 if (!isALNUM_LC(*s)) {
1150 if (tmp && (norun || regtry(prog, s)))
1163 LOAD_UTF8_CHARCLASS(space," ");
1164 while (s < strend) {
1165 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1166 if (tmp && (norun || regtry(prog, s)))
1177 while (s < strend) {
1179 if (tmp && (norun || regtry(prog, s)))
1191 PL_reg_flags |= RF_tainted;
1193 while (s < strend) {
1194 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1195 if (tmp && (norun || regtry(prog, s)))
1206 while (s < strend) {
1207 if (isSPACE_LC(*s)) {
1208 if (tmp && (norun || regtry(prog, s)))
1221 LOAD_UTF8_CHARCLASS(space," ");
1222 while (s < strend) {
1223 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1224 if (tmp && (norun || regtry(prog, s)))
1235 while (s < strend) {
1237 if (tmp && (norun || regtry(prog, s)))
1249 PL_reg_flags |= RF_tainted;
1251 while (s < strend) {
1252 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1253 if (tmp && (norun || regtry(prog, s)))
1264 while (s < strend) {
1265 if (!isSPACE_LC(*s)) {
1266 if (tmp && (norun || regtry(prog, s)))
1279 LOAD_UTF8_CHARCLASS(digit,"0");
1280 while (s < strend) {
1281 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1282 if (tmp && (norun || regtry(prog, s)))
1293 while (s < strend) {
1295 if (tmp && (norun || regtry(prog, s)))
1307 PL_reg_flags |= RF_tainted;
1309 while (s < strend) {
1310 if (isDIGIT_LC_utf8((U8*)s)) {
1311 if (tmp && (norun || regtry(prog, s)))
1322 while (s < strend) {
1323 if (isDIGIT_LC(*s)) {
1324 if (tmp && (norun || regtry(prog, s)))
1337 LOAD_UTF8_CHARCLASS(digit,"0");
1338 while (s < strend) {
1339 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1340 if (tmp && (norun || regtry(prog, s)))
1351 while (s < strend) {
1353 if (tmp && (norun || regtry(prog, s)))
1365 PL_reg_flags |= RF_tainted;
1367 while (s < strend) {
1368 if (!isDIGIT_LC_utf8((U8*)s)) {
1369 if (tmp && (norun || regtry(prog, s)))
1380 while (s < strend) {
1381 if (!isDIGIT_LC(*s)) {
1382 if (tmp && (norun || regtry(prog, s)))
1394 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1403 - regexec_flags - match a regexp against a string
1406 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1407 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1408 /* strend: pointer to null at end of string */
1409 /* strbeg: real beginning of string */
1410 /* minend: end of match must be >=minend after stringarg. */
1411 /* data: May be used for some additional optimizations. */
1412 /* nosave: For optimizations. */
1415 register regnode *c;
1416 register char *startpos = stringarg;
1417 I32 minlen; /* must match at least this many chars */
1418 I32 dontbother = 0; /* how many characters not to try at end */
1419 /* I32 start_shift = 0; */ /* Offset of the start to find
1420 constant substr. */ /* CC */
1421 I32 end_shift = 0; /* Same for the end. */ /* CC */
1422 I32 scream_pos = -1; /* Internal iterator of scream. */
1424 SV* oreplsv = GvSV(PL_replgv);
1425 bool do_utf8 = DO_UTF8(sv);
1431 PL_regnarrate = DEBUG_r_TEST;
1434 /* Be paranoid... */
1435 if (prog == NULL || startpos == NULL) {
1436 Perl_croak(aTHX_ "NULL regexp parameter");
1440 minlen = prog->minlen;
1442 if (!(prog->reganch & ROPT_SANY_SEEN))
1443 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1446 if (strend - startpos < minlen) goto phooey;
1449 /* Check validity of program. */
1450 if (UCHARAT(prog->program) != REG_MAGIC) {
1451 Perl_croak(aTHX_ "corrupted regexp program");
1455 PL_reg_eval_set = 0;
1458 if (prog->reganch & ROPT_UTF8)
1459 PL_reg_flags |= RF_utf8;
1461 /* Mark beginning of line for ^ and lookbehind. */
1462 PL_regbol = startpos;
1466 /* Mark end of line for $ (and such) */
1469 /* see how far we have to get to not match where we matched before */
1470 PL_regtill = startpos+minend;
1472 /* We start without call_cc context. */
1475 /* If there is a "must appear" string, look for it. */
1478 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1481 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1482 PL_reg_ganch = startpos;
1483 else if (sv && SvTYPE(sv) >= SVt_PVMG
1485 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1486 && mg->mg_len >= 0) {
1487 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1488 if (prog->reganch & ROPT_ANCH_GPOS) {
1489 if (s > PL_reg_ganch)
1494 else /* pos() not defined */
1495 PL_reg_ganch = strbeg;
1498 if (do_utf8 == (UTF!=0) &&
1499 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1500 re_scream_pos_data d;
1502 d.scream_olds = &scream_olds;
1503 d.scream_pos = &scream_pos;
1504 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1506 goto phooey; /* not present */
1509 DEBUG_r( if (!PL_colorset) reginitcolors() );
1510 DEBUG_r(PerlIO_printf(Perl_debug_log,
1511 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1512 PL_colors[4],PL_colors[5],PL_colors[0],
1515 (strlen(prog->precomp) > 60 ? "..." : ""),
1517 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1518 startpos, PL_colors[1],
1519 (strend - startpos > 60 ? "..." : ""))
1522 /* Simplest case: anchored match need be tried only once. */
1523 /* [unless only anchor is BOL and multiline is set] */
1524 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1525 if (s == startpos && regtry(prog, startpos))
1527 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1528 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1533 dontbother = minlen - 1;
1534 end = HOP3c(strend, -dontbother, strbeg) - 1;
1535 /* for multiline we only have to try after newlines */
1536 if (prog->check_substr) {
1540 if (regtry(prog, s))
1545 if (prog->reganch & RE_USE_INTUIT) {
1546 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1557 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1558 if (regtry(prog, s))
1565 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1566 if (regtry(prog, PL_reg_ganch))
1571 /* Messy cases: unanchored match. */
1572 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1573 /* we have /x+whatever/ */
1574 /* it must be a one character string (XXXX Except UTF?) */
1575 char ch = SvPVX(prog->anchored_substr)[0];
1581 while (s < strend) {
1583 DEBUG_r( did_match = 1 );
1584 if (regtry(prog, s)) goto got_it;
1586 while (s < strend && *s == ch)
1593 while (s < strend) {
1595 DEBUG_r( did_match = 1 );
1596 if (regtry(prog, s)) goto got_it;
1598 while (s < strend && *s == ch)
1604 DEBUG_r(if (!did_match)
1605 PerlIO_printf(Perl_debug_log,
1606 "Did not find anchored character...\n")
1610 else if (do_utf8 == (UTF!=0) &&
1611 (prog->anchored_substr != Nullsv
1612 || (prog->float_substr != Nullsv
1613 && prog->float_max_offset < strend - s))) {
1614 SV *must = prog->anchored_substr
1615 ? prog->anchored_substr : prog->float_substr;
1617 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1619 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1620 char *last = HOP3c(strend, /* Cannot start after this */
1621 -(I32)(CHR_SVLEN(must)
1622 - (SvTAIL(must) != 0) + back_min), strbeg);
1623 char *last1; /* Last position checked before */
1629 last1 = HOPc(s, -1);
1631 last1 = s - 1; /* bogus */
1633 /* XXXX check_substr already used to find `s', can optimize if
1634 check_substr==must. */
1636 dontbother = end_shift;
1637 strend = HOPc(strend, -dontbother);
1638 while ( (s <= last) &&
1639 ((flags & REXEC_SCREAM)
1640 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1641 end_shift, &scream_pos, 0))
1642 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1643 (unsigned char*)strend, must,
1644 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1645 DEBUG_r( did_match = 1 );
1646 if (HOPc(s, -back_max) > last1) {
1647 last1 = HOPc(s, -back_min);
1648 s = HOPc(s, -back_max);
1651 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1653 last1 = HOPc(s, -back_min);
1657 while (s <= last1) {
1658 if (regtry(prog, s))
1664 while (s <= last1) {
1665 if (regtry(prog, s))
1671 DEBUG_r(if (!did_match)
1672 PerlIO_printf(Perl_debug_log,
1673 "Did not find %s substr `%s%.*s%s'%s...\n",
1674 ((must == prog->anchored_substr)
1675 ? "anchored" : "floating"),
1677 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1679 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1683 else if ((c = prog->regstclass)) {
1684 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1685 /* don't bother with what can't match */
1686 strend = HOPc(strend, -(minlen - 1));
1688 SV *prop = sv_newmortal();
1690 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1692 if (find_byclass(prog, c, s, strend, startpos, 0))
1694 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1698 if (prog->float_substr != Nullsv) { /* Trim the end. */
1701 if (flags & REXEC_SCREAM) {
1702 last = screaminstr(sv, prog->float_substr, s - strbeg,
1703 end_shift, &scream_pos, 1); /* last one */
1705 last = scream_olds; /* Only one occurrence. */
1709 char *little = SvPV(prog->float_substr, len);
1711 if (SvTAIL(prog->float_substr)) {
1712 if (memEQ(strend - len + 1, little, len - 1))
1713 last = strend - len + 1;
1714 else if (!PL_multiline)
1715 last = memEQ(strend - len, little, len)
1716 ? strend - len : Nullch;
1722 last = rninstr(s, strend, little, little + len);
1724 last = strend; /* matching `$' */
1728 DEBUG_r(PerlIO_printf(Perl_debug_log,
1729 "%sCan't trim the tail, match fails (should not happen)%s\n",
1730 PL_colors[4],PL_colors[5]));
1731 goto phooey; /* Should not happen! */
1733 dontbother = strend - last + prog->float_min_offset;
1735 if (minlen && (dontbother < minlen))
1736 dontbother = minlen - 1;
1737 strend -= dontbother; /* this one's always in bytes! */
1738 /* We don't know much -- general case. */
1741 if (regtry(prog, s))
1750 if (regtry(prog, s))
1752 } while (s++ < strend);
1760 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1762 if (PL_reg_eval_set) {
1763 /* Preserve the current value of $^R */
1764 if (oreplsv != GvSV(PL_replgv))
1765 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1766 restored, the value remains
1768 restore_pos(aTHXo_ 0);
1771 /* make sure $`, $&, $', and $digit will work later */
1772 if ( !(flags & REXEC_NOT_FIRST) ) {
1773 if (RX_MATCH_COPIED(prog)) {
1774 Safefree(prog->subbeg);
1775 RX_MATCH_COPIED_off(prog);
1777 if (flags & REXEC_COPY_STR) {
1778 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1780 s = savepvn(strbeg, i);
1783 RX_MATCH_COPIED_on(prog);
1786 prog->subbeg = strbeg;
1787 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1794 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1795 PL_colors[4],PL_colors[5]));
1796 if (PL_reg_eval_set)
1797 restore_pos(aTHXo_ 0);
1802 - regtry - try match at specific point
1804 STATIC I32 /* 0 failure, 1 success */
1805 S_regtry(pTHX_ regexp *prog, char *startpos)
1813 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1815 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1818 PL_reg_eval_set = RS_init;
1820 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1821 (IV)(PL_stack_sp - PL_stack_base));
1823 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1824 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1825 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1827 /* Apparently this is not needed, judging by wantarray. */
1828 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1829 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1832 /* Make $_ available to executed code. */
1833 if (PL_reg_sv != DEFSV) {
1834 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1839 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1840 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1841 /* prepare for quick setting of pos */
1842 sv_magic(PL_reg_sv, (SV*)0,
1843 PERL_MAGIC_regex_global, Nullch, 0);
1844 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1848 PL_reg_oldpos = mg->mg_len;
1849 SAVEDESTRUCTOR_X(restore_pos, 0);
1852 Newz(22,PL_reg_curpm, 1, PMOP);
1853 PL_reg_curpm->op_pmregexp = prog;
1854 PL_reg_oldcurpm = PL_curpm;
1855 PL_curpm = PL_reg_curpm;
1856 if (RX_MATCH_COPIED(prog)) {
1857 /* Here is a serious problem: we cannot rewrite subbeg,
1858 since it may be needed if this match fails. Thus
1859 $` inside (?{}) could fail... */
1860 PL_reg_oldsaved = prog->subbeg;
1861 PL_reg_oldsavedlen = prog->sublen;
1862 RX_MATCH_COPIED_off(prog);
1865 PL_reg_oldsaved = Nullch;
1866 prog->subbeg = PL_bostr;
1867 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1869 prog->startp[0] = startpos - PL_bostr;
1870 PL_reginput = startpos;
1871 PL_regstartp = prog->startp;
1872 PL_regendp = prog->endp;
1873 PL_reglastparen = &prog->lastparen;
1874 prog->lastparen = 0;
1876 DEBUG_r(PL_reg_starttry = startpos);
1877 if (PL_reg_start_tmpl <= prog->nparens) {
1878 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1879 if(PL_reg_start_tmp)
1880 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1882 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1885 /* XXXX What this code is doing here?!!! There should be no need
1886 to do this again and again, PL_reglastparen should take care of
1889 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1890 * Actually, the code in regcppop() (which Ilya may be meaning by
1891 * PL_reglastparen), is not needed at all by the test suite
1892 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1893 * enough, for building DynaLoader, or otherwise this
1894 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1895 * will happen. Meanwhile, this code *is* needed for the
1896 * above-mentioned test suite tests to succeed. The common theme
1897 * on those tests seems to be returning null fields from matches.
1902 if (prog->nparens) {
1903 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1910 if (regmatch(prog->program + 1)) {
1911 prog->endp[0] = PL_reginput - PL_bostr;
1914 REGCP_UNWIND(lastcp);
1918 #define RE_UNWIND_BRANCH 1
1919 #define RE_UNWIND_BRANCHJ 2
1923 typedef struct { /* XX: makes sense to enlarge it... */
1927 } re_unwind_generic_t;
1940 } re_unwind_branch_t;
1942 typedef union re_unwind_t {
1944 re_unwind_generic_t generic;
1945 re_unwind_branch_t branch;
1949 - regmatch - main matching routine
1951 * Conceptually the strategy is simple: check to see whether the current
1952 * node matches, call self recursively to see whether the rest matches,
1953 * and then act accordingly. In practice we make some effort to avoid
1954 * recursion, in particular by going through "ordinary" nodes (that don't
1955 * need to know whether the rest of the match failed) by a loop instead of
1958 /* [lwall] I've hoisted the register declarations to the outer block in order to
1959 * maybe save a little bit of pushing and popping on the stack. It also takes
1960 * advantage of machines that use a register save mask on subroutine entry.
1962 STATIC I32 /* 0 failure, 1 success */
1963 S_regmatch(pTHX_ regnode *prog)
1965 register regnode *scan; /* Current node. */
1966 regnode *next; /* Next node. */
1967 regnode *inner; /* Next node in internal branch. */
1968 register I32 nextchr; /* renamed nextchr - nextchar colides with
1969 function of same name */
1970 register I32 n; /* no or next */
1971 register I32 ln = 0; /* len or last */
1972 register char *s = Nullch; /* operand or save */
1973 register char *locinput = PL_reginput;
1974 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
1975 int minmod = 0, sw = 0, logical = 0;
1978 I32 firstcp = PL_savestack_ix;
1980 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1986 /* Note that nextchr is a byte even in UTF */
1987 nextchr = UCHARAT(locinput);
1989 while (scan != NULL) {
1990 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1992 # define sayYES goto yes
1993 # define sayNO goto no
1994 # define sayYES_FINAL goto yes_final
1995 # define sayYES_LOUD goto yes_loud
1996 # define sayNO_FINAL goto no_final
1997 # define sayNO_SILENT goto do_no
1998 # define saySAME(x) if (x) goto yes; else goto no
1999 # define REPORT_CODE_OFF 24
2001 # define sayYES return 1
2002 # define sayNO return 0
2003 # define sayYES_FINAL return 1
2004 # define sayYES_LOUD return 1
2005 # define sayNO_FINAL return 0
2006 # define sayNO_SILENT return 0
2007 # define saySAME(x) return x
2010 SV *prop = sv_newmortal();
2011 int docolor = *PL_colors[0];
2012 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2013 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2014 /* The part of the string before starttry has one color
2015 (pref0_len chars), between starttry and current
2016 position another one (pref_len - pref0_len chars),
2017 after the current position the third one.
2018 We assume that pref0_len <= pref_len, otherwise we
2019 decrease pref0_len. */
2020 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2021 ? (5 + taill) - l : locinput - PL_bostr;
2024 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2026 pref0_len = pref_len - (locinput - PL_reg_starttry);
2027 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2028 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2029 ? (5 + taill) - pref_len : PL_regeol - locinput);
2030 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2034 if (pref0_len > pref_len)
2035 pref0_len = pref_len;
2036 regprop(prop, scan);
2037 PerlIO_printf(Perl_debug_log,
2038 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2039 (IV)(locinput - PL_bostr),
2040 PL_colors[4], pref0_len,
2041 locinput - pref_len, PL_colors[5],
2042 PL_colors[2], pref_len - pref0_len,
2043 locinput - pref_len + pref0_len, PL_colors[3],
2044 (docolor ? "" : "> <"),
2045 PL_colors[0], l, locinput, PL_colors[1],
2046 15 - l - pref_len + 1,
2048 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2052 next = scan + NEXT_OFF(scan);
2058 if (locinput == PL_bostr || (PL_multiline &&
2059 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2061 /* regtill = regbol; */
2066 if (locinput == PL_bostr ||
2067 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2073 if (locinput == PL_bostr)
2077 if (locinput == PL_reg_ganch)
2087 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2092 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2094 if (PL_regeol - locinput > 1)
2098 if (PL_regeol != locinput)
2102 if (!nextchr && locinput >= PL_regeol)
2104 nextchr = UCHARAT(++locinput);
2107 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2110 locinput += PL_utf8skip[nextchr];
2111 if (locinput > PL_regeol)
2113 nextchr = UCHARAT(locinput);
2116 nextchr = UCHARAT(++locinput);
2121 if (do_utf8 != (UTF!=0)) {
2129 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2138 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2144 nextchr = UCHARAT(locinput);
2147 /* Inline the first character, for speed. */
2148 if (UCHARAT(s) != nextchr)
2150 if (PL_regeol - locinput < ln)
2152 if (ln > 1 && memNE(s, locinput, ln))
2155 nextchr = UCHARAT(locinput);
2158 PL_reg_flags |= RF_tainted;
2168 c1 = OP(scan) == EXACTF;
2170 if (l >= PL_regeol) {
2173 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2174 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2176 s += UTF ? UTF8SKIP(s) : 1;
2180 nextchr = UCHARAT(locinput);
2184 /* Inline the first character, for speed. */
2185 if (UCHARAT(s) != nextchr &&
2186 UCHARAT(s) != ((OP(scan) == EXACTF)
2187 ? PL_fold : PL_fold_locale)[nextchr])
2189 if (PL_regeol - locinput < ln)
2191 if (ln > 1 && (OP(scan) == EXACTF
2192 ? ibcmp(s, locinput, ln)
2193 : ibcmp_locale(s, locinput, ln)))
2196 nextchr = UCHARAT(locinput);
2200 if (!reginclass(scan, (U8*)locinput, do_utf8))
2202 if (locinput >= PL_regeol)
2204 locinput += PL_utf8skip[nextchr];
2205 nextchr = UCHARAT(locinput);
2209 nextchr = UCHARAT(locinput);
2210 if (!reginclass(scan, (U8*)locinput, do_utf8))
2212 if (!nextchr && locinput >= PL_regeol)
2214 nextchr = UCHARAT(++locinput);
2218 PL_reg_flags |= RF_tainted;
2224 LOAD_UTF8_CHARCLASS(alnum,"a");
2225 if (!(OP(scan) == ALNUM
2226 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2227 : isALNUM_LC_utf8((U8*)locinput)))
2231 locinput += PL_utf8skip[nextchr];
2232 nextchr = UCHARAT(locinput);
2235 if (!(OP(scan) == ALNUM
2236 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2238 nextchr = UCHARAT(++locinput);
2241 PL_reg_flags |= RF_tainted;
2244 if (!nextchr && locinput >= PL_regeol)
2247 LOAD_UTF8_CHARCLASS(alnum,"a");
2248 if (OP(scan) == NALNUM
2249 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2250 : isALNUM_LC_utf8((U8*)locinput))
2254 locinput += PL_utf8skip[nextchr];
2255 nextchr = UCHARAT(locinput);
2258 if (OP(scan) == NALNUM
2259 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2261 nextchr = UCHARAT(++locinput);
2265 PL_reg_flags |= RF_tainted;
2269 /* was last char in word? */
2271 if (locinput == PL_bostr)
2274 U8 *r = reghop((U8*)locinput, -1);
2276 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2278 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2279 ln = isALNUM_uni(ln);
2280 LOAD_UTF8_CHARCLASS(alnum,"a");
2281 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2284 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2285 n = isALNUM_LC_utf8((U8*)locinput);
2289 ln = (locinput != PL_bostr) ?
2290 UCHARAT(locinput - 1) : '\n';
2291 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2293 n = isALNUM(nextchr);
2296 ln = isALNUM_LC(ln);
2297 n = isALNUM_LC(nextchr);
2300 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2301 OP(scan) == BOUNDL))
2305 PL_reg_flags |= RF_tainted;
2311 if (UTF8_IS_CONTINUED(nextchr)) {
2312 LOAD_UTF8_CHARCLASS(space," ");
2313 if (!(OP(scan) == SPACE
2314 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2315 : isSPACE_LC_utf8((U8*)locinput)))
2319 locinput += PL_utf8skip[nextchr];
2320 nextchr = UCHARAT(locinput);
2323 if (!(OP(scan) == SPACE
2324 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2326 nextchr = UCHARAT(++locinput);
2329 if (!(OP(scan) == SPACE
2330 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2332 nextchr = UCHARAT(++locinput);
2336 PL_reg_flags |= RF_tainted;
2339 if (!nextchr && locinput >= PL_regeol)
2342 LOAD_UTF8_CHARCLASS(space," ");
2343 if (OP(scan) == NSPACE
2344 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2345 : isSPACE_LC_utf8((U8*)locinput))
2349 locinput += PL_utf8skip[nextchr];
2350 nextchr = UCHARAT(locinput);
2353 if (OP(scan) == NSPACE
2354 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2356 nextchr = UCHARAT(++locinput);
2359 PL_reg_flags |= RF_tainted;
2365 LOAD_UTF8_CHARCLASS(digit,"0");
2366 if (!(OP(scan) == DIGIT
2367 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2368 : isDIGIT_LC_utf8((U8*)locinput)))
2372 locinput += PL_utf8skip[nextchr];
2373 nextchr = UCHARAT(locinput);
2376 if (!(OP(scan) == DIGIT
2377 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2379 nextchr = UCHARAT(++locinput);
2382 PL_reg_flags |= RF_tainted;
2385 if (!nextchr && locinput >= PL_regeol)
2388 LOAD_UTF8_CHARCLASS(digit,"0");
2389 if (OP(scan) == NDIGIT
2390 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2391 : isDIGIT_LC_utf8((U8*)locinput))
2395 locinput += PL_utf8skip[nextchr];
2396 nextchr = UCHARAT(locinput);
2399 if (OP(scan) == NDIGIT
2400 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2402 nextchr = UCHARAT(++locinput);
2405 LOAD_UTF8_CHARCLASS(mark,"~");
2406 if (locinput >= PL_regeol ||
2407 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2409 locinput += PL_utf8skip[nextchr];
2410 while (locinput < PL_regeol &&
2411 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2412 locinput += UTF8SKIP(locinput);
2413 if (locinput > PL_regeol)
2415 nextchr = UCHARAT(locinput);
2418 PL_reg_flags |= RF_tainted;
2422 n = ARG(scan); /* which paren pair */
2423 ln = PL_regstartp[n];
2424 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2425 if (*PL_reglastparen < n || ln == -1)
2426 sayNO; /* Do not match unless seen CLOSEn. */
2427 if (ln == PL_regendp[n])
2431 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2433 char *e = PL_bostr + PL_regendp[n];
2435 * Note that we can't do the "other character" lookup trick as
2436 * in the 8-bit case (no pun intended) because in Unicode we
2437 * have to map both upper and title case to lower case.
2439 if (OP(scan) == REFF) {
2443 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2453 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2460 nextchr = UCHARAT(locinput);
2464 /* Inline the first character, for speed. */
2465 if (UCHARAT(s) != nextchr &&
2467 (UCHARAT(s) != ((OP(scan) == REFF
2468 ? PL_fold : PL_fold_locale)[nextchr]))))
2470 ln = PL_regendp[n] - ln;
2471 if (locinput + ln > PL_regeol)
2473 if (ln > 1 && (OP(scan) == REF
2474 ? memNE(s, locinput, ln)
2476 ? ibcmp(s, locinput, ln)
2477 : ibcmp_locale(s, locinput, ln))))
2480 nextchr = UCHARAT(locinput);
2491 OP_4tree *oop = PL_op;
2492 COP *ocurcop = PL_curcop;
2493 SV **ocurpad = PL_curpad;
2497 PL_op = (OP_4tree*)PL_regdata->data[n];
2498 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2499 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2500 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2502 CALLRUNOPS(aTHX); /* Scalar context. */
2508 PL_curpad = ocurpad;
2509 PL_curcop = ocurcop;
2511 if (logical == 2) { /* Postponed subexpression. */
2513 MAGIC *mg = Null(MAGIC*);
2515 CHECKPOINT cp, lastcp;
2517 if(SvROK(ret) || SvRMAGICAL(ret)) {
2518 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2521 mg = mg_find(sv, PERL_MAGIC_qr);
2524 re = (regexp *)mg->mg_obj;
2525 (void)ReREFCNT_inc(re);
2529 char *t = SvPV(ret, len);
2531 char *oprecomp = PL_regprecomp;
2532 I32 osize = PL_regsize;
2533 I32 onpar = PL_regnpar;
2536 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2538 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2539 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2541 PL_regprecomp = oprecomp;
2546 PerlIO_printf(Perl_debug_log,
2547 "Entering embedded `%s%.60s%s%s'\n",
2551 (strlen(re->precomp) > 60 ? "..." : ""))
2554 state.prev = PL_reg_call_cc;
2555 state.cc = PL_regcc;
2556 state.re = PL_reg_re;
2560 cp = regcppush(0); /* Save *all* the positions. */
2563 state.ss = PL_savestack_ix;
2564 *PL_reglastparen = 0;
2565 PL_reg_call_cc = &state;
2566 PL_reginput = locinput;
2568 /* XXXX This is too dramatic a measure... */
2571 if (regmatch(re->program + 1)) {
2572 /* Even though we succeeded, we need to restore
2573 global variables, since we may be wrapped inside
2574 SUSPEND, thus the match may be not finished yet. */
2576 /* XXXX Do this only if SUSPENDed? */
2577 PL_reg_call_cc = state.prev;
2578 PL_regcc = state.cc;
2579 PL_reg_re = state.re;
2580 cache_re(PL_reg_re);
2582 /* XXXX This is too dramatic a measure... */
2585 /* These are needed even if not SUSPEND. */
2591 REGCP_UNWIND(lastcp);
2593 PL_reg_call_cc = state.prev;
2594 PL_regcc = state.cc;
2595 PL_reg_re = state.re;
2596 cache_re(PL_reg_re);
2598 /* XXXX This is too dramatic a measure... */
2607 sv_setsv(save_scalar(PL_replgv), ret);
2611 n = ARG(scan); /* which paren pair */
2612 PL_reg_start_tmp[n] = locinput;
2617 n = ARG(scan); /* which paren pair */
2618 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2619 PL_regendp[n] = locinput - PL_bostr;
2620 if (n > *PL_reglastparen)
2621 *PL_reglastparen = n;
2624 n = ARG(scan); /* which paren pair */
2625 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2628 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2630 next = NEXTOPER(NEXTOPER(scan));
2632 next = scan + ARG(scan);
2633 if (OP(next) == IFTHEN) /* Fake one. */
2634 next = NEXTOPER(NEXTOPER(next));
2638 logical = scan->flags;
2640 /*******************************************************************
2641 PL_regcc contains infoblock about the innermost (...)* loop, and
2642 a pointer to the next outer infoblock.
2644 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2646 1) After matching X, regnode for CURLYX is processed;
2648 2) This regnode creates infoblock on the stack, and calls
2649 regmatch() recursively with the starting point at WHILEM node;
2651 3) Each hit of WHILEM node tries to match A and Z (in the order
2652 depending on the current iteration, min/max of {min,max} and
2653 greediness). The information about where are nodes for "A"
2654 and "Z" is read from the infoblock, as is info on how many times "A"
2655 was already matched, and greediness.
2657 4) After A matches, the same WHILEM node is hit again.
2659 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2660 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2661 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2662 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2663 of the external loop.
2665 Currently present infoblocks form a tree with a stem formed by PL_curcc
2666 and whatever it mentions via ->next, and additional attached trees
2667 corresponding to temporarily unset infoblocks as in "5" above.
2669 In the following picture infoblocks for outer loop of
2670 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2671 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2672 infoblocks are drawn below the "reset" infoblock.
2674 In fact in the picture below we do not show failed matches for Z and T
2675 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2676 more obvious *why* one needs to *temporary* unset infoblocks.]
2678 Matched REx position InfoBlocks Comment
2682 Y A)*?Z)*?T x <- O <- I
2683 YA )*?Z)*?T x <- O <- I
2684 YA A)*?Z)*?T x <- O <- I
2685 YAA )*?Z)*?T x <- O <- I
2686 YAA Z)*?T x <- O # Temporary unset I
2689 YAAZ Y(A)*?Z)*?T x <- O
2692 YAAZY (A)*?Z)*?T x <- O
2695 YAAZY A)*?Z)*?T x <- O <- I
2698 YAAZYA )*?Z)*?T x <- O <- I
2701 YAAZYA Z)*?T x <- O # Temporary unset I
2707 YAAZYAZ T x # Temporary unset O
2714 *******************************************************************/
2717 CHECKPOINT cp = PL_savestack_ix;
2718 /* No need to save/restore up to this paren */
2719 I32 parenfloor = scan->flags;
2721 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2723 cc.oldcc = PL_regcc;
2725 /* XXXX Probably it is better to teach regpush to support
2726 parenfloor > PL_regsize... */
2727 if (parenfloor > *PL_reglastparen)
2728 parenfloor = *PL_reglastparen; /* Pessimization... */
2729 cc.parenfloor = parenfloor;
2731 cc.min = ARG1(scan);
2732 cc.max = ARG2(scan);
2733 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2737 PL_reginput = locinput;
2738 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2740 PL_regcc = cc.oldcc;
2746 * This is really hard to understand, because after we match
2747 * what we're trying to match, we must make sure the rest of
2748 * the REx is going to match for sure, and to do that we have
2749 * to go back UP the parse tree by recursing ever deeper. And
2750 * if it fails, we have to reset our parent's current state
2751 * that we can try again after backing off.
2754 CHECKPOINT cp, lastcp;
2755 CURCUR* cc = PL_regcc;
2756 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2758 n = cc->cur + 1; /* how many we know we matched */
2759 PL_reginput = locinput;
2762 PerlIO_printf(Perl_debug_log,
2763 "%*s %ld out of %ld..%ld cc=%lx\n",
2764 REPORT_CODE_OFF+PL_regindent*2, "",
2765 (long)n, (long)cc->min,
2766 (long)cc->max, (long)cc)
2769 /* If degenerate scan matches "", assume scan done. */
2771 if (locinput == cc->lastloc && n >= cc->min) {
2772 PL_regcc = cc->oldcc;
2776 PerlIO_printf(Perl_debug_log,
2777 "%*s empty match detected, try continuation...\n",
2778 REPORT_CODE_OFF+PL_regindent*2, "")
2780 if (regmatch(cc->next))
2788 /* First just match a string of min scans. */
2792 cc->lastloc = locinput;
2793 if (regmatch(cc->scan))
2796 cc->lastloc = lastloc;
2801 /* Check whether we already were at this position.
2802 Postpone detection until we know the match is not
2803 *that* much linear. */
2804 if (!PL_reg_maxiter) {
2805 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2806 PL_reg_leftiter = PL_reg_maxiter;
2808 if (PL_reg_leftiter-- == 0) {
2809 I32 size = (PL_reg_maxiter + 7)/8;
2810 if (PL_reg_poscache) {
2811 if (PL_reg_poscache_size < size) {
2812 Renew(PL_reg_poscache, size, char);
2813 PL_reg_poscache_size = size;
2815 Zero(PL_reg_poscache, size, char);
2818 PL_reg_poscache_size = size;
2819 Newz(29, PL_reg_poscache, size, char);
2822 PerlIO_printf(Perl_debug_log,
2823 "%sDetected a super-linear match, switching on caching%s...\n",
2824 PL_colors[4], PL_colors[5])
2827 if (PL_reg_leftiter < 0) {
2828 I32 o = locinput - PL_bostr, b;
2830 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2833 if (PL_reg_poscache[o] & (1<<b)) {
2835 PerlIO_printf(Perl_debug_log,
2836 "%*s already tried at this position...\n",
2837 REPORT_CODE_OFF+PL_regindent*2, "")
2841 PL_reg_poscache[o] |= (1<<b);
2845 /* Prefer next over scan for minimal matching. */
2848 PL_regcc = cc->oldcc;
2851 cp = regcppush(cc->parenfloor);
2853 if (regmatch(cc->next)) {
2855 sayYES; /* All done. */
2857 REGCP_UNWIND(lastcp);
2863 if (n >= cc->max) { /* Maximum greed exceeded? */
2864 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2865 && !(PL_reg_flags & RF_warned)) {
2866 PL_reg_flags |= RF_warned;
2867 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2868 "Complex regular subexpression recursion",
2875 PerlIO_printf(Perl_debug_log,
2876 "%*s trying longer...\n",
2877 REPORT_CODE_OFF+PL_regindent*2, "")
2879 /* Try scanning more and see if it helps. */
2880 PL_reginput = locinput;
2882 cc->lastloc = locinput;
2883 cp = regcppush(cc->parenfloor);
2885 if (regmatch(cc->scan)) {
2889 REGCP_UNWIND(lastcp);
2892 cc->lastloc = lastloc;
2896 /* Prefer scan over next for maximal matching. */
2898 if (n < cc->max) { /* More greed allowed? */
2899 cp = regcppush(cc->parenfloor);
2901 cc->lastloc = locinput;
2903 if (regmatch(cc->scan)) {
2907 REGCP_UNWIND(lastcp);
2908 regcppop(); /* Restore some previous $<digit>s? */
2909 PL_reginput = locinput;
2911 PerlIO_printf(Perl_debug_log,
2912 "%*s failed, try continuation...\n",
2913 REPORT_CODE_OFF+PL_regindent*2, "")
2916 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2917 && !(PL_reg_flags & RF_warned)) {
2918 PL_reg_flags |= RF_warned;
2919 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2920 "Complex regular subexpression recursion",
2924 /* Failed deeper matches of scan, so see if this one works. */
2925 PL_regcc = cc->oldcc;
2928 if (regmatch(cc->next))
2934 cc->lastloc = lastloc;
2939 next = scan + ARG(scan);
2942 inner = NEXTOPER(NEXTOPER(scan));
2945 inner = NEXTOPER(scan);
2949 if (OP(next) != c1) /* No choice. */
2950 next = inner; /* Avoid recursion. */
2952 I32 lastparen = *PL_reglastparen;
2954 re_unwind_branch_t *uw;
2956 /* Put unwinding data on stack */
2957 unwind1 = SSNEWt(1,re_unwind_branch_t);
2958 uw = SSPTRt(unwind1,re_unwind_branch_t);
2961 uw->type = ((c1 == BRANCH)
2963 : RE_UNWIND_BRANCHJ);
2964 uw->lastparen = lastparen;
2966 uw->locinput = locinput;
2967 uw->nextchr = nextchr;
2969 uw->regindent = ++PL_regindent;
2972 REGCP_SET(uw->lastcp);
2974 /* Now go into the first branch */
2987 /* We suppose that the next guy does not need
2988 backtracking: in particular, it is of constant length,
2989 and has no parenths to influence future backrefs. */
2990 ln = ARG1(scan); /* min to match */
2991 n = ARG2(scan); /* max to match */
2992 paren = scan->flags;
2994 if (paren > PL_regsize)
2996 if (paren > *PL_reglastparen)
2997 *PL_reglastparen = paren;
2999 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3001 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3002 PL_reginput = locinput;
3005 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3007 if (ln && l == 0 && n >= ln
3008 /* In fact, this is tricky. If paren, then the
3009 fact that we did/didnot match may influence
3010 future execution. */
3011 && !(paren && ln == 0))
3013 locinput = PL_reginput;
3014 if (PL_regkind[(U8)OP(next)] == EXACT) {
3015 c1 = (U8)*STRING(next);
3016 if (OP(next) == EXACTF)
3018 else if (OP(next) == EXACTFL)
3019 c2 = PL_fold_locale[c1];
3026 /* This may be improved if l == 0. */
3027 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3028 /* If it could work, try it. */
3030 UCHARAT(PL_reginput) == c1 ||
3031 UCHARAT(PL_reginput) == c2)
3035 PL_regstartp[paren] =
3036 HOPc(PL_reginput, -l) - PL_bostr;
3037 PL_regendp[paren] = PL_reginput - PL_bostr;
3040 PL_regendp[paren] = -1;
3044 REGCP_UNWIND(lastcp);
3046 /* Couldn't or didn't -- move forward. */
3047 PL_reginput = locinput;
3048 if (regrepeat_hard(scan, 1, &l)) {
3050 locinput = PL_reginput;
3057 n = regrepeat_hard(scan, n, &l);
3058 if (n != 0 && l == 0
3059 /* In fact, this is tricky. If paren, then the
3060 fact that we did/didnot match may influence
3061 future execution. */
3062 && !(paren && ln == 0))
3064 locinput = PL_reginput;
3066 PerlIO_printf(Perl_debug_log,
3067 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3068 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3072 if (PL_regkind[(U8)OP(next)] == EXACT) {
3073 c1 = (U8)*STRING(next);
3074 if (OP(next) == EXACTF)
3076 else if (OP(next) == EXACTFL)
3077 c2 = PL_fold_locale[c1];
3086 /* If it could work, try it. */
3088 UCHARAT(PL_reginput) == c1 ||
3089 UCHARAT(PL_reginput) == c2)
3092 PerlIO_printf(Perl_debug_log,
3093 "%*s trying tail with n=%"IVdf"...\n",
3094 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3098 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3099 PL_regendp[paren] = PL_reginput - PL_bostr;
3102 PL_regendp[paren] = -1;
3106 REGCP_UNWIND(lastcp);
3108 /* Couldn't or didn't -- back up. */
3110 locinput = HOPc(locinput, -l);
3111 PL_reginput = locinput;
3118 paren = scan->flags; /* Which paren to set */
3119 if (paren > PL_regsize)
3121 if (paren > *PL_reglastparen)
3122 *PL_reglastparen = paren;
3123 ln = ARG1(scan); /* min to match */
3124 n = ARG2(scan); /* max to match */
3125 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3129 ln = ARG1(scan); /* min to match */
3130 n = ARG2(scan); /* max to match */
3131 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3136 scan = NEXTOPER(scan);
3142 scan = NEXTOPER(scan);
3146 * Lookahead to avoid useless match attempts
3147 * when we know what character comes next.
3149 if (PL_regkind[(U8)OP(next)] == EXACT) {
3150 U8 *s = (U8*)STRING(next);
3153 if (OP(next) == EXACTF)
3155 else if (OP(next) == EXACTFL)
3156 c2 = PL_fold_locale[c1];
3159 if (OP(next) == EXACTF) {
3160 c1 = to_utf8_lower(s);
3161 c2 = to_utf8_upper(s);
3164 c2 = c1 = utf8_to_uvchr(s, NULL);
3170 PL_reginput = locinput;
3174 if (ln && regrepeat(scan, ln) < ln)
3176 locinput = PL_reginput;
3179 char *e; /* Should not check after this */
3180 char *old = locinput;
3182 if (n == REG_INFTY) {
3185 while (UTF8_IS_CONTINUATION(*(U8*)e))
3191 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3195 e = locinput + n - ln;
3201 /* Find place 'next' could work */
3204 while (locinput <= e && *locinput != c1)
3207 while (locinput <= e
3212 count = locinput - old;
3219 utf8_to_uvchr((U8*)locinput, &len) != c1;
3224 for (count = 0; locinput <= e; count++) {
3225 UV c = utf8_to_uvchr((U8*)locinput, &len);
3226 if (c == c1 || c == c2)
3234 /* PL_reginput == old now */
3235 if (locinput != old) {
3236 ln = 1; /* Did some */
3237 if (regrepeat(scan, count) < count)
3240 /* PL_reginput == locinput now */
3241 TRYPAREN(paren, ln, locinput);
3242 PL_reginput = locinput; /* Could be reset... */
3243 REGCP_UNWIND(lastcp);
3244 /* Couldn't or didn't -- move forward. */
3247 locinput += UTF8SKIP(locinput);
3253 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3257 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3259 c = UCHARAT(PL_reginput);
3260 /* If it could work, try it. */
3261 if (c == c1 || c == c2)
3263 TRYPAREN(paren, n, PL_reginput);
3264 REGCP_UNWIND(lastcp);
3267 /* If it could work, try it. */
3268 else if (c1 == -1000)
3270 TRYPAREN(paren, n, PL_reginput);
3271 REGCP_UNWIND(lastcp);
3273 /* Couldn't or didn't -- move forward. */
3274 PL_reginput = locinput;
3275 if (regrepeat(scan, 1)) {
3277 locinput = PL_reginput;
3285 n = regrepeat(scan, n);
3286 locinput = PL_reginput;
3287 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3288 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3289 ln = n; /* why back off? */
3290 /* ...because $ and \Z can match before *and* after
3291 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3292 We should back off by one in this case. */
3293 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3302 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3304 c = UCHARAT(PL_reginput);
3306 /* If it could work, try it. */
3307 if (c1 == -1000 || c == c1 || c == c2)
3309 TRYPAREN(paren, n, PL_reginput);
3310 REGCP_UNWIND(lastcp);
3312 /* Couldn't or didn't -- back up. */
3314 PL_reginput = locinput = HOPc(locinput, -1);
3322 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3324 c = UCHARAT(PL_reginput);
3326 /* If it could work, try it. */
3327 if (c1 == -1000 || c == c1 || c == c2)
3329 TRYPAREN(paren, n, PL_reginput);
3330 REGCP_UNWIND(lastcp);
3332 /* Couldn't or didn't -- back up. */
3334 PL_reginput = locinput = HOPc(locinput, -1);
3341 if (PL_reg_call_cc) {
3342 re_cc_state *cur_call_cc = PL_reg_call_cc;
3343 CURCUR *cctmp = PL_regcc;
3344 regexp *re = PL_reg_re;
3345 CHECKPOINT cp, lastcp;
3347 cp = regcppush(0); /* Save *all* the positions. */
3349 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3351 PL_reginput = locinput; /* Make position available to
3353 cache_re(PL_reg_call_cc->re);
3354 PL_regcc = PL_reg_call_cc->cc;
3355 PL_reg_call_cc = PL_reg_call_cc->prev;
3356 if (regmatch(cur_call_cc->node)) {
3357 PL_reg_call_cc = cur_call_cc;
3361 REGCP_UNWIND(lastcp);
3363 PL_reg_call_cc = cur_call_cc;
3369 PerlIO_printf(Perl_debug_log,
3370 "%*s continuation failed...\n",
3371 REPORT_CODE_OFF+PL_regindent*2, "")
3375 if (locinput < PL_regtill) {
3376 DEBUG_r(PerlIO_printf(Perl_debug_log,
3377 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3379 (long)(locinput - PL_reg_starttry),
3380 (long)(PL_regtill - PL_reg_starttry),
3382 sayNO_FINAL; /* Cannot match: too short. */
3384 PL_reginput = locinput; /* put where regtry can find it */
3385 sayYES_FINAL; /* Success! */
3387 PL_reginput = locinput; /* put where regtry can find it */
3388 sayYES_LOUD; /* Success! */
3391 PL_reginput = locinput;
3396 s = HOPBACKc(locinput, scan->flags);
3402 PL_reginput = locinput;
3407 s = HOPBACKc(locinput, scan->flags);
3413 PL_reginput = locinput;
3416 inner = NEXTOPER(NEXTOPER(scan));
3417 if (regmatch(inner) != n) {
3432 if (OP(scan) == SUSPEND) {
3433 locinput = PL_reginput;
3434 nextchr = UCHARAT(locinput);
3439 next = scan + ARG(scan);
3444 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3445 PTR2UV(scan), OP(scan));
3446 Perl_croak(aTHX_ "regexp memory corruption");
3453 * We get here only if there's trouble -- normally "case END" is
3454 * the terminating point.
3456 Perl_croak(aTHX_ "corrupted regexp pointers");
3462 PerlIO_printf(Perl_debug_log,
3463 "%*s %scould match...%s\n",
3464 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3468 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3469 PL_colors[4],PL_colors[5]));
3475 #if 0 /* Breaks $^R */
3483 PerlIO_printf(Perl_debug_log,
3484 "%*s %sfailed...%s\n",
3485 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3491 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3494 case RE_UNWIND_BRANCH:
3495 case RE_UNWIND_BRANCHJ:
3497 re_unwind_branch_t *uwb = &(uw->branch);
3498 I32 lastparen = uwb->lastparen;
3500 REGCP_UNWIND(uwb->lastcp);
3501 for (n = *PL_reglastparen; n > lastparen; n--)
3503 *PL_reglastparen = n;
3504 scan = next = uwb->next;
3506 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3507 ? BRANCH : BRANCHJ) ) { /* Failure */
3514 /* Have more choice yet. Reuse the same uwb. */
3516 if ((n = (uwb->type == RE_UNWIND_BRANCH
3517 ? NEXT_OFF(next) : ARG(next))))
3520 next = NULL; /* XXXX Needn't unwinding in this case... */
3522 next = NEXTOPER(scan);
3523 if (uwb->type == RE_UNWIND_BRANCHJ)
3524 next = NEXTOPER(next);
3525 locinput = uwb->locinput;
3526 nextchr = uwb->nextchr;
3528 PL_regindent = uwb->regindent;
3535 Perl_croak(aTHX_ "regexp unwind memory corruption");
3546 - regrepeat - repeatedly match something simple, report how many
3549 * [This routine now assumes that it will only match on things of length 1.
3550 * That was true before, but now we assume scan - reginput is the count,
3551 * rather than incrementing count on every character. [Er, except utf8.]]
3554 S_regrepeat(pTHX_ regnode *p, I32 max)
3556 register char *scan;
3558 register char *loceol = PL_regeol;
3559 register I32 hardcount = 0;
3560 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3563 if (max != REG_INFTY && max < loceol - scan)
3564 loceol = scan + max;
3569 while (scan < loceol && hardcount < max && *scan != '\n') {
3570 scan += UTF8SKIP(scan);
3574 while (scan < loceol && *scan != '\n')
3581 case EXACT: /* length of string is 1 */
3583 while (scan < loceol && UCHARAT(scan) == c)
3586 case EXACTF: /* length of string is 1 */
3588 while (scan < loceol &&
3589 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3592 case EXACTFL: /* length of string is 1 */
3593 PL_reg_flags |= RF_tainted;
3595 while (scan < loceol &&
3596 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3602 while (hardcount < max && scan < loceol &&
3603 reginclass(p, (U8*)scan, do_utf8)) {
3604 scan += UTF8SKIP(scan);
3608 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3615 LOAD_UTF8_CHARCLASS(alnum,"a");
3616 while (hardcount < max && scan < loceol &&
3617 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3618 scan += UTF8SKIP(scan);
3622 while (scan < loceol && isALNUM(*scan))
3627 PL_reg_flags |= RF_tainted;
3630 while (hardcount < max && scan < loceol &&
3631 isALNUM_LC_utf8((U8*)scan)) {
3632 scan += UTF8SKIP(scan);
3636 while (scan < loceol && isALNUM_LC(*scan))
3643 LOAD_UTF8_CHARCLASS(alnum,"a");
3644 while (hardcount < max && scan < loceol &&
3645 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3646 scan += UTF8SKIP(scan);
3650 while (scan < loceol && !isALNUM(*scan))
3655 PL_reg_flags |= RF_tainted;
3658 while (hardcount < max && scan < loceol &&
3659 !isALNUM_LC_utf8((U8*)scan)) {
3660 scan += UTF8SKIP(scan);
3664 while (scan < loceol && !isALNUM_LC(*scan))
3671 LOAD_UTF8_CHARCLASS(space," ");
3672 while (hardcount < max && scan < loceol &&
3674 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3675 scan += UTF8SKIP(scan);
3679 while (scan < loceol && isSPACE(*scan))
3684 PL_reg_flags |= RF_tainted;
3687 while (hardcount < max && scan < loceol &&
3688 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3689 scan += UTF8SKIP(scan);
3693 while (scan < loceol && isSPACE_LC(*scan))
3700 LOAD_UTF8_CHARCLASS(space," ");
3701 while (hardcount < max && scan < loceol &&
3703 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3704 scan += UTF8SKIP(scan);
3708 while (scan < loceol && !isSPACE(*scan))
3713 PL_reg_flags |= RF_tainted;
3716 while (hardcount < max && scan < loceol &&
3717 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3718 scan += UTF8SKIP(scan);
3722 while (scan < loceol && !isSPACE_LC(*scan))
3729 LOAD_UTF8_CHARCLASS(digit,"0");
3730 while (hardcount < max && scan < loceol &&
3731 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3732 scan += UTF8SKIP(scan);
3736 while (scan < loceol && isDIGIT(*scan))
3743 LOAD_UTF8_CHARCLASS(digit,"0");
3744 while (hardcount < max && scan < loceol &&
3745 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3746 scan += UTF8SKIP(scan);
3750 while (scan < loceol && !isDIGIT(*scan))
3754 default: /* Called on something of 0 width. */
3755 break; /* So match right here or not at all. */
3761 c = scan - PL_reginput;
3766 SV *prop = sv_newmortal();
3769 PerlIO_printf(Perl_debug_log,
3770 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3771 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3778 - regrepeat_hard - repeatedly match something, report total lenth and length
3780 * The repeater is supposed to have constant length.
3784 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3786 register char *scan = Nullch;
3787 register char *start;
3788 register char *loceol = PL_regeol;
3790 I32 count = 0, res = 1;
3795 start = PL_reginput;
3796 if (DO_UTF8(PL_reg_sv)) {
3797 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3800 while (start < PL_reginput) {
3802 start += UTF8SKIP(start);
3813 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3815 *lp = l = PL_reginput - start;
3816 if (max != REG_INFTY && l*max < loceol - scan)
3817 loceol = scan + l*max;
3830 - regclass_swash - prepare the utf8 swash
3834 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3839 if (PL_regdata && PL_regdata->count) {
3842 if (PL_regdata->what[n] == 's') {
3843 SV *rv = (SV*)PL_regdata->data[n];
3844 AV *av = (AV*)SvRV((SV*)rv);
3847 si = *av_fetch(av, 0, FALSE);
3848 a = av_fetch(av, 1, FALSE);
3852 else if (si && doinit) {
3853 sw = swash_init("utf8", "", si, 1, 0);
3854 (void)av_store(av, 1, sw);
3866 - reginclass - determine if a character falls into a character class
3870 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3872 char flags = ANYOF_FLAGS(n);
3877 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3879 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3880 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3881 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3884 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3887 SV *sw = regclass_swash(n, TRUE, 0);
3890 if (swash_fetch(sw, p, do_utf8))
3892 else if (flags & ANYOF_FOLD) {
3893 U8 tmpbuf[UTF8_MAXLEN+1];
3895 if (flags & ANYOF_LOCALE) {
3896 PL_reg_flags |= RF_tainted;
3897 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3900 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3901 if (swash_fetch(sw, tmpbuf, do_utf8))
3907 if (!match && c < 256) {
3908 if (ANYOF_BITMAP_TEST(n, c))
3910 else if (flags & ANYOF_FOLD) {
3913 if (flags & ANYOF_LOCALE) {
3914 PL_reg_flags |= RF_tainted;
3915 f = PL_fold_locale[c];
3919 if (f != c && ANYOF_BITMAP_TEST(n, f))
3923 if (!match && (flags & ANYOF_CLASS)) {
3924 PL_reg_flags |= RF_tainted;
3926 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3940 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3941 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3942 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3943 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3944 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3945 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3946 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3947 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3948 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3949 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3950 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3951 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3952 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3953 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3954 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3955 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3956 ) /* How's that for a conditional? */
3963 return (flags & ANYOF_INVERT) ? !match : match;
3967 S_reghop(pTHX_ U8 *s, I32 off)
3969 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3973 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3976 while (off-- && s < lim) {
3977 /* XXX could check well-formedness here */
3985 if (UTF8_IS_CONTINUED(*s)) {
3986 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3989 /* XXX could check well-formedness here */
3997 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3999 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4003 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4006 while (off-- && s < lim) {
4007 /* XXX could check well-formedness here */
4017 if (UTF8_IS_CONTINUED(*s)) {
4018 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4021 /* XXX could check well-formedness here */
4037 restore_pos(pTHXo_ void *arg)
4039 if (PL_reg_eval_set) {
4040 if (PL_reg_oldsaved) {
4041 PL_reg_re->subbeg = PL_reg_oldsaved;
4042 PL_reg_re->sublen = PL_reg_oldsavedlen;
4043 RX_MATCH_COPIED_on(PL_reg_re);
4045 PL_reg_magic->mg_len = PL_reg_oldpos;
4046 PL_reg_eval_set = 0;
4047 PL_curpm = PL_reg_oldcurpm;