5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define HOPBACK(pos, off) ( \
120 (UTF && DO_UTF8(PL_reg_sv)) \
121 ? reghopmaybe((U8*)pos, -off) \
122 : (pos - off >= PL_bostr) \
126 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
128 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
129 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
130 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
131 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
133 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
135 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
137 static void restore_pos(pTHXo_ void *arg);
140 S_regcppush(pTHX_ I32 parenfloor)
142 int retval = PL_savestack_ix;
143 #define REGCP_PAREN_ELEMS 4
144 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
147 if (paren_elems_to_push < 0)
148 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
150 #define REGCP_OTHER_ELEMS 5
151 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
152 for (p = PL_regsize; p > parenfloor; p--) {
153 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
154 SSPUSHINT(PL_regendp[p]);
155 SSPUSHINT(PL_regstartp[p]);
156 SSPUSHPTR(PL_reg_start_tmp[p]);
159 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
160 SSPUSHINT(PL_regsize);
161 SSPUSHINT(*PL_reglastparen);
162 SSPUSHPTR(PL_reginput);
163 #define REGCP_FRAME_ELEMS 2
164 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
165 * are needed for the regexp context stack bookkeeping. */
166 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
167 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
172 /* These are needed since we do not localize EVAL nodes: */
173 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
174 " Setting an EVAL scope, savestack=%"IVdf"\n", \
175 (IV)PL_savestack_ix)); cp = PL_savestack_ix
177 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
178 PerlIO_printf(Perl_debug_log, \
179 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
180 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
190 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
192 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
193 i = SSPOPINT; /* Parentheses elements to pop. */
194 input = (char *) SSPOPPTR;
195 *PL_reglastparen = SSPOPINT;
196 PL_regsize = SSPOPINT;
198 /* Now restore the parentheses context. */
199 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
200 i > 0; i -= REGCP_PAREN_ELEMS) {
201 paren = (U32)SSPOPINT;
202 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
203 PL_regstartp[paren] = SSPOPINT;
205 if (paren <= *PL_reglastparen)
206 PL_regendp[paren] = tmps;
208 PerlIO_printf(Perl_debug_log,
209 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
210 (UV)paren, (IV)PL_regstartp[paren],
211 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
212 (IV)PL_regendp[paren],
213 (paren > *PL_reglastparen ? "(no)" : ""));
217 if (*PL_reglastparen + 1 <= PL_regnpar) {
218 PerlIO_printf(Perl_debug_log,
219 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
220 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
224 /* It would seem that the similar code in regtry()
225 * already takes care of this, and in fact it is in
226 * a better location to since this code can #if 0-ed out
227 * but the code in regtry() is needed or otherwise tests
228 * requiring null fields (pat.t#187 and split.t#{13,14}
229 * (as of patchlevel 7877) will fail. Then again,
230 * this code seems to be necessary or otherwise
231 * building DynaLoader will fail:
232 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
234 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
235 if (paren > PL_regsize)
236 PL_regstartp[paren] = -1;
237 PL_regendp[paren] = -1;
244 S_regcp_set_to(pTHX_ I32 ss)
246 I32 tmp = PL_savestack_ix;
248 PL_savestack_ix = ss;
250 PL_savestack_ix = tmp;
254 typedef struct re_cc_state
258 struct re_cc_state *prev;
263 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
265 #define TRYPAREN(paren, n, input) { \
268 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
269 PL_regendp[paren] = input - PL_bostr; \
272 PL_regendp[paren] = -1; \
274 if (regmatch(next)) \
277 PL_regendp[paren] = -1; \
282 * pregexec and friends
286 - pregexec - match a regexp against a string
289 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
290 char *strbeg, I32 minend, SV *screamer, U32 nosave)
291 /* strend: pointer to null at end of string */
292 /* strbeg: real beginning of string */
293 /* minend: end of match must be >=minend after stringarg. */
294 /* nosave: For optimizations. */
297 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
298 nosave ? 0 : REXEC_COPY_STR);
302 S_cache_re(pTHX_ regexp *prog)
304 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
306 PL_regprogram = prog->program;
308 PL_regnpar = prog->nparens;
309 PL_regdata = prog->data;
314 * Need to implement the following flags for reg_anch:
316 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
318 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
319 * INTUIT_AUTORITATIVE_ML
320 * INTUIT_ONCE_NOML - Intuit can match in one location only.
323 * Another flag for this function: SECOND_TIME (so that float substrs
324 * with giant delta may be not rechecked).
327 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
329 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
330 Otherwise, only SvCUR(sv) is used to get strbeg. */
332 /* XXXX We assume that strpos is strbeg unless sv. */
334 /* XXXX Some places assume that there is a fixed substring.
335 An update may be needed if optimizer marks as "INTUITable"
336 RExen without fixed substrings. Similarly, it is assumed that
337 lengths of all the strings are no more than minlen, thus they
338 cannot come from lookahead.
339 (Or minlen should take into account lookahead.) */
341 /* A failure to find a constant substring means that there is no need to make
342 an expensive call to REx engine, thus we celebrate a failure. Similarly,
343 finding a substring too deep into the string means that less calls to
344 regtry() should be needed.
346 REx compiler's optimizer found 4 possible hints:
347 a) Anchored substring;
349 c) Whether we are anchored (beginning-of-line or \G);
350 d) First node (of those at offset 0) which may distingush positions;
351 We use a)b)d) and multiline-part of c), and try to find a position in the
352 string which does not contradict any of them.
355 /* Most of decisions we do here should have been done at compile time.
356 The nodes of the REx which we used for the search should have been
357 deleted from the finite automaton. */
360 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
361 char *strend, U32 flags, re_scream_pos_data *data)
363 register I32 start_shift = 0;
364 /* Should be nonnegative! */
365 register I32 end_shift = 0;
371 register char *other_last = Nullch; /* other substr checked before this */
372 char *check_at = Nullch; /* check substr found at this pos */
374 char *i_strpos = strpos;
377 DEBUG_r( if (!PL_colorset) reginitcolors() );
378 DEBUG_r(PerlIO_printf(Perl_debug_log,
379 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
380 PL_colors[4],PL_colors[5],PL_colors[0],
383 (strlen(prog->precomp) > 60 ? "..." : ""),
385 (int)(strend - strpos > 60 ? 60 : strend - strpos),
386 strpos, PL_colors[1],
387 (strend - strpos > 60 ? "..." : ""))
390 if (prog->reganch & ROPT_UTF8)
391 PL_reg_flags |= RF_utf8;
393 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
394 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
397 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
399 check = prog->check_substr;
400 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
401 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
402 || ( (prog->reganch & ROPT_ANCH_BOL)
403 && !PL_multiline ) ); /* Check after \n? */
406 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
407 | ROPT_IMPLICIT)) /* not a real BOL */
408 /* SvCUR is not set on references: SvRV and SvPVX overlap */
410 && (strpos != strbeg)) {
411 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
414 if (prog->check_offset_min == prog->check_offset_max &&
415 !(prog->reganch & ROPT_SANY_SEEN)) {
416 /* Substring at constant offset from beg-of-str... */
419 s = HOP3c(strpos, prog->check_offset_min, strend);
421 slen = SvCUR(check); /* >= 1 */
423 if ( strend - s > slen || strend - s < slen - 1
424 || (strend - s == slen && strend[-1] != '\n')) {
425 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
428 /* Now should match s[0..slen-2] */
430 if (slen && (*SvPVX(check) != *s
432 && memNE(SvPVX(check), s, slen)))) {
434 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
438 else if (*SvPVX(check) != *s
439 || ((slen = SvCUR(check)) > 1
440 && memNE(SvPVX(check), s, slen)))
442 goto success_at_start;
445 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
447 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
448 end_shift = prog->minlen - start_shift -
449 CHR_SVLEN(check) + (SvTAIL(check) != 0);
451 I32 end = prog->check_offset_max + CHR_SVLEN(check)
452 - (SvTAIL(check) != 0);
453 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
455 if (end_shift < eshift)
459 else { /* Can match at random position */
462 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
463 /* Should be nonnegative! */
464 end_shift = prog->minlen - start_shift -
465 CHR_SVLEN(check) + (SvTAIL(check) != 0);
468 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
470 Perl_croak(aTHX_ "panic: end_shift");
474 /* Find a possible match in the region s..strend by looking for
475 the "check" substring in the region corrected by start/end_shift. */
476 if (flags & REXEC_SCREAM) {
477 I32 p = -1; /* Internal iterator of scream. */
478 I32 *pp = data ? data->scream_pos : &p;
480 if (PL_screamfirst[BmRARE(check)] >= 0
481 || ( BmRARE(check) == '\n'
482 && (BmPREVIOUS(check) == SvCUR(check) - 1)
484 s = screaminstr(sv, check,
485 start_shift + (s - strbeg), end_shift, pp, 0);
489 *data->scream_olds = s;
491 else if (prog->reganch & ROPT_SANY_SEEN)
492 s = fbm_instr((U8*)(s + start_shift),
493 (U8*)(strend - end_shift),
494 check, PL_multiline ? FBMrf_MULTILINE : 0);
496 s = fbm_instr(HOP3(s, start_shift, strend),
497 HOP3(strend, -end_shift, strbeg),
498 check, PL_multiline ? FBMrf_MULTILINE : 0);
500 /* Update the count-of-usability, remove useless subpatterns,
503 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
504 (s ? "Found" : "Did not find"),
505 ((check == prog->anchored_substr) ? "anchored" : "floating"),
507 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
509 PL_colors[1], (SvTAIL(check) ? "$" : ""),
510 (s ? " at offset " : "...\n") ) );
517 /* Finish the diagnostic message */
518 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
520 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
521 Start with the other substr.
522 XXXX no SCREAM optimization yet - and a very coarse implementation
523 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
524 *always* match. Probably should be marked during compile...
525 Probably it is right to do no SCREAM here...
528 if (prog->float_substr && prog->anchored_substr) {
529 /* Take into account the "other" substring. */
530 /* XXXX May be hopelessly wrong for UTF... */
533 if (check == prog->float_substr) {
536 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
539 t = s - prog->check_offset_max;
540 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
541 && (!(prog->reganch & ROPT_UTF8)
542 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
547 t = HOP3c(t, prog->anchored_offset, strend);
548 if (t < other_last) /* These positions already checked */
550 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
553 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
554 /* On end-of-str: see comment below. */
555 s = fbm_instr((unsigned char*)t,
556 HOP3(HOP3(last1, prog->anchored_offset, strend)
557 + SvCUR(prog->anchored_substr),
558 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
559 prog->anchored_substr,
560 PL_multiline ? FBMrf_MULTILINE : 0);
561 DEBUG_r(PerlIO_printf(Perl_debug_log,
562 "%s anchored substr `%s%.*s%s'%s",
563 (s ? "Found" : "Contradicts"),
565 (int)(SvCUR(prog->anchored_substr)
566 - (SvTAIL(prog->anchored_substr)!=0)),
567 SvPVX(prog->anchored_substr),
568 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
570 if (last1 >= last2) {
571 DEBUG_r(PerlIO_printf(Perl_debug_log,
572 ", giving up...\n"));
575 DEBUG_r(PerlIO_printf(Perl_debug_log,
576 ", trying floating at offset %ld...\n",
577 (long)(HOP3c(s1, 1, strend) - i_strpos)));
578 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
579 s = HOP3c(last, 1, strend);
583 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
584 (long)(s - i_strpos)));
585 t = HOP3c(s, -prog->anchored_offset, strbeg);
586 other_last = HOP3c(s, 1, strend);
594 else { /* Take into account the floating substring. */
598 t = HOP3c(s, -start_shift, strbeg);
600 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
601 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
602 last = HOP3c(t, prog->float_max_offset, strend);
603 s = HOP3c(t, prog->float_min_offset, strend);
606 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
607 /* fbm_instr() takes into account exact value of end-of-str
608 if the check is SvTAIL(ed). Since false positives are OK,
609 and end-of-str is not later than strend we are OK. */
610 s = fbm_instr((unsigned char*)s,
611 (unsigned char*)last + SvCUR(prog->float_substr)
612 - (SvTAIL(prog->float_substr)!=0),
613 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
614 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
615 (s ? "Found" : "Contradicts"),
617 (int)(SvCUR(prog->float_substr)
618 - (SvTAIL(prog->float_substr)!=0)),
619 SvPVX(prog->float_substr),
620 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
623 DEBUG_r(PerlIO_printf(Perl_debug_log,
624 ", giving up...\n"));
627 DEBUG_r(PerlIO_printf(Perl_debug_log,
628 ", trying anchored starting at offset %ld...\n",
629 (long)(s1 + 1 - i_strpos)));
631 s = HOP3c(t, 1, strend);
635 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
636 (long)(s - i_strpos)));
637 other_last = s; /* Fix this later. --Hugo */
646 t = s - prog->check_offset_max;
647 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
648 && (!(prog->reganch & ROPT_UTF8)
649 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
651 /* Fixed substring is found far enough so that the match
652 cannot start at strpos. */
654 if (ml_anch && t[-1] != '\n') {
655 /* Eventually fbm_*() should handle this, but often
656 anchored_offset is not 0, so this check will not be wasted. */
657 /* XXXX In the code below we prefer to look for "^" even in
658 presence of anchored substrings. And we search even
659 beyond the found float position. These pessimizations
660 are historical artefacts only. */
662 while (t < strend - prog->minlen) {
664 if (t < check_at - prog->check_offset_min) {
665 if (prog->anchored_substr) {
666 /* Since we moved from the found position,
667 we definitely contradict the found anchored
668 substr. Due to the above check we do not
669 contradict "check" substr.
670 Thus we can arrive here only if check substr
671 is float. Redo checking for "other"=="fixed".
674 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
675 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
676 goto do_other_anchored;
678 /* We don't contradict the found floating substring. */
679 /* XXXX Why not check for STCLASS? */
681 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
682 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
685 /* Position contradicts check-string */
686 /* XXXX probably better to look for check-string
687 than for "\n", so one should lower the limit for t? */
688 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
689 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
690 other_last = strpos = s = t + 1;
695 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
696 PL_colors[0],PL_colors[1]));
700 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
701 PL_colors[0],PL_colors[1]));
705 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
708 /* The found string does not prohibit matching at strpos,
709 - no optimization of calling REx engine can be performed,
710 unless it was an MBOL and we are not after MBOL,
711 or a future STCLASS check will fail this. */
713 /* Even in this situation we may use MBOL flag if strpos is offset
714 wrt the start of the string. */
715 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
716 && (strpos != strbeg) && strpos[-1] != '\n'
717 /* May be due to an implicit anchor of m{.*foo} */
718 && !(prog->reganch & ROPT_IMPLICIT))
723 DEBUG_r( if (ml_anch)
724 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
725 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
728 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
729 && prog->check_substr /* Could be deleted already */
730 && --BmUSEFUL(prog->check_substr) < 0
731 && prog->check_substr == prog->float_substr)
733 /* If flags & SOMETHING - do not do it many times on the same match */
734 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
735 SvREFCNT_dec(prog->check_substr);
736 prog->check_substr = Nullsv; /* disable */
737 prog->float_substr = Nullsv; /* clear */
738 check = Nullsv; /* abort */
740 /* XXXX This is a remnant of the old implementation. It
741 looks wasteful, since now INTUIT can use many
743 prog->reganch &= ~RE_USE_INTUIT;
750 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
751 if (prog->regstclass) {
752 /* minlen == 0 is possible if regstclass is \b or \B,
753 and the fixed substr is ''$.
754 Since minlen is already taken into account, s+1 is before strend;
755 accidentally, minlen >= 1 guaranties no false positives at s + 1
756 even for \b or \B. But (minlen? 1 : 0) below assumes that
757 regstclass does not come from lookahead... */
758 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
759 This leaves EXACTF only, which is dealt with in find_byclass(). */
760 U8* str = (U8*)STRING(prog->regstclass);
761 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
762 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
764 char *endpos = (prog->anchored_substr || ml_anch)
765 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
766 : (prog->float_substr
767 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
770 char *startpos = strbeg;
773 if (prog->reganch & ROPT_UTF8) {
774 PL_regdata = prog->data;
777 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
782 if (endpos == strend) {
783 DEBUG_r( PerlIO_printf(Perl_debug_log,
784 "Could not match STCLASS...\n") );
787 DEBUG_r( PerlIO_printf(Perl_debug_log,
788 "This position contradicts STCLASS...\n") );
789 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
791 /* Contradict one of substrings */
792 if (prog->anchored_substr) {
793 if (prog->anchored_substr == check) {
794 DEBUG_r( what = "anchored" );
796 s = HOP3c(t, 1, strend);
797 if (s + start_shift + end_shift > strend) {
798 /* XXXX Should be taken into account earlier? */
799 DEBUG_r( PerlIO_printf(Perl_debug_log,
800 "Could not match STCLASS...\n") );
805 DEBUG_r( PerlIO_printf(Perl_debug_log,
806 "Looking for %s substr starting at offset %ld...\n",
807 what, (long)(s + start_shift - i_strpos)) );
810 /* Have both, check_string is floating */
811 if (t + start_shift >= check_at) /* Contradicts floating=check */
812 goto retry_floating_check;
813 /* Recheck anchored substring, but not floating... */
817 DEBUG_r( PerlIO_printf(Perl_debug_log,
818 "Looking for anchored substr starting at offset %ld...\n",
819 (long)(other_last - i_strpos)) );
820 goto do_other_anchored;
822 /* Another way we could have checked stclass at the
823 current position only: */
828 DEBUG_r( PerlIO_printf(Perl_debug_log,
829 "Looking for /%s^%s/m starting at offset %ld...\n",
830 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
833 if (!prog->float_substr) /* Could have been deleted */
835 /* Check is floating subtring. */
836 retry_floating_check:
837 t = check_at - start_shift;
838 DEBUG_r( what = "floating" );
839 goto hop_and_restart;
842 DEBUG_r(PerlIO_printf(Perl_debug_log,
843 "By STCLASS: moving %ld --> %ld\n",
844 (long)(t - i_strpos), (long)(s - i_strpos))
848 DEBUG_r(PerlIO_printf(Perl_debug_log,
849 "Does not contradict STCLASS...\n");
854 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
855 PL_colors[4], (check ? "Guessed" : "Giving up"),
856 PL_colors[5], (long)(s - i_strpos)) );
859 fail_finish: /* Substring not found */
860 if (prog->check_substr) /* could be removed already */
861 BmUSEFUL(prog->check_substr) += 5; /* hooray */
863 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
864 PL_colors[4],PL_colors[5]));
868 /* We know what class REx starts with. Try to find this position... */
870 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
872 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
878 register I32 tmp = 1; /* Scratch variable? */
879 register bool do_utf8 = DO_UTF8(PL_reg_sv);
881 /* We know what class it must start with. */
885 if (reginclass(c, (U8*)s, do_utf8)) {
886 if (tmp && (norun || regtry(prog, s)))
893 s += do_utf8 ? UTF8SKIP(s) : 1;
900 c1 = to_utf8_lower((U8*)m);
901 c2 = to_utf8_upper((U8*)m);
912 c2 = PL_fold_locale[c1];
917 e = s; /* Due to minlen logic of intuit() */
923 if ( utf8_to_uvchr((U8*)s, &len) == c1
930 UV c = utf8_to_uvchr((U8*)s, &len);
931 if ( (c == c1 || c == c2) && regtry(prog, s) )
940 && (ln == 1 || !(OP(c) == EXACTF
942 : ibcmp_locale(s, m, ln)))
943 && (norun || regtry(prog, s)) )
949 if ( (*(U8*)s == c1 || *(U8*)s == c2)
950 && (ln == 1 || !(OP(c) == EXACTF
952 : ibcmp_locale(s, m, ln)))
953 && (norun || regtry(prog, s)) )
960 PL_reg_flags |= RF_tainted;
967 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
969 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
971 tmp = ((OP(c) == BOUND ?
972 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
973 LOAD_UTF8_CHARCLASS(alnum,"a");
975 if (tmp == !(OP(c) == BOUND ?
976 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
977 isALNUM_LC_utf8((U8*)s)))
980 if ((norun || regtry(prog, s)))
987 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
988 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
991 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
993 if ((norun || regtry(prog, s)))
999 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1003 PL_reg_flags |= RF_tainted;
1010 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1012 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1014 tmp = ((OP(c) == NBOUND ?
1015 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1016 LOAD_UTF8_CHARCLASS(alnum,"a");
1017 while (s < strend) {
1018 if (tmp == !(OP(c) == NBOUND ?
1019 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1020 isALNUM_LC_utf8((U8*)s)))
1022 else if ((norun || regtry(prog, s)))
1028 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1029 tmp = ((OP(c) == NBOUND ?
1030 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1031 while (s < strend) {
1033 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1035 else if ((norun || regtry(prog, s)))
1040 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1045 LOAD_UTF8_CHARCLASS(alnum,"a");
1046 while (s < strend) {
1047 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1048 if (tmp && (norun || regtry(prog, s)))
1059 while (s < strend) {
1061 if (tmp && (norun || regtry(prog, s)))
1073 PL_reg_flags |= RF_tainted;
1075 while (s < strend) {
1076 if (isALNUM_LC_utf8((U8*)s)) {
1077 if (tmp && (norun || regtry(prog, s)))
1088 while (s < strend) {
1089 if (isALNUM_LC(*s)) {
1090 if (tmp && (norun || regtry(prog, s)))
1103 LOAD_UTF8_CHARCLASS(alnum,"a");
1104 while (s < strend) {
1105 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1106 if (tmp && (norun || regtry(prog, s)))
1117 while (s < strend) {
1119 if (tmp && (norun || regtry(prog, s)))
1131 PL_reg_flags |= RF_tainted;
1133 while (s < strend) {
1134 if (!isALNUM_LC_utf8((U8*)s)) {
1135 if (tmp && (norun || regtry(prog, s)))
1146 while (s < strend) {
1147 if (!isALNUM_LC(*s)) {
1148 if (tmp && (norun || regtry(prog, s)))
1161 LOAD_UTF8_CHARCLASS(space," ");
1162 while (s < strend) {
1163 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1164 if (tmp && (norun || regtry(prog, s)))
1175 while (s < strend) {
1177 if (tmp && (norun || regtry(prog, s)))
1189 PL_reg_flags |= RF_tainted;
1191 while (s < strend) {
1192 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1193 if (tmp && (norun || regtry(prog, s)))
1204 while (s < strend) {
1205 if (isSPACE_LC(*s)) {
1206 if (tmp && (norun || regtry(prog, s)))
1219 LOAD_UTF8_CHARCLASS(space," ");
1220 while (s < strend) {
1221 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1222 if (tmp && (norun || regtry(prog, s)))
1233 while (s < strend) {
1235 if (tmp && (norun || regtry(prog, s)))
1247 PL_reg_flags |= RF_tainted;
1249 while (s < strend) {
1250 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1251 if (tmp && (norun || regtry(prog, s)))
1262 while (s < strend) {
1263 if (!isSPACE_LC(*s)) {
1264 if (tmp && (norun || regtry(prog, s)))
1277 LOAD_UTF8_CHARCLASS(digit,"0");
1278 while (s < strend) {
1279 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1280 if (tmp && (norun || regtry(prog, s)))
1291 while (s < strend) {
1293 if (tmp && (norun || regtry(prog, s)))
1305 PL_reg_flags |= RF_tainted;
1307 while (s < strend) {
1308 if (isDIGIT_LC_utf8((U8*)s)) {
1309 if (tmp && (norun || regtry(prog, s)))
1320 while (s < strend) {
1321 if (isDIGIT_LC(*s)) {
1322 if (tmp && (norun || regtry(prog, s)))
1335 LOAD_UTF8_CHARCLASS(digit,"0");
1336 while (s < strend) {
1337 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1338 if (tmp && (norun || regtry(prog, s)))
1349 while (s < strend) {
1351 if (tmp && (norun || regtry(prog, s)))
1363 PL_reg_flags |= RF_tainted;
1365 while (s < strend) {
1366 if (!isDIGIT_LC_utf8((U8*)s)) {
1367 if (tmp && (norun || regtry(prog, s)))
1378 while (s < strend) {
1379 if (!isDIGIT_LC(*s)) {
1380 if (tmp && (norun || regtry(prog, s)))
1392 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1401 - regexec_flags - match a regexp against a string
1404 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1405 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1406 /* strend: pointer to null at end of string */
1407 /* strbeg: real beginning of string */
1408 /* minend: end of match must be >=minend after stringarg. */
1409 /* data: May be used for some additional optimizations. */
1410 /* nosave: For optimizations. */
1413 register regnode *c;
1414 register char *startpos = stringarg;
1415 I32 minlen; /* must match at least this many chars */
1416 I32 dontbother = 0; /* how many characters not to try at end */
1417 /* I32 start_shift = 0; */ /* Offset of the start to find
1418 constant substr. */ /* CC */
1419 I32 end_shift = 0; /* Same for the end. */ /* CC */
1420 I32 scream_pos = -1; /* Internal iterator of scream. */
1422 SV* oreplsv = GvSV(PL_replgv);
1423 bool do_utf8 = DO_UTF8(sv);
1429 PL_regnarrate = DEBUG_r_TEST;
1432 /* Be paranoid... */
1433 if (prog == NULL || startpos == NULL) {
1434 Perl_croak(aTHX_ "NULL regexp parameter");
1438 minlen = prog->minlen;
1440 if (!(prog->reganch & ROPT_SANY_SEEN))
1441 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1444 if (strend - startpos < minlen) goto phooey;
1447 /* Check validity of program. */
1448 if (UCHARAT(prog->program) != REG_MAGIC) {
1449 Perl_croak(aTHX_ "corrupted regexp program");
1453 PL_reg_eval_set = 0;
1456 if (prog->reganch & ROPT_UTF8)
1457 PL_reg_flags |= RF_utf8;
1459 /* Mark beginning of line for ^ and lookbehind. */
1460 PL_regbol = startpos;
1464 /* Mark end of line for $ (and such) */
1467 /* see how far we have to get to not match where we matched before */
1468 PL_regtill = startpos+minend;
1470 /* We start without call_cc context. */
1473 /* If there is a "must appear" string, look for it. */
1476 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1479 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1480 PL_reg_ganch = startpos;
1481 else if (sv && SvTYPE(sv) >= SVt_PVMG
1483 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1484 && mg->mg_len >= 0) {
1485 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1486 if (prog->reganch & ROPT_ANCH_GPOS) {
1487 if (s > PL_reg_ganch)
1492 else /* pos() not defined */
1493 PL_reg_ganch = strbeg;
1496 if (do_utf8 == (UTF!=0) &&
1497 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1498 re_scream_pos_data d;
1500 d.scream_olds = &scream_olds;
1501 d.scream_pos = &scream_pos;
1502 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1504 goto phooey; /* not present */
1507 DEBUG_r( if (!PL_colorset) reginitcolors() );
1508 DEBUG_r(PerlIO_printf(Perl_debug_log,
1509 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1510 PL_colors[4],PL_colors[5],PL_colors[0],
1513 (strlen(prog->precomp) > 60 ? "..." : ""),
1515 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1516 startpos, PL_colors[1],
1517 (strend - startpos > 60 ? "..." : ""))
1520 /* Simplest case: anchored match need be tried only once. */
1521 /* [unless only anchor is BOL and multiline is set] */
1522 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1523 if (s == startpos && regtry(prog, startpos))
1525 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1526 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1531 dontbother = minlen - 1;
1532 end = HOP3c(strend, -dontbother, strbeg) - 1;
1533 /* for multiline we only have to try after newlines */
1534 if (prog->check_substr) {
1538 if (regtry(prog, s))
1543 if (prog->reganch & RE_USE_INTUIT) {
1544 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1555 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1556 if (regtry(prog, s))
1563 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1564 if (regtry(prog, PL_reg_ganch))
1569 /* Messy cases: unanchored match. */
1570 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1571 /* we have /x+whatever/ */
1572 /* it must be a one character string (XXXX Except UTF?) */
1573 char ch = SvPVX(prog->anchored_substr)[0];
1579 while (s < strend) {
1581 DEBUG_r( did_match = 1 );
1582 if (regtry(prog, s)) goto got_it;
1584 while (s < strend && *s == ch)
1591 while (s < strend) {
1593 DEBUG_r( did_match = 1 );
1594 if (regtry(prog, s)) goto got_it;
1596 while (s < strend && *s == ch)
1602 DEBUG_r(if (!did_match)
1603 PerlIO_printf(Perl_debug_log,
1604 "Did not find anchored character...\n")
1608 else if (do_utf8 == (UTF!=0) &&
1609 (prog->anchored_substr != Nullsv
1610 || (prog->float_substr != Nullsv
1611 && prog->float_max_offset < strend - s))) {
1612 SV *must = prog->anchored_substr
1613 ? prog->anchored_substr : prog->float_substr;
1615 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1617 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1618 char *last = HOP3c(strend, /* Cannot start after this */
1619 -(I32)(CHR_SVLEN(must)
1620 - (SvTAIL(must) != 0) + back_min), strbeg);
1621 char *last1; /* Last position checked before */
1627 last1 = HOPc(s, -1);
1629 last1 = s - 1; /* bogus */
1631 /* XXXX check_substr already used to find `s', can optimize if
1632 check_substr==must. */
1634 dontbother = end_shift;
1635 strend = HOPc(strend, -dontbother);
1636 while ( (s <= last) &&
1637 ((flags & REXEC_SCREAM)
1638 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1639 end_shift, &scream_pos, 0))
1640 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1641 (unsigned char*)strend, must,
1642 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1643 DEBUG_r( did_match = 1 );
1644 if (HOPc(s, -back_max) > last1) {
1645 last1 = HOPc(s, -back_min);
1646 s = HOPc(s, -back_max);
1649 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1651 last1 = HOPc(s, -back_min);
1655 while (s <= last1) {
1656 if (regtry(prog, s))
1662 while (s <= last1) {
1663 if (regtry(prog, s))
1669 DEBUG_r(if (!did_match)
1670 PerlIO_printf(Perl_debug_log,
1671 "Did not find %s substr `%s%.*s%s'%s...\n",
1672 ((must == prog->anchored_substr)
1673 ? "anchored" : "floating"),
1675 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1677 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1681 else if ((c = prog->regstclass)) {
1682 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1683 /* don't bother with what can't match */
1684 strend = HOPc(strend, -(minlen - 1));
1686 SV *prop = sv_newmortal();
1688 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1690 if (find_byclass(prog, c, s, strend, startpos, 0))
1692 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1696 if (prog->float_substr != Nullsv) { /* Trim the end. */
1699 if (flags & REXEC_SCREAM) {
1700 last = screaminstr(sv, prog->float_substr, s - strbeg,
1701 end_shift, &scream_pos, 1); /* last one */
1703 last = scream_olds; /* Only one occurrence. */
1707 char *little = SvPV(prog->float_substr, len);
1709 if (SvTAIL(prog->float_substr)) {
1710 if (memEQ(strend - len + 1, little, len - 1))
1711 last = strend - len + 1;
1712 else if (!PL_multiline)
1713 last = memEQ(strend - len, little, len)
1714 ? strend - len : Nullch;
1720 last = rninstr(s, strend, little, little + len);
1722 last = strend; /* matching `$' */
1726 DEBUG_r(PerlIO_printf(Perl_debug_log,
1727 "%sCan't trim the tail, match fails (should not happen)%s\n",
1728 PL_colors[4],PL_colors[5]));
1729 goto phooey; /* Should not happen! */
1731 dontbother = strend - last + prog->float_min_offset;
1733 if (minlen && (dontbother < minlen))
1734 dontbother = minlen - 1;
1735 strend -= dontbother; /* this one's always in bytes! */
1736 /* We don't know much -- general case. */
1739 if (regtry(prog, s))
1748 if (regtry(prog, s))
1750 } while (s++ < strend);
1758 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1760 if (PL_reg_eval_set) {
1761 /* Preserve the current value of $^R */
1762 if (oreplsv != GvSV(PL_replgv))
1763 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1764 restored, the value remains
1766 restore_pos(aTHXo_ 0);
1769 /* make sure $`, $&, $', and $digit will work later */
1770 if ( !(flags & REXEC_NOT_FIRST) ) {
1771 if (RX_MATCH_COPIED(prog)) {
1772 Safefree(prog->subbeg);
1773 RX_MATCH_COPIED_off(prog);
1775 if (flags & REXEC_COPY_STR) {
1776 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1778 s = savepvn(strbeg, i);
1781 RX_MATCH_COPIED_on(prog);
1784 prog->subbeg = strbeg;
1785 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1792 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1793 PL_colors[4],PL_colors[5]));
1794 if (PL_reg_eval_set)
1795 restore_pos(aTHXo_ 0);
1800 - regtry - try match at specific point
1802 STATIC I32 /* 0 failure, 1 success */
1803 S_regtry(pTHX_ regexp *prog, char *startpos)
1811 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1813 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1816 PL_reg_eval_set = RS_init;
1818 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1819 (IV)(PL_stack_sp - PL_stack_base));
1821 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1822 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1823 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1825 /* Apparently this is not needed, judging by wantarray. */
1826 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1827 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1830 /* Make $_ available to executed code. */
1831 if (PL_reg_sv != DEFSV) {
1832 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1837 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1838 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1839 /* prepare for quick setting of pos */
1840 sv_magic(PL_reg_sv, (SV*)0,
1841 PERL_MAGIC_regex_global, Nullch, 0);
1842 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1846 PL_reg_oldpos = mg->mg_len;
1847 SAVEDESTRUCTOR_X(restore_pos, 0);
1850 Newz(22,PL_reg_curpm, 1, PMOP);
1851 PL_reg_curpm->op_pmregexp = prog;
1852 PL_reg_oldcurpm = PL_curpm;
1853 PL_curpm = PL_reg_curpm;
1854 if (RX_MATCH_COPIED(prog)) {
1855 /* Here is a serious problem: we cannot rewrite subbeg,
1856 since it may be needed if this match fails. Thus
1857 $` inside (?{}) could fail... */
1858 PL_reg_oldsaved = prog->subbeg;
1859 PL_reg_oldsavedlen = prog->sublen;
1860 RX_MATCH_COPIED_off(prog);
1863 PL_reg_oldsaved = Nullch;
1864 prog->subbeg = PL_bostr;
1865 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1867 prog->startp[0] = startpos - PL_bostr;
1868 PL_reginput = startpos;
1869 PL_regstartp = prog->startp;
1870 PL_regendp = prog->endp;
1871 PL_reglastparen = &prog->lastparen;
1872 prog->lastparen = 0;
1874 DEBUG_r(PL_reg_starttry = startpos);
1875 if (PL_reg_start_tmpl <= prog->nparens) {
1876 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1877 if(PL_reg_start_tmp)
1878 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1880 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1883 /* XXXX What this code is doing here?!!! There should be no need
1884 to do this again and again, PL_reglastparen should take care of
1887 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1888 * Actually, the code in regcppop() (which Ilya may be meaning by
1889 * PL_reglastparen), is not needed at all by the test suite
1890 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1891 * enough, for building DynaLoader, or otherwise this
1892 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1893 * will happen. Meanwhile, this code *is* needed for the
1894 * above-mentioned test suite tests to succeed. The common theme
1895 * on those tests seems to be returning null fields from matches.
1900 if (prog->nparens) {
1901 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1908 if (regmatch(prog->program + 1)) {
1909 prog->endp[0] = PL_reginput - PL_bostr;
1912 REGCP_UNWIND(lastcp);
1916 #define RE_UNWIND_BRANCH 1
1917 #define RE_UNWIND_BRANCHJ 2
1921 typedef struct { /* XX: makes sense to enlarge it... */
1925 } re_unwind_generic_t;
1938 } re_unwind_branch_t;
1940 typedef union re_unwind_t {
1942 re_unwind_generic_t generic;
1943 re_unwind_branch_t branch;
1947 - regmatch - main matching routine
1949 * Conceptually the strategy is simple: check to see whether the current
1950 * node matches, call self recursively to see whether the rest matches,
1951 * and then act accordingly. In practice we make some effort to avoid
1952 * recursion, in particular by going through "ordinary" nodes (that don't
1953 * need to know whether the rest of the match failed) by a loop instead of
1956 /* [lwall] I've hoisted the register declarations to the outer block in order to
1957 * maybe save a little bit of pushing and popping on the stack. It also takes
1958 * advantage of machines that use a register save mask on subroutine entry.
1960 STATIC I32 /* 0 failure, 1 success */
1961 S_regmatch(pTHX_ regnode *prog)
1963 register regnode *scan; /* Current node. */
1964 regnode *next; /* Next node. */
1965 regnode *inner; /* Next node in internal branch. */
1966 register I32 nextchr; /* renamed nextchr - nextchar colides with
1967 function of same name */
1968 register I32 n; /* no or next */
1969 register I32 ln = 0; /* len or last */
1970 register char *s = Nullch; /* operand or save */
1971 register char *locinput = PL_reginput;
1972 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
1973 int minmod = 0, sw = 0, logical = 0;
1976 I32 firstcp = PL_savestack_ix;
1978 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1984 /* Note that nextchr is a byte even in UTF */
1985 nextchr = UCHARAT(locinput);
1987 while (scan != NULL) {
1988 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1990 # define sayYES goto yes
1991 # define sayNO goto no
1992 # define sayYES_FINAL goto yes_final
1993 # define sayYES_LOUD goto yes_loud
1994 # define sayNO_FINAL goto no_final
1995 # define sayNO_SILENT goto do_no
1996 # define saySAME(x) if (x) goto yes; else goto no
1997 # define REPORT_CODE_OFF 24
1999 # define sayYES return 1
2000 # define sayNO return 0
2001 # define sayYES_FINAL return 1
2002 # define sayYES_LOUD return 1
2003 # define sayNO_FINAL return 0
2004 # define sayNO_SILENT return 0
2005 # define saySAME(x) return x
2008 SV *prop = sv_newmortal();
2009 int docolor = *PL_colors[0];
2010 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2011 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2012 /* The part of the string before starttry has one color
2013 (pref0_len chars), between starttry and current
2014 position another one (pref_len - pref0_len chars),
2015 after the current position the third one.
2016 We assume that pref0_len <= pref_len, otherwise we
2017 decrease pref0_len. */
2018 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2019 ? (5 + taill) - l : locinput - PL_bostr;
2022 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2024 pref0_len = pref_len - (locinput - PL_reg_starttry);
2025 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2026 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2027 ? (5 + taill) - pref_len : PL_regeol - locinput);
2028 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2032 if (pref0_len > pref_len)
2033 pref0_len = pref_len;
2034 regprop(prop, scan);
2035 PerlIO_printf(Perl_debug_log,
2036 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2037 (IV)(locinput - PL_bostr),
2038 PL_colors[4], pref0_len,
2039 locinput - pref_len, PL_colors[5],
2040 PL_colors[2], pref_len - pref0_len,
2041 locinput - pref_len + pref0_len, PL_colors[3],
2042 (docolor ? "" : "> <"),
2043 PL_colors[0], l, locinput, PL_colors[1],
2044 15 - l - pref_len + 1,
2046 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2050 next = scan + NEXT_OFF(scan);
2056 if (locinput == PL_bostr || (PL_multiline &&
2057 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2059 /* regtill = regbol; */
2064 if (locinput == PL_bostr ||
2065 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2071 if (locinput == PL_bostr)
2075 if (locinput == PL_reg_ganch)
2085 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2090 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2092 if (PL_regeol - locinput > 1)
2096 if (PL_regeol != locinput)
2100 if (!nextchr && locinput >= PL_regeol)
2102 nextchr = UCHARAT(++locinput);
2105 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2108 locinput += PL_utf8skip[nextchr];
2109 if (locinput > PL_regeol)
2111 nextchr = UCHARAT(locinput);
2114 nextchr = UCHARAT(++locinput);
2119 if (do_utf8 != (UTF!=0)) {
2127 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2136 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2142 nextchr = UCHARAT(locinput);
2145 /* Inline the first character, for speed. */
2146 if (UCHARAT(s) != nextchr)
2148 if (PL_regeol - locinput < ln)
2150 if (ln > 1 && memNE(s, locinput, ln))
2153 nextchr = UCHARAT(locinput);
2156 PL_reg_flags |= RF_tainted;
2166 c1 = OP(scan) == EXACTF;
2168 if (l >= PL_regeol) {
2171 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2172 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2174 s += UTF ? UTF8SKIP(s) : 1;
2178 nextchr = UCHARAT(locinput);
2182 /* Inline the first character, for speed. */
2183 if (UCHARAT(s) != nextchr &&
2184 UCHARAT(s) != ((OP(scan) == EXACTF)
2185 ? PL_fold : PL_fold_locale)[nextchr])
2187 if (PL_regeol - locinput < ln)
2189 if (ln > 1 && (OP(scan) == EXACTF
2190 ? ibcmp(s, locinput, ln)
2191 : ibcmp_locale(s, locinput, ln)))
2194 nextchr = UCHARAT(locinput);
2198 if (!reginclass(scan, (U8*)locinput, do_utf8))
2200 if (locinput >= PL_regeol)
2202 locinput += PL_utf8skip[nextchr];
2203 nextchr = UCHARAT(locinput);
2207 nextchr = UCHARAT(locinput);
2208 if (!reginclass(scan, (U8*)locinput, do_utf8))
2210 if (!nextchr && locinput >= PL_regeol)
2212 nextchr = UCHARAT(++locinput);
2216 PL_reg_flags |= RF_tainted;
2222 LOAD_UTF8_CHARCLASS(alnum,"a");
2223 if (!(OP(scan) == ALNUM
2224 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2225 : isALNUM_LC_utf8((U8*)locinput)))
2229 locinput += PL_utf8skip[nextchr];
2230 nextchr = UCHARAT(locinput);
2233 if (!(OP(scan) == ALNUM
2234 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2236 nextchr = UCHARAT(++locinput);
2239 PL_reg_flags |= RF_tainted;
2242 if (!nextchr && locinput >= PL_regeol)
2245 LOAD_UTF8_CHARCLASS(alnum,"a");
2246 if (OP(scan) == NALNUM
2247 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2248 : isALNUM_LC_utf8((U8*)locinput))
2252 locinput += PL_utf8skip[nextchr];
2253 nextchr = UCHARAT(locinput);
2256 if (OP(scan) == NALNUM
2257 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2259 nextchr = UCHARAT(++locinput);
2263 PL_reg_flags |= RF_tainted;
2267 /* was last char in word? */
2269 if (locinput == PL_bostr)
2272 U8 *r = reghop((U8*)locinput, -1);
2274 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2276 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2277 ln = isALNUM_uni(ln);
2278 LOAD_UTF8_CHARCLASS(alnum,"a");
2279 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2282 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2283 n = isALNUM_LC_utf8((U8*)locinput);
2287 ln = (locinput != PL_bostr) ?
2288 UCHARAT(locinput - 1) : '\n';
2289 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2291 n = isALNUM(nextchr);
2294 ln = isALNUM_LC(ln);
2295 n = isALNUM_LC(nextchr);
2298 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2299 OP(scan) == BOUNDL))
2303 PL_reg_flags |= RF_tainted;
2309 if (UTF8_IS_CONTINUED(nextchr)) {
2310 LOAD_UTF8_CHARCLASS(space," ");
2311 if (!(OP(scan) == SPACE
2312 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2313 : isSPACE_LC_utf8((U8*)locinput)))
2317 locinput += PL_utf8skip[nextchr];
2318 nextchr = UCHARAT(locinput);
2321 if (!(OP(scan) == SPACE
2322 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2324 nextchr = UCHARAT(++locinput);
2327 if (!(OP(scan) == SPACE
2328 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2330 nextchr = UCHARAT(++locinput);
2334 PL_reg_flags |= RF_tainted;
2337 if (!nextchr && locinput >= PL_regeol)
2340 LOAD_UTF8_CHARCLASS(space," ");
2341 if (OP(scan) == NSPACE
2342 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2343 : isSPACE_LC_utf8((U8*)locinput))
2347 locinput += PL_utf8skip[nextchr];
2348 nextchr = UCHARAT(locinput);
2351 if (OP(scan) == NSPACE
2352 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2354 nextchr = UCHARAT(++locinput);
2357 PL_reg_flags |= RF_tainted;
2363 LOAD_UTF8_CHARCLASS(digit,"0");
2364 if (!(OP(scan) == DIGIT
2365 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2366 : isDIGIT_LC_utf8((U8*)locinput)))
2370 locinput += PL_utf8skip[nextchr];
2371 nextchr = UCHARAT(locinput);
2374 if (!(OP(scan) == DIGIT
2375 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2377 nextchr = UCHARAT(++locinput);
2380 PL_reg_flags |= RF_tainted;
2383 if (!nextchr && locinput >= PL_regeol)
2386 LOAD_UTF8_CHARCLASS(digit,"0");
2387 if (OP(scan) == NDIGIT
2388 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2389 : isDIGIT_LC_utf8((U8*)locinput))
2393 locinput += PL_utf8skip[nextchr];
2394 nextchr = UCHARAT(locinput);
2397 if (OP(scan) == NDIGIT
2398 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2400 nextchr = UCHARAT(++locinput);
2403 LOAD_UTF8_CHARCLASS(mark,"~");
2404 if (locinput >= PL_regeol ||
2405 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2407 locinput += PL_utf8skip[nextchr];
2408 while (locinput < PL_regeol &&
2409 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2410 locinput += UTF8SKIP(locinput);
2411 if (locinput > PL_regeol)
2413 nextchr = UCHARAT(locinput);
2416 PL_reg_flags |= RF_tainted;
2420 n = ARG(scan); /* which paren pair */
2421 ln = PL_regstartp[n];
2422 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2423 if (*PL_reglastparen < n || ln == -1)
2424 sayNO; /* Do not match unless seen CLOSEn. */
2425 if (ln == PL_regendp[n])
2429 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2431 char *e = PL_bostr + PL_regendp[n];
2433 * Note that we can't do the "other character" lookup trick as
2434 * in the 8-bit case (no pun intended) because in Unicode we
2435 * have to map both upper and title case to lower case.
2437 if (OP(scan) == REFF) {
2441 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2451 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2458 nextchr = UCHARAT(locinput);
2462 /* Inline the first character, for speed. */
2463 if (UCHARAT(s) != nextchr &&
2465 (UCHARAT(s) != ((OP(scan) == REFF
2466 ? PL_fold : PL_fold_locale)[nextchr]))))
2468 ln = PL_regendp[n] - ln;
2469 if (locinput + ln > PL_regeol)
2471 if (ln > 1 && (OP(scan) == REF
2472 ? memNE(s, locinput, ln)
2474 ? ibcmp(s, locinput, ln)
2475 : ibcmp_locale(s, locinput, ln))))
2478 nextchr = UCHARAT(locinput);
2489 OP_4tree *oop = PL_op;
2490 COP *ocurcop = PL_curcop;
2491 SV **ocurpad = PL_curpad;
2495 PL_op = (OP_4tree*)PL_regdata->data[n];
2496 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2497 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2498 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2500 CALLRUNOPS(aTHX); /* Scalar context. */
2506 PL_curpad = ocurpad;
2507 PL_curcop = ocurcop;
2509 if (logical == 2) { /* Postponed subexpression. */
2511 MAGIC *mg = Null(MAGIC*);
2513 CHECKPOINT cp, lastcp;
2515 if(SvROK(ret) || SvRMAGICAL(ret)) {
2516 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2519 mg = mg_find(sv, PERL_MAGIC_qr);
2522 re = (regexp *)mg->mg_obj;
2523 (void)ReREFCNT_inc(re);
2527 char *t = SvPV(ret, len);
2529 char *oprecomp = PL_regprecomp;
2530 I32 osize = PL_regsize;
2531 I32 onpar = PL_regnpar;
2534 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2536 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2537 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2539 PL_regprecomp = oprecomp;
2544 PerlIO_printf(Perl_debug_log,
2545 "Entering embedded `%s%.60s%s%s'\n",
2549 (strlen(re->precomp) > 60 ? "..." : ""))
2552 state.prev = PL_reg_call_cc;
2553 state.cc = PL_regcc;
2554 state.re = PL_reg_re;
2558 cp = regcppush(0); /* Save *all* the positions. */
2561 state.ss = PL_savestack_ix;
2562 *PL_reglastparen = 0;
2563 PL_reg_call_cc = &state;
2564 PL_reginput = locinput;
2566 /* XXXX This is too dramatic a measure... */
2569 if (regmatch(re->program + 1)) {
2570 /* Even though we succeeded, we need to restore
2571 global variables, since we may be wrapped inside
2572 SUSPEND, thus the match may be not finished yet. */
2574 /* XXXX Do this only if SUSPENDed? */
2575 PL_reg_call_cc = state.prev;
2576 PL_regcc = state.cc;
2577 PL_reg_re = state.re;
2578 cache_re(PL_reg_re);
2580 /* XXXX This is too dramatic a measure... */
2583 /* These are needed even if not SUSPEND. */
2589 REGCP_UNWIND(lastcp);
2591 PL_reg_call_cc = state.prev;
2592 PL_regcc = state.cc;
2593 PL_reg_re = state.re;
2594 cache_re(PL_reg_re);
2596 /* XXXX This is too dramatic a measure... */
2605 sv_setsv(save_scalar(PL_replgv), ret);
2609 n = ARG(scan); /* which paren pair */
2610 PL_reg_start_tmp[n] = locinput;
2615 n = ARG(scan); /* which paren pair */
2616 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2617 PL_regendp[n] = locinput - PL_bostr;
2618 if (n > *PL_reglastparen)
2619 *PL_reglastparen = n;
2622 n = ARG(scan); /* which paren pair */
2623 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2626 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2628 next = NEXTOPER(NEXTOPER(scan));
2630 next = scan + ARG(scan);
2631 if (OP(next) == IFTHEN) /* Fake one. */
2632 next = NEXTOPER(NEXTOPER(next));
2636 logical = scan->flags;
2638 /*******************************************************************
2639 PL_regcc contains infoblock about the innermost (...)* loop, and
2640 a pointer to the next outer infoblock.
2642 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2644 1) After matching X, regnode for CURLYX is processed;
2646 2) This regnode creates infoblock on the stack, and calls
2647 regmatch() recursively with the starting point at WHILEM node;
2649 3) Each hit of WHILEM node tries to match A and Z (in the order
2650 depending on the current iteration, min/max of {min,max} and
2651 greediness). The information about where are nodes for "A"
2652 and "Z" is read from the infoblock, as is info on how many times "A"
2653 was already matched, and greediness.
2655 4) After A matches, the same WHILEM node is hit again.
2657 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2658 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2659 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2660 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2661 of the external loop.
2663 Currently present infoblocks form a tree with a stem formed by PL_curcc
2664 and whatever it mentions via ->next, and additional attached trees
2665 corresponding to temporarily unset infoblocks as in "5" above.
2667 In the following picture infoblocks for outer loop of
2668 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2669 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2670 infoblocks are drawn below the "reset" infoblock.
2672 In fact in the picture below we do not show failed matches for Z and T
2673 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2674 more obvious *why* one needs to *temporary* unset infoblocks.]
2676 Matched REx position InfoBlocks Comment
2680 Y A)*?Z)*?T x <- O <- I
2681 YA )*?Z)*?T x <- O <- I
2682 YA A)*?Z)*?T x <- O <- I
2683 YAA )*?Z)*?T x <- O <- I
2684 YAA Z)*?T x <- O # Temporary unset I
2687 YAAZ Y(A)*?Z)*?T x <- O
2690 YAAZY (A)*?Z)*?T x <- O
2693 YAAZY A)*?Z)*?T x <- O <- I
2696 YAAZYA )*?Z)*?T x <- O <- I
2699 YAAZYA Z)*?T x <- O # Temporary unset I
2705 YAAZYAZ T x # Temporary unset O
2712 *******************************************************************/
2715 CHECKPOINT cp = PL_savestack_ix;
2716 /* No need to save/restore up to this paren */
2717 I32 parenfloor = scan->flags;
2719 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2721 cc.oldcc = PL_regcc;
2723 /* XXXX Probably it is better to teach regpush to support
2724 parenfloor > PL_regsize... */
2725 if (parenfloor > *PL_reglastparen)
2726 parenfloor = *PL_reglastparen; /* Pessimization... */
2727 cc.parenfloor = parenfloor;
2729 cc.min = ARG1(scan);
2730 cc.max = ARG2(scan);
2731 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2735 PL_reginput = locinput;
2736 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2738 PL_regcc = cc.oldcc;
2744 * This is really hard to understand, because after we match
2745 * what we're trying to match, we must make sure the rest of
2746 * the REx is going to match for sure, and to do that we have
2747 * to go back UP the parse tree by recursing ever deeper. And
2748 * if it fails, we have to reset our parent's current state
2749 * that we can try again after backing off.
2752 CHECKPOINT cp, lastcp;
2753 CURCUR* cc = PL_regcc;
2754 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2756 n = cc->cur + 1; /* how many we know we matched */
2757 PL_reginput = locinput;
2760 PerlIO_printf(Perl_debug_log,
2761 "%*s %ld out of %ld..%ld cc=%lx\n",
2762 REPORT_CODE_OFF+PL_regindent*2, "",
2763 (long)n, (long)cc->min,
2764 (long)cc->max, (long)cc)
2767 /* If degenerate scan matches "", assume scan done. */
2769 if (locinput == cc->lastloc && n >= cc->min) {
2770 PL_regcc = cc->oldcc;
2774 PerlIO_printf(Perl_debug_log,
2775 "%*s empty match detected, try continuation...\n",
2776 REPORT_CODE_OFF+PL_regindent*2, "")
2778 if (regmatch(cc->next))
2786 /* First just match a string of min scans. */
2790 cc->lastloc = locinput;
2791 if (regmatch(cc->scan))
2794 cc->lastloc = lastloc;
2799 /* Check whether we already were at this position.
2800 Postpone detection until we know the match is not
2801 *that* much linear. */
2802 if (!PL_reg_maxiter) {
2803 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2804 PL_reg_leftiter = PL_reg_maxiter;
2806 if (PL_reg_leftiter-- == 0) {
2807 I32 size = (PL_reg_maxiter + 7)/8;
2808 if (PL_reg_poscache) {
2809 if (PL_reg_poscache_size < size) {
2810 Renew(PL_reg_poscache, size, char);
2811 PL_reg_poscache_size = size;
2813 Zero(PL_reg_poscache, size, char);
2816 PL_reg_poscache_size = size;
2817 Newz(29, PL_reg_poscache, size, char);
2820 PerlIO_printf(Perl_debug_log,
2821 "%sDetected a super-linear match, switching on caching%s...\n",
2822 PL_colors[4], PL_colors[5])
2825 if (PL_reg_leftiter < 0) {
2826 I32 o = locinput - PL_bostr, b;
2828 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2831 if (PL_reg_poscache[o] & (1<<b)) {
2833 PerlIO_printf(Perl_debug_log,
2834 "%*s already tried at this position...\n",
2835 REPORT_CODE_OFF+PL_regindent*2, "")
2839 PL_reg_poscache[o] |= (1<<b);
2843 /* Prefer next over scan for minimal matching. */
2846 PL_regcc = cc->oldcc;
2849 cp = regcppush(cc->parenfloor);
2851 if (regmatch(cc->next)) {
2853 sayYES; /* All done. */
2855 REGCP_UNWIND(lastcp);
2861 if (n >= cc->max) { /* Maximum greed exceeded? */
2862 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2863 && !(PL_reg_flags & RF_warned)) {
2864 PL_reg_flags |= RF_warned;
2865 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2866 "Complex regular subexpression recursion",
2873 PerlIO_printf(Perl_debug_log,
2874 "%*s trying longer...\n",
2875 REPORT_CODE_OFF+PL_regindent*2, "")
2877 /* Try scanning more and see if it helps. */
2878 PL_reginput = locinput;
2880 cc->lastloc = locinput;
2881 cp = regcppush(cc->parenfloor);
2883 if (regmatch(cc->scan)) {
2887 REGCP_UNWIND(lastcp);
2890 cc->lastloc = lastloc;
2894 /* Prefer scan over next for maximal matching. */
2896 if (n < cc->max) { /* More greed allowed? */
2897 cp = regcppush(cc->parenfloor);
2899 cc->lastloc = locinput;
2901 if (regmatch(cc->scan)) {
2905 REGCP_UNWIND(lastcp);
2906 regcppop(); /* Restore some previous $<digit>s? */
2907 PL_reginput = locinput;
2909 PerlIO_printf(Perl_debug_log,
2910 "%*s failed, try continuation...\n",
2911 REPORT_CODE_OFF+PL_regindent*2, "")
2914 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2915 && !(PL_reg_flags & RF_warned)) {
2916 PL_reg_flags |= RF_warned;
2917 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2918 "Complex regular subexpression recursion",
2922 /* Failed deeper matches of scan, so see if this one works. */
2923 PL_regcc = cc->oldcc;
2926 if (regmatch(cc->next))
2932 cc->lastloc = lastloc;
2937 next = scan + ARG(scan);
2940 inner = NEXTOPER(NEXTOPER(scan));
2943 inner = NEXTOPER(scan);
2947 if (OP(next) != c1) /* No choice. */
2948 next = inner; /* Avoid recursion. */
2950 I32 lastparen = *PL_reglastparen;
2952 re_unwind_branch_t *uw;
2954 /* Put unwinding data on stack */
2955 unwind1 = SSNEWt(1,re_unwind_branch_t);
2956 uw = SSPTRt(unwind1,re_unwind_branch_t);
2959 uw->type = ((c1 == BRANCH)
2961 : RE_UNWIND_BRANCHJ);
2962 uw->lastparen = lastparen;
2964 uw->locinput = locinput;
2965 uw->nextchr = nextchr;
2967 uw->regindent = ++PL_regindent;
2970 REGCP_SET(uw->lastcp);
2972 /* Now go into the first branch */
2985 /* We suppose that the next guy does not need
2986 backtracking: in particular, it is of constant length,
2987 and has no parenths to influence future backrefs. */
2988 ln = ARG1(scan); /* min to match */
2989 n = ARG2(scan); /* max to match */
2990 paren = scan->flags;
2992 if (paren > PL_regsize)
2994 if (paren > *PL_reglastparen)
2995 *PL_reglastparen = paren;
2997 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2999 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3000 PL_reginput = locinput;
3003 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3005 if (ln && l == 0 && n >= ln
3006 /* In fact, this is tricky. If paren, then the
3007 fact that we did/didnot match may influence
3008 future execution. */
3009 && !(paren && ln == 0))
3011 locinput = PL_reginput;
3012 if (PL_regkind[(U8)OP(next)] == EXACT) {
3013 c1 = (U8)*STRING(next);
3014 if (OP(next) == EXACTF)
3016 else if (OP(next) == EXACTFL)
3017 c2 = PL_fold_locale[c1];
3024 /* This may be improved if l == 0. */
3025 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3026 /* If it could work, try it. */
3028 UCHARAT(PL_reginput) == c1 ||
3029 UCHARAT(PL_reginput) == c2)
3033 PL_regstartp[paren] =
3034 HOPc(PL_reginput, -l) - PL_bostr;
3035 PL_regendp[paren] = PL_reginput - PL_bostr;
3038 PL_regendp[paren] = -1;
3042 REGCP_UNWIND(lastcp);
3044 /* Couldn't or didn't -- move forward. */
3045 PL_reginput = locinput;
3046 if (regrepeat_hard(scan, 1, &l)) {
3048 locinput = PL_reginput;
3055 n = regrepeat_hard(scan, n, &l);
3056 if (n != 0 && l == 0
3057 /* In fact, this is tricky. If paren, then the
3058 fact that we did/didnot match may influence
3059 future execution. */
3060 && !(paren && ln == 0))
3062 locinput = PL_reginput;
3064 PerlIO_printf(Perl_debug_log,
3065 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3066 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3070 if (PL_regkind[(U8)OP(next)] == EXACT) {
3071 c1 = (U8)*STRING(next);
3072 if (OP(next) == EXACTF)
3074 else if (OP(next) == EXACTFL)
3075 c2 = PL_fold_locale[c1];
3084 /* If it could work, try it. */
3086 UCHARAT(PL_reginput) == c1 ||
3087 UCHARAT(PL_reginput) == c2)
3090 PerlIO_printf(Perl_debug_log,
3091 "%*s trying tail with n=%"IVdf"...\n",
3092 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3096 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3097 PL_regendp[paren] = PL_reginput - PL_bostr;
3100 PL_regendp[paren] = -1;
3104 REGCP_UNWIND(lastcp);
3106 /* Couldn't or didn't -- back up. */
3108 locinput = HOPc(locinput, -l);
3109 PL_reginput = locinput;
3116 paren = scan->flags; /* Which paren to set */
3117 if (paren > PL_regsize)
3119 if (paren > *PL_reglastparen)
3120 *PL_reglastparen = paren;
3121 ln = ARG1(scan); /* min to match */
3122 n = ARG2(scan); /* max to match */
3123 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3127 ln = ARG1(scan); /* min to match */
3128 n = ARG2(scan); /* max to match */
3129 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3134 scan = NEXTOPER(scan);
3140 scan = NEXTOPER(scan);
3144 * Lookahead to avoid useless match attempts
3145 * when we know what character comes next.
3147 if (PL_regkind[(U8)OP(next)] == EXACT) {
3148 U8 *s = (U8*)STRING(next);
3151 if (OP(next) == EXACTF)
3153 else if (OP(next) == EXACTFL)
3154 c2 = PL_fold_locale[c1];
3157 if (OP(next) == EXACTF) {
3158 c1 = to_utf8_lower(s);
3159 c2 = to_utf8_upper(s);
3162 c2 = c1 = utf8_to_uvchr(s, NULL);
3168 PL_reginput = locinput;
3172 if (ln && regrepeat(scan, ln) < ln)
3174 locinput = PL_reginput;
3177 char *e; /* Should not check after this */
3178 char *old = locinput;
3180 if (n == REG_INFTY) {
3183 while (UTF8_IS_CONTINUATION(*(U8*)e))
3189 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3193 e = locinput + n - ln;
3199 /* Find place 'next' could work */
3202 while (locinput <= e && *locinput != c1)
3205 while (locinput <= e
3210 count = locinput - old;
3217 utf8_to_uvchr((U8*)locinput, &len) != c1;
3222 for (count = 0; locinput <= e; count++) {
3223 UV c = utf8_to_uvchr((U8*)locinput, &len);
3224 if (c == c1 || c == c2)
3232 /* PL_reginput == old now */
3233 if (locinput != old) {
3234 ln = 1; /* Did some */
3235 if (regrepeat(scan, count) < count)
3238 /* PL_reginput == locinput now */
3239 TRYPAREN(paren, ln, locinput);
3240 PL_reginput = locinput; /* Could be reset... */
3241 REGCP_UNWIND(lastcp);
3242 /* Couldn't or didn't -- move forward. */
3245 locinput += UTF8SKIP(locinput);
3251 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3255 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3257 c = UCHARAT(PL_reginput);
3258 /* If it could work, try it. */
3259 if (c == c1 || c == c2)
3261 TRYPAREN(paren, n, PL_reginput);
3262 REGCP_UNWIND(lastcp);
3265 /* If it could work, try it. */
3266 else if (c1 == -1000)
3268 TRYPAREN(paren, n, PL_reginput);
3269 REGCP_UNWIND(lastcp);
3271 /* Couldn't or didn't -- move forward. */
3272 PL_reginput = locinput;
3273 if (regrepeat(scan, 1)) {
3275 locinput = PL_reginput;
3283 n = regrepeat(scan, n);
3284 locinput = PL_reginput;
3285 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3286 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3287 ln = n; /* why back off? */
3288 /* ...because $ and \Z can match before *and* after
3289 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3290 We should back off by one in this case. */
3291 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3300 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3302 c = UCHARAT(PL_reginput);
3304 /* If it could work, try it. */
3305 if (c1 == -1000 || c == c1 || c == c2)
3307 TRYPAREN(paren, n, PL_reginput);
3308 REGCP_UNWIND(lastcp);
3310 /* Couldn't or didn't -- back up. */
3312 PL_reginput = locinput = HOPc(locinput, -1);
3320 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3322 c = UCHARAT(PL_reginput);
3324 /* If it could work, try it. */
3325 if (c1 == -1000 || c == c1 || c == c2)
3327 TRYPAREN(paren, n, PL_reginput);
3328 REGCP_UNWIND(lastcp);
3330 /* Couldn't or didn't -- back up. */
3332 PL_reginput = locinput = HOPc(locinput, -1);
3339 if (PL_reg_call_cc) {
3340 re_cc_state *cur_call_cc = PL_reg_call_cc;
3341 CURCUR *cctmp = PL_regcc;
3342 regexp *re = PL_reg_re;
3343 CHECKPOINT cp, lastcp;
3345 cp = regcppush(0); /* Save *all* the positions. */
3347 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3349 PL_reginput = locinput; /* Make position available to
3351 cache_re(PL_reg_call_cc->re);
3352 PL_regcc = PL_reg_call_cc->cc;
3353 PL_reg_call_cc = PL_reg_call_cc->prev;
3354 if (regmatch(cur_call_cc->node)) {
3355 PL_reg_call_cc = cur_call_cc;
3359 REGCP_UNWIND(lastcp);
3361 PL_reg_call_cc = cur_call_cc;
3367 PerlIO_printf(Perl_debug_log,
3368 "%*s continuation failed...\n",
3369 REPORT_CODE_OFF+PL_regindent*2, "")
3373 if (locinput < PL_regtill) {
3374 DEBUG_r(PerlIO_printf(Perl_debug_log,
3375 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3377 (long)(locinput - PL_reg_starttry),
3378 (long)(PL_regtill - PL_reg_starttry),
3380 sayNO_FINAL; /* Cannot match: too short. */
3382 PL_reginput = locinput; /* put where regtry can find it */
3383 sayYES_FINAL; /* Success! */
3385 PL_reginput = locinput; /* put where regtry can find it */
3386 sayYES_LOUD; /* Success! */
3389 PL_reginput = locinput;
3394 s = HOPBACKc(locinput, scan->flags);
3400 PL_reginput = locinput;
3405 s = HOPBACKc(locinput, scan->flags);
3411 PL_reginput = locinput;
3414 inner = NEXTOPER(NEXTOPER(scan));
3415 if (regmatch(inner) != n) {
3430 if (OP(scan) == SUSPEND) {
3431 locinput = PL_reginput;
3432 nextchr = UCHARAT(locinput);
3437 next = scan + ARG(scan);
3442 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3443 PTR2UV(scan), OP(scan));
3444 Perl_croak(aTHX_ "regexp memory corruption");
3451 * We get here only if there's trouble -- normally "case END" is
3452 * the terminating point.
3454 Perl_croak(aTHX_ "corrupted regexp pointers");
3460 PerlIO_printf(Perl_debug_log,
3461 "%*s %scould match...%s\n",
3462 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3466 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3467 PL_colors[4],PL_colors[5]));
3473 #if 0 /* Breaks $^R */
3481 PerlIO_printf(Perl_debug_log,
3482 "%*s %sfailed...%s\n",
3483 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3489 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3492 case RE_UNWIND_BRANCH:
3493 case RE_UNWIND_BRANCHJ:
3495 re_unwind_branch_t *uwb = &(uw->branch);
3496 I32 lastparen = uwb->lastparen;
3498 REGCP_UNWIND(uwb->lastcp);
3499 for (n = *PL_reglastparen; n > lastparen; n--)
3501 *PL_reglastparen = n;
3502 scan = next = uwb->next;
3504 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3505 ? BRANCH : BRANCHJ) ) { /* Failure */
3512 /* Have more choice yet. Reuse the same uwb. */
3514 if ((n = (uwb->type == RE_UNWIND_BRANCH
3515 ? NEXT_OFF(next) : ARG(next))))
3518 next = NULL; /* XXXX Needn't unwinding in this case... */
3520 next = NEXTOPER(scan);
3521 if (uwb->type == RE_UNWIND_BRANCHJ)
3522 next = NEXTOPER(next);
3523 locinput = uwb->locinput;
3524 nextchr = uwb->nextchr;
3526 PL_regindent = uwb->regindent;
3533 Perl_croak(aTHX_ "regexp unwind memory corruption");
3544 - regrepeat - repeatedly match something simple, report how many
3547 * [This routine now assumes that it will only match on things of length 1.
3548 * That was true before, but now we assume scan - reginput is the count,
3549 * rather than incrementing count on every character. [Er, except utf8.]]
3552 S_regrepeat(pTHX_ regnode *p, I32 max)
3554 register char *scan;
3556 register char *loceol = PL_regeol;
3557 register I32 hardcount = 0;
3558 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3561 if (max != REG_INFTY && max < loceol - scan)
3562 loceol = scan + max;
3567 while (scan < loceol && hardcount < max && *scan != '\n') {
3568 scan += UTF8SKIP(scan);
3572 while (scan < loceol && *scan != '\n')
3579 case EXACT: /* length of string is 1 */
3581 while (scan < loceol && UCHARAT(scan) == c)
3584 case EXACTF: /* length of string is 1 */
3586 while (scan < loceol &&
3587 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3590 case EXACTFL: /* length of string is 1 */
3591 PL_reg_flags |= RF_tainted;
3593 while (scan < loceol &&
3594 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3600 while (hardcount < max && scan < loceol &&
3601 reginclass(p, (U8*)scan, do_utf8)) {
3602 scan += UTF8SKIP(scan);
3606 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3613 LOAD_UTF8_CHARCLASS(alnum,"a");
3614 while (hardcount < max && scan < loceol &&
3615 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3616 scan += UTF8SKIP(scan);
3620 while (scan < loceol && isALNUM(*scan))
3625 PL_reg_flags |= RF_tainted;
3628 while (hardcount < max && scan < loceol &&
3629 isALNUM_LC_utf8((U8*)scan)) {
3630 scan += UTF8SKIP(scan);
3634 while (scan < loceol && isALNUM_LC(*scan))
3641 LOAD_UTF8_CHARCLASS(alnum,"a");
3642 while (hardcount < max && scan < loceol &&
3643 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3644 scan += UTF8SKIP(scan);
3648 while (scan < loceol && !isALNUM(*scan))
3653 PL_reg_flags |= RF_tainted;
3656 while (hardcount < max && scan < loceol &&
3657 !isALNUM_LC_utf8((U8*)scan)) {
3658 scan += UTF8SKIP(scan);
3662 while (scan < loceol && !isALNUM_LC(*scan))
3669 LOAD_UTF8_CHARCLASS(space," ");
3670 while (hardcount < max && scan < loceol &&
3672 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3673 scan += UTF8SKIP(scan);
3677 while (scan < loceol && isSPACE(*scan))
3682 PL_reg_flags |= RF_tainted;
3685 while (hardcount < max && scan < loceol &&
3686 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3687 scan += UTF8SKIP(scan);
3691 while (scan < loceol && isSPACE_LC(*scan))
3698 LOAD_UTF8_CHARCLASS(space," ");
3699 while (hardcount < max && scan < loceol &&
3701 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3702 scan += UTF8SKIP(scan);
3706 while (scan < loceol && !isSPACE(*scan))
3711 PL_reg_flags |= RF_tainted;
3714 while (hardcount < max && scan < loceol &&
3715 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3716 scan += UTF8SKIP(scan);
3720 while (scan < loceol && !isSPACE_LC(*scan))
3727 LOAD_UTF8_CHARCLASS(digit,"0");
3728 while (hardcount < max && scan < loceol &&
3729 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3730 scan += UTF8SKIP(scan);
3734 while (scan < loceol && isDIGIT(*scan))
3741 LOAD_UTF8_CHARCLASS(digit,"0");
3742 while (hardcount < max && scan < loceol &&
3743 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3744 scan += UTF8SKIP(scan);
3748 while (scan < loceol && !isDIGIT(*scan))
3752 default: /* Called on something of 0 width. */
3753 break; /* So match right here or not at all. */
3759 c = scan - PL_reginput;
3764 SV *prop = sv_newmortal();
3767 PerlIO_printf(Perl_debug_log,
3768 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3769 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3776 - regrepeat_hard - repeatedly match something, report total lenth and length
3778 * The repeater is supposed to have constant length.
3782 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3784 register char *scan = Nullch;
3785 register char *start;
3786 register char *loceol = PL_regeol;
3788 I32 count = 0, res = 1;
3793 start = PL_reginput;
3794 if (DO_UTF8(PL_reg_sv)) {
3795 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3798 while (start < PL_reginput) {
3800 start += UTF8SKIP(start);
3811 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3813 *lp = l = PL_reginput - start;
3814 if (max != REG_INFTY && l*max < loceol - scan)
3815 loceol = scan + l*max;
3828 - regclass_swash - prepare the utf8 swash
3832 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3837 if (PL_regdata && PL_regdata->count) {
3840 if (PL_regdata->what[n] == 's') {
3841 SV *rv = (SV*)PL_regdata->data[n];
3842 AV *av = (AV*)SvRV((SV*)rv);
3845 si = *av_fetch(av, 0, FALSE);
3846 a = av_fetch(av, 1, FALSE);
3850 else if (si && doinit) {
3851 sw = swash_init("utf8", "", si, 1, 0);
3852 (void)av_store(av, 1, sw);
3864 - reginclass - determine if a character falls into a character class
3868 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3870 char flags = ANYOF_FLAGS(n);
3875 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3877 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3878 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3879 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3882 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3885 SV *sw = regclass_swash(n, TRUE, 0);
3888 if (swash_fetch(sw, p, do_utf8))
3890 else if (flags & ANYOF_FOLD) {
3891 U8 tmpbuf[UTF8_MAXLEN+1];
3893 if (flags & ANYOF_LOCALE) {
3894 PL_reg_flags |= RF_tainted;
3895 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3898 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3899 if (swash_fetch(sw, tmpbuf, do_utf8))
3905 if (!match && c < 256) {
3906 if (ANYOF_BITMAP_TEST(n, c))
3908 else if (flags & ANYOF_FOLD) {
3911 if (flags & ANYOF_LOCALE) {
3912 PL_reg_flags |= RF_tainted;
3913 f = PL_fold_locale[c];
3917 if (f != c && ANYOF_BITMAP_TEST(n, f))
3921 if (!match && (flags & ANYOF_CLASS)) {
3922 PL_reg_flags |= RF_tainted;
3924 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3940 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3941 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3942 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3943 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3944 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3945 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3946 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3947 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3948 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3949 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3950 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3951 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3952 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3953 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3954 ) /* How's that for a conditional? */
3961 return (flags & ANYOF_INVERT) ? !match : match;
3965 S_reghop(pTHX_ U8 *s, I32 off)
3967 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3971 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3974 while (off-- && s < lim) {
3975 /* XXX could check well-formedness here */
3983 if (UTF8_IS_CONTINUED(*s)) {
3984 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3987 /* XXX could check well-formedness here */
3995 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3997 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4001 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4004 while (off-- && s < lim) {
4005 /* XXX could check well-formedness here */
4015 if (UTF8_IS_CONTINUED(*s)) {
4016 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4019 /* XXX could check well-formedness here */
4035 restore_pos(pTHXo_ void *arg)
4037 if (PL_reg_eval_set) {
4038 if (PL_reg_oldsaved) {
4039 PL_reg_re->subbeg = PL_reg_oldsaved;
4040 PL_reg_re->sublen = PL_reg_oldsavedlen;
4041 RX_MATCH_COPIED_on(PL_reg_re);
4043 PL_reg_magic->mg_len = PL_reg_oldpos;
4044 PL_reg_eval_set = 0;
4045 PL_curpm = PL_reg_oldcurpm;