5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define HOPBACK(pos, off) ( \
120 (UTF && DO_UTF8(PL_reg_sv)) \
121 ? reghopmaybe((U8*)pos, -off) \
122 : (pos - off >= PL_bostr) \
126 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
128 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
129 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
130 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
131 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
133 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
135 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
137 static void restore_pos(pTHXo_ void *arg);
140 S_regcppush(pTHX_ I32 parenfloor)
142 int retval = PL_savestack_ix;
143 #define REGCP_PAREN_ELEMS 4
144 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
147 #define REGCP_OTHER_ELEMS 5
148 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
149 for (p = PL_regsize; p > parenfloor; p--) {
150 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
151 SSPUSHINT(PL_regendp[p]);
152 SSPUSHINT(PL_regstartp[p]);
153 SSPUSHPTR(PL_reg_start_tmp[p]);
156 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
157 SSPUSHINT(PL_regsize);
158 SSPUSHINT(*PL_reglastparen);
159 SSPUSHPTR(PL_reginput);
160 #define REGCP_FRAME_ELEMS 2
161 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
162 * are needed for the regexp context stack bookkeeping. */
163 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
164 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
169 /* These are needed since we do not localize EVAL nodes: */
170 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
171 " Setting an EVAL scope, savestack=%"IVdf"\n", \
172 (IV)PL_savestack_ix)); cp = PL_savestack_ix
174 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
175 PerlIO_printf(Perl_debug_log, \
176 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
177 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
187 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
189 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
190 i = SSPOPINT; /* Parentheses elements to pop. */
191 input = (char *) SSPOPPTR;
192 *PL_reglastparen = SSPOPINT;
193 PL_regsize = SSPOPINT;
195 /* Now restore the parentheses context. */
196 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
197 i > 0; i -= REGCP_PAREN_ELEMS) {
198 paren = (U32)SSPOPINT;
199 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
200 PL_regstartp[paren] = SSPOPINT;
202 if (paren <= *PL_reglastparen)
203 PL_regendp[paren] = tmps;
205 PerlIO_printf(Perl_debug_log,
206 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
207 (UV)paren, (IV)PL_regstartp[paren],
208 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
209 (IV)PL_regendp[paren],
210 (paren > *PL_reglastparen ? "(no)" : ""));
214 if (*PL_reglastparen + 1 <= PL_regnpar) {
215 PerlIO_printf(Perl_debug_log,
216 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
217 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
221 /* It would seem that the similar code in regtry()
222 * already takes care of this, and in fact it is in
223 * a better location to since this code can #if 0-ed out
224 * but the code in regtry() is needed or otherwise tests
225 * requiring null fields (pat.t#187 and split.t#{13,14}
226 * (as of patchlevel 7877) will fail. Then again,
227 * this code seems to be necessary or otherwise
228 * building DynaLoader will fail:
229 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
231 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
232 if (paren > PL_regsize)
233 PL_regstartp[paren] = -1;
234 PL_regendp[paren] = -1;
241 S_regcp_set_to(pTHX_ I32 ss)
243 I32 tmp = PL_savestack_ix;
245 PL_savestack_ix = ss;
247 PL_savestack_ix = tmp;
251 typedef struct re_cc_state
255 struct re_cc_state *prev;
260 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
262 #define TRYPAREN(paren, n, input) { \
265 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
266 PL_regendp[paren] = input - PL_bostr; \
269 PL_regendp[paren] = -1; \
271 if (regmatch(next)) \
274 PL_regendp[paren] = -1; \
279 * pregexec and friends
283 - pregexec - match a regexp against a string
286 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
287 char *strbeg, I32 minend, SV *screamer, U32 nosave)
288 /* strend: pointer to null at end of string */
289 /* strbeg: real beginning of string */
290 /* minend: end of match must be >=minend after stringarg. */
291 /* nosave: For optimizations. */
294 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
295 nosave ? 0 : REXEC_COPY_STR);
299 S_cache_re(pTHX_ regexp *prog)
301 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
303 PL_regprogram = prog->program;
305 PL_regnpar = prog->nparens;
306 PL_regdata = prog->data;
311 * Need to implement the following flags for reg_anch:
313 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
315 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
316 * INTUIT_AUTORITATIVE_ML
317 * INTUIT_ONCE_NOML - Intuit can match in one location only.
320 * Another flag for this function: SECOND_TIME (so that float substrs
321 * with giant delta may be not rechecked).
324 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
326 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
327 Otherwise, only SvCUR(sv) is used to get strbeg. */
329 /* XXXX We assume that strpos is strbeg unless sv. */
331 /* XXXX Some places assume that there is a fixed substring.
332 An update may be needed if optimizer marks as "INTUITable"
333 RExen without fixed substrings. Similarly, it is assumed that
334 lengths of all the strings are no more than minlen, thus they
335 cannot come from lookahead.
336 (Or minlen should take into account lookahead.) */
338 /* A failure to find a constant substring means that there is no need to make
339 an expensive call to REx engine, thus we celebrate a failure. Similarly,
340 finding a substring too deep into the string means that less calls to
341 regtry() should be needed.
343 REx compiler's optimizer found 4 possible hints:
344 a) Anchored substring;
346 c) Whether we are anchored (beginning-of-line or \G);
347 d) First node (of those at offset 0) which may distingush positions;
348 We use a)b)d) and multiline-part of c), and try to find a position in the
349 string which does not contradict any of them.
352 /* Most of decisions we do here should have been done at compile time.
353 The nodes of the REx which we used for the search should have been
354 deleted from the finite automaton. */
357 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
358 char *strend, U32 flags, re_scream_pos_data *data)
360 register I32 start_shift;
361 /* Should be nonnegative! */
362 register I32 end_shift;
369 register char *other_last = Nullch; /* other substr checked before this */
370 char *check_at; /* check substr found at this pos */
372 char *i_strpos = strpos;
375 DEBUG_r( if (!PL_colorset) reginitcolors() );
376 DEBUG_r(PerlIO_printf(Perl_debug_log,
377 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
378 PL_colors[4],PL_colors[5],PL_colors[0],
381 (strlen(prog->precomp) > 60 ? "..." : ""),
383 (int)(strend - strpos > 60 ? 60 : strend - strpos),
384 strpos, PL_colors[1],
385 (strend - strpos > 60 ? "..." : ""))
388 if (prog->reganch & ROPT_UTF8)
389 PL_reg_flags |= RF_utf8;
391 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
392 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
395 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
397 check = prog->check_substr;
398 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
399 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
400 || ( (prog->reganch & ROPT_ANCH_BOL)
401 && !PL_multiline ) ); /* Check after \n? */
404 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
405 | ROPT_IMPLICIT)) /* not a real BOL */
406 /* SvCUR is not set on references: SvRV and SvPVX overlap */
408 && (strpos != strbeg)) {
409 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
412 if (prog->check_offset_min == prog->check_offset_max &&
413 !(prog->reganch & ROPT_SANY_SEEN)) {
414 /* Substring at constant offset from beg-of-str... */
417 s = HOP3c(strpos, prog->check_offset_min, strend);
419 slen = SvCUR(check); /* >= 1 */
421 if ( strend - s > slen || strend - s < slen - 1
422 || (strend - s == slen && strend[-1] != '\n')) {
423 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
426 /* Now should match s[0..slen-2] */
428 if (slen && (*SvPVX(check) != *s
430 && memNE(SvPVX(check), s, slen)))) {
432 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
436 else if (*SvPVX(check) != *s
437 || ((slen = SvCUR(check)) > 1
438 && memNE(SvPVX(check), s, slen)))
440 goto success_at_start;
443 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
445 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
446 end_shift = prog->minlen - start_shift -
447 CHR_SVLEN(check) + (SvTAIL(check) != 0);
449 I32 end = prog->check_offset_max + CHR_SVLEN(check)
450 - (SvTAIL(check) != 0);
451 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
453 if (end_shift < eshift)
457 else { /* Can match at random position */
460 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
461 /* Should be nonnegative! */
462 end_shift = prog->minlen - start_shift -
463 CHR_SVLEN(check) + (SvTAIL(check) != 0);
466 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
468 Perl_croak(aTHX_ "panic: end_shift");
472 /* Find a possible match in the region s..strend by looking for
473 the "check" substring in the region corrected by start/end_shift. */
474 if (flags & REXEC_SCREAM) {
475 I32 p = -1; /* Internal iterator of scream. */
476 I32 *pp = data ? data->scream_pos : &p;
478 if (PL_screamfirst[BmRARE(check)] >= 0
479 || ( BmRARE(check) == '\n'
480 && (BmPREVIOUS(check) == SvCUR(check) - 1)
482 s = screaminstr(sv, check,
483 start_shift + (s - strbeg), end_shift, pp, 0);
487 *data->scream_olds = s;
489 else if (prog->reganch & ROPT_SANY_SEEN)
490 s = fbm_instr((U8*)(s + start_shift),
491 (U8*)(strend - end_shift),
492 check, PL_multiline ? FBMrf_MULTILINE : 0);
494 s = fbm_instr(HOP3(s, start_shift, strend),
495 HOP3(strend, -end_shift, strbeg),
496 check, PL_multiline ? FBMrf_MULTILINE : 0);
498 /* Update the count-of-usability, remove useless subpatterns,
501 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
502 (s ? "Found" : "Did not find"),
503 ((check == prog->anchored_substr) ? "anchored" : "floating"),
505 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
507 PL_colors[1], (SvTAIL(check) ? "$" : ""),
508 (s ? " at offset " : "...\n") ) );
515 /* Finish the diagnostic message */
516 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
518 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
519 Start with the other substr.
520 XXXX no SCREAM optimization yet - and a very coarse implementation
521 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
522 *always* match. Probably should be marked during compile...
523 Probably it is right to do no SCREAM here...
526 if (prog->float_substr && prog->anchored_substr) {
527 /* Take into account the "other" substring. */
528 /* XXXX May be hopelessly wrong for UTF... */
531 if (check == prog->float_substr) {
534 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
537 t = s - prog->check_offset_max;
538 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
539 && (!(prog->reganch & ROPT_UTF8)
540 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
545 t = HOP3c(t, prog->anchored_offset, strend);
546 if (t < other_last) /* These positions already checked */
548 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
551 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
552 /* On end-of-str: see comment below. */
553 s = fbm_instr((unsigned char*)t,
554 HOP3(HOP3(last1, prog->anchored_offset, strend)
555 + SvCUR(prog->anchored_substr),
556 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
557 prog->anchored_substr,
558 PL_multiline ? FBMrf_MULTILINE : 0);
559 DEBUG_r(PerlIO_printf(Perl_debug_log,
560 "%s anchored substr `%s%.*s%s'%s",
561 (s ? "Found" : "Contradicts"),
563 (int)(SvCUR(prog->anchored_substr)
564 - (SvTAIL(prog->anchored_substr)!=0)),
565 SvPVX(prog->anchored_substr),
566 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
568 if (last1 >= last2) {
569 DEBUG_r(PerlIO_printf(Perl_debug_log,
570 ", giving up...\n"));
573 DEBUG_r(PerlIO_printf(Perl_debug_log,
574 ", trying floating at offset %ld...\n",
575 (long)(HOP3c(s1, 1, strend) - i_strpos)));
576 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
577 s = HOP3c(last, 1, strend);
581 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
582 (long)(s - i_strpos)));
583 t = HOP3c(s, -prog->anchored_offset, strbeg);
584 other_last = HOP3c(s, 1, strend);
592 else { /* Take into account the floating substring. */
596 t = HOP3c(s, -start_shift, strbeg);
598 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
599 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
600 last = HOP3c(t, prog->float_max_offset, strend);
601 s = HOP3c(t, prog->float_min_offset, strend);
604 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
605 /* fbm_instr() takes into account exact value of end-of-str
606 if the check is SvTAIL(ed). Since false positives are OK,
607 and end-of-str is not later than strend we are OK. */
608 s = fbm_instr((unsigned char*)s,
609 (unsigned char*)last + SvCUR(prog->float_substr)
610 - (SvTAIL(prog->float_substr)!=0),
611 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
612 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
613 (s ? "Found" : "Contradicts"),
615 (int)(SvCUR(prog->float_substr)
616 - (SvTAIL(prog->float_substr)!=0)),
617 SvPVX(prog->float_substr),
618 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
621 DEBUG_r(PerlIO_printf(Perl_debug_log,
622 ", giving up...\n"));
625 DEBUG_r(PerlIO_printf(Perl_debug_log,
626 ", trying anchored starting at offset %ld...\n",
627 (long)(s1 + 1 - i_strpos)));
629 s = HOP3c(t, 1, strend);
633 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
634 (long)(s - i_strpos)));
635 other_last = s; /* Fix this later. --Hugo */
644 t = s - prog->check_offset_max;
645 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
646 && (!(prog->reganch & ROPT_UTF8)
647 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
649 /* Fixed substring is found far enough so that the match
650 cannot start at strpos. */
652 if (ml_anch && t[-1] != '\n') {
653 /* Eventually fbm_*() should handle this, but often
654 anchored_offset is not 0, so this check will not be wasted. */
655 /* XXXX In the code below we prefer to look for "^" even in
656 presence of anchored substrings. And we search even
657 beyond the found float position. These pessimizations
658 are historical artefacts only. */
660 while (t < strend - prog->minlen) {
662 if (t < check_at - prog->check_offset_min) {
663 if (prog->anchored_substr) {
664 /* Since we moved from the found position,
665 we definitely contradict the found anchored
666 substr. Due to the above check we do not
667 contradict "check" substr.
668 Thus we can arrive here only if check substr
669 is float. Redo checking for "other"=="fixed".
672 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
673 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
674 goto do_other_anchored;
676 /* We don't contradict the found floating substring. */
677 /* XXXX Why not check for STCLASS? */
679 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
680 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
683 /* Position contradicts check-string */
684 /* XXXX probably better to look for check-string
685 than for "\n", so one should lower the limit for t? */
686 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
687 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
688 other_last = strpos = s = t + 1;
693 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
694 PL_colors[0],PL_colors[1]));
698 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
699 PL_colors[0],PL_colors[1]));
703 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
706 /* The found string does not prohibit matching at strpos,
707 - no optimization of calling REx engine can be performed,
708 unless it was an MBOL and we are not after MBOL,
709 or a future STCLASS check will fail this. */
711 /* Even in this situation we may use MBOL flag if strpos is offset
712 wrt the start of the string. */
713 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
714 && (strpos != strbeg) && strpos[-1] != '\n'
715 /* May be due to an implicit anchor of m{.*foo} */
716 && !(prog->reganch & ROPT_IMPLICIT))
721 DEBUG_r( if (ml_anch)
722 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
723 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
726 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
727 && prog->check_substr /* Could be deleted already */
728 && --BmUSEFUL(prog->check_substr) < 0
729 && prog->check_substr == prog->float_substr)
731 /* If flags & SOMETHING - do not do it many times on the same match */
732 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
733 SvREFCNT_dec(prog->check_substr);
734 prog->check_substr = Nullsv; /* disable */
735 prog->float_substr = Nullsv; /* clear */
736 check = Nullsv; /* abort */
738 /* XXXX This is a remnant of the old implementation. It
739 looks wasteful, since now INTUIT can use many
741 prog->reganch &= ~RE_USE_INTUIT;
748 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
749 if (prog->regstclass) {
750 /* minlen == 0 is possible if regstclass is \b or \B,
751 and the fixed substr is ''$.
752 Since minlen is already taken into account, s+1 is before strend;
753 accidentally, minlen >= 1 guaranties no false positives at s + 1
754 even for \b or \B. But (minlen? 1 : 0) below assumes that
755 regstclass does not come from lookahead... */
756 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
757 This leaves EXACTF only, which is dealt with in find_byclass(). */
758 U8* str = (U8*)STRING(prog->regstclass);
759 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
760 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
762 char *endpos = (prog->anchored_substr || ml_anch)
763 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
764 : (prog->float_substr
765 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
768 char *startpos = strbeg;
771 if (prog->reganch & ROPT_UTF8) {
772 PL_regdata = prog->data;
775 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
780 if (endpos == strend) {
781 DEBUG_r( PerlIO_printf(Perl_debug_log,
782 "Could not match STCLASS...\n") );
785 DEBUG_r( PerlIO_printf(Perl_debug_log,
786 "This position contradicts STCLASS...\n") );
787 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
789 /* Contradict one of substrings */
790 if (prog->anchored_substr) {
791 if (prog->anchored_substr == check) {
792 DEBUG_r( what = "anchored" );
794 s = HOP3c(t, 1, strend);
795 if (s + start_shift + end_shift > strend) {
796 /* XXXX Should be taken into account earlier? */
797 DEBUG_r( PerlIO_printf(Perl_debug_log,
798 "Could not match STCLASS...\n") );
803 DEBUG_r( PerlIO_printf(Perl_debug_log,
804 "Looking for %s substr starting at offset %ld...\n",
805 what, (long)(s + start_shift - i_strpos)) );
808 /* Have both, check_string is floating */
809 if (t + start_shift >= check_at) /* Contradicts floating=check */
810 goto retry_floating_check;
811 /* Recheck anchored substring, but not floating... */
815 DEBUG_r( PerlIO_printf(Perl_debug_log,
816 "Looking for anchored substr starting at offset %ld...\n",
817 (long)(other_last - i_strpos)) );
818 goto do_other_anchored;
820 /* Another way we could have checked stclass at the
821 current position only: */
826 DEBUG_r( PerlIO_printf(Perl_debug_log,
827 "Looking for /%s^%s/m starting at offset %ld...\n",
828 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
831 if (!prog->float_substr) /* Could have been deleted */
833 /* Check is floating subtring. */
834 retry_floating_check:
835 t = check_at - start_shift;
836 DEBUG_r( what = "floating" );
837 goto hop_and_restart;
840 PerlIO_printf(Perl_debug_log,
841 "By STCLASS: moving %ld --> %ld\n",
842 (long)(t - i_strpos), (long)(s - i_strpos));
844 PerlIO_printf(Perl_debug_log,
845 "Does not contradict STCLASS...\n") );
848 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
849 PL_colors[4], (check ? "Guessed" : "Giving up"),
850 PL_colors[5], (long)(s - i_strpos)) );
853 fail_finish: /* Substring not found */
854 if (prog->check_substr) /* could be removed already */
855 BmUSEFUL(prog->check_substr) += 5; /* hooray */
857 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
858 PL_colors[4],PL_colors[5]));
862 /* We know what class REx starts with. Try to find this position... */
864 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
866 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
872 register I32 tmp = 1; /* Scratch variable? */
873 register bool do_utf8 = DO_UTF8(PL_reg_sv);
875 /* We know what class it must start with. */
879 if (reginclass(c, (U8*)s, do_utf8)) {
880 if (tmp && (norun || regtry(prog, s)))
887 s += do_utf8 ? UTF8SKIP(s) : 1;
894 c1 = to_utf8_lower((U8*)m);
895 c2 = to_utf8_upper((U8*)m);
906 c2 = PL_fold_locale[c1];
911 e = s; /* Due to minlen logic of intuit() */
917 if ( utf8_to_uvchr((U8*)s, &len) == c1
924 UV c = utf8_to_uvchr((U8*)s, &len);
925 if ( (c == c1 || c == c2) && regtry(prog, s) )
934 && (ln == 1 || !(OP(c) == EXACTF
936 : ibcmp_locale(s, m, ln)))
937 && (norun || regtry(prog, s)) )
943 if ( (*(U8*)s == c1 || *(U8*)s == c2)
944 && (ln == 1 || !(OP(c) == EXACTF
946 : ibcmp_locale(s, m, ln)))
947 && (norun || regtry(prog, s)) )
954 PL_reg_flags |= RF_tainted;
961 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
963 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
965 tmp = ((OP(c) == BOUND ?
966 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
967 LOAD_UTF8_CHARCLASS(alnum,"a");
969 if (tmp == !(OP(c) == BOUND ?
970 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
971 isALNUM_LC_utf8((U8*)s)))
974 if ((norun || regtry(prog, s)))
981 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
982 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
985 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
987 if ((norun || regtry(prog, s)))
993 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
997 PL_reg_flags |= RF_tainted;
1004 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1006 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1008 tmp = ((OP(c) == NBOUND ?
1009 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1010 LOAD_UTF8_CHARCLASS(alnum,"a");
1011 while (s < strend) {
1012 if (tmp == !(OP(c) == NBOUND ?
1013 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1014 isALNUM_LC_utf8((U8*)s)))
1016 else if ((norun || regtry(prog, s)))
1022 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1023 tmp = ((OP(c) == NBOUND ?
1024 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1025 while (s < strend) {
1027 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1029 else if ((norun || regtry(prog, s)))
1034 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1039 LOAD_UTF8_CHARCLASS(alnum,"a");
1040 while (s < strend) {
1041 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1042 if (tmp && (norun || regtry(prog, s)))
1053 while (s < strend) {
1055 if (tmp && (norun || regtry(prog, s)))
1067 PL_reg_flags |= RF_tainted;
1069 while (s < strend) {
1070 if (isALNUM_LC_utf8((U8*)s)) {
1071 if (tmp && (norun || regtry(prog, s)))
1082 while (s < strend) {
1083 if (isALNUM_LC(*s)) {
1084 if (tmp && (norun || regtry(prog, s)))
1097 LOAD_UTF8_CHARCLASS(alnum,"a");
1098 while (s < strend) {
1099 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1100 if (tmp && (norun || regtry(prog, s)))
1111 while (s < strend) {
1113 if (tmp && (norun || regtry(prog, s)))
1125 PL_reg_flags |= RF_tainted;
1127 while (s < strend) {
1128 if (!isALNUM_LC_utf8((U8*)s)) {
1129 if (tmp && (norun || regtry(prog, s)))
1140 while (s < strend) {
1141 if (!isALNUM_LC(*s)) {
1142 if (tmp && (norun || regtry(prog, s)))
1155 LOAD_UTF8_CHARCLASS(space," ");
1156 while (s < strend) {
1157 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1158 if (tmp && (norun || regtry(prog, s)))
1169 while (s < strend) {
1171 if (tmp && (norun || regtry(prog, s)))
1183 PL_reg_flags |= RF_tainted;
1185 while (s < strend) {
1186 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1187 if (tmp && (norun || regtry(prog, s)))
1198 while (s < strend) {
1199 if (isSPACE_LC(*s)) {
1200 if (tmp && (norun || regtry(prog, s)))
1213 LOAD_UTF8_CHARCLASS(space," ");
1214 while (s < strend) {
1215 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1216 if (tmp && (norun || regtry(prog, s)))
1227 while (s < strend) {
1229 if (tmp && (norun || regtry(prog, s)))
1241 PL_reg_flags |= RF_tainted;
1243 while (s < strend) {
1244 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1245 if (tmp && (norun || regtry(prog, s)))
1256 while (s < strend) {
1257 if (!isSPACE_LC(*s)) {
1258 if (tmp && (norun || regtry(prog, s)))
1271 LOAD_UTF8_CHARCLASS(digit,"0");
1272 while (s < strend) {
1273 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1274 if (tmp && (norun || regtry(prog, s)))
1285 while (s < strend) {
1287 if (tmp && (norun || regtry(prog, s)))
1299 PL_reg_flags |= RF_tainted;
1301 while (s < strend) {
1302 if (isDIGIT_LC_utf8((U8*)s)) {
1303 if (tmp && (norun || regtry(prog, s)))
1314 while (s < strend) {
1315 if (isDIGIT_LC(*s)) {
1316 if (tmp && (norun || regtry(prog, s)))
1329 LOAD_UTF8_CHARCLASS(digit,"0");
1330 while (s < strend) {
1331 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1332 if (tmp && (norun || regtry(prog, s)))
1343 while (s < strend) {
1345 if (tmp && (norun || regtry(prog, s)))
1357 PL_reg_flags |= RF_tainted;
1359 while (s < strend) {
1360 if (!isDIGIT_LC_utf8((U8*)s)) {
1361 if (tmp && (norun || regtry(prog, s)))
1372 while (s < strend) {
1373 if (!isDIGIT_LC(*s)) {
1374 if (tmp && (norun || regtry(prog, s)))
1386 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1395 - regexec_flags - match a regexp against a string
1398 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1399 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1400 /* strend: pointer to null at end of string */
1401 /* strbeg: real beginning of string */
1402 /* minend: end of match must be >=minend after stringarg. */
1403 /* data: May be used for some additional optimizations. */
1404 /* nosave: For optimizations. */
1407 register regnode *c;
1408 register char *startpos = stringarg;
1409 I32 minlen; /* must match at least this many chars */
1410 I32 dontbother = 0; /* how many characters not to try at end */
1411 /* I32 start_shift = 0; */ /* Offset of the start to find
1412 constant substr. */ /* CC */
1413 I32 end_shift = 0; /* Same for the end. */ /* CC */
1414 I32 scream_pos = -1; /* Internal iterator of scream. */
1416 SV* oreplsv = GvSV(PL_replgv);
1417 bool do_utf8 = DO_UTF8(sv);
1423 PL_regnarrate = DEBUG_r_TEST;
1426 /* Be paranoid... */
1427 if (prog == NULL || startpos == NULL) {
1428 Perl_croak(aTHX_ "NULL regexp parameter");
1432 minlen = prog->minlen;
1434 if (!(prog->reganch & ROPT_SANY_SEEN))
1435 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1438 if (strend - startpos < minlen) goto phooey;
1441 /* Check validity of program. */
1442 if (UCHARAT(prog->program) != REG_MAGIC) {
1443 Perl_croak(aTHX_ "corrupted regexp program");
1447 PL_reg_eval_set = 0;
1450 if (prog->reganch & ROPT_UTF8)
1451 PL_reg_flags |= RF_utf8;
1453 /* Mark beginning of line for ^ and lookbehind. */
1454 PL_regbol = startpos;
1458 /* Mark end of line for $ (and such) */
1461 /* see how far we have to get to not match where we matched before */
1462 PL_regtill = startpos+minend;
1464 /* We start without call_cc context. */
1467 /* If there is a "must appear" string, look for it. */
1470 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1473 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1474 PL_reg_ganch = startpos;
1475 else if (sv && SvTYPE(sv) >= SVt_PVMG
1477 && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) {
1478 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1479 if (prog->reganch & ROPT_ANCH_GPOS) {
1480 if (s > PL_reg_ganch)
1485 else /* pos() not defined */
1486 PL_reg_ganch = strbeg;
1489 if (do_utf8 == (UTF!=0) &&
1490 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1491 re_scream_pos_data d;
1493 d.scream_olds = &scream_olds;
1494 d.scream_pos = &scream_pos;
1495 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1497 goto phooey; /* not present */
1500 DEBUG_r( if (!PL_colorset) reginitcolors() );
1501 DEBUG_r(PerlIO_printf(Perl_debug_log,
1502 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1503 PL_colors[4],PL_colors[5],PL_colors[0],
1506 (strlen(prog->precomp) > 60 ? "..." : ""),
1508 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1509 startpos, PL_colors[1],
1510 (strend - startpos > 60 ? "..." : ""))
1513 /* Simplest case: anchored match need be tried only once. */
1514 /* [unless only anchor is BOL and multiline is set] */
1515 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1516 if (s == startpos && regtry(prog, startpos))
1518 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1519 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1524 dontbother = minlen - 1;
1525 end = HOP3c(strend, -dontbother, strbeg) - 1;
1526 /* for multiline we only have to try after newlines */
1527 if (prog->check_substr) {
1531 if (regtry(prog, s))
1536 if (prog->reganch & RE_USE_INTUIT) {
1537 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1548 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1549 if (regtry(prog, s))
1556 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1557 if (regtry(prog, PL_reg_ganch))
1562 /* Messy cases: unanchored match. */
1563 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1564 /* we have /x+whatever/ */
1565 /* it must be a one character string (XXXX Except UTF?) */
1566 char ch = SvPVX(prog->anchored_substr)[0];
1572 while (s < strend) {
1574 DEBUG_r( did_match = 1 );
1575 if (regtry(prog, s)) goto got_it;
1577 while (s < strend && *s == ch)
1584 while (s < strend) {
1586 DEBUG_r( did_match = 1 );
1587 if (regtry(prog, s)) goto got_it;
1589 while (s < strend && *s == ch)
1595 DEBUG_r(did_match ||
1596 PerlIO_printf(Perl_debug_log,
1597 "Did not find anchored character...\n"));
1600 else if (do_utf8 == (UTF!=0) &&
1601 (prog->anchored_substr != Nullsv
1602 || (prog->float_substr != Nullsv
1603 && prog->float_max_offset < strend - s))) {
1604 SV *must = prog->anchored_substr
1605 ? prog->anchored_substr : prog->float_substr;
1607 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1609 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1610 char *last = HOP3c(strend, /* Cannot start after this */
1611 -(I32)(CHR_SVLEN(must)
1612 - (SvTAIL(must) != 0) + back_min), strbeg);
1613 char *last1; /* Last position checked before */
1619 last1 = HOPc(s, -1);
1621 last1 = s - 1; /* bogus */
1623 /* XXXX check_substr already used to find `s', can optimize if
1624 check_substr==must. */
1626 dontbother = end_shift;
1627 strend = HOPc(strend, -dontbother);
1628 while ( (s <= last) &&
1629 ((flags & REXEC_SCREAM)
1630 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1631 end_shift, &scream_pos, 0))
1632 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1633 (unsigned char*)strend, must,
1634 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1635 DEBUG_r( did_match = 1 );
1636 if (HOPc(s, -back_max) > last1) {
1637 last1 = HOPc(s, -back_min);
1638 s = HOPc(s, -back_max);
1641 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1643 last1 = HOPc(s, -back_min);
1647 while (s <= last1) {
1648 if (regtry(prog, s))
1654 while (s <= last1) {
1655 if (regtry(prog, s))
1661 DEBUG_r(did_match ||
1662 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1663 ((must == prog->anchored_substr)
1664 ? "anchored" : "floating"),
1666 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1668 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1671 else if ((c = prog->regstclass)) {
1672 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1673 /* don't bother with what can't match */
1674 strend = HOPc(strend, -(minlen - 1));
1676 SV *prop = sv_newmortal();
1678 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1680 if (find_byclass(prog, c, s, strend, startpos, 0))
1682 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1686 if (prog->float_substr != Nullsv) { /* Trim the end. */
1689 if (flags & REXEC_SCREAM) {
1690 last = screaminstr(sv, prog->float_substr, s - strbeg,
1691 end_shift, &scream_pos, 1); /* last one */
1693 last = scream_olds; /* Only one occurrence. */
1697 char *little = SvPV(prog->float_substr, len);
1699 if (SvTAIL(prog->float_substr)) {
1700 if (memEQ(strend - len + 1, little, len - 1))
1701 last = strend - len + 1;
1702 else if (!PL_multiline)
1703 last = memEQ(strend - len, little, len)
1704 ? strend - len : Nullch;
1710 last = rninstr(s, strend, little, little + len);
1712 last = strend; /* matching `$' */
1716 DEBUG_r(PerlIO_printf(Perl_debug_log,
1717 "%sCan't trim the tail, match fails (should not happen)%s\n",
1718 PL_colors[4],PL_colors[5]));
1719 goto phooey; /* Should not happen! */
1721 dontbother = strend - last + prog->float_min_offset;
1723 if (minlen && (dontbother < minlen))
1724 dontbother = minlen - 1;
1725 strend -= dontbother; /* this one's always in bytes! */
1726 /* We don't know much -- general case. */
1729 if (regtry(prog, s))
1738 if (regtry(prog, s))
1740 } while (s++ < strend);
1748 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1750 if (PL_reg_eval_set) {
1751 /* Preserve the current value of $^R */
1752 if (oreplsv != GvSV(PL_replgv))
1753 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1754 restored, the value remains
1756 restore_pos(aTHXo_ 0);
1759 /* make sure $`, $&, $', and $digit will work later */
1760 if ( !(flags & REXEC_NOT_FIRST) ) {
1761 if (RX_MATCH_COPIED(prog)) {
1762 Safefree(prog->subbeg);
1763 RX_MATCH_COPIED_off(prog);
1765 if (flags & REXEC_COPY_STR) {
1766 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1768 s = savepvn(strbeg, i);
1771 RX_MATCH_COPIED_on(prog);
1774 prog->subbeg = strbeg;
1775 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1782 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1783 PL_colors[4],PL_colors[5]));
1784 if (PL_reg_eval_set)
1785 restore_pos(aTHXo_ 0);
1790 - regtry - try match at specific point
1792 STATIC I32 /* 0 failure, 1 success */
1793 S_regtry(pTHX_ regexp *prog, char *startpos)
1801 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1803 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1806 PL_reg_eval_set = RS_init;
1808 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1809 (IV)(PL_stack_sp - PL_stack_base));
1811 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1812 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1813 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1815 /* Apparently this is not needed, judging by wantarray. */
1816 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1817 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1820 /* Make $_ available to executed code. */
1821 if (PL_reg_sv != DEFSV) {
1822 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1827 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1828 && (mg = mg_find(PL_reg_sv, 'g')))) {
1829 /* prepare for quick setting of pos */
1830 sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0);
1831 mg = mg_find(PL_reg_sv, 'g');
1835 PL_reg_oldpos = mg->mg_len;
1836 SAVEDESTRUCTOR_X(restore_pos, 0);
1839 Newz(22,PL_reg_curpm, 1, PMOP);
1840 PL_reg_curpm->op_pmregexp = prog;
1841 PL_reg_oldcurpm = PL_curpm;
1842 PL_curpm = PL_reg_curpm;
1843 if (RX_MATCH_COPIED(prog)) {
1844 /* Here is a serious problem: we cannot rewrite subbeg,
1845 since it may be needed if this match fails. Thus
1846 $` inside (?{}) could fail... */
1847 PL_reg_oldsaved = prog->subbeg;
1848 PL_reg_oldsavedlen = prog->sublen;
1849 RX_MATCH_COPIED_off(prog);
1852 PL_reg_oldsaved = Nullch;
1853 prog->subbeg = PL_bostr;
1854 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1856 prog->startp[0] = startpos - PL_bostr;
1857 PL_reginput = startpos;
1858 PL_regstartp = prog->startp;
1859 PL_regendp = prog->endp;
1860 PL_reglastparen = &prog->lastparen;
1861 prog->lastparen = 0;
1863 DEBUG_r(PL_reg_starttry = startpos);
1864 if (PL_reg_start_tmpl <= prog->nparens) {
1865 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1866 if(PL_reg_start_tmp)
1867 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1869 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1872 /* XXXX What this code is doing here?!!! There should be no need
1873 to do this again and again, PL_reglastparen should take care of
1876 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1877 * Actually, the code in regcppop() (which Ilya may be meaning by
1878 * PL_reglastparen), is not needed at all by the test suite
1879 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1880 * enough, for building DynaLoader, or otherwise this
1881 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1882 * will happen. Meanwhile, this code *is* needed for the
1883 * above-mentioned test suite tests to succeed. The common theme
1884 * on those tests seems to be returning null fields from matches.
1889 if (prog->nparens) {
1890 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1897 if (regmatch(prog->program + 1)) {
1898 prog->endp[0] = PL_reginput - PL_bostr;
1901 REGCP_UNWIND(lastcp);
1905 #define RE_UNWIND_BRANCH 1
1906 #define RE_UNWIND_BRANCHJ 2
1910 typedef struct { /* XX: makes sense to enlarge it... */
1914 } re_unwind_generic_t;
1927 } re_unwind_branch_t;
1929 typedef union re_unwind_t {
1931 re_unwind_generic_t generic;
1932 re_unwind_branch_t branch;
1936 - regmatch - main matching routine
1938 * Conceptually the strategy is simple: check to see whether the current
1939 * node matches, call self recursively to see whether the rest matches,
1940 * and then act accordingly. In practice we make some effort to avoid
1941 * recursion, in particular by going through "ordinary" nodes (that don't
1942 * need to know whether the rest of the match failed) by a loop instead of
1945 /* [lwall] I've hoisted the register declarations to the outer block in order to
1946 * maybe save a little bit of pushing and popping on the stack. It also takes
1947 * advantage of machines that use a register save mask on subroutine entry.
1949 STATIC I32 /* 0 failure, 1 success */
1950 S_regmatch(pTHX_ regnode *prog)
1952 register regnode *scan; /* Current node. */
1953 regnode *next; /* Next node. */
1954 regnode *inner; /* Next node in internal branch. */
1955 register I32 nextchr; /* renamed nextchr - nextchar colides with
1956 function of same name */
1957 register I32 n; /* no or next */
1958 register I32 ln; /* len or last */
1959 register char *s; /* operand or save */
1960 register char *locinput = PL_reginput;
1961 register I32 c1, c2, paren; /* case fold search, parenth */
1962 int minmod = 0, sw = 0, logical = 0;
1964 I32 firstcp = PL_savestack_ix;
1965 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1971 /* Note that nextchr is a byte even in UTF */
1972 nextchr = UCHARAT(locinput);
1974 while (scan != NULL) {
1975 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1977 # define sayYES goto yes
1978 # define sayNO goto no
1979 # define sayYES_FINAL goto yes_final
1980 # define sayYES_LOUD goto yes_loud
1981 # define sayNO_FINAL goto no_final
1982 # define sayNO_SILENT goto do_no
1983 # define saySAME(x) if (x) goto yes; else goto no
1984 # define REPORT_CODE_OFF 24
1986 # define sayYES return 1
1987 # define sayNO return 0
1988 # define sayYES_FINAL return 1
1989 # define sayYES_LOUD return 1
1990 # define sayNO_FINAL return 0
1991 # define sayNO_SILENT return 0
1992 # define saySAME(x) return x
1995 SV *prop = sv_newmortal();
1996 int docolor = *PL_colors[0];
1997 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1998 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
1999 /* The part of the string before starttry has one color
2000 (pref0_len chars), between starttry and current
2001 position another one (pref_len - pref0_len chars),
2002 after the current position the third one.
2003 We assume that pref0_len <= pref_len, otherwise we
2004 decrease pref0_len. */
2005 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2006 ? (5 + taill) - l : locinput - PL_bostr;
2009 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2011 pref0_len = pref_len - (locinput - PL_reg_starttry);
2012 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2013 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2014 ? (5 + taill) - pref_len : PL_regeol - locinput);
2015 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2019 if (pref0_len > pref_len)
2020 pref0_len = pref_len;
2021 regprop(prop, scan);
2022 PerlIO_printf(Perl_debug_log,
2023 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2024 (IV)(locinput - PL_bostr),
2025 PL_colors[4], pref0_len,
2026 locinput - pref_len, PL_colors[5],
2027 PL_colors[2], pref_len - pref0_len,
2028 locinput - pref_len + pref0_len, PL_colors[3],
2029 (docolor ? "" : "> <"),
2030 PL_colors[0], l, locinput, PL_colors[1],
2031 15 - l - pref_len + 1,
2033 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2037 next = scan + NEXT_OFF(scan);
2043 if (locinput == PL_bostr || (PL_multiline &&
2044 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2046 /* regtill = regbol; */
2051 if (locinput == PL_bostr ||
2052 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2058 if (locinput == PL_bostr)
2062 if (locinput == PL_reg_ganch)
2072 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2077 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2079 if (PL_regeol - locinput > 1)
2083 if (PL_regeol != locinput)
2087 if (!nextchr && locinput >= PL_regeol)
2089 nextchr = UCHARAT(++locinput);
2092 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2095 locinput += PL_utf8skip[nextchr];
2096 if (locinput > PL_regeol)
2098 nextchr = UCHARAT(locinput);
2101 nextchr = UCHARAT(++locinput);
2106 if (do_utf8 != (UTF!=0)) {
2114 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2123 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2129 nextchr = UCHARAT(locinput);
2132 /* Inline the first character, for speed. */
2133 if (UCHARAT(s) != nextchr)
2135 if (PL_regeol - locinput < ln)
2137 if (ln > 1 && memNE(s, locinput, ln))
2140 nextchr = UCHARAT(locinput);
2143 PL_reg_flags |= RF_tainted;
2153 c1 = OP(scan) == EXACTF;
2155 if (l >= PL_regeol) {
2158 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2159 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2161 s += UTF ? UTF8SKIP(s) : 1;
2165 nextchr = UCHARAT(locinput);
2169 /* Inline the first character, for speed. */
2170 if (UCHARAT(s) != nextchr &&
2171 UCHARAT(s) != ((OP(scan) == EXACTF)
2172 ? PL_fold : PL_fold_locale)[nextchr])
2174 if (PL_regeol - locinput < ln)
2176 if (ln > 1 && (OP(scan) == EXACTF
2177 ? ibcmp(s, locinput, ln)
2178 : ibcmp_locale(s, locinput, ln)))
2181 nextchr = UCHARAT(locinput);
2185 if (!reginclass(scan, (U8*)locinput, do_utf8))
2187 if (locinput >= PL_regeol)
2189 locinput += PL_utf8skip[nextchr];
2190 nextchr = UCHARAT(locinput);
2194 nextchr = UCHARAT(locinput);
2195 if (!reginclass(scan, (U8*)locinput, do_utf8))
2197 if (!nextchr && locinput >= PL_regeol)
2199 nextchr = UCHARAT(++locinput);
2203 PL_reg_flags |= RF_tainted;
2209 if (!(OP(scan) == ALNUM
2210 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2211 : isALNUM_LC_utf8((U8*)locinput)))
2215 locinput += PL_utf8skip[nextchr];
2216 nextchr = UCHARAT(locinput);
2219 if (!(OP(scan) == ALNUM
2220 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2222 nextchr = UCHARAT(++locinput);
2225 PL_reg_flags |= RF_tainted;
2228 if (!nextchr && locinput >= PL_regeol)
2231 LOAD_UTF8_CHARCLASS(alnum,"a");
2232 if (OP(scan) == NALNUM
2233 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2234 : isALNUM_LC_utf8((U8*)locinput))
2238 locinput += PL_utf8skip[nextchr];
2239 nextchr = UCHARAT(locinput);
2242 if (OP(scan) == NALNUM
2243 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2245 nextchr = UCHARAT(++locinput);
2249 PL_reg_flags |= RF_tainted;
2253 /* was last char in word? */
2255 if (locinput == PL_bostr)
2258 U8 *r = reghop((U8*)locinput, -1);
2260 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2262 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2263 ln = isALNUM_uni(ln);
2264 LOAD_UTF8_CHARCLASS(alnum,"a");
2265 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2268 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2269 n = isALNUM_LC_utf8((U8*)locinput);
2273 ln = (locinput != PL_bostr) ?
2274 UCHARAT(locinput - 1) : '\n';
2275 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2277 n = isALNUM(nextchr);
2280 ln = isALNUM_LC(ln);
2281 n = isALNUM_LC(nextchr);
2284 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2285 OP(scan) == BOUNDL))
2289 PL_reg_flags |= RF_tainted;
2295 if (UTF8_IS_CONTINUED(nextchr)) {
2296 LOAD_UTF8_CHARCLASS(space," ");
2297 if (!(OP(scan) == SPACE
2298 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2299 : isSPACE_LC_utf8((U8*)locinput)))
2303 locinput += PL_utf8skip[nextchr];
2304 nextchr = UCHARAT(locinput);
2307 if (!(OP(scan) == SPACE
2308 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2310 nextchr = UCHARAT(++locinput);
2313 if (!(OP(scan) == SPACE
2314 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2316 nextchr = UCHARAT(++locinput);
2320 PL_reg_flags |= RF_tainted;
2323 if (!nextchr && locinput >= PL_regeol)
2326 LOAD_UTF8_CHARCLASS(space," ");
2327 if (OP(scan) == NSPACE
2328 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2329 : isSPACE_LC_utf8((U8*)locinput))
2333 locinput += PL_utf8skip[nextchr];
2334 nextchr = UCHARAT(locinput);
2337 if (OP(scan) == NSPACE
2338 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2340 nextchr = UCHARAT(++locinput);
2343 PL_reg_flags |= RF_tainted;
2349 LOAD_UTF8_CHARCLASS(digit,"0");
2350 if (!(OP(scan) == DIGIT
2351 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2352 : isDIGIT_LC_utf8((U8*)locinput)))
2356 locinput += PL_utf8skip[nextchr];
2357 nextchr = UCHARAT(locinput);
2360 if (!(OP(scan) == DIGIT
2361 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2363 nextchr = UCHARAT(++locinput);
2366 PL_reg_flags |= RF_tainted;
2369 if (!nextchr && locinput >= PL_regeol)
2372 LOAD_UTF8_CHARCLASS(digit,"0");
2373 if (OP(scan) == NDIGIT
2374 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2375 : isDIGIT_LC_utf8((U8*)locinput))
2379 locinput += PL_utf8skip[nextchr];
2380 nextchr = UCHARAT(locinput);
2383 if (OP(scan) == NDIGIT
2384 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2386 nextchr = UCHARAT(++locinput);
2389 LOAD_UTF8_CHARCLASS(mark,"~");
2390 if (locinput >= PL_regeol ||
2391 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2393 locinput += PL_utf8skip[nextchr];
2394 while (locinput < PL_regeol &&
2395 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2396 locinput += UTF8SKIP(locinput);
2397 if (locinput > PL_regeol)
2399 nextchr = UCHARAT(locinput);
2402 PL_reg_flags |= RF_tainted;
2406 n = ARG(scan); /* which paren pair */
2407 ln = PL_regstartp[n];
2408 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2409 if (*PL_reglastparen < n || ln == -1)
2410 sayNO; /* Do not match unless seen CLOSEn. */
2411 if (ln == PL_regendp[n])
2415 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2417 char *e = PL_bostr + PL_regendp[n];
2419 * Note that we can't do the "other character" lookup trick as
2420 * in the 8-bit case (no pun intended) because in Unicode we
2421 * have to map both upper and title case to lower case.
2423 if (OP(scan) == REFF) {
2427 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2437 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2444 nextchr = UCHARAT(locinput);
2448 /* Inline the first character, for speed. */
2449 if (UCHARAT(s) != nextchr &&
2451 (UCHARAT(s) != ((OP(scan) == REFF
2452 ? PL_fold : PL_fold_locale)[nextchr]))))
2454 ln = PL_regendp[n] - ln;
2455 if (locinput + ln > PL_regeol)
2457 if (ln > 1 && (OP(scan) == REF
2458 ? memNE(s, locinput, ln)
2460 ? ibcmp(s, locinput, ln)
2461 : ibcmp_locale(s, locinput, ln))))
2464 nextchr = UCHARAT(locinput);
2475 OP_4tree *oop = PL_op;
2476 COP *ocurcop = PL_curcop;
2477 SV **ocurpad = PL_curpad;
2481 PL_op = (OP_4tree*)PL_regdata->data[n];
2482 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2483 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2484 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2486 CALLRUNOPS(aTHX); /* Scalar context. */
2492 PL_curpad = ocurpad;
2493 PL_curcop = ocurcop;
2495 if (logical == 2) { /* Postponed subexpression. */
2497 MAGIC *mg = Null(MAGIC*);
2499 CHECKPOINT cp, lastcp;
2501 if(SvROK(ret) || SvRMAGICAL(ret)) {
2502 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2505 mg = mg_find(sv, 'r');
2508 re = (regexp *)mg->mg_obj;
2509 (void)ReREFCNT_inc(re);
2513 char *t = SvPV(ret, len);
2515 char *oprecomp = PL_regprecomp;
2516 I32 osize = PL_regsize;
2517 I32 onpar = PL_regnpar;
2520 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2522 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2523 sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
2524 PL_regprecomp = oprecomp;
2529 PerlIO_printf(Perl_debug_log,
2530 "Entering embedded `%s%.60s%s%s'\n",
2534 (strlen(re->precomp) > 60 ? "..." : ""))
2537 state.prev = PL_reg_call_cc;
2538 state.cc = PL_regcc;
2539 state.re = PL_reg_re;
2543 cp = regcppush(0); /* Save *all* the positions. */
2546 state.ss = PL_savestack_ix;
2547 *PL_reglastparen = 0;
2548 PL_reg_call_cc = &state;
2549 PL_reginput = locinput;
2551 /* XXXX This is too dramatic a measure... */
2554 if (regmatch(re->program + 1)) {
2555 /* Even though we succeeded, we need to restore
2556 global variables, since we may be wrapped inside
2557 SUSPEND, thus the match may be not finished yet. */
2559 /* XXXX Do this only if SUSPENDed? */
2560 PL_reg_call_cc = state.prev;
2561 PL_regcc = state.cc;
2562 PL_reg_re = state.re;
2563 cache_re(PL_reg_re);
2565 /* XXXX This is too dramatic a measure... */
2568 /* These are needed even if not SUSPEND. */
2574 REGCP_UNWIND(lastcp);
2576 PL_reg_call_cc = state.prev;
2577 PL_regcc = state.cc;
2578 PL_reg_re = state.re;
2579 cache_re(PL_reg_re);
2581 /* XXXX This is too dramatic a measure... */
2590 sv_setsv(save_scalar(PL_replgv), ret);
2594 n = ARG(scan); /* which paren pair */
2595 PL_reg_start_tmp[n] = locinput;
2600 n = ARG(scan); /* which paren pair */
2601 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2602 PL_regendp[n] = locinput - PL_bostr;
2603 if (n > *PL_reglastparen)
2604 *PL_reglastparen = n;
2607 n = ARG(scan); /* which paren pair */
2608 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2611 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2613 next = NEXTOPER(NEXTOPER(scan));
2615 next = scan + ARG(scan);
2616 if (OP(next) == IFTHEN) /* Fake one. */
2617 next = NEXTOPER(NEXTOPER(next));
2621 logical = scan->flags;
2623 /*******************************************************************
2624 PL_regcc contains infoblock about the innermost (...)* loop, and
2625 a pointer to the next outer infoblock.
2627 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2629 1) After matching X, regnode for CURLYX is processed;
2631 2) This regnode creates infoblock on the stack, and calls
2632 regmatch() recursively with the starting point at WHILEM node;
2634 3) Each hit of WHILEM node tries to match A and Z (in the order
2635 depending on the current iteration, min/max of {min,max} and
2636 greediness). The information about where are nodes for "A"
2637 and "Z" is read from the infoblock, as is info on how many times "A"
2638 was already matched, and greediness.
2640 4) After A matches, the same WHILEM node is hit again.
2642 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2643 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2644 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2645 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2646 of the external loop.
2648 Currently present infoblocks form a tree with a stem formed by PL_curcc
2649 and whatever it mentions via ->next, and additional attached trees
2650 corresponding to temporarily unset infoblocks as in "5" above.
2652 In the following picture infoblocks for outer loop of
2653 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2654 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2655 infoblocks are drawn below the "reset" infoblock.
2657 In fact in the picture below we do not show failed matches for Z and T
2658 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2659 more obvious *why* one needs to *temporary* unset infoblocks.]
2661 Matched REx position InfoBlocks Comment
2665 Y A)*?Z)*?T x <- O <- I
2666 YA )*?Z)*?T x <- O <- I
2667 YA A)*?Z)*?T x <- O <- I
2668 YAA )*?Z)*?T x <- O <- I
2669 YAA Z)*?T x <- O # Temporary unset I
2672 YAAZ Y(A)*?Z)*?T x <- O
2675 YAAZY (A)*?Z)*?T x <- O
2678 YAAZY A)*?Z)*?T x <- O <- I
2681 YAAZYA )*?Z)*?T x <- O <- I
2684 YAAZYA Z)*?T x <- O # Temporary unset I
2690 YAAZYAZ T x # Temporary unset O
2697 *******************************************************************/
2700 CHECKPOINT cp = PL_savestack_ix;
2701 /* No need to save/restore up to this paren */
2702 I32 parenfloor = scan->flags;
2704 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2706 cc.oldcc = PL_regcc;
2708 /* XXXX Probably it is better to teach regpush to support
2709 parenfloor > PL_regsize... */
2710 if (parenfloor > *PL_reglastparen)
2711 parenfloor = *PL_reglastparen; /* Pessimization... */
2712 cc.parenfloor = parenfloor;
2714 cc.min = ARG1(scan);
2715 cc.max = ARG2(scan);
2716 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2720 PL_reginput = locinput;
2721 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2723 PL_regcc = cc.oldcc;
2729 * This is really hard to understand, because after we match
2730 * what we're trying to match, we must make sure the rest of
2731 * the REx is going to match for sure, and to do that we have
2732 * to go back UP the parse tree by recursing ever deeper. And
2733 * if it fails, we have to reset our parent's current state
2734 * that we can try again after backing off.
2737 CHECKPOINT cp, lastcp;
2738 CURCUR* cc = PL_regcc;
2739 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2741 n = cc->cur + 1; /* how many we know we matched */
2742 PL_reginput = locinput;
2745 PerlIO_printf(Perl_debug_log,
2746 "%*s %ld out of %ld..%ld cc=%lx\n",
2747 REPORT_CODE_OFF+PL_regindent*2, "",
2748 (long)n, (long)cc->min,
2749 (long)cc->max, (long)cc)
2752 /* If degenerate scan matches "", assume scan done. */
2754 if (locinput == cc->lastloc && n >= cc->min) {
2755 PL_regcc = cc->oldcc;
2759 PerlIO_printf(Perl_debug_log,
2760 "%*s empty match detected, try continuation...\n",
2761 REPORT_CODE_OFF+PL_regindent*2, "")
2763 if (regmatch(cc->next))
2771 /* First just match a string of min scans. */
2775 cc->lastloc = locinput;
2776 if (regmatch(cc->scan))
2779 cc->lastloc = lastloc;
2784 /* Check whether we already were at this position.
2785 Postpone detection until we know the match is not
2786 *that* much linear. */
2787 if (!PL_reg_maxiter) {
2788 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2789 PL_reg_leftiter = PL_reg_maxiter;
2791 if (PL_reg_leftiter-- == 0) {
2792 I32 size = (PL_reg_maxiter + 7)/8;
2793 if (PL_reg_poscache) {
2794 if (PL_reg_poscache_size < size) {
2795 Renew(PL_reg_poscache, size, char);
2796 PL_reg_poscache_size = size;
2798 Zero(PL_reg_poscache, size, char);
2801 PL_reg_poscache_size = size;
2802 Newz(29, PL_reg_poscache, size, char);
2805 PerlIO_printf(Perl_debug_log,
2806 "%sDetected a super-linear match, switching on caching%s...\n",
2807 PL_colors[4], PL_colors[5])
2810 if (PL_reg_leftiter < 0) {
2811 I32 o = locinput - PL_bostr, b;
2813 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2816 if (PL_reg_poscache[o] & (1<<b)) {
2818 PerlIO_printf(Perl_debug_log,
2819 "%*s already tried at this position...\n",
2820 REPORT_CODE_OFF+PL_regindent*2, "")
2824 PL_reg_poscache[o] |= (1<<b);
2828 /* Prefer next over scan for minimal matching. */
2831 PL_regcc = cc->oldcc;
2834 cp = regcppush(cc->parenfloor);
2836 if (regmatch(cc->next)) {
2838 sayYES; /* All done. */
2840 REGCP_UNWIND(lastcp);
2846 if (n >= cc->max) { /* Maximum greed exceeded? */
2847 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2848 && !(PL_reg_flags & RF_warned)) {
2849 PL_reg_flags |= RF_warned;
2850 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2851 "Complex regular subexpression recursion",
2858 PerlIO_printf(Perl_debug_log,
2859 "%*s trying longer...\n",
2860 REPORT_CODE_OFF+PL_regindent*2, "")
2862 /* Try scanning more and see if it helps. */
2863 PL_reginput = locinput;
2865 cc->lastloc = locinput;
2866 cp = regcppush(cc->parenfloor);
2868 if (regmatch(cc->scan)) {
2872 REGCP_UNWIND(lastcp);
2875 cc->lastloc = lastloc;
2879 /* Prefer scan over next for maximal matching. */
2881 if (n < cc->max) { /* More greed allowed? */
2882 cp = regcppush(cc->parenfloor);
2884 cc->lastloc = locinput;
2886 if (regmatch(cc->scan)) {
2890 REGCP_UNWIND(lastcp);
2891 regcppop(); /* Restore some previous $<digit>s? */
2892 PL_reginput = locinput;
2894 PerlIO_printf(Perl_debug_log,
2895 "%*s failed, try continuation...\n",
2896 REPORT_CODE_OFF+PL_regindent*2, "")
2899 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2900 && !(PL_reg_flags & RF_warned)) {
2901 PL_reg_flags |= RF_warned;
2902 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2903 "Complex regular subexpression recursion",
2907 /* Failed deeper matches of scan, so see if this one works. */
2908 PL_regcc = cc->oldcc;
2911 if (regmatch(cc->next))
2917 cc->lastloc = lastloc;
2922 next = scan + ARG(scan);
2925 inner = NEXTOPER(NEXTOPER(scan));
2928 inner = NEXTOPER(scan);
2933 if (OP(next) != c1) /* No choice. */
2934 next = inner; /* Avoid recursion. */
2936 I32 lastparen = *PL_reglastparen;
2938 re_unwind_branch_t *uw;
2940 /* Put unwinding data on stack */
2941 unwind1 = SSNEWt(1,re_unwind_branch_t);
2942 uw = SSPTRt(unwind1,re_unwind_branch_t);
2945 uw->type = ((c1 == BRANCH)
2947 : RE_UNWIND_BRANCHJ);
2948 uw->lastparen = lastparen;
2950 uw->locinput = locinput;
2951 uw->nextchr = nextchr;
2953 uw->regindent = ++PL_regindent;
2956 REGCP_SET(uw->lastcp);
2958 /* Now go into the first branch */
2971 /* We suppose that the next guy does not need
2972 backtracking: in particular, it is of constant length,
2973 and has no parenths to influence future backrefs. */
2974 ln = ARG1(scan); /* min to match */
2975 n = ARG2(scan); /* max to match */
2976 paren = scan->flags;
2978 if (paren > PL_regsize)
2980 if (paren > *PL_reglastparen)
2981 *PL_reglastparen = paren;
2983 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2985 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2986 PL_reginput = locinput;
2989 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2991 if (ln && l == 0 && n >= ln
2992 /* In fact, this is tricky. If paren, then the
2993 fact that we did/didnot match may influence
2994 future execution. */
2995 && !(paren && ln == 0))
2997 locinput = PL_reginput;
2998 if (PL_regkind[(U8)OP(next)] == EXACT) {
2999 c1 = (U8)*STRING(next);
3000 if (OP(next) == EXACTF)
3002 else if (OP(next) == EXACTFL)
3003 c2 = PL_fold_locale[c1];
3010 /* This may be improved if l == 0. */
3011 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3012 /* If it could work, try it. */
3014 UCHARAT(PL_reginput) == c1 ||
3015 UCHARAT(PL_reginput) == c2)
3019 PL_regstartp[paren] =
3020 HOPc(PL_reginput, -l) - PL_bostr;
3021 PL_regendp[paren] = PL_reginput - PL_bostr;
3024 PL_regendp[paren] = -1;
3028 REGCP_UNWIND(lastcp);
3030 /* Couldn't or didn't -- move forward. */
3031 PL_reginput = locinput;
3032 if (regrepeat_hard(scan, 1, &l)) {
3034 locinput = PL_reginput;
3041 n = regrepeat_hard(scan, n, &l);
3042 if (n != 0 && l == 0
3043 /* In fact, this is tricky. If paren, then the
3044 fact that we did/didnot match may influence
3045 future execution. */
3046 && !(paren && ln == 0))
3048 locinput = PL_reginput;
3050 PerlIO_printf(Perl_debug_log,
3051 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3052 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3056 if (PL_regkind[(U8)OP(next)] == EXACT) {
3057 c1 = (U8)*STRING(next);
3058 if (OP(next) == EXACTF)
3060 else if (OP(next) == EXACTFL)
3061 c2 = PL_fold_locale[c1];
3070 /* If it could work, try it. */
3072 UCHARAT(PL_reginput) == c1 ||
3073 UCHARAT(PL_reginput) == c2)
3076 PerlIO_printf(Perl_debug_log,
3077 "%*s trying tail with n=%"IVdf"...\n",
3078 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3082 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3083 PL_regendp[paren] = PL_reginput - PL_bostr;
3086 PL_regendp[paren] = -1;
3090 REGCP_UNWIND(lastcp);
3092 /* Couldn't or didn't -- back up. */
3094 locinput = HOPc(locinput, -l);
3095 PL_reginput = locinput;
3102 paren = scan->flags; /* Which paren to set */
3103 if (paren > PL_regsize)
3105 if (paren > *PL_reglastparen)
3106 *PL_reglastparen = paren;
3107 ln = ARG1(scan); /* min to match */
3108 n = ARG2(scan); /* max to match */
3109 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3113 ln = ARG1(scan); /* min to match */
3114 n = ARG2(scan); /* max to match */
3115 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3120 scan = NEXTOPER(scan);
3126 scan = NEXTOPER(scan);
3130 * Lookahead to avoid useless match attempts
3131 * when we know what character comes next.
3133 if (PL_regkind[(U8)OP(next)] == EXACT) {
3134 U8 *s = (U8*)STRING(next);
3137 if (OP(next) == EXACTF)
3139 else if (OP(next) == EXACTFL)
3140 c2 = PL_fold_locale[c1];
3143 if (OP(next) == EXACTF) {
3144 c1 = to_utf8_lower(s);
3145 c2 = to_utf8_upper(s);
3148 c2 = c1 = utf8_to_uvchr(s, NULL);
3154 PL_reginput = locinput;
3158 if (ln && regrepeat(scan, ln) < ln)
3160 locinput = PL_reginput;
3163 char *e; /* Should not check after this */
3164 char *old = locinput;
3166 if (n == REG_INFTY) {
3169 while (UTF8_IS_CONTINUATION(*(U8*)e))
3175 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3179 e = locinput + n - ln;
3185 /* Find place 'next' could work */
3188 while (locinput <= e && *locinput != c1)
3191 while (locinput <= e
3196 count = locinput - old;
3203 utf8_to_uvchr((U8*)locinput, &len) != c1;
3208 for (count = 0; locinput <= e; count++) {
3209 UV c = utf8_to_uvchr((U8*)locinput, &len);
3210 if (c == c1 || c == c2)
3218 /* PL_reginput == old now */
3219 if (locinput != old) {
3220 ln = 1; /* Did some */
3221 if (regrepeat(scan, count) < count)
3224 /* PL_reginput == locinput now */
3225 TRYPAREN(paren, ln, locinput);
3226 PL_reginput = locinput; /* Could be reset... */
3227 REGCP_UNWIND(lastcp);
3228 /* Couldn't or didn't -- move forward. */
3231 locinput += UTF8SKIP(locinput);
3237 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3241 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3243 c = UCHARAT(PL_reginput);
3244 /* If it could work, try it. */
3245 if (c == c1 || c == c2)
3247 TRYPAREN(paren, n, PL_reginput);
3248 REGCP_UNWIND(lastcp);
3251 /* If it could work, try it. */
3252 else if (c1 == -1000)
3254 TRYPAREN(paren, n, PL_reginput);
3255 REGCP_UNWIND(lastcp);
3257 /* Couldn't or didn't -- move forward. */
3258 PL_reginput = locinput;
3259 if (regrepeat(scan, 1)) {
3261 locinput = PL_reginput;
3269 n = regrepeat(scan, n);
3270 locinput = PL_reginput;
3271 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3272 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3273 ln = n; /* why back off? */
3274 /* ...because $ and \Z can match before *and* after
3275 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3276 We should back off by one in this case. */
3277 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3286 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3288 c = UCHARAT(PL_reginput);
3290 /* If it could work, try it. */
3291 if (c1 == -1000 || c == c1 || c == c2)
3293 TRYPAREN(paren, n, PL_reginput);
3294 REGCP_UNWIND(lastcp);
3296 /* Couldn't or didn't -- back up. */
3298 PL_reginput = locinput = HOPc(locinput, -1);
3306 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3308 c = UCHARAT(PL_reginput);
3310 /* If it could work, try it. */
3311 if (c1 == -1000 || c == c1 || c == c2)
3313 TRYPAREN(paren, n, PL_reginput);
3314 REGCP_UNWIND(lastcp);
3316 /* Couldn't or didn't -- back up. */
3318 PL_reginput = locinput = HOPc(locinput, -1);
3325 if (PL_reg_call_cc) {
3326 re_cc_state *cur_call_cc = PL_reg_call_cc;
3327 CURCUR *cctmp = PL_regcc;
3328 regexp *re = PL_reg_re;
3329 CHECKPOINT cp, lastcp;
3331 cp = regcppush(0); /* Save *all* the positions. */
3333 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3335 PL_reginput = locinput; /* Make position available to
3337 cache_re(PL_reg_call_cc->re);
3338 PL_regcc = PL_reg_call_cc->cc;
3339 PL_reg_call_cc = PL_reg_call_cc->prev;
3340 if (regmatch(cur_call_cc->node)) {
3341 PL_reg_call_cc = cur_call_cc;
3345 REGCP_UNWIND(lastcp);
3347 PL_reg_call_cc = cur_call_cc;
3353 PerlIO_printf(Perl_debug_log,
3354 "%*s continuation failed...\n",
3355 REPORT_CODE_OFF+PL_regindent*2, "")
3359 if (locinput < PL_regtill) {
3360 DEBUG_r(PerlIO_printf(Perl_debug_log,
3361 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3363 (long)(locinput - PL_reg_starttry),
3364 (long)(PL_regtill - PL_reg_starttry),
3366 sayNO_FINAL; /* Cannot match: too short. */
3368 PL_reginput = locinput; /* put where regtry can find it */
3369 sayYES_FINAL; /* Success! */
3371 PL_reginput = locinput; /* put where regtry can find it */
3372 sayYES_LOUD; /* Success! */
3375 PL_reginput = locinput;
3380 s = HOPBACKc(locinput, scan->flags);
3386 PL_reginput = locinput;
3391 s = HOPBACKc(locinput, scan->flags);
3397 PL_reginput = locinput;
3400 inner = NEXTOPER(NEXTOPER(scan));
3401 if (regmatch(inner) != n) {
3416 if (OP(scan) == SUSPEND) {
3417 locinput = PL_reginput;
3418 nextchr = UCHARAT(locinput);
3423 next = scan + ARG(scan);
3428 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3429 PTR2UV(scan), OP(scan));
3430 Perl_croak(aTHX_ "regexp memory corruption");
3437 * We get here only if there's trouble -- normally "case END" is
3438 * the terminating point.
3440 Perl_croak(aTHX_ "corrupted regexp pointers");
3446 PerlIO_printf(Perl_debug_log,
3447 "%*s %scould match...%s\n",
3448 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3452 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3453 PL_colors[4],PL_colors[5]));
3459 #if 0 /* Breaks $^R */
3467 PerlIO_printf(Perl_debug_log,
3468 "%*s %sfailed...%s\n",
3469 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3475 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3478 case RE_UNWIND_BRANCH:
3479 case RE_UNWIND_BRANCHJ:
3481 re_unwind_branch_t *uwb = &(uw->branch);
3482 I32 lastparen = uwb->lastparen;
3484 REGCP_UNWIND(uwb->lastcp);
3485 for (n = *PL_reglastparen; n > lastparen; n--)
3487 *PL_reglastparen = n;
3488 scan = next = uwb->next;
3490 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3491 ? BRANCH : BRANCHJ) ) { /* Failure */
3498 /* Have more choice yet. Reuse the same uwb. */
3500 if ((n = (uwb->type == RE_UNWIND_BRANCH
3501 ? NEXT_OFF(next) : ARG(next))))
3504 next = NULL; /* XXXX Needn't unwinding in this case... */
3506 next = NEXTOPER(scan);
3507 if (uwb->type == RE_UNWIND_BRANCHJ)
3508 next = NEXTOPER(next);
3509 locinput = uwb->locinput;
3510 nextchr = uwb->nextchr;
3512 PL_regindent = uwb->regindent;
3519 Perl_croak(aTHX_ "regexp unwind memory corruption");
3530 - regrepeat - repeatedly match something simple, report how many
3533 * [This routine now assumes that it will only match on things of length 1.
3534 * That was true before, but now we assume scan - reginput is the count,
3535 * rather than incrementing count on every character. [Er, except utf8.]]
3538 S_regrepeat(pTHX_ regnode *p, I32 max)
3540 register char *scan;
3542 register char *loceol = PL_regeol;
3543 register I32 hardcount = 0;
3544 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3547 if (max != REG_INFTY && max < loceol - scan)
3548 loceol = scan + max;
3553 while (scan < loceol && hardcount < max && *scan != '\n') {
3554 scan += UTF8SKIP(scan);
3558 while (scan < loceol && *scan != '\n')
3565 case EXACT: /* length of string is 1 */
3567 while (scan < loceol && UCHARAT(scan) == c)
3570 case EXACTF: /* length of string is 1 */
3572 while (scan < loceol &&
3573 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3576 case EXACTFL: /* length of string is 1 */
3577 PL_reg_flags |= RF_tainted;
3579 while (scan < loceol &&
3580 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3586 while (hardcount < max && scan < loceol &&
3587 reginclass(p, (U8*)scan, do_utf8)) {
3588 scan += UTF8SKIP(scan);
3592 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3599 LOAD_UTF8_CHARCLASS(alnum,"a");
3600 while (hardcount < max && scan < loceol &&
3601 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3602 scan += UTF8SKIP(scan);
3606 while (scan < loceol && isALNUM(*scan))
3611 PL_reg_flags |= RF_tainted;
3614 while (hardcount < max && scan < loceol &&
3615 isALNUM_LC_utf8((U8*)scan)) {
3616 scan += UTF8SKIP(scan);
3620 while (scan < loceol && isALNUM_LC(*scan))
3627 LOAD_UTF8_CHARCLASS(alnum,"a");
3628 while (hardcount < max && scan < loceol &&
3629 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3630 scan += UTF8SKIP(scan);
3634 while (scan < loceol && !isALNUM(*scan))
3639 PL_reg_flags |= RF_tainted;
3642 while (hardcount < max && scan < loceol &&
3643 !isALNUM_LC_utf8((U8*)scan)) {
3644 scan += UTF8SKIP(scan);
3648 while (scan < loceol && !isALNUM_LC(*scan))
3655 LOAD_UTF8_CHARCLASS(space," ");
3656 while (hardcount < max && scan < loceol &&
3658 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3659 scan += UTF8SKIP(scan);
3663 while (scan < loceol && isSPACE(*scan))
3668 PL_reg_flags |= RF_tainted;
3671 while (hardcount < max && scan < loceol &&
3672 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3673 scan += UTF8SKIP(scan);
3677 while (scan < loceol && isSPACE_LC(*scan))
3684 LOAD_UTF8_CHARCLASS(space," ");
3685 while (hardcount < max && scan < loceol &&
3687 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3688 scan += UTF8SKIP(scan);
3692 while (scan < loceol && !isSPACE(*scan))
3697 PL_reg_flags |= RF_tainted;
3700 while (hardcount < max && scan < loceol &&
3701 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3702 scan += UTF8SKIP(scan);
3706 while (scan < loceol && !isSPACE_LC(*scan))
3713 LOAD_UTF8_CHARCLASS(digit,"0");
3714 while (hardcount < max && scan < loceol &&
3715 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3716 scan += UTF8SKIP(scan);
3720 while (scan < loceol && isDIGIT(*scan))
3727 LOAD_UTF8_CHARCLASS(digit,"0");
3728 while (hardcount < max && scan < loceol &&
3729 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3730 scan += UTF8SKIP(scan);
3734 while (scan < loceol && !isDIGIT(*scan))
3738 default: /* Called on something of 0 width. */
3739 break; /* So match right here or not at all. */
3745 c = scan - PL_reginput;
3750 SV *prop = sv_newmortal();
3753 PerlIO_printf(Perl_debug_log,
3754 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3755 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3762 - regrepeat_hard - repeatedly match something, report total lenth and length
3764 * The repeater is supposed to have constant length.
3768 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3770 register char *scan;
3771 register char *start;
3772 register char *loceol = PL_regeol;
3774 I32 count = 0, res = 1;
3779 start = PL_reginput;
3780 if (DO_UTF8(PL_reg_sv)) {
3781 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3784 while (start < PL_reginput) {
3786 start += UTF8SKIP(start);
3797 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3799 *lp = l = PL_reginput - start;
3800 if (max != REG_INFTY && l*max < loceol - scan)
3801 loceol = scan + l*max;
3814 - regclass_swash - prepare the utf8 swash
3818 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3823 if (PL_regdata && PL_regdata->count) {
3826 if (PL_regdata->what[n] == 's') {
3827 SV *rv = (SV*)PL_regdata->data[n];
3828 AV *av = (AV*)SvRV((SV*)rv);
3831 si = *av_fetch(av, 0, FALSE);
3832 a = av_fetch(av, 1, FALSE);
3836 else if (si && doinit) {
3837 sw = swash_init("utf8", "", si, 1, 0);
3838 (void)av_store(av, 1, sw);
3850 - reginclass - determine if a character falls into a character class
3854 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3856 char flags = ANYOF_FLAGS(n);
3861 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3863 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3864 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3865 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3868 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3871 SV *sw = regclass_swash(n, TRUE, 0);
3874 if (swash_fetch(sw, p, do_utf8))
3876 else if (flags & ANYOF_FOLD) {
3877 U8 tmpbuf[UTF8_MAXLEN+1];
3879 if (flags & ANYOF_LOCALE) {
3880 PL_reg_flags |= RF_tainted;
3881 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3884 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3885 if (swash_fetch(sw, tmpbuf, do_utf8))
3891 if (!match && c < 256) {
3892 if (ANYOF_BITMAP_TEST(n, c))
3894 else if (flags & ANYOF_FOLD) {
3897 if (flags & ANYOF_LOCALE) {
3898 PL_reg_flags |= RF_tainted;
3899 f = PL_fold_locale[c];
3903 if (f != c && ANYOF_BITMAP_TEST(n, f))
3907 if (!match && (flags & ANYOF_CLASS)) {
3908 PL_reg_flags |= RF_tainted;
3910 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3911 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3912 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3913 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3914 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3915 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3916 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3917 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3918 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3919 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3920 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3921 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3922 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3923 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3924 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3940 ) /* How's that for a conditional? */
3947 return (flags & ANYOF_INVERT) ? !match : match;
3951 S_reghop(pTHX_ U8 *s, I32 off)
3953 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3957 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3960 while (off-- && s < lim) {
3961 /* XXX could check well-formedness here */
3969 if (UTF8_IS_CONTINUED(*s)) {
3970 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3973 /* XXX could check well-formedness here */
3981 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3983 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3987 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
3990 while (off-- && s < lim) {
3991 /* XXX could check well-formedness here */
4001 if (UTF8_IS_CONTINUED(*s)) {
4002 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4005 /* XXX could check well-formedness here */
4021 restore_pos(pTHXo_ void *arg)
4023 if (PL_reg_eval_set) {
4024 if (PL_reg_oldsaved) {
4025 PL_reg_re->subbeg = PL_reg_oldsaved;
4026 PL_reg_re->sublen = PL_reg_oldsavedlen;
4027 RX_MATCH_COPIED_on(PL_reg_re);
4029 PL_reg_magic->mg_len = PL_reg_oldpos;
4030 PL_reg_eval_set = 0;
4031 PL_curpm = PL_reg_oldcurpm;