5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
84 # if defined(PERL_CAPI) || defined(PERL_OBJECT)
91 #define RF_tainted 1 /* tainted information used? */
92 #define RF_warned 2 /* warned about big count? */
93 #define RF_evaled 4 /* Did an EVAL with setting? */
94 #define RF_utf8 8 /* String contains multibyte chars? */
96 #define UTF (PL_reg_flags & RF_utf8)
98 #define RS_init 1 /* eval environment created */
99 #define RS_set 2 /* replsv value is set */
102 #define STATIC static
109 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
110 #define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
112 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
113 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
114 #define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
115 #define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
116 #define HOPc(pos,off) ((char*)HOP(pos,off))
117 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
119 #define HOPBACK(pos, off) ( \
120 (UTF && DO_UTF8(PL_reg_sv)) \
121 ? reghopmaybe((U8*)pos, -off) \
122 : (pos - off >= PL_bostr) \
126 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
128 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
129 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
130 #define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
131 #define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
133 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
135 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
137 static void restore_pos(pTHXo_ void *arg);
140 S_regcppush(pTHX_ I32 parenfloor)
142 int retval = PL_savestack_ix;
143 #define REGCP_PAREN_ELEMS 4
144 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
147 #define REGCP_OTHER_ELEMS 5
148 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
149 for (p = PL_regsize; p > parenfloor; p--) {
150 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
151 SSPUSHINT(PL_regendp[p]);
152 SSPUSHINT(PL_regstartp[p]);
153 SSPUSHPTR(PL_reg_start_tmp[p]);
156 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
157 SSPUSHINT(PL_regsize);
158 SSPUSHINT(*PL_reglastparen);
159 SSPUSHPTR(PL_reginput);
160 #define REGCP_FRAME_ELEMS 2
161 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
162 * are needed for the regexp context stack bookkeeping. */
163 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
164 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
169 /* These are needed since we do not localize EVAL nodes: */
170 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
171 " Setting an EVAL scope, savestack=%"IVdf"\n", \
172 (IV)PL_savestack_ix)); cp = PL_savestack_ix
174 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
175 PerlIO_printf(Perl_debug_log, \
176 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
177 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
187 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
189 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
190 i = SSPOPINT; /* Parentheses elements to pop. */
191 input = (char *) SSPOPPTR;
192 *PL_reglastparen = SSPOPINT;
193 PL_regsize = SSPOPINT;
195 /* Now restore the parentheses context. */
196 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
197 i > 0; i -= REGCP_PAREN_ELEMS) {
198 paren = (U32)SSPOPINT;
199 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
200 PL_regstartp[paren] = SSPOPINT;
202 if (paren <= *PL_reglastparen)
203 PL_regendp[paren] = tmps;
205 PerlIO_printf(Perl_debug_log,
206 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
207 (UV)paren, (IV)PL_regstartp[paren],
208 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
209 (IV)PL_regendp[paren],
210 (paren > *PL_reglastparen ? "(no)" : ""));
214 if (*PL_reglastparen + 1 <= PL_regnpar) {
215 PerlIO_printf(Perl_debug_log,
216 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
217 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
221 /* It would seem that the similar code in regtry()
222 * already takes care of this, and in fact it is in
223 * a better location to since this code can #if 0-ed out
224 * but the code in regtry() is needed or otherwise tests
225 * requiring null fields (pat.t#187 and split.t#{13,14}
226 * (as of patchlevel 7877) will fail. Then again,
227 * this code seems to be necessary or otherwise
228 * building DynaLoader will fail:
229 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
231 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
232 if (paren > PL_regsize)
233 PL_regstartp[paren] = -1;
234 PL_regendp[paren] = -1;
241 S_regcp_set_to(pTHX_ I32 ss)
243 I32 tmp = PL_savestack_ix;
245 PL_savestack_ix = ss;
247 PL_savestack_ix = tmp;
251 typedef struct re_cc_state
255 struct re_cc_state *prev;
260 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
262 #define TRYPAREN(paren, n, input) { \
265 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
266 PL_regendp[paren] = input - PL_bostr; \
269 PL_regendp[paren] = -1; \
271 if (regmatch(next)) \
274 PL_regendp[paren] = -1; \
279 * pregexec and friends
283 - pregexec - match a regexp against a string
286 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
287 char *strbeg, I32 minend, SV *screamer, U32 nosave)
288 /* strend: pointer to null at end of string */
289 /* strbeg: real beginning of string */
290 /* minend: end of match must be >=minend after stringarg. */
291 /* nosave: For optimizations. */
294 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
295 nosave ? 0 : REXEC_COPY_STR);
299 S_cache_re(pTHX_ regexp *prog)
301 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
303 PL_regprogram = prog->program;
305 PL_regnpar = prog->nparens;
306 PL_regdata = prog->data;
311 * Need to implement the following flags for reg_anch:
313 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
315 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
316 * INTUIT_AUTORITATIVE_ML
317 * INTUIT_ONCE_NOML - Intuit can match in one location only.
320 * Another flag for this function: SECOND_TIME (so that float substrs
321 * with giant delta may be not rechecked).
324 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
326 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
327 Otherwise, only SvCUR(sv) is used to get strbeg. */
329 /* XXXX We assume that strpos is strbeg unless sv. */
331 /* XXXX Some places assume that there is a fixed substring.
332 An update may be needed if optimizer marks as "INTUITable"
333 RExen without fixed substrings. Similarly, it is assumed that
334 lengths of all the strings are no more than minlen, thus they
335 cannot come from lookahead.
336 (Or minlen should take into account lookahead.) */
338 /* A failure to find a constant substring means that there is no need to make
339 an expensive call to REx engine, thus we celebrate a failure. Similarly,
340 finding a substring too deep into the string means that less calls to
341 regtry() should be needed.
343 REx compiler's optimizer found 4 possible hints:
344 a) Anchored substring;
346 c) Whether we are anchored (beginning-of-line or \G);
347 d) First node (of those at offset 0) which may distingush positions;
348 We use a)b)d) and multiline-part of c), and try to find a position in the
349 string which does not contradict any of them.
352 /* Most of decisions we do here should have been done at compile time.
353 The nodes of the REx which we used for the search should have been
354 deleted from the finite automaton. */
357 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
358 char *strend, U32 flags, re_scream_pos_data *data)
360 register I32 start_shift;
361 /* Should be nonnegative! */
362 register I32 end_shift;
369 register char *other_last = Nullch; /* other substr checked before this */
370 char *check_at; /* check substr found at this pos */
372 char *i_strpos = strpos;
375 DEBUG_r( if (!PL_colorset) reginitcolors() );
376 DEBUG_r(PerlIO_printf(Perl_debug_log,
377 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
378 PL_colors[4],PL_colors[5],PL_colors[0],
381 (strlen(prog->precomp) > 60 ? "..." : ""),
383 (int)(strend - strpos > 60 ? 60 : strend - strpos),
384 strpos, PL_colors[1],
385 (strend - strpos > 60 ? "..." : ""))
388 if (prog->reganch & ROPT_UTF8)
389 PL_reg_flags |= RF_utf8;
391 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
392 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
395 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
397 check = prog->check_substr;
398 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
399 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
400 || ( (prog->reganch & ROPT_ANCH_BOL)
401 && !PL_multiline ) ); /* Check after \n? */
404 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
405 | ROPT_IMPLICIT)) /* not a real BOL */
406 /* SvCUR is not set on references: SvRV and SvPVX overlap */
408 && (strpos != strbeg)) {
409 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
412 if (prog->check_offset_min == prog->check_offset_max &&
413 !(prog->reganch & ROPT_SANY_SEEN)) {
414 /* Substring at constant offset from beg-of-str... */
417 s = HOP3c(strpos, prog->check_offset_min, strend);
419 slen = SvCUR(check); /* >= 1 */
421 if ( strend - s > slen || strend - s < slen - 1
422 || (strend - s == slen && strend[-1] != '\n')) {
423 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
426 /* Now should match s[0..slen-2] */
428 if (slen && (*SvPVX(check) != *s
430 && memNE(SvPVX(check), s, slen)))) {
432 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
436 else if (*SvPVX(check) != *s
437 || ((slen = SvCUR(check)) > 1
438 && memNE(SvPVX(check), s, slen)))
440 goto success_at_start;
443 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
445 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
446 end_shift = prog->minlen - start_shift -
447 CHR_SVLEN(check) + (SvTAIL(check) != 0);
449 I32 end = prog->check_offset_max + CHR_SVLEN(check)
450 - (SvTAIL(check) != 0);
451 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
453 if (end_shift < eshift)
457 else { /* Can match at random position */
460 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
461 /* Should be nonnegative! */
462 end_shift = prog->minlen - start_shift -
463 CHR_SVLEN(check) + (SvTAIL(check) != 0);
466 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
468 Perl_croak(aTHX_ "panic: end_shift");
472 /* Find a possible match in the region s..strend by looking for
473 the "check" substring in the region corrected by start/end_shift. */
474 if (flags & REXEC_SCREAM) {
475 I32 p = -1; /* Internal iterator of scream. */
476 I32 *pp = data ? data->scream_pos : &p;
478 if (PL_screamfirst[BmRARE(check)] >= 0
479 || ( BmRARE(check) == '\n'
480 && (BmPREVIOUS(check) == SvCUR(check) - 1)
482 s = screaminstr(sv, check,
483 start_shift + (s - strbeg), end_shift, pp, 0);
487 *data->scream_olds = s;
489 else if (prog->reganch & ROPT_SANY_SEEN)
490 s = fbm_instr((U8*)(s + start_shift),
491 (U8*)(strend - end_shift),
492 check, PL_multiline ? FBMrf_MULTILINE : 0);
494 s = fbm_instr(HOP3(s, start_shift, strend),
495 HOP3(strend, -end_shift, strbeg),
496 check, PL_multiline ? FBMrf_MULTILINE : 0);
498 /* Update the count-of-usability, remove useless subpatterns,
501 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
502 (s ? "Found" : "Did not find"),
503 ((check == prog->anchored_substr) ? "anchored" : "floating"),
505 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
507 PL_colors[1], (SvTAIL(check) ? "$" : ""),
508 (s ? " at offset " : "...\n") ) );
515 /* Finish the diagnostic message */
516 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
518 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
519 Start with the other substr.
520 XXXX no SCREAM optimization yet - and a very coarse implementation
521 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
522 *always* match. Probably should be marked during compile...
523 Probably it is right to do no SCREAM here...
526 if (prog->float_substr && prog->anchored_substr) {
527 /* Take into account the "other" substring. */
528 /* XXXX May be hopelessly wrong for UTF... */
531 if (check == prog->float_substr) {
534 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
537 t = s - prog->check_offset_max;
538 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
539 && (!(prog->reganch & ROPT_UTF8)
540 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
545 t = HOP3c(t, prog->anchored_offset, strend);
546 if (t < other_last) /* These positions already checked */
548 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
551 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
552 /* On end-of-str: see comment below. */
553 s = fbm_instr((unsigned char*)t,
554 HOP3(HOP3(last1, prog->anchored_offset, strend)
555 + SvCUR(prog->anchored_substr),
556 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
557 prog->anchored_substr,
558 PL_multiline ? FBMrf_MULTILINE : 0);
559 DEBUG_r(PerlIO_printf(Perl_debug_log,
560 "%s anchored substr `%s%.*s%s'%s",
561 (s ? "Found" : "Contradicts"),
563 (int)(SvCUR(prog->anchored_substr)
564 - (SvTAIL(prog->anchored_substr)!=0)),
565 SvPVX(prog->anchored_substr),
566 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
568 if (last1 >= last2) {
569 DEBUG_r(PerlIO_printf(Perl_debug_log,
570 ", giving up...\n"));
573 DEBUG_r(PerlIO_printf(Perl_debug_log,
574 ", trying floating at offset %ld...\n",
575 (long)(HOP3c(s1, 1, strend) - i_strpos)));
576 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
577 s = HOP3c(last, 1, strend);
581 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
582 (long)(s - i_strpos)));
583 t = HOP3c(s, -prog->anchored_offset, strbeg);
584 other_last = HOP3c(s, 1, strend);
592 else { /* Take into account the floating substring. */
596 t = HOP3c(s, -start_shift, strbeg);
598 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
599 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
600 last = HOP3c(t, prog->float_max_offset, strend);
601 s = HOP3c(t, prog->float_min_offset, strend);
604 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
605 /* fbm_instr() takes into account exact value of end-of-str
606 if the check is SvTAIL(ed). Since false positives are OK,
607 and end-of-str is not later than strend we are OK. */
608 s = fbm_instr((unsigned char*)s,
609 (unsigned char*)last + SvCUR(prog->float_substr)
610 - (SvTAIL(prog->float_substr)!=0),
611 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
612 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
613 (s ? "Found" : "Contradicts"),
615 (int)(SvCUR(prog->float_substr)
616 - (SvTAIL(prog->float_substr)!=0)),
617 SvPVX(prog->float_substr),
618 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
621 DEBUG_r(PerlIO_printf(Perl_debug_log,
622 ", giving up...\n"));
625 DEBUG_r(PerlIO_printf(Perl_debug_log,
626 ", trying anchored starting at offset %ld...\n",
627 (long)(s1 + 1 - i_strpos)));
629 s = HOP3c(t, 1, strend);
633 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
634 (long)(s - i_strpos)));
635 other_last = s; /* Fix this later. --Hugo */
644 t = s - prog->check_offset_max;
645 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
646 && (!(prog->reganch & ROPT_UTF8)
647 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
649 /* Fixed substring is found far enough so that the match
650 cannot start at strpos. */
652 if (ml_anch && t[-1] != '\n') {
653 /* Eventually fbm_*() should handle this, but often
654 anchored_offset is not 0, so this check will not be wasted. */
655 /* XXXX In the code below we prefer to look for "^" even in
656 presence of anchored substrings. And we search even
657 beyond the found float position. These pessimizations
658 are historical artefacts only. */
660 while (t < strend - prog->minlen) {
662 if (t < check_at - prog->check_offset_min) {
663 if (prog->anchored_substr) {
664 /* Since we moved from the found position,
665 we definitely contradict the found anchored
666 substr. Due to the above check we do not
667 contradict "check" substr.
668 Thus we can arrive here only if check substr
669 is float. Redo checking for "other"=="fixed".
672 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
673 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
674 goto do_other_anchored;
676 /* We don't contradict the found floating substring. */
677 /* XXXX Why not check for STCLASS? */
679 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
680 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
683 /* Position contradicts check-string */
684 /* XXXX probably better to look for check-string
685 than for "\n", so one should lower the limit for t? */
686 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
687 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
688 other_last = strpos = s = t + 1;
693 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
694 PL_colors[0],PL_colors[1]));
698 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
699 PL_colors[0],PL_colors[1]));
703 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
706 /* The found string does not prohibit matching at strpos,
707 - no optimization of calling REx engine can be performed,
708 unless it was an MBOL and we are not after MBOL,
709 or a future STCLASS check will fail this. */
711 /* Even in this situation we may use MBOL flag if strpos is offset
712 wrt the start of the string. */
713 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
714 && (strpos != strbeg) && strpos[-1] != '\n'
715 /* May be due to an implicit anchor of m{.*foo} */
716 && !(prog->reganch & ROPT_IMPLICIT))
721 DEBUG_r( if (ml_anch)
722 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
723 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
726 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
727 && prog->check_substr /* Could be deleted already */
728 && --BmUSEFUL(prog->check_substr) < 0
729 && prog->check_substr == prog->float_substr)
731 /* If flags & SOMETHING - do not do it many times on the same match */
732 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
733 SvREFCNT_dec(prog->check_substr);
734 prog->check_substr = Nullsv; /* disable */
735 prog->float_substr = Nullsv; /* clear */
736 check = Nullsv; /* abort */
738 /* XXXX This is a remnant of the old implementation. It
739 looks wasteful, since now INTUIT can use many
741 prog->reganch &= ~RE_USE_INTUIT;
748 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
749 if (prog->regstclass) {
750 /* minlen == 0 is possible if regstclass is \b or \B,
751 and the fixed substr is ''$.
752 Since minlen is already taken into account, s+1 is before strend;
753 accidentally, minlen >= 1 guaranties no false positives at s + 1
754 even for \b or \B. But (minlen? 1 : 0) below assumes that
755 regstclass does not come from lookahead... */
756 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
757 This leaves EXACTF only, which is dealt with in find_byclass(). */
758 U8* str = (U8*)STRING(prog->regstclass);
759 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
760 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
762 char *endpos = (prog->anchored_substr || ml_anch)
763 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
764 : (prog->float_substr
765 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
768 char *startpos = strbeg;
771 if (prog->reganch & ROPT_UTF8) {
772 PL_regdata = prog->data;
775 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
780 if (endpos == strend) {
781 DEBUG_r( PerlIO_printf(Perl_debug_log,
782 "Could not match STCLASS...\n") );
785 DEBUG_r( PerlIO_printf(Perl_debug_log,
786 "This position contradicts STCLASS...\n") );
787 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
789 /* Contradict one of substrings */
790 if (prog->anchored_substr) {
791 if (prog->anchored_substr == check) {
792 DEBUG_r( what = "anchored" );
794 s = HOP3c(t, 1, strend);
795 if (s + start_shift + end_shift > strend) {
796 /* XXXX Should be taken into account earlier? */
797 DEBUG_r( PerlIO_printf(Perl_debug_log,
798 "Could not match STCLASS...\n") );
803 DEBUG_r( PerlIO_printf(Perl_debug_log,
804 "Looking for %s substr starting at offset %ld...\n",
805 what, (long)(s + start_shift - i_strpos)) );
808 /* Have both, check_string is floating */
809 if (t + start_shift >= check_at) /* Contradicts floating=check */
810 goto retry_floating_check;
811 /* Recheck anchored substring, but not floating... */
815 DEBUG_r( PerlIO_printf(Perl_debug_log,
816 "Looking for anchored substr starting at offset %ld...\n",
817 (long)(other_last - i_strpos)) );
818 goto do_other_anchored;
820 /* Another way we could have checked stclass at the
821 current position only: */
826 DEBUG_r( PerlIO_printf(Perl_debug_log,
827 "Looking for /%s^%s/m starting at offset %ld...\n",
828 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
831 if (!prog->float_substr) /* Could have been deleted */
833 /* Check is floating subtring. */
834 retry_floating_check:
835 t = check_at - start_shift;
836 DEBUG_r( what = "floating" );
837 goto hop_and_restart;
840 PerlIO_printf(Perl_debug_log,
841 "By STCLASS: moving %ld --> %ld\n",
842 (long)(t - i_strpos), (long)(s - i_strpos));
844 PerlIO_printf(Perl_debug_log,
845 "Does not contradict STCLASS...\n") );
848 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
849 PL_colors[4], (check ? "Guessed" : "Giving up"),
850 PL_colors[5], (long)(s - i_strpos)) );
853 fail_finish: /* Substring not found */
854 if (prog->check_substr) /* could be removed already */
855 BmUSEFUL(prog->check_substr) += 5; /* hooray */
857 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
858 PL_colors[4],PL_colors[5]));
862 /* We know what class REx starts with. Try to find this position... */
864 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
866 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
872 register I32 tmp = 1; /* Scratch variable? */
873 register bool do_utf8 = DO_UTF8(PL_reg_sv);
875 /* We know what class it must start with. */
879 if (reginclass(c, (U8*)s, do_utf8)) {
880 if (tmp && (norun || regtry(prog, s)))
887 s += do_utf8 ? UTF8SKIP(s) : 1;
894 c1 = to_utf8_lower((U8*)m);
895 c2 = to_utf8_upper((U8*)m);
906 c2 = PL_fold_locale[c1];
911 e = s; /* Due to minlen logic of intuit() */
917 if ( utf8_to_uvchr((U8*)s, &len) == c1
924 UV c = utf8_to_uvchr((U8*)s, &len);
925 if ( (c == c1 || c == c2) && regtry(prog, s) )
934 && (ln == 1 || !(OP(c) == EXACTF
936 : ibcmp_locale(s, m, ln)))
937 && (norun || regtry(prog, s)) )
943 if ( (*(U8*)s == c1 || *(U8*)s == c2)
944 && (ln == 1 || !(OP(c) == EXACTF
946 : ibcmp_locale(s, m, ln)))
947 && (norun || regtry(prog, s)) )
954 PL_reg_flags |= RF_tainted;
961 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
963 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
965 tmp = ((OP(c) == BOUND ?
966 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
967 LOAD_UTF8_CHARCLASS(alnum,"a");
969 if (tmp == !(OP(c) == BOUND ?
970 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
971 isALNUM_LC_utf8((U8*)s)))
974 if ((norun || regtry(prog, s)))
981 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
982 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
985 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
987 if ((norun || regtry(prog, s)))
993 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
997 PL_reg_flags |= RF_tainted;
1004 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1006 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1008 tmp = ((OP(c) == NBOUND ?
1009 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1010 LOAD_UTF8_CHARCLASS(alnum,"a");
1011 while (s < strend) {
1012 if (tmp == !(OP(c) == NBOUND ?
1013 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1014 isALNUM_LC_utf8((U8*)s)))
1016 else if ((norun || regtry(prog, s)))
1022 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1023 tmp = ((OP(c) == NBOUND ?
1024 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1025 while (s < strend) {
1027 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1029 else if ((norun || regtry(prog, s)))
1034 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1039 LOAD_UTF8_CHARCLASS(alnum,"a");
1040 while (s < strend) {
1041 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1042 if (tmp && (norun || regtry(prog, s)))
1053 while (s < strend) {
1055 if (tmp && (norun || regtry(prog, s)))
1067 PL_reg_flags |= RF_tainted;
1069 while (s < strend) {
1070 if (isALNUM_LC_utf8((U8*)s)) {
1071 if (tmp && (norun || regtry(prog, s)))
1082 while (s < strend) {
1083 if (isALNUM_LC(*s)) {
1084 if (tmp && (norun || regtry(prog, s)))
1097 LOAD_UTF8_CHARCLASS(alnum,"a");
1098 while (s < strend) {
1099 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1100 if (tmp && (norun || regtry(prog, s)))
1111 while (s < strend) {
1113 if (tmp && (norun || regtry(prog, s)))
1125 PL_reg_flags |= RF_tainted;
1127 while (s < strend) {
1128 if (!isALNUM_LC_utf8((U8*)s)) {
1129 if (tmp && (norun || regtry(prog, s)))
1140 while (s < strend) {
1141 if (!isALNUM_LC(*s)) {
1142 if (tmp && (norun || regtry(prog, s)))
1155 LOAD_UTF8_CHARCLASS(space," ");
1156 while (s < strend) {
1157 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1158 if (tmp && (norun || regtry(prog, s)))
1169 while (s < strend) {
1171 if (tmp && (norun || regtry(prog, s)))
1183 PL_reg_flags |= RF_tainted;
1185 while (s < strend) {
1186 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1187 if (tmp && (norun || regtry(prog, s)))
1198 while (s < strend) {
1199 if (isSPACE_LC(*s)) {
1200 if (tmp && (norun || regtry(prog, s)))
1213 LOAD_UTF8_CHARCLASS(space," ");
1214 while (s < strend) {
1215 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1216 if (tmp && (norun || regtry(prog, s)))
1227 while (s < strend) {
1229 if (tmp && (norun || regtry(prog, s)))
1241 PL_reg_flags |= RF_tainted;
1243 while (s < strend) {
1244 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1245 if (tmp && (norun || regtry(prog, s)))
1256 while (s < strend) {
1257 if (!isSPACE_LC(*s)) {
1258 if (tmp && (norun || regtry(prog, s)))
1271 LOAD_UTF8_CHARCLASS(digit,"0");
1272 while (s < strend) {
1273 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1274 if (tmp && (norun || regtry(prog, s)))
1285 while (s < strend) {
1287 if (tmp && (norun || regtry(prog, s)))
1299 PL_reg_flags |= RF_tainted;
1301 while (s < strend) {
1302 if (isDIGIT_LC_utf8((U8*)s)) {
1303 if (tmp && (norun || regtry(prog, s)))
1314 while (s < strend) {
1315 if (isDIGIT_LC(*s)) {
1316 if (tmp && (norun || regtry(prog, s)))
1329 LOAD_UTF8_CHARCLASS(digit,"0");
1330 while (s < strend) {
1331 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1332 if (tmp && (norun || regtry(prog, s)))
1343 while (s < strend) {
1345 if (tmp && (norun || regtry(prog, s)))
1357 PL_reg_flags |= RF_tainted;
1359 while (s < strend) {
1360 if (!isDIGIT_LC_utf8((U8*)s)) {
1361 if (tmp && (norun || regtry(prog, s)))
1372 while (s < strend) {
1373 if (!isDIGIT_LC(*s)) {
1374 if (tmp && (norun || regtry(prog, s)))
1386 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1395 - regexec_flags - match a regexp against a string
1398 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1399 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1400 /* strend: pointer to null at end of string */
1401 /* strbeg: real beginning of string */
1402 /* minend: end of match must be >=minend after stringarg. */
1403 /* data: May be used for some additional optimizations. */
1404 /* nosave: For optimizations. */
1407 register regnode *c;
1408 register char *startpos = stringarg;
1409 I32 minlen; /* must match at least this many chars */
1410 I32 dontbother = 0; /* how many characters not to try at end */
1411 /* I32 start_shift = 0; */ /* Offset of the start to find
1412 constant substr. */ /* CC */
1413 I32 end_shift = 0; /* Same for the end. */ /* CC */
1414 I32 scream_pos = -1; /* Internal iterator of scream. */
1416 SV* oreplsv = GvSV(PL_replgv);
1417 bool do_utf8 = DO_UTF8(sv);
1423 PL_regnarrate = DEBUG_r_TEST;
1426 /* Be paranoid... */
1427 if (prog == NULL || startpos == NULL) {
1428 Perl_croak(aTHX_ "NULL regexp parameter");
1432 minlen = prog->minlen;
1434 if (!(prog->reganch & ROPT_SANY_SEEN))
1435 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1438 if (strend - startpos < minlen) goto phooey;
1441 /* Check validity of program. */
1442 if (UCHARAT(prog->program) != REG_MAGIC) {
1443 Perl_croak(aTHX_ "corrupted regexp program");
1447 PL_reg_eval_set = 0;
1450 if (prog->reganch & ROPT_UTF8)
1451 PL_reg_flags |= RF_utf8;
1453 /* Mark beginning of line for ^ and lookbehind. */
1454 PL_regbol = startpos;
1458 /* Mark end of line for $ (and such) */
1461 /* see how far we have to get to not match where we matched before */
1462 PL_regtill = startpos+minend;
1464 /* We start without call_cc context. */
1467 /* If there is a "must appear" string, look for it. */
1470 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1473 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1474 PL_reg_ganch = startpos;
1475 else if (sv && SvTYPE(sv) >= SVt_PVMG
1477 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1478 && mg->mg_len >= 0) {
1479 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1480 if (prog->reganch & ROPT_ANCH_GPOS) {
1481 if (s > PL_reg_ganch)
1486 else /* pos() not defined */
1487 PL_reg_ganch = strbeg;
1490 if (do_utf8 == (UTF!=0) &&
1491 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1492 re_scream_pos_data d;
1494 d.scream_olds = &scream_olds;
1495 d.scream_pos = &scream_pos;
1496 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1498 goto phooey; /* not present */
1501 DEBUG_r( if (!PL_colorset) reginitcolors() );
1502 DEBUG_r(PerlIO_printf(Perl_debug_log,
1503 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1504 PL_colors[4],PL_colors[5],PL_colors[0],
1507 (strlen(prog->precomp) > 60 ? "..." : ""),
1509 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1510 startpos, PL_colors[1],
1511 (strend - startpos > 60 ? "..." : ""))
1514 /* Simplest case: anchored match need be tried only once. */
1515 /* [unless only anchor is BOL and multiline is set] */
1516 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1517 if (s == startpos && regtry(prog, startpos))
1519 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1520 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1525 dontbother = minlen - 1;
1526 end = HOP3c(strend, -dontbother, strbeg) - 1;
1527 /* for multiline we only have to try after newlines */
1528 if (prog->check_substr) {
1532 if (regtry(prog, s))
1537 if (prog->reganch & RE_USE_INTUIT) {
1538 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1549 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1550 if (regtry(prog, s))
1557 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1558 if (regtry(prog, PL_reg_ganch))
1563 /* Messy cases: unanchored match. */
1564 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1565 /* we have /x+whatever/ */
1566 /* it must be a one character string (XXXX Except UTF?) */
1567 char ch = SvPVX(prog->anchored_substr)[0];
1573 while (s < strend) {
1575 DEBUG_r( did_match = 1 );
1576 if (regtry(prog, s)) goto got_it;
1578 while (s < strend && *s == ch)
1585 while (s < strend) {
1587 DEBUG_r( did_match = 1 );
1588 if (regtry(prog, s)) goto got_it;
1590 while (s < strend && *s == ch)
1596 DEBUG_r(did_match ||
1597 PerlIO_printf(Perl_debug_log,
1598 "Did not find anchored character...\n"));
1601 else if (do_utf8 == (UTF!=0) &&
1602 (prog->anchored_substr != Nullsv
1603 || (prog->float_substr != Nullsv
1604 && prog->float_max_offset < strend - s))) {
1605 SV *must = prog->anchored_substr
1606 ? prog->anchored_substr : prog->float_substr;
1608 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1610 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1611 char *last = HOP3c(strend, /* Cannot start after this */
1612 -(I32)(CHR_SVLEN(must)
1613 - (SvTAIL(must) != 0) + back_min), strbeg);
1614 char *last1; /* Last position checked before */
1620 last1 = HOPc(s, -1);
1622 last1 = s - 1; /* bogus */
1624 /* XXXX check_substr already used to find `s', can optimize if
1625 check_substr==must. */
1627 dontbother = end_shift;
1628 strend = HOPc(strend, -dontbother);
1629 while ( (s <= last) &&
1630 ((flags & REXEC_SCREAM)
1631 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1632 end_shift, &scream_pos, 0))
1633 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1634 (unsigned char*)strend, must,
1635 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1636 DEBUG_r( did_match = 1 );
1637 if (HOPc(s, -back_max) > last1) {
1638 last1 = HOPc(s, -back_min);
1639 s = HOPc(s, -back_max);
1642 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1644 last1 = HOPc(s, -back_min);
1648 while (s <= last1) {
1649 if (regtry(prog, s))
1655 while (s <= last1) {
1656 if (regtry(prog, s))
1662 DEBUG_r(did_match ||
1663 PerlIO_printf(Perl_debug_log, "Did not find %s substr `%s%.*s%s'%s...\n",
1664 ((must == prog->anchored_substr)
1665 ? "anchored" : "floating"),
1667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
1672 else if ((c = prog->regstclass)) {
1673 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1674 /* don't bother with what can't match */
1675 strend = HOPc(strend, -(minlen - 1));
1677 SV *prop = sv_newmortal();
1679 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1681 if (find_byclass(prog, c, s, strend, startpos, 0))
1683 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1687 if (prog->float_substr != Nullsv) { /* Trim the end. */
1690 if (flags & REXEC_SCREAM) {
1691 last = screaminstr(sv, prog->float_substr, s - strbeg,
1692 end_shift, &scream_pos, 1); /* last one */
1694 last = scream_olds; /* Only one occurrence. */
1698 char *little = SvPV(prog->float_substr, len);
1700 if (SvTAIL(prog->float_substr)) {
1701 if (memEQ(strend - len + 1, little, len - 1))
1702 last = strend - len + 1;
1703 else if (!PL_multiline)
1704 last = memEQ(strend - len, little, len)
1705 ? strend - len : Nullch;
1711 last = rninstr(s, strend, little, little + len);
1713 last = strend; /* matching `$' */
1717 DEBUG_r(PerlIO_printf(Perl_debug_log,
1718 "%sCan't trim the tail, match fails (should not happen)%s\n",
1719 PL_colors[4],PL_colors[5]));
1720 goto phooey; /* Should not happen! */
1722 dontbother = strend - last + prog->float_min_offset;
1724 if (minlen && (dontbother < minlen))
1725 dontbother = minlen - 1;
1726 strend -= dontbother; /* this one's always in bytes! */
1727 /* We don't know much -- general case. */
1730 if (regtry(prog, s))
1739 if (regtry(prog, s))
1741 } while (s++ < strend);
1749 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1751 if (PL_reg_eval_set) {
1752 /* Preserve the current value of $^R */
1753 if (oreplsv != GvSV(PL_replgv))
1754 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1755 restored, the value remains
1757 restore_pos(aTHXo_ 0);
1760 /* make sure $`, $&, $', and $digit will work later */
1761 if ( !(flags & REXEC_NOT_FIRST) ) {
1762 if (RX_MATCH_COPIED(prog)) {
1763 Safefree(prog->subbeg);
1764 RX_MATCH_COPIED_off(prog);
1766 if (flags & REXEC_COPY_STR) {
1767 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1769 s = savepvn(strbeg, i);
1772 RX_MATCH_COPIED_on(prog);
1775 prog->subbeg = strbeg;
1776 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1783 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1784 PL_colors[4],PL_colors[5]));
1785 if (PL_reg_eval_set)
1786 restore_pos(aTHXo_ 0);
1791 - regtry - try match at specific point
1793 STATIC I32 /* 0 failure, 1 success */
1794 S_regtry(pTHX_ regexp *prog, char *startpos)
1802 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1804 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1807 PL_reg_eval_set = RS_init;
1809 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1810 (IV)(PL_stack_sp - PL_stack_base));
1812 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1813 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1814 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1816 /* Apparently this is not needed, judging by wantarray. */
1817 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1818 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1821 /* Make $_ available to executed code. */
1822 if (PL_reg_sv != DEFSV) {
1823 /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
1828 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1829 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1830 /* prepare for quick setting of pos */
1831 sv_magic(PL_reg_sv, (SV*)0,
1832 PERL_MAGIC_regex_global, Nullch, 0);
1833 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1837 PL_reg_oldpos = mg->mg_len;
1838 SAVEDESTRUCTOR_X(restore_pos, 0);
1841 Newz(22,PL_reg_curpm, 1, PMOP);
1842 PL_reg_curpm->op_pmregexp = prog;
1843 PL_reg_oldcurpm = PL_curpm;
1844 PL_curpm = PL_reg_curpm;
1845 if (RX_MATCH_COPIED(prog)) {
1846 /* Here is a serious problem: we cannot rewrite subbeg,
1847 since it may be needed if this match fails. Thus
1848 $` inside (?{}) could fail... */
1849 PL_reg_oldsaved = prog->subbeg;
1850 PL_reg_oldsavedlen = prog->sublen;
1851 RX_MATCH_COPIED_off(prog);
1854 PL_reg_oldsaved = Nullch;
1855 prog->subbeg = PL_bostr;
1856 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1858 prog->startp[0] = startpos - PL_bostr;
1859 PL_reginput = startpos;
1860 PL_regstartp = prog->startp;
1861 PL_regendp = prog->endp;
1862 PL_reglastparen = &prog->lastparen;
1863 prog->lastparen = 0;
1865 DEBUG_r(PL_reg_starttry = startpos);
1866 if (PL_reg_start_tmpl <= prog->nparens) {
1867 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1868 if(PL_reg_start_tmp)
1869 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1871 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1874 /* XXXX What this code is doing here?!!! There should be no need
1875 to do this again and again, PL_reglastparen should take care of
1878 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1879 * Actually, the code in regcppop() (which Ilya may be meaning by
1880 * PL_reglastparen), is not needed at all by the test suite
1881 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1882 * enough, for building DynaLoader, or otherwise this
1883 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1884 * will happen. Meanwhile, this code *is* needed for the
1885 * above-mentioned test suite tests to succeed. The common theme
1886 * on those tests seems to be returning null fields from matches.
1891 if (prog->nparens) {
1892 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1899 if (regmatch(prog->program + 1)) {
1900 prog->endp[0] = PL_reginput - PL_bostr;
1903 REGCP_UNWIND(lastcp);
1907 #define RE_UNWIND_BRANCH 1
1908 #define RE_UNWIND_BRANCHJ 2
1912 typedef struct { /* XX: makes sense to enlarge it... */
1916 } re_unwind_generic_t;
1929 } re_unwind_branch_t;
1931 typedef union re_unwind_t {
1933 re_unwind_generic_t generic;
1934 re_unwind_branch_t branch;
1938 - regmatch - main matching routine
1940 * Conceptually the strategy is simple: check to see whether the current
1941 * node matches, call self recursively to see whether the rest matches,
1942 * and then act accordingly. In practice we make some effort to avoid
1943 * recursion, in particular by going through "ordinary" nodes (that don't
1944 * need to know whether the rest of the match failed) by a loop instead of
1947 /* [lwall] I've hoisted the register declarations to the outer block in order to
1948 * maybe save a little bit of pushing and popping on the stack. It also takes
1949 * advantage of machines that use a register save mask on subroutine entry.
1951 STATIC I32 /* 0 failure, 1 success */
1952 S_regmatch(pTHX_ regnode *prog)
1954 register regnode *scan; /* Current node. */
1955 regnode *next; /* Next node. */
1956 regnode *inner; /* Next node in internal branch. */
1957 register I32 nextchr; /* renamed nextchr - nextchar colides with
1958 function of same name */
1959 register I32 n; /* no or next */
1960 register I32 ln; /* len or last */
1961 register char *s; /* operand or save */
1962 register char *locinput = PL_reginput;
1963 register I32 c1, c2, paren; /* case fold search, parenth */
1964 int minmod = 0, sw = 0, logical = 0;
1966 I32 firstcp = PL_savestack_ix;
1967 register bool do_utf8 = DO_UTF8(PL_reg_sv);
1973 /* Note that nextchr is a byte even in UTF */
1974 nextchr = UCHARAT(locinput);
1976 while (scan != NULL) {
1977 #define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
1979 # define sayYES goto yes
1980 # define sayNO goto no
1981 # define sayYES_FINAL goto yes_final
1982 # define sayYES_LOUD goto yes_loud
1983 # define sayNO_FINAL goto no_final
1984 # define sayNO_SILENT goto do_no
1985 # define saySAME(x) if (x) goto yes; else goto no
1986 # define REPORT_CODE_OFF 24
1988 # define sayYES return 1
1989 # define sayNO return 0
1990 # define sayYES_FINAL return 1
1991 # define sayYES_LOUD return 1
1992 # define sayNO_FINAL return 0
1993 # define sayNO_SILENT return 0
1994 # define saySAME(x) return x
1997 SV *prop = sv_newmortal();
1998 int docolor = *PL_colors[0];
1999 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2000 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2001 /* The part of the string before starttry has one color
2002 (pref0_len chars), between starttry and current
2003 position another one (pref_len - pref0_len chars),
2004 after the current position the third one.
2005 We assume that pref0_len <= pref_len, otherwise we
2006 decrease pref0_len. */
2007 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2008 ? (5 + taill) - l : locinput - PL_bostr;
2011 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2013 pref0_len = pref_len - (locinput - PL_reg_starttry);
2014 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2015 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2016 ? (5 + taill) - pref_len : PL_regeol - locinput);
2017 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2021 if (pref0_len > pref_len)
2022 pref0_len = pref_len;
2023 regprop(prop, scan);
2024 PerlIO_printf(Perl_debug_log,
2025 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2026 (IV)(locinput - PL_bostr),
2027 PL_colors[4], pref0_len,
2028 locinput - pref_len, PL_colors[5],
2029 PL_colors[2], pref_len - pref0_len,
2030 locinput - pref_len + pref0_len, PL_colors[3],
2031 (docolor ? "" : "> <"),
2032 PL_colors[0], l, locinput, PL_colors[1],
2033 15 - l - pref_len + 1,
2035 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2039 next = scan + NEXT_OFF(scan);
2045 if (locinput == PL_bostr || (PL_multiline &&
2046 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2048 /* regtill = regbol; */
2053 if (locinput == PL_bostr ||
2054 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2060 if (locinput == PL_bostr)
2064 if (locinput == PL_reg_ganch)
2074 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2079 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2081 if (PL_regeol - locinput > 1)
2085 if (PL_regeol != locinput)
2089 if (!nextchr && locinput >= PL_regeol)
2091 nextchr = UCHARAT(++locinput);
2094 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2097 locinput += PL_utf8skip[nextchr];
2098 if (locinput > PL_regeol)
2100 nextchr = UCHARAT(locinput);
2103 nextchr = UCHARAT(++locinput);
2108 if (do_utf8 != (UTF!=0)) {
2116 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2125 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2131 nextchr = UCHARAT(locinput);
2134 /* Inline the first character, for speed. */
2135 if (UCHARAT(s) != nextchr)
2137 if (PL_regeol - locinput < ln)
2139 if (ln > 1 && memNE(s, locinput, ln))
2142 nextchr = UCHARAT(locinput);
2145 PL_reg_flags |= RF_tainted;
2155 c1 = OP(scan) == EXACTF;
2157 if (l >= PL_regeol) {
2160 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2161 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2163 s += UTF ? UTF8SKIP(s) : 1;
2167 nextchr = UCHARAT(locinput);
2171 /* Inline the first character, for speed. */
2172 if (UCHARAT(s) != nextchr &&
2173 UCHARAT(s) != ((OP(scan) == EXACTF)
2174 ? PL_fold : PL_fold_locale)[nextchr])
2176 if (PL_regeol - locinput < ln)
2178 if (ln > 1 && (OP(scan) == EXACTF
2179 ? ibcmp(s, locinput, ln)
2180 : ibcmp_locale(s, locinput, ln)))
2183 nextchr = UCHARAT(locinput);
2187 if (!reginclass(scan, (U8*)locinput, do_utf8))
2189 if (locinput >= PL_regeol)
2191 locinput += PL_utf8skip[nextchr];
2192 nextchr = UCHARAT(locinput);
2196 nextchr = UCHARAT(locinput);
2197 if (!reginclass(scan, (U8*)locinput, do_utf8))
2199 if (!nextchr && locinput >= PL_regeol)
2201 nextchr = UCHARAT(++locinput);
2205 PL_reg_flags |= RF_tainted;
2211 if (!(OP(scan) == ALNUM
2212 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2213 : isALNUM_LC_utf8((U8*)locinput)))
2217 locinput += PL_utf8skip[nextchr];
2218 nextchr = UCHARAT(locinput);
2221 if (!(OP(scan) == ALNUM
2222 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2224 nextchr = UCHARAT(++locinput);
2227 PL_reg_flags |= RF_tainted;
2230 if (!nextchr && locinput >= PL_regeol)
2233 LOAD_UTF8_CHARCLASS(alnum,"a");
2234 if (OP(scan) == NALNUM
2235 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2236 : isALNUM_LC_utf8((U8*)locinput))
2240 locinput += PL_utf8skip[nextchr];
2241 nextchr = UCHARAT(locinput);
2244 if (OP(scan) == NALNUM
2245 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2247 nextchr = UCHARAT(++locinput);
2251 PL_reg_flags |= RF_tainted;
2255 /* was last char in word? */
2257 if (locinput == PL_bostr)
2260 U8 *r = reghop((U8*)locinput, -1);
2262 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2264 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2265 ln = isALNUM_uni(ln);
2266 LOAD_UTF8_CHARCLASS(alnum,"a");
2267 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2270 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2271 n = isALNUM_LC_utf8((U8*)locinput);
2275 ln = (locinput != PL_bostr) ?
2276 UCHARAT(locinput - 1) : '\n';
2277 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2279 n = isALNUM(nextchr);
2282 ln = isALNUM_LC(ln);
2283 n = isALNUM_LC(nextchr);
2286 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2287 OP(scan) == BOUNDL))
2291 PL_reg_flags |= RF_tainted;
2297 if (UTF8_IS_CONTINUED(nextchr)) {
2298 LOAD_UTF8_CHARCLASS(space," ");
2299 if (!(OP(scan) == SPACE
2300 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2301 : isSPACE_LC_utf8((U8*)locinput)))
2305 locinput += PL_utf8skip[nextchr];
2306 nextchr = UCHARAT(locinput);
2309 if (!(OP(scan) == SPACE
2310 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2312 nextchr = UCHARAT(++locinput);
2315 if (!(OP(scan) == SPACE
2316 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2318 nextchr = UCHARAT(++locinput);
2322 PL_reg_flags |= RF_tainted;
2325 if (!nextchr && locinput >= PL_regeol)
2328 LOAD_UTF8_CHARCLASS(space," ");
2329 if (OP(scan) == NSPACE
2330 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2331 : isSPACE_LC_utf8((U8*)locinput))
2335 locinput += PL_utf8skip[nextchr];
2336 nextchr = UCHARAT(locinput);
2339 if (OP(scan) == NSPACE
2340 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2342 nextchr = UCHARAT(++locinput);
2345 PL_reg_flags |= RF_tainted;
2351 LOAD_UTF8_CHARCLASS(digit,"0");
2352 if (!(OP(scan) == DIGIT
2353 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2354 : isDIGIT_LC_utf8((U8*)locinput)))
2358 locinput += PL_utf8skip[nextchr];
2359 nextchr = UCHARAT(locinput);
2362 if (!(OP(scan) == DIGIT
2363 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2365 nextchr = UCHARAT(++locinput);
2368 PL_reg_flags |= RF_tainted;
2371 if (!nextchr && locinput >= PL_regeol)
2374 LOAD_UTF8_CHARCLASS(digit,"0");
2375 if (OP(scan) == NDIGIT
2376 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2377 : isDIGIT_LC_utf8((U8*)locinput))
2381 locinput += PL_utf8skip[nextchr];
2382 nextchr = UCHARAT(locinput);
2385 if (OP(scan) == NDIGIT
2386 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2388 nextchr = UCHARAT(++locinput);
2391 LOAD_UTF8_CHARCLASS(mark,"~");
2392 if (locinput >= PL_regeol ||
2393 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2395 locinput += PL_utf8skip[nextchr];
2396 while (locinput < PL_regeol &&
2397 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2398 locinput += UTF8SKIP(locinput);
2399 if (locinput > PL_regeol)
2401 nextchr = UCHARAT(locinput);
2404 PL_reg_flags |= RF_tainted;
2408 n = ARG(scan); /* which paren pair */
2409 ln = PL_regstartp[n];
2410 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2411 if (*PL_reglastparen < n || ln == -1)
2412 sayNO; /* Do not match unless seen CLOSEn. */
2413 if (ln == PL_regendp[n])
2417 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2419 char *e = PL_bostr + PL_regendp[n];
2421 * Note that we can't do the "other character" lookup trick as
2422 * in the 8-bit case (no pun intended) because in Unicode we
2423 * have to map both upper and title case to lower case.
2425 if (OP(scan) == REFF) {
2429 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2439 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2446 nextchr = UCHARAT(locinput);
2450 /* Inline the first character, for speed. */
2451 if (UCHARAT(s) != nextchr &&
2453 (UCHARAT(s) != ((OP(scan) == REFF
2454 ? PL_fold : PL_fold_locale)[nextchr]))))
2456 ln = PL_regendp[n] - ln;
2457 if (locinput + ln > PL_regeol)
2459 if (ln > 1 && (OP(scan) == REF
2460 ? memNE(s, locinput, ln)
2462 ? ibcmp(s, locinput, ln)
2463 : ibcmp_locale(s, locinput, ln))))
2466 nextchr = UCHARAT(locinput);
2477 OP_4tree *oop = PL_op;
2478 COP *ocurcop = PL_curcop;
2479 SV **ocurpad = PL_curpad;
2483 PL_op = (OP_4tree*)PL_regdata->data[n];
2484 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2485 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2486 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2488 CALLRUNOPS(aTHX); /* Scalar context. */
2494 PL_curpad = ocurpad;
2495 PL_curcop = ocurcop;
2497 if (logical == 2) { /* Postponed subexpression. */
2499 MAGIC *mg = Null(MAGIC*);
2501 CHECKPOINT cp, lastcp;
2503 if(SvROK(ret) || SvRMAGICAL(ret)) {
2504 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2507 mg = mg_find(sv, PERL_MAGIC_qr);
2510 re = (regexp *)mg->mg_obj;
2511 (void)ReREFCNT_inc(re);
2515 char *t = SvPV(ret, len);
2517 char *oprecomp = PL_regprecomp;
2518 I32 osize = PL_regsize;
2519 I32 onpar = PL_regnpar;
2522 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2524 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2525 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2527 PL_regprecomp = oprecomp;
2532 PerlIO_printf(Perl_debug_log,
2533 "Entering embedded `%s%.60s%s%s'\n",
2537 (strlen(re->precomp) > 60 ? "..." : ""))
2540 state.prev = PL_reg_call_cc;
2541 state.cc = PL_regcc;
2542 state.re = PL_reg_re;
2546 cp = regcppush(0); /* Save *all* the positions. */
2549 state.ss = PL_savestack_ix;
2550 *PL_reglastparen = 0;
2551 PL_reg_call_cc = &state;
2552 PL_reginput = locinput;
2554 /* XXXX This is too dramatic a measure... */
2557 if (regmatch(re->program + 1)) {
2558 /* Even though we succeeded, we need to restore
2559 global variables, since we may be wrapped inside
2560 SUSPEND, thus the match may be not finished yet. */
2562 /* XXXX Do this only if SUSPENDed? */
2563 PL_reg_call_cc = state.prev;
2564 PL_regcc = state.cc;
2565 PL_reg_re = state.re;
2566 cache_re(PL_reg_re);
2568 /* XXXX This is too dramatic a measure... */
2571 /* These are needed even if not SUSPEND. */
2577 REGCP_UNWIND(lastcp);
2579 PL_reg_call_cc = state.prev;
2580 PL_regcc = state.cc;
2581 PL_reg_re = state.re;
2582 cache_re(PL_reg_re);
2584 /* XXXX This is too dramatic a measure... */
2593 sv_setsv(save_scalar(PL_replgv), ret);
2597 n = ARG(scan); /* which paren pair */
2598 PL_reg_start_tmp[n] = locinput;
2603 n = ARG(scan); /* which paren pair */
2604 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2605 PL_regendp[n] = locinput - PL_bostr;
2606 if (n > *PL_reglastparen)
2607 *PL_reglastparen = n;
2610 n = ARG(scan); /* which paren pair */
2611 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2614 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2616 next = NEXTOPER(NEXTOPER(scan));
2618 next = scan + ARG(scan);
2619 if (OP(next) == IFTHEN) /* Fake one. */
2620 next = NEXTOPER(NEXTOPER(next));
2624 logical = scan->flags;
2626 /*******************************************************************
2627 PL_regcc contains infoblock about the innermost (...)* loop, and
2628 a pointer to the next outer infoblock.
2630 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2632 1) After matching X, regnode for CURLYX is processed;
2634 2) This regnode creates infoblock on the stack, and calls
2635 regmatch() recursively with the starting point at WHILEM node;
2637 3) Each hit of WHILEM node tries to match A and Z (in the order
2638 depending on the current iteration, min/max of {min,max} and
2639 greediness). The information about where are nodes for "A"
2640 and "Z" is read from the infoblock, as is info on how many times "A"
2641 was already matched, and greediness.
2643 4) After A matches, the same WHILEM node is hit again.
2645 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2646 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2647 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2648 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2649 of the external loop.
2651 Currently present infoblocks form a tree with a stem formed by PL_curcc
2652 and whatever it mentions via ->next, and additional attached trees
2653 corresponding to temporarily unset infoblocks as in "5" above.
2655 In the following picture infoblocks for outer loop of
2656 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2657 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2658 infoblocks are drawn below the "reset" infoblock.
2660 In fact in the picture below we do not show failed matches for Z and T
2661 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2662 more obvious *why* one needs to *temporary* unset infoblocks.]
2664 Matched REx position InfoBlocks Comment
2668 Y A)*?Z)*?T x <- O <- I
2669 YA )*?Z)*?T x <- O <- I
2670 YA A)*?Z)*?T x <- O <- I
2671 YAA )*?Z)*?T x <- O <- I
2672 YAA Z)*?T x <- O # Temporary unset I
2675 YAAZ Y(A)*?Z)*?T x <- O
2678 YAAZY (A)*?Z)*?T x <- O
2681 YAAZY A)*?Z)*?T x <- O <- I
2684 YAAZYA )*?Z)*?T x <- O <- I
2687 YAAZYA Z)*?T x <- O # Temporary unset I
2693 YAAZYAZ T x # Temporary unset O
2700 *******************************************************************/
2703 CHECKPOINT cp = PL_savestack_ix;
2704 /* No need to save/restore up to this paren */
2705 I32 parenfloor = scan->flags;
2707 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2709 cc.oldcc = PL_regcc;
2711 /* XXXX Probably it is better to teach regpush to support
2712 parenfloor > PL_regsize... */
2713 if (parenfloor > *PL_reglastparen)
2714 parenfloor = *PL_reglastparen; /* Pessimization... */
2715 cc.parenfloor = parenfloor;
2717 cc.min = ARG1(scan);
2718 cc.max = ARG2(scan);
2719 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2723 PL_reginput = locinput;
2724 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2726 PL_regcc = cc.oldcc;
2732 * This is really hard to understand, because after we match
2733 * what we're trying to match, we must make sure the rest of
2734 * the REx is going to match for sure, and to do that we have
2735 * to go back UP the parse tree by recursing ever deeper. And
2736 * if it fails, we have to reset our parent's current state
2737 * that we can try again after backing off.
2740 CHECKPOINT cp, lastcp;
2741 CURCUR* cc = PL_regcc;
2742 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2744 n = cc->cur + 1; /* how many we know we matched */
2745 PL_reginput = locinput;
2748 PerlIO_printf(Perl_debug_log,
2749 "%*s %ld out of %ld..%ld cc=%lx\n",
2750 REPORT_CODE_OFF+PL_regindent*2, "",
2751 (long)n, (long)cc->min,
2752 (long)cc->max, (long)cc)
2755 /* If degenerate scan matches "", assume scan done. */
2757 if (locinput == cc->lastloc && n >= cc->min) {
2758 PL_regcc = cc->oldcc;
2762 PerlIO_printf(Perl_debug_log,
2763 "%*s empty match detected, try continuation...\n",
2764 REPORT_CODE_OFF+PL_regindent*2, "")
2766 if (regmatch(cc->next))
2774 /* First just match a string of min scans. */
2778 cc->lastloc = locinput;
2779 if (regmatch(cc->scan))
2782 cc->lastloc = lastloc;
2787 /* Check whether we already were at this position.
2788 Postpone detection until we know the match is not
2789 *that* much linear. */
2790 if (!PL_reg_maxiter) {
2791 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2792 PL_reg_leftiter = PL_reg_maxiter;
2794 if (PL_reg_leftiter-- == 0) {
2795 I32 size = (PL_reg_maxiter + 7)/8;
2796 if (PL_reg_poscache) {
2797 if (PL_reg_poscache_size < size) {
2798 Renew(PL_reg_poscache, size, char);
2799 PL_reg_poscache_size = size;
2801 Zero(PL_reg_poscache, size, char);
2804 PL_reg_poscache_size = size;
2805 Newz(29, PL_reg_poscache, size, char);
2808 PerlIO_printf(Perl_debug_log,
2809 "%sDetected a super-linear match, switching on caching%s...\n",
2810 PL_colors[4], PL_colors[5])
2813 if (PL_reg_leftiter < 0) {
2814 I32 o = locinput - PL_bostr, b;
2816 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2819 if (PL_reg_poscache[o] & (1<<b)) {
2821 PerlIO_printf(Perl_debug_log,
2822 "%*s already tried at this position...\n",
2823 REPORT_CODE_OFF+PL_regindent*2, "")
2827 PL_reg_poscache[o] |= (1<<b);
2831 /* Prefer next over scan for minimal matching. */
2834 PL_regcc = cc->oldcc;
2837 cp = regcppush(cc->parenfloor);
2839 if (regmatch(cc->next)) {
2841 sayYES; /* All done. */
2843 REGCP_UNWIND(lastcp);
2849 if (n >= cc->max) { /* Maximum greed exceeded? */
2850 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2851 && !(PL_reg_flags & RF_warned)) {
2852 PL_reg_flags |= RF_warned;
2853 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2854 "Complex regular subexpression recursion",
2861 PerlIO_printf(Perl_debug_log,
2862 "%*s trying longer...\n",
2863 REPORT_CODE_OFF+PL_regindent*2, "")
2865 /* Try scanning more and see if it helps. */
2866 PL_reginput = locinput;
2868 cc->lastloc = locinput;
2869 cp = regcppush(cc->parenfloor);
2871 if (regmatch(cc->scan)) {
2875 REGCP_UNWIND(lastcp);
2878 cc->lastloc = lastloc;
2882 /* Prefer scan over next for maximal matching. */
2884 if (n < cc->max) { /* More greed allowed? */
2885 cp = regcppush(cc->parenfloor);
2887 cc->lastloc = locinput;
2889 if (regmatch(cc->scan)) {
2893 REGCP_UNWIND(lastcp);
2894 regcppop(); /* Restore some previous $<digit>s? */
2895 PL_reginput = locinput;
2897 PerlIO_printf(Perl_debug_log,
2898 "%*s failed, try continuation...\n",
2899 REPORT_CODE_OFF+PL_regindent*2, "")
2902 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2903 && !(PL_reg_flags & RF_warned)) {
2904 PL_reg_flags |= RF_warned;
2905 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2906 "Complex regular subexpression recursion",
2910 /* Failed deeper matches of scan, so see if this one works. */
2911 PL_regcc = cc->oldcc;
2914 if (regmatch(cc->next))
2920 cc->lastloc = lastloc;
2925 next = scan + ARG(scan);
2928 inner = NEXTOPER(NEXTOPER(scan));
2931 inner = NEXTOPER(scan);
2936 if (OP(next) != c1) /* No choice. */
2937 next = inner; /* Avoid recursion. */
2939 I32 lastparen = *PL_reglastparen;
2941 re_unwind_branch_t *uw;
2943 /* Put unwinding data on stack */
2944 unwind1 = SSNEWt(1,re_unwind_branch_t);
2945 uw = SSPTRt(unwind1,re_unwind_branch_t);
2948 uw->type = ((c1 == BRANCH)
2950 : RE_UNWIND_BRANCHJ);
2951 uw->lastparen = lastparen;
2953 uw->locinput = locinput;
2954 uw->nextchr = nextchr;
2956 uw->regindent = ++PL_regindent;
2959 REGCP_SET(uw->lastcp);
2961 /* Now go into the first branch */
2974 /* We suppose that the next guy does not need
2975 backtracking: in particular, it is of constant length,
2976 and has no parenths to influence future backrefs. */
2977 ln = ARG1(scan); /* min to match */
2978 n = ARG2(scan); /* max to match */
2979 paren = scan->flags;
2981 if (paren > PL_regsize)
2983 if (paren > *PL_reglastparen)
2984 *PL_reglastparen = paren;
2986 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
2988 scan += NEXT_OFF(scan); /* Skip former OPEN. */
2989 PL_reginput = locinput;
2992 if (ln && regrepeat_hard(scan, ln, &l) < ln)
2994 if (ln && l == 0 && n >= ln
2995 /* In fact, this is tricky. If paren, then the
2996 fact that we did/didnot match may influence
2997 future execution. */
2998 && !(paren && ln == 0))
3000 locinput = PL_reginput;
3001 if (PL_regkind[(U8)OP(next)] == EXACT) {
3002 c1 = (U8)*STRING(next);
3003 if (OP(next) == EXACTF)
3005 else if (OP(next) == EXACTFL)
3006 c2 = PL_fold_locale[c1];
3013 /* This may be improved if l == 0. */
3014 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3015 /* If it could work, try it. */
3017 UCHARAT(PL_reginput) == c1 ||
3018 UCHARAT(PL_reginput) == c2)
3022 PL_regstartp[paren] =
3023 HOPc(PL_reginput, -l) - PL_bostr;
3024 PL_regendp[paren] = PL_reginput - PL_bostr;
3027 PL_regendp[paren] = -1;
3031 REGCP_UNWIND(lastcp);
3033 /* Couldn't or didn't -- move forward. */
3034 PL_reginput = locinput;
3035 if (regrepeat_hard(scan, 1, &l)) {
3037 locinput = PL_reginput;
3044 n = regrepeat_hard(scan, n, &l);
3045 if (n != 0 && l == 0
3046 /* In fact, this is tricky. If paren, then the
3047 fact that we did/didnot match may influence
3048 future execution. */
3049 && !(paren && ln == 0))
3051 locinput = PL_reginput;
3053 PerlIO_printf(Perl_debug_log,
3054 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3055 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3059 if (PL_regkind[(U8)OP(next)] == EXACT) {
3060 c1 = (U8)*STRING(next);
3061 if (OP(next) == EXACTF)
3063 else if (OP(next) == EXACTFL)
3064 c2 = PL_fold_locale[c1];
3073 /* If it could work, try it. */
3075 UCHARAT(PL_reginput) == c1 ||
3076 UCHARAT(PL_reginput) == c2)
3079 PerlIO_printf(Perl_debug_log,
3080 "%*s trying tail with n=%"IVdf"...\n",
3081 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3085 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3086 PL_regendp[paren] = PL_reginput - PL_bostr;
3089 PL_regendp[paren] = -1;
3093 REGCP_UNWIND(lastcp);
3095 /* Couldn't or didn't -- back up. */
3097 locinput = HOPc(locinput, -l);
3098 PL_reginput = locinput;
3105 paren = scan->flags; /* Which paren to set */
3106 if (paren > PL_regsize)
3108 if (paren > *PL_reglastparen)
3109 *PL_reglastparen = paren;
3110 ln = ARG1(scan); /* min to match */
3111 n = ARG2(scan); /* max to match */
3112 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3116 ln = ARG1(scan); /* min to match */
3117 n = ARG2(scan); /* max to match */
3118 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3123 scan = NEXTOPER(scan);
3129 scan = NEXTOPER(scan);
3133 * Lookahead to avoid useless match attempts
3134 * when we know what character comes next.
3136 if (PL_regkind[(U8)OP(next)] == EXACT) {
3137 U8 *s = (U8*)STRING(next);
3140 if (OP(next) == EXACTF)
3142 else if (OP(next) == EXACTFL)
3143 c2 = PL_fold_locale[c1];
3146 if (OP(next) == EXACTF) {
3147 c1 = to_utf8_lower(s);
3148 c2 = to_utf8_upper(s);
3151 c2 = c1 = utf8_to_uvchr(s, NULL);
3157 PL_reginput = locinput;
3161 if (ln && regrepeat(scan, ln) < ln)
3163 locinput = PL_reginput;
3166 char *e; /* Should not check after this */
3167 char *old = locinput;
3169 if (n == REG_INFTY) {
3172 while (UTF8_IS_CONTINUATION(*(U8*)e))
3178 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3182 e = locinput + n - ln;
3188 /* Find place 'next' could work */
3191 while (locinput <= e && *locinput != c1)
3194 while (locinput <= e
3199 count = locinput - old;
3206 utf8_to_uvchr((U8*)locinput, &len) != c1;
3211 for (count = 0; locinput <= e; count++) {
3212 UV c = utf8_to_uvchr((U8*)locinput, &len);
3213 if (c == c1 || c == c2)
3221 /* PL_reginput == old now */
3222 if (locinput != old) {
3223 ln = 1; /* Did some */
3224 if (regrepeat(scan, count) < count)
3227 /* PL_reginput == locinput now */
3228 TRYPAREN(paren, ln, locinput);
3229 PL_reginput = locinput; /* Could be reset... */
3230 REGCP_UNWIND(lastcp);
3231 /* Couldn't or didn't -- move forward. */
3234 locinput += UTF8SKIP(locinput);
3240 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3244 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3246 c = UCHARAT(PL_reginput);
3247 /* If it could work, try it. */
3248 if (c == c1 || c == c2)
3250 TRYPAREN(paren, n, PL_reginput);
3251 REGCP_UNWIND(lastcp);
3254 /* If it could work, try it. */
3255 else if (c1 == -1000)
3257 TRYPAREN(paren, n, PL_reginput);
3258 REGCP_UNWIND(lastcp);
3260 /* Couldn't or didn't -- move forward. */
3261 PL_reginput = locinput;
3262 if (regrepeat(scan, 1)) {
3264 locinput = PL_reginput;
3272 n = regrepeat(scan, n);
3273 locinput = PL_reginput;
3274 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3275 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3276 ln = n; /* why back off? */
3277 /* ...because $ and \Z can match before *and* after
3278 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3279 We should back off by one in this case. */
3280 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3289 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3291 c = UCHARAT(PL_reginput);
3293 /* If it could work, try it. */
3294 if (c1 == -1000 || c == c1 || c == c2)
3296 TRYPAREN(paren, n, PL_reginput);
3297 REGCP_UNWIND(lastcp);
3299 /* Couldn't or didn't -- back up. */
3301 PL_reginput = locinput = HOPc(locinput, -1);
3309 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3311 c = UCHARAT(PL_reginput);
3313 /* If it could work, try it. */
3314 if (c1 == -1000 || c == c1 || c == c2)
3316 TRYPAREN(paren, n, PL_reginput);
3317 REGCP_UNWIND(lastcp);
3319 /* Couldn't or didn't -- back up. */
3321 PL_reginput = locinput = HOPc(locinput, -1);
3328 if (PL_reg_call_cc) {
3329 re_cc_state *cur_call_cc = PL_reg_call_cc;
3330 CURCUR *cctmp = PL_regcc;
3331 regexp *re = PL_reg_re;
3332 CHECKPOINT cp, lastcp;
3334 cp = regcppush(0); /* Save *all* the positions. */
3336 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3338 PL_reginput = locinput; /* Make position available to
3340 cache_re(PL_reg_call_cc->re);
3341 PL_regcc = PL_reg_call_cc->cc;
3342 PL_reg_call_cc = PL_reg_call_cc->prev;
3343 if (regmatch(cur_call_cc->node)) {
3344 PL_reg_call_cc = cur_call_cc;
3348 REGCP_UNWIND(lastcp);
3350 PL_reg_call_cc = cur_call_cc;
3356 PerlIO_printf(Perl_debug_log,
3357 "%*s continuation failed...\n",
3358 REPORT_CODE_OFF+PL_regindent*2, "")
3362 if (locinput < PL_regtill) {
3363 DEBUG_r(PerlIO_printf(Perl_debug_log,
3364 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3366 (long)(locinput - PL_reg_starttry),
3367 (long)(PL_regtill - PL_reg_starttry),
3369 sayNO_FINAL; /* Cannot match: too short. */
3371 PL_reginput = locinput; /* put where regtry can find it */
3372 sayYES_FINAL; /* Success! */
3374 PL_reginput = locinput; /* put where regtry can find it */
3375 sayYES_LOUD; /* Success! */
3378 PL_reginput = locinput;
3383 s = HOPBACKc(locinput, scan->flags);
3389 PL_reginput = locinput;
3394 s = HOPBACKc(locinput, scan->flags);
3400 PL_reginput = locinput;
3403 inner = NEXTOPER(NEXTOPER(scan));
3404 if (regmatch(inner) != n) {
3419 if (OP(scan) == SUSPEND) {
3420 locinput = PL_reginput;
3421 nextchr = UCHARAT(locinput);
3426 next = scan + ARG(scan);
3431 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3432 PTR2UV(scan), OP(scan));
3433 Perl_croak(aTHX_ "regexp memory corruption");
3440 * We get here only if there's trouble -- normally "case END" is
3441 * the terminating point.
3443 Perl_croak(aTHX_ "corrupted regexp pointers");
3449 PerlIO_printf(Perl_debug_log,
3450 "%*s %scould match...%s\n",
3451 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3455 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3456 PL_colors[4],PL_colors[5]));
3462 #if 0 /* Breaks $^R */
3470 PerlIO_printf(Perl_debug_log,
3471 "%*s %sfailed...%s\n",
3472 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3478 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3481 case RE_UNWIND_BRANCH:
3482 case RE_UNWIND_BRANCHJ:
3484 re_unwind_branch_t *uwb = &(uw->branch);
3485 I32 lastparen = uwb->lastparen;
3487 REGCP_UNWIND(uwb->lastcp);
3488 for (n = *PL_reglastparen; n > lastparen; n--)
3490 *PL_reglastparen = n;
3491 scan = next = uwb->next;
3493 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3494 ? BRANCH : BRANCHJ) ) { /* Failure */
3501 /* Have more choice yet. Reuse the same uwb. */
3503 if ((n = (uwb->type == RE_UNWIND_BRANCH
3504 ? NEXT_OFF(next) : ARG(next))))
3507 next = NULL; /* XXXX Needn't unwinding in this case... */
3509 next = NEXTOPER(scan);
3510 if (uwb->type == RE_UNWIND_BRANCHJ)
3511 next = NEXTOPER(next);
3512 locinput = uwb->locinput;
3513 nextchr = uwb->nextchr;
3515 PL_regindent = uwb->regindent;
3522 Perl_croak(aTHX_ "regexp unwind memory corruption");
3533 - regrepeat - repeatedly match something simple, report how many
3536 * [This routine now assumes that it will only match on things of length 1.
3537 * That was true before, but now we assume scan - reginput is the count,
3538 * rather than incrementing count on every character. [Er, except utf8.]]
3541 S_regrepeat(pTHX_ regnode *p, I32 max)
3543 register char *scan;
3545 register char *loceol = PL_regeol;
3546 register I32 hardcount = 0;
3547 register bool do_utf8 = DO_UTF8(PL_reg_sv);
3550 if (max != REG_INFTY && max < loceol - scan)
3551 loceol = scan + max;
3556 while (scan < loceol && hardcount < max && *scan != '\n') {
3557 scan += UTF8SKIP(scan);
3561 while (scan < loceol && *scan != '\n')
3568 case EXACT: /* length of string is 1 */
3570 while (scan < loceol && UCHARAT(scan) == c)
3573 case EXACTF: /* length of string is 1 */
3575 while (scan < loceol &&
3576 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3579 case EXACTFL: /* length of string is 1 */
3580 PL_reg_flags |= RF_tainted;
3582 while (scan < loceol &&
3583 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3589 while (hardcount < max && scan < loceol &&
3590 reginclass(p, (U8*)scan, do_utf8)) {
3591 scan += UTF8SKIP(scan);
3595 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3602 LOAD_UTF8_CHARCLASS(alnum,"a");
3603 while (hardcount < max && scan < loceol &&
3604 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3605 scan += UTF8SKIP(scan);
3609 while (scan < loceol && isALNUM(*scan))
3614 PL_reg_flags |= RF_tainted;
3617 while (hardcount < max && scan < loceol &&
3618 isALNUM_LC_utf8((U8*)scan)) {
3619 scan += UTF8SKIP(scan);
3623 while (scan < loceol && isALNUM_LC(*scan))
3630 LOAD_UTF8_CHARCLASS(alnum,"a");
3631 while (hardcount < max && scan < loceol &&
3632 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3633 scan += UTF8SKIP(scan);
3637 while (scan < loceol && !isALNUM(*scan))
3642 PL_reg_flags |= RF_tainted;
3645 while (hardcount < max && scan < loceol &&
3646 !isALNUM_LC_utf8((U8*)scan)) {
3647 scan += UTF8SKIP(scan);
3651 while (scan < loceol && !isALNUM_LC(*scan))
3658 LOAD_UTF8_CHARCLASS(space," ");
3659 while (hardcount < max && scan < loceol &&
3661 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3662 scan += UTF8SKIP(scan);
3666 while (scan < loceol && isSPACE(*scan))
3671 PL_reg_flags |= RF_tainted;
3674 while (hardcount < max && scan < loceol &&
3675 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3676 scan += UTF8SKIP(scan);
3680 while (scan < loceol && isSPACE_LC(*scan))
3687 LOAD_UTF8_CHARCLASS(space," ");
3688 while (hardcount < max && scan < loceol &&
3690 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3691 scan += UTF8SKIP(scan);
3695 while (scan < loceol && !isSPACE(*scan))
3700 PL_reg_flags |= RF_tainted;
3703 while (hardcount < max && scan < loceol &&
3704 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3705 scan += UTF8SKIP(scan);
3709 while (scan < loceol && !isSPACE_LC(*scan))
3716 LOAD_UTF8_CHARCLASS(digit,"0");
3717 while (hardcount < max && scan < loceol &&
3718 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3719 scan += UTF8SKIP(scan);
3723 while (scan < loceol && isDIGIT(*scan))
3730 LOAD_UTF8_CHARCLASS(digit,"0");
3731 while (hardcount < max && scan < loceol &&
3732 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3733 scan += UTF8SKIP(scan);
3737 while (scan < loceol && !isDIGIT(*scan))
3741 default: /* Called on something of 0 width. */
3742 break; /* So match right here or not at all. */
3748 c = scan - PL_reginput;
3753 SV *prop = sv_newmortal();
3756 PerlIO_printf(Perl_debug_log,
3757 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3758 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3765 - regrepeat_hard - repeatedly match something, report total lenth and length
3767 * The repeater is supposed to have constant length.
3771 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3773 register char *scan;
3774 register char *start;
3775 register char *loceol = PL_regeol;
3777 I32 count = 0, res = 1;
3782 start = PL_reginput;
3783 if (DO_UTF8(PL_reg_sv)) {
3784 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3787 while (start < PL_reginput) {
3789 start += UTF8SKIP(start);
3800 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3802 *lp = l = PL_reginput - start;
3803 if (max != REG_INFTY && l*max < loceol - scan)
3804 loceol = scan + l*max;
3817 - regclass_swash - prepare the utf8 swash
3821 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3826 if (PL_regdata && PL_regdata->count) {
3829 if (PL_regdata->what[n] == 's') {
3830 SV *rv = (SV*)PL_regdata->data[n];
3831 AV *av = (AV*)SvRV((SV*)rv);
3834 si = *av_fetch(av, 0, FALSE);
3835 a = av_fetch(av, 1, FALSE);
3839 else if (si && doinit) {
3840 sw = swash_init("utf8", "", si, 1, 0);
3841 (void)av_store(av, 1, sw);
3853 - reginclass - determine if a character falls into a character class
3857 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3859 char flags = ANYOF_FLAGS(n);
3864 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3866 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3867 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3868 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3871 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3874 SV *sw = regclass_swash(n, TRUE, 0);
3877 if (swash_fetch(sw, p, do_utf8))
3879 else if (flags & ANYOF_FOLD) {
3880 U8 tmpbuf[UTF8_MAXLEN+1];
3882 if (flags & ANYOF_LOCALE) {
3883 PL_reg_flags |= RF_tainted;
3884 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3887 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3888 if (swash_fetch(sw, tmpbuf, do_utf8))
3894 if (!match && c < 256) {
3895 if (ANYOF_BITMAP_TEST(n, c))
3897 else if (flags & ANYOF_FOLD) {
3900 if (flags & ANYOF_LOCALE) {
3901 PL_reg_flags |= RF_tainted;
3902 f = PL_fold_locale[c];
3906 if (f != c && ANYOF_BITMAP_TEST(n, f))
3910 if (!match && (flags & ANYOF_CLASS)) {
3911 PL_reg_flags |= RF_tainted;
3913 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
3914 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
3915 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
3916 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
3917 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
3918 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
3919 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
3920 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
3921 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
3922 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
3923 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
3924 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
3925 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
3926 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
3927 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
3928 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
3929 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
3930 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
3931 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
3932 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
3933 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
3934 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
3935 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
3936 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
3937 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
3938 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
3939 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
3940 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
3941 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
3942 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
3943 ) /* How's that for a conditional? */
3950 return (flags & ANYOF_INVERT) ? !match : match;
3954 S_reghop(pTHX_ U8 *s, I32 off)
3956 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3960 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
3963 while (off-- && s < lim) {
3964 /* XXX could check well-formedness here */
3972 if (UTF8_IS_CONTINUED(*s)) {
3973 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
3976 /* XXX could check well-formedness here */
3984 S_reghopmaybe(pTHX_ U8 *s, I32 off)
3986 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
3990 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
3993 while (off-- && s < lim) {
3994 /* XXX could check well-formedness here */
4004 if (UTF8_IS_CONTINUED(*s)) {
4005 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4008 /* XXX could check well-formedness here */
4024 restore_pos(pTHXo_ void *arg)
4026 if (PL_reg_eval_set) {
4027 if (PL_reg_oldsaved) {
4028 PL_reg_re->subbeg = PL_reg_oldsaved;
4029 PL_reg_re->sublen = PL_reg_oldsavedlen;
4030 RX_MATCH_COPIED_on(PL_reg_re);
4032 PL_reg_magic->mg_len = PL_reg_oldpos;
4033 PL_reg_eval_set = 0;
4034 PL_curpm = PL_reg_oldcurpm;