5 * "One Ring to rule them all, One Ring to find them..."
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
44 # define PERL_NO_GET_CONTEXT
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
68 **** Alterations to Henry's code are...
70 **** Copyright (c) 1991-2001, Larry Wall
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
80 #define PERL_IN_REGEXEC_C
85 #define RF_tainted 1 /* tainted information used? */
86 #define RF_warned 2 /* warned about big count? */
87 #define RF_evaled 4 /* Did an EVAL with setting? */
88 #define RF_utf8 8 /* String contains multibyte chars? */
90 #define UTF (PL_reg_flags & RF_utf8)
92 #define RS_init 1 /* eval environment created */
93 #define RS_set 2 /* replsv value is set */
103 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
106 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
107 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
108 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
109 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
110 #define HOPc(pos,off) ((char*)HOP(pos,off))
111 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
113 #define HOPBACK(pos, off) ( \
114 (UTF && PL_reg_match_utf8) \
115 ? reghopmaybe((U8*)pos, -off) \
116 : (pos - off >= PL_bostr) \
120 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
122 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
123 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
124 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
125 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
126 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
127 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
129 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
131 /* for use after a quantifier and before an EXACT-like node -- japhy */
132 #define NEXT_IMPT(from_rn,to_rn) STMT_START { \
134 while (PL_regkind[(U8)OP(to_rn)] == OPEN || OP(to_rn) == EVAL) \
135 to_rn += NEXT_OFF(to_rn); \
138 static void restore_pos(pTHX_ void *arg);
141 S_regcppush(pTHX_ I32 parenfloor)
143 int retval = PL_savestack_ix;
144 #define REGCP_PAREN_ELEMS 4
145 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
148 if (paren_elems_to_push < 0)
149 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
151 #define REGCP_OTHER_ELEMS 6
152 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
153 for (p = PL_regsize; p > parenfloor; p--) {
154 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
155 SSPUSHINT(PL_regendp[p]);
156 SSPUSHINT(PL_regstartp[p]);
157 SSPUSHPTR(PL_reg_start_tmp[p]);
160 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
161 SSPUSHINT(PL_regsize);
162 SSPUSHINT(*PL_reglastparen);
163 SSPUSHINT(*PL_reglastcloseparen);
164 SSPUSHPTR(PL_reginput);
165 #define REGCP_FRAME_ELEMS 2
166 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
167 * are needed for the regexp context stack bookkeeping. */
168 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
169 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
174 /* These are needed since we do not localize EVAL nodes: */
175 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
176 " Setting an EVAL scope, savestack=%"IVdf"\n", \
177 (IV)PL_savestack_ix)); cp = PL_savestack_ix
179 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
180 PerlIO_printf(Perl_debug_log, \
181 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
182 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
192 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
194 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
195 i = SSPOPINT; /* Parentheses elements to pop. */
196 input = (char *) SSPOPPTR;
197 *PL_reglastcloseparen = SSPOPINT;
198 *PL_reglastparen = SSPOPINT;
199 PL_regsize = SSPOPINT;
201 /* Now restore the parentheses context. */
202 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
203 i > 0; i -= REGCP_PAREN_ELEMS) {
204 paren = (U32)SSPOPINT;
205 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
206 PL_regstartp[paren] = SSPOPINT;
208 if (paren <= *PL_reglastparen)
209 PL_regendp[paren] = tmps;
211 PerlIO_printf(Perl_debug_log,
212 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
213 (UV)paren, (IV)PL_regstartp[paren],
214 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
215 (IV)PL_regendp[paren],
216 (paren > *PL_reglastparen ? "(no)" : ""));
220 if (*PL_reglastparen + 1 <= PL_regnpar) {
221 PerlIO_printf(Perl_debug_log,
222 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
223 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
227 /* It would seem that the similar code in regtry()
228 * already takes care of this, and in fact it is in
229 * a better location to since this code can #if 0-ed out
230 * but the code in regtry() is needed or otherwise tests
231 * requiring null fields (pat.t#187 and split.t#{13,14}
232 * (as of patchlevel 7877) will fail. Then again,
233 * this code seems to be necessary or otherwise
234 * building DynaLoader will fail:
235 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
237 for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) {
238 if (paren > PL_regsize)
239 PL_regstartp[paren] = -1;
240 PL_regendp[paren] = -1;
247 S_regcp_set_to(pTHX_ I32 ss)
249 I32 tmp = PL_savestack_ix;
251 PL_savestack_ix = ss;
253 PL_savestack_ix = tmp;
257 typedef struct re_cc_state
261 struct re_cc_state *prev;
266 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
268 #define TRYPAREN(paren, n, input) { \
271 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
272 PL_regendp[paren] = input - PL_bostr; \
275 PL_regendp[paren] = -1; \
277 if (regmatch(next)) \
280 PL_regendp[paren] = -1; \
285 * pregexec and friends
289 - pregexec - match a regexp against a string
292 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
293 char *strbeg, I32 minend, SV *screamer, U32 nosave)
294 /* strend: pointer to null at end of string */
295 /* strbeg: real beginning of string */
296 /* minend: end of match must be >=minend after stringarg. */
297 /* nosave: For optimizations. */
300 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
301 nosave ? 0 : REXEC_COPY_STR);
305 S_cache_re(pTHX_ regexp *prog)
307 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
309 PL_regprogram = prog->program;
311 PL_regnpar = prog->nparens;
312 PL_regdata = prog->data;
317 * Need to implement the following flags for reg_anch:
319 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
321 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
322 * INTUIT_AUTORITATIVE_ML
323 * INTUIT_ONCE_NOML - Intuit can match in one location only.
326 * Another flag for this function: SECOND_TIME (so that float substrs
327 * with giant delta may be not rechecked).
330 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
332 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
333 Otherwise, only SvCUR(sv) is used to get strbeg. */
335 /* XXXX We assume that strpos is strbeg unless sv. */
337 /* XXXX Some places assume that there is a fixed substring.
338 An update may be needed if optimizer marks as "INTUITable"
339 RExen without fixed substrings. Similarly, it is assumed that
340 lengths of all the strings are no more than minlen, thus they
341 cannot come from lookahead.
342 (Or minlen should take into account lookahead.) */
344 /* A failure to find a constant substring means that there is no need to make
345 an expensive call to REx engine, thus we celebrate a failure. Similarly,
346 finding a substring too deep into the string means that less calls to
347 regtry() should be needed.
349 REx compiler's optimizer found 4 possible hints:
350 a) Anchored substring;
352 c) Whether we are anchored (beginning-of-line or \G);
353 d) First node (of those at offset 0) which may distingush positions;
354 We use a)b)d) and multiline-part of c), and try to find a position in the
355 string which does not contradict any of them.
358 /* Most of decisions we do here should have been done at compile time.
359 The nodes of the REx which we used for the search should have been
360 deleted from the finite automaton. */
363 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
364 char *strend, U32 flags, re_scream_pos_data *data)
366 register I32 start_shift = 0;
367 /* Should be nonnegative! */
368 register I32 end_shift = 0;
374 register char *other_last = Nullch; /* other substr checked before this */
375 char *check_at = Nullch; /* check substr found at this pos */
377 char *i_strpos = strpos;
380 DEBUG_r( if (!PL_colorset) reginitcolors() );
381 DEBUG_r(PerlIO_printf(Perl_debug_log,
382 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
383 PL_colors[4],PL_colors[5],PL_colors[0],
386 (strlen(prog->precomp) > 60 ? "..." : ""),
388 (int)(strend - strpos > 60 ? 60 : strend - strpos),
389 strpos, PL_colors[1],
390 (strend - strpos > 60 ? "..." : ""))
393 if (prog->reganch & ROPT_UTF8)
394 PL_reg_flags |= RF_utf8;
396 if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
397 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
400 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
402 check = prog->check_substr;
403 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
404 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
405 || ( (prog->reganch & ROPT_ANCH_BOL)
406 && !PL_multiline ) ); /* Check after \n? */
409 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
410 | ROPT_IMPLICIT)) /* not a real BOL */
411 /* SvCUR is not set on references: SvRV and SvPVX overlap */
413 && (strpos != strbeg)) {
414 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
417 if (prog->check_offset_min == prog->check_offset_max &&
418 !(prog->reganch & ROPT_CANY_SEEN)) {
419 /* Substring at constant offset from beg-of-str... */
422 s = HOP3c(strpos, prog->check_offset_min, strend);
424 slen = SvCUR(check); /* >= 1 */
426 if ( strend - s > slen || strend - s < slen - 1
427 || (strend - s == slen && strend[-1] != '\n')) {
428 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
431 /* Now should match s[0..slen-2] */
433 if (slen && (*SvPVX(check) != *s
435 && memNE(SvPVX(check), s, slen)))) {
437 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
441 else if (*SvPVX(check) != *s
442 || ((slen = SvCUR(check)) > 1
443 && memNE(SvPVX(check), s, slen)))
445 goto success_at_start;
448 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
450 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
451 end_shift = prog->minlen - start_shift -
452 CHR_SVLEN(check) + (SvTAIL(check) != 0);
454 I32 end = prog->check_offset_max + CHR_SVLEN(check)
455 - (SvTAIL(check) != 0);
456 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
458 if (end_shift < eshift)
462 else { /* Can match at random position */
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 /* Should be nonnegative! */
467 end_shift = prog->minlen - start_shift -
468 CHR_SVLEN(check) + (SvTAIL(check) != 0);
471 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
473 Perl_croak(aTHX_ "panic: end_shift");
477 /* Find a possible match in the region s..strend by looking for
478 the "check" substring in the region corrected by start/end_shift. */
479 if (flags & REXEC_SCREAM) {
480 I32 p = -1; /* Internal iterator of scream. */
481 I32 *pp = data ? data->scream_pos : &p;
483 if (PL_screamfirst[BmRARE(check)] >= 0
484 || ( BmRARE(check) == '\n'
485 && (BmPREVIOUS(check) == SvCUR(check) - 1)
487 s = screaminstr(sv, check,
488 start_shift + (s - strbeg), end_shift, pp, 0);
492 *data->scream_olds = s;
494 else if (prog->reganch & ROPT_CANY_SEEN)
495 s = fbm_instr((U8*)(s + start_shift),
496 (U8*)(strend - end_shift),
497 check, PL_multiline ? FBMrf_MULTILINE : 0);
499 s = fbm_instr(HOP3(s, start_shift, strend),
500 HOP3(strend, -end_shift, strbeg),
501 check, PL_multiline ? FBMrf_MULTILINE : 0);
503 /* Update the count-of-usability, remove useless subpatterns,
506 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
507 (s ? "Found" : "Did not find"),
508 ((check == prog->anchored_substr) ? "anchored" : "floating"),
510 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
512 PL_colors[1], (SvTAIL(check) ? "$" : ""),
513 (s ? " at offset " : "...\n") ) );
520 /* Finish the diagnostic message */
521 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
523 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
524 Start with the other substr.
525 XXXX no SCREAM optimization yet - and a very coarse implementation
526 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
527 *always* match. Probably should be marked during compile...
528 Probably it is right to do no SCREAM here...
531 if (prog->float_substr && prog->anchored_substr) {
532 /* Take into account the "other" substring. */
533 /* XXXX May be hopelessly wrong for UTF... */
536 if (check == prog->float_substr) {
539 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
542 t = s - prog->check_offset_max;
543 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
544 && (!(prog->reganch & ROPT_UTF8)
545 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
550 t = HOP3c(t, prog->anchored_offset, strend);
551 if (t < other_last) /* These positions already checked */
553 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
556 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
557 /* On end-of-str: see comment below. */
558 s = fbm_instr((unsigned char*)t,
559 HOP3(HOP3(last1, prog->anchored_offset, strend)
560 + SvCUR(prog->anchored_substr),
561 -(SvTAIL(prog->anchored_substr)!=0), strbeg),
562 prog->anchored_substr,
563 PL_multiline ? FBMrf_MULTILINE : 0);
564 DEBUG_r(PerlIO_printf(Perl_debug_log,
565 "%s anchored substr `%s%.*s%s'%s",
566 (s ? "Found" : "Contradicts"),
568 (int)(SvCUR(prog->anchored_substr)
569 - (SvTAIL(prog->anchored_substr)!=0)),
570 SvPVX(prog->anchored_substr),
571 PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
573 if (last1 >= last2) {
574 DEBUG_r(PerlIO_printf(Perl_debug_log,
575 ", giving up...\n"));
578 DEBUG_r(PerlIO_printf(Perl_debug_log,
579 ", trying floating at offset %ld...\n",
580 (long)(HOP3c(s1, 1, strend) - i_strpos)));
581 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
582 s = HOP3c(last, 1, strend);
586 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
587 (long)(s - i_strpos)));
588 t = HOP3c(s, -prog->anchored_offset, strbeg);
589 other_last = HOP3c(s, 1, strend);
597 else { /* Take into account the floating substring. */
601 t = HOP3c(s, -start_shift, strbeg);
603 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
604 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
605 last = HOP3c(t, prog->float_max_offset, strend);
606 s = HOP3c(t, prog->float_min_offset, strend);
609 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
610 /* fbm_instr() takes into account exact value of end-of-str
611 if the check is SvTAIL(ed). Since false positives are OK,
612 and end-of-str is not later than strend we are OK. */
613 s = fbm_instr((unsigned char*)s,
614 (unsigned char*)last + SvCUR(prog->float_substr)
615 - (SvTAIL(prog->float_substr)!=0),
616 prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
617 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
618 (s ? "Found" : "Contradicts"),
620 (int)(SvCUR(prog->float_substr)
621 - (SvTAIL(prog->float_substr)!=0)),
622 SvPVX(prog->float_substr),
623 PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
626 DEBUG_r(PerlIO_printf(Perl_debug_log,
627 ", giving up...\n"));
630 DEBUG_r(PerlIO_printf(Perl_debug_log,
631 ", trying anchored starting at offset %ld...\n",
632 (long)(s1 + 1 - i_strpos)));
634 s = HOP3c(t, 1, strend);
638 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
639 (long)(s - i_strpos)));
640 other_last = s; /* Fix this later. --Hugo */
649 t = s - prog->check_offset_max;
650 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
651 && (!(prog->reganch & ROPT_UTF8)
652 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
654 /* Fixed substring is found far enough so that the match
655 cannot start at strpos. */
657 if (ml_anch && t[-1] != '\n') {
658 /* Eventually fbm_*() should handle this, but often
659 anchored_offset is not 0, so this check will not be wasted. */
660 /* XXXX In the code below we prefer to look for "^" even in
661 presence of anchored substrings. And we search even
662 beyond the found float position. These pessimizations
663 are historical artefacts only. */
665 while (t < strend - prog->minlen) {
667 if (t < check_at - prog->check_offset_min) {
668 if (prog->anchored_substr) {
669 /* Since we moved from the found position,
670 we definitely contradict the found anchored
671 substr. Due to the above check we do not
672 contradict "check" substr.
673 Thus we can arrive here only if check substr
674 is float. Redo checking for "other"=="fixed".
677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
678 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
679 goto do_other_anchored;
681 /* We don't contradict the found floating substring. */
682 /* XXXX Why not check for STCLASS? */
684 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
685 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
688 /* Position contradicts check-string */
689 /* XXXX probably better to look for check-string
690 than for "\n", so one should lower the limit for t? */
691 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
692 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
693 other_last = strpos = s = t + 1;
698 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
699 PL_colors[0],PL_colors[1]));
703 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
704 PL_colors[0],PL_colors[1]));
708 ++BmUSEFUL(prog->check_substr); /* hooray/5 */
711 /* The found string does not prohibit matching at strpos,
712 - no optimization of calling REx engine can be performed,
713 unless it was an MBOL and we are not after MBOL,
714 or a future STCLASS check will fail this. */
716 /* Even in this situation we may use MBOL flag if strpos is offset
717 wrt the start of the string. */
718 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
719 && (strpos != strbeg) && strpos[-1] != '\n'
720 /* May be due to an implicit anchor of m{.*foo} */
721 && !(prog->reganch & ROPT_IMPLICIT))
726 DEBUG_r( if (ml_anch)
727 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
728 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
731 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
732 && prog->check_substr /* Could be deleted already */
733 && --BmUSEFUL(prog->check_substr) < 0
734 && prog->check_substr == prog->float_substr)
736 /* If flags & SOMETHING - do not do it many times on the same match */
737 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
738 SvREFCNT_dec(prog->check_substr);
739 prog->check_substr = Nullsv; /* disable */
740 prog->float_substr = Nullsv; /* clear */
741 check = Nullsv; /* abort */
743 /* XXXX This is a remnant of the old implementation. It
744 looks wasteful, since now INTUIT can use many
746 prog->reganch &= ~RE_USE_INTUIT;
753 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
754 if (prog->regstclass) {
755 /* minlen == 0 is possible if regstclass is \b or \B,
756 and the fixed substr is ''$.
757 Since minlen is already taken into account, s+1 is before strend;
758 accidentally, minlen >= 1 guaranties no false positives at s + 1
759 even for \b or \B. But (minlen? 1 : 0) below assumes that
760 regstclass does not come from lookahead... */
761 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
762 This leaves EXACTF only, which is dealt with in find_byclass(). */
763 U8* str = (U8*)STRING(prog->regstclass);
764 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
765 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
767 char *endpos = (prog->anchored_substr || ml_anch)
768 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
769 : (prog->float_substr
770 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
773 char *startpos = strbeg;
776 if (prog->reganch & ROPT_UTF8) {
777 PL_regdata = prog->data;
780 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
785 if (endpos == strend) {
786 DEBUG_r( PerlIO_printf(Perl_debug_log,
787 "Could not match STCLASS...\n") );
790 DEBUG_r( PerlIO_printf(Perl_debug_log,
791 "This position contradicts STCLASS...\n") );
792 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
794 /* Contradict one of substrings */
795 if (prog->anchored_substr) {
796 if (prog->anchored_substr == check) {
797 DEBUG_r( what = "anchored" );
799 s = HOP3c(t, 1, strend);
800 if (s + start_shift + end_shift > strend) {
801 /* XXXX Should be taken into account earlier? */
802 DEBUG_r( PerlIO_printf(Perl_debug_log,
803 "Could not match STCLASS...\n") );
808 DEBUG_r( PerlIO_printf(Perl_debug_log,
809 "Looking for %s substr starting at offset %ld...\n",
810 what, (long)(s + start_shift - i_strpos)) );
813 /* Have both, check_string is floating */
814 if (t + start_shift >= check_at) /* Contradicts floating=check */
815 goto retry_floating_check;
816 /* Recheck anchored substring, but not floating... */
820 DEBUG_r( PerlIO_printf(Perl_debug_log,
821 "Looking for anchored substr starting at offset %ld...\n",
822 (long)(other_last - i_strpos)) );
823 goto do_other_anchored;
825 /* Another way we could have checked stclass at the
826 current position only: */
831 DEBUG_r( PerlIO_printf(Perl_debug_log,
832 "Looking for /%s^%s/m starting at offset %ld...\n",
833 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
836 if (!prog->float_substr) /* Could have been deleted */
838 /* Check is floating subtring. */
839 retry_floating_check:
840 t = check_at - start_shift;
841 DEBUG_r( what = "floating" );
842 goto hop_and_restart;
845 DEBUG_r(PerlIO_printf(Perl_debug_log,
846 "By STCLASS: moving %ld --> %ld\n",
847 (long)(t - i_strpos), (long)(s - i_strpos))
851 DEBUG_r(PerlIO_printf(Perl_debug_log,
852 "Does not contradict STCLASS...\n");
857 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
858 PL_colors[4], (check ? "Guessed" : "Giving up"),
859 PL_colors[5], (long)(s - i_strpos)) );
862 fail_finish: /* Substring not found */
863 if (prog->check_substr) /* could be removed already */
864 BmUSEFUL(prog->check_substr) += 5; /* hooray */
866 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
867 PL_colors[4],PL_colors[5]));
871 /* We know what class REx starts with. Try to find this position... */
873 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
875 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
881 register I32 tmp = 1; /* Scratch variable? */
882 register bool do_utf8 = PL_reg_match_utf8;
884 /* We know what class it must start with. */
888 if (reginclass(c, (U8*)s, do_utf8)) {
889 if (tmp && (norun || regtry(prog, s)))
896 s += do_utf8 ? UTF8SKIP(s) : 1;
901 if (tmp && (norun || regtry(prog, s)))
912 c1 = to_utf8_lower((U8*)m);
913 c2 = to_utf8_upper((U8*)m);
924 c2 = PL_fold_locale[c1];
929 e = s; /* Due to minlen logic of intuit() */
935 if ( utf8_to_uvchr((U8*)s, &len) == c1
942 UV c = utf8_to_uvchr((U8*)s, &len);
943 if ( (c == c1 || c == c2) && regtry(prog, s) )
952 && (ln == 1 || !(OP(c) == EXACTF
954 : ibcmp_locale(s, m, ln)))
955 && (norun || regtry(prog, s)) )
961 if ( (*(U8*)s == c1 || *(U8*)s == c2)
962 && (ln == 1 || !(OP(c) == EXACTF
964 : ibcmp_locale(s, m, ln)))
965 && (norun || regtry(prog, s)) )
972 PL_reg_flags |= RF_tainted;
979 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
982 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
984 tmp = ((OP(c) == BOUND ?
985 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
986 LOAD_UTF8_CHARCLASS(alnum,"a");
988 if (tmp == !(OP(c) == BOUND ?
989 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
990 isALNUM_LC_utf8((U8*)s)))
993 if ((norun || regtry(prog, s)))
1000 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1001 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1002 while (s < strend) {
1004 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1006 if ((norun || regtry(prog, s)))
1012 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1016 PL_reg_flags |= RF_tainted;
1023 U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
1026 tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
1028 tmp = ((OP(c) == NBOUND ?
1029 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1030 LOAD_UTF8_CHARCLASS(alnum,"a");
1031 while (s < strend) {
1032 if (tmp == !(OP(c) == NBOUND ?
1033 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1034 isALNUM_LC_utf8((U8*)s)))
1036 else if ((norun || regtry(prog, s)))
1042 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1043 tmp = ((OP(c) == NBOUND ?
1044 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1045 while (s < strend) {
1047 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1049 else if ((norun || regtry(prog, s)))
1054 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1059 LOAD_UTF8_CHARCLASS(alnum,"a");
1060 while (s < strend) {
1061 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1062 if (tmp && (norun || regtry(prog, s)))
1073 while (s < strend) {
1075 if (tmp && (norun || regtry(prog, s)))
1087 PL_reg_flags |= RF_tainted;
1089 while (s < strend) {
1090 if (isALNUM_LC_utf8((U8*)s)) {
1091 if (tmp && (norun || regtry(prog, s)))
1102 while (s < strend) {
1103 if (isALNUM_LC(*s)) {
1104 if (tmp && (norun || regtry(prog, s)))
1117 LOAD_UTF8_CHARCLASS(alnum,"a");
1118 while (s < strend) {
1119 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1120 if (tmp && (norun || regtry(prog, s)))
1131 while (s < strend) {
1133 if (tmp && (norun || regtry(prog, s)))
1145 PL_reg_flags |= RF_tainted;
1147 while (s < strend) {
1148 if (!isALNUM_LC_utf8((U8*)s)) {
1149 if (tmp && (norun || regtry(prog, s)))
1160 while (s < strend) {
1161 if (!isALNUM_LC(*s)) {
1162 if (tmp && (norun || regtry(prog, s)))
1175 LOAD_UTF8_CHARCLASS(space," ");
1176 while (s < strend) {
1177 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1178 if (tmp && (norun || regtry(prog, s)))
1189 while (s < strend) {
1191 if (tmp && (norun || regtry(prog, s)))
1203 PL_reg_flags |= RF_tainted;
1205 while (s < strend) {
1206 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1207 if (tmp && (norun || regtry(prog, s)))
1218 while (s < strend) {
1219 if (isSPACE_LC(*s)) {
1220 if (tmp && (norun || regtry(prog, s)))
1233 LOAD_UTF8_CHARCLASS(space," ");
1234 while (s < strend) {
1235 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1236 if (tmp && (norun || regtry(prog, s)))
1247 while (s < strend) {
1249 if (tmp && (norun || regtry(prog, s)))
1261 PL_reg_flags |= RF_tainted;
1263 while (s < strend) {
1264 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1265 if (tmp && (norun || regtry(prog, s)))
1276 while (s < strend) {
1277 if (!isSPACE_LC(*s)) {
1278 if (tmp && (norun || regtry(prog, s)))
1291 LOAD_UTF8_CHARCLASS(digit,"0");
1292 while (s < strend) {
1293 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1294 if (tmp && (norun || regtry(prog, s)))
1305 while (s < strend) {
1307 if (tmp && (norun || regtry(prog, s)))
1319 PL_reg_flags |= RF_tainted;
1321 while (s < strend) {
1322 if (isDIGIT_LC_utf8((U8*)s)) {
1323 if (tmp && (norun || regtry(prog, s)))
1334 while (s < strend) {
1335 if (isDIGIT_LC(*s)) {
1336 if (tmp && (norun || regtry(prog, s)))
1349 LOAD_UTF8_CHARCLASS(digit,"0");
1350 while (s < strend) {
1351 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1352 if (tmp && (norun || regtry(prog, s)))
1363 while (s < strend) {
1365 if (tmp && (norun || regtry(prog, s)))
1377 PL_reg_flags |= RF_tainted;
1379 while (s < strend) {
1380 if (!isDIGIT_LC_utf8((U8*)s)) {
1381 if (tmp && (norun || regtry(prog, s)))
1392 while (s < strend) {
1393 if (!isDIGIT_LC(*s)) {
1394 if (tmp && (norun || regtry(prog, s)))
1406 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1415 - regexec_flags - match a regexp against a string
1418 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1419 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1420 /* strend: pointer to null at end of string */
1421 /* strbeg: real beginning of string */
1422 /* minend: end of match must be >=minend after stringarg. */
1423 /* data: May be used for some additional optimizations. */
1424 /* nosave: For optimizations. */
1427 register regnode *c;
1428 register char *startpos = stringarg;
1429 I32 minlen; /* must match at least this many chars */
1430 I32 dontbother = 0; /* how many characters not to try at end */
1431 /* I32 start_shift = 0; */ /* Offset of the start to find
1432 constant substr. */ /* CC */
1433 I32 end_shift = 0; /* Same for the end. */ /* CC */
1434 I32 scream_pos = -1; /* Internal iterator of scream. */
1436 SV* oreplsv = GvSV(PL_replgv);
1437 bool do_utf8 = DO_UTF8(sv);
1443 PL_regnarrate = DEBUG_r_TEST;
1446 /* Be paranoid... */
1447 if (prog == NULL || startpos == NULL) {
1448 Perl_croak(aTHX_ "NULL regexp parameter");
1452 minlen = prog->minlen;
1453 if (do_utf8 && !(prog->reganch & ROPT_CANY_SEEN)) {
1454 if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
1457 if (strend - startpos < minlen) goto phooey;
1460 /* Check validity of program. */
1461 if (UCHARAT(prog->program) != REG_MAGIC) {
1462 Perl_croak(aTHX_ "corrupted regexp program");
1466 PL_reg_eval_set = 0;
1469 if (prog->reganch & ROPT_UTF8)
1470 PL_reg_flags |= RF_utf8;
1472 /* Mark beginning of line for ^ and lookbehind. */
1473 PL_regbol = startpos;
1477 /* Mark end of line for $ (and such) */
1480 /* see how far we have to get to not match where we matched before */
1481 PL_regtill = startpos+minend;
1483 /* We start without call_cc context. */
1486 /* If there is a "must appear" string, look for it. */
1489 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1492 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1493 PL_reg_ganch = startpos;
1494 else if (sv && SvTYPE(sv) >= SVt_PVMG
1496 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1497 && mg->mg_len >= 0) {
1498 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1499 if (prog->reganch & ROPT_ANCH_GPOS) {
1500 if (s > PL_reg_ganch)
1505 else /* pos() not defined */
1506 PL_reg_ganch = strbeg;
1509 if (do_utf8 == (UTF!=0) &&
1510 !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
1511 re_scream_pos_data d;
1513 d.scream_olds = &scream_olds;
1514 d.scream_pos = &scream_pos;
1515 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1517 goto phooey; /* not present */
1520 DEBUG_r( if (!PL_colorset) reginitcolors() );
1521 DEBUG_r(PerlIO_printf(Perl_debug_log,
1522 "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n",
1523 PL_colors[4],PL_colors[5],PL_colors[0],
1526 (strlen(prog->precomp) > 60 ? "..." : ""),
1528 (int)(strend - startpos > 60 ? 60 : strend - startpos),
1529 startpos, PL_colors[1],
1530 (strend - startpos > 60 ? "..." : ""))
1533 /* Simplest case: anchored match need be tried only once. */
1534 /* [unless only anchor is BOL and multiline is set] */
1535 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1536 if (s == startpos && regtry(prog, startpos))
1538 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1539 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1544 dontbother = minlen - 1;
1545 end = HOP3c(strend, -dontbother, strbeg) - 1;
1546 /* for multiline we only have to try after newlines */
1547 if (prog->check_substr) {
1551 if (regtry(prog, s))
1556 if (prog->reganch & RE_USE_INTUIT) {
1557 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1568 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1569 if (regtry(prog, s))
1576 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1577 if (regtry(prog, PL_reg_ganch))
1582 /* Messy cases: unanchored match. */
1583 if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
1584 /* we have /x+whatever/ */
1585 /* it must be a one character string (XXXX Except UTF?) */
1586 char ch = SvPVX(prog->anchored_substr)[0];
1592 while (s < strend) {
1594 DEBUG_r( did_match = 1 );
1595 if (regtry(prog, s)) goto got_it;
1597 while (s < strend && *s == ch)
1604 while (s < strend) {
1606 DEBUG_r( did_match = 1 );
1607 if (regtry(prog, s)) goto got_it;
1609 while (s < strend && *s == ch)
1615 DEBUG_r(if (!did_match)
1616 PerlIO_printf(Perl_debug_log,
1617 "Did not find anchored character...\n")
1621 else if (do_utf8 == (UTF!=0) &&
1622 (prog->anchored_substr != Nullsv
1623 || (prog->float_substr != Nullsv
1624 && prog->float_max_offset < strend - s))) {
1625 SV *must = prog->anchored_substr
1626 ? prog->anchored_substr : prog->float_substr;
1628 prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
1630 prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
1631 char *last = HOP3c(strend, /* Cannot start after this */
1632 -(I32)(CHR_SVLEN(must)
1633 - (SvTAIL(must) != 0) + back_min), strbeg);
1634 char *last1; /* Last position checked before */
1640 last1 = HOPc(s, -1);
1642 last1 = s - 1; /* bogus */
1644 /* XXXX check_substr already used to find `s', can optimize if
1645 check_substr==must. */
1647 dontbother = end_shift;
1648 strend = HOPc(strend, -dontbother);
1649 while ( (s <= last) &&
1650 ((flags & REXEC_SCREAM)
1651 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1652 end_shift, &scream_pos, 0))
1653 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1654 (unsigned char*)strend, must,
1655 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1656 DEBUG_r( did_match = 1 );
1657 if (HOPc(s, -back_max) > last1) {
1658 last1 = HOPc(s, -back_min);
1659 s = HOPc(s, -back_max);
1662 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1664 last1 = HOPc(s, -back_min);
1668 while (s <= last1) {
1669 if (regtry(prog, s))
1675 while (s <= last1) {
1676 if (regtry(prog, s))
1682 DEBUG_r(if (!did_match)
1683 PerlIO_printf(Perl_debug_log,
1684 "Did not find %s substr `%s%.*s%s'%s...\n",
1685 ((must == prog->anchored_substr)
1686 ? "anchored" : "floating"),
1688 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1690 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1694 else if ((c = prog->regstclass)) {
1695 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1696 /* don't bother with what can't match */
1697 strend = HOPc(strend, -(minlen - 1));
1699 SV *prop = sv_newmortal();
1701 PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
1703 if (find_byclass(prog, c, s, strend, startpos, 0))
1705 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1709 if (prog->float_substr != Nullsv) { /* Trim the end. */
1712 if (flags & REXEC_SCREAM) {
1713 last = screaminstr(sv, prog->float_substr, s - strbeg,
1714 end_shift, &scream_pos, 1); /* last one */
1716 last = scream_olds; /* Only one occurrence. */
1720 char *little = SvPV(prog->float_substr, len);
1722 if (SvTAIL(prog->float_substr)) {
1723 if (memEQ(strend - len + 1, little, len - 1))
1724 last = strend - len + 1;
1725 else if (!PL_multiline)
1726 last = memEQ(strend - len, little, len)
1727 ? strend - len : Nullch;
1733 last = rninstr(s, strend, little, little + len);
1735 last = strend; /* matching `$' */
1739 DEBUG_r(PerlIO_printf(Perl_debug_log,
1740 "%sCan't trim the tail, match fails (should not happen)%s\n",
1741 PL_colors[4],PL_colors[5]));
1742 goto phooey; /* Should not happen! */
1744 dontbother = strend - last + prog->float_min_offset;
1746 if (minlen && (dontbother < minlen))
1747 dontbother = minlen - 1;
1748 strend -= dontbother; /* this one's always in bytes! */
1749 /* We don't know much -- general case. */
1752 if (regtry(prog, s))
1761 if (regtry(prog, s))
1763 } while (s++ < strend);
1771 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1773 if (PL_reg_eval_set) {
1774 /* Preserve the current value of $^R */
1775 if (oreplsv != GvSV(PL_replgv))
1776 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1777 restored, the value remains
1779 restore_pos(aTHX_ 0);
1782 /* make sure $`, $&, $', and $digit will work later */
1783 if ( !(flags & REXEC_NOT_FIRST) ) {
1784 if (RX_MATCH_COPIED(prog)) {
1785 Safefree(prog->subbeg);
1786 RX_MATCH_COPIED_off(prog);
1788 if (flags & REXEC_COPY_STR) {
1789 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1791 s = savepvn(strbeg, i);
1794 RX_MATCH_COPIED_on(prog);
1797 prog->subbeg = strbeg;
1798 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1805 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1806 PL_colors[4],PL_colors[5]));
1807 if (PL_reg_eval_set)
1808 restore_pos(aTHX_ 0);
1813 - regtry - try match at specific point
1815 STATIC I32 /* 0 failure, 1 success */
1816 S_regtry(pTHX_ regexp *prog, char *startpos)
1824 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1826 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1829 PL_reg_eval_set = RS_init;
1831 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1832 (IV)(PL_stack_sp - PL_stack_base));
1834 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1835 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1836 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1838 /* Apparently this is not needed, judging by wantarray. */
1839 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1840 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1843 /* Make $_ available to executed code. */
1844 if (PL_reg_sv != DEFSV) {
1845 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
1850 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1851 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1852 /* prepare for quick setting of pos */
1853 sv_magic(PL_reg_sv, (SV*)0,
1854 PERL_MAGIC_regex_global, Nullch, 0);
1855 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
1859 PL_reg_oldpos = mg->mg_len;
1860 SAVEDESTRUCTOR_X(restore_pos, 0);
1862 if (!PL_reg_curpm) {
1863 Newz(22,PL_reg_curpm, 1, PMOP);
1866 SV* repointer = newSViv(0);
1867 /* so we know which PL_regex_padav element is PL_reg_curpm */
1868 SvFLAGS(repointer) |= SVf_BREAK;
1869 av_push(PL_regex_padav,repointer);
1870 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1871 PL_regex_pad = AvARRAY(PL_regex_padav);
1875 PM_SETRE(PL_reg_curpm, prog);
1876 PL_reg_oldcurpm = PL_curpm;
1877 PL_curpm = PL_reg_curpm;
1878 if (RX_MATCH_COPIED(prog)) {
1879 /* Here is a serious problem: we cannot rewrite subbeg,
1880 since it may be needed if this match fails. Thus
1881 $` inside (?{}) could fail... */
1882 PL_reg_oldsaved = prog->subbeg;
1883 PL_reg_oldsavedlen = prog->sublen;
1884 RX_MATCH_COPIED_off(prog);
1887 PL_reg_oldsaved = Nullch;
1888 prog->subbeg = PL_bostr;
1889 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1891 prog->startp[0] = startpos - PL_bostr;
1892 PL_reginput = startpos;
1893 PL_regstartp = prog->startp;
1894 PL_regendp = prog->endp;
1895 PL_reglastparen = &prog->lastparen;
1896 PL_reglastcloseparen = &prog->lastcloseparen;
1897 prog->lastparen = 0;
1899 DEBUG_r(PL_reg_starttry = startpos);
1900 if (PL_reg_start_tmpl <= prog->nparens) {
1901 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1902 if(PL_reg_start_tmp)
1903 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1905 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1908 /* XXXX What this code is doing here?!!! There should be no need
1909 to do this again and again, PL_reglastparen should take care of
1912 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1913 * Actually, the code in regcppop() (which Ilya may be meaning by
1914 * PL_reglastparen), is not needed at all by the test suite
1915 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1916 * enough, for building DynaLoader, or otherwise this
1917 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1918 * will happen. Meanwhile, this code *is* needed for the
1919 * above-mentioned test suite tests to succeed. The common theme
1920 * on those tests seems to be returning null fields from matches.
1925 if (prog->nparens) {
1926 for (i = prog->nparens; i > *PL_reglastparen; i--) {
1933 if (regmatch(prog->program + 1)) {
1934 prog->endp[0] = PL_reginput - PL_bostr;
1937 REGCP_UNWIND(lastcp);
1941 #define RE_UNWIND_BRANCH 1
1942 #define RE_UNWIND_BRANCHJ 2
1946 typedef struct { /* XX: makes sense to enlarge it... */
1950 } re_unwind_generic_t;
1963 } re_unwind_branch_t;
1965 typedef union re_unwind_t {
1967 re_unwind_generic_t generic;
1968 re_unwind_branch_t branch;
1971 #define sayYES goto yes
1972 #define sayNO goto no
1973 #define sayYES_FINAL goto yes_final
1974 #define sayYES_LOUD goto yes_loud
1975 #define sayNO_FINAL goto no_final
1976 #define sayNO_SILENT goto do_no
1977 #define saySAME(x) if (x) goto yes; else goto no
1979 #define REPORT_CODE_OFF 24
1982 - regmatch - main matching routine
1984 * Conceptually the strategy is simple: check to see whether the current
1985 * node matches, call self recursively to see whether the rest matches,
1986 * and then act accordingly. In practice we make some effort to avoid
1987 * recursion, in particular by going through "ordinary" nodes (that don't
1988 * need to know whether the rest of the match failed) by a loop instead of
1991 /* [lwall] I've hoisted the register declarations to the outer block in order to
1992 * maybe save a little bit of pushing and popping on the stack. It also takes
1993 * advantage of machines that use a register save mask on subroutine entry.
1995 STATIC I32 /* 0 failure, 1 success */
1996 S_regmatch(pTHX_ regnode *prog)
1998 register regnode *scan; /* Current node. */
1999 regnode *next; /* Next node. */
2000 regnode *inner; /* Next node in internal branch. */
2001 register I32 nextchr; /* renamed nextchr - nextchar colides with
2002 function of same name */
2003 register I32 n; /* no or next */
2004 register I32 ln = 0; /* len or last */
2005 register char *s = Nullch; /* operand or save */
2006 register char *locinput = PL_reginput;
2007 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2008 int minmod = 0, sw = 0, logical = 0;
2011 I32 firstcp = PL_savestack_ix;
2013 register bool do_utf8 = PL_reg_match_utf8;
2019 /* Note that nextchr is a byte even in UTF */
2020 nextchr = UCHARAT(locinput);
2022 while (scan != NULL) {
2025 SV *prop = sv_newmortal();
2026 int docolor = *PL_colors[0];
2027 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2028 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2029 /* The part of the string before starttry has one color
2030 (pref0_len chars), between starttry and current
2031 position another one (pref_len - pref0_len chars),
2032 after the current position the third one.
2033 We assume that pref0_len <= pref_len, otherwise we
2034 decrease pref0_len. */
2035 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2036 ? (5 + taill) - l : locinput - PL_bostr;
2039 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2041 pref0_len = pref_len - (locinput - PL_reg_starttry);
2042 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2043 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2044 ? (5 + taill) - pref_len : PL_regeol - locinput);
2045 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2049 if (pref0_len > pref_len)
2050 pref0_len = pref_len;
2051 regprop(prop, scan);
2052 PerlIO_printf(Perl_debug_log,
2053 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2054 (IV)(locinput - PL_bostr),
2055 PL_colors[4], pref0_len,
2056 locinput - pref_len, PL_colors[5],
2057 PL_colors[2], pref_len - pref0_len,
2058 locinput - pref_len + pref0_len, PL_colors[3],
2059 (docolor ? "" : "> <"),
2060 PL_colors[0], l, locinput, PL_colors[1],
2061 15 - l - pref_len + 1,
2063 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2067 next = scan + NEXT_OFF(scan);
2073 if (locinput == PL_bostr || (PL_multiline &&
2074 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2076 /* regtill = regbol; */
2081 if (locinput == PL_bostr ||
2082 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2088 if (locinput == PL_bostr)
2092 if (locinput == PL_reg_ganch)
2102 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2107 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2109 if (PL_regeol - locinput > 1)
2113 if (PL_regeol != locinput)
2117 if (!nextchr && locinput >= PL_regeol)
2120 locinput += PL_utf8skip[nextchr];
2121 if (locinput > PL_regeol)
2123 nextchr = UCHARAT(locinput);
2126 nextchr = UCHARAT(++locinput);
2129 if (!nextchr && locinput >= PL_regeol)
2131 nextchr = UCHARAT(++locinput);
2134 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2137 locinput += PL_utf8skip[nextchr];
2138 if (locinput > PL_regeol)
2140 nextchr = UCHARAT(locinput);
2143 nextchr = UCHARAT(++locinput);
2148 if (do_utf8 != (UTF!=0)) {
2156 if (*((U8*)s) != utf8_to_uvchr((U8*)l, &len))
2165 if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
2171 nextchr = UCHARAT(locinput);
2174 /* Inline the first character, for speed. */
2175 if (UCHARAT(s) != nextchr)
2177 if (PL_regeol - locinput < ln)
2179 if (ln > 1 && memNE(s, locinput, ln))
2182 nextchr = UCHARAT(locinput);
2185 PL_reg_flags |= RF_tainted;
2195 c1 = OP(scan) == EXACTF;
2197 if (l >= PL_regeol) {
2200 if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
2201 (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
2203 s += UTF ? UTF8SKIP(s) : 1;
2207 nextchr = UCHARAT(locinput);
2211 /* Inline the first character, for speed. */
2212 if (UCHARAT(s) != nextchr &&
2213 UCHARAT(s) != ((OP(scan) == EXACTF)
2214 ? PL_fold : PL_fold_locale)[nextchr])
2216 if (PL_regeol - locinput < ln)
2218 if (ln > 1 && (OP(scan) == EXACTF
2219 ? ibcmp(s, locinput, ln)
2220 : ibcmp_locale(s, locinput, ln)))
2223 nextchr = UCHARAT(locinput);
2227 if (!reginclass(scan, (U8*)locinput, do_utf8))
2229 if (locinput >= PL_regeol)
2231 locinput += PL_utf8skip[nextchr];
2232 nextchr = UCHARAT(locinput);
2236 nextchr = UCHARAT(locinput);
2237 if (!reginclass(scan, (U8*)locinput, do_utf8))
2239 if (!nextchr && locinput >= PL_regeol)
2241 nextchr = UCHARAT(++locinput);
2245 PL_reg_flags |= RF_tainted;
2251 LOAD_UTF8_CHARCLASS(alnum,"a");
2252 if (!(OP(scan) == ALNUM
2253 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2254 : isALNUM_LC_utf8((U8*)locinput)))
2258 locinput += PL_utf8skip[nextchr];
2259 nextchr = UCHARAT(locinput);
2262 if (!(OP(scan) == ALNUM
2263 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2265 nextchr = UCHARAT(++locinput);
2268 PL_reg_flags |= RF_tainted;
2271 if (!nextchr && locinput >= PL_regeol)
2274 LOAD_UTF8_CHARCLASS(alnum,"a");
2275 if (OP(scan) == NALNUM
2276 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2277 : isALNUM_LC_utf8((U8*)locinput))
2281 locinput += PL_utf8skip[nextchr];
2282 nextchr = UCHARAT(locinput);
2285 if (OP(scan) == NALNUM
2286 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2288 nextchr = UCHARAT(++locinput);
2292 PL_reg_flags |= RF_tainted;
2296 /* was last char in word? */
2298 if (locinput == PL_bostr)
2301 U8 *r = reghop((U8*)locinput, -1);
2303 ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
2305 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2306 ln = isALNUM_uni(ln);
2307 LOAD_UTF8_CHARCLASS(alnum,"a");
2308 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2311 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2312 n = isALNUM_LC_utf8((U8*)locinput);
2316 ln = (locinput != PL_bostr) ?
2317 UCHARAT(locinput - 1) : '\n';
2318 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2320 n = isALNUM(nextchr);
2323 ln = isALNUM_LC(ln);
2324 n = isALNUM_LC(nextchr);
2327 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2328 OP(scan) == BOUNDL))
2332 PL_reg_flags |= RF_tainted;
2338 if (UTF8_IS_CONTINUED(nextchr)) {
2339 LOAD_UTF8_CHARCLASS(space," ");
2340 if (!(OP(scan) == SPACE
2341 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2342 : isSPACE_LC_utf8((U8*)locinput)))
2346 locinput += PL_utf8skip[nextchr];
2347 nextchr = UCHARAT(locinput);
2350 if (!(OP(scan) == SPACE
2351 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2353 nextchr = UCHARAT(++locinput);
2356 if (!(OP(scan) == SPACE
2357 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2359 nextchr = UCHARAT(++locinput);
2363 PL_reg_flags |= RF_tainted;
2366 if (!nextchr && locinput >= PL_regeol)
2369 LOAD_UTF8_CHARCLASS(space," ");
2370 if (OP(scan) == NSPACE
2371 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2372 : isSPACE_LC_utf8((U8*)locinput))
2376 locinput += PL_utf8skip[nextchr];
2377 nextchr = UCHARAT(locinput);
2380 if (OP(scan) == NSPACE
2381 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2383 nextchr = UCHARAT(++locinput);
2386 PL_reg_flags |= RF_tainted;
2392 LOAD_UTF8_CHARCLASS(digit,"0");
2393 if (!(OP(scan) == DIGIT
2394 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2395 : isDIGIT_LC_utf8((U8*)locinput)))
2399 locinput += PL_utf8skip[nextchr];
2400 nextchr = UCHARAT(locinput);
2403 if (!(OP(scan) == DIGIT
2404 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2406 nextchr = UCHARAT(++locinput);
2409 PL_reg_flags |= RF_tainted;
2412 if (!nextchr && locinput >= PL_regeol)
2415 LOAD_UTF8_CHARCLASS(digit,"0");
2416 if (OP(scan) == NDIGIT
2417 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2418 : isDIGIT_LC_utf8((U8*)locinput))
2422 locinput += PL_utf8skip[nextchr];
2423 nextchr = UCHARAT(locinput);
2426 if (OP(scan) == NDIGIT
2427 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2429 nextchr = UCHARAT(++locinput);
2432 LOAD_UTF8_CHARCLASS(mark,"~");
2433 if (locinput >= PL_regeol ||
2434 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2436 locinput += PL_utf8skip[nextchr];
2437 while (locinput < PL_regeol &&
2438 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2439 locinput += UTF8SKIP(locinput);
2440 if (locinput > PL_regeol)
2442 nextchr = UCHARAT(locinput);
2445 PL_reg_flags |= RF_tainted;
2449 n = ARG(scan); /* which paren pair */
2450 ln = PL_regstartp[n];
2451 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2452 if (*PL_reglastparen < n || ln == -1)
2453 sayNO; /* Do not match unless seen CLOSEn. */
2454 if (ln == PL_regendp[n])
2458 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2460 char *e = PL_bostr + PL_regendp[n];
2462 * Note that we can't do the "other character" lookup trick as
2463 * in the 8-bit case (no pun intended) because in Unicode we
2464 * have to map both upper and title case to lower case.
2466 if (OP(scan) == REFF) {
2470 if (toLOWER_utf8((U8*)s) != toLOWER_utf8((U8*)l))
2480 if (toLOWER_LC_utf8((U8*)s) != toLOWER_LC_utf8((U8*)l))
2487 nextchr = UCHARAT(locinput);
2491 /* Inline the first character, for speed. */
2492 if (UCHARAT(s) != nextchr &&
2494 (UCHARAT(s) != ((OP(scan) == REFF
2495 ? PL_fold : PL_fold_locale)[nextchr]))))
2497 ln = PL_regendp[n] - ln;
2498 if (locinput + ln > PL_regeol)
2500 if (ln > 1 && (OP(scan) == REF
2501 ? memNE(s, locinput, ln)
2503 ? ibcmp(s, locinput, ln)
2504 : ibcmp_locale(s, locinput, ln))))
2507 nextchr = UCHARAT(locinput);
2518 OP_4tree *oop = PL_op;
2519 COP *ocurcop = PL_curcop;
2520 SV **ocurpad = PL_curpad;
2524 PL_op = (OP_4tree*)PL_regdata->data[n];
2525 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2526 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
2527 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2529 CALLRUNOPS(aTHX); /* Scalar context. */
2535 PL_curpad = ocurpad;
2536 PL_curcop = ocurcop;
2538 if (logical == 2) { /* Postponed subexpression. */
2540 MAGIC *mg = Null(MAGIC*);
2542 CHECKPOINT cp, lastcp;
2544 if(SvROK(ret) || SvRMAGICAL(ret)) {
2545 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2548 mg = mg_find(sv, PERL_MAGIC_qr);
2551 re = (regexp *)mg->mg_obj;
2552 (void)ReREFCNT_inc(re);
2556 char *t = SvPV(ret, len);
2558 char *oprecomp = PL_regprecomp;
2559 I32 osize = PL_regsize;
2560 I32 onpar = PL_regnpar;
2563 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2565 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
2566 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2568 PL_regprecomp = oprecomp;
2573 PerlIO_printf(Perl_debug_log,
2574 "Entering embedded `%s%.60s%s%s'\n",
2578 (strlen(re->precomp) > 60 ? "..." : ""))
2581 state.prev = PL_reg_call_cc;
2582 state.cc = PL_regcc;
2583 state.re = PL_reg_re;
2587 cp = regcppush(0); /* Save *all* the positions. */
2590 state.ss = PL_savestack_ix;
2591 *PL_reglastparen = 0;
2592 *PL_reglastcloseparen = 0;
2593 PL_reg_call_cc = &state;
2594 PL_reginput = locinput;
2596 /* XXXX This is too dramatic a measure... */
2599 if (regmatch(re->program + 1)) {
2600 /* Even though we succeeded, we need to restore
2601 global variables, since we may be wrapped inside
2602 SUSPEND, thus the match may be not finished yet. */
2604 /* XXXX Do this only if SUSPENDed? */
2605 PL_reg_call_cc = state.prev;
2606 PL_regcc = state.cc;
2607 PL_reg_re = state.re;
2608 cache_re(PL_reg_re);
2610 /* XXXX This is too dramatic a measure... */
2613 /* These are needed even if not SUSPEND. */
2619 REGCP_UNWIND(lastcp);
2621 PL_reg_call_cc = state.prev;
2622 PL_regcc = state.cc;
2623 PL_reg_re = state.re;
2624 cache_re(PL_reg_re);
2626 /* XXXX This is too dramatic a measure... */
2636 sv_setsv(save_scalar(PL_replgv), ret);
2640 n = ARG(scan); /* which paren pair */
2641 PL_reg_start_tmp[n] = locinput;
2646 n = ARG(scan); /* which paren pair */
2647 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2648 PL_regendp[n] = locinput - PL_bostr;
2649 if (n > *PL_reglastparen)
2650 *PL_reglastparen = n;
2651 *PL_reglastcloseparen = n;
2654 n = ARG(scan); /* which paren pair */
2655 sw = (*PL_reglastparen >= n && PL_regendp[n] != -1);
2658 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2660 next = NEXTOPER(NEXTOPER(scan));
2662 next = scan + ARG(scan);
2663 if (OP(next) == IFTHEN) /* Fake one. */
2664 next = NEXTOPER(NEXTOPER(next));
2668 logical = scan->flags;
2670 /*******************************************************************
2671 PL_regcc contains infoblock about the innermost (...)* loop, and
2672 a pointer to the next outer infoblock.
2674 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2676 1) After matching X, regnode for CURLYX is processed;
2678 2) This regnode creates infoblock on the stack, and calls
2679 regmatch() recursively with the starting point at WHILEM node;
2681 3) Each hit of WHILEM node tries to match A and Z (in the order
2682 depending on the current iteration, min/max of {min,max} and
2683 greediness). The information about where are nodes for "A"
2684 and "Z" is read from the infoblock, as is info on how many times "A"
2685 was already matched, and greediness.
2687 4) After A matches, the same WHILEM node is hit again.
2689 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2690 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2691 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2692 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2693 of the external loop.
2695 Currently present infoblocks form a tree with a stem formed by PL_curcc
2696 and whatever it mentions via ->next, and additional attached trees
2697 corresponding to temporarily unset infoblocks as in "5" above.
2699 In the following picture infoblocks for outer loop of
2700 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2701 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2702 infoblocks are drawn below the "reset" infoblock.
2704 In fact in the picture below we do not show failed matches for Z and T
2705 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2706 more obvious *why* one needs to *temporary* unset infoblocks.]
2708 Matched REx position InfoBlocks Comment
2712 Y A)*?Z)*?T x <- O <- I
2713 YA )*?Z)*?T x <- O <- I
2714 YA A)*?Z)*?T x <- O <- I
2715 YAA )*?Z)*?T x <- O <- I
2716 YAA Z)*?T x <- O # Temporary unset I
2719 YAAZ Y(A)*?Z)*?T x <- O
2722 YAAZY (A)*?Z)*?T x <- O
2725 YAAZY A)*?Z)*?T x <- O <- I
2728 YAAZYA )*?Z)*?T x <- O <- I
2731 YAAZYA Z)*?T x <- O # Temporary unset I
2737 YAAZYAZ T x # Temporary unset O
2744 *******************************************************************/
2747 CHECKPOINT cp = PL_savestack_ix;
2748 /* No need to save/restore up to this paren */
2749 I32 parenfloor = scan->flags;
2751 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
2753 cc.oldcc = PL_regcc;
2755 /* XXXX Probably it is better to teach regpush to support
2756 parenfloor > PL_regsize... */
2757 if (parenfloor > *PL_reglastparen)
2758 parenfloor = *PL_reglastparen; /* Pessimization... */
2759 cc.parenfloor = parenfloor;
2761 cc.min = ARG1(scan);
2762 cc.max = ARG2(scan);
2763 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2767 PL_reginput = locinput;
2768 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2770 PL_regcc = cc.oldcc;
2776 * This is really hard to understand, because after we match
2777 * what we're trying to match, we must make sure the rest of
2778 * the REx is going to match for sure, and to do that we have
2779 * to go back UP the parse tree by recursing ever deeper. And
2780 * if it fails, we have to reset our parent's current state
2781 * that we can try again after backing off.
2784 CHECKPOINT cp, lastcp;
2785 CURCUR* cc = PL_regcc;
2786 char *lastloc = cc->lastloc; /* Detection of 0-len. */
2788 n = cc->cur + 1; /* how many we know we matched */
2789 PL_reginput = locinput;
2792 PerlIO_printf(Perl_debug_log,
2793 "%*s %ld out of %ld..%ld cc=%lx\n",
2794 REPORT_CODE_OFF+PL_regindent*2, "",
2795 (long)n, (long)cc->min,
2796 (long)cc->max, (long)cc)
2799 /* If degenerate scan matches "", assume scan done. */
2801 if (locinput == cc->lastloc && n >= cc->min) {
2802 PL_regcc = cc->oldcc;
2806 PerlIO_printf(Perl_debug_log,
2807 "%*s empty match detected, try continuation...\n",
2808 REPORT_CODE_OFF+PL_regindent*2, "")
2810 if (regmatch(cc->next))
2818 /* First just match a string of min scans. */
2822 cc->lastloc = locinput;
2823 if (regmatch(cc->scan))
2826 cc->lastloc = lastloc;
2831 /* Check whether we already were at this position.
2832 Postpone detection until we know the match is not
2833 *that* much linear. */
2834 if (!PL_reg_maxiter) {
2835 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2836 PL_reg_leftiter = PL_reg_maxiter;
2838 if (PL_reg_leftiter-- == 0) {
2839 I32 size = (PL_reg_maxiter + 7)/8;
2840 if (PL_reg_poscache) {
2841 if (PL_reg_poscache_size < size) {
2842 Renew(PL_reg_poscache, size, char);
2843 PL_reg_poscache_size = size;
2845 Zero(PL_reg_poscache, size, char);
2848 PL_reg_poscache_size = size;
2849 Newz(29, PL_reg_poscache, size, char);
2852 PerlIO_printf(Perl_debug_log,
2853 "%sDetected a super-linear match, switching on caching%s...\n",
2854 PL_colors[4], PL_colors[5])
2857 if (PL_reg_leftiter < 0) {
2858 I32 o = locinput - PL_bostr, b;
2860 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2863 if (PL_reg_poscache[o] & (1<<b)) {
2865 PerlIO_printf(Perl_debug_log,
2866 "%*s already tried at this position...\n",
2867 REPORT_CODE_OFF+PL_regindent*2, "")
2871 PL_reg_poscache[o] |= (1<<b);
2875 /* Prefer next over scan for minimal matching. */
2878 PL_regcc = cc->oldcc;
2881 cp = regcppush(cc->parenfloor);
2883 if (regmatch(cc->next)) {
2885 sayYES; /* All done. */
2887 REGCP_UNWIND(lastcp);
2893 if (n >= cc->max) { /* Maximum greed exceeded? */
2894 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2895 && !(PL_reg_flags & RF_warned)) {
2896 PL_reg_flags |= RF_warned;
2897 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2898 "Complex regular subexpression recursion",
2905 PerlIO_printf(Perl_debug_log,
2906 "%*s trying longer...\n",
2907 REPORT_CODE_OFF+PL_regindent*2, "")
2909 /* Try scanning more and see if it helps. */
2910 PL_reginput = locinput;
2912 cc->lastloc = locinput;
2913 cp = regcppush(cc->parenfloor);
2915 if (regmatch(cc->scan)) {
2919 REGCP_UNWIND(lastcp);
2922 cc->lastloc = lastloc;
2926 /* Prefer scan over next for maximal matching. */
2928 if (n < cc->max) { /* More greed allowed? */
2929 cp = regcppush(cc->parenfloor);
2931 cc->lastloc = locinput;
2933 if (regmatch(cc->scan)) {
2937 REGCP_UNWIND(lastcp);
2938 regcppop(); /* Restore some previous $<digit>s? */
2939 PL_reginput = locinput;
2941 PerlIO_printf(Perl_debug_log,
2942 "%*s failed, try continuation...\n",
2943 REPORT_CODE_OFF+PL_regindent*2, "")
2946 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
2947 && !(PL_reg_flags & RF_warned)) {
2948 PL_reg_flags |= RF_warned;
2949 Perl_warner(aTHX_ WARN_REGEXP, "%s limit (%d) exceeded",
2950 "Complex regular subexpression recursion",
2954 /* Failed deeper matches of scan, so see if this one works. */
2955 PL_regcc = cc->oldcc;
2958 if (regmatch(cc->next))
2964 cc->lastloc = lastloc;
2969 next = scan + ARG(scan);
2972 inner = NEXTOPER(NEXTOPER(scan));
2975 inner = NEXTOPER(scan);
2979 if (OP(next) != c1) /* No choice. */
2980 next = inner; /* Avoid recursion. */
2982 I32 lastparen = *PL_reglastparen;
2984 re_unwind_branch_t *uw;
2986 /* Put unwinding data on stack */
2987 unwind1 = SSNEWt(1,re_unwind_branch_t);
2988 uw = SSPTRt(unwind1,re_unwind_branch_t);
2991 uw->type = ((c1 == BRANCH)
2993 : RE_UNWIND_BRANCHJ);
2994 uw->lastparen = lastparen;
2996 uw->locinput = locinput;
2997 uw->nextchr = nextchr;
2999 uw->regindent = ++PL_regindent;
3002 REGCP_SET(uw->lastcp);
3004 /* Now go into the first branch */
3017 /* We suppose that the next guy does not need
3018 backtracking: in particular, it is of constant length,
3019 and has no parenths to influence future backrefs. */
3020 ln = ARG1(scan); /* min to match */
3021 n = ARG2(scan); /* max to match */
3022 paren = scan->flags;
3024 if (paren > PL_regsize)
3026 if (paren > *PL_reglastparen)
3027 *PL_reglastparen = paren;
3029 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3031 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3032 PL_reginput = locinput;
3035 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3037 /* if we matched something zero-length we don't need to
3038 backtrack - capturing parens are already defined, so
3039 the caveat in the maximal case doesn't apply
3041 XXXX if ln == 0, we can redo this check first time
3042 through the following loop
3045 n = ln; /* don't backtrack */
3046 locinput = PL_reginput;
3048 PL_regkind[(U8)OP(next)] == EXACT ||
3049 PL_regkind[(U8)OP(next)] == OPEN ||
3052 regnode *text_node = next;
3054 if (PL_regkind[(U8)OP(next)] != EXACT)
3055 NEXT_IMPT(next, text_node);
3057 if (PL_regkind[(U8)OP(text_node)] != EXACT) {
3061 c1 = (U8)*STRING(text_node);
3062 if (OP(next) == EXACTF)
3064 else if (OP(text_node) == EXACTFL)
3065 c2 = PL_fold_locale[c1];
3073 /* This may be improved if l == 0. */
3074 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3075 /* If it could work, try it. */
3077 UCHARAT(PL_reginput) == c1 ||
3078 UCHARAT(PL_reginput) == c2)
3082 PL_regstartp[paren] =
3083 HOPc(PL_reginput, -l) - PL_bostr;
3084 PL_regendp[paren] = PL_reginput - PL_bostr;
3087 PL_regendp[paren] = -1;
3091 REGCP_UNWIND(lastcp);
3093 /* Couldn't or didn't -- move forward. */
3094 PL_reginput = locinput;
3095 if (regrepeat_hard(scan, 1, &l)) {
3097 locinput = PL_reginput;
3104 n = regrepeat_hard(scan, n, &l);
3105 /* if we matched something zero-length we don't need to
3106 backtrack, unless the minimum count is zero and we
3107 are capturing the result - in that case the capture
3108 being defined or not may affect later execution
3110 if (n != 0 && l == 0 && !(paren && ln == 0))
3111 ln = n; /* don't backtrack */
3112 locinput = PL_reginput;
3114 PerlIO_printf(Perl_debug_log,
3115 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3116 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3121 PL_regkind[(U8)OP(next)] == EXACT ||
3122 PL_regkind[(U8)OP(next)] == OPEN ||
3125 regnode *text_node = next;
3127 if (PL_regkind[(U8)OP(next)] != EXACT)
3128 NEXT_IMPT(next, text_node);
3130 if (PL_regkind[(U8)OP(text_node)] != EXACT) {
3134 c1 = (U8)*STRING(text_node);
3135 if (OP(text_node) == EXACTF)
3137 else if (OP(text_node) == EXACTFL)
3138 c2 = PL_fold_locale[c1];
3148 /* If it could work, try it. */
3150 UCHARAT(PL_reginput) == c1 ||
3151 UCHARAT(PL_reginput) == c2)
3154 PerlIO_printf(Perl_debug_log,
3155 "%*s trying tail with n=%"IVdf"...\n",
3156 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3160 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3161 PL_regendp[paren] = PL_reginput - PL_bostr;
3164 PL_regendp[paren] = -1;
3168 REGCP_UNWIND(lastcp);
3170 /* Couldn't or didn't -- back up. */
3172 locinput = HOPc(locinput, -l);
3173 PL_reginput = locinput;
3180 paren = scan->flags; /* Which paren to set */
3181 if (paren > PL_regsize)
3183 if (paren > *PL_reglastparen)
3184 *PL_reglastparen = paren;
3185 ln = ARG1(scan); /* min to match */
3186 n = ARG2(scan); /* max to match */
3187 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3191 ln = ARG1(scan); /* min to match */
3192 n = ARG2(scan); /* max to match */
3193 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3198 scan = NEXTOPER(scan);
3204 scan = NEXTOPER(scan);
3208 * Lookahead to avoid useless match attempts
3209 * when we know what character comes next.
3213 * Used to only do .*x and .*?x, but now it allows
3214 * for )'s, ('s and (?{ ... })'s to be in the way
3215 * of the quantifier and the EXACT-like node. -- japhy
3219 PL_regkind[(U8)OP(next)] == EXACT ||
3220 PL_regkind[(U8)OP(next)] == OPEN ||
3224 regnode *text_node = next;
3226 if (PL_regkind[(U8)OP(next)] != EXACT)
3227 NEXT_IMPT(next, text_node);
3229 if (PL_regkind[(U8)OP(text_node)] != EXACT) {
3233 s = (U8*)STRING(text_node);
3237 if (OP(text_node) == EXACTF)
3239 else if (OP(text_node) == EXACTFL)
3240 c2 = PL_fold_locale[c1];
3243 if (OP(text_node) == EXACTF) {
3244 c1 = to_utf8_lower(s);
3245 c2 = to_utf8_upper(s);
3248 c2 = c1 = utf8_to_uvchr(s, NULL);
3255 PL_reginput = locinput;
3259 if (ln && regrepeat(scan, ln) < ln)
3261 locinput = PL_reginput;
3264 char *e; /* Should not check after this */
3265 char *old = locinput;
3267 if (n == REG_INFTY) {
3270 while (UTF8_IS_CONTINUATION(*(U8*)e))
3276 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3280 e = locinput + n - ln;
3286 /* Find place 'next' could work */
3289 while (locinput <= e && *locinput != c1)
3292 while (locinput <= e
3297 count = locinput - old;
3304 utf8_to_uvchr((U8*)locinput, &len) != c1;
3309 for (count = 0; locinput <= e; count++) {
3310 UV c = utf8_to_uvchr((U8*)locinput, &len);
3311 if (c == c1 || c == c2)
3319 /* PL_reginput == old now */
3320 if (locinput != old) {
3321 ln = 1; /* Did some */
3322 if (regrepeat(scan, count) < count)
3325 /* PL_reginput == locinput now */
3326 TRYPAREN(paren, ln, locinput);
3327 PL_reginput = locinput; /* Could be reset... */
3328 REGCP_UNWIND(lastcp);
3329 /* Couldn't or didn't -- move forward. */
3332 locinput += UTF8SKIP(locinput);
3338 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3342 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3344 c = UCHARAT(PL_reginput);
3345 /* If it could work, try it. */
3346 if (c == c1 || c == c2)
3348 TRYPAREN(paren, n, PL_reginput);
3349 REGCP_UNWIND(lastcp);
3352 /* If it could work, try it. */
3353 else if (c1 == -1000)
3355 TRYPAREN(paren, n, PL_reginput);
3356 REGCP_UNWIND(lastcp);
3358 /* Couldn't or didn't -- move forward. */
3359 PL_reginput = locinput;
3360 if (regrepeat(scan, 1)) {
3362 locinput = PL_reginput;
3370 n = regrepeat(scan, n);
3371 locinput = PL_reginput;
3372 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3373 (!PL_multiline || OP(next) == SEOL || OP(next) == EOS)) {
3374 ln = n; /* why back off? */
3375 /* ...because $ and \Z can match before *and* after
3376 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3377 We should back off by one in this case. */
3378 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3387 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3389 c = UCHARAT(PL_reginput);
3391 /* If it could work, try it. */
3392 if (c1 == -1000 || c == c1 || c == c2)
3394 TRYPAREN(paren, n, PL_reginput);
3395 REGCP_UNWIND(lastcp);
3397 /* Couldn't or didn't -- back up. */
3399 PL_reginput = locinput = HOPc(locinput, -1);
3407 c = utf8_to_uvchr((U8*)PL_reginput, NULL);
3409 c = UCHARAT(PL_reginput);
3411 /* If it could work, try it. */
3412 if (c1 == -1000 || c == c1 || c == c2)
3414 TRYPAREN(paren, n, PL_reginput);
3415 REGCP_UNWIND(lastcp);
3417 /* Couldn't or didn't -- back up. */
3419 PL_reginput = locinput = HOPc(locinput, -1);
3426 if (PL_reg_call_cc) {
3427 re_cc_state *cur_call_cc = PL_reg_call_cc;
3428 CURCUR *cctmp = PL_regcc;
3429 regexp *re = PL_reg_re;
3430 CHECKPOINT cp, lastcp;
3432 cp = regcppush(0); /* Save *all* the positions. */
3434 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3436 PL_reginput = locinput; /* Make position available to
3438 cache_re(PL_reg_call_cc->re);
3439 PL_regcc = PL_reg_call_cc->cc;
3440 PL_reg_call_cc = PL_reg_call_cc->prev;
3441 if (regmatch(cur_call_cc->node)) {
3442 PL_reg_call_cc = cur_call_cc;
3446 REGCP_UNWIND(lastcp);
3448 PL_reg_call_cc = cur_call_cc;
3454 PerlIO_printf(Perl_debug_log,
3455 "%*s continuation failed...\n",
3456 REPORT_CODE_OFF+PL_regindent*2, "")
3460 if (locinput < PL_regtill) {
3461 DEBUG_r(PerlIO_printf(Perl_debug_log,
3462 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3464 (long)(locinput - PL_reg_starttry),
3465 (long)(PL_regtill - PL_reg_starttry),
3467 sayNO_FINAL; /* Cannot match: too short. */
3469 PL_reginput = locinput; /* put where regtry can find it */
3470 sayYES_FINAL; /* Success! */
3472 PL_reginput = locinput; /* put where regtry can find it */
3473 sayYES_LOUD; /* Success! */
3476 PL_reginput = locinput;
3481 s = HOPBACKc(locinput, scan->flags);
3487 PL_reginput = locinput;
3492 s = HOPBACKc(locinput, scan->flags);
3498 PL_reginput = locinput;
3501 inner = NEXTOPER(NEXTOPER(scan));
3502 if (regmatch(inner) != n) {
3517 if (OP(scan) == SUSPEND) {
3518 locinput = PL_reginput;
3519 nextchr = UCHARAT(locinput);
3524 next = scan + ARG(scan);
3529 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3530 PTR2UV(scan), OP(scan));
3531 Perl_croak(aTHX_ "regexp memory corruption");
3538 * We get here only if there's trouble -- normally "case END" is
3539 * the terminating point.
3541 Perl_croak(aTHX_ "corrupted regexp pointers");
3547 PerlIO_printf(Perl_debug_log,
3548 "%*s %scould match...%s\n",
3549 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3553 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3554 PL_colors[4],PL_colors[5]));
3560 #if 0 /* Breaks $^R */
3568 PerlIO_printf(Perl_debug_log,
3569 "%*s %sfailed...%s\n",
3570 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3576 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3579 case RE_UNWIND_BRANCH:
3580 case RE_UNWIND_BRANCHJ:
3582 re_unwind_branch_t *uwb = &(uw->branch);
3583 I32 lastparen = uwb->lastparen;
3585 REGCP_UNWIND(uwb->lastcp);
3586 for (n = *PL_reglastparen; n > lastparen; n--)
3588 *PL_reglastparen = n;
3589 scan = next = uwb->next;
3591 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3592 ? BRANCH : BRANCHJ) ) { /* Failure */
3599 /* Have more choice yet. Reuse the same uwb. */
3601 if ((n = (uwb->type == RE_UNWIND_BRANCH
3602 ? NEXT_OFF(next) : ARG(next))))
3605 next = NULL; /* XXXX Needn't unwinding in this case... */
3607 next = NEXTOPER(scan);
3608 if (uwb->type == RE_UNWIND_BRANCHJ)
3609 next = NEXTOPER(next);
3610 locinput = uwb->locinput;
3611 nextchr = uwb->nextchr;
3613 PL_regindent = uwb->regindent;
3620 Perl_croak(aTHX_ "regexp unwind memory corruption");
3631 - regrepeat - repeatedly match something simple, report how many
3634 * [This routine now assumes that it will only match on things of length 1.
3635 * That was true before, but now we assume scan - reginput is the count,
3636 * rather than incrementing count on every character. [Er, except utf8.]]
3639 S_regrepeat(pTHX_ regnode *p, I32 max)
3641 register char *scan;
3643 register char *loceol = PL_regeol;
3644 register I32 hardcount = 0;
3645 register bool do_utf8 = PL_reg_match_utf8;
3648 if (max != REG_INFTY && max < loceol - scan)
3649 loceol = scan + max;
3654 while (scan < loceol && hardcount < max && *scan != '\n') {
3655 scan += UTF8SKIP(scan);
3659 while (scan < loceol && *scan != '\n')
3669 case EXACT: /* length of string is 1 */
3671 while (scan < loceol && UCHARAT(scan) == c)
3674 case EXACTF: /* length of string is 1 */
3676 while (scan < loceol &&
3677 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3680 case EXACTFL: /* length of string is 1 */
3681 PL_reg_flags |= RF_tainted;
3683 while (scan < loceol &&
3684 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3690 while (hardcount < max && scan < loceol &&
3691 reginclass(p, (U8*)scan, do_utf8)) {
3692 scan += UTF8SKIP(scan);
3696 while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
3703 LOAD_UTF8_CHARCLASS(alnum,"a");
3704 while (hardcount < max && scan < loceol &&
3705 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3706 scan += UTF8SKIP(scan);
3710 while (scan < loceol && isALNUM(*scan))
3715 PL_reg_flags |= RF_tainted;
3718 while (hardcount < max && scan < loceol &&
3719 isALNUM_LC_utf8((U8*)scan)) {
3720 scan += UTF8SKIP(scan);
3724 while (scan < loceol && isALNUM_LC(*scan))
3731 LOAD_UTF8_CHARCLASS(alnum,"a");
3732 while (hardcount < max && scan < loceol &&
3733 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3734 scan += UTF8SKIP(scan);
3738 while (scan < loceol && !isALNUM(*scan))
3743 PL_reg_flags |= RF_tainted;
3746 while (hardcount < max && scan < loceol &&
3747 !isALNUM_LC_utf8((U8*)scan)) {
3748 scan += UTF8SKIP(scan);
3752 while (scan < loceol && !isALNUM_LC(*scan))
3759 LOAD_UTF8_CHARCLASS(space," ");
3760 while (hardcount < max && scan < loceol &&
3762 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3763 scan += UTF8SKIP(scan);
3767 while (scan < loceol && isSPACE(*scan))
3772 PL_reg_flags |= RF_tainted;
3775 while (hardcount < max && scan < loceol &&
3776 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3777 scan += UTF8SKIP(scan);
3781 while (scan < loceol && isSPACE_LC(*scan))
3788 LOAD_UTF8_CHARCLASS(space," ");
3789 while (hardcount < max && scan < loceol &&
3791 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3792 scan += UTF8SKIP(scan);
3796 while (scan < loceol && !isSPACE(*scan))
3801 PL_reg_flags |= RF_tainted;
3804 while (hardcount < max && scan < loceol &&
3805 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3806 scan += UTF8SKIP(scan);
3810 while (scan < loceol && !isSPACE_LC(*scan))
3817 LOAD_UTF8_CHARCLASS(digit,"0");
3818 while (hardcount < max && scan < loceol &&
3819 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3820 scan += UTF8SKIP(scan);
3824 while (scan < loceol && isDIGIT(*scan))
3831 LOAD_UTF8_CHARCLASS(digit,"0");
3832 while (hardcount < max && scan < loceol &&
3833 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
3834 scan += UTF8SKIP(scan);
3838 while (scan < loceol && !isDIGIT(*scan))
3842 default: /* Called on something of 0 width. */
3843 break; /* So match right here or not at all. */
3849 c = scan - PL_reginput;
3854 SV *prop = sv_newmortal();
3857 PerlIO_printf(Perl_debug_log,
3858 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3859 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
3866 - regrepeat_hard - repeatedly match something, report total lenth and length
3868 * The repeater is supposed to have constant length.
3872 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
3874 register char *scan = Nullch;
3875 register char *start;
3876 register char *loceol = PL_regeol;
3878 I32 count = 0, res = 1;
3883 start = PL_reginput;
3884 if (PL_reg_match_utf8) {
3885 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3888 while (start < PL_reginput) {
3890 start += UTF8SKIP(start);
3901 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
3903 *lp = l = PL_reginput - start;
3904 if (max != REG_INFTY && l*max < loceol - scan)
3905 loceol = scan + l*max;
3918 - regclass_swash - prepare the utf8 swash
3922 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
3927 if (PL_regdata && PL_regdata->count) {
3930 if (PL_regdata->what[n] == 's') {
3931 SV *rv = (SV*)PL_regdata->data[n];
3932 AV *av = (AV*)SvRV((SV*)rv);
3935 si = *av_fetch(av, 0, FALSE);
3936 a = av_fetch(av, 1, FALSE);
3940 else if (si && doinit) {
3941 sw = swash_init("utf8", "", si, 1, 0);
3942 (void)av_store(av, 1, sw);
3954 - reginclass - determine if a character falls into a character class
3958 S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
3960 char flags = ANYOF_FLAGS(n);
3965 c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
3967 if (do_utf8 || (flags & ANYOF_UNICODE)) {
3968 if (do_utf8 && !ANYOF_RUNTIME(n)) {
3969 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
3972 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
3975 SV *sw = regclass_swash(n, TRUE, 0);
3978 if (swash_fetch(sw, p, do_utf8))
3980 else if (flags & ANYOF_FOLD) {
3981 U8 tmpbuf[UTF8_MAXLEN+1];
3983 if (flags & ANYOF_LOCALE) {
3984 PL_reg_flags |= RF_tainted;
3985 uvchr_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
3988 uvchr_to_utf8(tmpbuf, toLOWER_utf8(p));
3989 if (swash_fetch(sw, tmpbuf, do_utf8))
3995 if (!match && c < 256) {
3996 if (ANYOF_BITMAP_TEST(n, c))
3998 else if (flags & ANYOF_FOLD) {
4001 if (flags & ANYOF_LOCALE) {
4002 PL_reg_flags |= RF_tainted;
4003 f = PL_fold_locale[c];
4007 if (f != c && ANYOF_BITMAP_TEST(n, f))
4011 if (!match && (flags & ANYOF_CLASS)) {
4012 PL_reg_flags |= RF_tainted;
4014 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4015 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4016 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4017 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4018 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4019 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4020 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4021 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4022 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4023 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4024 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4025 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4026 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4027 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4028 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4029 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4030 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4031 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4032 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4033 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4034 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4035 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4036 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4037 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4038 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4039 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4040 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4041 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4042 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4043 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4044 ) /* How's that for a conditional? */
4051 return (flags & ANYOF_INVERT) ? !match : match;
4055 S_reghop(pTHX_ U8 *s, I32 off)
4057 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4061 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4064 while (off-- && s < lim) {
4065 /* XXX could check well-formedness here */
4073 if (UTF8_IS_CONTINUED(*s)) {
4074 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4077 /* XXX could check well-formedness here */
4085 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4087 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4091 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4094 while (off-- && s < lim) {
4095 /* XXX could check well-formedness here */
4105 if (UTF8_IS_CONTINUED(*s)) {
4106 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4109 /* XXX could check well-formedness here */
4121 restore_pos(pTHX_ void *arg)
4123 if (PL_reg_eval_set) {
4124 if (PL_reg_oldsaved) {
4125 PL_reg_re->subbeg = PL_reg_oldsaved;
4126 PL_reg_re->sublen = PL_reg_oldsavedlen;
4127 RX_MATCH_COPIED_on(PL_reg_re);
4129 PL_reg_magic->mg_len = PL_reg_oldpos;
4130 PL_reg_eval_set = 0;
4131 PL_curpm = PL_reg_oldcurpm;