5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 8
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
172 for (p = PL_regsize; p > parenfloor; p--) {
173 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
174 SSPUSHINT(PL_regendp[p]);
175 SSPUSHINT(PL_regstartp[p]);
176 SSPUSHPTR(PL_reg_start_tmp[p]);
178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
179 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
180 (UV)p, (IV)PL_regstartp[p],
181 (IV)(PL_reg_start_tmp[p] - PL_bostr),
185 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
186 SSPUSHPTR(PL_regstartp);
187 SSPUSHPTR(PL_regendp);
188 SSPUSHINT(PL_regsize);
189 SSPUSHINT(*PL_reglastparen);
190 SSPUSHINT(*PL_reglastcloseparen);
191 SSPUSHPTR(PL_reginput);
192 #define REGCP_FRAME_ELEMS 2
193 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
194 * are needed for the regexp context stack bookkeeping. */
195 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
196 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
201 /* These are needed since we do not localize EVAL nodes: */
202 #define REGCP_SET(cp) \
204 PerlIO_printf(Perl_debug_log, \
205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
206 (IV)PL_savestack_ix)); \
209 #define REGCP_UNWIND(cp) \
211 if (cp != PL_savestack_ix) \
212 PerlIO_printf(Perl_debug_log, \
213 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
214 (IV)(cp), (IV)PL_savestack_ix)); \
218 S_regcppop(pTHX_ const regexp *rex)
224 GET_RE_DEBUG_FLAGS_DECL;
226 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
228 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
229 i = SSPOPINT; /* Parentheses elements to pop. */
230 input = (char *) SSPOPPTR;
231 *PL_reglastcloseparen = SSPOPINT;
232 *PL_reglastparen = SSPOPINT;
233 PL_regsize = SSPOPINT;
234 PL_regendp=(I32 *) SSPOPPTR;
235 PL_regstartp=(I32 *) SSPOPPTR;
238 /* Now restore the parentheses context. */
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
242 U32 paren = (U32)SSPOPINT;
243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
244 PL_regstartp[paren] = SSPOPINT;
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
249 PerlIO_printf(Perl_debug_log,
250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
251 (UV)paren, (IV)PL_regstartp[paren],
252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
253 (IV)PL_regendp[paren],
254 (paren > *PL_reglastparen ? "(no)" : ""));
258 if (*PL_reglastparen + 1 <= rex->nparens) {
259 PerlIO_printf(Perl_debug_log,
260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
261 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
275 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
277 PL_regstartp[i] = -1;
284 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
287 * pregexec and friends
290 #ifndef PERL_IN_XSUB_RE
292 - pregexec - match a regexp against a string
295 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
296 char *strbeg, I32 minend, SV *screamer, U32 nosave)
297 /* strend: pointer to null at end of string */
298 /* strbeg: real beginning of string */
299 /* minend: end of match must be >=minend after stringarg. */
300 /* nosave: For optimizations. */
303 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
304 nosave ? 0 : REXEC_COPY_STR);
309 * Need to implement the following flags for reg_anch:
311 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
313 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
314 * INTUIT_AUTORITATIVE_ML
315 * INTUIT_ONCE_NOML - Intuit can match in one location only.
318 * Another flag for this function: SECOND_TIME (so that float substrs
319 * with giant delta may be not rechecked).
322 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
324 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
325 Otherwise, only SvCUR(sv) is used to get strbeg. */
327 /* XXXX We assume that strpos is strbeg unless sv. */
329 /* XXXX Some places assume that there is a fixed substring.
330 An update may be needed if optimizer marks as "INTUITable"
331 RExen without fixed substrings. Similarly, it is assumed that
332 lengths of all the strings are no more than minlen, thus they
333 cannot come from lookahead.
334 (Or minlen should take into account lookahead.) */
336 /* A failure to find a constant substring means that there is no need to make
337 an expensive call to REx engine, thus we celebrate a failure. Similarly,
338 finding a substring too deep into the string means that less calls to
339 regtry() should be needed.
341 REx compiler's optimizer found 4 possible hints:
342 a) Anchored substring;
344 c) Whether we are anchored (beginning-of-line or \G);
345 d) First node (of those at offset 0) which may distingush positions;
346 We use a)b)d) and multiline-part of c), and try to find a position in the
347 string which does not contradict any of them.
350 /* Most of decisions we do here should have been done at compile time.
351 The nodes of the REx which we used for the search should have been
352 deleted from the finite automaton. */
355 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
356 char *strend, U32 flags, re_scream_pos_data *data)
359 register I32 start_shift = 0;
360 /* Should be nonnegative! */
361 register I32 end_shift = 0;
366 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
368 register char *other_last = NULL; /* other substr checked before this */
369 char *check_at = NULL; /* check substr found at this pos */
370 const I32 multiline = prog->reganch & PMf_MULTILINE;
372 const char * const i_strpos = strpos;
375 GET_RE_DEBUG_FLAGS_DECL;
377 RX_MATCH_UTF8_set(prog,do_utf8);
379 if (prog->reganch & ROPT_UTF8) {
380 PL_reg_flags |= RF_utf8;
383 debug_start_match(prog, do_utf8, strpos, strend,
384 sv ? "Guessing start of match in sv for"
385 : "Guessing start of match in string for");
388 /* CHR_DIST() would be more correct here but it makes things slow. */
389 if (prog->minlen > strend - strpos) {
390 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
391 "String too short... [re_intuit_start]\n"));
395 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
398 if (!prog->check_utf8 && prog->check_substr)
399 to_utf8_substr(prog);
400 check = prog->check_utf8;
402 if (!prog->check_substr && prog->check_utf8)
403 to_byte_substr(prog);
404 check = prog->check_substr;
406 if (check == &PL_sv_undef) {
407 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
408 "Non-utf8 string cannot match utf8 check string\n"));
411 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
412 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
413 || ( (prog->reganch & ROPT_ANCH_BOL)
414 && !multiline ) ); /* Check after \n? */
417 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
418 | ROPT_IMPLICIT)) /* not a real BOL */
419 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
421 && (strpos != strbeg)) {
422 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
425 if (prog->check_offset_min == prog->check_offset_max &&
426 !(prog->reganch & ROPT_CANY_SEEN)) {
427 /* Substring at constant offset from beg-of-str... */
430 s = HOP3c(strpos, prog->check_offset_min, strend);
433 slen = SvCUR(check); /* >= 1 */
435 if ( strend - s > slen || strend - s < slen - 1
436 || (strend - s == slen && strend[-1] != '\n')) {
437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
440 /* Now should match s[0..slen-2] */
442 if (slen && (*SvPVX_const(check) != *s
444 && memNE(SvPVX_const(check), s, slen)))) {
446 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
450 else if (*SvPVX_const(check) != *s
451 || ((slen = SvCUR(check)) > 1
452 && memNE(SvPVX_const(check), s, slen)))
455 goto success_at_start;
458 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
460 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
461 end_shift = prog->check_end_shift;
464 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
465 - (SvTAIL(check) != 0);
466 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
468 if (end_shift < eshift)
472 else { /* Can match at random position */
475 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
476 end_shift = prog->check_end_shift;
478 /* end shift should be non negative here */
481 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
483 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
484 (IV)end_shift, prog->precomp);
488 /* Find a possible match in the region s..strend by looking for
489 the "check" substring in the region corrected by start/end_shift. */
492 I32 srch_start_shift = start_shift;
493 I32 srch_end_shift = end_shift;
494 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
495 srch_end_shift -= ((strbeg - s) - srch_start_shift);
496 srch_start_shift = strbeg - s;
498 DEBUG_OPTIMISE_MORE_r({
499 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
500 (IV)prog->check_offset_min,
501 (IV)srch_start_shift,
503 (IV)prog->check_end_shift);
506 if (flags & REXEC_SCREAM) {
507 I32 p = -1; /* Internal iterator of scream. */
508 I32 * const pp = data ? data->scream_pos : &p;
510 if (PL_screamfirst[BmRARE(check)] >= 0
511 || ( BmRARE(check) == '\n'
512 && (BmPREVIOUS(check) == SvCUR(check) - 1)
514 s = screaminstr(sv, check,
515 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
518 /* we may be pointing at the wrong string */
519 if (s && RX_MATCH_COPIED(prog))
520 s = strbeg + (s - SvPVX_const(sv));
522 *data->scream_olds = s;
527 if (prog->reganch & ROPT_CANY_SEEN) {
528 start_point= (U8*)(s + srch_start_shift);
529 end_point= (U8*)(strend - srch_end_shift);
531 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
532 end_point= HOP3(strend, -srch_end_shift, strbeg);
534 DEBUG_OPTIMISE_MORE_r({
535 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
536 (int)(end_point - start_point),
537 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
541 s = fbm_instr( start_point, end_point,
542 check, multiline ? FBMrf_MULTILINE : 0);
545 /* Update the count-of-usability, remove useless subpatterns,
549 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
550 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
551 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
552 (s ? "Found" : "Did not find"),
553 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
554 ? "anchored" : "floating"),
557 (s ? " at offset " : "...\n") );
562 /* Finish the diagnostic message */
563 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
565 /* XXX dmq: first branch is for positive lookbehind...
566 Our check string is offset from the beginning of the pattern.
567 So we need to do any stclass tests offset forward from that
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
579 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
585 : (prog->float_substr && prog->anchored_substr))
587 /* Take into account the "other" substring. */
588 /* XXXX May be hopelessly wrong for UTF... */
591 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
594 char * const last = HOP3c(s, -start_shift, strbeg);
596 char * const saved_s = s;
599 t = s - prog->check_offset_max;
600 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
602 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
607 t = HOP3c(t, prog->anchored_offset, strend);
608 if (t < other_last) /* These positions already checked */
610 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
613 /* XXXX It is not documented what units *_offsets are in.
614 We assume bytes, but this is clearly wrong.
615 Meaning this code needs to be carefully reviewed for errors.
619 /* On end-of-str: see comment below. */
620 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
621 if (must == &PL_sv_undef) {
623 DEBUG_r(must = prog->anchored_utf8); /* for debug */
628 HOP3(HOP3(last1, prog->anchored_offset, strend)
629 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
631 multiline ? FBMrf_MULTILINE : 0
634 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
635 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
636 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
637 (s ? "Found" : "Contradicts"),
638 quoted, RE_SV_TAIL(must));
643 if (last1 >= last2) {
644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
645 ", giving up...\n"));
648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
649 ", trying floating at offset %ld...\n",
650 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
651 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
652 s = HOP3c(last, 1, strend);
656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
657 (long)(s - i_strpos)));
658 t = HOP3c(s, -prog->anchored_offset, strbeg);
659 other_last = HOP3c(s, 1, strend);
667 else { /* Take into account the floating substring. */
669 char * const saved_s = s;
672 t = HOP3c(s, -start_shift, strbeg);
674 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
675 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
676 last = HOP3c(t, prog->float_max_offset, strend);
677 s = HOP3c(t, prog->float_min_offset, strend);
680 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
681 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
682 /* fbm_instr() takes into account exact value of end-of-str
683 if the check is SvTAIL(ed). Since false positives are OK,
684 and end-of-str is not later than strend we are OK. */
685 if (must == &PL_sv_undef) {
687 DEBUG_r(must = prog->float_utf8); /* for debug message */
690 s = fbm_instr((unsigned char*)s,
691 (unsigned char*)last + SvCUR(must)
693 must, multiline ? FBMrf_MULTILINE : 0);
695 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
696 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
697 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
698 (s ? "Found" : "Contradicts"),
699 quoted, RE_SV_TAIL(must));
703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
704 ", giving up...\n"));
707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
708 ", trying anchored starting at offset %ld...\n",
709 (long)(saved_s + 1 - i_strpos)));
711 s = HOP3c(t, 1, strend);
715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
716 (long)(s - i_strpos)));
717 other_last = s; /* Fix this later. --Hugo */
727 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
729 DEBUG_OPTIMISE_MORE_r(
730 PerlIO_printf(Perl_debug_log,
731 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
732 (IV)prog->check_offset_min,
733 (IV)prog->check_offset_max,
741 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
743 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
746 /* Fixed substring is found far enough so that the match
747 cannot start at strpos. */
749 if (ml_anch && t[-1] != '\n') {
750 /* Eventually fbm_*() should handle this, but often
751 anchored_offset is not 0, so this check will not be wasted. */
752 /* XXXX In the code below we prefer to look for "^" even in
753 presence of anchored substrings. And we search even
754 beyond the found float position. These pessimizations
755 are historical artefacts only. */
757 while (t < strend - prog->minlen) {
759 if (t < check_at - prog->check_offset_min) {
760 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
761 /* Since we moved from the found position,
762 we definitely contradict the found anchored
763 substr. Due to the above check we do not
764 contradict "check" substr.
765 Thus we can arrive here only if check substr
766 is float. Redo checking for "other"=="fixed".
769 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
770 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
771 goto do_other_anchored;
773 /* We don't contradict the found floating substring. */
774 /* XXXX Why not check for STCLASS? */
776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
777 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
780 /* Position contradicts check-string */
781 /* XXXX probably better to look for check-string
782 than for "\n", so one should lower the limit for t? */
783 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
784 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
785 other_last = strpos = s = t + 1;
790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
791 PL_colors[0], PL_colors[1]));
795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
796 PL_colors[0], PL_colors[1]));
800 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
803 /* The found string does not prohibit matching at strpos,
804 - no optimization of calling REx engine can be performed,
805 unless it was an MBOL and we are not after MBOL,
806 or a future STCLASS check will fail this. */
808 /* Even in this situation we may use MBOL flag if strpos is offset
809 wrt the start of the string. */
810 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
811 && (strpos != strbeg) && strpos[-1] != '\n'
812 /* May be due to an implicit anchor of m{.*foo} */
813 && !(prog->reganch & ROPT_IMPLICIT))
818 DEBUG_EXECUTE_r( if (ml_anch)
819 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
820 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
823 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
825 prog->check_utf8 /* Could be deleted already */
826 && --BmUSEFUL(prog->check_utf8) < 0
827 && (prog->check_utf8 == prog->float_utf8)
829 prog->check_substr /* Could be deleted already */
830 && --BmUSEFUL(prog->check_substr) < 0
831 && (prog->check_substr == prog->float_substr)
834 /* If flags & SOMETHING - do not do it many times on the same match */
835 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
836 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
837 if (do_utf8 ? prog->check_substr : prog->check_utf8)
838 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
839 prog->check_substr = prog->check_utf8 = NULL; /* disable */
840 prog->float_substr = prog->float_utf8 = NULL; /* clear */
841 check = NULL; /* abort */
843 /* XXXX This is a remnant of the old implementation. It
844 looks wasteful, since now INTUIT can use many
846 prog->reganch &= ~RE_USE_INTUIT;
853 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
854 /* trie stclasses are too expensive to use here, we are better off to
855 leave it to regmatch itself */
856 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
857 /* minlen == 0 is possible if regstclass is \b or \B,
858 and the fixed substr is ''$.
859 Since minlen is already taken into account, s+1 is before strend;
860 accidentally, minlen >= 1 guaranties no false positives at s + 1
861 even for \b or \B. But (minlen? 1 : 0) below assumes that
862 regstclass does not come from lookahead... */
863 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
864 This leaves EXACTF only, which is dealt with in find_byclass(). */
865 const U8* const str = (U8*)STRING(prog->regstclass);
866 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
867 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
870 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
871 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
872 else if (prog->float_substr || prog->float_utf8)
873 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
877 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
878 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
881 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
884 const char *what = NULL;
886 if (endpos == strend) {
887 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
888 "Could not match STCLASS...\n") );
891 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
892 "This position contradicts STCLASS...\n") );
893 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
895 /* Contradict one of substrings */
896 if (prog->anchored_substr || prog->anchored_utf8) {
897 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
898 DEBUG_EXECUTE_r( what = "anchored" );
900 s = HOP3c(t, 1, strend);
901 if (s + start_shift + end_shift > strend) {
902 /* XXXX Should be taken into account earlier? */
903 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
904 "Could not match STCLASS...\n") );
909 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
910 "Looking for %s substr starting at offset %ld...\n",
911 what, (long)(s + start_shift - i_strpos)) );
914 /* Have both, check_string is floating */
915 if (t + start_shift >= check_at) /* Contradicts floating=check */
916 goto retry_floating_check;
917 /* Recheck anchored substring, but not floating... */
921 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
922 "Looking for anchored substr starting at offset %ld...\n",
923 (long)(other_last - i_strpos)) );
924 goto do_other_anchored;
926 /* Another way we could have checked stclass at the
927 current position only: */
932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
933 "Looking for /%s^%s/m starting at offset %ld...\n",
934 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
937 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
939 /* Check is floating subtring. */
940 retry_floating_check:
941 t = check_at - start_shift;
942 DEBUG_EXECUTE_r( what = "floating" );
943 goto hop_and_restart;
946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
947 "By STCLASS: moving %ld --> %ld\n",
948 (long)(t - i_strpos), (long)(s - i_strpos))
952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
953 "Does not contradict STCLASS...\n");
958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
959 PL_colors[4], (check ? "Guessed" : "Giving up"),
960 PL_colors[5], (long)(s - i_strpos)) );
963 fail_finish: /* Substring not found */
964 if (prog->check_substr || prog->check_utf8) /* could be removed already */
965 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
968 PL_colors[4], PL_colors[5]));
974 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
975 foldlen, foldbuf, uniflags) STMT_START { \
976 switch (trie_type) { \
977 case trie_utf8_fold: \
979 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
985 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
986 foldlen -= UNISKIP( uvc ); \
987 uscan = foldbuf + UNISKIP( uvc ); \
991 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
999 charid = trie->charmap[ uvc ]; \
1003 if (trie->widecharmap) { \
1004 SV** const svpp = hv_fetch(trie->widecharmap, \
1005 (char*)&uvc, sizeof(UV), 0); \
1007 charid = (U16)SvIV(*svpp); \
1012 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1015 ibcmp_utf8(s, NULL, 0, do_utf8, \
1016 m, NULL, ln, (bool)UTF)) \
1017 && (!reginfo || regtry(reginfo, s)) ) \
1020 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1021 uvchr_to_utf8(tmpbuf, c); \
1022 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1024 && (f == c1 || f == c2) \
1025 && (ln == foldlen || \
1026 !ibcmp_utf8((char *) foldbuf, \
1027 NULL, foldlen, do_utf8, \
1029 NULL, ln, (bool)UTF)) \
1030 && (!reginfo || regtry(reginfo, s)) ) \
1035 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1039 && (ln == 1 || !(OP(c) == EXACTF \
1041 : ibcmp_locale(s, m, ln))) \
1042 && (!reginfo || regtry(reginfo, s)) ) \
1048 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1050 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1056 #define REXEC_FBC_SCAN(CoDe) \
1058 while (s < strend) { \
1064 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1065 REXEC_FBC_UTF8_SCAN( \
1067 if (tmp && (!reginfo || regtry(reginfo, s))) \
1076 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1079 if (tmp && (!reginfo || regtry(reginfo, s))) \
1088 #define REXEC_FBC_TRYIT \
1089 if ((!reginfo || regtry(reginfo, s))) \
1092 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1095 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1098 REXEC_FBC_CLASS_SCAN(CoNd); \
1102 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1103 PL_reg_flags |= RF_tainted; \
1105 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1108 REXEC_FBC_CLASS_SCAN(CoNd); \
1112 #define DUMP_EXEC_POS(li,s,doutf8) \
1113 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1115 /* We know what class REx starts with. Try to find this position... */
1116 /* if reginfo is NULL, its a dryrun */
1117 /* annoyingly all the vars in this routine have different names from their counterparts
1118 in regmatch. /grrr */
1121 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1122 const char *strend, const regmatch_info *reginfo)
1125 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1129 register STRLEN uskip;
1133 register I32 tmp = 1; /* Scratch variable? */
1134 register const bool do_utf8 = PL_reg_match_utf8;
1136 /* We know what class it must start with. */
1140 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1141 !UTF8_IS_INVARIANT((U8)s[0]) ?
1142 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1143 REGINCLASS(prog, c, (U8*)s));
1146 while (s < strend) {
1149 if (REGINCLASS(prog, c, (U8*)s) ||
1150 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1151 /* The assignment of 2 is intentional:
1152 * for the folded sharp s, the skip is 2. */
1153 (skip = SHARP_S_SKIP))) {
1154 if (tmp && (!reginfo || regtry(reginfo, s)))
1167 if (tmp && (!reginfo || regtry(reginfo, s)))
1175 ln = STR_LEN(c); /* length to match in octets/bytes */
1176 lnc = (I32) ln; /* length to match in characters */
1178 STRLEN ulen1, ulen2;
1180 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1181 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1182 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1184 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1185 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1187 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1189 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1192 while (sm < ((U8 *) m + ln)) {
1207 c2 = PL_fold_locale[c1];
1209 e = HOP3c(strend, -((I32)lnc), s);
1211 if (!reginfo && e < s)
1212 e = s; /* Due to minlen logic of intuit() */
1214 /* The idea in the EXACTF* cases is to first find the
1215 * first character of the EXACTF* node and then, if
1216 * necessary, case-insensitively compare the full
1217 * text of the node. The c1 and c2 are the first
1218 * characters (though in Unicode it gets a bit
1219 * more complicated because there are more cases
1220 * than just upper and lower: one needs to use
1221 * the so-called folding case for case-insensitive
1222 * matching (called "loose matching" in Unicode).
1223 * ibcmp_utf8() will do just that. */
1227 U8 tmpbuf [UTF8_MAXBYTES+1];
1228 STRLEN len, foldlen;
1229 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1231 /* Upper and lower of 1st char are equal -
1232 * probably not a "letter". */
1234 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1236 REXEC_FBC_EXACTISH_CHECK(c == c1);
1241 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1244 /* Handle some of the three Greek sigmas cases.
1245 * Note that not all the possible combinations
1246 * are handled here: some of them are handled
1247 * by the standard folding rules, and some of
1248 * them (the character class or ANYOF cases)
1249 * are handled during compiletime in
1250 * regexec.c:S_regclass(). */
1251 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1252 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1253 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1255 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1261 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1263 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1267 PL_reg_flags |= RF_tainted;
1274 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1275 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1277 tmp = ((OP(c) == BOUND ?
1278 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1279 LOAD_UTF8_CHARCLASS_ALNUM();
1280 REXEC_FBC_UTF8_SCAN(
1281 if (tmp == !(OP(c) == BOUND ?
1282 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1283 isALNUM_LC_utf8((U8*)s)))
1291 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1292 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1295 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1301 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1305 PL_reg_flags |= RF_tainted;
1312 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1313 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1315 tmp = ((OP(c) == NBOUND ?
1316 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1317 LOAD_UTF8_CHARCLASS_ALNUM();
1318 REXEC_FBC_UTF8_SCAN(
1319 if (tmp == !(OP(c) == NBOUND ?
1320 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1321 isALNUM_LC_utf8((U8*)s)))
1323 else REXEC_FBC_TRYIT;
1327 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1328 tmp = ((OP(c) == NBOUND ?
1329 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1332 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1334 else REXEC_FBC_TRYIT;
1337 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1341 REXEC_FBC_CSCAN_PRELOAD(
1342 LOAD_UTF8_CHARCLASS_ALNUM(),
1343 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1347 REXEC_FBC_CSCAN_TAINT(
1348 isALNUM_LC_utf8((U8*)s),
1352 REXEC_FBC_CSCAN_PRELOAD(
1353 LOAD_UTF8_CHARCLASS_ALNUM(),
1354 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1358 REXEC_FBC_CSCAN_TAINT(
1359 !isALNUM_LC_utf8((U8*)s),
1363 REXEC_FBC_CSCAN_PRELOAD(
1364 LOAD_UTF8_CHARCLASS_SPACE(),
1365 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1369 REXEC_FBC_CSCAN_TAINT(
1370 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1374 REXEC_FBC_CSCAN_PRELOAD(
1375 LOAD_UTF8_CHARCLASS_SPACE(),
1376 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1380 REXEC_FBC_CSCAN_TAINT(
1381 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1385 REXEC_FBC_CSCAN_PRELOAD(
1386 LOAD_UTF8_CHARCLASS_DIGIT(),
1387 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1391 REXEC_FBC_CSCAN_TAINT(
1392 isDIGIT_LC_utf8((U8*)s),
1396 REXEC_FBC_CSCAN_PRELOAD(
1397 LOAD_UTF8_CHARCLASS_DIGIT(),
1398 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1402 REXEC_FBC_CSCAN_TAINT(
1403 !isDIGIT_LC_utf8((U8*)s),
1409 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1410 trie_type = do_utf8 ?
1411 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1413 /* what trie are we using right now */
1415 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1416 reg_trie_data *trie=aho->trie;
1418 const char *last_start = strend - trie->minlen;
1420 const char *real_start = s;
1422 STRLEN maxlen = trie->maxlen;
1424 U8 **points; /* map of where we were in the input string
1425 when reading a given char. For ASCII this
1426 is unnecessary overhead as the relationship
1427 is always 1:1, but for unicode, especially
1428 case folded unicode this is not true. */
1429 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1433 GET_RE_DEBUG_FLAGS_DECL;
1435 /* We can't just allocate points here. We need to wrap it in
1436 * an SV so it gets freed properly if there is a croak while
1437 * running the match */
1440 sv_points=newSV(maxlen * sizeof(U8 *));
1441 SvCUR_set(sv_points,
1442 maxlen * sizeof(U8 *));
1443 SvPOK_on(sv_points);
1444 sv_2mortal(sv_points);
1445 points=(U8**)SvPV_nolen(sv_points );
1446 if ( trie_type != trie_utf8_fold
1447 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1450 bitmap=(U8*)trie->bitmap;
1452 bitmap=(U8*)ANYOF_BITMAP(c);
1454 /* this is the Aho-Corasick algorithm modified a touch
1455 to include special handling for long "unknown char"
1456 sequences. The basic idea being that we use AC as long
1457 as we are dealing with a possible matching char, when
1458 we encounter an unknown char (and we have not encountered
1459 an accepting state) we scan forward until we find a legal
1461 AC matching is basically that of trie matching, except
1462 that when we encounter a failing transition, we fall back
1463 to the current states "fail state", and try the current char
1464 again, a process we repeat until we reach the root state,
1465 state 1, or a legal transition. If we fail on the root state
1466 then we can either terminate if we have reached an accepting
1467 state previously, or restart the entire process from the beginning
1471 while (s <= last_start) {
1472 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1480 U8 *uscan = (U8*)NULL;
1481 U8 *leftmost = NULL;
1483 U32 accepted_word= 0;
1487 while ( state && uc <= (U8*)strend ) {
1489 U32 word = aho->states[ state ].wordnum;
1493 DEBUG_TRIE_EXECUTE_r(
1494 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1495 dump_exec_pos( (char *)uc, c, strend, real_start,
1496 (char *)uc, do_utf8 );
1497 PerlIO_printf( Perl_debug_log,
1498 " Scanning for legal start char...\n");
1501 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1506 if (uc >(U8*)last_start) break;
1510 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1511 if (!leftmost || lpos < leftmost) {
1512 DEBUG_r(accepted_word=word);
1518 points[pointpos++ % maxlen]= uc;
1519 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1520 uvc, charid, foldlen, foldbuf, uniflags);
1521 DEBUG_TRIE_EXECUTE_r({
1522 dump_exec_pos( (char *)uc, c, strend, real_start,
1524 PerlIO_printf(Perl_debug_log,
1525 " Charid:%3u CP:%4"UVxf" ",
1531 word = aho->states[ state ].wordnum;
1533 base = aho->states[ state ].trans.base;
1535 DEBUG_TRIE_EXECUTE_r({
1537 dump_exec_pos( (char *)uc, c, strend, real_start,
1539 PerlIO_printf( Perl_debug_log,
1540 "%sState: %4"UVxf", word=%"UVxf,
1541 failed ? " Fail transition to " : "",
1542 (UV)state, (UV)word);
1547 (base + charid > trie->uniquecharcount )
1548 && (base + charid - 1 - trie->uniquecharcount
1550 && trie->trans[base + charid - 1 -
1551 trie->uniquecharcount].check == state
1552 && (tmp=trie->trans[base + charid - 1 -
1553 trie->uniquecharcount ].next))
1555 DEBUG_TRIE_EXECUTE_r(
1556 PerlIO_printf( Perl_debug_log," - legal\n"));
1561 DEBUG_TRIE_EXECUTE_r(
1562 PerlIO_printf( Perl_debug_log," - fail\n"));
1564 state = aho->fail[state];
1568 /* we must be accepting here */
1569 DEBUG_TRIE_EXECUTE_r(
1570 PerlIO_printf( Perl_debug_log," - accepting\n"));
1579 if (!state) state = 1;
1582 if ( aho->states[ state ].wordnum ) {
1583 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1584 if (!leftmost || lpos < leftmost) {
1585 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1590 s = (char*)leftmost;
1591 DEBUG_TRIE_EXECUTE_r({
1593 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1594 (UV)accepted_word, s - real_start
1597 if (!reginfo || regtry(reginfo, s)) {
1603 DEBUG_TRIE_EXECUTE_r({
1604 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1607 DEBUG_TRIE_EXECUTE_r(
1608 PerlIO_printf( Perl_debug_log,"No match.\n"));
1617 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1626 - regexec_flags - match a regexp against a string
1629 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1630 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1631 /* strend: pointer to null at end of string */
1632 /* strbeg: real beginning of string */
1633 /* minend: end of match must be >=minend after stringarg. */
1634 /* data: May be used for some additional optimizations. */
1635 /* nosave: For optimizations. */
1639 register regnode *c;
1640 register char *startpos = stringarg;
1641 I32 minlen; /* must match at least this many chars */
1642 I32 dontbother = 0; /* how many characters not to try at end */
1643 I32 end_shift = 0; /* Same for the end. */ /* CC */
1644 I32 scream_pos = -1; /* Internal iterator of scream. */
1645 char *scream_olds = NULL;
1646 SV* const oreplsv = GvSV(PL_replgv);
1647 const bool do_utf8 = (bool)DO_UTF8(sv);
1650 regmatch_info reginfo; /* create some info to pass to regtry etc */
1652 GET_RE_DEBUG_FLAGS_DECL;
1654 PERL_UNUSED_ARG(data);
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1662 multiline = prog->reganch & PMf_MULTILINE;
1663 reginfo.prog = prog;
1665 RX_MATCH_UTF8_set(prog, do_utf8);
1667 debug_start_match(prog, do_utf8, startpos, strend,
1671 minlen = prog->minlen;
1673 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1675 "String too short [regexec_flags]...\n"));
1680 /* Check validity of program. */
1681 if (UCHARAT(prog->program) != REG_MAGIC) {
1682 Perl_croak(aTHX_ "corrupted regexp program");
1686 PL_reg_eval_set = 0;
1689 if (prog->reganch & ROPT_UTF8)
1690 PL_reg_flags |= RF_utf8;
1692 /* Mark beginning of line for ^ and lookbehind. */
1693 reginfo.bol = startpos; /* XXX not used ??? */
1697 /* Mark end of line for $ (and such) */
1700 /* see how far we have to get to not match where we matched before */
1701 reginfo.till = startpos+minend;
1703 /* If there is a "must appear" string, look for it. */
1706 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1709 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1710 reginfo.ganch = startpos;
1711 else if (sv && SvTYPE(sv) >= SVt_PVMG
1713 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1714 && mg->mg_len >= 0) {
1715 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1716 if (prog->reganch & ROPT_ANCH_GPOS) {
1717 if (s > reginfo.ganch)
1722 else /* pos() not defined */
1723 reginfo.ganch = strbeg;
1726 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1727 re_scream_pos_data d;
1729 d.scream_olds = &scream_olds;
1730 d.scream_pos = &scream_pos;
1731 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1734 goto phooey; /* not present */
1740 /* Simplest case: anchored match need be tried only once. */
1741 /* [unless only anchor is BOL and multiline is set] */
1742 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1743 if (s == startpos && regtry(®info, startpos))
1745 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1746 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1751 dontbother = minlen - 1;
1752 end = HOP3c(strend, -dontbother, strbeg) - 1;
1753 /* for multiline we only have to try after newlines */
1754 if (prog->check_substr || prog->check_utf8) {
1758 if (regtry(®info, s))
1763 if (prog->reganch & RE_USE_INTUIT) {
1764 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1775 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1776 if (regtry(®info, s))
1783 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1785 /* the warning about reginfo.ganch being used without intialization
1786 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1787 and we only enter this block when the same bit is set. */
1788 if (regtry(®info, reginfo.ganch))
1793 /* Messy cases: unanchored match. */
1794 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1795 /* we have /x+whatever/ */
1796 /* it must be a one character string (XXXX Except UTF?) */
1801 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1802 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1803 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1808 DEBUG_EXECUTE_r( did_match = 1 );
1809 if (regtry(®info, s)) goto got_it;
1811 while (s < strend && *s == ch)
1819 DEBUG_EXECUTE_r( did_match = 1 );
1820 if (regtry(®info, s)) goto got_it;
1822 while (s < strend && *s == ch)
1827 DEBUG_EXECUTE_r(if (!did_match)
1828 PerlIO_printf(Perl_debug_log,
1829 "Did not find anchored character...\n")
1832 else if (prog->anchored_substr != NULL
1833 || prog->anchored_utf8 != NULL
1834 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1835 && prog->float_max_offset < strend - s)) {
1840 char *last1; /* Last position checked before */
1844 if (prog->anchored_substr || prog->anchored_utf8) {
1845 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1846 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1847 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1848 back_max = back_min = prog->anchored_offset;
1850 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1851 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1852 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1853 back_max = prog->float_max_offset;
1854 back_min = prog->float_min_offset;
1858 if (must == &PL_sv_undef)
1859 /* could not downgrade utf8 check substring, so must fail */
1865 last = HOP3c(strend, /* Cannot start after this */
1866 -(I32)(CHR_SVLEN(must)
1867 - (SvTAIL(must) != 0) + back_min), strbeg);
1870 last1 = HOPc(s, -1);
1872 last1 = s - 1; /* bogus */
1874 /* XXXX check_substr already used to find "s", can optimize if
1875 check_substr==must. */
1877 dontbother = end_shift;
1878 strend = HOPc(strend, -dontbother);
1879 while ( (s <= last) &&
1880 ((flags & REXEC_SCREAM)
1881 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1882 end_shift, &scream_pos, 0))
1883 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1884 (unsigned char*)strend, must,
1885 multiline ? FBMrf_MULTILINE : 0))) ) {
1886 /* we may be pointing at the wrong string */
1887 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1888 s = strbeg + (s - SvPVX_const(sv));
1889 DEBUG_EXECUTE_r( did_match = 1 );
1890 if (HOPc(s, -back_max) > last1) {
1891 last1 = HOPc(s, -back_min);
1892 s = HOPc(s, -back_max);
1895 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1897 last1 = HOPc(s, -back_min);
1901 while (s <= last1) {
1902 if (regtry(®info, s))
1908 while (s <= last1) {
1909 if (regtry(®info, s))
1915 DEBUG_EXECUTE_r(if (!did_match) {
1916 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1917 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1918 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1919 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1920 ? "anchored" : "floating"),
1921 quoted, RE_SV_TAIL(must));
1925 else if ( (c = prog->regstclass) ) {
1927 const OPCODE op = OP(prog->regstclass);
1928 /* don't bother with what can't match */
1929 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1930 strend = HOPc(strend, -(minlen - 1));
1933 SV * const prop = sv_newmortal();
1934 regprop(prog, prop, c);
1936 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1938 PerlIO_printf(Perl_debug_log,
1939 "Matching stclass %.*s against %s (%d chars)\n",
1940 (int)SvCUR(prop), SvPVX_const(prop),
1941 quoted, (int)(strend - s));
1944 if (find_byclass(prog, c, s, strend, ®info))
1946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1950 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1955 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1956 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1957 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1959 if (flags & REXEC_SCREAM) {
1960 last = screaminstr(sv, float_real, s - strbeg,
1961 end_shift, &scream_pos, 1); /* last one */
1963 last = scream_olds; /* Only one occurrence. */
1964 /* we may be pointing at the wrong string */
1965 else if (RX_MATCH_COPIED(prog))
1966 s = strbeg + (s - SvPVX_const(sv));
1970 const char * const little = SvPV_const(float_real, len);
1972 if (SvTAIL(float_real)) {
1973 if (memEQ(strend - len + 1, little, len - 1))
1974 last = strend - len + 1;
1975 else if (!multiline)
1976 last = memEQ(strend - len, little, len)
1977 ? strend - len : NULL;
1983 last = rninstr(s, strend, little, little + len);
1985 last = strend; /* matching "$" */
1990 PerlIO_printf(Perl_debug_log,
1991 "%sCan't trim the tail, match fails (should not happen)%s\n",
1992 PL_colors[4], PL_colors[5]));
1993 goto phooey; /* Should not happen! */
1995 dontbother = strend - last + prog->float_min_offset;
1997 if (minlen && (dontbother < minlen))
1998 dontbother = minlen - 1;
1999 strend -= dontbother; /* this one's always in bytes! */
2000 /* We don't know much -- general case. */
2003 if (regtry(®info, s))
2012 if (regtry(®info, s))
2014 } while (s++ < strend);
2022 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2024 if (PL_reg_eval_set) {
2025 /* Preserve the current value of $^R */
2026 if (oreplsv != GvSV(PL_replgv))
2027 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2028 restored, the value remains
2030 restore_pos(aTHX_ prog);
2032 if (prog->paren_names)
2033 (void)hv_iterinit(prog->paren_names);
2035 /* make sure $`, $&, $', and $digit will work later */
2036 if ( !(flags & REXEC_NOT_FIRST) ) {
2037 RX_MATCH_COPY_FREE(prog);
2038 if (flags & REXEC_COPY_STR) {
2039 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2040 #ifdef PERL_OLD_COPY_ON_WRITE
2042 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2044 PerlIO_printf(Perl_debug_log,
2045 "Copy on write: regexp capture, type %d\n",
2048 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2049 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2050 assert (SvPOKp(prog->saved_copy));
2054 RX_MATCH_COPIED_on(prog);
2055 s = savepvn(strbeg, i);
2061 prog->subbeg = strbeg;
2062 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2069 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2070 PL_colors[4], PL_colors[5]));
2071 if (PL_reg_eval_set)
2072 restore_pos(aTHX_ prog);
2078 - regtry - try match at specific point
2080 STATIC I32 /* 0 failure, 1 success */
2081 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2087 regexp *prog = reginfo->prog;
2088 GET_RE_DEBUG_FLAGS_DECL;
2090 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2093 PL_reg_eval_set = RS_init;
2094 DEBUG_EXECUTE_r(DEBUG_s(
2095 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2096 (IV)(PL_stack_sp - PL_stack_base));
2099 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2100 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2102 /* Apparently this is not needed, judging by wantarray. */
2103 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2104 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2107 /* Make $_ available to executed code. */
2108 if (reginfo->sv != DEFSV) {
2110 DEFSV = reginfo->sv;
2113 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2114 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2115 /* prepare for quick setting of pos */
2116 #ifdef PERL_OLD_COPY_ON_WRITE
2118 sv_force_normal_flags(sv, 0);
2120 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2121 &PL_vtbl_mglob, NULL, 0);
2125 PL_reg_oldpos = mg->mg_len;
2126 SAVEDESTRUCTOR_X(restore_pos, prog);
2128 if (!PL_reg_curpm) {
2129 Newxz(PL_reg_curpm, 1, PMOP);
2132 SV* const repointer = newSViv(0);
2133 /* so we know which PL_regex_padav element is PL_reg_curpm */
2134 SvFLAGS(repointer) |= SVf_BREAK;
2135 av_push(PL_regex_padav,repointer);
2136 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2137 PL_regex_pad = AvARRAY(PL_regex_padav);
2141 PM_SETRE(PL_reg_curpm, prog);
2142 PL_reg_oldcurpm = PL_curpm;
2143 PL_curpm = PL_reg_curpm;
2144 if (RX_MATCH_COPIED(prog)) {
2145 /* Here is a serious problem: we cannot rewrite subbeg,
2146 since it may be needed if this match fails. Thus
2147 $` inside (?{}) could fail... */
2148 PL_reg_oldsaved = prog->subbeg;
2149 PL_reg_oldsavedlen = prog->sublen;
2150 #ifdef PERL_OLD_COPY_ON_WRITE
2151 PL_nrs = prog->saved_copy;
2153 RX_MATCH_COPIED_off(prog);
2156 PL_reg_oldsaved = NULL;
2157 prog->subbeg = PL_bostr;
2158 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2160 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2161 prog->startp[0] = startpos - PL_bostr;
2162 PL_reginput = startpos;
2163 PL_reglastparen = &prog->lastparen;
2164 PL_reglastcloseparen = &prog->lastcloseparen;
2165 prog->lastparen = 0;
2166 prog->lastcloseparen = 0;
2168 PL_regstartp = prog->startp;
2169 PL_regendp = prog->endp;
2170 if (PL_reg_start_tmpl <= prog->nparens) {
2171 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2172 if(PL_reg_start_tmp)
2173 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2175 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2178 /* XXXX What this code is doing here?!!! There should be no need
2179 to do this again and again, PL_reglastparen should take care of
2182 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2183 * Actually, the code in regcppop() (which Ilya may be meaning by
2184 * PL_reglastparen), is not needed at all by the test suite
2185 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2186 * enough, for building DynaLoader, or otherwise this
2187 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2188 * will happen. Meanwhile, this code *is* needed for the
2189 * above-mentioned test suite tests to succeed. The common theme
2190 * on those tests seems to be returning null fields from matches.
2195 if (prog->nparens) {
2197 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2204 if (regmatch(reginfo, prog->program + 1)) {
2205 prog->endp[0] = PL_reginput - PL_bostr;
2208 REGCP_UNWIND(lastcp);
2213 #define sayYES goto yes
2214 #define sayNO goto no
2215 #define sayNO_SILENT goto no_silent
2217 /* we dont use STMT_START/END here because it leads to
2218 "unreachable code" warnings, which are bogus, but distracting. */
2219 #define CACHEsayNO \
2220 if (ST.cache_mask) \
2221 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2224 /* this is used to determine how far from the left messages like
2225 'failed...' are printed. It should be set such that messages
2226 are inline with the regop output that created them.
2228 #define REPORT_CODE_OFF 32
2231 /* Make sure there is a test for this +1 options in re_tests */
2232 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2234 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2235 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2237 #define SLAB_FIRST(s) (&(s)->states[0])
2238 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2240 /* grab a new slab and return the first slot in it */
2242 STATIC regmatch_state *
2245 #if PERL_VERSION < 9
2248 regmatch_slab *s = PL_regmatch_slab->next;
2250 Newx(s, 1, regmatch_slab);
2251 s->prev = PL_regmatch_slab;
2253 PL_regmatch_slab->next = s;
2255 PL_regmatch_slab = s;
2256 return SLAB_FIRST(s);
2260 /* push a new state then goto it */
2262 #define PUSH_STATE_GOTO(state, node) \
2264 st->resume_state = state; \
2267 /* push a new state with success backtracking, then goto it */
2269 #define PUSH_YES_STATE_GOTO(state, node) \
2271 st->resume_state = state; \
2272 goto push_yes_state;
2278 regmatch() - main matching routine
2280 This is basically one big switch statement in a loop. We execute an op,
2281 set 'next' to point the next op, and continue. If we come to a point which
2282 we may need to backtrack to on failure such as (A|B|C), we push a
2283 backtrack state onto the backtrack stack. On failure, we pop the top
2284 state, and re-enter the loop at the state indicated. If there are no more
2285 states to pop, we return failure.
2287 Sometimes we also need to backtrack on success; for example /A+/, where
2288 after successfully matching one A, we need to go back and try to
2289 match another one; similarly for lookahead assertions: if the assertion
2290 completes successfully, we backtrack to the state just before the assertion
2291 and then carry on. In these cases, the pushed state is marked as
2292 'backtrack on success too'. This marking is in fact done by a chain of
2293 pointers, each pointing to the previous 'yes' state. On success, we pop to
2294 the nearest yes state, discarding any intermediate failure-only states.
2295 Sometimes a yes state is pushed just to force some cleanup code to be
2296 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2297 it to free the inner regex.
2299 Note that failure backtracking rewinds the cursor position, while
2300 success backtracking leaves it alone.
2302 A pattern is complete when the END op is executed, while a subpattern
2303 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2304 ops trigger the "pop to last yes state if any, otherwise return true"
2307 A common convention in this function is to use A and B to refer to the two
2308 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2309 the subpattern to be matched possibly multiple times, while B is the entire
2310 rest of the pattern. Variable and state names reflect this convention.
2312 The states in the main switch are the union of ops and failure/success of
2313 substates associated with with that op. For example, IFMATCH is the op
2314 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2315 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2316 successfully matched A and IFMATCH_A_fail is a state saying that we have
2317 just failed to match A. Resume states always come in pairs. The backtrack
2318 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2319 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2320 on success or failure.
2322 The struct that holds a backtracking state is actually a big union, with
2323 one variant for each major type of op. The variable st points to the
2324 top-most backtrack struct. To make the code clearer, within each
2325 block of code we #define ST to alias the relevant union.
2327 Here's a concrete example of a (vastly oversimplified) IFMATCH
2333 #define ST st->u.ifmatch
2335 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2336 ST.foo = ...; // some state we wish to save
2338 // push a yes backtrack state with a resume value of
2339 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2341 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2344 case IFMATCH_A: // we have successfully executed A; now continue with B
2346 bar = ST.foo; // do something with the preserved value
2349 case IFMATCH_A_fail: // A failed, so the assertion failed
2350 ...; // do some housekeeping, then ...
2351 sayNO; // propagate the failure
2358 For any old-timers reading this who are familiar with the old recursive
2359 approach, the code above is equivalent to:
2361 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2370 ...; // do some housekeeping, then ...
2371 sayNO; // propagate the failure
2374 The topmost backtrack state, pointed to by st, is usually free. If you
2375 want to claim it, populate any ST.foo fields in it with values you wish to
2376 save, then do one of
2378 PUSH_STATE_GOTO(resume_state, node);
2379 PUSH_YES_STATE_GOTO(resume_state, node);
2381 which sets that backtrack state's resume value to 'resume_state', pushes a
2382 new free entry to the top of the backtrack stack, then goes to 'node'.
2383 On backtracking, the free slot is popped, and the saved state becomes the
2384 new free state. An ST.foo field in this new top state can be temporarily
2385 accessed to retrieve values, but once the main loop is re-entered, it
2386 becomes available for reuse.
2388 Note that the depth of the backtrack stack constantly increases during the
2389 left-to-right execution of the pattern, rather than going up and down with
2390 the pattern nesting. For example the stack is at its maximum at Z at the
2391 end of the pattern, rather than at X in the following:
2393 /(((X)+)+)+....(Y)+....Z/
2395 The only exceptions to this are lookahead/behind assertions and the cut,
2396 (?>A), which pop all the backtrack states associated with A before
2399 Bascktrack state structs are allocated in slabs of about 4K in size.
2400 PL_regmatch_state and st always point to the currently active state,
2401 and PL_regmatch_slab points to the slab currently containing
2402 PL_regmatch_state. The first time regmatch() is called, the first slab is
2403 allocated, and is never freed until interpreter destruction. When the slab
2404 is full, a new one is allocated and chained to the end. At exit from
2405 regmatch(), slabs allocated since entry are freed.
2410 #define DEBUG_STATE_pp(pp) \
2412 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2413 PerlIO_printf(Perl_debug_log, \
2416 reg_name[st->resume_state] ); \
2420 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2425 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2426 const char *start, const char *end, const char *blurb)
2428 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2432 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2433 prog->precomp, prog->prelen, 60);
2435 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2436 start, end - start, 60);
2438 PerlIO_printf(Perl_debug_log,
2439 "%s%s REx%s %s against %s\n",
2440 PL_colors[4], blurb, PL_colors[5], s0, s1);
2442 if (do_utf8||utf8_pat)
2443 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2444 utf8_pat ? "pattern" : "",
2445 utf8_pat && do_utf8 ? " and " : "",
2446 do_utf8 ? "string" : ""
2452 S_dump_exec_pos(pTHX_ const char *locinput,
2453 const regnode *scan,
2454 const char *loc_regeol,
2455 const char *loc_bostr,
2456 const char *loc_reg_starttry,
2459 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2460 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2461 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2462 /* The part of the string before starttry has one color
2463 (pref0_len chars), between starttry and current
2464 position another one (pref_len - pref0_len chars),
2465 after the current position the third one.
2466 We assume that pref0_len <= pref_len, otherwise we
2467 decrease pref0_len. */
2468 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2469 ? (5 + taill) - l : locinput - loc_bostr;
2472 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2474 pref0_len = pref_len - (locinput - loc_reg_starttry);
2475 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2476 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2477 ? (5 + taill) - pref_len : loc_regeol - locinput);
2478 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2482 if (pref0_len > pref_len)
2483 pref0_len = pref_len;
2485 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2487 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2488 (locinput - pref_len),pref0_len, 60, 4, 5);
2490 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2491 (locinput - pref_len + pref0_len),
2492 pref_len - pref0_len, 60, 2, 3);
2494 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2495 locinput, loc_regeol - locinput, 10, 0, 1);
2497 const STRLEN tlen=len0+len1+len2;
2498 PerlIO_printf(Perl_debug_log,
2499 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2500 (IV)(locinput - loc_bostr),
2503 (docolor ? "" : "> <"),
2505 (int)(tlen > 19 ? 0 : 19 - tlen),
2512 /* reg_check_named_buff_matched()
2513 * Checks to see if a named buffer has matched. The data array of
2514 * buffer numbers corresponding to the buffer is expected to reside
2515 * in the regexp->data->data array in the slot stored in the ARG() of
2516 * node involved. Note that this routine doesn't actually care about the
2517 * name, that information is not preserved from compilation to execution.
2518 * Returns the index of the leftmost defined buffer with the given name
2519 * or 0 if non of the buffers matched.
2522 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2524 SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];
2525 I32 *nums=(I32*)SvPVX(sv_dat);
2526 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2527 if ((I32)*PL_reglastparen >= nums[n] &&
2528 PL_regendp[nums[n]] != -1)
2536 STATIC I32 /* 0 failure, 1 success */
2537 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2539 #if PERL_VERSION < 9
2543 register const bool do_utf8 = PL_reg_match_utf8;
2544 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2546 regexp *rex = reginfo->prog;
2548 regmatch_slab *orig_slab;
2549 regmatch_state *orig_state;
2551 /* the current state. This is a cached copy of PL_regmatch_state */
2552 register regmatch_state *st;
2554 /* cache heavy used fields of st in registers */
2555 register regnode *scan;
2556 register regnode *next;
2557 register I32 n = 0; /* general value; init to avoid compiler warning */
2558 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2559 register char *locinput = PL_reginput;
2560 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2562 bool result = 0; /* return value of S_regmatch */
2563 int depth = 0; /* depth of backtrack stack */
2564 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
2565 regmatch_state *yes_state = NULL; /* state to pop to on success of
2567 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2568 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2571 /* these three flags are set by various ops to signal information to
2572 * the very next op. They have a useful lifetime of exactly one loop
2573 * iteration, and are not preserved or restored by state pushes/pops
2575 bool sw = 0; /* the condition value in (?(cond)a|b) */
2576 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2577 int logical = 0; /* the following EVAL is:
2581 or the following IFMATCH/UNLESSM is:
2582 false: plain (?=foo)
2583 true: used as a condition: (?(?=foo))
2587 GET_RE_DEBUG_FLAGS_DECL;
2590 /* on first ever call to regmatch, allocate first slab */
2591 if (!PL_regmatch_slab) {
2592 Newx(PL_regmatch_slab, 1, regmatch_slab);
2593 PL_regmatch_slab->prev = NULL;
2594 PL_regmatch_slab->next = NULL;
2595 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2598 /* remember current high-water mark for exit */
2599 /* XXX this should be done with SAVE* instead */
2600 orig_slab = PL_regmatch_slab;
2601 orig_state = PL_regmatch_state;
2603 /* grab next free state slot */
2604 st = ++PL_regmatch_state;
2605 if (st > SLAB_LAST(PL_regmatch_slab))
2606 st = PL_regmatch_state = S_push_slab(aTHX);
2608 /* Note that nextchr is a byte even in UTF */
2609 nextchr = UCHARAT(locinput);
2611 while (scan != NULL) {
2614 SV * const prop = sv_newmortal();
2615 regnode *rnext=regnext(scan);
2616 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2617 regprop(rex, prop, scan);
2619 PerlIO_printf(Perl_debug_log,
2620 "%3"IVdf":%*s%s(%"IVdf")\n",
2621 (IV)(scan - rex->program), depth*2, "",
2623 (PL_regkind[OP(scan)] == END || !rnext) ?
2624 0 : (IV)(rnext - rex->program));
2627 next = scan + NEXT_OFF(scan);
2630 state_num = OP(scan);
2633 switch (state_num) {
2635 if (locinput == PL_bostr)
2637 /* reginfo->till = reginfo->bol; */
2642 if (locinput == PL_bostr ||
2643 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2649 if (locinput == PL_bostr)
2653 if (locinput == reginfo->ganch)
2659 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2664 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2666 if (PL_regeol - locinput > 1)
2670 if (PL_regeol != locinput)
2674 if (!nextchr && locinput >= PL_regeol)
2677 locinput += PL_utf8skip[nextchr];
2678 if (locinput > PL_regeol)
2680 nextchr = UCHARAT(locinput);
2683 nextchr = UCHARAT(++locinput);
2686 if (!nextchr && locinput >= PL_regeol)
2688 nextchr = UCHARAT(++locinput);
2691 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2694 locinput += PL_utf8skip[nextchr];
2695 if (locinput > PL_regeol)
2697 nextchr = UCHARAT(locinput);
2700 nextchr = UCHARAT(++locinput);
2704 #define ST st->u.trie
2706 /* In this case the charclass data is available inline so
2707 we can fail fast without a lot of extra overhead.
2709 if (scan->flags == EXACT || !do_utf8) {
2710 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2712 PerlIO_printf(Perl_debug_log,
2713 "%*s %sfailed to match trie start class...%s\n",
2714 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2723 /* what type of TRIE am I? (utf8 makes this contextual) */
2724 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2725 trie_type = do_utf8 ?
2726 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2729 /* what trie are we using right now */
2730 reg_trie_data * const trie
2731 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2732 U32 state = trie->startstate;
2734 if (trie->bitmap && trie_type != trie_utf8_fold &&
2735 !TRIE_BITMAP_TEST(trie,*locinput)
2737 if (trie->states[ state ].wordnum) {
2739 PerlIO_printf(Perl_debug_log,
2740 "%*s %smatched empty string...%s\n",
2741 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2746 PerlIO_printf(Perl_debug_log,
2747 "%*s %sfailed to match trie start class...%s\n",
2748 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2755 U8 *uc = ( U8* )locinput;
2759 U8 *uscan = (U8*)NULL;
2761 SV *sv_accept_buff = NULL;
2762 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2764 ST.accepted = 0; /* how many accepting states we have seen */
2766 ST.jump = trie->jump;
2770 traverse the TRIE keeping track of all accepting states
2771 we transition through until we get to a failing node.
2774 while ( state && uc <= (U8*)PL_regeol ) {
2775 U32 base = trie->states[ state ].trans.base;
2778 /* We use charid to hold the wordnum as we don't use it
2779 for charid until after we have done the wordnum logic.
2780 We define an alias just so that the wordnum logic reads
2783 #define got_wordnum charid
2784 got_wordnum = trie->states[ state ].wordnum;
2786 if ( got_wordnum ) {
2787 if ( ! ST.accepted ) {
2790 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2791 sv_accept_buff=newSV(bufflen *
2792 sizeof(reg_trie_accepted) - 1);
2793 SvCUR_set(sv_accept_buff, 0);
2794 SvPOK_on(sv_accept_buff);
2795 sv_2mortal(sv_accept_buff);
2798 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2801 if (ST.accepted >= bufflen) {
2803 ST.accept_buff =(reg_trie_accepted*)
2804 SvGROW(sv_accept_buff,
2805 bufflen * sizeof(reg_trie_accepted));
2807 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2808 + sizeof(reg_trie_accepted));
2811 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2812 ST.accept_buff[ST.accepted].endpos = uc;
2814 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2818 DEBUG_TRIE_EXECUTE_r({
2819 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2820 PerlIO_printf( Perl_debug_log,
2821 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2822 2+depth * 2, "", PL_colors[4],
2823 (UV)state, (UV)ST.accepted );
2827 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2828 uvc, charid, foldlen, foldbuf, uniflags);
2831 (base + charid > trie->uniquecharcount )
2832 && (base + charid - 1 - trie->uniquecharcount
2834 && trie->trans[base + charid - 1 -
2835 trie->uniquecharcount].check == state)
2837 state = trie->trans[base + charid - 1 -
2838 trie->uniquecharcount ].next;
2849 DEBUG_TRIE_EXECUTE_r(
2850 PerlIO_printf( Perl_debug_log,
2851 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2852 charid, uvc, (UV)state, PL_colors[5] );
2859 PerlIO_printf( Perl_debug_log,
2860 "%*s %sgot %"IVdf" possible matches%s\n",
2861 REPORT_CODE_OFF + depth * 2, "",
2862 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2868 case TRIE_next_fail: /* we failed - try next alterative */
2870 if ( ST.accepted == 1 ) {
2871 /* only one choice left - just continue */
2873 reg_trie_data * const trie
2874 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2875 SV ** const tmp = RX_DEBUG(reginfo->prog)
2876 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2878 PerlIO_printf( Perl_debug_log,
2879 "%*s %sonly one match left: #%d <%s>%s\n",
2880 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2881 ST.accept_buff[ 0 ].wordnum,
2882 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2885 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2886 /* in this case we free tmps/leave before we call regmatch
2887 as we wont be using accept_buff again. */
2890 locinput = PL_reginput;
2891 nextchr = UCHARAT(locinput);
2893 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
2896 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
2898 continue; /* execute rest of RE */
2901 if (!ST.accepted-- ) {
2908 There are at least two accepting states left. Presumably
2909 the number of accepting states is going to be low,
2910 typically two. So we simply scan through to find the one
2911 with lowest wordnum. Once we find it, we swap the last
2912 state into its place and decrement the size. We then try to
2913 match the rest of the pattern at the point where the word
2914 ends. If we succeed, control just continues along the
2915 regex; if we fail we return here to try the next accepting
2922 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2923 DEBUG_TRIE_EXECUTE_r(
2924 PerlIO_printf( Perl_debug_log,
2925 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2926 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
2927 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2928 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2931 if (ST.accept_buff[cur].wordnum <
2932 ST.accept_buff[best].wordnum)
2937 reg_trie_data * const trie
2938 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2939 SV ** const tmp = RX_DEBUG(reginfo->prog)
2940 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2942 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
2944 ST.me + ST.jump[ST.accept_buff[best].wordnum];
2945 PerlIO_printf( Perl_debug_log,
2946 "%*s %strying alternation #%d <%s> at node #%d %s\n",
2947 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2948 ST.accept_buff[best].wordnum,
2949 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2950 REG_NODE_NUM(nextop),
2954 if ( best<ST.accepted ) {
2955 reg_trie_accepted tmp = ST.accept_buff[ best ];
2956 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2957 ST.accept_buff[ ST.accepted ] = tmp;
2960 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2961 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
2962 PUSH_STATE_GOTO(TRIE_next, ST.B);
2965 PUSH_STATE_GOTO(TRIE_next, ST.me + ST.jump[ST.accept_buff[best].wordnum]);
2975 char *s = STRING(scan);
2977 if (do_utf8 != UTF) {
2978 /* The target and the pattern have differing utf8ness. */
2980 const char * const e = s + ln;
2983 /* The target is utf8, the pattern is not utf8. */
2988 if (NATIVE_TO_UNI(*(U8*)s) !=
2989 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2997 /* The target is not utf8, the pattern is utf8. */
3002 if (NATIVE_TO_UNI(*((U8*)l)) !=
3003 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3011 nextchr = UCHARAT(locinput);
3014 /* The target and the pattern have the same utf8ness. */
3015 /* Inline the first character, for speed. */
3016 if (UCHARAT(s) != nextchr)
3018 if (PL_regeol - locinput < ln)
3020 if (ln > 1 && memNE(s, locinput, ln))
3023 nextchr = UCHARAT(locinput);
3027 PL_reg_flags |= RF_tainted;
3030 char * const s = STRING(scan);
3033 if (do_utf8 || UTF) {
3034 /* Either target or the pattern are utf8. */
3035 const char * const l = locinput;
3036 char *e = PL_regeol;
3038 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3039 l, &e, 0, do_utf8)) {
3040 /* One more case for the sharp s:
3041 * pack("U0U*", 0xDF) =~ /ss/i,
3042 * the 0xC3 0x9F are the UTF-8
3043 * byte sequence for the U+00DF. */
3045 toLOWER(s[0]) == 's' &&
3047 toLOWER(s[1]) == 's' &&
3054 nextchr = UCHARAT(locinput);
3058 /* Neither the target and the pattern are utf8. */
3060 /* Inline the first character, for speed. */
3061 if (UCHARAT(s) != nextchr &&
3062 UCHARAT(s) != ((OP(scan) == EXACTF)
3063 ? PL_fold : PL_fold_locale)[nextchr])
3065 if (PL_regeol - locinput < ln)
3067 if (ln > 1 && (OP(scan) == EXACTF
3068 ? ibcmp(s, locinput, ln)
3069 : ibcmp_locale(s, locinput, ln)))
3072 nextchr = UCHARAT(locinput);
3077 STRLEN inclasslen = PL_regeol - locinput;
3079 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3081 if (locinput >= PL_regeol)
3083 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3084 nextchr = UCHARAT(locinput);
3089 nextchr = UCHARAT(locinput);
3090 if (!REGINCLASS(rex, scan, (U8*)locinput))
3092 if (!nextchr && locinput >= PL_regeol)
3094 nextchr = UCHARAT(++locinput);
3098 /* If we might have the case of the German sharp s
3099 * in a casefolding Unicode character class. */
3101 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3102 locinput += SHARP_S_SKIP;
3103 nextchr = UCHARAT(locinput);
3109 PL_reg_flags |= RF_tainted;
3115 LOAD_UTF8_CHARCLASS_ALNUM();
3116 if (!(OP(scan) == ALNUM
3117 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3118 : isALNUM_LC_utf8((U8*)locinput)))
3122 locinput += PL_utf8skip[nextchr];
3123 nextchr = UCHARAT(locinput);
3126 if (!(OP(scan) == ALNUM
3127 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3129 nextchr = UCHARAT(++locinput);
3132 PL_reg_flags |= RF_tainted;
3135 if (!nextchr && locinput >= PL_regeol)
3138 LOAD_UTF8_CHARCLASS_ALNUM();
3139 if (OP(scan) == NALNUM
3140 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3141 : isALNUM_LC_utf8((U8*)locinput))
3145 locinput += PL_utf8skip[nextchr];
3146 nextchr = UCHARAT(locinput);
3149 if (OP(scan) == NALNUM
3150 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3152 nextchr = UCHARAT(++locinput);
3156 PL_reg_flags |= RF_tainted;
3160 /* was last char in word? */
3162 if (locinput == PL_bostr)
3165 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3167 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3169 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3170 ln = isALNUM_uni(ln);
3171 LOAD_UTF8_CHARCLASS_ALNUM();
3172 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3175 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3176 n = isALNUM_LC_utf8((U8*)locinput);
3180 ln = (locinput != PL_bostr) ?
3181 UCHARAT(locinput - 1) : '\n';
3182 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3184 n = isALNUM(nextchr);
3187 ln = isALNUM_LC(ln);
3188 n = isALNUM_LC(nextchr);
3191 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3192 OP(scan) == BOUNDL))
3196 PL_reg_flags |= RF_tainted;
3202 if (UTF8_IS_CONTINUED(nextchr)) {
3203 LOAD_UTF8_CHARCLASS_SPACE();
3204 if (!(OP(scan) == SPACE
3205 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3206 : isSPACE_LC_utf8((U8*)locinput)))
3210 locinput += PL_utf8skip[nextchr];
3211 nextchr = UCHARAT(locinput);
3214 if (!(OP(scan) == SPACE
3215 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3217 nextchr = UCHARAT(++locinput);
3220 if (!(OP(scan) == SPACE
3221 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3223 nextchr = UCHARAT(++locinput);
3227 PL_reg_flags |= RF_tainted;
3230 if (!nextchr && locinput >= PL_regeol)
3233 LOAD_UTF8_CHARCLASS_SPACE();
3234 if (OP(scan) == NSPACE
3235 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3236 : isSPACE_LC_utf8((U8*)locinput))
3240 locinput += PL_utf8skip[nextchr];
3241 nextchr = UCHARAT(locinput);
3244 if (OP(scan) == NSPACE
3245 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3247 nextchr = UCHARAT(++locinput);
3250 PL_reg_flags |= RF_tainted;
3256 LOAD_UTF8_CHARCLASS_DIGIT();
3257 if (!(OP(scan) == DIGIT
3258 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3259 : isDIGIT_LC_utf8((U8*)locinput)))
3263 locinput += PL_utf8skip[nextchr];
3264 nextchr = UCHARAT(locinput);
3267 if (!(OP(scan) == DIGIT
3268 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3270 nextchr = UCHARAT(++locinput);
3273 PL_reg_flags |= RF_tainted;
3276 if (!nextchr && locinput >= PL_regeol)
3279 LOAD_UTF8_CHARCLASS_DIGIT();
3280 if (OP(scan) == NDIGIT
3281 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3282 : isDIGIT_LC_utf8((U8*)locinput))
3286 locinput += PL_utf8skip[nextchr];
3287 nextchr = UCHARAT(locinput);
3290 if (OP(scan) == NDIGIT
3291 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3293 nextchr = UCHARAT(++locinput);
3296 if (locinput >= PL_regeol)
3299 LOAD_UTF8_CHARCLASS_MARK();
3300 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3302 locinput += PL_utf8skip[nextchr];
3303 while (locinput < PL_regeol &&
3304 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3305 locinput += UTF8SKIP(locinput);
3306 if (locinput > PL_regeol)
3311 nextchr = UCHARAT(locinput);
3318 PL_reg_flags |= RF_tainted;
3323 n = reg_check_named_buff_matched(rex,scan);
3326 type = REF + ( type - NREF );
3333 PL_reg_flags |= RF_tainted;
3337 n = ARG(scan); /* which paren pair */
3340 ln = PL_regstartp[n];
3341 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3342 if ((I32)*PL_reglastparen < n || ln == -1)
3343 sayNO; /* Do not match unless seen CLOSEn. */
3344 if (ln == PL_regendp[n])
3348 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3350 const char *e = PL_bostr + PL_regendp[n];
3352 * Note that we can't do the "other character" lookup trick as
3353 * in the 8-bit case (no pun intended) because in Unicode we
3354 * have to map both upper and title case to lower case.
3358 STRLEN ulen1, ulen2;
3359 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3360 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3364 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3365 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3366 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3373 nextchr = UCHARAT(locinput);
3377 /* Inline the first character, for speed. */
3378 if (UCHARAT(s) != nextchr &&
3380 (UCHARAT(s) != (type == REFF
3381 ? PL_fold : PL_fold_locale)[nextchr])))
3383 ln = PL_regendp[n] - ln;
3384 if (locinput + ln > PL_regeol)
3386 if (ln > 1 && (type == REF
3387 ? memNE(s, locinput, ln)
3389 ? ibcmp(s, locinput, ln)
3390 : ibcmp_locale(s, locinput, ln))))
3393 nextchr = UCHARAT(locinput);
3403 #define ST st->u.eval
3407 regnode *startpoint;
3410 case RECURSE: /* /(...(?1))/ */
3411 if (cur_eval && cur_eval->locinput==locinput) {
3412 if (cur_eval->u.eval.close_paren == ARG(scan))
3413 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3414 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3415 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3420 (void)ReREFCNT_inc(rex);
3421 if (OP(scan)==RECURSE) {
3422 startpoint = scan + ARG2L(scan);
3423 ST.close_paren = ARG(scan);
3425 startpoint = re->program+1;
3428 goto eval_recurse_doit;
3430 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3431 if (cur_eval && cur_eval->locinput==locinput) {
3432 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3433 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3438 /* execute the code in the {...} */
3440 SV ** const before = SP;
3441 OP_4tree * const oop = PL_op;
3442 COP * const ocurcop = PL_curcop;
3446 PL_op = (OP_4tree*)rex->data->data[n];
3447 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3448 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3449 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3451 CALLRUNOPS(aTHX); /* Scalar context. */
3454 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3461 PAD_RESTORE_LOCAL(old_comppad);
3462 PL_curcop = ocurcop;
3465 sv_setsv(save_scalar(PL_replgv), ret);
3469 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3472 /* extract RE object from returned value; compiling if
3477 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3478 mg = mg_find(sv, PERL_MAGIC_qr);
3479 else if (SvSMAGICAL(ret)) {
3480 if (SvGMAGICAL(ret))
3481 sv_unmagic(ret, PERL_MAGIC_qr);
3483 mg = mg_find(ret, PERL_MAGIC_qr);
3487 re = (regexp *)mg->mg_obj;
3488 (void)ReREFCNT_inc(re);
3492 const char * const t = SvPV_const(ret, len);
3494 const I32 osize = PL_regsize;
3497 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3498 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3500 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3502 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3508 debug_start_match(re, do_utf8, locinput, PL_regeol,
3509 "Matching embedded");
3511 startpoint = re->program + 1;
3512 ST.close_paren = 0; /* only used for RECURSE */
3513 /* borrowed from regtry */
3514 if (PL_reg_start_tmpl <= re->nparens) {
3515 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3516 if(PL_reg_start_tmp)
3517 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3519 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3522 eval_recurse_doit: /* Share code with RECURSE below this line */
3523 /* run the pattern returned from (??{...}) */
3524 ST.cp = regcppush(0); /* Save *all* the positions. */
3525 REGCP_SET(ST.lastcp);
3527 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3528 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3530 *PL_reglastparen = 0;
3531 *PL_reglastcloseparen = 0;
3532 PL_reginput = locinput;
3535 /* XXXX This is too dramatic a measure... */
3538 ST.toggle_reg_flags = PL_reg_flags;
3539 if (re->reganch & ROPT_UTF8)
3540 PL_reg_flags |= RF_utf8;
3542 PL_reg_flags &= ~RF_utf8;
3543 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3546 ST.prev_curlyx = cur_curlyx;
3550 ST.prev_eval = cur_eval;
3552 /* now continue from first node in postoned RE */
3553 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3556 /* logical is 1, /(?(?{...})X|Y)/ */
3557 sw = (bool)SvTRUE(ret);
3562 case EVAL_AB: /* cleanup after a successful (??{A})B */
3563 /* note: this is called twice; first after popping B, then A */
3564 PL_reg_flags ^= ST.toggle_reg_flags;
3568 cur_eval = ST.prev_eval;
3569 cur_curlyx = ST.prev_curlyx;
3570 /* XXXX This is too dramatic a measure... */
3575 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3576 /* note: this is called twice; first after popping B, then A */
3577 PL_reg_flags ^= ST.toggle_reg_flags;
3580 PL_reginput = locinput;
3581 REGCP_UNWIND(ST.lastcp);
3583 cur_eval = ST.prev_eval;
3584 cur_curlyx = ST.prev_curlyx;
3585 /* XXXX This is too dramatic a measure... */
3591 n = ARG(scan); /* which paren pair */
3592 PL_reg_start_tmp[n] = locinput;
3597 n = ARG(scan); /* which paren pair */
3598 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3599 PL_regendp[n] = locinput - PL_bostr;
3600 /*if (n > PL_regsize)
3602 if (n > (I32)*PL_reglastparen)
3603 *PL_reglastparen = n;
3604 *PL_reglastcloseparen = n;
3605 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3610 n = ARG(scan); /* which paren pair */
3611 sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3614 /* reg_check_named_buff_matched returns 0 for no match */
3615 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3619 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3625 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3627 next = NEXTOPER(NEXTOPER(scan));
3629 next = scan + ARG(scan);
3630 if (OP(next) == IFTHEN) /* Fake one. */
3631 next = NEXTOPER(NEXTOPER(next));
3635 logical = scan->flags;
3638 /*******************************************************************
3640 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3641 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3642 STAR/PLUS/CURLY/CURLYN are used instead.)
3644 A*B is compiled as <CURLYX><A><WHILEM><B>
3646 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3647 state, which contains the current count, initialised to -1. It also sets
3648 cur_curlyx to point to this state, with any previous value saved in the
3651 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3652 since the pattern may possibly match zero times (i.e. it's a while {} loop
3653 rather than a do {} while loop).
3655 Each entry to WHILEM represents a successful match of A. The count in the
3656 CURLYX block is incremented, another WHILEM state is pushed, and execution
3657 passes to A or B depending on greediness and the current count.
3659 For example, if matching against the string a1a2a3b (where the aN are
3660 substrings that match /A/), then the match progresses as follows: (the
3661 pushed states are interspersed with the bits of strings matched so far):
3664 <CURLYX cnt=0><WHILEM>
3665 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3666 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3667 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3668 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3670 (Contrast this with something like CURLYM, which maintains only a single
3674 a1 <CURLYM cnt=1> a2
3675 a1 a2 <CURLYM cnt=2> a3
3676 a1 a2 a3 <CURLYM cnt=3> b
3679 Each WHILEM state block marks a point to backtrack to upon partial failure
3680 of A or B, and also contains some minor state data related to that
3681 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3682 overall state, such as the count, and pointers to the A and B ops.
3684 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3685 must always point to the *current* CURLYX block, the rules are:
3687 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3688 and set cur_curlyx to point the new block.
3690 When popping the CURLYX block after a successful or unsuccessful match,
3691 restore the previous cur_curlyx.
3693 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3694 to the outer one saved in the CURLYX block.
3696 When popping the WHILEM block after a successful or unsuccessful B match,
3697 restore the previous cur_curlyx.
3699 Here's an example for the pattern (AI* BI)*BO
3700 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3703 curlyx backtrack stack
3704 ------ ---------------
3706 CO <CO prev=NULL> <WO>
3707 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3708 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3709 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3711 At this point the pattern succeeds, and we work back down the stack to
3712 clean up, restoring as we go:
3714 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3715 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3716 CO <CO prev=NULL> <WO>
3719 *******************************************************************/
3721 #define ST st->u.curlyx
3723 case CURLYX: /* start of /A*B/ (for complex A) */
3725 /* No need to save/restore up to this paren */
3726 I32 parenfloor = scan->flags;
3728 assert(next); /* keep Coverity happy */
3729 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3732 /* XXXX Probably it is better to teach regpush to support
3733 parenfloor > PL_regsize... */
3734 if (parenfloor > (I32)*PL_reglastparen)
3735 parenfloor = *PL_reglastparen; /* Pessimization... */
3737 ST.prev_curlyx= cur_curlyx;
3739 ST.cp = PL_savestack_ix;
3741 /* these fields contain the state of the current curly.
3742 * they are accessed by subsequent WHILEMs */
3743 ST.parenfloor = parenfloor;
3744 ST.min = ARG1(scan);
3745 ST.max = ARG2(scan);
3746 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3750 ST.count = -1; /* this will be updated by WHILEM */
3751 ST.lastloc = NULL; /* this will be updated by WHILEM */
3753 PL_reginput = locinput;
3754 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3758 case CURLYX_end: /* just finished matching all of A*B */
3760 cur_curlyx = ST.prev_curlyx;
3764 case CURLYX_end_fail: /* just failed to match all of A*B */
3766 cur_curlyx = ST.prev_curlyx;
3772 #define ST st->u.whilem
3774 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3776 /* see the discussion above about CURLYX/WHILEM */
3778 assert(cur_curlyx); /* keep Coverity happy */
3779 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3780 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3781 ST.cache_offset = 0;
3784 PL_reginput = locinput;
3786 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3787 "%*s whilem: matched %ld out of %ld..%ld\n",
3788 REPORT_CODE_OFF+depth*2, "", (long)n,
3789 (long)cur_curlyx->u.curlyx.min,
3790 (long)cur_curlyx->u.curlyx.max)
3793 /* First just match a string of min A's. */
3795 if (n < cur_curlyx->u.curlyx.min) {
3796 cur_curlyx->u.curlyx.lastloc = locinput;
3797 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3801 /* If degenerate A matches "", assume A done. */
3803 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3804 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3805 "%*s whilem: empty match detected, trying continuation...\n",
3806 REPORT_CODE_OFF+depth*2, "")
3808 goto do_whilem_B_max;
3811 /* super-linear cache processing */
3815 if (!PL_reg_maxiter) {
3816 /* start the countdown: Postpone detection until we
3817 * know the match is not *that* much linear. */
3818 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3819 /* possible overflow for long strings and many CURLYX's */
3820 if (PL_reg_maxiter < 0)
3821 PL_reg_maxiter = I32_MAX;
3822 PL_reg_leftiter = PL_reg_maxiter;
3825 if (PL_reg_leftiter-- == 0) {
3826 /* initialise cache */
3827 const I32 size = (PL_reg_maxiter + 7)/8;
3828 if (PL_reg_poscache) {
3829 if ((I32)PL_reg_poscache_size < size) {
3830 Renew(PL_reg_poscache, size, char);
3831 PL_reg_poscache_size = size;
3833 Zero(PL_reg_poscache, size, char);
3836 PL_reg_poscache_size = size;
3837 Newxz(PL_reg_poscache, size, char);
3839 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3840 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3841 PL_colors[4], PL_colors[5])
3845 if (PL_reg_leftiter < 0) {
3846 /* have we already failed at this position? */
3848 offset = (scan->flags & 0xf) - 1
3849 + (locinput - PL_bostr) * (scan->flags>>4);
3850 mask = 1 << (offset % 8);
3852 if (PL_reg_poscache[offset] & mask) {
3853 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3854 "%*s whilem: (cache) already tried at this position...\n",
3855 REPORT_CODE_OFF+depth*2, "")
3857 sayNO; /* cache records failure */
3859 ST.cache_offset = offset;
3860 ST.cache_mask = mask;
3864 /* Prefer B over A for minimal matching. */
3866 if (cur_curlyx->u.curlyx.minmod) {
3867 ST.save_curlyx = cur_curlyx;
3868 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3869 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3870 REGCP_SET(ST.lastcp);
3871 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3875 /* Prefer A over B for maximal matching. */
3877 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3878 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3879 cur_curlyx->u.curlyx.lastloc = locinput;
3880 REGCP_SET(ST.lastcp);
3881 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3884 goto do_whilem_B_max;
3888 case WHILEM_B_min: /* just matched B in a minimal match */
3889 case WHILEM_B_max: /* just matched B in a maximal match */
3890 cur_curlyx = ST.save_curlyx;
3894 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3895 cur_curlyx = ST.save_curlyx;
3896 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3897 cur_curlyx->u.curlyx.count--;
3901 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3902 REGCP_UNWIND(ST.lastcp);
3905 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3906 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3907 cur_curlyx->u.curlyx.count--;
3911 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3912 REGCP_UNWIND(ST.lastcp);
3913 regcppop(rex); /* Restore some previous $<digit>s? */
3914 PL_reginput = locinput;
3915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3916 "%*s whilem: failed, trying continuation...\n",
3917 REPORT_CODE_OFF+depth*2, "")
3920 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3921 && ckWARN(WARN_REGEXP)
3922 && !(PL_reg_flags & RF_warned))
3924 PL_reg_flags |= RF_warned;
3925 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3926 "Complex regular subexpression recursion",
3931 ST.save_curlyx = cur_curlyx;
3932 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3933 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3936 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3937 cur_curlyx = ST.save_curlyx;
3938 REGCP_UNWIND(ST.lastcp);
3941 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3942 /* Maximum greed exceeded */
3943 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3944 && ckWARN(WARN_REGEXP)
3945 && !(PL_reg_flags & RF_warned))
3947 PL_reg_flags |= RF_warned;
3948 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3949 "%s limit (%d) exceeded",
3950 "Complex regular subexpression recursion",
3953 cur_curlyx->u.curlyx.count--;
3957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3958 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3960 /* Try grabbing another A and see if it helps. */
3961 PL_reginput = locinput;
3962 cur_curlyx->u.curlyx.lastloc = locinput;
3963 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3964 REGCP_SET(ST.lastcp);
3965 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
3969 #define ST st->u.branch
3971 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3972 next = scan + ARG(scan);
3975 scan = NEXTOPER(scan);
3978 case BRANCH: /* /(...|A|...)/ */
3979 scan = NEXTOPER(scan); /* scan now points to inner node */
3980 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3981 /* last branch; skip state push and jump direct to node */
3983 ST.lastparen = *PL_reglastparen;
3984 ST.next_branch = next;
3986 PL_reginput = locinput;
3988 /* Now go into the branch */
3989 PUSH_STATE_GOTO(BRANCH_next, scan);
3992 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3993 REGCP_UNWIND(ST.cp);
3994 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3996 *PL_reglastparen = n;
3997 /*dmq: *PL_reglastcloseparen = n; */
3998 scan = ST.next_branch;
3999 /* no more branches? */
4000 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4002 continue; /* execute next BRANCH[J] op */
4010 #define ST st->u.curlym
4012 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4014 /* This is an optimisation of CURLYX that enables us to push
4015 * only a single backtracking state, no matter now many matches
4016 * there are in {m,n}. It relies on the pattern being constant
4017 * length, with no parens to influence future backrefs
4021 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4023 /* if paren positive, emulate an OPEN/CLOSE around A */
4025 I32 paren = ST.me->flags;
4026 if (paren > PL_regsize)
4028 if (paren > (I32)*PL_reglastparen)
4029 *PL_reglastparen = paren;
4030 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4038 ST.c1 = CHRTEST_UNINIT;
4041 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4044 curlym_do_A: /* execute the A in /A{m,n}B/ */
4045 PL_reginput = locinput;
4046 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4049 case CURLYM_A: /* we've just matched an A */
4050 locinput = st->locinput;
4051 nextchr = UCHARAT(locinput);
4054 /* after first match, determine A's length: u.curlym.alen */
4055 if (ST.count == 1) {
4056 if (PL_reg_match_utf8) {
4058 while (s < PL_reginput) {
4064 ST.alen = PL_reginput - locinput;
4067 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4070 PerlIO_printf(Perl_debug_log,
4071 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4072 (int)(REPORT_CODE_OFF+(depth*2)), "",
4073 (IV) ST.count, (IV)ST.alen)
4076 locinput = PL_reginput;
4078 if (cur_eval && cur_eval->u.eval.close_paren &&
4079 cur_eval->u.eval.close_paren == ST.me->flags)
4082 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4083 goto curlym_do_A; /* try to match another A */
4084 goto curlym_do_B; /* try to match B */
4086 case CURLYM_A_fail: /* just failed to match an A */
4087 REGCP_UNWIND(ST.cp);
4089 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4090 || (cur_eval && cur_eval->u.eval.close_paren &&
4091 cur_eval->u.eval.close_paren == ST.me->flags))
4094 curlym_do_B: /* execute the B in /A{m,n}B/ */
4095 PL_reginput = locinput;
4096 if (ST.c1 == CHRTEST_UNINIT) {
4097 /* calculate c1 and c2 for possible match of 1st char
4098 * following curly */
4099 ST.c1 = ST.c2 = CHRTEST_VOID;
4100 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4101 regnode *text_node = ST.B;
4102 if (! HAS_TEXT(text_node))
4103 FIND_NEXT_IMPT(text_node);
4104 if (HAS_TEXT(text_node)
4105 && PL_regkind[OP(text_node)] != REF)
4107 ST.c1 = (U8)*STRING(text_node);
4109 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4111 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4112 ? PL_fold_locale[ST.c1]
4119 PerlIO_printf(Perl_debug_log,
4120 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4121 (int)(REPORT_CODE_OFF+(depth*2)),
4124 if (ST.c1 != CHRTEST_VOID
4125 && UCHARAT(PL_reginput) != ST.c1
4126 && UCHARAT(PL_reginput) != ST.c2)
4128 /* simulate B failing */
4129 state_num = CURLYM_B_fail;
4130 goto reenter_switch;
4134 /* mark current A as captured */
4135 I32 paren = ST.me->flags;
4138 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4139 PL_regendp[paren] = PL_reginput - PL_bostr;
4140 /*dmq: *PL_reglastcloseparen = paren; */
4143 PL_regendp[paren] = -1;
4144 if (cur_eval && cur_eval->u.eval.close_paren &&
4145 cur_eval->u.eval.close_paren == ST.me->flags)
4154 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4157 case CURLYM_B_fail: /* just failed to match a B */
4158 REGCP_UNWIND(ST.cp);
4160 if (ST.count == ARG2(ST.me) /* max */)
4162 goto curlym_do_A; /* try to match a further A */
4164 /* backtrack one A */
4165 if (ST.count == ARG1(ST.me) /* min */)
4168 locinput = HOPc(locinput, -ST.alen);
4169 goto curlym_do_B; /* try to match B */
4172 #define ST st->u.curly
4174 #define CURLY_SETPAREN(paren, success) \
4177 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4178 PL_regendp[paren] = locinput - PL_bostr; \
4179 *PL_reglastcloseparen = paren; \
4182 PL_regendp[paren] = -1; \
4185 case STAR: /* /A*B/ where A is width 1 */
4189 scan = NEXTOPER(scan);
4191 case PLUS: /* /A+B/ where A is width 1 */
4195 scan = NEXTOPER(scan);
4197 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4198 ST.paren = scan->flags; /* Which paren to set */
4199 if (ST.paren > PL_regsize)
4200 PL_regsize = ST.paren;
4201 if (ST.paren > (I32)*PL_reglastparen)
4202 *PL_reglastparen = ST.paren;
4203 ST.min = ARG1(scan); /* min to match */
4204 ST.max = ARG2(scan); /* max to match */
4205 if (cur_eval && cur_eval->u.eval.close_paren &&
4206 cur_eval->u.eval.close_paren == ST.paren) {
4210 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4212 case CURLY: /* /A{m,n}B/ where A is width 1 */
4214 ST.min = ARG1(scan); /* min to match */
4215 ST.max = ARG2(scan); /* max to match */
4216 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4219 * Lookahead to avoid useless match attempts
4220 * when we know what character comes next.
4222 * Used to only do .*x and .*?x, but now it allows
4223 * for )'s, ('s and (?{ ... })'s to be in the way
4224 * of the quantifier and the EXACT-like node. -- japhy
4227 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4229 if (HAS_TEXT(next) || JUMPABLE(next)) {
4231 regnode *text_node = next;
4233 if (! HAS_TEXT(text_node))
4234 FIND_NEXT_IMPT(text_node);
4236 if (! HAS_TEXT(text_node))
4237 ST.c1 = ST.c2 = CHRTEST_VOID;
4239 if (PL_regkind[OP(text_node)] == REF) {
4240 ST.c1 = ST.c2 = CHRTEST_VOID;
4241 goto assume_ok_easy;
4244 s = (U8*)STRING(text_node);
4248 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4249 ST.c2 = PL_fold[ST.c1];
4250 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4251 ST.c2 = PL_fold_locale[ST.c1];
4254 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4255 STRLEN ulen1, ulen2;
4256 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4257 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4259 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4260 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4262 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4264 0 : UTF8_ALLOW_ANY);
4265 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4267 0 : UTF8_ALLOW_ANY);
4269 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4271 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4276 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4283 ST.c1 = ST.c2 = CHRTEST_VOID;
4288 PL_reginput = locinput;
4291 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4294 locinput = PL_reginput;
4296 if (ST.c1 == CHRTEST_VOID)
4297 goto curly_try_B_min;
4299 ST.oldloc = locinput;
4301 /* set ST.maxpos to the furthest point along the
4302 * string that could possibly match */
4303 if (ST.max == REG_INFTY) {
4304 ST.maxpos = PL_regeol - 1;
4306 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4310 int m = ST.max - ST.min;
4311 for (ST.maxpos = locinput;
4312 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4313 ST.maxpos += UTF8SKIP(ST.maxpos);
4316 ST.maxpos = locinput + ST.max - ST.min;
4317 if (ST.maxpos >= PL_regeol)
4318 ST.maxpos = PL_regeol - 1;
4320 goto curly_try_B_min_known;
4324 ST.count = regrepeat(rex, ST.A, ST.max);
4325 locinput = PL_reginput;
4326 if (ST.count < ST.min)
4328 if ((ST.count > ST.min)
4329 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4331 /* A{m,n} must come at the end of the string, there's
4332 * no point in backing off ... */
4334 /* ...except that $ and \Z can match before *and* after
4335 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4336 We may back off by one in this case. */
4337 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4341 goto curly_try_B_max;
4346 case CURLY_B_min_known_fail:
4347 /* failed to find B in a non-greedy match where c1,c2 valid */
4348 if (ST.paren && ST.count)
4349 PL_regendp[ST.paren] = -1;
4351 PL_reginput = locinput; /* Could be reset... */
4352 REGCP_UNWIND(ST.cp);
4353 /* Couldn't or didn't -- move forward. */
4354 ST.oldloc = locinput;
4356 locinput += UTF8SKIP(locinput);
4360 curly_try_B_min_known:
4361 /* find the next place where 'B' could work, then call B */
4365 n = (ST.oldloc == locinput) ? 0 : 1;
4366 if (ST.c1 == ST.c2) {
4368 /* set n to utf8_distance(oldloc, locinput) */
4369 while (locinput <= ST.maxpos &&
4370 utf8n_to_uvchr((U8*)locinput,
4371 UTF8_MAXBYTES, &len,
4372 uniflags) != (UV)ST.c1) {
4378 /* set n to utf8_distance(oldloc, locinput) */
4379 while (locinput <= ST.maxpos) {
4381 const UV c = utf8n_to_uvchr((U8*)locinput,
4382 UTF8_MAXBYTES, &len,
4384 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4392 if (ST.c1 == ST.c2) {
4393 while (locinput <= ST.maxpos &&
4394 UCHARAT(locinput) != ST.c1)
4398 while (locinput <= ST.maxpos
4399 && UCHARAT(locinput) != ST.c1
4400 && UCHARAT(locinput) != ST.c2)
4403 n = locinput - ST.oldloc;
4405 if (locinput > ST.maxpos)
4407 /* PL_reginput == oldloc now */
4410 if (regrepeat(rex, ST.A, n) < n)
4413 PL_reginput = locinput;
4414 CURLY_SETPAREN(ST.paren, ST.count);
4415 if (cur_eval && cur_eval->u.eval.close_paren &&
4416 cur_eval->u.eval.close_paren == ST.paren) {
4419 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4424 case CURLY_B_min_fail:
4425 /* failed to find B in a non-greedy match where c1,c2 invalid */
4426 if (ST.paren && ST.count)
4427 PL_regendp[ST.paren] = -1;
4429 REGCP_UNWIND(ST.cp);
4430 /* failed -- move forward one */
4431 PL_reginput = locinput;
4432 if (regrepeat(rex, ST.A, 1)) {
4434 locinput = PL_reginput;
4435 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4436 ST.count > 0)) /* count overflow ? */
4439 CURLY_SETPAREN(ST.paren, ST.count);
4440 if (cur_eval && cur_eval->u.eval.close_paren &&
4441 cur_eval->u.eval.close_paren == ST.paren) {
4444 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4452 /* a successful greedy match: now try to match B */
4455 if (ST.c1 != CHRTEST_VOID)
4456 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4457 UTF8_MAXBYTES, 0, uniflags)
4458 : (UV) UCHARAT(PL_reginput);
4459 /* If it could work, try it. */
4460 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4461 CURLY_SETPAREN(ST.paren, ST.count);
4462 if (cur_eval && cur_eval->u.eval.close_paren &&
4463 cur_eval->u.eval.close_paren == ST.paren) {
4466 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4471 case CURLY_B_max_fail:
4472 /* failed to find B in a greedy match */
4473 if (ST.paren && ST.count)
4474 PL_regendp[ST.paren] = -1;
4476 REGCP_UNWIND(ST.cp);
4478 if (--ST.count < ST.min)
4480 PL_reginput = locinput = HOPc(locinput, -1);
4481 goto curly_try_B_max;
4488 /* we've just finished A in /(??{A})B/; now continue with B */
4492 st->u.eval.toggle_reg_flags
4493 = cur_eval->u.eval.toggle_reg_flags;
4494 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4496 st->u.eval.prev_rex = rex; /* inner */
4497 rex = cur_eval->u.eval.prev_rex; /* outer */
4498 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4500 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4501 REGCP_SET(st->u.eval.lastcp);
4502 PL_reginput = locinput;
4504 /* Restore parens of the outer rex without popping the
4506 tmpix = PL_savestack_ix;
4507 PL_savestack_ix = cur_eval->u.eval.lastcp;
4509 PL_savestack_ix = tmpix;
4511 st->u.eval.prev_eval = cur_eval;
4512 cur_eval = cur_eval->u.eval.prev_eval;
4514 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4515 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
4516 PUSH_YES_STATE_GOTO(EVAL_AB,
4517 st->u.eval.prev_eval->u.eval.B); /* match B */
4520 if (locinput < reginfo->till) {
4521 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4522 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4524 (long)(locinput - PL_reg_starttry),
4525 (long)(reginfo->till - PL_reg_starttry),
4527 sayNO_SILENT; /* Cannot match: too short. */
4529 PL_reginput = locinput; /* put where regtry can find it */
4530 sayYES; /* Success! */
4532 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4534 PerlIO_printf(Perl_debug_log,
4535 "%*s %ssubpattern success...%s\n",
4536 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4537 PL_reginput = locinput; /* put where regtry can find it */
4538 sayYES; /* Success! */
4541 #define ST st->u.ifmatch
4543 case SUSPEND: /* (?>A) */
4545 PL_reginput = locinput;
4548 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4550 goto ifmatch_trivial_fail_test;
4552 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4554 ifmatch_trivial_fail_test:
4556 char * const s = HOPBACKc(locinput, scan->flags);
4561 sw = 1 - (bool)ST.wanted;
4565 next = scan + ARG(scan);
4573 PL_reginput = locinput;
4577 ST.logical = logical;
4578 /* execute body of (?...A) */
4579 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4582 case IFMATCH_A_fail: /* body of (?...A) failed */
4583 ST.wanted = !ST.wanted;
4586 case IFMATCH_A: /* body of (?...A) succeeded */
4588 sw = (bool)ST.wanted;
4590 else if (!ST.wanted)
4593 if (OP(ST.me) == SUSPEND)
4594 locinput = PL_reginput;
4596 locinput = PL_reginput = st->locinput;
4597 nextchr = UCHARAT(locinput);
4599 scan = ST.me + ARG(ST.me);
4602 continue; /* execute B */
4607 next = scan + ARG(scan);
4614 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4615 PTR2UV(scan), OP(scan));
4616 Perl_croak(aTHX_ "regexp memory corruption");
4624 /* push a state that backtracks on success */
4625 st->u.yes.prev_yes_state = yes_state;
4629 /* push a new regex state, then continue at scan */
4631 regmatch_state *newst;
4633 DEBUG_STATE_pp("push");
4635 st->locinput = locinput;
4637 if (newst > SLAB_LAST(PL_regmatch_slab))
4638 newst = S_push_slab(aTHX);
4639 PL_regmatch_state = newst;
4641 locinput = PL_reginput;
4642 nextchr = UCHARAT(locinput);
4650 * We get here only if there's trouble -- normally "case END" is
4651 * the terminating point.
4653 Perl_croak(aTHX_ "corrupted regexp pointers");
4659 /* we have successfully completed a subexpression, but we must now
4660 * pop to the state marked by yes_state and continue from there */
4661 assert(st != yes_state);
4663 while (st != yes_state) {
4665 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4666 PL_regmatch_slab = PL_regmatch_slab->prev;
4667 st = SLAB_LAST(PL_regmatch_slab);
4669 DEBUG_STATE_pp("pop (yes)");
4673 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4674 || yes_state > SLAB_LAST(PL_regmatch_slab))
4676 /* not in this slab, pop slab */
4677 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4678 PL_regmatch_slab = PL_regmatch_slab->prev;
4679 st = SLAB_LAST(PL_regmatch_slab);
4681 depth -= (st - yes_state);
4684 yes_state = st->u.yes.prev_yes_state;
4685 PL_regmatch_state = st;
4687 state_num = st->resume_state;
4688 goto reenter_switch;
4691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4692 PL_colors[4], PL_colors[5]));
4699 PerlIO_printf(Perl_debug_log,
4700 "%*s %sfailed...%s\n",
4701 REPORT_CODE_OFF+depth*2, "",
4702 PL_colors[4], PL_colors[5])
4707 /* there's a previous state to backtrack to */
4709 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4710 PL_regmatch_slab = PL_regmatch_slab->prev;
4711 st = SLAB_LAST(PL_regmatch_slab);
4713 PL_regmatch_state = st;
4714 locinput= st->locinput;
4715 nextchr = UCHARAT(locinput);
4717 DEBUG_STATE_pp("pop");
4719 if (yes_state == st)
4720 yes_state = st->u.yes.prev_yes_state;
4722 state_num = st->resume_state + 1; /* failure = success + 1 */
4723 goto reenter_switch;
4729 /* restore original high-water mark */
4730 PL_regmatch_slab = orig_slab;
4731 PL_regmatch_state = orig_state;
4733 /* free all slabs above current one */
4734 if (orig_slab->next) {
4735 regmatch_slab *sl = orig_slab->next;
4736 orig_slab->next = NULL;
4738 regmatch_slab * const osl = sl;
4748 - regrepeat - repeatedly match something simple, report how many
4751 * [This routine now assumes that it will only match on things of length 1.
4752 * That was true before, but now we assume scan - reginput is the count,
4753 * rather than incrementing count on every character. [Er, except utf8.]]
4756 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4759 register char *scan;
4761 register char *loceol = PL_regeol;
4762 register I32 hardcount = 0;
4763 register bool do_utf8 = PL_reg_match_utf8;
4766 if (max == REG_INFTY)
4768 else if (max < loceol - scan)
4769 loceol = scan + max;
4774 while (scan < loceol && hardcount < max && *scan != '\n') {
4775 scan += UTF8SKIP(scan);
4779 while (scan < loceol && *scan != '\n')
4786 while (scan < loceol && hardcount < max) {
4787 scan += UTF8SKIP(scan);
4797 case EXACT: /* length of string is 1 */
4799 while (scan < loceol && UCHARAT(scan) == c)
4802 case EXACTF: /* length of string is 1 */
4804 while (scan < loceol &&
4805 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4808 case EXACTFL: /* length of string is 1 */
4809 PL_reg_flags |= RF_tainted;
4811 while (scan < loceol &&
4812 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4818 while (hardcount < max && scan < loceol &&
4819 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4820 scan += UTF8SKIP(scan);
4824 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4831 LOAD_UTF8_CHARCLASS_ALNUM();
4832 while (hardcount < max && scan < loceol &&
4833 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4834 scan += UTF8SKIP(scan);
4838 while (scan < loceol && isALNUM(*scan))
4843 PL_reg_flags |= RF_tainted;
4846 while (hardcount < max && scan < loceol &&
4847 isALNUM_LC_utf8((U8*)scan)) {
4848 scan += UTF8SKIP(scan);
4852 while (scan < loceol && isALNUM_LC(*scan))
4859 LOAD_UTF8_CHARCLASS_ALNUM();
4860 while (hardcount < max && scan < loceol &&
4861 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4862 scan += UTF8SKIP(scan);
4866 while (scan < loceol && !isALNUM(*scan))
4871 PL_reg_flags |= RF_tainted;
4874 while (hardcount < max && scan < loceol &&
4875 !isALNUM_LC_utf8((U8*)scan)) {
4876 scan += UTF8SKIP(scan);
4880 while (scan < loceol && !isALNUM_LC(*scan))
4887 LOAD_UTF8_CHARCLASS_SPACE();
4888 while (hardcount < max && scan < loceol &&
4890 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4891 scan += UTF8SKIP(scan);
4895 while (scan < loceol && isSPACE(*scan))
4900 PL_reg_flags |= RF_tainted;
4903 while (hardcount < max && scan < loceol &&
4904 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4905 scan += UTF8SKIP(scan);
4909 while (scan < loceol && isSPACE_LC(*scan))
4916 LOAD_UTF8_CHARCLASS_SPACE();
4917 while (hardcount < max && scan < loceol &&
4919 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4920 scan += UTF8SKIP(scan);
4924 while (scan < loceol && !isSPACE(*scan))
4929 PL_reg_flags |= RF_tainted;
4932 while (hardcount < max && scan < loceol &&
4933 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4934 scan += UTF8SKIP(scan);
4938 while (scan < loceol && !isSPACE_LC(*scan))
4945 LOAD_UTF8_CHARCLASS_DIGIT();
4946 while (hardcount < max && scan < loceol &&
4947 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4948 scan += UTF8SKIP(scan);
4952 while (scan < loceol && isDIGIT(*scan))
4959 LOAD_UTF8_CHARCLASS_DIGIT();
4960 while (hardcount < max && scan < loceol &&
4961 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4962 scan += UTF8SKIP(scan);
4966 while (scan < loceol && !isDIGIT(*scan))
4970 default: /* Called on something of 0 width. */
4971 break; /* So match right here or not at all. */
4977 c = scan - PL_reginput;
4981 GET_RE_DEBUG_FLAGS_DECL;
4983 SV * const prop = sv_newmortal();
4984 regprop(prog, prop, p);
4985 PerlIO_printf(Perl_debug_log,
4986 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4987 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4995 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4997 - regclass_swash - prepare the utf8 swash
5001 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5007 const struct reg_data * const data = prog ? prog->data : NULL;
5009 if (data && data->count) {
5010 const U32 n = ARG(node);
5012 if (data->what[n] == 's') {
5013 SV * const rv = (SV*)data->data[n];
5014 AV * const av = (AV*)SvRV((SV*)rv);
5015 SV **const ary = AvARRAY(av);
5018 /* See the end of regcomp.c:S_regclass() for
5019 * documentation of these array elements. */
5022 a = SvROK(ary[1]) ? &ary[1] : 0;
5023 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5027 else if (si && doinit) {
5028 sw = swash_init("utf8", "", si, 1, 0);
5029 (void)av_store(av, 1, sw);
5046 - reginclass - determine if a character falls into a character class
5048 The n is the ANYOF regnode, the p is the target string, lenp
5049 is pointer to the maximum length of how far to go in the p
5050 (if the lenp is zero, UTF8SKIP(p) is used),
5051 do_utf8 tells whether the target string is in UTF-8.
5056 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5059 const char flags = ANYOF_FLAGS(n);
5065 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5066 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5067 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5068 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5069 if (len == (STRLEN)-1)
5070 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5073 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5074 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5077 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5078 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5081 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5085 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5088 if (swash_fetch(sw, p, do_utf8))
5090 else if (flags & ANYOF_FOLD) {
5091 if (!match && lenp && av) {
5093 for (i = 0; i <= av_len(av); i++) {
5094 SV* const sv = *av_fetch(av, i, FALSE);
5096 const char * const s = SvPV_const(sv, len);
5098 if (len <= plen && memEQ(s, (char*)p, len)) {
5106 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5109 to_utf8_fold(p, tmpbuf, &tmplen);
5110 if (swash_fetch(sw, tmpbuf, do_utf8))
5116 if (match && lenp && *lenp == 0)
5117 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5119 if (!match && c < 256) {
5120 if (ANYOF_BITMAP_TEST(n, c))
5122 else if (flags & ANYOF_FOLD) {
5125 if (flags & ANYOF_LOCALE) {
5126 PL_reg_flags |= RF_tainted;
5127 f = PL_fold_locale[c];
5131 if (f != c && ANYOF_BITMAP_TEST(n, f))
5135 if (!match && (flags & ANYOF_CLASS)) {
5136 PL_reg_flags |= RF_tainted;
5138 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5139 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5140 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5141 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5142 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5143 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5144 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5145 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5146 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5147 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5148 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5149 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5150 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5151 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5152 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5153 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5154 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5155 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5156 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5157 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5158 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5159 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5160 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5161 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5162 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5163 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5164 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5165 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5166 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5167 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5168 ) /* How's that for a conditional? */
5175 return (flags & ANYOF_INVERT) ? !match : match;
5179 S_reghop3(U8 *s, I32 off, const U8* lim)
5183 while (off-- && s < lim) {
5184 /* XXX could check well-formedness here */
5189 while (off++ && s > lim) {
5191 if (UTF8_IS_CONTINUED(*s)) {
5192 while (s > lim && UTF8_IS_CONTINUATION(*s))
5195 /* XXX could check well-formedness here */
5202 /* there are a bunch of places where we use two reghop3's that should
5203 be replaced with this routine. but since thats not done yet
5204 we ifdef it out - dmq
5207 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5211 while (off-- && s < rlim) {
5212 /* XXX could check well-formedness here */
5217 while (off++ && s > llim) {
5219 if (UTF8_IS_CONTINUED(*s)) {
5220 while (s > llim && UTF8_IS_CONTINUATION(*s))
5223 /* XXX could check well-formedness here */
5231 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5235 while (off-- && s < lim) {
5236 /* XXX could check well-formedness here */
5243 while (off++ && s > lim) {
5245 if (UTF8_IS_CONTINUED(*s)) {
5246 while (s > lim && UTF8_IS_CONTINUATION(*s))
5249 /* XXX could check well-formedness here */
5258 restore_pos(pTHX_ void *arg)
5261 regexp * const rex = (regexp *)arg;
5262 if (PL_reg_eval_set) {
5263 if (PL_reg_oldsaved) {
5264 rex->subbeg = PL_reg_oldsaved;
5265 rex->sublen = PL_reg_oldsavedlen;
5266 #ifdef PERL_OLD_COPY_ON_WRITE
5267 rex->saved_copy = PL_nrs;
5269 RX_MATCH_COPIED_on(rex);
5271 PL_reg_magic->mg_len = PL_reg_oldpos;
5272 PL_reg_eval_set = 0;
5273 PL_curpm = PL_reg_oldcurpm;
5278 S_to_utf8_substr(pTHX_ register regexp *prog)
5280 if (prog->float_substr && !prog->float_utf8) {
5281 SV* const sv = newSVsv(prog->float_substr);
5282 prog->float_utf8 = sv;
5283 sv_utf8_upgrade(sv);
5284 if (SvTAIL(prog->float_substr))
5286 if (prog->float_substr == prog->check_substr)
5287 prog->check_utf8 = sv;
5289 if (prog->anchored_substr && !prog->anchored_utf8) {
5290 SV* const sv = newSVsv(prog->anchored_substr);
5291 prog->anchored_utf8 = sv;
5292 sv_utf8_upgrade(sv);
5293 if (SvTAIL(prog->anchored_substr))
5295 if (prog->anchored_substr == prog->check_substr)
5296 prog->check_utf8 = sv;
5301 S_to_byte_substr(pTHX_ register regexp *prog)
5304 if (prog->float_utf8 && !prog->float_substr) {
5305 SV* sv = newSVsv(prog->float_utf8);
5306 prog->float_substr = sv;
5307 if (sv_utf8_downgrade(sv, TRUE)) {
5308 if (SvTAIL(prog->float_utf8))
5312 prog->float_substr = sv = &PL_sv_undef;
5314 if (prog->float_utf8 == prog->check_utf8)
5315 prog->check_substr = sv;
5317 if (prog->anchored_utf8 && !prog->anchored_substr) {
5318 SV* sv = newSVsv(prog->anchored_utf8);
5319 prog->anchored_substr = sv;
5320 if (sv_utf8_downgrade(sv, TRUE)) {
5321 if (SvTAIL(prog->anchored_utf8))
5325 prog->anchored_substr = sv = &PL_sv_undef;
5327 if (prog->anchored_utf8 == prog->check_utf8)
5328 prog->check_substr = sv;
5334 * c-indentation-style: bsd
5336 * indent-tabs-mode: t
5339 * ex: set ts=8 sts=4 sw=4 noet: