5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 8
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHPTR(PL_regstartp);
186 SSPUSHPTR(PL_regendp);
187 SSPUSHINT(PL_regsize);
188 SSPUSHINT(*PL_reglastparen);
189 SSPUSHINT(*PL_reglastcloseparen);
190 SSPUSHPTR(PL_reginput);
191 #define REGCP_FRAME_ELEMS 2
192 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
193 * are needed for the regexp context stack bookkeeping. */
194 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
195 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
200 /* These are needed since we do not localize EVAL nodes: */
201 #define REGCP_SET(cp) \
203 PerlIO_printf(Perl_debug_log, \
204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
205 (IV)PL_savestack_ix)); \
208 #define REGCP_UNWIND(cp) \
210 if (cp != PL_savestack_ix) \
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
213 (IV)(cp), (IV)PL_savestack_ix)); \
217 S_regcppop(pTHX_ const regexp *rex)
223 GET_RE_DEBUG_FLAGS_DECL;
225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
229 input = (char *) SSPOPPTR;
230 *PL_reglastcloseparen = SSPOPINT;
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
233 PL_regendp=(I32 *) SSPOPPTR;
234 PL_regstartp=(I32 *) SSPOPPTR;
237 /* Now restore the parentheses context. */
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
241 U32 paren = (U32)SSPOPINT;
242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
243 PL_regstartp[paren] = SSPOPINT;
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
248 PerlIO_printf(Perl_debug_log,
249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
250 (UV)paren, (IV)PL_regstartp[paren],
251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
252 (IV)PL_regendp[paren],
253 (paren > *PL_reglastparen ? "(no)" : ""));
257 if (*PL_reglastparen + 1 <= rex->nparens) {
258 PerlIO_printf(Perl_debug_log,
259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
260 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
276 PL_regstartp[i] = -1;
283 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
286 * pregexec and friends
289 #ifndef PERL_IN_XSUB_RE
291 - pregexec - match a regexp against a string
294 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
295 char *strbeg, I32 minend, SV *screamer, U32 nosave)
296 /* strend: pointer to null at end of string */
297 /* strbeg: real beginning of string */
298 /* minend: end of match must be >=minend after stringarg. */
299 /* nosave: For optimizations. */
302 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
303 nosave ? 0 : REXEC_COPY_STR);
308 * Need to implement the following flags for reg_anch:
310 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
313 * INTUIT_AUTORITATIVE_ML
314 * INTUIT_ONCE_NOML - Intuit can match in one location only.
317 * Another flag for this function: SECOND_TIME (so that float substrs
318 * with giant delta may be not rechecked).
321 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
324 Otherwise, only SvCUR(sv) is used to get strbeg. */
326 /* XXXX We assume that strpos is strbeg unless sv. */
328 /* XXXX Some places assume that there is a fixed substring.
329 An update may be needed if optimizer marks as "INTUITable"
330 RExen without fixed substrings. Similarly, it is assumed that
331 lengths of all the strings are no more than minlen, thus they
332 cannot come from lookahead.
333 (Or minlen should take into account lookahead.) */
335 /* A failure to find a constant substring means that there is no need to make
336 an expensive call to REx engine, thus we celebrate a failure. Similarly,
337 finding a substring too deep into the string means that less calls to
338 regtry() should be needed.
340 REx compiler's optimizer found 4 possible hints:
341 a) Anchored substring;
343 c) Whether we are anchored (beginning-of-line or \G);
344 d) First node (of those at offset 0) which may distingush positions;
345 We use a)b)d) and multiline-part of c), and try to find a position in the
346 string which does not contradict any of them.
349 /* Most of decisions we do here should have been done at compile time.
350 The nodes of the REx which we used for the search should have been
351 deleted from the finite automaton. */
354 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
355 char *strend, U32 flags, re_scream_pos_data *data)
358 register I32 start_shift = 0;
359 /* Should be nonnegative! */
360 register I32 end_shift = 0;
365 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
367 register char *other_last = NULL; /* other substr checked before this */
368 char *check_at = NULL; /* check substr found at this pos */
369 const I32 multiline = prog->reganch & PMf_MULTILINE;
371 const char * const i_strpos = strpos;
374 GET_RE_DEBUG_FLAGS_DECL;
376 RX_MATCH_UTF8_set(prog,do_utf8);
378 if (prog->reganch & ROPT_UTF8) {
379 PL_reg_flags |= RF_utf8;
382 debug_start_match(prog, do_utf8, strpos, strend,
383 sv ? "Guessing start of match in sv for"
384 : "Guessing start of match in string for");
387 /* CHR_DIST() would be more correct here but it makes things slow. */
388 if (prog->minlen > strend - strpos) {
389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
390 "String too short... [re_intuit_start]\n"));
394 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
397 if (!prog->check_utf8 && prog->check_substr)
398 to_utf8_substr(prog);
399 check = prog->check_utf8;
401 if (!prog->check_substr && prog->check_utf8)
402 to_byte_substr(prog);
403 check = prog->check_substr;
405 if (check == &PL_sv_undef) {
406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
407 "Non-utf8 string cannot match utf8 check string\n"));
410 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
411 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
412 || ( (prog->reganch & ROPT_ANCH_BOL)
413 && !multiline ) ); /* Check after \n? */
416 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
417 | ROPT_IMPLICIT)) /* not a real BOL */
418 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
420 && (strpos != strbeg)) {
421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
424 if (prog->check_offset_min == prog->check_offset_max &&
425 !(prog->reganch & ROPT_CANY_SEEN)) {
426 /* Substring at constant offset from beg-of-str... */
429 s = HOP3c(strpos, prog->check_offset_min, strend);
432 slen = SvCUR(check); /* >= 1 */
434 if ( strend - s > slen || strend - s < slen - 1
435 || (strend - s == slen && strend[-1] != '\n')) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
439 /* Now should match s[0..slen-2] */
441 if (slen && (*SvPVX_const(check) != *s
443 && memNE(SvPVX_const(check), s, slen)))) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
449 else if (*SvPVX_const(check) != *s
450 || ((slen = SvCUR(check)) > 1
451 && memNE(SvPVX_const(check), s, slen)))
454 goto success_at_start;
457 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
459 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
460 end_shift = prog->check_end_shift;
463 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
464 - (SvTAIL(check) != 0);
465 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
467 if (end_shift < eshift)
471 else { /* Can match at random position */
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->check_end_shift;
477 /* end shift should be non negative here */
480 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
482 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
483 (IV)end_shift, prog->precomp);
487 /* Find a possible match in the region s..strend by looking for
488 the "check" substring in the region corrected by start/end_shift. */
491 I32 srch_start_shift = start_shift;
492 I32 srch_end_shift = end_shift;
493 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
494 srch_end_shift -= ((strbeg - s) - srch_start_shift);
495 srch_start_shift = strbeg - s;
497 DEBUG_OPTIMISE_MORE_r({
498 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
499 (IV)prog->check_offset_min,
500 (IV)srch_start_shift,
502 (IV)prog->check_end_shift);
505 if (flags & REXEC_SCREAM) {
506 I32 p = -1; /* Internal iterator of scream. */
507 I32 * const pp = data ? data->scream_pos : &p;
509 if (PL_screamfirst[BmRARE(check)] >= 0
510 || ( BmRARE(check) == '\n'
511 && (BmPREVIOUS(check) == SvCUR(check) - 1)
513 s = screaminstr(sv, check,
514 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
517 /* we may be pointing at the wrong string */
518 if (s && RX_MATCH_COPIED(prog))
519 s = strbeg + (s - SvPVX_const(sv));
521 *data->scream_olds = s;
526 if (prog->reganch & ROPT_CANY_SEEN) {
527 start_point= (U8*)(s + srch_start_shift);
528 end_point= (U8*)(strend - srch_end_shift);
530 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
531 end_point= HOP3(strend, -srch_end_shift, strbeg);
533 DEBUG_OPTIMISE_MORE_r({
534 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
535 (int)(end_point - start_point),
536 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
540 s = fbm_instr( start_point, end_point,
541 check, multiline ? FBMrf_MULTILINE : 0);
544 /* Update the count-of-usability, remove useless subpatterns,
548 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
549 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
550 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
551 (s ? "Found" : "Did not find"),
552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
553 ? "anchored" : "floating"),
556 (s ? " at offset " : "...\n") );
561 /* Finish the diagnostic message */
562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
564 /* XXX dmq: first branch is for positive lookbehind...
565 Our check string is offset from the beginning of the pattern.
566 So we need to do any stclass tests offset forward from that
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
578 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
584 : (prog->float_substr && prog->anchored_substr))
586 /* Take into account the "other" substring. */
587 /* XXXX May be hopelessly wrong for UTF... */
590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
593 char * const last = HOP3c(s, -start_shift, strbeg);
595 char * const saved_s = s;
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
606 t = HOP3c(t, prog->anchored_offset, strend);
607 if (t < other_last) /* These positions already checked */
609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
612 /* XXXX It is not documented what units *_offsets are in.
613 We assume bytes, but this is clearly wrong.
614 Meaning this code needs to be carefully reviewed for errors.
618 /* On end-of-str: see comment below. */
619 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
620 if (must == &PL_sv_undef) {
622 DEBUG_r(must = prog->anchored_utf8); /* for debug */
627 HOP3(HOP3(last1, prog->anchored_offset, strend)
628 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
630 multiline ? FBMrf_MULTILINE : 0
633 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
634 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
635 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
636 (s ? "Found" : "Contradicts"),
637 quoted, RE_SV_TAIL(must));
642 if (last1 >= last2) {
643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
644 ", giving up...\n"));
647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
648 ", trying floating at offset %ld...\n",
649 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
650 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
651 s = HOP3c(last, 1, strend);
655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
656 (long)(s - i_strpos)));
657 t = HOP3c(s, -prog->anchored_offset, strbeg);
658 other_last = HOP3c(s, 1, strend);
666 else { /* Take into account the floating substring. */
668 char * const saved_s = s;
671 t = HOP3c(s, -start_shift, strbeg);
673 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
674 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
675 last = HOP3c(t, prog->float_max_offset, strend);
676 s = HOP3c(t, prog->float_min_offset, strend);
679 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
680 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
681 /* fbm_instr() takes into account exact value of end-of-str
682 if the check is SvTAIL(ed). Since false positives are OK,
683 and end-of-str is not later than strend we are OK. */
684 if (must == &PL_sv_undef) {
686 DEBUG_r(must = prog->float_utf8); /* for debug message */
689 s = fbm_instr((unsigned char*)s,
690 (unsigned char*)last + SvCUR(must)
692 must, multiline ? FBMrf_MULTILINE : 0);
694 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
695 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
696 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
697 (s ? "Found" : "Contradicts"),
698 quoted, RE_SV_TAIL(must));
702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
703 ", giving up...\n"));
706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
707 ", trying anchored starting at offset %ld...\n",
708 (long)(saved_s + 1 - i_strpos)));
710 s = HOP3c(t, 1, strend);
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
715 (long)(s - i_strpos)));
716 other_last = s; /* Fix this later. --Hugo */
726 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
728 DEBUG_OPTIMISE_MORE_r(
729 PerlIO_printf(Perl_debug_log,
730 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
731 (IV)prog->check_offset_min,
732 (IV)prog->check_offset_max,
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
748 if (ml_anch && t[-1] != '\n') {
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
756 while (t < strend - prog->minlen) {
758 if (t < check_at - prog->check_offset_min) {
759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
769 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
770 goto do_other_anchored;
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
776 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
783 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
784 other_last = strpos = s = t + 1;
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
790 PL_colors[0], PL_colors[1]));
794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
795 PL_colors[0], PL_colors[1]));
799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
802 /* The found string does not prohibit matching at strpos,
803 - no optimization of calling REx engine can be performed,
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
810 && (strpos != strbeg) && strpos[-1] != '\n'
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
817 DEBUG_EXECUTE_r( if (ml_anch)
818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
819 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
833 /* If flags & SOMETHING - do not do it many times on the same match */
834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
838 prog->check_substr = prog->check_utf8 = NULL; /* disable */
839 prog->float_substr = prog->float_utf8 = NULL; /* clear */
840 check = NULL; /* abort */
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
845 prog->reganch &= ~RE_USE_INTUIT;
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
853 /* trie stclasses are too expensive to use here, we are better off to
854 leave it to regmatch itself */
855 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
864 const U8* const str = (U8*)STRING(prog->regstclass);
865 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
869 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
871 else if (prog->float_substr || prog->float_utf8)
872 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
877 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
880 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
883 const char *what = NULL;
885 if (endpos == strend) {
886 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
887 "Could not match STCLASS...\n") );
890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
891 "This position contradicts STCLASS...\n") );
892 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
894 /* Contradict one of substrings */
895 if (prog->anchored_substr || prog->anchored_utf8) {
896 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
897 DEBUG_EXECUTE_r( what = "anchored" );
899 s = HOP3c(t, 1, strend);
900 if (s + start_shift + end_shift > strend) {
901 /* XXXX Should be taken into account earlier? */
902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
903 "Could not match STCLASS...\n") );
908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
909 "Looking for %s substr starting at offset %ld...\n",
910 what, (long)(s + start_shift - i_strpos)) );
913 /* Have both, check_string is floating */
914 if (t + start_shift >= check_at) /* Contradicts floating=check */
915 goto retry_floating_check;
916 /* Recheck anchored substring, but not floating... */
920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
921 "Looking for anchored substr starting at offset %ld...\n",
922 (long)(other_last - i_strpos)) );
923 goto do_other_anchored;
925 /* Another way we could have checked stclass at the
926 current position only: */
931 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
932 "Looking for /%s^%s/m starting at offset %ld...\n",
933 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
936 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
938 /* Check is floating subtring. */
939 retry_floating_check:
940 t = check_at - start_shift;
941 DEBUG_EXECUTE_r( what = "floating" );
942 goto hop_and_restart;
945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946 "By STCLASS: moving %ld --> %ld\n",
947 (long)(t - i_strpos), (long)(s - i_strpos))
951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
952 "Does not contradict STCLASS...\n");
957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
958 PL_colors[4], (check ? "Guessed" : "Giving up"),
959 PL_colors[5], (long)(s - i_strpos)) );
962 fail_finish: /* Substring not found */
963 if (prog->check_substr || prog->check_utf8) /* could be removed already */
964 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
967 PL_colors[4], PL_colors[5]));
973 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
974 foldlen, foldbuf, uniflags) STMT_START { \
975 switch (trie_type) { \
976 case trie_utf8_fold: \
978 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
983 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
985 foldlen -= UNISKIP( uvc ); \
986 uscan = foldbuf + UNISKIP( uvc ); \
990 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
998 charid = trie->charmap[ uvc ]; \
1002 if (trie->widecharmap) { \
1003 SV** const svpp = hv_fetch(trie->widecharmap, \
1004 (char*)&uvc, sizeof(UV), 0); \
1006 charid = (U16)SvIV(*svpp); \
1011 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1014 ibcmp_utf8(s, NULL, 0, do_utf8, \
1015 m, NULL, ln, (bool)UTF)) \
1016 && (!reginfo || regtry(reginfo, s)) ) \
1019 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1020 uvchr_to_utf8(tmpbuf, c); \
1021 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1023 && (f == c1 || f == c2) \
1024 && (ln == foldlen || \
1025 !ibcmp_utf8((char *) foldbuf, \
1026 NULL, foldlen, do_utf8, \
1028 NULL, ln, (bool)UTF)) \
1029 && (!reginfo || regtry(reginfo, s)) ) \
1034 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1038 && (ln == 1 || !(OP(c) == EXACTF \
1040 : ibcmp_locale(s, m, ln))) \
1041 && (!reginfo || regtry(reginfo, s)) ) \
1047 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1049 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1055 #define REXEC_FBC_SCAN(CoDe) \
1057 while (s < strend) { \
1063 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1064 REXEC_FBC_UTF8_SCAN( \
1066 if (tmp && (!reginfo || regtry(reginfo, s))) \
1075 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1078 if (tmp && (!reginfo || regtry(reginfo, s))) \
1087 #define REXEC_FBC_TRYIT \
1088 if ((!reginfo || regtry(reginfo, s))) \
1091 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1094 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1097 REXEC_FBC_CLASS_SCAN(CoNd); \
1101 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1102 PL_reg_flags |= RF_tainted; \
1104 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1107 REXEC_FBC_CLASS_SCAN(CoNd); \
1111 #define DUMP_EXEC_POS(li,s,doutf8) \
1112 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1114 /* We know what class REx starts with. Try to find this position... */
1115 /* if reginfo is NULL, its a dryrun */
1116 /* annoyingly all the vars in this routine have different names from their counterparts
1117 in regmatch. /grrr */
1120 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1121 const char *strend, const regmatch_info *reginfo)
1124 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1128 register STRLEN uskip;
1132 register I32 tmp = 1; /* Scratch variable? */
1133 register const bool do_utf8 = PL_reg_match_utf8;
1135 /* We know what class it must start with. */
1139 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1140 !UTF8_IS_INVARIANT((U8)s[0]) ?
1141 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1142 REGINCLASS(prog, c, (U8*)s));
1145 while (s < strend) {
1148 if (REGINCLASS(prog, c, (U8*)s) ||
1149 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1150 /* The assignment of 2 is intentional:
1151 * for the folded sharp s, the skip is 2. */
1152 (skip = SHARP_S_SKIP))) {
1153 if (tmp && (!reginfo || regtry(reginfo, s)))
1166 if (tmp && (!reginfo || regtry(reginfo, s)))
1174 ln = STR_LEN(c); /* length to match in octets/bytes */
1175 lnc = (I32) ln; /* length to match in characters */
1177 STRLEN ulen1, ulen2;
1179 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1180 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1181 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1183 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1184 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1186 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1188 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1191 while (sm < ((U8 *) m + ln)) {
1206 c2 = PL_fold_locale[c1];
1208 e = HOP3c(strend, -((I32)lnc), s);
1210 if (!reginfo && e < s)
1211 e = s; /* Due to minlen logic of intuit() */
1213 /* The idea in the EXACTF* cases is to first find the
1214 * first character of the EXACTF* node and then, if
1215 * necessary, case-insensitively compare the full
1216 * text of the node. The c1 and c2 are the first
1217 * characters (though in Unicode it gets a bit
1218 * more complicated because there are more cases
1219 * than just upper and lower: one needs to use
1220 * the so-called folding case for case-insensitive
1221 * matching (called "loose matching" in Unicode).
1222 * ibcmp_utf8() will do just that. */
1226 U8 tmpbuf [UTF8_MAXBYTES+1];
1227 STRLEN len, foldlen;
1228 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1230 /* Upper and lower of 1st char are equal -
1231 * probably not a "letter". */
1233 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1235 REXEC_FBC_EXACTISH_CHECK(c == c1);
1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1243 /* Handle some of the three Greek sigmas cases.
1244 * Note that not all the possible combinations
1245 * are handled here: some of them are handled
1246 * by the standard folding rules, and some of
1247 * them (the character class or ANYOF cases)
1248 * are handled during compiletime in
1249 * regexec.c:S_regclass(). */
1250 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1251 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1252 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1254 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1260 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1262 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1266 PL_reg_flags |= RF_tainted;
1273 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1274 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1276 tmp = ((OP(c) == BOUND ?
1277 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1278 LOAD_UTF8_CHARCLASS_ALNUM();
1279 REXEC_FBC_UTF8_SCAN(
1280 if (tmp == !(OP(c) == BOUND ?
1281 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1282 isALNUM_LC_utf8((U8*)s)))
1290 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1291 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1294 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1300 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1304 PL_reg_flags |= RF_tainted;
1311 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1312 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1314 tmp = ((OP(c) == NBOUND ?
1315 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1316 LOAD_UTF8_CHARCLASS_ALNUM();
1317 REXEC_FBC_UTF8_SCAN(
1318 if (tmp == !(OP(c) == NBOUND ?
1319 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1320 isALNUM_LC_utf8((U8*)s)))
1322 else REXEC_FBC_TRYIT;
1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1327 tmp = ((OP(c) == NBOUND ?
1328 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1331 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1333 else REXEC_FBC_TRYIT;
1336 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1340 REXEC_FBC_CSCAN_PRELOAD(
1341 LOAD_UTF8_CHARCLASS_ALNUM(),
1342 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1346 REXEC_FBC_CSCAN_TAINT(
1347 isALNUM_LC_utf8((U8*)s),
1351 REXEC_FBC_CSCAN_PRELOAD(
1352 LOAD_UTF8_CHARCLASS_ALNUM(),
1353 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1357 REXEC_FBC_CSCAN_TAINT(
1358 !isALNUM_LC_utf8((U8*)s),
1362 REXEC_FBC_CSCAN_PRELOAD(
1363 LOAD_UTF8_CHARCLASS_SPACE(),
1364 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1368 REXEC_FBC_CSCAN_TAINT(
1369 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1373 REXEC_FBC_CSCAN_PRELOAD(
1374 LOAD_UTF8_CHARCLASS_SPACE(),
1375 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1379 REXEC_FBC_CSCAN_TAINT(
1380 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1384 REXEC_FBC_CSCAN_PRELOAD(
1385 LOAD_UTF8_CHARCLASS_DIGIT(),
1386 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1390 REXEC_FBC_CSCAN_TAINT(
1391 isDIGIT_LC_utf8((U8*)s),
1395 REXEC_FBC_CSCAN_PRELOAD(
1396 LOAD_UTF8_CHARCLASS_DIGIT(),
1397 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1401 REXEC_FBC_CSCAN_TAINT(
1402 !isDIGIT_LC_utf8((U8*)s),
1408 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1409 trie_type = do_utf8 ?
1410 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1412 /* what trie are we using right now */
1414 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1415 reg_trie_data *trie=aho->trie;
1417 const char *last_start = strend - trie->minlen;
1419 const char *real_start = s;
1421 STRLEN maxlen = trie->maxlen;
1423 U8 **points; /* map of where we were in the input string
1424 when reading a given char. For ASCII this
1425 is unnecessary overhead as the relationship
1426 is always 1:1, but for unicode, especially
1427 case folded unicode this is not true. */
1428 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1432 GET_RE_DEBUG_FLAGS_DECL;
1434 /* We can't just allocate points here. We need to wrap it in
1435 * an SV so it gets freed properly if there is a croak while
1436 * running the match */
1439 sv_points=newSV(maxlen * sizeof(U8 *));
1440 SvCUR_set(sv_points,
1441 maxlen * sizeof(U8 *));
1442 SvPOK_on(sv_points);
1443 sv_2mortal(sv_points);
1444 points=(U8**)SvPV_nolen(sv_points );
1445 if ( trie_type != trie_utf8_fold
1446 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1449 bitmap=(U8*)trie->bitmap;
1451 bitmap=(U8*)ANYOF_BITMAP(c);
1453 /* this is the Aho-Corasick algorithm modified a touch
1454 to include special handling for long "unknown char"
1455 sequences. The basic idea being that we use AC as long
1456 as we are dealing with a possible matching char, when
1457 we encounter an unknown char (and we have not encountered
1458 an accepting state) we scan forward until we find a legal
1460 AC matching is basically that of trie matching, except
1461 that when we encounter a failing transition, we fall back
1462 to the current states "fail state", and try the current char
1463 again, a process we repeat until we reach the root state,
1464 state 1, or a legal transition. If we fail on the root state
1465 then we can either terminate if we have reached an accepting
1466 state previously, or restart the entire process from the beginning
1470 while (s <= last_start) {
1471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1479 U8 *uscan = (U8*)NULL;
1480 U8 *leftmost = NULL;
1482 U32 accepted_word= 0;
1486 while ( state && uc <= (U8*)strend ) {
1488 U32 word = aho->states[ state ].wordnum;
1492 DEBUG_TRIE_EXECUTE_r(
1493 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1494 dump_exec_pos( (char *)uc, c, strend, real_start,
1495 (char *)uc, do_utf8 );
1496 PerlIO_printf( Perl_debug_log,
1497 " Scanning for legal start char...\n");
1500 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1505 if (uc >(U8*)last_start) break;
1509 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1510 if (!leftmost || lpos < leftmost) {
1511 DEBUG_r(accepted_word=word);
1517 points[pointpos++ % maxlen]= uc;
1518 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1519 uvc, charid, foldlen, foldbuf, uniflags);
1520 DEBUG_TRIE_EXECUTE_r({
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1523 PerlIO_printf(Perl_debug_log,
1524 " Charid:%3u CP:%4"UVxf" ",
1530 word = aho->states[ state ].wordnum;
1532 base = aho->states[ state ].trans.base;
1534 DEBUG_TRIE_EXECUTE_r({
1536 dump_exec_pos( (char *)uc, c, strend, real_start,
1538 PerlIO_printf( Perl_debug_log,
1539 "%sState: %4"UVxf", word=%"UVxf,
1540 failed ? " Fail transition to " : "",
1541 (UV)state, (UV)word);
1546 (base + charid > trie->uniquecharcount )
1547 && (base + charid - 1 - trie->uniquecharcount
1549 && trie->trans[base + charid - 1 -
1550 trie->uniquecharcount].check == state
1551 && (tmp=trie->trans[base + charid - 1 -
1552 trie->uniquecharcount ].next))
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - legal\n"));
1560 DEBUG_TRIE_EXECUTE_r(
1561 PerlIO_printf( Perl_debug_log," - fail\n"));
1563 state = aho->fail[state];
1567 /* we must be accepting here */
1568 DEBUG_TRIE_EXECUTE_r(
1569 PerlIO_printf( Perl_debug_log," - accepting\n"));
1578 if (!state) state = 1;
1581 if ( aho->states[ state ].wordnum ) {
1582 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1583 if (!leftmost || lpos < leftmost) {
1584 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1589 s = (char*)leftmost;
1590 DEBUG_TRIE_EXECUTE_r({
1592 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1593 (UV)accepted_word, s - real_start
1596 if (!reginfo || regtry(reginfo, s)) {
1602 DEBUG_TRIE_EXECUTE_r({
1603 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1606 DEBUG_TRIE_EXECUTE_r(
1607 PerlIO_printf( Perl_debug_log,"No match.\n"));
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1625 - regexec_flags - match a regexp against a string
1628 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630 /* strend: pointer to null at end of string */
1631 /* strbeg: real beginning of string */
1632 /* minend: end of match must be >=minend after stringarg. */
1633 /* data: May be used for some additional optimizations. */
1634 /* nosave: For optimizations. */
1638 register regnode *c;
1639 register char *startpos = stringarg;
1640 I32 minlen; /* must match at least this many chars */
1641 I32 dontbother = 0; /* how many characters not to try at end */
1642 I32 end_shift = 0; /* Same for the end. */ /* CC */
1643 I32 scream_pos = -1; /* Internal iterator of scream. */
1644 char *scream_olds = NULL;
1645 SV* const oreplsv = GvSV(PL_replgv);
1646 const bool do_utf8 = (bool)DO_UTF8(sv);
1649 regmatch_info reginfo; /* create some info to pass to regtry etc */
1651 GET_RE_DEBUG_FLAGS_DECL;
1653 PERL_UNUSED_ARG(data);
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1661 multiline = prog->reganch & PMf_MULTILINE;
1662 reginfo.prog = prog;
1664 RX_MATCH_UTF8_set(prog, do_utf8);
1666 debug_start_match(prog, do_utf8, startpos, strend,
1670 minlen = prog->minlen;
1672 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1674 "String too short [regexec_flags]...\n"));
1679 /* Check validity of program. */
1680 if (UCHARAT(prog->program) != REG_MAGIC) {
1681 Perl_croak(aTHX_ "corrupted regexp program");
1685 PL_reg_eval_set = 0;
1688 if (prog->reganch & ROPT_UTF8)
1689 PL_reg_flags |= RF_utf8;
1691 /* Mark beginning of line for ^ and lookbehind. */
1692 reginfo.bol = startpos; /* XXX not used ??? */
1696 /* Mark end of line for $ (and such) */
1699 /* see how far we have to get to not match where we matched before */
1700 reginfo.till = startpos+minend;
1702 /* If there is a "must appear" string, look for it. */
1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1709 reginfo.ganch = startpos;
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
1714 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1715 if (prog->reganch & ROPT_ANCH_GPOS) {
1716 if (s > reginfo.ganch)
1721 else /* pos() not defined */
1722 reginfo.ganch = strbeg;
1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1726 re_scream_pos_data d;
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1733 goto phooey; /* not present */
1739 /* Simplest case: anchored match need be tried only once. */
1740 /* [unless only anchor is BOL and multiline is set] */
1741 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1742 if (s == startpos && regtry(®info, startpos))
1744 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1745 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1750 dontbother = minlen - 1;
1751 end = HOP3c(strend, -dontbother, strbeg) - 1;
1752 /* for multiline we only have to try after newlines */
1753 if (prog->check_substr || prog->check_utf8) {
1757 if (regtry(®info, s))
1762 if (prog->reganch & RE_USE_INTUIT) {
1763 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1774 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1775 if (regtry(®info, s))
1782 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1784 /* the warning about reginfo.ganch being used without intialization
1785 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1786 and we only enter this block when the same bit is set. */
1787 if (regtry(®info, reginfo.ganch))
1792 /* Messy cases: unanchored match. */
1793 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1794 /* we have /x+whatever/ */
1795 /* it must be a one character string (XXXX Except UTF?) */
1800 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1801 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1802 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1807 DEBUG_EXECUTE_r( did_match = 1 );
1808 if (regtry(®info, s)) goto got_it;
1810 while (s < strend && *s == ch)
1818 DEBUG_EXECUTE_r( did_match = 1 );
1819 if (regtry(®info, s)) goto got_it;
1821 while (s < strend && *s == ch)
1826 DEBUG_EXECUTE_r(if (!did_match)
1827 PerlIO_printf(Perl_debug_log,
1828 "Did not find anchored character...\n")
1831 else if (prog->anchored_substr != NULL
1832 || prog->anchored_utf8 != NULL
1833 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1834 && prog->float_max_offset < strend - s)) {
1839 char *last1; /* Last position checked before */
1843 if (prog->anchored_substr || prog->anchored_utf8) {
1844 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1847 back_max = back_min = prog->anchored_offset;
1849 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1850 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1851 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1852 back_max = prog->float_max_offset;
1853 back_min = prog->float_min_offset;
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1864 last = HOP3c(strend, /* Cannot start after this */
1865 -(I32)(CHR_SVLEN(must)
1866 - (SvTAIL(must) != 0) + back_min), strbeg);
1869 last1 = HOPc(s, -1);
1871 last1 = s - 1; /* bogus */
1873 /* XXXX check_substr already used to find "s", can optimize if
1874 check_substr==must. */
1876 dontbother = end_shift;
1877 strend = HOPc(strend, -dontbother);
1878 while ( (s <= last) &&
1879 ((flags & REXEC_SCREAM)
1880 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1881 end_shift, &scream_pos, 0))
1882 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1883 (unsigned char*)strend, must,
1884 multiline ? FBMrf_MULTILINE : 0))) ) {
1885 /* we may be pointing at the wrong string */
1886 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1887 s = strbeg + (s - SvPVX_const(sv));
1888 DEBUG_EXECUTE_r( did_match = 1 );
1889 if (HOPc(s, -back_max) > last1) {
1890 last1 = HOPc(s, -back_min);
1891 s = HOPc(s, -back_max);
1894 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1896 last1 = HOPc(s, -back_min);
1900 while (s <= last1) {
1901 if (regtry(®info, s))
1907 while (s <= last1) {
1908 if (regtry(®info, s))
1914 DEBUG_EXECUTE_r(if (!did_match) {
1915 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1916 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1917 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1918 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1919 ? "anchored" : "floating"),
1920 quoted, RE_SV_TAIL(must));
1924 else if ( (c = prog->regstclass) ) {
1926 const OPCODE op = OP(prog->regstclass);
1927 /* don't bother with what can't match */
1928 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1929 strend = HOPc(strend, -(minlen - 1));
1932 SV * const prop = sv_newmortal();
1933 regprop(prog, prop, c);
1935 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1937 PerlIO_printf(Perl_debug_log,
1938 "Matching stclass %.*s against %s (%d chars)\n",
1939 (int)SvCUR(prop), SvPVX_const(prop),
1940 quoted, (int)(strend - s));
1943 if (find_byclass(prog, c, s, strend, ®info))
1945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1949 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1954 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1955 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1956 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1958 if (flags & REXEC_SCREAM) {
1959 last = screaminstr(sv, float_real, s - strbeg,
1960 end_shift, &scream_pos, 1); /* last one */
1962 last = scream_olds; /* Only one occurrence. */
1963 /* we may be pointing at the wrong string */
1964 else if (RX_MATCH_COPIED(prog))
1965 s = strbeg + (s - SvPVX_const(sv));
1969 const char * const little = SvPV_const(float_real, len);
1971 if (SvTAIL(float_real)) {
1972 if (memEQ(strend - len + 1, little, len - 1))
1973 last = strend - len + 1;
1974 else if (!multiline)
1975 last = memEQ(strend - len, little, len)
1976 ? strend - len : NULL;
1982 last = rninstr(s, strend, little, little + len);
1984 last = strend; /* matching "$" */
1989 PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
1992 goto phooey; /* Should not happen! */
1994 dontbother = strend - last + prog->float_min_offset;
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
2002 if (regtry(®info, s))
2011 if (regtry(®info, s))
2013 } while (s++ < strend);
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2029 restore_pos(aTHX_ prog);
2031 if (prog->paren_names)
2032 (void)hv_iterinit(prog->paren_names);
2034 /* make sure $`, $&, $', and $digit will work later */
2035 if ( !(flags & REXEC_NOT_FIRST) ) {
2036 RX_MATCH_COPY_FREE(prog);
2037 if (flags & REXEC_COPY_STR) {
2038 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2039 #ifdef PERL_OLD_COPY_ON_WRITE
2041 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2043 PerlIO_printf(Perl_debug_log,
2044 "Copy on write: regexp capture, type %d\n",
2047 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2048 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2049 assert (SvPOKp(prog->saved_copy));
2053 RX_MATCH_COPIED_on(prog);
2054 s = savepvn(strbeg, i);
2060 prog->subbeg = strbeg;
2061 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2068 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2069 PL_colors[4], PL_colors[5]));
2070 if (PL_reg_eval_set)
2071 restore_pos(aTHX_ prog);
2077 - regtry - try match at specific point
2079 STATIC I32 /* 0 failure, 1 success */
2080 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2086 regexp *prog = reginfo->prog;
2087 GET_RE_DEBUG_FLAGS_DECL;
2089 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2092 PL_reg_eval_set = RS_init;
2093 DEBUG_EXECUTE_r(DEBUG_s(
2094 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2095 (IV)(PL_stack_sp - PL_stack_base));
2097 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2098 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2099 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2101 /* Apparently this is not needed, judging by wantarray. */
2102 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2103 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2106 /* Make $_ available to executed code. */
2107 if (reginfo->sv != DEFSV) {
2109 DEFSV = reginfo->sv;
2112 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2113 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2114 /* prepare for quick setting of pos */
2115 #ifdef PERL_OLD_COPY_ON_WRITE
2117 sv_force_normal_flags(sv, 0);
2119 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2120 &PL_vtbl_mglob, NULL, 0);
2124 PL_reg_oldpos = mg->mg_len;
2125 SAVEDESTRUCTOR_X(restore_pos, prog);
2127 if (!PL_reg_curpm) {
2128 Newxz(PL_reg_curpm, 1, PMOP);
2131 SV* const repointer = newSViv(0);
2132 /* so we know which PL_regex_padav element is PL_reg_curpm */
2133 SvFLAGS(repointer) |= SVf_BREAK;
2134 av_push(PL_regex_padav,repointer);
2135 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2136 PL_regex_pad = AvARRAY(PL_regex_padav);
2140 PM_SETRE(PL_reg_curpm, prog);
2141 PL_reg_oldcurpm = PL_curpm;
2142 PL_curpm = PL_reg_curpm;
2143 if (RX_MATCH_COPIED(prog)) {
2144 /* Here is a serious problem: we cannot rewrite subbeg,
2145 since it may be needed if this match fails. Thus
2146 $` inside (?{}) could fail... */
2147 PL_reg_oldsaved = prog->subbeg;
2148 PL_reg_oldsavedlen = prog->sublen;
2149 #ifdef PERL_OLD_COPY_ON_WRITE
2150 PL_nrs = prog->saved_copy;
2152 RX_MATCH_COPIED_off(prog);
2155 PL_reg_oldsaved = NULL;
2156 prog->subbeg = PL_bostr;
2157 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2159 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2160 prog->startp[0] = startpos - PL_bostr;
2161 PL_reginput = startpos;
2162 PL_reglastparen = &prog->lastparen;
2163 PL_reglastcloseparen = &prog->lastcloseparen;
2164 prog->lastparen = 0;
2165 prog->lastcloseparen = 0;
2167 PL_regstartp = prog->startp;
2168 PL_regendp = prog->endp;
2169 if (PL_reg_start_tmpl <= prog->nparens) {
2170 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2171 if(PL_reg_start_tmp)
2172 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2177 /* XXXX What this code is doing here?!!! There should be no need
2178 to do this again and again, PL_reglastparen should take care of
2181 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2182 * Actually, the code in regcppop() (which Ilya may be meaning by
2183 * PL_reglastparen), is not needed at all by the test suite
2184 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2185 * enough, for building DynaLoader, or otherwise this
2186 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2187 * will happen. Meanwhile, this code *is* needed for the
2188 * above-mentioned test suite tests to succeed. The common theme
2189 * on those tests seems to be returning null fields from matches.
2194 if (prog->nparens) {
2196 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2203 if (regmatch(reginfo, prog->program + 1)) {
2204 prog->endp[0] = PL_reginput - PL_bostr;
2207 REGCP_UNWIND(lastcp);
2212 #define sayYES goto yes
2213 #define sayNO goto no
2214 #define sayNO_SILENT goto no_silent
2216 /* we dont use STMT_START/END here because it leads to
2217 "unreachable code" warnings, which are bogus, but distracting. */
2218 #define CACHEsayNO \
2219 if (ST.cache_mask) \
2220 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2223 /* this is used to determine how far from the left messages like
2224 'failed...' are printed. It should be set such that messages
2225 are inline with the regop output that created them.
2227 #define REPORT_CODE_OFF 32
2230 /* Make sure there is a test for this +1 options in re_tests */
2231 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2233 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2234 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2236 #define SLAB_FIRST(s) (&(s)->states[0])
2237 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2239 /* grab a new slab and return the first slot in it */
2241 STATIC regmatch_state *
2244 #if PERL_VERSION < 9
2247 regmatch_slab *s = PL_regmatch_slab->next;
2249 Newx(s, 1, regmatch_slab);
2250 s->prev = PL_regmatch_slab;
2252 PL_regmatch_slab->next = s;
2254 PL_regmatch_slab = s;
2255 return SLAB_FIRST(s);
2259 /* push a new state then goto it */
2261 #define PUSH_STATE_GOTO(state, node) \
2263 st->resume_state = state; \
2266 /* push a new state with success backtracking, then goto it */
2268 #define PUSH_YES_STATE_GOTO(state, node) \
2270 st->resume_state = state; \
2271 goto push_yes_state;
2277 regmatch() - main matching routine
2279 This is basically one big switch statement in a loop. We execute an op,
2280 set 'next' to point the next op, and continue. If we come to a point which
2281 we may need to backtrack to on failure such as (A|B|C), we push a
2282 backtrack state onto the backtrack stack. On failure, we pop the top
2283 state, and re-enter the loop at the state indicated. If there are no more
2284 states to pop, we return failure.
2286 Sometimes we also need to backtrack on success; for example /A+/, where
2287 after successfully matching one A, we need to go back and try to
2288 match another one; similarly for lookahead assertions: if the assertion
2289 completes successfully, we backtrack to the state just before the assertion
2290 and then carry on. In these cases, the pushed state is marked as
2291 'backtrack on success too'. This marking is in fact done by a chain of
2292 pointers, each pointing to the previous 'yes' state. On success, we pop to
2293 the nearest yes state, discarding any intermediate failure-only states.
2294 Sometimes a yes state is pushed just to force some cleanup code to be
2295 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2296 it to free the inner regex.
2298 Note that failure backtracking rewinds the cursor position, while
2299 success backtracking leaves it alone.
2301 A pattern is complete when the END op is executed, while a subpattern
2302 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2303 ops trigger the "pop to last yes state if any, otherwise return true"
2306 A common convention in this function is to use A and B to refer to the two
2307 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2308 the subpattern to be matched possibly multiple times, while B is the entire
2309 rest of the pattern. Variable and state names reflect this convention.
2311 The states in the main switch are the union of ops and failure/success of
2312 substates associated with with that op. For example, IFMATCH is the op
2313 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2314 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2315 successfully matched A and IFMATCH_A_fail is a state saying that we have
2316 just failed to match A. Resume states always come in pairs. The backtrack
2317 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2318 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2319 on success or failure.
2321 The struct that holds a backtracking state is actually a big union, with
2322 one variant for each major type of op. The variable st points to the
2323 top-most backtrack struct. To make the code clearer, within each
2324 block of code we #define ST to alias the relevant union.
2326 Here's a concrete example of a (vastly oversimplified) IFMATCH
2332 #define ST st->u.ifmatch
2334 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2335 ST.foo = ...; // some state we wish to save
2337 // push a yes backtrack state with a resume value of
2338 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2340 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2343 case IFMATCH_A: // we have successfully executed A; now continue with B
2345 bar = ST.foo; // do something with the preserved value
2348 case IFMATCH_A_fail: // A failed, so the assertion failed
2349 ...; // do some housekeeping, then ...
2350 sayNO; // propagate the failure
2357 For any old-timers reading this who are familiar with the old recursive
2358 approach, the code above is equivalent to:
2360 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2369 ...; // do some housekeeping, then ...
2370 sayNO; // propagate the failure
2373 The topmost backtrack state, pointed to by st, is usually free. If you
2374 want to claim it, populate any ST.foo fields in it with values you wish to
2375 save, then do one of
2377 PUSH_STATE_GOTO(resume_state, node);
2378 PUSH_YES_STATE_GOTO(resume_state, node);
2380 which sets that backtrack state's resume value to 'resume_state', pushes a
2381 new free entry to the top of the backtrack stack, then goes to 'node'.
2382 On backtracking, the free slot is popped, and the saved state becomes the
2383 new free state. An ST.foo field in this new top state can be temporarily
2384 accessed to retrieve values, but once the main loop is re-entered, it
2385 becomes available for reuse.
2387 Note that the depth of the backtrack stack constantly increases during the
2388 left-to-right execution of the pattern, rather than going up and down with
2389 the pattern nesting. For example the stack is at its maximum at Z at the
2390 end of the pattern, rather than at X in the following:
2392 /(((X)+)+)+....(Y)+....Z/
2394 The only exceptions to this are lookahead/behind assertions and the cut,
2395 (?>A), which pop all the backtrack states associated with A before
2398 Bascktrack state structs are allocated in slabs of about 4K in size.
2399 PL_regmatch_state and st always point to the currently active state,
2400 and PL_regmatch_slab points to the slab currently containing
2401 PL_regmatch_state. The first time regmatch() is called, the first slab is
2402 allocated, and is never freed until interpreter destruction. When the slab
2403 is full, a new one is allocated and chained to the end. At exit from
2404 regmatch(), slabs allocated since entry are freed.
2409 #define DEBUG_STATE_pp(pp) \
2411 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2412 PerlIO_printf(Perl_debug_log, \
2415 reg_name[st->resume_state] ); \
2419 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2424 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2425 const char *start, const char *end, const char *blurb)
2427 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2431 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2432 prog->precomp, prog->prelen, 60);
2434 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2435 start, end - start, 60);
2437 PerlIO_printf(Perl_debug_log,
2438 "%s%s REx%s %s against %s\n",
2439 PL_colors[4], blurb, PL_colors[5], s0, s1);
2441 if (do_utf8||utf8_pat)
2442 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2443 utf8_pat ? "pattern" : "",
2444 utf8_pat && do_utf8 ? " and " : "",
2445 do_utf8 ? "string" : ""
2451 S_dump_exec_pos(pTHX_ const char *locinput,
2452 const regnode *scan,
2453 const char *loc_regeol,
2454 const char *loc_bostr,
2455 const char *loc_reg_starttry,
2458 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2459 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2460 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2461 /* The part of the string before starttry has one color
2462 (pref0_len chars), between starttry and current
2463 position another one (pref_len - pref0_len chars),
2464 after the current position the third one.
2465 We assume that pref0_len <= pref_len, otherwise we
2466 decrease pref0_len. */
2467 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2468 ? (5 + taill) - l : locinput - loc_bostr;
2471 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2473 pref0_len = pref_len - (locinput - loc_reg_starttry);
2474 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2475 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2476 ? (5 + taill) - pref_len : loc_regeol - locinput);
2477 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2481 if (pref0_len > pref_len)
2482 pref0_len = pref_len;
2484 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2486 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2487 (locinput - pref_len),pref0_len, 60, 4, 5);
2489 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2490 (locinput - pref_len + pref0_len),
2491 pref_len - pref0_len, 60, 2, 3);
2493 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2494 locinput, loc_regeol - locinput, 10, 0, 1);
2496 const STRLEN tlen=len0+len1+len2;
2497 PerlIO_printf(Perl_debug_log,
2498 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2499 (IV)(locinput - loc_bostr),
2502 (docolor ? "" : "> <"),
2504 (int)(tlen > 19 ? 0 : 19 - tlen),
2511 STATIC I32 /* 0 failure, 1 success */
2512 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2514 #if PERL_VERSION < 9
2518 register const bool do_utf8 = PL_reg_match_utf8;
2519 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2521 regexp *rex = reginfo->prog;
2523 regmatch_slab *orig_slab;
2524 regmatch_state *orig_state;
2526 /* the current state. This is a cached copy of PL_regmatch_state */
2527 register regmatch_state *st;
2529 /* cache heavy used fields of st in registers */
2530 register regnode *scan;
2531 register regnode *next;
2532 register I32 n = 0; /* general value; init to avoid compiler warning */
2533 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2534 register char *locinput = PL_reginput;
2535 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2537 bool result = 0; /* return value of S_regmatch */
2538 int depth = 0; /* depth of backtrack stack */
2539 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
2540 regmatch_state *yes_state = NULL; /* state to pop to on success of
2542 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2543 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2546 /* these three flags are set by various ops to signal information to
2547 * the very next op. They have a useful lifetime of exactly one loop
2548 * iteration, and are not preserved or restored by state pushes/pops
2550 bool sw = 0; /* the condition value in (?(cond)a|b) */
2551 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2552 int logical = 0; /* the following EVAL is:
2556 or the following IFMATCH/UNLESSM is:
2557 false: plain (?=foo)
2558 true: used as a condition: (?(?=foo))
2562 GET_RE_DEBUG_FLAGS_DECL;
2565 /* on first ever call to regmatch, allocate first slab */
2566 if (!PL_regmatch_slab) {
2567 Newx(PL_regmatch_slab, 1, regmatch_slab);
2568 PL_regmatch_slab->prev = NULL;
2569 PL_regmatch_slab->next = NULL;
2570 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2573 /* remember current high-water mark for exit */
2574 /* XXX this should be done with SAVE* instead */
2575 orig_slab = PL_regmatch_slab;
2576 orig_state = PL_regmatch_state;
2578 /* grab next free state slot */
2579 st = ++PL_regmatch_state;
2580 if (st > SLAB_LAST(PL_regmatch_slab))
2581 st = PL_regmatch_state = S_push_slab(aTHX);
2583 /* Note that nextchr is a byte even in UTF */
2584 nextchr = UCHARAT(locinput);
2586 while (scan != NULL) {
2589 SV * const prop = sv_newmortal();
2590 regnode *rnext=regnext(scan);
2591 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2592 regprop(rex, prop, scan);
2594 PerlIO_printf(Perl_debug_log,
2595 "%3"IVdf":%*s%s(%"IVdf")\n",
2596 (IV)(scan - rex->program), depth*2, "",
2598 (PL_regkind[OP(scan)] == END || !rnext) ?
2599 0 : (IV)(rnext - rex->program));
2602 next = scan + NEXT_OFF(scan);
2605 state_num = OP(scan);
2608 switch (state_num) {
2610 if (locinput == PL_bostr)
2612 /* reginfo->till = reginfo->bol; */
2617 if (locinput == PL_bostr ||
2618 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2624 if (locinput == PL_bostr)
2628 if (locinput == reginfo->ganch)
2634 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2639 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2641 if (PL_regeol - locinput > 1)
2645 if (PL_regeol != locinput)
2649 if (!nextchr && locinput >= PL_regeol)
2652 locinput += PL_utf8skip[nextchr];
2653 if (locinput > PL_regeol)
2655 nextchr = UCHARAT(locinput);
2658 nextchr = UCHARAT(++locinput);
2661 if (!nextchr && locinput >= PL_regeol)
2663 nextchr = UCHARAT(++locinput);
2666 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2669 locinput += PL_utf8skip[nextchr];
2670 if (locinput > PL_regeol)
2672 nextchr = UCHARAT(locinput);
2675 nextchr = UCHARAT(++locinput);
2679 #define ST st->u.trie
2681 /* In this case the charclass data is available inline so
2682 we can fail fast without a lot of extra overhead.
2684 if (scan->flags == EXACT || !do_utf8) {
2685 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2687 PerlIO_printf(Perl_debug_log,
2688 "%*s %sfailed to match trie start class...%s\n",
2689 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2698 /* what type of TRIE am I? (utf8 makes this contextual) */
2699 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2700 trie_type = do_utf8 ?
2701 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2704 /* what trie are we using right now */
2705 reg_trie_data * const trie
2706 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2707 U32 state = trie->startstate;
2709 if (trie->bitmap && trie_type != trie_utf8_fold &&
2710 !TRIE_BITMAP_TEST(trie,*locinput)
2712 if (trie->states[ state ].wordnum) {
2714 PerlIO_printf(Perl_debug_log,
2715 "%*s %smatched empty string...%s\n",
2716 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2721 PerlIO_printf(Perl_debug_log,
2722 "%*s %sfailed to match trie start class...%s\n",
2723 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2730 U8 *uc = ( U8* )locinput;
2734 U8 *uscan = (U8*)NULL;
2736 SV *sv_accept_buff = NULL;
2737 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2739 ST.accepted = 0; /* how many accepting states we have seen */
2741 ST.jump = trie->jump;
2750 traverse the TRIE keeping track of all accepting states
2751 we transition through until we get to a failing node.
2754 while ( state && uc <= (U8*)PL_regeol ) {
2755 U32 base = trie->states[ state ].trans.base;
2758 /* We use charid to hold the wordnum as we don't use it
2759 for charid until after we have done the wordnum logic.
2760 We define an alias just so that the wordnum logic reads
2763 #define got_wordnum charid
2764 got_wordnum = trie->states[ state ].wordnum;
2766 if ( got_wordnum ) {
2767 if ( ! ST.accepted ) {
2770 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2771 sv_accept_buff=newSV(bufflen *
2772 sizeof(reg_trie_accepted) - 1);
2773 SvCUR_set(sv_accept_buff, 0);
2774 SvPOK_on(sv_accept_buff);
2775 sv_2mortal(sv_accept_buff);
2778 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2781 if (ST.accepted >= bufflen) {
2783 ST.accept_buff =(reg_trie_accepted*)
2784 SvGROW(sv_accept_buff,
2785 bufflen * sizeof(reg_trie_accepted));
2787 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2788 + sizeof(reg_trie_accepted));
2791 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2792 ST.accept_buff[ST.accepted].endpos = uc;
2794 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2798 DEBUG_TRIE_EXECUTE_r({
2799 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2800 PerlIO_printf( Perl_debug_log,
2801 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2802 2+depth * 2, "", PL_colors[4],
2803 (UV)state, (UV)ST.accepted );
2807 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2808 uvc, charid, foldlen, foldbuf, uniflags);
2811 (base + charid > trie->uniquecharcount )
2812 && (base + charid - 1 - trie->uniquecharcount
2814 && trie->trans[base + charid - 1 -
2815 trie->uniquecharcount].check == state)
2817 state = trie->trans[base + charid - 1 -
2818 trie->uniquecharcount ].next;
2829 DEBUG_TRIE_EXECUTE_r(
2830 PerlIO_printf( Perl_debug_log,
2831 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2832 charid, uvc, (UV)state, PL_colors[5] );
2839 PerlIO_printf( Perl_debug_log,
2840 "%*s %sgot %"IVdf" possible matches%s\n",
2841 REPORT_CODE_OFF + depth * 2, "",
2842 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2848 case TRIE_next_fail: /* we failed - try next alterative */
2850 if ( ST.accepted == 1 ) {
2851 /* only one choice left - just continue */
2853 reg_trie_data * const trie
2854 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2855 SV ** const tmp = RX_DEBUG(reginfo->prog)
2856 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2858 PerlIO_printf( Perl_debug_log,
2859 "%*s %sonly one match left: #%d <%s>%s\n",
2860 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2861 ST.accept_buff[ 0 ].wordnum,
2862 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2865 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2866 /* in this case we free tmps/leave before we call regmatch
2867 as we wont be using accept_buff again. */
2870 locinput = PL_reginput;
2871 nextchr = UCHARAT(locinput);
2876 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2878 continue; /* execute rest of RE */
2881 if (!ST.accepted-- ) {
2888 There are at least two accepting states left. Presumably
2889 the number of accepting states is going to be low,
2890 typically two. So we simply scan through to find the one
2891 with lowest wordnum. Once we find it, we swap the last
2892 state into its place and decrement the size. We then try to
2893 match the rest of the pattern at the point where the word
2894 ends. If we succeed, control just continues along the
2895 regex; if we fail we return here to try the next accepting
2902 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2903 DEBUG_TRIE_EXECUTE_r(
2904 PerlIO_printf( Perl_debug_log,
2905 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2906 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
2907 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2908 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2911 if (ST.accept_buff[cur].wordnum <
2912 ST.accept_buff[best].wordnum)
2917 reg_trie_data * const trie
2918 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2919 SV ** const tmp = RX_DEBUG(reginfo->prog)
2920 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2922 regnode *nextop=!ST.jump ?
2924 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2925 PerlIO_printf( Perl_debug_log,
2926 "%*s %strying alternation #%d <%s> at node #%d %s\n",
2927 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2928 ST.accept_buff[best].wordnum,
2929 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2930 REG_NODE_NUM(nextop),
2934 if ( best<ST.accepted ) {
2935 reg_trie_accepted tmp = ST.accept_buff[ best ];
2936 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2937 ST.accept_buff[ ST.accepted ] = tmp;
2940 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2942 PUSH_STATE_GOTO(TRIE_next, ST.B);
2945 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2955 char *s = STRING(scan);
2957 if (do_utf8 != UTF) {
2958 /* The target and the pattern have differing utf8ness. */
2960 const char * const e = s + ln;
2963 /* The target is utf8, the pattern is not utf8. */
2968 if (NATIVE_TO_UNI(*(U8*)s) !=
2969 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2977 /* The target is not utf8, the pattern is utf8. */
2982 if (NATIVE_TO_UNI(*((U8*)l)) !=
2983 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2991 nextchr = UCHARAT(locinput);
2994 /* The target and the pattern have the same utf8ness. */
2995 /* Inline the first character, for speed. */
2996 if (UCHARAT(s) != nextchr)
2998 if (PL_regeol - locinput < ln)
3000 if (ln > 1 && memNE(s, locinput, ln))
3003 nextchr = UCHARAT(locinput);
3007 PL_reg_flags |= RF_tainted;
3010 char * const s = STRING(scan);
3013 if (do_utf8 || UTF) {
3014 /* Either target or the pattern are utf8. */
3015 const char * const l = locinput;
3016 char *e = PL_regeol;
3018 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3019 l, &e, 0, do_utf8)) {
3020 /* One more case for the sharp s:
3021 * pack("U0U*", 0xDF) =~ /ss/i,
3022 * the 0xC3 0x9F are the UTF-8
3023 * byte sequence for the U+00DF. */
3025 toLOWER(s[0]) == 's' &&
3027 toLOWER(s[1]) == 's' &&
3034 nextchr = UCHARAT(locinput);
3038 /* Neither the target and the pattern are utf8. */
3040 /* Inline the first character, for speed. */
3041 if (UCHARAT(s) != nextchr &&
3042 UCHARAT(s) != ((OP(scan) == EXACTF)
3043 ? PL_fold : PL_fold_locale)[nextchr])
3045 if (PL_regeol - locinput < ln)
3047 if (ln > 1 && (OP(scan) == EXACTF
3048 ? ibcmp(s, locinput, ln)
3049 : ibcmp_locale(s, locinput, ln)))
3052 nextchr = UCHARAT(locinput);
3057 STRLEN inclasslen = PL_regeol - locinput;
3059 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3061 if (locinput >= PL_regeol)
3063 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3064 nextchr = UCHARAT(locinput);
3069 nextchr = UCHARAT(locinput);
3070 if (!REGINCLASS(rex, scan, (U8*)locinput))
3072 if (!nextchr && locinput >= PL_regeol)
3074 nextchr = UCHARAT(++locinput);
3078 /* If we might have the case of the German sharp s
3079 * in a casefolding Unicode character class. */
3081 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3082 locinput += SHARP_S_SKIP;
3083 nextchr = UCHARAT(locinput);
3089 PL_reg_flags |= RF_tainted;
3095 LOAD_UTF8_CHARCLASS_ALNUM();
3096 if (!(OP(scan) == ALNUM
3097 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3098 : isALNUM_LC_utf8((U8*)locinput)))
3102 locinput += PL_utf8skip[nextchr];
3103 nextchr = UCHARAT(locinput);
3106 if (!(OP(scan) == ALNUM
3107 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3109 nextchr = UCHARAT(++locinput);
3112 PL_reg_flags |= RF_tainted;
3115 if (!nextchr && locinput >= PL_regeol)
3118 LOAD_UTF8_CHARCLASS_ALNUM();
3119 if (OP(scan) == NALNUM
3120 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3121 : isALNUM_LC_utf8((U8*)locinput))
3125 locinput += PL_utf8skip[nextchr];
3126 nextchr = UCHARAT(locinput);
3129 if (OP(scan) == NALNUM
3130 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3132 nextchr = UCHARAT(++locinput);
3136 PL_reg_flags |= RF_tainted;
3140 /* was last char in word? */
3142 if (locinput == PL_bostr)
3145 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3147 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3149 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3150 ln = isALNUM_uni(ln);
3151 LOAD_UTF8_CHARCLASS_ALNUM();
3152 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3155 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3156 n = isALNUM_LC_utf8((U8*)locinput);
3160 ln = (locinput != PL_bostr) ?
3161 UCHARAT(locinput - 1) : '\n';
3162 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3164 n = isALNUM(nextchr);
3167 ln = isALNUM_LC(ln);
3168 n = isALNUM_LC(nextchr);
3171 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3172 OP(scan) == BOUNDL))
3176 PL_reg_flags |= RF_tainted;
3182 if (UTF8_IS_CONTINUED(nextchr)) {
3183 LOAD_UTF8_CHARCLASS_SPACE();
3184 if (!(OP(scan) == SPACE
3185 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3186 : isSPACE_LC_utf8((U8*)locinput)))
3190 locinput += PL_utf8skip[nextchr];
3191 nextchr = UCHARAT(locinput);
3194 if (!(OP(scan) == SPACE
3195 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3197 nextchr = UCHARAT(++locinput);
3200 if (!(OP(scan) == SPACE
3201 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3203 nextchr = UCHARAT(++locinput);
3207 PL_reg_flags |= RF_tainted;
3210 if (!nextchr && locinput >= PL_regeol)
3213 LOAD_UTF8_CHARCLASS_SPACE();
3214 if (OP(scan) == NSPACE
3215 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3216 : isSPACE_LC_utf8((U8*)locinput))
3220 locinput += PL_utf8skip[nextchr];
3221 nextchr = UCHARAT(locinput);
3224 if (OP(scan) == NSPACE
3225 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3227 nextchr = UCHARAT(++locinput);
3230 PL_reg_flags |= RF_tainted;
3236 LOAD_UTF8_CHARCLASS_DIGIT();
3237 if (!(OP(scan) == DIGIT
3238 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3239 : isDIGIT_LC_utf8((U8*)locinput)))
3243 locinput += PL_utf8skip[nextchr];
3244 nextchr = UCHARAT(locinput);
3247 if (!(OP(scan) == DIGIT
3248 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3250 nextchr = UCHARAT(++locinput);
3253 PL_reg_flags |= RF_tainted;
3256 if (!nextchr && locinput >= PL_regeol)
3259 LOAD_UTF8_CHARCLASS_DIGIT();
3260 if (OP(scan) == NDIGIT
3261 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3262 : isDIGIT_LC_utf8((U8*)locinput))
3266 locinput += PL_utf8skip[nextchr];
3267 nextchr = UCHARAT(locinput);
3270 if (OP(scan) == NDIGIT
3271 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3273 nextchr = UCHARAT(++locinput);
3276 if (locinput >= PL_regeol)
3279 LOAD_UTF8_CHARCLASS_MARK();
3280 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3282 locinput += PL_utf8skip[nextchr];
3283 while (locinput < PL_regeol &&
3284 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3285 locinput += UTF8SKIP(locinput);
3286 if (locinput > PL_regeol)
3291 nextchr = UCHARAT(locinput);
3298 PL_reg_flags |= RF_tainted;
3304 SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];
3305 I32 *nums=(I32*)SvPVX(sv_dat);
3306 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3307 if ((I32)*PL_reglastparen >= nums[n] &&
3308 PL_regstartp[nums[n]] != -1 &&
3309 PL_regendp[nums[n]] != -1)
3312 type = REF + ( type - NREF );
3320 PL_reg_flags |= RF_tainted;
3324 n = ARG(scan); /* which paren pair */
3327 ln = PL_regstartp[n];
3328 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3329 if ((I32)*PL_reglastparen < n || ln == -1)
3330 sayNO; /* Do not match unless seen CLOSEn. */
3331 if (ln == PL_regendp[n])
3335 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3337 const char *e = PL_bostr + PL_regendp[n];
3339 * Note that we can't do the "other character" lookup trick as
3340 * in the 8-bit case (no pun intended) because in Unicode we
3341 * have to map both upper and title case to lower case.
3345 STRLEN ulen1, ulen2;
3346 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3347 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3351 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3352 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3353 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3360 nextchr = UCHARAT(locinput);
3364 /* Inline the first character, for speed. */
3365 if (UCHARAT(s) != nextchr &&
3367 (UCHARAT(s) != (type == REFF
3368 ? PL_fold : PL_fold_locale)[nextchr])))
3370 ln = PL_regendp[n] - ln;
3371 if (locinput + ln > PL_regeol)
3373 if (ln > 1 && (type == REF
3374 ? memNE(s, locinput, ln)
3376 ? ibcmp(s, locinput, ln)
3377 : ibcmp_locale(s, locinput, ln))))
3380 nextchr = UCHARAT(locinput);
3390 #define ST st->u.eval
3394 regnode *startpoint;
3397 case RECURSE: /* /(...(?1))/ */
3398 if (cur_eval && cur_eval->locinput==locinput) {
3399 if (cur_eval->u.eval.close_paren == ARG(scan))
3400 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3401 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3402 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3407 (void)ReREFCNT_inc(rex);
3408 if (OP(scan)==RECURSE) {
3409 startpoint = scan + ARG2L(scan);
3410 ST.close_paren = ARG(scan);
3412 startpoint = re->program+1;
3415 goto eval_recurse_doit;
3417 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3418 if (cur_eval && cur_eval->locinput==locinput) {
3419 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3420 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3425 /* execute the code in the {...} */
3427 SV ** const before = SP;
3428 OP_4tree * const oop = PL_op;
3429 COP * const ocurcop = PL_curcop;
3433 PL_op = (OP_4tree*)rex->data->data[n];
3434 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3435 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3436 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3438 CALLRUNOPS(aTHX); /* Scalar context. */
3441 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3448 PAD_RESTORE_LOCAL(old_comppad);
3449 PL_curcop = ocurcop;
3452 sv_setsv(save_scalar(PL_replgv), ret);
3456 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3459 /* extract RE object from returned value; compiling if
3464 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3465 mg = mg_find(sv, PERL_MAGIC_qr);
3466 else if (SvSMAGICAL(ret)) {
3467 if (SvGMAGICAL(ret))
3468 sv_unmagic(ret, PERL_MAGIC_qr);
3470 mg = mg_find(ret, PERL_MAGIC_qr);
3474 re = (regexp *)mg->mg_obj;
3475 (void)ReREFCNT_inc(re);
3479 const char * const t = SvPV_const(ret, len);
3481 const I32 osize = PL_regsize;
3484 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3485 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3487 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3489 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3495 debug_start_match(re, do_utf8, locinput, PL_regeol,
3496 "Matching embedded");
3498 startpoint = re->program + 1;
3499 ST.close_paren = 0; /* only used for RECURSE */
3500 /* borrowed from regtry */
3501 if (PL_reg_start_tmpl <= re->nparens) {
3502 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3503 if(PL_reg_start_tmp)
3504 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3506 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3509 eval_recurse_doit: /* Share code with RECURSE below this line */
3510 /* run the pattern returned from (??{...}) */
3511 ST.cp = regcppush(0); /* Save *all* the positions. */
3512 REGCP_SET(ST.lastcp);
3514 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3515 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3517 *PL_reglastparen = 0;
3518 *PL_reglastcloseparen = 0;
3519 PL_reginput = locinput;
3521 /* XXXX This is too dramatic a measure... */
3524 ST.toggle_reg_flags = PL_reg_flags;
3525 if (re->reganch & ROPT_UTF8)
3526 PL_reg_flags |= RF_utf8;
3528 PL_reg_flags &= ~RF_utf8;
3529 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3532 ST.prev_curlyx = cur_curlyx;
3536 ST.prev_eval = cur_eval;
3538 /* now continue from first node in postoned RE */
3539 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3542 /* logical is 1, /(?(?{...})X|Y)/ */
3543 sw = (bool)SvTRUE(ret);
3548 case EVAL_AB: /* cleanup after a successful (??{A})B */
3549 /* note: this is called twice; first after popping B, then A */
3550 PL_reg_flags ^= ST.toggle_reg_flags;
3554 cur_eval = ST.prev_eval;
3555 cur_curlyx = ST.prev_curlyx;
3556 /* XXXX This is too dramatic a measure... */
3561 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3562 /* note: this is called twice; first after popping B, then A */
3563 PL_reg_flags ^= ST.toggle_reg_flags;
3566 PL_reginput = locinput;
3567 REGCP_UNWIND(ST.lastcp);
3569 cur_eval = ST.prev_eval;
3570 cur_curlyx = ST.prev_curlyx;
3571 /* XXXX This is too dramatic a measure... */
3577 n = ARG(scan); /* which paren pair */
3578 PL_reg_start_tmp[n] = locinput;
3583 n = ARG(scan); /* which paren pair */
3584 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3585 PL_regendp[n] = locinput - PL_bostr;
3586 if (n > (I32)*PL_reglastparen)
3587 *PL_reglastparen = n;
3588 *PL_reglastcloseparen = n;
3589 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3594 n = ARG(scan); /* which paren pair */
3595 sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3598 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3600 next = NEXTOPER(NEXTOPER(scan));
3602 next = scan + ARG(scan);
3603 if (OP(next) == IFTHEN) /* Fake one. */
3604 next = NEXTOPER(NEXTOPER(next));
3608 logical = scan->flags;
3611 /*******************************************************************
3613 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3614 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3615 STAR/PLUS/CURLY/CURLYN are used instead.)
3617 A*B is compiled as <CURLYX><A><WHILEM><B>
3619 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3620 state, which contains the current count, initialised to -1. It also sets
3621 cur_curlyx to point to this state, with any previous value saved in the
3624 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3625 since the pattern may possibly match zero times (i.e. it's a while {} loop
3626 rather than a do {} while loop).
3628 Each entry to WHILEM represents a successful match of A. The count in the
3629 CURLYX block is incremented, another WHILEM state is pushed, and execution
3630 passes to A or B depending on greediness and the current count.
3632 For example, if matching against the string a1a2a3b (where the aN are
3633 substrings that match /A/), then the match progresses as follows: (the
3634 pushed states are interspersed with the bits of strings matched so far):
3637 <CURLYX cnt=0><WHILEM>
3638 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3639 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3640 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3641 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3643 (Contrast this with something like CURLYM, which maintains only a single
3647 a1 <CURLYM cnt=1> a2
3648 a1 a2 <CURLYM cnt=2> a3
3649 a1 a2 a3 <CURLYM cnt=3> b
3652 Each WHILEM state block marks a point to backtrack to upon partial failure
3653 of A or B, and also contains some minor state data related to that
3654 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3655 overall state, such as the count, and pointers to the A and B ops.
3657 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3658 must always point to the *current* CURLYX block, the rules are:
3660 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3661 and set cur_curlyx to point the new block.
3663 When popping the CURLYX block after a successful or unsuccessful match,
3664 restore the previous cur_curlyx.
3666 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3667 to the outer one saved in the CURLYX block.
3669 When popping the WHILEM block after a successful or unsuccessful B match,
3670 restore the previous cur_curlyx.
3672 Here's an example for the pattern (AI* BI)*BO
3673 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3676 curlyx backtrack stack
3677 ------ ---------------
3679 CO <CO prev=NULL> <WO>
3680 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3681 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3682 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3684 At this point the pattern succeeds, and we work back down the stack to
3685 clean up, restoring as we go:
3687 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3688 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3689 CO <CO prev=NULL> <WO>
3692 *******************************************************************/
3694 #define ST st->u.curlyx
3696 case CURLYX: /* start of /A*B/ (for complex A) */
3698 /* No need to save/restore up to this paren */
3699 I32 parenfloor = scan->flags;
3701 assert(next); /* keep Coverity happy */
3702 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3705 /* XXXX Probably it is better to teach regpush to support
3706 parenfloor > PL_regsize... */
3707 if (parenfloor > (I32)*PL_reglastparen)
3708 parenfloor = *PL_reglastparen; /* Pessimization... */
3710 ST.prev_curlyx= cur_curlyx;
3712 ST.cp = PL_savestack_ix;
3714 /* these fields contain the state of the current curly.
3715 * they are accessed by subsequent WHILEMs */
3716 ST.parenfloor = parenfloor;
3717 ST.min = ARG1(scan);
3718 ST.max = ARG2(scan);
3719 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3723 ST.count = -1; /* this will be updated by WHILEM */
3724 ST.lastloc = NULL; /* this will be updated by WHILEM */
3726 PL_reginput = locinput;
3727 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3731 case CURLYX_end: /* just finished matching all of A*B */
3733 cur_curlyx = ST.prev_curlyx;
3737 case CURLYX_end_fail: /* just failed to match all of A*B */
3739 cur_curlyx = ST.prev_curlyx;
3745 #define ST st->u.whilem
3747 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3749 /* see the discussion above about CURLYX/WHILEM */
3752 assert(cur_curlyx); /* keep Coverity happy */
3753 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3754 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3755 ST.cache_offset = 0;
3758 PL_reginput = locinput;
3760 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3761 "%*s whilem: matched %ld out of %ld..%ld\n",
3762 REPORT_CODE_OFF+depth*2, "", (long)n,
3763 (long)cur_curlyx->u.curlyx.min,
3764 (long)cur_curlyx->u.curlyx.max)
3767 /* First just match a string of min A's. */
3769 if (n < cur_curlyx->u.curlyx.min) {
3770 cur_curlyx->u.curlyx.lastloc = locinput;
3771 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3775 /* If degenerate A matches "", assume A done. */
3777 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3778 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3779 "%*s whilem: empty match detected, trying continuation...\n",
3780 REPORT_CODE_OFF+depth*2, "")
3782 goto do_whilem_B_max;
3785 /* super-linear cache processing */
3789 if (!PL_reg_maxiter) {
3790 /* start the countdown: Postpone detection until we
3791 * know the match is not *that* much linear. */
3792 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3793 /* possible overflow for long strings and many CURLYX's */
3794 if (PL_reg_maxiter < 0)
3795 PL_reg_maxiter = I32_MAX;
3796 PL_reg_leftiter = PL_reg_maxiter;
3799 if (PL_reg_leftiter-- == 0) {
3800 /* initialise cache */
3801 const I32 size = (PL_reg_maxiter + 7)/8;
3802 if (PL_reg_poscache) {
3803 if ((I32)PL_reg_poscache_size < size) {
3804 Renew(PL_reg_poscache, size, char);
3805 PL_reg_poscache_size = size;
3807 Zero(PL_reg_poscache, size, char);
3810 PL_reg_poscache_size = size;
3811 Newxz(PL_reg_poscache, size, char);
3813 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3814 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3815 PL_colors[4], PL_colors[5])
3819 if (PL_reg_leftiter < 0) {
3820 /* have we already failed at this position? */
3822 offset = (scan->flags & 0xf) - 1
3823 + (locinput - PL_bostr) * (scan->flags>>4);
3824 mask = 1 << (offset % 8);
3826 if (PL_reg_poscache[offset] & mask) {
3827 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3828 "%*s whilem: (cache) already tried at this position...\n",
3829 REPORT_CODE_OFF+depth*2, "")
3831 sayNO; /* cache records failure */
3833 ST.cache_offset = offset;
3834 ST.cache_mask = mask;
3838 /* Prefer B over A for minimal matching. */
3840 if (cur_curlyx->u.curlyx.minmod) {
3841 ST.save_curlyx = cur_curlyx;
3842 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3843 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3844 REGCP_SET(ST.lastcp);
3845 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3849 /* Prefer A over B for maximal matching. */
3851 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3852 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3853 cur_curlyx->u.curlyx.lastloc = locinput;
3854 REGCP_SET(ST.lastcp);
3855 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3858 goto do_whilem_B_max;
3862 case WHILEM_B_min: /* just matched B in a minimal match */
3863 case WHILEM_B_max: /* just matched B in a maximal match */
3864 cur_curlyx = ST.save_curlyx;
3868 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3869 cur_curlyx = ST.save_curlyx;
3870 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3871 cur_curlyx->u.curlyx.count--;
3875 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3876 REGCP_UNWIND(ST.lastcp);
3879 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3880 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3881 cur_curlyx->u.curlyx.count--;
3885 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3886 REGCP_UNWIND(ST.lastcp);
3887 regcppop(rex); /* Restore some previous $<digit>s? */
3888 PL_reginput = locinput;
3889 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3890 "%*s whilem: failed, trying continuation...\n",
3891 REPORT_CODE_OFF+depth*2, "")
3894 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3895 && ckWARN(WARN_REGEXP)
3896 && !(PL_reg_flags & RF_warned))
3898 PL_reg_flags |= RF_warned;
3899 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3900 "Complex regular subexpression recursion",
3905 ST.save_curlyx = cur_curlyx;
3906 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3907 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3910 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3911 cur_curlyx = ST.save_curlyx;
3912 REGCP_UNWIND(ST.lastcp);
3915 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3916 /* Maximum greed exceeded */
3917 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3918 && ckWARN(WARN_REGEXP)
3919 && !(PL_reg_flags & RF_warned))
3921 PL_reg_flags |= RF_warned;
3922 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3923 "%s limit (%d) exceeded",
3924 "Complex regular subexpression recursion",
3927 cur_curlyx->u.curlyx.count--;
3931 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3932 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3934 /* Try grabbing another A and see if it helps. */
3935 PL_reginput = locinput;
3936 cur_curlyx->u.curlyx.lastloc = locinput;
3937 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3938 REGCP_SET(ST.lastcp);
3939 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
3943 #define ST st->u.branch
3945 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3946 next = scan + ARG(scan);
3949 scan = NEXTOPER(scan);
3952 case BRANCH: /* /(...|A|...)/ */
3953 scan = NEXTOPER(scan); /* scan now points to inner node */
3954 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3955 /* last branch; skip state push and jump direct to node */
3957 ST.lastparen = *PL_reglastparen;
3958 ST.next_branch = next;
3960 PL_reginput = locinput;
3962 /* Now go into the branch */
3963 PUSH_STATE_GOTO(BRANCH_next, scan);
3966 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3967 REGCP_UNWIND(ST.cp);
3968 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3970 *PL_reglastparen = n;
3971 scan = ST.next_branch;
3972 /* no more branches? */
3973 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3975 continue; /* execute next BRANCH[J] op */
3983 #define ST st->u.curlym
3985 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3987 /* This is an optimisation of CURLYX that enables us to push
3988 * only a single backtracking state, no matter now many matches
3989 * there are in {m,n}. It relies on the pattern being constant
3990 * length, with no parens to influence future backrefs
3994 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3996 /* if paren positive, emulate an OPEN/CLOSE around A */
3998 I32 paren = ST.me->flags;
3999 if (paren > PL_regsize)
4001 if (paren > (I32)*PL_reglastparen)
4002 *PL_reglastparen = paren;
4003 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4011 ST.c1 = CHRTEST_UNINIT;
4014 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4017 curlym_do_A: /* execute the A in /A{m,n}B/ */
4018 PL_reginput = locinput;
4019 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4022 case CURLYM_A: /* we've just matched an A */
4023 locinput = st->locinput;
4024 nextchr = UCHARAT(locinput);
4027 /* after first match, determine A's length: u.curlym.alen */
4028 if (ST.count == 1) {
4029 if (PL_reg_match_utf8) {
4031 while (s < PL_reginput) {
4037 ST.alen = PL_reginput - locinput;
4040 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4043 PerlIO_printf(Perl_debug_log,
4044 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4045 (int)(REPORT_CODE_OFF+(depth*2)), "",
4046 (IV) ST.count, (IV)ST.alen)
4049 locinput = PL_reginput;
4050 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4051 goto curlym_do_A; /* try to match another A */
4052 goto curlym_do_B; /* try to match B */
4054 case CURLYM_A_fail: /* just failed to match an A */
4055 REGCP_UNWIND(ST.cp);
4056 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4059 curlym_do_B: /* execute the B in /A{m,n}B/ */
4060 PL_reginput = locinput;
4061 if (ST.c1 == CHRTEST_UNINIT) {
4062 /* calculate c1 and c2 for possible match of 1st char
4063 * following curly */
4064 ST.c1 = ST.c2 = CHRTEST_VOID;
4065 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4066 regnode *text_node = ST.B;
4067 if (! HAS_TEXT(text_node))
4068 FIND_NEXT_IMPT(text_node);
4069 if (HAS_TEXT(text_node)
4070 && PL_regkind[OP(text_node)] != REF)
4072 ST.c1 = (U8)*STRING(text_node);
4074 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4076 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4077 ? PL_fold_locale[ST.c1]
4084 PerlIO_printf(Perl_debug_log,
4085 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4086 (int)(REPORT_CODE_OFF+(depth*2)),
4089 if (ST.c1 != CHRTEST_VOID
4090 && UCHARAT(PL_reginput) != ST.c1
4091 && UCHARAT(PL_reginput) != ST.c2)
4093 /* simulate B failing */
4094 state_num = CURLYM_B_fail;
4095 goto reenter_switch;
4099 /* mark current A as captured */
4100 I32 paren = ST.me->flags;
4103 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4104 PL_regendp[paren] = PL_reginput - PL_bostr;
4107 PL_regendp[paren] = -1;
4109 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4112 case CURLYM_B_fail: /* just failed to match a B */
4113 REGCP_UNWIND(ST.cp);
4115 if (ST.count == ARG2(ST.me) /* max */)
4117 goto curlym_do_A; /* try to match a further A */
4119 /* backtrack one A */
4120 if (ST.count == ARG1(ST.me) /* min */)
4123 locinput = HOPc(locinput, -ST.alen);
4124 goto curlym_do_B; /* try to match B */
4127 #define ST st->u.curly
4129 #define CURLY_SETPAREN(paren, success) \
4132 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4133 PL_regendp[paren] = locinput - PL_bostr; \
4136 PL_regendp[paren] = -1; \
4139 case STAR: /* /A*B/ where A is width 1 */
4143 scan = NEXTOPER(scan);
4145 case PLUS: /* /A+B/ where A is width 1 */
4149 scan = NEXTOPER(scan);
4151 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4152 ST.paren = scan->flags; /* Which paren to set */
4153 if (ST.paren > PL_regsize)
4154 PL_regsize = ST.paren;
4155 if (ST.paren > (I32)*PL_reglastparen)
4156 *PL_reglastparen = ST.paren;
4157 ST.min = ARG1(scan); /* min to match */
4158 ST.max = ARG2(scan); /* max to match */
4159 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4161 case CURLY: /* /A{m,n}B/ where A is width 1 */
4163 ST.min = ARG1(scan); /* min to match */
4164 ST.max = ARG2(scan); /* max to match */
4165 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4168 * Lookahead to avoid useless match attempts
4169 * when we know what character comes next.
4171 * Used to only do .*x and .*?x, but now it allows
4172 * for )'s, ('s and (?{ ... })'s to be in the way
4173 * of the quantifier and the EXACT-like node. -- japhy
4176 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4178 if (HAS_TEXT(next) || JUMPABLE(next)) {
4180 regnode *text_node = next;
4182 if (! HAS_TEXT(text_node))
4183 FIND_NEXT_IMPT(text_node);
4185 if (! HAS_TEXT(text_node))
4186 ST.c1 = ST.c2 = CHRTEST_VOID;
4188 if (PL_regkind[OP(text_node)] == REF) {
4189 ST.c1 = ST.c2 = CHRTEST_VOID;
4190 goto assume_ok_easy;
4193 s = (U8*)STRING(text_node);
4197 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4198 ST.c2 = PL_fold[ST.c1];
4199 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4200 ST.c2 = PL_fold_locale[ST.c1];
4203 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4204 STRLEN ulen1, ulen2;
4205 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4206 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4208 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4209 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4211 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4213 0 : UTF8_ALLOW_ANY);
4214 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4216 0 : UTF8_ALLOW_ANY);
4218 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4220 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4225 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4232 ST.c1 = ST.c2 = CHRTEST_VOID;
4237 PL_reginput = locinput;
4240 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4243 locinput = PL_reginput;
4245 if (ST.c1 == CHRTEST_VOID)
4246 goto curly_try_B_min;
4248 ST.oldloc = locinput;
4250 /* set ST.maxpos to the furthest point along the
4251 * string that could possibly match */
4252 if (ST.max == REG_INFTY) {
4253 ST.maxpos = PL_regeol - 1;
4255 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4259 int m = ST.max - ST.min;
4260 for (ST.maxpos = locinput;
4261 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4262 ST.maxpos += UTF8SKIP(ST.maxpos);
4265 ST.maxpos = locinput + ST.max - ST.min;
4266 if (ST.maxpos >= PL_regeol)
4267 ST.maxpos = PL_regeol - 1;
4269 goto curly_try_B_min_known;
4273 ST.count = regrepeat(rex, ST.A, ST.max);
4274 locinput = PL_reginput;
4275 if (ST.count < ST.min)
4277 if ((ST.count > ST.min)
4278 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4280 /* A{m,n} must come at the end of the string, there's
4281 * no point in backing off ... */
4283 /* ...except that $ and \Z can match before *and* after
4284 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4285 We may back off by one in this case. */
4286 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4290 goto curly_try_B_max;
4295 case CURLY_B_min_known_fail:
4296 /* failed to find B in a non-greedy match where c1,c2 valid */
4297 if (ST.paren && ST.count)
4298 PL_regendp[ST.paren] = -1;
4300 PL_reginput = locinput; /* Could be reset... */
4301 REGCP_UNWIND(ST.cp);
4302 /* Couldn't or didn't -- move forward. */
4303 ST.oldloc = locinput;
4305 locinput += UTF8SKIP(locinput);
4309 curly_try_B_min_known:
4310 /* find the next place where 'B' could work, then call B */
4314 n = (ST.oldloc == locinput) ? 0 : 1;
4315 if (ST.c1 == ST.c2) {
4317 /* set n to utf8_distance(oldloc, locinput) */
4318 while (locinput <= ST.maxpos &&
4319 utf8n_to_uvchr((U8*)locinput,
4320 UTF8_MAXBYTES, &len,
4321 uniflags) != (UV)ST.c1) {
4327 /* set n to utf8_distance(oldloc, locinput) */
4328 while (locinput <= ST.maxpos) {
4330 const UV c = utf8n_to_uvchr((U8*)locinput,
4331 UTF8_MAXBYTES, &len,
4333 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4341 if (ST.c1 == ST.c2) {
4342 while (locinput <= ST.maxpos &&
4343 UCHARAT(locinput) != ST.c1)
4347 while (locinput <= ST.maxpos
4348 && UCHARAT(locinput) != ST.c1
4349 && UCHARAT(locinput) != ST.c2)
4352 n = locinput - ST.oldloc;
4354 if (locinput > ST.maxpos)
4356 /* PL_reginput == oldloc now */
4359 if (regrepeat(rex, ST.A, n) < n)
4362 PL_reginput = locinput;
4363 CURLY_SETPAREN(ST.paren, ST.count);
4364 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4369 case CURLY_B_min_fail:
4370 /* failed to find B in a non-greedy match where c1,c2 invalid */
4371 if (ST.paren && ST.count)
4372 PL_regendp[ST.paren] = -1;
4374 REGCP_UNWIND(ST.cp);
4375 /* failed -- move forward one */
4376 PL_reginput = locinput;
4377 if (regrepeat(rex, ST.A, 1)) {
4379 locinput = PL_reginput;
4380 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4381 ST.count > 0)) /* count overflow ? */
4384 CURLY_SETPAREN(ST.paren, ST.count);
4385 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4393 /* a successful greedy match: now try to match B */
4396 if (ST.c1 != CHRTEST_VOID)
4397 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4398 UTF8_MAXBYTES, 0, uniflags)
4399 : (UV) UCHARAT(PL_reginput);
4400 /* If it could work, try it. */
4401 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4402 CURLY_SETPAREN(ST.paren, ST.count);
4403 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4408 case CURLY_B_max_fail:
4409 /* failed to find B in a greedy match */
4410 if (ST.paren && ST.count)
4411 PL_regendp[ST.paren] = -1;
4413 REGCP_UNWIND(ST.cp);
4415 if (--ST.count < ST.min)
4417 PL_reginput = locinput = HOPc(locinput, -1);
4418 goto curly_try_B_max;
4426 /* we've just finished A in /(??{A})B/; now continue with B */
4430 st->u.eval.toggle_reg_flags
4431 = cur_eval->u.eval.toggle_reg_flags;
4432 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4434 st->u.eval.prev_rex = rex; /* inner */
4435 rex = cur_eval->u.eval.prev_rex; /* outer */
4436 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4438 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4439 REGCP_SET(st->u.eval.lastcp);
4440 PL_reginput = locinput;
4442 /* Restore parens of the outer rex without popping the
4444 tmpix = PL_savestack_ix;
4445 PL_savestack_ix = cur_eval->u.eval.lastcp;
4447 PL_savestack_ix = tmpix;
4449 st->u.eval.prev_eval = cur_eval;
4450 cur_eval = cur_eval->u.eval.prev_eval;
4452 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4453 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
4454 PUSH_YES_STATE_GOTO(EVAL_AB,
4455 st->u.eval.prev_eval->u.eval.B); /* match B */
4458 if (locinput < reginfo->till) {
4459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4460 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4462 (long)(locinput - PL_reg_starttry),
4463 (long)(reginfo->till - PL_reg_starttry),
4465 sayNO_SILENT; /* Cannot match: too short. */
4467 PL_reginput = locinput; /* put where regtry can find it */
4468 sayYES; /* Success! */
4470 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4472 PerlIO_printf(Perl_debug_log,
4473 "%*s %ssubpattern success...%s\n",
4474 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4475 PL_reginput = locinput; /* put where regtry can find it */
4476 sayYES; /* Success! */
4479 #define ST st->u.ifmatch
4481 case SUSPEND: /* (?>A) */
4483 PL_reginput = locinput;
4486 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4488 goto ifmatch_trivial_fail_test;
4490 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4492 ifmatch_trivial_fail_test:
4494 char * const s = HOPBACKc(locinput, scan->flags);
4499 sw = 1 - (bool)ST.wanted;
4503 next = scan + ARG(scan);
4511 PL_reginput = locinput;
4515 ST.logical = logical;
4516 /* execute body of (?...A) */
4517 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4520 case IFMATCH_A_fail: /* body of (?...A) failed */
4521 ST.wanted = !ST.wanted;
4524 case IFMATCH_A: /* body of (?...A) succeeded */
4526 sw = (bool)ST.wanted;
4528 else if (!ST.wanted)
4531 if (OP(ST.me) == SUSPEND)
4532 locinput = PL_reginput;
4534 locinput = PL_reginput = st->locinput;
4535 nextchr = UCHARAT(locinput);
4537 scan = ST.me + ARG(ST.me);
4540 continue; /* execute B */
4545 next = scan + ARG(scan);
4550 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4551 PTR2UV(scan), OP(scan));
4552 Perl_croak(aTHX_ "regexp memory corruption");
4560 /* push a state that backtracks on success */
4561 st->u.yes.prev_yes_state = yes_state;
4565 /* push a new regex state, then continue at scan */
4567 regmatch_state *newst;
4569 DEBUG_STATE_pp("push");
4571 st->locinput = locinput;
4573 if (newst > SLAB_LAST(PL_regmatch_slab))
4574 newst = S_push_slab(aTHX);
4575 PL_regmatch_state = newst;
4577 locinput = PL_reginput;
4578 nextchr = UCHARAT(locinput);
4586 * We get here only if there's trouble -- normally "case END" is
4587 * the terminating point.
4589 Perl_croak(aTHX_ "corrupted regexp pointers");
4595 /* we have successfully completed a subexpression, but we must now
4596 * pop to the state marked by yes_state and continue from there */
4597 assert(st != yes_state);
4599 while (st != yes_state) {
4601 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4602 PL_regmatch_slab = PL_regmatch_slab->prev;
4603 st = SLAB_LAST(PL_regmatch_slab);
4605 DEBUG_STATE_pp("pop (yes)");
4609 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4610 || yes_state > SLAB_LAST(PL_regmatch_slab))
4612 /* not in this slab, pop slab */
4613 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4614 PL_regmatch_slab = PL_regmatch_slab->prev;
4615 st = SLAB_LAST(PL_regmatch_slab);
4617 depth -= (st - yes_state);
4620 yes_state = st->u.yes.prev_yes_state;
4621 PL_regmatch_state = st;
4623 state_num = st->resume_state;
4624 goto reenter_switch;
4627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4628 PL_colors[4], PL_colors[5]));
4635 PerlIO_printf(Perl_debug_log,
4636 "%*s %sfailed...%s\n",
4637 REPORT_CODE_OFF+depth*2, "",
4638 PL_colors[4], PL_colors[5])
4643 /* there's a previous state to backtrack to */
4645 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4646 PL_regmatch_slab = PL_regmatch_slab->prev;
4647 st = SLAB_LAST(PL_regmatch_slab);
4649 PL_regmatch_state = st;
4650 locinput= st->locinput;
4651 nextchr = UCHARAT(locinput);
4653 DEBUG_STATE_pp("pop");
4655 if (yes_state == st)
4656 yes_state = st->u.yes.prev_yes_state;
4658 state_num = st->resume_state + 1; /* failure = success + 1 */
4659 goto reenter_switch;
4665 /* restore original high-water mark */
4666 PL_regmatch_slab = orig_slab;
4667 PL_regmatch_state = orig_state;
4669 /* free all slabs above current one */
4670 if (orig_slab->next) {
4671 regmatch_slab *sl = orig_slab->next;
4672 orig_slab->next = NULL;
4674 regmatch_slab * const osl = sl;
4684 - regrepeat - repeatedly match something simple, report how many
4687 * [This routine now assumes that it will only match on things of length 1.
4688 * That was true before, but now we assume scan - reginput is the count,
4689 * rather than incrementing count on every character. [Er, except utf8.]]
4692 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4695 register char *scan;
4697 register char *loceol = PL_regeol;
4698 register I32 hardcount = 0;
4699 register bool do_utf8 = PL_reg_match_utf8;
4702 if (max == REG_INFTY)
4704 else if (max < loceol - scan)
4705 loceol = scan + max;
4710 while (scan < loceol && hardcount < max && *scan != '\n') {
4711 scan += UTF8SKIP(scan);
4715 while (scan < loceol && *scan != '\n')
4722 while (scan < loceol && hardcount < max) {
4723 scan += UTF8SKIP(scan);
4733 case EXACT: /* length of string is 1 */
4735 while (scan < loceol && UCHARAT(scan) == c)
4738 case EXACTF: /* length of string is 1 */
4740 while (scan < loceol &&
4741 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4744 case EXACTFL: /* length of string is 1 */
4745 PL_reg_flags |= RF_tainted;
4747 while (scan < loceol &&
4748 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4754 while (hardcount < max && scan < loceol &&
4755 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4756 scan += UTF8SKIP(scan);
4760 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4767 LOAD_UTF8_CHARCLASS_ALNUM();
4768 while (hardcount < max && scan < loceol &&
4769 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4770 scan += UTF8SKIP(scan);
4774 while (scan < loceol && isALNUM(*scan))
4779 PL_reg_flags |= RF_tainted;
4782 while (hardcount < max && scan < loceol &&
4783 isALNUM_LC_utf8((U8*)scan)) {
4784 scan += UTF8SKIP(scan);
4788 while (scan < loceol && isALNUM_LC(*scan))
4795 LOAD_UTF8_CHARCLASS_ALNUM();
4796 while (hardcount < max && scan < loceol &&
4797 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4798 scan += UTF8SKIP(scan);
4802 while (scan < loceol && !isALNUM(*scan))
4807 PL_reg_flags |= RF_tainted;
4810 while (hardcount < max && scan < loceol &&
4811 !isALNUM_LC_utf8((U8*)scan)) {
4812 scan += UTF8SKIP(scan);
4816 while (scan < loceol && !isALNUM_LC(*scan))
4823 LOAD_UTF8_CHARCLASS_SPACE();
4824 while (hardcount < max && scan < loceol &&
4826 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4827 scan += UTF8SKIP(scan);
4831 while (scan < loceol && isSPACE(*scan))
4836 PL_reg_flags |= RF_tainted;
4839 while (hardcount < max && scan < loceol &&
4840 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4841 scan += UTF8SKIP(scan);
4845 while (scan < loceol && isSPACE_LC(*scan))
4852 LOAD_UTF8_CHARCLASS_SPACE();
4853 while (hardcount < max && scan < loceol &&
4855 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4856 scan += UTF8SKIP(scan);
4860 while (scan < loceol && !isSPACE(*scan))
4865 PL_reg_flags |= RF_tainted;
4868 while (hardcount < max && scan < loceol &&
4869 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4870 scan += UTF8SKIP(scan);
4874 while (scan < loceol && !isSPACE_LC(*scan))
4881 LOAD_UTF8_CHARCLASS_DIGIT();
4882 while (hardcount < max && scan < loceol &&
4883 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4884 scan += UTF8SKIP(scan);
4888 while (scan < loceol && isDIGIT(*scan))
4895 LOAD_UTF8_CHARCLASS_DIGIT();
4896 while (hardcount < max && scan < loceol &&
4897 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4898 scan += UTF8SKIP(scan);
4902 while (scan < loceol && !isDIGIT(*scan))
4906 default: /* Called on something of 0 width. */
4907 break; /* So match right here or not at all. */
4913 c = scan - PL_reginput;
4917 GET_RE_DEBUG_FLAGS_DECL;
4919 SV * const prop = sv_newmortal();
4920 regprop(prog, prop, p);
4921 PerlIO_printf(Perl_debug_log,
4922 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4923 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4931 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4933 - regclass_swash - prepare the utf8 swash
4937 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4943 const struct reg_data * const data = prog ? prog->data : NULL;
4945 if (data && data->count) {
4946 const U32 n = ARG(node);
4948 if (data->what[n] == 's') {
4949 SV * const rv = (SV*)data->data[n];
4950 AV * const av = (AV*)SvRV((SV*)rv);
4951 SV **const ary = AvARRAY(av);
4954 /* See the end of regcomp.c:S_regclass() for
4955 * documentation of these array elements. */
4958 a = SvROK(ary[1]) ? &ary[1] : 0;
4959 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4963 else if (si && doinit) {
4964 sw = swash_init("utf8", "", si, 1, 0);
4965 (void)av_store(av, 1, sw);
4982 - reginclass - determine if a character falls into a character class
4984 The n is the ANYOF regnode, the p is the target string, lenp
4985 is pointer to the maximum length of how far to go in the p
4986 (if the lenp is zero, UTF8SKIP(p) is used),
4987 do_utf8 tells whether the target string is in UTF-8.
4992 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4995 const char flags = ANYOF_FLAGS(n);
5001 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5002 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5003 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5004 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5005 if (len == (STRLEN)-1)
5006 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5009 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5010 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5013 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5014 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5017 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5021 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5024 if (swash_fetch(sw, p, do_utf8))
5026 else if (flags & ANYOF_FOLD) {
5027 if (!match && lenp && av) {
5029 for (i = 0; i <= av_len(av); i++) {
5030 SV* const sv = *av_fetch(av, i, FALSE);
5032 const char * const s = SvPV_const(sv, len);
5034 if (len <= plen && memEQ(s, (char*)p, len)) {
5042 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5045 to_utf8_fold(p, tmpbuf, &tmplen);
5046 if (swash_fetch(sw, tmpbuf, do_utf8))
5052 if (match && lenp && *lenp == 0)
5053 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5055 if (!match && c < 256) {
5056 if (ANYOF_BITMAP_TEST(n, c))
5058 else if (flags & ANYOF_FOLD) {
5061 if (flags & ANYOF_LOCALE) {
5062 PL_reg_flags |= RF_tainted;
5063 f = PL_fold_locale[c];
5067 if (f != c && ANYOF_BITMAP_TEST(n, f))
5071 if (!match && (flags & ANYOF_CLASS)) {
5072 PL_reg_flags |= RF_tainted;
5074 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5075 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5076 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5077 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5078 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5079 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5080 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5081 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5082 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5083 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5084 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5085 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5086 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5087 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5088 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5089 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5090 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5091 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5092 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5093 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5094 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5095 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5096 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5097 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5098 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5099 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5100 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5101 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5102 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5103 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5104 ) /* How's that for a conditional? */
5111 return (flags & ANYOF_INVERT) ? !match : match;
5115 S_reghop3(U8 *s, I32 off, const U8* lim)
5119 while (off-- && s < lim) {
5120 /* XXX could check well-formedness here */
5125 while (off++ && s > lim) {
5127 if (UTF8_IS_CONTINUED(*s)) {
5128 while (s > lim && UTF8_IS_CONTINUATION(*s))
5131 /* XXX could check well-formedness here */
5138 /* there are a bunch of places where we use two reghop3's that should
5139 be replaced with this routine. but since thats not done yet
5140 we ifdef it out - dmq
5143 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5147 while (off-- && s < rlim) {
5148 /* XXX could check well-formedness here */
5153 while (off++ && s > llim) {
5155 if (UTF8_IS_CONTINUED(*s)) {
5156 while (s > llim && UTF8_IS_CONTINUATION(*s))
5159 /* XXX could check well-formedness here */
5167 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5171 while (off-- && s < lim) {
5172 /* XXX could check well-formedness here */
5179 while (off++ && s > lim) {
5181 if (UTF8_IS_CONTINUED(*s)) {
5182 while (s > lim && UTF8_IS_CONTINUATION(*s))
5185 /* XXX could check well-formedness here */
5194 restore_pos(pTHX_ void *arg)
5197 regexp * const rex = (regexp *)arg;
5198 if (PL_reg_eval_set) {
5199 if (PL_reg_oldsaved) {
5200 rex->subbeg = PL_reg_oldsaved;
5201 rex->sublen = PL_reg_oldsavedlen;
5202 #ifdef PERL_OLD_COPY_ON_WRITE
5203 rex->saved_copy = PL_nrs;
5205 RX_MATCH_COPIED_on(rex);
5207 PL_reg_magic->mg_len = PL_reg_oldpos;
5208 PL_reg_eval_set = 0;
5209 PL_curpm = PL_reg_oldcurpm;
5214 S_to_utf8_substr(pTHX_ register regexp *prog)
5216 if (prog->float_substr && !prog->float_utf8) {
5217 SV* const sv = newSVsv(prog->float_substr);
5218 prog->float_utf8 = sv;
5219 sv_utf8_upgrade(sv);
5220 if (SvTAIL(prog->float_substr))
5222 if (prog->float_substr == prog->check_substr)
5223 prog->check_utf8 = sv;
5225 if (prog->anchored_substr && !prog->anchored_utf8) {
5226 SV* const sv = newSVsv(prog->anchored_substr);
5227 prog->anchored_utf8 = sv;
5228 sv_utf8_upgrade(sv);
5229 if (SvTAIL(prog->anchored_substr))
5231 if (prog->anchored_substr == prog->check_substr)
5232 prog->check_utf8 = sv;
5237 S_to_byte_substr(pTHX_ register regexp *prog)
5240 if (prog->float_utf8 && !prog->float_substr) {
5241 SV* sv = newSVsv(prog->float_utf8);
5242 prog->float_substr = sv;
5243 if (sv_utf8_downgrade(sv, TRUE)) {
5244 if (SvTAIL(prog->float_utf8))
5248 prog->float_substr = sv = &PL_sv_undef;
5250 if (prog->float_utf8 == prog->check_utf8)
5251 prog->check_substr = sv;
5253 if (prog->anchored_utf8 && !prog->anchored_substr) {
5254 SV* sv = newSVsv(prog->anchored_utf8);
5255 prog->anchored_substr = sv;
5256 if (sv_utf8_downgrade(sv, TRUE)) {
5257 if (SvTAIL(prog->anchored_utf8))
5261 prog->anchored_substr = sv = &PL_sv_undef;
5263 if (prog->anchored_utf8 == prog->check_utf8)
5264 prog->check_substr = sv;
5270 * c-indentation-style: bsd
5272 * indent-tabs-mode: t
5275 * ex: set ts=8 sts=4 sw=4 noet: