5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 8
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHPTR(PL_regstartp);
186 SSPUSHPTR(PL_regendp);
187 SSPUSHINT(PL_regsize);
188 SSPUSHINT(*PL_reglastparen);
189 SSPUSHINT(*PL_reglastcloseparen);
190 SSPUSHPTR(PL_reginput);
191 #define REGCP_FRAME_ELEMS 2
192 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
193 * are needed for the regexp context stack bookkeeping. */
194 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
195 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
200 /* These are needed since we do not localize EVAL nodes: */
201 #define REGCP_SET(cp) \
203 PerlIO_printf(Perl_debug_log, \
204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
205 (IV)PL_savestack_ix)); \
208 #define REGCP_UNWIND(cp) \
210 if (cp != PL_savestack_ix) \
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
213 (IV)(cp), (IV)PL_savestack_ix)); \
217 S_regcppop(pTHX_ const regexp *rex)
223 GET_RE_DEBUG_FLAGS_DECL;
225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
229 input = (char *) SSPOPPTR;
230 *PL_reglastcloseparen = SSPOPINT;
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
233 PL_regendp=(I32 *) SSPOPPTR;
234 PL_regstartp=(I32 *) SSPOPPTR;
237 /* Now restore the parentheses context. */
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
241 U32 paren = (U32)SSPOPINT;
242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
243 PL_regstartp[paren] = SSPOPINT;
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
248 PerlIO_printf(Perl_debug_log,
249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
250 (UV)paren, (IV)PL_regstartp[paren],
251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
252 (IV)PL_regendp[paren],
253 (paren > *PL_reglastparen ? "(no)" : ""));
257 if (*PL_reglastparen + 1 <= rex->nparens) {
258 PerlIO_printf(Perl_debug_log,
259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
260 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
276 PL_regstartp[i] = -1;
283 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
286 * pregexec and friends
289 #ifndef PERL_IN_XSUB_RE
291 - pregexec - match a regexp against a string
294 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
295 char *strbeg, I32 minend, SV *screamer, U32 nosave)
296 /* strend: pointer to null at end of string */
297 /* strbeg: real beginning of string */
298 /* minend: end of match must be >=minend after stringarg. */
299 /* nosave: For optimizations. */
302 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
303 nosave ? 0 : REXEC_COPY_STR);
308 * Need to implement the following flags for reg_anch:
310 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
313 * INTUIT_AUTORITATIVE_ML
314 * INTUIT_ONCE_NOML - Intuit can match in one location only.
317 * Another flag for this function: SECOND_TIME (so that float substrs
318 * with giant delta may be not rechecked).
321 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
324 Otherwise, only SvCUR(sv) is used to get strbeg. */
326 /* XXXX We assume that strpos is strbeg unless sv. */
328 /* XXXX Some places assume that there is a fixed substring.
329 An update may be needed if optimizer marks as "INTUITable"
330 RExen without fixed substrings. Similarly, it is assumed that
331 lengths of all the strings are no more than minlen, thus they
332 cannot come from lookahead.
333 (Or minlen should take into account lookahead.) */
335 /* A failure to find a constant substring means that there is no need to make
336 an expensive call to REx engine, thus we celebrate a failure. Similarly,
337 finding a substring too deep into the string means that less calls to
338 regtry() should be needed.
340 REx compiler's optimizer found 4 possible hints:
341 a) Anchored substring;
343 c) Whether we are anchored (beginning-of-line or \G);
344 d) First node (of those at offset 0) which may distingush positions;
345 We use a)b)d) and multiline-part of c), and try to find a position in the
346 string which does not contradict any of them.
349 /* Most of decisions we do here should have been done at compile time.
350 The nodes of the REx which we used for the search should have been
351 deleted from the finite automaton. */
354 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
355 char *strend, U32 flags, re_scream_pos_data *data)
358 register I32 start_shift = 0;
359 /* Should be nonnegative! */
360 register I32 end_shift = 0;
365 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
367 register char *other_last = NULL; /* other substr checked before this */
368 char *check_at = NULL; /* check substr found at this pos */
369 const I32 multiline = prog->reganch & PMf_MULTILINE;
371 const char * const i_strpos = strpos;
374 GET_RE_DEBUG_FLAGS_DECL;
376 RX_MATCH_UTF8_set(prog,do_utf8);
378 if (prog->reganch & ROPT_UTF8) {
379 PL_reg_flags |= RF_utf8;
382 debug_start_match(prog, do_utf8, strpos, strend,
383 sv ? "Guessing start of match in sv for"
384 : "Guessing start of match in string for");
387 /* CHR_DIST() would be more correct here but it makes things slow. */
388 if (prog->minlen > strend - strpos) {
389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
390 "String too short... [re_intuit_start]\n"));
394 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
397 if (!prog->check_utf8 && prog->check_substr)
398 to_utf8_substr(prog);
399 check = prog->check_utf8;
401 if (!prog->check_substr && prog->check_utf8)
402 to_byte_substr(prog);
403 check = prog->check_substr;
405 if (check == &PL_sv_undef) {
406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
407 "Non-utf8 string cannot match utf8 check string\n"));
410 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
411 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
412 || ( (prog->reganch & ROPT_ANCH_BOL)
413 && !multiline ) ); /* Check after \n? */
416 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
417 | ROPT_IMPLICIT)) /* not a real BOL */
418 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
420 && (strpos != strbeg)) {
421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
424 if (prog->check_offset_min == prog->check_offset_max &&
425 !(prog->reganch & ROPT_CANY_SEEN)) {
426 /* Substring at constant offset from beg-of-str... */
429 s = HOP3c(strpos, prog->check_offset_min, strend);
432 slen = SvCUR(check); /* >= 1 */
434 if ( strend - s > slen || strend - s < slen - 1
435 || (strend - s == slen && strend[-1] != '\n')) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
439 /* Now should match s[0..slen-2] */
441 if (slen && (*SvPVX_const(check) != *s
443 && memNE(SvPVX_const(check), s, slen)))) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
449 else if (*SvPVX_const(check) != *s
450 || ((slen = SvCUR(check)) > 1
451 && memNE(SvPVX_const(check), s, slen)))
454 goto success_at_start;
457 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
459 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
460 end_shift = prog->check_end_shift;
463 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
464 - (SvTAIL(check) != 0);
465 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
467 if (end_shift < eshift)
471 else { /* Can match at random position */
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->check_end_shift;
477 /* end shift should be non negative here */
480 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
482 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
483 (IV)end_shift, prog->precomp);
487 /* Find a possible match in the region s..strend by looking for
488 the "check" substring in the region corrected by start/end_shift. */
491 I32 srch_start_shift = start_shift;
492 I32 srch_end_shift = end_shift;
493 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
494 srch_end_shift -= ((strbeg - s) - srch_start_shift);
495 srch_start_shift = strbeg - s;
497 DEBUG_OPTIMISE_MORE_r({
498 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
499 (IV)prog->check_offset_min,
500 (IV)srch_start_shift,
502 (IV)prog->check_end_shift);
505 if (flags & REXEC_SCREAM) {
506 I32 p = -1; /* Internal iterator of scream. */
507 I32 * const pp = data ? data->scream_pos : &p;
509 if (PL_screamfirst[BmRARE(check)] >= 0
510 || ( BmRARE(check) == '\n'
511 && (BmPREVIOUS(check) == SvCUR(check) - 1)
513 s = screaminstr(sv, check,
514 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
517 /* we may be pointing at the wrong string */
518 if (s && RX_MATCH_COPIED(prog))
519 s = strbeg + (s - SvPVX_const(sv));
521 *data->scream_olds = s;
526 if (prog->reganch & ROPT_CANY_SEEN) {
527 start_point= (U8*)(s + srch_start_shift);
528 end_point= (U8*)(strend - srch_end_shift);
530 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
531 end_point= HOP3(strend, -srch_end_shift, strbeg);
533 DEBUG_OPTIMISE_MORE_r({
534 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
535 (int)(end_point - start_point),
536 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
540 s = fbm_instr( start_point, end_point,
541 check, multiline ? FBMrf_MULTILINE : 0);
544 /* Update the count-of-usability, remove useless subpatterns,
548 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
549 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
550 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
551 (s ? "Found" : "Did not find"),
552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
553 ? "anchored" : "floating"),
556 (s ? " at offset " : "...\n") );
561 /* Finish the diagnostic message */
562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
564 /* XXX dmq: first branch is for positive lookbehind...
565 Our check string is offset from the beginning of the pattern.
566 So we need to do any stclass tests offset forward from that
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
578 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
584 : (prog->float_substr && prog->anchored_substr))
586 /* Take into account the "other" substring. */
587 /* XXXX May be hopelessly wrong for UTF... */
590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
593 char * const last = HOP3c(s, -start_shift, strbeg);
595 char * const saved_s = s;
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
606 t = HOP3c(t, prog->anchored_offset, strend);
607 if (t < other_last) /* These positions already checked */
609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
612 /* XXXX It is not documented what units *_offsets are in.
613 We assume bytes, but this is clearly wrong.
614 Meaning this code needs to be carefully reviewed for errors.
618 /* On end-of-str: see comment below. */
619 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
620 if (must == &PL_sv_undef) {
622 DEBUG_r(must = prog->anchored_utf8); /* for debug */
627 HOP3(HOP3(last1, prog->anchored_offset, strend)
628 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
630 multiline ? FBMrf_MULTILINE : 0
633 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
634 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
635 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
636 (s ? "Found" : "Contradicts"),
637 quoted, RE_SV_TAIL(must));
642 if (last1 >= last2) {
643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
644 ", giving up...\n"));
647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
648 ", trying floating at offset %ld...\n",
649 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
650 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
651 s = HOP3c(last, 1, strend);
655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
656 (long)(s - i_strpos)));
657 t = HOP3c(s, -prog->anchored_offset, strbeg);
658 other_last = HOP3c(s, 1, strend);
666 else { /* Take into account the floating substring. */
668 char * const saved_s = s;
671 t = HOP3c(s, -start_shift, strbeg);
673 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
674 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
675 last = HOP3c(t, prog->float_max_offset, strend);
676 s = HOP3c(t, prog->float_min_offset, strend);
679 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
680 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
681 /* fbm_instr() takes into account exact value of end-of-str
682 if the check is SvTAIL(ed). Since false positives are OK,
683 and end-of-str is not later than strend we are OK. */
684 if (must == &PL_sv_undef) {
686 DEBUG_r(must = prog->float_utf8); /* for debug message */
689 s = fbm_instr((unsigned char*)s,
690 (unsigned char*)last + SvCUR(must)
692 must, multiline ? FBMrf_MULTILINE : 0);
694 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
695 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
696 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
697 (s ? "Found" : "Contradicts"),
698 quoted, RE_SV_TAIL(must));
702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
703 ", giving up...\n"));
706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
707 ", trying anchored starting at offset %ld...\n",
708 (long)(saved_s + 1 - i_strpos)));
710 s = HOP3c(t, 1, strend);
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
715 (long)(s - i_strpos)));
716 other_last = s; /* Fix this later. --Hugo */
726 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
728 DEBUG_OPTIMISE_MORE_r(
729 PerlIO_printf(Perl_debug_log,
730 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
731 (IV)prog->check_offset_min,
732 (IV)prog->check_offset_max,
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
748 if (ml_anch && t[-1] != '\n') {
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
756 while (t < strend - prog->minlen) {
758 if (t < check_at - prog->check_offset_min) {
759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
769 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
770 goto do_other_anchored;
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
776 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
783 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
784 other_last = strpos = s = t + 1;
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
790 PL_colors[0], PL_colors[1]));
794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
795 PL_colors[0], PL_colors[1]));
799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
802 /* The found string does not prohibit matching at strpos,
803 - no optimization of calling REx engine can be performed,
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
810 && (strpos != strbeg) && strpos[-1] != '\n'
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
817 DEBUG_EXECUTE_r( if (ml_anch)
818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
819 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
833 /* If flags & SOMETHING - do not do it many times on the same match */
834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
838 prog->check_substr = prog->check_utf8 = NULL; /* disable */
839 prog->float_substr = prog->float_utf8 = NULL; /* clear */
840 check = NULL; /* abort */
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
845 prog->reganch &= ~RE_USE_INTUIT;
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
853 /* trie stclasses are too expensive to use here, we are better off to
854 leave it to regmatch itself */
855 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
864 const U8* const str = (U8*)STRING(prog->regstclass);
865 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
869 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
871 else if (prog->float_substr || prog->float_utf8)
872 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
877 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
880 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
883 const char *what = NULL;
885 if (endpos == strend) {
886 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
887 "Could not match STCLASS...\n") );
890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
891 "This position contradicts STCLASS...\n") );
892 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
894 /* Contradict one of substrings */
895 if (prog->anchored_substr || prog->anchored_utf8) {
896 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
897 DEBUG_EXECUTE_r( what = "anchored" );
899 s = HOP3c(t, 1, strend);
900 if (s + start_shift + end_shift > strend) {
901 /* XXXX Should be taken into account earlier? */
902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
903 "Could not match STCLASS...\n") );
908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
909 "Looking for %s substr starting at offset %ld...\n",
910 what, (long)(s + start_shift - i_strpos)) );
913 /* Have both, check_string is floating */
914 if (t + start_shift >= check_at) /* Contradicts floating=check */
915 goto retry_floating_check;
916 /* Recheck anchored substring, but not floating... */
920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
921 "Looking for anchored substr starting at offset %ld...\n",
922 (long)(other_last - i_strpos)) );
923 goto do_other_anchored;
925 /* Another way we could have checked stclass at the
926 current position only: */
931 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
932 "Looking for /%s^%s/m starting at offset %ld...\n",
933 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
936 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
938 /* Check is floating subtring. */
939 retry_floating_check:
940 t = check_at - start_shift;
941 DEBUG_EXECUTE_r( what = "floating" );
942 goto hop_and_restart;
945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946 "By STCLASS: moving %ld --> %ld\n",
947 (long)(t - i_strpos), (long)(s - i_strpos))
951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
952 "Does not contradict STCLASS...\n");
957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
958 PL_colors[4], (check ? "Guessed" : "Giving up"),
959 PL_colors[5], (long)(s - i_strpos)) );
962 fail_finish: /* Substring not found */
963 if (prog->check_substr || prog->check_utf8) /* could be removed already */
964 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
967 PL_colors[4], PL_colors[5]));
973 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
974 foldlen, foldbuf, uniflags) STMT_START { \
975 switch (trie_type) { \
976 case trie_utf8_fold: \
978 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
983 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
985 foldlen -= UNISKIP( uvc ); \
986 uscan = foldbuf + UNISKIP( uvc ); \
990 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
998 charid = trie->charmap[ uvc ]; \
1002 if (trie->widecharmap) { \
1003 SV** const svpp = hv_fetch(trie->widecharmap, \
1004 (char*)&uvc, sizeof(UV), 0); \
1006 charid = (U16)SvIV(*svpp); \
1011 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1014 ibcmp_utf8(s, NULL, 0, do_utf8, \
1015 m, NULL, ln, (bool)UTF)) \
1016 && (!reginfo || regtry(reginfo, s)) ) \
1019 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1020 uvchr_to_utf8(tmpbuf, c); \
1021 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1023 && (f == c1 || f == c2) \
1024 && (ln == foldlen || \
1025 !ibcmp_utf8((char *) foldbuf, \
1026 NULL, foldlen, do_utf8, \
1028 NULL, ln, (bool)UTF)) \
1029 && (!reginfo || regtry(reginfo, s)) ) \
1034 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1038 && (ln == 1 || !(OP(c) == EXACTF \
1040 : ibcmp_locale(s, m, ln))) \
1041 && (!reginfo || regtry(reginfo, s)) ) \
1047 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1049 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1055 #define REXEC_FBC_SCAN(CoDe) \
1057 while (s < strend) { \
1063 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1064 REXEC_FBC_UTF8_SCAN( \
1066 if (tmp && (!reginfo || regtry(reginfo, s))) \
1075 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1078 if (tmp && (!reginfo || regtry(reginfo, s))) \
1087 #define REXEC_FBC_TRYIT \
1088 if ((!reginfo || regtry(reginfo, s))) \
1091 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1094 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1097 REXEC_FBC_CLASS_SCAN(CoNd); \
1101 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1102 PL_reg_flags |= RF_tainted; \
1104 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1107 REXEC_FBC_CLASS_SCAN(CoNd); \
1111 #define DUMP_EXEC_POS(li,s,doutf8) \
1112 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1114 /* We know what class REx starts with. Try to find this position... */
1115 /* if reginfo is NULL, its a dryrun */
1116 /* annoyingly all the vars in this routine have different names from their counterparts
1117 in regmatch. /grrr */
1120 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1121 const char *strend, const regmatch_info *reginfo)
1124 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1128 register STRLEN uskip;
1132 register I32 tmp = 1; /* Scratch variable? */
1133 register const bool do_utf8 = PL_reg_match_utf8;
1135 /* We know what class it must start with. */
1139 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1140 !UTF8_IS_INVARIANT((U8)s[0]) ?
1141 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1142 REGINCLASS(prog, c, (U8*)s));
1145 while (s < strend) {
1148 if (REGINCLASS(prog, c, (U8*)s) ||
1149 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1150 /* The assignment of 2 is intentional:
1151 * for the folded sharp s, the skip is 2. */
1152 (skip = SHARP_S_SKIP))) {
1153 if (tmp && (!reginfo || regtry(reginfo, s)))
1166 if (tmp && (!reginfo || regtry(reginfo, s)))
1174 ln = STR_LEN(c); /* length to match in octets/bytes */
1175 lnc = (I32) ln; /* length to match in characters */
1177 STRLEN ulen1, ulen2;
1179 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1180 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1181 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1183 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1184 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1186 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1188 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1191 while (sm < ((U8 *) m + ln)) {
1206 c2 = PL_fold_locale[c1];
1208 e = HOP3c(strend, -((I32)lnc), s);
1210 if (!reginfo && e < s)
1211 e = s; /* Due to minlen logic of intuit() */
1213 /* The idea in the EXACTF* cases is to first find the
1214 * first character of the EXACTF* node and then, if
1215 * necessary, case-insensitively compare the full
1216 * text of the node. The c1 and c2 are the first
1217 * characters (though in Unicode it gets a bit
1218 * more complicated because there are more cases
1219 * than just upper and lower: one needs to use
1220 * the so-called folding case for case-insensitive
1221 * matching (called "loose matching" in Unicode).
1222 * ibcmp_utf8() will do just that. */
1226 U8 tmpbuf [UTF8_MAXBYTES+1];
1227 STRLEN len, foldlen;
1228 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1230 /* Upper and lower of 1st char are equal -
1231 * probably not a "letter". */
1233 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1235 REXEC_FBC_EXACTISH_CHECK(c == c1);
1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1243 /* Handle some of the three Greek sigmas cases.
1244 * Note that not all the possible combinations
1245 * are handled here: some of them are handled
1246 * by the standard folding rules, and some of
1247 * them (the character class or ANYOF cases)
1248 * are handled during compiletime in
1249 * regexec.c:S_regclass(). */
1250 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1251 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1252 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1254 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1260 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1262 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1266 PL_reg_flags |= RF_tainted;
1273 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1274 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1276 tmp = ((OP(c) == BOUND ?
1277 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1278 LOAD_UTF8_CHARCLASS_ALNUM();
1279 REXEC_FBC_UTF8_SCAN(
1280 if (tmp == !(OP(c) == BOUND ?
1281 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1282 isALNUM_LC_utf8((U8*)s)))
1290 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1291 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1294 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1300 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1304 PL_reg_flags |= RF_tainted;
1311 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1312 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1314 tmp = ((OP(c) == NBOUND ?
1315 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1316 LOAD_UTF8_CHARCLASS_ALNUM();
1317 REXEC_FBC_UTF8_SCAN(
1318 if (tmp == !(OP(c) == NBOUND ?
1319 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1320 isALNUM_LC_utf8((U8*)s)))
1322 else REXEC_FBC_TRYIT;
1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1327 tmp = ((OP(c) == NBOUND ?
1328 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1331 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1333 else REXEC_FBC_TRYIT;
1336 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1340 REXEC_FBC_CSCAN_PRELOAD(
1341 LOAD_UTF8_CHARCLASS_ALNUM(),
1342 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1346 REXEC_FBC_CSCAN_TAINT(
1347 isALNUM_LC_utf8((U8*)s),
1351 REXEC_FBC_CSCAN_PRELOAD(
1352 LOAD_UTF8_CHARCLASS_ALNUM(),
1353 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1357 REXEC_FBC_CSCAN_TAINT(
1358 !isALNUM_LC_utf8((U8*)s),
1362 REXEC_FBC_CSCAN_PRELOAD(
1363 LOAD_UTF8_CHARCLASS_SPACE(),
1364 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1368 REXEC_FBC_CSCAN_TAINT(
1369 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1373 REXEC_FBC_CSCAN_PRELOAD(
1374 LOAD_UTF8_CHARCLASS_SPACE(),
1375 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1379 REXEC_FBC_CSCAN_TAINT(
1380 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1384 REXEC_FBC_CSCAN_PRELOAD(
1385 LOAD_UTF8_CHARCLASS_DIGIT(),
1386 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1390 REXEC_FBC_CSCAN_TAINT(
1391 isDIGIT_LC_utf8((U8*)s),
1395 REXEC_FBC_CSCAN_PRELOAD(
1396 LOAD_UTF8_CHARCLASS_DIGIT(),
1397 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1401 REXEC_FBC_CSCAN_TAINT(
1402 !isDIGIT_LC_utf8((U8*)s),
1408 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1409 trie_type = do_utf8 ?
1410 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1412 /* what trie are we using right now */
1414 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1415 reg_trie_data *trie=aho->trie;
1417 const char *last_start = strend - trie->minlen;
1419 const char *real_start = s;
1421 STRLEN maxlen = trie->maxlen;
1423 U8 **points; /* map of where we were in the input string
1424 when reading a given char. For ASCII this
1425 is unnecessary overhead as the relationship
1426 is always 1:1, but for unicode, especially
1427 case folded unicode this is not true. */
1428 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1432 GET_RE_DEBUG_FLAGS_DECL;
1434 /* We can't just allocate points here. We need to wrap it in
1435 * an SV so it gets freed properly if there is a croak while
1436 * running the match */
1439 sv_points=newSV(maxlen * sizeof(U8 *));
1440 SvCUR_set(sv_points,
1441 maxlen * sizeof(U8 *));
1442 SvPOK_on(sv_points);
1443 sv_2mortal(sv_points);
1444 points=(U8**)SvPV_nolen(sv_points );
1445 if ( trie_type != trie_utf8_fold
1446 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1449 bitmap=(U8*)trie->bitmap;
1451 bitmap=(U8*)ANYOF_BITMAP(c);
1453 /* this is the Aho-Corasick algorithm modified a touch
1454 to include special handling for long "unknown char"
1455 sequences. The basic idea being that we use AC as long
1456 as we are dealing with a possible matching char, when
1457 we encounter an unknown char (and we have not encountered
1458 an accepting state) we scan forward until we find a legal
1460 AC matching is basically that of trie matching, except
1461 that when we encounter a failing transition, we fall back
1462 to the current states "fail state", and try the current char
1463 again, a process we repeat until we reach the root state,
1464 state 1, or a legal transition. If we fail on the root state
1465 then we can either terminate if we have reached an accepting
1466 state previously, or restart the entire process from the beginning
1470 while (s <= last_start) {
1471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1479 U8 *uscan = (U8*)NULL;
1480 U8 *leftmost = NULL;
1482 U32 accepted_word= 0;
1486 while ( state && uc <= (U8*)strend ) {
1488 U32 word = aho->states[ state ].wordnum;
1492 DEBUG_TRIE_EXECUTE_r(
1493 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1494 dump_exec_pos( (char *)uc, c, strend, real_start,
1495 (char *)uc, do_utf8 );
1496 PerlIO_printf( Perl_debug_log,
1497 " Scanning for legal start char...\n");
1500 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1505 if (uc >(U8*)last_start) break;
1509 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1510 if (!leftmost || lpos < leftmost) {
1511 DEBUG_r(accepted_word=word);
1517 points[pointpos++ % maxlen]= uc;
1518 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1519 uvc, charid, foldlen, foldbuf, uniflags);
1520 DEBUG_TRIE_EXECUTE_r({
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1523 PerlIO_printf(Perl_debug_log,
1524 " Charid:%3u CP:%4"UVxf" ",
1530 word = aho->states[ state ].wordnum;
1532 base = aho->states[ state ].trans.base;
1534 DEBUG_TRIE_EXECUTE_r({
1536 dump_exec_pos( (char *)uc, c, strend, real_start,
1538 PerlIO_printf( Perl_debug_log,
1539 "%sState: %4"UVxf", word=%"UVxf,
1540 failed ? " Fail transition to " : "",
1541 (UV)state, (UV)word);
1546 (base + charid > trie->uniquecharcount )
1547 && (base + charid - 1 - trie->uniquecharcount
1549 && trie->trans[base + charid - 1 -
1550 trie->uniquecharcount].check == state
1551 && (tmp=trie->trans[base + charid - 1 -
1552 trie->uniquecharcount ].next))
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - legal\n"));
1560 DEBUG_TRIE_EXECUTE_r(
1561 PerlIO_printf( Perl_debug_log," - fail\n"));
1563 state = aho->fail[state];
1567 /* we must be accepting here */
1568 DEBUG_TRIE_EXECUTE_r(
1569 PerlIO_printf( Perl_debug_log," - accepting\n"));
1578 if (!state) state = 1;
1581 if ( aho->states[ state ].wordnum ) {
1582 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1583 if (!leftmost || lpos < leftmost) {
1584 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1589 s = (char*)leftmost;
1590 DEBUG_TRIE_EXECUTE_r({
1592 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1593 (UV)accepted_word, s - real_start
1596 if (!reginfo || regtry(reginfo, s)) {
1602 DEBUG_TRIE_EXECUTE_r({
1603 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1606 DEBUG_TRIE_EXECUTE_r(
1607 PerlIO_printf( Perl_debug_log,"No match.\n"));
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1625 - regexec_flags - match a regexp against a string
1628 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630 /* strend: pointer to null at end of string */
1631 /* strbeg: real beginning of string */
1632 /* minend: end of match must be >=minend after stringarg. */
1633 /* data: May be used for some additional optimizations. */
1634 /* nosave: For optimizations. */
1638 register regnode *c;
1639 register char *startpos = stringarg;
1640 I32 minlen; /* must match at least this many chars */
1641 I32 dontbother = 0; /* how many characters not to try at end */
1642 I32 end_shift = 0; /* Same for the end. */ /* CC */
1643 I32 scream_pos = -1; /* Internal iterator of scream. */
1644 char *scream_olds = NULL;
1645 SV* const oreplsv = GvSV(PL_replgv);
1646 const bool do_utf8 = (bool)DO_UTF8(sv);
1649 regmatch_info reginfo; /* create some info to pass to regtry etc */
1651 GET_RE_DEBUG_FLAGS_DECL;
1653 PERL_UNUSED_ARG(data);
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1661 multiline = prog->reganch & PMf_MULTILINE;
1662 reginfo.prog = prog;
1664 RX_MATCH_UTF8_set(prog, do_utf8);
1666 debug_start_match(prog, do_utf8, startpos, strend,
1670 minlen = prog->minlen;
1672 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1674 "String too short [regexec_flags]...\n"));
1679 /* Check validity of program. */
1680 if (UCHARAT(prog->program) != REG_MAGIC) {
1681 Perl_croak(aTHX_ "corrupted regexp program");
1685 PL_reg_eval_set = 0;
1688 if (prog->reganch & ROPT_UTF8)
1689 PL_reg_flags |= RF_utf8;
1691 /* Mark beginning of line for ^ and lookbehind. */
1692 reginfo.bol = startpos; /* XXX not used ??? */
1696 /* Mark end of line for $ (and such) */
1699 /* see how far we have to get to not match where we matched before */
1700 reginfo.till = startpos+minend;
1702 /* If there is a "must appear" string, look for it. */
1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1709 reginfo.ganch = startpos;
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
1714 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1715 if (prog->reganch & ROPT_ANCH_GPOS) {
1716 if (s > reginfo.ganch)
1721 else /* pos() not defined */
1722 reginfo.ganch = strbeg;
1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1726 re_scream_pos_data d;
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1733 goto phooey; /* not present */
1739 /* Simplest case: anchored match need be tried only once. */
1740 /* [unless only anchor is BOL and multiline is set] */
1741 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1742 if (s == startpos && regtry(®info, startpos))
1744 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1745 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1750 dontbother = minlen - 1;
1751 end = HOP3c(strend, -dontbother, strbeg) - 1;
1752 /* for multiline we only have to try after newlines */
1753 if (prog->check_substr || prog->check_utf8) {
1757 if (regtry(®info, s))
1762 if (prog->reganch & RE_USE_INTUIT) {
1763 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1774 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1775 if (regtry(®info, s))
1782 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1784 /* the warning about reginfo.ganch being used without intialization
1785 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1786 and we only enter this block when the same bit is set. */
1787 if (regtry(®info, reginfo.ganch))
1792 /* Messy cases: unanchored match. */
1793 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1794 /* we have /x+whatever/ */
1795 /* it must be a one character string (XXXX Except UTF?) */
1800 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1801 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1802 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1807 DEBUG_EXECUTE_r( did_match = 1 );
1808 if (regtry(®info, s)) goto got_it;
1810 while (s < strend && *s == ch)
1818 DEBUG_EXECUTE_r( did_match = 1 );
1819 if (regtry(®info, s)) goto got_it;
1821 while (s < strend && *s == ch)
1826 DEBUG_EXECUTE_r(if (!did_match)
1827 PerlIO_printf(Perl_debug_log,
1828 "Did not find anchored character...\n")
1831 else if (prog->anchored_substr != NULL
1832 || prog->anchored_utf8 != NULL
1833 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1834 && prog->float_max_offset < strend - s)) {
1839 char *last1; /* Last position checked before */
1843 if (prog->anchored_substr || prog->anchored_utf8) {
1844 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1847 back_max = back_min = prog->anchored_offset;
1849 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1850 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1851 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1852 back_max = prog->float_max_offset;
1853 back_min = prog->float_min_offset;
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1864 last = HOP3c(strend, /* Cannot start after this */
1865 -(I32)(CHR_SVLEN(must)
1866 - (SvTAIL(must) != 0) + back_min), strbeg);
1869 last1 = HOPc(s, -1);
1871 last1 = s - 1; /* bogus */
1873 /* XXXX check_substr already used to find "s", can optimize if
1874 check_substr==must. */
1876 dontbother = end_shift;
1877 strend = HOPc(strend, -dontbother);
1878 while ( (s <= last) &&
1879 ((flags & REXEC_SCREAM)
1880 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1881 end_shift, &scream_pos, 0))
1882 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1883 (unsigned char*)strend, must,
1884 multiline ? FBMrf_MULTILINE : 0))) ) {
1885 /* we may be pointing at the wrong string */
1886 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1887 s = strbeg + (s - SvPVX_const(sv));
1888 DEBUG_EXECUTE_r( did_match = 1 );
1889 if (HOPc(s, -back_max) > last1) {
1890 last1 = HOPc(s, -back_min);
1891 s = HOPc(s, -back_max);
1894 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1896 last1 = HOPc(s, -back_min);
1900 while (s <= last1) {
1901 if (regtry(®info, s))
1907 while (s <= last1) {
1908 if (regtry(®info, s))
1914 DEBUG_EXECUTE_r(if (!did_match) {
1915 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1916 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1917 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1918 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1919 ? "anchored" : "floating"),
1920 quoted, RE_SV_TAIL(must));
1924 else if ( (c = prog->regstclass) ) {
1926 const OPCODE op = OP(prog->regstclass);
1927 /* don't bother with what can't match */
1928 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1929 strend = HOPc(strend, -(minlen - 1));
1932 SV * const prop = sv_newmortal();
1933 regprop(prog, prop, c);
1935 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1937 PerlIO_printf(Perl_debug_log,
1938 "Matching stclass %.*s against %s (%d chars)\n",
1939 (int)SvCUR(prop), SvPVX_const(prop),
1940 quoted, (int)(strend - s));
1943 if (find_byclass(prog, c, s, strend, ®info))
1945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1949 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1954 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1955 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1956 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1958 if (flags & REXEC_SCREAM) {
1959 last = screaminstr(sv, float_real, s - strbeg,
1960 end_shift, &scream_pos, 1); /* last one */
1962 last = scream_olds; /* Only one occurrence. */
1963 /* we may be pointing at the wrong string */
1964 else if (RX_MATCH_COPIED(prog))
1965 s = strbeg + (s - SvPVX_const(sv));
1969 const char * const little = SvPV_const(float_real, len);
1971 if (SvTAIL(float_real)) {
1972 if (memEQ(strend - len + 1, little, len - 1))
1973 last = strend - len + 1;
1974 else if (!multiline)
1975 last = memEQ(strend - len, little, len)
1976 ? strend - len : NULL;
1982 last = rninstr(s, strend, little, little + len);
1984 last = strend; /* matching "$" */
1989 PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
1992 goto phooey; /* Should not happen! */
1994 dontbother = strend - last + prog->float_min_offset;
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
2002 if (regtry(®info, s))
2011 if (regtry(®info, s))
2013 } while (s++ < strend);
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2029 restore_pos(aTHX_ prog);
2032 /* make sure $`, $&, $', and $digit will work later */
2033 if ( !(flags & REXEC_NOT_FIRST) ) {
2034 RX_MATCH_COPY_FREE(prog);
2035 if (flags & REXEC_COPY_STR) {
2036 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2037 #ifdef PERL_OLD_COPY_ON_WRITE
2039 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2041 PerlIO_printf(Perl_debug_log,
2042 "Copy on write: regexp capture, type %d\n",
2045 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2046 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2047 assert (SvPOKp(prog->saved_copy));
2051 RX_MATCH_COPIED_on(prog);
2052 s = savepvn(strbeg, i);
2058 prog->subbeg = strbeg;
2059 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2067 PL_colors[4], PL_colors[5]));
2068 if (PL_reg_eval_set)
2069 restore_pos(aTHX_ prog);
2075 - regtry - try match at specific point
2077 STATIC I32 /* 0 failure, 1 success */
2078 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2084 regexp *prog = reginfo->prog;
2085 GET_RE_DEBUG_FLAGS_DECL;
2087 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2090 PL_reg_eval_set = RS_init;
2091 DEBUG_EXECUTE_r(DEBUG_s(
2092 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2093 (IV)(PL_stack_sp - PL_stack_base));
2095 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2096 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2097 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2099 /* Apparently this is not needed, judging by wantarray. */
2100 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2101 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2104 /* Make $_ available to executed code. */
2105 if (reginfo->sv != DEFSV) {
2107 DEFSV = reginfo->sv;
2110 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2111 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2112 /* prepare for quick setting of pos */
2113 #ifdef PERL_OLD_COPY_ON_WRITE
2115 sv_force_normal_flags(sv, 0);
2117 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2118 &PL_vtbl_mglob, NULL, 0);
2122 PL_reg_oldpos = mg->mg_len;
2123 SAVEDESTRUCTOR_X(restore_pos, prog);
2125 if (!PL_reg_curpm) {
2126 Newxz(PL_reg_curpm, 1, PMOP);
2129 SV* const repointer = newSViv(0);
2130 /* so we know which PL_regex_padav element is PL_reg_curpm */
2131 SvFLAGS(repointer) |= SVf_BREAK;
2132 av_push(PL_regex_padav,repointer);
2133 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2134 PL_regex_pad = AvARRAY(PL_regex_padav);
2138 PM_SETRE(PL_reg_curpm, prog);
2139 PL_reg_oldcurpm = PL_curpm;
2140 PL_curpm = PL_reg_curpm;
2141 if (RX_MATCH_COPIED(prog)) {
2142 /* Here is a serious problem: we cannot rewrite subbeg,
2143 since it may be needed if this match fails. Thus
2144 $` inside (?{}) could fail... */
2145 PL_reg_oldsaved = prog->subbeg;
2146 PL_reg_oldsavedlen = prog->sublen;
2147 #ifdef PERL_OLD_COPY_ON_WRITE
2148 PL_nrs = prog->saved_copy;
2150 RX_MATCH_COPIED_off(prog);
2153 PL_reg_oldsaved = NULL;
2154 prog->subbeg = PL_bostr;
2155 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2157 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2158 prog->startp[0] = startpos - PL_bostr;
2159 PL_reginput = startpos;
2160 PL_reglastparen = &prog->lastparen;
2161 PL_reglastcloseparen = &prog->lastcloseparen;
2162 prog->lastparen = 0;
2163 prog->lastcloseparen = 0;
2165 PL_regstartp = prog->startp;
2166 PL_regendp = prog->endp;
2167 if (PL_reg_start_tmpl <= prog->nparens) {
2168 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2169 if(PL_reg_start_tmp)
2170 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2172 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2175 /* XXXX What this code is doing here?!!! There should be no need
2176 to do this again and again, PL_reglastparen should take care of
2179 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2180 * Actually, the code in regcppop() (which Ilya may be meaning by
2181 * PL_reglastparen), is not needed at all by the test suite
2182 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2183 * enough, for building DynaLoader, or otherwise this
2184 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2185 * will happen. Meanwhile, this code *is* needed for the
2186 * above-mentioned test suite tests to succeed. The common theme
2187 * on those tests seems to be returning null fields from matches.
2192 if (prog->nparens) {
2194 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2201 if (regmatch(reginfo, prog->program + 1)) {
2202 prog->endp[0] = PL_reginput - PL_bostr;
2205 REGCP_UNWIND(lastcp);
2210 #define sayYES goto yes
2211 #define sayNO goto no
2212 #define sayNO_SILENT goto no_silent
2214 /* we dont use STMT_START/END here because it leads to
2215 "unreachable code" warnings, which are bogus, but distracting. */
2216 #define CACHEsayNO \
2217 if (ST.cache_mask) \
2218 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2221 /* this is used to determine how far from the left messages like
2222 'failed...' are printed. It should be set such that messages
2223 are inline with the regop output that created them.
2225 #define REPORT_CODE_OFF 32
2228 /* Make sure there is a test for this +1 options in re_tests */
2229 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2231 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2232 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2234 #define SLAB_FIRST(s) (&(s)->states[0])
2235 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2237 /* grab a new slab and return the first slot in it */
2239 STATIC regmatch_state *
2242 #if PERL_VERSION < 9
2245 regmatch_slab *s = PL_regmatch_slab->next;
2247 Newx(s, 1, regmatch_slab);
2248 s->prev = PL_regmatch_slab;
2250 PL_regmatch_slab->next = s;
2252 PL_regmatch_slab = s;
2253 return SLAB_FIRST(s);
2257 /* push a new state then goto it */
2259 #define PUSH_STATE_GOTO(state, node) \
2261 st->resume_state = state; \
2264 /* push a new state with success backtracking, then goto it */
2266 #define PUSH_YES_STATE_GOTO(state, node) \
2268 st->resume_state = state; \
2269 goto push_yes_state;
2275 regmatch() - main matching routine
2277 This is basically one big switch statement in a loop. We execute an op,
2278 set 'next' to point the next op, and continue. If we come to a point which
2279 we may need to backtrack to on failure such as (A|B|C), we push a
2280 backtrack state onto the backtrack stack. On failure, we pop the top
2281 state, and re-enter the loop at the state indicated. If there are no more
2282 states to pop, we return failure.
2284 Sometimes we also need to backtrack on success; for example /A+/, where
2285 after successfully matching one A, we need to go back and try to
2286 match another one; similarly for lookahead assertions: if the assertion
2287 completes successfully, we backtrack to the state just before the assertion
2288 and then carry on. In these cases, the pushed state is marked as
2289 'backtrack on success too'. This marking is in fact done by a chain of
2290 pointers, each pointing to the previous 'yes' state. On success, we pop to
2291 the nearest yes state, discarding any intermediate failure-only states.
2292 Sometimes a yes state is pushed just to force some cleanup code to be
2293 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2294 it to free the inner regex.
2296 Note that failure backtracking rewinds the cursor position, while
2297 success backtracking leaves it alone.
2299 A pattern is complete when the END op is executed, while a subpattern
2300 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2301 ops trigger the "pop to last yes state if any, otherwise return true"
2304 A common convention in this function is to use A and B to refer to the two
2305 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2306 the subpattern to be matched possibly multiple times, while B is the entire
2307 rest of the pattern. Variable and state names reflect this convention.
2309 The states in the main switch are the union of ops and failure/success of
2310 substates associated with with that op. For example, IFMATCH is the op
2311 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2312 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2313 successfully matched A and IFMATCH_A_fail is a state saying that we have
2314 just failed to match A. Resume states always come in pairs. The backtrack
2315 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2316 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2317 on success or failure.
2319 The struct that holds a backtracking state is actually a big union, with
2320 one variant for each major type of op. The variable st points to the
2321 top-most backtrack struct. To make the code clearer, within each
2322 block of code we #define ST to alias the relevant union.
2324 Here's a concrete example of a (vastly oversimplified) IFMATCH
2330 #define ST st->u.ifmatch
2332 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2333 ST.foo = ...; // some state we wish to save
2335 // push a yes backtrack state with a resume value of
2336 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2338 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2341 case IFMATCH_A: // we have successfully executed A; now continue with B
2343 bar = ST.foo; // do something with the preserved value
2346 case IFMATCH_A_fail: // A failed, so the assertion failed
2347 ...; // do some housekeeping, then ...
2348 sayNO; // propagate the failure
2355 For any old-timers reading this who are familiar with the old recursive
2356 approach, the code above is equivalent to:
2358 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2367 ...; // do some housekeeping, then ...
2368 sayNO; // propagate the failure
2371 The topmost backtrack state, pointed to by st, is usually free. If you
2372 want to claim it, populate any ST.foo fields in it with values you wish to
2373 save, then do one of
2375 PUSH_STATE_GOTO(resume_state, node);
2376 PUSH_YES_STATE_GOTO(resume_state, node);
2378 which sets that backtrack state's resume value to 'resume_state', pushes a
2379 new free entry to the top of the backtrack stack, then goes to 'node'.
2380 On backtracking, the free slot is popped, and the saved state becomes the
2381 new free state. An ST.foo field in this new top state can be temporarily
2382 accessed to retrieve values, but once the main loop is re-entered, it
2383 becomes available for reuse.
2385 Note that the depth of the backtrack stack constantly increases during the
2386 left-to-right execution of the pattern, rather than going up and down with
2387 the pattern nesting. For example the stack is at its maximum at Z at the
2388 end of the pattern, rather than at X in the following:
2390 /(((X)+)+)+....(Y)+....Z/
2392 The only exceptions to this are lookahead/behind assertions and the cut,
2393 (?>A), which pop all the backtrack states associated with A before
2396 Bascktrack state structs are allocated in slabs of about 4K in size.
2397 PL_regmatch_state and st always point to the currently active state,
2398 and PL_regmatch_slab points to the slab currently containing
2399 PL_regmatch_state. The first time regmatch() is called, the first slab is
2400 allocated, and is never freed until interpreter destruction. When the slab
2401 is full, a new one is allocated and chained to the end. At exit from
2402 regmatch(), slabs allocated since entry are freed.
2407 #define DEBUG_STATE_pp(pp) \
2409 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2410 PerlIO_printf(Perl_debug_log, \
2413 reg_name[st->resume_state] ); \
2417 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2422 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2423 const char *start, const char *end, const char *blurb)
2425 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2429 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2430 prog->precomp, prog->prelen, 60);
2432 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2433 start, end - start, 60);
2435 PerlIO_printf(Perl_debug_log,
2436 "%s%s REx%s %s against %s\n",
2437 PL_colors[4], blurb, PL_colors[5], s0, s1);
2439 if (do_utf8||utf8_pat)
2440 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2441 utf8_pat ? "pattern" : "",
2442 utf8_pat && do_utf8 ? " and " : "",
2443 do_utf8 ? "string" : ""
2449 S_dump_exec_pos(pTHX_ const char *locinput,
2450 const regnode *scan,
2451 const char *loc_regeol,
2452 const char *loc_bostr,
2453 const char *loc_reg_starttry,
2456 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2457 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2458 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2459 /* The part of the string before starttry has one color
2460 (pref0_len chars), between starttry and current
2461 position another one (pref_len - pref0_len chars),
2462 after the current position the third one.
2463 We assume that pref0_len <= pref_len, otherwise we
2464 decrease pref0_len. */
2465 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2466 ? (5 + taill) - l : locinput - loc_bostr;
2469 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2471 pref0_len = pref_len - (locinput - loc_reg_starttry);
2472 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2473 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2474 ? (5 + taill) - pref_len : loc_regeol - locinput);
2475 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2479 if (pref0_len > pref_len)
2480 pref0_len = pref_len;
2482 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2484 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2485 (locinput - pref_len),pref0_len, 60, 4, 5);
2487 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2488 (locinput - pref_len + pref0_len),
2489 pref_len - pref0_len, 60, 2, 3);
2491 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2492 locinput, loc_regeol - locinput, 10, 0, 1);
2494 const STRLEN tlen=len0+len1+len2;
2495 PerlIO_printf(Perl_debug_log,
2496 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2497 (IV)(locinput - loc_bostr),
2500 (docolor ? "" : "> <"),
2502 (int)(tlen > 19 ? 0 : 19 - tlen),
2509 STATIC I32 /* 0 failure, 1 success */
2510 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2512 #if PERL_VERSION < 9
2516 register const bool do_utf8 = PL_reg_match_utf8;
2517 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2519 regexp *rex = reginfo->prog;
2521 regmatch_slab *orig_slab;
2522 regmatch_state *orig_state;
2524 /* the current state. This is a cached copy of PL_regmatch_state */
2525 register regmatch_state *st;
2527 /* cache heavy used fields of st in registers */
2528 register regnode *scan;
2529 register regnode *next;
2530 register I32 n = 0; /* general value; init to avoid compiler warning */
2531 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2532 register char *locinput = PL_reginput;
2533 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2535 bool result = 0; /* return value of S_regmatch */
2536 int depth = 0; /* depth of backtrack stack */
2537 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
2538 regmatch_state *yes_state = NULL; /* state to pop to on success of
2540 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2541 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2544 /* these three flags are set by various ops to signal information to
2545 * the very next op. They have a useful lifetime of exactly one loop
2546 * iteration, and are not preserved or restored by state pushes/pops
2548 bool sw = 0; /* the condition value in (?(cond)a|b) */
2549 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2550 int logical = 0; /* the following EVAL is:
2554 or the following IFMATCH/UNLESSM is:
2555 false: plain (?=foo)
2556 true: used as a condition: (?(?=foo))
2560 GET_RE_DEBUG_FLAGS_DECL;
2563 /* on first ever call to regmatch, allocate first slab */
2564 if (!PL_regmatch_slab) {
2565 Newx(PL_regmatch_slab, 1, regmatch_slab);
2566 PL_regmatch_slab->prev = NULL;
2567 PL_regmatch_slab->next = NULL;
2568 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2571 /* remember current high-water mark for exit */
2572 /* XXX this should be done with SAVE* instead */
2573 orig_slab = PL_regmatch_slab;
2574 orig_state = PL_regmatch_state;
2576 /* grab next free state slot */
2577 st = ++PL_regmatch_state;
2578 if (st > SLAB_LAST(PL_regmatch_slab))
2579 st = PL_regmatch_state = S_push_slab(aTHX);
2581 /* Note that nextchr is a byte even in UTF */
2582 nextchr = UCHARAT(locinput);
2584 while (scan != NULL) {
2587 SV * const prop = sv_newmortal();
2588 regnode *rnext=regnext(scan);
2589 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2590 regprop(rex, prop, scan);
2592 PerlIO_printf(Perl_debug_log,
2593 "%3"IVdf":%*s%s(%"IVdf")\n",
2594 (IV)(scan - rex->program), depth*2, "",
2596 (PL_regkind[OP(scan)] == END || !rnext) ?
2597 0 : (IV)(rnext - rex->program));
2600 next = scan + NEXT_OFF(scan);
2603 state_num = OP(scan);
2606 switch (state_num) {
2608 if (locinput == PL_bostr)
2610 /* reginfo->till = reginfo->bol; */
2615 if (locinput == PL_bostr ||
2616 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2622 if (locinput == PL_bostr)
2626 if (locinput == reginfo->ganch)
2632 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2637 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2639 if (PL_regeol - locinput > 1)
2643 if (PL_regeol != locinput)
2647 if (!nextchr && locinput >= PL_regeol)
2650 locinput += PL_utf8skip[nextchr];
2651 if (locinput > PL_regeol)
2653 nextchr = UCHARAT(locinput);
2656 nextchr = UCHARAT(++locinput);
2659 if (!nextchr && locinput >= PL_regeol)
2661 nextchr = UCHARAT(++locinput);
2664 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2667 locinput += PL_utf8skip[nextchr];
2668 if (locinput > PL_regeol)
2670 nextchr = UCHARAT(locinput);
2673 nextchr = UCHARAT(++locinput);
2677 #define ST st->u.trie
2679 /* In this case the charclass data is available inline so
2680 we can fail fast without a lot of extra overhead.
2682 if (scan->flags == EXACT || !do_utf8) {
2683 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2685 PerlIO_printf(Perl_debug_log,
2686 "%*s %sfailed to match trie start class...%s\n",
2687 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2696 /* what type of TRIE am I? (utf8 makes this contextual) */
2697 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2698 trie_type = do_utf8 ?
2699 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2702 /* what trie are we using right now */
2703 reg_trie_data * const trie
2704 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2705 U32 state = trie->startstate;
2707 if (trie->bitmap && trie_type != trie_utf8_fold &&
2708 !TRIE_BITMAP_TEST(trie,*locinput)
2710 if (trie->states[ state ].wordnum) {
2712 PerlIO_printf(Perl_debug_log,
2713 "%*s %smatched empty string...%s\n",
2714 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2719 PerlIO_printf(Perl_debug_log,
2720 "%*s %sfailed to match trie start class...%s\n",
2721 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2728 U8 *uc = ( U8* )locinput;
2732 U8 *uscan = (U8*)NULL;
2734 SV *sv_accept_buff = NULL;
2735 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2737 ST.accepted = 0; /* how many accepting states we have seen */
2739 ST.jump = trie->jump;
2748 traverse the TRIE keeping track of all accepting states
2749 we transition through until we get to a failing node.
2752 while ( state && uc <= (U8*)PL_regeol ) {
2753 U32 base = trie->states[ state ].trans.base;
2756 /* We use charid to hold the wordnum as we don't use it
2757 for charid until after we have done the wordnum logic.
2758 We define an alias just so that the wordnum logic reads
2761 #define got_wordnum charid
2762 got_wordnum = trie->states[ state ].wordnum;
2764 if ( got_wordnum ) {
2765 if ( ! ST.accepted ) {
2768 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2769 sv_accept_buff=newSV(bufflen *
2770 sizeof(reg_trie_accepted) - 1);
2771 SvCUR_set(sv_accept_buff, 0);
2772 SvPOK_on(sv_accept_buff);
2773 sv_2mortal(sv_accept_buff);
2776 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2779 if (ST.accepted >= bufflen) {
2781 ST.accept_buff =(reg_trie_accepted*)
2782 SvGROW(sv_accept_buff,
2783 bufflen * sizeof(reg_trie_accepted));
2785 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2786 + sizeof(reg_trie_accepted));
2789 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2790 ST.accept_buff[ST.accepted].endpos = uc;
2792 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2796 DEBUG_TRIE_EXECUTE_r({
2797 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2798 PerlIO_printf( Perl_debug_log,
2799 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2800 2+depth * 2, "", PL_colors[4],
2801 (UV)state, (UV)ST.accepted );
2805 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2806 uvc, charid, foldlen, foldbuf, uniflags);
2809 (base + charid > trie->uniquecharcount )
2810 && (base + charid - 1 - trie->uniquecharcount
2812 && trie->trans[base + charid - 1 -
2813 trie->uniquecharcount].check == state)
2815 state = trie->trans[base + charid - 1 -
2816 trie->uniquecharcount ].next;
2827 DEBUG_TRIE_EXECUTE_r(
2828 PerlIO_printf( Perl_debug_log,
2829 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2830 charid, uvc, (UV)state, PL_colors[5] );
2837 PerlIO_printf( Perl_debug_log,
2838 "%*s %sgot %"IVdf" possible matches%s\n",
2839 REPORT_CODE_OFF + depth * 2, "",
2840 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2846 case TRIE_next_fail: /* we failed - try next alterative */
2848 if ( ST.accepted == 1 ) {
2849 /* only one choice left - just continue */
2851 reg_trie_data * const trie
2852 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2853 SV ** const tmp = RX_DEBUG(reginfo->prog)
2854 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2856 PerlIO_printf( Perl_debug_log,
2857 "%*s %sonly one match left: #%d <%s>%s\n",
2858 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2859 ST.accept_buff[ 0 ].wordnum,
2860 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2863 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2864 /* in this case we free tmps/leave before we call regmatch
2865 as we wont be using accept_buff again. */
2868 locinput = PL_reginput;
2869 nextchr = UCHARAT(locinput);
2874 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2876 continue; /* execute rest of RE */
2879 if (!ST.accepted-- ) {
2886 There are at least two accepting states left. Presumably
2887 the number of accepting states is going to be low,
2888 typically two. So we simply scan through to find the one
2889 with lowest wordnum. Once we find it, we swap the last
2890 state into its place and decrement the size. We then try to
2891 match the rest of the pattern at the point where the word
2892 ends. If we succeed, control just continues along the
2893 regex; if we fail we return here to try the next accepting
2900 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2901 DEBUG_TRIE_EXECUTE_r(
2902 PerlIO_printf( Perl_debug_log,
2903 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2904 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
2905 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2906 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2909 if (ST.accept_buff[cur].wordnum <
2910 ST.accept_buff[best].wordnum)
2915 reg_trie_data * const trie
2916 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2917 SV ** const tmp = RX_DEBUG(reginfo->prog)
2918 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2920 regnode *nextop=!ST.jump ?
2922 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2923 PerlIO_printf( Perl_debug_log,
2924 "%*s %strying alternation #%d <%s> at node #%d %s\n",
2925 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2926 ST.accept_buff[best].wordnum,
2927 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2928 REG_NODE_NUM(nextop),
2932 if ( best<ST.accepted ) {
2933 reg_trie_accepted tmp = ST.accept_buff[ best ];
2934 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2935 ST.accept_buff[ ST.accepted ] = tmp;
2938 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2940 PUSH_STATE_GOTO(TRIE_next, ST.B);
2943 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2953 char *s = STRING(scan);
2955 if (do_utf8 != UTF) {
2956 /* The target and the pattern have differing utf8ness. */
2958 const char * const e = s + ln;
2961 /* The target is utf8, the pattern is not utf8. */
2966 if (NATIVE_TO_UNI(*(U8*)s) !=
2967 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2975 /* The target is not utf8, the pattern is utf8. */
2980 if (NATIVE_TO_UNI(*((U8*)l)) !=
2981 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2989 nextchr = UCHARAT(locinput);
2992 /* The target and the pattern have the same utf8ness. */
2993 /* Inline the first character, for speed. */
2994 if (UCHARAT(s) != nextchr)
2996 if (PL_regeol - locinput < ln)
2998 if (ln > 1 && memNE(s, locinput, ln))
3001 nextchr = UCHARAT(locinput);
3005 PL_reg_flags |= RF_tainted;
3008 char * const s = STRING(scan);
3011 if (do_utf8 || UTF) {
3012 /* Either target or the pattern are utf8. */
3013 const char * const l = locinput;
3014 char *e = PL_regeol;
3016 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3017 l, &e, 0, do_utf8)) {
3018 /* One more case for the sharp s:
3019 * pack("U0U*", 0xDF) =~ /ss/i,
3020 * the 0xC3 0x9F are the UTF-8
3021 * byte sequence for the U+00DF. */
3023 toLOWER(s[0]) == 's' &&
3025 toLOWER(s[1]) == 's' &&
3032 nextchr = UCHARAT(locinput);
3036 /* Neither the target and the pattern are utf8. */
3038 /* Inline the first character, for speed. */
3039 if (UCHARAT(s) != nextchr &&
3040 UCHARAT(s) != ((OP(scan) == EXACTF)
3041 ? PL_fold : PL_fold_locale)[nextchr])
3043 if (PL_regeol - locinput < ln)
3045 if (ln > 1 && (OP(scan) == EXACTF
3046 ? ibcmp(s, locinput, ln)
3047 : ibcmp_locale(s, locinput, ln)))
3050 nextchr = UCHARAT(locinput);
3055 STRLEN inclasslen = PL_regeol - locinput;
3057 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3059 if (locinput >= PL_regeol)
3061 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3062 nextchr = UCHARAT(locinput);
3067 nextchr = UCHARAT(locinput);
3068 if (!REGINCLASS(rex, scan, (U8*)locinput))
3070 if (!nextchr && locinput >= PL_regeol)
3072 nextchr = UCHARAT(++locinput);
3076 /* If we might have the case of the German sharp s
3077 * in a casefolding Unicode character class. */
3079 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3080 locinput += SHARP_S_SKIP;
3081 nextchr = UCHARAT(locinput);
3087 PL_reg_flags |= RF_tainted;
3093 LOAD_UTF8_CHARCLASS_ALNUM();
3094 if (!(OP(scan) == ALNUM
3095 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3096 : isALNUM_LC_utf8((U8*)locinput)))
3100 locinput += PL_utf8skip[nextchr];
3101 nextchr = UCHARAT(locinput);
3104 if (!(OP(scan) == ALNUM
3105 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3107 nextchr = UCHARAT(++locinput);
3110 PL_reg_flags |= RF_tainted;
3113 if (!nextchr && locinput >= PL_regeol)
3116 LOAD_UTF8_CHARCLASS_ALNUM();
3117 if (OP(scan) == NALNUM
3118 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3119 : isALNUM_LC_utf8((U8*)locinput))
3123 locinput += PL_utf8skip[nextchr];
3124 nextchr = UCHARAT(locinput);
3127 if (OP(scan) == NALNUM
3128 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3130 nextchr = UCHARAT(++locinput);
3134 PL_reg_flags |= RF_tainted;
3138 /* was last char in word? */
3140 if (locinput == PL_bostr)
3143 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3145 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3147 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3148 ln = isALNUM_uni(ln);
3149 LOAD_UTF8_CHARCLASS_ALNUM();
3150 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3153 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3154 n = isALNUM_LC_utf8((U8*)locinput);
3158 ln = (locinput != PL_bostr) ?
3159 UCHARAT(locinput - 1) : '\n';
3160 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3162 n = isALNUM(nextchr);
3165 ln = isALNUM_LC(ln);
3166 n = isALNUM_LC(nextchr);
3169 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3170 OP(scan) == BOUNDL))
3174 PL_reg_flags |= RF_tainted;
3180 if (UTF8_IS_CONTINUED(nextchr)) {
3181 LOAD_UTF8_CHARCLASS_SPACE();
3182 if (!(OP(scan) == SPACE
3183 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3184 : isSPACE_LC_utf8((U8*)locinput)))
3188 locinput += PL_utf8skip[nextchr];
3189 nextchr = UCHARAT(locinput);
3192 if (!(OP(scan) == SPACE
3193 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3195 nextchr = UCHARAT(++locinput);
3198 if (!(OP(scan) == SPACE
3199 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3201 nextchr = UCHARAT(++locinput);
3205 PL_reg_flags |= RF_tainted;
3208 if (!nextchr && locinput >= PL_regeol)
3211 LOAD_UTF8_CHARCLASS_SPACE();
3212 if (OP(scan) == NSPACE
3213 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3214 : isSPACE_LC_utf8((U8*)locinput))
3218 locinput += PL_utf8skip[nextchr];
3219 nextchr = UCHARAT(locinput);
3222 if (OP(scan) == NSPACE
3223 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3225 nextchr = UCHARAT(++locinput);
3228 PL_reg_flags |= RF_tainted;
3234 LOAD_UTF8_CHARCLASS_DIGIT();
3235 if (!(OP(scan) == DIGIT
3236 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3237 : isDIGIT_LC_utf8((U8*)locinput)))
3241 locinput += PL_utf8skip[nextchr];
3242 nextchr = UCHARAT(locinput);
3245 if (!(OP(scan) == DIGIT
3246 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3248 nextchr = UCHARAT(++locinput);
3251 PL_reg_flags |= RF_tainted;
3254 if (!nextchr && locinput >= PL_regeol)
3257 LOAD_UTF8_CHARCLASS_DIGIT();
3258 if (OP(scan) == NDIGIT
3259 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3260 : isDIGIT_LC_utf8((U8*)locinput))
3264 locinput += PL_utf8skip[nextchr];
3265 nextchr = UCHARAT(locinput);
3268 if (OP(scan) == NDIGIT
3269 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3271 nextchr = UCHARAT(++locinput);
3274 if (locinput >= PL_regeol)
3277 LOAD_UTF8_CHARCLASS_MARK();
3278 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3280 locinput += PL_utf8skip[nextchr];
3281 while (locinput < PL_regeol &&
3282 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3283 locinput += UTF8SKIP(locinput);
3284 if (locinput > PL_regeol)
3289 nextchr = UCHARAT(locinput);
3292 PL_reg_flags |= RF_tainted;
3297 n = ARG(scan); /* which paren pair */
3298 ln = PL_regstartp[n];
3299 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3300 if ((I32)*PL_reglastparen < n || ln == -1)
3301 sayNO; /* Do not match unless seen CLOSEn. */
3302 if (ln == PL_regendp[n])
3306 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3308 const char *e = PL_bostr + PL_regendp[n];
3310 * Note that we can't do the "other character" lookup trick as
3311 * in the 8-bit case (no pun intended) because in Unicode we
3312 * have to map both upper and title case to lower case.
3314 if (OP(scan) == REFF) {
3316 STRLEN ulen1, ulen2;
3317 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3318 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3322 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3323 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3324 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3331 nextchr = UCHARAT(locinput);
3335 /* Inline the first character, for speed. */
3336 if (UCHARAT(s) != nextchr &&
3338 (UCHARAT(s) != ((OP(scan) == REFF
3339 ? PL_fold : PL_fold_locale)[nextchr]))))
3341 ln = PL_regendp[n] - ln;
3342 if (locinput + ln > PL_regeol)
3344 if (ln > 1 && (OP(scan) == REF
3345 ? memNE(s, locinput, ln)
3347 ? ibcmp(s, locinput, ln)
3348 : ibcmp_locale(s, locinput, ln))))
3351 nextchr = UCHARAT(locinput);
3362 #define ST st->u.eval
3366 regnode *startpoint;
3369 case RECURSE: /* /(...(?1))/ */
3370 if (cur_eval && cur_eval->locinput==locinput) {
3371 if (cur_eval->u.eval.close_paren == ARG(scan))
3372 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3373 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3374 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3379 (void)ReREFCNT_inc(rex);
3380 if (OP(scan)==RECURSE) {
3381 startpoint = scan + ARG2L(scan);
3382 ST.close_paren = ARG(scan);
3384 startpoint = re->program+1;
3387 goto eval_recurse_doit;
3389 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3390 if (cur_eval && cur_eval->locinput==locinput) {
3391 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3392 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3397 /* execute the code in the {...} */
3399 SV ** const before = SP;
3400 OP_4tree * const oop = PL_op;
3401 COP * const ocurcop = PL_curcop;
3405 PL_op = (OP_4tree*)rex->data->data[n];
3406 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3407 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3408 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3410 CALLRUNOPS(aTHX); /* Scalar context. */
3413 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3420 PAD_RESTORE_LOCAL(old_comppad);
3421 PL_curcop = ocurcop;
3424 sv_setsv(save_scalar(PL_replgv), ret);
3428 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3431 /* extract RE object from returned value; compiling if
3436 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3437 mg = mg_find(sv, PERL_MAGIC_qr);
3438 else if (SvSMAGICAL(ret)) {
3439 if (SvGMAGICAL(ret))
3440 sv_unmagic(ret, PERL_MAGIC_qr);
3442 mg = mg_find(ret, PERL_MAGIC_qr);
3446 re = (regexp *)mg->mg_obj;
3447 (void)ReREFCNT_inc(re);
3451 const char * const t = SvPV_const(ret, len);
3453 const I32 osize = PL_regsize;
3456 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3457 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3459 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3461 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3467 debug_start_match(re, do_utf8, locinput, PL_regeol,
3468 "Matching embedded");
3470 startpoint = re->program + 1;
3471 ST.close_paren = 0; /* only used for RECURSE */
3472 /* borrowed from regtry */
3473 if (PL_reg_start_tmpl <= re->nparens) {
3474 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3475 if(PL_reg_start_tmp)
3476 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3478 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3481 eval_recurse_doit: /* Share code with RECURSE below this line */
3482 /* run the pattern returned from (??{...}) */
3483 ST.cp = regcppush(0); /* Save *all* the positions. */
3484 REGCP_SET(ST.lastcp);
3486 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3487 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3489 *PL_reglastparen = 0;
3490 *PL_reglastcloseparen = 0;
3491 PL_reginput = locinput;
3493 /* XXXX This is too dramatic a measure... */
3496 ST.toggle_reg_flags = PL_reg_flags;
3497 if (re->reganch & ROPT_UTF8)
3498 PL_reg_flags |= RF_utf8;
3500 PL_reg_flags &= ~RF_utf8;
3501 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3504 ST.prev_curlyx = cur_curlyx;
3508 ST.prev_eval = cur_eval;
3510 /* now continue from first node in postoned RE */
3511 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3514 /* logical is 1, /(?(?{...})X|Y)/ */
3515 sw = (bool)SvTRUE(ret);
3520 case EVAL_AB: /* cleanup after a successful (??{A})B */
3521 /* note: this is called twice; first after popping B, then A */
3522 PL_reg_flags ^= ST.toggle_reg_flags;
3526 cur_eval = ST.prev_eval;
3527 cur_curlyx = ST.prev_curlyx;
3528 /* XXXX This is too dramatic a measure... */
3533 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3534 /* note: this is called twice; first after popping B, then A */
3535 PL_reg_flags ^= ST.toggle_reg_flags;
3538 PL_reginput = locinput;
3539 REGCP_UNWIND(ST.lastcp);
3541 cur_eval = ST.prev_eval;
3542 cur_curlyx = ST.prev_curlyx;
3543 /* XXXX This is too dramatic a measure... */
3549 n = ARG(scan); /* which paren pair */
3550 PL_reg_start_tmp[n] = locinput;
3555 n = ARG(scan); /* which paren pair */
3556 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3557 PL_regendp[n] = locinput - PL_bostr;
3558 if (n > (I32)*PL_reglastparen)
3559 *PL_reglastparen = n;
3560 *PL_reglastcloseparen = n;
3561 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3566 n = ARG(scan); /* which paren pair */
3567 sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3570 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3572 next = NEXTOPER(NEXTOPER(scan));
3574 next = scan + ARG(scan);
3575 if (OP(next) == IFTHEN) /* Fake one. */
3576 next = NEXTOPER(NEXTOPER(next));
3580 logical = scan->flags;
3583 /*******************************************************************
3585 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3586 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3587 STAR/PLUS/CURLY/CURLYN are used instead.)
3589 A*B is compiled as <CURLYX><A><WHILEM><B>
3591 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3592 state, which contains the current count, initialised to -1. It also sets
3593 cur_curlyx to point to this state, with any previous value saved in the
3596 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3597 since the pattern may possibly match zero times (i.e. it's a while {} loop
3598 rather than a do {} while loop).
3600 Each entry to WHILEM represents a successful match of A. The count in the
3601 CURLYX block is incremented, another WHILEM state is pushed, and execution
3602 passes to A or B depending on greediness and the current count.
3604 For example, if matching against the string a1a2a3b (where the aN are
3605 substrings that match /A/), then the match progresses as follows: (the
3606 pushed states are interspersed with the bits of strings matched so far):
3609 <CURLYX cnt=0><WHILEM>
3610 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3611 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3612 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3613 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3615 (Contrast this with something like CURLYM, which maintains only a single
3619 a1 <CURLYM cnt=1> a2
3620 a1 a2 <CURLYM cnt=2> a3
3621 a1 a2 a3 <CURLYM cnt=3> b
3624 Each WHILEM state block marks a point to backtrack to upon partial failure
3625 of A or B, and also contains some minor state data related to that
3626 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3627 overall state, such as the count, and pointers to the A and B ops.
3629 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3630 must always point to the *current* CURLYX block, the rules are:
3632 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3633 and set cur_curlyx to point the new block.
3635 When popping the CURLYX block after a successful or unsuccessful match,
3636 restore the previous cur_curlyx.
3638 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3639 to the outer one saved in the CURLYX block.
3641 When popping the WHILEM block after a successful or unsuccessful B match,
3642 restore the previous cur_curlyx.
3644 Here's an example for the pattern (AI* BI)*BO
3645 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3648 curlyx backtrack stack
3649 ------ ---------------
3651 CO <CO prev=NULL> <WO>
3652 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3653 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3654 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3656 At this point the pattern succeeds, and we work back down the stack to
3657 clean up, restoring as we go:
3659 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3660 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3661 CO <CO prev=NULL> <WO>
3664 *******************************************************************/
3666 #define ST st->u.curlyx
3668 case CURLYX: /* start of /A*B/ (for complex A) */
3670 /* No need to save/restore up to this paren */
3671 I32 parenfloor = scan->flags;
3673 assert(next); /* keep Coverity happy */
3674 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3677 /* XXXX Probably it is better to teach regpush to support
3678 parenfloor > PL_regsize... */
3679 if (parenfloor > (I32)*PL_reglastparen)
3680 parenfloor = *PL_reglastparen; /* Pessimization... */
3682 ST.prev_curlyx= cur_curlyx;
3684 ST.cp = PL_savestack_ix;
3686 /* these fields contain the state of the current curly.
3687 * they are accessed by subsequent WHILEMs */
3688 ST.parenfloor = parenfloor;
3689 ST.min = ARG1(scan);
3690 ST.max = ARG2(scan);
3691 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3695 ST.count = -1; /* this will be updated by WHILEM */
3696 ST.lastloc = NULL; /* this will be updated by WHILEM */
3698 PL_reginput = locinput;
3699 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3703 case CURLYX_end: /* just finished matching all of A*B */
3705 cur_curlyx = ST.prev_curlyx;
3709 case CURLYX_end_fail: /* just failed to match all of A*B */
3711 cur_curlyx = ST.prev_curlyx;
3717 #define ST st->u.whilem
3719 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3721 /* see the discussion above about CURLYX/WHILEM */
3724 assert(cur_curlyx); /* keep Coverity happy */
3725 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3726 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3727 ST.cache_offset = 0;
3730 PL_reginput = locinput;
3732 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3733 "%*s whilem: matched %ld out of %ld..%ld\n",
3734 REPORT_CODE_OFF+depth*2, "", (long)n,
3735 (long)cur_curlyx->u.curlyx.min,
3736 (long)cur_curlyx->u.curlyx.max)
3739 /* First just match a string of min A's. */
3741 if (n < cur_curlyx->u.curlyx.min) {
3742 cur_curlyx->u.curlyx.lastloc = locinput;
3743 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3747 /* If degenerate A matches "", assume A done. */
3749 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3750 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3751 "%*s whilem: empty match detected, trying continuation...\n",
3752 REPORT_CODE_OFF+depth*2, "")
3754 goto do_whilem_B_max;
3757 /* super-linear cache processing */
3761 if (!PL_reg_maxiter) {
3762 /* start the countdown: Postpone detection until we
3763 * know the match is not *that* much linear. */
3764 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3765 /* possible overflow for long strings and many CURLYX's */
3766 if (PL_reg_maxiter < 0)
3767 PL_reg_maxiter = I32_MAX;
3768 PL_reg_leftiter = PL_reg_maxiter;
3771 if (PL_reg_leftiter-- == 0) {
3772 /* initialise cache */
3773 const I32 size = (PL_reg_maxiter + 7)/8;
3774 if (PL_reg_poscache) {
3775 if ((I32)PL_reg_poscache_size < size) {
3776 Renew(PL_reg_poscache, size, char);
3777 PL_reg_poscache_size = size;
3779 Zero(PL_reg_poscache, size, char);
3782 PL_reg_poscache_size = size;
3783 Newxz(PL_reg_poscache, size, char);
3785 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3786 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3787 PL_colors[4], PL_colors[5])
3791 if (PL_reg_leftiter < 0) {
3792 /* have we already failed at this position? */
3794 offset = (scan->flags & 0xf) - 1
3795 + (locinput - PL_bostr) * (scan->flags>>4);
3796 mask = 1 << (offset % 8);
3798 if (PL_reg_poscache[offset] & mask) {
3799 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3800 "%*s whilem: (cache) already tried at this position...\n",
3801 REPORT_CODE_OFF+depth*2, "")
3803 sayNO; /* cache records failure */
3805 ST.cache_offset = offset;
3806 ST.cache_mask = mask;
3810 /* Prefer B over A for minimal matching. */
3812 if (cur_curlyx->u.curlyx.minmod) {
3813 ST.save_curlyx = cur_curlyx;
3814 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3815 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3816 REGCP_SET(ST.lastcp);
3817 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3821 /* Prefer A over B for maximal matching. */
3823 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3824 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3825 cur_curlyx->u.curlyx.lastloc = locinput;
3826 REGCP_SET(ST.lastcp);
3827 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3830 goto do_whilem_B_max;
3834 case WHILEM_B_min: /* just matched B in a minimal match */
3835 case WHILEM_B_max: /* just matched B in a maximal match */
3836 cur_curlyx = ST.save_curlyx;
3840 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3841 cur_curlyx = ST.save_curlyx;
3842 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3843 cur_curlyx->u.curlyx.count--;
3847 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3848 REGCP_UNWIND(ST.lastcp);
3851 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3852 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3853 cur_curlyx->u.curlyx.count--;
3857 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3858 REGCP_UNWIND(ST.lastcp);
3859 regcppop(rex); /* Restore some previous $<digit>s? */
3860 PL_reginput = locinput;
3861 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3862 "%*s whilem: failed, trying continuation...\n",
3863 REPORT_CODE_OFF+depth*2, "")
3866 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3867 && ckWARN(WARN_REGEXP)
3868 && !(PL_reg_flags & RF_warned))
3870 PL_reg_flags |= RF_warned;
3871 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3872 "Complex regular subexpression recursion",
3877 ST.save_curlyx = cur_curlyx;
3878 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3879 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3882 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3883 cur_curlyx = ST.save_curlyx;
3884 REGCP_UNWIND(ST.lastcp);
3887 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3888 /* Maximum greed exceeded */
3889 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3890 && ckWARN(WARN_REGEXP)
3891 && !(PL_reg_flags & RF_warned))
3893 PL_reg_flags |= RF_warned;
3894 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3895 "%s limit (%d) exceeded",
3896 "Complex regular subexpression recursion",
3899 cur_curlyx->u.curlyx.count--;
3903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3904 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3906 /* Try grabbing another A and see if it helps. */
3907 PL_reginput = locinput;
3908 cur_curlyx->u.curlyx.lastloc = locinput;
3909 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3910 REGCP_SET(ST.lastcp);
3911 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
3915 #define ST st->u.branch
3917 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3918 next = scan + ARG(scan);
3921 scan = NEXTOPER(scan);
3924 case BRANCH: /* /(...|A|...)/ */
3925 scan = NEXTOPER(scan); /* scan now points to inner node */
3926 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3927 /* last branch; skip state push and jump direct to node */
3929 ST.lastparen = *PL_reglastparen;
3930 ST.next_branch = next;
3932 PL_reginput = locinput;
3934 /* Now go into the branch */
3935 PUSH_STATE_GOTO(BRANCH_next, scan);
3938 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3939 REGCP_UNWIND(ST.cp);
3940 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3942 *PL_reglastparen = n;
3943 scan = ST.next_branch;
3944 /* no more branches? */
3945 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3947 continue; /* execute next BRANCH[J] op */
3955 #define ST st->u.curlym
3957 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3959 /* This is an optimisation of CURLYX that enables us to push
3960 * only a single backtracking state, no matter now many matches
3961 * there are in {m,n}. It relies on the pattern being constant
3962 * length, with no parens to influence future backrefs
3966 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3968 /* if paren positive, emulate an OPEN/CLOSE around A */
3970 I32 paren = ST.me->flags;
3971 if (paren > PL_regsize)
3973 if (paren > (I32)*PL_reglastparen)
3974 *PL_reglastparen = paren;
3975 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3983 ST.c1 = CHRTEST_UNINIT;
3986 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3989 curlym_do_A: /* execute the A in /A{m,n}B/ */
3990 PL_reginput = locinput;
3991 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3994 case CURLYM_A: /* we've just matched an A */
3995 locinput = st->locinput;
3996 nextchr = UCHARAT(locinput);
3999 /* after first match, determine A's length: u.curlym.alen */
4000 if (ST.count == 1) {
4001 if (PL_reg_match_utf8) {
4003 while (s < PL_reginput) {
4009 ST.alen = PL_reginput - locinput;
4012 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4015 PerlIO_printf(Perl_debug_log,
4016 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4017 (int)(REPORT_CODE_OFF+(depth*2)), "",
4018 (IV) ST.count, (IV)ST.alen)
4021 locinput = PL_reginput;
4022 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4023 goto curlym_do_A; /* try to match another A */
4024 goto curlym_do_B; /* try to match B */
4026 case CURLYM_A_fail: /* just failed to match an A */
4027 REGCP_UNWIND(ST.cp);
4028 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4031 curlym_do_B: /* execute the B in /A{m,n}B/ */
4032 PL_reginput = locinput;
4033 if (ST.c1 == CHRTEST_UNINIT) {
4034 /* calculate c1 and c2 for possible match of 1st char
4035 * following curly */
4036 ST.c1 = ST.c2 = CHRTEST_VOID;
4037 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4038 regnode *text_node = ST.B;
4039 if (! HAS_TEXT(text_node))
4040 FIND_NEXT_IMPT(text_node);
4041 if (HAS_TEXT(text_node)
4042 && PL_regkind[OP(text_node)] != REF)
4044 ST.c1 = (U8)*STRING(text_node);
4046 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4048 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4049 ? PL_fold_locale[ST.c1]
4056 PerlIO_printf(Perl_debug_log,
4057 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4058 (int)(REPORT_CODE_OFF+(depth*2)),
4061 if (ST.c1 != CHRTEST_VOID
4062 && UCHARAT(PL_reginput) != ST.c1
4063 && UCHARAT(PL_reginput) != ST.c2)
4065 /* simulate B failing */
4066 state_num = CURLYM_B_fail;
4067 goto reenter_switch;
4071 /* mark current A as captured */
4072 I32 paren = ST.me->flags;
4075 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4076 PL_regendp[paren] = PL_reginput - PL_bostr;
4079 PL_regendp[paren] = -1;
4081 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4084 case CURLYM_B_fail: /* just failed to match a B */
4085 REGCP_UNWIND(ST.cp);
4087 if (ST.count == ARG2(ST.me) /* max */)
4089 goto curlym_do_A; /* try to match a further A */
4091 /* backtrack one A */
4092 if (ST.count == ARG1(ST.me) /* min */)
4095 locinput = HOPc(locinput, -ST.alen);
4096 goto curlym_do_B; /* try to match B */
4099 #define ST st->u.curly
4101 #define CURLY_SETPAREN(paren, success) \
4104 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4105 PL_regendp[paren] = locinput - PL_bostr; \
4108 PL_regendp[paren] = -1; \
4111 case STAR: /* /A*B/ where A is width 1 */
4115 scan = NEXTOPER(scan);
4117 case PLUS: /* /A+B/ where A is width 1 */
4121 scan = NEXTOPER(scan);
4123 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4124 ST.paren = scan->flags; /* Which paren to set */
4125 if (ST.paren > PL_regsize)
4126 PL_regsize = ST.paren;
4127 if (ST.paren > (I32)*PL_reglastparen)
4128 *PL_reglastparen = ST.paren;
4129 ST.min = ARG1(scan); /* min to match */
4130 ST.max = ARG2(scan); /* max to match */
4131 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4133 case CURLY: /* /A{m,n}B/ where A is width 1 */
4135 ST.min = ARG1(scan); /* min to match */
4136 ST.max = ARG2(scan); /* max to match */
4137 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4140 * Lookahead to avoid useless match attempts
4141 * when we know what character comes next.
4143 * Used to only do .*x and .*?x, but now it allows
4144 * for )'s, ('s and (?{ ... })'s to be in the way
4145 * of the quantifier and the EXACT-like node. -- japhy
4148 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4150 if (HAS_TEXT(next) || JUMPABLE(next)) {
4152 regnode *text_node = next;
4154 if (! HAS_TEXT(text_node))
4155 FIND_NEXT_IMPT(text_node);
4157 if (! HAS_TEXT(text_node))
4158 ST.c1 = ST.c2 = CHRTEST_VOID;
4160 if (PL_regkind[OP(text_node)] == REF) {
4161 ST.c1 = ST.c2 = CHRTEST_VOID;
4162 goto assume_ok_easy;
4165 s = (U8*)STRING(text_node);
4169 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4170 ST.c2 = PL_fold[ST.c1];
4171 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4172 ST.c2 = PL_fold_locale[ST.c1];
4175 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4176 STRLEN ulen1, ulen2;
4177 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4178 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4180 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4181 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4183 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4185 0 : UTF8_ALLOW_ANY);
4186 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4188 0 : UTF8_ALLOW_ANY);
4190 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4192 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4197 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4204 ST.c1 = ST.c2 = CHRTEST_VOID;
4209 PL_reginput = locinput;
4212 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4215 locinput = PL_reginput;
4217 if (ST.c1 == CHRTEST_VOID)
4218 goto curly_try_B_min;
4220 ST.oldloc = locinput;
4222 /* set ST.maxpos to the furthest point along the
4223 * string that could possibly match */
4224 if (ST.max == REG_INFTY) {
4225 ST.maxpos = PL_regeol - 1;
4227 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4231 int m = ST.max - ST.min;
4232 for (ST.maxpos = locinput;
4233 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4234 ST.maxpos += UTF8SKIP(ST.maxpos);
4237 ST.maxpos = locinput + ST.max - ST.min;
4238 if (ST.maxpos >= PL_regeol)
4239 ST.maxpos = PL_regeol - 1;
4241 goto curly_try_B_min_known;
4245 ST.count = regrepeat(rex, ST.A, ST.max);
4246 locinput = PL_reginput;
4247 if (ST.count < ST.min)
4249 if ((ST.count > ST.min)
4250 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4252 /* A{m,n} must come at the end of the string, there's
4253 * no point in backing off ... */
4255 /* ...except that $ and \Z can match before *and* after
4256 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4257 We may back off by one in this case. */
4258 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4262 goto curly_try_B_max;
4267 case CURLY_B_min_known_fail:
4268 /* failed to find B in a non-greedy match where c1,c2 valid */
4269 if (ST.paren && ST.count)
4270 PL_regendp[ST.paren] = -1;
4272 PL_reginput = locinput; /* Could be reset... */
4273 REGCP_UNWIND(ST.cp);
4274 /* Couldn't or didn't -- move forward. */
4275 ST.oldloc = locinput;
4277 locinput += UTF8SKIP(locinput);
4281 curly_try_B_min_known:
4282 /* find the next place where 'B' could work, then call B */
4286 n = (ST.oldloc == locinput) ? 0 : 1;
4287 if (ST.c1 == ST.c2) {
4289 /* set n to utf8_distance(oldloc, locinput) */
4290 while (locinput <= ST.maxpos &&
4291 utf8n_to_uvchr((U8*)locinput,
4292 UTF8_MAXBYTES, &len,
4293 uniflags) != (UV)ST.c1) {
4299 /* set n to utf8_distance(oldloc, locinput) */
4300 while (locinput <= ST.maxpos) {
4302 const UV c = utf8n_to_uvchr((U8*)locinput,
4303 UTF8_MAXBYTES, &len,
4305 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4313 if (ST.c1 == ST.c2) {
4314 while (locinput <= ST.maxpos &&
4315 UCHARAT(locinput) != ST.c1)
4319 while (locinput <= ST.maxpos
4320 && UCHARAT(locinput) != ST.c1
4321 && UCHARAT(locinput) != ST.c2)
4324 n = locinput - ST.oldloc;
4326 if (locinput > ST.maxpos)
4328 /* PL_reginput == oldloc now */
4331 if (regrepeat(rex, ST.A, n) < n)
4334 PL_reginput = locinput;
4335 CURLY_SETPAREN(ST.paren, ST.count);
4336 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4341 case CURLY_B_min_fail:
4342 /* failed to find B in a non-greedy match where c1,c2 invalid */
4343 if (ST.paren && ST.count)
4344 PL_regendp[ST.paren] = -1;
4346 REGCP_UNWIND(ST.cp);
4347 /* failed -- move forward one */
4348 PL_reginput = locinput;
4349 if (regrepeat(rex, ST.A, 1)) {
4351 locinput = PL_reginput;
4352 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4353 ST.count > 0)) /* count overflow ? */
4356 CURLY_SETPAREN(ST.paren, ST.count);
4357 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4365 /* a successful greedy match: now try to match B */
4368 if (ST.c1 != CHRTEST_VOID)
4369 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4370 UTF8_MAXBYTES, 0, uniflags)
4371 : (UV) UCHARAT(PL_reginput);
4372 /* If it could work, try it. */
4373 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4374 CURLY_SETPAREN(ST.paren, ST.count);
4375 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4380 case CURLY_B_max_fail:
4381 /* failed to find B in a greedy match */
4382 if (ST.paren && ST.count)
4383 PL_regendp[ST.paren] = -1;
4385 REGCP_UNWIND(ST.cp);
4387 if (--ST.count < ST.min)
4389 PL_reginput = locinput = HOPc(locinput, -1);
4390 goto curly_try_B_max;
4398 /* we've just finished A in /(??{A})B/; now continue with B */
4402 st->u.eval.toggle_reg_flags
4403 = cur_eval->u.eval.toggle_reg_flags;
4404 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4406 st->u.eval.prev_rex = rex; /* inner */
4407 rex = cur_eval->u.eval.prev_rex; /* outer */
4408 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4410 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4411 REGCP_SET(st->u.eval.lastcp);
4412 PL_reginput = locinput;
4414 /* Restore parens of the outer rex without popping the
4416 tmpix = PL_savestack_ix;
4417 PL_savestack_ix = cur_eval->u.eval.lastcp;
4419 PL_savestack_ix = tmpix;
4421 st->u.eval.prev_eval = cur_eval;
4422 cur_eval = cur_eval->u.eval.prev_eval;
4424 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4425 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
4426 PUSH_YES_STATE_GOTO(EVAL_AB,
4427 st->u.eval.prev_eval->u.eval.B); /* match B */
4430 if (locinput < reginfo->till) {
4431 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4432 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4434 (long)(locinput - PL_reg_starttry),
4435 (long)(reginfo->till - PL_reg_starttry),
4437 sayNO_SILENT; /* Cannot match: too short. */
4439 PL_reginput = locinput; /* put where regtry can find it */
4440 sayYES; /* Success! */
4442 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4444 PerlIO_printf(Perl_debug_log,
4445 "%*s %ssubpattern success...%s\n",
4446 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4447 PL_reginput = locinput; /* put where regtry can find it */
4448 sayYES; /* Success! */
4451 #define ST st->u.ifmatch
4453 case SUSPEND: /* (?>A) */
4455 PL_reginput = locinput;
4458 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4460 goto ifmatch_trivial_fail_test;
4462 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4464 ifmatch_trivial_fail_test:
4466 char * const s = HOPBACKc(locinput, scan->flags);
4471 sw = 1 - (bool)ST.wanted;
4475 next = scan + ARG(scan);
4483 PL_reginput = locinput;
4487 ST.logical = logical;
4488 /* execute body of (?...A) */
4489 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4492 case IFMATCH_A_fail: /* body of (?...A) failed */
4493 ST.wanted = !ST.wanted;
4496 case IFMATCH_A: /* body of (?...A) succeeded */
4498 sw = (bool)ST.wanted;
4500 else if (!ST.wanted)
4503 if (OP(ST.me) == SUSPEND)
4504 locinput = PL_reginput;
4506 locinput = PL_reginput = st->locinput;
4507 nextchr = UCHARAT(locinput);
4509 scan = ST.me + ARG(ST.me);
4512 continue; /* execute B */
4517 next = scan + ARG(scan);
4522 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4523 PTR2UV(scan), OP(scan));
4524 Perl_croak(aTHX_ "regexp memory corruption");
4532 /* push a state that backtracks on success */
4533 st->u.yes.prev_yes_state = yes_state;
4537 /* push a new regex state, then continue at scan */
4539 regmatch_state *newst;
4541 DEBUG_STATE_pp("push");
4543 st->locinput = locinput;
4545 if (newst > SLAB_LAST(PL_regmatch_slab))
4546 newst = S_push_slab(aTHX);
4547 PL_regmatch_state = newst;
4549 locinput = PL_reginput;
4550 nextchr = UCHARAT(locinput);
4558 * We get here only if there's trouble -- normally "case END" is
4559 * the terminating point.
4561 Perl_croak(aTHX_ "corrupted regexp pointers");
4567 /* we have successfully completed a subexpression, but we must now
4568 * pop to the state marked by yes_state and continue from there */
4569 assert(st != yes_state);
4571 while (st != yes_state) {
4573 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4574 PL_regmatch_slab = PL_regmatch_slab->prev;
4575 st = SLAB_LAST(PL_regmatch_slab);
4577 DEBUG_STATE_pp("pop (yes)");
4581 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4582 || yes_state > SLAB_LAST(PL_regmatch_slab))
4584 /* not in this slab, pop slab */
4585 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4586 PL_regmatch_slab = PL_regmatch_slab->prev;
4587 st = SLAB_LAST(PL_regmatch_slab);
4589 depth -= (st - yes_state);
4592 yes_state = st->u.yes.prev_yes_state;
4593 PL_regmatch_state = st;
4595 state_num = st->resume_state;
4596 goto reenter_switch;
4599 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4600 PL_colors[4], PL_colors[5]));
4607 PerlIO_printf(Perl_debug_log,
4608 "%*s %sfailed...%s\n",
4609 REPORT_CODE_OFF+depth*2, "",
4610 PL_colors[4], PL_colors[5])
4615 /* there's a previous state to backtrack to */
4617 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4618 PL_regmatch_slab = PL_regmatch_slab->prev;
4619 st = SLAB_LAST(PL_regmatch_slab);
4621 PL_regmatch_state = st;
4622 locinput= st->locinput;
4623 nextchr = UCHARAT(locinput);
4625 DEBUG_STATE_pp("pop");
4627 if (yes_state == st)
4628 yes_state = st->u.yes.prev_yes_state;
4630 state_num = st->resume_state + 1; /* failure = success + 1 */
4631 goto reenter_switch;
4637 /* restore original high-water mark */
4638 PL_regmatch_slab = orig_slab;
4639 PL_regmatch_state = orig_state;
4641 /* free all slabs above current one */
4642 if (orig_slab->next) {
4643 regmatch_slab *sl = orig_slab->next;
4644 orig_slab->next = NULL;
4646 regmatch_slab * const osl = sl;
4656 - regrepeat - repeatedly match something simple, report how many
4659 * [This routine now assumes that it will only match on things of length 1.
4660 * That was true before, but now we assume scan - reginput is the count,
4661 * rather than incrementing count on every character. [Er, except utf8.]]
4664 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4667 register char *scan;
4669 register char *loceol = PL_regeol;
4670 register I32 hardcount = 0;
4671 register bool do_utf8 = PL_reg_match_utf8;
4674 if (max == REG_INFTY)
4676 else if (max < loceol - scan)
4677 loceol = scan + max;
4682 while (scan < loceol && hardcount < max && *scan != '\n') {
4683 scan += UTF8SKIP(scan);
4687 while (scan < loceol && *scan != '\n')
4694 while (scan < loceol && hardcount < max) {
4695 scan += UTF8SKIP(scan);
4705 case EXACT: /* length of string is 1 */
4707 while (scan < loceol && UCHARAT(scan) == c)
4710 case EXACTF: /* length of string is 1 */
4712 while (scan < loceol &&
4713 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4716 case EXACTFL: /* length of string is 1 */
4717 PL_reg_flags |= RF_tainted;
4719 while (scan < loceol &&
4720 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4726 while (hardcount < max && scan < loceol &&
4727 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4728 scan += UTF8SKIP(scan);
4732 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4739 LOAD_UTF8_CHARCLASS_ALNUM();
4740 while (hardcount < max && scan < loceol &&
4741 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4742 scan += UTF8SKIP(scan);
4746 while (scan < loceol && isALNUM(*scan))
4751 PL_reg_flags |= RF_tainted;
4754 while (hardcount < max && scan < loceol &&
4755 isALNUM_LC_utf8((U8*)scan)) {
4756 scan += UTF8SKIP(scan);
4760 while (scan < loceol && isALNUM_LC(*scan))
4767 LOAD_UTF8_CHARCLASS_ALNUM();
4768 while (hardcount < max && scan < loceol &&
4769 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4770 scan += UTF8SKIP(scan);
4774 while (scan < loceol && !isALNUM(*scan))
4779 PL_reg_flags |= RF_tainted;
4782 while (hardcount < max && scan < loceol &&
4783 !isALNUM_LC_utf8((U8*)scan)) {
4784 scan += UTF8SKIP(scan);
4788 while (scan < loceol && !isALNUM_LC(*scan))
4795 LOAD_UTF8_CHARCLASS_SPACE();
4796 while (hardcount < max && scan < loceol &&
4798 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4799 scan += UTF8SKIP(scan);
4803 while (scan < loceol && isSPACE(*scan))
4808 PL_reg_flags |= RF_tainted;
4811 while (hardcount < max && scan < loceol &&
4812 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4813 scan += UTF8SKIP(scan);
4817 while (scan < loceol && isSPACE_LC(*scan))
4824 LOAD_UTF8_CHARCLASS_SPACE();
4825 while (hardcount < max && scan < loceol &&
4827 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4828 scan += UTF8SKIP(scan);
4832 while (scan < loceol && !isSPACE(*scan))
4837 PL_reg_flags |= RF_tainted;
4840 while (hardcount < max && scan < loceol &&
4841 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4842 scan += UTF8SKIP(scan);
4846 while (scan < loceol && !isSPACE_LC(*scan))
4853 LOAD_UTF8_CHARCLASS_DIGIT();
4854 while (hardcount < max && scan < loceol &&
4855 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4856 scan += UTF8SKIP(scan);
4860 while (scan < loceol && isDIGIT(*scan))
4867 LOAD_UTF8_CHARCLASS_DIGIT();
4868 while (hardcount < max && scan < loceol &&
4869 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4870 scan += UTF8SKIP(scan);
4874 while (scan < loceol && !isDIGIT(*scan))
4878 default: /* Called on something of 0 width. */
4879 break; /* So match right here or not at all. */
4885 c = scan - PL_reginput;
4889 GET_RE_DEBUG_FLAGS_DECL;
4891 SV * const prop = sv_newmortal();
4892 regprop(prog, prop, p);
4893 PerlIO_printf(Perl_debug_log,
4894 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4895 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4903 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4905 - regclass_swash - prepare the utf8 swash
4909 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4915 const struct reg_data * const data = prog ? prog->data : NULL;
4917 if (data && data->count) {
4918 const U32 n = ARG(node);
4920 if (data->what[n] == 's') {
4921 SV * const rv = (SV*)data->data[n];
4922 AV * const av = (AV*)SvRV((SV*)rv);
4923 SV **const ary = AvARRAY(av);
4926 /* See the end of regcomp.c:S_regclass() for
4927 * documentation of these array elements. */
4930 a = SvROK(ary[1]) ? &ary[1] : 0;
4931 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4935 else if (si && doinit) {
4936 sw = swash_init("utf8", "", si, 1, 0);
4937 (void)av_store(av, 1, sw);
4954 - reginclass - determine if a character falls into a character class
4956 The n is the ANYOF regnode, the p is the target string, lenp
4957 is pointer to the maximum length of how far to go in the p
4958 (if the lenp is zero, UTF8SKIP(p) is used),
4959 do_utf8 tells whether the target string is in UTF-8.
4964 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4967 const char flags = ANYOF_FLAGS(n);
4973 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4974 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4975 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4976 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
4977 if (len == (STRLEN)-1)
4978 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4981 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4982 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4985 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4986 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4989 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4993 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
4996 if (swash_fetch(sw, p, do_utf8))
4998 else if (flags & ANYOF_FOLD) {
4999 if (!match && lenp && av) {
5001 for (i = 0; i <= av_len(av); i++) {
5002 SV* const sv = *av_fetch(av, i, FALSE);
5004 const char * const s = SvPV_const(sv, len);
5006 if (len <= plen && memEQ(s, (char*)p, len)) {
5014 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5017 to_utf8_fold(p, tmpbuf, &tmplen);
5018 if (swash_fetch(sw, tmpbuf, do_utf8))
5024 if (match && lenp && *lenp == 0)
5025 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5027 if (!match && c < 256) {
5028 if (ANYOF_BITMAP_TEST(n, c))
5030 else if (flags & ANYOF_FOLD) {
5033 if (flags & ANYOF_LOCALE) {
5034 PL_reg_flags |= RF_tainted;
5035 f = PL_fold_locale[c];
5039 if (f != c && ANYOF_BITMAP_TEST(n, f))
5043 if (!match && (flags & ANYOF_CLASS)) {
5044 PL_reg_flags |= RF_tainted;
5046 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5047 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5048 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5049 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5050 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5051 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5052 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5053 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5054 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5055 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5056 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5057 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5058 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5059 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5060 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5061 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5062 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5063 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5064 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5065 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5066 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5067 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5068 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5069 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5070 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5071 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5072 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5073 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5074 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5075 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5076 ) /* How's that for a conditional? */
5083 return (flags & ANYOF_INVERT) ? !match : match;
5087 S_reghop3(U8 *s, I32 off, const U8* lim)
5091 while (off-- && s < lim) {
5092 /* XXX could check well-formedness here */
5097 while (off++ && s > lim) {
5099 if (UTF8_IS_CONTINUED(*s)) {
5100 while (s > lim && UTF8_IS_CONTINUATION(*s))
5103 /* XXX could check well-formedness here */
5110 /* there are a bunch of places where we use two reghop3's that should
5111 be replaced with this routine. but since thats not done yet
5112 we ifdef it out - dmq
5115 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5119 while (off-- && s < rlim) {
5120 /* XXX could check well-formedness here */
5125 while (off++ && s > llim) {
5127 if (UTF8_IS_CONTINUED(*s)) {
5128 while (s > llim && UTF8_IS_CONTINUATION(*s))
5131 /* XXX could check well-formedness here */
5139 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5143 while (off-- && s < lim) {
5144 /* XXX could check well-formedness here */
5151 while (off++ && s > lim) {
5153 if (UTF8_IS_CONTINUED(*s)) {
5154 while (s > lim && UTF8_IS_CONTINUATION(*s))
5157 /* XXX could check well-formedness here */
5166 restore_pos(pTHX_ void *arg)
5169 regexp * const rex = (regexp *)arg;
5170 if (PL_reg_eval_set) {
5171 if (PL_reg_oldsaved) {
5172 rex->subbeg = PL_reg_oldsaved;
5173 rex->sublen = PL_reg_oldsavedlen;
5174 #ifdef PERL_OLD_COPY_ON_WRITE
5175 rex->saved_copy = PL_nrs;
5177 RX_MATCH_COPIED_on(rex);
5179 PL_reg_magic->mg_len = PL_reg_oldpos;
5180 PL_reg_eval_set = 0;
5181 PL_curpm = PL_reg_oldcurpm;
5186 S_to_utf8_substr(pTHX_ register regexp *prog)
5188 if (prog->float_substr && !prog->float_utf8) {
5189 SV* const sv = newSVsv(prog->float_substr);
5190 prog->float_utf8 = sv;
5191 sv_utf8_upgrade(sv);
5192 if (SvTAIL(prog->float_substr))
5194 if (prog->float_substr == prog->check_substr)
5195 prog->check_utf8 = sv;
5197 if (prog->anchored_substr && !prog->anchored_utf8) {
5198 SV* const sv = newSVsv(prog->anchored_substr);
5199 prog->anchored_utf8 = sv;
5200 sv_utf8_upgrade(sv);
5201 if (SvTAIL(prog->anchored_substr))
5203 if (prog->anchored_substr == prog->check_substr)
5204 prog->check_utf8 = sv;
5209 S_to_byte_substr(pTHX_ register regexp *prog)
5212 if (prog->float_utf8 && !prog->float_substr) {
5213 SV* sv = newSVsv(prog->float_utf8);
5214 prog->float_substr = sv;
5215 if (sv_utf8_downgrade(sv, TRUE)) {
5216 if (SvTAIL(prog->float_utf8))
5220 prog->float_substr = sv = &PL_sv_undef;
5222 if (prog->float_utf8 == prog->check_utf8)
5223 prog->check_substr = sv;
5225 if (prog->anchored_utf8 && !prog->anchored_substr) {
5226 SV* sv = newSVsv(prog->anchored_utf8);
5227 prog->anchored_substr = sv;
5228 if (sv_utf8_downgrade(sv, TRUE)) {
5229 if (SvTAIL(prog->anchored_utf8))
5233 prog->anchored_substr = sv = &PL_sv_undef;
5235 if (prog->anchored_utf8 == prog->check_utf8)
5236 prog->check_substr = sv;
5242 * c-indentation-style: bsd
5244 * indent-tabs-mode: t
5247 * ex: set ts=8 sts=4 sw=4 noet: