5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
159 const int retval = PL_savestack_ix;
160 #define REGCP_PAREN_ELEMS 4
161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 GET_RE_DEBUG_FLAGS_DECL;
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168 #define REGCP_OTHER_ELEMS 8
169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
170 for (p = PL_regsize; p > parenfloor; p--) {
171 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
174 SSPUSHPTR(PL_reg_start_tmp[p]);
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
183 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
184 SSPUSHPTR(PL_regstartp);
185 SSPUSHPTR(PL_regendp);
186 SSPUSHINT(PL_regsize);
187 SSPUSHINT(*PL_reglastparen);
188 SSPUSHINT(*PL_reglastcloseparen);
189 SSPUSHPTR(PL_reginput);
190 #define REGCP_FRAME_ELEMS 2
191 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
192 * are needed for the regexp context stack bookkeeping. */
193 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
194 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
199 /* These are needed since we do not localize EVAL nodes: */
200 #define REGCP_SET(cp) \
202 PerlIO_printf(Perl_debug_log, \
203 " Setting an EVAL scope, savestack=%"IVdf"\n", \
204 (IV)PL_savestack_ix)); \
207 #define REGCP_UNWIND(cp) \
209 if (cp != PL_savestack_ix) \
210 PerlIO_printf(Perl_debug_log, \
211 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
212 (IV)(cp), (IV)PL_savestack_ix)); \
216 S_regcppop(pTHX_ const regexp *rex)
222 GET_RE_DEBUG_FLAGS_DECL;
224 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
226 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
227 i = SSPOPINT; /* Parentheses elements to pop. */
228 input = (char *) SSPOPPTR;
229 *PL_reglastcloseparen = SSPOPINT;
230 *PL_reglastparen = SSPOPINT;
231 PL_regsize = SSPOPINT;
232 PL_regendp=(I32 *) SSPOPPTR;
233 PL_regstartp=(I32 *) SSPOPPTR;
236 /* Now restore the parentheses context. */
237 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
238 i > 0; i -= REGCP_PAREN_ELEMS) {
240 U32 paren = (U32)SSPOPINT;
241 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
242 PL_regstartp[paren] = SSPOPINT;
244 if (paren <= *PL_reglastparen)
245 PL_regendp[paren] = tmps;
247 PerlIO_printf(Perl_debug_log,
248 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
249 (UV)paren, (IV)PL_regstartp[paren],
250 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
251 (IV)PL_regendp[paren],
252 (paren > *PL_reglastparen ? "(no)" : ""));
256 if (*PL_reglastparen + 1 <= rex->nparens) {
257 PerlIO_printf(Perl_debug_log,
258 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
259 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
263 /* It would seem that the similar code in regtry()
264 * already takes care of this, and in fact it is in
265 * a better location to since this code can #if 0-ed out
266 * but the code in regtry() is needed or otherwise tests
267 * requiring null fields (pat.t#187 and split.t#{13,14}
268 * (as of patchlevel 7877) will fail. Then again,
269 * this code seems to be necessary or otherwise
270 * building DynaLoader will fail:
271 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
273 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
275 PL_regstartp[i] = -1;
282 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
285 * pregexec and friends
288 #ifndef PERL_IN_XSUB_RE
290 - pregexec - match a regexp against a string
293 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
294 char *strbeg, I32 minend, SV *screamer, U32 nosave)
295 /* strend: pointer to null at end of string */
296 /* strbeg: real beginning of string */
297 /* minend: end of match must be >=minend after stringarg. */
298 /* nosave: For optimizations. */
301 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
302 nosave ? 0 : REXEC_COPY_STR);
307 * Need to implement the following flags for reg_anch:
309 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
311 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
312 * INTUIT_AUTORITATIVE_ML
313 * INTUIT_ONCE_NOML - Intuit can match in one location only.
316 * Another flag for this function: SECOND_TIME (so that float substrs
317 * with giant delta may be not rechecked).
320 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
322 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
323 Otherwise, only SvCUR(sv) is used to get strbeg. */
325 /* XXXX We assume that strpos is strbeg unless sv. */
327 /* XXXX Some places assume that there is a fixed substring.
328 An update may be needed if optimizer marks as "INTUITable"
329 RExen without fixed substrings. Similarly, it is assumed that
330 lengths of all the strings are no more than minlen, thus they
331 cannot come from lookahead.
332 (Or minlen should take into account lookahead.) */
334 /* A failure to find a constant substring means that there is no need to make
335 an expensive call to REx engine, thus we celebrate a failure. Similarly,
336 finding a substring too deep into the string means that less calls to
337 regtry() should be needed.
339 REx compiler's optimizer found 4 possible hints:
340 a) Anchored substring;
342 c) Whether we are anchored (beginning-of-line or \G);
343 d) First node (of those at offset 0) which may distingush positions;
344 We use a)b)d) and multiline-part of c), and try to find a position in the
345 string which does not contradict any of them.
348 /* Most of decisions we do here should have been done at compile time.
349 The nodes of the REx which we used for the search should have been
350 deleted from the finite automaton. */
353 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
354 char *strend, U32 flags, re_scream_pos_data *data)
357 register I32 start_shift = 0;
358 /* Should be nonnegative! */
359 register I32 end_shift = 0;
364 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
366 register char *other_last = NULL; /* other substr checked before this */
367 char *check_at = NULL; /* check substr found at this pos */
368 const I32 multiline = prog->reganch & PMf_MULTILINE;
370 const char * const i_strpos = strpos;
373 GET_RE_DEBUG_FLAGS_DECL;
375 RX_MATCH_UTF8_set(prog,do_utf8);
377 if (prog->reganch & ROPT_UTF8) {
378 PL_reg_flags |= RF_utf8;
381 debug_start_match(prog, do_utf8, strpos, strend,
382 sv ? "Guessing start of match in sv for"
383 : "Guessing start of match in string for");
386 /* CHR_DIST() would be more correct here but it makes things slow. */
387 if (prog->minlen > strend - strpos) {
388 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
389 "String too short... [re_intuit_start]\n"));
393 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
396 if (!prog->check_utf8 && prog->check_substr)
397 to_utf8_substr(prog);
398 check = prog->check_utf8;
400 if (!prog->check_substr && prog->check_utf8)
401 to_byte_substr(prog);
402 check = prog->check_substr;
404 if (check == &PL_sv_undef) {
405 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
406 "Non-utf8 string cannot match utf8 check string\n"));
409 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
410 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
411 || ( (prog->reganch & ROPT_ANCH_BOL)
412 && !multiline ) ); /* Check after \n? */
415 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
416 | ROPT_IMPLICIT)) /* not a real BOL */
417 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
419 && (strpos != strbeg)) {
420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
423 if (prog->check_offset_min == prog->check_offset_max &&
424 !(prog->reganch & ROPT_CANY_SEEN)) {
425 /* Substring at constant offset from beg-of-str... */
428 s = HOP3c(strpos, prog->check_offset_min, strend);
431 slen = SvCUR(check); /* >= 1 */
433 if ( strend - s > slen || strend - s < slen - 1
434 || (strend - s == slen && strend[-1] != '\n')) {
435 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
438 /* Now should match s[0..slen-2] */
440 if (slen && (*SvPVX_const(check) != *s
442 && memNE(SvPVX_const(check), s, slen)))) {
444 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
448 else if (*SvPVX_const(check) != *s
449 || ((slen = SvCUR(check)) > 1
450 && memNE(SvPVX_const(check), s, slen)))
453 goto success_at_start;
456 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
458 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
459 end_shift = prog->check_end_shift;
462 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
463 - (SvTAIL(check) != 0);
464 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
466 if (end_shift < eshift)
470 else { /* Can match at random position */
473 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
474 end_shift = prog->check_end_shift;
476 /* end shift should be non negative here */
479 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
481 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
482 (IV)end_shift, prog->precomp);
486 /* Find a possible match in the region s..strend by looking for
487 the "check" substring in the region corrected by start/end_shift. */
490 I32 srch_start_shift = start_shift;
491 I32 srch_end_shift = end_shift;
492 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
493 srch_end_shift -= ((strbeg - s) - srch_start_shift);
494 srch_start_shift = strbeg - s;
496 DEBUG_OPTIMISE_MORE_r({
497 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
498 (IV)prog->check_offset_min,
499 (IV)srch_start_shift,
501 (IV)prog->check_end_shift);
504 if (flags & REXEC_SCREAM) {
505 I32 p = -1; /* Internal iterator of scream. */
506 I32 * const pp = data ? data->scream_pos : &p;
508 if (PL_screamfirst[BmRARE(check)] >= 0
509 || ( BmRARE(check) == '\n'
510 && (BmPREVIOUS(check) == SvCUR(check) - 1)
512 s = screaminstr(sv, check,
513 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
516 /* we may be pointing at the wrong string */
517 if (s && RX_MATCH_COPIED(prog))
518 s = strbeg + (s - SvPVX_const(sv));
520 *data->scream_olds = s;
525 if (prog->reganch & ROPT_CANY_SEEN) {
526 start_point= (U8*)(s + srch_start_shift);
527 end_point= (U8*)(strend - srch_end_shift);
529 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
530 end_point= HOP3(strend, -srch_end_shift, strbeg);
532 DEBUG_OPTIMISE_MORE_r({
533 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
534 (int)(end_point - start_point),
535 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
539 s = fbm_instr( start_point, end_point,
540 check, multiline ? FBMrf_MULTILINE : 0);
543 /* Update the count-of-usability, remove useless subpatterns,
547 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
548 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
549 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
550 (s ? "Found" : "Did not find"),
551 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
552 ? "anchored" : "floating"),
555 (s ? " at offset " : "...\n") );
560 /* Finish the diagnostic message */
561 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
563 /* XXX dmq: first branch is for positive lookbehind...
564 Our check string is offset from the beginning of the pattern.
565 So we need to do any stclass tests offset forward from that
574 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
575 Start with the other substr.
576 XXXX no SCREAM optimization yet - and a very coarse implementation
577 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
578 *always* match. Probably should be marked during compile...
579 Probably it is right to do no SCREAM here...
582 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
583 : (prog->float_substr && prog->anchored_substr))
585 /* Take into account the "other" substring. */
586 /* XXXX May be hopelessly wrong for UTF... */
589 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
592 char * const last = HOP3c(s, -start_shift, strbeg);
594 char * const saved_s = s;
597 t = s - prog->check_offset_max;
598 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
600 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
605 t = HOP3c(t, prog->anchored_offset, strend);
606 if (t < other_last) /* These positions already checked */
608 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
611 /* XXXX It is not documented what units *_offsets are in.
612 We assume bytes, but this is clearly wrong.
613 Meaning this code needs to be carefully reviewed for errors.
617 /* On end-of-str: see comment below. */
618 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
619 if (must == &PL_sv_undef) {
621 DEBUG_r(must = prog->anchored_utf8); /* for debug */
626 HOP3(HOP3(last1, prog->anchored_offset, strend)
627 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
629 multiline ? FBMrf_MULTILINE : 0
632 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
633 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
634 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
635 (s ? "Found" : "Contradicts"),
636 quoted, RE_SV_TAIL(must));
641 if (last1 >= last2) {
642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
643 ", giving up...\n"));
646 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
647 ", trying floating at offset %ld...\n",
648 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
649 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
650 s = HOP3c(last, 1, strend);
654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
655 (long)(s - i_strpos)));
656 t = HOP3c(s, -prog->anchored_offset, strbeg);
657 other_last = HOP3c(s, 1, strend);
665 else { /* Take into account the floating substring. */
667 char * const saved_s = s;
670 t = HOP3c(s, -start_shift, strbeg);
672 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
673 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
674 last = HOP3c(t, prog->float_max_offset, strend);
675 s = HOP3c(t, prog->float_min_offset, strend);
678 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
679 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
680 /* fbm_instr() takes into account exact value of end-of-str
681 if the check is SvTAIL(ed). Since false positives are OK,
682 and end-of-str is not later than strend we are OK. */
683 if (must == &PL_sv_undef) {
685 DEBUG_r(must = prog->float_utf8); /* for debug message */
688 s = fbm_instr((unsigned char*)s,
689 (unsigned char*)last + SvCUR(must)
691 must, multiline ? FBMrf_MULTILINE : 0);
693 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
694 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
695 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
696 (s ? "Found" : "Contradicts"),
697 quoted, RE_SV_TAIL(must));
701 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
702 ", giving up...\n"));
705 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
706 ", trying anchored starting at offset %ld...\n",
707 (long)(saved_s + 1 - i_strpos)));
709 s = HOP3c(t, 1, strend);
713 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
714 (long)(s - i_strpos)));
715 other_last = s; /* Fix this later. --Hugo */
725 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
727 DEBUG_OPTIMISE_MORE_r(
728 PerlIO_printf(Perl_debug_log,
729 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
730 (IV)prog->check_offset_min,
731 (IV)prog->check_offset_max,
739 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
741 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
744 /* Fixed substring is found far enough so that the match
745 cannot start at strpos. */
747 if (ml_anch && t[-1] != '\n') {
748 /* Eventually fbm_*() should handle this, but often
749 anchored_offset is not 0, so this check will not be wasted. */
750 /* XXXX In the code below we prefer to look for "^" even in
751 presence of anchored substrings. And we search even
752 beyond the found float position. These pessimizations
753 are historical artefacts only. */
755 while (t < strend - prog->minlen) {
757 if (t < check_at - prog->check_offset_min) {
758 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
759 /* Since we moved from the found position,
760 we definitely contradict the found anchored
761 substr. Due to the above check we do not
762 contradict "check" substr.
763 Thus we can arrive here only if check substr
764 is float. Redo checking for "other"=="fixed".
767 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
768 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
769 goto do_other_anchored;
771 /* We don't contradict the found floating substring. */
772 /* XXXX Why not check for STCLASS? */
774 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
775 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
778 /* Position contradicts check-string */
779 /* XXXX probably better to look for check-string
780 than for "\n", so one should lower the limit for t? */
781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
782 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
783 other_last = strpos = s = t + 1;
788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
789 PL_colors[0], PL_colors[1]));
793 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
794 PL_colors[0], PL_colors[1]));
798 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
801 /* The found string does not prohibit matching at strpos,
802 - no optimization of calling REx engine can be performed,
803 unless it was an MBOL and we are not after MBOL,
804 or a future STCLASS check will fail this. */
806 /* Even in this situation we may use MBOL flag if strpos is offset
807 wrt the start of the string. */
808 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
809 && (strpos != strbeg) && strpos[-1] != '\n'
810 /* May be due to an implicit anchor of m{.*foo} */
811 && !(prog->reganch & ROPT_IMPLICIT))
816 DEBUG_EXECUTE_r( if (ml_anch)
817 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
818 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
821 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
823 prog->check_utf8 /* Could be deleted already */
824 && --BmUSEFUL(prog->check_utf8) < 0
825 && (prog->check_utf8 == prog->float_utf8)
827 prog->check_substr /* Could be deleted already */
828 && --BmUSEFUL(prog->check_substr) < 0
829 && (prog->check_substr == prog->float_substr)
832 /* If flags & SOMETHING - do not do it many times on the same match */
833 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
834 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
835 if (do_utf8 ? prog->check_substr : prog->check_utf8)
836 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
837 prog->check_substr = prog->check_utf8 = NULL; /* disable */
838 prog->float_substr = prog->float_utf8 = NULL; /* clear */
839 check = NULL; /* abort */
841 /* XXXX This is a remnant of the old implementation. It
842 looks wasteful, since now INTUIT can use many
844 prog->reganch &= ~RE_USE_INTUIT;
851 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
852 /* trie stclasses are too expensive to use here, we are better off to
853 leave it to regmatch itself */
854 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
855 /* minlen == 0 is possible if regstclass is \b or \B,
856 and the fixed substr is ''$.
857 Since minlen is already taken into account, s+1 is before strend;
858 accidentally, minlen >= 1 guaranties no false positives at s + 1
859 even for \b or \B. But (minlen? 1 : 0) below assumes that
860 regstclass does not come from lookahead... */
861 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
862 This leaves EXACTF only, which is dealt with in find_byclass(). */
863 const U8* const str = (U8*)STRING(prog->regstclass);
864 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
865 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
868 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
869 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
870 else if (prog->float_substr || prog->float_utf8)
871 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
875 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
876 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
879 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
882 const char *what = NULL;
884 if (endpos == strend) {
885 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
886 "Could not match STCLASS...\n") );
889 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
890 "This position contradicts STCLASS...\n") );
891 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
893 /* Contradict one of substrings */
894 if (prog->anchored_substr || prog->anchored_utf8) {
895 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
896 DEBUG_EXECUTE_r( what = "anchored" );
898 s = HOP3c(t, 1, strend);
899 if (s + start_shift + end_shift > strend) {
900 /* XXXX Should be taken into account earlier? */
901 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
902 "Could not match STCLASS...\n") );
907 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
908 "Looking for %s substr starting at offset %ld...\n",
909 what, (long)(s + start_shift - i_strpos)) );
912 /* Have both, check_string is floating */
913 if (t + start_shift >= check_at) /* Contradicts floating=check */
914 goto retry_floating_check;
915 /* Recheck anchored substring, but not floating... */
919 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
920 "Looking for anchored substr starting at offset %ld...\n",
921 (long)(other_last - i_strpos)) );
922 goto do_other_anchored;
924 /* Another way we could have checked stclass at the
925 current position only: */
930 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
931 "Looking for /%s^%s/m starting at offset %ld...\n",
932 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
935 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
937 /* Check is floating subtring. */
938 retry_floating_check:
939 t = check_at - start_shift;
940 DEBUG_EXECUTE_r( what = "floating" );
941 goto hop_and_restart;
944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
945 "By STCLASS: moving %ld --> %ld\n",
946 (long)(t - i_strpos), (long)(s - i_strpos))
950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
951 "Does not contradict STCLASS...\n");
956 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
957 PL_colors[4], (check ? "Guessed" : "Giving up"),
958 PL_colors[5], (long)(s - i_strpos)) );
961 fail_finish: /* Substring not found */
962 if (prog->check_substr || prog->check_utf8) /* could be removed already */
963 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
965 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
966 PL_colors[4], PL_colors[5]));
972 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
973 foldlen, foldbuf, uniflags) STMT_START { \
974 switch (trie_type) { \
975 case trie_utf8_fold: \
977 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
982 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
983 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
984 foldlen -= UNISKIP( uvc ); \
985 uscan = foldbuf + UNISKIP( uvc ); \
989 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
997 charid = trie->charmap[ uvc ]; \
1001 if (trie->widecharmap) { \
1002 SV** const svpp = hv_fetch(trie->widecharmap, \
1003 (char*)&uvc, sizeof(UV), 0); \
1005 charid = (U16)SvIV(*svpp); \
1010 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1013 ibcmp_utf8(s, NULL, 0, do_utf8, \
1014 m, NULL, ln, (bool)UTF)) \
1015 && (!reginfo || regtry(reginfo, s)) ) \
1018 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1019 uvchr_to_utf8(tmpbuf, c); \
1020 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1022 && (f == c1 || f == c2) \
1023 && (ln == foldlen || \
1024 !ibcmp_utf8((char *) foldbuf, \
1025 NULL, foldlen, do_utf8, \
1027 NULL, ln, (bool)UTF)) \
1028 && (!reginfo || regtry(reginfo, s)) ) \
1033 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1037 && (ln == 1 || !(OP(c) == EXACTF \
1039 : ibcmp_locale(s, m, ln))) \
1040 && (!reginfo || regtry(reginfo, s)) ) \
1046 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1048 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1054 #define REXEC_FBC_SCAN(CoDe) \
1056 while (s < strend) { \
1062 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1063 REXEC_FBC_UTF8_SCAN( \
1065 if (tmp && (!reginfo || regtry(reginfo, s))) \
1074 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1077 if (tmp && (!reginfo || regtry(reginfo, s))) \
1086 #define REXEC_FBC_TRYIT \
1087 if ((!reginfo || regtry(reginfo, s))) \
1090 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1093 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1096 REXEC_FBC_CLASS_SCAN(CoNd); \
1100 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1101 PL_reg_flags |= RF_tainted; \
1103 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1106 REXEC_FBC_CLASS_SCAN(CoNd); \
1110 #define DUMP_EXEC_POS(li,s,doutf8) \
1111 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1113 /* We know what class REx starts with. Try to find this position... */
1114 /* if reginfo is NULL, its a dryrun */
1115 /* annoyingly all the vars in this routine have different names from their counterparts
1116 in regmatch. /grrr */
1119 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1120 const char *strend, const regmatch_info *reginfo)
1123 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1127 register STRLEN uskip;
1131 register I32 tmp = 1; /* Scratch variable? */
1132 register const bool do_utf8 = PL_reg_match_utf8;
1134 /* We know what class it must start with. */
1138 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1139 !UTF8_IS_INVARIANT((U8)s[0]) ?
1140 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1141 REGINCLASS(prog, c, (U8*)s));
1144 while (s < strend) {
1147 if (REGINCLASS(prog, c, (U8*)s) ||
1148 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1149 /* The assignment of 2 is intentional:
1150 * for the folded sharp s, the skip is 2. */
1151 (skip = SHARP_S_SKIP))) {
1152 if (tmp && (!reginfo || regtry(reginfo, s)))
1165 if (tmp && (!reginfo || regtry(reginfo, s)))
1173 ln = STR_LEN(c); /* length to match in octets/bytes */
1174 lnc = (I32) ln; /* length to match in characters */
1176 STRLEN ulen1, ulen2;
1178 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1179 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1180 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1182 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1183 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1185 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1187 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1190 while (sm < ((U8 *) m + ln)) {
1205 c2 = PL_fold_locale[c1];
1207 e = HOP3c(strend, -((I32)lnc), s);
1209 if (!reginfo && e < s)
1210 e = s; /* Due to minlen logic of intuit() */
1212 /* The idea in the EXACTF* cases is to first find the
1213 * first character of the EXACTF* node and then, if
1214 * necessary, case-insensitively compare the full
1215 * text of the node. The c1 and c2 are the first
1216 * characters (though in Unicode it gets a bit
1217 * more complicated because there are more cases
1218 * than just upper and lower: one needs to use
1219 * the so-called folding case for case-insensitive
1220 * matching (called "loose matching" in Unicode).
1221 * ibcmp_utf8() will do just that. */
1225 U8 tmpbuf [UTF8_MAXBYTES+1];
1226 STRLEN len, foldlen;
1227 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1229 /* Upper and lower of 1st char are equal -
1230 * probably not a "letter". */
1232 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1234 REXEC_FBC_EXACTISH_CHECK(c == c1);
1239 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1242 /* Handle some of the three Greek sigmas cases.
1243 * Note that not all the possible combinations
1244 * are handled here: some of them are handled
1245 * by the standard folding rules, and some of
1246 * them (the character class or ANYOF cases)
1247 * are handled during compiletime in
1248 * regexec.c:S_regclass(). */
1249 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1250 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1251 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1253 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1259 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1261 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1265 PL_reg_flags |= RF_tainted;
1272 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1273 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1275 tmp = ((OP(c) == BOUND ?
1276 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1277 LOAD_UTF8_CHARCLASS_ALNUM();
1278 REXEC_FBC_UTF8_SCAN(
1279 if (tmp == !(OP(c) == BOUND ?
1280 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1281 isALNUM_LC_utf8((U8*)s)))
1289 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1290 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1293 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1299 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1303 PL_reg_flags |= RF_tainted;
1310 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1311 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1313 tmp = ((OP(c) == NBOUND ?
1314 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1315 LOAD_UTF8_CHARCLASS_ALNUM();
1316 REXEC_FBC_UTF8_SCAN(
1317 if (tmp == !(OP(c) == NBOUND ?
1318 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1319 isALNUM_LC_utf8((U8*)s)))
1321 else REXEC_FBC_TRYIT;
1325 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1326 tmp = ((OP(c) == NBOUND ?
1327 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1330 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1332 else REXEC_FBC_TRYIT;
1335 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1339 REXEC_FBC_CSCAN_PRELOAD(
1340 LOAD_UTF8_CHARCLASS_ALNUM(),
1341 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1345 REXEC_FBC_CSCAN_TAINT(
1346 isALNUM_LC_utf8((U8*)s),
1350 REXEC_FBC_CSCAN_PRELOAD(
1351 LOAD_UTF8_CHARCLASS_ALNUM(),
1352 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1356 REXEC_FBC_CSCAN_TAINT(
1357 !isALNUM_LC_utf8((U8*)s),
1361 REXEC_FBC_CSCAN_PRELOAD(
1362 LOAD_UTF8_CHARCLASS_SPACE(),
1363 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1367 REXEC_FBC_CSCAN_TAINT(
1368 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1372 REXEC_FBC_CSCAN_PRELOAD(
1373 LOAD_UTF8_CHARCLASS_SPACE(),
1374 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1378 REXEC_FBC_CSCAN_TAINT(
1379 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1383 REXEC_FBC_CSCAN_PRELOAD(
1384 LOAD_UTF8_CHARCLASS_DIGIT(),
1385 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1389 REXEC_FBC_CSCAN_TAINT(
1390 isDIGIT_LC_utf8((U8*)s),
1394 REXEC_FBC_CSCAN_PRELOAD(
1395 LOAD_UTF8_CHARCLASS_DIGIT(),
1396 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1400 REXEC_FBC_CSCAN_TAINT(
1401 !isDIGIT_LC_utf8((U8*)s),
1407 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1408 trie_type = do_utf8 ?
1409 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1411 /* what trie are we using right now */
1413 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1414 reg_trie_data *trie=aho->trie;
1416 const char *last_start = strend - trie->minlen;
1418 const char *real_start = s;
1420 STRLEN maxlen = trie->maxlen;
1422 U8 **points; /* map of where we were in the input string
1423 when reading a given char. For ASCII this
1424 is unnecessary overhead as the relationship
1425 is always 1:1, but for unicode, especially
1426 case folded unicode this is not true. */
1427 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1431 GET_RE_DEBUG_FLAGS_DECL;
1433 /* We can't just allocate points here. We need to wrap it in
1434 * an SV so it gets freed properly if there is a croak while
1435 * running the match */
1438 sv_points=newSV(maxlen * sizeof(U8 *));
1439 SvCUR_set(sv_points,
1440 maxlen * sizeof(U8 *));
1441 SvPOK_on(sv_points);
1442 sv_2mortal(sv_points);
1443 points=(U8**)SvPV_nolen(sv_points );
1444 if ( trie_type != trie_utf8_fold
1445 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1448 bitmap=(U8*)trie->bitmap;
1450 bitmap=(U8*)ANYOF_BITMAP(c);
1452 /* this is the Aho-Corasick algorithm modified a touch
1453 to include special handling for long "unknown char"
1454 sequences. The basic idea being that we use AC as long
1455 as we are dealing with a possible matching char, when
1456 we encounter an unknown char (and we have not encountered
1457 an accepting state) we scan forward until we find a legal
1459 AC matching is basically that of trie matching, except
1460 that when we encounter a failing transition, we fall back
1461 to the current states "fail state", and try the current char
1462 again, a process we repeat until we reach the root state,
1463 state 1, or a legal transition. If we fail on the root state
1464 then we can either terminate if we have reached an accepting
1465 state previously, or restart the entire process from the beginning
1469 while (s <= last_start) {
1470 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1478 U8 *uscan = (U8*)NULL;
1479 U8 *leftmost = NULL;
1481 U32 accepted_word= 0;
1485 while ( state && uc <= (U8*)strend ) {
1487 U32 word = aho->states[ state ].wordnum;
1491 DEBUG_TRIE_EXECUTE_r(
1492 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1493 dump_exec_pos( (char *)uc, c, strend, real_start,
1494 (char *)uc, do_utf8 );
1495 PerlIO_printf( Perl_debug_log,
1496 " Scanning for legal start char...\n");
1499 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1504 if (uc >(U8*)last_start) break;
1508 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1509 if (!leftmost || lpos < leftmost) {
1510 DEBUG_r(accepted_word=word);
1516 points[pointpos++ % maxlen]= uc;
1517 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1518 uvc, charid, foldlen, foldbuf, uniflags);
1519 DEBUG_TRIE_EXECUTE_r({
1520 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 PerlIO_printf(Perl_debug_log,
1523 " Charid:%3u CP:%4"UVxf" ",
1529 word = aho->states[ state ].wordnum;
1531 base = aho->states[ state ].trans.base;
1533 DEBUG_TRIE_EXECUTE_r({
1535 dump_exec_pos( (char *)uc, c, strend, real_start,
1537 PerlIO_printf( Perl_debug_log,
1538 "%sState: %4"UVxf", word=%"UVxf,
1539 failed ? " Fail transition to " : "",
1540 (UV)state, (UV)word);
1545 (base + charid > trie->uniquecharcount )
1546 && (base + charid - 1 - trie->uniquecharcount
1548 && trie->trans[base + charid - 1 -
1549 trie->uniquecharcount].check == state
1550 && (tmp=trie->trans[base + charid - 1 -
1551 trie->uniquecharcount ].next))
1553 DEBUG_TRIE_EXECUTE_r(
1554 PerlIO_printf( Perl_debug_log," - legal\n"));
1559 DEBUG_TRIE_EXECUTE_r(
1560 PerlIO_printf( Perl_debug_log," - fail\n"));
1562 state = aho->fail[state];
1566 /* we must be accepting here */
1567 DEBUG_TRIE_EXECUTE_r(
1568 PerlIO_printf( Perl_debug_log," - accepting\n"));
1577 if (!state) state = 1;
1580 if ( aho->states[ state ].wordnum ) {
1581 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1582 if (!leftmost || lpos < leftmost) {
1583 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1588 s = (char*)leftmost;
1589 DEBUG_TRIE_EXECUTE_r({
1591 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1592 (UV)accepted_word, s - real_start
1595 if (!reginfo || regtry(reginfo, s)) {
1601 DEBUG_TRIE_EXECUTE_r({
1602 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1605 DEBUG_TRIE_EXECUTE_r(
1606 PerlIO_printf( Perl_debug_log,"No match.\n"));
1615 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1624 - regexec_flags - match a regexp against a string
1627 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1628 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1629 /* strend: pointer to null at end of string */
1630 /* strbeg: real beginning of string */
1631 /* minend: end of match must be >=minend after stringarg. */
1632 /* data: May be used for some additional optimizations. */
1633 /* nosave: For optimizations. */
1637 register regnode *c;
1638 register char *startpos = stringarg;
1639 I32 minlen; /* must match at least this many chars */
1640 I32 dontbother = 0; /* how many characters not to try at end */
1641 I32 end_shift = 0; /* Same for the end. */ /* CC */
1642 I32 scream_pos = -1; /* Internal iterator of scream. */
1643 char *scream_olds = NULL;
1644 SV* const oreplsv = GvSV(PL_replgv);
1645 const bool do_utf8 = (bool)DO_UTF8(sv);
1648 regmatch_info reginfo; /* create some info to pass to regtry etc */
1650 GET_RE_DEBUG_FLAGS_DECL;
1652 PERL_UNUSED_ARG(data);
1654 /* Be paranoid... */
1655 if (prog == NULL || startpos == NULL) {
1656 Perl_croak(aTHX_ "NULL regexp parameter");
1660 multiline = prog->reganch & PMf_MULTILINE;
1661 reginfo.prog = prog;
1663 RX_MATCH_UTF8_set(prog, do_utf8);
1665 debug_start_match(prog, do_utf8, startpos, strend,
1669 minlen = prog->minlen;
1671 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1673 "String too short [regexec_flags]...\n"));
1678 /* Check validity of program. */
1679 if (UCHARAT(prog->program) != REG_MAGIC) {
1680 Perl_croak(aTHX_ "corrupted regexp program");
1684 PL_reg_eval_set = 0;
1687 if (prog->reganch & ROPT_UTF8)
1688 PL_reg_flags |= RF_utf8;
1690 /* Mark beginning of line for ^ and lookbehind. */
1691 reginfo.bol = startpos; /* XXX not used ??? */
1695 /* Mark end of line for $ (and such) */
1698 /* see how far we have to get to not match where we matched before */
1699 reginfo.till = startpos+minend;
1701 /* If there is a "must appear" string, look for it. */
1704 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1707 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1708 reginfo.ganch = startpos;
1709 else if (sv && SvTYPE(sv) >= SVt_PVMG
1711 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1712 && mg->mg_len >= 0) {
1713 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1714 if (prog->reganch & ROPT_ANCH_GPOS) {
1715 if (s > reginfo.ganch)
1720 else /* pos() not defined */
1721 reginfo.ganch = strbeg;
1724 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1725 re_scream_pos_data d;
1727 d.scream_olds = &scream_olds;
1728 d.scream_pos = &scream_pos;
1729 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1731 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1732 goto phooey; /* not present */
1738 /* Simplest case: anchored match need be tried only once. */
1739 /* [unless only anchor is BOL and multiline is set] */
1740 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1741 if (s == startpos && regtry(®info, startpos))
1743 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1744 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1749 dontbother = minlen - 1;
1750 end = HOP3c(strend, -dontbother, strbeg) - 1;
1751 /* for multiline we only have to try after newlines */
1752 if (prog->check_substr || prog->check_utf8) {
1756 if (regtry(®info, s))
1761 if (prog->reganch & RE_USE_INTUIT) {
1762 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1773 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1774 if (regtry(®info, s))
1781 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1783 /* the warning about reginfo.ganch being used without intialization
1784 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1785 and we only enter this block when the same bit is set. */
1786 if (regtry(®info, reginfo.ganch))
1791 /* Messy cases: unanchored match. */
1792 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1793 /* we have /x+whatever/ */
1794 /* it must be a one character string (XXXX Except UTF?) */
1799 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1800 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1801 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1806 DEBUG_EXECUTE_r( did_match = 1 );
1807 if (regtry(®info, s)) goto got_it;
1809 while (s < strend && *s == ch)
1817 DEBUG_EXECUTE_r( did_match = 1 );
1818 if (regtry(®info, s)) goto got_it;
1820 while (s < strend && *s == ch)
1825 DEBUG_EXECUTE_r(if (!did_match)
1826 PerlIO_printf(Perl_debug_log,
1827 "Did not find anchored character...\n")
1830 else if (prog->anchored_substr != NULL
1831 || prog->anchored_utf8 != NULL
1832 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1833 && prog->float_max_offset < strend - s)) {
1838 char *last1; /* Last position checked before */
1842 if (prog->anchored_substr || prog->anchored_utf8) {
1843 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1844 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1845 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1846 back_max = back_min = prog->anchored_offset;
1848 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1849 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1850 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1851 back_max = prog->float_max_offset;
1852 back_min = prog->float_min_offset;
1856 if (must == &PL_sv_undef)
1857 /* could not downgrade utf8 check substring, so must fail */
1863 last = HOP3c(strend, /* Cannot start after this */
1864 -(I32)(CHR_SVLEN(must)
1865 - (SvTAIL(must) != 0) + back_min), strbeg);
1868 last1 = HOPc(s, -1);
1870 last1 = s - 1; /* bogus */
1872 /* XXXX check_substr already used to find "s", can optimize if
1873 check_substr==must. */
1875 dontbother = end_shift;
1876 strend = HOPc(strend, -dontbother);
1877 while ( (s <= last) &&
1878 ((flags & REXEC_SCREAM)
1879 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1880 end_shift, &scream_pos, 0))
1881 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1882 (unsigned char*)strend, must,
1883 multiline ? FBMrf_MULTILINE : 0))) ) {
1884 /* we may be pointing at the wrong string */
1885 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1886 s = strbeg + (s - SvPVX_const(sv));
1887 DEBUG_EXECUTE_r( did_match = 1 );
1888 if (HOPc(s, -back_max) > last1) {
1889 last1 = HOPc(s, -back_min);
1890 s = HOPc(s, -back_max);
1893 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1895 last1 = HOPc(s, -back_min);
1899 while (s <= last1) {
1900 if (regtry(®info, s))
1906 while (s <= last1) {
1907 if (regtry(®info, s))
1913 DEBUG_EXECUTE_r(if (!did_match) {
1914 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1915 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1916 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1917 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1918 ? "anchored" : "floating"),
1919 quoted, RE_SV_TAIL(must));
1923 else if ( (c = prog->regstclass) ) {
1925 const OPCODE op = OP(prog->regstclass);
1926 /* don't bother with what can't match */
1927 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1928 strend = HOPc(strend, -(minlen - 1));
1931 SV * const prop = sv_newmortal();
1932 regprop(prog, prop, c);
1934 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1936 PerlIO_printf(Perl_debug_log,
1937 "Matching stclass %.*s against %s (%d chars)\n",
1938 (int)SvCUR(prop), SvPVX_const(prop),
1939 quoted, (int)(strend - s));
1942 if (find_byclass(prog, c, s, strend, ®info))
1944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1948 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1953 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1954 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1955 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1957 if (flags & REXEC_SCREAM) {
1958 last = screaminstr(sv, float_real, s - strbeg,
1959 end_shift, &scream_pos, 1); /* last one */
1961 last = scream_olds; /* Only one occurrence. */
1962 /* we may be pointing at the wrong string */
1963 else if (RX_MATCH_COPIED(prog))
1964 s = strbeg + (s - SvPVX_const(sv));
1968 const char * const little = SvPV_const(float_real, len);
1970 if (SvTAIL(float_real)) {
1971 if (memEQ(strend - len + 1, little, len - 1))
1972 last = strend - len + 1;
1973 else if (!multiline)
1974 last = memEQ(strend - len, little, len)
1975 ? strend - len : NULL;
1981 last = rninstr(s, strend, little, little + len);
1983 last = strend; /* matching "$" */
1988 PerlIO_printf(Perl_debug_log,
1989 "%sCan't trim the tail, match fails (should not happen)%s\n",
1990 PL_colors[4], PL_colors[5]));
1991 goto phooey; /* Should not happen! */
1993 dontbother = strend - last + prog->float_min_offset;
1995 if (minlen && (dontbother < minlen))
1996 dontbother = minlen - 1;
1997 strend -= dontbother; /* this one's always in bytes! */
1998 /* We don't know much -- general case. */
2001 if (regtry(®info, s))
2010 if (regtry(®info, s))
2012 } while (s++ < strend);
2020 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2022 if (PL_reg_eval_set) {
2023 /* Preserve the current value of $^R */
2024 if (oreplsv != GvSV(PL_replgv))
2025 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2026 restored, the value remains
2028 restore_pos(aTHX_ prog);
2031 /* make sure $`, $&, $', and $digit will work later */
2032 if ( !(flags & REXEC_NOT_FIRST) ) {
2033 RX_MATCH_COPY_FREE(prog);
2034 if (flags & REXEC_COPY_STR) {
2035 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2036 #ifdef PERL_OLD_COPY_ON_WRITE
2038 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2040 PerlIO_printf(Perl_debug_log,
2041 "Copy on write: regexp capture, type %d\n",
2044 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2045 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2046 assert (SvPOKp(prog->saved_copy));
2050 RX_MATCH_COPIED_on(prog);
2051 s = savepvn(strbeg, i);
2057 prog->subbeg = strbeg;
2058 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2065 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2066 PL_colors[4], PL_colors[5]));
2067 if (PL_reg_eval_set)
2068 restore_pos(aTHX_ prog);
2074 - regtry - try match at specific point
2076 STATIC I32 /* 0 failure, 1 success */
2077 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2083 regexp *prog = reginfo->prog;
2084 GET_RE_DEBUG_FLAGS_DECL;
2086 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2089 PL_reg_eval_set = RS_init;
2090 DEBUG_EXECUTE_r(DEBUG_s(
2091 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2092 (IV)(PL_stack_sp - PL_stack_base));
2094 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2095 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2096 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2098 /* Apparently this is not needed, judging by wantarray. */
2099 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2100 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2103 /* Make $_ available to executed code. */
2104 if (reginfo->sv != DEFSV) {
2106 DEFSV = reginfo->sv;
2109 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2110 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2111 /* prepare for quick setting of pos */
2112 #ifdef PERL_OLD_COPY_ON_WRITE
2114 sv_force_normal_flags(sv, 0);
2116 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2117 &PL_vtbl_mglob, NULL, 0);
2121 PL_reg_oldpos = mg->mg_len;
2122 SAVEDESTRUCTOR_X(restore_pos, prog);
2124 if (!PL_reg_curpm) {
2125 Newxz(PL_reg_curpm, 1, PMOP);
2128 SV* const repointer = newSViv(0);
2129 /* so we know which PL_regex_padav element is PL_reg_curpm */
2130 SvFLAGS(repointer) |= SVf_BREAK;
2131 av_push(PL_regex_padav,repointer);
2132 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2133 PL_regex_pad = AvARRAY(PL_regex_padav);
2137 PM_SETRE(PL_reg_curpm, prog);
2138 PL_reg_oldcurpm = PL_curpm;
2139 PL_curpm = PL_reg_curpm;
2140 if (RX_MATCH_COPIED(prog)) {
2141 /* Here is a serious problem: we cannot rewrite subbeg,
2142 since it may be needed if this match fails. Thus
2143 $` inside (?{}) could fail... */
2144 PL_reg_oldsaved = prog->subbeg;
2145 PL_reg_oldsavedlen = prog->sublen;
2146 #ifdef PERL_OLD_COPY_ON_WRITE
2147 PL_nrs = prog->saved_copy;
2149 RX_MATCH_COPIED_off(prog);
2152 PL_reg_oldsaved = NULL;
2153 prog->subbeg = PL_bostr;
2154 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2156 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2157 prog->startp[0] = startpos - PL_bostr;
2158 PL_reginput = startpos;
2159 PL_reglastparen = &prog->lastparen;
2160 PL_reglastcloseparen = &prog->lastcloseparen;
2161 prog->lastparen = 0;
2162 prog->lastcloseparen = 0;
2164 PL_regstartp = prog->startp;
2165 PL_regendp = prog->endp;
2166 if (PL_reg_start_tmpl <= prog->nparens) {
2167 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2168 if(PL_reg_start_tmp)
2169 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2171 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 /* XXXX What this code is doing here?!!! There should be no need
2175 to do this again and again, PL_reglastparen should take care of
2178 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2179 * Actually, the code in regcppop() (which Ilya may be meaning by
2180 * PL_reglastparen), is not needed at all by the test suite
2181 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2182 * enough, for building DynaLoader, or otherwise this
2183 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2184 * will happen. Meanwhile, this code *is* needed for the
2185 * above-mentioned test suite tests to succeed. The common theme
2186 * on those tests seems to be returning null fields from matches.
2191 if (prog->nparens) {
2193 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2200 if (regmatch(reginfo, prog->program + 1)) {
2201 prog->endp[0] = PL_reginput - PL_bostr;
2204 REGCP_UNWIND(lastcp);
2209 #define sayYES goto yes
2210 #define sayNO goto no
2211 #define sayNO_SILENT goto no_silent
2212 #define saySAME(x) if (x) goto yes; else goto no
2214 /* we dont use STMT_START/END here because it leads to
2215 "unreachable code" warnings, which are bogus, but distracting. */
2216 #define CACHEsayNO \
2217 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2218 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2219 (1<<st->u.whilem.cache_bit); \
2222 /* this is used to determine how far from the left messages like
2223 'failed...' are printed. It should be set such that messages
2224 are inline with the regop output that created them.
2226 #define REPORT_CODE_OFF 32
2229 /* Make sure there is a test for this +1 options in re_tests */
2230 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2232 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2233 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2235 #define SLAB_FIRST(s) (&(s)->states[0])
2236 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2238 /* grab a new slab and return the first slot in it */
2240 STATIC regmatch_state *
2243 #if PERL_VERSION < 9
2246 regmatch_slab *s = PL_regmatch_slab->next;
2248 Newx(s, 1, regmatch_slab);
2249 s->prev = PL_regmatch_slab;
2251 PL_regmatch_slab->next = s;
2253 PL_regmatch_slab = s;
2254 return SLAB_FIRST(s);
2257 /* simulate a recursive call to regmatch */
2259 #define REGMATCH(ns, where) \
2262 st->resume_state = resume_##where; \
2263 goto start_recurse; \
2264 resume_point_##where:
2266 /* push a new state then goto it */
2268 #define PUSH_STATE_GOTO(state, node) \
2270 st->resume_state = state; \
2273 /* push a new state with success backtracking, then goto it */
2275 #define PUSH_YES_STATE_GOTO(state, node) \
2277 st->resume_state = state; \
2278 goto push_yes_state;
2283 - regmatch - main matching routine
2285 * Conceptually the strategy is simple: check to see whether the current
2286 * node matches, call self recursively to see whether the rest matches,
2287 * and then act accordingly. In practice we make some effort to avoid
2288 * recursion, in particular by going through "ordinary" nodes (that don't
2289 * need to know whether the rest of the match failed) by a loop instead of
2292 /* [lwall] I've hoisted the register declarations to the outer block in order to
2293 * maybe save a little bit of pushing and popping on the stack. It also takes
2294 * advantage of machines that use a register save mask on subroutine entry.
2296 * This function used to be heavily recursive, but since this had the
2297 * effect of blowing the CPU stack on complex regexes, it has been
2298 * restructured to be iterative, and to save state onto the heap rather
2299 * than the stack. Essentially whereever regmatch() used to be called, it
2300 * pushes the current state, notes where to return, then jumps back into
2303 * Originally the structure of this function used to look something like
2308 while (scan != NULL) {
2309 a++; // do stuff with a and b
2315 if (regmatch(...)) // recurse
2325 * Now it looks something like this:
2333 regmatch_state *st = new();
2335 st->a++; // do stuff with a and b
2337 while (scan != NULL) {
2345 st->resume_state = resume_FOO;
2346 goto start_recurse; // recurse
2355 st = new(); push a new state
2356 st->a = 1; st->b = 2;
2363 switch (resume_state) {
2365 goto resume_point_FOO;
2372 * WARNING: this means that any line in this function that contains a
2373 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2374 * regmatch() using gotos instead. Thus the values of any local variables
2375 * not saved in the regmatch_state structure will have been lost when
2376 * execution resumes on the next line .
2378 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2379 * PL_regmatch_state always points to the currently active state, and
2380 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2381 * The first time regmatch is called, the first slab is allocated, and is
2382 * never freed until interpreter desctruction. When the slab is full,
2383 * a new one is allocated chained to the end. At exit from regmatch, slabs
2384 * allocated since entry are freed.
2388 #define DEBUG_STATE_pp(pp) \
2390 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2391 PerlIO_printf(Perl_debug_log, \
2394 reg_name[st->resume_state] ); \
2398 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2403 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2404 const char *start, const char *end, const char *blurb)
2406 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2410 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2411 prog->precomp, prog->prelen, 60);
2413 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2414 start, end - start, 60);
2416 PerlIO_printf(Perl_debug_log,
2417 "%s%s REx%s %s against %s\n",
2418 PL_colors[4], blurb, PL_colors[5], s0, s1);
2420 if (do_utf8||utf8_pat)
2421 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2422 utf8_pat ? "pattern" : "",
2423 utf8_pat && do_utf8 ? " and " : "",
2424 do_utf8 ? "string" : ""
2430 S_dump_exec_pos(pTHX_ const char *locinput,
2431 const regnode *scan,
2432 const char *loc_regeol,
2433 const char *loc_bostr,
2434 const char *loc_reg_starttry,
2437 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2438 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2439 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2440 /* The part of the string before starttry has one color
2441 (pref0_len chars), between starttry and current
2442 position another one (pref_len - pref0_len chars),
2443 after the current position the third one.
2444 We assume that pref0_len <= pref_len, otherwise we
2445 decrease pref0_len. */
2446 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2447 ? (5 + taill) - l : locinput - loc_bostr;
2450 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2452 pref0_len = pref_len - (locinput - loc_reg_starttry);
2453 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2454 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2455 ? (5 + taill) - pref_len : loc_regeol - locinput);
2456 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2460 if (pref0_len > pref_len)
2461 pref0_len = pref_len;
2463 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2465 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2466 (locinput - pref_len),pref0_len, 60, 4, 5);
2468 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2469 (locinput - pref_len + pref0_len),
2470 pref_len - pref0_len, 60, 2, 3);
2472 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2473 locinput, loc_regeol - locinput, 10, 0, 1);
2475 const STRLEN tlen=len0+len1+len2;
2476 PerlIO_printf(Perl_debug_log,
2477 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2478 (IV)(locinput - loc_bostr),
2481 (docolor ? "" : "> <"),
2483 (int)(tlen > 19 ? 0 : 19 - tlen),
2490 STATIC I32 /* 0 failure, 1 success */
2491 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2493 #if PERL_VERSION < 9
2497 register const bool do_utf8 = PL_reg_match_utf8;
2498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2500 regexp *rex = reginfo->prog;
2502 regmatch_slab *orig_slab;
2503 regmatch_state *orig_state;
2505 /* the current state. This is a cached copy of PL_regmatch_state */
2506 register regmatch_state *st;
2508 /* cache heavy used fields of st in registers */
2509 register regnode *scan;
2510 register regnode *next;
2511 register I32 n = 0; /* initialize to shut up compiler warning */
2512 register char *locinput = PL_reginput;
2514 /* these variables are NOT saved during a recusive RFEGMATCH: */
2515 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2516 bool result = 0; /* return value of S_regmatch */
2517 int depth = 0; /* depth of recursion */
2518 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
2519 regmatch_state *yes_state = NULL; /* state to pop to on success of
2521 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2522 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2528 GET_RE_DEBUG_FLAGS_DECL;
2531 /* on first ever call to regmatch, allocate first slab */
2532 if (!PL_regmatch_slab) {
2533 Newx(PL_regmatch_slab, 1, regmatch_slab);
2534 PL_regmatch_slab->prev = NULL;
2535 PL_regmatch_slab->next = NULL;
2536 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2539 /* remember current high-water mark for exit */
2540 /* XXX this should be done with SAVE* instead */
2541 orig_slab = PL_regmatch_slab;
2542 orig_state = PL_regmatch_state;
2544 /* grab next free state slot */
2545 st = ++PL_regmatch_state;
2546 if (st > SLAB_LAST(PL_regmatch_slab))
2547 st = PL_regmatch_state = S_push_slab(aTHX);
2554 /* Note that nextchr is a byte even in UTF */
2555 nextchr = UCHARAT(locinput);
2557 while (scan != NULL) {
2560 SV * const prop = sv_newmortal();
2561 regnode *rnext=regnext(scan);
2562 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2563 regprop(rex, prop, scan);
2565 PerlIO_printf(Perl_debug_log,
2566 "%3"IVdf":%*s%s(%"IVdf")\n",
2567 (IV)(scan - rex->program), depth*2, "",
2569 (PL_regkind[OP(scan)] == END || !rnext) ?
2570 0 : (IV)(rnext - rex->program));
2573 next = scan + NEXT_OFF(scan);
2576 state_num = OP(scan);
2579 switch (state_num) {
2581 if (locinput == PL_bostr)
2583 /* reginfo->till = reginfo->bol; */
2588 if (locinput == PL_bostr ||
2589 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2595 if (locinput == PL_bostr)
2599 if (locinput == reginfo->ganch)
2605 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2610 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2612 if (PL_regeol - locinput > 1)
2616 if (PL_regeol != locinput)
2620 if (!nextchr && locinput >= PL_regeol)
2623 locinput += PL_utf8skip[nextchr];
2624 if (locinput > PL_regeol)
2626 nextchr = UCHARAT(locinput);
2629 nextchr = UCHARAT(++locinput);
2632 if (!nextchr && locinput >= PL_regeol)
2634 nextchr = UCHARAT(++locinput);
2637 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2640 locinput += PL_utf8skip[nextchr];
2641 if (locinput > PL_regeol)
2643 nextchr = UCHARAT(locinput);
2646 nextchr = UCHARAT(++locinput);
2650 #define ST st->u.trie
2652 /* In this case the charclass data is available inline so
2653 we can fail fast without a lot of extra overhead.
2655 if (scan->flags == EXACT || !do_utf8) {
2656 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2658 PerlIO_printf(Perl_debug_log,
2659 "%*s %sfailed to match trie start class...%s\n",
2660 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2669 /* what type of TRIE am I? (utf8 makes this contextual) */
2670 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2671 trie_type = do_utf8 ?
2672 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2675 /* what trie are we using right now */
2676 reg_trie_data * const trie
2677 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2678 U32 state = trie->startstate;
2680 if (trie->bitmap && trie_type != trie_utf8_fold &&
2681 !TRIE_BITMAP_TEST(trie,*locinput)
2683 if (trie->states[ state ].wordnum) {
2685 PerlIO_printf(Perl_debug_log,
2686 "%*s %smatched empty string...%s\n",
2687 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2692 PerlIO_printf(Perl_debug_log,
2693 "%*s %sfailed to match trie start class...%s\n",
2694 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2701 U8 *uc = ( U8* )locinput;
2705 U8 *uscan = (U8*)NULL;
2707 SV *sv_accept_buff = NULL;
2708 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2710 ST.accepted = 0; /* how many accepting states we have seen */
2712 ST.jump = trie->jump;
2721 traverse the TRIE keeping track of all accepting states
2722 we transition through until we get to a failing node.
2725 while ( state && uc <= (U8*)PL_regeol ) {
2726 U32 base = trie->states[ state ].trans.base;
2729 /* We use charid to hold the wordnum as we don't use it
2730 for charid until after we have done the wordnum logic.
2731 We define an alias just so that the wordnum logic reads
2734 #define got_wordnum charid
2735 got_wordnum = trie->states[ state ].wordnum;
2737 if ( got_wordnum ) {
2738 if ( ! ST.accepted ) {
2741 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2742 sv_accept_buff=newSV(bufflen *
2743 sizeof(reg_trie_accepted) - 1);
2744 SvCUR_set(sv_accept_buff, 0);
2745 SvPOK_on(sv_accept_buff);
2746 sv_2mortal(sv_accept_buff);
2749 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2752 if (ST.accepted >= bufflen) {
2754 ST.accept_buff =(reg_trie_accepted*)
2755 SvGROW(sv_accept_buff,
2756 bufflen * sizeof(reg_trie_accepted));
2758 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2759 + sizeof(reg_trie_accepted));
2762 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2763 ST.accept_buff[ST.accepted].endpos = uc;
2765 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2769 DEBUG_TRIE_EXECUTE_r({
2770 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2771 PerlIO_printf( Perl_debug_log,
2772 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2773 2+depth * 2, "", PL_colors[4],
2774 (UV)state, (UV)ST.accepted );
2778 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2779 uvc, charid, foldlen, foldbuf, uniflags);
2782 (base + charid > trie->uniquecharcount )
2783 && (base + charid - 1 - trie->uniquecharcount
2785 && trie->trans[base + charid - 1 -
2786 trie->uniquecharcount].check == state)
2788 state = trie->trans[base + charid - 1 -
2789 trie->uniquecharcount ].next;
2800 DEBUG_TRIE_EXECUTE_r(
2801 PerlIO_printf( Perl_debug_log,
2802 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2803 charid, uvc, (UV)state, PL_colors[5] );
2810 PerlIO_printf( Perl_debug_log,
2811 "%*s %sgot %"IVdf" possible matches%s\n",
2812 REPORT_CODE_OFF + depth * 2, "",
2813 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2819 case TRIE_next_fail: /* we failed - try next alterative */
2821 if ( ST.accepted == 1 ) {
2822 /* only one choice left - just continue */
2824 reg_trie_data * const trie
2825 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2826 SV ** const tmp = RX_DEBUG(reginfo->prog)
2827 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2829 PerlIO_printf( Perl_debug_log,
2830 "%*s %sonly one match left: #%d <%s>%s\n",
2831 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2832 ST.accept_buff[ 0 ].wordnum,
2833 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2836 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2837 /* in this case we free tmps/leave before we call regmatch
2838 as we wont be using accept_buff again. */
2841 locinput = PL_reginput;
2842 nextchr = UCHARAT(locinput);
2847 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2849 continue; /* execute rest of RE */
2852 if (!ST.accepted-- ) {
2859 There are at least two accepting states left. Presumably
2860 the number of accepting states is going to be low,
2861 typically two. So we simply scan through to find the one
2862 with lowest wordnum. Once we find it, we swap the last
2863 state into its place and decrement the size. We then try to
2864 match the rest of the pattern at the point where the word
2865 ends. If we succeed, control just continues along the
2866 regex; if we fail we return here to try the next accepting
2873 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2874 DEBUG_TRIE_EXECUTE_r(
2875 PerlIO_printf( Perl_debug_log,
2876 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2877 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
2878 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2879 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2882 if (ST.accept_buff[cur].wordnum <
2883 ST.accept_buff[best].wordnum)
2888 reg_trie_data * const trie
2889 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2890 SV ** const tmp = RX_DEBUG(reginfo->prog)
2891 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2893 regnode *nextop=!ST.jump ?
2895 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2896 PerlIO_printf( Perl_debug_log,
2897 "%*s %strying alternation #%d <%s> at node #%d %s\n",
2898 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2899 ST.accept_buff[best].wordnum,
2900 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2901 REG_NODE_NUM(nextop),
2905 if ( best<ST.accepted ) {
2906 reg_trie_accepted tmp = ST.accept_buff[ best ];
2907 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2908 ST.accept_buff[ ST.accepted ] = tmp;
2911 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2913 PUSH_STATE_GOTO(TRIE_next, ST.B);
2916 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2926 char *s = STRING(scan);
2927 st->ln = STR_LEN(scan);
2928 if (do_utf8 != UTF) {
2929 /* The target and the pattern have differing utf8ness. */
2931 const char * const e = s + st->ln;
2934 /* The target is utf8, the pattern is not utf8. */
2939 if (NATIVE_TO_UNI(*(U8*)s) !=
2940 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2948 /* The target is not utf8, the pattern is utf8. */
2953 if (NATIVE_TO_UNI(*((U8*)l)) !=
2954 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2962 nextchr = UCHARAT(locinput);
2965 /* The target and the pattern have the same utf8ness. */
2966 /* Inline the first character, for speed. */
2967 if (UCHARAT(s) != nextchr)
2969 if (PL_regeol - locinput < st->ln)
2971 if (st->ln > 1 && memNE(s, locinput, st->ln))
2974 nextchr = UCHARAT(locinput);
2978 PL_reg_flags |= RF_tainted;
2981 char * const s = STRING(scan);
2982 st->ln = STR_LEN(scan);
2984 if (do_utf8 || UTF) {
2985 /* Either target or the pattern are utf8. */
2986 const char * const l = locinput;
2987 char *e = PL_regeol;
2989 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2990 l, &e, 0, do_utf8)) {
2991 /* One more case for the sharp s:
2992 * pack("U0U*", 0xDF) =~ /ss/i,
2993 * the 0xC3 0x9F are the UTF-8
2994 * byte sequence for the U+00DF. */
2996 toLOWER(s[0]) == 's' &&
2998 toLOWER(s[1]) == 's' &&
3005 nextchr = UCHARAT(locinput);
3009 /* Neither the target and the pattern are utf8. */
3011 /* Inline the first character, for speed. */
3012 if (UCHARAT(s) != nextchr &&
3013 UCHARAT(s) != ((OP(scan) == EXACTF)
3014 ? PL_fold : PL_fold_locale)[nextchr])
3016 if (PL_regeol - locinput < st->ln)
3018 if (st->ln > 1 && (OP(scan) == EXACTF
3019 ? ibcmp(s, locinput, st->ln)
3020 : ibcmp_locale(s, locinput, st->ln)))
3023 nextchr = UCHARAT(locinput);
3028 STRLEN inclasslen = PL_regeol - locinput;
3030 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3032 if (locinput >= PL_regeol)
3034 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3035 nextchr = UCHARAT(locinput);
3040 nextchr = UCHARAT(locinput);
3041 if (!REGINCLASS(rex, scan, (U8*)locinput))
3043 if (!nextchr && locinput >= PL_regeol)
3045 nextchr = UCHARAT(++locinput);
3049 /* If we might have the case of the German sharp s
3050 * in a casefolding Unicode character class. */
3052 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3053 locinput += SHARP_S_SKIP;
3054 nextchr = UCHARAT(locinput);
3060 PL_reg_flags |= RF_tainted;
3066 LOAD_UTF8_CHARCLASS_ALNUM();
3067 if (!(OP(scan) == ALNUM
3068 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3069 : isALNUM_LC_utf8((U8*)locinput)))
3073 locinput += PL_utf8skip[nextchr];
3074 nextchr = UCHARAT(locinput);
3077 if (!(OP(scan) == ALNUM
3078 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3080 nextchr = UCHARAT(++locinput);
3083 PL_reg_flags |= RF_tainted;
3086 if (!nextchr && locinput >= PL_regeol)
3089 LOAD_UTF8_CHARCLASS_ALNUM();
3090 if (OP(scan) == NALNUM
3091 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3092 : isALNUM_LC_utf8((U8*)locinput))
3096 locinput += PL_utf8skip[nextchr];
3097 nextchr = UCHARAT(locinput);
3100 if (OP(scan) == NALNUM
3101 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3103 nextchr = UCHARAT(++locinput);
3107 PL_reg_flags |= RF_tainted;
3111 /* was last char in word? */
3113 if (locinput == PL_bostr)
3116 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3118 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3120 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3121 st->ln = isALNUM_uni(st->ln);
3122 LOAD_UTF8_CHARCLASS_ALNUM();
3123 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3126 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3127 n = isALNUM_LC_utf8((U8*)locinput);
3131 st->ln = (locinput != PL_bostr) ?
3132 UCHARAT(locinput - 1) : '\n';
3133 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3134 st->ln = isALNUM(st->ln);
3135 n = isALNUM(nextchr);
3138 st->ln = isALNUM_LC(st->ln);
3139 n = isALNUM_LC(nextchr);
3142 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3143 OP(scan) == BOUNDL))
3147 PL_reg_flags |= RF_tainted;
3153 if (UTF8_IS_CONTINUED(nextchr)) {
3154 LOAD_UTF8_CHARCLASS_SPACE();
3155 if (!(OP(scan) == SPACE
3156 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3157 : isSPACE_LC_utf8((U8*)locinput)))
3161 locinput += PL_utf8skip[nextchr];
3162 nextchr = UCHARAT(locinput);
3165 if (!(OP(scan) == SPACE
3166 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3168 nextchr = UCHARAT(++locinput);
3171 if (!(OP(scan) == SPACE
3172 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3174 nextchr = UCHARAT(++locinput);
3178 PL_reg_flags |= RF_tainted;
3181 if (!nextchr && locinput >= PL_regeol)
3184 LOAD_UTF8_CHARCLASS_SPACE();
3185 if (OP(scan) == NSPACE
3186 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3187 : isSPACE_LC_utf8((U8*)locinput))
3191 locinput += PL_utf8skip[nextchr];
3192 nextchr = UCHARAT(locinput);
3195 if (OP(scan) == NSPACE
3196 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3198 nextchr = UCHARAT(++locinput);
3201 PL_reg_flags |= RF_tainted;
3207 LOAD_UTF8_CHARCLASS_DIGIT();
3208 if (!(OP(scan) == DIGIT
3209 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3210 : isDIGIT_LC_utf8((U8*)locinput)))
3214 locinput += PL_utf8skip[nextchr];
3215 nextchr = UCHARAT(locinput);
3218 if (!(OP(scan) == DIGIT
3219 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3221 nextchr = UCHARAT(++locinput);
3224 PL_reg_flags |= RF_tainted;
3227 if (!nextchr && locinput >= PL_regeol)
3230 LOAD_UTF8_CHARCLASS_DIGIT();
3231 if (OP(scan) == NDIGIT
3232 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3233 : isDIGIT_LC_utf8((U8*)locinput))
3237 locinput += PL_utf8skip[nextchr];
3238 nextchr = UCHARAT(locinput);
3241 if (OP(scan) == NDIGIT
3242 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3244 nextchr = UCHARAT(++locinput);
3247 if (locinput >= PL_regeol)
3250 LOAD_UTF8_CHARCLASS_MARK();
3251 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3253 locinput += PL_utf8skip[nextchr];
3254 while (locinput < PL_regeol &&
3255 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3256 locinput += UTF8SKIP(locinput);
3257 if (locinput > PL_regeol)
3262 nextchr = UCHARAT(locinput);
3265 PL_reg_flags |= RF_tainted;
3270 n = ARG(scan); /* which paren pair */
3271 st->ln = PL_regstartp[n];
3272 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3273 if ((I32)*PL_reglastparen < n || st->ln == -1)
3274 sayNO; /* Do not match unless seen CLOSEn. */
3275 if (st->ln == PL_regendp[n])
3278 s = PL_bostr + st->ln;
3279 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3281 const char *e = PL_bostr + PL_regendp[n];
3283 * Note that we can't do the "other character" lookup trick as
3284 * in the 8-bit case (no pun intended) because in Unicode we
3285 * have to map both upper and title case to lower case.
3287 if (OP(scan) == REFF) {
3289 STRLEN ulen1, ulen2;
3290 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3291 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3295 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3296 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3297 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3304 nextchr = UCHARAT(locinput);
3308 /* Inline the first character, for speed. */
3309 if (UCHARAT(s) != nextchr &&
3311 (UCHARAT(s) != ((OP(scan) == REFF
3312 ? PL_fold : PL_fold_locale)[nextchr]))))
3314 st->ln = PL_regendp[n] - st->ln;
3315 if (locinput + st->ln > PL_regeol)
3317 if (st->ln > 1 && (OP(scan) == REF
3318 ? memNE(s, locinput, st->ln)
3320 ? ibcmp(s, locinput, st->ln)
3321 : ibcmp_locale(s, locinput, st->ln))))
3324 nextchr = UCHARAT(locinput);
3335 #define ST st->u.eval
3339 regnode *startpoint;
3342 case RECURSE: /* /(...(?1))/ */
3343 if (cur_eval && cur_eval->locinput==locinput) {
3344 if (cur_eval->u.eval.close_paren == ARG(scan))
3345 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3346 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3347 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3352 (void)ReREFCNT_inc(rex);
3353 if (OP(scan)==RECURSE) {
3354 startpoint = scan + ARG2L(scan);
3355 ST.close_paren = ARG(scan);
3357 startpoint = re->program+1;
3360 goto eval_recurse_doit;
3362 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3363 if (cur_eval && cur_eval->locinput==locinput) {
3364 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3365 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3370 /* execute the code in the {...} */
3372 SV ** const before = SP;
3373 OP_4tree * const oop = PL_op;
3374 COP * const ocurcop = PL_curcop;
3378 PL_op = (OP_4tree*)rex->data->data[n];
3379 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3380 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3381 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3383 CALLRUNOPS(aTHX); /* Scalar context. */
3386 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3393 PAD_RESTORE_LOCAL(old_comppad);
3394 PL_curcop = ocurcop;
3397 sv_setsv(save_scalar(PL_replgv), ret);
3401 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3404 /* extract RE object from returned value; compiling if
3409 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3410 mg = mg_find(sv, PERL_MAGIC_qr);
3411 else if (SvSMAGICAL(ret)) {
3412 if (SvGMAGICAL(ret))
3413 sv_unmagic(ret, PERL_MAGIC_qr);
3415 mg = mg_find(ret, PERL_MAGIC_qr);
3419 re = (regexp *)mg->mg_obj;
3420 (void)ReREFCNT_inc(re);
3424 const char * const t = SvPV_const(ret, len);
3426 const I32 osize = PL_regsize;
3429 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3430 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3432 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3434 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3440 debug_start_match(re, do_utf8, locinput, PL_regeol,
3441 "Matching embedded");
3443 startpoint = re->program + 1;
3444 ST.close_paren = 0; /* only used for RECURSE */
3445 /* borrowed from regtry */
3446 if (PL_reg_start_tmpl <= re->nparens) {
3447 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3448 if(PL_reg_start_tmp)
3449 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3451 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3454 eval_recurse_doit: /* Share code with RECURSE below this line */
3455 /* run the pattern returned from (??{...}) */
3456 ST.cp = regcppush(0); /* Save *all* the positions. */
3457 REGCP_SET(ST.lastcp);
3459 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3460 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3462 *PL_reglastparen = 0;
3463 *PL_reglastcloseparen = 0;
3464 PL_reginput = locinput;
3466 /* XXXX This is too dramatic a measure... */
3470 ST.toggle_reg_flags = PL_reg_flags;
3471 if (re->reganch & ROPT_UTF8)
3472 PL_reg_flags |= RF_utf8;
3474 PL_reg_flags &= ~RF_utf8;
3475 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3478 ST.prev_curlyx = cur_curlyx;
3482 ST.prev_eval = cur_eval;
3484 /* now continue from first node in postoned RE */
3485 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3488 /* /(?(?{...})X|Y)/ */
3489 st->sw = (bool)SvTRUE(ret);
3494 case EVAL_AB: /* cleanup after a successful (??{A})B */
3495 /* note: this is called twice; first after popping B, then A */
3496 PL_reg_flags ^= ST.toggle_reg_flags;
3500 cur_eval = ST.prev_eval;
3501 cur_curlyx = ST.prev_curlyx;
3502 /* XXXX This is too dramatic a measure... */
3507 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3508 /* note: this is called twice; first after popping B, then A */
3509 PL_reg_flags ^= ST.toggle_reg_flags;
3512 PL_reginput = locinput;
3513 REGCP_UNWIND(ST.lastcp);
3515 cur_eval = ST.prev_eval;
3516 cur_curlyx = ST.prev_curlyx;
3517 /* XXXX This is too dramatic a measure... */
3523 n = ARG(scan); /* which paren pair */
3524 PL_reg_start_tmp[n] = locinput;
3529 n = ARG(scan); /* which paren pair */
3530 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3531 PL_regendp[n] = locinput - PL_bostr;
3532 if (n > (I32)*PL_reglastparen)
3533 *PL_reglastparen = n;
3534 *PL_reglastcloseparen = n;
3535 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3540 n = ARG(scan); /* which paren pair */
3541 st->sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3544 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3546 next = NEXTOPER(NEXTOPER(scan));
3548 next = scan + ARG(scan);
3549 if (OP(next) == IFTHEN) /* Fake one. */
3550 next = NEXTOPER(NEXTOPER(next));
3554 st->logical = scan->flags;
3556 /*******************************************************************
3557 cur_curlyx points to the regmatch_state associated with the most recent CURLYX.
3558 This struct contains info about the innermost (...)* loop (an
3559 "infoblock"), and a pointer to the next outer cur_curlyx.
3561 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3563 1) After matching Y, regnode for CURLYX is processed;
3565 2) This regnode populates cur_curlyx, and calls regmatch() recursively
3566 with the starting point at WHILEM node;
3568 3) Each hit of WHILEM node tries to match A and Z (in the order
3569 depending on the current iteration, min/max of {min,max} and
3570 greediness). The information about where are nodes for "A"
3571 and "Z" is read from cur_curlyx, as is info on how many times "A"
3572 was already matched, and greediness.
3574 4) After A matches, the same WHILEM node is hit again.
3576 5) Each time WHILEM is hit, cur_curlyx is the infoblock created by CURLYX
3577 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3578 resets cur_curlyx, since this Y(A)*Z can be a part of some other loop:
3579 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3580 of the external loop.
3582 Currently present infoblocks form a tree with a stem formed by cur_curlyx
3583 and whatever it mentions via ->next, and additional attached trees
3584 corresponding to temporarily unset infoblocks as in "5" above.
3586 In the following picture, infoblocks for outer loop of
3587 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3588 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3589 infoblocks are drawn below the "reset" infoblock.
3591 In fact in the picture below we do not show failed matches for Z and T
3592 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3593 more obvious *why* one needs to *temporary* unset infoblocks.]
3595 Matched REx position InfoBlocks Comment
3599 Y A)*?Z)*?T x <- O <- I
3600 YA )*?Z)*?T x <- O <- I
3601 YA A)*?Z)*?T x <- O <- I
3602 YAA )*?Z)*?T x <- O <- I
3603 YAA Z)*?T x <- O # Temporary unset I
3606 YAAZ Y(A)*?Z)*?T x <- O
3609 YAAZY (A)*?Z)*?T x <- O
3612 YAAZY A)*?Z)*?T x <- O <- I
3615 YAAZYA )*?Z)*?T x <- O <- I
3618 YAAZYA Z)*?T x <- O # Temporary unset I
3624 YAAZYAZ T x # Temporary unset O
3631 *******************************************************************/
3634 /* No need to save/restore up to this paren */
3635 parenfloor = scan->flags;
3639 CURLYX and WHILEM are always paired: they're the moral
3640 equivalent of pp_enteriter anbd pp_iter.
3642 The only time next could be null is if the node tree is
3643 corrupt. This was mentioned on p5p a few days ago.
3645 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3646 So we'll assert that this is true:
3649 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3651 /* XXXX Probably it is better to teach regpush to support
3652 parenfloor > PL_regsize... */
3653 if (parenfloor > (I32)*PL_reglastparen)
3654 parenfloor = *PL_reglastparen; /* Pessimization... */
3656 st->u.curlyx.cp = PL_savestack_ix;
3657 st->u.curlyx.outercc = cur_curlyx;
3659 /* these fields contain the state of the current curly.
3660 * they are accessed by subsequent WHILEMs;
3661 * cur and lastloc are also updated by WHILEM */
3662 st->u.curlyx.parenfloor = parenfloor;
3663 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3664 st->u.curlyx.min = ARG1(scan);
3665 st->u.curlyx.max = ARG2(scan);
3666 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3667 st->u.curlyx.lastloc = 0;
3668 /* st->next and st->minmod are also read by WHILEM */
3670 PL_reginput = locinput;
3671 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3672 /*** all unsaved local vars undefined at this point */
3673 regcpblow(st->u.curlyx.cp);
3674 cur_curlyx = st->u.curlyx.outercc;
3680 * This is really hard to understand, because after we match
3681 * what we're trying to match, we must make sure the rest of
3682 * the REx is going to match for sure, and to do that we have
3683 * to go back UP the parse tree by recursing ever deeper. And
3684 * if it fails, we have to reset our parent's current state
3685 * that we can try again after backing off.
3690 cur_curlyx gets initialised by CURLYX ready for use by WHILEM.
3691 So again, unless somethings been corrupted, cur_curlyx cannot
3692 be null at that point in WHILEM.
3694 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3695 So we'll assert that this is true:
3698 st->u.whilem.lastloc = cur_curlyx->u.curlyx.lastloc; /* Detection of 0-len. */
3699 st->u.whilem.cache_offset = 0;
3700 st->u.whilem.cache_bit = 0;
3702 n = cur_curlyx->u.curlyx.cur + 1; /* how many we know we matched */
3703 PL_reginput = locinput;
3706 PerlIO_printf(Perl_debug_log,
3707 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3708 REPORT_CODE_OFF+depth*2, "",
3709 (long)n, (long)cur_curlyx->u.curlyx.min,
3710 (long)cur_curlyx->u.curlyx.max,
3714 /* If degenerate scan matches "", assume scan done. */
3716 if (locinput == cur_curlyx->u.curlyx.lastloc && n >=
3717 cur_curlyx->u.curlyx.min)
3719 st->u.whilem.savecc = cur_curlyx;
3720 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3722 PerlIO_printf(Perl_debug_log,
3723 "%*s empty match detected, try continuation...\n",
3724 REPORT_CODE_OFF+depth*2, "")
3726 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3727 /*** all unsaved local vars undefined at this point */
3728 cur_curlyx = st->u.whilem.savecc;
3734 /* First just match a string of min scans. */
3736 if (n < cur_curlyx->u.curlyx.min) {
3737 cur_curlyx->u.curlyx.cur = n;
3738 cur_curlyx->u.curlyx.lastloc = locinput;
3739 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM2);
3740 /*** all unsaved local vars undefined at this point */
3743 cur_curlyx->u.curlyx.cur = n - 1;
3744 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
3749 /* Check whether we already were at this position.
3750 Postpone detection until we know the match is not
3751 *that* much linear. */
3752 if (!PL_reg_maxiter) {
3753 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3754 /* possible overflow for long strings and many CURLYX's */
3755 if (PL_reg_maxiter < 0)
3756 PL_reg_maxiter = I32_MAX;
3757 PL_reg_leftiter = PL_reg_maxiter;
3759 if (PL_reg_leftiter-- == 0) {
3760 const I32 size = (PL_reg_maxiter + 7)/8;
3761 if (PL_reg_poscache) {
3762 if ((I32)PL_reg_poscache_size < size) {
3763 Renew(PL_reg_poscache, size, char);
3764 PL_reg_poscache_size = size;
3766 Zero(PL_reg_poscache, size, char);
3769 PL_reg_poscache_size = size;
3770 Newxz(PL_reg_poscache, size, char);
3773 PerlIO_printf(Perl_debug_log,
3774 "%sDetected a super-linear match, switching on caching%s...\n",
3775 PL_colors[4], PL_colors[5])
3778 if (PL_reg_leftiter < 0) {
3779 st->u.whilem.cache_offset = locinput - PL_bostr;
3781 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
3782 + st->u.whilem.cache_offset * (scan->flags>>4);
3783 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3784 st->u.whilem.cache_offset /= 8;
3785 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3787 PerlIO_printf(Perl_debug_log,
3788 "%*s already tried at this position...\n",
3789 REPORT_CODE_OFF+depth*2, "")
3791 sayNO; /* cache records failure */
3796 /* Prefer next over scan for minimal matching. */
3798 if (cur_curlyx->minmod) {
3799 st->u.whilem.savecc = cur_curlyx;
3800 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3801 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3802 REGCP_SET(st->u.whilem.lastcp);
3803 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3804 /*** all unsaved local vars undefined at this point */
3805 cur_curlyx = st->u.whilem.savecc;
3807 regcpblow(st->u.whilem.cp);
3808 sayYES; /* All done. */
3810 REGCP_UNWIND(st->u.whilem.lastcp);
3813 if (n >= cur_curlyx->u.curlyx.max) { /* Maximum greed exceeded? */
3814 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3815 && !(PL_reg_flags & RF_warned)) {
3816 PL_reg_flags |= RF_warned;
3817 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3818 "Complex regular subexpression recursion",
3825 PerlIO_printf(Perl_debug_log,
3826 "%*s trying longer...\n",
3827 REPORT_CODE_OFF+depth*2, "")
3829 /* Try scanning more and see if it helps. */
3830 PL_reginput = locinput;
3831 cur_curlyx->u.curlyx.cur = n;
3832 cur_curlyx->u.curlyx.lastloc = locinput;
3833 st->u.whilem.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3834 REGCP_SET(st->u.whilem.lastcp);
3835 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM4);
3836 /*** all unsaved local vars undefined at this point */
3838 regcpblow(st->u.whilem.cp);
3841 REGCP_UNWIND(st->u.whilem.lastcp);
3843 cur_curlyx->u.curlyx.cur = n - 1;
3844 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
3848 /* Prefer scan over next for maximal matching. */
3850 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3851 st->u.whilem.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3852 cur_curlyx->u.curlyx.cur = n;
3853 cur_curlyx->u.curlyx.lastloc = locinput;
3854 REGCP_SET(st->u.whilem.lastcp);
3855 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM5);
3856 /*** all unsaved local vars undefined at this point */
3858 regcpblow(st->u.whilem.cp);
3861 REGCP_UNWIND(st->u.whilem.lastcp);
3862 regcppop(rex); /* Restore some previous $<digit>s? */
3863 PL_reginput = locinput;
3865 PerlIO_printf(Perl_debug_log,
3866 "%*s failed, try continuation...\n",
3867 REPORT_CODE_OFF+depth*2, "")
3870 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3871 && !(PL_reg_flags & RF_warned)) {
3872 PL_reg_flags |= RF_warned;
3873 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3874 "Complex regular subexpression recursion",
3878 /* Failed deeper matches of scan, so see if this one works. */
3879 st->u.whilem.savecc = cur_curlyx;
3880 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3881 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3882 /*** all unsaved local vars undefined at this point */
3883 cur_curlyx = st->u.whilem.savecc;
3886 cur_curlyx->u.curlyx.cur = n - 1;
3887 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
3893 #define ST st->u.branch
3895 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3896 next = scan + ARG(scan);
3899 scan = NEXTOPER(scan);
3902 case BRANCH: /* /(...|A|...)/ */
3903 scan = NEXTOPER(scan); /* scan now points to inner node */
3904 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3905 /* last branch; skip state push and jump direct to node */
3907 ST.lastparen = *PL_reglastparen;
3908 ST.next_branch = next;
3910 PL_reginput = locinput;
3912 /* Now go into the branch */
3913 PUSH_STATE_GOTO(BRANCH_next, scan);
3916 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3917 REGCP_UNWIND(ST.cp);
3918 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3920 *PL_reglastparen = n;
3921 scan = ST.next_branch;
3922 /* no more branches? */
3923 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3925 continue; /* execute next BRANCH[J] op */
3933 #define ST st->u.curlym
3935 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3937 /* This is an optimisation of CURLYX that enables us to push
3938 * only a single backtracking state, no matter now many matches
3939 * there are in {m,n}. It relies on the pattern being constant
3940 * length, with no parens to influence future backrefs
3944 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3946 /* if paren positive, emulate an OPEN/CLOSE around A */
3948 I32 paren = ST.me->flags;
3949 if (paren > PL_regsize)
3951 if (paren > (I32)*PL_reglastparen)
3952 *PL_reglastparen = paren;
3953 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3959 ST.minmod = st->minmod;
3961 ST.c1 = CHRTEST_UNINIT;
3964 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3967 curlym_do_A: /* execute the A in /A{m,n}B/ */
3968 PL_reginput = locinput;
3969 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3972 case CURLYM_A: /* we've just matched an A */
3973 locinput = st->locinput;
3974 nextchr = UCHARAT(locinput);
3977 /* after first match, determine A's length: u.curlym.alen */
3978 if (ST.count == 1) {
3979 if (PL_reg_match_utf8) {
3981 while (s < PL_reginput) {
3987 ST.alen = PL_reginput - locinput;
3990 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3993 PerlIO_printf(Perl_debug_log,
3994 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3995 (int)(REPORT_CODE_OFF+(depth*2)), "",
3996 (IV) ST.count, (IV)ST.alen)
3999 locinput = PL_reginput;
4000 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4001 goto curlym_do_A; /* try to match another A */
4002 goto curlym_do_B; /* try to match B */
4004 case CURLYM_A_fail: /* just failed to match an A */
4005 REGCP_UNWIND(ST.cp);
4006 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4009 curlym_do_B: /* execute the B in /A{m,n}B/ */
4010 PL_reginput = locinput;
4011 if (ST.c1 == CHRTEST_UNINIT) {
4012 /* calculate c1 and c2 for possible match of 1st char
4013 * following curly */
4014 ST.c1 = ST.c2 = CHRTEST_VOID;
4015 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4016 regnode *text_node = ST.B;
4017 if (! HAS_TEXT(text_node))
4018 FIND_NEXT_IMPT(text_node);
4019 if (HAS_TEXT(text_node)
4020 && PL_regkind[OP(text_node)] != REF)
4022 ST.c1 = (U8)*STRING(text_node);
4024 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4026 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4027 ? PL_fold_locale[ST.c1]
4034 PerlIO_printf(Perl_debug_log,
4035 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4036 (int)(REPORT_CODE_OFF+(depth*2)),
4039 if (ST.c1 != CHRTEST_VOID
4040 && UCHARAT(PL_reginput) != ST.c1
4041 && UCHARAT(PL_reginput) != ST.c2)
4043 /* simulate B failing */
4044 state_num = CURLYM_B_fail;
4045 goto reenter_switch;
4049 /* mark current A as captured */
4050 I32 paren = ST.me->flags;
4053 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4054 PL_regendp[paren] = PL_reginput - PL_bostr;
4057 PL_regendp[paren] = -1;
4059 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4062 case CURLYM_B_fail: /* just failed to match a B */
4063 REGCP_UNWIND(ST.cp);
4065 if (ST.count == ARG2(ST.me) /* max */)
4067 goto curlym_do_A; /* try to match a further A */
4069 /* backtrack one A */
4070 if (ST.count == ARG1(ST.me) /* min */)
4073 locinput = HOPc(locinput, -ST.alen);
4074 goto curlym_do_B; /* try to match B */
4077 #define ST st->u.curly
4079 #define CURLY_SETPAREN(paren, success) \
4082 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4083 PL_regendp[paren] = locinput - PL_bostr; \
4086 PL_regendp[paren] = -1; \
4089 case STAR: /* /A*B/ where A is width 1 */
4093 scan = NEXTOPER(scan);
4095 case PLUS: /* /A+B/ where A is width 1 */
4099 scan = NEXTOPER(scan);
4101 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4102 ST.paren = scan->flags; /* Which paren to set */
4103 if (ST.paren > PL_regsize)
4104 PL_regsize = ST.paren;
4105 if (ST.paren > (I32)*PL_reglastparen)
4106 *PL_reglastparen = ST.paren;
4107 ST.min = ARG1(scan); /* min to match */
4108 ST.max = ARG2(scan); /* max to match */
4109 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4111 case CURLY: /* /A{m,n}B/ where A is width 1 */
4113 ST.min = ARG1(scan); /* min to match */
4114 ST.max = ARG2(scan); /* max to match */
4115 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4118 * Lookahead to avoid useless match attempts
4119 * when we know what character comes next.
4121 * Used to only do .*x and .*?x, but now it allows
4122 * for )'s, ('s and (?{ ... })'s to be in the way
4123 * of the quantifier and the EXACT-like node. -- japhy
4126 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4128 if (HAS_TEXT(next) || JUMPABLE(next)) {
4130 regnode *text_node = next;
4132 if (! HAS_TEXT(text_node))
4133 FIND_NEXT_IMPT(text_node);
4135 if (! HAS_TEXT(text_node))
4136 ST.c1 = ST.c2 = CHRTEST_VOID;
4138 if (PL_regkind[OP(text_node)] == REF) {
4139 ST.c1 = ST.c2 = CHRTEST_VOID;
4140 goto assume_ok_easy;
4143 s = (U8*)STRING(text_node);
4147 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4148 ST.c2 = PL_fold[ST.c1];
4149 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4150 ST.c2 = PL_fold_locale[ST.c1];
4153 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4154 STRLEN ulen1, ulen2;
4155 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4156 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4158 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4159 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4161 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4163 0 : UTF8_ALLOW_ANY);
4164 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4166 0 : UTF8_ALLOW_ANY);
4168 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4170 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4175 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4182 ST.c1 = ST.c2 = CHRTEST_VOID;
4187 PL_reginput = locinput;
4190 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4193 locinput = PL_reginput;
4195 if (ST.c1 == CHRTEST_VOID)
4196 goto curly_try_B_min;
4198 ST.oldloc = locinput;
4200 /* set ST.maxpos to the furthest point along the
4201 * string that could possibly match */
4202 if (ST.max == REG_INFTY) {
4203 ST.maxpos = PL_regeol - 1;
4205 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4209 int m = ST.max - ST.min;
4210 for (ST.maxpos = locinput;
4211 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4212 ST.maxpos += UTF8SKIP(ST.maxpos);
4215 ST.maxpos = locinput + ST.max - ST.min;
4216 if (ST.maxpos >= PL_regeol)
4217 ST.maxpos = PL_regeol - 1;
4219 goto curly_try_B_min_known;
4223 ST.count = regrepeat(rex, ST.A, ST.max);
4224 locinput = PL_reginput;
4225 if (ST.count < ST.min)
4227 if ((ST.count > ST.min)
4228 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4230 /* A{m,n} must come at the end of the string, there's
4231 * no point in backing off ... */
4233 /* ...except that $ and \Z can match before *and* after
4234 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4235 We may back off by one in this case. */
4236 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4240 goto curly_try_B_max;
4245 case CURLY_B_min_known_fail:
4246 /* failed to find B in a non-greedy match where c1,c2 valid */
4247 if (ST.paren && ST.count)
4248 PL_regendp[ST.paren] = -1;
4250 PL_reginput = locinput; /* Could be reset... */
4251 REGCP_UNWIND(ST.cp);
4252 /* Couldn't or didn't -- move forward. */
4253 ST.oldloc = locinput;
4255 locinput += UTF8SKIP(locinput);
4259 curly_try_B_min_known:
4260 /* find the next place where 'B' could work, then call B */
4264 n = (ST.oldloc == locinput) ? 0 : 1;
4265 if (ST.c1 == ST.c2) {
4267 /* set n to utf8_distance(oldloc, locinput) */
4268 while (locinput <= ST.maxpos &&
4269 utf8n_to_uvchr((U8*)locinput,
4270 UTF8_MAXBYTES, &len,
4271 uniflags) != (UV)ST.c1) {
4277 /* set n to utf8_distance(oldloc, locinput) */
4278 while (locinput <= ST.maxpos) {
4280 const UV c = utf8n_to_uvchr((U8*)locinput,
4281 UTF8_MAXBYTES, &len,
4283 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4291 if (ST.c1 == ST.c2) {
4292 while (locinput <= ST.maxpos &&
4293 UCHARAT(locinput) != ST.c1)
4297 while (locinput <= ST.maxpos
4298 && UCHARAT(locinput) != ST.c1
4299 && UCHARAT(locinput) != ST.c2)
4302 n = locinput - ST.oldloc;
4304 if (locinput > ST.maxpos)
4306 /* PL_reginput == oldloc now */
4309 if (regrepeat(rex, ST.A, n) < n)
4312 PL_reginput = locinput;
4313 CURLY_SETPAREN(ST.paren, ST.count);
4314 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4319 case CURLY_B_min_fail:
4320 /* failed to find B in a non-greedy match where c1,c2 invalid */
4321 if (ST.paren && ST.count)
4322 PL_regendp[ST.paren] = -1;
4324 REGCP_UNWIND(ST.cp);
4325 /* failed -- move forward one */
4326 PL_reginput = locinput;
4327 if (regrepeat(rex, ST.A, 1)) {
4329 locinput = PL_reginput;
4330 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4331 ST.count > 0)) /* count overflow ? */
4334 CURLY_SETPAREN(ST.paren, ST.count);
4335 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4343 /* a successful greedy match: now try to match B */
4346 if (ST.c1 != CHRTEST_VOID)
4347 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4348 UTF8_MAXBYTES, 0, uniflags)
4349 : (UV) UCHARAT(PL_reginput);
4350 /* If it could work, try it. */
4351 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4352 CURLY_SETPAREN(ST.paren, ST.count);
4353 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4358 case CURLY_B_max_fail:
4359 /* failed to find B in a greedy match */
4360 if (ST.paren && ST.count)
4361 PL_regendp[ST.paren] = -1;
4363 REGCP_UNWIND(ST.cp);
4365 if (--ST.count < ST.min)
4367 PL_reginput = locinput = HOPc(locinput, -1);
4368 goto curly_try_B_max;
4376 /* we've just finished A in /(??{A})B/; now continue with B */
4380 st->u.eval.toggle_reg_flags
4381 = cur_eval->u.eval.toggle_reg_flags;
4382 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4384 st->u.eval.prev_rex = rex; /* inner */
4385 rex = cur_eval->u.eval.prev_rex; /* outer */
4386 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4388 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4389 REGCP_SET(st->u.eval.lastcp);
4390 PL_reginput = locinput;
4392 /* Restore parens of the outer rex without popping the
4394 tmpix = PL_savestack_ix;
4395 PL_savestack_ix = cur_eval->u.eval.lastcp;
4397 PL_savestack_ix = tmpix;
4399 st->u.eval.prev_eval = cur_eval;
4400 cur_eval = cur_eval->u.eval.prev_eval;
4402 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4403 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
4404 PUSH_YES_STATE_GOTO(EVAL_AB,
4405 st->u.eval.prev_eval->u.eval.B); /* match B */
4408 if (locinput < reginfo->till) {
4409 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4410 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4412 (long)(locinput - PL_reg_starttry),
4413 (long)(reginfo->till - PL_reg_starttry),
4415 sayNO_SILENT; /* Cannot match: too short. */
4417 PL_reginput = locinput; /* put where regtry can find it */
4418 sayYES; /* Success! */
4420 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4422 PerlIO_printf(Perl_debug_log,
4423 "%*s %ssubpattern success...%s\n",
4424 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4425 PL_reginput = locinput; /* put where regtry can find it */
4426 sayYES; /* Success! */
4429 #define ST st->u.ifmatch
4431 case SUSPEND: /* (?>A) */
4433 PL_reginput = locinput;
4436 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4438 goto ifmatch_trivial_fail_test;
4440 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4442 ifmatch_trivial_fail_test:
4444 char * const s = HOPBACKc(locinput, scan->flags);
4449 st->sw = 1 - (bool)ST.wanted;
4453 next = scan + ARG(scan);
4461 PL_reginput = locinput;
4465 /* execute body of (?...A) */
4466 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4469 case IFMATCH_A_fail: /* body of (?...A) failed */
4470 ST.wanted = !ST.wanted;
4473 case IFMATCH_A: /* body of (?...A) succeeded */
4476 st->sw = (bool)ST.wanted;
4478 else if (!ST.wanted)
4481 if (OP(ST.me) == SUSPEND)
4482 locinput = PL_reginput;
4484 locinput = PL_reginput = st->locinput;
4485 nextchr = UCHARAT(locinput);
4487 scan = ST.me + ARG(ST.me);
4490 continue; /* execute B */
4495 next = scan + ARG(scan);
4500 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4501 PTR2UV(scan), OP(scan));
4502 Perl_croak(aTHX_ "regexp memory corruption");
4510 /* push a state that backtracks on success */
4511 st->u.yes.prev_yes_state = yes_state;
4515 /* push a new regex state, then continue at scan */
4517 regmatch_state *newst;
4519 DEBUG_STATE_pp("push");
4521 st->locinput = locinput;
4523 if (newst > SLAB_LAST(PL_regmatch_slab))
4524 newst = S_push_slab(aTHX);
4525 PL_regmatch_state = newst;
4526 /* XXX probably don't need to initialise these */
4532 locinput = PL_reginput;
4533 nextchr = UCHARAT(locinput);
4539 /* simulate recursively calling regmatch(), but without actually
4540 * recursing - ie save the current state on the heap rather than on
4541 * the stack, then re-enter the loop. This avoids complex regexes
4542 * blowing the processor stack */
4546 /* push new state */
4547 regmatch_state *oldst = st;
4549 DEBUG_STATE_pp("push");
4551 st->u.yes.prev_yes_state = yes_state;
4554 /* grab the next free state slot */
4556 if (st > SLAB_LAST(PL_regmatch_slab))
4557 st = S_push_slab(aTHX);
4558 PL_regmatch_state = st;
4562 oldst->locinput = locinput;
4564 locinput = PL_reginput;
4565 nextchr = UCHARAT(locinput);
4574 * We get here only if there's trouble -- normally "case END" is
4575 * the terminating point.
4577 Perl_croak(aTHX_ "corrupted regexp pointers");
4583 /* we have successfully completed a subexpression, but we must now
4584 * pop to the state marked by yes_state and continue from there */
4585 assert(st != yes_state);
4587 while (st != yes_state) {
4589 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4590 PL_regmatch_slab = PL_regmatch_slab->prev;
4591 st = SLAB_LAST(PL_regmatch_slab);
4593 DEBUG_STATE_pp("pop (yes)");
4597 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4598 || yes_state > SLAB_LAST(PL_regmatch_slab))
4600 /* not in this slab, pop slab */
4601 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4602 PL_regmatch_slab = PL_regmatch_slab->prev;
4603 st = SLAB_LAST(PL_regmatch_slab);
4605 depth -= (st - yes_state);
4608 yes_state = st->u.yes.prev_yes_state;
4609 PL_regmatch_state = st;
4611 switch (st->resume_state) {
4613 case resume_WHILEM1:
4614 case resume_WHILEM2:
4615 case resume_WHILEM3:
4616 case resume_WHILEM4:
4617 case resume_WHILEM5:
4618 case resume_WHILEM6:
4620 /* restore previous state and re-enter */
4624 locinput= st->locinput;
4625 nextchr = UCHARAT(locinput);
4626 switch (st->resume_state) {
4628 goto resume_point_CURLYX;
4629 case resume_WHILEM1:
4630 goto resume_point_WHILEM1;
4631 case resume_WHILEM2:
4632 goto resume_point_WHILEM2;
4633 case resume_WHILEM3:
4634 goto resume_point_WHILEM3;
4635 case resume_WHILEM4:
4636 goto resume_point_WHILEM4;
4637 case resume_WHILEM5:
4638 goto resume_point_WHILEM5;
4639 case resume_WHILEM6:
4640 goto resume_point_WHILEM6;
4642 Perl_croak(aTHX_ "unexpected whilem resume state");
4647 state_num = st->resume_state;
4648 goto reenter_switch;
4655 Perl_croak(aTHX_ "unexpected yes resume state");
4659 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4660 PL_colors[4], PL_colors[5]));
4667 PerlIO_printf(Perl_debug_log,
4668 "%*s %sfailed...%s\n",
4669 REPORT_CODE_OFF+depth*2, "",
4670 PL_colors[4], PL_colors[5])
4677 /* there's a previous state to backtrack to */
4679 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4680 PL_regmatch_slab = PL_regmatch_slab->prev;
4681 st = SLAB_LAST(PL_regmatch_slab);
4683 PL_regmatch_state = st;
4687 locinput= st->locinput;
4688 nextchr = UCHARAT(locinput);
4690 DEBUG_STATE_pp("pop");
4692 if (yes_state == st)
4693 yes_state = st->u.yes.prev_yes_state;
4695 switch (st->resume_state) {
4697 goto resume_point_CURLYX;
4698 case resume_WHILEM1:
4699 goto resume_point_WHILEM1;
4700 case resume_WHILEM2:
4701 goto resume_point_WHILEM2;
4702 case resume_WHILEM3:
4703 goto resume_point_WHILEM3;
4704 case resume_WHILEM4:
4705 goto resume_point_WHILEM4;
4706 case resume_WHILEM5:
4707 goto resume_point_WHILEM5;
4708 case resume_WHILEM6:
4709 goto resume_point_WHILEM6;
4719 case CURLY_B_min_known:
4720 state_num = st->resume_state + 1; /* failure = success + 1 */
4721 goto reenter_switch;
4724 Perl_croak(aTHX_ "regexp resume memory corruption");
4730 /* restore original high-water mark */
4731 PL_regmatch_slab = orig_slab;
4732 PL_regmatch_state = orig_state;
4734 /* free all slabs above current one */
4735 if (orig_slab->next) {
4736 regmatch_slab *sl = orig_slab->next;
4737 orig_slab->next = NULL;
4739 regmatch_slab * const osl = sl;
4749 - regrepeat - repeatedly match something simple, report how many
4752 * [This routine now assumes that it will only match on things of length 1.
4753 * That was true before, but now we assume scan - reginput is the count,
4754 * rather than incrementing count on every character. [Er, except utf8.]]
4757 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4760 register char *scan;
4762 register char *loceol = PL_regeol;
4763 register I32 hardcount = 0;
4764 register bool do_utf8 = PL_reg_match_utf8;
4767 if (max == REG_INFTY)
4769 else if (max < loceol - scan)
4770 loceol = scan + max;
4775 while (scan < loceol && hardcount < max && *scan != '\n') {
4776 scan += UTF8SKIP(scan);
4780 while (scan < loceol && *scan != '\n')
4787 while (scan < loceol && hardcount < max) {
4788 scan += UTF8SKIP(scan);
4798 case EXACT: /* length of string is 1 */
4800 while (scan < loceol && UCHARAT(scan) == c)
4803 case EXACTF: /* length of string is 1 */
4805 while (scan < loceol &&
4806 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4809 case EXACTFL: /* length of string is 1 */
4810 PL_reg_flags |= RF_tainted;
4812 while (scan < loceol &&
4813 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4819 while (hardcount < max && scan < loceol &&
4820 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4821 scan += UTF8SKIP(scan);
4825 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4832 LOAD_UTF8_CHARCLASS_ALNUM();
4833 while (hardcount < max && scan < loceol &&
4834 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4835 scan += UTF8SKIP(scan);
4839 while (scan < loceol && isALNUM(*scan))
4844 PL_reg_flags |= RF_tainted;
4847 while (hardcount < max && scan < loceol &&
4848 isALNUM_LC_utf8((U8*)scan)) {
4849 scan += UTF8SKIP(scan);
4853 while (scan < loceol && isALNUM_LC(*scan))
4860 LOAD_UTF8_CHARCLASS_ALNUM();
4861 while (hardcount < max && scan < loceol &&
4862 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4863 scan += UTF8SKIP(scan);
4867 while (scan < loceol && !isALNUM(*scan))
4872 PL_reg_flags |= RF_tainted;
4875 while (hardcount < max && scan < loceol &&
4876 !isALNUM_LC_utf8((U8*)scan)) {
4877 scan += UTF8SKIP(scan);
4881 while (scan < loceol && !isALNUM_LC(*scan))
4888 LOAD_UTF8_CHARCLASS_SPACE();
4889 while (hardcount < max && scan < loceol &&
4891 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4892 scan += UTF8SKIP(scan);
4896 while (scan < loceol && isSPACE(*scan))
4901 PL_reg_flags |= RF_tainted;
4904 while (hardcount < max && scan < loceol &&
4905 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4906 scan += UTF8SKIP(scan);
4910 while (scan < loceol && isSPACE_LC(*scan))
4917 LOAD_UTF8_CHARCLASS_SPACE();
4918 while (hardcount < max && scan < loceol &&
4920 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4921 scan += UTF8SKIP(scan);
4925 while (scan < loceol && !isSPACE(*scan))
4930 PL_reg_flags |= RF_tainted;
4933 while (hardcount < max && scan < loceol &&
4934 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4935 scan += UTF8SKIP(scan);
4939 while (scan < loceol && !isSPACE_LC(*scan))
4946 LOAD_UTF8_CHARCLASS_DIGIT();
4947 while (hardcount < max && scan < loceol &&
4948 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4949 scan += UTF8SKIP(scan);
4953 while (scan < loceol && isDIGIT(*scan))
4960 LOAD_UTF8_CHARCLASS_DIGIT();
4961 while (hardcount < max && scan < loceol &&
4962 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4963 scan += UTF8SKIP(scan);
4967 while (scan < loceol && !isDIGIT(*scan))
4971 default: /* Called on something of 0 width. */
4972 break; /* So match right here or not at all. */
4978 c = scan - PL_reginput;
4982 GET_RE_DEBUG_FLAGS_DECL;
4984 SV * const prop = sv_newmortal();
4985 regprop(prog, prop, p);
4986 PerlIO_printf(Perl_debug_log,
4987 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4988 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4996 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4998 - regclass_swash - prepare the utf8 swash
5002 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5008 const struct reg_data * const data = prog ? prog->data : NULL;
5010 if (data && data->count) {
5011 const U32 n = ARG(node);
5013 if (data->what[n] == 's') {
5014 SV * const rv = (SV*)data->data[n];
5015 AV * const av = (AV*)SvRV((SV*)rv);
5016 SV **const ary = AvARRAY(av);
5019 /* See the end of regcomp.c:S_regclass() for
5020 * documentation of these array elements. */
5023 a = SvROK(ary[1]) ? &ary[1] : 0;
5024 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5028 else if (si && doinit) {
5029 sw = swash_init("utf8", "", si, 1, 0);
5030 (void)av_store(av, 1, sw);
5047 - reginclass - determine if a character falls into a character class
5049 The n is the ANYOF regnode, the p is the target string, lenp
5050 is pointer to the maximum length of how far to go in the p
5051 (if the lenp is zero, UTF8SKIP(p) is used),
5052 do_utf8 tells whether the target string is in UTF-8.
5057 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5060 const char flags = ANYOF_FLAGS(n);
5066 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5067 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5068 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5069 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5070 if (len == (STRLEN)-1)
5071 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5074 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5075 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5078 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5079 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5082 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5086 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5089 if (swash_fetch(sw, p, do_utf8))
5091 else if (flags & ANYOF_FOLD) {
5092 if (!match && lenp && av) {
5094 for (i = 0; i <= av_len(av); i++) {
5095 SV* const sv = *av_fetch(av, i, FALSE);
5097 const char * const s = SvPV_const(sv, len);
5099 if (len <= plen && memEQ(s, (char*)p, len)) {
5107 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5110 to_utf8_fold(p, tmpbuf, &tmplen);
5111 if (swash_fetch(sw, tmpbuf, do_utf8))
5117 if (match && lenp && *lenp == 0)
5118 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5120 if (!match && c < 256) {
5121 if (ANYOF_BITMAP_TEST(n, c))
5123 else if (flags & ANYOF_FOLD) {
5126 if (flags & ANYOF_LOCALE) {
5127 PL_reg_flags |= RF_tainted;
5128 f = PL_fold_locale[c];
5132 if (f != c && ANYOF_BITMAP_TEST(n, f))
5136 if (!match && (flags & ANYOF_CLASS)) {
5137 PL_reg_flags |= RF_tainted;
5139 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5140 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5141 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5142 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5143 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5144 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5145 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5146 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5147 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5148 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5149 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5150 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5151 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5152 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5153 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5154 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5155 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5156 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5157 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5158 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5159 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5160 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5161 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5162 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5163 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5164 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5165 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5166 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5167 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5168 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5169 ) /* How's that for a conditional? */
5176 return (flags & ANYOF_INVERT) ? !match : match;
5180 S_reghop3(U8 *s, I32 off, const U8* lim)
5184 while (off-- && s < lim) {
5185 /* XXX could check well-formedness here */
5190 while (off++ && s > lim) {
5192 if (UTF8_IS_CONTINUED(*s)) {
5193 while (s > lim && UTF8_IS_CONTINUATION(*s))
5196 /* XXX could check well-formedness here */
5203 /* there are a bunch of places where we use two reghop3's that should
5204 be replaced with this routine. but since thats not done yet
5205 we ifdef it out - dmq
5208 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5212 while (off-- && s < rlim) {
5213 /* XXX could check well-formedness here */
5218 while (off++ && s > llim) {
5220 if (UTF8_IS_CONTINUED(*s)) {
5221 while (s > llim && UTF8_IS_CONTINUATION(*s))
5224 /* XXX could check well-formedness here */
5232 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5236 while (off-- && s < lim) {
5237 /* XXX could check well-formedness here */
5244 while (off++ && s > lim) {
5246 if (UTF8_IS_CONTINUED(*s)) {
5247 while (s > lim && UTF8_IS_CONTINUATION(*s))
5250 /* XXX could check well-formedness here */
5259 restore_pos(pTHX_ void *arg)
5262 regexp * const rex = (regexp *)arg;
5263 if (PL_reg_eval_set) {
5264 if (PL_reg_oldsaved) {
5265 rex->subbeg = PL_reg_oldsaved;
5266 rex->sublen = PL_reg_oldsavedlen;
5267 #ifdef PERL_OLD_COPY_ON_WRITE
5268 rex->saved_copy = PL_nrs;
5270 RX_MATCH_COPIED_on(rex);
5272 PL_reg_magic->mg_len = PL_reg_oldpos;
5273 PL_reg_eval_set = 0;
5274 PL_curpm = PL_reg_oldcurpm;
5279 S_to_utf8_substr(pTHX_ register regexp *prog)
5281 if (prog->float_substr && !prog->float_utf8) {
5282 SV* const sv = newSVsv(prog->float_substr);
5283 prog->float_utf8 = sv;
5284 sv_utf8_upgrade(sv);
5285 if (SvTAIL(prog->float_substr))
5287 if (prog->float_substr == prog->check_substr)
5288 prog->check_utf8 = sv;
5290 if (prog->anchored_substr && !prog->anchored_utf8) {
5291 SV* const sv = newSVsv(prog->anchored_substr);
5292 prog->anchored_utf8 = sv;
5293 sv_utf8_upgrade(sv);
5294 if (SvTAIL(prog->anchored_substr))
5296 if (prog->anchored_substr == prog->check_substr)
5297 prog->check_utf8 = sv;
5302 S_to_byte_substr(pTHX_ register regexp *prog)
5305 if (prog->float_utf8 && !prog->float_substr) {
5306 SV* sv = newSVsv(prog->float_utf8);
5307 prog->float_substr = sv;
5308 if (sv_utf8_downgrade(sv, TRUE)) {
5309 if (SvTAIL(prog->float_utf8))
5313 prog->float_substr = sv = &PL_sv_undef;
5315 if (prog->float_utf8 == prog->check_utf8)
5316 prog->check_substr = sv;
5318 if (prog->anchored_utf8 && !prog->anchored_substr) {
5319 SV* sv = newSVsv(prog->anchored_utf8);
5320 prog->anchored_substr = sv;
5321 if (sv_utf8_downgrade(sv, TRUE)) {
5322 if (SvTAIL(prog->anchored_utf8))
5326 prog->anchored_substr = sv = &PL_sv_undef;
5328 if (prog->anchored_utf8 == prog->check_utf8)
5329 prog->check_substr = sv;
5335 * c-indentation-style: bsd
5337 * indent-tabs-mode: t
5340 * ex: set ts=8 sts=4 sw=4 noet: