5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 8
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regendp[p]);
193 SSPUSHINT(PL_regstartp[p]);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regstartp[p],
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regstartp);
205 SSPUSHPTR(PL_regendp);
206 SSPUSHINT(PL_regsize);
207 SSPUSHINT(*PL_reglastparen);
208 SSPUSHINT(*PL_reglastcloseparen);
209 SSPUSHPTR(PL_reginput);
210 #define REGCP_FRAME_ELEMS 2
211 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
212 * are needed for the regexp context stack bookkeeping. */
213 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
214 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
219 /* These are needed since we do not localize EVAL nodes: */
220 #define REGCP_SET(cp) \
222 PerlIO_printf(Perl_debug_log, \
223 " Setting an EVAL scope, savestack=%"IVdf"\n", \
224 (IV)PL_savestack_ix)); \
227 #define REGCP_UNWIND(cp) \
229 if (cp != PL_savestack_ix) \
230 PerlIO_printf(Perl_debug_log, \
231 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
232 (IV)(cp), (IV)PL_savestack_ix)); \
236 S_regcppop(pTHX_ const regexp *rex)
242 GET_RE_DEBUG_FLAGS_DECL;
244 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
246 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
247 i = SSPOPINT; /* Parentheses elements to pop. */
248 input = (char *) SSPOPPTR;
249 *PL_reglastcloseparen = SSPOPINT;
250 *PL_reglastparen = SSPOPINT;
251 PL_regsize = SSPOPINT;
252 PL_regendp=(I32 *) SSPOPPTR;
253 PL_regstartp=(I32 *) SSPOPPTR;
256 /* Now restore the parentheses context. */
257 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
258 i > 0; i -= REGCP_PAREN_ELEMS) {
260 U32 paren = (U32)SSPOPINT;
261 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
262 PL_regstartp[paren] = SSPOPINT;
264 if (paren <= *PL_reglastparen)
265 PL_regendp[paren] = tmps;
267 PerlIO_printf(Perl_debug_log,
268 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
269 (UV)paren, (IV)PL_regstartp[paren],
270 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
271 (IV)PL_regendp[paren],
272 (paren > *PL_reglastparen ? "(no)" : ""));
276 if (*PL_reglastparen + 1 <= rex->nparens) {
277 PerlIO_printf(Perl_debug_log,
278 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
279 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
283 /* It would seem that the similar code in regtry()
284 * already takes care of this, and in fact it is in
285 * a better location to since this code can #if 0-ed out
286 * but the code in regtry() is needed or otherwise tests
287 * requiring null fields (pat.t#187 and split.t#{13,14}
288 * (as of patchlevel 7877) will fail. Then again,
289 * this code seems to be necessary or otherwise
290 * building DynaLoader will fail:
291 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
293 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
295 PL_regstartp[i] = -1;
302 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
305 * pregexec and friends
308 #ifndef PERL_IN_XSUB_RE
310 - pregexec - match a regexp against a string
313 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
314 char *strbeg, I32 minend, SV *screamer, U32 nosave)
315 /* strend: pointer to null at end of string */
316 /* strbeg: real beginning of string */
317 /* minend: end of match must be >=minend after stringarg. */
318 /* nosave: For optimizations. */
321 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
322 nosave ? 0 : REXEC_COPY_STR);
327 * Need to implement the following flags for reg_anch:
329 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
331 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
332 * INTUIT_AUTORITATIVE_ML
333 * INTUIT_ONCE_NOML - Intuit can match in one location only.
336 * Another flag for this function: SECOND_TIME (so that float substrs
337 * with giant delta may be not rechecked).
340 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
342 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
343 Otherwise, only SvCUR(sv) is used to get strbeg. */
345 /* XXXX We assume that strpos is strbeg unless sv. */
347 /* XXXX Some places assume that there is a fixed substring.
348 An update may be needed if optimizer marks as "INTUITable"
349 RExen without fixed substrings. Similarly, it is assumed that
350 lengths of all the strings are no more than minlen, thus they
351 cannot come from lookahead.
352 (Or minlen should take into account lookahead.)
353 NOTE: Some of this comment is not correct. minlen does now take account
354 of lookahead/behind. Further research is required. -- demerphq
358 /* A failure to find a constant substring means that there is no need to make
359 an expensive call to REx engine, thus we celebrate a failure. Similarly,
360 finding a substring too deep into the string means that less calls to
361 regtry() should be needed.
363 REx compiler's optimizer found 4 possible hints:
364 a) Anchored substring;
366 c) Whether we are anchored (beginning-of-line or \G);
367 d) First node (of those at offset 0) which may distingush positions;
368 We use a)b)d) and multiline-part of c), and try to find a position in the
369 string which does not contradict any of them.
372 /* Most of decisions we do here should have been done at compile time.
373 The nodes of the REx which we used for the search should have been
374 deleted from the finite automaton. */
377 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
378 char *strend, U32 flags, re_scream_pos_data *data)
381 register I32 start_shift = 0;
382 /* Should be nonnegative! */
383 register I32 end_shift = 0;
388 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
390 register char *other_last = NULL; /* other substr checked before this */
391 char *check_at = NULL; /* check substr found at this pos */
392 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
393 RXi_GET_DECL(prog,progi);
395 const char * const i_strpos = strpos;
398 GET_RE_DEBUG_FLAGS_DECL;
400 RX_MATCH_UTF8_set(prog,do_utf8);
402 if (prog->extflags & RXf_UTF8) {
403 PL_reg_flags |= RF_utf8;
406 debug_start_match(prog, do_utf8, strpos, strend,
407 sv ? "Guessing start of match in sv for"
408 : "Guessing start of match in string for");
411 /* CHR_DIST() would be more correct here but it makes things slow. */
412 if (prog->minlen > strend - strpos) {
413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
414 "String too short... [re_intuit_start]\n"));
418 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
421 if (!prog->check_utf8 && prog->check_substr)
422 to_utf8_substr(prog);
423 check = prog->check_utf8;
425 if (!prog->check_substr && prog->check_utf8)
426 to_byte_substr(prog);
427 check = prog->check_substr;
429 if (check == &PL_sv_undef) {
430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
431 "Non-utf8 string cannot match utf8 check string\n"));
434 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
435 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
436 || ( (prog->extflags & RXf_ANCH_BOL)
437 && !multiline ) ); /* Check after \n? */
440 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
441 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
442 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
444 && (strpos != strbeg)) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
448 if (prog->check_offset_min == prog->check_offset_max &&
449 !(prog->extflags & RXf_CANY_SEEN)) {
450 /* Substring at constant offset from beg-of-str... */
453 s = HOP3c(strpos, prog->check_offset_min, strend);
456 slen = SvCUR(check); /* >= 1 */
458 if ( strend - s > slen || strend - s < slen - 1
459 || (strend - s == slen && strend[-1] != '\n')) {
460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
463 /* Now should match s[0..slen-2] */
465 if (slen && (*SvPVX_const(check) != *s
467 && memNE(SvPVX_const(check), s, slen)))) {
469 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
473 else if (*SvPVX_const(check) != *s
474 || ((slen = SvCUR(check)) > 1
475 && memNE(SvPVX_const(check), s, slen)))
478 goto success_at_start;
481 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
484 end_shift = prog->check_end_shift;
487 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
488 - (SvTAIL(check) != 0);
489 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
491 if (end_shift < eshift)
495 else { /* Can match at random position */
498 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
499 end_shift = prog->check_end_shift;
501 /* end shift should be non negative here */
504 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
506 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
507 (IV)end_shift, prog->precomp);
511 /* Find a possible match in the region s..strend by looking for
512 the "check" substring in the region corrected by start/end_shift. */
515 I32 srch_start_shift = start_shift;
516 I32 srch_end_shift = end_shift;
517 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
518 srch_end_shift -= ((strbeg - s) - srch_start_shift);
519 srch_start_shift = strbeg - s;
521 DEBUG_OPTIMISE_MORE_r({
522 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
523 (IV)prog->check_offset_min,
524 (IV)srch_start_shift,
526 (IV)prog->check_end_shift);
529 if (flags & REXEC_SCREAM) {
530 I32 p = -1; /* Internal iterator of scream. */
531 I32 * const pp = data ? data->scream_pos : &p;
533 if (PL_screamfirst[BmRARE(check)] >= 0
534 || ( BmRARE(check) == '\n'
535 && (BmPREVIOUS(check) == SvCUR(check) - 1)
537 s = screaminstr(sv, check,
538 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
541 /* we may be pointing at the wrong string */
542 if (s && RX_MATCH_COPIED(prog))
543 s = strbeg + (s - SvPVX_const(sv));
545 *data->scream_olds = s;
550 if (prog->extflags & RXf_CANY_SEEN) {
551 start_point= (U8*)(s + srch_start_shift);
552 end_point= (U8*)(strend - srch_end_shift);
554 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
555 end_point= HOP3(strend, -srch_end_shift, strbeg);
557 DEBUG_OPTIMISE_MORE_r({
558 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
559 (int)(end_point - start_point),
560 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
564 s = fbm_instr( start_point, end_point,
565 check, multiline ? FBMrf_MULTILINE : 0);
568 /* Update the count-of-usability, remove useless subpatterns,
572 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
573 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
574 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
575 (s ? "Found" : "Did not find"),
576 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
577 ? "anchored" : "floating"),
580 (s ? " at offset " : "...\n") );
585 /* Finish the diagnostic message */
586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
588 /* XXX dmq: first branch is for positive lookbehind...
589 Our check string is offset from the beginning of the pattern.
590 So we need to do any stclass tests offset forward from that
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
602 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
608 : (prog->float_substr && prog->anchored_substr))
610 /* Take into account the "other" substring. */
611 /* XXXX May be hopelessly wrong for UTF... */
614 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
617 char * const last = HOP3c(s, -start_shift, strbeg);
619 char * const saved_s = s;
622 t = s - prog->check_offset_max;
623 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
625 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
630 t = HOP3c(t, prog->anchored_offset, strend);
631 if (t < other_last) /* These positions already checked */
633 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
636 /* XXXX It is not documented what units *_offsets are in.
637 We assume bytes, but this is clearly wrong.
638 Meaning this code needs to be carefully reviewed for errors.
642 /* On end-of-str: see comment below. */
643 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
644 if (must == &PL_sv_undef) {
646 DEBUG_r(must = prog->anchored_utf8); /* for debug */
651 HOP3(HOP3(last1, prog->anchored_offset, strend)
652 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
654 multiline ? FBMrf_MULTILINE : 0
657 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
658 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
659 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
660 (s ? "Found" : "Contradicts"),
661 quoted, RE_SV_TAIL(must));
666 if (last1 >= last2) {
667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
668 ", giving up...\n"));
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
672 ", trying floating at offset %ld...\n",
673 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
674 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
675 s = HOP3c(last, 1, strend);
679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
680 (long)(s - i_strpos)));
681 t = HOP3c(s, -prog->anchored_offset, strbeg);
682 other_last = HOP3c(s, 1, strend);
690 else { /* Take into account the floating substring. */
692 char * const saved_s = s;
695 t = HOP3c(s, -start_shift, strbeg);
697 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
698 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
699 last = HOP3c(t, prog->float_max_offset, strend);
700 s = HOP3c(t, prog->float_min_offset, strend);
703 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
704 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
705 /* fbm_instr() takes into account exact value of end-of-str
706 if the check is SvTAIL(ed). Since false positives are OK,
707 and end-of-str is not later than strend we are OK. */
708 if (must == &PL_sv_undef) {
710 DEBUG_r(must = prog->float_utf8); /* for debug message */
713 s = fbm_instr((unsigned char*)s,
714 (unsigned char*)last + SvCUR(must)
716 must, multiline ? FBMrf_MULTILINE : 0);
718 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
719 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
720 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
721 (s ? "Found" : "Contradicts"),
722 quoted, RE_SV_TAIL(must));
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
727 ", giving up...\n"));
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
731 ", trying anchored starting at offset %ld...\n",
732 (long)(saved_s + 1 - i_strpos)));
734 s = HOP3c(t, 1, strend);
738 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
739 (long)(s - i_strpos)));
740 other_last = s; /* Fix this later. --Hugo */
750 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
752 DEBUG_OPTIMISE_MORE_r(
753 PerlIO_printf(Perl_debug_log,
754 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
755 (IV)prog->check_offset_min,
756 (IV)prog->check_offset_max,
764 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
766 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
769 /* Fixed substring is found far enough so that the match
770 cannot start at strpos. */
772 if (ml_anch && t[-1] != '\n') {
773 /* Eventually fbm_*() should handle this, but often
774 anchored_offset is not 0, so this check will not be wasted. */
775 /* XXXX In the code below we prefer to look for "^" even in
776 presence of anchored substrings. And we search even
777 beyond the found float position. These pessimizations
778 are historical artefacts only. */
780 while (t < strend - prog->minlen) {
782 if (t < check_at - prog->check_offset_min) {
783 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
784 /* Since we moved from the found position,
785 we definitely contradict the found anchored
786 substr. Due to the above check we do not
787 contradict "check" substr.
788 Thus we can arrive here only if check substr
789 is float. Redo checking for "other"=="fixed".
792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
793 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
794 goto do_other_anchored;
796 /* We don't contradict the found floating substring. */
797 /* XXXX Why not check for STCLASS? */
799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
800 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
803 /* Position contradicts check-string */
804 /* XXXX probably better to look for check-string
805 than for "\n", so one should lower the limit for t? */
806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
807 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
808 other_last = strpos = s = t + 1;
813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
814 PL_colors[0], PL_colors[1]));
818 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
819 PL_colors[0], PL_colors[1]));
823 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
826 /* The found string does not prohibit matching at strpos,
827 - no optimization of calling REx engine can be performed,
828 unless it was an MBOL and we are not after MBOL,
829 or a future STCLASS check will fail this. */
831 /* Even in this situation we may use MBOL flag if strpos is offset
832 wrt the start of the string. */
833 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
834 && (strpos != strbeg) && strpos[-1] != '\n'
835 /* May be due to an implicit anchor of m{.*foo} */
836 && !(prog->intflags & PREGf_IMPLICIT))
841 DEBUG_EXECUTE_r( if (ml_anch)
842 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
843 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
846 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
848 prog->check_utf8 /* Could be deleted already */
849 && --BmUSEFUL(prog->check_utf8) < 0
850 && (prog->check_utf8 == prog->float_utf8)
852 prog->check_substr /* Could be deleted already */
853 && --BmUSEFUL(prog->check_substr) < 0
854 && (prog->check_substr == prog->float_substr)
857 /* If flags & SOMETHING - do not do it many times on the same match */
858 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
859 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
860 if (do_utf8 ? prog->check_substr : prog->check_utf8)
861 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
862 prog->check_substr = prog->check_utf8 = NULL; /* disable */
863 prog->float_substr = prog->float_utf8 = NULL; /* clear */
864 check = NULL; /* abort */
866 /* XXXX This is a remnant of the old implementation. It
867 looks wasteful, since now INTUIT can use many
869 prog->extflags &= ~RXf_USE_INTUIT;
876 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
877 /* trie stclasses are too expensive to use here, we are better off to
878 leave it to regmatch itself */
879 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
880 /* minlen == 0 is possible if regstclass is \b or \B,
881 and the fixed substr is ''$.
882 Since minlen is already taken into account, s+1 is before strend;
883 accidentally, minlen >= 1 guaranties no false positives at s + 1
884 even for \b or \B. But (minlen? 1 : 0) below assumes that
885 regstclass does not come from lookahead... */
886 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
887 This leaves EXACTF only, which is dealt with in find_byclass(). */
888 const U8* const str = (U8*)STRING(progi->regstclass);
889 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
890 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
893 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
894 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
895 else if (prog->float_substr || prog->float_utf8)
896 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
901 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
904 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
907 const char *what = NULL;
909 if (endpos == strend) {
910 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
911 "Could not match STCLASS...\n") );
914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
915 "This position contradicts STCLASS...\n") );
916 if ((prog->extflags & RXf_ANCH) && !ml_anch)
918 /* Contradict one of substrings */
919 if (prog->anchored_substr || prog->anchored_utf8) {
920 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
921 DEBUG_EXECUTE_r( what = "anchored" );
923 s = HOP3c(t, 1, strend);
924 if (s + start_shift + end_shift > strend) {
925 /* XXXX Should be taken into account earlier? */
926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
927 "Could not match STCLASS...\n") );
932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
933 "Looking for %s substr starting at offset %ld...\n",
934 what, (long)(s + start_shift - i_strpos)) );
937 /* Have both, check_string is floating */
938 if (t + start_shift >= check_at) /* Contradicts floating=check */
939 goto retry_floating_check;
940 /* Recheck anchored substring, but not floating... */
944 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
945 "Looking for anchored substr starting at offset %ld...\n",
946 (long)(other_last - i_strpos)) );
947 goto do_other_anchored;
949 /* Another way we could have checked stclass at the
950 current position only: */
955 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
956 "Looking for /%s^%s/m starting at offset %ld...\n",
957 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
960 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
962 /* Check is floating subtring. */
963 retry_floating_check:
964 t = check_at - start_shift;
965 DEBUG_EXECUTE_r( what = "floating" );
966 goto hop_and_restart;
969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
970 "By STCLASS: moving %ld --> %ld\n",
971 (long)(t - i_strpos), (long)(s - i_strpos))
975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
976 "Does not contradict STCLASS...\n");
981 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
982 PL_colors[4], (check ? "Guessed" : "Giving up"),
983 PL_colors[5], (long)(s - i_strpos)) );
986 fail_finish: /* Substring not found */
987 if (prog->check_substr || prog->check_utf8) /* could be removed already */
988 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
991 PL_colors[4], PL_colors[5]));
997 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
998 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
999 switch (trie_type) { \
1000 case trie_utf8_fold: \
1001 if ( foldlen>0 ) { \
1002 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1007 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1008 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1009 foldlen -= UNISKIP( uvc ); \
1010 uscan = foldbuf + UNISKIP( uvc ); \
1014 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1022 charid = trie->charmap[ uvc ]; \
1026 if (widecharmap) { \
1027 SV** const svpp = hv_fetch(widecharmap, \
1028 (char*)&uvc, sizeof(UV), 0); \
1030 charid = (U16)SvIV(*svpp); \
1035 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1038 ibcmp_utf8(s, NULL, 0, do_utf8, \
1039 m, NULL, ln, (bool)UTF)) \
1040 && (!reginfo || regtry(reginfo, &s)) ) \
1043 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1044 uvchr_to_utf8(tmpbuf, c); \
1045 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1047 && (f == c1 || f == c2) \
1048 && (ln == foldlen || \
1049 !ibcmp_utf8((char *) foldbuf, \
1050 NULL, foldlen, do_utf8, \
1052 NULL, ln, (bool)UTF)) \
1053 && (!reginfo || regtry(reginfo, &s)) ) \
1058 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1062 && (ln == 1 || !(OP(c) == EXACTF \
1064 : ibcmp_locale(s, m, ln))) \
1065 && (!reginfo || regtry(reginfo, &s)) ) \
1071 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1073 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1079 #define REXEC_FBC_SCAN(CoDe) \
1081 while (s < strend) { \
1087 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1088 REXEC_FBC_UTF8_SCAN( \
1090 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1099 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1102 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1111 #define REXEC_FBC_TRYIT \
1112 if ((!reginfo || regtry(reginfo, &s))) \
1115 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1118 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1121 REXEC_FBC_CLASS_SCAN(CoNd); \
1125 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1126 PL_reg_flags |= RF_tainted; \
1128 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1131 REXEC_FBC_CLASS_SCAN(CoNd); \
1135 #define DUMP_EXEC_POS(li,s,doutf8) \
1136 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1138 /* We know what class REx starts with. Try to find this position... */
1139 /* if reginfo is NULL, its a dryrun */
1140 /* annoyingly all the vars in this routine have different names from their counterparts
1141 in regmatch. /grrr */
1144 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1145 const char *strend, regmatch_info *reginfo)
1148 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1152 register STRLEN uskip;
1156 register I32 tmp = 1; /* Scratch variable? */
1157 register const bool do_utf8 = PL_reg_match_utf8;
1158 RXi_GET_DECL(prog,progi);
1160 /* We know what class it must start with. */
1164 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1165 !UTF8_IS_INVARIANT((U8)s[0]) ?
1166 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1167 REGINCLASS(prog, c, (U8*)s));
1170 while (s < strend) {
1173 if (REGINCLASS(prog, c, (U8*)s) ||
1174 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1175 /* The assignment of 2 is intentional:
1176 * for the folded sharp s, the skip is 2. */
1177 (skip = SHARP_S_SKIP))) {
1178 if (tmp && (!reginfo || regtry(reginfo, &s)))
1191 if (tmp && (!reginfo || regtry(reginfo, &s)))
1199 ln = STR_LEN(c); /* length to match in octets/bytes */
1200 lnc = (I32) ln; /* length to match in characters */
1202 STRLEN ulen1, ulen2;
1204 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1205 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1208 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1209 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1211 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1213 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1216 while (sm < ((U8 *) m + ln)) {
1231 c2 = PL_fold_locale[c1];
1233 e = HOP3c(strend, -((I32)lnc), s);
1235 if (!reginfo && e < s)
1236 e = s; /* Due to minlen logic of intuit() */
1238 /* The idea in the EXACTF* cases is to first find the
1239 * first character of the EXACTF* node and then, if
1240 * necessary, case-insensitively compare the full
1241 * text of the node. The c1 and c2 are the first
1242 * characters (though in Unicode it gets a bit
1243 * more complicated because there are more cases
1244 * than just upper and lower: one needs to use
1245 * the so-called folding case for case-insensitive
1246 * matching (called "loose matching" in Unicode).
1247 * ibcmp_utf8() will do just that. */
1251 U8 tmpbuf [UTF8_MAXBYTES+1];
1252 STRLEN len, foldlen;
1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1255 /* Upper and lower of 1st char are equal -
1256 * probably not a "letter". */
1258 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1260 REXEC_FBC_EXACTISH_CHECK(c == c1);
1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1268 /* Handle some of the three Greek sigmas cases.
1269 * Note that not all the possible combinations
1270 * are handled here: some of them are handled
1271 * by the standard folding rules, and some of
1272 * them (the character class or ANYOF cases)
1273 * are handled during compiletime in
1274 * regexec.c:S_regclass(). */
1275 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1276 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1277 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1279 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1285 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1287 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1291 PL_reg_flags |= RF_tainted;
1298 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1299 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1301 tmp = ((OP(c) == BOUND ?
1302 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1303 LOAD_UTF8_CHARCLASS_ALNUM();
1304 REXEC_FBC_UTF8_SCAN(
1305 if (tmp == !(OP(c) == BOUND ?
1306 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1307 isALNUM_LC_utf8((U8*)s)))
1315 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1316 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1319 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1325 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1329 PL_reg_flags |= RF_tainted;
1336 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1337 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1339 tmp = ((OP(c) == NBOUND ?
1340 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1341 LOAD_UTF8_CHARCLASS_ALNUM();
1342 REXEC_FBC_UTF8_SCAN(
1343 if (tmp == !(OP(c) == NBOUND ?
1344 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1345 isALNUM_LC_utf8((U8*)s)))
1347 else REXEC_FBC_TRYIT;
1351 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1352 tmp = ((OP(c) == NBOUND ?
1353 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1356 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1358 else REXEC_FBC_TRYIT;
1361 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1365 REXEC_FBC_CSCAN_PRELOAD(
1366 LOAD_UTF8_CHARCLASS_ALNUM(),
1367 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1371 REXEC_FBC_CSCAN_TAINT(
1372 isALNUM_LC_utf8((U8*)s),
1376 REXEC_FBC_CSCAN_PRELOAD(
1377 LOAD_UTF8_CHARCLASS_ALNUM(),
1378 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1382 REXEC_FBC_CSCAN_TAINT(
1383 !isALNUM_LC_utf8((U8*)s),
1387 REXEC_FBC_CSCAN_PRELOAD(
1388 LOAD_UTF8_CHARCLASS_SPACE(),
1389 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1393 REXEC_FBC_CSCAN_TAINT(
1394 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1398 REXEC_FBC_CSCAN_PRELOAD(
1399 LOAD_UTF8_CHARCLASS_SPACE(),
1400 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1404 REXEC_FBC_CSCAN_TAINT(
1405 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1409 REXEC_FBC_CSCAN_PRELOAD(
1410 LOAD_UTF8_CHARCLASS_DIGIT(),
1411 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1415 REXEC_FBC_CSCAN_TAINT(
1416 isDIGIT_LC_utf8((U8*)s),
1420 REXEC_FBC_CSCAN_PRELOAD(
1421 LOAD_UTF8_CHARCLASS_DIGIT(),
1422 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1426 REXEC_FBC_CSCAN_TAINT(
1427 !isDIGIT_LC_utf8((U8*)s),
1433 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1434 trie_type = do_utf8 ?
1435 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1437 /* what trie are we using right now */
1439 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1441 = (reg_trie_data*)progi->data->data[ aho->trie ];
1442 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1444 const char *last_start = strend - trie->minlen;
1446 const char *real_start = s;
1448 STRLEN maxlen = trie->maxlen;
1450 U8 **points; /* map of where we were in the input string
1451 when reading a given char. For ASCII this
1452 is unnecessary overhead as the relationship
1453 is always 1:1, but for unicode, especially
1454 case folded unicode this is not true. */
1455 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459 GET_RE_DEBUG_FLAGS_DECL;
1461 /* We can't just allocate points here. We need to wrap it in
1462 * an SV so it gets freed properly if there is a croak while
1463 * running the match */
1466 sv_points=newSV(maxlen * sizeof(U8 *));
1467 SvCUR_set(sv_points,
1468 maxlen * sizeof(U8 *));
1469 SvPOK_on(sv_points);
1470 sv_2mortal(sv_points);
1471 points=(U8**)SvPV_nolen(sv_points );
1472 if ( trie_type != trie_utf8_fold
1473 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1476 bitmap=(U8*)trie->bitmap;
1478 bitmap=(U8*)ANYOF_BITMAP(c);
1480 /* this is the Aho-Corasick algorithm modified a touch
1481 to include special handling for long "unknown char"
1482 sequences. The basic idea being that we use AC as long
1483 as we are dealing with a possible matching char, when
1484 we encounter an unknown char (and we have not encountered
1485 an accepting state) we scan forward until we find a legal
1487 AC matching is basically that of trie matching, except
1488 that when we encounter a failing transition, we fall back
1489 to the current states "fail state", and try the current char
1490 again, a process we repeat until we reach the root state,
1491 state 1, or a legal transition. If we fail on the root state
1492 then we can either terminate if we have reached an accepting
1493 state previously, or restart the entire process from the beginning
1497 while (s <= last_start) {
1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1506 U8 *uscan = (U8*)NULL;
1507 U8 *leftmost = NULL;
1509 U32 accepted_word= 0;
1513 while ( state && uc <= (U8*)strend ) {
1515 U32 word = aho->states[ state ].wordnum;
1519 DEBUG_TRIE_EXECUTE_r(
1520 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 (char *)uc, do_utf8 );
1523 PerlIO_printf( Perl_debug_log,
1524 " Scanning for legal start char...\n");
1527 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1532 if (uc >(U8*)last_start) break;
1536 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1537 if (!leftmost || lpos < leftmost) {
1538 DEBUG_r(accepted_word=word);
1544 points[pointpos++ % maxlen]= uc;
1545 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1546 uscan, len, uvc, charid, foldlen,
1548 DEBUG_TRIE_EXECUTE_r({
1549 dump_exec_pos( (char *)uc, c, strend, real_start,
1551 PerlIO_printf(Perl_debug_log,
1552 " Charid:%3u CP:%4"UVxf" ",
1558 word = aho->states[ state ].wordnum;
1560 base = aho->states[ state ].trans.base;
1562 DEBUG_TRIE_EXECUTE_r({
1564 dump_exec_pos( (char *)uc, c, strend, real_start,
1566 PerlIO_printf( Perl_debug_log,
1567 "%sState: %4"UVxf", word=%"UVxf,
1568 failed ? " Fail transition to " : "",
1569 (UV)state, (UV)word);
1574 (base + charid > trie->uniquecharcount )
1575 && (base + charid - 1 - trie->uniquecharcount
1577 && trie->trans[base + charid - 1 -
1578 trie->uniquecharcount].check == state
1579 && (tmp=trie->trans[base + charid - 1 -
1580 trie->uniquecharcount ].next))
1582 DEBUG_TRIE_EXECUTE_r(
1583 PerlIO_printf( Perl_debug_log," - legal\n"));
1588 DEBUG_TRIE_EXECUTE_r(
1589 PerlIO_printf( Perl_debug_log," - fail\n"));
1591 state = aho->fail[state];
1595 /* we must be accepting here */
1596 DEBUG_TRIE_EXECUTE_r(
1597 PerlIO_printf( Perl_debug_log," - accepting\n"));
1606 if (!state) state = 1;
1609 if ( aho->states[ state ].wordnum ) {
1610 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1611 if (!leftmost || lpos < leftmost) {
1612 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1617 s = (char*)leftmost;
1618 DEBUG_TRIE_EXECUTE_r({
1620 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1621 (UV)accepted_word, (IV)(s - real_start)
1624 if (!reginfo || regtry(reginfo, &s)) {
1630 DEBUG_TRIE_EXECUTE_r({
1631 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1634 DEBUG_TRIE_EXECUTE_r(
1635 PerlIO_printf( Perl_debug_log,"No match.\n"));
1644 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1653 S_swap_match_buff (pTHX_ regexp *prog) {
1655 RXi_GET_DECL(prog,progi);
1658 /* We have to be careful. If the previous successful match
1659 was from this regex we don't want a subsequent paritally
1660 successful match to clobber the old results.
1661 So when we detect this possibility we add a swap buffer
1662 to the re, and switch the buffer each match. If we fail
1663 we switch it back, otherwise we leave it swapped.
1665 Newxz(progi->swap, 1, regexp_paren_ofs);
1666 /* no need to copy these */
1667 Newxz(progi->swap->startp, prog->nparens + 1, I32);
1668 Newxz(progi->swap->endp, prog->nparens + 1, I32);
1670 t = progi->swap->startp;
1671 progi->swap->startp = prog->startp;
1673 t = progi->swap->endp;
1674 progi->swap->endp = prog->endp;
1680 - regexec_flags - match a regexp against a string
1683 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1684 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1685 /* strend: pointer to null at end of string */
1686 /* strbeg: real beginning of string */
1687 /* minend: end of match must be >=minend after stringarg. */
1688 /* data: May be used for some additional optimizations.
1689 Currently its only used, with a U32 cast, for transmitting
1690 the ganch offset when doing a /g match. This will change */
1691 /* nosave: For optimizations. */
1694 /*register*/ char *s;
1695 register regnode *c;
1696 /*register*/ char *startpos = stringarg;
1697 I32 minlen; /* must match at least this many chars */
1698 I32 dontbother = 0; /* how many characters not to try at end */
1699 I32 end_shift = 0; /* Same for the end. */ /* CC */
1700 I32 scream_pos = -1; /* Internal iterator of scream. */
1701 char *scream_olds = NULL;
1702 SV* const oreplsv = GvSV(PL_replgv);
1703 const bool do_utf8 = (bool)DO_UTF8(sv);
1705 RXi_GET_DECL(prog,progi);
1706 regmatch_info reginfo; /* create some info to pass to regtry etc */
1707 bool swap_on_fail = 0;
1709 GET_RE_DEBUG_FLAGS_DECL;
1711 PERL_UNUSED_ARG(data);
1713 /* Be paranoid... */
1714 if (prog == NULL || startpos == NULL) {
1715 Perl_croak(aTHX_ "NULL regexp parameter");
1719 multiline = prog->extflags & RXf_PMf_MULTILINE;
1720 reginfo.prog = prog;
1722 RX_MATCH_UTF8_set(prog, do_utf8);
1724 debug_start_match(prog, do_utf8, startpos, strend,
1728 minlen = prog->minlen;
1730 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1731 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1732 "String too short [regexec_flags]...\n"));
1737 /* Check validity of program. */
1738 if (UCHARAT(progi->program) != REG_MAGIC) {
1739 Perl_croak(aTHX_ "corrupted regexp program");
1743 PL_reg_eval_set = 0;
1746 if (prog->extflags & RXf_UTF8)
1747 PL_reg_flags |= RF_utf8;
1749 /* Mark beginning of line for ^ and lookbehind. */
1750 reginfo.bol = startpos; /* XXX not used ??? */
1754 /* Mark end of line for $ (and such) */
1757 /* see how far we have to get to not match where we matched before */
1758 reginfo.till = startpos+minend;
1760 /* If there is a "must appear" string, look for it. */
1763 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1766 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1767 reginfo.ganch = startpos + prog->gofs;
1768 else if (sv && SvTYPE(sv) >= SVt_PVMG
1770 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1771 && mg->mg_len >= 0) {
1772 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1773 if (prog->extflags & RXf_ANCH_GPOS) {
1774 if (s > reginfo.ganch)
1776 s = reginfo.ganch - prog->gofs;
1780 reginfo.ganch = strbeg + PTR2UV(data);
1781 } else /* pos() not defined */
1782 reginfo.ganch = strbeg;
1784 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1786 swap_match_buff(prog); /* do we need a save destructor here for
1789 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1790 re_scream_pos_data d;
1792 d.scream_olds = &scream_olds;
1793 d.scream_pos = &scream_pos;
1794 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1797 goto phooey; /* not present */
1803 /* Simplest case: anchored match need be tried only once. */
1804 /* [unless only anchor is BOL and multiline is set] */
1805 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1806 if (s == startpos && regtry(®info, &startpos))
1808 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1809 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1814 dontbother = minlen - 1;
1815 end = HOP3c(strend, -dontbother, strbeg) - 1;
1816 /* for multiline we only have to try after newlines */
1817 if (prog->check_substr || prog->check_utf8) {
1821 if (regtry(®info, &s))
1826 if (prog->extflags & RXf_USE_INTUIT) {
1827 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1838 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1839 if (regtry(®info, &s))
1846 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1848 /* the warning about reginfo.ganch being used without intialization
1849 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1850 and we only enter this block when the same bit is set. */
1851 char *tmp_s = reginfo.ganch - prog->gofs;
1852 if (regtry(®info, &tmp_s))
1857 /* Messy cases: unanchored match. */
1858 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1859 /* we have /x+whatever/ */
1860 /* it must be a one character string (XXXX Except UTF?) */
1865 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1872 DEBUG_EXECUTE_r( did_match = 1 );
1873 if (regtry(®info, &s)) goto got_it;
1875 while (s < strend && *s == ch)
1883 DEBUG_EXECUTE_r( did_match = 1 );
1884 if (regtry(®info, &s)) goto got_it;
1886 while (s < strend && *s == ch)
1891 DEBUG_EXECUTE_r(if (!did_match)
1892 PerlIO_printf(Perl_debug_log,
1893 "Did not find anchored character...\n")
1896 else if (prog->anchored_substr != NULL
1897 || prog->anchored_utf8 != NULL
1898 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1899 && prog->float_max_offset < strend - s)) {
1904 char *last1; /* Last position checked before */
1908 if (prog->anchored_substr || prog->anchored_utf8) {
1909 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1910 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1911 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1912 back_max = back_min = prog->anchored_offset;
1914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1915 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1916 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1917 back_max = prog->float_max_offset;
1918 back_min = prog->float_min_offset;
1922 if (must == &PL_sv_undef)
1923 /* could not downgrade utf8 check substring, so must fail */
1929 last = HOP3c(strend, /* Cannot start after this */
1930 -(I32)(CHR_SVLEN(must)
1931 - (SvTAIL(must) != 0) + back_min), strbeg);
1934 last1 = HOPc(s, -1);
1936 last1 = s - 1; /* bogus */
1938 /* XXXX check_substr already used to find "s", can optimize if
1939 check_substr==must. */
1941 dontbother = end_shift;
1942 strend = HOPc(strend, -dontbother);
1943 while ( (s <= last) &&
1944 ((flags & REXEC_SCREAM)
1945 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1946 end_shift, &scream_pos, 0))
1947 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1948 (unsigned char*)strend, must,
1949 multiline ? FBMrf_MULTILINE : 0))) ) {
1950 /* we may be pointing at the wrong string */
1951 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1952 s = strbeg + (s - SvPVX_const(sv));
1953 DEBUG_EXECUTE_r( did_match = 1 );
1954 if (HOPc(s, -back_max) > last1) {
1955 last1 = HOPc(s, -back_min);
1956 s = HOPc(s, -back_max);
1959 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1961 last1 = HOPc(s, -back_min);
1965 while (s <= last1) {
1966 if (regtry(®info, &s))
1972 while (s <= last1) {
1973 if (regtry(®info, &s))
1979 DEBUG_EXECUTE_r(if (!did_match) {
1980 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1981 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1982 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1983 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1984 ? "anchored" : "floating"),
1985 quoted, RE_SV_TAIL(must));
1989 else if ( (c = progi->regstclass) ) {
1991 const OPCODE op = OP(progi->regstclass);
1992 /* don't bother with what can't match */
1993 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1994 strend = HOPc(strend, -(minlen - 1));
1997 SV * const prop = sv_newmortal();
1998 regprop(prog, prop, c);
2000 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
2002 PerlIO_printf(Perl_debug_log,
2003 "Matching stclass %.*s against %s (%d chars)\n",
2004 (int)SvCUR(prop), SvPVX_const(prop),
2005 quoted, (int)(strend - s));
2008 if (find_byclass(prog, c, s, strend, ®info))
2010 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2014 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2019 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2020 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2021 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2023 if (flags & REXEC_SCREAM) {
2024 last = screaminstr(sv, float_real, s - strbeg,
2025 end_shift, &scream_pos, 1); /* last one */
2027 last = scream_olds; /* Only one occurrence. */
2028 /* we may be pointing at the wrong string */
2029 else if (RX_MATCH_COPIED(prog))
2030 s = strbeg + (s - SvPVX_const(sv));
2034 const char * const little = SvPV_const(float_real, len);
2036 if (SvTAIL(float_real)) {
2037 if (memEQ(strend - len + 1, little, len - 1))
2038 last = strend - len + 1;
2039 else if (!multiline)
2040 last = memEQ(strend - len, little, len)
2041 ? strend - len : NULL;
2047 last = rninstr(s, strend, little, little + len);
2049 last = strend; /* matching "$" */
2054 PerlIO_printf(Perl_debug_log,
2055 "%sCan't trim the tail, match fails (should not happen)%s\n",
2056 PL_colors[4], PL_colors[5]));
2057 goto phooey; /* Should not happen! */
2059 dontbother = strend - last + prog->float_min_offset;
2061 if (minlen && (dontbother < minlen))
2062 dontbother = minlen - 1;
2063 strend -= dontbother; /* this one's always in bytes! */
2064 /* We don't know much -- general case. */
2067 if (regtry(®info, &s))
2076 if (regtry(®info, &s))
2078 } while (s++ < strend);
2086 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2088 if (PL_reg_eval_set) {
2089 /* Preserve the current value of $^R */
2090 if (oreplsv != GvSV(PL_replgv))
2091 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2092 restored, the value remains
2094 restore_pos(aTHX_ prog);
2096 if (prog->paren_names)
2097 (void)hv_iterinit(prog->paren_names);
2099 /* make sure $`, $&, $', and $digit will work later */
2100 if ( !(flags & REXEC_NOT_FIRST) ) {
2101 RX_MATCH_COPY_FREE(prog);
2102 if (flags & REXEC_COPY_STR) {
2103 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2104 #ifdef PERL_OLD_COPY_ON_WRITE
2106 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2108 PerlIO_printf(Perl_debug_log,
2109 "Copy on write: regexp capture, type %d\n",
2112 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2113 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2114 assert (SvPOKp(prog->saved_copy));
2118 RX_MATCH_COPIED_on(prog);
2119 s = savepvn(strbeg, i);
2125 prog->subbeg = strbeg;
2126 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2133 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2134 PL_colors[4], PL_colors[5]));
2135 if (PL_reg_eval_set)
2136 restore_pos(aTHX_ prog);
2138 /* we failed :-( roll it back */
2139 swap_match_buff(prog);
2148 - regtry - try match at specific point
2150 STATIC I32 /* 0 failure, 1 success */
2151 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2157 regexp *prog = reginfo->prog;
2158 RXi_GET_DECL(prog,progi);
2159 GET_RE_DEBUG_FLAGS_DECL;
2160 reginfo->cutpoint=NULL;
2162 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2165 PL_reg_eval_set = RS_init;
2166 DEBUG_EXECUTE_r(DEBUG_s(
2167 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2168 (IV)(PL_stack_sp - PL_stack_base));
2171 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2172 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2174 /* Apparently this is not needed, judging by wantarray. */
2175 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2176 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2179 /* Make $_ available to executed code. */
2180 if (reginfo->sv != DEFSV) {
2182 DEFSV = reginfo->sv;
2185 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2186 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2187 /* prepare for quick setting of pos */
2188 #ifdef PERL_OLD_COPY_ON_WRITE
2189 if (SvIsCOW(reginfo->sv))
2190 sv_force_normal_flags(reginfo->sv, 0);
2192 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2193 &PL_vtbl_mglob, NULL, 0);
2197 PL_reg_oldpos = mg->mg_len;
2198 SAVEDESTRUCTOR_X(restore_pos, prog);
2200 if (!PL_reg_curpm) {
2201 Newxz(PL_reg_curpm, 1, PMOP);
2204 SV* const repointer = newSViv(0);
2205 /* so we know which PL_regex_padav element is PL_reg_curpm */
2206 SvFLAGS(repointer) |= SVf_BREAK;
2207 av_push(PL_regex_padav,repointer);
2208 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2209 PL_regex_pad = AvARRAY(PL_regex_padav);
2213 PM_SETRE(PL_reg_curpm, prog);
2214 PL_reg_oldcurpm = PL_curpm;
2215 PL_curpm = PL_reg_curpm;
2216 if (RX_MATCH_COPIED(prog)) {
2217 /* Here is a serious problem: we cannot rewrite subbeg,
2218 since it may be needed if this match fails. Thus
2219 $` inside (?{}) could fail... */
2220 PL_reg_oldsaved = prog->subbeg;
2221 PL_reg_oldsavedlen = prog->sublen;
2222 #ifdef PERL_OLD_COPY_ON_WRITE
2223 PL_nrs = prog->saved_copy;
2225 RX_MATCH_COPIED_off(prog);
2228 PL_reg_oldsaved = NULL;
2229 prog->subbeg = PL_bostr;
2230 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2232 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2233 prog->startp[0] = *startpos - PL_bostr;
2234 PL_reginput = *startpos;
2235 PL_reglastparen = &prog->lastparen;
2236 PL_reglastcloseparen = &prog->lastcloseparen;
2237 prog->lastparen = 0;
2238 prog->lastcloseparen = 0;
2240 PL_regstartp = prog->startp;
2241 PL_regendp = prog->endp;
2242 if (PL_reg_start_tmpl <= prog->nparens) {
2243 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2244 if(PL_reg_start_tmp)
2245 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2247 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2250 /* XXXX What this code is doing here?!!! There should be no need
2251 to do this again and again, PL_reglastparen should take care of
2254 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2255 * Actually, the code in regcppop() (which Ilya may be meaning by
2256 * PL_reglastparen), is not needed at all by the test suite
2257 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2258 * enough, for building DynaLoader, or otherwise this
2259 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2260 * will happen. Meanwhile, this code *is* needed for the
2261 * above-mentioned test suite tests to succeed. The common theme
2262 * on those tests seems to be returning null fields from matches.
2267 if (prog->nparens) {
2269 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2276 if (regmatch(reginfo, progi->program + 1)) {
2277 PL_regendp[0] = PL_reginput - PL_bostr;
2280 if (reginfo->cutpoint)
2281 *startpos= reginfo->cutpoint;
2282 REGCP_UNWIND(lastcp);
2287 #define sayYES goto yes
2288 #define sayNO goto no
2289 #define sayNO_SILENT goto no_silent
2291 /* we dont use STMT_START/END here because it leads to
2292 "unreachable code" warnings, which are bogus, but distracting. */
2293 #define CACHEsayNO \
2294 if (ST.cache_mask) \
2295 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2298 /* this is used to determine how far from the left messages like
2299 'failed...' are printed. It should be set such that messages
2300 are inline with the regop output that created them.
2302 #define REPORT_CODE_OFF 32
2305 /* Make sure there is a test for this +1 options in re_tests */
2306 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2308 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2309 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2311 #define SLAB_FIRST(s) (&(s)->states[0])
2312 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2314 /* grab a new slab and return the first slot in it */
2316 STATIC regmatch_state *
2319 #if PERL_VERSION < 9
2322 regmatch_slab *s = PL_regmatch_slab->next;
2324 Newx(s, 1, regmatch_slab);
2325 s->prev = PL_regmatch_slab;
2327 PL_regmatch_slab->next = s;
2329 PL_regmatch_slab = s;
2330 return SLAB_FIRST(s);
2334 /* push a new state then goto it */
2336 #define PUSH_STATE_GOTO(state, node) \
2338 st->resume_state = state; \
2341 /* push a new state with success backtracking, then goto it */
2343 #define PUSH_YES_STATE_GOTO(state, node) \
2345 st->resume_state = state; \
2346 goto push_yes_state;
2352 regmatch() - main matching routine
2354 This is basically one big switch statement in a loop. We execute an op,
2355 set 'next' to point the next op, and continue. If we come to a point which
2356 we may need to backtrack to on failure such as (A|B|C), we push a
2357 backtrack state onto the backtrack stack. On failure, we pop the top
2358 state, and re-enter the loop at the state indicated. If there are no more
2359 states to pop, we return failure.
2361 Sometimes we also need to backtrack on success; for example /A+/, where
2362 after successfully matching one A, we need to go back and try to
2363 match another one; similarly for lookahead assertions: if the assertion
2364 completes successfully, we backtrack to the state just before the assertion
2365 and then carry on. In these cases, the pushed state is marked as
2366 'backtrack on success too'. This marking is in fact done by a chain of
2367 pointers, each pointing to the previous 'yes' state. On success, we pop to
2368 the nearest yes state, discarding any intermediate failure-only states.
2369 Sometimes a yes state is pushed just to force some cleanup code to be
2370 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2371 it to free the inner regex.
2373 Note that failure backtracking rewinds the cursor position, while
2374 success backtracking leaves it alone.
2376 A pattern is complete when the END op is executed, while a subpattern
2377 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2378 ops trigger the "pop to last yes state if any, otherwise return true"
2381 A common convention in this function is to use A and B to refer to the two
2382 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2383 the subpattern to be matched possibly multiple times, while B is the entire
2384 rest of the pattern. Variable and state names reflect this convention.
2386 The states in the main switch are the union of ops and failure/success of
2387 substates associated with with that op. For example, IFMATCH is the op
2388 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2389 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2390 successfully matched A and IFMATCH_A_fail is a state saying that we have
2391 just failed to match A. Resume states always come in pairs. The backtrack
2392 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2393 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2394 on success or failure.
2396 The struct that holds a backtracking state is actually a big union, with
2397 one variant for each major type of op. The variable st points to the
2398 top-most backtrack struct. To make the code clearer, within each
2399 block of code we #define ST to alias the relevant union.
2401 Here's a concrete example of a (vastly oversimplified) IFMATCH
2407 #define ST st->u.ifmatch
2409 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2410 ST.foo = ...; // some state we wish to save
2412 // push a yes backtrack state with a resume value of
2413 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2415 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2418 case IFMATCH_A: // we have successfully executed A; now continue with B
2420 bar = ST.foo; // do something with the preserved value
2423 case IFMATCH_A_fail: // A failed, so the assertion failed
2424 ...; // do some housekeeping, then ...
2425 sayNO; // propagate the failure
2432 For any old-timers reading this who are familiar with the old recursive
2433 approach, the code above is equivalent to:
2435 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2444 ...; // do some housekeeping, then ...
2445 sayNO; // propagate the failure
2448 The topmost backtrack state, pointed to by st, is usually free. If you
2449 want to claim it, populate any ST.foo fields in it with values you wish to
2450 save, then do one of
2452 PUSH_STATE_GOTO(resume_state, node);
2453 PUSH_YES_STATE_GOTO(resume_state, node);
2455 which sets that backtrack state's resume value to 'resume_state', pushes a
2456 new free entry to the top of the backtrack stack, then goes to 'node'.
2457 On backtracking, the free slot is popped, and the saved state becomes the
2458 new free state. An ST.foo field in this new top state can be temporarily
2459 accessed to retrieve values, but once the main loop is re-entered, it
2460 becomes available for reuse.
2462 Note that the depth of the backtrack stack constantly increases during the
2463 left-to-right execution of the pattern, rather than going up and down with
2464 the pattern nesting. For example the stack is at its maximum at Z at the
2465 end of the pattern, rather than at X in the following:
2467 /(((X)+)+)+....(Y)+....Z/
2469 The only exceptions to this are lookahead/behind assertions and the cut,
2470 (?>A), which pop all the backtrack states associated with A before
2473 Bascktrack state structs are allocated in slabs of about 4K in size.
2474 PL_regmatch_state and st always point to the currently active state,
2475 and PL_regmatch_slab points to the slab currently containing
2476 PL_regmatch_state. The first time regmatch() is called, the first slab is
2477 allocated, and is never freed until interpreter destruction. When the slab
2478 is full, a new one is allocated and chained to the end. At exit from
2479 regmatch(), slabs allocated since entry are freed.
2484 #define DEBUG_STATE_pp(pp) \
2486 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2487 PerlIO_printf(Perl_debug_log, \
2488 " %*s"pp" %s%s%s%s%s\n", \
2490 reg_name[st->resume_state], \
2491 ((st==yes_state||st==mark_state) ? "[" : ""), \
2492 ((st==yes_state) ? "Y" : ""), \
2493 ((st==mark_state) ? "M" : ""), \
2494 ((st==yes_state||st==mark_state) ? "]" : "") \
2499 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2504 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2505 const char *start, const char *end, const char *blurb)
2507 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2511 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2512 prog->precomp, prog->prelen, 60);
2514 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2515 start, end - start, 60);
2517 PerlIO_printf(Perl_debug_log,
2518 "%s%s REx%s %s against %s\n",
2519 PL_colors[4], blurb, PL_colors[5], s0, s1);
2521 if (do_utf8||utf8_pat)
2522 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2523 utf8_pat ? "pattern" : "",
2524 utf8_pat && do_utf8 ? " and " : "",
2525 do_utf8 ? "string" : ""
2531 S_dump_exec_pos(pTHX_ const char *locinput,
2532 const regnode *scan,
2533 const char *loc_regeol,
2534 const char *loc_bostr,
2535 const char *loc_reg_starttry,
2538 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2539 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2540 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2541 /* The part of the string before starttry has one color
2542 (pref0_len chars), between starttry and current
2543 position another one (pref_len - pref0_len chars),
2544 after the current position the third one.
2545 We assume that pref0_len <= pref_len, otherwise we
2546 decrease pref0_len. */
2547 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2548 ? (5 + taill) - l : locinput - loc_bostr;
2551 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2553 pref0_len = pref_len - (locinput - loc_reg_starttry);
2554 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2555 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2556 ? (5 + taill) - pref_len : loc_regeol - locinput);
2557 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2561 if (pref0_len > pref_len)
2562 pref0_len = pref_len;
2564 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2566 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2567 (locinput - pref_len),pref0_len, 60, 4, 5);
2569 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2570 (locinput - pref_len + pref0_len),
2571 pref_len - pref0_len, 60, 2, 3);
2573 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2574 locinput, loc_regeol - locinput, 10, 0, 1);
2576 const STRLEN tlen=len0+len1+len2;
2577 PerlIO_printf(Perl_debug_log,
2578 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2579 (IV)(locinput - loc_bostr),
2582 (docolor ? "" : "> <"),
2584 (int)(tlen > 19 ? 0 : 19 - tlen),
2591 /* reg_check_named_buff_matched()
2592 * Checks to see if a named buffer has matched. The data array of
2593 * buffer numbers corresponding to the buffer is expected to reside
2594 * in the regexp->data->data array in the slot stored in the ARG() of
2595 * node involved. Note that this routine doesn't actually care about the
2596 * name, that information is not preserved from compilation to execution.
2597 * Returns the index of the leftmost defined buffer with the given name
2598 * or 0 if non of the buffers matched.
2601 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2603 RXi_GET_DECL(rex,rexi);
2604 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2605 I32 *nums=(I32*)SvPVX(sv_dat);
2606 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2607 if ((I32)*PL_reglastparen >= nums[n] &&
2608 PL_regendp[nums[n]] != -1)
2616 STATIC I32 /* 0 failure, 1 success */
2617 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2619 #if PERL_VERSION < 9
2623 register const bool do_utf8 = PL_reg_match_utf8;
2624 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2626 regexp *rex = reginfo->prog;
2627 RXi_GET_DECL(rex,rexi);
2629 regmatch_slab *orig_slab;
2630 regmatch_state *orig_state;
2632 /* the current state. This is a cached copy of PL_regmatch_state */
2633 register regmatch_state *st;
2635 /* cache heavy used fields of st in registers */
2636 register regnode *scan;
2637 register regnode *next;
2638 register U32 n = 0; /* general value; init to avoid compiler warning */
2639 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2640 register char *locinput = PL_reginput;
2641 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2643 bool result = 0; /* return value of S_regmatch */
2644 int depth = 0; /* depth of backtrack stack */
2645 int nochange_depth = 0; /* depth of GOSUB recursion with nochange*/
2646 regmatch_state *yes_state = NULL; /* state to pop to on success of
2648 /* mark_state piggy backs on the yes_state logic so that when we unwind
2649 the stack on success we can update the mark_state as we go */
2650 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2652 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2653 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2655 bool no_final = 0; /* prevent failure from backtracking? */
2656 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2657 char *startpoint = PL_reginput;
2658 SV *popmark = NULL; /* are we looking for a mark? */
2659 SV *sv_commit = NULL; /* last mark name seen in failure */
2660 SV *sv_yes_mark = NULL; /* last mark name we have seen
2661 during a successfull match */
2662 U32 lastopen = 0; /* last open we saw */
2663 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2666 /* these three flags are set by various ops to signal information to
2667 * the very next op. They have a useful lifetime of exactly one loop
2668 * iteration, and are not preserved or restored by state pushes/pops
2670 bool sw = 0; /* the condition value in (?(cond)a|b) */
2671 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2672 int logical = 0; /* the following EVAL is:
2676 or the following IFMATCH/UNLESSM is:
2677 false: plain (?=foo)
2678 true: used as a condition: (?(?=foo))
2682 GET_RE_DEBUG_FLAGS_DECL;
2686 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2688 /* on first ever call to regmatch, allocate first slab */
2689 if (!PL_regmatch_slab) {
2690 Newx(PL_regmatch_slab, 1, regmatch_slab);
2691 PL_regmatch_slab->prev = NULL;
2692 PL_regmatch_slab->next = NULL;
2693 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2696 /* remember current high-water mark for exit */
2697 /* XXX this should be done with SAVE* instead */
2698 orig_slab = PL_regmatch_slab;
2699 orig_state = PL_regmatch_state;
2701 /* grab next free state slot */
2702 st = ++PL_regmatch_state;
2703 if (st > SLAB_LAST(PL_regmatch_slab))
2704 st = PL_regmatch_state = S_push_slab(aTHX);
2706 /* Note that nextchr is a byte even in UTF */
2707 nextchr = UCHARAT(locinput);
2709 while (scan != NULL) {
2712 SV * const prop = sv_newmortal();
2713 regnode *rnext=regnext(scan);
2714 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2715 regprop(rex, prop, scan);
2717 PerlIO_printf(Perl_debug_log,
2718 "%3"IVdf":%*s%s(%"IVdf")\n",
2719 (IV)(scan - rexi->program), depth*2, "",
2721 (PL_regkind[OP(scan)] == END || !rnext) ?
2722 0 : (IV)(rnext - rexi->program));
2725 next = scan + NEXT_OFF(scan);
2728 state_num = OP(scan);
2731 switch (state_num) {
2733 if (locinput == PL_bostr)
2735 /* reginfo->till = reginfo->bol; */
2740 if (locinput == PL_bostr ||
2741 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2747 if (locinput == PL_bostr)
2751 if (locinput == reginfo->ganch)
2756 /* update the startpoint */
2757 st->u.keeper.val = PL_regstartp[0];
2758 PL_reginput = locinput;
2759 PL_regstartp[0] = locinput - PL_bostr;
2760 PUSH_STATE_GOTO(KEEPS_next, next);
2762 case KEEPS_next_fail:
2763 /* rollback the start point change */
2764 PL_regstartp[0] = st->u.keeper.val;
2770 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2775 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2777 if (PL_regeol - locinput > 1)
2781 if (PL_regeol != locinput)
2785 if (!nextchr && locinput >= PL_regeol)
2788 locinput += PL_utf8skip[nextchr];
2789 if (locinput > PL_regeol)
2791 nextchr = UCHARAT(locinput);
2794 nextchr = UCHARAT(++locinput);
2797 if (!nextchr && locinput >= PL_regeol)
2799 nextchr = UCHARAT(++locinput);
2802 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2805 locinput += PL_utf8skip[nextchr];
2806 if (locinput > PL_regeol)
2808 nextchr = UCHARAT(locinput);
2811 nextchr = UCHARAT(++locinput);
2815 #define ST st->u.trie
2817 /* In this case the charclass data is available inline so
2818 we can fail fast without a lot of extra overhead.
2820 if (scan->flags == EXACT || !do_utf8) {
2821 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2823 PerlIO_printf(Perl_debug_log,
2824 "%*s %sfailed to match trie start class...%s\n",
2825 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2834 /* what type of TRIE am I? (utf8 makes this contextual) */
2835 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2836 trie_type = do_utf8 ?
2837 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2840 /* what trie are we using right now */
2841 reg_trie_data * const trie
2842 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2843 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2844 U32 state = trie->startstate;
2846 if (trie->bitmap && trie_type != trie_utf8_fold &&
2847 !TRIE_BITMAP_TEST(trie,*locinput)
2849 if (trie->states[ state ].wordnum) {
2851 PerlIO_printf(Perl_debug_log,
2852 "%*s %smatched empty string...%s\n",
2853 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2858 PerlIO_printf(Perl_debug_log,
2859 "%*s %sfailed to match trie start class...%s\n",
2860 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2867 U8 *uc = ( U8* )locinput;
2871 U8 *uscan = (U8*)NULL;
2873 SV *sv_accept_buff = NULL;
2874 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2876 ST.accepted = 0; /* how many accepting states we have seen */
2878 ST.jump = trie->jump;
2881 traverse the TRIE keeping track of all accepting states
2882 we transition through until we get to a failing node.
2885 while ( state && uc <= (U8*)PL_regeol ) {
2886 U32 base = trie->states[ state ].trans.base;
2889 /* We use charid to hold the wordnum as we don't use it
2890 for charid until after we have done the wordnum logic.
2891 We define an alias just so that the wordnum logic reads
2894 #define got_wordnum charid
2895 got_wordnum = trie->states[ state ].wordnum;
2897 if ( got_wordnum ) {
2898 if ( ! ST.accepted ) {
2901 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2902 sv_accept_buff=newSV(bufflen *
2903 sizeof(reg_trie_accepted) - 1);
2904 SvCUR_set(sv_accept_buff, 0);
2905 SvPOK_on(sv_accept_buff);
2906 sv_2mortal(sv_accept_buff);
2909 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2912 if (ST.accepted >= bufflen) {
2914 ST.accept_buff =(reg_trie_accepted*)
2915 SvGROW(sv_accept_buff,
2916 bufflen * sizeof(reg_trie_accepted));
2918 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2919 + sizeof(reg_trie_accepted));
2922 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2923 ST.accept_buff[ST.accepted].endpos = uc;
2925 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2929 DEBUG_TRIE_EXECUTE_r({
2930 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2931 PerlIO_printf( Perl_debug_log,
2932 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2933 2+depth * 2, "", PL_colors[4],
2934 (UV)state, (UV)ST.accepted );
2938 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2939 uscan, len, uvc, charid, foldlen,
2943 (base + charid > trie->uniquecharcount )
2944 && (base + charid - 1 - trie->uniquecharcount
2946 && trie->trans[base + charid - 1 -
2947 trie->uniquecharcount].check == state)
2949 state = trie->trans[base + charid - 1 -
2950 trie->uniquecharcount ].next;
2961 DEBUG_TRIE_EXECUTE_r(
2962 PerlIO_printf( Perl_debug_log,
2963 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2964 charid, uvc, (UV)state, PL_colors[5] );
2971 PerlIO_printf( Perl_debug_log,
2972 "%*s %sgot %"IVdf" possible matches%s\n",
2973 REPORT_CODE_OFF + depth * 2, "",
2974 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2977 goto trie_first_try; /* jump into the fail handler */
2979 case TRIE_next_fail: /* we failed - try next alterative */
2981 REGCP_UNWIND(ST.cp);
2982 for (n = *PL_reglastparen; n > ST.lastparen; n--)
2984 *PL_reglastparen = n;
2993 ST.lastparen = *PL_reglastparen;
2996 if ( ST.accepted == 1 ) {
2997 /* only one choice left - just continue */
2999 AV *const trie_words
3000 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3001 SV ** const tmp = av_fetch( trie_words,
3002 ST.accept_buff[ 0 ].wordnum-1, 0 );
3003 SV *sv= tmp ? sv_newmortal() : NULL;
3005 PerlIO_printf( Perl_debug_log,
3006 "%*s %sonly one match left: #%d <%s>%s\n",
3007 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3008 ST.accept_buff[ 0 ].wordnum,
3009 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3010 PL_colors[0], PL_colors[1],
3011 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3013 : "not compiled under -Dr",
3016 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3017 /* in this case we free tmps/leave before we call regmatch
3018 as we wont be using accept_buff again. */
3020 locinput = PL_reginput;
3021 nextchr = UCHARAT(locinput);
3022 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3025 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3026 if (!has_cutgroup) {
3031 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3034 continue; /* execute rest of RE */
3037 if ( !ST.accepted-- ) {
3039 PerlIO_printf( Perl_debug_log,
3040 "%*s %sTRIE failed...%s\n",
3041 REPORT_CODE_OFF+depth*2, "",
3052 There are at least two accepting states left. Presumably
3053 the number of accepting states is going to be low,
3054 typically two. So we simply scan through to find the one
3055 with lowest wordnum. Once we find it, we swap the last
3056 state into its place and decrement the size. We then try to
3057 match the rest of the pattern at the point where the word
3058 ends. If we succeed, control just continues along the
3059 regex; if we fail we return here to try the next accepting
3066 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3067 DEBUG_TRIE_EXECUTE_r(
3068 PerlIO_printf( Perl_debug_log,
3069 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3070 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3071 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3072 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3075 if (ST.accept_buff[cur].wordnum <
3076 ST.accept_buff[best].wordnum)
3081 AV *const trie_words
3082 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3083 SV ** const tmp = av_fetch( trie_words,
3084 ST.accept_buff[ best ].wordnum - 1, 0 );
3085 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3087 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3088 SV *sv= tmp ? sv_newmortal() : NULL;
3090 PerlIO_printf( Perl_debug_log,
3091 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3092 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3093 ST.accept_buff[best].wordnum,
3094 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3095 PL_colors[0], PL_colors[1],
3096 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3097 ) : "not compiled under -Dr",
3098 REG_NODE_NUM(nextop),
3102 if ( best<ST.accepted ) {
3103 reg_trie_accepted tmp = ST.accept_buff[ best ];
3104 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3105 ST.accept_buff[ ST.accepted ] = tmp;
3108 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3109 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3113 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3117 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3120 PUSH_STATE_GOTO(TRIE_next, scan);
3133 char *s = STRING(scan);
3135 if (do_utf8 != UTF) {
3136 /* The target and the pattern have differing utf8ness. */
3138 const char * const e = s + ln;
3141 /* The target is utf8, the pattern is not utf8. */
3146 if (NATIVE_TO_UNI(*(U8*)s) !=
3147 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3155 /* The target is not utf8, the pattern is utf8. */
3160 if (NATIVE_TO_UNI(*((U8*)l)) !=
3161 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3169 nextchr = UCHARAT(locinput);
3172 /* The target and the pattern have the same utf8ness. */
3173 /* Inline the first character, for speed. */
3174 if (UCHARAT(s) != nextchr)
3176 if (PL_regeol - locinput < ln)
3178 if (ln > 1 && memNE(s, locinput, ln))
3181 nextchr = UCHARAT(locinput);
3185 PL_reg_flags |= RF_tainted;
3188 char * const s = STRING(scan);
3191 if (do_utf8 || UTF) {
3192 /* Either target or the pattern are utf8. */
3193 const char * const l = locinput;
3194 char *e = PL_regeol;
3196 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3197 l, &e, 0, do_utf8)) {
3198 /* One more case for the sharp s:
3199 * pack("U0U*", 0xDF) =~ /ss/i,
3200 * the 0xC3 0x9F are the UTF-8
3201 * byte sequence for the U+00DF. */
3203 toLOWER(s[0]) == 's' &&
3205 toLOWER(s[1]) == 's' &&
3212 nextchr = UCHARAT(locinput);
3216 /* Neither the target and the pattern are utf8. */
3218 /* Inline the first character, for speed. */
3219 if (UCHARAT(s) != nextchr &&
3220 UCHARAT(s) != ((OP(scan) == EXACTF)
3221 ? PL_fold : PL_fold_locale)[nextchr])
3223 if (PL_regeol - locinput < ln)
3225 if (ln > 1 && (OP(scan) == EXACTF
3226 ? ibcmp(s, locinput, ln)
3227 : ibcmp_locale(s, locinput, ln)))
3230 nextchr = UCHARAT(locinput);
3235 STRLEN inclasslen = PL_regeol - locinput;
3237 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3239 if (locinput >= PL_regeol)
3241 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3242 nextchr = UCHARAT(locinput);
3247 nextchr = UCHARAT(locinput);
3248 if (!REGINCLASS(rex, scan, (U8*)locinput))
3250 if (!nextchr && locinput >= PL_regeol)
3252 nextchr = UCHARAT(++locinput);
3256 /* If we might have the case of the German sharp s
3257 * in a casefolding Unicode character class. */
3259 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3260 locinput += SHARP_S_SKIP;
3261 nextchr = UCHARAT(locinput);
3267 PL_reg_flags |= RF_tainted;
3273 LOAD_UTF8_CHARCLASS_ALNUM();
3274 if (!(OP(scan) == ALNUM
3275 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3276 : isALNUM_LC_utf8((U8*)locinput)))
3280 locinput += PL_utf8skip[nextchr];
3281 nextchr = UCHARAT(locinput);
3284 if (!(OP(scan) == ALNUM
3285 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3287 nextchr = UCHARAT(++locinput);
3290 PL_reg_flags |= RF_tainted;
3293 if (!nextchr && locinput >= PL_regeol)
3296 LOAD_UTF8_CHARCLASS_ALNUM();
3297 if (OP(scan) == NALNUM
3298 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3299 : isALNUM_LC_utf8((U8*)locinput))
3303 locinput += PL_utf8skip[nextchr];
3304 nextchr = UCHARAT(locinput);
3307 if (OP(scan) == NALNUM
3308 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3310 nextchr = UCHARAT(++locinput);
3314 PL_reg_flags |= RF_tainted;
3318 /* was last char in word? */
3320 if (locinput == PL_bostr)
3323 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3325 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3327 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3328 ln = isALNUM_uni(ln);
3329 LOAD_UTF8_CHARCLASS_ALNUM();
3330 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3333 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3334 n = isALNUM_LC_utf8((U8*)locinput);
3338 ln = (locinput != PL_bostr) ?
3339 UCHARAT(locinput - 1) : '\n';
3340 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3342 n = isALNUM(nextchr);
3345 ln = isALNUM_LC(ln);
3346 n = isALNUM_LC(nextchr);
3349 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3350 OP(scan) == BOUNDL))
3354 PL_reg_flags |= RF_tainted;
3360 if (UTF8_IS_CONTINUED(nextchr)) {
3361 LOAD_UTF8_CHARCLASS_SPACE();
3362 if (!(OP(scan) == SPACE
3363 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3364 : isSPACE_LC_utf8((U8*)locinput)))
3368 locinput += PL_utf8skip[nextchr];
3369 nextchr = UCHARAT(locinput);
3372 if (!(OP(scan) == SPACE
3373 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3375 nextchr = UCHARAT(++locinput);
3378 if (!(OP(scan) == SPACE
3379 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3381 nextchr = UCHARAT(++locinput);
3385 PL_reg_flags |= RF_tainted;
3388 if (!nextchr && locinput >= PL_regeol)
3391 LOAD_UTF8_CHARCLASS_SPACE();
3392 if (OP(scan) == NSPACE
3393 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3394 : isSPACE_LC_utf8((U8*)locinput))
3398 locinput += PL_utf8skip[nextchr];
3399 nextchr = UCHARAT(locinput);
3402 if (OP(scan) == NSPACE
3403 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3405 nextchr = UCHARAT(++locinput);
3408 PL_reg_flags |= RF_tainted;
3414 LOAD_UTF8_CHARCLASS_DIGIT();
3415 if (!(OP(scan) == DIGIT
3416 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3417 : isDIGIT_LC_utf8((U8*)locinput)))
3421 locinput += PL_utf8skip[nextchr];
3422 nextchr = UCHARAT(locinput);
3425 if (!(OP(scan) == DIGIT
3426 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3428 nextchr = UCHARAT(++locinput);
3431 PL_reg_flags |= RF_tainted;
3434 if (!nextchr && locinput >= PL_regeol)
3437 LOAD_UTF8_CHARCLASS_DIGIT();
3438 if (OP(scan) == NDIGIT
3439 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3440 : isDIGIT_LC_utf8((U8*)locinput))
3444 locinput += PL_utf8skip[nextchr];
3445 nextchr = UCHARAT(locinput);
3448 if (OP(scan) == NDIGIT
3449 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3451 nextchr = UCHARAT(++locinput);
3454 if (locinput >= PL_regeol)
3457 LOAD_UTF8_CHARCLASS_MARK();
3458 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3460 locinput += PL_utf8skip[nextchr];
3461 while (locinput < PL_regeol &&
3462 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3463 locinput += UTF8SKIP(locinput);
3464 if (locinput > PL_regeol)
3469 nextchr = UCHARAT(locinput);
3476 PL_reg_flags |= RF_tainted;
3481 n = reg_check_named_buff_matched(rex,scan);
3484 type = REF + ( type - NREF );
3491 PL_reg_flags |= RF_tainted;
3495 n = ARG(scan); /* which paren pair */
3498 ln = PL_regstartp[n];
3499 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3500 if (*PL_reglastparen < n || ln == -1)
3501 sayNO; /* Do not match unless seen CLOSEn. */
3502 if (ln == PL_regendp[n])
3506 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3508 const char *e = PL_bostr + PL_regendp[n];
3510 * Note that we can't do the "other character" lookup trick as
3511 * in the 8-bit case (no pun intended) because in Unicode we
3512 * have to map both upper and title case to lower case.
3516 STRLEN ulen1, ulen2;
3517 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3518 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3522 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3523 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3524 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3531 nextchr = UCHARAT(locinput);
3535 /* Inline the first character, for speed. */
3536 if (UCHARAT(s) != nextchr &&
3538 (UCHARAT(s) != (type == REFF
3539 ? PL_fold : PL_fold_locale)[nextchr])))
3541 ln = PL_regendp[n] - ln;
3542 if (locinput + ln > PL_regeol)
3544 if (ln > 1 && (type == REF
3545 ? memNE(s, locinput, ln)
3547 ? ibcmp(s, locinput, ln)
3548 : ibcmp_locale(s, locinput, ln))))
3551 nextchr = UCHARAT(locinput);
3561 #define ST st->u.eval
3565 regexp_internal *rei;
3566 regnode *startpoint;
3569 case GOSUB: /* /(...(?1))/ */
3570 if (cur_eval && cur_eval->locinput==locinput) {
3571 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3572 Perl_croak(aTHX_ "Infinite recursion in regex");
3573 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3575 "Pattern subroutine nesting without pos change"
3576 " exceeded limit in regex");
3582 (void)ReREFCNT_inc(rex);
3583 if (OP(scan)==GOSUB) {
3584 startpoint = scan + ARG2L(scan);
3585 ST.close_paren = ARG(scan);
3587 startpoint = rei->program+1;
3590 goto eval_recurse_doit;
3592 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3593 if (cur_eval && cur_eval->locinput==locinput) {
3594 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3595 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3599 { regexp *ocurpm = PM_GETRE(PL_curpm);
3600 char *osubbeg = rex->subbeg;
3601 STRLEN osublen = rex->sublen;
3603 /* execute the code in the {...} */
3605 SV ** const before = SP;
3606 OP_4tree * const oop = PL_op;
3607 COP * const ocurcop = PL_curcop;
3612 PL_op = (OP_4tree*)rexi->data->data[n];
3613 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3614 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3615 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3616 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3619 SV *sv_mrk = get_sv("REGMARK", 1);
3620 sv_setsv(sv_mrk, sv_yes_mark);
3622 /* make sure that $1 and friends are available with nested eval */
3623 PM_SETRE(PL_curpm,rex);
3624 rex->subbeg = ocurpm->subbeg;
3625 rex->sublen = ocurpm->sublen;
3627 CALLRUNOPS(aTHX); /* Scalar context. */
3630 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3637 PAD_RESTORE_LOCAL(old_comppad);
3638 PL_curcop = ocurcop;
3642 sv_setsv(save_scalar(PL_replgv), ret);
3646 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3649 /* extract RE object from returned value; compiling if
3654 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3655 mg = mg_find(sv, PERL_MAGIC_qr);
3656 else if (SvSMAGICAL(ret)) {
3657 if (SvGMAGICAL(ret))
3658 sv_unmagic(ret, PERL_MAGIC_qr);
3660 mg = mg_find(ret, PERL_MAGIC_qr);
3664 re = (regexp *)mg->mg_obj;
3665 (void)ReREFCNT_inc(re);
3669 const char * const t = SvPV_const(ret, len);
3671 const I32 osize = PL_regsize;
3674 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3675 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3677 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3679 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3686 /* restore PL_curpm after the eval */
3687 PM_SETRE(PL_curpm,ocurpm);
3688 rex->sublen = osublen;
3689 rex->subbeg = osubbeg;
3692 debug_start_match(re, do_utf8, locinput, PL_regeol,
3693 "Matching embedded");
3695 startpoint = rei->program + 1;
3696 ST.close_paren = 0; /* only used for GOSUB */
3697 /* borrowed from regtry */
3698 if (PL_reg_start_tmpl <= re->nparens) {
3699 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3700 if(PL_reg_start_tmp)
3701 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3703 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3707 eval_recurse_doit: /* Share code with GOSUB below this line */
3708 /* run the pattern returned from (??{...}) */
3709 ST.cp = regcppush(0); /* Save *all* the positions. */
3710 REGCP_SET(ST.lastcp);
3712 PL_regstartp = re->startp; /* essentially NOOP on GOSUB */
3713 PL_regendp = re->endp; /* essentially NOOP on GOSUB */
3715 *PL_reglastparen = 0;
3716 *PL_reglastcloseparen = 0;
3717 PL_reginput = locinput;
3720 /* XXXX This is too dramatic a measure... */
3723 ST.toggle_reg_flags = PL_reg_flags;
3724 if (re->extflags & RXf_UTF8)
3725 PL_reg_flags |= RF_utf8;
3727 PL_reg_flags &= ~RF_utf8;
3728 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3731 ST.prev_curlyx = cur_curlyx;
3736 ST.prev_eval = cur_eval;
3738 /* now continue from first node in postoned RE */
3739 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3742 /* restore PL_curpm after the eval */
3743 PM_SETRE(PL_curpm,ocurpm);
3744 rex->sublen = osublen;
3745 rex->subbeg = osubbeg;
3747 /* logical is 1, /(?(?{...})X|Y)/ */
3748 sw = (bool)SvTRUE(ret);
3753 case EVAL_AB: /* cleanup after a successful (??{A})B */
3754 /* note: this is called twice; first after popping B, then A */
3755 PL_reg_flags ^= ST.toggle_reg_flags;
3758 rexi = RXi_GET(rex);
3760 cur_eval = ST.prev_eval;
3761 cur_curlyx = ST.prev_curlyx;
3762 /* XXXX This is too dramatic a measure... */
3767 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3768 /* note: this is called twice; first after popping B, then A */
3769 PL_reg_flags ^= ST.toggle_reg_flags;
3772 rexi = RXi_GET(rex);
3773 PL_reginput = locinput;
3774 REGCP_UNWIND(ST.lastcp);
3776 cur_eval = ST.prev_eval;
3777 cur_curlyx = ST.prev_curlyx;
3778 /* XXXX This is too dramatic a measure... */
3784 n = ARG(scan); /* which paren pair */
3785 PL_reg_start_tmp[n] = locinput;
3791 n = ARG(scan); /* which paren pair */
3792 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3793 PL_regendp[n] = locinput - PL_bostr;
3794 /*if (n > PL_regsize)
3796 if (n > *PL_reglastparen)
3797 *PL_reglastparen = n;
3798 *PL_reglastcloseparen = n;
3799 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3807 cursor && OP(cursor)!=END;
3808 cursor=regnext(cursor))
3810 if ( OP(cursor)==CLOSE ){
3812 if ( n <= lastopen ) {
3813 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3814 PL_regendp[n] = locinput - PL_bostr;
3815 /*if (n > PL_regsize)
3817 if (n > *PL_reglastparen)
3818 *PL_reglastparen = n;
3819 *PL_reglastcloseparen = n;
3820 if ( n == ARG(scan) || (cur_eval &&
3821 cur_eval->u.eval.close_paren == n))
3830 n = ARG(scan); /* which paren pair */
3831 sw = (bool)(*PL_reglastparen >= n && PL_regendp[n] != -1);
3834 /* reg_check_named_buff_matched returns 0 for no match */
3835 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3839 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3845 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3847 next = NEXTOPER(NEXTOPER(scan));
3849 next = scan + ARG(scan);
3850 if (OP(next) == IFTHEN) /* Fake one. */
3851 next = NEXTOPER(NEXTOPER(next));
3855 logical = scan->flags;
3858 /*******************************************************************
3860 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3861 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3862 STAR/PLUS/CURLY/CURLYN are used instead.)
3864 A*B is compiled as <CURLYX><A><WHILEM><B>
3866 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3867 state, which contains the current count, initialised to -1. It also sets
3868 cur_curlyx to point to this state, with any previous value saved in the
3871 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3872 since the pattern may possibly match zero times (i.e. it's a while {} loop
3873 rather than a do {} while loop).
3875 Each entry to WHILEM represents a successful match of A. The count in the
3876 CURLYX block is incremented, another WHILEM state is pushed, and execution
3877 passes to A or B depending on greediness and the current count.
3879 For example, if matching against the string a1a2a3b (where the aN are
3880 substrings that match /A/), then the match progresses as follows: (the
3881 pushed states are interspersed with the bits of strings matched so far):
3884 <CURLYX cnt=0><WHILEM>
3885 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3886 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3887 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3888 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3890 (Contrast this with something like CURLYM, which maintains only a single
3894 a1 <CURLYM cnt=1> a2
3895 a1 a2 <CURLYM cnt=2> a3
3896 a1 a2 a3 <CURLYM cnt=3> b
3899 Each WHILEM state block marks a point to backtrack to upon partial failure
3900 of A or B, and also contains some minor state data related to that
3901 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3902 overall state, such as the count, and pointers to the A and B ops.
3904 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3905 must always point to the *current* CURLYX block, the rules are:
3907 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3908 and set cur_curlyx to point the new block.
3910 When popping the CURLYX block after a successful or unsuccessful match,
3911 restore the previous cur_curlyx.
3913 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3914 to the outer one saved in the CURLYX block.
3916 When popping the WHILEM block after a successful or unsuccessful B match,
3917 restore the previous cur_curlyx.
3919 Here's an example for the pattern (AI* BI)*BO
3920 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3923 curlyx backtrack stack
3924 ------ ---------------
3926 CO <CO prev=NULL> <WO>
3927 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3928 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3929 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3931 At this point the pattern succeeds, and we work back down the stack to
3932 clean up, restoring as we go:
3934 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3935 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3936 CO <CO prev=NULL> <WO>
3939 *******************************************************************/
3941 #define ST st->u.curlyx
3943 case CURLYX: /* start of /A*B/ (for complex A) */
3945 /* No need to save/restore up to this paren */
3946 I32 parenfloor = scan->flags;
3948 assert(next); /* keep Coverity happy */
3949 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3952 /* XXXX Probably it is better to teach regpush to support
3953 parenfloor > PL_regsize... */
3954 if (parenfloor > (I32)*PL_reglastparen)
3955 parenfloor = *PL_reglastparen; /* Pessimization... */
3957 ST.prev_curlyx= cur_curlyx;
3959 ST.cp = PL_savestack_ix;
3961 /* these fields contain the state of the current curly.
3962 * they are accessed by subsequent WHILEMs */
3963 ST.parenfloor = parenfloor;
3964 ST.min = ARG1(scan);
3965 ST.max = ARG2(scan);
3966 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3970 ST.count = -1; /* this will be updated by WHILEM */
3971 ST.lastloc = NULL; /* this will be updated by WHILEM */
3973 PL_reginput = locinput;
3974 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3978 case CURLYX_end: /* just finished matching all of A*B */
3979 if (PL_reg_eval_set){
3980 SV *pres= GvSV(PL_replgv);
3983 sv_setsv(GvSV(PL_replgv), pres);
3988 cur_curlyx = ST.prev_curlyx;
3992 case CURLYX_end_fail: /* just failed to match all of A*B */
3994 cur_curlyx = ST.prev_curlyx;
4000 #define ST st->u.whilem
4002 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4004 /* see the discussion above about CURLYX/WHILEM */
4006 assert(cur_curlyx); /* keep Coverity happy */
4007 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4008 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4009 ST.cache_offset = 0;
4012 PL_reginput = locinput;
4014 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4015 "%*s whilem: matched %ld out of %ld..%ld\n",
4016 REPORT_CODE_OFF+depth*2, "", (long)n,
4017 (long)cur_curlyx->u.curlyx.min,
4018 (long)cur_curlyx->u.curlyx.max)
4021 /* First just match a string of min A's. */
4023 if (n < cur_curlyx->u.curlyx.min) {
4024 cur_curlyx->u.curlyx.lastloc = locinput;
4025 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4029 /* If degenerate A matches "", assume A done. */
4031 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4032 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4033 "%*s whilem: empty match detected, trying continuation...\n",
4034 REPORT_CODE_OFF+depth*2, "")
4036 goto do_whilem_B_max;
4039 /* super-linear cache processing */
4043 if (!PL_reg_maxiter) {
4044 /* start the countdown: Postpone detection until we
4045 * know the match is not *that* much linear. */
4046 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4047 /* possible overflow for long strings and many CURLYX's */
4048 if (PL_reg_maxiter < 0)
4049 PL_reg_maxiter = I32_MAX;
4050 PL_reg_leftiter = PL_reg_maxiter;
4053 if (PL_reg_leftiter-- == 0) {
4054 /* initialise cache */
4055 const I32 size = (PL_reg_maxiter + 7)/8;
4056 if (PL_reg_poscache) {
4057 if ((I32)PL_reg_poscache_size < size) {
4058 Renew(PL_reg_poscache, size, char);
4059 PL_reg_poscache_size = size;
4061 Zero(PL_reg_poscache, size, char);
4064 PL_reg_poscache_size = size;
4065 Newxz(PL_reg_poscache, size, char);
4067 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4068 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4069 PL_colors[4], PL_colors[5])
4073 if (PL_reg_leftiter < 0) {
4074 /* have we already failed at this position? */
4076 offset = (scan->flags & 0xf) - 1
4077 + (locinput - PL_bostr) * (scan->flags>>4);
4078 mask = 1 << (offset % 8);
4080 if (PL_reg_poscache[offset] & mask) {
4081 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4082 "%*s whilem: (cache) already tried at this position...\n",
4083 REPORT_CODE_OFF+depth*2, "")
4085 sayNO; /* cache records failure */
4087 ST.cache_offset = offset;
4088 ST.cache_mask = mask;
4092 /* Prefer B over A for minimal matching. */
4094 if (cur_curlyx->u.curlyx.minmod) {
4095 ST.save_curlyx = cur_curlyx;
4096 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4097 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4098 REGCP_SET(ST.lastcp);
4099 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4103 /* Prefer A over B for maximal matching. */
4105 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4106 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4107 cur_curlyx->u.curlyx.lastloc = locinput;
4108 REGCP_SET(ST.lastcp);
4109 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4112 goto do_whilem_B_max;
4116 case WHILEM_B_min: /* just matched B in a minimal match */
4117 case WHILEM_B_max: /* just matched B in a maximal match */
4118 cur_curlyx = ST.save_curlyx;
4122 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4123 cur_curlyx = ST.save_curlyx;
4124 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4125 cur_curlyx->u.curlyx.count--;
4129 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4130 REGCP_UNWIND(ST.lastcp);
4133 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4134 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4135 cur_curlyx->u.curlyx.count--;
4139 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4140 REGCP_UNWIND(ST.lastcp);
4141 regcppop(rex); /* Restore some previous $<digit>s? */
4142 PL_reginput = locinput;
4143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4144 "%*s whilem: failed, trying continuation...\n",
4145 REPORT_CODE_OFF+depth*2, "")
4148 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4149 && ckWARN(WARN_REGEXP)
4150 && !(PL_reg_flags & RF_warned))
4152 PL_reg_flags |= RF_warned;
4153 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4154 "Complex regular subexpression recursion",
4159 ST.save_curlyx = cur_curlyx;
4160 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4161 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4164 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4165 cur_curlyx = ST.save_curlyx;
4166 REGCP_UNWIND(ST.lastcp);
4169 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4170 /* Maximum greed exceeded */
4171 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4172 && ckWARN(WARN_REGEXP)
4173 && !(PL_reg_flags & RF_warned))
4175 PL_reg_flags |= RF_warned;
4176 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4177 "%s limit (%d) exceeded",
4178 "Complex regular subexpression recursion",
4181 cur_curlyx->u.curlyx.count--;
4185 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4186 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4188 /* Try grabbing another A and see if it helps. */
4189 PL_reginput = locinput;
4190 cur_curlyx->u.curlyx.lastloc = locinput;
4191 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4192 REGCP_SET(ST.lastcp);
4193 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4197 #define ST st->u.branch
4199 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4200 next = scan + ARG(scan);
4203 scan = NEXTOPER(scan);
4206 case BRANCH: /* /(...|A|...)/ */
4207 scan = NEXTOPER(scan); /* scan now points to inner node */
4208 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4211 /* last branch; skip state push and jump direct to node */
4214 ST.lastparen = *PL_reglastparen;
4215 ST.next_branch = next;
4217 PL_reginput = locinput;
4219 /* Now go into the branch */
4221 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4223 PUSH_STATE_GOTO(BRANCH_next, scan);
4227 PL_reginput = locinput;
4228 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4229 (SV*)rexi->data->data[ ARG( scan ) ];
4230 PUSH_STATE_GOTO(CUTGROUP_next,next);
4232 case CUTGROUP_next_fail:
4235 if (st->u.mark.mark_name)
4236 sv_commit = st->u.mark.mark_name;
4242 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4247 REGCP_UNWIND(ST.cp);
4248 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4250 *PL_reglastparen = n;
4251 /*dmq: *PL_reglastcloseparen = n; */
4252 scan = ST.next_branch;
4253 /* no more branches? */
4254 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4256 PerlIO_printf( Perl_debug_log,
4257 "%*s %sBRANCH failed...%s\n",
4258 REPORT_CODE_OFF+depth*2, "",
4264 continue; /* execute next BRANCH[J] op */
4272 #define ST st->u.curlym
4274 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4276 /* This is an optimisation of CURLYX that enables us to push
4277 * only a single backtracking state, no matter now many matches
4278 * there are in {m,n}. It relies on the pattern being constant
4279 * length, with no parens to influence future backrefs
4283 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4285 /* if paren positive, emulate an OPEN/CLOSE around A */
4287 U32 paren = ST.me->flags;
4288 if (paren > PL_regsize)
4290 if (paren > *PL_reglastparen)
4291 *PL_reglastparen = paren;
4292 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4300 ST.c1 = CHRTEST_UNINIT;
4303 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4306 curlym_do_A: /* execute the A in /A{m,n}B/ */
4307 PL_reginput = locinput;
4308 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4311 case CURLYM_A: /* we've just matched an A */
4312 locinput = st->locinput;
4313 nextchr = UCHARAT(locinput);
4316 /* after first match, determine A's length: u.curlym.alen */
4317 if (ST.count == 1) {
4318 if (PL_reg_match_utf8) {
4320 while (s < PL_reginput) {
4326 ST.alen = PL_reginput - locinput;
4329 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4332 PerlIO_printf(Perl_debug_log,
4333 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4334 (int)(REPORT_CODE_OFF+(depth*2)), "",
4335 (IV) ST.count, (IV)ST.alen)
4338 locinput = PL_reginput;
4340 if (cur_eval && cur_eval->u.eval.close_paren &&
4341 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4344 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4345 goto curlym_do_A; /* try to match another A */
4346 goto curlym_do_B; /* try to match B */
4348 case CURLYM_A_fail: /* just failed to match an A */
4349 REGCP_UNWIND(ST.cp);
4351 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4352 || (cur_eval && cur_eval->u.eval.close_paren &&
4353 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4356 curlym_do_B: /* execute the B in /A{m,n}B/ */
4357 PL_reginput = locinput;
4358 if (ST.c1 == CHRTEST_UNINIT) {
4359 /* calculate c1 and c2 for possible match of 1st char
4360 * following curly */
4361 ST.c1 = ST.c2 = CHRTEST_VOID;
4362 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4363 regnode *text_node = ST.B;
4364 if (! HAS_TEXT(text_node))
4365 FIND_NEXT_IMPT(text_node);
4368 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4370 But the former is redundant in light of the latter.
4372 if this changes back then the macro for
4373 IS_TEXT and friends need to change.
4375 if (PL_regkind[OP(text_node)] == EXACT)
4378 ST.c1 = (U8)*STRING(text_node);
4380 (IS_TEXTF(text_node))
4382 : (IS_TEXTFL(text_node))
4383 ? PL_fold_locale[ST.c1]
4390 PerlIO_printf(Perl_debug_log,
4391 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4392 (int)(REPORT_CODE_OFF+(depth*2)),
4395 if (ST.c1 != CHRTEST_VOID
4396 && UCHARAT(PL_reginput) != ST.c1
4397 && UCHARAT(PL_reginput) != ST.c2)
4399 /* simulate B failing */
4401 PerlIO_printf(Perl_debug_log,
4402 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4403 (int)(REPORT_CODE_OFF+(depth*2)),"",
4406 state_num = CURLYM_B_fail;
4407 goto reenter_switch;
4411 /* mark current A as captured */
4412 I32 paren = ST.me->flags;
4415 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4416 PL_regendp[paren] = PL_reginput - PL_bostr;
4417 /*dmq: *PL_reglastcloseparen = paren; */
4420 PL_regendp[paren] = -1;
4421 if (cur_eval && cur_eval->u.eval.close_paren &&
4422 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4431 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4434 case CURLYM_B_fail: /* just failed to match a B */
4435 REGCP_UNWIND(ST.cp);
4437 if (ST.count == ARG2(ST.me) /* max */)
4439 goto curlym_do_A; /* try to match a further A */
4441 /* backtrack one A */
4442 if (ST.count == ARG1(ST.me) /* min */)
4445 locinput = HOPc(locinput, -ST.alen);
4446 goto curlym_do_B; /* try to match B */
4449 #define ST st->u.curly
4451 #define CURLY_SETPAREN(paren, success) \
4454 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4455 PL_regendp[paren] = locinput - PL_bostr; \
4456 *PL_reglastcloseparen = paren; \
4459 PL_regendp[paren] = -1; \
4462 case STAR: /* /A*B/ where A is width 1 */
4466 scan = NEXTOPER(scan);
4468 case PLUS: /* /A+B/ where A is width 1 */
4472 scan = NEXTOPER(scan);
4474 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4475 ST.paren = scan->flags; /* Which paren to set */
4476 if (ST.paren > PL_regsize)
4477 PL_regsize = ST.paren;
4478 if (ST.paren > *PL_reglastparen)
4479 *PL_reglastparen = ST.paren;
4480 ST.min = ARG1(scan); /* min to match */
4481 ST.max = ARG2(scan); /* max to match */
4482 if (cur_eval && cur_eval->u.eval.close_paren &&
4483 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4487 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4489 case CURLY: /* /A{m,n}B/ where A is width 1 */
4491 ST.min = ARG1(scan); /* min to match */
4492 ST.max = ARG2(scan); /* max to match */
4493 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4496 * Lookahead to avoid useless match attempts
4497 * when we know what character comes next.
4499 * Used to only do .*x and .*?x, but now it allows
4500 * for )'s, ('s and (?{ ... })'s to be in the way
4501 * of the quantifier and the EXACT-like node. -- japhy
4504 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4506 if (HAS_TEXT(next) || JUMPABLE(next)) {
4508 regnode *text_node = next;
4510 if (! HAS_TEXT(text_node))
4511 FIND_NEXT_IMPT(text_node);
4513 if (! HAS_TEXT(text_node))
4514 ST.c1 = ST.c2 = CHRTEST_VOID;
4516 if ( PL_regkind[OP(text_node)] != EXACT ) {
4517 ST.c1 = ST.c2 = CHRTEST_VOID;
4518 goto assume_ok_easy;
4521 s = (U8*)STRING(text_node);
4523 /* Currently we only get here when
4525 PL_rekind[OP(text_node)] == EXACT
4527 if this changes back then the macro for IS_TEXT and
4528 friends need to change. */
4531 if (IS_TEXTF(text_node))
4532 ST.c2 = PL_fold[ST.c1];
4533 else if (IS_TEXTFL(text_node))
4534 ST.c2 = PL_fold_locale[ST.c1];
4537 if (IS_TEXTF(text_node)) {
4538 STRLEN ulen1, ulen2;
4539 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4540 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4542 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4543 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4545 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4547 0 : UTF8_ALLOW_ANY);
4548 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4550 0 : UTF8_ALLOW_ANY);
4552 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4554 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4559 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4566 ST.c1 = ST.c2 = CHRTEST_VOID;
4571 PL_reginput = locinput;
4574 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4577 locinput = PL_reginput;
4579 if (ST.c1 == CHRTEST_VOID)
4580 goto curly_try_B_min;
4582 ST.oldloc = locinput;
4584 /* set ST.maxpos to the furthest point along the
4585 * string that could possibly match */
4586 if (ST.max == REG_INFTY) {
4587 ST.maxpos = PL_regeol - 1;
4589 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4593 int m = ST.max - ST.min;
4594 for (ST.maxpos = locinput;
4595 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4596 ST.maxpos += UTF8SKIP(ST.maxpos);
4599 ST.maxpos = locinput + ST.max - ST.min;
4600 if (ST.maxpos >= PL_regeol)
4601 ST.maxpos = PL_regeol - 1;
4603 goto curly_try_B_min_known;
4607 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4608 locinput = PL_reginput;
4609 if (ST.count < ST.min)
4611 if ((ST.count > ST.min)
4612 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4614 /* A{m,n} must come at the end of the string, there's
4615 * no point in backing off ... */
4617 /* ...except that $ and \Z can match before *and* after
4618 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4619 We may back off by one in this case. */
4620 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4624 goto curly_try_B_max;
4629 case CURLY_B_min_known_fail:
4630 /* failed to find B in a non-greedy match where c1,c2 valid */
4631 if (ST.paren && ST.count)
4632 PL_regendp[ST.paren] = -1;
4634 PL_reginput = locinput; /* Could be reset... */
4635 REGCP_UNWIND(ST.cp);
4636 /* Couldn't or didn't -- move forward. */
4637 ST.oldloc = locinput;
4639 locinput += UTF8SKIP(locinput);
4643 curly_try_B_min_known:
4644 /* find the next place where 'B' could work, then call B */
4648 n = (ST.oldloc == locinput) ? 0 : 1;
4649 if (ST.c1 == ST.c2) {
4651 /* set n to utf8_distance(oldloc, locinput) */
4652 while (locinput <= ST.maxpos &&
4653 utf8n_to_uvchr((U8*)locinput,
4654 UTF8_MAXBYTES, &len,
4655 uniflags) != (UV)ST.c1) {
4661 /* set n to utf8_distance(oldloc, locinput) */
4662 while (locinput <= ST.maxpos) {
4664 const UV c = utf8n_to_uvchr((U8*)locinput,
4665 UTF8_MAXBYTES, &len,
4667 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4675 if (ST.c1 == ST.c2) {
4676 while (locinput <= ST.maxpos &&
4677 UCHARAT(locinput) != ST.c1)
4681 while (locinput <= ST.maxpos
4682 && UCHARAT(locinput) != ST.c1
4683 && UCHARAT(locinput) != ST.c2)
4686 n = locinput - ST.oldloc;
4688 if (locinput > ST.maxpos)
4690 /* PL_reginput == oldloc now */
4693 if (regrepeat(rex, ST.A, n, depth) < n)
4696 PL_reginput = locinput;
4697 CURLY_SETPAREN(ST.paren, ST.count);
4698 if (cur_eval && cur_eval->u.eval.close_paren &&
4699 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4702 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4707 case CURLY_B_min_fail:
4708 /* failed to find B in a non-greedy match where c1,c2 invalid */
4709 if (ST.paren && ST.count)
4710 PL_regendp[ST.paren] = -1;
4712 REGCP_UNWIND(ST.cp);
4713 /* failed -- move forward one */
4714 PL_reginput = locinput;
4715 if (regrepeat(rex, ST.A, 1, depth)) {
4717 locinput = PL_reginput;
4718 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4719 ST.count > 0)) /* count overflow ? */
4722 CURLY_SETPAREN(ST.paren, ST.count);
4723 if (cur_eval && cur_eval->u.eval.close_paren &&
4724 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4727 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4735 /* a successful greedy match: now try to match B */
4736 if (cur_eval && cur_eval->u.eval.close_paren &&
4737 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4742 if (ST.c1 != CHRTEST_VOID)
4743 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4744 UTF8_MAXBYTES, 0, uniflags)
4745 : (UV) UCHARAT(PL_reginput);
4746 /* If it could work, try it. */
4747 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4748 CURLY_SETPAREN(ST.paren, ST.count);
4749 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4754 case CURLY_B_max_fail:
4755 /* failed to find B in a greedy match */
4756 if (ST.paren && ST.count)
4757 PL_regendp[ST.paren] = -1;
4759 REGCP_UNWIND(ST.cp);
4761 if (--ST.count < ST.min)
4763 PL_reginput = locinput = HOPc(locinput, -1);
4764 goto curly_try_B_max;
4771 /* we've just finished A in /(??{A})B/; now continue with B */
4775 st->u.eval.toggle_reg_flags
4776 = cur_eval->u.eval.toggle_reg_flags;
4777 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4779 st->u.eval.prev_rex = rex; /* inner */
4780 rex = cur_eval->u.eval.prev_rex; /* outer */
4781 rexi = RXi_GET(rex);
4782 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4784 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4785 REGCP_SET(st->u.eval.lastcp);
4786 PL_reginput = locinput;
4788 /* Restore parens of the outer rex without popping the
4790 tmpix = PL_savestack_ix;
4791 PL_savestack_ix = cur_eval->u.eval.lastcp;
4793 PL_savestack_ix = tmpix;
4795 st->u.eval.prev_eval = cur_eval;
4796 cur_eval = cur_eval->u.eval.prev_eval;
4798 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4799 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4800 PUSH_YES_STATE_GOTO(EVAL_AB,
4801 st->u.eval.prev_eval->u.eval.B); /* match B */
4804 if (locinput < reginfo->till) {
4805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4806 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4808 (long)(locinput - PL_reg_starttry),
4809 (long)(reginfo->till - PL_reg_starttry),
4812 sayNO_SILENT; /* Cannot match: too short. */
4814 PL_reginput = locinput; /* put where regtry can find it */
4815 sayYES; /* Success! */
4817 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4819 PerlIO_printf(Perl_debug_log,
4820 "%*s %ssubpattern success...%s\n",
4821 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4822 PL_reginput = locinput; /* put where regtry can find it */
4823 sayYES; /* Success! */
4826 #define ST st->u.ifmatch
4828 case SUSPEND: /* (?>A) */
4830 PL_reginput = locinput;
4833 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4835 goto ifmatch_trivial_fail_test;
4837 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4839 ifmatch_trivial_fail_test:
4841 char * const s = HOPBACKc(locinput, scan->flags);
4846 sw = 1 - (bool)ST.wanted;
4850 next = scan + ARG(scan);
4858 PL_reginput = locinput;
4862 ST.logical = logical;
4863 /* execute body of (?...A) */
4864 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4867 case IFMATCH_A_fail: /* body of (?...A) failed */
4868 ST.wanted = !ST.wanted;
4871 case IFMATCH_A: /* body of (?...A) succeeded */
4873 sw = (bool)ST.wanted;
4875 else if (!ST.wanted)
4878 if (OP(ST.me) == SUSPEND)
4879 locinput = PL_reginput;
4881 locinput = PL_reginput = st->locinput;
4882 nextchr = UCHARAT(locinput);
4884 scan = ST.me + ARG(ST.me);
4887 continue; /* execute B */
4892 next = scan + ARG(scan);
4897 reginfo->cutpoint = PL_regeol;
4900 PL_reginput = locinput;
4902 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
4903 PUSH_STATE_GOTO(COMMIT_next,next);
4905 case COMMIT_next_fail:
4912 #define ST st->u.mark
4914 ST.prev_mark = mark_state;
4915 ST.mark_name = sv_commit = sv_yes_mark
4916 = (SV*)rexi->data->data[ ARG( scan ) ];
4918 ST.mark_loc = PL_reginput = locinput;
4919 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4921 case MARKPOINT_next:
4922 mark_state = ST.prev_mark;
4925 case MARKPOINT_next_fail:
4926 if (popmark && sv_eq(ST.mark_name,popmark))
4928 if (ST.mark_loc > startpoint)
4929 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4930 popmark = NULL; /* we found our mark */
4931 sv_commit = ST.mark_name;
4934 PerlIO_printf(Perl_debug_log,
4935 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4936 REPORT_CODE_OFF+depth*2, "",
4937 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
4940 mark_state = ST.prev_mark;
4941 sv_yes_mark = mark_state ?
4942 mark_state->u.mark.mark_name : NULL;
4946 PL_reginput = locinput;
4948 /* (*SKIP) : if we fail we cut here*/
4949 ST.mark_name = NULL;
4950 ST.mark_loc = locinput;
4951 PUSH_STATE_GOTO(SKIP_next,next);
4953 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
4954 otherwise do nothing. Meaning we need to scan
4956 regmatch_state *cur = mark_state;
4957 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
4960 if ( sv_eq( cur->u.mark.mark_name,
4963 ST.mark_name = find;
4964 PUSH_STATE_GOTO( SKIP_next, next );
4966 cur = cur->u.mark.prev_mark;
4969 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
4971 case SKIP_next_fail:
4973 /* (*CUT:NAME) - Set up to search for the name as we
4974 collapse the stack*/
4975 popmark = ST.mark_name;
4977 /* (*CUT) - No name, we cut here.*/
4978 if (ST.mark_loc > startpoint)
4979 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4980 /* but we set sv_commit to latest mark_name if there
4981 is one so they can test to see how things lead to this
4984 sv_commit=mark_state->u.mark.mark_name;
4992 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4993 PTR2UV(scan), OP(scan));
4994 Perl_croak(aTHX_ "regexp memory corruption");
4998 /* switch break jumps here */
4999 scan = next; /* prepare to execute the next op and ... */
5000 continue; /* ... jump back to the top, reusing st */
5004 /* push a state that backtracks on success */
5005 st->u.yes.prev_yes_state = yes_state;
5009 /* push a new regex state, then continue at scan */
5011 regmatch_state *newst;
5014 regmatch_state *cur = st;
5015 regmatch_state *curyes = yes_state;
5017 regmatch_slab *slab = PL_regmatch_slab;
5018 for (;curd > -1;cur--,curd--) {
5019 if (cur < SLAB_FIRST(slab)) {
5021 cur = SLAB_LAST(slab);
5023 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5024 REPORT_CODE_OFF + 2 + depth * 2,"",
5025 curd, reg_name[cur->resume_state],
5026 (curyes == cur) ? "yes" : ""
5029 curyes = cur->u.yes.prev_yes_state;
5032 DEBUG_STATE_pp("push")
5035 st->locinput = locinput;
5037 if (newst > SLAB_LAST(PL_regmatch_slab))
5038 newst = S_push_slab(aTHX);
5039 PL_regmatch_state = newst;
5041 locinput = PL_reginput;
5042 nextchr = UCHARAT(locinput);
5050 * We get here only if there's trouble -- normally "case END" is
5051 * the terminating point.
5053 Perl_croak(aTHX_ "corrupted regexp pointers");
5059 /* we have successfully completed a subexpression, but we must now
5060 * pop to the state marked by yes_state and continue from there */
5061 assert(st != yes_state);
5063 while (st != yes_state) {
5065 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5066 PL_regmatch_slab = PL_regmatch_slab->prev;
5067 st = SLAB_LAST(PL_regmatch_slab);
5071 DEBUG_STATE_pp("pop (no final)");
5073 DEBUG_STATE_pp("pop (yes)");
5079 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5080 || yes_state > SLAB_LAST(PL_regmatch_slab))
5082 /* not in this slab, pop slab */
5083 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5084 PL_regmatch_slab = PL_regmatch_slab->prev;
5085 st = SLAB_LAST(PL_regmatch_slab);
5087 depth -= (st - yes_state);
5090 yes_state = st->u.yes.prev_yes_state;
5091 PL_regmatch_state = st;
5094 locinput= st->locinput;
5095 nextchr = UCHARAT(locinput);
5097 state_num = st->resume_state + no_final;
5098 goto reenter_switch;
5101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5102 PL_colors[4], PL_colors[5]));
5109 PerlIO_printf(Perl_debug_log,
5110 "%*s %sfailed...%s\n",
5111 REPORT_CODE_OFF+depth*2, "",
5112 PL_colors[4], PL_colors[5])
5124 /* there's a previous state to backtrack to */
5126 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5127 PL_regmatch_slab = PL_regmatch_slab->prev;
5128 st = SLAB_LAST(PL_regmatch_slab);
5130 PL_regmatch_state = st;
5131 locinput= st->locinput;
5132 nextchr = UCHARAT(locinput);
5134 DEBUG_STATE_pp("pop");
5136 if (yes_state == st)
5137 yes_state = st->u.yes.prev_yes_state;
5139 state_num = st->resume_state + 1; /* failure = success + 1 */
5140 goto reenter_switch;
5145 if (rex->intflags & PREGf_VERBARG_SEEN) {
5146 SV *sv_err = get_sv("REGERROR", 1);
5147 SV *sv_mrk = get_sv("REGMARK", 1);
5149 sv_commit = &PL_sv_no;
5151 sv_yes_mark = &PL_sv_yes;
5154 sv_commit = &PL_sv_yes;
5155 sv_yes_mark = &PL_sv_no;
5157 sv_setsv(sv_err, sv_commit);
5158 sv_setsv(sv_mrk, sv_yes_mark);
5160 /* restore original high-water mark */
5161 PL_regmatch_slab = orig_slab;
5162 PL_regmatch_state = orig_state;
5164 /* free all slabs above current one */
5165 if (orig_slab->next) {
5166 regmatch_slab *sl = orig_slab->next;
5167 orig_slab->next = NULL;
5169 regmatch_slab * const osl = sl;
5179 - regrepeat - repeatedly match something simple, report how many
5182 * [This routine now assumes that it will only match on things of length 1.
5183 * That was true before, but now we assume scan - reginput is the count,
5184 * rather than incrementing count on every character. [Er, except utf8.]]
5187 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5190 register char *scan;
5192 register char *loceol = PL_regeol;
5193 register I32 hardcount = 0;
5194 register bool do_utf8 = PL_reg_match_utf8;
5197 if (max == REG_INFTY)
5199 else if (max < loceol - scan)
5200 loceol = scan + max;
5205 while (scan < loceol && hardcount < max && *scan != '\n') {
5206 scan += UTF8SKIP(scan);
5210 while (scan < loceol && *scan != '\n')
5217 while (scan < loceol && hardcount < max) {
5218 scan += UTF8SKIP(scan);
5228 case EXACT: /* length of string is 1 */
5230 while (scan < loceol && UCHARAT(scan) == c)
5233 case EXACTF: /* length of string is 1 */
5235 while (scan < loceol &&
5236 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5239 case EXACTFL: /* length of string is 1 */
5240 PL_reg_flags |= RF_tainted;
5242 while (scan < loceol &&
5243 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5249 while (hardcount < max && scan < loceol &&
5250 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5251 scan += UTF8SKIP(scan);
5255 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5262 LOAD_UTF8_CHARCLASS_ALNUM();
5263 while (hardcount < max && scan < loceol &&
5264 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5265 scan += UTF8SKIP(scan);
5269 while (scan < loceol && isALNUM(*scan))
5274 PL_reg_flags |= RF_tainted;
5277 while (hardcount < max && scan < loceol &&
5278 isALNUM_LC_utf8((U8*)scan)) {
5279 scan += UTF8SKIP(scan);
5283 while (scan < loceol && isALNUM_LC(*scan))
5290 LOAD_UTF8_CHARCLASS_ALNUM();
5291 while (hardcount < max && scan < loceol &&
5292 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5293 scan += UTF8SKIP(scan);
5297 while (scan < loceol && !isALNUM(*scan))
5302 PL_reg_flags |= RF_tainted;
5305 while (hardcount < max && scan < loceol &&
5306 !isALNUM_LC_utf8((U8*)scan)) {
5307 scan += UTF8SKIP(scan);
5311 while (scan < loceol && !isALNUM_LC(*scan))
5318 LOAD_UTF8_CHARCLASS_SPACE();
5319 while (hardcount < max && scan < loceol &&
5321 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5322 scan += UTF8SKIP(scan);
5326 while (scan < loceol && isSPACE(*scan))
5331 PL_reg_flags |= RF_tainted;
5334 while (hardcount < max && scan < loceol &&
5335 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5336 scan += UTF8SKIP(scan);
5340 while (scan < loceol && isSPACE_LC(*scan))
5347 LOAD_UTF8_CHARCLASS_SPACE();
5348 while (hardcount < max && scan < loceol &&
5350 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5351 scan += UTF8SKIP(scan);
5355 while (scan < loceol && !isSPACE(*scan))
5360 PL_reg_flags |= RF_tainted;
5363 while (hardcount < max && scan < loceol &&
5364 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5365 scan += UTF8SKIP(scan);
5369 while (scan < loceol && !isSPACE_LC(*scan))
5376 LOAD_UTF8_CHARCLASS_DIGIT();
5377 while (hardcount < max && scan < loceol &&
5378 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5379 scan += UTF8SKIP(scan);
5383 while (scan < loceol && isDIGIT(*scan))
5390 LOAD_UTF8_CHARCLASS_DIGIT();
5391 while (hardcount < max && scan < loceol &&
5392 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5393 scan += UTF8SKIP(scan);
5397 while (scan < loceol && !isDIGIT(*scan))
5401 default: /* Called on something of 0 width. */
5402 break; /* So match right here or not at all. */
5408 c = scan - PL_reginput;
5412 GET_RE_DEBUG_FLAGS_DECL;
5414 SV * const prop = sv_newmortal();
5415 regprop(prog, prop, p);
5416 PerlIO_printf(Perl_debug_log,
5417 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5418 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5426 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5428 - regclass_swash - prepare the utf8 swash
5432 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5438 RXi_GET_DECL(prog,progi);
5439 const struct reg_data * const data = prog ? progi->data : NULL;
5441 if (data && data->count) {
5442 const U32 n = ARG(node);
5444 if (data->what[n] == 's') {
5445 SV * const rv = (SV*)data->data[n];
5446 AV * const av = (AV*)SvRV((SV*)rv);
5447 SV **const ary = AvARRAY(av);
5450 /* See the end of regcomp.c:S_regclass() for
5451 * documentation of these array elements. */
5454 a = SvROK(ary[1]) ? &ary[1] : 0;
5455 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5459 else if (si && doinit) {
5460 sw = swash_init("utf8", "", si, 1, 0);
5461 (void)av_store(av, 1, sw);
5478 - reginclass - determine if a character falls into a character class
5480 The n is the ANYOF regnode, the p is the target string, lenp
5481 is pointer to the maximum length of how far to go in the p
5482 (if the lenp is zero, UTF8SKIP(p) is used),
5483 do_utf8 tells whether the target string is in UTF-8.
5488 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5491 const char flags = ANYOF_FLAGS(n);
5497 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5498 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5499 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5500 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5501 if (len == (STRLEN)-1)
5502 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5505 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5506 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5509 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5510 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5513 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5517 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5520 if (swash_fetch(sw, p, do_utf8))
5522 else if (flags & ANYOF_FOLD) {
5523 if (!match && lenp && av) {
5525 for (i = 0; i <= av_len(av); i++) {
5526 SV* const sv = *av_fetch(av, i, FALSE);
5528 const char * const s = SvPV_const(sv, len);
5530 if (len <= plen && memEQ(s, (char*)p, len)) {
5538 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5541 to_utf8_fold(p, tmpbuf, &tmplen);
5542 if (swash_fetch(sw, tmpbuf, do_utf8))
5548 if (match && lenp && *lenp == 0)
5549 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5551 if (!match && c < 256) {
5552 if (ANYOF_BITMAP_TEST(n, c))
5554 else if (flags & ANYOF_FOLD) {
5557 if (flags & ANYOF_LOCALE) {
5558 PL_reg_flags |= RF_tainted;
5559 f = PL_fold_locale[c];
5563 if (f != c && ANYOF_BITMAP_TEST(n, f))
5567 if (!match && (flags & ANYOF_CLASS)) {
5568 PL_reg_flags |= RF_tainted;
5570 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5571 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5572 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5573 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5574 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5575 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5576 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5577 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5578 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5579 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5580 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5581 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5582 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5583 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5584 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5585 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5586 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5587 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5588 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5589 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5590 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5591 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5592 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5593 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5594 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5595 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5596 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5597 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5598 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5599 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5600 ) /* How's that for a conditional? */
5607 return (flags & ANYOF_INVERT) ? !match : match;
5611 S_reghop3(U8 *s, I32 off, const U8* lim)
5615 while (off-- && s < lim) {
5616 /* XXX could check well-formedness here */
5621 while (off++ && s > lim) {
5623 if (UTF8_IS_CONTINUED(*s)) {
5624 while (s > lim && UTF8_IS_CONTINUATION(*s))
5627 /* XXX could check well-formedness here */
5634 /* there are a bunch of places where we use two reghop3's that should
5635 be replaced with this routine. but since thats not done yet
5636 we ifdef it out - dmq
5639 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5643 while (off-- && s < rlim) {
5644 /* XXX could check well-formedness here */
5649 while (off++ && s > llim) {
5651 if (UTF8_IS_CONTINUED(*s)) {
5652 while (s > llim && UTF8_IS_CONTINUATION(*s))
5655 /* XXX could check well-formedness here */
5663 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5667 while (off-- && s < lim) {
5668 /* XXX could check well-formedness here */
5675 while (off++ && s > lim) {
5677 if (UTF8_IS_CONTINUED(*s)) {
5678 while (s > lim && UTF8_IS_CONTINUATION(*s))
5681 /* XXX could check well-formedness here */
5690 restore_pos(pTHX_ void *arg)
5693 regexp * const rex = (regexp *)arg;
5694 if (PL_reg_eval_set) {
5695 if (PL_reg_oldsaved) {
5696 rex->subbeg = PL_reg_oldsaved;
5697 rex->sublen = PL_reg_oldsavedlen;
5698 #ifdef PERL_OLD_COPY_ON_WRITE
5699 rex->saved_copy = PL_nrs;
5701 RX_MATCH_COPIED_on(rex);
5703 PL_reg_magic->mg_len = PL_reg_oldpos;
5704 PL_reg_eval_set = 0;
5705 PL_curpm = PL_reg_oldcurpm;
5710 S_to_utf8_substr(pTHX_ register regexp *prog)
5714 if (prog->substrs->data[i].substr
5715 && !prog->substrs->data[i].utf8_substr) {
5716 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5717 prog->substrs->data[i].utf8_substr = sv;
5718 sv_utf8_upgrade(sv);
5719 if (SvVALID(prog->substrs->data[i].substr)) {
5720 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5721 if (flags & FBMcf_TAIL) {
5722 /* Trim the trailing \n that fbm_compile added last
5724 SvCUR_set(sv, SvCUR(sv) - 1);
5725 /* Whilst this makes the SV technically "invalid" (as its
5726 buffer is no longer followed by "\0") when fbm_compile()
5727 adds the "\n" back, a "\0" is restored. */
5729 fbm_compile(sv, flags);
5731 if (prog->substrs->data[i].substr == prog->check_substr)
5732 prog->check_utf8 = sv;
5738 S_to_byte_substr(pTHX_ register regexp *prog)
5743 if (prog->substrs->data[i].utf8_substr
5744 && !prog->substrs->data[i].substr) {
5745 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5746 if (sv_utf8_downgrade(sv, TRUE)) {
5747 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5749 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5750 if (flags & FBMcf_TAIL) {
5751 /* Trim the trailing \n that fbm_compile added last
5753 SvCUR_set(sv, SvCUR(sv) - 1);
5755 fbm_compile(sv, flags);
5761 prog->substrs->data[i].substr = sv;
5762 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5763 prog->check_substr = sv;
5770 * c-indentation-style: bsd
5772 * indent-tabs-mode: t
5775 * ex: set ts=8 sts=4 sw=4 noet: