5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 8
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regendp[p]);
193 SSPUSHINT(PL_regstartp[p]);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regstartp[p],
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regstartp);
205 SSPUSHPTR(PL_regendp);
206 SSPUSHINT(PL_regsize);
207 SSPUSHINT(*PL_reglastparen);
208 SSPUSHINT(*PL_reglastcloseparen);
209 SSPUSHPTR(PL_reginput);
210 #define REGCP_FRAME_ELEMS 2
211 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
212 * are needed for the regexp context stack bookkeeping. */
213 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
214 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
219 /* These are needed since we do not localize EVAL nodes: */
220 #define REGCP_SET(cp) \
222 PerlIO_printf(Perl_debug_log, \
223 " Setting an EVAL scope, savestack=%"IVdf"\n", \
224 (IV)PL_savestack_ix)); \
227 #define REGCP_UNWIND(cp) \
229 if (cp != PL_savestack_ix) \
230 PerlIO_printf(Perl_debug_log, \
231 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
232 (IV)(cp), (IV)PL_savestack_ix)); \
236 S_regcppop(pTHX_ const regexp *rex)
242 GET_RE_DEBUG_FLAGS_DECL;
244 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
246 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
247 i = SSPOPINT; /* Parentheses elements to pop. */
248 input = (char *) SSPOPPTR;
249 *PL_reglastcloseparen = SSPOPINT;
250 *PL_reglastparen = SSPOPINT;
251 PL_regsize = SSPOPINT;
252 PL_regendp=(I32 *) SSPOPPTR;
253 PL_regstartp=(I32 *) SSPOPPTR;
256 /* Now restore the parentheses context. */
257 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
258 i > 0; i -= REGCP_PAREN_ELEMS) {
260 U32 paren = (U32)SSPOPINT;
261 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
262 PL_regstartp[paren] = SSPOPINT;
264 if (paren <= *PL_reglastparen)
265 PL_regendp[paren] = tmps;
267 PerlIO_printf(Perl_debug_log,
268 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
269 (UV)paren, (IV)PL_regstartp[paren],
270 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
271 (IV)PL_regendp[paren],
272 (paren > *PL_reglastparen ? "(no)" : ""));
276 if (*PL_reglastparen + 1 <= rex->nparens) {
277 PerlIO_printf(Perl_debug_log,
278 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
279 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
283 /* It would seem that the similar code in regtry()
284 * already takes care of this, and in fact it is in
285 * a better location to since this code can #if 0-ed out
286 * but the code in regtry() is needed or otherwise tests
287 * requiring null fields (pat.t#187 and split.t#{13,14}
288 * (as of patchlevel 7877) will fail. Then again,
289 * this code seems to be necessary or otherwise
290 * building DynaLoader will fail:
291 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
293 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
295 PL_regstartp[i] = -1;
302 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
305 * pregexec and friends
308 #ifndef PERL_IN_XSUB_RE
310 - pregexec - match a regexp against a string
313 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
314 char *strbeg, I32 minend, SV *screamer, U32 nosave)
315 /* strend: pointer to null at end of string */
316 /* strbeg: real beginning of string */
317 /* minend: end of match must be >=minend after stringarg. */
318 /* nosave: For optimizations. */
321 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
322 nosave ? 0 : REXEC_COPY_STR);
327 * Need to implement the following flags for reg_anch:
329 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
331 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
332 * INTUIT_AUTORITATIVE_ML
333 * INTUIT_ONCE_NOML - Intuit can match in one location only.
336 * Another flag for this function: SECOND_TIME (so that float substrs
337 * with giant delta may be not rechecked).
340 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
342 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
343 Otherwise, only SvCUR(sv) is used to get strbeg. */
345 /* XXXX We assume that strpos is strbeg unless sv. */
347 /* XXXX Some places assume that there is a fixed substring.
348 An update may be needed if optimizer marks as "INTUITable"
349 RExen without fixed substrings. Similarly, it is assumed that
350 lengths of all the strings are no more than minlen, thus they
351 cannot come from lookahead.
352 (Or minlen should take into account lookahead.)
353 NOTE: Some of this comment is not correct. minlen does now take account
354 of lookahead/behind. Further research is required. -- demerphq
358 /* A failure to find a constant substring means that there is no need to make
359 an expensive call to REx engine, thus we celebrate a failure. Similarly,
360 finding a substring too deep into the string means that less calls to
361 regtry() should be needed.
363 REx compiler's optimizer found 4 possible hints:
364 a) Anchored substring;
366 c) Whether we are anchored (beginning-of-line or \G);
367 d) First node (of those at offset 0) which may distingush positions;
368 We use a)b)d) and multiline-part of c), and try to find a position in the
369 string which does not contradict any of them.
372 /* Most of decisions we do here should have been done at compile time.
373 The nodes of the REx which we used for the search should have been
374 deleted from the finite automaton. */
377 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
378 char *strend, U32 flags, re_scream_pos_data *data)
381 register I32 start_shift = 0;
382 /* Should be nonnegative! */
383 register I32 end_shift = 0;
388 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
390 register char *other_last = NULL; /* other substr checked before this */
391 char *check_at = NULL; /* check substr found at this pos */
392 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
393 RXi_GET_DECL(prog,progi);
395 const char * const i_strpos = strpos;
398 GET_RE_DEBUG_FLAGS_DECL;
400 RX_MATCH_UTF8_set(prog,do_utf8);
402 if (prog->extflags & RXf_UTF8) {
403 PL_reg_flags |= RF_utf8;
406 debug_start_match(prog, do_utf8, strpos, strend,
407 sv ? "Guessing start of match in sv for"
408 : "Guessing start of match in string for");
411 /* CHR_DIST() would be more correct here but it makes things slow. */
412 if (prog->minlen > strend - strpos) {
413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
414 "String too short... [re_intuit_start]\n"));
418 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
421 if (!prog->check_utf8 && prog->check_substr)
422 to_utf8_substr(prog);
423 check = prog->check_utf8;
425 if (!prog->check_substr && prog->check_utf8)
426 to_byte_substr(prog);
427 check = prog->check_substr;
429 if (check == &PL_sv_undef) {
430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
431 "Non-utf8 string cannot match utf8 check string\n"));
434 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
435 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
436 || ( (prog->extflags & RXf_ANCH_BOL)
437 && !multiline ) ); /* Check after \n? */
440 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
441 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
442 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
444 && (strpos != strbeg)) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
448 if (prog->check_offset_min == prog->check_offset_max &&
449 !(prog->extflags & RXf_CANY_SEEN)) {
450 /* Substring at constant offset from beg-of-str... */
453 s = HOP3c(strpos, prog->check_offset_min, strend);
456 slen = SvCUR(check); /* >= 1 */
458 if ( strend - s > slen || strend - s < slen - 1
459 || (strend - s == slen && strend[-1] != '\n')) {
460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
463 /* Now should match s[0..slen-2] */
465 if (slen && (*SvPVX_const(check) != *s
467 && memNE(SvPVX_const(check), s, slen)))) {
469 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
473 else if (*SvPVX_const(check) != *s
474 || ((slen = SvCUR(check)) > 1
475 && memNE(SvPVX_const(check), s, slen)))
478 goto success_at_start;
481 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
484 end_shift = prog->check_end_shift;
487 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
488 - (SvTAIL(check) != 0);
489 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
491 if (end_shift < eshift)
495 else { /* Can match at random position */
498 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
499 end_shift = prog->check_end_shift;
501 /* end shift should be non negative here */
504 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
506 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
507 (IV)end_shift, prog->precomp);
511 /* Find a possible match in the region s..strend by looking for
512 the "check" substring in the region corrected by start/end_shift. */
515 I32 srch_start_shift = start_shift;
516 I32 srch_end_shift = end_shift;
517 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
518 srch_end_shift -= ((strbeg - s) - srch_start_shift);
519 srch_start_shift = strbeg - s;
521 DEBUG_OPTIMISE_MORE_r({
522 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
523 (IV)prog->check_offset_min,
524 (IV)srch_start_shift,
526 (IV)prog->check_end_shift);
529 if (flags & REXEC_SCREAM) {
530 I32 p = -1; /* Internal iterator of scream. */
531 I32 * const pp = data ? data->scream_pos : &p;
533 if (PL_screamfirst[BmRARE(check)] >= 0
534 || ( BmRARE(check) == '\n'
535 && (BmPREVIOUS(check) == SvCUR(check) - 1)
537 s = screaminstr(sv, check,
538 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
541 /* we may be pointing at the wrong string */
542 if (s && RX_MATCH_COPIED(prog))
543 s = strbeg + (s - SvPVX_const(sv));
545 *data->scream_olds = s;
550 if (prog->extflags & RXf_CANY_SEEN) {
551 start_point= (U8*)(s + srch_start_shift);
552 end_point= (U8*)(strend - srch_end_shift);
554 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
555 end_point= HOP3(strend, -srch_end_shift, strbeg);
557 DEBUG_OPTIMISE_MORE_r({
558 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
559 (int)(end_point - start_point),
560 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
564 s = fbm_instr( start_point, end_point,
565 check, multiline ? FBMrf_MULTILINE : 0);
568 /* Update the count-of-usability, remove useless subpatterns,
572 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
573 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
574 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
575 (s ? "Found" : "Did not find"),
576 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
577 ? "anchored" : "floating"),
580 (s ? " at offset " : "...\n") );
585 /* Finish the diagnostic message */
586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
588 /* XXX dmq: first branch is for positive lookbehind...
589 Our check string is offset from the beginning of the pattern.
590 So we need to do any stclass tests offset forward from that
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
602 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
608 : (prog->float_substr && prog->anchored_substr))
610 /* Take into account the "other" substring. */
611 /* XXXX May be hopelessly wrong for UTF... */
614 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
617 char * const last = HOP3c(s, -start_shift, strbeg);
619 char * const saved_s = s;
622 t = s - prog->check_offset_max;
623 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
625 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
630 t = HOP3c(t, prog->anchored_offset, strend);
631 if (t < other_last) /* These positions already checked */
633 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
636 /* XXXX It is not documented what units *_offsets are in.
637 We assume bytes, but this is clearly wrong.
638 Meaning this code needs to be carefully reviewed for errors.
642 /* On end-of-str: see comment below. */
643 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
644 if (must == &PL_sv_undef) {
646 DEBUG_r(must = prog->anchored_utf8); /* for debug */
651 HOP3(HOP3(last1, prog->anchored_offset, strend)
652 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
654 multiline ? FBMrf_MULTILINE : 0
657 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
658 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
659 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
660 (s ? "Found" : "Contradicts"),
661 quoted, RE_SV_TAIL(must));
666 if (last1 >= last2) {
667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
668 ", giving up...\n"));
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
672 ", trying floating at offset %ld...\n",
673 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
674 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
675 s = HOP3c(last, 1, strend);
679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
680 (long)(s - i_strpos)));
681 t = HOP3c(s, -prog->anchored_offset, strbeg);
682 other_last = HOP3c(s, 1, strend);
690 else { /* Take into account the floating substring. */
692 char * const saved_s = s;
695 t = HOP3c(s, -start_shift, strbeg);
697 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
698 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
699 last = HOP3c(t, prog->float_max_offset, strend);
700 s = HOP3c(t, prog->float_min_offset, strend);
703 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
704 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
705 /* fbm_instr() takes into account exact value of end-of-str
706 if the check is SvTAIL(ed). Since false positives are OK,
707 and end-of-str is not later than strend we are OK. */
708 if (must == &PL_sv_undef) {
710 DEBUG_r(must = prog->float_utf8); /* for debug message */
713 s = fbm_instr((unsigned char*)s,
714 (unsigned char*)last + SvCUR(must)
716 must, multiline ? FBMrf_MULTILINE : 0);
718 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
719 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
720 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
721 (s ? "Found" : "Contradicts"),
722 quoted, RE_SV_TAIL(must));
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
727 ", giving up...\n"));
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
731 ", trying anchored starting at offset %ld...\n",
732 (long)(saved_s + 1 - i_strpos)));
734 s = HOP3c(t, 1, strend);
738 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
739 (long)(s - i_strpos)));
740 other_last = s; /* Fix this later. --Hugo */
750 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
752 DEBUG_OPTIMISE_MORE_r(
753 PerlIO_printf(Perl_debug_log,
754 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
755 (IV)prog->check_offset_min,
756 (IV)prog->check_offset_max,
764 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
766 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
769 /* Fixed substring is found far enough so that the match
770 cannot start at strpos. */
772 if (ml_anch && t[-1] != '\n') {
773 /* Eventually fbm_*() should handle this, but often
774 anchored_offset is not 0, so this check will not be wasted. */
775 /* XXXX In the code below we prefer to look for "^" even in
776 presence of anchored substrings. And we search even
777 beyond the found float position. These pessimizations
778 are historical artefacts only. */
780 while (t < strend - prog->minlen) {
782 if (t < check_at - prog->check_offset_min) {
783 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
784 /* Since we moved from the found position,
785 we definitely contradict the found anchored
786 substr. Due to the above check we do not
787 contradict "check" substr.
788 Thus we can arrive here only if check substr
789 is float. Redo checking for "other"=="fixed".
792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
793 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
794 goto do_other_anchored;
796 /* We don't contradict the found floating substring. */
797 /* XXXX Why not check for STCLASS? */
799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
800 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
803 /* Position contradicts check-string */
804 /* XXXX probably better to look for check-string
805 than for "\n", so one should lower the limit for t? */
806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
807 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
808 other_last = strpos = s = t + 1;
813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
814 PL_colors[0], PL_colors[1]));
818 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
819 PL_colors[0], PL_colors[1]));
823 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
826 /* The found string does not prohibit matching at strpos,
827 - no optimization of calling REx engine can be performed,
828 unless it was an MBOL and we are not after MBOL,
829 or a future STCLASS check will fail this. */
831 /* Even in this situation we may use MBOL flag if strpos is offset
832 wrt the start of the string. */
833 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
834 && (strpos != strbeg) && strpos[-1] != '\n'
835 /* May be due to an implicit anchor of m{.*foo} */
836 && !(prog->intflags & PREGf_IMPLICIT))
841 DEBUG_EXECUTE_r( if (ml_anch)
842 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
843 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
846 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
848 prog->check_utf8 /* Could be deleted already */
849 && --BmUSEFUL(prog->check_utf8) < 0
850 && (prog->check_utf8 == prog->float_utf8)
852 prog->check_substr /* Could be deleted already */
853 && --BmUSEFUL(prog->check_substr) < 0
854 && (prog->check_substr == prog->float_substr)
857 /* If flags & SOMETHING - do not do it many times on the same match */
858 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
859 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
860 if (do_utf8 ? prog->check_substr : prog->check_utf8)
861 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
862 prog->check_substr = prog->check_utf8 = NULL; /* disable */
863 prog->float_substr = prog->float_utf8 = NULL; /* clear */
864 check = NULL; /* abort */
866 /* XXXX This is a remnant of the old implementation. It
867 looks wasteful, since now INTUIT can use many
869 prog->extflags &= ~RXf_USE_INTUIT;
876 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
877 /* trie stclasses are too expensive to use here, we are better off to
878 leave it to regmatch itself */
879 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
880 /* minlen == 0 is possible if regstclass is \b or \B,
881 and the fixed substr is ''$.
882 Since minlen is already taken into account, s+1 is before strend;
883 accidentally, minlen >= 1 guaranties no false positives at s + 1
884 even for \b or \B. But (minlen? 1 : 0) below assumes that
885 regstclass does not come from lookahead... */
886 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
887 This leaves EXACTF only, which is dealt with in find_byclass(). */
888 const U8* const str = (U8*)STRING(progi->regstclass);
889 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
890 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
893 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
894 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
895 else if (prog->float_substr || prog->float_utf8)
896 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
901 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
904 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
907 const char *what = NULL;
909 if (endpos == strend) {
910 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
911 "Could not match STCLASS...\n") );
914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
915 "This position contradicts STCLASS...\n") );
916 if ((prog->extflags & RXf_ANCH) && !ml_anch)
918 /* Contradict one of substrings */
919 if (prog->anchored_substr || prog->anchored_utf8) {
920 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
921 DEBUG_EXECUTE_r( what = "anchored" );
923 s = HOP3c(t, 1, strend);
924 if (s + start_shift + end_shift > strend) {
925 /* XXXX Should be taken into account earlier? */
926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
927 "Could not match STCLASS...\n") );
932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
933 "Looking for %s substr starting at offset %ld...\n",
934 what, (long)(s + start_shift - i_strpos)) );
937 /* Have both, check_string is floating */
938 if (t + start_shift >= check_at) /* Contradicts floating=check */
939 goto retry_floating_check;
940 /* Recheck anchored substring, but not floating... */
944 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
945 "Looking for anchored substr starting at offset %ld...\n",
946 (long)(other_last - i_strpos)) );
947 goto do_other_anchored;
949 /* Another way we could have checked stclass at the
950 current position only: */
955 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
956 "Looking for /%s^%s/m starting at offset %ld...\n",
957 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
960 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
962 /* Check is floating subtring. */
963 retry_floating_check:
964 t = check_at - start_shift;
965 DEBUG_EXECUTE_r( what = "floating" );
966 goto hop_and_restart;
969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
970 "By STCLASS: moving %ld --> %ld\n",
971 (long)(t - i_strpos), (long)(s - i_strpos))
975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
976 "Does not contradict STCLASS...\n");
981 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
982 PL_colors[4], (check ? "Guessed" : "Giving up"),
983 PL_colors[5], (long)(s - i_strpos)) );
986 fail_finish: /* Substring not found */
987 if (prog->check_substr || prog->check_utf8) /* could be removed already */
988 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
991 PL_colors[4], PL_colors[5]));
997 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
998 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
999 switch (trie_type) { \
1000 case trie_utf8_fold: \
1001 if ( foldlen>0 ) { \
1002 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1007 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1008 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1009 foldlen -= UNISKIP( uvc ); \
1010 uscan = foldbuf + UNISKIP( uvc ); \
1014 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1022 charid = trie->charmap[ uvc ]; \
1026 if (widecharmap) { \
1027 SV** const svpp = hv_fetch(widecharmap, \
1028 (char*)&uvc, sizeof(UV), 0); \
1030 charid = (U16)SvIV(*svpp); \
1035 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1038 ibcmp_utf8(s, NULL, 0, do_utf8, \
1039 m, NULL, ln, (bool)UTF)) \
1040 && (!reginfo || regtry(reginfo, &s)) ) \
1043 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1044 uvchr_to_utf8(tmpbuf, c); \
1045 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1047 && (f == c1 || f == c2) \
1048 && (ln == foldlen || \
1049 !ibcmp_utf8((char *) foldbuf, \
1050 NULL, foldlen, do_utf8, \
1052 NULL, ln, (bool)UTF)) \
1053 && (!reginfo || regtry(reginfo, &s)) ) \
1058 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1062 && (ln == 1 || !(OP(c) == EXACTF \
1064 : ibcmp_locale(s, m, ln))) \
1065 && (!reginfo || regtry(reginfo, &s)) ) \
1071 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1073 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1079 #define REXEC_FBC_SCAN(CoDe) \
1081 while (s < strend) { \
1087 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1088 REXEC_FBC_UTF8_SCAN( \
1090 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1099 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1102 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1111 #define REXEC_FBC_TRYIT \
1112 if ((!reginfo || regtry(reginfo, &s))) \
1115 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1118 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1121 REXEC_FBC_CLASS_SCAN(CoNd); \
1125 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1126 PL_reg_flags |= RF_tainted; \
1128 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1131 REXEC_FBC_CLASS_SCAN(CoNd); \
1135 #define DUMP_EXEC_POS(li,s,doutf8) \
1136 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1138 /* We know what class REx starts with. Try to find this position... */
1139 /* if reginfo is NULL, its a dryrun */
1140 /* annoyingly all the vars in this routine have different names from their counterparts
1141 in regmatch. /grrr */
1144 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1145 const char *strend, regmatch_info *reginfo)
1148 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1152 register STRLEN uskip;
1156 register I32 tmp = 1; /* Scratch variable? */
1157 register const bool do_utf8 = PL_reg_match_utf8;
1158 RXi_GET_DECL(prog,progi);
1160 /* We know what class it must start with. */
1164 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1165 !UTF8_IS_INVARIANT((U8)s[0]) ?
1166 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1167 REGINCLASS(prog, c, (U8*)s));
1170 while (s < strend) {
1173 if (REGINCLASS(prog, c, (U8*)s) ||
1174 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1175 /* The assignment of 2 is intentional:
1176 * for the folded sharp s, the skip is 2. */
1177 (skip = SHARP_S_SKIP))) {
1178 if (tmp && (!reginfo || regtry(reginfo, &s)))
1191 if (tmp && (!reginfo || regtry(reginfo, &s)))
1199 ln = STR_LEN(c); /* length to match in octets/bytes */
1200 lnc = (I32) ln; /* length to match in characters */
1202 STRLEN ulen1, ulen2;
1204 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1205 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1208 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1209 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1211 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1213 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1216 while (sm < ((U8 *) m + ln)) {
1231 c2 = PL_fold_locale[c1];
1233 e = HOP3c(strend, -((I32)lnc), s);
1235 if (!reginfo && e < s)
1236 e = s; /* Due to minlen logic of intuit() */
1238 /* The idea in the EXACTF* cases is to first find the
1239 * first character of the EXACTF* node and then, if
1240 * necessary, case-insensitively compare the full
1241 * text of the node. The c1 and c2 are the first
1242 * characters (though in Unicode it gets a bit
1243 * more complicated because there are more cases
1244 * than just upper and lower: one needs to use
1245 * the so-called folding case for case-insensitive
1246 * matching (called "loose matching" in Unicode).
1247 * ibcmp_utf8() will do just that. */
1251 U8 tmpbuf [UTF8_MAXBYTES+1];
1252 STRLEN len, foldlen;
1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1255 /* Upper and lower of 1st char are equal -
1256 * probably not a "letter". */
1258 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1260 REXEC_FBC_EXACTISH_CHECK(c == c1);
1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1268 /* Handle some of the three Greek sigmas cases.
1269 * Note that not all the possible combinations
1270 * are handled here: some of them are handled
1271 * by the standard folding rules, and some of
1272 * them (the character class or ANYOF cases)
1273 * are handled during compiletime in
1274 * regexec.c:S_regclass(). */
1275 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1276 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1277 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1279 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1285 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1287 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1291 PL_reg_flags |= RF_tainted;
1298 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1299 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1301 tmp = ((OP(c) == BOUND ?
1302 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1303 LOAD_UTF8_CHARCLASS_ALNUM();
1304 REXEC_FBC_UTF8_SCAN(
1305 if (tmp == !(OP(c) == BOUND ?
1306 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1307 isALNUM_LC_utf8((U8*)s)))
1315 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1316 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1319 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1325 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1329 PL_reg_flags |= RF_tainted;
1336 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1337 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1339 tmp = ((OP(c) == NBOUND ?
1340 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1341 LOAD_UTF8_CHARCLASS_ALNUM();
1342 REXEC_FBC_UTF8_SCAN(
1343 if (tmp == !(OP(c) == NBOUND ?
1344 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1345 isALNUM_LC_utf8((U8*)s)))
1347 else REXEC_FBC_TRYIT;
1351 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1352 tmp = ((OP(c) == NBOUND ?
1353 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1356 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1358 else REXEC_FBC_TRYIT;
1361 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1365 REXEC_FBC_CSCAN_PRELOAD(
1366 LOAD_UTF8_CHARCLASS_ALNUM(),
1367 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1371 REXEC_FBC_CSCAN_TAINT(
1372 isALNUM_LC_utf8((U8*)s),
1376 REXEC_FBC_CSCAN_PRELOAD(
1377 LOAD_UTF8_CHARCLASS_ALNUM(),
1378 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1382 REXEC_FBC_CSCAN_TAINT(
1383 !isALNUM_LC_utf8((U8*)s),
1387 REXEC_FBC_CSCAN_PRELOAD(
1388 LOAD_UTF8_CHARCLASS_SPACE(),
1389 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1393 REXEC_FBC_CSCAN_TAINT(
1394 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1398 REXEC_FBC_CSCAN_PRELOAD(
1399 LOAD_UTF8_CHARCLASS_SPACE(),
1400 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1404 REXEC_FBC_CSCAN_TAINT(
1405 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1409 REXEC_FBC_CSCAN_PRELOAD(
1410 LOAD_UTF8_CHARCLASS_DIGIT(),
1411 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1415 REXEC_FBC_CSCAN_TAINT(
1416 isDIGIT_LC_utf8((U8*)s),
1420 REXEC_FBC_CSCAN_PRELOAD(
1421 LOAD_UTF8_CHARCLASS_DIGIT(),
1422 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1426 REXEC_FBC_CSCAN_TAINT(
1427 !isDIGIT_LC_utf8((U8*)s),
1433 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1434 trie_type = do_utf8 ?
1435 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1437 /* what trie are we using right now */
1439 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1441 = (reg_trie_data*)progi->data->data[ aho->trie ];
1442 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1444 const char *last_start = strend - trie->minlen;
1446 const char *real_start = s;
1448 STRLEN maxlen = trie->maxlen;
1450 U8 **points; /* map of where we were in the input string
1451 when reading a given char. For ASCII this
1452 is unnecessary overhead as the relationship
1453 is always 1:1, but for unicode, especially
1454 case folded unicode this is not true. */
1455 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459 GET_RE_DEBUG_FLAGS_DECL;
1461 /* We can't just allocate points here. We need to wrap it in
1462 * an SV so it gets freed properly if there is a croak while
1463 * running the match */
1466 sv_points=newSV(maxlen * sizeof(U8 *));
1467 SvCUR_set(sv_points,
1468 maxlen * sizeof(U8 *));
1469 SvPOK_on(sv_points);
1470 sv_2mortal(sv_points);
1471 points=(U8**)SvPV_nolen(sv_points );
1472 if ( trie_type != trie_utf8_fold
1473 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1476 bitmap=(U8*)trie->bitmap;
1478 bitmap=(U8*)ANYOF_BITMAP(c);
1480 /* this is the Aho-Corasick algorithm modified a touch
1481 to include special handling for long "unknown char"
1482 sequences. The basic idea being that we use AC as long
1483 as we are dealing with a possible matching char, when
1484 we encounter an unknown char (and we have not encountered
1485 an accepting state) we scan forward until we find a legal
1487 AC matching is basically that of trie matching, except
1488 that when we encounter a failing transition, we fall back
1489 to the current states "fail state", and try the current char
1490 again, a process we repeat until we reach the root state,
1491 state 1, or a legal transition. If we fail on the root state
1492 then we can either terminate if we have reached an accepting
1493 state previously, or restart the entire process from the beginning
1497 while (s <= last_start) {
1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1506 U8 *uscan = (U8*)NULL;
1507 U8 *leftmost = NULL;
1509 U32 accepted_word= 0;
1513 while ( state && uc <= (U8*)strend ) {
1515 U32 word = aho->states[ state ].wordnum;
1519 DEBUG_TRIE_EXECUTE_r(
1520 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 (char *)uc, do_utf8 );
1523 PerlIO_printf( Perl_debug_log,
1524 " Scanning for legal start char...\n");
1527 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1532 if (uc >(U8*)last_start) break;
1536 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1537 if (!leftmost || lpos < leftmost) {
1538 DEBUG_r(accepted_word=word);
1544 points[pointpos++ % maxlen]= uc;
1545 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1546 uscan, len, uvc, charid, foldlen,
1548 DEBUG_TRIE_EXECUTE_r({
1549 dump_exec_pos( (char *)uc, c, strend, real_start,
1551 PerlIO_printf(Perl_debug_log,
1552 " Charid:%3u CP:%4"UVxf" ",
1558 word = aho->states[ state ].wordnum;
1560 base = aho->states[ state ].trans.base;
1562 DEBUG_TRIE_EXECUTE_r({
1564 dump_exec_pos( (char *)uc, c, strend, real_start,
1566 PerlIO_printf( Perl_debug_log,
1567 "%sState: %4"UVxf", word=%"UVxf,
1568 failed ? " Fail transition to " : "",
1569 (UV)state, (UV)word);
1574 (base + charid > trie->uniquecharcount )
1575 && (base + charid - 1 - trie->uniquecharcount
1577 && trie->trans[base + charid - 1 -
1578 trie->uniquecharcount].check == state
1579 && (tmp=trie->trans[base + charid - 1 -
1580 trie->uniquecharcount ].next))
1582 DEBUG_TRIE_EXECUTE_r(
1583 PerlIO_printf( Perl_debug_log," - legal\n"));
1588 DEBUG_TRIE_EXECUTE_r(
1589 PerlIO_printf( Perl_debug_log," - fail\n"));
1591 state = aho->fail[state];
1595 /* we must be accepting here */
1596 DEBUG_TRIE_EXECUTE_r(
1597 PerlIO_printf( Perl_debug_log," - accepting\n"));
1606 if (!state) state = 1;
1609 if ( aho->states[ state ].wordnum ) {
1610 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1611 if (!leftmost || lpos < leftmost) {
1612 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1617 s = (char*)leftmost;
1618 DEBUG_TRIE_EXECUTE_r({
1620 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1621 (UV)accepted_word, (IV)(s - real_start)
1624 if (!reginfo || regtry(reginfo, &s)) {
1630 DEBUG_TRIE_EXECUTE_r({
1631 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1634 DEBUG_TRIE_EXECUTE_r(
1635 PerlIO_printf( Perl_debug_log,"No match.\n"));
1644 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1653 S_swap_match_buff (pTHX_ regexp *prog) {
1657 /* We have to be careful. If the previous successful match
1658 was from this regex we don't want a subsequent paritally
1659 successful match to clobber the old results.
1660 So when we detect this possibility we add a swap buffer
1661 to the re, and switch the buffer each match. If we fail
1662 we switch it back, otherwise we leave it swapped.
1664 Newxz(prog->swap, 1, regexp_paren_ofs);
1665 /* no need to copy these */
1666 Newxz(prog->swap->startp, 2 * (prog->nparens + 1), I32);
1667 prog->swap->endp = prog->swap->startp + prog->nparens + 1;
1669 t = prog->swap->startp;
1670 prog->swap->startp = prog->startp;
1672 t = prog->swap->endp;
1673 prog->swap->endp = prog->endp;
1679 - regexec_flags - match a regexp against a string
1682 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1683 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1684 /* strend: pointer to null at end of string */
1685 /* strbeg: real beginning of string */
1686 /* minend: end of match must be >=minend after stringarg. */
1687 /* data: May be used for some additional optimizations.
1688 Currently its only used, with a U32 cast, for transmitting
1689 the ganch offset when doing a /g match. This will change */
1690 /* nosave: For optimizations. */
1693 /*register*/ char *s;
1694 register regnode *c;
1695 /*register*/ char *startpos = stringarg;
1696 I32 minlen; /* must match at least this many chars */
1697 I32 dontbother = 0; /* how many characters not to try at end */
1698 I32 end_shift = 0; /* Same for the end. */ /* CC */
1699 I32 scream_pos = -1; /* Internal iterator of scream. */
1700 char *scream_olds = NULL;
1701 SV* const oreplsv = GvSV(PL_replgv);
1702 const bool do_utf8 = (bool)DO_UTF8(sv);
1704 RXi_GET_DECL(prog,progi);
1705 regmatch_info reginfo; /* create some info to pass to regtry etc */
1706 bool swap_on_fail = 0;
1708 GET_RE_DEBUG_FLAGS_DECL;
1710 PERL_UNUSED_ARG(data);
1712 /* Be paranoid... */
1713 if (prog == NULL || startpos == NULL) {
1714 Perl_croak(aTHX_ "NULL regexp parameter");
1718 multiline = prog->extflags & RXf_PMf_MULTILINE;
1719 reginfo.prog = prog;
1721 RX_MATCH_UTF8_set(prog, do_utf8);
1723 debug_start_match(prog, do_utf8, startpos, strend,
1727 minlen = prog->minlen;
1729 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1731 "String too short [regexec_flags]...\n"));
1736 /* Check validity of program. */
1737 if (UCHARAT(progi->program) != REG_MAGIC) {
1738 Perl_croak(aTHX_ "corrupted regexp program");
1742 PL_reg_eval_set = 0;
1745 if (prog->extflags & RXf_UTF8)
1746 PL_reg_flags |= RF_utf8;
1748 /* Mark beginning of line for ^ and lookbehind. */
1749 reginfo.bol = startpos; /* XXX not used ??? */
1753 /* Mark end of line for $ (and such) */
1756 /* see how far we have to get to not match where we matched before */
1757 reginfo.till = startpos+minend;
1759 /* If there is a "must appear" string, look for it. */
1762 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1765 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1766 reginfo.ganch = startpos + prog->gofs;
1767 else if (sv && SvTYPE(sv) >= SVt_PVMG
1769 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1770 && mg->mg_len >= 0) {
1771 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1772 if (prog->extflags & RXf_ANCH_GPOS) {
1773 if (s > reginfo.ganch)
1775 s = reginfo.ganch - prog->gofs;
1779 reginfo.ganch = strbeg + PTR2UV(data);
1780 } else /* pos() not defined */
1781 reginfo.ganch = strbeg;
1783 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1785 swap_match_buff(prog); /* do we need a save destructor here for
1788 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1789 re_scream_pos_data d;
1791 d.scream_olds = &scream_olds;
1792 d.scream_pos = &scream_pos;
1793 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1796 goto phooey; /* not present */
1802 /* Simplest case: anchored match need be tried only once. */
1803 /* [unless only anchor is BOL and multiline is set] */
1804 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1805 if (s == startpos && regtry(®info, &startpos))
1807 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1808 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1813 dontbother = minlen - 1;
1814 end = HOP3c(strend, -dontbother, strbeg) - 1;
1815 /* for multiline we only have to try after newlines */
1816 if (prog->check_substr || prog->check_utf8) {
1820 if (regtry(®info, &s))
1825 if (prog->extflags & RXf_USE_INTUIT) {
1826 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1837 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1838 if (regtry(®info, &s))
1845 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1847 /* the warning about reginfo.ganch being used without intialization
1848 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1849 and we only enter this block when the same bit is set. */
1850 char *tmp_s = reginfo.ganch - prog->gofs;
1851 if (regtry(®info, &tmp_s))
1856 /* Messy cases: unanchored match. */
1857 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1858 /* we have /x+whatever/ */
1859 /* it must be a one character string (XXXX Except UTF?) */
1864 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1865 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1866 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1871 DEBUG_EXECUTE_r( did_match = 1 );
1872 if (regtry(®info, &s)) goto got_it;
1874 while (s < strend && *s == ch)
1882 DEBUG_EXECUTE_r( did_match = 1 );
1883 if (regtry(®info, &s)) goto got_it;
1885 while (s < strend && *s == ch)
1890 DEBUG_EXECUTE_r(if (!did_match)
1891 PerlIO_printf(Perl_debug_log,
1892 "Did not find anchored character...\n")
1895 else if (prog->anchored_substr != NULL
1896 || prog->anchored_utf8 != NULL
1897 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1898 && prog->float_max_offset < strend - s)) {
1903 char *last1; /* Last position checked before */
1907 if (prog->anchored_substr || prog->anchored_utf8) {
1908 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1909 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1910 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1911 back_max = back_min = prog->anchored_offset;
1913 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1914 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1915 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1916 back_max = prog->float_max_offset;
1917 back_min = prog->float_min_offset;
1921 if (must == &PL_sv_undef)
1922 /* could not downgrade utf8 check substring, so must fail */
1928 last = HOP3c(strend, /* Cannot start after this */
1929 -(I32)(CHR_SVLEN(must)
1930 - (SvTAIL(must) != 0) + back_min), strbeg);
1933 last1 = HOPc(s, -1);
1935 last1 = s - 1; /* bogus */
1937 /* XXXX check_substr already used to find "s", can optimize if
1938 check_substr==must. */
1940 dontbother = end_shift;
1941 strend = HOPc(strend, -dontbother);
1942 while ( (s <= last) &&
1943 ((flags & REXEC_SCREAM)
1944 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1945 end_shift, &scream_pos, 0))
1946 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1947 (unsigned char*)strend, must,
1948 multiline ? FBMrf_MULTILINE : 0))) ) {
1949 /* we may be pointing at the wrong string */
1950 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1951 s = strbeg + (s - SvPVX_const(sv));
1952 DEBUG_EXECUTE_r( did_match = 1 );
1953 if (HOPc(s, -back_max) > last1) {
1954 last1 = HOPc(s, -back_min);
1955 s = HOPc(s, -back_max);
1958 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1960 last1 = HOPc(s, -back_min);
1964 while (s <= last1) {
1965 if (regtry(®info, &s))
1971 while (s <= last1) {
1972 if (regtry(®info, &s))
1978 DEBUG_EXECUTE_r(if (!did_match) {
1979 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1980 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1981 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1982 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1983 ? "anchored" : "floating"),
1984 quoted, RE_SV_TAIL(must));
1988 else if ( (c = progi->regstclass) ) {
1990 const OPCODE op = OP(progi->regstclass);
1991 /* don't bother with what can't match */
1992 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1993 strend = HOPc(strend, -(minlen - 1));
1996 SV * const prop = sv_newmortal();
1997 regprop(prog, prop, c);
1999 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2001 PerlIO_printf(Perl_debug_log,
2002 "Matching stclass %.*s against %s (%d chars)\n",
2003 (int)SvCUR(prop), SvPVX_const(prop),
2004 quoted, (int)(strend - s));
2007 if (find_byclass(prog, c, s, strend, ®info))
2009 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2013 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2018 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2019 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2020 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2022 if (flags & REXEC_SCREAM) {
2023 last = screaminstr(sv, float_real, s - strbeg,
2024 end_shift, &scream_pos, 1); /* last one */
2026 last = scream_olds; /* Only one occurrence. */
2027 /* we may be pointing at the wrong string */
2028 else if (RX_MATCH_COPIED(prog))
2029 s = strbeg + (s - SvPVX_const(sv));
2033 const char * const little = SvPV_const(float_real, len);
2035 if (SvTAIL(float_real)) {
2036 if (memEQ(strend - len + 1, little, len - 1))
2037 last = strend - len + 1;
2038 else if (!multiline)
2039 last = memEQ(strend - len, little, len)
2040 ? strend - len : NULL;
2046 last = rninstr(s, strend, little, little + len);
2048 last = strend; /* matching "$" */
2053 PerlIO_printf(Perl_debug_log,
2054 "%sCan't trim the tail, match fails (should not happen)%s\n",
2055 PL_colors[4], PL_colors[5]));
2056 goto phooey; /* Should not happen! */
2058 dontbother = strend - last + prog->float_min_offset;
2060 if (minlen && (dontbother < minlen))
2061 dontbother = minlen - 1;
2062 strend -= dontbother; /* this one's always in bytes! */
2063 /* We don't know much -- general case. */
2066 if (regtry(®info, &s))
2075 if (regtry(®info, &s))
2077 } while (s++ < strend);
2085 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2087 if (PL_reg_eval_set) {
2088 /* Preserve the current value of $^R */
2089 if (oreplsv != GvSV(PL_replgv))
2090 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2091 restored, the value remains
2093 restore_pos(aTHX_ prog);
2095 if (prog->paren_names)
2096 (void)hv_iterinit(prog->paren_names);
2098 /* make sure $`, $&, $', and $digit will work later */
2099 if ( !(flags & REXEC_NOT_FIRST) ) {
2100 RX_MATCH_COPY_FREE(prog);
2101 if (flags & REXEC_COPY_STR) {
2102 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2103 #ifdef PERL_OLD_COPY_ON_WRITE
2105 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2107 PerlIO_printf(Perl_debug_log,
2108 "Copy on write: regexp capture, type %d\n",
2111 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2112 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2113 assert (SvPOKp(prog->saved_copy));
2117 RX_MATCH_COPIED_on(prog);
2118 s = savepvn(strbeg, i);
2124 prog->subbeg = strbeg;
2125 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2132 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2133 PL_colors[4], PL_colors[5]));
2134 if (PL_reg_eval_set)
2135 restore_pos(aTHX_ prog);
2137 /* we failed :-( roll it back */
2138 swap_match_buff(prog);
2145 - regtry - try match at specific point
2147 STATIC I32 /* 0 failure, 1 success */
2148 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2152 regexp *prog = reginfo->prog;
2153 RXi_GET_DECL(prog,progi);
2154 GET_RE_DEBUG_FLAGS_DECL;
2155 reginfo->cutpoint=NULL;
2157 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2160 PL_reg_eval_set = RS_init;
2161 DEBUG_EXECUTE_r(DEBUG_s(
2162 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2163 (IV)(PL_stack_sp - PL_stack_base));
2166 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2167 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2169 /* Apparently this is not needed, judging by wantarray. */
2170 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2171 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2174 /* Make $_ available to executed code. */
2175 if (reginfo->sv != DEFSV) {
2177 DEFSV = reginfo->sv;
2180 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2181 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2182 /* prepare for quick setting of pos */
2183 #ifdef PERL_OLD_COPY_ON_WRITE
2184 if (SvIsCOW(reginfo->sv))
2185 sv_force_normal_flags(reginfo->sv, 0);
2187 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2188 &PL_vtbl_mglob, NULL, 0);
2192 PL_reg_oldpos = mg->mg_len;
2193 SAVEDESTRUCTOR_X(restore_pos, prog);
2195 if (!PL_reg_curpm) {
2196 Newxz(PL_reg_curpm, 1, PMOP);
2199 SV* const repointer = newSViv(0);
2200 /* so we know which PL_regex_padav element is PL_reg_curpm */
2201 SvFLAGS(repointer) |= SVf_BREAK;
2202 av_push(PL_regex_padav,repointer);
2203 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2204 PL_regex_pad = AvARRAY(PL_regex_padav);
2208 PM_SETRE(PL_reg_curpm, prog);
2209 PL_reg_oldcurpm = PL_curpm;
2210 PL_curpm = PL_reg_curpm;
2211 if (RX_MATCH_COPIED(prog)) {
2212 /* Here is a serious problem: we cannot rewrite subbeg,
2213 since it may be needed if this match fails. Thus
2214 $` inside (?{}) could fail... */
2215 PL_reg_oldsaved = prog->subbeg;
2216 PL_reg_oldsavedlen = prog->sublen;
2217 #ifdef PERL_OLD_COPY_ON_WRITE
2218 PL_nrs = prog->saved_copy;
2220 RX_MATCH_COPIED_off(prog);
2223 PL_reg_oldsaved = NULL;
2224 prog->subbeg = PL_bostr;
2225 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2227 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2228 prog->startp[0] = *startpos - PL_bostr;
2229 PL_reginput = *startpos;
2230 PL_reglastparen = &prog->lastparen;
2231 PL_reglastcloseparen = &prog->lastcloseparen;
2232 prog->lastparen = 0;
2233 prog->lastcloseparen = 0;
2235 PL_regstartp = prog->startp;
2236 PL_regendp = prog->endp;
2237 if (PL_reg_start_tmpl <= prog->nparens) {
2238 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2239 if(PL_reg_start_tmp)
2240 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2242 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2245 /* XXXX What this code is doing here?!!! There should be no need
2246 to do this again and again, PL_reglastparen should take care of
2249 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2250 * Actually, the code in regcppop() (which Ilya may be meaning by
2251 * PL_reglastparen), is not needed at all by the test suite
2252 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2253 * enough, for building DynaLoader, or otherwise this
2254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2255 * will happen. Meanwhile, this code *is* needed for the
2256 * above-mentioned test suite tests to succeed. The common theme
2257 * on those tests seems to be returning null fields from matches.
2260 if (prog->nparens) {
2261 I32 *sp = PL_regstartp;
2262 I32 *ep = PL_regendp;
2264 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2271 if (regmatch(reginfo, progi->program + 1)) {
2272 PL_regendp[0] = PL_reginput - PL_bostr;
2275 if (reginfo->cutpoint)
2276 *startpos= reginfo->cutpoint;
2277 REGCP_UNWIND(lastcp);
2282 #define sayYES goto yes
2283 #define sayNO goto no
2284 #define sayNO_SILENT goto no_silent
2286 /* we dont use STMT_START/END here because it leads to
2287 "unreachable code" warnings, which are bogus, but distracting. */
2288 #define CACHEsayNO \
2289 if (ST.cache_mask) \
2290 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2293 /* this is used to determine how far from the left messages like
2294 'failed...' are printed. It should be set such that messages
2295 are inline with the regop output that created them.
2297 #define REPORT_CODE_OFF 32
2300 /* Make sure there is a test for this +1 options in re_tests */
2301 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2303 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2304 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2306 #define SLAB_FIRST(s) (&(s)->states[0])
2307 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2309 /* grab a new slab and return the first slot in it */
2311 STATIC regmatch_state *
2314 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2317 regmatch_slab *s = PL_regmatch_slab->next;
2319 Newx(s, 1, regmatch_slab);
2320 s->prev = PL_regmatch_slab;
2322 PL_regmatch_slab->next = s;
2324 PL_regmatch_slab = s;
2325 return SLAB_FIRST(s);
2329 /* push a new state then goto it */
2331 #define PUSH_STATE_GOTO(state, node) \
2333 st->resume_state = state; \
2336 /* push a new state with success backtracking, then goto it */
2338 #define PUSH_YES_STATE_GOTO(state, node) \
2340 st->resume_state = state; \
2341 goto push_yes_state;
2347 regmatch() - main matching routine
2349 This is basically one big switch statement in a loop. We execute an op,
2350 set 'next' to point the next op, and continue. If we come to a point which
2351 we may need to backtrack to on failure such as (A|B|C), we push a
2352 backtrack state onto the backtrack stack. On failure, we pop the top
2353 state, and re-enter the loop at the state indicated. If there are no more
2354 states to pop, we return failure.
2356 Sometimes we also need to backtrack on success; for example /A+/, where
2357 after successfully matching one A, we need to go back and try to
2358 match another one; similarly for lookahead assertions: if the assertion
2359 completes successfully, we backtrack to the state just before the assertion
2360 and then carry on. In these cases, the pushed state is marked as
2361 'backtrack on success too'. This marking is in fact done by a chain of
2362 pointers, each pointing to the previous 'yes' state. On success, we pop to
2363 the nearest yes state, discarding any intermediate failure-only states.
2364 Sometimes a yes state is pushed just to force some cleanup code to be
2365 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2366 it to free the inner regex.
2368 Note that failure backtracking rewinds the cursor position, while
2369 success backtracking leaves it alone.
2371 A pattern is complete when the END op is executed, while a subpattern
2372 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2373 ops trigger the "pop to last yes state if any, otherwise return true"
2376 A common convention in this function is to use A and B to refer to the two
2377 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2378 the subpattern to be matched possibly multiple times, while B is the entire
2379 rest of the pattern. Variable and state names reflect this convention.
2381 The states in the main switch are the union of ops and failure/success of
2382 substates associated with with that op. For example, IFMATCH is the op
2383 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2384 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2385 successfully matched A and IFMATCH_A_fail is a state saying that we have
2386 just failed to match A. Resume states always come in pairs. The backtrack
2387 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2388 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2389 on success or failure.
2391 The struct that holds a backtracking state is actually a big union, with
2392 one variant for each major type of op. The variable st points to the
2393 top-most backtrack struct. To make the code clearer, within each
2394 block of code we #define ST to alias the relevant union.
2396 Here's a concrete example of a (vastly oversimplified) IFMATCH
2402 #define ST st->u.ifmatch
2404 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2405 ST.foo = ...; // some state we wish to save
2407 // push a yes backtrack state with a resume value of
2408 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2410 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2413 case IFMATCH_A: // we have successfully executed A; now continue with B
2415 bar = ST.foo; // do something with the preserved value
2418 case IFMATCH_A_fail: // A failed, so the assertion failed
2419 ...; // do some housekeeping, then ...
2420 sayNO; // propagate the failure
2427 For any old-timers reading this who are familiar with the old recursive
2428 approach, the code above is equivalent to:
2430 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2439 ...; // do some housekeeping, then ...
2440 sayNO; // propagate the failure
2443 The topmost backtrack state, pointed to by st, is usually free. If you
2444 want to claim it, populate any ST.foo fields in it with values you wish to
2445 save, then do one of
2447 PUSH_STATE_GOTO(resume_state, node);
2448 PUSH_YES_STATE_GOTO(resume_state, node);
2450 which sets that backtrack state's resume value to 'resume_state', pushes a
2451 new free entry to the top of the backtrack stack, then goes to 'node'.
2452 On backtracking, the free slot is popped, and the saved state becomes the
2453 new free state. An ST.foo field in this new top state can be temporarily
2454 accessed to retrieve values, but once the main loop is re-entered, it
2455 becomes available for reuse.
2457 Note that the depth of the backtrack stack constantly increases during the
2458 left-to-right execution of the pattern, rather than going up and down with
2459 the pattern nesting. For example the stack is at its maximum at Z at the
2460 end of the pattern, rather than at X in the following:
2462 /(((X)+)+)+....(Y)+....Z/
2464 The only exceptions to this are lookahead/behind assertions and the cut,
2465 (?>A), which pop all the backtrack states associated with A before
2468 Bascktrack state structs are allocated in slabs of about 4K in size.
2469 PL_regmatch_state and st always point to the currently active state,
2470 and PL_regmatch_slab points to the slab currently containing
2471 PL_regmatch_state. The first time regmatch() is called, the first slab is
2472 allocated, and is never freed until interpreter destruction. When the slab
2473 is full, a new one is allocated and chained to the end. At exit from
2474 regmatch(), slabs allocated since entry are freed.
2479 #define DEBUG_STATE_pp(pp) \
2481 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2482 PerlIO_printf(Perl_debug_log, \
2483 " %*s"pp" %s%s%s%s%s\n", \
2485 PL_reg_name[st->resume_state], \
2486 ((st==yes_state||st==mark_state) ? "[" : ""), \
2487 ((st==yes_state) ? "Y" : ""), \
2488 ((st==mark_state) ? "M" : ""), \
2489 ((st==yes_state||st==mark_state) ? "]" : "") \
2494 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2499 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2500 const char *start, const char *end, const char *blurb)
2502 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2506 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2507 prog->precomp, prog->prelen, 60);
2509 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2510 start, end - start, 60);
2512 PerlIO_printf(Perl_debug_log,
2513 "%s%s REx%s %s against %s\n",
2514 PL_colors[4], blurb, PL_colors[5], s0, s1);
2516 if (do_utf8||utf8_pat)
2517 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2518 utf8_pat ? "pattern" : "",
2519 utf8_pat && do_utf8 ? " and " : "",
2520 do_utf8 ? "string" : ""
2526 S_dump_exec_pos(pTHX_ const char *locinput,
2527 const regnode *scan,
2528 const char *loc_regeol,
2529 const char *loc_bostr,
2530 const char *loc_reg_starttry,
2533 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2534 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2535 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2536 /* The part of the string before starttry has one color
2537 (pref0_len chars), between starttry and current
2538 position another one (pref_len - pref0_len chars),
2539 after the current position the third one.
2540 We assume that pref0_len <= pref_len, otherwise we
2541 decrease pref0_len. */
2542 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2543 ? (5 + taill) - l : locinput - loc_bostr;
2546 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2548 pref0_len = pref_len - (locinput - loc_reg_starttry);
2549 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2550 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2551 ? (5 + taill) - pref_len : loc_regeol - locinput);
2552 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2556 if (pref0_len > pref_len)
2557 pref0_len = pref_len;
2559 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2561 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2562 (locinput - pref_len),pref0_len, 60, 4, 5);
2564 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2565 (locinput - pref_len + pref0_len),
2566 pref_len - pref0_len, 60, 2, 3);
2568 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2569 locinput, loc_regeol - locinput, 10, 0, 1);
2571 const STRLEN tlen=len0+len1+len2;
2572 PerlIO_printf(Perl_debug_log,
2573 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2574 (IV)(locinput - loc_bostr),
2577 (docolor ? "" : "> <"),
2579 (int)(tlen > 19 ? 0 : 19 - tlen),
2586 /* reg_check_named_buff_matched()
2587 * Checks to see if a named buffer has matched. The data array of
2588 * buffer numbers corresponding to the buffer is expected to reside
2589 * in the regexp->data->data array in the slot stored in the ARG() of
2590 * node involved. Note that this routine doesn't actually care about the
2591 * name, that information is not preserved from compilation to execution.
2592 * Returns the index of the leftmost defined buffer with the given name
2593 * or 0 if non of the buffers matched.
2596 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2598 RXi_GET_DECL(rex,rexi);
2599 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2600 I32 *nums=(I32*)SvPVX(sv_dat);
2601 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2602 if ((I32)*PL_reglastparen >= nums[n] &&
2603 PL_regendp[nums[n]] != -1)
2611 #define SETREX(Re1,Re2) \
2612 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2615 STATIC I32 /* 0 failure, 1 success */
2616 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2618 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2622 register const bool do_utf8 = PL_reg_match_utf8;
2623 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2625 regexp *rex = reginfo->prog;
2626 RXi_GET_DECL(rex,rexi);
2628 regmatch_slab *orig_slab;
2629 regmatch_state *orig_state;
2631 /* the current state. This is a cached copy of PL_regmatch_state */
2632 register regmatch_state *st;
2634 /* cache heavy used fields of st in registers */
2635 register regnode *scan;
2636 register regnode *next;
2637 register U32 n = 0; /* general value; init to avoid compiler warning */
2638 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2639 register char *locinput = PL_reginput;
2640 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2642 bool result = 0; /* return value of S_regmatch */
2643 int depth = 0; /* depth of backtrack stack */
2644 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2645 const U32 max_nochange_depth =
2646 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2647 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2649 regmatch_state *yes_state = NULL; /* state to pop to on success of
2651 /* mark_state piggy backs on the yes_state logic so that when we unwind
2652 the stack on success we can update the mark_state as we go */
2653 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2655 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2656 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2658 bool no_final = 0; /* prevent failure from backtracking? */
2659 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2660 char *startpoint = PL_reginput;
2661 SV *popmark = NULL; /* are we looking for a mark? */
2662 SV *sv_commit = NULL; /* last mark name seen in failure */
2663 SV *sv_yes_mark = NULL; /* last mark name we have seen
2664 during a successfull match */
2665 U32 lastopen = 0; /* last open we saw */
2666 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2669 /* these three flags are set by various ops to signal information to
2670 * the very next op. They have a useful lifetime of exactly one loop
2671 * iteration, and are not preserved or restored by state pushes/pops
2673 bool sw = 0; /* the condition value in (?(cond)a|b) */
2674 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2675 int logical = 0; /* the following EVAL is:
2679 or the following IFMATCH/UNLESSM is:
2680 false: plain (?=foo)
2681 true: used as a condition: (?(?=foo))
2685 GET_RE_DEBUG_FLAGS_DECL;
2688 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2689 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2691 /* on first ever call to regmatch, allocate first slab */
2692 if (!PL_regmatch_slab) {
2693 Newx(PL_regmatch_slab, 1, regmatch_slab);
2694 PL_regmatch_slab->prev = NULL;
2695 PL_regmatch_slab->next = NULL;
2696 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2699 /* remember current high-water mark for exit */
2700 /* XXX this should be done with SAVE* instead */
2701 orig_slab = PL_regmatch_slab;
2702 orig_state = PL_regmatch_state;
2704 /* grab next free state slot */
2705 st = ++PL_regmatch_state;
2706 if (st > SLAB_LAST(PL_regmatch_slab))
2707 st = PL_regmatch_state = S_push_slab(aTHX);
2709 /* Note that nextchr is a byte even in UTF */
2710 nextchr = UCHARAT(locinput);
2712 while (scan != NULL) {
2715 SV * const prop = sv_newmortal();
2716 regnode *rnext=regnext(scan);
2717 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2718 regprop(rex, prop, scan);
2720 PerlIO_printf(Perl_debug_log,
2721 "%3"IVdf":%*s%s(%"IVdf")\n",
2722 (IV)(scan - rexi->program), depth*2, "",
2724 (PL_regkind[OP(scan)] == END || !rnext) ?
2725 0 : (IV)(rnext - rexi->program));
2728 next = scan + NEXT_OFF(scan);
2731 state_num = OP(scan);
2734 switch (state_num) {
2736 if (locinput == PL_bostr)
2738 /* reginfo->till = reginfo->bol; */
2743 if (locinput == PL_bostr ||
2744 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2750 if (locinput == PL_bostr)
2754 if (locinput == reginfo->ganch)
2759 /* update the startpoint */
2760 st->u.keeper.val = PL_regstartp[0];
2761 PL_reginput = locinput;
2762 PL_regstartp[0] = locinput - PL_bostr;
2763 PUSH_STATE_GOTO(KEEPS_next, next);
2765 case KEEPS_next_fail:
2766 /* rollback the start point change */
2767 PL_regstartp[0] = st->u.keeper.val;
2773 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2778 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2780 if (PL_regeol - locinput > 1)
2784 if (PL_regeol != locinput)
2788 if (!nextchr && locinput >= PL_regeol)
2791 locinput += PL_utf8skip[nextchr];
2792 if (locinput > PL_regeol)
2794 nextchr = UCHARAT(locinput);
2797 nextchr = UCHARAT(++locinput);
2800 if (!nextchr && locinput >= PL_regeol)
2802 nextchr = UCHARAT(++locinput);
2805 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2808 locinput += PL_utf8skip[nextchr];
2809 if (locinput > PL_regeol)
2811 nextchr = UCHARAT(locinput);
2814 nextchr = UCHARAT(++locinput);
2818 #define ST st->u.trie
2820 /* In this case the charclass data is available inline so
2821 we can fail fast without a lot of extra overhead.
2823 if (scan->flags == EXACT || !do_utf8) {
2824 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2826 PerlIO_printf(Perl_debug_log,
2827 "%*s %sfailed to match trie start class...%s\n",
2828 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2837 /* what type of TRIE am I? (utf8 makes this contextual) */
2838 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2839 trie_type = do_utf8 ?
2840 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2843 /* what trie are we using right now */
2844 reg_trie_data * const trie
2845 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2846 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2847 U32 state = trie->startstate;
2849 if (trie->bitmap && trie_type != trie_utf8_fold &&
2850 !TRIE_BITMAP_TEST(trie,*locinput)
2852 if (trie->states[ state ].wordnum) {
2854 PerlIO_printf(Perl_debug_log,
2855 "%*s %smatched empty string...%s\n",
2856 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2861 PerlIO_printf(Perl_debug_log,
2862 "%*s %sfailed to match trie start class...%s\n",
2863 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2870 U8 *uc = ( U8* )locinput;
2874 U8 *uscan = (U8*)NULL;
2876 SV *sv_accept_buff = NULL;
2877 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2879 ST.accepted = 0; /* how many accepting states we have seen */
2881 ST.jump = trie->jump;
2884 traverse the TRIE keeping track of all accepting states
2885 we transition through until we get to a failing node.
2888 while ( state && uc <= (U8*)PL_regeol ) {
2889 U32 base = trie->states[ state ].trans.base;
2892 /* We use charid to hold the wordnum as we don't use it
2893 for charid until after we have done the wordnum logic.
2894 We define an alias just so that the wordnum logic reads
2897 #define got_wordnum charid
2898 got_wordnum = trie->states[ state ].wordnum;
2900 if ( got_wordnum ) {
2901 if ( ! ST.accepted ) {
2904 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2905 sv_accept_buff=newSV(bufflen *
2906 sizeof(reg_trie_accepted) - 1);
2907 SvCUR_set(sv_accept_buff, 0);
2908 SvPOK_on(sv_accept_buff);
2909 sv_2mortal(sv_accept_buff);
2912 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2915 if (ST.accepted >= bufflen) {
2917 ST.accept_buff =(reg_trie_accepted*)
2918 SvGROW(sv_accept_buff,
2919 bufflen * sizeof(reg_trie_accepted));
2921 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2922 + sizeof(reg_trie_accepted));
2925 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2926 ST.accept_buff[ST.accepted].endpos = uc;
2928 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2932 DEBUG_TRIE_EXECUTE_r({
2933 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2934 PerlIO_printf( Perl_debug_log,
2935 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2936 2+depth * 2, "", PL_colors[4],
2937 (UV)state, (UV)ST.accepted );
2941 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2942 uscan, len, uvc, charid, foldlen,
2946 (base + charid > trie->uniquecharcount )
2947 && (base + charid - 1 - trie->uniquecharcount
2949 && trie->trans[base + charid - 1 -
2950 trie->uniquecharcount].check == state)
2952 state = trie->trans[base + charid - 1 -
2953 trie->uniquecharcount ].next;
2964 DEBUG_TRIE_EXECUTE_r(
2965 PerlIO_printf( Perl_debug_log,
2966 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2967 charid, uvc, (UV)state, PL_colors[5] );
2974 PerlIO_printf( Perl_debug_log,
2975 "%*s %sgot %"IVdf" possible matches%s\n",
2976 REPORT_CODE_OFF + depth * 2, "",
2977 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2980 goto trie_first_try; /* jump into the fail handler */
2982 case TRIE_next_fail: /* we failed - try next alterative */
2984 REGCP_UNWIND(ST.cp);
2985 for (n = *PL_reglastparen; n > ST.lastparen; n--)
2987 *PL_reglastparen = n;
2996 ST.lastparen = *PL_reglastparen;
2999 if ( ST.accepted == 1 ) {
3000 /* only one choice left - just continue */
3002 AV *const trie_words
3003 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3004 SV ** const tmp = av_fetch( trie_words,
3005 ST.accept_buff[ 0 ].wordnum-1, 0 );
3006 SV *sv= tmp ? sv_newmortal() : NULL;
3008 PerlIO_printf( Perl_debug_log,
3009 "%*s %sonly one match left: #%d <%s>%s\n",
3010 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3011 ST.accept_buff[ 0 ].wordnum,
3012 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3013 PL_colors[0], PL_colors[1],
3014 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3016 : "not compiled under -Dr",
3019 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3020 /* in this case we free tmps/leave before we call regmatch
3021 as we wont be using accept_buff again. */
3023 locinput = PL_reginput;
3024 nextchr = UCHARAT(locinput);
3025 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3028 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3029 if (!has_cutgroup) {
3034 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3037 continue; /* execute rest of RE */
3040 if ( !ST.accepted-- ) {
3042 PerlIO_printf( Perl_debug_log,
3043 "%*s %sTRIE failed...%s\n",
3044 REPORT_CODE_OFF+depth*2, "",
3055 There are at least two accepting states left. Presumably
3056 the number of accepting states is going to be low,
3057 typically two. So we simply scan through to find the one
3058 with lowest wordnum. Once we find it, we swap the last
3059 state into its place and decrement the size. We then try to
3060 match the rest of the pattern at the point where the word
3061 ends. If we succeed, control just continues along the
3062 regex; if we fail we return here to try the next accepting
3069 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3070 DEBUG_TRIE_EXECUTE_r(
3071 PerlIO_printf( Perl_debug_log,
3072 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3073 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3074 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3075 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3078 if (ST.accept_buff[cur].wordnum <
3079 ST.accept_buff[best].wordnum)
3084 AV *const trie_words
3085 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3086 SV ** const tmp = av_fetch( trie_words,
3087 ST.accept_buff[ best ].wordnum - 1, 0 );
3088 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3090 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3091 SV *sv= tmp ? sv_newmortal() : NULL;
3093 PerlIO_printf( Perl_debug_log,
3094 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3095 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3096 ST.accept_buff[best].wordnum,
3097 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3098 PL_colors[0], PL_colors[1],
3099 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3100 ) : "not compiled under -Dr",
3101 REG_NODE_NUM(nextop),
3105 if ( best<ST.accepted ) {
3106 reg_trie_accepted tmp = ST.accept_buff[ best ];
3107 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3108 ST.accept_buff[ ST.accepted ] = tmp;
3111 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3112 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3116 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3120 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3123 PUSH_STATE_GOTO(TRIE_next, scan);
3136 char *s = STRING(scan);
3138 if (do_utf8 != UTF) {
3139 /* The target and the pattern have differing utf8ness. */
3141 const char * const e = s + ln;
3144 /* The target is utf8, the pattern is not utf8. */
3149 if (NATIVE_TO_UNI(*(U8*)s) !=
3150 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3158 /* The target is not utf8, the pattern is utf8. */
3163 if (NATIVE_TO_UNI(*((U8*)l)) !=
3164 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3172 nextchr = UCHARAT(locinput);
3175 /* The target and the pattern have the same utf8ness. */
3176 /* Inline the first character, for speed. */
3177 if (UCHARAT(s) != nextchr)
3179 if (PL_regeol - locinput < ln)
3181 if (ln > 1 && memNE(s, locinput, ln))
3184 nextchr = UCHARAT(locinput);
3188 PL_reg_flags |= RF_tainted;
3191 char * const s = STRING(scan);
3194 if (do_utf8 || UTF) {
3195 /* Either target or the pattern are utf8. */
3196 const char * const l = locinput;
3197 char *e = PL_regeol;
3199 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3200 l, &e, 0, do_utf8)) {
3201 /* One more case for the sharp s:
3202 * pack("U0U*", 0xDF) =~ /ss/i,
3203 * the 0xC3 0x9F are the UTF-8
3204 * byte sequence for the U+00DF. */
3206 toLOWER(s[0]) == 's' &&
3208 toLOWER(s[1]) == 's' &&
3215 nextchr = UCHARAT(locinput);
3219 /* Neither the target and the pattern are utf8. */
3221 /* Inline the first character, for speed. */
3222 if (UCHARAT(s) != nextchr &&
3223 UCHARAT(s) != ((OP(scan) == EXACTF)
3224 ? PL_fold : PL_fold_locale)[nextchr])
3226 if (PL_regeol - locinput < ln)
3228 if (ln > 1 && (OP(scan) == EXACTF
3229 ? ibcmp(s, locinput, ln)
3230 : ibcmp_locale(s, locinput, ln)))
3233 nextchr = UCHARAT(locinput);
3238 STRLEN inclasslen = PL_regeol - locinput;
3240 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3242 if (locinput >= PL_regeol)
3244 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3245 nextchr = UCHARAT(locinput);
3250 nextchr = UCHARAT(locinput);
3251 if (!REGINCLASS(rex, scan, (U8*)locinput))
3253 if (!nextchr && locinput >= PL_regeol)
3255 nextchr = UCHARAT(++locinput);
3259 /* If we might have the case of the German sharp s
3260 * in a casefolding Unicode character class. */
3262 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3263 locinput += SHARP_S_SKIP;
3264 nextchr = UCHARAT(locinput);
3270 PL_reg_flags |= RF_tainted;
3276 LOAD_UTF8_CHARCLASS_ALNUM();
3277 if (!(OP(scan) == ALNUM
3278 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3279 : isALNUM_LC_utf8((U8*)locinput)))
3283 locinput += PL_utf8skip[nextchr];
3284 nextchr = UCHARAT(locinput);
3287 if (!(OP(scan) == ALNUM
3288 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3290 nextchr = UCHARAT(++locinput);
3293 PL_reg_flags |= RF_tainted;
3296 if (!nextchr && locinput >= PL_regeol)
3299 LOAD_UTF8_CHARCLASS_ALNUM();
3300 if (OP(scan) == NALNUM
3301 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3302 : isALNUM_LC_utf8((U8*)locinput))
3306 locinput += PL_utf8skip[nextchr];
3307 nextchr = UCHARAT(locinput);
3310 if (OP(scan) == NALNUM
3311 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3313 nextchr = UCHARAT(++locinput);
3317 PL_reg_flags |= RF_tainted;
3321 /* was last char in word? */
3323 if (locinput == PL_bostr)
3326 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3328 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3330 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3331 ln = isALNUM_uni(ln);
3332 LOAD_UTF8_CHARCLASS_ALNUM();
3333 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3336 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3337 n = isALNUM_LC_utf8((U8*)locinput);
3341 ln = (locinput != PL_bostr) ?
3342 UCHARAT(locinput - 1) : '\n';
3343 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3345 n = isALNUM(nextchr);
3348 ln = isALNUM_LC(ln);
3349 n = isALNUM_LC(nextchr);
3352 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3353 OP(scan) == BOUNDL))
3357 PL_reg_flags |= RF_tainted;
3363 if (UTF8_IS_CONTINUED(nextchr)) {
3364 LOAD_UTF8_CHARCLASS_SPACE();
3365 if (!(OP(scan) == SPACE
3366 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3367 : isSPACE_LC_utf8((U8*)locinput)))
3371 locinput += PL_utf8skip[nextchr];
3372 nextchr = UCHARAT(locinput);
3375 if (!(OP(scan) == SPACE
3376 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3378 nextchr = UCHARAT(++locinput);
3381 if (!(OP(scan) == SPACE
3382 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3384 nextchr = UCHARAT(++locinput);
3388 PL_reg_flags |= RF_tainted;
3391 if (!nextchr && locinput >= PL_regeol)
3394 LOAD_UTF8_CHARCLASS_SPACE();
3395 if (OP(scan) == NSPACE
3396 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3397 : isSPACE_LC_utf8((U8*)locinput))
3401 locinput += PL_utf8skip[nextchr];
3402 nextchr = UCHARAT(locinput);
3405 if (OP(scan) == NSPACE
3406 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3408 nextchr = UCHARAT(++locinput);
3411 PL_reg_flags |= RF_tainted;
3417 LOAD_UTF8_CHARCLASS_DIGIT();
3418 if (!(OP(scan) == DIGIT
3419 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3420 : isDIGIT_LC_utf8((U8*)locinput)))
3424 locinput += PL_utf8skip[nextchr];
3425 nextchr = UCHARAT(locinput);
3428 if (!(OP(scan) == DIGIT
3429 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3431 nextchr = UCHARAT(++locinput);
3434 PL_reg_flags |= RF_tainted;
3437 if (!nextchr && locinput >= PL_regeol)
3440 LOAD_UTF8_CHARCLASS_DIGIT();
3441 if (OP(scan) == NDIGIT
3442 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3443 : isDIGIT_LC_utf8((U8*)locinput))
3447 locinput += PL_utf8skip[nextchr];
3448 nextchr = UCHARAT(locinput);
3451 if (OP(scan) == NDIGIT
3452 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3454 nextchr = UCHARAT(++locinput);
3457 if (locinput >= PL_regeol)
3460 LOAD_UTF8_CHARCLASS_MARK();
3461 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3463 locinput += PL_utf8skip[nextchr];
3464 while (locinput < PL_regeol &&
3465 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3466 locinput += UTF8SKIP(locinput);
3467 if (locinput > PL_regeol)
3472 nextchr = UCHARAT(locinput);
3479 PL_reg_flags |= RF_tainted;
3484 n = reg_check_named_buff_matched(rex,scan);
3487 type = REF + ( type - NREF );
3494 PL_reg_flags |= RF_tainted;
3498 n = ARG(scan); /* which paren pair */
3501 ln = PL_regstartp[n];
3502 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3503 if (*PL_reglastparen < n || ln == -1)
3504 sayNO; /* Do not match unless seen CLOSEn. */
3505 if (ln == PL_regendp[n])
3509 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3511 const char *e = PL_bostr + PL_regendp[n];
3513 * Note that we can't do the "other character" lookup trick as
3514 * in the 8-bit case (no pun intended) because in Unicode we
3515 * have to map both upper and title case to lower case.
3519 STRLEN ulen1, ulen2;
3520 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3521 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3525 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3526 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3527 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3534 nextchr = UCHARAT(locinput);
3538 /* Inline the first character, for speed. */
3539 if (UCHARAT(s) != nextchr &&
3541 (UCHARAT(s) != (type == REFF
3542 ? PL_fold : PL_fold_locale)[nextchr])))
3544 ln = PL_regendp[n] - ln;
3545 if (locinput + ln > PL_regeol)
3547 if (ln > 1 && (type == REF
3548 ? memNE(s, locinput, ln)
3550 ? ibcmp(s, locinput, ln)
3551 : ibcmp_locale(s, locinput, ln))))
3554 nextchr = UCHARAT(locinput);
3564 #define ST st->u.eval
3568 regexp_internal *rei;
3569 regnode *startpoint;
3572 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3573 if (cur_eval && cur_eval->locinput==locinput) {
3574 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3575 Perl_croak(aTHX_ "Infinite recursion in regex");
3576 if ( ++nochange_depth > max_nochange_depth )
3578 "Pattern subroutine nesting without pos change"
3579 " exceeded limit in regex");
3585 (void)ReREFCNT_inc(rex);
3586 if (OP(scan)==GOSUB) {
3587 startpoint = scan + ARG2L(scan);
3588 ST.close_paren = ARG(scan);
3590 startpoint = rei->program+1;
3593 goto eval_recurse_doit;
3595 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3596 if (cur_eval && cur_eval->locinput==locinput) {
3597 if ( ++nochange_depth > max_nochange_depth )
3598 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3603 /* execute the code in the {...} */
3605 SV ** const before = SP;
3606 OP_4tree * const oop = PL_op;
3607 COP * const ocurcop = PL_curcop;
3611 PL_op = (OP_4tree*)rexi->data->data[n];
3612 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3613 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3614 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3615 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3618 SV *sv_mrk = get_sv("REGMARK", 1);
3619 sv_setsv(sv_mrk, sv_yes_mark);
3622 CALLRUNOPS(aTHX); /* Scalar context. */
3625 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3632 PAD_RESTORE_LOCAL(old_comppad);
3633 PL_curcop = ocurcop;
3636 sv_setsv(save_scalar(PL_replgv), ret);
3640 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3643 /* extract RE object from returned value; compiling if
3648 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3649 mg = mg_find(sv, PERL_MAGIC_qr);
3650 else if (SvSMAGICAL(ret)) {
3651 if (SvGMAGICAL(ret))
3652 sv_unmagic(ret, PERL_MAGIC_qr);
3654 mg = mg_find(ret, PERL_MAGIC_qr);
3658 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
3662 const char * const t = SvPV_const(ret, len);
3664 const I32 osize = PL_regsize;
3667 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3668 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3670 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3672 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3677 RX_MATCH_COPIED_off(re);
3678 re->subbeg = rex->subbeg;
3679 re->sublen = rex->sublen;
3682 debug_start_match(re, do_utf8, locinput, PL_regeol,
3683 "Matching embedded");
3685 startpoint = rei->program + 1;
3686 ST.close_paren = 0; /* only used for GOSUB */
3687 /* borrowed from regtry */
3688 if (PL_reg_start_tmpl <= re->nparens) {
3689 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3690 if(PL_reg_start_tmp)
3691 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3693 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3696 eval_recurse_doit: /* Share code with GOSUB below this line */
3697 /* run the pattern returned from (??{...}) */
3698 ST.cp = regcppush(0); /* Save *all* the positions. */
3699 REGCP_SET(ST.lastcp);
3701 PL_regstartp = re->startp; /* essentially NOOP on GOSUB */
3702 PL_regendp = re->endp; /* essentially NOOP on GOSUB */
3704 *PL_reglastparen = 0;
3705 *PL_reglastcloseparen = 0;
3706 PL_reginput = locinput;
3709 /* XXXX This is too dramatic a measure... */
3712 ST.toggle_reg_flags = PL_reg_flags;
3713 if (re->extflags & RXf_UTF8)
3714 PL_reg_flags |= RF_utf8;
3716 PL_reg_flags &= ~RF_utf8;
3717 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3720 ST.prev_curlyx = cur_curlyx;
3725 ST.prev_eval = cur_eval;
3727 /* now continue from first node in postoned RE */
3728 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3731 /* logical is 1, /(?(?{...})X|Y)/ */
3732 sw = (bool)SvTRUE(ret);
3737 case EVAL_AB: /* cleanup after a successful (??{A})B */
3738 /* note: this is called twice; first after popping B, then A */
3739 PL_reg_flags ^= ST.toggle_reg_flags;
3741 SETREX(rex,ST.prev_rex);
3742 rexi = RXi_GET(rex);
3744 cur_eval = ST.prev_eval;
3745 cur_curlyx = ST.prev_curlyx;
3746 /* XXXX This is too dramatic a measure... */
3748 if ( nochange_depth )
3753 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3754 /* note: this is called twice; first after popping B, then A */
3755 PL_reg_flags ^= ST.toggle_reg_flags;
3757 SETREX(rex,ST.prev_rex);
3758 rexi = RXi_GET(rex);
3759 PL_reginput = locinput;
3760 REGCP_UNWIND(ST.lastcp);
3762 cur_eval = ST.prev_eval;
3763 cur_curlyx = ST.prev_curlyx;
3764 /* XXXX This is too dramatic a measure... */
3766 if ( nochange_depth )
3772 n = ARG(scan); /* which paren pair */
3773 PL_reg_start_tmp[n] = locinput;
3779 n = ARG(scan); /* which paren pair */
3780 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3781 PL_regendp[n] = locinput - PL_bostr;
3782 /*if (n > PL_regsize)
3784 if (n > *PL_reglastparen)
3785 *PL_reglastparen = n;
3786 *PL_reglastcloseparen = n;
3787 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3795 cursor && OP(cursor)!=END;
3796 cursor=regnext(cursor))
3798 if ( OP(cursor)==CLOSE ){
3800 if ( n <= lastopen ) {
3801 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3802 PL_regendp[n] = locinput - PL_bostr;
3803 /*if (n > PL_regsize)
3805 if (n > *PL_reglastparen)
3806 *PL_reglastparen = n;
3807 *PL_reglastcloseparen = n;
3808 if ( n == ARG(scan) || (cur_eval &&
3809 cur_eval->u.eval.close_paren == n))
3818 n = ARG(scan); /* which paren pair */
3819 sw = (bool)(*PL_reglastparen >= n && PL_regendp[n] != -1);
3822 /* reg_check_named_buff_matched returns 0 for no match */
3823 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3827 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3833 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3835 next = NEXTOPER(NEXTOPER(scan));
3837 next = scan + ARG(scan);
3838 if (OP(next) == IFTHEN) /* Fake one. */
3839 next = NEXTOPER(NEXTOPER(next));
3843 logical = scan->flags;
3846 /*******************************************************************
3848 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3849 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3850 STAR/PLUS/CURLY/CURLYN are used instead.)
3852 A*B is compiled as <CURLYX><A><WHILEM><B>
3854 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3855 state, which contains the current count, initialised to -1. It also sets
3856 cur_curlyx to point to this state, with any previous value saved in the
3859 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3860 since the pattern may possibly match zero times (i.e. it's a while {} loop
3861 rather than a do {} while loop).
3863 Each entry to WHILEM represents a successful match of A. The count in the
3864 CURLYX block is incremented, another WHILEM state is pushed, and execution
3865 passes to A or B depending on greediness and the current count.
3867 For example, if matching against the string a1a2a3b (where the aN are
3868 substrings that match /A/), then the match progresses as follows: (the
3869 pushed states are interspersed with the bits of strings matched so far):
3872 <CURLYX cnt=0><WHILEM>
3873 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3874 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3875 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3876 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3878 (Contrast this with something like CURLYM, which maintains only a single
3882 a1 <CURLYM cnt=1> a2
3883 a1 a2 <CURLYM cnt=2> a3
3884 a1 a2 a3 <CURLYM cnt=3> b
3887 Each WHILEM state block marks a point to backtrack to upon partial failure
3888 of A or B, and also contains some minor state data related to that
3889 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3890 overall state, such as the count, and pointers to the A and B ops.
3892 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3893 must always point to the *current* CURLYX block, the rules are:
3895 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3896 and set cur_curlyx to point the new block.
3898 When popping the CURLYX block after a successful or unsuccessful match,
3899 restore the previous cur_curlyx.
3901 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3902 to the outer one saved in the CURLYX block.
3904 When popping the WHILEM block after a successful or unsuccessful B match,
3905 restore the previous cur_curlyx.
3907 Here's an example for the pattern (AI* BI)*BO
3908 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3911 curlyx backtrack stack
3912 ------ ---------------
3914 CO <CO prev=NULL> <WO>
3915 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3916 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3917 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3919 At this point the pattern succeeds, and we work back down the stack to
3920 clean up, restoring as we go:
3922 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3923 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3924 CO <CO prev=NULL> <WO>
3927 *******************************************************************/
3929 #define ST st->u.curlyx
3931 case CURLYX: /* start of /A*B/ (for complex A) */
3933 /* No need to save/restore up to this paren */
3934 I32 parenfloor = scan->flags;
3936 assert(next); /* keep Coverity happy */
3937 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3940 /* XXXX Probably it is better to teach regpush to support
3941 parenfloor > PL_regsize... */
3942 if (parenfloor > (I32)*PL_reglastparen)
3943 parenfloor = *PL_reglastparen; /* Pessimization... */
3945 ST.prev_curlyx= cur_curlyx;
3947 ST.cp = PL_savestack_ix;
3949 /* these fields contain the state of the current curly.
3950 * they are accessed by subsequent WHILEMs */
3951 ST.parenfloor = parenfloor;
3952 ST.min = ARG1(scan);
3953 ST.max = ARG2(scan);
3954 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3958 ST.count = -1; /* this will be updated by WHILEM */
3959 ST.lastloc = NULL; /* this will be updated by WHILEM */
3961 PL_reginput = locinput;
3962 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3966 case CURLYX_end: /* just finished matching all of A*B */
3967 if (PL_reg_eval_set){
3968 SV *pres= GvSV(PL_replgv);
3971 sv_setsv(GvSV(PL_replgv), pres);
3976 cur_curlyx = ST.prev_curlyx;
3980 case CURLYX_end_fail: /* just failed to match all of A*B */
3982 cur_curlyx = ST.prev_curlyx;
3988 #define ST st->u.whilem
3990 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3992 /* see the discussion above about CURLYX/WHILEM */
3994 assert(cur_curlyx); /* keep Coverity happy */
3995 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3996 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3997 ST.cache_offset = 0;
4000 PL_reginput = locinput;
4002 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4003 "%*s whilem: matched %ld out of %ld..%ld\n",
4004 REPORT_CODE_OFF+depth*2, "", (long)n,
4005 (long)cur_curlyx->u.curlyx.min,
4006 (long)cur_curlyx->u.curlyx.max)
4009 /* First just match a string of min A's. */
4011 if (n < cur_curlyx->u.curlyx.min) {
4012 cur_curlyx->u.curlyx.lastloc = locinput;
4013 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4017 /* If degenerate A matches "", assume A done. */
4019 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4020 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4021 "%*s whilem: empty match detected, trying continuation...\n",
4022 REPORT_CODE_OFF+depth*2, "")
4024 goto do_whilem_B_max;
4027 /* super-linear cache processing */
4031 if (!PL_reg_maxiter) {
4032 /* start the countdown: Postpone detection until we
4033 * know the match is not *that* much linear. */
4034 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4035 /* possible overflow for long strings and many CURLYX's */
4036 if (PL_reg_maxiter < 0)
4037 PL_reg_maxiter = I32_MAX;
4038 PL_reg_leftiter = PL_reg_maxiter;
4041 if (PL_reg_leftiter-- == 0) {
4042 /* initialise cache */
4043 const I32 size = (PL_reg_maxiter + 7)/8;
4044 if (PL_reg_poscache) {
4045 if ((I32)PL_reg_poscache_size < size) {
4046 Renew(PL_reg_poscache, size, char);
4047 PL_reg_poscache_size = size;
4049 Zero(PL_reg_poscache, size, char);
4052 PL_reg_poscache_size = size;
4053 Newxz(PL_reg_poscache, size, char);
4055 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4056 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4057 PL_colors[4], PL_colors[5])
4061 if (PL_reg_leftiter < 0) {
4062 /* have we already failed at this position? */
4064 offset = (scan->flags & 0xf) - 1
4065 + (locinput - PL_bostr) * (scan->flags>>4);
4066 mask = 1 << (offset % 8);
4068 if (PL_reg_poscache[offset] & mask) {
4069 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4070 "%*s whilem: (cache) already tried at this position...\n",
4071 REPORT_CODE_OFF+depth*2, "")
4073 sayNO; /* cache records failure */
4075 ST.cache_offset = offset;
4076 ST.cache_mask = mask;
4080 /* Prefer B over A for minimal matching. */
4082 if (cur_curlyx->u.curlyx.minmod) {
4083 ST.save_curlyx = cur_curlyx;
4084 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4085 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4086 REGCP_SET(ST.lastcp);
4087 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4091 /* Prefer A over B for maximal matching. */
4093 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4094 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4095 cur_curlyx->u.curlyx.lastloc = locinput;
4096 REGCP_SET(ST.lastcp);
4097 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4100 goto do_whilem_B_max;
4104 case WHILEM_B_min: /* just matched B in a minimal match */
4105 case WHILEM_B_max: /* just matched B in a maximal match */
4106 cur_curlyx = ST.save_curlyx;
4110 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4111 cur_curlyx = ST.save_curlyx;
4112 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4113 cur_curlyx->u.curlyx.count--;
4117 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4118 REGCP_UNWIND(ST.lastcp);
4121 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4122 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4123 cur_curlyx->u.curlyx.count--;
4127 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4128 REGCP_UNWIND(ST.lastcp);
4129 regcppop(rex); /* Restore some previous $<digit>s? */
4130 PL_reginput = locinput;
4131 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4132 "%*s whilem: failed, trying continuation...\n",
4133 REPORT_CODE_OFF+depth*2, "")
4136 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4137 && ckWARN(WARN_REGEXP)
4138 && !(PL_reg_flags & RF_warned))
4140 PL_reg_flags |= RF_warned;
4141 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4142 "Complex regular subexpression recursion",
4147 ST.save_curlyx = cur_curlyx;
4148 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4149 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4152 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4153 cur_curlyx = ST.save_curlyx;
4154 REGCP_UNWIND(ST.lastcp);
4157 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4158 /* Maximum greed exceeded */
4159 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4160 && ckWARN(WARN_REGEXP)
4161 && !(PL_reg_flags & RF_warned))
4163 PL_reg_flags |= RF_warned;
4164 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4165 "%s limit (%d) exceeded",
4166 "Complex regular subexpression recursion",
4169 cur_curlyx->u.curlyx.count--;
4173 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4174 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4176 /* Try grabbing another A and see if it helps. */
4177 PL_reginput = locinput;
4178 cur_curlyx->u.curlyx.lastloc = locinput;
4179 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4180 REGCP_SET(ST.lastcp);
4181 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4185 #define ST st->u.branch
4187 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4188 next = scan + ARG(scan);
4191 scan = NEXTOPER(scan);
4194 case BRANCH: /* /(...|A|...)/ */
4195 scan = NEXTOPER(scan); /* scan now points to inner node */
4196 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4199 /* last branch; skip state push and jump direct to node */
4202 ST.lastparen = *PL_reglastparen;
4203 ST.next_branch = next;
4205 PL_reginput = locinput;
4207 /* Now go into the branch */
4209 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4211 PUSH_STATE_GOTO(BRANCH_next, scan);
4215 PL_reginput = locinput;
4216 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4217 (SV*)rexi->data->data[ ARG( scan ) ];
4218 PUSH_STATE_GOTO(CUTGROUP_next,next);
4220 case CUTGROUP_next_fail:
4223 if (st->u.mark.mark_name)
4224 sv_commit = st->u.mark.mark_name;
4230 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4235 REGCP_UNWIND(ST.cp);
4236 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4238 *PL_reglastparen = n;
4239 /*dmq: *PL_reglastcloseparen = n; */
4240 scan = ST.next_branch;
4241 /* no more branches? */
4242 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4244 PerlIO_printf( Perl_debug_log,
4245 "%*s %sBRANCH failed...%s\n",
4246 REPORT_CODE_OFF+depth*2, "",
4252 continue; /* execute next BRANCH[J] op */
4260 #define ST st->u.curlym
4262 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4264 /* This is an optimisation of CURLYX that enables us to push
4265 * only a single backtracking state, no matter now many matches
4266 * there are in {m,n}. It relies on the pattern being constant
4267 * length, with no parens to influence future backrefs
4271 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4273 /* if paren positive, emulate an OPEN/CLOSE around A */
4275 U32 paren = ST.me->flags;
4276 if (paren > PL_regsize)
4278 if (paren > *PL_reglastparen)
4279 *PL_reglastparen = paren;
4280 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4288 ST.c1 = CHRTEST_UNINIT;
4291 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4294 curlym_do_A: /* execute the A in /A{m,n}B/ */
4295 PL_reginput = locinput;
4296 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4299 case CURLYM_A: /* we've just matched an A */
4300 locinput = st->locinput;
4301 nextchr = UCHARAT(locinput);
4304 /* after first match, determine A's length: u.curlym.alen */
4305 if (ST.count == 1) {
4306 if (PL_reg_match_utf8) {
4308 while (s < PL_reginput) {
4314 ST.alen = PL_reginput - locinput;
4317 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4320 PerlIO_printf(Perl_debug_log,
4321 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4322 (int)(REPORT_CODE_OFF+(depth*2)), "",
4323 (IV) ST.count, (IV)ST.alen)
4326 locinput = PL_reginput;
4328 if (cur_eval && cur_eval->u.eval.close_paren &&
4329 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4332 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4333 goto curlym_do_A; /* try to match another A */
4334 goto curlym_do_B; /* try to match B */
4336 case CURLYM_A_fail: /* just failed to match an A */
4337 REGCP_UNWIND(ST.cp);
4339 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4340 || (cur_eval && cur_eval->u.eval.close_paren &&
4341 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4344 curlym_do_B: /* execute the B in /A{m,n}B/ */
4345 PL_reginput = locinput;
4346 if (ST.c1 == CHRTEST_UNINIT) {
4347 /* calculate c1 and c2 for possible match of 1st char
4348 * following curly */
4349 ST.c1 = ST.c2 = CHRTEST_VOID;
4350 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4351 regnode *text_node = ST.B;
4352 if (! HAS_TEXT(text_node))
4353 FIND_NEXT_IMPT(text_node);
4356 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4358 But the former is redundant in light of the latter.
4360 if this changes back then the macro for
4361 IS_TEXT and friends need to change.
4363 if (PL_regkind[OP(text_node)] == EXACT)
4366 ST.c1 = (U8)*STRING(text_node);
4368 (IS_TEXTF(text_node))
4370 : (IS_TEXTFL(text_node))
4371 ? PL_fold_locale[ST.c1]
4378 PerlIO_printf(Perl_debug_log,
4379 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4380 (int)(REPORT_CODE_OFF+(depth*2)),
4383 if (ST.c1 != CHRTEST_VOID
4384 && UCHARAT(PL_reginput) != ST.c1
4385 && UCHARAT(PL_reginput) != ST.c2)
4387 /* simulate B failing */
4389 PerlIO_printf(Perl_debug_log,
4390 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4391 (int)(REPORT_CODE_OFF+(depth*2)),"",
4394 state_num = CURLYM_B_fail;
4395 goto reenter_switch;
4399 /* mark current A as captured */
4400 I32 paren = ST.me->flags;
4403 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4404 PL_regendp[paren] = PL_reginput - PL_bostr;
4405 /*dmq: *PL_reglastcloseparen = paren; */
4408 PL_regendp[paren] = -1;
4409 if (cur_eval && cur_eval->u.eval.close_paren &&
4410 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4419 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4422 case CURLYM_B_fail: /* just failed to match a B */
4423 REGCP_UNWIND(ST.cp);
4425 if (ST.count == ARG2(ST.me) /* max */)
4427 goto curlym_do_A; /* try to match a further A */
4429 /* backtrack one A */
4430 if (ST.count == ARG1(ST.me) /* min */)
4433 locinput = HOPc(locinput, -ST.alen);
4434 goto curlym_do_B; /* try to match B */
4437 #define ST st->u.curly
4439 #define CURLY_SETPAREN(paren, success) \
4442 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4443 PL_regendp[paren] = locinput - PL_bostr; \
4444 *PL_reglastcloseparen = paren; \
4447 PL_regendp[paren] = -1; \
4450 case STAR: /* /A*B/ where A is width 1 */
4454 scan = NEXTOPER(scan);
4456 case PLUS: /* /A+B/ where A is width 1 */
4460 scan = NEXTOPER(scan);
4462 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4463 ST.paren = scan->flags; /* Which paren to set */
4464 if (ST.paren > PL_regsize)
4465 PL_regsize = ST.paren;
4466 if (ST.paren > *PL_reglastparen)
4467 *PL_reglastparen = ST.paren;
4468 ST.min = ARG1(scan); /* min to match */
4469 ST.max = ARG2(scan); /* max to match */
4470 if (cur_eval && cur_eval->u.eval.close_paren &&
4471 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4475 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4477 case CURLY: /* /A{m,n}B/ where A is width 1 */
4479 ST.min = ARG1(scan); /* min to match */
4480 ST.max = ARG2(scan); /* max to match */
4481 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4484 * Lookahead to avoid useless match attempts
4485 * when we know what character comes next.
4487 * Used to only do .*x and .*?x, but now it allows
4488 * for )'s, ('s and (?{ ... })'s to be in the way
4489 * of the quantifier and the EXACT-like node. -- japhy
4492 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4494 if (HAS_TEXT(next) || JUMPABLE(next)) {
4496 regnode *text_node = next;
4498 if (! HAS_TEXT(text_node))
4499 FIND_NEXT_IMPT(text_node);
4501 if (! HAS_TEXT(text_node))
4502 ST.c1 = ST.c2 = CHRTEST_VOID;
4504 if ( PL_regkind[OP(text_node)] != EXACT ) {
4505 ST.c1 = ST.c2 = CHRTEST_VOID;
4506 goto assume_ok_easy;
4509 s = (U8*)STRING(text_node);
4511 /* Currently we only get here when
4513 PL_rekind[OP(text_node)] == EXACT
4515 if this changes back then the macro for IS_TEXT and
4516 friends need to change. */
4519 if (IS_TEXTF(text_node))
4520 ST.c2 = PL_fold[ST.c1];
4521 else if (IS_TEXTFL(text_node))
4522 ST.c2 = PL_fold_locale[ST.c1];
4525 if (IS_TEXTF(text_node)) {
4526 STRLEN ulen1, ulen2;
4527 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4528 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4530 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4531 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4533 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4535 0 : UTF8_ALLOW_ANY);
4536 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4538 0 : UTF8_ALLOW_ANY);
4540 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4542 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4547 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4554 ST.c1 = ST.c2 = CHRTEST_VOID;
4559 PL_reginput = locinput;
4562 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4565 locinput = PL_reginput;
4567 if (ST.c1 == CHRTEST_VOID)
4568 goto curly_try_B_min;
4570 ST.oldloc = locinput;
4572 /* set ST.maxpos to the furthest point along the
4573 * string that could possibly match */
4574 if (ST.max == REG_INFTY) {
4575 ST.maxpos = PL_regeol - 1;
4577 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4581 int m = ST.max - ST.min;
4582 for (ST.maxpos = locinput;
4583 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4584 ST.maxpos += UTF8SKIP(ST.maxpos);
4587 ST.maxpos = locinput + ST.max - ST.min;
4588 if (ST.maxpos >= PL_regeol)
4589 ST.maxpos = PL_regeol - 1;
4591 goto curly_try_B_min_known;
4595 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4596 locinput = PL_reginput;
4597 if (ST.count < ST.min)
4599 if ((ST.count > ST.min)
4600 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4602 /* A{m,n} must come at the end of the string, there's
4603 * no point in backing off ... */
4605 /* ...except that $ and \Z can match before *and* after
4606 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4607 We may back off by one in this case. */
4608 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4612 goto curly_try_B_max;
4617 case CURLY_B_min_known_fail:
4618 /* failed to find B in a non-greedy match where c1,c2 valid */
4619 if (ST.paren && ST.count)
4620 PL_regendp[ST.paren] = -1;
4622 PL_reginput = locinput; /* Could be reset... */
4623 REGCP_UNWIND(ST.cp);
4624 /* Couldn't or didn't -- move forward. */
4625 ST.oldloc = locinput;
4627 locinput += UTF8SKIP(locinput);
4631 curly_try_B_min_known:
4632 /* find the next place where 'B' could work, then call B */
4636 n = (ST.oldloc == locinput) ? 0 : 1;
4637 if (ST.c1 == ST.c2) {
4639 /* set n to utf8_distance(oldloc, locinput) */
4640 while (locinput <= ST.maxpos &&
4641 utf8n_to_uvchr((U8*)locinput,
4642 UTF8_MAXBYTES, &len,
4643 uniflags) != (UV)ST.c1) {
4649 /* set n to utf8_distance(oldloc, locinput) */
4650 while (locinput <= ST.maxpos) {
4652 const UV c = utf8n_to_uvchr((U8*)locinput,
4653 UTF8_MAXBYTES, &len,
4655 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4663 if (ST.c1 == ST.c2) {
4664 while (locinput <= ST.maxpos &&
4665 UCHARAT(locinput) != ST.c1)
4669 while (locinput <= ST.maxpos
4670 && UCHARAT(locinput) != ST.c1
4671 && UCHARAT(locinput) != ST.c2)
4674 n = locinput - ST.oldloc;
4676 if (locinput > ST.maxpos)
4678 /* PL_reginput == oldloc now */
4681 if (regrepeat(rex, ST.A, n, depth) < n)
4684 PL_reginput = locinput;
4685 CURLY_SETPAREN(ST.paren, ST.count);
4686 if (cur_eval && cur_eval->u.eval.close_paren &&
4687 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4690 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4695 case CURLY_B_min_fail:
4696 /* failed to find B in a non-greedy match where c1,c2 invalid */
4697 if (ST.paren && ST.count)
4698 PL_regendp[ST.paren] = -1;
4700 REGCP_UNWIND(ST.cp);
4701 /* failed -- move forward one */
4702 PL_reginput = locinput;
4703 if (regrepeat(rex, ST.A, 1, depth)) {
4705 locinput = PL_reginput;
4706 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4707 ST.count > 0)) /* count overflow ? */
4710 CURLY_SETPAREN(ST.paren, ST.count);
4711 if (cur_eval && cur_eval->u.eval.close_paren &&
4712 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4715 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4723 /* a successful greedy match: now try to match B */
4724 if (cur_eval && cur_eval->u.eval.close_paren &&
4725 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4730 if (ST.c1 != CHRTEST_VOID)
4731 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4732 UTF8_MAXBYTES, 0, uniflags)
4733 : (UV) UCHARAT(PL_reginput);
4734 /* If it could work, try it. */
4735 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4736 CURLY_SETPAREN(ST.paren, ST.count);
4737 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4742 case CURLY_B_max_fail:
4743 /* failed to find B in a greedy match */
4744 if (ST.paren && ST.count)
4745 PL_regendp[ST.paren] = -1;
4747 REGCP_UNWIND(ST.cp);
4749 if (--ST.count < ST.min)
4751 PL_reginput = locinput = HOPc(locinput, -1);
4752 goto curly_try_B_max;
4759 /* we've just finished A in /(??{A})B/; now continue with B */
4761 st->u.eval.toggle_reg_flags
4762 = cur_eval->u.eval.toggle_reg_flags;
4763 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4765 st->u.eval.prev_rex = rex; /* inner */
4766 SETREX(rex,cur_eval->u.eval.prev_rex);
4767 rexi = RXi_GET(rex);
4768 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4770 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4771 REGCP_SET(st->u.eval.lastcp);
4772 PL_reginput = locinput;
4774 /* Restore parens of the outer rex without popping the
4776 tmpix = PL_savestack_ix;
4777 PL_savestack_ix = cur_eval->u.eval.lastcp;
4779 PL_savestack_ix = tmpix;
4781 st->u.eval.prev_eval = cur_eval;
4782 cur_eval = cur_eval->u.eval.prev_eval;
4784 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4785 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4786 if ( nochange_depth )
4789 PUSH_YES_STATE_GOTO(EVAL_AB,
4790 st->u.eval.prev_eval->u.eval.B); /* match B */
4793 if (locinput < reginfo->till) {
4794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4795 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4797 (long)(locinput - PL_reg_starttry),
4798 (long)(reginfo->till - PL_reg_starttry),
4801 sayNO_SILENT; /* Cannot match: too short. */
4803 PL_reginput = locinput; /* put where regtry can find it */
4804 sayYES; /* Success! */
4806 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4808 PerlIO_printf(Perl_debug_log,
4809 "%*s %ssubpattern success...%s\n",
4810 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4811 PL_reginput = locinput; /* put where regtry can find it */
4812 sayYES; /* Success! */
4815 #define ST st->u.ifmatch
4817 case SUSPEND: /* (?>A) */
4819 PL_reginput = locinput;
4822 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4824 goto ifmatch_trivial_fail_test;
4826 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4828 ifmatch_trivial_fail_test:
4830 char * const s = HOPBACKc(locinput, scan->flags);
4835 sw = 1 - (bool)ST.wanted;
4839 next = scan + ARG(scan);
4847 PL_reginput = locinput;
4851 ST.logical = logical;
4852 /* execute body of (?...A) */
4853 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4856 case IFMATCH_A_fail: /* body of (?...A) failed */
4857 ST.wanted = !ST.wanted;
4860 case IFMATCH_A: /* body of (?...A) succeeded */
4862 sw = (bool)ST.wanted;
4864 else if (!ST.wanted)
4867 if (OP(ST.me) == SUSPEND)
4868 locinput = PL_reginput;
4870 locinput = PL_reginput = st->locinput;
4871 nextchr = UCHARAT(locinput);
4873 scan = ST.me + ARG(ST.me);
4876 continue; /* execute B */
4881 next = scan + ARG(scan);
4886 reginfo->cutpoint = PL_regeol;
4889 PL_reginput = locinput;
4891 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
4892 PUSH_STATE_GOTO(COMMIT_next,next);
4894 case COMMIT_next_fail:
4901 #define ST st->u.mark
4903 ST.prev_mark = mark_state;
4904 ST.mark_name = sv_commit = sv_yes_mark
4905 = (SV*)rexi->data->data[ ARG( scan ) ];
4907 ST.mark_loc = PL_reginput = locinput;
4908 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4910 case MARKPOINT_next:
4911 mark_state = ST.prev_mark;
4914 case MARKPOINT_next_fail:
4915 if (popmark && sv_eq(ST.mark_name,popmark))
4917 if (ST.mark_loc > startpoint)
4918 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4919 popmark = NULL; /* we found our mark */
4920 sv_commit = ST.mark_name;
4923 PerlIO_printf(Perl_debug_log,
4924 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4925 REPORT_CODE_OFF+depth*2, "",
4926 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
4929 mark_state = ST.prev_mark;
4930 sv_yes_mark = mark_state ?
4931 mark_state->u.mark.mark_name : NULL;
4935 PL_reginput = locinput;
4937 /* (*SKIP) : if we fail we cut here*/
4938 ST.mark_name = NULL;
4939 ST.mark_loc = locinput;
4940 PUSH_STATE_GOTO(SKIP_next,next);
4942 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
4943 otherwise do nothing. Meaning we need to scan
4945 regmatch_state *cur = mark_state;
4946 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
4949 if ( sv_eq( cur->u.mark.mark_name,
4952 ST.mark_name = find;
4953 PUSH_STATE_GOTO( SKIP_next, next );
4955 cur = cur->u.mark.prev_mark;
4958 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
4960 case SKIP_next_fail:
4962 /* (*CUT:NAME) - Set up to search for the name as we
4963 collapse the stack*/
4964 popmark = ST.mark_name;
4966 /* (*CUT) - No name, we cut here.*/
4967 if (ST.mark_loc > startpoint)
4968 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4969 /* but we set sv_commit to latest mark_name if there
4970 is one so they can test to see how things lead to this
4973 sv_commit=mark_state->u.mark.mark_name;
4981 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4982 PTR2UV(scan), OP(scan));
4983 Perl_croak(aTHX_ "regexp memory corruption");
4987 /* switch break jumps here */
4988 scan = next; /* prepare to execute the next op and ... */
4989 continue; /* ... jump back to the top, reusing st */
4993 /* push a state that backtracks on success */
4994 st->u.yes.prev_yes_state = yes_state;
4998 /* push a new regex state, then continue at scan */
5000 regmatch_state *newst;
5003 regmatch_state *cur = st;
5004 regmatch_state *curyes = yes_state;
5006 regmatch_slab *slab = PL_regmatch_slab;
5007 for (;curd > -1;cur--,curd--) {
5008 if (cur < SLAB_FIRST(slab)) {
5010 cur = SLAB_LAST(slab);
5012 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5013 REPORT_CODE_OFF + 2 + depth * 2,"",
5014 curd, PL_reg_name[cur->resume_state],
5015 (curyes == cur) ? "yes" : ""
5018 curyes = cur->u.yes.prev_yes_state;
5021 DEBUG_STATE_pp("push")
5024 st->locinput = locinput;
5026 if (newst > SLAB_LAST(PL_regmatch_slab))
5027 newst = S_push_slab(aTHX);
5028 PL_regmatch_state = newst;
5030 locinput = PL_reginput;
5031 nextchr = UCHARAT(locinput);
5039 * We get here only if there's trouble -- normally "case END" is
5040 * the terminating point.
5042 Perl_croak(aTHX_ "corrupted regexp pointers");
5048 /* we have successfully completed a subexpression, but we must now
5049 * pop to the state marked by yes_state and continue from there */
5050 assert(st != yes_state);
5052 while (st != yes_state) {
5054 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5055 PL_regmatch_slab = PL_regmatch_slab->prev;
5056 st = SLAB_LAST(PL_regmatch_slab);
5060 DEBUG_STATE_pp("pop (no final)");
5062 DEBUG_STATE_pp("pop (yes)");
5068 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5069 || yes_state > SLAB_LAST(PL_regmatch_slab))
5071 /* not in this slab, pop slab */
5072 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5073 PL_regmatch_slab = PL_regmatch_slab->prev;
5074 st = SLAB_LAST(PL_regmatch_slab);
5076 depth -= (st - yes_state);
5079 yes_state = st->u.yes.prev_yes_state;
5080 PL_regmatch_state = st;
5083 locinput= st->locinput;
5084 nextchr = UCHARAT(locinput);
5086 state_num = st->resume_state + no_final;
5087 goto reenter_switch;
5090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5091 PL_colors[4], PL_colors[5]));
5098 PerlIO_printf(Perl_debug_log,
5099 "%*s %sfailed...%s\n",
5100 REPORT_CODE_OFF+depth*2, "",
5101 PL_colors[4], PL_colors[5])
5113 /* there's a previous state to backtrack to */
5115 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5116 PL_regmatch_slab = PL_regmatch_slab->prev;
5117 st = SLAB_LAST(PL_regmatch_slab);
5119 PL_regmatch_state = st;
5120 locinput= st->locinput;
5121 nextchr = UCHARAT(locinput);
5123 DEBUG_STATE_pp("pop");
5125 if (yes_state == st)
5126 yes_state = st->u.yes.prev_yes_state;
5128 state_num = st->resume_state + 1; /* failure = success + 1 */
5129 goto reenter_switch;
5134 if (rex->intflags & PREGf_VERBARG_SEEN) {
5135 SV *sv_err = get_sv("REGERROR", 1);
5136 SV *sv_mrk = get_sv("REGMARK", 1);
5138 sv_commit = &PL_sv_no;
5140 sv_yes_mark = &PL_sv_yes;
5143 sv_commit = &PL_sv_yes;
5144 sv_yes_mark = &PL_sv_no;
5146 sv_setsv(sv_err, sv_commit);
5147 sv_setsv(sv_mrk, sv_yes_mark);
5149 /* restore original high-water mark */
5150 PL_regmatch_slab = orig_slab;
5151 PL_regmatch_state = orig_state;
5153 /* free all slabs above current one */
5154 if (orig_slab->next) {
5155 regmatch_slab *sl = orig_slab->next;
5156 orig_slab->next = NULL;
5158 regmatch_slab * const osl = sl;
5168 - regrepeat - repeatedly match something simple, report how many
5171 * [This routine now assumes that it will only match on things of length 1.
5172 * That was true before, but now we assume scan - reginput is the count,
5173 * rather than incrementing count on every character. [Er, except utf8.]]
5176 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5179 register char *scan;
5181 register char *loceol = PL_regeol;
5182 register I32 hardcount = 0;
5183 register bool do_utf8 = PL_reg_match_utf8;
5185 PERL_UNUSED_ARG(depth);
5189 if (max == REG_INFTY)
5191 else if (max < loceol - scan)
5192 loceol = scan + max;
5197 while (scan < loceol && hardcount < max && *scan != '\n') {
5198 scan += UTF8SKIP(scan);
5202 while (scan < loceol && *scan != '\n')
5209 while (scan < loceol && hardcount < max) {
5210 scan += UTF8SKIP(scan);
5220 case EXACT: /* length of string is 1 */
5222 while (scan < loceol && UCHARAT(scan) == c)
5225 case EXACTF: /* length of string is 1 */
5227 while (scan < loceol &&
5228 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5231 case EXACTFL: /* length of string is 1 */
5232 PL_reg_flags |= RF_tainted;
5234 while (scan < loceol &&
5235 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5241 while (hardcount < max && scan < loceol &&
5242 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5243 scan += UTF8SKIP(scan);
5247 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5254 LOAD_UTF8_CHARCLASS_ALNUM();
5255 while (hardcount < max && scan < loceol &&
5256 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5257 scan += UTF8SKIP(scan);
5261 while (scan < loceol && isALNUM(*scan))
5266 PL_reg_flags |= RF_tainted;
5269 while (hardcount < max && scan < loceol &&
5270 isALNUM_LC_utf8((U8*)scan)) {
5271 scan += UTF8SKIP(scan);
5275 while (scan < loceol && isALNUM_LC(*scan))
5282 LOAD_UTF8_CHARCLASS_ALNUM();
5283 while (hardcount < max && scan < loceol &&
5284 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5285 scan += UTF8SKIP(scan);
5289 while (scan < loceol && !isALNUM(*scan))
5294 PL_reg_flags |= RF_tainted;
5297 while (hardcount < max && scan < loceol &&
5298 !isALNUM_LC_utf8((U8*)scan)) {
5299 scan += UTF8SKIP(scan);
5303 while (scan < loceol && !isALNUM_LC(*scan))
5310 LOAD_UTF8_CHARCLASS_SPACE();
5311 while (hardcount < max && scan < loceol &&
5313 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5314 scan += UTF8SKIP(scan);
5318 while (scan < loceol && isSPACE(*scan))
5323 PL_reg_flags |= RF_tainted;
5326 while (hardcount < max && scan < loceol &&
5327 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5328 scan += UTF8SKIP(scan);
5332 while (scan < loceol && isSPACE_LC(*scan))
5339 LOAD_UTF8_CHARCLASS_SPACE();
5340 while (hardcount < max && scan < loceol &&
5342 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5343 scan += UTF8SKIP(scan);
5347 while (scan < loceol && !isSPACE(*scan))
5352 PL_reg_flags |= RF_tainted;
5355 while (hardcount < max && scan < loceol &&
5356 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5357 scan += UTF8SKIP(scan);
5361 while (scan < loceol && !isSPACE_LC(*scan))
5368 LOAD_UTF8_CHARCLASS_DIGIT();
5369 while (hardcount < max && scan < loceol &&
5370 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5371 scan += UTF8SKIP(scan);
5375 while (scan < loceol && isDIGIT(*scan))
5382 LOAD_UTF8_CHARCLASS_DIGIT();
5383 while (hardcount < max && scan < loceol &&
5384 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5385 scan += UTF8SKIP(scan);
5389 while (scan < loceol && !isDIGIT(*scan))
5393 default: /* Called on something of 0 width. */
5394 break; /* So match right here or not at all. */
5400 c = scan - PL_reginput;
5404 GET_RE_DEBUG_FLAGS_DECL;
5406 SV * const prop = sv_newmortal();
5407 regprop(prog, prop, p);
5408 PerlIO_printf(Perl_debug_log,
5409 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5410 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5418 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5420 - regclass_swash - prepare the utf8 swash
5424 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5430 RXi_GET_DECL(prog,progi);
5431 const struct reg_data * const data = prog ? progi->data : NULL;
5433 if (data && data->count) {
5434 const U32 n = ARG(node);
5436 if (data->what[n] == 's') {
5437 SV * const rv = (SV*)data->data[n];
5438 AV * const av = (AV*)SvRV((SV*)rv);
5439 SV **const ary = AvARRAY(av);
5442 /* See the end of regcomp.c:S_regclass() for
5443 * documentation of these array elements. */
5446 a = SvROK(ary[1]) ? &ary[1] : 0;
5447 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5451 else if (si && doinit) {
5452 sw = swash_init("utf8", "", si, 1, 0);
5453 (void)av_store(av, 1, sw);
5470 - reginclass - determine if a character falls into a character class
5472 The n is the ANYOF regnode, the p is the target string, lenp
5473 is pointer to the maximum length of how far to go in the p
5474 (if the lenp is zero, UTF8SKIP(p) is used),
5475 do_utf8 tells whether the target string is in UTF-8.
5480 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5483 const char flags = ANYOF_FLAGS(n);
5489 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5490 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5491 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5492 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5493 if (len == (STRLEN)-1)
5494 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5497 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5498 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5501 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5502 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5505 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5509 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5512 if (swash_fetch(sw, p, do_utf8))
5514 else if (flags & ANYOF_FOLD) {
5515 if (!match && lenp && av) {
5517 for (i = 0; i <= av_len(av); i++) {
5518 SV* const sv = *av_fetch(av, i, FALSE);
5520 const char * const s = SvPV_const(sv, len);
5522 if (len <= plen && memEQ(s, (char*)p, len)) {
5530 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5533 to_utf8_fold(p, tmpbuf, &tmplen);
5534 if (swash_fetch(sw, tmpbuf, do_utf8))
5540 if (match && lenp && *lenp == 0)
5541 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5543 if (!match && c < 256) {
5544 if (ANYOF_BITMAP_TEST(n, c))
5546 else if (flags & ANYOF_FOLD) {
5549 if (flags & ANYOF_LOCALE) {
5550 PL_reg_flags |= RF_tainted;
5551 f = PL_fold_locale[c];
5555 if (f != c && ANYOF_BITMAP_TEST(n, f))
5559 if (!match && (flags & ANYOF_CLASS)) {
5560 PL_reg_flags |= RF_tainted;
5562 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5563 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5564 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5565 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5566 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5567 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5568 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5569 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5570 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5571 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5572 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5573 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5574 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5575 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5576 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5577 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5578 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5579 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5580 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5581 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5582 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5583 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5584 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5585 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5586 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5587 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5588 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5589 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5590 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5591 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5592 ) /* How's that for a conditional? */
5599 return (flags & ANYOF_INVERT) ? !match : match;
5603 S_reghop3(U8 *s, I32 off, const U8* lim)
5607 while (off-- && s < lim) {
5608 /* XXX could check well-formedness here */
5613 while (off++ && s > lim) {
5615 if (UTF8_IS_CONTINUED(*s)) {
5616 while (s > lim && UTF8_IS_CONTINUATION(*s))
5619 /* XXX could check well-formedness here */
5626 /* there are a bunch of places where we use two reghop3's that should
5627 be replaced with this routine. but since thats not done yet
5628 we ifdef it out - dmq
5631 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5635 while (off-- && s < rlim) {
5636 /* XXX could check well-formedness here */
5641 while (off++ && s > llim) {
5643 if (UTF8_IS_CONTINUED(*s)) {
5644 while (s > llim && UTF8_IS_CONTINUATION(*s))
5647 /* XXX could check well-formedness here */
5655 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5659 while (off-- && s < lim) {
5660 /* XXX could check well-formedness here */
5667 while (off++ && s > lim) {
5669 if (UTF8_IS_CONTINUED(*s)) {
5670 while (s > lim && UTF8_IS_CONTINUATION(*s))
5673 /* XXX could check well-formedness here */
5682 restore_pos(pTHX_ void *arg)
5685 regexp * const rex = (regexp *)arg;
5686 if (PL_reg_eval_set) {
5687 if (PL_reg_oldsaved) {
5688 rex->subbeg = PL_reg_oldsaved;
5689 rex->sublen = PL_reg_oldsavedlen;
5690 #ifdef PERL_OLD_COPY_ON_WRITE
5691 rex->saved_copy = PL_nrs;
5693 RX_MATCH_COPIED_on(rex);
5695 PL_reg_magic->mg_len = PL_reg_oldpos;
5696 PL_reg_eval_set = 0;
5697 PL_curpm = PL_reg_oldcurpm;
5702 S_to_utf8_substr(pTHX_ register regexp *prog)
5706 if (prog->substrs->data[i].substr
5707 && !prog->substrs->data[i].utf8_substr) {
5708 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5709 prog->substrs->data[i].utf8_substr = sv;
5710 sv_utf8_upgrade(sv);
5711 if (SvVALID(prog->substrs->data[i].substr)) {
5712 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5713 if (flags & FBMcf_TAIL) {
5714 /* Trim the trailing \n that fbm_compile added last
5716 SvCUR_set(sv, SvCUR(sv) - 1);
5717 /* Whilst this makes the SV technically "invalid" (as its
5718 buffer is no longer followed by "\0") when fbm_compile()
5719 adds the "\n" back, a "\0" is restored. */
5721 fbm_compile(sv, flags);
5723 if (prog->substrs->data[i].substr == prog->check_substr)
5724 prog->check_utf8 = sv;
5730 S_to_byte_substr(pTHX_ register regexp *prog)
5735 if (prog->substrs->data[i].utf8_substr
5736 && !prog->substrs->data[i].substr) {
5737 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5738 if (sv_utf8_downgrade(sv, TRUE)) {
5739 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5741 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5742 if (flags & FBMcf_TAIL) {
5743 /* Trim the trailing \n that fbm_compile added last
5745 SvCUR_set(sv, SvCUR(sv) - 1);
5747 fbm_compile(sv, flags);
5753 prog->substrs->data[i].substr = sv;
5754 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5755 prog->check_substr = sv;
5762 * c-indentation-style: bsd
5764 * indent-tabs-mode: t
5767 * ex: set ts=8 sts=4 sw=4 noet: