5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 8
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regendp[p]);
193 SSPUSHINT(PL_regstartp[p]);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regstartp[p],
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regstartp);
205 SSPUSHPTR(PL_regendp);
206 SSPUSHINT(PL_regsize);
207 SSPUSHINT(*PL_reglastparen);
208 SSPUSHINT(*PL_reglastcloseparen);
209 SSPUSHPTR(PL_reginput);
210 #define REGCP_FRAME_ELEMS 2
211 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
212 * are needed for the regexp context stack bookkeeping. */
213 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
214 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
219 /* These are needed since we do not localize EVAL nodes: */
220 #define REGCP_SET(cp) \
222 PerlIO_printf(Perl_debug_log, \
223 " Setting an EVAL scope, savestack=%"IVdf"\n", \
224 (IV)PL_savestack_ix)); \
227 #define REGCP_UNWIND(cp) \
229 if (cp != PL_savestack_ix) \
230 PerlIO_printf(Perl_debug_log, \
231 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
232 (IV)(cp), (IV)PL_savestack_ix)); \
236 S_regcppop(pTHX_ const regexp *rex)
242 GET_RE_DEBUG_FLAGS_DECL;
244 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
246 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
247 i = SSPOPINT; /* Parentheses elements to pop. */
248 input = (char *) SSPOPPTR;
249 *PL_reglastcloseparen = SSPOPINT;
250 *PL_reglastparen = SSPOPINT;
251 PL_regsize = SSPOPINT;
252 PL_regendp=(I32 *) SSPOPPTR;
253 PL_regstartp=(I32 *) SSPOPPTR;
256 /* Now restore the parentheses context. */
257 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
258 i > 0; i -= REGCP_PAREN_ELEMS) {
260 U32 paren = (U32)SSPOPINT;
261 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
262 PL_regstartp[paren] = SSPOPINT;
264 if (paren <= *PL_reglastparen)
265 PL_regendp[paren] = tmps;
267 PerlIO_printf(Perl_debug_log,
268 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
269 (UV)paren, (IV)PL_regstartp[paren],
270 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
271 (IV)PL_regendp[paren],
272 (paren > *PL_reglastparen ? "(no)" : ""));
276 if (*PL_reglastparen + 1 <= rex->nparens) {
277 PerlIO_printf(Perl_debug_log,
278 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
279 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
283 /* It would seem that the similar code in regtry()
284 * already takes care of this, and in fact it is in
285 * a better location to since this code can #if 0-ed out
286 * but the code in regtry() is needed or otherwise tests
287 * requiring null fields (pat.t#187 and split.t#{13,14}
288 * (as of patchlevel 7877) will fail. Then again,
289 * this code seems to be necessary or otherwise
290 * building DynaLoader will fail:
291 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
293 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
295 PL_regstartp[i] = -1;
302 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
305 * pregexec and friends
308 #ifndef PERL_IN_XSUB_RE
310 - pregexec - match a regexp against a string
313 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
314 char *strbeg, I32 minend, SV *screamer, U32 nosave)
315 /* strend: pointer to null at end of string */
316 /* strbeg: real beginning of string */
317 /* minend: end of match must be >=minend after stringarg. */
318 /* nosave: For optimizations. */
321 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
322 nosave ? 0 : REXEC_COPY_STR);
327 * Need to implement the following flags for reg_anch:
329 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
331 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
332 * INTUIT_AUTORITATIVE_ML
333 * INTUIT_ONCE_NOML - Intuit can match in one location only.
336 * Another flag for this function: SECOND_TIME (so that float substrs
337 * with giant delta may be not rechecked).
340 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
342 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
343 Otherwise, only SvCUR(sv) is used to get strbeg. */
345 /* XXXX We assume that strpos is strbeg unless sv. */
347 /* XXXX Some places assume that there is a fixed substring.
348 An update may be needed if optimizer marks as "INTUITable"
349 RExen without fixed substrings. Similarly, it is assumed that
350 lengths of all the strings are no more than minlen, thus they
351 cannot come from lookahead.
352 (Or minlen should take into account lookahead.)
353 NOTE: Some of this comment is not correct. minlen does now take account
354 of lookahead/behind. Further research is required. -- demerphq
358 /* A failure to find a constant substring means that there is no need to make
359 an expensive call to REx engine, thus we celebrate a failure. Similarly,
360 finding a substring too deep into the string means that less calls to
361 regtry() should be needed.
363 REx compiler's optimizer found 4 possible hints:
364 a) Anchored substring;
366 c) Whether we are anchored (beginning-of-line or \G);
367 d) First node (of those at offset 0) which may distingush positions;
368 We use a)b)d) and multiline-part of c), and try to find a position in the
369 string which does not contradict any of them.
372 /* Most of decisions we do here should have been done at compile time.
373 The nodes of the REx which we used for the search should have been
374 deleted from the finite automaton. */
377 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
378 char *strend, U32 flags, re_scream_pos_data *data)
381 register I32 start_shift = 0;
382 /* Should be nonnegative! */
383 register I32 end_shift = 0;
388 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
390 register char *other_last = NULL; /* other substr checked before this */
391 char *check_at = NULL; /* check substr found at this pos */
392 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
393 RXi_GET_DECL(prog,progi);
395 const char * const i_strpos = strpos;
398 GET_RE_DEBUG_FLAGS_DECL;
400 RX_MATCH_UTF8_set(prog,do_utf8);
402 if (prog->extflags & RXf_UTF8) {
403 PL_reg_flags |= RF_utf8;
406 debug_start_match(prog, do_utf8, strpos, strend,
407 sv ? "Guessing start of match in sv for"
408 : "Guessing start of match in string for");
411 /* CHR_DIST() would be more correct here but it makes things slow. */
412 if (prog->minlen > strend - strpos) {
413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
414 "String too short... [re_intuit_start]\n"));
418 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
421 if (!prog->check_utf8 && prog->check_substr)
422 to_utf8_substr(prog);
423 check = prog->check_utf8;
425 if (!prog->check_substr && prog->check_utf8)
426 to_byte_substr(prog);
427 check = prog->check_substr;
429 if (check == &PL_sv_undef) {
430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
431 "Non-utf8 string cannot match utf8 check string\n"));
434 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
435 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
436 || ( (prog->extflags & RXf_ANCH_BOL)
437 && !multiline ) ); /* Check after \n? */
440 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
441 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
442 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
444 && (strpos != strbeg)) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
448 if (prog->check_offset_min == prog->check_offset_max &&
449 !(prog->extflags & RXf_CANY_SEEN)) {
450 /* Substring at constant offset from beg-of-str... */
453 s = HOP3c(strpos, prog->check_offset_min, strend);
456 slen = SvCUR(check); /* >= 1 */
458 if ( strend - s > slen || strend - s < slen - 1
459 || (strend - s == slen && strend[-1] != '\n')) {
460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
463 /* Now should match s[0..slen-2] */
465 if (slen && (*SvPVX_const(check) != *s
467 && memNE(SvPVX_const(check), s, slen)))) {
469 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
473 else if (*SvPVX_const(check) != *s
474 || ((slen = SvCUR(check)) > 1
475 && memNE(SvPVX_const(check), s, slen)))
478 goto success_at_start;
481 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
484 end_shift = prog->check_end_shift;
487 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
488 - (SvTAIL(check) != 0);
489 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
491 if (end_shift < eshift)
495 else { /* Can match at random position */
498 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
499 end_shift = prog->check_end_shift;
501 /* end shift should be non negative here */
504 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
506 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
507 (IV)end_shift, prog->precomp);
511 /* Find a possible match in the region s..strend by looking for
512 the "check" substring in the region corrected by start/end_shift. */
515 I32 srch_start_shift = start_shift;
516 I32 srch_end_shift = end_shift;
517 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
518 srch_end_shift -= ((strbeg - s) - srch_start_shift);
519 srch_start_shift = strbeg - s;
521 DEBUG_OPTIMISE_MORE_r({
522 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
523 (IV)prog->check_offset_min,
524 (IV)srch_start_shift,
526 (IV)prog->check_end_shift);
529 if (flags & REXEC_SCREAM) {
530 I32 p = -1; /* Internal iterator of scream. */
531 I32 * const pp = data ? data->scream_pos : &p;
533 if (PL_screamfirst[BmRARE(check)] >= 0
534 || ( BmRARE(check) == '\n'
535 && (BmPREVIOUS(check) == SvCUR(check) - 1)
537 s = screaminstr(sv, check,
538 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
541 /* we may be pointing at the wrong string */
542 if (s && RX_MATCH_COPIED(prog))
543 s = strbeg + (s - SvPVX_const(sv));
545 *data->scream_olds = s;
550 if (prog->extflags & RXf_CANY_SEEN) {
551 start_point= (U8*)(s + srch_start_shift);
552 end_point= (U8*)(strend - srch_end_shift);
554 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
555 end_point= HOP3(strend, -srch_end_shift, strbeg);
557 DEBUG_OPTIMISE_MORE_r({
558 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
559 (int)(end_point - start_point),
560 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
564 s = fbm_instr( start_point, end_point,
565 check, multiline ? FBMrf_MULTILINE : 0);
568 /* Update the count-of-usability, remove useless subpatterns,
572 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
573 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
574 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
575 (s ? "Found" : "Did not find"),
576 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
577 ? "anchored" : "floating"),
580 (s ? " at offset " : "...\n") );
585 /* Finish the diagnostic message */
586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
588 /* XXX dmq: first branch is for positive lookbehind...
589 Our check string is offset from the beginning of the pattern.
590 So we need to do any stclass tests offset forward from that
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
602 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
608 : (prog->float_substr && prog->anchored_substr))
610 /* Take into account the "other" substring. */
611 /* XXXX May be hopelessly wrong for UTF... */
614 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
617 char * const last = HOP3c(s, -start_shift, strbeg);
619 char * const saved_s = s;
622 t = s - prog->check_offset_max;
623 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
625 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
630 t = HOP3c(t, prog->anchored_offset, strend);
631 if (t < other_last) /* These positions already checked */
633 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
636 /* XXXX It is not documented what units *_offsets are in.
637 We assume bytes, but this is clearly wrong.
638 Meaning this code needs to be carefully reviewed for errors.
642 /* On end-of-str: see comment below. */
643 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
644 if (must == &PL_sv_undef) {
646 DEBUG_r(must = prog->anchored_utf8); /* for debug */
651 HOP3(HOP3(last1, prog->anchored_offset, strend)
652 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
654 multiline ? FBMrf_MULTILINE : 0
657 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
658 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
659 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
660 (s ? "Found" : "Contradicts"),
661 quoted, RE_SV_TAIL(must));
666 if (last1 >= last2) {
667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
668 ", giving up...\n"));
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
672 ", trying floating at offset %ld...\n",
673 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
674 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
675 s = HOP3c(last, 1, strend);
679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
680 (long)(s - i_strpos)));
681 t = HOP3c(s, -prog->anchored_offset, strbeg);
682 other_last = HOP3c(s, 1, strend);
690 else { /* Take into account the floating substring. */
692 char * const saved_s = s;
695 t = HOP3c(s, -start_shift, strbeg);
697 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
698 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
699 last = HOP3c(t, prog->float_max_offset, strend);
700 s = HOP3c(t, prog->float_min_offset, strend);
703 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
704 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
705 /* fbm_instr() takes into account exact value of end-of-str
706 if the check is SvTAIL(ed). Since false positives are OK,
707 and end-of-str is not later than strend we are OK. */
708 if (must == &PL_sv_undef) {
710 DEBUG_r(must = prog->float_utf8); /* for debug message */
713 s = fbm_instr((unsigned char*)s,
714 (unsigned char*)last + SvCUR(must)
716 must, multiline ? FBMrf_MULTILINE : 0);
718 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
719 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
720 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
721 (s ? "Found" : "Contradicts"),
722 quoted, RE_SV_TAIL(must));
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
727 ", giving up...\n"));
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
731 ", trying anchored starting at offset %ld...\n",
732 (long)(saved_s + 1 - i_strpos)));
734 s = HOP3c(t, 1, strend);
738 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
739 (long)(s - i_strpos)));
740 other_last = s; /* Fix this later. --Hugo */
750 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
752 DEBUG_OPTIMISE_MORE_r(
753 PerlIO_printf(Perl_debug_log,
754 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
755 (IV)prog->check_offset_min,
756 (IV)prog->check_offset_max,
764 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
766 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
769 /* Fixed substring is found far enough so that the match
770 cannot start at strpos. */
772 if (ml_anch && t[-1] != '\n') {
773 /* Eventually fbm_*() should handle this, but often
774 anchored_offset is not 0, so this check will not be wasted. */
775 /* XXXX In the code below we prefer to look for "^" even in
776 presence of anchored substrings. And we search even
777 beyond the found float position. These pessimizations
778 are historical artefacts only. */
780 while (t < strend - prog->minlen) {
782 if (t < check_at - prog->check_offset_min) {
783 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
784 /* Since we moved from the found position,
785 we definitely contradict the found anchored
786 substr. Due to the above check we do not
787 contradict "check" substr.
788 Thus we can arrive here only if check substr
789 is float. Redo checking for "other"=="fixed".
792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
793 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
794 goto do_other_anchored;
796 /* We don't contradict the found floating substring. */
797 /* XXXX Why not check for STCLASS? */
799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
800 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
803 /* Position contradicts check-string */
804 /* XXXX probably better to look for check-string
805 than for "\n", so one should lower the limit for t? */
806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
807 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
808 other_last = strpos = s = t + 1;
813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
814 PL_colors[0], PL_colors[1]));
818 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
819 PL_colors[0], PL_colors[1]));
823 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
826 /* The found string does not prohibit matching at strpos,
827 - no optimization of calling REx engine can be performed,
828 unless it was an MBOL and we are not after MBOL,
829 or a future STCLASS check will fail this. */
831 /* Even in this situation we may use MBOL flag if strpos is offset
832 wrt the start of the string. */
833 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
834 && (strpos != strbeg) && strpos[-1] != '\n'
835 /* May be due to an implicit anchor of m{.*foo} */
836 && !(prog->intflags & PREGf_IMPLICIT))
841 DEBUG_EXECUTE_r( if (ml_anch)
842 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
843 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
846 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
848 prog->check_utf8 /* Could be deleted already */
849 && --BmUSEFUL(prog->check_utf8) < 0
850 && (prog->check_utf8 == prog->float_utf8)
852 prog->check_substr /* Could be deleted already */
853 && --BmUSEFUL(prog->check_substr) < 0
854 && (prog->check_substr == prog->float_substr)
857 /* If flags & SOMETHING - do not do it many times on the same match */
858 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
859 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
860 if (do_utf8 ? prog->check_substr : prog->check_utf8)
861 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
862 prog->check_substr = prog->check_utf8 = NULL; /* disable */
863 prog->float_substr = prog->float_utf8 = NULL; /* clear */
864 check = NULL; /* abort */
866 /* XXXX This is a remnant of the old implementation. It
867 looks wasteful, since now INTUIT can use many
869 prog->extflags &= ~RXf_USE_INTUIT;
876 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
877 /* trie stclasses are too expensive to use here, we are better off to
878 leave it to regmatch itself */
879 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
880 /* minlen == 0 is possible if regstclass is \b or \B,
881 and the fixed substr is ''$.
882 Since minlen is already taken into account, s+1 is before strend;
883 accidentally, minlen >= 1 guaranties no false positives at s + 1
884 even for \b or \B. But (minlen? 1 : 0) below assumes that
885 regstclass does not come from lookahead... */
886 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
887 This leaves EXACTF only, which is dealt with in find_byclass(). */
888 const U8* const str = (U8*)STRING(progi->regstclass);
889 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
890 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
893 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
894 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
895 else if (prog->float_substr || prog->float_utf8)
896 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
901 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
904 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
907 const char *what = NULL;
909 if (endpos == strend) {
910 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
911 "Could not match STCLASS...\n") );
914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
915 "This position contradicts STCLASS...\n") );
916 if ((prog->extflags & RXf_ANCH) && !ml_anch)
918 /* Contradict one of substrings */
919 if (prog->anchored_substr || prog->anchored_utf8) {
920 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
921 DEBUG_EXECUTE_r( what = "anchored" );
923 s = HOP3c(t, 1, strend);
924 if (s + start_shift + end_shift > strend) {
925 /* XXXX Should be taken into account earlier? */
926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
927 "Could not match STCLASS...\n") );
932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
933 "Looking for %s substr starting at offset %ld...\n",
934 what, (long)(s + start_shift - i_strpos)) );
937 /* Have both, check_string is floating */
938 if (t + start_shift >= check_at) /* Contradicts floating=check */
939 goto retry_floating_check;
940 /* Recheck anchored substring, but not floating... */
944 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
945 "Looking for anchored substr starting at offset %ld...\n",
946 (long)(other_last - i_strpos)) );
947 goto do_other_anchored;
949 /* Another way we could have checked stclass at the
950 current position only: */
955 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
956 "Looking for /%s^%s/m starting at offset %ld...\n",
957 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
960 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
962 /* Check is floating subtring. */
963 retry_floating_check:
964 t = check_at - start_shift;
965 DEBUG_EXECUTE_r( what = "floating" );
966 goto hop_and_restart;
969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
970 "By STCLASS: moving %ld --> %ld\n",
971 (long)(t - i_strpos), (long)(s - i_strpos))
975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
976 "Does not contradict STCLASS...\n");
981 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
982 PL_colors[4], (check ? "Guessed" : "Giving up"),
983 PL_colors[5], (long)(s - i_strpos)) );
986 fail_finish: /* Substring not found */
987 if (prog->check_substr || prog->check_utf8) /* could be removed already */
988 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
991 PL_colors[4], PL_colors[5]));
997 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
998 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
999 switch (trie_type) { \
1000 case trie_utf8_fold: \
1001 if ( foldlen>0 ) { \
1002 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1007 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1008 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1009 foldlen -= UNISKIP( uvc ); \
1010 uscan = foldbuf + UNISKIP( uvc ); \
1014 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1022 charid = trie->charmap[ uvc ]; \
1026 if (widecharmap) { \
1027 SV** const svpp = hv_fetch(widecharmap, \
1028 (char*)&uvc, sizeof(UV), 0); \
1030 charid = (U16)SvIV(*svpp); \
1035 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1038 ibcmp_utf8(s, NULL, 0, do_utf8, \
1039 m, NULL, ln, (bool)UTF)) \
1040 && (!reginfo || regtry(reginfo, &s)) ) \
1043 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1044 uvchr_to_utf8(tmpbuf, c); \
1045 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1047 && (f == c1 || f == c2) \
1048 && (ln == foldlen || \
1049 !ibcmp_utf8((char *) foldbuf, \
1050 NULL, foldlen, do_utf8, \
1052 NULL, ln, (bool)UTF)) \
1053 && (!reginfo || regtry(reginfo, &s)) ) \
1058 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1062 && (ln == 1 || !(OP(c) == EXACTF \
1064 : ibcmp_locale(s, m, ln))) \
1065 && (!reginfo || regtry(reginfo, &s)) ) \
1071 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1073 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1079 #define REXEC_FBC_SCAN(CoDe) \
1081 while (s < strend) { \
1087 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1088 REXEC_FBC_UTF8_SCAN( \
1090 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1099 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1102 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1111 #define REXEC_FBC_TRYIT \
1112 if ((!reginfo || regtry(reginfo, &s))) \
1115 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1118 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1121 REXEC_FBC_CLASS_SCAN(CoNd); \
1125 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1126 PL_reg_flags |= RF_tainted; \
1128 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1131 REXEC_FBC_CLASS_SCAN(CoNd); \
1135 #define DUMP_EXEC_POS(li,s,doutf8) \
1136 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1138 /* We know what class REx starts with. Try to find this position... */
1139 /* if reginfo is NULL, its a dryrun */
1140 /* annoyingly all the vars in this routine have different names from their counterparts
1141 in regmatch. /grrr */
1144 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1145 const char *strend, regmatch_info *reginfo)
1148 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1152 register STRLEN uskip;
1156 register I32 tmp = 1; /* Scratch variable? */
1157 register const bool do_utf8 = PL_reg_match_utf8;
1158 RXi_GET_DECL(prog,progi);
1160 /* We know what class it must start with. */
1164 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1165 !UTF8_IS_INVARIANT((U8)s[0]) ?
1166 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1167 REGINCLASS(prog, c, (U8*)s));
1170 while (s < strend) {
1173 if (REGINCLASS(prog, c, (U8*)s) ||
1174 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1175 /* The assignment of 2 is intentional:
1176 * for the folded sharp s, the skip is 2. */
1177 (skip = SHARP_S_SKIP))) {
1178 if (tmp && (!reginfo || regtry(reginfo, &s)))
1191 if (tmp && (!reginfo || regtry(reginfo, &s)))
1199 ln = STR_LEN(c); /* length to match in octets/bytes */
1200 lnc = (I32) ln; /* length to match in characters */
1202 STRLEN ulen1, ulen2;
1204 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1205 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1208 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1209 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1211 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1213 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1216 while (sm < ((U8 *) m + ln)) {
1231 c2 = PL_fold_locale[c1];
1233 e = HOP3c(strend, -((I32)lnc), s);
1235 if (!reginfo && e < s)
1236 e = s; /* Due to minlen logic of intuit() */
1238 /* The idea in the EXACTF* cases is to first find the
1239 * first character of the EXACTF* node and then, if
1240 * necessary, case-insensitively compare the full
1241 * text of the node. The c1 and c2 are the first
1242 * characters (though in Unicode it gets a bit
1243 * more complicated because there are more cases
1244 * than just upper and lower: one needs to use
1245 * the so-called folding case for case-insensitive
1246 * matching (called "loose matching" in Unicode).
1247 * ibcmp_utf8() will do just that. */
1251 U8 tmpbuf [UTF8_MAXBYTES+1];
1252 STRLEN len, foldlen;
1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1255 /* Upper and lower of 1st char are equal -
1256 * probably not a "letter". */
1258 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1260 REXEC_FBC_EXACTISH_CHECK(c == c1);
1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1268 /* Handle some of the three Greek sigmas cases.
1269 * Note that not all the possible combinations
1270 * are handled here: some of them are handled
1271 * by the standard folding rules, and some of
1272 * them (the character class or ANYOF cases)
1273 * are handled during compiletime in
1274 * regexec.c:S_regclass(). */
1275 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1276 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1277 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1279 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1285 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1287 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1291 PL_reg_flags |= RF_tainted;
1298 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1299 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1301 tmp = ((OP(c) == BOUND ?
1302 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1303 LOAD_UTF8_CHARCLASS_ALNUM();
1304 REXEC_FBC_UTF8_SCAN(
1305 if (tmp == !(OP(c) == BOUND ?
1306 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1307 isALNUM_LC_utf8((U8*)s)))
1315 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1316 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1319 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1325 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1329 PL_reg_flags |= RF_tainted;
1336 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1337 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1339 tmp = ((OP(c) == NBOUND ?
1340 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1341 LOAD_UTF8_CHARCLASS_ALNUM();
1342 REXEC_FBC_UTF8_SCAN(
1343 if (tmp == !(OP(c) == NBOUND ?
1344 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1345 isALNUM_LC_utf8((U8*)s)))
1347 else REXEC_FBC_TRYIT;
1351 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1352 tmp = ((OP(c) == NBOUND ?
1353 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1356 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1358 else REXEC_FBC_TRYIT;
1361 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1365 REXEC_FBC_CSCAN_PRELOAD(
1366 LOAD_UTF8_CHARCLASS_ALNUM(),
1367 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1371 REXEC_FBC_CSCAN_TAINT(
1372 isALNUM_LC_utf8((U8*)s),
1376 REXEC_FBC_CSCAN_PRELOAD(
1377 LOAD_UTF8_CHARCLASS_ALNUM(),
1378 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1382 REXEC_FBC_CSCAN_TAINT(
1383 !isALNUM_LC_utf8((U8*)s),
1387 REXEC_FBC_CSCAN_PRELOAD(
1388 LOAD_UTF8_CHARCLASS_SPACE(),
1389 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1393 REXEC_FBC_CSCAN_TAINT(
1394 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1398 REXEC_FBC_CSCAN_PRELOAD(
1399 LOAD_UTF8_CHARCLASS_SPACE(),
1400 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1404 REXEC_FBC_CSCAN_TAINT(
1405 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1409 REXEC_FBC_CSCAN_PRELOAD(
1410 LOAD_UTF8_CHARCLASS_DIGIT(),
1411 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1415 REXEC_FBC_CSCAN_TAINT(
1416 isDIGIT_LC_utf8((U8*)s),
1420 REXEC_FBC_CSCAN_PRELOAD(
1421 LOAD_UTF8_CHARCLASS_DIGIT(),
1422 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1426 REXEC_FBC_CSCAN_TAINT(
1427 !isDIGIT_LC_utf8((U8*)s),
1433 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1434 trie_type = do_utf8 ?
1435 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1437 /* what trie are we using right now */
1439 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1441 = (reg_trie_data*)progi->data->data[ aho->trie ];
1442 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1444 const char *last_start = strend - trie->minlen;
1446 const char *real_start = s;
1448 STRLEN maxlen = trie->maxlen;
1450 U8 **points; /* map of where we were in the input string
1451 when reading a given char. For ASCII this
1452 is unnecessary overhead as the relationship
1453 is always 1:1, but for unicode, especially
1454 case folded unicode this is not true. */
1455 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459 GET_RE_DEBUG_FLAGS_DECL;
1461 /* We can't just allocate points here. We need to wrap it in
1462 * an SV so it gets freed properly if there is a croak while
1463 * running the match */
1466 sv_points=newSV(maxlen * sizeof(U8 *));
1467 SvCUR_set(sv_points,
1468 maxlen * sizeof(U8 *));
1469 SvPOK_on(sv_points);
1470 sv_2mortal(sv_points);
1471 points=(U8**)SvPV_nolen(sv_points );
1472 if ( trie_type != trie_utf8_fold
1473 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1476 bitmap=(U8*)trie->bitmap;
1478 bitmap=(U8*)ANYOF_BITMAP(c);
1480 /* this is the Aho-Corasick algorithm modified a touch
1481 to include special handling for long "unknown char"
1482 sequences. The basic idea being that we use AC as long
1483 as we are dealing with a possible matching char, when
1484 we encounter an unknown char (and we have not encountered
1485 an accepting state) we scan forward until we find a legal
1487 AC matching is basically that of trie matching, except
1488 that when we encounter a failing transition, we fall back
1489 to the current states "fail state", and try the current char
1490 again, a process we repeat until we reach the root state,
1491 state 1, or a legal transition. If we fail on the root state
1492 then we can either terminate if we have reached an accepting
1493 state previously, or restart the entire process from the beginning
1497 while (s <= last_start) {
1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1506 U8 *uscan = (U8*)NULL;
1507 U8 *leftmost = NULL;
1509 U32 accepted_word= 0;
1513 while ( state && uc <= (U8*)strend ) {
1515 U32 word = aho->states[ state ].wordnum;
1519 DEBUG_TRIE_EXECUTE_r(
1520 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 (char *)uc, do_utf8 );
1523 PerlIO_printf( Perl_debug_log,
1524 " Scanning for legal start char...\n");
1527 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1532 if (uc >(U8*)last_start) break;
1536 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1537 if (!leftmost || lpos < leftmost) {
1538 DEBUG_r(accepted_word=word);
1544 points[pointpos++ % maxlen]= uc;
1545 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1546 uscan, len, uvc, charid, foldlen,
1548 DEBUG_TRIE_EXECUTE_r({
1549 dump_exec_pos( (char *)uc, c, strend, real_start,
1551 PerlIO_printf(Perl_debug_log,
1552 " Charid:%3u CP:%4"UVxf" ",
1558 word = aho->states[ state ].wordnum;
1560 base = aho->states[ state ].trans.base;
1562 DEBUG_TRIE_EXECUTE_r({
1564 dump_exec_pos( (char *)uc, c, strend, real_start,
1566 PerlIO_printf( Perl_debug_log,
1567 "%sState: %4"UVxf", word=%"UVxf,
1568 failed ? " Fail transition to " : "",
1569 (UV)state, (UV)word);
1574 (base + charid > trie->uniquecharcount )
1575 && (base + charid - 1 - trie->uniquecharcount
1577 && trie->trans[base + charid - 1 -
1578 trie->uniquecharcount].check == state
1579 && (tmp=trie->trans[base + charid - 1 -
1580 trie->uniquecharcount ].next))
1582 DEBUG_TRIE_EXECUTE_r(
1583 PerlIO_printf( Perl_debug_log," - legal\n"));
1588 DEBUG_TRIE_EXECUTE_r(
1589 PerlIO_printf( Perl_debug_log," - fail\n"));
1591 state = aho->fail[state];
1595 /* we must be accepting here */
1596 DEBUG_TRIE_EXECUTE_r(
1597 PerlIO_printf( Perl_debug_log," - accepting\n"));
1606 if (!state) state = 1;
1609 if ( aho->states[ state ].wordnum ) {
1610 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1611 if (!leftmost || lpos < leftmost) {
1612 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1617 s = (char*)leftmost;
1618 DEBUG_TRIE_EXECUTE_r({
1620 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1621 (UV)accepted_word, (IV)(s - real_start)
1624 if (!reginfo || regtry(reginfo, &s)) {
1630 DEBUG_TRIE_EXECUTE_r({
1631 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1634 DEBUG_TRIE_EXECUTE_r(
1635 PerlIO_printf( Perl_debug_log,"No match.\n"));
1644 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1653 S_swap_match_buff (pTHX_ regexp *prog) {
1657 /* We have to be careful. If the previous successful match
1658 was from this regex we don't want a subsequent paritally
1659 successful match to clobber the old results.
1660 So when we detect this possibility we add a swap buffer
1661 to the re, and switch the buffer each match. If we fail
1662 we switch it back, otherwise we leave it swapped.
1664 Newxz(prog->swap, 1, regexp_paren_ofs);
1665 /* no need to copy these */
1666 Newxz(prog->swap->startp, 2 * (prog->nparens + 1), I32);
1667 prog->swap->endp = prog->swap->startp + prog->nparens + 1;
1669 t = prog->swap->startp;
1670 prog->swap->startp = prog->startp;
1672 t = prog->swap->endp;
1673 prog->swap->endp = prog->endp;
1679 - regexec_flags - match a regexp against a string
1682 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1683 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1684 /* strend: pointer to null at end of string */
1685 /* strbeg: real beginning of string */
1686 /* minend: end of match must be >=minend after stringarg. */
1687 /* data: May be used for some additional optimizations.
1688 Currently its only used, with a U32 cast, for transmitting
1689 the ganch offset when doing a /g match. This will change */
1690 /* nosave: For optimizations. */
1693 /*register*/ char *s;
1694 register regnode *c;
1695 /*register*/ char *startpos = stringarg;
1696 I32 minlen; /* must match at least this many chars */
1697 I32 dontbother = 0; /* how many characters not to try at end */
1698 I32 end_shift = 0; /* Same for the end. */ /* CC */
1699 I32 scream_pos = -1; /* Internal iterator of scream. */
1700 char *scream_olds = NULL;
1701 SV* const oreplsv = GvSV(PL_replgv);
1702 const bool do_utf8 = (bool)DO_UTF8(sv);
1704 RXi_GET_DECL(prog,progi);
1705 regmatch_info reginfo; /* create some info to pass to regtry etc */
1706 bool swap_on_fail = 0;
1708 GET_RE_DEBUG_FLAGS_DECL;
1710 PERL_UNUSED_ARG(data);
1712 /* Be paranoid... */
1713 if (prog == NULL || startpos == NULL) {
1714 Perl_croak(aTHX_ "NULL regexp parameter");
1718 multiline = prog->extflags & RXf_PMf_MULTILINE;
1719 reginfo.prog = prog;
1721 RX_MATCH_UTF8_set(prog, do_utf8);
1723 debug_start_match(prog, do_utf8, startpos, strend,
1727 minlen = prog->minlen;
1729 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1731 "String too short [regexec_flags]...\n"));
1736 /* Check validity of program. */
1737 if (UCHARAT(progi->program) != REG_MAGIC) {
1738 Perl_croak(aTHX_ "corrupted regexp program");
1742 PL_reg_eval_set = 0;
1745 if (prog->extflags & RXf_UTF8)
1746 PL_reg_flags |= RF_utf8;
1748 /* Mark beginning of line for ^ and lookbehind. */
1749 reginfo.bol = startpos; /* XXX not used ??? */
1753 /* Mark end of line for $ (and such) */
1756 /* see how far we have to get to not match where we matched before */
1757 reginfo.till = startpos+minend;
1759 /* If there is a "must appear" string, look for it. */
1762 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1765 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1766 reginfo.ganch = startpos + prog->gofs;
1767 else if (sv && SvTYPE(sv) >= SVt_PVMG
1769 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1770 && mg->mg_len >= 0) {
1771 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1772 if (prog->extflags & RXf_ANCH_GPOS) {
1773 if (s > reginfo.ganch)
1775 s = reginfo.ganch - prog->gofs;
1779 reginfo.ganch = strbeg + PTR2UV(data);
1780 } else /* pos() not defined */
1781 reginfo.ganch = strbeg;
1783 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1785 swap_match_buff(prog); /* do we need a save destructor here for
1788 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1789 re_scream_pos_data d;
1791 d.scream_olds = &scream_olds;
1792 d.scream_pos = &scream_pos;
1793 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1796 goto phooey; /* not present */
1802 /* Simplest case: anchored match need be tried only once. */
1803 /* [unless only anchor is BOL and multiline is set] */
1804 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1805 if (s == startpos && regtry(®info, &startpos))
1807 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1808 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1813 dontbother = minlen - 1;
1814 end = HOP3c(strend, -dontbother, strbeg) - 1;
1815 /* for multiline we only have to try after newlines */
1816 if (prog->check_substr || prog->check_utf8) {
1820 if (regtry(®info, &s))
1825 if (prog->extflags & RXf_USE_INTUIT) {
1826 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1837 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1838 if (regtry(®info, &s))
1845 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1847 /* the warning about reginfo.ganch being used without intialization
1848 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1849 and we only enter this block when the same bit is set. */
1850 char *tmp_s = reginfo.ganch - prog->gofs;
1851 if (regtry(®info, &tmp_s))
1856 /* Messy cases: unanchored match. */
1857 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1858 /* we have /x+whatever/ */
1859 /* it must be a one character string (XXXX Except UTF?) */
1864 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1865 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1866 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1871 DEBUG_EXECUTE_r( did_match = 1 );
1872 if (regtry(®info, &s)) goto got_it;
1874 while (s < strend && *s == ch)
1882 DEBUG_EXECUTE_r( did_match = 1 );
1883 if (regtry(®info, &s)) goto got_it;
1885 while (s < strend && *s == ch)
1890 DEBUG_EXECUTE_r(if (!did_match)
1891 PerlIO_printf(Perl_debug_log,
1892 "Did not find anchored character...\n")
1895 else if (prog->anchored_substr != NULL
1896 || prog->anchored_utf8 != NULL
1897 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1898 && prog->float_max_offset < strend - s)) {
1903 char *last1; /* Last position checked before */
1907 if (prog->anchored_substr || prog->anchored_utf8) {
1908 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1909 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1910 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1911 back_max = back_min = prog->anchored_offset;
1913 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1914 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1915 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1916 back_max = prog->float_max_offset;
1917 back_min = prog->float_min_offset;
1921 if (must == &PL_sv_undef)
1922 /* could not downgrade utf8 check substring, so must fail */
1928 last = HOP3c(strend, /* Cannot start after this */
1929 -(I32)(CHR_SVLEN(must)
1930 - (SvTAIL(must) != 0) + back_min), strbeg);
1933 last1 = HOPc(s, -1);
1935 last1 = s - 1; /* bogus */
1937 /* XXXX check_substr already used to find "s", can optimize if
1938 check_substr==must. */
1940 dontbother = end_shift;
1941 strend = HOPc(strend, -dontbother);
1942 while ( (s <= last) &&
1943 ((flags & REXEC_SCREAM)
1944 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1945 end_shift, &scream_pos, 0))
1946 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1947 (unsigned char*)strend, must,
1948 multiline ? FBMrf_MULTILINE : 0))) ) {
1949 /* we may be pointing at the wrong string */
1950 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1951 s = strbeg + (s - SvPVX_const(sv));
1952 DEBUG_EXECUTE_r( did_match = 1 );
1953 if (HOPc(s, -back_max) > last1) {
1954 last1 = HOPc(s, -back_min);
1955 s = HOPc(s, -back_max);
1958 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1960 last1 = HOPc(s, -back_min);
1964 while (s <= last1) {
1965 if (regtry(®info, &s))
1971 while (s <= last1) {
1972 if (regtry(®info, &s))
1978 DEBUG_EXECUTE_r(if (!did_match) {
1979 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1980 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1981 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1982 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1983 ? "anchored" : "floating"),
1984 quoted, RE_SV_TAIL(must));
1988 else if ( (c = progi->regstclass) ) {
1990 const OPCODE op = OP(progi->regstclass);
1991 /* don't bother with what can't match */
1992 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1993 strend = HOPc(strend, -(minlen - 1));
1996 SV * const prop = sv_newmortal();
1997 regprop(prog, prop, c);
1999 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2001 PerlIO_printf(Perl_debug_log,
2002 "Matching stclass %.*s against %s (%d chars)\n",
2003 (int)SvCUR(prop), SvPVX_const(prop),
2004 quoted, (int)(strend - s));
2007 if (find_byclass(prog, c, s, strend, ®info))
2009 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2013 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2018 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2019 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2020 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2022 if (flags & REXEC_SCREAM) {
2023 last = screaminstr(sv, float_real, s - strbeg,
2024 end_shift, &scream_pos, 1); /* last one */
2026 last = scream_olds; /* Only one occurrence. */
2027 /* we may be pointing at the wrong string */
2028 else if (RX_MATCH_COPIED(prog))
2029 s = strbeg + (s - SvPVX_const(sv));
2033 const char * const little = SvPV_const(float_real, len);
2035 if (SvTAIL(float_real)) {
2036 if (memEQ(strend - len + 1, little, len - 1))
2037 last = strend - len + 1;
2038 else if (!multiline)
2039 last = memEQ(strend - len, little, len)
2040 ? strend - len : NULL;
2046 last = rninstr(s, strend, little, little + len);
2048 last = strend; /* matching "$" */
2053 PerlIO_printf(Perl_debug_log,
2054 "%sCan't trim the tail, match fails (should not happen)%s\n",
2055 PL_colors[4], PL_colors[5]));
2056 goto phooey; /* Should not happen! */
2058 dontbother = strend - last + prog->float_min_offset;
2060 if (minlen && (dontbother < minlen))
2061 dontbother = minlen - 1;
2062 strend -= dontbother; /* this one's always in bytes! */
2063 /* We don't know much -- general case. */
2066 if (regtry(®info, &s))
2075 if (regtry(®info, &s))
2077 } while (s++ < strend);
2085 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2087 if (PL_reg_eval_set) {
2088 /* Preserve the current value of $^R */
2089 if (oreplsv != GvSV(PL_replgv))
2090 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2091 restored, the value remains
2093 restore_pos(aTHX_ prog);
2095 if (prog->paren_names)
2096 (void)hv_iterinit(prog->paren_names);
2098 /* make sure $`, $&, $', and $digit will work later */
2099 if ( !(flags & REXEC_NOT_FIRST) ) {
2100 RX_MATCH_COPY_FREE(prog);
2101 if (flags & REXEC_COPY_STR) {
2102 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2103 #ifdef PERL_OLD_COPY_ON_WRITE
2105 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2107 PerlIO_printf(Perl_debug_log,
2108 "Copy on write: regexp capture, type %d\n",
2111 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2112 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2113 assert (SvPOKp(prog->saved_copy));
2117 RX_MATCH_COPIED_on(prog);
2118 s = savepvn(strbeg, i);
2124 prog->subbeg = strbeg;
2125 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2132 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2133 PL_colors[4], PL_colors[5]));
2134 if (PL_reg_eval_set)
2135 restore_pos(aTHX_ prog);
2137 /* we failed :-( roll it back */
2138 swap_match_buff(prog);
2145 - regtry - try match at specific point
2147 STATIC I32 /* 0 failure, 1 success */
2148 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2154 regexp *prog = reginfo->prog;
2155 RXi_GET_DECL(prog,progi);
2156 GET_RE_DEBUG_FLAGS_DECL;
2157 reginfo->cutpoint=NULL;
2159 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2162 PL_reg_eval_set = RS_init;
2163 DEBUG_EXECUTE_r(DEBUG_s(
2164 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2165 (IV)(PL_stack_sp - PL_stack_base));
2168 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2169 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2171 /* Apparently this is not needed, judging by wantarray. */
2172 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2173 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2176 /* Make $_ available to executed code. */
2177 if (reginfo->sv != DEFSV) {
2179 DEFSV = reginfo->sv;
2182 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2183 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2184 /* prepare for quick setting of pos */
2185 #ifdef PERL_OLD_COPY_ON_WRITE
2186 if (SvIsCOW(reginfo->sv))
2187 sv_force_normal_flags(reginfo->sv, 0);
2189 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2190 &PL_vtbl_mglob, NULL, 0);
2194 PL_reg_oldpos = mg->mg_len;
2195 SAVEDESTRUCTOR_X(restore_pos, prog);
2197 if (!PL_reg_curpm) {
2198 Newxz(PL_reg_curpm, 1, PMOP);
2201 SV* const repointer = newSViv(0);
2202 /* so we know which PL_regex_padav element is PL_reg_curpm */
2203 SvFLAGS(repointer) |= SVf_BREAK;
2204 av_push(PL_regex_padav,repointer);
2205 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2206 PL_regex_pad = AvARRAY(PL_regex_padav);
2210 PM_SETRE(PL_reg_curpm, prog);
2211 PL_reg_oldcurpm = PL_curpm;
2212 PL_curpm = PL_reg_curpm;
2213 if (RX_MATCH_COPIED(prog)) {
2214 /* Here is a serious problem: we cannot rewrite subbeg,
2215 since it may be needed if this match fails. Thus
2216 $` inside (?{}) could fail... */
2217 PL_reg_oldsaved = prog->subbeg;
2218 PL_reg_oldsavedlen = prog->sublen;
2219 #ifdef PERL_OLD_COPY_ON_WRITE
2220 PL_nrs = prog->saved_copy;
2222 RX_MATCH_COPIED_off(prog);
2225 PL_reg_oldsaved = NULL;
2226 prog->subbeg = PL_bostr;
2227 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2229 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2230 prog->startp[0] = *startpos - PL_bostr;
2231 PL_reginput = *startpos;
2232 PL_reglastparen = &prog->lastparen;
2233 PL_reglastcloseparen = &prog->lastcloseparen;
2234 prog->lastparen = 0;
2235 prog->lastcloseparen = 0;
2237 PL_regstartp = prog->startp;
2238 PL_regendp = prog->endp;
2239 if (PL_reg_start_tmpl <= prog->nparens) {
2240 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2241 if(PL_reg_start_tmp)
2242 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2244 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2247 /* XXXX What this code is doing here?!!! There should be no need
2248 to do this again and again, PL_reglastparen should take care of
2251 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2252 * Actually, the code in regcppop() (which Ilya may be meaning by
2253 * PL_reglastparen), is not needed at all by the test suite
2254 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2255 * enough, for building DynaLoader, or otherwise this
2256 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2257 * will happen. Meanwhile, this code *is* needed for the
2258 * above-mentioned test suite tests to succeed. The common theme
2259 * on those tests seems to be returning null fields from matches.
2264 if (prog->nparens) {
2266 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2273 if (regmatch(reginfo, progi->program + 1)) {
2274 PL_regendp[0] = PL_reginput - PL_bostr;
2277 if (reginfo->cutpoint)
2278 *startpos= reginfo->cutpoint;
2279 REGCP_UNWIND(lastcp);
2284 #define sayYES goto yes
2285 #define sayNO goto no
2286 #define sayNO_SILENT goto no_silent
2288 /* we dont use STMT_START/END here because it leads to
2289 "unreachable code" warnings, which are bogus, but distracting. */
2290 #define CACHEsayNO \
2291 if (ST.cache_mask) \
2292 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2295 /* this is used to determine how far from the left messages like
2296 'failed...' are printed. It should be set such that messages
2297 are inline with the regop output that created them.
2299 #define REPORT_CODE_OFF 32
2302 /* Make sure there is a test for this +1 options in re_tests */
2303 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2305 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2306 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2308 #define SLAB_FIRST(s) (&(s)->states[0])
2309 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2311 /* grab a new slab and return the first slot in it */
2313 STATIC regmatch_state *
2316 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2319 regmatch_slab *s = PL_regmatch_slab->next;
2321 Newx(s, 1, regmatch_slab);
2322 s->prev = PL_regmatch_slab;
2324 PL_regmatch_slab->next = s;
2326 PL_regmatch_slab = s;
2327 return SLAB_FIRST(s);
2331 /* push a new state then goto it */
2333 #define PUSH_STATE_GOTO(state, node) \
2335 st->resume_state = state; \
2338 /* push a new state with success backtracking, then goto it */
2340 #define PUSH_YES_STATE_GOTO(state, node) \
2342 st->resume_state = state; \
2343 goto push_yes_state;
2349 regmatch() - main matching routine
2351 This is basically one big switch statement in a loop. We execute an op,
2352 set 'next' to point the next op, and continue. If we come to a point which
2353 we may need to backtrack to on failure such as (A|B|C), we push a
2354 backtrack state onto the backtrack stack. On failure, we pop the top
2355 state, and re-enter the loop at the state indicated. If there are no more
2356 states to pop, we return failure.
2358 Sometimes we also need to backtrack on success; for example /A+/, where
2359 after successfully matching one A, we need to go back and try to
2360 match another one; similarly for lookahead assertions: if the assertion
2361 completes successfully, we backtrack to the state just before the assertion
2362 and then carry on. In these cases, the pushed state is marked as
2363 'backtrack on success too'. This marking is in fact done by a chain of
2364 pointers, each pointing to the previous 'yes' state. On success, we pop to
2365 the nearest yes state, discarding any intermediate failure-only states.
2366 Sometimes a yes state is pushed just to force some cleanup code to be
2367 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2368 it to free the inner regex.
2370 Note that failure backtracking rewinds the cursor position, while
2371 success backtracking leaves it alone.
2373 A pattern is complete when the END op is executed, while a subpattern
2374 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2375 ops trigger the "pop to last yes state if any, otherwise return true"
2378 A common convention in this function is to use A and B to refer to the two
2379 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2380 the subpattern to be matched possibly multiple times, while B is the entire
2381 rest of the pattern. Variable and state names reflect this convention.
2383 The states in the main switch are the union of ops and failure/success of
2384 substates associated with with that op. For example, IFMATCH is the op
2385 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2386 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2387 successfully matched A and IFMATCH_A_fail is a state saying that we have
2388 just failed to match A. Resume states always come in pairs. The backtrack
2389 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2390 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2391 on success or failure.
2393 The struct that holds a backtracking state is actually a big union, with
2394 one variant for each major type of op. The variable st points to the
2395 top-most backtrack struct. To make the code clearer, within each
2396 block of code we #define ST to alias the relevant union.
2398 Here's a concrete example of a (vastly oversimplified) IFMATCH
2404 #define ST st->u.ifmatch
2406 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2407 ST.foo = ...; // some state we wish to save
2409 // push a yes backtrack state with a resume value of
2410 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2412 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2415 case IFMATCH_A: // we have successfully executed A; now continue with B
2417 bar = ST.foo; // do something with the preserved value
2420 case IFMATCH_A_fail: // A failed, so the assertion failed
2421 ...; // do some housekeeping, then ...
2422 sayNO; // propagate the failure
2429 For any old-timers reading this who are familiar with the old recursive
2430 approach, the code above is equivalent to:
2432 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2441 ...; // do some housekeeping, then ...
2442 sayNO; // propagate the failure
2445 The topmost backtrack state, pointed to by st, is usually free. If you
2446 want to claim it, populate any ST.foo fields in it with values you wish to
2447 save, then do one of
2449 PUSH_STATE_GOTO(resume_state, node);
2450 PUSH_YES_STATE_GOTO(resume_state, node);
2452 which sets that backtrack state's resume value to 'resume_state', pushes a
2453 new free entry to the top of the backtrack stack, then goes to 'node'.
2454 On backtracking, the free slot is popped, and the saved state becomes the
2455 new free state. An ST.foo field in this new top state can be temporarily
2456 accessed to retrieve values, but once the main loop is re-entered, it
2457 becomes available for reuse.
2459 Note that the depth of the backtrack stack constantly increases during the
2460 left-to-right execution of the pattern, rather than going up and down with
2461 the pattern nesting. For example the stack is at its maximum at Z at the
2462 end of the pattern, rather than at X in the following:
2464 /(((X)+)+)+....(Y)+....Z/
2466 The only exceptions to this are lookahead/behind assertions and the cut,
2467 (?>A), which pop all the backtrack states associated with A before
2470 Bascktrack state structs are allocated in slabs of about 4K in size.
2471 PL_regmatch_state and st always point to the currently active state,
2472 and PL_regmatch_slab points to the slab currently containing
2473 PL_regmatch_state. The first time regmatch() is called, the first slab is
2474 allocated, and is never freed until interpreter destruction. When the slab
2475 is full, a new one is allocated and chained to the end. At exit from
2476 regmatch(), slabs allocated since entry are freed.
2481 #define DEBUG_STATE_pp(pp) \
2483 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2484 PerlIO_printf(Perl_debug_log, \
2485 " %*s"pp" %s%s%s%s%s\n", \
2487 PL_reg_name[st->resume_state], \
2488 ((st==yes_state||st==mark_state) ? "[" : ""), \
2489 ((st==yes_state) ? "Y" : ""), \
2490 ((st==mark_state) ? "M" : ""), \
2491 ((st==yes_state||st==mark_state) ? "]" : "") \
2496 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2501 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2502 const char *start, const char *end, const char *blurb)
2504 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2508 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2509 prog->precomp, prog->prelen, 60);
2511 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2512 start, end - start, 60);
2514 PerlIO_printf(Perl_debug_log,
2515 "%s%s REx%s %s against %s\n",
2516 PL_colors[4], blurb, PL_colors[5], s0, s1);
2518 if (do_utf8||utf8_pat)
2519 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2520 utf8_pat ? "pattern" : "",
2521 utf8_pat && do_utf8 ? " and " : "",
2522 do_utf8 ? "string" : ""
2528 S_dump_exec_pos(pTHX_ const char *locinput,
2529 const regnode *scan,
2530 const char *loc_regeol,
2531 const char *loc_bostr,
2532 const char *loc_reg_starttry,
2535 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2536 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2537 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2538 /* The part of the string before starttry has one color
2539 (pref0_len chars), between starttry and current
2540 position another one (pref_len - pref0_len chars),
2541 after the current position the third one.
2542 We assume that pref0_len <= pref_len, otherwise we
2543 decrease pref0_len. */
2544 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2545 ? (5 + taill) - l : locinput - loc_bostr;
2548 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2550 pref0_len = pref_len - (locinput - loc_reg_starttry);
2551 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2552 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2553 ? (5 + taill) - pref_len : loc_regeol - locinput);
2554 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2558 if (pref0_len > pref_len)
2559 pref0_len = pref_len;
2561 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2563 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2564 (locinput - pref_len),pref0_len, 60, 4, 5);
2566 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2567 (locinput - pref_len + pref0_len),
2568 pref_len - pref0_len, 60, 2, 3);
2570 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2571 locinput, loc_regeol - locinput, 10, 0, 1);
2573 const STRLEN tlen=len0+len1+len2;
2574 PerlIO_printf(Perl_debug_log,
2575 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2576 (IV)(locinput - loc_bostr),
2579 (docolor ? "" : "> <"),
2581 (int)(tlen > 19 ? 0 : 19 - tlen),
2588 /* reg_check_named_buff_matched()
2589 * Checks to see if a named buffer has matched. The data array of
2590 * buffer numbers corresponding to the buffer is expected to reside
2591 * in the regexp->data->data array in the slot stored in the ARG() of
2592 * node involved. Note that this routine doesn't actually care about the
2593 * name, that information is not preserved from compilation to execution.
2594 * Returns the index of the leftmost defined buffer with the given name
2595 * or 0 if non of the buffers matched.
2598 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2600 RXi_GET_DECL(rex,rexi);
2601 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2602 I32 *nums=(I32*)SvPVX(sv_dat);
2603 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2604 if ((I32)*PL_reglastparen >= nums[n] &&
2605 PL_regendp[nums[n]] != -1)
2613 #define SETREX(Re1,Re2) \
2614 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2617 STATIC I32 /* 0 failure, 1 success */
2618 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2620 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2624 register const bool do_utf8 = PL_reg_match_utf8;
2625 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2627 regexp *rex = reginfo->prog;
2628 RXi_GET_DECL(rex,rexi);
2630 regmatch_slab *orig_slab;
2631 regmatch_state *orig_state;
2633 /* the current state. This is a cached copy of PL_regmatch_state */
2634 register regmatch_state *st;
2636 /* cache heavy used fields of st in registers */
2637 register regnode *scan;
2638 register regnode *next;
2639 register U32 n = 0; /* general value; init to avoid compiler warning */
2640 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2641 register char *locinput = PL_reginput;
2642 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2644 bool result = 0; /* return value of S_regmatch */
2645 int depth = 0; /* depth of backtrack stack */
2646 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2647 const U32 max_nochange_depth =
2648 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2649 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2651 regmatch_state *yes_state = NULL; /* state to pop to on success of
2653 /* mark_state piggy backs on the yes_state logic so that when we unwind
2654 the stack on success we can update the mark_state as we go */
2655 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2657 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2658 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2660 bool no_final = 0; /* prevent failure from backtracking? */
2661 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2662 char *startpoint = PL_reginput;
2663 SV *popmark = NULL; /* are we looking for a mark? */
2664 SV *sv_commit = NULL; /* last mark name seen in failure */
2665 SV *sv_yes_mark = NULL; /* last mark name we have seen
2666 during a successfull match */
2667 U32 lastopen = 0; /* last open we saw */
2668 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2671 /* these three flags are set by various ops to signal information to
2672 * the very next op. They have a useful lifetime of exactly one loop
2673 * iteration, and are not preserved or restored by state pushes/pops
2675 bool sw = 0; /* the condition value in (?(cond)a|b) */
2676 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2677 int logical = 0; /* the following EVAL is:
2681 or the following IFMATCH/UNLESSM is:
2682 false: plain (?=foo)
2683 true: used as a condition: (?(?=foo))
2687 GET_RE_DEBUG_FLAGS_DECL;
2690 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2691 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2693 /* on first ever call to regmatch, allocate first slab */
2694 if (!PL_regmatch_slab) {
2695 Newx(PL_regmatch_slab, 1, regmatch_slab);
2696 PL_regmatch_slab->prev = NULL;
2697 PL_regmatch_slab->next = NULL;
2698 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2701 /* remember current high-water mark for exit */
2702 /* XXX this should be done with SAVE* instead */
2703 orig_slab = PL_regmatch_slab;
2704 orig_state = PL_regmatch_state;
2706 /* grab next free state slot */
2707 st = ++PL_regmatch_state;
2708 if (st > SLAB_LAST(PL_regmatch_slab))
2709 st = PL_regmatch_state = S_push_slab(aTHX);
2711 /* Note that nextchr is a byte even in UTF */
2712 nextchr = UCHARAT(locinput);
2714 while (scan != NULL) {
2717 SV * const prop = sv_newmortal();
2718 regnode *rnext=regnext(scan);
2719 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2720 regprop(rex, prop, scan);
2722 PerlIO_printf(Perl_debug_log,
2723 "%3"IVdf":%*s%s(%"IVdf")\n",
2724 (IV)(scan - rexi->program), depth*2, "",
2726 (PL_regkind[OP(scan)] == END || !rnext) ?
2727 0 : (IV)(rnext - rexi->program));
2730 next = scan + NEXT_OFF(scan);
2733 state_num = OP(scan);
2736 switch (state_num) {
2738 if (locinput == PL_bostr)
2740 /* reginfo->till = reginfo->bol; */
2745 if (locinput == PL_bostr ||
2746 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2752 if (locinput == PL_bostr)
2756 if (locinput == reginfo->ganch)
2761 /* update the startpoint */
2762 st->u.keeper.val = PL_regstartp[0];
2763 PL_reginput = locinput;
2764 PL_regstartp[0] = locinput - PL_bostr;
2765 PUSH_STATE_GOTO(KEEPS_next, next);
2767 case KEEPS_next_fail:
2768 /* rollback the start point change */
2769 PL_regstartp[0] = st->u.keeper.val;
2775 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2780 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2782 if (PL_regeol - locinput > 1)
2786 if (PL_regeol != locinput)
2790 if (!nextchr && locinput >= PL_regeol)
2793 locinput += PL_utf8skip[nextchr];
2794 if (locinput > PL_regeol)
2796 nextchr = UCHARAT(locinput);
2799 nextchr = UCHARAT(++locinput);
2802 if (!nextchr && locinput >= PL_regeol)
2804 nextchr = UCHARAT(++locinput);
2807 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2810 locinput += PL_utf8skip[nextchr];
2811 if (locinput > PL_regeol)
2813 nextchr = UCHARAT(locinput);
2816 nextchr = UCHARAT(++locinput);
2820 #define ST st->u.trie
2822 /* In this case the charclass data is available inline so
2823 we can fail fast without a lot of extra overhead.
2825 if (scan->flags == EXACT || !do_utf8) {
2826 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2828 PerlIO_printf(Perl_debug_log,
2829 "%*s %sfailed to match trie start class...%s\n",
2830 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2839 /* what type of TRIE am I? (utf8 makes this contextual) */
2840 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2841 trie_type = do_utf8 ?
2842 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2845 /* what trie are we using right now */
2846 reg_trie_data * const trie
2847 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2848 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2849 U32 state = trie->startstate;
2851 if (trie->bitmap && trie_type != trie_utf8_fold &&
2852 !TRIE_BITMAP_TEST(trie,*locinput)
2854 if (trie->states[ state ].wordnum) {
2856 PerlIO_printf(Perl_debug_log,
2857 "%*s %smatched empty string...%s\n",
2858 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2863 PerlIO_printf(Perl_debug_log,
2864 "%*s %sfailed to match trie start class...%s\n",
2865 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2872 U8 *uc = ( U8* )locinput;
2876 U8 *uscan = (U8*)NULL;
2878 SV *sv_accept_buff = NULL;
2879 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2881 ST.accepted = 0; /* how many accepting states we have seen */
2883 ST.jump = trie->jump;
2886 traverse the TRIE keeping track of all accepting states
2887 we transition through until we get to a failing node.
2890 while ( state && uc <= (U8*)PL_regeol ) {
2891 U32 base = trie->states[ state ].trans.base;
2894 /* We use charid to hold the wordnum as we don't use it
2895 for charid until after we have done the wordnum logic.
2896 We define an alias just so that the wordnum logic reads
2899 #define got_wordnum charid
2900 got_wordnum = trie->states[ state ].wordnum;
2902 if ( got_wordnum ) {
2903 if ( ! ST.accepted ) {
2906 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2907 sv_accept_buff=newSV(bufflen *
2908 sizeof(reg_trie_accepted) - 1);
2909 SvCUR_set(sv_accept_buff, 0);
2910 SvPOK_on(sv_accept_buff);
2911 sv_2mortal(sv_accept_buff);
2914 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2917 if (ST.accepted >= bufflen) {
2919 ST.accept_buff =(reg_trie_accepted*)
2920 SvGROW(sv_accept_buff,
2921 bufflen * sizeof(reg_trie_accepted));
2923 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2924 + sizeof(reg_trie_accepted));
2927 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2928 ST.accept_buff[ST.accepted].endpos = uc;
2930 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2934 DEBUG_TRIE_EXECUTE_r({
2935 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2936 PerlIO_printf( Perl_debug_log,
2937 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2938 2+depth * 2, "", PL_colors[4],
2939 (UV)state, (UV)ST.accepted );
2943 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2944 uscan, len, uvc, charid, foldlen,
2948 (base + charid > trie->uniquecharcount )
2949 && (base + charid - 1 - trie->uniquecharcount
2951 && trie->trans[base + charid - 1 -
2952 trie->uniquecharcount].check == state)
2954 state = trie->trans[base + charid - 1 -
2955 trie->uniquecharcount ].next;
2966 DEBUG_TRIE_EXECUTE_r(
2967 PerlIO_printf( Perl_debug_log,
2968 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2969 charid, uvc, (UV)state, PL_colors[5] );
2976 PerlIO_printf( Perl_debug_log,
2977 "%*s %sgot %"IVdf" possible matches%s\n",
2978 REPORT_CODE_OFF + depth * 2, "",
2979 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2982 goto trie_first_try; /* jump into the fail handler */
2984 case TRIE_next_fail: /* we failed - try next alterative */
2986 REGCP_UNWIND(ST.cp);
2987 for (n = *PL_reglastparen; n > ST.lastparen; n--)
2989 *PL_reglastparen = n;
2998 ST.lastparen = *PL_reglastparen;
3001 if ( ST.accepted == 1 ) {
3002 /* only one choice left - just continue */
3004 AV *const trie_words
3005 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3006 SV ** const tmp = av_fetch( trie_words,
3007 ST.accept_buff[ 0 ].wordnum-1, 0 );
3008 SV *sv= tmp ? sv_newmortal() : NULL;
3010 PerlIO_printf( Perl_debug_log,
3011 "%*s %sonly one match left: #%d <%s>%s\n",
3012 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3013 ST.accept_buff[ 0 ].wordnum,
3014 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3015 PL_colors[0], PL_colors[1],
3016 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3018 : "not compiled under -Dr",
3021 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3022 /* in this case we free tmps/leave before we call regmatch
3023 as we wont be using accept_buff again. */
3025 locinput = PL_reginput;
3026 nextchr = UCHARAT(locinput);
3027 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3030 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3031 if (!has_cutgroup) {
3036 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3039 continue; /* execute rest of RE */
3042 if ( !ST.accepted-- ) {
3044 PerlIO_printf( Perl_debug_log,
3045 "%*s %sTRIE failed...%s\n",
3046 REPORT_CODE_OFF+depth*2, "",
3057 There are at least two accepting states left. Presumably
3058 the number of accepting states is going to be low,
3059 typically two. So we simply scan through to find the one
3060 with lowest wordnum. Once we find it, we swap the last
3061 state into its place and decrement the size. We then try to
3062 match the rest of the pattern at the point where the word
3063 ends. If we succeed, control just continues along the
3064 regex; if we fail we return here to try the next accepting
3071 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3072 DEBUG_TRIE_EXECUTE_r(
3073 PerlIO_printf( Perl_debug_log,
3074 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3075 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3076 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3077 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3080 if (ST.accept_buff[cur].wordnum <
3081 ST.accept_buff[best].wordnum)
3086 AV *const trie_words
3087 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3088 SV ** const tmp = av_fetch( trie_words,
3089 ST.accept_buff[ best ].wordnum - 1, 0 );
3090 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3092 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3093 SV *sv= tmp ? sv_newmortal() : NULL;
3095 PerlIO_printf( Perl_debug_log,
3096 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3097 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3098 ST.accept_buff[best].wordnum,
3099 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3100 PL_colors[0], PL_colors[1],
3101 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3102 ) : "not compiled under -Dr",
3103 REG_NODE_NUM(nextop),
3107 if ( best<ST.accepted ) {
3108 reg_trie_accepted tmp = ST.accept_buff[ best ];
3109 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3110 ST.accept_buff[ ST.accepted ] = tmp;
3113 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3114 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3118 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3122 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3125 PUSH_STATE_GOTO(TRIE_next, scan);
3138 char *s = STRING(scan);
3140 if (do_utf8 != UTF) {
3141 /* The target and the pattern have differing utf8ness. */
3143 const char * const e = s + ln;
3146 /* The target is utf8, the pattern is not utf8. */
3151 if (NATIVE_TO_UNI(*(U8*)s) !=
3152 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3160 /* The target is not utf8, the pattern is utf8. */
3165 if (NATIVE_TO_UNI(*((U8*)l)) !=
3166 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3174 nextchr = UCHARAT(locinput);
3177 /* The target and the pattern have the same utf8ness. */
3178 /* Inline the first character, for speed. */
3179 if (UCHARAT(s) != nextchr)
3181 if (PL_regeol - locinput < ln)
3183 if (ln > 1 && memNE(s, locinput, ln))
3186 nextchr = UCHARAT(locinput);
3190 PL_reg_flags |= RF_tainted;
3193 char * const s = STRING(scan);
3196 if (do_utf8 || UTF) {
3197 /* Either target or the pattern are utf8. */
3198 const char * const l = locinput;
3199 char *e = PL_regeol;
3201 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3202 l, &e, 0, do_utf8)) {
3203 /* One more case for the sharp s:
3204 * pack("U0U*", 0xDF) =~ /ss/i,
3205 * the 0xC3 0x9F are the UTF-8
3206 * byte sequence for the U+00DF. */
3208 toLOWER(s[0]) == 's' &&
3210 toLOWER(s[1]) == 's' &&
3217 nextchr = UCHARAT(locinput);
3221 /* Neither the target and the pattern are utf8. */
3223 /* Inline the first character, for speed. */
3224 if (UCHARAT(s) != nextchr &&
3225 UCHARAT(s) != ((OP(scan) == EXACTF)
3226 ? PL_fold : PL_fold_locale)[nextchr])
3228 if (PL_regeol - locinput < ln)
3230 if (ln > 1 && (OP(scan) == EXACTF
3231 ? ibcmp(s, locinput, ln)
3232 : ibcmp_locale(s, locinput, ln)))
3235 nextchr = UCHARAT(locinput);
3240 STRLEN inclasslen = PL_regeol - locinput;
3242 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3244 if (locinput >= PL_regeol)
3246 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3247 nextchr = UCHARAT(locinput);
3252 nextchr = UCHARAT(locinput);
3253 if (!REGINCLASS(rex, scan, (U8*)locinput))
3255 if (!nextchr && locinput >= PL_regeol)
3257 nextchr = UCHARAT(++locinput);
3261 /* If we might have the case of the German sharp s
3262 * in a casefolding Unicode character class. */
3264 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3265 locinput += SHARP_S_SKIP;
3266 nextchr = UCHARAT(locinput);
3272 PL_reg_flags |= RF_tainted;
3278 LOAD_UTF8_CHARCLASS_ALNUM();
3279 if (!(OP(scan) == ALNUM
3280 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3281 : isALNUM_LC_utf8((U8*)locinput)))
3285 locinput += PL_utf8skip[nextchr];
3286 nextchr = UCHARAT(locinput);
3289 if (!(OP(scan) == ALNUM
3290 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3292 nextchr = UCHARAT(++locinput);
3295 PL_reg_flags |= RF_tainted;
3298 if (!nextchr && locinput >= PL_regeol)
3301 LOAD_UTF8_CHARCLASS_ALNUM();
3302 if (OP(scan) == NALNUM
3303 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3304 : isALNUM_LC_utf8((U8*)locinput))
3308 locinput += PL_utf8skip[nextchr];
3309 nextchr = UCHARAT(locinput);
3312 if (OP(scan) == NALNUM
3313 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3315 nextchr = UCHARAT(++locinput);
3319 PL_reg_flags |= RF_tainted;
3323 /* was last char in word? */
3325 if (locinput == PL_bostr)
3328 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3330 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3332 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3333 ln = isALNUM_uni(ln);
3334 LOAD_UTF8_CHARCLASS_ALNUM();
3335 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3338 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3339 n = isALNUM_LC_utf8((U8*)locinput);
3343 ln = (locinput != PL_bostr) ?
3344 UCHARAT(locinput - 1) : '\n';
3345 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3347 n = isALNUM(nextchr);
3350 ln = isALNUM_LC(ln);
3351 n = isALNUM_LC(nextchr);
3354 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3355 OP(scan) == BOUNDL))
3359 PL_reg_flags |= RF_tainted;
3365 if (UTF8_IS_CONTINUED(nextchr)) {
3366 LOAD_UTF8_CHARCLASS_SPACE();
3367 if (!(OP(scan) == SPACE
3368 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3369 : isSPACE_LC_utf8((U8*)locinput)))
3373 locinput += PL_utf8skip[nextchr];
3374 nextchr = UCHARAT(locinput);
3377 if (!(OP(scan) == SPACE
3378 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3380 nextchr = UCHARAT(++locinput);
3383 if (!(OP(scan) == SPACE
3384 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3386 nextchr = UCHARAT(++locinput);
3390 PL_reg_flags |= RF_tainted;
3393 if (!nextchr && locinput >= PL_regeol)
3396 LOAD_UTF8_CHARCLASS_SPACE();
3397 if (OP(scan) == NSPACE
3398 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3399 : isSPACE_LC_utf8((U8*)locinput))
3403 locinput += PL_utf8skip[nextchr];
3404 nextchr = UCHARAT(locinput);
3407 if (OP(scan) == NSPACE
3408 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3410 nextchr = UCHARAT(++locinput);
3413 PL_reg_flags |= RF_tainted;
3419 LOAD_UTF8_CHARCLASS_DIGIT();
3420 if (!(OP(scan) == DIGIT
3421 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3422 : isDIGIT_LC_utf8((U8*)locinput)))
3426 locinput += PL_utf8skip[nextchr];
3427 nextchr = UCHARAT(locinput);
3430 if (!(OP(scan) == DIGIT
3431 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3433 nextchr = UCHARAT(++locinput);
3436 PL_reg_flags |= RF_tainted;
3439 if (!nextchr && locinput >= PL_regeol)
3442 LOAD_UTF8_CHARCLASS_DIGIT();
3443 if (OP(scan) == NDIGIT
3444 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3445 : isDIGIT_LC_utf8((U8*)locinput))
3449 locinput += PL_utf8skip[nextchr];
3450 nextchr = UCHARAT(locinput);
3453 if (OP(scan) == NDIGIT
3454 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3456 nextchr = UCHARAT(++locinput);
3459 if (locinput >= PL_regeol)
3462 LOAD_UTF8_CHARCLASS_MARK();
3463 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3465 locinput += PL_utf8skip[nextchr];
3466 while (locinput < PL_regeol &&
3467 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3468 locinput += UTF8SKIP(locinput);
3469 if (locinput > PL_regeol)
3474 nextchr = UCHARAT(locinput);
3481 PL_reg_flags |= RF_tainted;
3486 n = reg_check_named_buff_matched(rex,scan);
3489 type = REF + ( type - NREF );
3496 PL_reg_flags |= RF_tainted;
3500 n = ARG(scan); /* which paren pair */
3503 ln = PL_regstartp[n];
3504 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3505 if (*PL_reglastparen < n || ln == -1)
3506 sayNO; /* Do not match unless seen CLOSEn. */
3507 if (ln == PL_regendp[n])
3511 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3513 const char *e = PL_bostr + PL_regendp[n];
3515 * Note that we can't do the "other character" lookup trick as
3516 * in the 8-bit case (no pun intended) because in Unicode we
3517 * have to map both upper and title case to lower case.
3521 STRLEN ulen1, ulen2;
3522 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3523 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3527 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3528 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3529 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3536 nextchr = UCHARAT(locinput);
3540 /* Inline the first character, for speed. */
3541 if (UCHARAT(s) != nextchr &&
3543 (UCHARAT(s) != (type == REFF
3544 ? PL_fold : PL_fold_locale)[nextchr])))
3546 ln = PL_regendp[n] - ln;
3547 if (locinput + ln > PL_regeol)
3549 if (ln > 1 && (type == REF
3550 ? memNE(s, locinput, ln)
3552 ? ibcmp(s, locinput, ln)
3553 : ibcmp_locale(s, locinput, ln))))
3556 nextchr = UCHARAT(locinput);
3566 #define ST st->u.eval
3570 regexp_internal *rei;
3571 regnode *startpoint;
3574 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3575 if (cur_eval && cur_eval->locinput==locinput) {
3576 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3577 Perl_croak(aTHX_ "Infinite recursion in regex");
3578 if ( ++nochange_depth > max_nochange_depth )
3580 "Pattern subroutine nesting without pos change"
3581 " exceeded limit in regex");
3587 (void)ReREFCNT_inc(rex);
3588 if (OP(scan)==GOSUB) {
3589 startpoint = scan + ARG2L(scan);
3590 ST.close_paren = ARG(scan);
3592 startpoint = rei->program+1;
3595 goto eval_recurse_doit;
3597 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3598 if (cur_eval && cur_eval->locinput==locinput) {
3599 if ( ++nochange_depth > max_nochange_depth )
3600 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3605 /* execute the code in the {...} */
3607 SV ** const before = SP;
3608 OP_4tree * const oop = PL_op;
3609 COP * const ocurcop = PL_curcop;
3613 PL_op = (OP_4tree*)rexi->data->data[n];
3614 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3615 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3616 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3617 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3620 SV *sv_mrk = get_sv("REGMARK", 1);
3621 sv_setsv(sv_mrk, sv_yes_mark);
3624 CALLRUNOPS(aTHX); /* Scalar context. */
3627 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3634 PAD_RESTORE_LOCAL(old_comppad);
3635 PL_curcop = ocurcop;
3638 sv_setsv(save_scalar(PL_replgv), ret);
3642 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3645 /* extract RE object from returned value; compiling if
3650 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3651 mg = mg_find(sv, PERL_MAGIC_qr);
3652 else if (SvSMAGICAL(ret)) {
3653 if (SvGMAGICAL(ret))
3654 sv_unmagic(ret, PERL_MAGIC_qr);
3656 mg = mg_find(ret, PERL_MAGIC_qr);
3660 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
3664 const char * const t = SvPV_const(ret, len);
3666 const I32 osize = PL_regsize;
3669 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3670 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3672 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3674 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3679 RX_MATCH_COPIED_off(re);
3680 re->subbeg = rex->subbeg;
3681 re->sublen = rex->sublen;
3684 debug_start_match(re, do_utf8, locinput, PL_regeol,
3685 "Matching embedded");
3687 startpoint = rei->program + 1;
3688 ST.close_paren = 0; /* only used for GOSUB */
3689 /* borrowed from regtry */
3690 if (PL_reg_start_tmpl <= re->nparens) {
3691 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3692 if(PL_reg_start_tmp)
3693 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3695 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3698 eval_recurse_doit: /* Share code with GOSUB below this line */
3699 /* run the pattern returned from (??{...}) */
3700 ST.cp = regcppush(0); /* Save *all* the positions. */
3701 REGCP_SET(ST.lastcp);
3703 PL_regstartp = re->startp; /* essentially NOOP on GOSUB */
3704 PL_regendp = re->endp; /* essentially NOOP on GOSUB */
3706 *PL_reglastparen = 0;
3707 *PL_reglastcloseparen = 0;
3708 PL_reginput = locinput;
3711 /* XXXX This is too dramatic a measure... */
3714 ST.toggle_reg_flags = PL_reg_flags;
3715 if (re->extflags & RXf_UTF8)
3716 PL_reg_flags |= RF_utf8;
3718 PL_reg_flags &= ~RF_utf8;
3719 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3722 ST.prev_curlyx = cur_curlyx;
3727 ST.prev_eval = cur_eval;
3729 /* now continue from first node in postoned RE */
3730 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3733 /* logical is 1, /(?(?{...})X|Y)/ */
3734 sw = (bool)SvTRUE(ret);
3739 case EVAL_AB: /* cleanup after a successful (??{A})B */
3740 /* note: this is called twice; first after popping B, then A */
3741 PL_reg_flags ^= ST.toggle_reg_flags;
3743 SETREX(rex,ST.prev_rex);
3744 rexi = RXi_GET(rex);
3746 cur_eval = ST.prev_eval;
3747 cur_curlyx = ST.prev_curlyx;
3748 /* XXXX This is too dramatic a measure... */
3750 if ( nochange_depth )
3755 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3756 /* note: this is called twice; first after popping B, then A */
3757 PL_reg_flags ^= ST.toggle_reg_flags;
3759 SETREX(rex,ST.prev_rex);
3760 rexi = RXi_GET(rex);
3761 PL_reginput = locinput;
3762 REGCP_UNWIND(ST.lastcp);
3764 cur_eval = ST.prev_eval;
3765 cur_curlyx = ST.prev_curlyx;
3766 /* XXXX This is too dramatic a measure... */
3768 if ( nochange_depth )
3774 n = ARG(scan); /* which paren pair */
3775 PL_reg_start_tmp[n] = locinput;
3781 n = ARG(scan); /* which paren pair */
3782 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3783 PL_regendp[n] = locinput - PL_bostr;
3784 /*if (n > PL_regsize)
3786 if (n > *PL_reglastparen)
3787 *PL_reglastparen = n;
3788 *PL_reglastcloseparen = n;
3789 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3797 cursor && OP(cursor)!=END;
3798 cursor=regnext(cursor))
3800 if ( OP(cursor)==CLOSE ){
3802 if ( n <= lastopen ) {
3803 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3804 PL_regendp[n] = locinput - PL_bostr;
3805 /*if (n > PL_regsize)
3807 if (n > *PL_reglastparen)
3808 *PL_reglastparen = n;
3809 *PL_reglastcloseparen = n;
3810 if ( n == ARG(scan) || (cur_eval &&
3811 cur_eval->u.eval.close_paren == n))
3820 n = ARG(scan); /* which paren pair */
3821 sw = (bool)(*PL_reglastparen >= n && PL_regendp[n] != -1);
3824 /* reg_check_named_buff_matched returns 0 for no match */
3825 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3829 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3835 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3837 next = NEXTOPER(NEXTOPER(scan));
3839 next = scan + ARG(scan);
3840 if (OP(next) == IFTHEN) /* Fake one. */
3841 next = NEXTOPER(NEXTOPER(next));
3845 logical = scan->flags;
3848 /*******************************************************************
3850 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3851 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3852 STAR/PLUS/CURLY/CURLYN are used instead.)
3854 A*B is compiled as <CURLYX><A><WHILEM><B>
3856 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3857 state, which contains the current count, initialised to -1. It also sets
3858 cur_curlyx to point to this state, with any previous value saved in the
3861 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3862 since the pattern may possibly match zero times (i.e. it's a while {} loop
3863 rather than a do {} while loop).
3865 Each entry to WHILEM represents a successful match of A. The count in the
3866 CURLYX block is incremented, another WHILEM state is pushed, and execution
3867 passes to A or B depending on greediness and the current count.
3869 For example, if matching against the string a1a2a3b (where the aN are
3870 substrings that match /A/), then the match progresses as follows: (the
3871 pushed states are interspersed with the bits of strings matched so far):
3874 <CURLYX cnt=0><WHILEM>
3875 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3876 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3877 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3878 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3880 (Contrast this with something like CURLYM, which maintains only a single
3884 a1 <CURLYM cnt=1> a2
3885 a1 a2 <CURLYM cnt=2> a3
3886 a1 a2 a3 <CURLYM cnt=3> b
3889 Each WHILEM state block marks a point to backtrack to upon partial failure
3890 of A or B, and also contains some minor state data related to that
3891 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3892 overall state, such as the count, and pointers to the A and B ops.
3894 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3895 must always point to the *current* CURLYX block, the rules are:
3897 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3898 and set cur_curlyx to point the new block.
3900 When popping the CURLYX block after a successful or unsuccessful match,
3901 restore the previous cur_curlyx.
3903 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3904 to the outer one saved in the CURLYX block.
3906 When popping the WHILEM block after a successful or unsuccessful B match,
3907 restore the previous cur_curlyx.
3909 Here's an example for the pattern (AI* BI)*BO
3910 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3913 curlyx backtrack stack
3914 ------ ---------------
3916 CO <CO prev=NULL> <WO>
3917 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3918 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3919 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3921 At this point the pattern succeeds, and we work back down the stack to
3922 clean up, restoring as we go:
3924 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3925 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3926 CO <CO prev=NULL> <WO>
3929 *******************************************************************/
3931 #define ST st->u.curlyx
3933 case CURLYX: /* start of /A*B/ (for complex A) */
3935 /* No need to save/restore up to this paren */
3936 I32 parenfloor = scan->flags;
3938 assert(next); /* keep Coverity happy */
3939 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3942 /* XXXX Probably it is better to teach regpush to support
3943 parenfloor > PL_regsize... */
3944 if (parenfloor > (I32)*PL_reglastparen)
3945 parenfloor = *PL_reglastparen; /* Pessimization... */
3947 ST.prev_curlyx= cur_curlyx;
3949 ST.cp = PL_savestack_ix;
3951 /* these fields contain the state of the current curly.
3952 * they are accessed by subsequent WHILEMs */
3953 ST.parenfloor = parenfloor;
3954 ST.min = ARG1(scan);
3955 ST.max = ARG2(scan);
3956 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3960 ST.count = -1; /* this will be updated by WHILEM */
3961 ST.lastloc = NULL; /* this will be updated by WHILEM */
3963 PL_reginput = locinput;
3964 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3968 case CURLYX_end: /* just finished matching all of A*B */
3969 if (PL_reg_eval_set){
3970 SV *pres= GvSV(PL_replgv);
3973 sv_setsv(GvSV(PL_replgv), pres);
3978 cur_curlyx = ST.prev_curlyx;
3982 case CURLYX_end_fail: /* just failed to match all of A*B */
3984 cur_curlyx = ST.prev_curlyx;
3990 #define ST st->u.whilem
3992 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3994 /* see the discussion above about CURLYX/WHILEM */
3996 assert(cur_curlyx); /* keep Coverity happy */
3997 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3998 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3999 ST.cache_offset = 0;
4002 PL_reginput = locinput;
4004 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4005 "%*s whilem: matched %ld out of %ld..%ld\n",
4006 REPORT_CODE_OFF+depth*2, "", (long)n,
4007 (long)cur_curlyx->u.curlyx.min,
4008 (long)cur_curlyx->u.curlyx.max)
4011 /* First just match a string of min A's. */
4013 if (n < cur_curlyx->u.curlyx.min) {
4014 cur_curlyx->u.curlyx.lastloc = locinput;
4015 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4019 /* If degenerate A matches "", assume A done. */
4021 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4022 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4023 "%*s whilem: empty match detected, trying continuation...\n",
4024 REPORT_CODE_OFF+depth*2, "")
4026 goto do_whilem_B_max;
4029 /* super-linear cache processing */
4033 if (!PL_reg_maxiter) {
4034 /* start the countdown: Postpone detection until we
4035 * know the match is not *that* much linear. */
4036 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4037 /* possible overflow for long strings and many CURLYX's */
4038 if (PL_reg_maxiter < 0)
4039 PL_reg_maxiter = I32_MAX;
4040 PL_reg_leftiter = PL_reg_maxiter;
4043 if (PL_reg_leftiter-- == 0) {
4044 /* initialise cache */
4045 const I32 size = (PL_reg_maxiter + 7)/8;
4046 if (PL_reg_poscache) {
4047 if ((I32)PL_reg_poscache_size < size) {
4048 Renew(PL_reg_poscache, size, char);
4049 PL_reg_poscache_size = size;
4051 Zero(PL_reg_poscache, size, char);
4054 PL_reg_poscache_size = size;
4055 Newxz(PL_reg_poscache, size, char);
4057 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4058 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4059 PL_colors[4], PL_colors[5])
4063 if (PL_reg_leftiter < 0) {
4064 /* have we already failed at this position? */
4066 offset = (scan->flags & 0xf) - 1
4067 + (locinput - PL_bostr) * (scan->flags>>4);
4068 mask = 1 << (offset % 8);
4070 if (PL_reg_poscache[offset] & mask) {
4071 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4072 "%*s whilem: (cache) already tried at this position...\n",
4073 REPORT_CODE_OFF+depth*2, "")
4075 sayNO; /* cache records failure */
4077 ST.cache_offset = offset;
4078 ST.cache_mask = mask;
4082 /* Prefer B over A for minimal matching. */
4084 if (cur_curlyx->u.curlyx.minmod) {
4085 ST.save_curlyx = cur_curlyx;
4086 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4087 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4088 REGCP_SET(ST.lastcp);
4089 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4093 /* Prefer A over B for maximal matching. */
4095 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4096 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4097 cur_curlyx->u.curlyx.lastloc = locinput;
4098 REGCP_SET(ST.lastcp);
4099 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4102 goto do_whilem_B_max;
4106 case WHILEM_B_min: /* just matched B in a minimal match */
4107 case WHILEM_B_max: /* just matched B in a maximal match */
4108 cur_curlyx = ST.save_curlyx;
4112 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4113 cur_curlyx = ST.save_curlyx;
4114 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4115 cur_curlyx->u.curlyx.count--;
4119 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4120 REGCP_UNWIND(ST.lastcp);
4123 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4124 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4125 cur_curlyx->u.curlyx.count--;
4129 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4130 REGCP_UNWIND(ST.lastcp);
4131 regcppop(rex); /* Restore some previous $<digit>s? */
4132 PL_reginput = locinput;
4133 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4134 "%*s whilem: failed, trying continuation...\n",
4135 REPORT_CODE_OFF+depth*2, "")
4138 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4139 && ckWARN(WARN_REGEXP)
4140 && !(PL_reg_flags & RF_warned))
4142 PL_reg_flags |= RF_warned;
4143 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4144 "Complex regular subexpression recursion",
4149 ST.save_curlyx = cur_curlyx;
4150 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4151 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4154 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4155 cur_curlyx = ST.save_curlyx;
4156 REGCP_UNWIND(ST.lastcp);
4159 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4160 /* Maximum greed exceeded */
4161 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4162 && ckWARN(WARN_REGEXP)
4163 && !(PL_reg_flags & RF_warned))
4165 PL_reg_flags |= RF_warned;
4166 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4167 "%s limit (%d) exceeded",
4168 "Complex regular subexpression recursion",
4171 cur_curlyx->u.curlyx.count--;
4175 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4176 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4178 /* Try grabbing another A and see if it helps. */
4179 PL_reginput = locinput;
4180 cur_curlyx->u.curlyx.lastloc = locinput;
4181 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4182 REGCP_SET(ST.lastcp);
4183 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4187 #define ST st->u.branch
4189 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4190 next = scan + ARG(scan);
4193 scan = NEXTOPER(scan);
4196 case BRANCH: /* /(...|A|...)/ */
4197 scan = NEXTOPER(scan); /* scan now points to inner node */
4198 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4201 /* last branch; skip state push and jump direct to node */
4204 ST.lastparen = *PL_reglastparen;
4205 ST.next_branch = next;
4207 PL_reginput = locinput;
4209 /* Now go into the branch */
4211 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4213 PUSH_STATE_GOTO(BRANCH_next, scan);
4217 PL_reginput = locinput;
4218 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4219 (SV*)rexi->data->data[ ARG( scan ) ];
4220 PUSH_STATE_GOTO(CUTGROUP_next,next);
4222 case CUTGROUP_next_fail:
4225 if (st->u.mark.mark_name)
4226 sv_commit = st->u.mark.mark_name;
4232 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4237 REGCP_UNWIND(ST.cp);
4238 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4240 *PL_reglastparen = n;
4241 /*dmq: *PL_reglastcloseparen = n; */
4242 scan = ST.next_branch;
4243 /* no more branches? */
4244 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4246 PerlIO_printf( Perl_debug_log,
4247 "%*s %sBRANCH failed...%s\n",
4248 REPORT_CODE_OFF+depth*2, "",
4254 continue; /* execute next BRANCH[J] op */
4262 #define ST st->u.curlym
4264 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4266 /* This is an optimisation of CURLYX that enables us to push
4267 * only a single backtracking state, no matter now many matches
4268 * there are in {m,n}. It relies on the pattern being constant
4269 * length, with no parens to influence future backrefs
4273 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4275 /* if paren positive, emulate an OPEN/CLOSE around A */
4277 U32 paren = ST.me->flags;
4278 if (paren > PL_regsize)
4280 if (paren > *PL_reglastparen)
4281 *PL_reglastparen = paren;
4282 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4290 ST.c1 = CHRTEST_UNINIT;
4293 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4296 curlym_do_A: /* execute the A in /A{m,n}B/ */
4297 PL_reginput = locinput;
4298 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4301 case CURLYM_A: /* we've just matched an A */
4302 locinput = st->locinput;
4303 nextchr = UCHARAT(locinput);
4306 /* after first match, determine A's length: u.curlym.alen */
4307 if (ST.count == 1) {
4308 if (PL_reg_match_utf8) {
4310 while (s < PL_reginput) {
4316 ST.alen = PL_reginput - locinput;
4319 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4322 PerlIO_printf(Perl_debug_log,
4323 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4324 (int)(REPORT_CODE_OFF+(depth*2)), "",
4325 (IV) ST.count, (IV)ST.alen)
4328 locinput = PL_reginput;
4330 if (cur_eval && cur_eval->u.eval.close_paren &&
4331 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4334 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4335 goto curlym_do_A; /* try to match another A */
4336 goto curlym_do_B; /* try to match B */
4338 case CURLYM_A_fail: /* just failed to match an A */
4339 REGCP_UNWIND(ST.cp);
4341 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4342 || (cur_eval && cur_eval->u.eval.close_paren &&
4343 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4346 curlym_do_B: /* execute the B in /A{m,n}B/ */
4347 PL_reginput = locinput;
4348 if (ST.c1 == CHRTEST_UNINIT) {
4349 /* calculate c1 and c2 for possible match of 1st char
4350 * following curly */
4351 ST.c1 = ST.c2 = CHRTEST_VOID;
4352 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4353 regnode *text_node = ST.B;
4354 if (! HAS_TEXT(text_node))
4355 FIND_NEXT_IMPT(text_node);
4358 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4360 But the former is redundant in light of the latter.
4362 if this changes back then the macro for
4363 IS_TEXT and friends need to change.
4365 if (PL_regkind[OP(text_node)] == EXACT)
4368 ST.c1 = (U8)*STRING(text_node);
4370 (IS_TEXTF(text_node))
4372 : (IS_TEXTFL(text_node))
4373 ? PL_fold_locale[ST.c1]
4380 PerlIO_printf(Perl_debug_log,
4381 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4382 (int)(REPORT_CODE_OFF+(depth*2)),
4385 if (ST.c1 != CHRTEST_VOID
4386 && UCHARAT(PL_reginput) != ST.c1
4387 && UCHARAT(PL_reginput) != ST.c2)
4389 /* simulate B failing */
4391 PerlIO_printf(Perl_debug_log,
4392 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4393 (int)(REPORT_CODE_OFF+(depth*2)),"",
4396 state_num = CURLYM_B_fail;
4397 goto reenter_switch;
4401 /* mark current A as captured */
4402 I32 paren = ST.me->flags;
4405 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4406 PL_regendp[paren] = PL_reginput - PL_bostr;
4407 /*dmq: *PL_reglastcloseparen = paren; */
4410 PL_regendp[paren] = -1;
4411 if (cur_eval && cur_eval->u.eval.close_paren &&
4412 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4421 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4424 case CURLYM_B_fail: /* just failed to match a B */
4425 REGCP_UNWIND(ST.cp);
4427 if (ST.count == ARG2(ST.me) /* max */)
4429 goto curlym_do_A; /* try to match a further A */
4431 /* backtrack one A */
4432 if (ST.count == ARG1(ST.me) /* min */)
4435 locinput = HOPc(locinput, -ST.alen);
4436 goto curlym_do_B; /* try to match B */
4439 #define ST st->u.curly
4441 #define CURLY_SETPAREN(paren, success) \
4444 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4445 PL_regendp[paren] = locinput - PL_bostr; \
4446 *PL_reglastcloseparen = paren; \
4449 PL_regendp[paren] = -1; \
4452 case STAR: /* /A*B/ where A is width 1 */
4456 scan = NEXTOPER(scan);
4458 case PLUS: /* /A+B/ where A is width 1 */
4462 scan = NEXTOPER(scan);
4464 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4465 ST.paren = scan->flags; /* Which paren to set */
4466 if (ST.paren > PL_regsize)
4467 PL_regsize = ST.paren;
4468 if (ST.paren > *PL_reglastparen)
4469 *PL_reglastparen = ST.paren;
4470 ST.min = ARG1(scan); /* min to match */
4471 ST.max = ARG2(scan); /* max to match */
4472 if (cur_eval && cur_eval->u.eval.close_paren &&
4473 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4477 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4479 case CURLY: /* /A{m,n}B/ where A is width 1 */
4481 ST.min = ARG1(scan); /* min to match */
4482 ST.max = ARG2(scan); /* max to match */
4483 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4486 * Lookahead to avoid useless match attempts
4487 * when we know what character comes next.
4489 * Used to only do .*x and .*?x, but now it allows
4490 * for )'s, ('s and (?{ ... })'s to be in the way
4491 * of the quantifier and the EXACT-like node. -- japhy
4494 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4496 if (HAS_TEXT(next) || JUMPABLE(next)) {
4498 regnode *text_node = next;
4500 if (! HAS_TEXT(text_node))
4501 FIND_NEXT_IMPT(text_node);
4503 if (! HAS_TEXT(text_node))
4504 ST.c1 = ST.c2 = CHRTEST_VOID;
4506 if ( PL_regkind[OP(text_node)] != EXACT ) {
4507 ST.c1 = ST.c2 = CHRTEST_VOID;
4508 goto assume_ok_easy;
4511 s = (U8*)STRING(text_node);
4513 /* Currently we only get here when
4515 PL_rekind[OP(text_node)] == EXACT
4517 if this changes back then the macro for IS_TEXT and
4518 friends need to change. */
4521 if (IS_TEXTF(text_node))
4522 ST.c2 = PL_fold[ST.c1];
4523 else if (IS_TEXTFL(text_node))
4524 ST.c2 = PL_fold_locale[ST.c1];
4527 if (IS_TEXTF(text_node)) {
4528 STRLEN ulen1, ulen2;
4529 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4530 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4532 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4533 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4535 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4537 0 : UTF8_ALLOW_ANY);
4538 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4540 0 : UTF8_ALLOW_ANY);
4542 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4544 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4549 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4556 ST.c1 = ST.c2 = CHRTEST_VOID;
4561 PL_reginput = locinput;
4564 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4567 locinput = PL_reginput;
4569 if (ST.c1 == CHRTEST_VOID)
4570 goto curly_try_B_min;
4572 ST.oldloc = locinput;
4574 /* set ST.maxpos to the furthest point along the
4575 * string that could possibly match */
4576 if (ST.max == REG_INFTY) {
4577 ST.maxpos = PL_regeol - 1;
4579 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4583 int m = ST.max - ST.min;
4584 for (ST.maxpos = locinput;
4585 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4586 ST.maxpos += UTF8SKIP(ST.maxpos);
4589 ST.maxpos = locinput + ST.max - ST.min;
4590 if (ST.maxpos >= PL_regeol)
4591 ST.maxpos = PL_regeol - 1;
4593 goto curly_try_B_min_known;
4597 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4598 locinput = PL_reginput;
4599 if (ST.count < ST.min)
4601 if ((ST.count > ST.min)
4602 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4604 /* A{m,n} must come at the end of the string, there's
4605 * no point in backing off ... */
4607 /* ...except that $ and \Z can match before *and* after
4608 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4609 We may back off by one in this case. */
4610 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4614 goto curly_try_B_max;
4619 case CURLY_B_min_known_fail:
4620 /* failed to find B in a non-greedy match where c1,c2 valid */
4621 if (ST.paren && ST.count)
4622 PL_regendp[ST.paren] = -1;
4624 PL_reginput = locinput; /* Could be reset... */
4625 REGCP_UNWIND(ST.cp);
4626 /* Couldn't or didn't -- move forward. */
4627 ST.oldloc = locinput;
4629 locinput += UTF8SKIP(locinput);
4633 curly_try_B_min_known:
4634 /* find the next place where 'B' could work, then call B */
4638 n = (ST.oldloc == locinput) ? 0 : 1;
4639 if (ST.c1 == ST.c2) {
4641 /* set n to utf8_distance(oldloc, locinput) */
4642 while (locinput <= ST.maxpos &&
4643 utf8n_to_uvchr((U8*)locinput,
4644 UTF8_MAXBYTES, &len,
4645 uniflags) != (UV)ST.c1) {
4651 /* set n to utf8_distance(oldloc, locinput) */
4652 while (locinput <= ST.maxpos) {
4654 const UV c = utf8n_to_uvchr((U8*)locinput,
4655 UTF8_MAXBYTES, &len,
4657 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4665 if (ST.c1 == ST.c2) {
4666 while (locinput <= ST.maxpos &&
4667 UCHARAT(locinput) != ST.c1)
4671 while (locinput <= ST.maxpos
4672 && UCHARAT(locinput) != ST.c1
4673 && UCHARAT(locinput) != ST.c2)
4676 n = locinput - ST.oldloc;
4678 if (locinput > ST.maxpos)
4680 /* PL_reginput == oldloc now */
4683 if (regrepeat(rex, ST.A, n, depth) < n)
4686 PL_reginput = locinput;
4687 CURLY_SETPAREN(ST.paren, ST.count);
4688 if (cur_eval && cur_eval->u.eval.close_paren &&
4689 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4692 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4697 case CURLY_B_min_fail:
4698 /* failed to find B in a non-greedy match where c1,c2 invalid */
4699 if (ST.paren && ST.count)
4700 PL_regendp[ST.paren] = -1;
4702 REGCP_UNWIND(ST.cp);
4703 /* failed -- move forward one */
4704 PL_reginput = locinput;
4705 if (regrepeat(rex, ST.A, 1, depth)) {
4707 locinput = PL_reginput;
4708 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4709 ST.count > 0)) /* count overflow ? */
4712 CURLY_SETPAREN(ST.paren, ST.count);
4713 if (cur_eval && cur_eval->u.eval.close_paren &&
4714 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4717 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4725 /* a successful greedy match: now try to match B */
4726 if (cur_eval && cur_eval->u.eval.close_paren &&
4727 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4732 if (ST.c1 != CHRTEST_VOID)
4733 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4734 UTF8_MAXBYTES, 0, uniflags)
4735 : (UV) UCHARAT(PL_reginput);
4736 /* If it could work, try it. */
4737 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4738 CURLY_SETPAREN(ST.paren, ST.count);
4739 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4744 case CURLY_B_max_fail:
4745 /* failed to find B in a greedy match */
4746 if (ST.paren && ST.count)
4747 PL_regendp[ST.paren] = -1;
4749 REGCP_UNWIND(ST.cp);
4751 if (--ST.count < ST.min)
4753 PL_reginput = locinput = HOPc(locinput, -1);
4754 goto curly_try_B_max;
4761 /* we've just finished A in /(??{A})B/; now continue with B */
4763 st->u.eval.toggle_reg_flags
4764 = cur_eval->u.eval.toggle_reg_flags;
4765 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4767 st->u.eval.prev_rex = rex; /* inner */
4768 SETREX(rex,cur_eval->u.eval.prev_rex);
4769 rexi = RXi_GET(rex);
4770 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4772 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4773 REGCP_SET(st->u.eval.lastcp);
4774 PL_reginput = locinput;
4776 /* Restore parens of the outer rex without popping the
4778 tmpix = PL_savestack_ix;
4779 PL_savestack_ix = cur_eval->u.eval.lastcp;
4781 PL_savestack_ix = tmpix;
4783 st->u.eval.prev_eval = cur_eval;
4784 cur_eval = cur_eval->u.eval.prev_eval;
4786 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4787 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4788 if ( nochange_depth )
4791 PUSH_YES_STATE_GOTO(EVAL_AB,
4792 st->u.eval.prev_eval->u.eval.B); /* match B */
4795 if (locinput < reginfo->till) {
4796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4797 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4799 (long)(locinput - PL_reg_starttry),
4800 (long)(reginfo->till - PL_reg_starttry),
4803 sayNO_SILENT; /* Cannot match: too short. */
4805 PL_reginput = locinput; /* put where regtry can find it */
4806 sayYES; /* Success! */
4808 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4810 PerlIO_printf(Perl_debug_log,
4811 "%*s %ssubpattern success...%s\n",
4812 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4813 PL_reginput = locinput; /* put where regtry can find it */
4814 sayYES; /* Success! */
4817 #define ST st->u.ifmatch
4819 case SUSPEND: /* (?>A) */
4821 PL_reginput = locinput;
4824 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4826 goto ifmatch_trivial_fail_test;
4828 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4830 ifmatch_trivial_fail_test:
4832 char * const s = HOPBACKc(locinput, scan->flags);
4837 sw = 1 - (bool)ST.wanted;
4841 next = scan + ARG(scan);
4849 PL_reginput = locinput;
4853 ST.logical = logical;
4854 /* execute body of (?...A) */
4855 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4858 case IFMATCH_A_fail: /* body of (?...A) failed */
4859 ST.wanted = !ST.wanted;
4862 case IFMATCH_A: /* body of (?...A) succeeded */
4864 sw = (bool)ST.wanted;
4866 else if (!ST.wanted)
4869 if (OP(ST.me) == SUSPEND)
4870 locinput = PL_reginput;
4872 locinput = PL_reginput = st->locinput;
4873 nextchr = UCHARAT(locinput);
4875 scan = ST.me + ARG(ST.me);
4878 continue; /* execute B */
4883 next = scan + ARG(scan);
4888 reginfo->cutpoint = PL_regeol;
4891 PL_reginput = locinput;
4893 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
4894 PUSH_STATE_GOTO(COMMIT_next,next);
4896 case COMMIT_next_fail:
4903 #define ST st->u.mark
4905 ST.prev_mark = mark_state;
4906 ST.mark_name = sv_commit = sv_yes_mark
4907 = (SV*)rexi->data->data[ ARG( scan ) ];
4909 ST.mark_loc = PL_reginput = locinput;
4910 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4912 case MARKPOINT_next:
4913 mark_state = ST.prev_mark;
4916 case MARKPOINT_next_fail:
4917 if (popmark && sv_eq(ST.mark_name,popmark))
4919 if (ST.mark_loc > startpoint)
4920 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4921 popmark = NULL; /* we found our mark */
4922 sv_commit = ST.mark_name;
4925 PerlIO_printf(Perl_debug_log,
4926 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4927 REPORT_CODE_OFF+depth*2, "",
4928 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
4931 mark_state = ST.prev_mark;
4932 sv_yes_mark = mark_state ?
4933 mark_state->u.mark.mark_name : NULL;
4937 PL_reginput = locinput;
4939 /* (*SKIP) : if we fail we cut here*/
4940 ST.mark_name = NULL;
4941 ST.mark_loc = locinput;
4942 PUSH_STATE_GOTO(SKIP_next,next);
4944 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
4945 otherwise do nothing. Meaning we need to scan
4947 regmatch_state *cur = mark_state;
4948 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
4951 if ( sv_eq( cur->u.mark.mark_name,
4954 ST.mark_name = find;
4955 PUSH_STATE_GOTO( SKIP_next, next );
4957 cur = cur->u.mark.prev_mark;
4960 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
4962 case SKIP_next_fail:
4964 /* (*CUT:NAME) - Set up to search for the name as we
4965 collapse the stack*/
4966 popmark = ST.mark_name;
4968 /* (*CUT) - No name, we cut here.*/
4969 if (ST.mark_loc > startpoint)
4970 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4971 /* but we set sv_commit to latest mark_name if there
4972 is one so they can test to see how things lead to this
4975 sv_commit=mark_state->u.mark.mark_name;
4983 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4984 PTR2UV(scan), OP(scan));
4985 Perl_croak(aTHX_ "regexp memory corruption");
4989 /* switch break jumps here */
4990 scan = next; /* prepare to execute the next op and ... */
4991 continue; /* ... jump back to the top, reusing st */
4995 /* push a state that backtracks on success */
4996 st->u.yes.prev_yes_state = yes_state;
5000 /* push a new regex state, then continue at scan */
5002 regmatch_state *newst;
5005 regmatch_state *cur = st;
5006 regmatch_state *curyes = yes_state;
5008 regmatch_slab *slab = PL_regmatch_slab;
5009 for (;curd > -1;cur--,curd--) {
5010 if (cur < SLAB_FIRST(slab)) {
5012 cur = SLAB_LAST(slab);
5014 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5015 REPORT_CODE_OFF + 2 + depth * 2,"",
5016 curd, PL_reg_name[cur->resume_state],
5017 (curyes == cur) ? "yes" : ""
5020 curyes = cur->u.yes.prev_yes_state;
5023 DEBUG_STATE_pp("push")
5026 st->locinput = locinput;
5028 if (newst > SLAB_LAST(PL_regmatch_slab))
5029 newst = S_push_slab(aTHX);
5030 PL_regmatch_state = newst;
5032 locinput = PL_reginput;
5033 nextchr = UCHARAT(locinput);
5041 * We get here only if there's trouble -- normally "case END" is
5042 * the terminating point.
5044 Perl_croak(aTHX_ "corrupted regexp pointers");
5050 /* we have successfully completed a subexpression, but we must now
5051 * pop to the state marked by yes_state and continue from there */
5052 assert(st != yes_state);
5054 while (st != yes_state) {
5056 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5057 PL_regmatch_slab = PL_regmatch_slab->prev;
5058 st = SLAB_LAST(PL_regmatch_slab);
5062 DEBUG_STATE_pp("pop (no final)");
5064 DEBUG_STATE_pp("pop (yes)");
5070 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5071 || yes_state > SLAB_LAST(PL_regmatch_slab))
5073 /* not in this slab, pop slab */
5074 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5075 PL_regmatch_slab = PL_regmatch_slab->prev;
5076 st = SLAB_LAST(PL_regmatch_slab);
5078 depth -= (st - yes_state);
5081 yes_state = st->u.yes.prev_yes_state;
5082 PL_regmatch_state = st;
5085 locinput= st->locinput;
5086 nextchr = UCHARAT(locinput);
5088 state_num = st->resume_state + no_final;
5089 goto reenter_switch;
5092 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5093 PL_colors[4], PL_colors[5]));
5100 PerlIO_printf(Perl_debug_log,
5101 "%*s %sfailed...%s\n",
5102 REPORT_CODE_OFF+depth*2, "",
5103 PL_colors[4], PL_colors[5])
5115 /* there's a previous state to backtrack to */
5117 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5118 PL_regmatch_slab = PL_regmatch_slab->prev;
5119 st = SLAB_LAST(PL_regmatch_slab);
5121 PL_regmatch_state = st;
5122 locinput= st->locinput;
5123 nextchr = UCHARAT(locinput);
5125 DEBUG_STATE_pp("pop");
5127 if (yes_state == st)
5128 yes_state = st->u.yes.prev_yes_state;
5130 state_num = st->resume_state + 1; /* failure = success + 1 */
5131 goto reenter_switch;
5136 if (rex->intflags & PREGf_VERBARG_SEEN) {
5137 SV *sv_err = get_sv("REGERROR", 1);
5138 SV *sv_mrk = get_sv("REGMARK", 1);
5140 sv_commit = &PL_sv_no;
5142 sv_yes_mark = &PL_sv_yes;
5145 sv_commit = &PL_sv_yes;
5146 sv_yes_mark = &PL_sv_no;
5148 sv_setsv(sv_err, sv_commit);
5149 sv_setsv(sv_mrk, sv_yes_mark);
5151 /* restore original high-water mark */
5152 PL_regmatch_slab = orig_slab;
5153 PL_regmatch_state = orig_state;
5155 /* free all slabs above current one */
5156 if (orig_slab->next) {
5157 regmatch_slab *sl = orig_slab->next;
5158 orig_slab->next = NULL;
5160 regmatch_slab * const osl = sl;
5170 - regrepeat - repeatedly match something simple, report how many
5173 * [This routine now assumes that it will only match on things of length 1.
5174 * That was true before, but now we assume scan - reginput is the count,
5175 * rather than incrementing count on every character. [Er, except utf8.]]
5178 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5181 register char *scan;
5183 register char *loceol = PL_regeol;
5184 register I32 hardcount = 0;
5185 register bool do_utf8 = PL_reg_match_utf8;
5187 PERL_UNUSED_ARG(depth);
5191 if (max == REG_INFTY)
5193 else if (max < loceol - scan)
5194 loceol = scan + max;
5199 while (scan < loceol && hardcount < max && *scan != '\n') {
5200 scan += UTF8SKIP(scan);
5204 while (scan < loceol && *scan != '\n')
5211 while (scan < loceol && hardcount < max) {
5212 scan += UTF8SKIP(scan);
5222 case EXACT: /* length of string is 1 */
5224 while (scan < loceol && UCHARAT(scan) == c)
5227 case EXACTF: /* length of string is 1 */
5229 while (scan < loceol &&
5230 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5233 case EXACTFL: /* length of string is 1 */
5234 PL_reg_flags |= RF_tainted;
5236 while (scan < loceol &&
5237 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5243 while (hardcount < max && scan < loceol &&
5244 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5245 scan += UTF8SKIP(scan);
5249 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5256 LOAD_UTF8_CHARCLASS_ALNUM();
5257 while (hardcount < max && scan < loceol &&
5258 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5259 scan += UTF8SKIP(scan);
5263 while (scan < loceol && isALNUM(*scan))
5268 PL_reg_flags |= RF_tainted;
5271 while (hardcount < max && scan < loceol &&
5272 isALNUM_LC_utf8((U8*)scan)) {
5273 scan += UTF8SKIP(scan);
5277 while (scan < loceol && isALNUM_LC(*scan))
5284 LOAD_UTF8_CHARCLASS_ALNUM();
5285 while (hardcount < max && scan < loceol &&
5286 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5287 scan += UTF8SKIP(scan);
5291 while (scan < loceol && !isALNUM(*scan))
5296 PL_reg_flags |= RF_tainted;
5299 while (hardcount < max && scan < loceol &&
5300 !isALNUM_LC_utf8((U8*)scan)) {
5301 scan += UTF8SKIP(scan);
5305 while (scan < loceol && !isALNUM_LC(*scan))
5312 LOAD_UTF8_CHARCLASS_SPACE();
5313 while (hardcount < max && scan < loceol &&
5315 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5316 scan += UTF8SKIP(scan);
5320 while (scan < loceol && isSPACE(*scan))
5325 PL_reg_flags |= RF_tainted;
5328 while (hardcount < max && scan < loceol &&
5329 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5330 scan += UTF8SKIP(scan);
5334 while (scan < loceol && isSPACE_LC(*scan))
5341 LOAD_UTF8_CHARCLASS_SPACE();
5342 while (hardcount < max && scan < loceol &&
5344 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5345 scan += UTF8SKIP(scan);
5349 while (scan < loceol && !isSPACE(*scan))
5354 PL_reg_flags |= RF_tainted;
5357 while (hardcount < max && scan < loceol &&
5358 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5359 scan += UTF8SKIP(scan);
5363 while (scan < loceol && !isSPACE_LC(*scan))
5370 LOAD_UTF8_CHARCLASS_DIGIT();
5371 while (hardcount < max && scan < loceol &&
5372 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5373 scan += UTF8SKIP(scan);
5377 while (scan < loceol && isDIGIT(*scan))
5384 LOAD_UTF8_CHARCLASS_DIGIT();
5385 while (hardcount < max && scan < loceol &&
5386 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5387 scan += UTF8SKIP(scan);
5391 while (scan < loceol && !isDIGIT(*scan))
5395 default: /* Called on something of 0 width. */
5396 break; /* So match right here or not at all. */
5402 c = scan - PL_reginput;
5406 GET_RE_DEBUG_FLAGS_DECL;
5408 SV * const prop = sv_newmortal();
5409 regprop(prog, prop, p);
5410 PerlIO_printf(Perl_debug_log,
5411 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5412 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5420 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5422 - regclass_swash - prepare the utf8 swash
5426 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5432 RXi_GET_DECL(prog,progi);
5433 const struct reg_data * const data = prog ? progi->data : NULL;
5435 if (data && data->count) {
5436 const U32 n = ARG(node);
5438 if (data->what[n] == 's') {
5439 SV * const rv = (SV*)data->data[n];
5440 AV * const av = (AV*)SvRV((SV*)rv);
5441 SV **const ary = AvARRAY(av);
5444 /* See the end of regcomp.c:S_regclass() for
5445 * documentation of these array elements. */
5448 a = SvROK(ary[1]) ? &ary[1] : 0;
5449 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5453 else if (si && doinit) {
5454 sw = swash_init("utf8", "", si, 1, 0);
5455 (void)av_store(av, 1, sw);
5472 - reginclass - determine if a character falls into a character class
5474 The n is the ANYOF regnode, the p is the target string, lenp
5475 is pointer to the maximum length of how far to go in the p
5476 (if the lenp is zero, UTF8SKIP(p) is used),
5477 do_utf8 tells whether the target string is in UTF-8.
5482 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5485 const char flags = ANYOF_FLAGS(n);
5491 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5492 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5493 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5494 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5495 if (len == (STRLEN)-1)
5496 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5499 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5500 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5503 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5504 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5507 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5511 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5514 if (swash_fetch(sw, p, do_utf8))
5516 else if (flags & ANYOF_FOLD) {
5517 if (!match && lenp && av) {
5519 for (i = 0; i <= av_len(av); i++) {
5520 SV* const sv = *av_fetch(av, i, FALSE);
5522 const char * const s = SvPV_const(sv, len);
5524 if (len <= plen && memEQ(s, (char*)p, len)) {
5532 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5535 to_utf8_fold(p, tmpbuf, &tmplen);
5536 if (swash_fetch(sw, tmpbuf, do_utf8))
5542 if (match && lenp && *lenp == 0)
5543 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5545 if (!match && c < 256) {
5546 if (ANYOF_BITMAP_TEST(n, c))
5548 else if (flags & ANYOF_FOLD) {
5551 if (flags & ANYOF_LOCALE) {
5552 PL_reg_flags |= RF_tainted;
5553 f = PL_fold_locale[c];
5557 if (f != c && ANYOF_BITMAP_TEST(n, f))
5561 if (!match && (flags & ANYOF_CLASS)) {
5562 PL_reg_flags |= RF_tainted;
5564 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5565 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5566 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5567 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5568 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5569 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5570 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5571 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5572 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5573 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5574 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5575 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5576 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5577 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5578 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5579 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5580 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5581 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5582 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5583 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5584 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5585 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5586 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5587 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5588 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5589 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5590 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5591 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5592 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5593 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5594 ) /* How's that for a conditional? */
5601 return (flags & ANYOF_INVERT) ? !match : match;
5605 S_reghop3(U8 *s, I32 off, const U8* lim)
5609 while (off-- && s < lim) {
5610 /* XXX could check well-formedness here */
5615 while (off++ && s > lim) {
5617 if (UTF8_IS_CONTINUED(*s)) {
5618 while (s > lim && UTF8_IS_CONTINUATION(*s))
5621 /* XXX could check well-formedness here */
5628 /* there are a bunch of places where we use two reghop3's that should
5629 be replaced with this routine. but since thats not done yet
5630 we ifdef it out - dmq
5633 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5637 while (off-- && s < rlim) {
5638 /* XXX could check well-formedness here */
5643 while (off++ && s > llim) {
5645 if (UTF8_IS_CONTINUED(*s)) {
5646 while (s > llim && UTF8_IS_CONTINUATION(*s))
5649 /* XXX could check well-formedness here */
5657 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5661 while (off-- && s < lim) {
5662 /* XXX could check well-formedness here */
5669 while (off++ && s > lim) {
5671 if (UTF8_IS_CONTINUED(*s)) {
5672 while (s > lim && UTF8_IS_CONTINUATION(*s))
5675 /* XXX could check well-formedness here */
5684 restore_pos(pTHX_ void *arg)
5687 regexp * const rex = (regexp *)arg;
5688 if (PL_reg_eval_set) {
5689 if (PL_reg_oldsaved) {
5690 rex->subbeg = PL_reg_oldsaved;
5691 rex->sublen = PL_reg_oldsavedlen;
5692 #ifdef PERL_OLD_COPY_ON_WRITE
5693 rex->saved_copy = PL_nrs;
5695 RX_MATCH_COPIED_on(rex);
5697 PL_reg_magic->mg_len = PL_reg_oldpos;
5698 PL_reg_eval_set = 0;
5699 PL_curpm = PL_reg_oldcurpm;
5704 S_to_utf8_substr(pTHX_ register regexp *prog)
5708 if (prog->substrs->data[i].substr
5709 && !prog->substrs->data[i].utf8_substr) {
5710 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5711 prog->substrs->data[i].utf8_substr = sv;
5712 sv_utf8_upgrade(sv);
5713 if (SvVALID(prog->substrs->data[i].substr)) {
5714 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5715 if (flags & FBMcf_TAIL) {
5716 /* Trim the trailing \n that fbm_compile added last
5718 SvCUR_set(sv, SvCUR(sv) - 1);
5719 /* Whilst this makes the SV technically "invalid" (as its
5720 buffer is no longer followed by "\0") when fbm_compile()
5721 adds the "\n" back, a "\0" is restored. */
5723 fbm_compile(sv, flags);
5725 if (prog->substrs->data[i].substr == prog->check_substr)
5726 prog->check_utf8 = sv;
5732 S_to_byte_substr(pTHX_ register regexp *prog)
5737 if (prog->substrs->data[i].utf8_substr
5738 && !prog->substrs->data[i].substr) {
5739 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5740 if (sv_utf8_downgrade(sv, TRUE)) {
5741 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5743 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5744 if (flags & FBMcf_TAIL) {
5745 /* Trim the trailing \n that fbm_compile added last
5747 SvCUR_set(sv, SvCUR(sv) - 1);
5749 fbm_compile(sv, flags);
5755 prog->substrs->data[i].substr = sv;
5756 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5757 prog->check_substr = sv;
5764 * c-indentation-style: bsd
5766 * indent-tabs-mode: t
5769 * ex: set ts=8 sts=4 sw=4 noet: