5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 8
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
172 for (p = PL_regsize; p > parenfloor; p--) {
173 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
174 SSPUSHINT(PL_regendp[p]);
175 SSPUSHINT(PL_regstartp[p]);
176 SSPUSHPTR(PL_reg_start_tmp[p]);
178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
179 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
180 (UV)p, (IV)PL_regstartp[p],
181 (IV)(PL_reg_start_tmp[p] - PL_bostr),
185 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
186 SSPUSHPTR(PL_regstartp);
187 SSPUSHPTR(PL_regendp);
188 SSPUSHINT(PL_regsize);
189 SSPUSHINT(*PL_reglastparen);
190 SSPUSHINT(*PL_reglastcloseparen);
191 SSPUSHPTR(PL_reginput);
192 #define REGCP_FRAME_ELEMS 2
193 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
194 * are needed for the regexp context stack bookkeeping. */
195 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
196 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
201 /* These are needed since we do not localize EVAL nodes: */
202 #define REGCP_SET(cp) \
204 PerlIO_printf(Perl_debug_log, \
205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
206 (IV)PL_savestack_ix)); \
209 #define REGCP_UNWIND(cp) \
211 if (cp != PL_savestack_ix) \
212 PerlIO_printf(Perl_debug_log, \
213 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
214 (IV)(cp), (IV)PL_savestack_ix)); \
218 S_regcppop(pTHX_ const regexp *rex)
224 GET_RE_DEBUG_FLAGS_DECL;
226 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
228 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
229 i = SSPOPINT; /* Parentheses elements to pop. */
230 input = (char *) SSPOPPTR;
231 *PL_reglastcloseparen = SSPOPINT;
232 *PL_reglastparen = SSPOPINT;
233 PL_regsize = SSPOPINT;
234 PL_regendp=(I32 *) SSPOPPTR;
235 PL_regstartp=(I32 *) SSPOPPTR;
238 /* Now restore the parentheses context. */
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
242 U32 paren = (U32)SSPOPINT;
243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
244 PL_regstartp[paren] = SSPOPINT;
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
249 PerlIO_printf(Perl_debug_log,
250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
251 (UV)paren, (IV)PL_regstartp[paren],
252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
253 (IV)PL_regendp[paren],
254 (paren > *PL_reglastparen ? "(no)" : ""));
258 if (*PL_reglastparen + 1 <= rex->nparens) {
259 PerlIO_printf(Perl_debug_log,
260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
261 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
275 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
277 PL_regstartp[i] = -1;
284 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
287 * pregexec and friends
290 #ifndef PERL_IN_XSUB_RE
292 - pregexec - match a regexp against a string
295 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
296 char *strbeg, I32 minend, SV *screamer, U32 nosave)
297 /* strend: pointer to null at end of string */
298 /* strbeg: real beginning of string */
299 /* minend: end of match must be >=minend after stringarg. */
300 /* nosave: For optimizations. */
303 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
304 nosave ? 0 : REXEC_COPY_STR);
309 * Need to implement the following flags for reg_anch:
311 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
313 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
314 * INTUIT_AUTORITATIVE_ML
315 * INTUIT_ONCE_NOML - Intuit can match in one location only.
318 * Another flag for this function: SECOND_TIME (so that float substrs
319 * with giant delta may be not rechecked).
322 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
324 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
325 Otherwise, only SvCUR(sv) is used to get strbeg. */
327 /* XXXX We assume that strpos is strbeg unless sv. */
329 /* XXXX Some places assume that there is a fixed substring.
330 An update may be needed if optimizer marks as "INTUITable"
331 RExen without fixed substrings. Similarly, it is assumed that
332 lengths of all the strings are no more than minlen, thus they
333 cannot come from lookahead.
334 (Or minlen should take into account lookahead.)
335 NOTE: Some of this comment is not correct. minlen does now take account
336 of lookahead/behind. Further research is required. -- demerphq
340 /* A failure to find a constant substring means that there is no need to make
341 an expensive call to REx engine, thus we celebrate a failure. Similarly,
342 finding a substring too deep into the string means that less calls to
343 regtry() should be needed.
345 REx compiler's optimizer found 4 possible hints:
346 a) Anchored substring;
348 c) Whether we are anchored (beginning-of-line or \G);
349 d) First node (of those at offset 0) which may distingush positions;
350 We use a)b)d) and multiline-part of c), and try to find a position in the
351 string which does not contradict any of them.
354 /* Most of decisions we do here should have been done at compile time.
355 The nodes of the REx which we used for the search should have been
356 deleted from the finite automaton. */
359 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
360 char *strend, U32 flags, re_scream_pos_data *data)
363 register I32 start_shift = 0;
364 /* Should be nonnegative! */
365 register I32 end_shift = 0;
370 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
372 register char *other_last = NULL; /* other substr checked before this */
373 char *check_at = NULL; /* check substr found at this pos */
374 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
376 const char * const i_strpos = strpos;
379 GET_RE_DEBUG_FLAGS_DECL;
381 RX_MATCH_UTF8_set(prog,do_utf8);
383 if (prog->extflags & RXf_UTF8) {
384 PL_reg_flags |= RF_utf8;
387 debug_start_match(prog, do_utf8, strpos, strend,
388 sv ? "Guessing start of match in sv for"
389 : "Guessing start of match in string for");
392 /* CHR_DIST() would be more correct here but it makes things slow. */
393 if (prog->minlen > strend - strpos) {
394 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
395 "String too short... [re_intuit_start]\n"));
399 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
402 if (!prog->check_utf8 && prog->check_substr)
403 to_utf8_substr(prog);
404 check = prog->check_utf8;
406 if (!prog->check_substr && prog->check_utf8)
407 to_byte_substr(prog);
408 check = prog->check_substr;
410 if (check == &PL_sv_undef) {
411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
412 "Non-utf8 string cannot match utf8 check string\n"));
415 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
416 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
417 || ( (prog->extflags & RXf_ANCH_BOL)
418 && !multiline ) ); /* Check after \n? */
421 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
422 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
423 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
425 && (strpos != strbeg)) {
426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
429 if (prog->check_offset_min == prog->check_offset_max &&
430 !(prog->extflags & RXf_CANY_SEEN)) {
431 /* Substring at constant offset from beg-of-str... */
434 s = HOP3c(strpos, prog->check_offset_min, strend);
437 slen = SvCUR(check); /* >= 1 */
439 if ( strend - s > slen || strend - s < slen - 1
440 || (strend - s == slen && strend[-1] != '\n')) {
441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
444 /* Now should match s[0..slen-2] */
446 if (slen && (*SvPVX_const(check) != *s
448 && memNE(SvPVX_const(check), s, slen)))) {
450 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
454 else if (*SvPVX_const(check) != *s
455 || ((slen = SvCUR(check)) > 1
456 && memNE(SvPVX_const(check), s, slen)))
459 goto success_at_start;
462 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
464 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
465 end_shift = prog->check_end_shift;
468 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
469 - (SvTAIL(check) != 0);
470 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
472 if (end_shift < eshift)
476 else { /* Can match at random position */
479 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
480 end_shift = prog->check_end_shift;
482 /* end shift should be non negative here */
485 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
487 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
488 (IV)end_shift, prog->precomp);
492 /* Find a possible match in the region s..strend by looking for
493 the "check" substring in the region corrected by start/end_shift. */
496 I32 srch_start_shift = start_shift;
497 I32 srch_end_shift = end_shift;
498 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
499 srch_end_shift -= ((strbeg - s) - srch_start_shift);
500 srch_start_shift = strbeg - s;
502 DEBUG_OPTIMISE_MORE_r({
503 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
504 (IV)prog->check_offset_min,
505 (IV)srch_start_shift,
507 (IV)prog->check_end_shift);
510 if (flags & REXEC_SCREAM) {
511 I32 p = -1; /* Internal iterator of scream. */
512 I32 * const pp = data ? data->scream_pos : &p;
514 if (PL_screamfirst[BmRARE(check)] >= 0
515 || ( BmRARE(check) == '\n'
516 && (BmPREVIOUS(check) == SvCUR(check) - 1)
518 s = screaminstr(sv, check,
519 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
522 /* we may be pointing at the wrong string */
523 if (s && RX_MATCH_COPIED(prog))
524 s = strbeg + (s - SvPVX_const(sv));
526 *data->scream_olds = s;
531 if (prog->extflags & RXf_CANY_SEEN) {
532 start_point= (U8*)(s + srch_start_shift);
533 end_point= (U8*)(strend - srch_end_shift);
535 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
536 end_point= HOP3(strend, -srch_end_shift, strbeg);
538 DEBUG_OPTIMISE_MORE_r({
539 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
540 (int)(end_point - start_point),
541 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
545 s = fbm_instr( start_point, end_point,
546 check, multiline ? FBMrf_MULTILINE : 0);
549 /* Update the count-of-usability, remove useless subpatterns,
553 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
554 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
555 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
556 (s ? "Found" : "Did not find"),
557 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
558 ? "anchored" : "floating"),
561 (s ? " at offset " : "...\n") );
566 /* Finish the diagnostic message */
567 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
569 /* XXX dmq: first branch is for positive lookbehind...
570 Our check string is offset from the beginning of the pattern.
571 So we need to do any stclass tests offset forward from that
580 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
581 Start with the other substr.
582 XXXX no SCREAM optimization yet - and a very coarse implementation
583 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
584 *always* match. Probably should be marked during compile...
585 Probably it is right to do no SCREAM here...
588 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
589 : (prog->float_substr && prog->anchored_substr))
591 /* Take into account the "other" substring. */
592 /* XXXX May be hopelessly wrong for UTF... */
595 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
598 char * const last = HOP3c(s, -start_shift, strbeg);
600 char * const saved_s = s;
603 t = s - prog->check_offset_max;
604 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
606 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
611 t = HOP3c(t, prog->anchored_offset, strend);
612 if (t < other_last) /* These positions already checked */
614 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
617 /* XXXX It is not documented what units *_offsets are in.
618 We assume bytes, but this is clearly wrong.
619 Meaning this code needs to be carefully reviewed for errors.
623 /* On end-of-str: see comment below. */
624 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
625 if (must == &PL_sv_undef) {
627 DEBUG_r(must = prog->anchored_utf8); /* for debug */
632 HOP3(HOP3(last1, prog->anchored_offset, strend)
633 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
635 multiline ? FBMrf_MULTILINE : 0
638 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
639 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
640 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
641 (s ? "Found" : "Contradicts"),
642 quoted, RE_SV_TAIL(must));
647 if (last1 >= last2) {
648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
649 ", giving up...\n"));
652 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
653 ", trying floating at offset %ld...\n",
654 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
655 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
656 s = HOP3c(last, 1, strend);
660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
661 (long)(s - i_strpos)));
662 t = HOP3c(s, -prog->anchored_offset, strbeg);
663 other_last = HOP3c(s, 1, strend);
671 else { /* Take into account the floating substring. */
673 char * const saved_s = s;
676 t = HOP3c(s, -start_shift, strbeg);
678 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
679 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
680 last = HOP3c(t, prog->float_max_offset, strend);
681 s = HOP3c(t, prog->float_min_offset, strend);
684 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
685 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
686 /* fbm_instr() takes into account exact value of end-of-str
687 if the check is SvTAIL(ed). Since false positives are OK,
688 and end-of-str is not later than strend we are OK. */
689 if (must == &PL_sv_undef) {
691 DEBUG_r(must = prog->float_utf8); /* for debug message */
694 s = fbm_instr((unsigned char*)s,
695 (unsigned char*)last + SvCUR(must)
697 must, multiline ? FBMrf_MULTILINE : 0);
699 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
700 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
701 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
702 (s ? "Found" : "Contradicts"),
703 quoted, RE_SV_TAIL(must));
707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
708 ", giving up...\n"));
711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
712 ", trying anchored starting at offset %ld...\n",
713 (long)(saved_s + 1 - i_strpos)));
715 s = HOP3c(t, 1, strend);
719 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
720 (long)(s - i_strpos)));
721 other_last = s; /* Fix this later. --Hugo */
731 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
733 DEBUG_OPTIMISE_MORE_r(
734 PerlIO_printf(Perl_debug_log,
735 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
736 (IV)prog->check_offset_min,
737 (IV)prog->check_offset_max,
745 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
747 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
750 /* Fixed substring is found far enough so that the match
751 cannot start at strpos. */
753 if (ml_anch && t[-1] != '\n') {
754 /* Eventually fbm_*() should handle this, but often
755 anchored_offset is not 0, so this check will not be wasted. */
756 /* XXXX In the code below we prefer to look for "^" even in
757 presence of anchored substrings. And we search even
758 beyond the found float position. These pessimizations
759 are historical artefacts only. */
761 while (t < strend - prog->minlen) {
763 if (t < check_at - prog->check_offset_min) {
764 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
765 /* Since we moved from the found position,
766 we definitely contradict the found anchored
767 substr. Due to the above check we do not
768 contradict "check" substr.
769 Thus we can arrive here only if check substr
770 is float. Redo checking for "other"=="fixed".
773 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
774 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
775 goto do_other_anchored;
777 /* We don't contradict the found floating substring. */
778 /* XXXX Why not check for STCLASS? */
780 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
781 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
784 /* Position contradicts check-string */
785 /* XXXX probably better to look for check-string
786 than for "\n", so one should lower the limit for t? */
787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
788 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
789 other_last = strpos = s = t + 1;
794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
795 PL_colors[0], PL_colors[1]));
799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
800 PL_colors[0], PL_colors[1]));
804 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
807 /* The found string does not prohibit matching at strpos,
808 - no optimization of calling REx engine can be performed,
809 unless it was an MBOL and we are not after MBOL,
810 or a future STCLASS check will fail this. */
812 /* Even in this situation we may use MBOL flag if strpos is offset
813 wrt the start of the string. */
814 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
815 && (strpos != strbeg) && strpos[-1] != '\n'
816 /* May be due to an implicit anchor of m{.*foo} */
817 && !(prog->intflags & PREGf_IMPLICIT))
822 DEBUG_EXECUTE_r( if (ml_anch)
823 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
824 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
827 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
829 prog->check_utf8 /* Could be deleted already */
830 && --BmUSEFUL(prog->check_utf8) < 0
831 && (prog->check_utf8 == prog->float_utf8)
833 prog->check_substr /* Could be deleted already */
834 && --BmUSEFUL(prog->check_substr) < 0
835 && (prog->check_substr == prog->float_substr)
838 /* If flags & SOMETHING - do not do it many times on the same match */
839 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
840 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
841 if (do_utf8 ? prog->check_substr : prog->check_utf8)
842 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
843 prog->check_substr = prog->check_utf8 = NULL; /* disable */
844 prog->float_substr = prog->float_utf8 = NULL; /* clear */
845 check = NULL; /* abort */
847 /* XXXX This is a remnant of the old implementation. It
848 looks wasteful, since now INTUIT can use many
850 prog->extflags &= ~RXf_USE_INTUIT;
857 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
858 /* trie stclasses are too expensive to use here, we are better off to
859 leave it to regmatch itself */
860 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
861 /* minlen == 0 is possible if regstclass is \b or \B,
862 and the fixed substr is ''$.
863 Since minlen is already taken into account, s+1 is before strend;
864 accidentally, minlen >= 1 guaranties no false positives at s + 1
865 even for \b or \B. But (minlen? 1 : 0) below assumes that
866 regstclass does not come from lookahead... */
867 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
868 This leaves EXACTF only, which is dealt with in find_byclass(). */
869 const U8* const str = (U8*)STRING(prog->regstclass);
870 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
871 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
874 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
875 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
876 else if (prog->float_substr || prog->float_utf8)
877 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
882 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
885 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
888 const char *what = NULL;
890 if (endpos == strend) {
891 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
892 "Could not match STCLASS...\n") );
895 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
896 "This position contradicts STCLASS...\n") );
897 if ((prog->extflags & RXf_ANCH) && !ml_anch)
899 /* Contradict one of substrings */
900 if (prog->anchored_substr || prog->anchored_utf8) {
901 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
902 DEBUG_EXECUTE_r( what = "anchored" );
904 s = HOP3c(t, 1, strend);
905 if (s + start_shift + end_shift > strend) {
906 /* XXXX Should be taken into account earlier? */
907 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
908 "Could not match STCLASS...\n") );
913 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
914 "Looking for %s substr starting at offset %ld...\n",
915 what, (long)(s + start_shift - i_strpos)) );
918 /* Have both, check_string is floating */
919 if (t + start_shift >= check_at) /* Contradicts floating=check */
920 goto retry_floating_check;
921 /* Recheck anchored substring, but not floating... */
925 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
926 "Looking for anchored substr starting at offset %ld...\n",
927 (long)(other_last - i_strpos)) );
928 goto do_other_anchored;
930 /* Another way we could have checked stclass at the
931 current position only: */
936 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
937 "Looking for /%s^%s/m starting at offset %ld...\n",
938 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
941 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
943 /* Check is floating subtring. */
944 retry_floating_check:
945 t = check_at - start_shift;
946 DEBUG_EXECUTE_r( what = "floating" );
947 goto hop_and_restart;
950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
951 "By STCLASS: moving %ld --> %ld\n",
952 (long)(t - i_strpos), (long)(s - i_strpos))
956 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
957 "Does not contradict STCLASS...\n");
962 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
963 PL_colors[4], (check ? "Guessed" : "Giving up"),
964 PL_colors[5], (long)(s - i_strpos)) );
967 fail_finish: /* Substring not found */
968 if (prog->check_substr || prog->check_utf8) /* could be removed already */
969 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
971 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
972 PL_colors[4], PL_colors[5]));
978 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
979 foldlen, foldbuf, uniflags) STMT_START { \
980 switch (trie_type) { \
981 case trie_utf8_fold: \
983 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
988 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
989 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
990 foldlen -= UNISKIP( uvc ); \
991 uscan = foldbuf + UNISKIP( uvc ); \
995 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1003 charid = trie->charmap[ uvc ]; \
1007 if (trie->widecharmap) { \
1008 SV** const svpp = hv_fetch(trie->widecharmap, \
1009 (char*)&uvc, sizeof(UV), 0); \
1011 charid = (U16)SvIV(*svpp); \
1016 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1019 ibcmp_utf8(s, NULL, 0, do_utf8, \
1020 m, NULL, ln, (bool)UTF)) \
1021 && (!reginfo || regtry(reginfo, &s)) ) \
1024 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1025 uvchr_to_utf8(tmpbuf, c); \
1026 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1028 && (f == c1 || f == c2) \
1029 && (ln == foldlen || \
1030 !ibcmp_utf8((char *) foldbuf, \
1031 NULL, foldlen, do_utf8, \
1033 NULL, ln, (bool)UTF)) \
1034 && (!reginfo || regtry(reginfo, &s)) ) \
1039 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1043 && (ln == 1 || !(OP(c) == EXACTF \
1045 : ibcmp_locale(s, m, ln))) \
1046 && (!reginfo || regtry(reginfo, &s)) ) \
1052 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1054 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1060 #define REXEC_FBC_SCAN(CoDe) \
1062 while (s < strend) { \
1068 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1069 REXEC_FBC_UTF8_SCAN( \
1071 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1080 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1083 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1092 #define REXEC_FBC_TRYIT \
1093 if ((!reginfo || regtry(reginfo, &s))) \
1096 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1099 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1102 REXEC_FBC_CLASS_SCAN(CoNd); \
1106 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1107 PL_reg_flags |= RF_tainted; \
1109 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1112 REXEC_FBC_CLASS_SCAN(CoNd); \
1116 #define DUMP_EXEC_POS(li,s,doutf8) \
1117 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1119 /* We know what class REx starts with. Try to find this position... */
1120 /* if reginfo is NULL, its a dryrun */
1121 /* annoyingly all the vars in this routine have different names from their counterparts
1122 in regmatch. /grrr */
1125 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1126 const char *strend, regmatch_info *reginfo)
1129 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1133 register STRLEN uskip;
1137 register I32 tmp = 1; /* Scratch variable? */
1138 register const bool do_utf8 = PL_reg_match_utf8;
1140 /* We know what class it must start with. */
1144 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1145 !UTF8_IS_INVARIANT((U8)s[0]) ?
1146 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1147 REGINCLASS(prog, c, (U8*)s));
1150 while (s < strend) {
1153 if (REGINCLASS(prog, c, (U8*)s) ||
1154 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1155 /* The assignment of 2 is intentional:
1156 * for the folded sharp s, the skip is 2. */
1157 (skip = SHARP_S_SKIP))) {
1158 if (tmp && (!reginfo || regtry(reginfo, &s)))
1171 if (tmp && (!reginfo || regtry(reginfo, &s)))
1179 ln = STR_LEN(c); /* length to match in octets/bytes */
1180 lnc = (I32) ln; /* length to match in characters */
1182 STRLEN ulen1, ulen2;
1184 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1185 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1186 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1188 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1189 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1191 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1193 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1196 while (sm < ((U8 *) m + ln)) {
1211 c2 = PL_fold_locale[c1];
1213 e = HOP3c(strend, -((I32)lnc), s);
1215 if (!reginfo && e < s)
1216 e = s; /* Due to minlen logic of intuit() */
1218 /* The idea in the EXACTF* cases is to first find the
1219 * first character of the EXACTF* node and then, if
1220 * necessary, case-insensitively compare the full
1221 * text of the node. The c1 and c2 are the first
1222 * characters (though in Unicode it gets a bit
1223 * more complicated because there are more cases
1224 * than just upper and lower: one needs to use
1225 * the so-called folding case for case-insensitive
1226 * matching (called "loose matching" in Unicode).
1227 * ibcmp_utf8() will do just that. */
1231 U8 tmpbuf [UTF8_MAXBYTES+1];
1232 STRLEN len, foldlen;
1233 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1235 /* Upper and lower of 1st char are equal -
1236 * probably not a "letter". */
1238 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1240 REXEC_FBC_EXACTISH_CHECK(c == c1);
1245 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1248 /* Handle some of the three Greek sigmas cases.
1249 * Note that not all the possible combinations
1250 * are handled here: some of them are handled
1251 * by the standard folding rules, and some of
1252 * them (the character class or ANYOF cases)
1253 * are handled during compiletime in
1254 * regexec.c:S_regclass(). */
1255 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1256 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1257 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1259 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1265 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1267 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1271 PL_reg_flags |= RF_tainted;
1278 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1279 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1281 tmp = ((OP(c) == BOUND ?
1282 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1283 LOAD_UTF8_CHARCLASS_ALNUM();
1284 REXEC_FBC_UTF8_SCAN(
1285 if (tmp == !(OP(c) == BOUND ?
1286 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1287 isALNUM_LC_utf8((U8*)s)))
1295 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1296 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1299 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1305 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1309 PL_reg_flags |= RF_tainted;
1316 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1317 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1319 tmp = ((OP(c) == NBOUND ?
1320 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1321 LOAD_UTF8_CHARCLASS_ALNUM();
1322 REXEC_FBC_UTF8_SCAN(
1323 if (tmp == !(OP(c) == NBOUND ?
1324 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1325 isALNUM_LC_utf8((U8*)s)))
1327 else REXEC_FBC_TRYIT;
1331 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1332 tmp = ((OP(c) == NBOUND ?
1333 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1336 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1338 else REXEC_FBC_TRYIT;
1341 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1345 REXEC_FBC_CSCAN_PRELOAD(
1346 LOAD_UTF8_CHARCLASS_ALNUM(),
1347 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1351 REXEC_FBC_CSCAN_TAINT(
1352 isALNUM_LC_utf8((U8*)s),
1356 REXEC_FBC_CSCAN_PRELOAD(
1357 LOAD_UTF8_CHARCLASS_ALNUM(),
1358 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1362 REXEC_FBC_CSCAN_TAINT(
1363 !isALNUM_LC_utf8((U8*)s),
1367 REXEC_FBC_CSCAN_PRELOAD(
1368 LOAD_UTF8_CHARCLASS_SPACE(),
1369 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1373 REXEC_FBC_CSCAN_TAINT(
1374 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1378 REXEC_FBC_CSCAN_PRELOAD(
1379 LOAD_UTF8_CHARCLASS_SPACE(),
1380 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1384 REXEC_FBC_CSCAN_TAINT(
1385 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1389 REXEC_FBC_CSCAN_PRELOAD(
1390 LOAD_UTF8_CHARCLASS_DIGIT(),
1391 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1395 REXEC_FBC_CSCAN_TAINT(
1396 isDIGIT_LC_utf8((U8*)s),
1400 REXEC_FBC_CSCAN_PRELOAD(
1401 LOAD_UTF8_CHARCLASS_DIGIT(),
1402 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1406 REXEC_FBC_CSCAN_TAINT(
1407 !isDIGIT_LC_utf8((U8*)s),
1413 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1414 trie_type = do_utf8 ?
1415 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1417 /* what trie are we using right now */
1419 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1420 reg_trie_data *trie=aho->trie;
1422 const char *last_start = strend - trie->minlen;
1424 const char *real_start = s;
1426 STRLEN maxlen = trie->maxlen;
1428 U8 **points; /* map of where we were in the input string
1429 when reading a given char. For ASCII this
1430 is unnecessary overhead as the relationship
1431 is always 1:1, but for unicode, especially
1432 case folded unicode this is not true. */
1433 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1437 GET_RE_DEBUG_FLAGS_DECL;
1439 /* We can't just allocate points here. We need to wrap it in
1440 * an SV so it gets freed properly if there is a croak while
1441 * running the match */
1444 sv_points=newSV(maxlen * sizeof(U8 *));
1445 SvCUR_set(sv_points,
1446 maxlen * sizeof(U8 *));
1447 SvPOK_on(sv_points);
1448 sv_2mortal(sv_points);
1449 points=(U8**)SvPV_nolen(sv_points );
1450 if ( trie_type != trie_utf8_fold
1451 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1454 bitmap=(U8*)trie->bitmap;
1456 bitmap=(U8*)ANYOF_BITMAP(c);
1458 /* this is the Aho-Corasick algorithm modified a touch
1459 to include special handling for long "unknown char"
1460 sequences. The basic idea being that we use AC as long
1461 as we are dealing with a possible matching char, when
1462 we encounter an unknown char (and we have not encountered
1463 an accepting state) we scan forward until we find a legal
1465 AC matching is basically that of trie matching, except
1466 that when we encounter a failing transition, we fall back
1467 to the current states "fail state", and try the current char
1468 again, a process we repeat until we reach the root state,
1469 state 1, or a legal transition. If we fail on the root state
1470 then we can either terminate if we have reached an accepting
1471 state previously, or restart the entire process from the beginning
1475 while (s <= last_start) {
1476 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1484 U8 *uscan = (U8*)NULL;
1485 U8 *leftmost = NULL;
1487 U32 accepted_word= 0;
1491 while ( state && uc <= (U8*)strend ) {
1493 U32 word = aho->states[ state ].wordnum;
1497 DEBUG_TRIE_EXECUTE_r(
1498 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1499 dump_exec_pos( (char *)uc, c, strend, real_start,
1500 (char *)uc, do_utf8 );
1501 PerlIO_printf( Perl_debug_log,
1502 " Scanning for legal start char...\n");
1505 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1510 if (uc >(U8*)last_start) break;
1514 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1515 if (!leftmost || lpos < leftmost) {
1516 DEBUG_r(accepted_word=word);
1522 points[pointpos++ % maxlen]= uc;
1523 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1524 uvc, charid, foldlen, foldbuf, uniflags);
1525 DEBUG_TRIE_EXECUTE_r({
1526 dump_exec_pos( (char *)uc, c, strend, real_start,
1528 PerlIO_printf(Perl_debug_log,
1529 " Charid:%3u CP:%4"UVxf" ",
1535 word = aho->states[ state ].wordnum;
1537 base = aho->states[ state ].trans.base;
1539 DEBUG_TRIE_EXECUTE_r({
1541 dump_exec_pos( (char *)uc, c, strend, real_start,
1543 PerlIO_printf( Perl_debug_log,
1544 "%sState: %4"UVxf", word=%"UVxf,
1545 failed ? " Fail transition to " : "",
1546 (UV)state, (UV)word);
1551 (base + charid > trie->uniquecharcount )
1552 && (base + charid - 1 - trie->uniquecharcount
1554 && trie->trans[base + charid - 1 -
1555 trie->uniquecharcount].check == state
1556 && (tmp=trie->trans[base + charid - 1 -
1557 trie->uniquecharcount ].next))
1559 DEBUG_TRIE_EXECUTE_r(
1560 PerlIO_printf( Perl_debug_log," - legal\n"));
1565 DEBUG_TRIE_EXECUTE_r(
1566 PerlIO_printf( Perl_debug_log," - fail\n"));
1568 state = aho->fail[state];
1572 /* we must be accepting here */
1573 DEBUG_TRIE_EXECUTE_r(
1574 PerlIO_printf( Perl_debug_log," - accepting\n"));
1583 if (!state) state = 1;
1586 if ( aho->states[ state ].wordnum ) {
1587 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1588 if (!leftmost || lpos < leftmost) {
1589 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1594 s = (char*)leftmost;
1595 DEBUG_TRIE_EXECUTE_r({
1597 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1598 (UV)accepted_word, s - real_start
1601 if (!reginfo || regtry(reginfo, &s)) {
1607 DEBUG_TRIE_EXECUTE_r({
1608 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1611 DEBUG_TRIE_EXECUTE_r(
1612 PerlIO_printf( Perl_debug_log,"No match.\n"));
1621 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1630 - regexec_flags - match a regexp against a string
1633 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1634 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1635 /* strend: pointer to null at end of string */
1636 /* strbeg: real beginning of string */
1637 /* minend: end of match must be >=minend after stringarg. */
1638 /* data: May be used for some additional optimizations.
1639 Currently its only used, with a U32 cast, for transmitting
1640 the ganch offset when doing a /g match. This will change */
1641 /* nosave: For optimizations. */
1644 /*register*/ char *s;
1645 register regnode *c;
1646 /*register*/ char *startpos = stringarg;
1647 I32 minlen; /* must match at least this many chars */
1648 I32 dontbother = 0; /* how many characters not to try at end */
1649 I32 end_shift = 0; /* Same for the end. */ /* CC */
1650 I32 scream_pos = -1; /* Internal iterator of scream. */
1651 char *scream_olds = NULL;
1652 SV* const oreplsv = GvSV(PL_replgv);
1653 const bool do_utf8 = (bool)DO_UTF8(sv);
1656 regmatch_info reginfo; /* create some info to pass to regtry etc */
1658 GET_RE_DEBUG_FLAGS_DECL;
1660 PERL_UNUSED_ARG(data);
1662 /* Be paranoid... */
1663 if (prog == NULL || startpos == NULL) {
1664 Perl_croak(aTHX_ "NULL regexp parameter");
1668 multiline = prog->extflags & RXf_PMf_MULTILINE;
1669 reginfo.prog = prog;
1671 RX_MATCH_UTF8_set(prog, do_utf8);
1673 debug_start_match(prog, do_utf8, startpos, strend,
1677 minlen = prog->minlen;
1679 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1680 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1681 "String too short [regexec_flags]...\n"));
1686 /* Check validity of program. */
1687 if (UCHARAT(prog->program) != REG_MAGIC) {
1688 Perl_croak(aTHX_ "corrupted regexp program");
1692 PL_reg_eval_set = 0;
1695 if (prog->extflags & RXf_UTF8)
1696 PL_reg_flags |= RF_utf8;
1698 /* Mark beginning of line for ^ and lookbehind. */
1699 reginfo.bol = startpos; /* XXX not used ??? */
1703 /* Mark end of line for $ (and such) */
1706 /* see how far we have to get to not match where we matched before */
1707 reginfo.till = startpos+minend;
1709 /* If there is a "must appear" string, look for it. */
1712 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1715 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1716 reginfo.ganch = startpos + prog->gofs;
1717 else if (sv && SvTYPE(sv) >= SVt_PVMG
1719 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1720 && mg->mg_len >= 0) {
1721 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1722 if (prog->extflags & RXf_ANCH_GPOS) {
1723 if (s > reginfo.ganch)
1725 s = reginfo.ganch - prog->gofs;
1729 reginfo.ganch = strbeg + (UV)data;
1730 } else /* pos() not defined */
1731 reginfo.ganch = strbeg;
1733 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1736 /* We have to be careful. If the previous successful match
1737 was from this regex we don't want a subsequent paritally
1738 successful match to clobber the old results.
1739 So when we detect this possibility we add a swap buffer
1740 to the re, and switch the buffer each match. If we fail
1741 we switch it back, otherwise we leave it swapped.
1743 Newxz(prog->swap, 1, regexp_paren_ofs);
1744 /* no need to copy these */
1745 Newxz(prog->swap->startp, prog->nparens + 1, I32);
1746 Newxz(prog->swap->endp, prog->nparens + 1, I32);
1748 t = prog->swap->startp;
1749 prog->swap->startp = prog->startp;
1751 t = prog->swap->endp;
1752 prog->swap->endp = prog->endp;
1755 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1756 re_scream_pos_data d;
1758 d.scream_olds = &scream_olds;
1759 d.scream_pos = &scream_pos;
1760 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1763 goto phooey; /* not present */
1769 /* Simplest case: anchored match need be tried only once. */
1770 /* [unless only anchor is BOL and multiline is set] */
1771 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1772 if (s == startpos && regtry(®info, &startpos))
1774 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1775 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1780 dontbother = minlen - 1;
1781 end = HOP3c(strend, -dontbother, strbeg) - 1;
1782 /* for multiline we only have to try after newlines */
1783 if (prog->check_substr || prog->check_utf8) {
1787 if (regtry(®info, &s))
1792 if (prog->extflags & RXf_USE_INTUIT) {
1793 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1804 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1805 if (regtry(®info, &s))
1812 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1814 /* the warning about reginfo.ganch being used without intialization
1815 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1816 and we only enter this block when the same bit is set. */
1817 char *tmp_s = reginfo.ganch - prog->gofs;
1818 if (regtry(®info, &tmp_s))
1823 /* Messy cases: unanchored match. */
1824 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1825 /* we have /x+whatever/ */
1826 /* it must be a one character string (XXXX Except UTF?) */
1831 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1832 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1833 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1838 DEBUG_EXECUTE_r( did_match = 1 );
1839 if (regtry(®info, &s)) goto got_it;
1841 while (s < strend && *s == ch)
1849 DEBUG_EXECUTE_r( did_match = 1 );
1850 if (regtry(®info, &s)) goto got_it;
1852 while (s < strend && *s == ch)
1857 DEBUG_EXECUTE_r(if (!did_match)
1858 PerlIO_printf(Perl_debug_log,
1859 "Did not find anchored character...\n")
1862 else if (prog->anchored_substr != NULL
1863 || prog->anchored_utf8 != NULL
1864 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1865 && prog->float_max_offset < strend - s)) {
1870 char *last1; /* Last position checked before */
1874 if (prog->anchored_substr || prog->anchored_utf8) {
1875 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1876 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1877 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1878 back_max = back_min = prog->anchored_offset;
1880 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1881 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1882 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1883 back_max = prog->float_max_offset;
1884 back_min = prog->float_min_offset;
1888 if (must == &PL_sv_undef)
1889 /* could not downgrade utf8 check substring, so must fail */
1895 last = HOP3c(strend, /* Cannot start after this */
1896 -(I32)(CHR_SVLEN(must)
1897 - (SvTAIL(must) != 0) + back_min), strbeg);
1900 last1 = HOPc(s, -1);
1902 last1 = s - 1; /* bogus */
1904 /* XXXX check_substr already used to find "s", can optimize if
1905 check_substr==must. */
1907 dontbother = end_shift;
1908 strend = HOPc(strend, -dontbother);
1909 while ( (s <= last) &&
1910 ((flags & REXEC_SCREAM)
1911 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1912 end_shift, &scream_pos, 0))
1913 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1914 (unsigned char*)strend, must,
1915 multiline ? FBMrf_MULTILINE : 0))) ) {
1916 /* we may be pointing at the wrong string */
1917 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1918 s = strbeg + (s - SvPVX_const(sv));
1919 DEBUG_EXECUTE_r( did_match = 1 );
1920 if (HOPc(s, -back_max) > last1) {
1921 last1 = HOPc(s, -back_min);
1922 s = HOPc(s, -back_max);
1925 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1927 last1 = HOPc(s, -back_min);
1931 while (s <= last1) {
1932 if (regtry(®info, &s))
1938 while (s <= last1) {
1939 if (regtry(®info, &s))
1945 DEBUG_EXECUTE_r(if (!did_match) {
1946 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1947 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1948 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1949 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1950 ? "anchored" : "floating"),
1951 quoted, RE_SV_TAIL(must));
1955 else if ( (c = prog->regstclass) ) {
1957 const OPCODE op = OP(prog->regstclass);
1958 /* don't bother with what can't match */
1959 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1960 strend = HOPc(strend, -(minlen - 1));
1963 SV * const prop = sv_newmortal();
1964 regprop(prog, prop, c);
1966 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1968 PerlIO_printf(Perl_debug_log,
1969 "Matching stclass %.*s against %s (%d chars)\n",
1970 (int)SvCUR(prop), SvPVX_const(prop),
1971 quoted, (int)(strend - s));
1974 if (find_byclass(prog, c, s, strend, ®info))
1976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1980 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1985 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1986 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1987 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1989 if (flags & REXEC_SCREAM) {
1990 last = screaminstr(sv, float_real, s - strbeg,
1991 end_shift, &scream_pos, 1); /* last one */
1993 last = scream_olds; /* Only one occurrence. */
1994 /* we may be pointing at the wrong string */
1995 else if (RX_MATCH_COPIED(prog))
1996 s = strbeg + (s - SvPVX_const(sv));
2000 const char * const little = SvPV_const(float_real, len);
2002 if (SvTAIL(float_real)) {
2003 if (memEQ(strend - len + 1, little, len - 1))
2004 last = strend - len + 1;
2005 else if (!multiline)
2006 last = memEQ(strend - len, little, len)
2007 ? strend - len : NULL;
2013 last = rninstr(s, strend, little, little + len);
2015 last = strend; /* matching "$" */
2020 PerlIO_printf(Perl_debug_log,
2021 "%sCan't trim the tail, match fails (should not happen)%s\n",
2022 PL_colors[4], PL_colors[5]));
2023 goto phooey; /* Should not happen! */
2025 dontbother = strend - last + prog->float_min_offset;
2027 if (minlen && (dontbother < minlen))
2028 dontbother = minlen - 1;
2029 strend -= dontbother; /* this one's always in bytes! */
2030 /* We don't know much -- general case. */
2033 if (regtry(®info, &s))
2042 if (regtry(®info, &s))
2044 } while (s++ < strend);
2052 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2054 if (PL_reg_eval_set) {
2055 /* Preserve the current value of $^R */
2056 if (oreplsv != GvSV(PL_replgv))
2057 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2058 restored, the value remains
2060 restore_pos(aTHX_ prog);
2062 if (prog->paren_names)
2063 (void)hv_iterinit(prog->paren_names);
2065 /* make sure $`, $&, $', and $digit will work later */
2066 if ( !(flags & REXEC_NOT_FIRST) ) {
2067 RX_MATCH_COPY_FREE(prog);
2068 if (flags & REXEC_COPY_STR) {
2069 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2070 #ifdef PERL_OLD_COPY_ON_WRITE
2072 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2074 PerlIO_printf(Perl_debug_log,
2075 "Copy on write: regexp capture, type %d\n",
2078 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2079 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2080 assert (SvPOKp(prog->saved_copy));
2084 RX_MATCH_COPIED_on(prog);
2085 s = savepvn(strbeg, i);
2091 prog->subbeg = strbeg;
2092 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2100 PL_colors[4], PL_colors[5]));
2101 if (PL_reg_eval_set)
2102 restore_pos(aTHX_ prog);
2104 /* we failed :-( roll it back */
2106 t = prog->swap->startp;
2107 prog->swap->startp = prog->startp;
2109 t = prog->swap->endp;
2110 prog->swap->endp = prog->endp;
2118 - regtry - try match at specific point
2120 STATIC I32 /* 0 failure, 1 success */
2121 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2127 regexp *prog = reginfo->prog;
2128 GET_RE_DEBUG_FLAGS_DECL;
2129 reginfo->cutpoint=NULL;
2131 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2134 PL_reg_eval_set = RS_init;
2135 DEBUG_EXECUTE_r(DEBUG_s(
2136 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2137 (IV)(PL_stack_sp - PL_stack_base));
2140 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2141 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2143 /* Apparently this is not needed, judging by wantarray. */
2144 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2145 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2148 /* Make $_ available to executed code. */
2149 if (reginfo->sv != DEFSV) {
2151 DEFSV = reginfo->sv;
2154 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2155 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2156 /* prepare for quick setting of pos */
2157 #ifdef PERL_OLD_COPY_ON_WRITE
2158 if (SvIsCOW(reginfo->sv))
2159 sv_force_normal_flags(reginfo->sv, 0);
2161 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2162 &PL_vtbl_mglob, NULL, 0);
2166 PL_reg_oldpos = mg->mg_len;
2167 SAVEDESTRUCTOR_X(restore_pos, prog);
2169 if (!PL_reg_curpm) {
2170 Newxz(PL_reg_curpm, 1, PMOP);
2173 SV* const repointer = newSViv(0);
2174 /* so we know which PL_regex_padav element is PL_reg_curpm */
2175 SvFLAGS(repointer) |= SVf_BREAK;
2176 av_push(PL_regex_padav,repointer);
2177 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2178 PL_regex_pad = AvARRAY(PL_regex_padav);
2182 PM_SETRE(PL_reg_curpm, prog);
2183 PL_reg_oldcurpm = PL_curpm;
2184 PL_curpm = PL_reg_curpm;
2185 if (RX_MATCH_COPIED(prog)) {
2186 /* Here is a serious problem: we cannot rewrite subbeg,
2187 since it may be needed if this match fails. Thus
2188 $` inside (?{}) could fail... */
2189 PL_reg_oldsaved = prog->subbeg;
2190 PL_reg_oldsavedlen = prog->sublen;
2191 #ifdef PERL_OLD_COPY_ON_WRITE
2192 PL_nrs = prog->saved_copy;
2194 RX_MATCH_COPIED_off(prog);
2197 PL_reg_oldsaved = NULL;
2198 prog->subbeg = PL_bostr;
2199 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2201 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2202 prog->startp[0] = *startpos - PL_bostr;
2203 PL_reginput = *startpos;
2204 PL_reglastparen = &prog->lastparen;
2205 PL_reglastcloseparen = &prog->lastcloseparen;
2206 prog->lastparen = 0;
2207 prog->lastcloseparen = 0;
2209 PL_regstartp = prog->startp;
2210 PL_regendp = prog->endp;
2211 if (PL_reg_start_tmpl <= prog->nparens) {
2212 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2213 if(PL_reg_start_tmp)
2214 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2216 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2219 /* XXXX What this code is doing here?!!! There should be no need
2220 to do this again and again, PL_reglastparen should take care of
2223 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2224 * Actually, the code in regcppop() (which Ilya may be meaning by
2225 * PL_reglastparen), is not needed at all by the test suite
2226 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2227 * enough, for building DynaLoader, or otherwise this
2228 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2229 * will happen. Meanwhile, this code *is* needed for the
2230 * above-mentioned test suite tests to succeed. The common theme
2231 * on those tests seems to be returning null fields from matches.
2236 if (prog->nparens) {
2238 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2245 if (regmatch(reginfo, prog->program + 1)) {
2246 PL_regendp[0] = PL_reginput - PL_bostr;
2249 if (reginfo->cutpoint)
2250 *startpos= reginfo->cutpoint;
2251 REGCP_UNWIND(lastcp);
2256 #define sayYES goto yes
2257 #define sayNO goto no
2258 #define sayNO_SILENT goto no_silent
2260 /* we dont use STMT_START/END here because it leads to
2261 "unreachable code" warnings, which are bogus, but distracting. */
2262 #define CACHEsayNO \
2263 if (ST.cache_mask) \
2264 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2267 /* this is used to determine how far from the left messages like
2268 'failed...' are printed. It should be set such that messages
2269 are inline with the regop output that created them.
2271 #define REPORT_CODE_OFF 32
2274 /* Make sure there is a test for this +1 options in re_tests */
2275 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2277 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2278 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2280 #define SLAB_FIRST(s) (&(s)->states[0])
2281 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2283 /* grab a new slab and return the first slot in it */
2285 STATIC regmatch_state *
2288 #if PERL_VERSION < 9
2291 regmatch_slab *s = PL_regmatch_slab->next;
2293 Newx(s, 1, regmatch_slab);
2294 s->prev = PL_regmatch_slab;
2296 PL_regmatch_slab->next = s;
2298 PL_regmatch_slab = s;
2299 return SLAB_FIRST(s);
2303 /* push a new state then goto it */
2305 #define PUSH_STATE_GOTO(state, node) \
2307 st->resume_state = state; \
2310 /* push a new state with success backtracking, then goto it */
2312 #define PUSH_YES_STATE_GOTO(state, node) \
2314 st->resume_state = state; \
2315 goto push_yes_state;
2321 regmatch() - main matching routine
2323 This is basically one big switch statement in a loop. We execute an op,
2324 set 'next' to point the next op, and continue. If we come to a point which
2325 we may need to backtrack to on failure such as (A|B|C), we push a
2326 backtrack state onto the backtrack stack. On failure, we pop the top
2327 state, and re-enter the loop at the state indicated. If there are no more
2328 states to pop, we return failure.
2330 Sometimes we also need to backtrack on success; for example /A+/, where
2331 after successfully matching one A, we need to go back and try to
2332 match another one; similarly for lookahead assertions: if the assertion
2333 completes successfully, we backtrack to the state just before the assertion
2334 and then carry on. In these cases, the pushed state is marked as
2335 'backtrack on success too'. This marking is in fact done by a chain of
2336 pointers, each pointing to the previous 'yes' state. On success, we pop to
2337 the nearest yes state, discarding any intermediate failure-only states.
2338 Sometimes a yes state is pushed just to force some cleanup code to be
2339 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2340 it to free the inner regex.
2342 Note that failure backtracking rewinds the cursor position, while
2343 success backtracking leaves it alone.
2345 A pattern is complete when the END op is executed, while a subpattern
2346 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2347 ops trigger the "pop to last yes state if any, otherwise return true"
2350 A common convention in this function is to use A and B to refer to the two
2351 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2352 the subpattern to be matched possibly multiple times, while B is the entire
2353 rest of the pattern. Variable and state names reflect this convention.
2355 The states in the main switch are the union of ops and failure/success of
2356 substates associated with with that op. For example, IFMATCH is the op
2357 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2358 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2359 successfully matched A and IFMATCH_A_fail is a state saying that we have
2360 just failed to match A. Resume states always come in pairs. The backtrack
2361 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2362 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2363 on success or failure.
2365 The struct that holds a backtracking state is actually a big union, with
2366 one variant for each major type of op. The variable st points to the
2367 top-most backtrack struct. To make the code clearer, within each
2368 block of code we #define ST to alias the relevant union.
2370 Here's a concrete example of a (vastly oversimplified) IFMATCH
2376 #define ST st->u.ifmatch
2378 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2379 ST.foo = ...; // some state we wish to save
2381 // push a yes backtrack state with a resume value of
2382 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2384 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2387 case IFMATCH_A: // we have successfully executed A; now continue with B
2389 bar = ST.foo; // do something with the preserved value
2392 case IFMATCH_A_fail: // A failed, so the assertion failed
2393 ...; // do some housekeeping, then ...
2394 sayNO; // propagate the failure
2401 For any old-timers reading this who are familiar with the old recursive
2402 approach, the code above is equivalent to:
2404 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2413 ...; // do some housekeeping, then ...
2414 sayNO; // propagate the failure
2417 The topmost backtrack state, pointed to by st, is usually free. If you
2418 want to claim it, populate any ST.foo fields in it with values you wish to
2419 save, then do one of
2421 PUSH_STATE_GOTO(resume_state, node);
2422 PUSH_YES_STATE_GOTO(resume_state, node);
2424 which sets that backtrack state's resume value to 'resume_state', pushes a
2425 new free entry to the top of the backtrack stack, then goes to 'node'.
2426 On backtracking, the free slot is popped, and the saved state becomes the
2427 new free state. An ST.foo field in this new top state can be temporarily
2428 accessed to retrieve values, but once the main loop is re-entered, it
2429 becomes available for reuse.
2431 Note that the depth of the backtrack stack constantly increases during the
2432 left-to-right execution of the pattern, rather than going up and down with
2433 the pattern nesting. For example the stack is at its maximum at Z at the
2434 end of the pattern, rather than at X in the following:
2436 /(((X)+)+)+....(Y)+....Z/
2438 The only exceptions to this are lookahead/behind assertions and the cut,
2439 (?>A), which pop all the backtrack states associated with A before
2442 Bascktrack state structs are allocated in slabs of about 4K in size.
2443 PL_regmatch_state and st always point to the currently active state,
2444 and PL_regmatch_slab points to the slab currently containing
2445 PL_regmatch_state. The first time regmatch() is called, the first slab is
2446 allocated, and is never freed until interpreter destruction. When the slab
2447 is full, a new one is allocated and chained to the end. At exit from
2448 regmatch(), slabs allocated since entry are freed.
2453 #define DEBUG_STATE_pp(pp) \
2455 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2456 PerlIO_printf(Perl_debug_log, \
2457 " %*s"pp" %s%s%s%s%s\n", \
2459 reg_name[st->resume_state], \
2460 ((st==yes_state||st==mark_state) ? "[" : ""), \
2461 ((st==yes_state) ? "Y" : ""), \
2462 ((st==mark_state) ? "M" : ""), \
2463 ((st==yes_state||st==mark_state) ? "]" : "") \
2468 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2473 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2474 const char *start, const char *end, const char *blurb)
2476 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2480 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2481 prog->precomp, prog->prelen, 60);
2483 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2484 start, end - start, 60);
2486 PerlIO_printf(Perl_debug_log,
2487 "%s%s REx%s %s against %s\n",
2488 PL_colors[4], blurb, PL_colors[5], s0, s1);
2490 if (do_utf8||utf8_pat)
2491 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2492 utf8_pat ? "pattern" : "",
2493 utf8_pat && do_utf8 ? " and " : "",
2494 do_utf8 ? "string" : ""
2500 S_dump_exec_pos(pTHX_ const char *locinput,
2501 const regnode *scan,
2502 const char *loc_regeol,
2503 const char *loc_bostr,
2504 const char *loc_reg_starttry,
2507 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2508 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2509 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2510 /* The part of the string before starttry has one color
2511 (pref0_len chars), between starttry and current
2512 position another one (pref_len - pref0_len chars),
2513 after the current position the third one.
2514 We assume that pref0_len <= pref_len, otherwise we
2515 decrease pref0_len. */
2516 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2517 ? (5 + taill) - l : locinput - loc_bostr;
2520 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2522 pref0_len = pref_len - (locinput - loc_reg_starttry);
2523 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2524 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2525 ? (5 + taill) - pref_len : loc_regeol - locinput);
2526 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2530 if (pref0_len > pref_len)
2531 pref0_len = pref_len;
2533 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2535 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2536 (locinput - pref_len),pref0_len, 60, 4, 5);
2538 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2539 (locinput - pref_len + pref0_len),
2540 pref_len - pref0_len, 60, 2, 3);
2542 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2543 locinput, loc_regeol - locinput, 10, 0, 1);
2545 const STRLEN tlen=len0+len1+len2;
2546 PerlIO_printf(Perl_debug_log,
2547 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2548 (IV)(locinput - loc_bostr),
2551 (docolor ? "" : "> <"),
2553 (int)(tlen > 19 ? 0 : 19 - tlen),
2560 /* reg_check_named_buff_matched()
2561 * Checks to see if a named buffer has matched. The data array of
2562 * buffer numbers corresponding to the buffer is expected to reside
2563 * in the regexp->data->data array in the slot stored in the ARG() of
2564 * node involved. Note that this routine doesn't actually care about the
2565 * name, that information is not preserved from compilation to execution.
2566 * Returns the index of the leftmost defined buffer with the given name
2567 * or 0 if non of the buffers matched.
2570 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2572 SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];
2573 I32 *nums=(I32*)SvPVX(sv_dat);
2574 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2575 if ((I32)*PL_reglastparen >= nums[n] &&
2576 PL_regendp[nums[n]] != -1)
2584 STATIC I32 /* 0 failure, 1 success */
2585 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2587 #if PERL_VERSION < 9
2591 register const bool do_utf8 = PL_reg_match_utf8;
2592 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2594 regexp *rex = reginfo->prog;
2596 regmatch_slab *orig_slab;
2597 regmatch_state *orig_state;
2599 /* the current state. This is a cached copy of PL_regmatch_state */
2600 register regmatch_state *st;
2602 /* cache heavy used fields of st in registers */
2603 register regnode *scan;
2604 register regnode *next;
2605 register U32 n = 0; /* general value; init to avoid compiler warning */
2606 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2607 register char *locinput = PL_reginput;
2608 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2610 bool result = 0; /* return value of S_regmatch */
2611 int depth = 0; /* depth of backtrack stack */
2612 int nochange_depth = 0; /* depth of GOSUB recursion with nochange*/
2613 regmatch_state *yes_state = NULL; /* state to pop to on success of
2615 /* mark_state piggy backs on the yes_state logic so that when we unwind
2616 the stack on success we can update the mark_state as we go */
2617 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2619 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2620 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2622 bool no_final = 0; /* prevent failure from backtracking? */
2623 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2624 char *startpoint = PL_reginput;
2625 SV *popmark = NULL; /* are we looking for a mark? */
2626 SV *sv_commit = NULL; /* last mark name seen in failure */
2627 SV *sv_yes_mark = NULL; /* last mark name we have seen
2628 during a successfull match */
2629 U32 lastopen = 0; /* last open we saw */
2630 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2633 /* these three flags are set by various ops to signal information to
2634 * the very next op. They have a useful lifetime of exactly one loop
2635 * iteration, and are not preserved or restored by state pushes/pops
2637 bool sw = 0; /* the condition value in (?(cond)a|b) */
2638 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2639 int logical = 0; /* the following EVAL is:
2643 or the following IFMATCH/UNLESSM is:
2644 false: plain (?=foo)
2645 true: used as a condition: (?(?=foo))
2649 GET_RE_DEBUG_FLAGS_DECL;
2653 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2655 /* on first ever call to regmatch, allocate first slab */
2656 if (!PL_regmatch_slab) {
2657 Newx(PL_regmatch_slab, 1, regmatch_slab);
2658 PL_regmatch_slab->prev = NULL;
2659 PL_regmatch_slab->next = NULL;
2660 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2663 /* remember current high-water mark for exit */
2664 /* XXX this should be done with SAVE* instead */
2665 orig_slab = PL_regmatch_slab;
2666 orig_state = PL_regmatch_state;
2668 /* grab next free state slot */
2669 st = ++PL_regmatch_state;
2670 if (st > SLAB_LAST(PL_regmatch_slab))
2671 st = PL_regmatch_state = S_push_slab(aTHX);
2673 /* Note that nextchr is a byte even in UTF */
2674 nextchr = UCHARAT(locinput);
2676 while (scan != NULL) {
2679 SV * const prop = sv_newmortal();
2680 regnode *rnext=regnext(scan);
2681 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2682 regprop(rex, prop, scan);
2684 PerlIO_printf(Perl_debug_log,
2685 "%3"IVdf":%*s%s(%"IVdf")\n",
2686 (IV)(scan - rex->program), depth*2, "",
2688 (PL_regkind[OP(scan)] == END || !rnext) ?
2689 0 : (IV)(rnext - rex->program));
2692 next = scan + NEXT_OFF(scan);
2695 state_num = OP(scan);
2698 switch (state_num) {
2700 if (locinput == PL_bostr)
2702 /* reginfo->till = reginfo->bol; */
2707 if (locinput == PL_bostr ||
2708 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2714 if (locinput == PL_bostr)
2718 if (locinput == reginfo->ganch)
2724 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2729 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2731 if (PL_regeol - locinput > 1)
2735 if (PL_regeol != locinput)
2739 if (!nextchr && locinput >= PL_regeol)
2742 locinput += PL_utf8skip[nextchr];
2743 if (locinput > PL_regeol)
2745 nextchr = UCHARAT(locinput);
2748 nextchr = UCHARAT(++locinput);
2751 if (!nextchr && locinput >= PL_regeol)
2753 nextchr = UCHARAT(++locinput);
2756 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2759 locinput += PL_utf8skip[nextchr];
2760 if (locinput > PL_regeol)
2762 nextchr = UCHARAT(locinput);
2765 nextchr = UCHARAT(++locinput);
2769 #define ST st->u.trie
2771 /* In this case the charclass data is available inline so
2772 we can fail fast without a lot of extra overhead.
2774 if (scan->flags == EXACT || !do_utf8) {
2775 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2777 PerlIO_printf(Perl_debug_log,
2778 "%*s %sfailed to match trie start class...%s\n",
2779 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2788 /* what type of TRIE am I? (utf8 makes this contextual) */
2789 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2790 trie_type = do_utf8 ?
2791 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2794 /* what trie are we using right now */
2795 reg_trie_data * const trie
2796 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2797 U32 state = trie->startstate;
2799 if (trie->bitmap && trie_type != trie_utf8_fold &&
2800 !TRIE_BITMAP_TEST(trie,*locinput)
2802 if (trie->states[ state ].wordnum) {
2804 PerlIO_printf(Perl_debug_log,
2805 "%*s %smatched empty string...%s\n",
2806 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2811 PerlIO_printf(Perl_debug_log,
2812 "%*s %sfailed to match trie start class...%s\n",
2813 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2820 U8 *uc = ( U8* )locinput;
2824 U8 *uscan = (U8*)NULL;
2826 SV *sv_accept_buff = NULL;
2827 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2829 ST.accepted = 0; /* how many accepting states we have seen */
2831 ST.jump = trie->jump;
2835 traverse the TRIE keeping track of all accepting states
2836 we transition through until we get to a failing node.
2839 while ( state && uc <= (U8*)PL_regeol ) {
2840 U32 base = trie->states[ state ].trans.base;
2843 /* We use charid to hold the wordnum as we don't use it
2844 for charid until after we have done the wordnum logic.
2845 We define an alias just so that the wordnum logic reads
2848 #define got_wordnum charid
2849 got_wordnum = trie->states[ state ].wordnum;
2851 if ( got_wordnum ) {
2852 if ( ! ST.accepted ) {
2855 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2856 sv_accept_buff=newSV(bufflen *
2857 sizeof(reg_trie_accepted) - 1);
2858 SvCUR_set(sv_accept_buff, 0);
2859 SvPOK_on(sv_accept_buff);
2860 sv_2mortal(sv_accept_buff);
2863 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2866 if (ST.accepted >= bufflen) {
2868 ST.accept_buff =(reg_trie_accepted*)
2869 SvGROW(sv_accept_buff,
2870 bufflen * sizeof(reg_trie_accepted));
2872 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2873 + sizeof(reg_trie_accepted));
2876 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2877 ST.accept_buff[ST.accepted].endpos = uc;
2879 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2883 DEBUG_TRIE_EXECUTE_r({
2884 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2885 PerlIO_printf( Perl_debug_log,
2886 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2887 2+depth * 2, "", PL_colors[4],
2888 (UV)state, (UV)ST.accepted );
2892 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2893 uvc, charid, foldlen, foldbuf, uniflags);
2896 (base + charid > trie->uniquecharcount )
2897 && (base + charid - 1 - trie->uniquecharcount
2899 && trie->trans[base + charid - 1 -
2900 trie->uniquecharcount].check == state)
2902 state = trie->trans[base + charid - 1 -
2903 trie->uniquecharcount ].next;
2914 DEBUG_TRIE_EXECUTE_r(
2915 PerlIO_printf( Perl_debug_log,
2916 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2917 charid, uvc, (UV)state, PL_colors[5] );
2924 PerlIO_printf( Perl_debug_log,
2925 "%*s %sgot %"IVdf" possible matches%s\n",
2926 REPORT_CODE_OFF + depth * 2, "",
2927 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2932 case TRIE_next_fail: /* we failed - try next alterative */
2937 if ( ST.accepted == 1 ) {
2938 /* only one choice left - just continue */
2940 reg_trie_data * const trie
2941 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2942 SV ** const tmp = av_fetch( trie->words,
2943 ST.accept_buff[ 0 ].wordnum-1, 0 );
2944 SV *sv= tmp ? sv_newmortal() : NULL;
2946 PerlIO_printf( Perl_debug_log,
2947 "%*s %sonly one match left: #%d <%s>%s\n",
2948 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2949 ST.accept_buff[ 0 ].wordnum,
2950 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
2951 PL_colors[0], PL_colors[1],
2952 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2954 : "not compiled under -Dr",
2957 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2958 /* in this case we free tmps/leave before we call regmatch
2959 as we wont be using accept_buff again. */
2961 locinput = PL_reginput;
2962 nextchr = UCHARAT(locinput);
2963 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
2966 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
2967 if (!has_cutgroup) {
2972 PUSH_YES_STATE_GOTO(TRIE_next, scan);
2975 continue; /* execute rest of RE */
2978 if (!ST.accepted-- ) {
2980 PerlIO_printf( Perl_debug_log,
2981 "%*s %sTRIE failed...%s\n",
2982 REPORT_CODE_OFF+depth*2, "",
2992 There are at least two accepting states left. Presumably
2993 the number of accepting states is going to be low,
2994 typically two. So we simply scan through to find the one
2995 with lowest wordnum. Once we find it, we swap the last
2996 state into its place and decrement the size. We then try to
2997 match the rest of the pattern at the point where the word
2998 ends. If we succeed, control just continues along the
2999 regex; if we fail we return here to try the next accepting
3006 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3007 DEBUG_TRIE_EXECUTE_r(
3008 PerlIO_printf( Perl_debug_log,
3009 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3010 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3011 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3012 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3015 if (ST.accept_buff[cur].wordnum <
3016 ST.accept_buff[best].wordnum)
3021 reg_trie_data * const trie
3022 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3023 SV ** const tmp = av_fetch( trie->words,
3024 ST.accept_buff[ best ].wordnum - 1, 0 );
3025 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3027 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3028 SV *sv= tmp ? sv_newmortal() : NULL;
3030 PerlIO_printf( Perl_debug_log,
3031 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3032 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3033 ST.accept_buff[best].wordnum,
3034 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3035 PL_colors[0], PL_colors[1],
3036 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3037 ) : "not compiled under -Dr",
3038 REG_NODE_NUM(nextop),
3042 if ( best<ST.accepted ) {
3043 reg_trie_accepted tmp = ST.accept_buff[ best ];
3044 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3045 ST.accept_buff[ ST.accepted ] = tmp;
3048 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3049 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3053 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3057 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3060 PUSH_STATE_GOTO(TRIE_next, scan);
3073 char *s = STRING(scan);
3075 if (do_utf8 != UTF) {
3076 /* The target and the pattern have differing utf8ness. */
3078 const char * const e = s + ln;
3081 /* The target is utf8, the pattern is not utf8. */
3086 if (NATIVE_TO_UNI(*(U8*)s) !=
3087 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3095 /* The target is not utf8, the pattern is utf8. */
3100 if (NATIVE_TO_UNI(*((U8*)l)) !=
3101 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3109 nextchr = UCHARAT(locinput);
3112 /* The target and the pattern have the same utf8ness. */
3113 /* Inline the first character, for speed. */
3114 if (UCHARAT(s) != nextchr)
3116 if (PL_regeol - locinput < ln)
3118 if (ln > 1 && memNE(s, locinput, ln))
3121 nextchr = UCHARAT(locinput);
3125 PL_reg_flags |= RF_tainted;
3128 char * const s = STRING(scan);
3131 if (do_utf8 || UTF) {
3132 /* Either target or the pattern are utf8. */
3133 const char * const l = locinput;
3134 char *e = PL_regeol;
3136 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3137 l, &e, 0, do_utf8)) {
3138 /* One more case for the sharp s:
3139 * pack("U0U*", 0xDF) =~ /ss/i,
3140 * the 0xC3 0x9F are the UTF-8
3141 * byte sequence for the U+00DF. */
3143 toLOWER(s[0]) == 's' &&
3145 toLOWER(s[1]) == 's' &&
3152 nextchr = UCHARAT(locinput);
3156 /* Neither the target and the pattern are utf8. */
3158 /* Inline the first character, for speed. */
3159 if (UCHARAT(s) != nextchr &&
3160 UCHARAT(s) != ((OP(scan) == EXACTF)
3161 ? PL_fold : PL_fold_locale)[nextchr])
3163 if (PL_regeol - locinput < ln)
3165 if (ln > 1 && (OP(scan) == EXACTF
3166 ? ibcmp(s, locinput, ln)
3167 : ibcmp_locale(s, locinput, ln)))
3170 nextchr = UCHARAT(locinput);
3175 STRLEN inclasslen = PL_regeol - locinput;
3177 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3179 if (locinput >= PL_regeol)
3181 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3182 nextchr = UCHARAT(locinput);
3187 nextchr = UCHARAT(locinput);
3188 if (!REGINCLASS(rex, scan, (U8*)locinput))
3190 if (!nextchr && locinput >= PL_regeol)
3192 nextchr = UCHARAT(++locinput);
3196 /* If we might have the case of the German sharp s
3197 * in a casefolding Unicode character class. */
3199 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3200 locinput += SHARP_S_SKIP;
3201 nextchr = UCHARAT(locinput);
3207 PL_reg_flags |= RF_tainted;
3213 LOAD_UTF8_CHARCLASS_ALNUM();
3214 if (!(OP(scan) == ALNUM
3215 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3216 : isALNUM_LC_utf8((U8*)locinput)))
3220 locinput += PL_utf8skip[nextchr];
3221 nextchr = UCHARAT(locinput);
3224 if (!(OP(scan) == ALNUM
3225 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3227 nextchr = UCHARAT(++locinput);
3230 PL_reg_flags |= RF_tainted;
3233 if (!nextchr && locinput >= PL_regeol)
3236 LOAD_UTF8_CHARCLASS_ALNUM();
3237 if (OP(scan) == NALNUM
3238 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3239 : isALNUM_LC_utf8((U8*)locinput))
3243 locinput += PL_utf8skip[nextchr];
3244 nextchr = UCHARAT(locinput);
3247 if (OP(scan) == NALNUM
3248 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3250 nextchr = UCHARAT(++locinput);
3254 PL_reg_flags |= RF_tainted;
3258 /* was last char in word? */
3260 if (locinput == PL_bostr)
3263 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3265 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3267 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3268 ln = isALNUM_uni(ln);
3269 LOAD_UTF8_CHARCLASS_ALNUM();
3270 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3273 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3274 n = isALNUM_LC_utf8((U8*)locinput);
3278 ln = (locinput != PL_bostr) ?
3279 UCHARAT(locinput - 1) : '\n';
3280 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3282 n = isALNUM(nextchr);
3285 ln = isALNUM_LC(ln);
3286 n = isALNUM_LC(nextchr);
3289 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3290 OP(scan) == BOUNDL))
3294 PL_reg_flags |= RF_tainted;
3300 if (UTF8_IS_CONTINUED(nextchr)) {
3301 LOAD_UTF8_CHARCLASS_SPACE();
3302 if (!(OP(scan) == SPACE
3303 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3304 : isSPACE_LC_utf8((U8*)locinput)))
3308 locinput += PL_utf8skip[nextchr];
3309 nextchr = UCHARAT(locinput);
3312 if (!(OP(scan) == SPACE
3313 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3315 nextchr = UCHARAT(++locinput);
3318 if (!(OP(scan) == SPACE
3319 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3321 nextchr = UCHARAT(++locinput);
3325 PL_reg_flags |= RF_tainted;
3328 if (!nextchr && locinput >= PL_regeol)
3331 LOAD_UTF8_CHARCLASS_SPACE();
3332 if (OP(scan) == NSPACE
3333 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3334 : isSPACE_LC_utf8((U8*)locinput))
3338 locinput += PL_utf8skip[nextchr];
3339 nextchr = UCHARAT(locinput);
3342 if (OP(scan) == NSPACE
3343 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3345 nextchr = UCHARAT(++locinput);
3348 PL_reg_flags |= RF_tainted;
3354 LOAD_UTF8_CHARCLASS_DIGIT();
3355 if (!(OP(scan) == DIGIT
3356 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3357 : isDIGIT_LC_utf8((U8*)locinput)))
3361 locinput += PL_utf8skip[nextchr];
3362 nextchr = UCHARAT(locinput);
3365 if (!(OP(scan) == DIGIT
3366 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3368 nextchr = UCHARAT(++locinput);
3371 PL_reg_flags |= RF_tainted;
3374 if (!nextchr && locinput >= PL_regeol)
3377 LOAD_UTF8_CHARCLASS_DIGIT();
3378 if (OP(scan) == NDIGIT
3379 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3380 : isDIGIT_LC_utf8((U8*)locinput))
3384 locinput += PL_utf8skip[nextchr];
3385 nextchr = UCHARAT(locinput);
3388 if (OP(scan) == NDIGIT
3389 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3391 nextchr = UCHARAT(++locinput);
3394 if (locinput >= PL_regeol)
3397 LOAD_UTF8_CHARCLASS_MARK();
3398 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3400 locinput += PL_utf8skip[nextchr];
3401 while (locinput < PL_regeol &&
3402 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3403 locinput += UTF8SKIP(locinput);
3404 if (locinput > PL_regeol)
3409 nextchr = UCHARAT(locinput);
3416 PL_reg_flags |= RF_tainted;
3421 n = reg_check_named_buff_matched(rex,scan);
3424 type = REF + ( type - NREF );
3431 PL_reg_flags |= RF_tainted;
3435 n = ARG(scan); /* which paren pair */
3438 ln = PL_regstartp[n];
3439 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3440 if (*PL_reglastparen < n || ln == -1)
3441 sayNO; /* Do not match unless seen CLOSEn. */
3442 if (ln == PL_regendp[n])
3446 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3448 const char *e = PL_bostr + PL_regendp[n];
3450 * Note that we can't do the "other character" lookup trick as
3451 * in the 8-bit case (no pun intended) because in Unicode we
3452 * have to map both upper and title case to lower case.
3456 STRLEN ulen1, ulen2;
3457 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3458 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3462 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3463 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3464 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3471 nextchr = UCHARAT(locinput);
3475 /* Inline the first character, for speed. */
3476 if (UCHARAT(s) != nextchr &&
3478 (UCHARAT(s) != (type == REFF
3479 ? PL_fold : PL_fold_locale)[nextchr])))
3481 ln = PL_regendp[n] - ln;
3482 if (locinput + ln > PL_regeol)
3484 if (ln > 1 && (type == REF
3485 ? memNE(s, locinput, ln)
3487 ? ibcmp(s, locinput, ln)
3488 : ibcmp_locale(s, locinput, ln))))
3491 nextchr = UCHARAT(locinput);
3501 #define ST st->u.eval
3505 regnode *startpoint;
3508 case GOSUB: /* /(...(?1))/ */
3509 if (cur_eval && cur_eval->locinput==locinput) {
3510 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3511 Perl_croak(aTHX_ "Infinite recursion in regex");
3512 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3514 "Pattern subroutine nesting without pos change"
3515 " exceeded limit in regex");
3520 (void)ReREFCNT_inc(rex);
3521 if (OP(scan)==GOSUB) {
3522 startpoint = scan + ARG2L(scan);
3523 ST.close_paren = ARG(scan);
3525 startpoint = re->program+1;
3528 goto eval_recurse_doit;
3530 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3531 if (cur_eval && cur_eval->locinput==locinput) {
3532 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3533 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3538 /* execute the code in the {...} */
3540 SV ** const before = SP;
3541 OP_4tree * const oop = PL_op;
3542 COP * const ocurcop = PL_curcop;
3546 PL_op = (OP_4tree*)rex->data->data[n];
3547 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3548 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3549 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3550 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3552 CALLRUNOPS(aTHX); /* Scalar context. */
3555 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3562 PAD_RESTORE_LOCAL(old_comppad);
3563 PL_curcop = ocurcop;
3566 sv_setsv(save_scalar(PL_replgv), ret);
3570 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3573 /* extract RE object from returned value; compiling if
3578 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3579 mg = mg_find(sv, PERL_MAGIC_qr);
3580 else if (SvSMAGICAL(ret)) {
3581 if (SvGMAGICAL(ret))
3582 sv_unmagic(ret, PERL_MAGIC_qr);
3584 mg = mg_find(ret, PERL_MAGIC_qr);
3588 re = (regexp *)mg->mg_obj;
3589 (void)ReREFCNT_inc(re);
3593 const char * const t = SvPV_const(ret, len);
3595 const I32 osize = PL_regsize;
3598 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3599 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3601 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3603 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3609 debug_start_match(re, do_utf8, locinput, PL_regeol,
3610 "Matching embedded");
3612 startpoint = re->program + 1;
3613 ST.close_paren = 0; /* only used for GOSUB */
3614 /* borrowed from regtry */
3615 if (PL_reg_start_tmpl <= re->nparens) {
3616 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3617 if(PL_reg_start_tmp)
3618 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3620 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3623 eval_recurse_doit: /* Share code with GOSUB below this line */
3624 /* run the pattern returned from (??{...}) */
3625 ST.cp = regcppush(0); /* Save *all* the positions. */
3626 REGCP_SET(ST.lastcp);
3628 PL_regstartp = re->startp; /* essentially NOOP on GOSUB */
3629 PL_regendp = re->endp; /* essentially NOOP on GOSUB */
3631 *PL_reglastparen = 0;
3632 *PL_reglastcloseparen = 0;
3633 PL_reginput = locinput;
3636 /* XXXX This is too dramatic a measure... */
3639 ST.toggle_reg_flags = PL_reg_flags;
3640 if (re->extflags & RXf_UTF8)
3641 PL_reg_flags |= RF_utf8;
3643 PL_reg_flags &= ~RF_utf8;
3644 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3647 ST.prev_curlyx = cur_curlyx;
3651 ST.prev_eval = cur_eval;
3653 /* now continue from first node in postoned RE */
3654 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3657 /* logical is 1, /(?(?{...})X|Y)/ */
3658 sw = (bool)SvTRUE(ret);
3663 case EVAL_AB: /* cleanup after a successful (??{A})B */
3664 /* note: this is called twice; first after popping B, then A */
3665 PL_reg_flags ^= ST.toggle_reg_flags;
3669 cur_eval = ST.prev_eval;
3670 cur_curlyx = ST.prev_curlyx;
3671 /* XXXX This is too dramatic a measure... */
3676 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3677 /* note: this is called twice; first after popping B, then A */
3678 PL_reg_flags ^= ST.toggle_reg_flags;
3681 PL_reginput = locinput;
3682 REGCP_UNWIND(ST.lastcp);
3684 cur_eval = ST.prev_eval;
3685 cur_curlyx = ST.prev_curlyx;
3686 /* XXXX This is too dramatic a measure... */
3692 n = ARG(scan); /* which paren pair */
3693 PL_reg_start_tmp[n] = locinput;
3699 n = ARG(scan); /* which paren pair */
3700 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3701 PL_regendp[n] = locinput - PL_bostr;
3702 /*if (n > PL_regsize)
3704 if (n > *PL_reglastparen)
3705 *PL_reglastparen = n;
3706 *PL_reglastcloseparen = n;
3707 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3715 cursor && OP(cursor)!=END;
3716 cursor=regnext(cursor))
3718 if ( OP(cursor)==CLOSE ){
3720 if ( n <= lastopen ) {
3721 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3722 PL_regendp[n] = locinput - PL_bostr;
3723 /*if (n > PL_regsize)
3725 if (n > *PL_reglastparen)
3726 *PL_reglastparen = n;
3727 *PL_reglastcloseparen = n;
3728 if ( n == ARG(scan) || (cur_eval &&
3729 cur_eval->u.eval.close_paren == n))
3738 n = ARG(scan); /* which paren pair */
3739 sw = (bool)(*PL_reglastparen >= n && PL_regendp[n] != -1);
3742 /* reg_check_named_buff_matched returns 0 for no match */
3743 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3747 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3753 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3755 next = NEXTOPER(NEXTOPER(scan));
3757 next = scan + ARG(scan);
3758 if (OP(next) == IFTHEN) /* Fake one. */
3759 next = NEXTOPER(NEXTOPER(next));
3763 logical = scan->flags;
3766 /*******************************************************************
3768 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3769 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3770 STAR/PLUS/CURLY/CURLYN are used instead.)
3772 A*B is compiled as <CURLYX><A><WHILEM><B>
3774 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3775 state, which contains the current count, initialised to -1. It also sets
3776 cur_curlyx to point to this state, with any previous value saved in the
3779 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3780 since the pattern may possibly match zero times (i.e. it's a while {} loop
3781 rather than a do {} while loop).
3783 Each entry to WHILEM represents a successful match of A. The count in the
3784 CURLYX block is incremented, another WHILEM state is pushed, and execution
3785 passes to A or B depending on greediness and the current count.
3787 For example, if matching against the string a1a2a3b (where the aN are
3788 substrings that match /A/), then the match progresses as follows: (the
3789 pushed states are interspersed with the bits of strings matched so far):
3792 <CURLYX cnt=0><WHILEM>
3793 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3794 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3795 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3796 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3798 (Contrast this with something like CURLYM, which maintains only a single
3802 a1 <CURLYM cnt=1> a2
3803 a1 a2 <CURLYM cnt=2> a3
3804 a1 a2 a3 <CURLYM cnt=3> b
3807 Each WHILEM state block marks a point to backtrack to upon partial failure
3808 of A or B, and also contains some minor state data related to that
3809 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3810 overall state, such as the count, and pointers to the A and B ops.
3812 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3813 must always point to the *current* CURLYX block, the rules are:
3815 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3816 and set cur_curlyx to point the new block.
3818 When popping the CURLYX block after a successful or unsuccessful match,
3819 restore the previous cur_curlyx.
3821 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3822 to the outer one saved in the CURLYX block.
3824 When popping the WHILEM block after a successful or unsuccessful B match,
3825 restore the previous cur_curlyx.
3827 Here's an example for the pattern (AI* BI)*BO
3828 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3831 curlyx backtrack stack
3832 ------ ---------------
3834 CO <CO prev=NULL> <WO>
3835 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3836 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3837 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3839 At this point the pattern succeeds, and we work back down the stack to
3840 clean up, restoring as we go:
3842 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3843 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3844 CO <CO prev=NULL> <WO>
3847 *******************************************************************/
3849 #define ST st->u.curlyx
3851 case CURLYX: /* start of /A*B/ (for complex A) */
3853 /* No need to save/restore up to this paren */
3854 I32 parenfloor = scan->flags;
3856 assert(next); /* keep Coverity happy */
3857 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3860 /* XXXX Probably it is better to teach regpush to support
3861 parenfloor > PL_regsize... */
3862 if (parenfloor > (I32)*PL_reglastparen)
3863 parenfloor = *PL_reglastparen; /* Pessimization... */
3865 ST.prev_curlyx= cur_curlyx;
3867 ST.cp = PL_savestack_ix;
3869 /* these fields contain the state of the current curly.
3870 * they are accessed by subsequent WHILEMs */
3871 ST.parenfloor = parenfloor;
3872 ST.min = ARG1(scan);
3873 ST.max = ARG2(scan);
3874 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3878 ST.count = -1; /* this will be updated by WHILEM */
3879 ST.lastloc = NULL; /* this will be updated by WHILEM */
3881 PL_reginput = locinput;
3882 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3886 case CURLYX_end: /* just finished matching all of A*B */
3887 if (PL_reg_eval_set){
3888 SV *pres= GvSV(PL_replgv);
3891 sv_setsv(GvSV(PL_replgv), pres);
3896 cur_curlyx = ST.prev_curlyx;
3900 case CURLYX_end_fail: /* just failed to match all of A*B */
3902 cur_curlyx = ST.prev_curlyx;
3908 #define ST st->u.whilem
3910 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3912 /* see the discussion above about CURLYX/WHILEM */
3914 assert(cur_curlyx); /* keep Coverity happy */
3915 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3916 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3917 ST.cache_offset = 0;
3920 PL_reginput = locinput;
3922 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3923 "%*s whilem: matched %ld out of %ld..%ld\n",
3924 REPORT_CODE_OFF+depth*2, "", (long)n,
3925 (long)cur_curlyx->u.curlyx.min,
3926 (long)cur_curlyx->u.curlyx.max)
3929 /* First just match a string of min A's. */
3931 if (n < cur_curlyx->u.curlyx.min) {
3932 cur_curlyx->u.curlyx.lastloc = locinput;
3933 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3937 /* If degenerate A matches "", assume A done. */
3939 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3940 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3941 "%*s whilem: empty match detected, trying continuation...\n",
3942 REPORT_CODE_OFF+depth*2, "")
3944 goto do_whilem_B_max;
3947 /* super-linear cache processing */
3951 if (!PL_reg_maxiter) {
3952 /* start the countdown: Postpone detection until we
3953 * know the match is not *that* much linear. */
3954 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3955 /* possible overflow for long strings and many CURLYX's */
3956 if (PL_reg_maxiter < 0)
3957 PL_reg_maxiter = I32_MAX;
3958 PL_reg_leftiter = PL_reg_maxiter;
3961 if (PL_reg_leftiter-- == 0) {
3962 /* initialise cache */
3963 const I32 size = (PL_reg_maxiter + 7)/8;
3964 if (PL_reg_poscache) {
3965 if ((I32)PL_reg_poscache_size < size) {
3966 Renew(PL_reg_poscache, size, char);
3967 PL_reg_poscache_size = size;
3969 Zero(PL_reg_poscache, size, char);
3972 PL_reg_poscache_size = size;
3973 Newxz(PL_reg_poscache, size, char);
3975 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3976 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3977 PL_colors[4], PL_colors[5])
3981 if (PL_reg_leftiter < 0) {
3982 /* have we already failed at this position? */
3984 offset = (scan->flags & 0xf) - 1
3985 + (locinput - PL_bostr) * (scan->flags>>4);
3986 mask = 1 << (offset % 8);
3988 if (PL_reg_poscache[offset] & mask) {
3989 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3990 "%*s whilem: (cache) already tried at this position...\n",
3991 REPORT_CODE_OFF+depth*2, "")
3993 sayNO; /* cache records failure */
3995 ST.cache_offset = offset;
3996 ST.cache_mask = mask;
4000 /* Prefer B over A for minimal matching. */
4002 if (cur_curlyx->u.curlyx.minmod) {
4003 ST.save_curlyx = cur_curlyx;
4004 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4005 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4006 REGCP_SET(ST.lastcp);
4007 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4011 /* Prefer A over B for maximal matching. */
4013 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4014 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4015 cur_curlyx->u.curlyx.lastloc = locinput;
4016 REGCP_SET(ST.lastcp);
4017 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4020 goto do_whilem_B_max;
4024 case WHILEM_B_min: /* just matched B in a minimal match */
4025 case WHILEM_B_max: /* just matched B in a maximal match */
4026 cur_curlyx = ST.save_curlyx;
4030 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4031 cur_curlyx = ST.save_curlyx;
4032 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4033 cur_curlyx->u.curlyx.count--;
4037 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4038 REGCP_UNWIND(ST.lastcp);
4041 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4042 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4043 cur_curlyx->u.curlyx.count--;
4047 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4048 REGCP_UNWIND(ST.lastcp);
4049 regcppop(rex); /* Restore some previous $<digit>s? */
4050 PL_reginput = locinput;
4051 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4052 "%*s whilem: failed, trying continuation...\n",
4053 REPORT_CODE_OFF+depth*2, "")
4056 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4057 && ckWARN(WARN_REGEXP)
4058 && !(PL_reg_flags & RF_warned))
4060 PL_reg_flags |= RF_warned;
4061 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4062 "Complex regular subexpression recursion",
4067 ST.save_curlyx = cur_curlyx;
4068 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4069 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4072 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4073 cur_curlyx = ST.save_curlyx;
4074 REGCP_UNWIND(ST.lastcp);
4077 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4078 /* Maximum greed exceeded */
4079 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4080 && ckWARN(WARN_REGEXP)
4081 && !(PL_reg_flags & RF_warned))
4083 PL_reg_flags |= RF_warned;
4084 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4085 "%s limit (%d) exceeded",
4086 "Complex regular subexpression recursion",
4089 cur_curlyx->u.curlyx.count--;
4093 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4094 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4096 /* Try grabbing another A and see if it helps. */
4097 PL_reginput = locinput;
4098 cur_curlyx->u.curlyx.lastloc = locinput;
4099 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4100 REGCP_SET(ST.lastcp);
4101 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4105 #define ST st->u.branch
4107 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4108 next = scan + ARG(scan);
4111 scan = NEXTOPER(scan);
4114 case BRANCH: /* /(...|A|...)/ */
4115 scan = NEXTOPER(scan); /* scan now points to inner node */
4116 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4119 /* last branch; skip state push and jump direct to node */
4122 ST.lastparen = *PL_reglastparen;
4123 ST.next_branch = next;
4125 PL_reginput = locinput;
4127 /* Now go into the branch */
4129 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4131 PUSH_STATE_GOTO(BRANCH_next, scan);
4135 PL_reginput = locinput;
4136 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4137 (SV*)rex->data->data[ ARG( scan ) ];
4138 PUSH_STATE_GOTO(CUTGROUP_next,next);
4140 case CUTGROUP_next_fail:
4143 if (st->u.mark.mark_name)
4144 sv_commit = st->u.mark.mark_name;
4150 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4155 REGCP_UNWIND(ST.cp);
4156 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4158 *PL_reglastparen = n;
4159 /*dmq: *PL_reglastcloseparen = n; */
4160 scan = ST.next_branch;
4161 /* no more branches? */
4162 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4164 PerlIO_printf( Perl_debug_log,
4165 "%*s %sBRANCH failed...%s\n",
4166 REPORT_CODE_OFF+depth*2, "",
4172 continue; /* execute next BRANCH[J] op */
4180 #define ST st->u.curlym
4182 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4184 /* This is an optimisation of CURLYX that enables us to push
4185 * only a single backtracking state, no matter now many matches
4186 * there are in {m,n}. It relies on the pattern being constant
4187 * length, with no parens to influence future backrefs
4191 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4193 /* if paren positive, emulate an OPEN/CLOSE around A */
4195 U32 paren = ST.me->flags;
4196 if (paren > PL_regsize)
4198 if (paren > *PL_reglastparen)
4199 *PL_reglastparen = paren;
4200 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4208 ST.c1 = CHRTEST_UNINIT;
4211 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4214 curlym_do_A: /* execute the A in /A{m,n}B/ */
4215 PL_reginput = locinput;
4216 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4219 case CURLYM_A: /* we've just matched an A */
4220 locinput = st->locinput;
4221 nextchr = UCHARAT(locinput);
4224 /* after first match, determine A's length: u.curlym.alen */
4225 if (ST.count == 1) {
4226 if (PL_reg_match_utf8) {
4228 while (s < PL_reginput) {
4234 ST.alen = PL_reginput - locinput;
4237 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4240 PerlIO_printf(Perl_debug_log,
4241 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4242 (int)(REPORT_CODE_OFF+(depth*2)), "",
4243 (IV) ST.count, (IV)ST.alen)
4246 locinput = PL_reginput;
4248 if (cur_eval && cur_eval->u.eval.close_paren &&
4249 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4252 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4253 goto curlym_do_A; /* try to match another A */
4254 goto curlym_do_B; /* try to match B */
4256 case CURLYM_A_fail: /* just failed to match an A */
4257 REGCP_UNWIND(ST.cp);
4259 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4260 || (cur_eval && cur_eval->u.eval.close_paren &&
4261 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4264 curlym_do_B: /* execute the B in /A{m,n}B/ */
4265 PL_reginput = locinput;
4266 if (ST.c1 == CHRTEST_UNINIT) {
4267 /* calculate c1 and c2 for possible match of 1st char
4268 * following curly */
4269 ST.c1 = ST.c2 = CHRTEST_VOID;
4270 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4271 regnode *text_node = ST.B;
4272 if (! HAS_TEXT(text_node))
4273 FIND_NEXT_IMPT(text_node);
4274 if (HAS_TEXT(text_node)
4275 && PL_regkind[OP(text_node)] != REF)
4277 ST.c1 = (U8)*STRING(text_node);
4279 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4281 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4282 ? PL_fold_locale[ST.c1]
4289 PerlIO_printf(Perl_debug_log,
4290 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4291 (int)(REPORT_CODE_OFF+(depth*2)),
4294 if (ST.c1 != CHRTEST_VOID
4295 && UCHARAT(PL_reginput) != ST.c1
4296 && UCHARAT(PL_reginput) != ST.c2)
4298 /* simulate B failing */
4299 state_num = CURLYM_B_fail;
4300 goto reenter_switch;
4304 /* mark current A as captured */
4305 I32 paren = ST.me->flags;
4308 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4309 PL_regendp[paren] = PL_reginput - PL_bostr;
4310 /*dmq: *PL_reglastcloseparen = paren; */
4313 PL_regendp[paren] = -1;
4314 if (cur_eval && cur_eval->u.eval.close_paren &&
4315 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4324 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4327 case CURLYM_B_fail: /* just failed to match a B */
4328 REGCP_UNWIND(ST.cp);
4330 if (ST.count == ARG2(ST.me) /* max */)
4332 goto curlym_do_A; /* try to match a further A */
4334 /* backtrack one A */
4335 if (ST.count == ARG1(ST.me) /* min */)
4338 locinput = HOPc(locinput, -ST.alen);
4339 goto curlym_do_B; /* try to match B */
4342 #define ST st->u.curly
4344 #define CURLY_SETPAREN(paren, success) \
4347 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4348 PL_regendp[paren] = locinput - PL_bostr; \
4349 *PL_reglastcloseparen = paren; \
4352 PL_regendp[paren] = -1; \
4355 case STAR: /* /A*B/ where A is width 1 */
4359 scan = NEXTOPER(scan);
4361 case PLUS: /* /A+B/ where A is width 1 */
4365 scan = NEXTOPER(scan);
4367 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4368 ST.paren = scan->flags; /* Which paren to set */
4369 if (ST.paren > PL_regsize)
4370 PL_regsize = ST.paren;
4371 if (ST.paren > *PL_reglastparen)
4372 *PL_reglastparen = ST.paren;
4373 ST.min = ARG1(scan); /* min to match */
4374 ST.max = ARG2(scan); /* max to match */
4375 if (cur_eval && cur_eval->u.eval.close_paren &&
4376 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4380 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4382 case CURLY: /* /A{m,n}B/ where A is width 1 */
4384 ST.min = ARG1(scan); /* min to match */
4385 ST.max = ARG2(scan); /* max to match */
4386 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4389 * Lookahead to avoid useless match attempts
4390 * when we know what character comes next.
4392 * Used to only do .*x and .*?x, but now it allows
4393 * for )'s, ('s and (?{ ... })'s to be in the way
4394 * of the quantifier and the EXACT-like node. -- japhy
4397 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4399 if (HAS_TEXT(next) || JUMPABLE(next)) {
4401 regnode *text_node = next;
4403 if (! HAS_TEXT(text_node))
4404 FIND_NEXT_IMPT(text_node);
4406 if (! HAS_TEXT(text_node))
4407 ST.c1 = ST.c2 = CHRTEST_VOID;
4409 if (PL_regkind[OP(text_node)] == REF) {
4410 ST.c1 = ST.c2 = CHRTEST_VOID;
4411 goto assume_ok_easy;
4414 s = (U8*)STRING(text_node);
4418 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4419 ST.c2 = PL_fold[ST.c1];
4420 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4421 ST.c2 = PL_fold_locale[ST.c1];
4424 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4425 STRLEN ulen1, ulen2;
4426 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4427 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4429 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4430 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4432 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4434 0 : UTF8_ALLOW_ANY);
4435 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4437 0 : UTF8_ALLOW_ANY);
4439 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4441 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4446 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4453 ST.c1 = ST.c2 = CHRTEST_VOID;
4458 PL_reginput = locinput;
4461 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4464 locinput = PL_reginput;
4466 if (ST.c1 == CHRTEST_VOID)
4467 goto curly_try_B_min;
4469 ST.oldloc = locinput;
4471 /* set ST.maxpos to the furthest point along the
4472 * string that could possibly match */
4473 if (ST.max == REG_INFTY) {
4474 ST.maxpos = PL_regeol - 1;
4476 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4480 int m = ST.max - ST.min;
4481 for (ST.maxpos = locinput;
4482 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4483 ST.maxpos += UTF8SKIP(ST.maxpos);
4486 ST.maxpos = locinput + ST.max - ST.min;
4487 if (ST.maxpos >= PL_regeol)
4488 ST.maxpos = PL_regeol - 1;
4490 goto curly_try_B_min_known;
4494 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4495 locinput = PL_reginput;
4496 if (ST.count < ST.min)
4498 if ((ST.count > ST.min)
4499 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4501 /* A{m,n} must come at the end of the string, there's
4502 * no point in backing off ... */
4504 /* ...except that $ and \Z can match before *and* after
4505 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4506 We may back off by one in this case. */
4507 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4511 goto curly_try_B_max;
4516 case CURLY_B_min_known_fail:
4517 /* failed to find B in a non-greedy match where c1,c2 valid */
4518 if (ST.paren && ST.count)
4519 PL_regendp[ST.paren] = -1;
4521 PL_reginput = locinput; /* Could be reset... */
4522 REGCP_UNWIND(ST.cp);
4523 /* Couldn't or didn't -- move forward. */
4524 ST.oldloc = locinput;
4526 locinput += UTF8SKIP(locinput);
4530 curly_try_B_min_known:
4531 /* find the next place where 'B' could work, then call B */
4535 n = (ST.oldloc == locinput) ? 0 : 1;
4536 if (ST.c1 == ST.c2) {
4538 /* set n to utf8_distance(oldloc, locinput) */
4539 while (locinput <= ST.maxpos &&
4540 utf8n_to_uvchr((U8*)locinput,
4541 UTF8_MAXBYTES, &len,
4542 uniflags) != (UV)ST.c1) {
4548 /* set n to utf8_distance(oldloc, locinput) */
4549 while (locinput <= ST.maxpos) {
4551 const UV c = utf8n_to_uvchr((U8*)locinput,
4552 UTF8_MAXBYTES, &len,
4554 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4562 if (ST.c1 == ST.c2) {
4563 while (locinput <= ST.maxpos &&
4564 UCHARAT(locinput) != ST.c1)
4568 while (locinput <= ST.maxpos
4569 && UCHARAT(locinput) != ST.c1
4570 && UCHARAT(locinput) != ST.c2)
4573 n = locinput - ST.oldloc;
4575 if (locinput > ST.maxpos)
4577 /* PL_reginput == oldloc now */
4580 if (regrepeat(rex, ST.A, n, depth) < n)
4583 PL_reginput = locinput;
4584 CURLY_SETPAREN(ST.paren, ST.count);
4585 if (cur_eval && cur_eval->u.eval.close_paren &&
4586 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4589 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4594 case CURLY_B_min_fail:
4595 /* failed to find B in a non-greedy match where c1,c2 invalid */
4596 if (ST.paren && ST.count)
4597 PL_regendp[ST.paren] = -1;
4599 REGCP_UNWIND(ST.cp);
4600 /* failed -- move forward one */
4601 PL_reginput = locinput;
4602 if (regrepeat(rex, ST.A, 1, depth)) {
4604 locinput = PL_reginput;
4605 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4606 ST.count > 0)) /* count overflow ? */
4609 CURLY_SETPAREN(ST.paren, ST.count);
4610 if (cur_eval && cur_eval->u.eval.close_paren &&
4611 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4614 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4622 /* a successful greedy match: now try to match B */
4623 if (cur_eval && cur_eval->u.eval.close_paren &&
4624 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4629 if (ST.c1 != CHRTEST_VOID)
4630 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4631 UTF8_MAXBYTES, 0, uniflags)
4632 : (UV) UCHARAT(PL_reginput);
4633 /* If it could work, try it. */
4634 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4635 CURLY_SETPAREN(ST.paren, ST.count);
4636 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4641 case CURLY_B_max_fail:
4642 /* failed to find B in a greedy match */
4643 if (ST.paren && ST.count)
4644 PL_regendp[ST.paren] = -1;
4646 REGCP_UNWIND(ST.cp);
4648 if (--ST.count < ST.min)
4650 PL_reginput = locinput = HOPc(locinput, -1);
4651 goto curly_try_B_max;
4658 /* we've just finished A in /(??{A})B/; now continue with B */
4662 st->u.eval.toggle_reg_flags
4663 = cur_eval->u.eval.toggle_reg_flags;
4664 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4666 st->u.eval.prev_rex = rex; /* inner */
4667 rex = cur_eval->u.eval.prev_rex; /* outer */
4668 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4670 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4671 REGCP_SET(st->u.eval.lastcp);
4672 PL_reginput = locinput;
4674 /* Restore parens of the outer rex without popping the
4676 tmpix = PL_savestack_ix;
4677 PL_savestack_ix = cur_eval->u.eval.lastcp;
4679 PL_savestack_ix = tmpix;
4681 st->u.eval.prev_eval = cur_eval;
4682 cur_eval = cur_eval->u.eval.prev_eval;
4684 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4685 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4686 PUSH_YES_STATE_GOTO(EVAL_AB,
4687 st->u.eval.prev_eval->u.eval.B); /* match B */
4690 if (locinput < reginfo->till) {
4691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4692 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4694 (long)(locinput - PL_reg_starttry),
4695 (long)(reginfo->till - PL_reg_starttry),
4698 sayNO_SILENT; /* Cannot match: too short. */
4700 PL_reginput = locinput; /* put where regtry can find it */
4701 sayYES; /* Success! */
4703 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4705 PerlIO_printf(Perl_debug_log,
4706 "%*s %ssubpattern success...%s\n",
4707 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4708 PL_reginput = locinput; /* put where regtry can find it */
4709 sayYES; /* Success! */
4712 #define ST st->u.ifmatch
4714 case SUSPEND: /* (?>A) */
4716 PL_reginput = locinput;
4719 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4721 goto ifmatch_trivial_fail_test;
4723 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4725 ifmatch_trivial_fail_test:
4727 char * const s = HOPBACKc(locinput, scan->flags);
4732 sw = 1 - (bool)ST.wanted;
4736 next = scan + ARG(scan);
4744 PL_reginput = locinput;
4748 ST.logical = logical;
4749 /* execute body of (?...A) */
4750 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4753 case IFMATCH_A_fail: /* body of (?...A) failed */
4754 ST.wanted = !ST.wanted;
4757 case IFMATCH_A: /* body of (?...A) succeeded */
4759 sw = (bool)ST.wanted;
4761 else if (!ST.wanted)
4764 if (OP(ST.me) == SUSPEND)
4765 locinput = PL_reginput;
4767 locinput = PL_reginput = st->locinput;
4768 nextchr = UCHARAT(locinput);
4770 scan = ST.me + ARG(ST.me);
4773 continue; /* execute B */
4778 next = scan + ARG(scan);
4783 reginfo->cutpoint = PL_regeol;
4786 PL_reginput = locinput;
4788 sv_yes_mark = sv_commit = (SV*)rex->data->data[ ARG( scan ) ];
4789 PUSH_STATE_GOTO(COMMIT_next,next);
4791 case COMMIT_next_fail:
4798 #define ST st->u.mark
4800 ST.prev_mark = mark_state;
4801 ST.mark_name = sv_commit = sv_yes_mark
4802 = (SV*)rex->data->data[ ARG( scan ) ];
4804 ST.mark_loc = PL_reginput = locinput;
4805 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4807 case MARKPOINT_next:
4808 mark_state = ST.prev_mark;
4811 case MARKPOINT_next_fail:
4812 if (popmark && sv_eq(ST.mark_name,popmark))
4814 if (ST.mark_loc > startpoint)
4815 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4816 popmark = NULL; /* we found our mark */
4817 sv_commit = ST.mark_name;
4820 PerlIO_printf(Perl_debug_log,
4821 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4822 REPORT_CODE_OFF+depth*2, "",
4823 PL_colors[4], sv_commit, PL_colors[5]);
4826 mark_state = ST.prev_mark;
4827 sv_yes_mark = mark_state ?
4828 mark_state->u.mark.mark_name : NULL;
4832 PL_reginput = locinput;
4834 /* (*CUT) : if we fail we cut here*/
4835 ST.mark_name = NULL;
4836 ST.mark_loc = locinput;
4837 PUSH_STATE_GOTO(SKIP_next,next);
4839 /* (*CUT:NAME) : if there is a (*MARK:NAME) fail where it was,
4840 otherwise do nothing. Meaning we need to scan
4842 regmatch_state *cur = mark_state;
4843 SV *find = (SV*)rex->data->data[ ARG( scan ) ];
4846 if ( sv_eq( cur->u.mark.mark_name,
4849 ST.mark_name = find;
4850 PUSH_STATE_GOTO( SKIP_next, next );
4852 cur = cur->u.mark.prev_mark;
4855 /* Didn't find our (*MARK:NAME) so ignore this (*CUT:NAME) */
4857 case SKIP_next_fail:
4859 /* (*CUT:NAME) - Set up to search for the name as we
4860 collapse the stack*/
4861 popmark = ST.mark_name;
4863 /* (*CUT) - No name, we cut here.*/
4864 if (ST.mark_loc > startpoint)
4865 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4866 /* but we set sv_commit to latest mark_name if there
4867 is one so they can test to see how things lead to this
4870 sv_commit=mark_state->u.mark.mark_name;
4878 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4879 PTR2UV(scan), OP(scan));
4880 Perl_croak(aTHX_ "regexp memory corruption");
4884 /* switch break jumps here */
4885 scan = next; /* prepare to execute the next op and ... */
4886 continue; /* ... jump back to the top, reusing st */
4890 /* push a state that backtracks on success */
4891 st->u.yes.prev_yes_state = yes_state;
4895 /* push a new regex state, then continue at scan */
4897 regmatch_state *newst;
4900 regmatch_state *cur = st;
4901 regmatch_state *curyes = yes_state;
4903 regmatch_slab *slab = PL_regmatch_slab;
4904 for (;curd > -1;cur--,curd--) {
4905 if (cur < SLAB_FIRST(slab)) {
4907 cur = SLAB_LAST(slab);
4909 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
4910 REPORT_CODE_OFF + 2 + depth * 2,"",
4911 curd, reg_name[cur->resume_state],
4912 (curyes == cur) ? "yes" : ""
4915 curyes = cur->u.yes.prev_yes_state;
4918 DEBUG_STATE_pp("push")
4921 st->locinput = locinput;
4923 if (newst > SLAB_LAST(PL_regmatch_slab))
4924 newst = S_push_slab(aTHX);
4925 PL_regmatch_state = newst;
4927 locinput = PL_reginput;
4928 nextchr = UCHARAT(locinput);
4936 * We get here only if there's trouble -- normally "case END" is
4937 * the terminating point.
4939 Perl_croak(aTHX_ "corrupted regexp pointers");
4945 /* we have successfully completed a subexpression, but we must now
4946 * pop to the state marked by yes_state and continue from there */
4947 assert(st != yes_state);
4949 while (st != yes_state) {
4951 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4952 PL_regmatch_slab = PL_regmatch_slab->prev;
4953 st = SLAB_LAST(PL_regmatch_slab);
4957 DEBUG_STATE_pp("pop (no final)");
4959 DEBUG_STATE_pp("pop (yes)");
4965 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4966 || yes_state > SLAB_LAST(PL_regmatch_slab))
4968 /* not in this slab, pop slab */
4969 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4970 PL_regmatch_slab = PL_regmatch_slab->prev;
4971 st = SLAB_LAST(PL_regmatch_slab);
4973 depth -= (st - yes_state);
4976 yes_state = st->u.yes.prev_yes_state;
4977 PL_regmatch_state = st;
4980 locinput= st->locinput;
4981 nextchr = UCHARAT(locinput);
4983 state_num = st->resume_state + no_final;
4984 goto reenter_switch;
4987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4988 PL_colors[4], PL_colors[5]));
4995 PerlIO_printf(Perl_debug_log,
4996 "%*s %sfailed...%s\n",
4997 REPORT_CODE_OFF+depth*2, "",
4998 PL_colors[4], PL_colors[5])
5010 /* there's a previous state to backtrack to */
5012 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5013 PL_regmatch_slab = PL_regmatch_slab->prev;
5014 st = SLAB_LAST(PL_regmatch_slab);
5016 PL_regmatch_state = st;
5017 locinput= st->locinput;
5018 nextchr = UCHARAT(locinput);
5020 DEBUG_STATE_pp("pop");
5022 if (yes_state == st)
5023 yes_state = st->u.yes.prev_yes_state;
5025 state_num = st->resume_state + 1; /* failure = success + 1 */
5026 goto reenter_switch;
5031 if (rex->intflags & PREGf_VERBARG_SEEN) {
5032 SV *sv_err = get_sv("REGERROR", 1);
5033 SV *sv_mrk = get_sv("REGMARK", 1);
5035 sv_commit = &PL_sv_no;
5037 sv_yes_mark = &PL_sv_yes;
5040 sv_commit = &PL_sv_yes;
5041 sv_yes_mark = &PL_sv_no;
5043 sv_setsv(sv_err, sv_commit);
5044 sv_setsv(sv_mrk, sv_yes_mark);
5046 /* restore original high-water mark */
5047 PL_regmatch_slab = orig_slab;
5048 PL_regmatch_state = orig_state;
5050 /* free all slabs above current one */
5051 if (orig_slab->next) {
5052 regmatch_slab *sl = orig_slab->next;
5053 orig_slab->next = NULL;
5055 regmatch_slab * const osl = sl;
5065 - regrepeat - repeatedly match something simple, report how many
5068 * [This routine now assumes that it will only match on things of length 1.
5069 * That was true before, but now we assume scan - reginput is the count,
5070 * rather than incrementing count on every character. [Er, except utf8.]]
5073 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5076 register char *scan;
5078 register char *loceol = PL_regeol;
5079 register I32 hardcount = 0;
5080 register bool do_utf8 = PL_reg_match_utf8;
5083 if (max == REG_INFTY)
5085 else if (max < loceol - scan)
5086 loceol = scan + max;
5091 while (scan < loceol && hardcount < max && *scan != '\n') {
5092 scan += UTF8SKIP(scan);
5096 while (scan < loceol && *scan != '\n')
5103 while (scan < loceol && hardcount < max) {
5104 scan += UTF8SKIP(scan);
5114 case EXACT: /* length of string is 1 */
5116 while (scan < loceol && UCHARAT(scan) == c)
5119 case EXACTF: /* length of string is 1 */
5121 while (scan < loceol &&
5122 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5125 case EXACTFL: /* length of string is 1 */
5126 PL_reg_flags |= RF_tainted;
5128 while (scan < loceol &&
5129 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5135 while (hardcount < max && scan < loceol &&
5136 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5137 scan += UTF8SKIP(scan);
5141 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5148 LOAD_UTF8_CHARCLASS_ALNUM();
5149 while (hardcount < max && scan < loceol &&
5150 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5151 scan += UTF8SKIP(scan);
5155 while (scan < loceol && isALNUM(*scan))
5160 PL_reg_flags |= RF_tainted;
5163 while (hardcount < max && scan < loceol &&
5164 isALNUM_LC_utf8((U8*)scan)) {
5165 scan += UTF8SKIP(scan);
5169 while (scan < loceol && isALNUM_LC(*scan))
5176 LOAD_UTF8_CHARCLASS_ALNUM();
5177 while (hardcount < max && scan < loceol &&
5178 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5179 scan += UTF8SKIP(scan);
5183 while (scan < loceol && !isALNUM(*scan))
5188 PL_reg_flags |= RF_tainted;
5191 while (hardcount < max && scan < loceol &&
5192 !isALNUM_LC_utf8((U8*)scan)) {
5193 scan += UTF8SKIP(scan);
5197 while (scan < loceol && !isALNUM_LC(*scan))
5204 LOAD_UTF8_CHARCLASS_SPACE();
5205 while (hardcount < max && scan < loceol &&
5207 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5208 scan += UTF8SKIP(scan);
5212 while (scan < loceol && isSPACE(*scan))
5217 PL_reg_flags |= RF_tainted;
5220 while (hardcount < max && scan < loceol &&
5221 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5222 scan += UTF8SKIP(scan);
5226 while (scan < loceol && isSPACE_LC(*scan))
5233 LOAD_UTF8_CHARCLASS_SPACE();
5234 while (hardcount < max && scan < loceol &&
5236 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5237 scan += UTF8SKIP(scan);
5241 while (scan < loceol && !isSPACE(*scan))
5246 PL_reg_flags |= RF_tainted;
5249 while (hardcount < max && scan < loceol &&
5250 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5251 scan += UTF8SKIP(scan);
5255 while (scan < loceol && !isSPACE_LC(*scan))
5262 LOAD_UTF8_CHARCLASS_DIGIT();
5263 while (hardcount < max && scan < loceol &&
5264 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5265 scan += UTF8SKIP(scan);
5269 while (scan < loceol && isDIGIT(*scan))
5276 LOAD_UTF8_CHARCLASS_DIGIT();
5277 while (hardcount < max && scan < loceol &&
5278 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5279 scan += UTF8SKIP(scan);
5283 while (scan < loceol && !isDIGIT(*scan))
5287 default: /* Called on something of 0 width. */
5288 break; /* So match right here or not at all. */
5294 c = scan - PL_reginput;
5298 GET_RE_DEBUG_FLAGS_DECL;
5300 SV * const prop = sv_newmortal();
5301 regprop(prog, prop, p);
5302 PerlIO_printf(Perl_debug_log,
5303 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5304 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5312 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5314 - regclass_swash - prepare the utf8 swash
5318 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5324 const struct reg_data * const data = prog ? prog->data : NULL;
5326 if (data && data->count) {
5327 const U32 n = ARG(node);
5329 if (data->what[n] == 's') {
5330 SV * const rv = (SV*)data->data[n];
5331 AV * const av = (AV*)SvRV((SV*)rv);
5332 SV **const ary = AvARRAY(av);
5335 /* See the end of regcomp.c:S_regclass() for
5336 * documentation of these array elements. */
5339 a = SvROK(ary[1]) ? &ary[1] : 0;
5340 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5344 else if (si && doinit) {
5345 sw = swash_init("utf8", "", si, 1, 0);
5346 (void)av_store(av, 1, sw);
5363 - reginclass - determine if a character falls into a character class
5365 The n is the ANYOF regnode, the p is the target string, lenp
5366 is pointer to the maximum length of how far to go in the p
5367 (if the lenp is zero, UTF8SKIP(p) is used),
5368 do_utf8 tells whether the target string is in UTF-8.
5373 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5376 const char flags = ANYOF_FLAGS(n);
5382 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5383 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5384 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5385 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5386 if (len == (STRLEN)-1)
5387 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5390 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5391 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5394 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5395 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5398 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5402 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5405 if (swash_fetch(sw, p, do_utf8))
5407 else if (flags & ANYOF_FOLD) {
5408 if (!match && lenp && av) {
5410 for (i = 0; i <= av_len(av); i++) {
5411 SV* const sv = *av_fetch(av, i, FALSE);
5413 const char * const s = SvPV_const(sv, len);
5415 if (len <= plen && memEQ(s, (char*)p, len)) {
5423 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5426 to_utf8_fold(p, tmpbuf, &tmplen);
5427 if (swash_fetch(sw, tmpbuf, do_utf8))
5433 if (match && lenp && *lenp == 0)
5434 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5436 if (!match && c < 256) {
5437 if (ANYOF_BITMAP_TEST(n, c))
5439 else if (flags & ANYOF_FOLD) {
5442 if (flags & ANYOF_LOCALE) {
5443 PL_reg_flags |= RF_tainted;
5444 f = PL_fold_locale[c];
5448 if (f != c && ANYOF_BITMAP_TEST(n, f))
5452 if (!match && (flags & ANYOF_CLASS)) {
5453 PL_reg_flags |= RF_tainted;
5455 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5456 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5457 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5458 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5459 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5460 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5461 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5462 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5463 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5464 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5465 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5466 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5467 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5468 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5469 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5470 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5471 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5472 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5473 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5474 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5475 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5476 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5477 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5478 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5479 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5480 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5481 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5482 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5483 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5484 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5485 ) /* How's that for a conditional? */
5492 return (flags & ANYOF_INVERT) ? !match : match;
5496 S_reghop3(U8 *s, I32 off, const U8* lim)
5500 while (off-- && s < lim) {
5501 /* XXX could check well-formedness here */
5506 while (off++ && s > lim) {
5508 if (UTF8_IS_CONTINUED(*s)) {
5509 while (s > lim && UTF8_IS_CONTINUATION(*s))
5512 /* XXX could check well-formedness here */
5519 /* there are a bunch of places where we use two reghop3's that should
5520 be replaced with this routine. but since thats not done yet
5521 we ifdef it out - dmq
5524 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5528 while (off-- && s < rlim) {
5529 /* XXX could check well-formedness here */
5534 while (off++ && s > llim) {
5536 if (UTF8_IS_CONTINUED(*s)) {
5537 while (s > llim && UTF8_IS_CONTINUATION(*s))
5540 /* XXX could check well-formedness here */
5548 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5552 while (off-- && s < lim) {
5553 /* XXX could check well-formedness here */
5560 while (off++ && s > lim) {
5562 if (UTF8_IS_CONTINUED(*s)) {
5563 while (s > lim && UTF8_IS_CONTINUATION(*s))
5566 /* XXX could check well-formedness here */
5575 restore_pos(pTHX_ void *arg)
5578 regexp * const rex = (regexp *)arg;
5579 if (PL_reg_eval_set) {
5580 if (PL_reg_oldsaved) {
5581 rex->subbeg = PL_reg_oldsaved;
5582 rex->sublen = PL_reg_oldsavedlen;
5583 #ifdef PERL_OLD_COPY_ON_WRITE
5584 rex->saved_copy = PL_nrs;
5586 RX_MATCH_COPIED_on(rex);
5588 PL_reg_magic->mg_len = PL_reg_oldpos;
5589 PL_reg_eval_set = 0;
5590 PL_curpm = PL_reg_oldcurpm;
5595 S_to_utf8_substr(pTHX_ register regexp *prog)
5597 if (prog->float_substr && !prog->float_utf8) {
5598 SV* const sv = newSVsv(prog->float_substr);
5599 prog->float_utf8 = sv;
5600 sv_utf8_upgrade(sv);
5601 if (SvTAIL(prog->float_substr))
5603 if (prog->float_substr == prog->check_substr)
5604 prog->check_utf8 = sv;
5606 if (prog->anchored_substr && !prog->anchored_utf8) {
5607 SV* const sv = newSVsv(prog->anchored_substr);
5608 prog->anchored_utf8 = sv;
5609 sv_utf8_upgrade(sv);
5610 if (SvTAIL(prog->anchored_substr))
5612 if (prog->anchored_substr == prog->check_substr)
5613 prog->check_utf8 = sv;
5618 S_to_byte_substr(pTHX_ register regexp *prog)
5621 if (prog->float_utf8 && !prog->float_substr) {
5622 SV* sv = newSVsv(prog->float_utf8);
5623 prog->float_substr = sv;
5624 if (sv_utf8_downgrade(sv, TRUE)) {
5625 if (SvTAIL(prog->float_utf8))
5629 prog->float_substr = sv = &PL_sv_undef;
5631 if (prog->float_utf8 == prog->check_utf8)
5632 prog->check_substr = sv;
5634 if (prog->anchored_utf8 && !prog->anchored_substr) {
5635 SV* sv = newSVsv(prog->anchored_utf8);
5636 prog->anchored_substr = sv;
5637 if (sv_utf8_downgrade(sv, TRUE)) {
5638 if (SvTAIL(prog->anchored_utf8))
5642 prog->anchored_substr = sv = &PL_sv_undef;
5644 if (prog->anchored_utf8 == prog->check_utf8)
5645 prog->check_substr = sv;
5651 * c-indentation-style: bsd
5653 * indent-tabs-mode: t
5656 * ex: set ts=8 sts=4 sw=4 noet: