5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 7
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regoffs[p].end);
193 SSPUSHINT(PL_regoffs[p].start);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regoffs[p].start,
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
200 (IV)PL_regoffs[p].end
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regoffs);
205 SSPUSHINT(PL_regsize);
206 SSPUSHINT(*PL_reglastparen);
207 SSPUSHINT(*PL_reglastcloseparen);
208 SSPUSHPTR(PL_reginput);
209 #define REGCP_FRAME_ELEMS 2
210 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
211 * are needed for the regexp context stack bookkeeping. */
212 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
213 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
218 /* These are needed since we do not localize EVAL nodes: */
219 #define REGCP_SET(cp) \
221 PerlIO_printf(Perl_debug_log, \
222 " Setting an EVAL scope, savestack=%"IVdf"\n", \
223 (IV)PL_savestack_ix)); \
226 #define REGCP_UNWIND(cp) \
228 if (cp != PL_savestack_ix) \
229 PerlIO_printf(Perl_debug_log, \
230 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
231 (IV)(cp), (IV)PL_savestack_ix)); \
235 S_regcppop(pTHX_ const regexp *rex)
241 GET_RE_DEBUG_FLAGS_DECL;
243 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
245 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
246 i = SSPOPINT; /* Parentheses elements to pop. */
247 input = (char *) SSPOPPTR;
248 *PL_reglastcloseparen = SSPOPINT;
249 *PL_reglastparen = SSPOPINT;
250 PL_regsize = SSPOPINT;
251 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
254 /* Now restore the parentheses context. */
255 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
256 i > 0; i -= REGCP_PAREN_ELEMS) {
258 U32 paren = (U32)SSPOPINT;
259 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
260 PL_regoffs[paren].start = SSPOPINT;
262 if (paren <= *PL_reglastparen)
263 PL_regoffs[paren].end = tmps;
265 PerlIO_printf(Perl_debug_log,
266 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
267 (UV)paren, (IV)PL_regoffs[paren].start,
268 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
269 (IV)PL_regoffs[paren].end,
270 (paren > *PL_reglastparen ? "(no)" : ""));
274 if (*PL_reglastparen + 1 <= rex->nparens) {
275 PerlIO_printf(Perl_debug_log,
276 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
277 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
281 /* It would seem that the similar code in regtry()
282 * already takes care of this, and in fact it is in
283 * a better location to since this code can #if 0-ed out
284 * but the code in regtry() is needed or otherwise tests
285 * requiring null fields (pat.t#187 and split.t#{13,14}
286 * (as of patchlevel 7877) will fail. Then again,
287 * this code seems to be necessary or otherwise
288 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
289 * --jhi updated by dapm */
290 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
292 PL_regoffs[i].start = -1;
293 PL_regoffs[i].end = -1;
299 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
302 * pregexec and friends
305 #ifndef PERL_IN_XSUB_RE
307 - pregexec - match a regexp against a string
310 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
311 char *strbeg, I32 minend, SV *screamer, U32 nosave)
312 /* strend: pointer to null at end of string */
313 /* strbeg: real beginning of string */
314 /* minend: end of match must be >=minend after stringarg. */
315 /* nosave: For optimizations. */
318 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
319 nosave ? 0 : REXEC_COPY_STR);
324 * Need to implement the following flags for reg_anch:
326 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
328 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
329 * INTUIT_AUTORITATIVE_ML
330 * INTUIT_ONCE_NOML - Intuit can match in one location only.
333 * Another flag for this function: SECOND_TIME (so that float substrs
334 * with giant delta may be not rechecked).
337 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
339 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
340 Otherwise, only SvCUR(sv) is used to get strbeg. */
342 /* XXXX We assume that strpos is strbeg unless sv. */
344 /* XXXX Some places assume that there is a fixed substring.
345 An update may be needed if optimizer marks as "INTUITable"
346 RExen without fixed substrings. Similarly, it is assumed that
347 lengths of all the strings are no more than minlen, thus they
348 cannot come from lookahead.
349 (Or minlen should take into account lookahead.)
350 NOTE: Some of this comment is not correct. minlen does now take account
351 of lookahead/behind. Further research is required. -- demerphq
355 /* A failure to find a constant substring means that there is no need to make
356 an expensive call to REx engine, thus we celebrate a failure. Similarly,
357 finding a substring too deep into the string means that less calls to
358 regtry() should be needed.
360 REx compiler's optimizer found 4 possible hints:
361 a) Anchored substring;
363 c) Whether we are anchored (beginning-of-line or \G);
364 d) First node (of those at offset 0) which may distingush positions;
365 We use a)b)d) and multiline-part of c), and try to find a position in the
366 string which does not contradict any of them.
369 /* Most of decisions we do here should have been done at compile time.
370 The nodes of the REx which we used for the search should have been
371 deleted from the finite automaton. */
374 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
375 char *strend, const U32 flags, re_scream_pos_data *data)
378 struct regexp *const prog = (struct regexp *)SvANY(rx);
379 register I32 start_shift = 0;
380 /* Should be nonnegative! */
381 register I32 end_shift = 0;
386 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
388 register char *other_last = NULL; /* other substr checked before this */
389 char *check_at = NULL; /* check substr found at this pos */
390 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
391 RXi_GET_DECL(prog,progi);
393 const char * const i_strpos = strpos;
396 GET_RE_DEBUG_FLAGS_DECL;
398 RX_MATCH_UTF8_set(rx,do_utf8);
401 PL_reg_flags |= RF_utf8;
404 debug_start_match(rx, do_utf8, strpos, strend,
405 sv ? "Guessing start of match in sv for"
406 : "Guessing start of match in string for");
409 /* CHR_DIST() would be more correct here but it makes things slow. */
410 if (prog->minlen > strend - strpos) {
411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
412 "String too short... [re_intuit_start]\n"));
416 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
419 if (!prog->check_utf8 && prog->check_substr)
420 to_utf8_substr(prog);
421 check = prog->check_utf8;
423 if (!prog->check_substr && prog->check_utf8)
424 to_byte_substr(prog);
425 check = prog->check_substr;
427 if (check == &PL_sv_undef) {
428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
429 "Non-utf8 string cannot match utf8 check string\n"));
432 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
433 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
434 || ( (prog->extflags & RXf_ANCH_BOL)
435 && !multiline ) ); /* Check after \n? */
438 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
439 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
440 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
442 && (strpos != strbeg)) {
443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
446 if (prog->check_offset_min == prog->check_offset_max &&
447 !(prog->extflags & RXf_CANY_SEEN)) {
448 /* Substring at constant offset from beg-of-str... */
451 s = HOP3c(strpos, prog->check_offset_min, strend);
454 slen = SvCUR(check); /* >= 1 */
456 if ( strend - s > slen || strend - s < slen - 1
457 || (strend - s == slen && strend[-1] != '\n')) {
458 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
461 /* Now should match s[0..slen-2] */
463 if (slen && (*SvPVX_const(check) != *s
465 && memNE(SvPVX_const(check), s, slen)))) {
467 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
471 else if (*SvPVX_const(check) != *s
472 || ((slen = SvCUR(check)) > 1
473 && memNE(SvPVX_const(check), s, slen)))
476 goto success_at_start;
479 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
481 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
482 end_shift = prog->check_end_shift;
485 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
486 - (SvTAIL(check) != 0);
487 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
489 if (end_shift < eshift)
493 else { /* Can match at random position */
496 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
497 end_shift = prog->check_end_shift;
499 /* end shift should be non negative here */
502 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
504 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
505 (IV)end_shift, RX_PRECOMP(prog));
509 /* Find a possible match in the region s..strend by looking for
510 the "check" substring in the region corrected by start/end_shift. */
513 I32 srch_start_shift = start_shift;
514 I32 srch_end_shift = end_shift;
515 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
516 srch_end_shift -= ((strbeg - s) - srch_start_shift);
517 srch_start_shift = strbeg - s;
519 DEBUG_OPTIMISE_MORE_r({
520 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
521 (IV)prog->check_offset_min,
522 (IV)srch_start_shift,
524 (IV)prog->check_end_shift);
527 if (flags & REXEC_SCREAM) {
528 I32 p = -1; /* Internal iterator of scream. */
529 I32 * const pp = data ? data->scream_pos : &p;
531 if (PL_screamfirst[BmRARE(check)] >= 0
532 || ( BmRARE(check) == '\n'
533 && (BmPREVIOUS(check) == SvCUR(check) - 1)
535 s = screaminstr(sv, check,
536 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
539 /* we may be pointing at the wrong string */
540 if (s && RXp_MATCH_COPIED(prog))
541 s = strbeg + (s - SvPVX_const(sv));
543 *data->scream_olds = s;
548 if (prog->extflags & RXf_CANY_SEEN) {
549 start_point= (U8*)(s + srch_start_shift);
550 end_point= (U8*)(strend - srch_end_shift);
552 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
553 end_point= HOP3(strend, -srch_end_shift, strbeg);
555 DEBUG_OPTIMISE_MORE_r({
556 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
557 (int)(end_point - start_point),
558 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
562 s = fbm_instr( start_point, end_point,
563 check, multiline ? FBMrf_MULTILINE : 0);
566 /* Update the count-of-usability, remove useless subpatterns,
570 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
571 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
572 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
573 (s ? "Found" : "Did not find"),
574 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
575 ? "anchored" : "floating"),
578 (s ? " at offset " : "...\n") );
583 /* Finish the diagnostic message */
584 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
586 /* XXX dmq: first branch is for positive lookbehind...
587 Our check string is offset from the beginning of the pattern.
588 So we need to do any stclass tests offset forward from that
597 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
598 Start with the other substr.
599 XXXX no SCREAM optimization yet - and a very coarse implementation
600 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
601 *always* match. Probably should be marked during compile...
602 Probably it is right to do no SCREAM here...
605 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
606 : (prog->float_substr && prog->anchored_substr))
608 /* Take into account the "other" substring. */
609 /* XXXX May be hopelessly wrong for UTF... */
612 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
615 char * const last = HOP3c(s, -start_shift, strbeg);
617 char * const saved_s = s;
620 t = s - prog->check_offset_max;
621 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
623 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
628 t = HOP3c(t, prog->anchored_offset, strend);
629 if (t < other_last) /* These positions already checked */
631 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
634 /* XXXX It is not documented what units *_offsets are in.
635 We assume bytes, but this is clearly wrong.
636 Meaning this code needs to be carefully reviewed for errors.
640 /* On end-of-str: see comment below. */
641 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
642 if (must == &PL_sv_undef) {
644 DEBUG_r(must = prog->anchored_utf8); /* for debug */
649 HOP3(HOP3(last1, prog->anchored_offset, strend)
650 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
652 multiline ? FBMrf_MULTILINE : 0
655 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
656 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
657 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
658 (s ? "Found" : "Contradicts"),
659 quoted, RE_SV_TAIL(must));
664 if (last1 >= last2) {
665 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
666 ", giving up...\n"));
669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
670 ", trying floating at offset %ld...\n",
671 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
672 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
673 s = HOP3c(last, 1, strend);
677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
678 (long)(s - i_strpos)));
679 t = HOP3c(s, -prog->anchored_offset, strbeg);
680 other_last = HOP3c(s, 1, strend);
688 else { /* Take into account the floating substring. */
690 char * const saved_s = s;
693 t = HOP3c(s, -start_shift, strbeg);
695 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
696 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
697 last = HOP3c(t, prog->float_max_offset, strend);
698 s = HOP3c(t, prog->float_min_offset, strend);
701 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
702 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
703 /* fbm_instr() takes into account exact value of end-of-str
704 if the check is SvTAIL(ed). Since false positives are OK,
705 and end-of-str is not later than strend we are OK. */
706 if (must == &PL_sv_undef) {
708 DEBUG_r(must = prog->float_utf8); /* for debug message */
711 s = fbm_instr((unsigned char*)s,
712 (unsigned char*)last + SvCUR(must)
714 must, multiline ? FBMrf_MULTILINE : 0);
716 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
717 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
718 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
719 (s ? "Found" : "Contradicts"),
720 quoted, RE_SV_TAIL(must));
724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
725 ", giving up...\n"));
728 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
729 ", trying anchored starting at offset %ld...\n",
730 (long)(saved_s + 1 - i_strpos)));
732 s = HOP3c(t, 1, strend);
736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
737 (long)(s - i_strpos)));
738 other_last = s; /* Fix this later. --Hugo */
748 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
750 DEBUG_OPTIMISE_MORE_r(
751 PerlIO_printf(Perl_debug_log,
752 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
753 (IV)prog->check_offset_min,
754 (IV)prog->check_offset_max,
762 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
764 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
767 /* Fixed substring is found far enough so that the match
768 cannot start at strpos. */
770 if (ml_anch && t[-1] != '\n') {
771 /* Eventually fbm_*() should handle this, but often
772 anchored_offset is not 0, so this check will not be wasted. */
773 /* XXXX In the code below we prefer to look for "^" even in
774 presence of anchored substrings. And we search even
775 beyond the found float position. These pessimizations
776 are historical artefacts only. */
778 while (t < strend - prog->minlen) {
780 if (t < check_at - prog->check_offset_min) {
781 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
782 /* Since we moved from the found position,
783 we definitely contradict the found anchored
784 substr. Due to the above check we do not
785 contradict "check" substr.
786 Thus we can arrive here only if check substr
787 is float. Redo checking for "other"=="fixed".
790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
791 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
792 goto do_other_anchored;
794 /* We don't contradict the found floating substring. */
795 /* XXXX Why not check for STCLASS? */
797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
798 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
801 /* Position contradicts check-string */
802 /* XXXX probably better to look for check-string
803 than for "\n", so one should lower the limit for t? */
804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
805 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
806 other_last = strpos = s = t + 1;
811 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
812 PL_colors[0], PL_colors[1]));
816 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
817 PL_colors[0], PL_colors[1]));
821 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
824 /* The found string does not prohibit matching at strpos,
825 - no optimization of calling REx engine can be performed,
826 unless it was an MBOL and we are not after MBOL,
827 or a future STCLASS check will fail this. */
829 /* Even in this situation we may use MBOL flag if strpos is offset
830 wrt the start of the string. */
831 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
832 && (strpos != strbeg) && strpos[-1] != '\n'
833 /* May be due to an implicit anchor of m{.*foo} */
834 && !(prog->intflags & PREGf_IMPLICIT))
839 DEBUG_EXECUTE_r( if (ml_anch)
840 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
841 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
844 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
846 prog->check_utf8 /* Could be deleted already */
847 && --BmUSEFUL(prog->check_utf8) < 0
848 && (prog->check_utf8 == prog->float_utf8)
850 prog->check_substr /* Could be deleted already */
851 && --BmUSEFUL(prog->check_substr) < 0
852 && (prog->check_substr == prog->float_substr)
855 /* If flags & SOMETHING - do not do it many times on the same match */
856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
857 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
858 if (do_utf8 ? prog->check_substr : prog->check_utf8)
859 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
860 prog->check_substr = prog->check_utf8 = NULL; /* disable */
861 prog->float_substr = prog->float_utf8 = NULL; /* clear */
862 check = NULL; /* abort */
864 /* XXXX This is a remnant of the old implementation. It
865 looks wasteful, since now INTUIT can use many
867 prog->extflags &= ~RXf_USE_INTUIT;
874 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
875 /* trie stclasses are too expensive to use here, we are better off to
876 leave it to regmatch itself */
877 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
878 /* minlen == 0 is possible if regstclass is \b or \B,
879 and the fixed substr is ''$.
880 Since minlen is already taken into account, s+1 is before strend;
881 accidentally, minlen >= 1 guaranties no false positives at s + 1
882 even for \b or \B. But (minlen? 1 : 0) below assumes that
883 regstclass does not come from lookahead... */
884 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
885 This leaves EXACTF only, which is dealt with in find_byclass(). */
886 const U8* const str = (U8*)STRING(progi->regstclass);
887 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
888 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
891 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
892 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
893 else if (prog->float_substr || prog->float_utf8)
894 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
899 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
902 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
905 const char *what = NULL;
907 if (endpos == strend) {
908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
909 "Could not match STCLASS...\n") );
912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
913 "This position contradicts STCLASS...\n") );
914 if ((prog->extflags & RXf_ANCH) && !ml_anch)
916 /* Contradict one of substrings */
917 if (prog->anchored_substr || prog->anchored_utf8) {
918 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
919 DEBUG_EXECUTE_r( what = "anchored" );
921 s = HOP3c(t, 1, strend);
922 if (s + start_shift + end_shift > strend) {
923 /* XXXX Should be taken into account earlier? */
924 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
925 "Could not match STCLASS...\n") );
930 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
931 "Looking for %s substr starting at offset %ld...\n",
932 what, (long)(s + start_shift - i_strpos)) );
935 /* Have both, check_string is floating */
936 if (t + start_shift >= check_at) /* Contradicts floating=check */
937 goto retry_floating_check;
938 /* Recheck anchored substring, but not floating... */
942 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
943 "Looking for anchored substr starting at offset %ld...\n",
944 (long)(other_last - i_strpos)) );
945 goto do_other_anchored;
947 /* Another way we could have checked stclass at the
948 current position only: */
953 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
954 "Looking for /%s^%s/m starting at offset %ld...\n",
955 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
958 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
960 /* Check is floating subtring. */
961 retry_floating_check:
962 t = check_at - start_shift;
963 DEBUG_EXECUTE_r( what = "floating" );
964 goto hop_and_restart;
967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
968 "By STCLASS: moving %ld --> %ld\n",
969 (long)(t - i_strpos), (long)(s - i_strpos))
973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
974 "Does not contradict STCLASS...\n");
979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
980 PL_colors[4], (check ? "Guessed" : "Giving up"),
981 PL_colors[5], (long)(s - i_strpos)) );
984 fail_finish: /* Substring not found */
985 if (prog->check_substr || prog->check_utf8) /* could be removed already */
986 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
988 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
989 PL_colors[4], PL_colors[5]));
993 #define DECL_TRIE_TYPE(scan) \
994 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
995 trie_type = (scan->flags != EXACT) \
996 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
997 : (do_utf8 ? trie_utf8 : trie_plain)
999 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1000 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1001 switch (trie_type) { \
1002 case trie_utf8_fold: \
1003 if ( foldlen>0 ) { \
1004 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1009 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1010 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1011 foldlen -= UNISKIP( uvc ); \
1012 uscan = foldbuf + UNISKIP( uvc ); \
1015 case trie_latin_utf8_fold: \
1016 if ( foldlen>0 ) { \
1017 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1023 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1024 foldlen -= UNISKIP( uvc ); \
1025 uscan = foldbuf + UNISKIP( uvc ); \
1029 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1037 charid = trie->charmap[ uvc ]; \
1041 if (widecharmap) { \
1042 SV** const svpp = hv_fetch(widecharmap, \
1043 (char*)&uvc, sizeof(UV), 0); \
1045 charid = (U16)SvIV(*svpp); \
1050 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1052 char *my_strend= (char *)strend; \
1055 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
1056 m, NULL, ln, (bool)UTF)) \
1057 && (!reginfo || regtry(reginfo, &s)) ) \
1060 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1061 uvchr_to_utf8(tmpbuf, c); \
1062 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1064 && (f == c1 || f == c2) \
1066 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1067 m, NULL, ln, (bool)UTF)) \
1068 && (!reginfo || regtry(reginfo, &s)) ) \
1074 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1078 && (ln == 1 || !(OP(c) == EXACTF \
1080 : ibcmp_locale(s, m, ln))) \
1081 && (!reginfo || regtry(reginfo, &s)) ) \
1087 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1089 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1095 #define REXEC_FBC_SCAN(CoDe) \
1097 while (s < strend) { \
1103 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1104 REXEC_FBC_UTF8_SCAN( \
1106 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1115 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1118 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1127 #define REXEC_FBC_TRYIT \
1128 if ((!reginfo || regtry(reginfo, &s))) \
1131 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1133 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1136 REXEC_FBC_CLASS_SCAN(CoNd); \
1140 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1143 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1146 REXEC_FBC_CLASS_SCAN(CoNd); \
1150 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1151 PL_reg_flags |= RF_tainted; \
1153 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1156 REXEC_FBC_CLASS_SCAN(CoNd); \
1160 #define DUMP_EXEC_POS(li,s,doutf8) \
1161 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1163 /* We know what class REx starts with. Try to find this position... */
1164 /* if reginfo is NULL, its a dryrun */
1165 /* annoyingly all the vars in this routine have different names from their counterparts
1166 in regmatch. /grrr */
1169 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1170 const char *strend, regmatch_info *reginfo)
1173 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1177 register STRLEN uskip;
1181 register I32 tmp = 1; /* Scratch variable? */
1182 register const bool do_utf8 = PL_reg_match_utf8;
1183 RXi_GET_DECL(prog,progi);
1185 /* We know what class it must start with. */
1189 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1190 !UTF8_IS_INVARIANT((U8)s[0]) ?
1191 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1192 REGINCLASS(prog, c, (U8*)s));
1195 while (s < strend) {
1198 if (REGINCLASS(prog, c, (U8*)s) ||
1199 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1200 /* The assignment of 2 is intentional:
1201 * for the folded sharp s, the skip is 2. */
1202 (skip = SHARP_S_SKIP))) {
1203 if (tmp && (!reginfo || regtry(reginfo, &s)))
1216 if (tmp && (!reginfo || regtry(reginfo, &s)))
1224 ln = STR_LEN(c); /* length to match in octets/bytes */
1225 lnc = (I32) ln; /* length to match in characters */
1227 STRLEN ulen1, ulen2;
1229 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1230 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1231 /* used by commented-out code below */
1232 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1234 /* XXX: Since the node will be case folded at compile
1235 time this logic is a little odd, although im not
1236 sure that its actually wrong. --dmq */
1238 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1239 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1241 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1242 codepoint of the first character in the converted
1243 form, yet originally we did the extra step.
1244 No tests fail by commenting this code out however
1245 so Ive left it out. -- dmq.
1247 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1249 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1254 while (sm < ((U8 *) m + ln)) {
1269 c2 = PL_fold_locale[c1];
1271 e = HOP3c(strend, -((I32)lnc), s);
1273 if (!reginfo && e < s)
1274 e = s; /* Due to minlen logic of intuit() */
1276 /* The idea in the EXACTF* cases is to first find the
1277 * first character of the EXACTF* node and then, if
1278 * necessary, case-insensitively compare the full
1279 * text of the node. The c1 and c2 are the first
1280 * characters (though in Unicode it gets a bit
1281 * more complicated because there are more cases
1282 * than just upper and lower: one needs to use
1283 * the so-called folding case for case-insensitive
1284 * matching (called "loose matching" in Unicode).
1285 * ibcmp_utf8() will do just that. */
1287 if (do_utf8 || UTF) {
1289 U8 tmpbuf [UTF8_MAXBYTES+1];
1292 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1294 /* Upper and lower of 1st char are equal -
1295 * probably not a "letter". */
1298 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1303 REXEC_FBC_EXACTISH_CHECK(c == c1);
1309 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1315 /* Handle some of the three Greek sigmas cases.
1316 * Note that not all the possible combinations
1317 * are handled here: some of them are handled
1318 * by the standard folding rules, and some of
1319 * them (the character class or ANYOF cases)
1320 * are handled during compiletime in
1321 * regexec.c:S_regclass(). */
1322 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1323 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1324 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1326 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1331 /* Neither pattern nor string are UTF8 */
1333 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1335 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1339 PL_reg_flags |= RF_tainted;
1346 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1347 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1349 tmp = ((OP(c) == BOUND ?
1350 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1351 LOAD_UTF8_CHARCLASS_ALNUM();
1352 REXEC_FBC_UTF8_SCAN(
1353 if (tmp == !(OP(c) == BOUND ?
1354 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1355 isALNUM_LC_utf8((U8*)s)))
1363 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1364 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1367 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1373 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1377 PL_reg_flags |= RF_tainted;
1384 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1385 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1387 tmp = ((OP(c) == NBOUND ?
1388 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1389 LOAD_UTF8_CHARCLASS_ALNUM();
1390 REXEC_FBC_UTF8_SCAN(
1391 if (tmp == !(OP(c) == NBOUND ?
1392 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1393 isALNUM_LC_utf8((U8*)s)))
1395 else REXEC_FBC_TRYIT;
1399 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1400 tmp = ((OP(c) == NBOUND ?
1401 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1404 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1406 else REXEC_FBC_TRYIT;
1409 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1413 REXEC_FBC_CSCAN_PRELOAD(
1414 LOAD_UTF8_CHARCLASS_ALNUM(),
1415 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1419 REXEC_FBC_CSCAN_TAINT(
1420 isALNUM_LC_utf8((U8*)s),
1424 REXEC_FBC_CSCAN_PRELOAD(
1425 LOAD_UTF8_CHARCLASS_ALNUM(),
1426 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1430 REXEC_FBC_CSCAN_TAINT(
1431 !isALNUM_LC_utf8((U8*)s),
1435 REXEC_FBC_CSCAN_PRELOAD(
1436 LOAD_UTF8_CHARCLASS_SPACE(),
1437 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1441 REXEC_FBC_CSCAN_TAINT(
1442 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1446 REXEC_FBC_CSCAN_PRELOAD(
1447 LOAD_UTF8_CHARCLASS_SPACE(),
1448 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1452 REXEC_FBC_CSCAN_TAINT(
1453 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1457 REXEC_FBC_CSCAN_PRELOAD(
1458 LOAD_UTF8_CHARCLASS_DIGIT(),
1459 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1463 REXEC_FBC_CSCAN_TAINT(
1464 isDIGIT_LC_utf8((U8*)s),
1468 REXEC_FBC_CSCAN_PRELOAD(
1469 LOAD_UTF8_CHARCLASS_DIGIT(),
1470 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1474 REXEC_FBC_CSCAN_TAINT(
1475 !isDIGIT_LC_utf8((U8*)s),
1481 is_LNBREAK_latin1(s)
1491 !is_VERTWS_latin1(s)
1496 is_HORIZWS_latin1(s)
1500 !is_HORIZWS_utf8(s),
1501 !is_HORIZWS_latin1(s)
1507 /* what trie are we using right now */
1509 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1511 = (reg_trie_data*)progi->data->data[ aho->trie ];
1512 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1514 const char *last_start = strend - trie->minlen;
1516 const char *real_start = s;
1518 STRLEN maxlen = trie->maxlen;
1520 U8 **points; /* map of where we were in the input string
1521 when reading a given char. For ASCII this
1522 is unnecessary overhead as the relationship
1523 is always 1:1, but for Unicode, especially
1524 case folded Unicode this is not true. */
1525 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1529 GET_RE_DEBUG_FLAGS_DECL;
1531 /* We can't just allocate points here. We need to wrap it in
1532 * an SV so it gets freed properly if there is a croak while
1533 * running the match */
1536 sv_points=newSV(maxlen * sizeof(U8 *));
1537 SvCUR_set(sv_points,
1538 maxlen * sizeof(U8 *));
1539 SvPOK_on(sv_points);
1540 sv_2mortal(sv_points);
1541 points=(U8**)SvPV_nolen(sv_points );
1542 if ( trie_type != trie_utf8_fold
1543 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1546 bitmap=(U8*)trie->bitmap;
1548 bitmap=(U8*)ANYOF_BITMAP(c);
1550 /* this is the Aho-Corasick algorithm modified a touch
1551 to include special handling for long "unknown char"
1552 sequences. The basic idea being that we use AC as long
1553 as we are dealing with a possible matching char, when
1554 we encounter an unknown char (and we have not encountered
1555 an accepting state) we scan forward until we find a legal
1557 AC matching is basically that of trie matching, except
1558 that when we encounter a failing transition, we fall back
1559 to the current states "fail state", and try the current char
1560 again, a process we repeat until we reach the root state,
1561 state 1, or a legal transition. If we fail on the root state
1562 then we can either terminate if we have reached an accepting
1563 state previously, or restart the entire process from the beginning
1567 while (s <= last_start) {
1568 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1576 U8 *uscan = (U8*)NULL;
1577 U8 *leftmost = NULL;
1579 U32 accepted_word= 0;
1583 while ( state && uc <= (U8*)strend ) {
1585 U32 word = aho->states[ state ].wordnum;
1589 DEBUG_TRIE_EXECUTE_r(
1590 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1591 dump_exec_pos( (char *)uc, c, strend, real_start,
1592 (char *)uc, do_utf8 );
1593 PerlIO_printf( Perl_debug_log,
1594 " Scanning for legal start char...\n");
1597 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1602 if (uc >(U8*)last_start) break;
1606 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1607 if (!leftmost || lpos < leftmost) {
1608 DEBUG_r(accepted_word=word);
1614 points[pointpos++ % maxlen]= uc;
1615 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1616 uscan, len, uvc, charid, foldlen,
1618 DEBUG_TRIE_EXECUTE_r({
1619 dump_exec_pos( (char *)uc, c, strend, real_start,
1621 PerlIO_printf(Perl_debug_log,
1622 " Charid:%3u CP:%4"UVxf" ",
1628 word = aho->states[ state ].wordnum;
1630 base = aho->states[ state ].trans.base;
1632 DEBUG_TRIE_EXECUTE_r({
1634 dump_exec_pos( (char *)uc, c, strend, real_start,
1636 PerlIO_printf( Perl_debug_log,
1637 "%sState: %4"UVxf", word=%"UVxf,
1638 failed ? " Fail transition to " : "",
1639 (UV)state, (UV)word);
1644 (base + charid > trie->uniquecharcount )
1645 && (base + charid - 1 - trie->uniquecharcount
1647 && trie->trans[base + charid - 1 -
1648 trie->uniquecharcount].check == state
1649 && (tmp=trie->trans[base + charid - 1 -
1650 trie->uniquecharcount ].next))
1652 DEBUG_TRIE_EXECUTE_r(
1653 PerlIO_printf( Perl_debug_log," - legal\n"));
1658 DEBUG_TRIE_EXECUTE_r(
1659 PerlIO_printf( Perl_debug_log," - fail\n"));
1661 state = aho->fail[state];
1665 /* we must be accepting here */
1666 DEBUG_TRIE_EXECUTE_r(
1667 PerlIO_printf( Perl_debug_log," - accepting\n"));
1676 if (!state) state = 1;
1679 if ( aho->states[ state ].wordnum ) {
1680 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1681 if (!leftmost || lpos < leftmost) {
1682 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1687 s = (char*)leftmost;
1688 DEBUG_TRIE_EXECUTE_r({
1690 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1691 (UV)accepted_word, (IV)(s - real_start)
1694 if (!reginfo || regtry(reginfo, &s)) {
1700 DEBUG_TRIE_EXECUTE_r({
1701 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1704 DEBUG_TRIE_EXECUTE_r(
1705 PerlIO_printf( Perl_debug_log,"No match.\n"));
1714 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1723 S_swap_match_buff (pTHX_ regexp *prog) {
1724 regexp_paren_pair *t;
1727 /* We have to be careful. If the previous successful match
1728 was from this regex we don't want a subsequent paritally
1729 successful match to clobber the old results.
1730 So when we detect this possibility we add a swap buffer
1731 to the re, and switch the buffer each match. If we fail
1732 we switch it back, otherwise we leave it swapped.
1734 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
1737 prog->swap = prog->offs;
1743 - regexec_flags - match a regexp against a string
1746 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1747 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1748 /* strend: pointer to null at end of string */
1749 /* strbeg: real beginning of string */
1750 /* minend: end of match must be >=minend after stringarg. */
1751 /* data: May be used for some additional optimizations.
1752 Currently its only used, with a U32 cast, for transmitting
1753 the ganch offset when doing a /g match. This will change */
1754 /* nosave: For optimizations. */
1757 struct regexp *const prog = (struct regexp *)SvANY(rx);
1758 /*register*/ char *s;
1759 register regnode *c;
1760 /*register*/ char *startpos = stringarg;
1761 I32 minlen; /* must match at least this many chars */
1762 I32 dontbother = 0; /* how many characters not to try at end */
1763 I32 end_shift = 0; /* Same for the end. */ /* CC */
1764 I32 scream_pos = -1; /* Internal iterator of scream. */
1765 char *scream_olds = NULL;
1766 const bool do_utf8 = (bool)DO_UTF8(sv);
1768 RXi_GET_DECL(prog,progi);
1769 regmatch_info reginfo; /* create some info to pass to regtry etc */
1770 bool swap_on_fail = 0;
1772 GET_RE_DEBUG_FLAGS_DECL;
1774 PERL_UNUSED_ARG(data);
1776 /* Be paranoid... */
1777 if (prog == NULL || startpos == NULL) {
1778 Perl_croak(aTHX_ "NULL regexp parameter");
1782 multiline = prog->extflags & RXf_PMf_MULTILINE;
1783 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
1785 RX_MATCH_UTF8_set(rx, do_utf8);
1787 debug_start_match(rx, do_utf8, startpos, strend,
1791 minlen = prog->minlen;
1793 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1795 "String too short [regexec_flags]...\n"));
1800 /* Check validity of program. */
1801 if (UCHARAT(progi->program) != REG_MAGIC) {
1802 Perl_croak(aTHX_ "corrupted regexp program");
1806 PL_reg_eval_set = 0;
1810 PL_reg_flags |= RF_utf8;
1812 /* Mark beginning of line for ^ and lookbehind. */
1813 reginfo.bol = startpos; /* XXX not used ??? */
1817 /* Mark end of line for $ (and such) */
1820 /* see how far we have to get to not match where we matched before */
1821 reginfo.till = startpos+minend;
1823 /* If there is a "must appear" string, look for it. */
1826 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1829 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1830 reginfo.ganch = startpos + prog->gofs;
1831 else if (sv && SvTYPE(sv) >= SVt_PVMG
1833 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1834 && mg->mg_len >= 0) {
1835 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1836 if (prog->extflags & RXf_ANCH_GPOS) {
1837 if (s > reginfo.ganch)
1839 s = reginfo.ganch - prog->gofs;
1843 reginfo.ganch = strbeg + PTR2UV(data);
1844 } else /* pos() not defined */
1845 reginfo.ganch = strbeg;
1847 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1849 swap_match_buff(prog); /* do we need a save destructor here for
1852 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1853 re_scream_pos_data d;
1855 d.scream_olds = &scream_olds;
1856 d.scream_pos = &scream_pos;
1857 s = re_intuit_start(rx, sv, s, strend, flags, &d);
1859 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1860 goto phooey; /* not present */
1866 /* Simplest case: anchored match need be tried only once. */
1867 /* [unless only anchor is BOL and multiline is set] */
1868 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1869 if (s == startpos && regtry(®info, &startpos))
1871 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1872 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1877 dontbother = minlen - 1;
1878 end = HOP3c(strend, -dontbother, strbeg) - 1;
1879 /* for multiline we only have to try after newlines */
1880 if (prog->check_substr || prog->check_utf8) {
1884 if (regtry(®info, &s))
1889 if (prog->extflags & RXf_USE_INTUIT) {
1890 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
1901 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1902 if (regtry(®info, &s))
1909 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1911 /* the warning about reginfo.ganch being used without intialization
1912 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1913 and we only enter this block when the same bit is set. */
1914 char *tmp_s = reginfo.ganch - prog->gofs;
1915 if (regtry(®info, &tmp_s))
1920 /* Messy cases: unanchored match. */
1921 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1922 /* we have /x+whatever/ */
1923 /* it must be a one character string (XXXX Except UTF?) */
1928 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1929 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1930 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1935 DEBUG_EXECUTE_r( did_match = 1 );
1936 if (regtry(®info, &s)) goto got_it;
1938 while (s < strend && *s == ch)
1946 DEBUG_EXECUTE_r( did_match = 1 );
1947 if (regtry(®info, &s)) goto got_it;
1949 while (s < strend && *s == ch)
1954 DEBUG_EXECUTE_r(if (!did_match)
1955 PerlIO_printf(Perl_debug_log,
1956 "Did not find anchored character...\n")
1959 else if (prog->anchored_substr != NULL
1960 || prog->anchored_utf8 != NULL
1961 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1962 && prog->float_max_offset < strend - s)) {
1967 char *last1; /* Last position checked before */
1971 if (prog->anchored_substr || prog->anchored_utf8) {
1972 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1973 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1974 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1975 back_max = back_min = prog->anchored_offset;
1977 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1978 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1979 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1980 back_max = prog->float_max_offset;
1981 back_min = prog->float_min_offset;
1985 if (must == &PL_sv_undef)
1986 /* could not downgrade utf8 check substring, so must fail */
1992 last = HOP3c(strend, /* Cannot start after this */
1993 -(I32)(CHR_SVLEN(must)
1994 - (SvTAIL(must) != 0) + back_min), strbeg);
1997 last1 = HOPc(s, -1);
1999 last1 = s - 1; /* bogus */
2001 /* XXXX check_substr already used to find "s", can optimize if
2002 check_substr==must. */
2004 dontbother = end_shift;
2005 strend = HOPc(strend, -dontbother);
2006 while ( (s <= last) &&
2007 ((flags & REXEC_SCREAM)
2008 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2009 end_shift, &scream_pos, 0))
2010 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2011 (unsigned char*)strend, must,
2012 multiline ? FBMrf_MULTILINE : 0))) ) {
2013 /* we may be pointing at the wrong string */
2014 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2015 s = strbeg + (s - SvPVX_const(sv));
2016 DEBUG_EXECUTE_r( did_match = 1 );
2017 if (HOPc(s, -back_max) > last1) {
2018 last1 = HOPc(s, -back_min);
2019 s = HOPc(s, -back_max);
2022 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2024 last1 = HOPc(s, -back_min);
2028 while (s <= last1) {
2029 if (regtry(®info, &s))
2035 while (s <= last1) {
2036 if (regtry(®info, &s))
2042 DEBUG_EXECUTE_r(if (!did_match) {
2043 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2044 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2045 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2046 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2047 ? "anchored" : "floating"),
2048 quoted, RE_SV_TAIL(must));
2052 else if ( (c = progi->regstclass) ) {
2054 const OPCODE op = OP(progi->regstclass);
2055 /* don't bother with what can't match */
2056 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2057 strend = HOPc(strend, -(minlen - 1));
2060 SV * const prop = sv_newmortal();
2061 regprop(prog, prop, c);
2063 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2065 PerlIO_printf(Perl_debug_log,
2066 "Matching stclass %.*s against %s (%d chars)\n",
2067 (int)SvCUR(prop), SvPVX_const(prop),
2068 quoted, (int)(strend - s));
2071 if (find_byclass(prog, c, s, strend, ®info))
2073 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2077 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2082 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2083 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2084 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2086 if (flags & REXEC_SCREAM) {
2087 last = screaminstr(sv, float_real, s - strbeg,
2088 end_shift, &scream_pos, 1); /* last one */
2090 last = scream_olds; /* Only one occurrence. */
2091 /* we may be pointing at the wrong string */
2092 else if (RXp_MATCH_COPIED(prog))
2093 s = strbeg + (s - SvPVX_const(sv));
2097 const char * const little = SvPV_const(float_real, len);
2099 if (SvTAIL(float_real)) {
2100 if (memEQ(strend - len + 1, little, len - 1))
2101 last = strend - len + 1;
2102 else if (!multiline)
2103 last = memEQ(strend - len, little, len)
2104 ? strend - len : NULL;
2110 last = rninstr(s, strend, little, little + len);
2112 last = strend; /* matching "$" */
2117 PerlIO_printf(Perl_debug_log,
2118 "%sCan't trim the tail, match fails (should not happen)%s\n",
2119 PL_colors[4], PL_colors[5]));
2120 goto phooey; /* Should not happen! */
2122 dontbother = strend - last + prog->float_min_offset;
2124 if (minlen && (dontbother < minlen))
2125 dontbother = minlen - 1;
2126 strend -= dontbother; /* this one's always in bytes! */
2127 /* We don't know much -- general case. */
2130 if (regtry(®info, &s))
2139 if (regtry(®info, &s))
2141 } while (s++ < strend);
2149 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2151 if (PL_reg_eval_set)
2152 restore_pos(aTHX_ prog);
2153 if (RXp_PAREN_NAMES(prog))
2154 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2156 /* make sure $`, $&, $', and $digit will work later */
2157 if ( !(flags & REXEC_NOT_FIRST) ) {
2158 RX_MATCH_COPY_FREE(rx);
2159 if (flags & REXEC_COPY_STR) {
2160 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2161 #ifdef PERL_OLD_COPY_ON_WRITE
2163 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2165 PerlIO_printf(Perl_debug_log,
2166 "Copy on write: regexp capture, type %d\n",
2169 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2170 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2171 assert (SvPOKp(prog->saved_copy));
2175 RX_MATCH_COPIED_on(rx);
2176 s = savepvn(strbeg, i);
2182 prog->subbeg = strbeg;
2183 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2190 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2191 PL_colors[4], PL_colors[5]));
2192 if (PL_reg_eval_set)
2193 restore_pos(aTHX_ prog);
2195 /* we failed :-( roll it back */
2196 swap_match_buff(prog);
2203 - regtry - try match at specific point
2205 STATIC I32 /* 0 failure, 1 success */
2206 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2210 REGEXP *const rx = reginfo->prog;
2211 regexp *const prog = (struct regexp *)SvANY(rx);
2212 RXi_GET_DECL(prog,progi);
2213 GET_RE_DEBUG_FLAGS_DECL;
2214 reginfo->cutpoint=NULL;
2216 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2219 PL_reg_eval_set = RS_init;
2220 DEBUG_EXECUTE_r(DEBUG_s(
2221 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2222 (IV)(PL_stack_sp - PL_stack_base));
2225 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2226 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2228 /* Apparently this is not needed, judging by wantarray. */
2229 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2230 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2233 /* Make $_ available to executed code. */
2234 if (reginfo->sv != DEFSV) {
2236 DEFSV = reginfo->sv;
2239 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2240 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2241 /* prepare for quick setting of pos */
2242 #ifdef PERL_OLD_COPY_ON_WRITE
2243 if (SvIsCOW(reginfo->sv))
2244 sv_force_normal_flags(reginfo->sv, 0);
2246 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2247 &PL_vtbl_mglob, NULL, 0);
2251 PL_reg_oldpos = mg->mg_len;
2252 SAVEDESTRUCTOR_X(restore_pos, prog);
2254 if (!PL_reg_curpm) {
2255 Newxz(PL_reg_curpm, 1, PMOP);
2258 SV* const repointer = &PL_sv_undef;
2259 /* this regexp is also owned by the new PL_reg_curpm, which
2260 will try to free it. */
2261 av_push(PL_regex_padav, repointer);
2262 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2263 PL_regex_pad = AvARRAY(PL_regex_padav);
2268 /* It seems that non-ithreads works both with and without this code.
2269 So for efficiency reasons it seems best not to have the code
2270 compiled when it is not needed. */
2271 /* This is safe against NULLs: */
2272 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2273 /* PM_reg_curpm owns a reference to this regexp. */
2276 PM_SETRE(PL_reg_curpm, rx);
2277 PL_reg_oldcurpm = PL_curpm;
2278 PL_curpm = PL_reg_curpm;
2279 if (RXp_MATCH_COPIED(prog)) {
2280 /* Here is a serious problem: we cannot rewrite subbeg,
2281 since it may be needed if this match fails. Thus
2282 $` inside (?{}) could fail... */
2283 PL_reg_oldsaved = prog->subbeg;
2284 PL_reg_oldsavedlen = prog->sublen;
2285 #ifdef PERL_OLD_COPY_ON_WRITE
2286 PL_nrs = prog->saved_copy;
2288 RXp_MATCH_COPIED_off(prog);
2291 PL_reg_oldsaved = NULL;
2292 prog->subbeg = PL_bostr;
2293 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2295 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2296 prog->offs[0].start = *startpos - PL_bostr;
2297 PL_reginput = *startpos;
2298 PL_reglastparen = &prog->lastparen;
2299 PL_reglastcloseparen = &prog->lastcloseparen;
2300 prog->lastparen = 0;
2301 prog->lastcloseparen = 0;
2303 PL_regoffs = prog->offs;
2304 if (PL_reg_start_tmpl <= prog->nparens) {
2305 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2306 if(PL_reg_start_tmp)
2307 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2309 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2312 /* XXXX What this code is doing here?!!! There should be no need
2313 to do this again and again, PL_reglastparen should take care of
2316 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2317 * Actually, the code in regcppop() (which Ilya may be meaning by
2318 * PL_reglastparen), is not needed at all by the test suite
2319 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2320 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2321 * Meanwhile, this code *is* needed for the
2322 * above-mentioned test suite tests to succeed. The common theme
2323 * on those tests seems to be returning null fields from matches.
2324 * --jhi updated by dapm */
2326 if (prog->nparens) {
2327 regexp_paren_pair *pp = PL_regoffs;
2329 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2337 if (regmatch(reginfo, progi->program + 1)) {
2338 PL_regoffs[0].end = PL_reginput - PL_bostr;
2341 if (reginfo->cutpoint)
2342 *startpos= reginfo->cutpoint;
2343 REGCP_UNWIND(lastcp);
2348 #define sayYES goto yes
2349 #define sayNO goto no
2350 #define sayNO_SILENT goto no_silent
2352 /* we dont use STMT_START/END here because it leads to
2353 "unreachable code" warnings, which are bogus, but distracting. */
2354 #define CACHEsayNO \
2355 if (ST.cache_mask) \
2356 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2359 /* this is used to determine how far from the left messages like
2360 'failed...' are printed. It should be set such that messages
2361 are inline with the regop output that created them.
2363 #define REPORT_CODE_OFF 32
2366 /* Make sure there is a test for this +1 options in re_tests */
2367 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2369 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2370 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2372 #define SLAB_FIRST(s) (&(s)->states[0])
2373 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2375 /* grab a new slab and return the first slot in it */
2377 STATIC regmatch_state *
2380 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2383 regmatch_slab *s = PL_regmatch_slab->next;
2385 Newx(s, 1, regmatch_slab);
2386 s->prev = PL_regmatch_slab;
2388 PL_regmatch_slab->next = s;
2390 PL_regmatch_slab = s;
2391 return SLAB_FIRST(s);
2395 /* push a new state then goto it */
2397 #define PUSH_STATE_GOTO(state, node) \
2399 st->resume_state = state; \
2402 /* push a new state with success backtracking, then goto it */
2404 #define PUSH_YES_STATE_GOTO(state, node) \
2406 st->resume_state = state; \
2407 goto push_yes_state;
2413 regmatch() - main matching routine
2415 This is basically one big switch statement in a loop. We execute an op,
2416 set 'next' to point the next op, and continue. If we come to a point which
2417 we may need to backtrack to on failure such as (A|B|C), we push a
2418 backtrack state onto the backtrack stack. On failure, we pop the top
2419 state, and re-enter the loop at the state indicated. If there are no more
2420 states to pop, we return failure.
2422 Sometimes we also need to backtrack on success; for example /A+/, where
2423 after successfully matching one A, we need to go back and try to
2424 match another one; similarly for lookahead assertions: if the assertion
2425 completes successfully, we backtrack to the state just before the assertion
2426 and then carry on. In these cases, the pushed state is marked as
2427 'backtrack on success too'. This marking is in fact done by a chain of
2428 pointers, each pointing to the previous 'yes' state. On success, we pop to
2429 the nearest yes state, discarding any intermediate failure-only states.
2430 Sometimes a yes state is pushed just to force some cleanup code to be
2431 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2432 it to free the inner regex.
2434 Note that failure backtracking rewinds the cursor position, while
2435 success backtracking leaves it alone.
2437 A pattern is complete when the END op is executed, while a subpattern
2438 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2439 ops trigger the "pop to last yes state if any, otherwise return true"
2442 A common convention in this function is to use A and B to refer to the two
2443 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2444 the subpattern to be matched possibly multiple times, while B is the entire
2445 rest of the pattern. Variable and state names reflect this convention.
2447 The states in the main switch are the union of ops and failure/success of
2448 substates associated with with that op. For example, IFMATCH is the op
2449 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2450 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2451 successfully matched A and IFMATCH_A_fail is a state saying that we have
2452 just failed to match A. Resume states always come in pairs. The backtrack
2453 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2454 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2455 on success or failure.
2457 The struct that holds a backtracking state is actually a big union, with
2458 one variant for each major type of op. The variable st points to the
2459 top-most backtrack struct. To make the code clearer, within each
2460 block of code we #define ST to alias the relevant union.
2462 Here's a concrete example of a (vastly oversimplified) IFMATCH
2468 #define ST st->u.ifmatch
2470 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2471 ST.foo = ...; // some state we wish to save
2473 // push a yes backtrack state with a resume value of
2474 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2476 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2479 case IFMATCH_A: // we have successfully executed A; now continue with B
2481 bar = ST.foo; // do something with the preserved value
2484 case IFMATCH_A_fail: // A failed, so the assertion failed
2485 ...; // do some housekeeping, then ...
2486 sayNO; // propagate the failure
2493 For any old-timers reading this who are familiar with the old recursive
2494 approach, the code above is equivalent to:
2496 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2505 ...; // do some housekeeping, then ...
2506 sayNO; // propagate the failure
2509 The topmost backtrack state, pointed to by st, is usually free. If you
2510 want to claim it, populate any ST.foo fields in it with values you wish to
2511 save, then do one of
2513 PUSH_STATE_GOTO(resume_state, node);
2514 PUSH_YES_STATE_GOTO(resume_state, node);
2516 which sets that backtrack state's resume value to 'resume_state', pushes a
2517 new free entry to the top of the backtrack stack, then goes to 'node'.
2518 On backtracking, the free slot is popped, and the saved state becomes the
2519 new free state. An ST.foo field in this new top state can be temporarily
2520 accessed to retrieve values, but once the main loop is re-entered, it
2521 becomes available for reuse.
2523 Note that the depth of the backtrack stack constantly increases during the
2524 left-to-right execution of the pattern, rather than going up and down with
2525 the pattern nesting. For example the stack is at its maximum at Z at the
2526 end of the pattern, rather than at X in the following:
2528 /(((X)+)+)+....(Y)+....Z/
2530 The only exceptions to this are lookahead/behind assertions and the cut,
2531 (?>A), which pop all the backtrack states associated with A before
2534 Bascktrack state structs are allocated in slabs of about 4K in size.
2535 PL_regmatch_state and st always point to the currently active state,
2536 and PL_regmatch_slab points to the slab currently containing
2537 PL_regmatch_state. The first time regmatch() is called, the first slab is
2538 allocated, and is never freed until interpreter destruction. When the slab
2539 is full, a new one is allocated and chained to the end. At exit from
2540 regmatch(), slabs allocated since entry are freed.
2545 #define DEBUG_STATE_pp(pp) \
2547 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2548 PerlIO_printf(Perl_debug_log, \
2549 " %*s"pp" %s%s%s%s%s\n", \
2551 PL_reg_name[st->resume_state], \
2552 ((st==yes_state||st==mark_state) ? "[" : ""), \
2553 ((st==yes_state) ? "Y" : ""), \
2554 ((st==mark_state) ? "M" : ""), \
2555 ((st==yes_state||st==mark_state) ? "]" : "") \
2560 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2565 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2566 const char *start, const char *end, const char *blurb)
2568 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2572 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2573 RX_PRECOMP(prog), RX_PRELEN(prog), 60);
2575 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2576 start, end - start, 60);
2578 PerlIO_printf(Perl_debug_log,
2579 "%s%s REx%s %s against %s\n",
2580 PL_colors[4], blurb, PL_colors[5], s0, s1);
2582 if (do_utf8||utf8_pat)
2583 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2584 utf8_pat ? "pattern" : "",
2585 utf8_pat && do_utf8 ? " and " : "",
2586 do_utf8 ? "string" : ""
2592 S_dump_exec_pos(pTHX_ const char *locinput,
2593 const regnode *scan,
2594 const char *loc_regeol,
2595 const char *loc_bostr,
2596 const char *loc_reg_starttry,
2599 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2600 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2601 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2602 /* The part of the string before starttry has one color
2603 (pref0_len chars), between starttry and current
2604 position another one (pref_len - pref0_len chars),
2605 after the current position the third one.
2606 We assume that pref0_len <= pref_len, otherwise we
2607 decrease pref0_len. */
2608 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2609 ? (5 + taill) - l : locinput - loc_bostr;
2612 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2614 pref0_len = pref_len - (locinput - loc_reg_starttry);
2615 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2616 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2617 ? (5 + taill) - pref_len : loc_regeol - locinput);
2618 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2622 if (pref0_len > pref_len)
2623 pref0_len = pref_len;
2625 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2627 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2628 (locinput - pref_len),pref0_len, 60, 4, 5);
2630 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2631 (locinput - pref_len + pref0_len),
2632 pref_len - pref0_len, 60, 2, 3);
2634 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2635 locinput, loc_regeol - locinput, 10, 0, 1);
2637 const STRLEN tlen=len0+len1+len2;
2638 PerlIO_printf(Perl_debug_log,
2639 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2640 (IV)(locinput - loc_bostr),
2643 (docolor ? "" : "> <"),
2645 (int)(tlen > 19 ? 0 : 19 - tlen),
2652 /* reg_check_named_buff_matched()
2653 * Checks to see if a named buffer has matched. The data array of
2654 * buffer numbers corresponding to the buffer is expected to reside
2655 * in the regexp->data->data array in the slot stored in the ARG() of
2656 * node involved. Note that this routine doesn't actually care about the
2657 * name, that information is not preserved from compilation to execution.
2658 * Returns the index of the leftmost defined buffer with the given name
2659 * or 0 if non of the buffers matched.
2662 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2664 RXi_GET_DECL(rex,rexi);
2665 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2666 I32 *nums=(I32*)SvPVX(sv_dat);
2667 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2668 if ((I32)*PL_reglastparen >= nums[n] &&
2669 PL_regoffs[nums[n]].end != -1)
2678 /* free all slabs above current one - called during LEAVE_SCOPE */
2681 S_clear_backtrack_stack(pTHX_ void *p)
2683 regmatch_slab *s = PL_regmatch_slab->next;
2688 PL_regmatch_slab->next = NULL;
2690 regmatch_slab * const osl = s;
2697 #define SETREX(Re1,Re2) \
2698 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2701 STATIC I32 /* 0 failure, 1 success */
2702 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2704 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2708 register const bool do_utf8 = PL_reg_match_utf8;
2709 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2711 REGEXP *rex_sv = reginfo->prog;
2712 regexp *rex = (struct regexp *)SvANY(rex_sv);
2713 RXi_GET_DECL(rex,rexi);
2717 /* the current state. This is a cached copy of PL_regmatch_state */
2718 register regmatch_state *st;
2720 /* cache heavy used fields of st in registers */
2721 register regnode *scan;
2722 register regnode *next;
2723 register U32 n = 0; /* general value; init to avoid compiler warning */
2724 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2725 register char *locinput = PL_reginput;
2726 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2728 bool result = 0; /* return value of S_regmatch */
2729 int depth = 0; /* depth of backtrack stack */
2730 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2731 const U32 max_nochange_depth =
2732 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2733 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2735 regmatch_state *yes_state = NULL; /* state to pop to on success of
2737 /* mark_state piggy backs on the yes_state logic so that when we unwind
2738 the stack on success we can update the mark_state as we go */
2739 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2741 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2742 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2744 bool no_final = 0; /* prevent failure from backtracking? */
2745 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2746 char *startpoint = PL_reginput;
2747 SV *popmark = NULL; /* are we looking for a mark? */
2748 SV *sv_commit = NULL; /* last mark name seen in failure */
2749 SV *sv_yes_mark = NULL; /* last mark name we have seen
2750 during a successfull match */
2751 U32 lastopen = 0; /* last open we saw */
2752 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2754 SV* const oreplsv = GvSV(PL_replgv);
2757 /* these three flags are set by various ops to signal information to
2758 * the very next op. They have a useful lifetime of exactly one loop
2759 * iteration, and are not preserved or restored by state pushes/pops
2761 bool sw = 0; /* the condition value in (?(cond)a|b) */
2762 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2763 int logical = 0; /* the following EVAL is:
2767 or the following IFMATCH/UNLESSM is:
2768 false: plain (?=foo)
2769 true: used as a condition: (?(?=foo))
2773 GET_RE_DEBUG_FLAGS_DECL;
2776 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2777 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2779 /* on first ever call to regmatch, allocate first slab */
2780 if (!PL_regmatch_slab) {
2781 Newx(PL_regmatch_slab, 1, regmatch_slab);
2782 PL_regmatch_slab->prev = NULL;
2783 PL_regmatch_slab->next = NULL;
2784 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2787 oldsave = PL_savestack_ix;
2788 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2789 SAVEVPTR(PL_regmatch_slab);
2790 SAVEVPTR(PL_regmatch_state);
2792 /* grab next free state slot */
2793 st = ++PL_regmatch_state;
2794 if (st > SLAB_LAST(PL_regmatch_slab))
2795 st = PL_regmatch_state = S_push_slab(aTHX);
2797 /* Note that nextchr is a byte even in UTF */
2798 nextchr = UCHARAT(locinput);
2800 while (scan != NULL) {
2803 SV * const prop = sv_newmortal();
2804 regnode *rnext=regnext(scan);
2805 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2806 regprop(rex, prop, scan);
2808 PerlIO_printf(Perl_debug_log,
2809 "%3"IVdf":%*s%s(%"IVdf")\n",
2810 (IV)(scan - rexi->program), depth*2, "",
2812 (PL_regkind[OP(scan)] == END || !rnext) ?
2813 0 : (IV)(rnext - rexi->program));
2816 next = scan + NEXT_OFF(scan);
2819 state_num = OP(scan);
2822 switch (state_num) {
2824 if (locinput == PL_bostr)
2826 /* reginfo->till = reginfo->bol; */
2831 if (locinput == PL_bostr ||
2832 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2838 if (locinput == PL_bostr)
2842 if (locinput == reginfo->ganch)
2847 /* update the startpoint */
2848 st->u.keeper.val = PL_regoffs[0].start;
2849 PL_reginput = locinput;
2850 PL_regoffs[0].start = locinput - PL_bostr;
2851 PUSH_STATE_GOTO(KEEPS_next, next);
2853 case KEEPS_next_fail:
2854 /* rollback the start point change */
2855 PL_regoffs[0].start = st->u.keeper.val;
2861 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2866 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2868 if (PL_regeol - locinput > 1)
2872 if (PL_regeol != locinput)
2876 if (!nextchr && locinput >= PL_regeol)
2879 locinput += PL_utf8skip[nextchr];
2880 if (locinput > PL_regeol)
2882 nextchr = UCHARAT(locinput);
2885 nextchr = UCHARAT(++locinput);
2888 if (!nextchr && locinput >= PL_regeol)
2890 nextchr = UCHARAT(++locinput);
2893 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2896 locinput += PL_utf8skip[nextchr];
2897 if (locinput > PL_regeol)
2899 nextchr = UCHARAT(locinput);
2902 nextchr = UCHARAT(++locinput);
2906 #define ST st->u.trie
2908 /* In this case the charclass data is available inline so
2909 we can fail fast without a lot of extra overhead.
2911 if (scan->flags == EXACT || !do_utf8) {
2912 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2914 PerlIO_printf(Perl_debug_log,
2915 "%*s %sfailed to match trie start class...%s\n",
2916 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2925 /* what type of TRIE am I? (utf8 makes this contextual) */
2926 DECL_TRIE_TYPE(scan);
2928 /* what trie are we using right now */
2929 reg_trie_data * const trie
2930 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2931 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2932 U32 state = trie->startstate;
2934 if (trie->bitmap && trie_type != trie_utf8_fold &&
2935 !TRIE_BITMAP_TEST(trie,*locinput)
2937 if (trie->states[ state ].wordnum) {
2939 PerlIO_printf(Perl_debug_log,
2940 "%*s %smatched empty string...%s\n",
2941 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2946 PerlIO_printf(Perl_debug_log,
2947 "%*s %sfailed to match trie start class...%s\n",
2948 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2955 U8 *uc = ( U8* )locinput;
2959 U8 *uscan = (U8*)NULL;
2961 SV *sv_accept_buff = NULL;
2962 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2964 ST.accepted = 0; /* how many accepting states we have seen */
2966 ST.jump = trie->jump;
2969 traverse the TRIE keeping track of all accepting states
2970 we transition through until we get to a failing node.
2973 while ( state && uc <= (U8*)PL_regeol ) {
2974 U32 base = trie->states[ state ].trans.base;
2977 /* We use charid to hold the wordnum as we don't use it
2978 for charid until after we have done the wordnum logic.
2979 We define an alias just so that the wordnum logic reads
2982 #define got_wordnum charid
2983 got_wordnum = trie->states[ state ].wordnum;
2985 if ( got_wordnum ) {
2986 if ( ! ST.accepted ) {
2988 /* SAVETMPS; */ /* XXX is this necessary? dmq */
2989 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2990 sv_accept_buff=newSV(bufflen *
2991 sizeof(reg_trie_accepted) - 1);
2992 SvCUR_set(sv_accept_buff, 0);
2993 SvPOK_on(sv_accept_buff);
2994 sv_2mortal(sv_accept_buff);
2997 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3000 if (ST.accepted >= bufflen) {
3002 ST.accept_buff =(reg_trie_accepted*)
3003 SvGROW(sv_accept_buff,
3004 bufflen * sizeof(reg_trie_accepted));
3006 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3007 + sizeof(reg_trie_accepted));
3010 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3011 ST.accept_buff[ST.accepted].endpos = uc;
3013 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3017 DEBUG_TRIE_EXECUTE_r({
3018 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3019 PerlIO_printf( Perl_debug_log,
3020 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
3021 2+depth * 2, "", PL_colors[4],
3022 (UV)state, (UV)ST.accepted );
3026 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3027 uscan, len, uvc, charid, foldlen,
3031 (base + charid > trie->uniquecharcount )
3032 && (base + charid - 1 - trie->uniquecharcount
3034 && trie->trans[base + charid - 1 -
3035 trie->uniquecharcount].check == state)
3037 state = trie->trans[base + charid - 1 -
3038 trie->uniquecharcount ].next;
3049 DEBUG_TRIE_EXECUTE_r(
3050 PerlIO_printf( Perl_debug_log,
3051 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3052 charid, uvc, (UV)state, PL_colors[5] );
3059 PerlIO_printf( Perl_debug_log,
3060 "%*s %sgot %"IVdf" possible matches%s\n",
3061 REPORT_CODE_OFF + depth * 2, "",
3062 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3065 goto trie_first_try; /* jump into the fail handler */
3067 case TRIE_next_fail: /* we failed - try next alterative */
3069 REGCP_UNWIND(ST.cp);
3070 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3071 PL_regoffs[n].end = -1;
3072 *PL_reglastparen = n;
3081 ST.lastparen = *PL_reglastparen;
3084 if ( ST.accepted == 1 ) {
3085 /* only one choice left - just continue */
3087 AV *const trie_words
3088 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3089 SV ** const tmp = av_fetch( trie_words,
3090 ST.accept_buff[ 0 ].wordnum-1, 0 );
3091 SV *sv= tmp ? sv_newmortal() : NULL;
3093 PerlIO_printf( Perl_debug_log,
3094 "%*s %sonly one match left: #%d <%s>%s\n",
3095 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3096 ST.accept_buff[ 0 ].wordnum,
3097 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3098 PL_colors[0], PL_colors[1],
3099 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3101 : "not compiled under -Dr",
3104 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3105 /* in this case we free tmps/leave before we call regmatch
3106 as we wont be using accept_buff again. */
3108 locinput = PL_reginput;
3109 nextchr = UCHARAT(locinput);
3110 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3113 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3114 if (!has_cutgroup) {
3119 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3122 continue; /* execute rest of RE */
3125 if ( !ST.accepted-- ) {
3127 PerlIO_printf( Perl_debug_log,
3128 "%*s %sTRIE failed...%s\n",
3129 REPORT_CODE_OFF+depth*2, "",
3140 There are at least two accepting states left. Presumably
3141 the number of accepting states is going to be low,
3142 typically two. So we simply scan through to find the one
3143 with lowest wordnum. Once we find it, we swap the last
3144 state into its place and decrement the size. We then try to
3145 match the rest of the pattern at the point where the word
3146 ends. If we succeed, control just continues along the
3147 regex; if we fail we return here to try the next accepting
3154 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3155 DEBUG_TRIE_EXECUTE_r(
3156 PerlIO_printf( Perl_debug_log,
3157 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3158 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3159 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3160 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3163 if (ST.accept_buff[cur].wordnum <
3164 ST.accept_buff[best].wordnum)
3169 AV *const trie_words
3170 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3171 SV ** const tmp = av_fetch( trie_words,
3172 ST.accept_buff[ best ].wordnum - 1, 0 );
3173 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3175 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3176 SV *sv= tmp ? sv_newmortal() : NULL;
3178 PerlIO_printf( Perl_debug_log,
3179 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3180 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3181 ST.accept_buff[best].wordnum,
3182 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3183 PL_colors[0], PL_colors[1],
3184 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3185 ) : "not compiled under -Dr",
3186 REG_NODE_NUM(nextop),
3190 if ( best<ST.accepted ) {
3191 reg_trie_accepted tmp = ST.accept_buff[ best ];
3192 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3193 ST.accept_buff[ ST.accepted ] = tmp;
3196 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3197 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3200 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3202 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3213 char *s = STRING(scan);
3215 if (do_utf8 != UTF) {
3216 /* The target and the pattern have differing utf8ness. */
3218 const char * const e = s + ln;
3221 /* The target is utf8, the pattern is not utf8. */
3226 if (NATIVE_TO_UNI(*(U8*)s) !=
3227 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3235 /* The target is not utf8, the pattern is utf8. */
3240 if (NATIVE_TO_UNI(*((U8*)l)) !=
3241 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3249 nextchr = UCHARAT(locinput);
3252 /* The target and the pattern have the same utf8ness. */
3253 /* Inline the first character, for speed. */
3254 if (UCHARAT(s) != nextchr)
3256 if (PL_regeol - locinput < ln)
3258 if (ln > 1 && memNE(s, locinput, ln))
3261 nextchr = UCHARAT(locinput);
3265 PL_reg_flags |= RF_tainted;
3268 char * const s = STRING(scan);
3271 if (do_utf8 || UTF) {
3272 /* Either target or the pattern are utf8. */
3273 const char * const l = locinput;
3274 char *e = PL_regeol;
3276 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3277 l, &e, 0, do_utf8)) {
3278 /* One more case for the sharp s:
3279 * pack("U0U*", 0xDF) =~ /ss/i,
3280 * the 0xC3 0x9F are the UTF-8
3281 * byte sequence for the U+00DF. */
3284 toLOWER(s[0]) == 's' &&
3286 toLOWER(s[1]) == 's' &&
3293 nextchr = UCHARAT(locinput);
3297 /* Neither the target and the pattern are utf8. */
3299 /* Inline the first character, for speed. */
3300 if (UCHARAT(s) != nextchr &&
3301 UCHARAT(s) != ((OP(scan) == EXACTF)
3302 ? PL_fold : PL_fold_locale)[nextchr])
3304 if (PL_regeol - locinput < ln)
3306 if (ln > 1 && (OP(scan) == EXACTF
3307 ? ibcmp(s, locinput, ln)
3308 : ibcmp_locale(s, locinput, ln)))
3311 nextchr = UCHARAT(locinput);
3316 STRLEN inclasslen = PL_regeol - locinput;
3318 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3320 if (locinput >= PL_regeol)
3322 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3323 nextchr = UCHARAT(locinput);
3328 nextchr = UCHARAT(locinput);
3329 if (!REGINCLASS(rex, scan, (U8*)locinput))
3331 if (!nextchr && locinput >= PL_regeol)
3333 nextchr = UCHARAT(++locinput);
3337 /* If we might have the case of the German sharp s
3338 * in a casefolding Unicode character class. */
3340 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3341 locinput += SHARP_S_SKIP;
3342 nextchr = UCHARAT(locinput);
3348 PL_reg_flags |= RF_tainted;
3354 LOAD_UTF8_CHARCLASS_ALNUM();
3355 if (!(OP(scan) == ALNUM
3356 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3357 : isALNUM_LC_utf8((U8*)locinput)))
3361 locinput += PL_utf8skip[nextchr];
3362 nextchr = UCHARAT(locinput);
3365 if (!(OP(scan) == ALNUM
3366 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3368 nextchr = UCHARAT(++locinput);
3371 PL_reg_flags |= RF_tainted;
3374 if (!nextchr && locinput >= PL_regeol)
3377 LOAD_UTF8_CHARCLASS_ALNUM();
3378 if (OP(scan) == NALNUM
3379 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3380 : isALNUM_LC_utf8((U8*)locinput))
3384 locinput += PL_utf8skip[nextchr];
3385 nextchr = UCHARAT(locinput);
3388 if (OP(scan) == NALNUM
3389 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3391 nextchr = UCHARAT(++locinput);
3395 PL_reg_flags |= RF_tainted;
3399 /* was last char in word? */
3401 if (locinput == PL_bostr)
3404 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3406 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3408 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3409 ln = isALNUM_uni(ln);
3410 LOAD_UTF8_CHARCLASS_ALNUM();
3411 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3414 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3415 n = isALNUM_LC_utf8((U8*)locinput);
3419 ln = (locinput != PL_bostr) ?
3420 UCHARAT(locinput - 1) : '\n';
3421 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3423 n = isALNUM(nextchr);
3426 ln = isALNUM_LC(ln);
3427 n = isALNUM_LC(nextchr);
3430 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3431 OP(scan) == BOUNDL))
3435 PL_reg_flags |= RF_tainted;
3441 if (UTF8_IS_CONTINUED(nextchr)) {
3442 LOAD_UTF8_CHARCLASS_SPACE();
3443 if (!(OP(scan) == SPACE
3444 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3445 : isSPACE_LC_utf8((U8*)locinput)))
3449 locinput += PL_utf8skip[nextchr];
3450 nextchr = UCHARAT(locinput);
3453 if (!(OP(scan) == SPACE
3454 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3456 nextchr = UCHARAT(++locinput);
3459 if (!(OP(scan) == SPACE
3460 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3462 nextchr = UCHARAT(++locinput);
3466 PL_reg_flags |= RF_tainted;
3469 if (!nextchr && locinput >= PL_regeol)
3472 LOAD_UTF8_CHARCLASS_SPACE();
3473 if (OP(scan) == NSPACE
3474 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3475 : isSPACE_LC_utf8((U8*)locinput))
3479 locinput += PL_utf8skip[nextchr];
3480 nextchr = UCHARAT(locinput);
3483 if (OP(scan) == NSPACE
3484 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3486 nextchr = UCHARAT(++locinput);
3489 PL_reg_flags |= RF_tainted;
3495 LOAD_UTF8_CHARCLASS_DIGIT();
3496 if (!(OP(scan) == DIGIT
3497 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3498 : isDIGIT_LC_utf8((U8*)locinput)))
3502 locinput += PL_utf8skip[nextchr];
3503 nextchr = UCHARAT(locinput);
3506 if (!(OP(scan) == DIGIT
3507 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3509 nextchr = UCHARAT(++locinput);
3512 PL_reg_flags |= RF_tainted;
3515 if (!nextchr && locinput >= PL_regeol)
3518 LOAD_UTF8_CHARCLASS_DIGIT();
3519 if (OP(scan) == NDIGIT
3520 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3521 : isDIGIT_LC_utf8((U8*)locinput))
3525 locinput += PL_utf8skip[nextchr];
3526 nextchr = UCHARAT(locinput);
3529 if (OP(scan) == NDIGIT
3530 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3532 nextchr = UCHARAT(++locinput);
3535 if (locinput >= PL_regeol)
3538 LOAD_UTF8_CHARCLASS_MARK();
3539 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3541 locinput += PL_utf8skip[nextchr];
3542 while (locinput < PL_regeol &&
3543 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3544 locinput += UTF8SKIP(locinput);
3545 if (locinput > PL_regeol)
3550 nextchr = UCHARAT(locinput);
3557 PL_reg_flags |= RF_tainted;
3562 n = reg_check_named_buff_matched(rex,scan);
3565 type = REF + ( type - NREF );
3572 PL_reg_flags |= RF_tainted;
3576 n = ARG(scan); /* which paren pair */
3579 ln = PL_regoffs[n].start;
3580 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3581 if (*PL_reglastparen < n || ln == -1)
3582 sayNO; /* Do not match unless seen CLOSEn. */
3583 if (ln == PL_regoffs[n].end)
3587 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3589 const char *e = PL_bostr + PL_regoffs[n].end;
3591 * Note that we can't do the "other character" lookup trick as
3592 * in the 8-bit case (no pun intended) because in Unicode we
3593 * have to map both upper and title case to lower case.
3597 STRLEN ulen1, ulen2;
3598 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3599 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3603 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3604 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3605 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3612 nextchr = UCHARAT(locinput);
3616 /* Inline the first character, for speed. */
3617 if (UCHARAT(s) != nextchr &&
3619 (UCHARAT(s) != (type == REFF
3620 ? PL_fold : PL_fold_locale)[nextchr])))
3622 ln = PL_regoffs[n].end - ln;
3623 if (locinput + ln > PL_regeol)
3625 if (ln > 1 && (type == REF
3626 ? memNE(s, locinput, ln)
3628 ? ibcmp(s, locinput, ln)
3629 : ibcmp_locale(s, locinput, ln))))
3632 nextchr = UCHARAT(locinput);
3642 #define ST st->u.eval
3647 regexp_internal *rei;
3648 regnode *startpoint;
3651 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3652 if (cur_eval && cur_eval->locinput==locinput) {
3653 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3654 Perl_croak(aTHX_ "Infinite recursion in regex");
3655 if ( ++nochange_depth > max_nochange_depth )
3657 "Pattern subroutine nesting without pos change"
3658 " exceeded limit in regex");
3665 (void)ReREFCNT_inc(rex_sv);
3666 if (OP(scan)==GOSUB) {
3667 startpoint = scan + ARG2L(scan);
3668 ST.close_paren = ARG(scan);
3670 startpoint = rei->program+1;
3673 goto eval_recurse_doit;
3675 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3676 if (cur_eval && cur_eval->locinput==locinput) {
3677 if ( ++nochange_depth > max_nochange_depth )
3678 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3683 /* execute the code in the {...} */
3685 SV ** const before = SP;
3686 OP_4tree * const oop = PL_op;
3687 COP * const ocurcop = PL_curcop;
3691 PL_op = (OP_4tree*)rexi->data->data[n];
3692 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3693 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3694 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3695 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3698 SV *sv_mrk = get_sv("REGMARK", 1);
3699 sv_setsv(sv_mrk, sv_yes_mark);
3702 CALLRUNOPS(aTHX); /* Scalar context. */
3705 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3712 PAD_RESTORE_LOCAL(old_comppad);
3713 PL_curcop = ocurcop;
3716 sv_setsv(save_scalar(PL_replgv), ret);
3720 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3723 /* extract RE object from returned value; compiling if
3729 SV *const sv = SvRV(ret);
3731 if (SvTYPE(sv) == SVt_REGEXP) {
3733 } else if (SvSMAGICAL(sv)) {
3734 mg = mg_find(sv, PERL_MAGIC_qr);
3737 } else if (SvTYPE(ret) == SVt_REGEXP) {
3739 } else if (SvSMAGICAL(ret)) {
3740 if (SvGMAGICAL(ret)) {
3741 /* I don't believe that there is ever qr magic
3743 assert(!mg_find(ret, PERL_MAGIC_qr));
3744 sv_unmagic(ret, PERL_MAGIC_qr);
3747 mg = mg_find(ret, PERL_MAGIC_qr);
3748 /* testing suggests mg only ends up non-NULL for
3749 scalars who were upgraded and compiled in the
3750 else block below. In turn, this is only
3751 triggered in the "postponed utf8 string" tests
3757 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3761 rx = reg_temp_copy(rx);
3765 const I32 osize = PL_regsize;
3768 assert (SvUTF8(ret));
3769 } else if (SvUTF8(ret)) {
3770 /* Not doing UTF-8, despite what the SV says. Is
3771 this only if we're trapped in use 'bytes'? */
3772 /* Make a copy of the octet sequence, but without
3773 the flag on, as the compiler now honours the
3774 SvUTF8 flag on ret. */
3776 const char *const p = SvPV(ret, len);
3777 ret = newSVpvn_flags(p, len, SVs_TEMP);
3779 rx = CALLREGCOMP(ret, pm_flags);
3781 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3783 /* This isn't a first class regexp. Instead, it's
3784 caching a regexp onto an existing, Perl visible
3786 sv_magic(ret, (SV*) rx, PERL_MAGIC_qr, 0, 0);
3791 re = (struct regexp *)SvANY(rx);
3793 RXp_MATCH_COPIED_off(re);
3794 re->subbeg = rex->subbeg;
3795 re->sublen = rex->sublen;
3798 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
3799 "Matching embedded");
3801 startpoint = rei->program + 1;
3802 ST.close_paren = 0; /* only used for GOSUB */
3803 /* borrowed from regtry */
3804 if (PL_reg_start_tmpl <= re->nparens) {
3805 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3806 if(PL_reg_start_tmp)
3807 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3809 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3812 eval_recurse_doit: /* Share code with GOSUB below this line */
3813 /* run the pattern returned from (??{...}) */
3814 ST.cp = regcppush(0); /* Save *all* the positions. */
3815 REGCP_SET(ST.lastcp);
3817 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
3819 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3820 PL_reglastparen = &re->lastparen;
3821 PL_reglastcloseparen = &re->lastcloseparen;
3823 re->lastcloseparen = 0;
3825 PL_reginput = locinput;
3828 /* XXXX This is too dramatic a measure... */
3831 ST.toggle_reg_flags = PL_reg_flags;
3833 PL_reg_flags |= RF_utf8;
3835 PL_reg_flags &= ~RF_utf8;
3836 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3838 ST.prev_rex = rex_sv;
3839 ST.prev_curlyx = cur_curlyx;
3840 SETREX(rex_sv,re_sv);
3845 ST.prev_eval = cur_eval;
3847 /* now continue from first node in postoned RE */
3848 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3851 /* logical is 1, /(?(?{...})X|Y)/ */
3852 sw = (bool)SvTRUE(ret);
3857 case EVAL_AB: /* cleanup after a successful (??{A})B */
3858 /* note: this is called twice; first after popping B, then A */
3859 PL_reg_flags ^= ST.toggle_reg_flags;
3860 ReREFCNT_dec(rex_sv);
3861 SETREX(rex_sv,ST.prev_rex);
3862 rex = (struct regexp *)SvANY(rex_sv);
3863 rexi = RXi_GET(rex);
3865 cur_eval = ST.prev_eval;
3866 cur_curlyx = ST.prev_curlyx;
3868 PL_reglastparen = &rex->lastparen;
3869 PL_reglastcloseparen = &rex->lastcloseparen;
3871 /* XXXX This is too dramatic a measure... */
3873 if ( nochange_depth )
3878 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3879 /* note: this is called twice; first after popping B, then A */
3880 PL_reg_flags ^= ST.toggle_reg_flags;
3881 ReREFCNT_dec(rex_sv);
3882 SETREX(rex_sv,ST.prev_rex);
3883 rex = (struct regexp *)SvANY(rex_sv);
3884 rexi = RXi_GET(rex);
3885 PL_reglastparen = &rex->lastparen;
3886 PL_reglastcloseparen = &rex->lastcloseparen;
3888 PL_reginput = locinput;
3889 REGCP_UNWIND(ST.lastcp);
3891 cur_eval = ST.prev_eval;
3892 cur_curlyx = ST.prev_curlyx;
3893 /* XXXX This is too dramatic a measure... */
3895 if ( nochange_depth )
3901 n = ARG(scan); /* which paren pair */
3902 PL_reg_start_tmp[n] = locinput;
3908 n = ARG(scan); /* which paren pair */
3909 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3910 PL_regoffs[n].end = locinput - PL_bostr;
3911 /*if (n > PL_regsize)
3913 if (n > *PL_reglastparen)
3914 *PL_reglastparen = n;
3915 *PL_reglastcloseparen = n;
3916 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3924 cursor && OP(cursor)!=END;
3925 cursor=regnext(cursor))
3927 if ( OP(cursor)==CLOSE ){
3929 if ( n <= lastopen ) {
3931 = PL_reg_start_tmp[n] - PL_bostr;
3932 PL_regoffs[n].end = locinput - PL_bostr;
3933 /*if (n > PL_regsize)
3935 if (n > *PL_reglastparen)
3936 *PL_reglastparen = n;
3937 *PL_reglastcloseparen = n;
3938 if ( n == ARG(scan) || (cur_eval &&
3939 cur_eval->u.eval.close_paren == n))
3948 n = ARG(scan); /* which paren pair */
3949 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
3952 /* reg_check_named_buff_matched returns 0 for no match */
3953 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3957 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3963 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3965 next = NEXTOPER(NEXTOPER(scan));
3967 next = scan + ARG(scan);
3968 if (OP(next) == IFTHEN) /* Fake one. */
3969 next = NEXTOPER(NEXTOPER(next));
3973 logical = scan->flags;
3976 /*******************************************************************
3978 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3979 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3980 STAR/PLUS/CURLY/CURLYN are used instead.)
3982 A*B is compiled as <CURLYX><A><WHILEM><B>
3984 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3985 state, which contains the current count, initialised to -1. It also sets
3986 cur_curlyx to point to this state, with any previous value saved in the
3989 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3990 since the pattern may possibly match zero times (i.e. it's a while {} loop
3991 rather than a do {} while loop).
3993 Each entry to WHILEM represents a successful match of A. The count in the
3994 CURLYX block is incremented, another WHILEM state is pushed, and execution
3995 passes to A or B depending on greediness and the current count.
3997 For example, if matching against the string a1a2a3b (where the aN are
3998 substrings that match /A/), then the match progresses as follows: (the
3999 pushed states are interspersed with the bits of strings matched so far):
4002 <CURLYX cnt=0><WHILEM>
4003 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4004 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4005 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4006 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4008 (Contrast this with something like CURLYM, which maintains only a single
4012 a1 <CURLYM cnt=1> a2
4013 a1 a2 <CURLYM cnt=2> a3
4014 a1 a2 a3 <CURLYM cnt=3> b
4017 Each WHILEM state block marks a point to backtrack to upon partial failure
4018 of A or B, and also contains some minor state data related to that
4019 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4020 overall state, such as the count, and pointers to the A and B ops.
4022 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4023 must always point to the *current* CURLYX block, the rules are:
4025 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4026 and set cur_curlyx to point the new block.
4028 When popping the CURLYX block after a successful or unsuccessful match,
4029 restore the previous cur_curlyx.
4031 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4032 to the outer one saved in the CURLYX block.
4034 When popping the WHILEM block after a successful or unsuccessful B match,
4035 restore the previous cur_curlyx.
4037 Here's an example for the pattern (AI* BI)*BO
4038 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4041 curlyx backtrack stack
4042 ------ ---------------
4044 CO <CO prev=NULL> <WO>
4045 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4046 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4047 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4049 At this point the pattern succeeds, and we work back down the stack to
4050 clean up, restoring as we go:
4052 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4053 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4054 CO <CO prev=NULL> <WO>
4057 *******************************************************************/
4059 #define ST st->u.curlyx
4061 case CURLYX: /* start of /A*B/ (for complex A) */
4063 /* No need to save/restore up to this paren */
4064 I32 parenfloor = scan->flags;
4066 assert(next); /* keep Coverity happy */
4067 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4070 /* XXXX Probably it is better to teach regpush to support
4071 parenfloor > PL_regsize... */
4072 if (parenfloor > (I32)*PL_reglastparen)
4073 parenfloor = *PL_reglastparen; /* Pessimization... */
4075 ST.prev_curlyx= cur_curlyx;
4077 ST.cp = PL_savestack_ix;
4079 /* these fields contain the state of the current curly.
4080 * they are accessed by subsequent WHILEMs */
4081 ST.parenfloor = parenfloor;
4082 ST.min = ARG1(scan);
4083 ST.max = ARG2(scan);
4084 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4088 ST.count = -1; /* this will be updated by WHILEM */
4089 ST.lastloc = NULL; /* this will be updated by WHILEM */
4091 PL_reginput = locinput;
4092 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4096 case CURLYX_end: /* just finished matching all of A*B */
4097 cur_curlyx = ST.prev_curlyx;
4101 case CURLYX_end_fail: /* just failed to match all of A*B */
4103 cur_curlyx = ST.prev_curlyx;
4109 #define ST st->u.whilem
4111 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4113 /* see the discussion above about CURLYX/WHILEM */
4115 assert(cur_curlyx); /* keep Coverity happy */
4116 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4117 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4118 ST.cache_offset = 0;
4121 PL_reginput = locinput;
4123 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4124 "%*s whilem: matched %ld out of %ld..%ld\n",
4125 REPORT_CODE_OFF+depth*2, "", (long)n,
4126 (long)cur_curlyx->u.curlyx.min,
4127 (long)cur_curlyx->u.curlyx.max)
4130 /* First just match a string of min A's. */
4132 if (n < cur_curlyx->u.curlyx.min) {
4133 cur_curlyx->u.curlyx.lastloc = locinput;
4134 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4138 /* If degenerate A matches "", assume A done. */
4140 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4141 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4142 "%*s whilem: empty match detected, trying continuation...\n",
4143 REPORT_CODE_OFF+depth*2, "")
4145 goto do_whilem_B_max;
4148 /* super-linear cache processing */
4152 if (!PL_reg_maxiter) {
4153 /* start the countdown: Postpone detection until we
4154 * know the match is not *that* much linear. */
4155 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4156 /* possible overflow for long strings and many CURLYX's */
4157 if (PL_reg_maxiter < 0)
4158 PL_reg_maxiter = I32_MAX;
4159 PL_reg_leftiter = PL_reg_maxiter;
4162 if (PL_reg_leftiter-- == 0) {
4163 /* initialise cache */
4164 const I32 size = (PL_reg_maxiter + 7)/8;
4165 if (PL_reg_poscache) {
4166 if ((I32)PL_reg_poscache_size < size) {
4167 Renew(PL_reg_poscache, size, char);
4168 PL_reg_poscache_size = size;
4170 Zero(PL_reg_poscache, size, char);
4173 PL_reg_poscache_size = size;
4174 Newxz(PL_reg_poscache, size, char);
4176 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4177 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4178 PL_colors[4], PL_colors[5])
4182 if (PL_reg_leftiter < 0) {
4183 /* have we already failed at this position? */
4185 offset = (scan->flags & 0xf) - 1
4186 + (locinput - PL_bostr) * (scan->flags>>4);
4187 mask = 1 << (offset % 8);
4189 if (PL_reg_poscache[offset] & mask) {
4190 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4191 "%*s whilem: (cache) already tried at this position...\n",
4192 REPORT_CODE_OFF+depth*2, "")
4194 sayNO; /* cache records failure */
4196 ST.cache_offset = offset;
4197 ST.cache_mask = mask;
4201 /* Prefer B over A for minimal matching. */
4203 if (cur_curlyx->u.curlyx.minmod) {
4204 ST.save_curlyx = cur_curlyx;
4205 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4206 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4207 REGCP_SET(ST.lastcp);
4208 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4212 /* Prefer A over B for maximal matching. */
4214 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4215 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4216 cur_curlyx->u.curlyx.lastloc = locinput;
4217 REGCP_SET(ST.lastcp);
4218 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4221 goto do_whilem_B_max;
4225 case WHILEM_B_min: /* just matched B in a minimal match */
4226 case WHILEM_B_max: /* just matched B in a maximal match */
4227 cur_curlyx = ST.save_curlyx;
4231 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4232 cur_curlyx = ST.save_curlyx;
4233 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4234 cur_curlyx->u.curlyx.count--;
4238 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4239 REGCP_UNWIND(ST.lastcp);
4242 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4243 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4244 cur_curlyx->u.curlyx.count--;
4248 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4249 REGCP_UNWIND(ST.lastcp);
4250 regcppop(rex); /* Restore some previous $<digit>s? */
4251 PL_reginput = locinput;
4252 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4253 "%*s whilem: failed, trying continuation...\n",
4254 REPORT_CODE_OFF+depth*2, "")
4257 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4258 && ckWARN(WARN_REGEXP)
4259 && !(PL_reg_flags & RF_warned))
4261 PL_reg_flags |= RF_warned;
4262 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4263 "Complex regular subexpression recursion",
4268 ST.save_curlyx = cur_curlyx;
4269 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4270 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4273 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4274 cur_curlyx = ST.save_curlyx;
4275 REGCP_UNWIND(ST.lastcp);
4278 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4279 /* Maximum greed exceeded */
4280 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4281 && ckWARN(WARN_REGEXP)
4282 && !(PL_reg_flags & RF_warned))
4284 PL_reg_flags |= RF_warned;
4285 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4286 "%s limit (%d) exceeded",
4287 "Complex regular subexpression recursion",
4290 cur_curlyx->u.curlyx.count--;
4294 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4295 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4297 /* Try grabbing another A and see if it helps. */
4298 PL_reginput = locinput;
4299 cur_curlyx->u.curlyx.lastloc = locinput;
4300 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4301 REGCP_SET(ST.lastcp);
4302 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4306 #define ST st->u.branch
4308 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4309 next = scan + ARG(scan);
4312 scan = NEXTOPER(scan);
4315 case BRANCH: /* /(...|A|...)/ */
4316 scan = NEXTOPER(scan); /* scan now points to inner node */
4317 ST.lastparen = *PL_reglastparen;
4318 ST.next_branch = next;
4320 PL_reginput = locinput;
4322 /* Now go into the branch */
4324 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4326 PUSH_STATE_GOTO(BRANCH_next, scan);
4330 PL_reginput = locinput;
4331 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4332 (SV*)rexi->data->data[ ARG( scan ) ];
4333 PUSH_STATE_GOTO(CUTGROUP_next,next);
4335 case CUTGROUP_next_fail:
4338 if (st->u.mark.mark_name)
4339 sv_commit = st->u.mark.mark_name;
4345 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4350 REGCP_UNWIND(ST.cp);
4351 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4352 PL_regoffs[n].end = -1;
4353 *PL_reglastparen = n;
4354 /*dmq: *PL_reglastcloseparen = n; */
4355 scan = ST.next_branch;
4356 /* no more branches? */
4357 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4359 PerlIO_printf( Perl_debug_log,
4360 "%*s %sBRANCH failed...%s\n",
4361 REPORT_CODE_OFF+depth*2, "",
4367 continue; /* execute next BRANCH[J] op */
4375 #define ST st->u.curlym
4377 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4379 /* This is an optimisation of CURLYX that enables us to push
4380 * only a single backtracking state, no matter now many matches
4381 * there are in {m,n}. It relies on the pattern being constant
4382 * length, with no parens to influence future backrefs
4386 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4388 /* if paren positive, emulate an OPEN/CLOSE around A */
4390 U32 paren = ST.me->flags;
4391 if (paren > PL_regsize)
4393 if (paren > *PL_reglastparen)
4394 *PL_reglastparen = paren;
4395 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4403 ST.c1 = CHRTEST_UNINIT;
4406 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4409 curlym_do_A: /* execute the A in /A{m,n}B/ */
4410 PL_reginput = locinput;
4411 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4414 case CURLYM_A: /* we've just matched an A */
4415 locinput = st->locinput;
4416 nextchr = UCHARAT(locinput);
4419 /* after first match, determine A's length: u.curlym.alen */
4420 if (ST.count == 1) {
4421 if (PL_reg_match_utf8) {
4423 while (s < PL_reginput) {
4429 ST.alen = PL_reginput - locinput;
4432 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4435 PerlIO_printf(Perl_debug_log,
4436 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4437 (int)(REPORT_CODE_OFF+(depth*2)), "",
4438 (IV) ST.count, (IV)ST.alen)
4441 locinput = PL_reginput;
4443 if (cur_eval && cur_eval->u.eval.close_paren &&
4444 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4447 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4448 goto curlym_do_A; /* try to match another A */
4449 goto curlym_do_B; /* try to match B */
4451 case CURLYM_A_fail: /* just failed to match an A */
4452 REGCP_UNWIND(ST.cp);
4454 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4455 || (cur_eval && cur_eval->u.eval.close_paren &&
4456 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4459 curlym_do_B: /* execute the B in /A{m,n}B/ */
4460 PL_reginput = locinput;
4461 if (ST.c1 == CHRTEST_UNINIT) {
4462 /* calculate c1 and c2 for possible match of 1st char
4463 * following curly */
4464 ST.c1 = ST.c2 = CHRTEST_VOID;
4465 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4466 regnode *text_node = ST.B;
4467 if (! HAS_TEXT(text_node))
4468 FIND_NEXT_IMPT(text_node);
4471 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4473 But the former is redundant in light of the latter.
4475 if this changes back then the macro for
4476 IS_TEXT and friends need to change.
4478 if (PL_regkind[OP(text_node)] == EXACT)
4481 ST.c1 = (U8)*STRING(text_node);
4483 (IS_TEXTF(text_node))
4485 : (IS_TEXTFL(text_node))
4486 ? PL_fold_locale[ST.c1]
4493 PerlIO_printf(Perl_debug_log,
4494 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4495 (int)(REPORT_CODE_OFF+(depth*2)),
4498 if (ST.c1 != CHRTEST_VOID
4499 && UCHARAT(PL_reginput) != ST.c1
4500 && UCHARAT(PL_reginput) != ST.c2)
4502 /* simulate B failing */
4504 PerlIO_printf(Perl_debug_log,
4505 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4506 (int)(REPORT_CODE_OFF+(depth*2)),"",
4509 state_num = CURLYM_B_fail;
4510 goto reenter_switch;
4514 /* mark current A as captured */
4515 I32 paren = ST.me->flags;
4517 PL_regoffs[paren].start
4518 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4519 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4520 /*dmq: *PL_reglastcloseparen = paren; */
4523 PL_regoffs[paren].end = -1;
4524 if (cur_eval && cur_eval->u.eval.close_paren &&
4525 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4534 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4537 case CURLYM_B_fail: /* just failed to match a B */
4538 REGCP_UNWIND(ST.cp);
4540 if (ST.count == ARG2(ST.me) /* max */)
4542 goto curlym_do_A; /* try to match a further A */
4544 /* backtrack one A */
4545 if (ST.count == ARG1(ST.me) /* min */)
4548 locinput = HOPc(locinput, -ST.alen);
4549 goto curlym_do_B; /* try to match B */
4552 #define ST st->u.curly
4554 #define CURLY_SETPAREN(paren, success) \
4557 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4558 PL_regoffs[paren].end = locinput - PL_bostr; \
4559 *PL_reglastcloseparen = paren; \
4562 PL_regoffs[paren].end = -1; \
4565 case STAR: /* /A*B/ where A is width 1 */
4569 scan = NEXTOPER(scan);
4571 case PLUS: /* /A+B/ where A is width 1 */
4575 scan = NEXTOPER(scan);
4577 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4578 ST.paren = scan->flags; /* Which paren to set */
4579 if (ST.paren > PL_regsize)
4580 PL_regsize = ST.paren;
4581 if (ST.paren > *PL_reglastparen)
4582 *PL_reglastparen = ST.paren;
4583 ST.min = ARG1(scan); /* min to match */
4584 ST.max = ARG2(scan); /* max to match */
4585 if (cur_eval && cur_eval->u.eval.close_paren &&
4586 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4590 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4592 case CURLY: /* /A{m,n}B/ where A is width 1 */
4594 ST.min = ARG1(scan); /* min to match */
4595 ST.max = ARG2(scan); /* max to match */
4596 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4599 * Lookahead to avoid useless match attempts
4600 * when we know what character comes next.
4602 * Used to only do .*x and .*?x, but now it allows
4603 * for )'s, ('s and (?{ ... })'s to be in the way
4604 * of the quantifier and the EXACT-like node. -- japhy
4607 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4609 if (HAS_TEXT(next) || JUMPABLE(next)) {
4611 regnode *text_node = next;
4613 if (! HAS_TEXT(text_node))
4614 FIND_NEXT_IMPT(text_node);
4616 if (! HAS_TEXT(text_node))
4617 ST.c1 = ST.c2 = CHRTEST_VOID;
4619 if ( PL_regkind[OP(text_node)] != EXACT ) {
4620 ST.c1 = ST.c2 = CHRTEST_VOID;
4621 goto assume_ok_easy;
4624 s = (U8*)STRING(text_node);
4626 /* Currently we only get here when
4628 PL_rekind[OP(text_node)] == EXACT
4630 if this changes back then the macro for IS_TEXT and
4631 friends need to change. */
4634 if (IS_TEXTF(text_node))
4635 ST.c2 = PL_fold[ST.c1];
4636 else if (IS_TEXTFL(text_node))
4637 ST.c2 = PL_fold_locale[ST.c1];
4640 if (IS_TEXTF(text_node)) {
4641 STRLEN ulen1, ulen2;
4642 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4643 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4645 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4646 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4648 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4650 0 : UTF8_ALLOW_ANY);
4651 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4653 0 : UTF8_ALLOW_ANY);
4655 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4657 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4662 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4669 ST.c1 = ST.c2 = CHRTEST_VOID;
4674 PL_reginput = locinput;
4677 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4680 locinput = PL_reginput;
4682 if (ST.c1 == CHRTEST_VOID)
4683 goto curly_try_B_min;
4685 ST.oldloc = locinput;
4687 /* set ST.maxpos to the furthest point along the
4688 * string that could possibly match */
4689 if (ST.max == REG_INFTY) {
4690 ST.maxpos = PL_regeol - 1;
4692 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4696 int m = ST.max - ST.min;
4697 for (ST.maxpos = locinput;
4698 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4699 ST.maxpos += UTF8SKIP(ST.maxpos);
4702 ST.maxpos = locinput + ST.max - ST.min;
4703 if (ST.maxpos >= PL_regeol)
4704 ST.maxpos = PL_regeol - 1;
4706 goto curly_try_B_min_known;
4710 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4711 locinput = PL_reginput;
4712 if (ST.count < ST.min)
4714 if ((ST.count > ST.min)
4715 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4717 /* A{m,n} must come at the end of the string, there's
4718 * no point in backing off ... */
4720 /* ...except that $ and \Z can match before *and* after
4721 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4722 We may back off by one in this case. */
4723 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4727 goto curly_try_B_max;
4732 case CURLY_B_min_known_fail:
4733 /* failed to find B in a non-greedy match where c1,c2 valid */
4734 if (ST.paren && ST.count)
4735 PL_regoffs[ST.paren].end = -1;
4737 PL_reginput = locinput; /* Could be reset... */
4738 REGCP_UNWIND(ST.cp);
4739 /* Couldn't or didn't -- move forward. */
4740 ST.oldloc = locinput;
4742 locinput += UTF8SKIP(locinput);
4746 curly_try_B_min_known:
4747 /* find the next place where 'B' could work, then call B */
4751 n = (ST.oldloc == locinput) ? 0 : 1;
4752 if (ST.c1 == ST.c2) {
4754 /* set n to utf8_distance(oldloc, locinput) */
4755 while (locinput <= ST.maxpos &&
4756 utf8n_to_uvchr((U8*)locinput,
4757 UTF8_MAXBYTES, &len,
4758 uniflags) != (UV)ST.c1) {
4764 /* set n to utf8_distance(oldloc, locinput) */
4765 while (locinput <= ST.maxpos) {
4767 const UV c = utf8n_to_uvchr((U8*)locinput,
4768 UTF8_MAXBYTES, &len,
4770 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4778 if (ST.c1 == ST.c2) {
4779 while (locinput <= ST.maxpos &&
4780 UCHARAT(locinput) != ST.c1)
4784 while (locinput <= ST.maxpos
4785 && UCHARAT(locinput) != ST.c1
4786 && UCHARAT(locinput) != ST.c2)
4789 n = locinput - ST.oldloc;
4791 if (locinput > ST.maxpos)
4793 /* PL_reginput == oldloc now */
4796 if (regrepeat(rex, ST.A, n, depth) < n)
4799 PL_reginput = locinput;
4800 CURLY_SETPAREN(ST.paren, ST.count);
4801 if (cur_eval && cur_eval->u.eval.close_paren &&
4802 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4805 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4810 case CURLY_B_min_fail:
4811 /* failed to find B in a non-greedy match where c1,c2 invalid */
4812 if (ST.paren && ST.count)
4813 PL_regoffs[ST.paren].end = -1;
4815 REGCP_UNWIND(ST.cp);
4816 /* failed -- move forward one */
4817 PL_reginput = locinput;
4818 if (regrepeat(rex, ST.A, 1, depth)) {
4820 locinput = PL_reginput;
4821 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4822 ST.count > 0)) /* count overflow ? */
4825 CURLY_SETPAREN(ST.paren, ST.count);
4826 if (cur_eval && cur_eval->u.eval.close_paren &&
4827 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4830 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4838 /* a successful greedy match: now try to match B */
4839 if (cur_eval && cur_eval->u.eval.close_paren &&
4840 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4845 if (ST.c1 != CHRTEST_VOID)
4846 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4847 UTF8_MAXBYTES, 0, uniflags)
4848 : (UV) UCHARAT(PL_reginput);
4849 /* If it could work, try it. */
4850 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4851 CURLY_SETPAREN(ST.paren, ST.count);
4852 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4857 case CURLY_B_max_fail:
4858 /* failed to find B in a greedy match */
4859 if (ST.paren && ST.count)
4860 PL_regoffs[ST.paren].end = -1;
4862 REGCP_UNWIND(ST.cp);
4864 if (--ST.count < ST.min)
4866 PL_reginput = locinput = HOPc(locinput, -1);
4867 goto curly_try_B_max;
4874 /* we've just finished A in /(??{A})B/; now continue with B */
4876 st->u.eval.toggle_reg_flags
4877 = cur_eval->u.eval.toggle_reg_flags;
4878 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4880 st->u.eval.prev_rex = rex_sv; /* inner */
4881 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4882 rex = (struct regexp *)SvANY(rex_sv);
4883 rexi = RXi_GET(rex);
4884 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4885 ReREFCNT_inc(rex_sv);
4886 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4887 REGCP_SET(st->u.eval.lastcp);
4888 PL_reginput = locinput;
4890 /* Restore parens of the outer rex without popping the
4892 tmpix = PL_savestack_ix;
4893 PL_savestack_ix = cur_eval->u.eval.lastcp;
4895 PL_savestack_ix = tmpix;
4897 st->u.eval.prev_eval = cur_eval;
4898 cur_eval = cur_eval->u.eval.prev_eval;
4900 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4901 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4902 if ( nochange_depth )
4905 PUSH_YES_STATE_GOTO(EVAL_AB,
4906 st->u.eval.prev_eval->u.eval.B); /* match B */
4909 if (locinput < reginfo->till) {
4910 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4911 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4913 (long)(locinput - PL_reg_starttry),
4914 (long)(reginfo->till - PL_reg_starttry),
4917 sayNO_SILENT; /* Cannot match: too short. */
4919 PL_reginput = locinput; /* put where regtry can find it */
4920 sayYES; /* Success! */
4922 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4924 PerlIO_printf(Perl_debug_log,
4925 "%*s %ssubpattern success...%s\n",
4926 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4927 PL_reginput = locinput; /* put where regtry can find it */
4928 sayYES; /* Success! */
4931 #define ST st->u.ifmatch
4933 case SUSPEND: /* (?>A) */
4935 PL_reginput = locinput;
4938 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4940 goto ifmatch_trivial_fail_test;
4942 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4944 ifmatch_trivial_fail_test:
4946 char * const s = HOPBACKc(locinput, scan->flags);
4951 sw = 1 - (bool)ST.wanted;
4955 next = scan + ARG(scan);
4963 PL_reginput = locinput;
4967 ST.logical = logical;
4968 /* execute body of (?...A) */
4969 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4972 case IFMATCH_A_fail: /* body of (?...A) failed */
4973 ST.wanted = !ST.wanted;
4976 case IFMATCH_A: /* body of (?...A) succeeded */
4978 sw = (bool)ST.wanted;
4980 else if (!ST.wanted)
4983 if (OP(ST.me) == SUSPEND)
4984 locinput = PL_reginput;
4986 locinput = PL_reginput = st->locinput;
4987 nextchr = UCHARAT(locinput);
4989 scan = ST.me + ARG(ST.me);
4992 continue; /* execute B */
4997 next = scan + ARG(scan);
5002 reginfo->cutpoint = PL_regeol;
5005 PL_reginput = locinput;
5007 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
5008 PUSH_STATE_GOTO(COMMIT_next,next);
5010 case COMMIT_next_fail:
5017 #define ST st->u.mark
5019 ST.prev_mark = mark_state;
5020 ST.mark_name = sv_commit = sv_yes_mark
5021 = (SV*)rexi->data->data[ ARG( scan ) ];
5023 ST.mark_loc = PL_reginput = locinput;
5024 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5026 case MARKPOINT_next:
5027 mark_state = ST.prev_mark;
5030 case MARKPOINT_next_fail:
5031 if (popmark && sv_eq(ST.mark_name,popmark))
5033 if (ST.mark_loc > startpoint)
5034 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5035 popmark = NULL; /* we found our mark */
5036 sv_commit = ST.mark_name;
5039 PerlIO_printf(Perl_debug_log,
5040 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5041 REPORT_CODE_OFF+depth*2, "",
5042 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5045 mark_state = ST.prev_mark;
5046 sv_yes_mark = mark_state ?
5047 mark_state->u.mark.mark_name : NULL;
5051 PL_reginput = locinput;
5053 /* (*SKIP) : if we fail we cut here*/
5054 ST.mark_name = NULL;
5055 ST.mark_loc = locinput;
5056 PUSH_STATE_GOTO(SKIP_next,next);
5058 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5059 otherwise do nothing. Meaning we need to scan
5061 regmatch_state *cur = mark_state;
5062 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
5065 if ( sv_eq( cur->u.mark.mark_name,
5068 ST.mark_name = find;
5069 PUSH_STATE_GOTO( SKIP_next, next );
5071 cur = cur->u.mark.prev_mark;
5074 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5076 case SKIP_next_fail:
5078 /* (*CUT:NAME) - Set up to search for the name as we
5079 collapse the stack*/
5080 popmark = ST.mark_name;
5082 /* (*CUT) - No name, we cut here.*/
5083 if (ST.mark_loc > startpoint)
5084 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5085 /* but we set sv_commit to latest mark_name if there
5086 is one so they can test to see how things lead to this
5089 sv_commit=mark_state->u.mark.mark_name;
5097 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5099 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5102 U8 folded[UTF8_MAXBYTES_CASE+1];
5104 const char * const l = locinput;
5105 char *e = PL_regeol;
5106 to_uni_fold(n, folded, &foldlen);
5108 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
5109 l, &e, 0, do_utf8)) {
5114 nextchr = UCHARAT(locinput);
5117 if ((n=is_LNBREAK(locinput,do_utf8))) {
5119 nextchr = UCHARAT(locinput);
5124 #define CASE_CLASS(nAmE) \
5126 if ((n=is_##nAmE(locinput,do_utf8))) { \
5128 nextchr = UCHARAT(locinput); \
5133 if ((n=is_##nAmE(locinput,do_utf8))) { \
5136 locinput += UTF8SKIP(locinput); \
5137 nextchr = UCHARAT(locinput); \
5142 CASE_CLASS(HORIZWS);
5146 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5147 PTR2UV(scan), OP(scan));
5148 Perl_croak(aTHX_ "regexp memory corruption");
5152 /* switch break jumps here */
5153 scan = next; /* prepare to execute the next op and ... */
5154 continue; /* ... jump back to the top, reusing st */
5158 /* push a state that backtracks on success */
5159 st->u.yes.prev_yes_state = yes_state;
5163 /* push a new regex state, then continue at scan */
5165 regmatch_state *newst;
5168 regmatch_state *cur = st;
5169 regmatch_state *curyes = yes_state;
5171 regmatch_slab *slab = PL_regmatch_slab;
5172 for (;curd > -1;cur--,curd--) {
5173 if (cur < SLAB_FIRST(slab)) {
5175 cur = SLAB_LAST(slab);
5177 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5178 REPORT_CODE_OFF + 2 + depth * 2,"",
5179 curd, PL_reg_name[cur->resume_state],
5180 (curyes == cur) ? "yes" : ""
5183 curyes = cur->u.yes.prev_yes_state;
5186 DEBUG_STATE_pp("push")
5189 st->locinput = locinput;
5191 if (newst > SLAB_LAST(PL_regmatch_slab))
5192 newst = S_push_slab(aTHX);
5193 PL_regmatch_state = newst;
5195 locinput = PL_reginput;
5196 nextchr = UCHARAT(locinput);
5204 * We get here only if there's trouble -- normally "case END" is
5205 * the terminating point.
5207 Perl_croak(aTHX_ "corrupted regexp pointers");
5213 /* we have successfully completed a subexpression, but we must now
5214 * pop to the state marked by yes_state and continue from there */
5215 assert(st != yes_state);
5217 while (st != yes_state) {
5219 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5220 PL_regmatch_slab = PL_regmatch_slab->prev;
5221 st = SLAB_LAST(PL_regmatch_slab);
5225 DEBUG_STATE_pp("pop (no final)");
5227 DEBUG_STATE_pp("pop (yes)");
5233 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5234 || yes_state > SLAB_LAST(PL_regmatch_slab))
5236 /* not in this slab, pop slab */
5237 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5238 PL_regmatch_slab = PL_regmatch_slab->prev;
5239 st = SLAB_LAST(PL_regmatch_slab);
5241 depth -= (st - yes_state);
5244 yes_state = st->u.yes.prev_yes_state;
5245 PL_regmatch_state = st;
5248 locinput= st->locinput;
5249 nextchr = UCHARAT(locinput);
5251 state_num = st->resume_state + no_final;
5252 goto reenter_switch;
5255 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5256 PL_colors[4], PL_colors[5]));
5258 if (PL_reg_eval_set) {
5259 /* each successfully executed (?{...}) block does the equivalent of
5260 * local $^R = do {...}
5261 * When popping the save stack, all these locals would be undone;
5262 * bypass this by setting the outermost saved $^R to the latest
5264 if (oreplsv != GvSV(PL_replgv))
5265 sv_setsv(oreplsv, GvSV(PL_replgv));
5272 PerlIO_printf(Perl_debug_log,
5273 "%*s %sfailed...%s\n",
5274 REPORT_CODE_OFF+depth*2, "",
5275 PL_colors[4], PL_colors[5])
5287 /* there's a previous state to backtrack to */
5289 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5290 PL_regmatch_slab = PL_regmatch_slab->prev;
5291 st = SLAB_LAST(PL_regmatch_slab);
5293 PL_regmatch_state = st;
5294 locinput= st->locinput;
5295 nextchr = UCHARAT(locinput);
5297 DEBUG_STATE_pp("pop");
5299 if (yes_state == st)
5300 yes_state = st->u.yes.prev_yes_state;
5302 state_num = st->resume_state + 1; /* failure = success + 1 */
5303 goto reenter_switch;
5308 if (rex->intflags & PREGf_VERBARG_SEEN) {
5309 SV *sv_err = get_sv("REGERROR", 1);
5310 SV *sv_mrk = get_sv("REGMARK", 1);
5312 sv_commit = &PL_sv_no;
5314 sv_yes_mark = &PL_sv_yes;
5317 sv_commit = &PL_sv_yes;
5318 sv_yes_mark = &PL_sv_no;
5320 sv_setsv(sv_err, sv_commit);
5321 sv_setsv(sv_mrk, sv_yes_mark);
5324 /* clean up; in particular, free all slabs above current one */
5325 LEAVE_SCOPE(oldsave);
5331 - regrepeat - repeatedly match something simple, report how many
5334 * [This routine now assumes that it will only match on things of length 1.
5335 * That was true before, but now we assume scan - reginput is the count,
5336 * rather than incrementing count on every character. [Er, except utf8.]]
5339 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5342 register char *scan;
5344 register char *loceol = PL_regeol;
5345 register I32 hardcount = 0;
5346 register bool do_utf8 = PL_reg_match_utf8;
5348 PERL_UNUSED_ARG(depth);
5352 if (max == REG_INFTY)
5354 else if (max < loceol - scan)
5355 loceol = scan + max;
5360 while (scan < loceol && hardcount < max && *scan != '\n') {
5361 scan += UTF8SKIP(scan);
5365 while (scan < loceol && *scan != '\n')
5372 while (scan < loceol && hardcount < max) {
5373 scan += UTF8SKIP(scan);
5383 case EXACT: /* length of string is 1 */
5385 while (scan < loceol && UCHARAT(scan) == c)
5388 case EXACTF: /* length of string is 1 */
5390 while (scan < loceol &&
5391 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5394 case EXACTFL: /* length of string is 1 */
5395 PL_reg_flags |= RF_tainted;
5397 while (scan < loceol &&
5398 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5404 while (hardcount < max && scan < loceol &&
5405 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5406 scan += UTF8SKIP(scan);
5410 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5417 LOAD_UTF8_CHARCLASS_ALNUM();
5418 while (hardcount < max && scan < loceol &&
5419 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5420 scan += UTF8SKIP(scan);
5424 while (scan < loceol && isALNUM(*scan))
5429 PL_reg_flags |= RF_tainted;
5432 while (hardcount < max && scan < loceol &&
5433 isALNUM_LC_utf8((U8*)scan)) {
5434 scan += UTF8SKIP(scan);
5438 while (scan < loceol && isALNUM_LC(*scan))
5445 LOAD_UTF8_CHARCLASS_ALNUM();
5446 while (hardcount < max && scan < loceol &&
5447 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5448 scan += UTF8SKIP(scan);
5452 while (scan < loceol && !isALNUM(*scan))
5457 PL_reg_flags |= RF_tainted;
5460 while (hardcount < max && scan < loceol &&
5461 !isALNUM_LC_utf8((U8*)scan)) {
5462 scan += UTF8SKIP(scan);
5466 while (scan < loceol && !isALNUM_LC(*scan))
5473 LOAD_UTF8_CHARCLASS_SPACE();
5474 while (hardcount < max && scan < loceol &&
5476 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5477 scan += UTF8SKIP(scan);
5481 while (scan < loceol && isSPACE(*scan))
5486 PL_reg_flags |= RF_tainted;
5489 while (hardcount < max && scan < loceol &&
5490 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5491 scan += UTF8SKIP(scan);
5495 while (scan < loceol && isSPACE_LC(*scan))
5502 LOAD_UTF8_CHARCLASS_SPACE();
5503 while (hardcount < max && scan < loceol &&
5505 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5506 scan += UTF8SKIP(scan);
5510 while (scan < loceol && !isSPACE(*scan))
5515 PL_reg_flags |= RF_tainted;
5518 while (hardcount < max && scan < loceol &&
5519 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5520 scan += UTF8SKIP(scan);
5524 while (scan < loceol && !isSPACE_LC(*scan))
5531 LOAD_UTF8_CHARCLASS_DIGIT();
5532 while (hardcount < max && scan < loceol &&
5533 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5534 scan += UTF8SKIP(scan);
5538 while (scan < loceol && isDIGIT(*scan))
5545 LOAD_UTF8_CHARCLASS_DIGIT();
5546 while (hardcount < max && scan < loceol &&
5547 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5548 scan += UTF8SKIP(scan);
5552 while (scan < loceol && !isDIGIT(*scan))
5558 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5564 LNBREAK can match two latin chars, which is ok,
5565 because we have a null terminated string, but we
5566 have to use hardcount in this situation
5568 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5577 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5582 while (scan < loceol && is_HORIZWS_latin1(scan))
5589 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5590 scan += UTF8SKIP(scan);
5594 while (scan < loceol && !is_HORIZWS_latin1(scan))
5602 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5607 while (scan < loceol && is_VERTWS_latin1(scan))
5615 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5616 scan += UTF8SKIP(scan);
5620 while (scan < loceol && !is_VERTWS_latin1(scan))
5626 default: /* Called on something of 0 width. */
5627 break; /* So match right here or not at all. */
5633 c = scan - PL_reginput;
5637 GET_RE_DEBUG_FLAGS_DECL;
5639 SV * const prop = sv_newmortal();
5640 regprop(prog, prop, p);
5641 PerlIO_printf(Perl_debug_log,
5642 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5643 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5651 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5653 - regclass_swash - prepare the utf8 swash
5657 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5663 RXi_GET_DECL(prog,progi);
5664 const struct reg_data * const data = prog ? progi->data : NULL;
5666 if (data && data->count) {
5667 const U32 n = ARG(node);
5669 if (data->what[n] == 's') {
5670 SV * const rv = (SV*)data->data[n];
5671 AV * const av = (AV*)SvRV((SV*)rv);
5672 SV **const ary = AvARRAY(av);
5675 /* See the end of regcomp.c:S_regclass() for
5676 * documentation of these array elements. */
5679 a = SvROK(ary[1]) ? &ary[1] : NULL;
5680 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5684 else if (si && doinit) {
5685 sw = swash_init("utf8", "", si, 1, 0);
5686 (void)av_store(av, 1, sw);
5703 - reginclass - determine if a character falls into a character class
5705 The n is the ANYOF regnode, the p is the target string, lenp
5706 is pointer to the maximum length of how far to go in the p
5707 (if the lenp is zero, UTF8SKIP(p) is used),
5708 do_utf8 tells whether the target string is in UTF-8.
5713 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5716 const char flags = ANYOF_FLAGS(n);
5722 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5723 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5724 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5725 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5726 if (len == (STRLEN)-1)
5727 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5730 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5731 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5734 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5735 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5738 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5742 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5745 if (swash_fetch(sw, p, do_utf8))
5747 else if (flags & ANYOF_FOLD) {
5748 if (!match && lenp && av) {
5750 for (i = 0; i <= av_len(av); i++) {
5751 SV* const sv = *av_fetch(av, i, FALSE);
5753 const char * const s = SvPV_const(sv, len);
5755 if (len <= plen && memEQ(s, (char*)p, len)) {
5763 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5766 to_utf8_fold(p, tmpbuf, &tmplen);
5767 if (swash_fetch(sw, tmpbuf, do_utf8))
5773 if (match && lenp && *lenp == 0)
5774 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5776 if (!match && c < 256) {
5777 if (ANYOF_BITMAP_TEST(n, c))
5779 else if (flags & ANYOF_FOLD) {
5782 if (flags & ANYOF_LOCALE) {
5783 PL_reg_flags |= RF_tainted;
5784 f = PL_fold_locale[c];
5788 if (f != c && ANYOF_BITMAP_TEST(n, f))
5792 if (!match && (flags & ANYOF_CLASS)) {
5793 PL_reg_flags |= RF_tainted;
5795 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5796 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5797 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5798 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5799 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5800 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5801 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5802 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5803 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5804 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5805 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5806 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5807 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5808 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5809 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5810 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5811 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5812 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5813 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5814 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5815 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5816 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5817 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5818 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5819 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5820 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5821 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5822 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5823 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5824 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5825 ) /* How's that for a conditional? */
5832 return (flags & ANYOF_INVERT) ? !match : match;
5836 S_reghop3(U8 *s, I32 off, const U8* lim)
5840 while (off-- && s < lim) {
5841 /* XXX could check well-formedness here */
5846 while (off++ && s > lim) {
5848 if (UTF8_IS_CONTINUED(*s)) {
5849 while (s > lim && UTF8_IS_CONTINUATION(*s))
5852 /* XXX could check well-formedness here */
5859 /* there are a bunch of places where we use two reghop3's that should
5860 be replaced with this routine. but since thats not done yet
5861 we ifdef it out - dmq
5864 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5868 while (off-- && s < rlim) {
5869 /* XXX could check well-formedness here */
5874 while (off++ && s > llim) {
5876 if (UTF8_IS_CONTINUED(*s)) {
5877 while (s > llim && UTF8_IS_CONTINUATION(*s))
5880 /* XXX could check well-formedness here */
5888 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5892 while (off-- && s < lim) {
5893 /* XXX could check well-formedness here */
5900 while (off++ && s > lim) {
5902 if (UTF8_IS_CONTINUED(*s)) {
5903 while (s > lim && UTF8_IS_CONTINUATION(*s))
5906 /* XXX could check well-formedness here */
5915 restore_pos(pTHX_ void *arg)
5918 regexp * const rex = (regexp *)arg;
5919 if (PL_reg_eval_set) {
5920 if (PL_reg_oldsaved) {
5921 rex->subbeg = PL_reg_oldsaved;
5922 rex->sublen = PL_reg_oldsavedlen;
5923 #ifdef PERL_OLD_COPY_ON_WRITE
5924 rex->saved_copy = PL_nrs;
5926 RXp_MATCH_COPIED_on(rex);
5928 PL_reg_magic->mg_len = PL_reg_oldpos;
5929 PL_reg_eval_set = 0;
5930 PL_curpm = PL_reg_oldcurpm;
5935 S_to_utf8_substr(pTHX_ register regexp *prog)
5939 if (prog->substrs->data[i].substr
5940 && !prog->substrs->data[i].utf8_substr) {
5941 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5942 prog->substrs->data[i].utf8_substr = sv;
5943 sv_utf8_upgrade(sv);
5944 if (SvVALID(prog->substrs->data[i].substr)) {
5945 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5946 if (flags & FBMcf_TAIL) {
5947 /* Trim the trailing \n that fbm_compile added last
5949 SvCUR_set(sv, SvCUR(sv) - 1);
5950 /* Whilst this makes the SV technically "invalid" (as its
5951 buffer is no longer followed by "\0") when fbm_compile()
5952 adds the "\n" back, a "\0" is restored. */
5954 fbm_compile(sv, flags);
5956 if (prog->substrs->data[i].substr == prog->check_substr)
5957 prog->check_utf8 = sv;
5963 S_to_byte_substr(pTHX_ register regexp *prog)
5968 if (prog->substrs->data[i].utf8_substr
5969 && !prog->substrs->data[i].substr) {
5970 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5971 if (sv_utf8_downgrade(sv, TRUE)) {
5972 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5974 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5975 if (flags & FBMcf_TAIL) {
5976 /* Trim the trailing \n that fbm_compile added last
5978 SvCUR_set(sv, SvCUR(sv) - 1);
5980 fbm_compile(sv, flags);
5986 prog->substrs->data[i].substr = sv;
5987 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5988 prog->check_substr = sv;
5995 * c-indentation-style: bsd
5997 * indent-tabs-mode: t
6000 * ex: set ts=8 sts=4 sw=4 noet: