5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 7
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regoffs[p].end);
193 SSPUSHINT(PL_regoffs[p].start);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regoffs[p].start,
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
200 (IV)PL_regoffs[p].end
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regoffs);
205 SSPUSHINT(PL_regsize);
206 SSPUSHINT(*PL_reglastparen);
207 SSPUSHINT(*PL_reglastcloseparen);
208 SSPUSHPTR(PL_reginput);
209 #define REGCP_FRAME_ELEMS 2
210 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
211 * are needed for the regexp context stack bookkeeping. */
212 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
213 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
218 /* These are needed since we do not localize EVAL nodes: */
219 #define REGCP_SET(cp) \
221 PerlIO_printf(Perl_debug_log, \
222 " Setting an EVAL scope, savestack=%"IVdf"\n", \
223 (IV)PL_savestack_ix)); \
226 #define REGCP_UNWIND(cp) \
228 if (cp != PL_savestack_ix) \
229 PerlIO_printf(Perl_debug_log, \
230 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
231 (IV)(cp), (IV)PL_savestack_ix)); \
235 S_regcppop(pTHX_ const regexp *rex)
240 GET_RE_DEBUG_FLAGS_DECL;
242 PERL_ARGS_ASSERT_REGCPPOP;
244 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
246 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
247 i = SSPOPINT; /* Parentheses elements to pop. */
248 input = (char *) SSPOPPTR;
249 *PL_reglastcloseparen = SSPOPINT;
250 *PL_reglastparen = SSPOPINT;
251 PL_regsize = SSPOPINT;
252 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
255 /* Now restore the parentheses context. */
256 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
257 i > 0; i -= REGCP_PAREN_ELEMS) {
259 U32 paren = (U32)SSPOPINT;
260 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
261 PL_regoffs[paren].start = SSPOPINT;
263 if (paren <= *PL_reglastparen)
264 PL_regoffs[paren].end = tmps;
266 PerlIO_printf(Perl_debug_log,
267 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
268 (UV)paren, (IV)PL_regoffs[paren].start,
269 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
270 (IV)PL_regoffs[paren].end,
271 (paren > *PL_reglastparen ? "(no)" : ""));
275 if (*PL_reglastparen + 1 <= rex->nparens) {
276 PerlIO_printf(Perl_debug_log,
277 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
278 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
282 /* It would seem that the similar code in regtry()
283 * already takes care of this, and in fact it is in
284 * a better location to since this code can #if 0-ed out
285 * but the code in regtry() is needed or otherwise tests
286 * requiring null fields (pat.t#187 and split.t#{13,14}
287 * (as of patchlevel 7877) will fail. Then again,
288 * this code seems to be necessary or otherwise
289 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
290 * --jhi updated by dapm */
291 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
293 PL_regoffs[i].start = -1;
294 PL_regoffs[i].end = -1;
300 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
303 * pregexec and friends
306 #ifndef PERL_IN_XSUB_RE
308 - pregexec - match a regexp against a string
311 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
312 char *strbeg, I32 minend, SV *screamer, U32 nosave)
313 /* strend: pointer to null at end of string */
314 /* strbeg: real beginning of string */
315 /* minend: end of match must be >=minend after stringarg. */
316 /* nosave: For optimizations. */
318 PERL_ARGS_ASSERT_PREGEXEC;
321 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
322 nosave ? 0 : REXEC_COPY_STR);
327 * Need to implement the following flags for reg_anch:
329 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
331 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
332 * INTUIT_AUTORITATIVE_ML
333 * INTUIT_ONCE_NOML - Intuit can match in one location only.
336 * Another flag for this function: SECOND_TIME (so that float substrs
337 * with giant delta may be not rechecked).
340 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
342 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
343 Otherwise, only SvCUR(sv) is used to get strbeg. */
345 /* XXXX We assume that strpos is strbeg unless sv. */
347 /* XXXX Some places assume that there is a fixed substring.
348 An update may be needed if optimizer marks as "INTUITable"
349 RExen without fixed substrings. Similarly, it is assumed that
350 lengths of all the strings are no more than minlen, thus they
351 cannot come from lookahead.
352 (Or minlen should take into account lookahead.)
353 NOTE: Some of this comment is not correct. minlen does now take account
354 of lookahead/behind. Further research is required. -- demerphq
358 /* A failure to find a constant substring means that there is no need to make
359 an expensive call to REx engine, thus we celebrate a failure. Similarly,
360 finding a substring too deep into the string means that less calls to
361 regtry() should be needed.
363 REx compiler's optimizer found 4 possible hints:
364 a) Anchored substring;
366 c) Whether we are anchored (beginning-of-line or \G);
367 d) First node (of those at offset 0) which may distingush positions;
368 We use a)b)d) and multiline-part of c), and try to find a position in the
369 string which does not contradict any of them.
372 /* Most of decisions we do here should have been done at compile time.
373 The nodes of the REx which we used for the search should have been
374 deleted from the finite automaton. */
377 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
378 char *strend, const U32 flags, re_scream_pos_data *data)
381 struct regexp *const prog = (struct regexp *)SvANY(rx);
382 register I32 start_shift = 0;
383 /* Should be nonnegative! */
384 register I32 end_shift = 0;
389 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
391 register char *other_last = NULL; /* other substr checked before this */
392 char *check_at = NULL; /* check substr found at this pos */
393 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
394 RXi_GET_DECL(prog,progi);
396 const char * const i_strpos = strpos;
398 GET_RE_DEBUG_FLAGS_DECL;
400 PERL_ARGS_ASSERT_RE_INTUIT_START;
402 RX_MATCH_UTF8_set(rx,do_utf8);
405 PL_reg_flags |= RF_utf8;
408 debug_start_match(rx, do_utf8, strpos, strend,
409 sv ? "Guessing start of match in sv for"
410 : "Guessing start of match in string for");
413 /* CHR_DIST() would be more correct here but it makes things slow. */
414 if (prog->minlen > strend - strpos) {
415 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
416 "String too short... [re_intuit_start]\n"));
420 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
423 if (!prog->check_utf8 && prog->check_substr)
424 to_utf8_substr(prog);
425 check = prog->check_utf8;
427 if (!prog->check_substr && prog->check_utf8)
428 to_byte_substr(prog);
429 check = prog->check_substr;
431 if (check == &PL_sv_undef) {
432 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
433 "Non-utf8 string cannot match utf8 check string\n"));
436 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
437 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
438 || ( (prog->extflags & RXf_ANCH_BOL)
439 && !multiline ) ); /* Check after \n? */
442 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
443 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
444 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
446 && (strpos != strbeg)) {
447 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
450 if (prog->check_offset_min == prog->check_offset_max &&
451 !(prog->extflags & RXf_CANY_SEEN)) {
452 /* Substring at constant offset from beg-of-str... */
455 s = HOP3c(strpos, prog->check_offset_min, strend);
458 slen = SvCUR(check); /* >= 1 */
460 if ( strend - s > slen || strend - s < slen - 1
461 || (strend - s == slen && strend[-1] != '\n')) {
462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
465 /* Now should match s[0..slen-2] */
467 if (slen && (*SvPVX_const(check) != *s
469 && memNE(SvPVX_const(check), s, slen)))) {
471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
475 else if (*SvPVX_const(check) != *s
476 || ((slen = SvCUR(check)) > 1
477 && memNE(SvPVX_const(check), s, slen)))
480 goto success_at_start;
483 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
485 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
486 end_shift = prog->check_end_shift;
489 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
490 - (SvTAIL(check) != 0);
491 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
493 if (end_shift < eshift)
497 else { /* Can match at random position */
500 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
501 end_shift = prog->check_end_shift;
503 /* end shift should be non negative here */
506 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
508 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
509 (IV)end_shift, RX_PRECOMP(prog));
513 /* Find a possible match in the region s..strend by looking for
514 the "check" substring in the region corrected by start/end_shift. */
517 I32 srch_start_shift = start_shift;
518 I32 srch_end_shift = end_shift;
519 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
520 srch_end_shift -= ((strbeg - s) - srch_start_shift);
521 srch_start_shift = strbeg - s;
523 DEBUG_OPTIMISE_MORE_r({
524 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
525 (IV)prog->check_offset_min,
526 (IV)srch_start_shift,
528 (IV)prog->check_end_shift);
531 if (flags & REXEC_SCREAM) {
532 I32 p = -1; /* Internal iterator of scream. */
533 I32 * const pp = data ? data->scream_pos : &p;
535 if (PL_screamfirst[BmRARE(check)] >= 0
536 || ( BmRARE(check) == '\n'
537 && (BmPREVIOUS(check) == SvCUR(check) - 1)
539 s = screaminstr(sv, check,
540 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
543 /* we may be pointing at the wrong string */
544 if (s && RXp_MATCH_COPIED(prog))
545 s = strbeg + (s - SvPVX_const(sv));
547 *data->scream_olds = s;
552 if (prog->extflags & RXf_CANY_SEEN) {
553 start_point= (U8*)(s + srch_start_shift);
554 end_point= (U8*)(strend - srch_end_shift);
556 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
557 end_point= HOP3(strend, -srch_end_shift, strbeg);
559 DEBUG_OPTIMISE_MORE_r({
560 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
561 (int)(end_point - start_point),
562 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
566 s = fbm_instr( start_point, end_point,
567 check, multiline ? FBMrf_MULTILINE : 0);
570 /* Update the count-of-usability, remove useless subpatterns,
574 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
575 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
576 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
577 (s ? "Found" : "Did not find"),
578 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
579 ? "anchored" : "floating"),
582 (s ? " at offset " : "...\n") );
587 /* Finish the diagnostic message */
588 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
590 /* XXX dmq: first branch is for positive lookbehind...
591 Our check string is offset from the beginning of the pattern.
592 So we need to do any stclass tests offset forward from that
601 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
602 Start with the other substr.
603 XXXX no SCREAM optimization yet - and a very coarse implementation
604 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
605 *always* match. Probably should be marked during compile...
606 Probably it is right to do no SCREAM here...
609 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
610 : (prog->float_substr && prog->anchored_substr))
612 /* Take into account the "other" substring. */
613 /* XXXX May be hopelessly wrong for UTF... */
616 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
619 char * const last = HOP3c(s, -start_shift, strbeg);
621 char * const saved_s = s;
624 t = s - prog->check_offset_max;
625 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
627 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
632 t = HOP3c(t, prog->anchored_offset, strend);
633 if (t < other_last) /* These positions already checked */
635 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
638 /* XXXX It is not documented what units *_offsets are in.
639 We assume bytes, but this is clearly wrong.
640 Meaning this code needs to be carefully reviewed for errors.
644 /* On end-of-str: see comment below. */
645 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
646 if (must == &PL_sv_undef) {
648 DEBUG_r(must = prog->anchored_utf8); /* for debug */
653 HOP3(HOP3(last1, prog->anchored_offset, strend)
654 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
656 multiline ? FBMrf_MULTILINE : 0
659 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
660 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
661 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
662 (s ? "Found" : "Contradicts"),
663 quoted, RE_SV_TAIL(must));
668 if (last1 >= last2) {
669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
670 ", giving up...\n"));
673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
674 ", trying floating at offset %ld...\n",
675 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
676 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
677 s = HOP3c(last, 1, strend);
681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
682 (long)(s - i_strpos)));
683 t = HOP3c(s, -prog->anchored_offset, strbeg);
684 other_last = HOP3c(s, 1, strend);
692 else { /* Take into account the floating substring. */
694 char * const saved_s = s;
697 t = HOP3c(s, -start_shift, strbeg);
699 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
700 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
701 last = HOP3c(t, prog->float_max_offset, strend);
702 s = HOP3c(t, prog->float_min_offset, strend);
705 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
706 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
707 /* fbm_instr() takes into account exact value of end-of-str
708 if the check is SvTAIL(ed). Since false positives are OK,
709 and end-of-str is not later than strend we are OK. */
710 if (must == &PL_sv_undef) {
712 DEBUG_r(must = prog->float_utf8); /* for debug message */
715 s = fbm_instr((unsigned char*)s,
716 (unsigned char*)last + SvCUR(must)
718 must, multiline ? FBMrf_MULTILINE : 0);
720 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
721 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
722 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
723 (s ? "Found" : "Contradicts"),
724 quoted, RE_SV_TAIL(must));
728 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
729 ", giving up...\n"));
732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
733 ", trying anchored starting at offset %ld...\n",
734 (long)(saved_s + 1 - i_strpos)));
736 s = HOP3c(t, 1, strend);
740 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
741 (long)(s - i_strpos)));
742 other_last = s; /* Fix this later. --Hugo */
752 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
754 DEBUG_OPTIMISE_MORE_r(
755 PerlIO_printf(Perl_debug_log,
756 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
757 (IV)prog->check_offset_min,
758 (IV)prog->check_offset_max,
766 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
768 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
771 /* Fixed substring is found far enough so that the match
772 cannot start at strpos. */
774 if (ml_anch && t[-1] != '\n') {
775 /* Eventually fbm_*() should handle this, but often
776 anchored_offset is not 0, so this check will not be wasted. */
777 /* XXXX In the code below we prefer to look for "^" even in
778 presence of anchored substrings. And we search even
779 beyond the found float position. These pessimizations
780 are historical artefacts only. */
782 while (t < strend - prog->minlen) {
784 if (t < check_at - prog->check_offset_min) {
785 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
786 /* Since we moved from the found position,
787 we definitely contradict the found anchored
788 substr. Due to the above check we do not
789 contradict "check" substr.
790 Thus we can arrive here only if check substr
791 is float. Redo checking for "other"=="fixed".
794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
795 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
796 goto do_other_anchored;
798 /* We don't contradict the found floating substring. */
799 /* XXXX Why not check for STCLASS? */
801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
802 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
805 /* Position contradicts check-string */
806 /* XXXX probably better to look for check-string
807 than for "\n", so one should lower the limit for t? */
808 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
809 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
810 other_last = strpos = s = t + 1;
815 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
816 PL_colors[0], PL_colors[1]));
820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
821 PL_colors[0], PL_colors[1]));
825 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
828 /* The found string does not prohibit matching at strpos,
829 - no optimization of calling REx engine can be performed,
830 unless it was an MBOL and we are not after MBOL,
831 or a future STCLASS check will fail this. */
833 /* Even in this situation we may use MBOL flag if strpos is offset
834 wrt the start of the string. */
835 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
836 && (strpos != strbeg) && strpos[-1] != '\n'
837 /* May be due to an implicit anchor of m{.*foo} */
838 && !(prog->intflags & PREGf_IMPLICIT))
843 DEBUG_EXECUTE_r( if (ml_anch)
844 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
845 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
848 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
850 prog->check_utf8 /* Could be deleted already */
851 && --BmUSEFUL(prog->check_utf8) < 0
852 && (prog->check_utf8 == prog->float_utf8)
854 prog->check_substr /* Could be deleted already */
855 && --BmUSEFUL(prog->check_substr) < 0
856 && (prog->check_substr == prog->float_substr)
859 /* If flags & SOMETHING - do not do it many times on the same match */
860 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
861 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
862 if (do_utf8 ? prog->check_substr : prog->check_utf8)
863 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
864 prog->check_substr = prog->check_utf8 = NULL; /* disable */
865 prog->float_substr = prog->float_utf8 = NULL; /* clear */
866 check = NULL; /* abort */
868 /* XXXX This is a remnant of the old implementation. It
869 looks wasteful, since now INTUIT can use many
871 prog->extflags &= ~RXf_USE_INTUIT;
878 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
879 /* trie stclasses are too expensive to use here, we are better off to
880 leave it to regmatch itself */
881 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
882 /* minlen == 0 is possible if regstclass is \b or \B,
883 and the fixed substr is ''$.
884 Since minlen is already taken into account, s+1 is before strend;
885 accidentally, minlen >= 1 guaranties no false positives at s + 1
886 even for \b or \B. But (minlen? 1 : 0) below assumes that
887 regstclass does not come from lookahead... */
888 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
889 This leaves EXACTF only, which is dealt with in find_byclass(). */
890 const U8* const str = (U8*)STRING(progi->regstclass);
891 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
892 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
895 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
896 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
897 else if (prog->float_substr || prog->float_utf8)
898 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
903 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
906 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
909 const char *what = NULL;
911 if (endpos == strend) {
912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
913 "Could not match STCLASS...\n") );
916 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
917 "This position contradicts STCLASS...\n") );
918 if ((prog->extflags & RXf_ANCH) && !ml_anch)
920 /* Contradict one of substrings */
921 if (prog->anchored_substr || prog->anchored_utf8) {
922 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
923 DEBUG_EXECUTE_r( what = "anchored" );
925 s = HOP3c(t, 1, strend);
926 if (s + start_shift + end_shift > strend) {
927 /* XXXX Should be taken into account earlier? */
928 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
929 "Could not match STCLASS...\n") );
934 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
935 "Looking for %s substr starting at offset %ld...\n",
936 what, (long)(s + start_shift - i_strpos)) );
939 /* Have both, check_string is floating */
940 if (t + start_shift >= check_at) /* Contradicts floating=check */
941 goto retry_floating_check;
942 /* Recheck anchored substring, but not floating... */
946 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
947 "Looking for anchored substr starting at offset %ld...\n",
948 (long)(other_last - i_strpos)) );
949 goto do_other_anchored;
951 /* Another way we could have checked stclass at the
952 current position only: */
957 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
958 "Looking for /%s^%s/m starting at offset %ld...\n",
959 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
962 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
964 /* Check is floating subtring. */
965 retry_floating_check:
966 t = check_at - start_shift;
967 DEBUG_EXECUTE_r( what = "floating" );
968 goto hop_and_restart;
971 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
972 "By STCLASS: moving %ld --> %ld\n",
973 (long)(t - i_strpos), (long)(s - i_strpos))
977 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
978 "Does not contradict STCLASS...\n");
983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
984 PL_colors[4], (check ? "Guessed" : "Giving up"),
985 PL_colors[5], (long)(s - i_strpos)) );
988 fail_finish: /* Substring not found */
989 if (prog->check_substr || prog->check_utf8) /* could be removed already */
990 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
992 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
993 PL_colors[4], PL_colors[5]));
997 #define DECL_TRIE_TYPE(scan) \
998 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
999 trie_type = (scan->flags != EXACT) \
1000 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1001 : (do_utf8 ? trie_utf8 : trie_plain)
1003 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1004 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1005 switch (trie_type) { \
1006 case trie_utf8_fold: \
1007 if ( foldlen>0 ) { \
1008 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1013 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1014 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1015 foldlen -= UNISKIP( uvc ); \
1016 uscan = foldbuf + UNISKIP( uvc ); \
1019 case trie_latin_utf8_fold: \
1020 if ( foldlen>0 ) { \
1021 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1027 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1028 foldlen -= UNISKIP( uvc ); \
1029 uscan = foldbuf + UNISKIP( uvc ); \
1033 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1041 charid = trie->charmap[ uvc ]; \
1045 if (widecharmap) { \
1046 SV** const svpp = hv_fetch(widecharmap, \
1047 (char*)&uvc, sizeof(UV), 0); \
1049 charid = (U16)SvIV(*svpp); \
1054 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1056 char *my_strend= (char *)strend; \
1059 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
1060 m, NULL, ln, (bool)UTF)) \
1061 && (!reginfo || regtry(reginfo, &s)) ) \
1064 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1065 uvchr_to_utf8(tmpbuf, c); \
1066 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1068 && (f == c1 || f == c2) \
1070 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1071 m, NULL, ln, (bool)UTF)) \
1072 && (!reginfo || regtry(reginfo, &s)) ) \
1078 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1082 && (ln == 1 || !(OP(c) == EXACTF \
1084 : ibcmp_locale(s, m, ln))) \
1085 && (!reginfo || regtry(reginfo, &s)) ) \
1091 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1093 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1099 #define REXEC_FBC_SCAN(CoDe) \
1101 while (s < strend) { \
1107 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1108 REXEC_FBC_UTF8_SCAN( \
1110 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1119 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1122 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1131 #define REXEC_FBC_TRYIT \
1132 if ((!reginfo || regtry(reginfo, &s))) \
1135 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1137 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1140 REXEC_FBC_CLASS_SCAN(CoNd); \
1144 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1147 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1150 REXEC_FBC_CLASS_SCAN(CoNd); \
1154 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1155 PL_reg_flags |= RF_tainted; \
1157 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1160 REXEC_FBC_CLASS_SCAN(CoNd); \
1164 #define DUMP_EXEC_POS(li,s,doutf8) \
1165 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1167 /* We know what class REx starts with. Try to find this position... */
1168 /* if reginfo is NULL, its a dryrun */
1169 /* annoyingly all the vars in this routine have different names from their counterparts
1170 in regmatch. /grrr */
1173 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1174 const char *strend, regmatch_info *reginfo)
1177 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1181 register STRLEN uskip;
1185 register I32 tmp = 1; /* Scratch variable? */
1186 register const bool do_utf8 = PL_reg_match_utf8;
1187 RXi_GET_DECL(prog,progi);
1189 PERL_ARGS_ASSERT_FIND_BYCLASS;
1191 /* We know what class it must start with. */
1195 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1196 !UTF8_IS_INVARIANT((U8)s[0]) ?
1197 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1198 REGINCLASS(prog, c, (U8*)s));
1201 while (s < strend) {
1204 if (REGINCLASS(prog, c, (U8*)s) ||
1205 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1206 /* The assignment of 2 is intentional:
1207 * for the folded sharp s, the skip is 2. */
1208 (skip = SHARP_S_SKIP))) {
1209 if (tmp && (!reginfo || regtry(reginfo, &s)))
1222 if (tmp && (!reginfo || regtry(reginfo, &s)))
1230 ln = STR_LEN(c); /* length to match in octets/bytes */
1231 lnc = (I32) ln; /* length to match in characters */
1233 STRLEN ulen1, ulen2;
1235 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1236 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1237 /* used by commented-out code below */
1238 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1240 /* XXX: Since the node will be case folded at compile
1241 time this logic is a little odd, although im not
1242 sure that its actually wrong. --dmq */
1244 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1245 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1247 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1248 codepoint of the first character in the converted
1249 form, yet originally we did the extra step.
1250 No tests fail by commenting this code out however
1251 so Ive left it out. -- dmq.
1253 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1255 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1260 while (sm < ((U8 *) m + ln)) {
1275 c2 = PL_fold_locale[c1];
1277 e = HOP3c(strend, -((I32)lnc), s);
1279 if (!reginfo && e < s)
1280 e = s; /* Due to minlen logic of intuit() */
1282 /* The idea in the EXACTF* cases is to first find the
1283 * first character of the EXACTF* node and then, if
1284 * necessary, case-insensitively compare the full
1285 * text of the node. The c1 and c2 are the first
1286 * characters (though in Unicode it gets a bit
1287 * more complicated because there are more cases
1288 * than just upper and lower: one needs to use
1289 * the so-called folding case for case-insensitive
1290 * matching (called "loose matching" in Unicode).
1291 * ibcmp_utf8() will do just that. */
1293 if (do_utf8 || UTF) {
1295 U8 tmpbuf [UTF8_MAXBYTES+1];
1298 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1300 /* Upper and lower of 1st char are equal -
1301 * probably not a "letter". */
1304 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1309 REXEC_FBC_EXACTISH_CHECK(c == c1);
1315 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1321 /* Handle some of the three Greek sigmas cases.
1322 * Note that not all the possible combinations
1323 * are handled here: some of them are handled
1324 * by the standard folding rules, and some of
1325 * them (the character class or ANYOF cases)
1326 * are handled during compiletime in
1327 * regexec.c:S_regclass(). */
1328 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1329 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1330 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1332 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1337 /* Neither pattern nor string are UTF8 */
1339 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1341 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1345 PL_reg_flags |= RF_tainted;
1352 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1353 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1355 tmp = ((OP(c) == BOUND ?
1356 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1357 LOAD_UTF8_CHARCLASS_ALNUM();
1358 REXEC_FBC_UTF8_SCAN(
1359 if (tmp == !(OP(c) == BOUND ?
1360 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1361 isALNUM_LC_utf8((U8*)s)))
1369 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1370 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1373 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1379 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1383 PL_reg_flags |= RF_tainted;
1390 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1391 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1393 tmp = ((OP(c) == NBOUND ?
1394 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1395 LOAD_UTF8_CHARCLASS_ALNUM();
1396 REXEC_FBC_UTF8_SCAN(
1397 if (tmp == !(OP(c) == NBOUND ?
1398 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1399 isALNUM_LC_utf8((U8*)s)))
1401 else REXEC_FBC_TRYIT;
1405 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1406 tmp = ((OP(c) == NBOUND ?
1407 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1410 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1412 else REXEC_FBC_TRYIT;
1415 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1419 REXEC_FBC_CSCAN_PRELOAD(
1420 LOAD_UTF8_CHARCLASS_ALNUM(),
1421 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1425 REXEC_FBC_CSCAN_TAINT(
1426 isALNUM_LC_utf8((U8*)s),
1430 REXEC_FBC_CSCAN_PRELOAD(
1431 LOAD_UTF8_CHARCLASS_ALNUM(),
1432 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1436 REXEC_FBC_CSCAN_TAINT(
1437 !isALNUM_LC_utf8((U8*)s),
1441 REXEC_FBC_CSCAN_PRELOAD(
1442 LOAD_UTF8_CHARCLASS_SPACE(),
1443 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1447 REXEC_FBC_CSCAN_TAINT(
1448 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1452 REXEC_FBC_CSCAN_PRELOAD(
1453 LOAD_UTF8_CHARCLASS_SPACE(),
1454 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1458 REXEC_FBC_CSCAN_TAINT(
1459 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1463 REXEC_FBC_CSCAN_PRELOAD(
1464 LOAD_UTF8_CHARCLASS_DIGIT(),
1465 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1469 REXEC_FBC_CSCAN_TAINT(
1470 isDIGIT_LC_utf8((U8*)s),
1474 REXEC_FBC_CSCAN_PRELOAD(
1475 LOAD_UTF8_CHARCLASS_DIGIT(),
1476 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1480 REXEC_FBC_CSCAN_TAINT(
1481 !isDIGIT_LC_utf8((U8*)s),
1487 is_LNBREAK_latin1(s)
1497 !is_VERTWS_latin1(s)
1502 is_HORIZWS_latin1(s)
1506 !is_HORIZWS_utf8(s),
1507 !is_HORIZWS_latin1(s)
1513 /* what trie are we using right now */
1515 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1517 = (reg_trie_data*)progi->data->data[ aho->trie ];
1518 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1520 const char *last_start = strend - trie->minlen;
1522 const char *real_start = s;
1524 STRLEN maxlen = trie->maxlen;
1526 U8 **points; /* map of where we were in the input string
1527 when reading a given char. For ASCII this
1528 is unnecessary overhead as the relationship
1529 is always 1:1, but for Unicode, especially
1530 case folded Unicode this is not true. */
1531 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1535 GET_RE_DEBUG_FLAGS_DECL;
1537 /* We can't just allocate points here. We need to wrap it in
1538 * an SV so it gets freed properly if there is a croak while
1539 * running the match */
1542 sv_points=newSV(maxlen * sizeof(U8 *));
1543 SvCUR_set(sv_points,
1544 maxlen * sizeof(U8 *));
1545 SvPOK_on(sv_points);
1546 sv_2mortal(sv_points);
1547 points=(U8**)SvPV_nolen(sv_points );
1548 if ( trie_type != trie_utf8_fold
1549 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1552 bitmap=(U8*)trie->bitmap;
1554 bitmap=(U8*)ANYOF_BITMAP(c);
1556 /* this is the Aho-Corasick algorithm modified a touch
1557 to include special handling for long "unknown char"
1558 sequences. The basic idea being that we use AC as long
1559 as we are dealing with a possible matching char, when
1560 we encounter an unknown char (and we have not encountered
1561 an accepting state) we scan forward until we find a legal
1563 AC matching is basically that of trie matching, except
1564 that when we encounter a failing transition, we fall back
1565 to the current states "fail state", and try the current char
1566 again, a process we repeat until we reach the root state,
1567 state 1, or a legal transition. If we fail on the root state
1568 then we can either terminate if we have reached an accepting
1569 state previously, or restart the entire process from the beginning
1573 while (s <= last_start) {
1574 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1582 U8 *uscan = (U8*)NULL;
1583 U8 *leftmost = NULL;
1585 U32 accepted_word= 0;
1589 while ( state && uc <= (U8*)strend ) {
1591 U32 word = aho->states[ state ].wordnum;
1595 DEBUG_TRIE_EXECUTE_r(
1596 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1597 dump_exec_pos( (char *)uc, c, strend, real_start,
1598 (char *)uc, do_utf8 );
1599 PerlIO_printf( Perl_debug_log,
1600 " Scanning for legal start char...\n");
1603 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1608 if (uc >(U8*)last_start) break;
1612 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1613 if (!leftmost || lpos < leftmost) {
1614 DEBUG_r(accepted_word=word);
1620 points[pointpos++ % maxlen]= uc;
1621 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1622 uscan, len, uvc, charid, foldlen,
1624 DEBUG_TRIE_EXECUTE_r({
1625 dump_exec_pos( (char *)uc, c, strend, real_start,
1627 PerlIO_printf(Perl_debug_log,
1628 " Charid:%3u CP:%4"UVxf" ",
1634 word = aho->states[ state ].wordnum;
1636 base = aho->states[ state ].trans.base;
1638 DEBUG_TRIE_EXECUTE_r({
1640 dump_exec_pos( (char *)uc, c, strend, real_start,
1642 PerlIO_printf( Perl_debug_log,
1643 "%sState: %4"UVxf", word=%"UVxf,
1644 failed ? " Fail transition to " : "",
1645 (UV)state, (UV)word);
1650 (base + charid > trie->uniquecharcount )
1651 && (base + charid - 1 - trie->uniquecharcount
1653 && trie->trans[base + charid - 1 -
1654 trie->uniquecharcount].check == state
1655 && (tmp=trie->trans[base + charid - 1 -
1656 trie->uniquecharcount ].next))
1658 DEBUG_TRIE_EXECUTE_r(
1659 PerlIO_printf( Perl_debug_log," - legal\n"));
1664 DEBUG_TRIE_EXECUTE_r(
1665 PerlIO_printf( Perl_debug_log," - fail\n"));
1667 state = aho->fail[state];
1671 /* we must be accepting here */
1672 DEBUG_TRIE_EXECUTE_r(
1673 PerlIO_printf( Perl_debug_log," - accepting\n"));
1682 if (!state) state = 1;
1685 if ( aho->states[ state ].wordnum ) {
1686 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1687 if (!leftmost || lpos < leftmost) {
1688 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1693 s = (char*)leftmost;
1694 DEBUG_TRIE_EXECUTE_r({
1696 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1697 (UV)accepted_word, (IV)(s - real_start)
1700 if (!reginfo || regtry(reginfo, &s)) {
1706 DEBUG_TRIE_EXECUTE_r({
1707 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1710 DEBUG_TRIE_EXECUTE_r(
1711 PerlIO_printf( Perl_debug_log,"No match.\n"));
1720 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1729 S_swap_match_buff (pTHX_ regexp *prog)
1731 regexp_paren_pair *t;
1733 PERL_ARGS_ASSERT_SWAP_MATCH_BUFF;
1736 /* We have to be careful. If the previous successful match
1737 was from this regex we don't want a subsequent paritally
1738 successful match to clobber the old results.
1739 So when we detect this possibility we add a swap buffer
1740 to the re, and switch the buffer each match. If we fail
1741 we switch it back, otherwise we leave it swapped.
1743 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
1746 prog->swap = prog->offs;
1752 - regexec_flags - match a regexp against a string
1755 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1756 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1757 /* strend: pointer to null at end of string */
1758 /* strbeg: real beginning of string */
1759 /* minend: end of match must be >=minend after stringarg. */
1760 /* data: May be used for some additional optimizations.
1761 Currently its only used, with a U32 cast, for transmitting
1762 the ganch offset when doing a /g match. This will change */
1763 /* nosave: For optimizations. */
1766 struct regexp *const prog = (struct regexp *)SvANY(rx);
1767 /*register*/ char *s;
1768 register regnode *c;
1769 /*register*/ char *startpos = stringarg;
1770 I32 minlen; /* must match at least this many chars */
1771 I32 dontbother = 0; /* how many characters not to try at end */
1772 I32 end_shift = 0; /* Same for the end. */ /* CC */
1773 I32 scream_pos = -1; /* Internal iterator of scream. */
1774 char *scream_olds = NULL;
1775 const bool do_utf8 = (bool)DO_UTF8(sv);
1777 RXi_GET_DECL(prog,progi);
1778 regmatch_info reginfo; /* create some info to pass to regtry etc */
1779 bool swap_on_fail = 0;
1780 GET_RE_DEBUG_FLAGS_DECL;
1782 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1783 PERL_UNUSED_ARG(data);
1785 /* Be paranoid... */
1786 if (prog == NULL || startpos == NULL) {
1787 Perl_croak(aTHX_ "NULL regexp parameter");
1791 multiline = prog->extflags & RXf_PMf_MULTILINE;
1792 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
1794 RX_MATCH_UTF8_set(rx, do_utf8);
1796 debug_start_match(rx, do_utf8, startpos, strend,
1800 minlen = prog->minlen;
1802 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1804 "String too short [regexec_flags]...\n"));
1809 /* Check validity of program. */
1810 if (UCHARAT(progi->program) != REG_MAGIC) {
1811 Perl_croak(aTHX_ "corrupted regexp program");
1815 PL_reg_eval_set = 0;
1819 PL_reg_flags |= RF_utf8;
1821 /* Mark beginning of line for ^ and lookbehind. */
1822 reginfo.bol = startpos; /* XXX not used ??? */
1826 /* Mark end of line for $ (and such) */
1829 /* see how far we have to get to not match where we matched before */
1830 reginfo.till = startpos+minend;
1832 /* If there is a "must appear" string, look for it. */
1835 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1838 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1839 reginfo.ganch = startpos + prog->gofs;
1840 else if (sv && SvTYPE(sv) >= SVt_PVMG
1842 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1843 && mg->mg_len >= 0) {
1844 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1845 if (prog->extflags & RXf_ANCH_GPOS) {
1846 if (s > reginfo.ganch)
1848 s = reginfo.ganch - prog->gofs;
1852 reginfo.ganch = strbeg + PTR2UV(data);
1853 } else /* pos() not defined */
1854 reginfo.ganch = strbeg;
1856 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1858 swap_match_buff(prog); /* do we need a save destructor here for
1861 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1862 re_scream_pos_data d;
1864 d.scream_olds = &scream_olds;
1865 d.scream_pos = &scream_pos;
1866 s = re_intuit_start(rx, sv, s, strend, flags, &d);
1868 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1869 goto phooey; /* not present */
1875 /* Simplest case: anchored match need be tried only once. */
1876 /* [unless only anchor is BOL and multiline is set] */
1877 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1878 if (s == startpos && regtry(®info, &startpos))
1880 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1881 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1886 dontbother = minlen - 1;
1887 end = HOP3c(strend, -dontbother, strbeg) - 1;
1888 /* for multiline we only have to try after newlines */
1889 if (prog->check_substr || prog->check_utf8) {
1893 if (regtry(®info, &s))
1898 if (prog->extflags & RXf_USE_INTUIT) {
1899 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
1910 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1911 if (regtry(®info, &s))
1918 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1920 /* the warning about reginfo.ganch being used without intialization
1921 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1922 and we only enter this block when the same bit is set. */
1923 char *tmp_s = reginfo.ganch - prog->gofs;
1924 if (regtry(®info, &tmp_s))
1929 /* Messy cases: unanchored match. */
1930 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1931 /* we have /x+whatever/ */
1932 /* it must be a one character string (XXXX Except UTF?) */
1937 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1938 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1939 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1944 DEBUG_EXECUTE_r( did_match = 1 );
1945 if (regtry(®info, &s)) goto got_it;
1947 while (s < strend && *s == ch)
1955 DEBUG_EXECUTE_r( did_match = 1 );
1956 if (regtry(®info, &s)) goto got_it;
1958 while (s < strend && *s == ch)
1963 DEBUG_EXECUTE_r(if (!did_match)
1964 PerlIO_printf(Perl_debug_log,
1965 "Did not find anchored character...\n")
1968 else if (prog->anchored_substr != NULL
1969 || prog->anchored_utf8 != NULL
1970 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1971 && prog->float_max_offset < strend - s)) {
1976 char *last1; /* Last position checked before */
1980 if (prog->anchored_substr || prog->anchored_utf8) {
1981 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1982 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1983 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1984 back_max = back_min = prog->anchored_offset;
1986 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1987 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1988 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1989 back_max = prog->float_max_offset;
1990 back_min = prog->float_min_offset;
1994 if (must == &PL_sv_undef)
1995 /* could not downgrade utf8 check substring, so must fail */
2001 last = HOP3c(strend, /* Cannot start after this */
2002 -(I32)(CHR_SVLEN(must)
2003 - (SvTAIL(must) != 0) + back_min), strbeg);
2006 last1 = HOPc(s, -1);
2008 last1 = s - 1; /* bogus */
2010 /* XXXX check_substr already used to find "s", can optimize if
2011 check_substr==must. */
2013 dontbother = end_shift;
2014 strend = HOPc(strend, -dontbother);
2015 while ( (s <= last) &&
2016 ((flags & REXEC_SCREAM)
2017 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2018 end_shift, &scream_pos, 0))
2019 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2020 (unsigned char*)strend, must,
2021 multiline ? FBMrf_MULTILINE : 0))) ) {
2022 /* we may be pointing at the wrong string */
2023 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2024 s = strbeg + (s - SvPVX_const(sv));
2025 DEBUG_EXECUTE_r( did_match = 1 );
2026 if (HOPc(s, -back_max) > last1) {
2027 last1 = HOPc(s, -back_min);
2028 s = HOPc(s, -back_max);
2031 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2033 last1 = HOPc(s, -back_min);
2037 while (s <= last1) {
2038 if (regtry(®info, &s))
2044 while (s <= last1) {
2045 if (regtry(®info, &s))
2051 DEBUG_EXECUTE_r(if (!did_match) {
2052 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2053 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2054 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2055 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2056 ? "anchored" : "floating"),
2057 quoted, RE_SV_TAIL(must));
2061 else if ( (c = progi->regstclass) ) {
2063 const OPCODE op = OP(progi->regstclass);
2064 /* don't bother with what can't match */
2065 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2066 strend = HOPc(strend, -(minlen - 1));
2069 SV * const prop = sv_newmortal();
2070 regprop(prog, prop, c);
2072 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2074 PerlIO_printf(Perl_debug_log,
2075 "Matching stclass %.*s against %s (%d chars)\n",
2076 (int)SvCUR(prop), SvPVX_const(prop),
2077 quoted, (int)(strend - s));
2080 if (find_byclass(prog, c, s, strend, ®info))
2082 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2086 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2091 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2092 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2093 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2095 if (flags & REXEC_SCREAM) {
2096 last = screaminstr(sv, float_real, s - strbeg,
2097 end_shift, &scream_pos, 1); /* last one */
2099 last = scream_olds; /* Only one occurrence. */
2100 /* we may be pointing at the wrong string */
2101 else if (RXp_MATCH_COPIED(prog))
2102 s = strbeg + (s - SvPVX_const(sv));
2106 const char * const little = SvPV_const(float_real, len);
2108 if (SvTAIL(float_real)) {
2109 if (memEQ(strend - len + 1, little, len - 1))
2110 last = strend - len + 1;
2111 else if (!multiline)
2112 last = memEQ(strend - len, little, len)
2113 ? strend - len : NULL;
2119 last = rninstr(s, strend, little, little + len);
2121 last = strend; /* matching "$" */
2126 PerlIO_printf(Perl_debug_log,
2127 "%sCan't trim the tail, match fails (should not happen)%s\n",
2128 PL_colors[4], PL_colors[5]));
2129 goto phooey; /* Should not happen! */
2131 dontbother = strend - last + prog->float_min_offset;
2133 if (minlen && (dontbother < minlen))
2134 dontbother = minlen - 1;
2135 strend -= dontbother; /* this one's always in bytes! */
2136 /* We don't know much -- general case. */
2139 if (regtry(®info, &s))
2148 if (regtry(®info, &s))
2150 } while (s++ < strend);
2158 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2160 if (PL_reg_eval_set)
2161 restore_pos(aTHX_ prog);
2162 if (RXp_PAREN_NAMES(prog))
2163 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2165 /* make sure $`, $&, $', and $digit will work later */
2166 if ( !(flags & REXEC_NOT_FIRST) ) {
2167 RX_MATCH_COPY_FREE(rx);
2168 if (flags & REXEC_COPY_STR) {
2169 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2170 #ifdef PERL_OLD_COPY_ON_WRITE
2172 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2174 PerlIO_printf(Perl_debug_log,
2175 "Copy on write: regexp capture, type %d\n",
2178 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2179 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2180 assert (SvPOKp(prog->saved_copy));
2184 RX_MATCH_COPIED_on(rx);
2185 s = savepvn(strbeg, i);
2191 prog->subbeg = strbeg;
2192 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2199 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2200 PL_colors[4], PL_colors[5]));
2201 if (PL_reg_eval_set)
2202 restore_pos(aTHX_ prog);
2204 /* we failed :-( roll it back */
2205 swap_match_buff(prog);
2212 - regtry - try match at specific point
2214 STATIC I32 /* 0 failure, 1 success */
2215 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2219 REGEXP *const rx = reginfo->prog;
2220 regexp *const prog = (struct regexp *)SvANY(rx);
2221 RXi_GET_DECL(prog,progi);
2222 GET_RE_DEBUG_FLAGS_DECL;
2224 PERL_ARGS_ASSERT_REGTRY;
2226 reginfo->cutpoint=NULL;
2228 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2231 PL_reg_eval_set = RS_init;
2232 DEBUG_EXECUTE_r(DEBUG_s(
2233 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2234 (IV)(PL_stack_sp - PL_stack_base));
2237 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2238 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2240 /* Apparently this is not needed, judging by wantarray. */
2241 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2242 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2245 /* Make $_ available to executed code. */
2246 if (reginfo->sv != DEFSV) {
2248 DEFSV = reginfo->sv;
2251 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2252 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2253 /* prepare for quick setting of pos */
2254 #ifdef PERL_OLD_COPY_ON_WRITE
2255 if (SvIsCOW(reginfo->sv))
2256 sv_force_normal_flags(reginfo->sv, 0);
2258 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2259 &PL_vtbl_mglob, NULL, 0);
2263 PL_reg_oldpos = mg->mg_len;
2264 SAVEDESTRUCTOR_X(restore_pos, prog);
2266 if (!PL_reg_curpm) {
2267 Newxz(PL_reg_curpm, 1, PMOP);
2270 SV* const repointer = &PL_sv_undef;
2271 /* this regexp is also owned by the new PL_reg_curpm, which
2272 will try to free it. */
2273 av_push(PL_regex_padav, repointer);
2274 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2275 PL_regex_pad = AvARRAY(PL_regex_padav);
2280 /* It seems that non-ithreads works both with and without this code.
2281 So for efficiency reasons it seems best not to have the code
2282 compiled when it is not needed. */
2283 /* This is safe against NULLs: */
2284 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2285 /* PM_reg_curpm owns a reference to this regexp. */
2288 PM_SETRE(PL_reg_curpm, rx);
2289 PL_reg_oldcurpm = PL_curpm;
2290 PL_curpm = PL_reg_curpm;
2291 if (RXp_MATCH_COPIED(prog)) {
2292 /* Here is a serious problem: we cannot rewrite subbeg,
2293 since it may be needed if this match fails. Thus
2294 $` inside (?{}) could fail... */
2295 PL_reg_oldsaved = prog->subbeg;
2296 PL_reg_oldsavedlen = prog->sublen;
2297 #ifdef PERL_OLD_COPY_ON_WRITE
2298 PL_nrs = prog->saved_copy;
2300 RXp_MATCH_COPIED_off(prog);
2303 PL_reg_oldsaved = NULL;
2304 prog->subbeg = PL_bostr;
2305 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2307 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2308 prog->offs[0].start = *startpos - PL_bostr;
2309 PL_reginput = *startpos;
2310 PL_reglastparen = &prog->lastparen;
2311 PL_reglastcloseparen = &prog->lastcloseparen;
2312 prog->lastparen = 0;
2313 prog->lastcloseparen = 0;
2315 PL_regoffs = prog->offs;
2316 if (PL_reg_start_tmpl <= prog->nparens) {
2317 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2318 if(PL_reg_start_tmp)
2319 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2321 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2324 /* XXXX What this code is doing here?!!! There should be no need
2325 to do this again and again, PL_reglastparen should take care of
2328 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2329 * Actually, the code in regcppop() (which Ilya may be meaning by
2330 * PL_reglastparen), is not needed at all by the test suite
2331 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2332 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2333 * Meanwhile, this code *is* needed for the
2334 * above-mentioned test suite tests to succeed. The common theme
2335 * on those tests seems to be returning null fields from matches.
2336 * --jhi updated by dapm */
2338 if (prog->nparens) {
2339 regexp_paren_pair *pp = PL_regoffs;
2341 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2349 if (regmatch(reginfo, progi->program + 1)) {
2350 PL_regoffs[0].end = PL_reginput - PL_bostr;
2353 if (reginfo->cutpoint)
2354 *startpos= reginfo->cutpoint;
2355 REGCP_UNWIND(lastcp);
2360 #define sayYES goto yes
2361 #define sayNO goto no
2362 #define sayNO_SILENT goto no_silent
2364 /* we dont use STMT_START/END here because it leads to
2365 "unreachable code" warnings, which are bogus, but distracting. */
2366 #define CACHEsayNO \
2367 if (ST.cache_mask) \
2368 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2371 /* this is used to determine how far from the left messages like
2372 'failed...' are printed. It should be set such that messages
2373 are inline with the regop output that created them.
2375 #define REPORT_CODE_OFF 32
2378 /* Make sure there is a test for this +1 options in re_tests */
2379 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2381 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2382 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2384 #define SLAB_FIRST(s) (&(s)->states[0])
2385 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2387 /* grab a new slab and return the first slot in it */
2389 STATIC regmatch_state *
2392 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2395 regmatch_slab *s = PL_regmatch_slab->next;
2397 Newx(s, 1, regmatch_slab);
2398 s->prev = PL_regmatch_slab;
2400 PL_regmatch_slab->next = s;
2402 PL_regmatch_slab = s;
2403 return SLAB_FIRST(s);
2407 /* push a new state then goto it */
2409 #define PUSH_STATE_GOTO(state, node) \
2411 st->resume_state = state; \
2414 /* push a new state with success backtracking, then goto it */
2416 #define PUSH_YES_STATE_GOTO(state, node) \
2418 st->resume_state = state; \
2419 goto push_yes_state;
2425 regmatch() - main matching routine
2427 This is basically one big switch statement in a loop. We execute an op,
2428 set 'next' to point the next op, and continue. If we come to a point which
2429 we may need to backtrack to on failure such as (A|B|C), we push a
2430 backtrack state onto the backtrack stack. On failure, we pop the top
2431 state, and re-enter the loop at the state indicated. If there are no more
2432 states to pop, we return failure.
2434 Sometimes we also need to backtrack on success; for example /A+/, where
2435 after successfully matching one A, we need to go back and try to
2436 match another one; similarly for lookahead assertions: if the assertion
2437 completes successfully, we backtrack to the state just before the assertion
2438 and then carry on. In these cases, the pushed state is marked as
2439 'backtrack on success too'. This marking is in fact done by a chain of
2440 pointers, each pointing to the previous 'yes' state. On success, we pop to
2441 the nearest yes state, discarding any intermediate failure-only states.
2442 Sometimes a yes state is pushed just to force some cleanup code to be
2443 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2444 it to free the inner regex.
2446 Note that failure backtracking rewinds the cursor position, while
2447 success backtracking leaves it alone.
2449 A pattern is complete when the END op is executed, while a subpattern
2450 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2451 ops trigger the "pop to last yes state if any, otherwise return true"
2454 A common convention in this function is to use A and B to refer to the two
2455 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2456 the subpattern to be matched possibly multiple times, while B is the entire
2457 rest of the pattern. Variable and state names reflect this convention.
2459 The states in the main switch are the union of ops and failure/success of
2460 substates associated with with that op. For example, IFMATCH is the op
2461 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2462 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2463 successfully matched A and IFMATCH_A_fail is a state saying that we have
2464 just failed to match A. Resume states always come in pairs. The backtrack
2465 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2466 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2467 on success or failure.
2469 The struct that holds a backtracking state is actually a big union, with
2470 one variant for each major type of op. The variable st points to the
2471 top-most backtrack struct. To make the code clearer, within each
2472 block of code we #define ST to alias the relevant union.
2474 Here's a concrete example of a (vastly oversimplified) IFMATCH
2480 #define ST st->u.ifmatch
2482 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2483 ST.foo = ...; // some state we wish to save
2485 // push a yes backtrack state with a resume value of
2486 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2488 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2491 case IFMATCH_A: // we have successfully executed A; now continue with B
2493 bar = ST.foo; // do something with the preserved value
2496 case IFMATCH_A_fail: // A failed, so the assertion failed
2497 ...; // do some housekeeping, then ...
2498 sayNO; // propagate the failure
2505 For any old-timers reading this who are familiar with the old recursive
2506 approach, the code above is equivalent to:
2508 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2517 ...; // do some housekeeping, then ...
2518 sayNO; // propagate the failure
2521 The topmost backtrack state, pointed to by st, is usually free. If you
2522 want to claim it, populate any ST.foo fields in it with values you wish to
2523 save, then do one of
2525 PUSH_STATE_GOTO(resume_state, node);
2526 PUSH_YES_STATE_GOTO(resume_state, node);
2528 which sets that backtrack state's resume value to 'resume_state', pushes a
2529 new free entry to the top of the backtrack stack, then goes to 'node'.
2530 On backtracking, the free slot is popped, and the saved state becomes the
2531 new free state. An ST.foo field in this new top state can be temporarily
2532 accessed to retrieve values, but once the main loop is re-entered, it
2533 becomes available for reuse.
2535 Note that the depth of the backtrack stack constantly increases during the
2536 left-to-right execution of the pattern, rather than going up and down with
2537 the pattern nesting. For example the stack is at its maximum at Z at the
2538 end of the pattern, rather than at X in the following:
2540 /(((X)+)+)+....(Y)+....Z/
2542 The only exceptions to this are lookahead/behind assertions and the cut,
2543 (?>A), which pop all the backtrack states associated with A before
2546 Bascktrack state structs are allocated in slabs of about 4K in size.
2547 PL_regmatch_state and st always point to the currently active state,
2548 and PL_regmatch_slab points to the slab currently containing
2549 PL_regmatch_state. The first time regmatch() is called, the first slab is
2550 allocated, and is never freed until interpreter destruction. When the slab
2551 is full, a new one is allocated and chained to the end. At exit from
2552 regmatch(), slabs allocated since entry are freed.
2557 #define DEBUG_STATE_pp(pp) \
2559 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2560 PerlIO_printf(Perl_debug_log, \
2561 " %*s"pp" %s%s%s%s%s\n", \
2563 PL_reg_name[st->resume_state], \
2564 ((st==yes_state||st==mark_state) ? "[" : ""), \
2565 ((st==yes_state) ? "Y" : ""), \
2566 ((st==mark_state) ? "M" : ""), \
2567 ((st==yes_state||st==mark_state) ? "]" : "") \
2572 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2577 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2578 const char *start, const char *end, const char *blurb)
2580 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2582 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2587 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2588 RX_PRECOMP(prog), RX_PRELEN(prog), 60);
2590 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2591 start, end - start, 60);
2593 PerlIO_printf(Perl_debug_log,
2594 "%s%s REx%s %s against %s\n",
2595 PL_colors[4], blurb, PL_colors[5], s0, s1);
2597 if (do_utf8||utf8_pat)
2598 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2599 utf8_pat ? "pattern" : "",
2600 utf8_pat && do_utf8 ? " and " : "",
2601 do_utf8 ? "string" : ""
2607 S_dump_exec_pos(pTHX_ const char *locinput,
2608 const regnode *scan,
2609 const char *loc_regeol,
2610 const char *loc_bostr,
2611 const char *loc_reg_starttry,
2614 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2615 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2616 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2617 /* The part of the string before starttry has one color
2618 (pref0_len chars), between starttry and current
2619 position another one (pref_len - pref0_len chars),
2620 after the current position the third one.
2621 We assume that pref0_len <= pref_len, otherwise we
2622 decrease pref0_len. */
2623 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2624 ? (5 + taill) - l : locinput - loc_bostr;
2627 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2629 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2631 pref0_len = pref_len - (locinput - loc_reg_starttry);
2632 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2633 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2634 ? (5 + taill) - pref_len : loc_regeol - locinput);
2635 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2639 if (pref0_len > pref_len)
2640 pref0_len = pref_len;
2642 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2644 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2645 (locinput - pref_len),pref0_len, 60, 4, 5);
2647 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2648 (locinput - pref_len + pref0_len),
2649 pref_len - pref0_len, 60, 2, 3);
2651 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2652 locinput, loc_regeol - locinput, 10, 0, 1);
2654 const STRLEN tlen=len0+len1+len2;
2655 PerlIO_printf(Perl_debug_log,
2656 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2657 (IV)(locinput - loc_bostr),
2660 (docolor ? "" : "> <"),
2662 (int)(tlen > 19 ? 0 : 19 - tlen),
2669 /* reg_check_named_buff_matched()
2670 * Checks to see if a named buffer has matched. The data array of
2671 * buffer numbers corresponding to the buffer is expected to reside
2672 * in the regexp->data->data array in the slot stored in the ARG() of
2673 * node involved. Note that this routine doesn't actually care about the
2674 * name, that information is not preserved from compilation to execution.
2675 * Returns the index of the leftmost defined buffer with the given name
2676 * or 0 if non of the buffers matched.
2679 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2682 RXi_GET_DECL(rex,rexi);
2683 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2684 I32 *nums=(I32*)SvPVX(sv_dat);
2686 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2688 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2689 if ((I32)*PL_reglastparen >= nums[n] &&
2690 PL_regoffs[nums[n]].end != -1)
2699 /* free all slabs above current one - called during LEAVE_SCOPE */
2702 S_clear_backtrack_stack(pTHX_ void *p)
2704 regmatch_slab *s = PL_regmatch_slab->next;
2709 PL_regmatch_slab->next = NULL;
2711 regmatch_slab * const osl = s;
2718 #define SETREX(Re1,Re2) \
2719 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2722 STATIC I32 /* 0 failure, 1 success */
2723 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2725 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2729 register const bool do_utf8 = PL_reg_match_utf8;
2730 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2731 REGEXP *rex_sv = reginfo->prog;
2732 regexp *rex = (struct regexp *)SvANY(rex_sv);
2733 RXi_GET_DECL(rex,rexi);
2735 /* the current state. This is a cached copy of PL_regmatch_state */
2736 register regmatch_state *st;
2737 /* cache heavy used fields of st in registers */
2738 register regnode *scan;
2739 register regnode *next;
2740 register U32 n = 0; /* general value; init to avoid compiler warning */
2741 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2742 register char *locinput = PL_reginput;
2743 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2745 bool result = 0; /* return value of S_regmatch */
2746 int depth = 0; /* depth of backtrack stack */
2747 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2748 const U32 max_nochange_depth =
2749 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2750 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2751 regmatch_state *yes_state = NULL; /* state to pop to on success of
2753 /* mark_state piggy backs on the yes_state logic so that when we unwind
2754 the stack on success we can update the mark_state as we go */
2755 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2756 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2757 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2759 bool no_final = 0; /* prevent failure from backtracking? */
2760 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2761 char *startpoint = PL_reginput;
2762 SV *popmark = NULL; /* are we looking for a mark? */
2763 SV *sv_commit = NULL; /* last mark name seen in failure */
2764 SV *sv_yes_mark = NULL; /* last mark name we have seen
2765 during a successfull match */
2766 U32 lastopen = 0; /* last open we saw */
2767 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2768 SV* const oreplsv = GvSV(PL_replgv);
2769 /* these three flags are set by various ops to signal information to
2770 * the very next op. They have a useful lifetime of exactly one loop
2771 * iteration, and are not preserved or restored by state pushes/pops
2773 bool sw = 0; /* the condition value in (?(cond)a|b) */
2774 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2775 int logical = 0; /* the following EVAL is:
2779 or the following IFMATCH/UNLESSM is:
2780 false: plain (?=foo)
2781 true: used as a condition: (?(?=foo))
2784 GET_RE_DEBUG_FLAGS_DECL;
2787 PERL_ARGS_ASSERT_REGMATCH;
2789 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2790 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2792 /* on first ever call to regmatch, allocate first slab */
2793 if (!PL_regmatch_slab) {
2794 Newx(PL_regmatch_slab, 1, regmatch_slab);
2795 PL_regmatch_slab->prev = NULL;
2796 PL_regmatch_slab->next = NULL;
2797 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2800 oldsave = PL_savestack_ix;
2801 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2802 SAVEVPTR(PL_regmatch_slab);
2803 SAVEVPTR(PL_regmatch_state);
2805 /* grab next free state slot */
2806 st = ++PL_regmatch_state;
2807 if (st > SLAB_LAST(PL_regmatch_slab))
2808 st = PL_regmatch_state = S_push_slab(aTHX);
2810 /* Note that nextchr is a byte even in UTF */
2811 nextchr = UCHARAT(locinput);
2813 while (scan != NULL) {
2816 SV * const prop = sv_newmortal();
2817 regnode *rnext=regnext(scan);
2818 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2819 regprop(rex, prop, scan);
2821 PerlIO_printf(Perl_debug_log,
2822 "%3"IVdf":%*s%s(%"IVdf")\n",
2823 (IV)(scan - rexi->program), depth*2, "",
2825 (PL_regkind[OP(scan)] == END || !rnext) ?
2826 0 : (IV)(rnext - rexi->program));
2829 next = scan + NEXT_OFF(scan);
2832 state_num = OP(scan);
2835 switch (state_num) {
2837 if (locinput == PL_bostr)
2839 /* reginfo->till = reginfo->bol; */
2844 if (locinput == PL_bostr ||
2845 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2851 if (locinput == PL_bostr)
2855 if (locinput == reginfo->ganch)
2860 /* update the startpoint */
2861 st->u.keeper.val = PL_regoffs[0].start;
2862 PL_reginput = locinput;
2863 PL_regoffs[0].start = locinput - PL_bostr;
2864 PUSH_STATE_GOTO(KEEPS_next, next);
2866 case KEEPS_next_fail:
2867 /* rollback the start point change */
2868 PL_regoffs[0].start = st->u.keeper.val;
2874 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2879 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2881 if (PL_regeol - locinput > 1)
2885 if (PL_regeol != locinput)
2889 if (!nextchr && locinput >= PL_regeol)
2892 locinput += PL_utf8skip[nextchr];
2893 if (locinput > PL_regeol)
2895 nextchr = UCHARAT(locinput);
2898 nextchr = UCHARAT(++locinput);
2901 if (!nextchr && locinput >= PL_regeol)
2903 nextchr = UCHARAT(++locinput);
2906 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2909 locinput += PL_utf8skip[nextchr];
2910 if (locinput > PL_regeol)
2912 nextchr = UCHARAT(locinput);
2915 nextchr = UCHARAT(++locinput);
2919 #define ST st->u.trie
2921 /* In this case the charclass data is available inline so
2922 we can fail fast without a lot of extra overhead.
2924 if (scan->flags == EXACT || !do_utf8) {
2925 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2927 PerlIO_printf(Perl_debug_log,
2928 "%*s %sfailed to match trie start class...%s\n",
2929 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2938 /* what type of TRIE am I? (utf8 makes this contextual) */
2939 DECL_TRIE_TYPE(scan);
2941 /* what trie are we using right now */
2942 reg_trie_data * const trie
2943 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2944 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2945 U32 state = trie->startstate;
2947 if (trie->bitmap && trie_type != trie_utf8_fold &&
2948 !TRIE_BITMAP_TEST(trie,*locinput)
2950 if (trie->states[ state ].wordnum) {
2952 PerlIO_printf(Perl_debug_log,
2953 "%*s %smatched empty string...%s\n",
2954 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2959 PerlIO_printf(Perl_debug_log,
2960 "%*s %sfailed to match trie start class...%s\n",
2961 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2968 U8 *uc = ( U8* )locinput;
2972 U8 *uscan = (U8*)NULL;
2974 SV *sv_accept_buff = NULL;
2975 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2977 ST.accepted = 0; /* how many accepting states we have seen */
2979 ST.jump = trie->jump;
2982 traverse the TRIE keeping track of all accepting states
2983 we transition through until we get to a failing node.
2986 while ( state && uc <= (U8*)PL_regeol ) {
2987 U32 base = trie->states[ state ].trans.base;
2990 /* We use charid to hold the wordnum as we don't use it
2991 for charid until after we have done the wordnum logic.
2992 We define an alias just so that the wordnum logic reads
2995 #define got_wordnum charid
2996 got_wordnum = trie->states[ state ].wordnum;
2998 if ( got_wordnum ) {
2999 if ( ! ST.accepted ) {
3001 /* SAVETMPS; */ /* XXX is this necessary? dmq */
3002 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3003 sv_accept_buff=newSV(bufflen *
3004 sizeof(reg_trie_accepted) - 1);
3005 SvCUR_set(sv_accept_buff, 0);
3006 SvPOK_on(sv_accept_buff);
3007 sv_2mortal(sv_accept_buff);
3010 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3013 if (ST.accepted >= bufflen) {
3015 ST.accept_buff =(reg_trie_accepted*)
3016 SvGROW(sv_accept_buff,
3017 bufflen * sizeof(reg_trie_accepted));
3019 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3020 + sizeof(reg_trie_accepted));
3023 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3024 ST.accept_buff[ST.accepted].endpos = uc;
3026 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3030 DEBUG_TRIE_EXECUTE_r({
3031 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3032 PerlIO_printf( Perl_debug_log,
3033 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
3034 2+depth * 2, "", PL_colors[4],
3035 (UV)state, (UV)ST.accepted );
3039 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3040 uscan, len, uvc, charid, foldlen,
3044 (base + charid > trie->uniquecharcount )
3045 && (base + charid - 1 - trie->uniquecharcount
3047 && trie->trans[base + charid - 1 -
3048 trie->uniquecharcount].check == state)
3050 state = trie->trans[base + charid - 1 -
3051 trie->uniquecharcount ].next;
3062 DEBUG_TRIE_EXECUTE_r(
3063 PerlIO_printf( Perl_debug_log,
3064 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3065 charid, uvc, (UV)state, PL_colors[5] );
3072 PerlIO_printf( Perl_debug_log,
3073 "%*s %sgot %"IVdf" possible matches%s\n",
3074 REPORT_CODE_OFF + depth * 2, "",
3075 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3078 goto trie_first_try; /* jump into the fail handler */
3080 case TRIE_next_fail: /* we failed - try next alterative */
3082 REGCP_UNWIND(ST.cp);
3083 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3084 PL_regoffs[n].end = -1;
3085 *PL_reglastparen = n;
3094 ST.lastparen = *PL_reglastparen;
3097 if ( ST.accepted == 1 ) {
3098 /* only one choice left - just continue */
3100 AV *const trie_words
3101 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3102 SV ** const tmp = av_fetch( trie_words,
3103 ST.accept_buff[ 0 ].wordnum-1, 0 );
3104 SV *sv= tmp ? sv_newmortal() : NULL;
3106 PerlIO_printf( Perl_debug_log,
3107 "%*s %sonly one match left: #%d <%s>%s\n",
3108 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3109 ST.accept_buff[ 0 ].wordnum,
3110 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3111 PL_colors[0], PL_colors[1],
3112 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3114 : "not compiled under -Dr",
3117 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3118 /* in this case we free tmps/leave before we call regmatch
3119 as we wont be using accept_buff again. */
3121 locinput = PL_reginput;
3122 nextchr = UCHARAT(locinput);
3123 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3126 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3127 if (!has_cutgroup) {
3132 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3135 continue; /* execute rest of RE */
3138 if ( !ST.accepted-- ) {
3140 PerlIO_printf( Perl_debug_log,
3141 "%*s %sTRIE failed...%s\n",
3142 REPORT_CODE_OFF+depth*2, "",
3153 There are at least two accepting states left. Presumably
3154 the number of accepting states is going to be low,
3155 typically two. So we simply scan through to find the one
3156 with lowest wordnum. Once we find it, we swap the last
3157 state into its place and decrement the size. We then try to
3158 match the rest of the pattern at the point where the word
3159 ends. If we succeed, control just continues along the
3160 regex; if we fail we return here to try the next accepting
3167 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3168 DEBUG_TRIE_EXECUTE_r(
3169 PerlIO_printf( Perl_debug_log,
3170 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3171 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3172 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3173 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3176 if (ST.accept_buff[cur].wordnum <
3177 ST.accept_buff[best].wordnum)
3182 AV *const trie_words
3183 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3184 SV ** const tmp = av_fetch( trie_words,
3185 ST.accept_buff[ best ].wordnum - 1, 0 );
3186 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3188 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3189 SV *sv= tmp ? sv_newmortal() : NULL;
3191 PerlIO_printf( Perl_debug_log,
3192 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3193 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3194 ST.accept_buff[best].wordnum,
3195 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3196 PL_colors[0], PL_colors[1],
3197 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3198 ) : "not compiled under -Dr",
3199 REG_NODE_NUM(nextop),
3203 if ( best<ST.accepted ) {
3204 reg_trie_accepted tmp = ST.accept_buff[ best ];
3205 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3206 ST.accept_buff[ ST.accepted ] = tmp;
3209 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3210 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3213 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3215 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3226 char *s = STRING(scan);
3228 if (do_utf8 != UTF) {
3229 /* The target and the pattern have differing utf8ness. */
3231 const char * const e = s + ln;
3234 /* The target is utf8, the pattern is not utf8. */
3239 if (NATIVE_TO_UNI(*(U8*)s) !=
3240 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3248 /* The target is not utf8, the pattern is utf8. */
3253 if (NATIVE_TO_UNI(*((U8*)l)) !=
3254 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3262 nextchr = UCHARAT(locinput);
3265 /* The target and the pattern have the same utf8ness. */
3266 /* Inline the first character, for speed. */
3267 if (UCHARAT(s) != nextchr)
3269 if (PL_regeol - locinput < ln)
3271 if (ln > 1 && memNE(s, locinput, ln))
3274 nextchr = UCHARAT(locinput);
3278 PL_reg_flags |= RF_tainted;
3281 char * const s = STRING(scan);
3284 if (do_utf8 || UTF) {
3285 /* Either target or the pattern are utf8. */
3286 const char * const l = locinput;
3287 char *e = PL_regeol;
3289 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3290 l, &e, 0, do_utf8)) {
3291 /* One more case for the sharp s:
3292 * pack("U0U*", 0xDF) =~ /ss/i,
3293 * the 0xC3 0x9F are the UTF-8
3294 * byte sequence for the U+00DF. */
3297 toLOWER(s[0]) == 's' &&
3299 toLOWER(s[1]) == 's' &&
3306 nextchr = UCHARAT(locinput);
3310 /* Neither the target and the pattern are utf8. */
3312 /* Inline the first character, for speed. */
3313 if (UCHARAT(s) != nextchr &&
3314 UCHARAT(s) != ((OP(scan) == EXACTF)
3315 ? PL_fold : PL_fold_locale)[nextchr])
3317 if (PL_regeol - locinput < ln)
3319 if (ln > 1 && (OP(scan) == EXACTF
3320 ? ibcmp(s, locinput, ln)
3321 : ibcmp_locale(s, locinput, ln)))
3324 nextchr = UCHARAT(locinput);
3329 STRLEN inclasslen = PL_regeol - locinput;
3331 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3333 if (locinput >= PL_regeol)
3335 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3336 nextchr = UCHARAT(locinput);
3341 nextchr = UCHARAT(locinput);
3342 if (!REGINCLASS(rex, scan, (U8*)locinput))
3344 if (!nextchr && locinput >= PL_regeol)
3346 nextchr = UCHARAT(++locinput);
3350 /* If we might have the case of the German sharp s
3351 * in a casefolding Unicode character class. */
3353 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3354 locinput += SHARP_S_SKIP;
3355 nextchr = UCHARAT(locinput);
3361 PL_reg_flags |= RF_tainted;
3367 LOAD_UTF8_CHARCLASS_ALNUM();
3368 if (!(OP(scan) == ALNUM
3369 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3370 : isALNUM_LC_utf8((U8*)locinput)))
3374 locinput += PL_utf8skip[nextchr];
3375 nextchr = UCHARAT(locinput);
3378 if (!(OP(scan) == ALNUM
3379 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3381 nextchr = UCHARAT(++locinput);
3384 PL_reg_flags |= RF_tainted;
3387 if (!nextchr && locinput >= PL_regeol)
3390 LOAD_UTF8_CHARCLASS_ALNUM();
3391 if (OP(scan) == NALNUM
3392 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3393 : isALNUM_LC_utf8((U8*)locinput))
3397 locinput += PL_utf8skip[nextchr];
3398 nextchr = UCHARAT(locinput);
3401 if (OP(scan) == NALNUM
3402 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3404 nextchr = UCHARAT(++locinput);
3408 PL_reg_flags |= RF_tainted;
3412 /* was last char in word? */
3414 if (locinput == PL_bostr)
3417 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3419 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3421 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3422 ln = isALNUM_uni(ln);
3423 LOAD_UTF8_CHARCLASS_ALNUM();
3424 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3427 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3428 n = isALNUM_LC_utf8((U8*)locinput);
3432 ln = (locinput != PL_bostr) ?
3433 UCHARAT(locinput - 1) : '\n';
3434 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3436 n = isALNUM(nextchr);
3439 ln = isALNUM_LC(ln);
3440 n = isALNUM_LC(nextchr);
3443 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3444 OP(scan) == BOUNDL))
3448 PL_reg_flags |= RF_tainted;
3454 if (UTF8_IS_CONTINUED(nextchr)) {
3455 LOAD_UTF8_CHARCLASS_SPACE();
3456 if (!(OP(scan) == SPACE
3457 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3458 : isSPACE_LC_utf8((U8*)locinput)))
3462 locinput += PL_utf8skip[nextchr];
3463 nextchr = UCHARAT(locinput);
3466 if (!(OP(scan) == SPACE
3467 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3469 nextchr = UCHARAT(++locinput);
3472 if (!(OP(scan) == SPACE
3473 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3475 nextchr = UCHARAT(++locinput);
3479 PL_reg_flags |= RF_tainted;
3482 if (!nextchr && locinput >= PL_regeol)
3485 LOAD_UTF8_CHARCLASS_SPACE();
3486 if (OP(scan) == NSPACE
3487 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3488 : isSPACE_LC_utf8((U8*)locinput))
3492 locinput += PL_utf8skip[nextchr];
3493 nextchr = UCHARAT(locinput);
3496 if (OP(scan) == NSPACE
3497 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3499 nextchr = UCHARAT(++locinput);
3502 PL_reg_flags |= RF_tainted;
3508 LOAD_UTF8_CHARCLASS_DIGIT();
3509 if (!(OP(scan) == DIGIT
3510 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3511 : isDIGIT_LC_utf8((U8*)locinput)))
3515 locinput += PL_utf8skip[nextchr];
3516 nextchr = UCHARAT(locinput);
3519 if (!(OP(scan) == DIGIT
3520 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3522 nextchr = UCHARAT(++locinput);
3525 PL_reg_flags |= RF_tainted;
3528 if (!nextchr && locinput >= PL_regeol)
3531 LOAD_UTF8_CHARCLASS_DIGIT();
3532 if (OP(scan) == NDIGIT
3533 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3534 : isDIGIT_LC_utf8((U8*)locinput))
3538 locinput += PL_utf8skip[nextchr];
3539 nextchr = UCHARAT(locinput);
3542 if (OP(scan) == NDIGIT
3543 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3545 nextchr = UCHARAT(++locinput);
3548 if (locinput >= PL_regeol)
3551 LOAD_UTF8_CHARCLASS_MARK();
3552 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3554 locinput += PL_utf8skip[nextchr];
3555 while (locinput < PL_regeol &&
3556 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3557 locinput += UTF8SKIP(locinput);
3558 if (locinput > PL_regeol)
3563 nextchr = UCHARAT(locinput);
3570 PL_reg_flags |= RF_tainted;
3575 n = reg_check_named_buff_matched(rex,scan);
3578 type = REF + ( type - NREF );
3585 PL_reg_flags |= RF_tainted;
3589 n = ARG(scan); /* which paren pair */
3592 ln = PL_regoffs[n].start;
3593 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3594 if (*PL_reglastparen < n || ln == -1)
3595 sayNO; /* Do not match unless seen CLOSEn. */
3596 if (ln == PL_regoffs[n].end)
3600 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3602 const char *e = PL_bostr + PL_regoffs[n].end;
3604 * Note that we can't do the "other character" lookup trick as
3605 * in the 8-bit case (no pun intended) because in Unicode we
3606 * have to map both upper and title case to lower case.
3610 STRLEN ulen1, ulen2;
3611 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3612 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3616 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3617 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3618 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3625 nextchr = UCHARAT(locinput);
3629 /* Inline the first character, for speed. */
3630 if (UCHARAT(s) != nextchr &&
3632 (UCHARAT(s) != (type == REFF
3633 ? PL_fold : PL_fold_locale)[nextchr])))
3635 ln = PL_regoffs[n].end - ln;
3636 if (locinput + ln > PL_regeol)
3638 if (ln > 1 && (type == REF
3639 ? memNE(s, locinput, ln)
3641 ? ibcmp(s, locinput, ln)
3642 : ibcmp_locale(s, locinput, ln))))
3645 nextchr = UCHARAT(locinput);
3655 #define ST st->u.eval
3660 regexp_internal *rei;
3661 regnode *startpoint;
3664 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3665 if (cur_eval && cur_eval->locinput==locinput) {
3666 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3667 Perl_croak(aTHX_ "Infinite recursion in regex");
3668 if ( ++nochange_depth > max_nochange_depth )
3670 "Pattern subroutine nesting without pos change"
3671 " exceeded limit in regex");
3678 (void)ReREFCNT_inc(rex_sv);
3679 if (OP(scan)==GOSUB) {
3680 startpoint = scan + ARG2L(scan);
3681 ST.close_paren = ARG(scan);
3683 startpoint = rei->program+1;
3686 goto eval_recurse_doit;
3688 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3689 if (cur_eval && cur_eval->locinput==locinput) {
3690 if ( ++nochange_depth > max_nochange_depth )
3691 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3696 /* execute the code in the {...} */
3698 SV ** const before = SP;
3699 OP_4tree * const oop = PL_op;
3700 COP * const ocurcop = PL_curcop;
3704 PL_op = (OP_4tree*)rexi->data->data[n];
3705 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3706 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3707 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3708 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3711 SV *sv_mrk = get_sv("REGMARK", 1);
3712 sv_setsv(sv_mrk, sv_yes_mark);
3715 CALLRUNOPS(aTHX); /* Scalar context. */
3718 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3725 PAD_RESTORE_LOCAL(old_comppad);
3726 PL_curcop = ocurcop;
3729 sv_setsv(save_scalar(PL_replgv), ret);
3733 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3736 /* extract RE object from returned value; compiling if
3742 SV *const sv = SvRV(ret);
3744 if (SvTYPE(sv) == SVt_REGEXP) {
3746 } else if (SvSMAGICAL(sv)) {
3747 mg = mg_find(sv, PERL_MAGIC_qr);
3750 } else if (SvTYPE(ret) == SVt_REGEXP) {
3752 } else if (SvSMAGICAL(ret)) {
3753 if (SvGMAGICAL(ret)) {
3754 /* I don't believe that there is ever qr magic
3756 assert(!mg_find(ret, PERL_MAGIC_qr));
3757 sv_unmagic(ret, PERL_MAGIC_qr);
3760 mg = mg_find(ret, PERL_MAGIC_qr);
3761 /* testing suggests mg only ends up non-NULL for
3762 scalars who were upgraded and compiled in the
3763 else block below. In turn, this is only
3764 triggered in the "postponed utf8 string" tests
3770 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3774 rx = reg_temp_copy(rx);
3778 const I32 osize = PL_regsize;
3781 assert (SvUTF8(ret));
3782 } else if (SvUTF8(ret)) {
3783 /* Not doing UTF-8, despite what the SV says. Is
3784 this only if we're trapped in use 'bytes'? */
3785 /* Make a copy of the octet sequence, but without
3786 the flag on, as the compiler now honours the
3787 SvUTF8 flag on ret. */
3789 const char *const p = SvPV(ret, len);
3790 ret = newSVpvn_flags(p, len, SVs_TEMP);
3792 rx = CALLREGCOMP(ret, pm_flags);
3794 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3796 /* This isn't a first class regexp. Instead, it's
3797 caching a regexp onto an existing, Perl visible
3799 sv_magic(ret, (SV*) rx, PERL_MAGIC_qr, 0, 0);
3804 re = (struct regexp *)SvANY(rx);
3806 RXp_MATCH_COPIED_off(re);
3807 re->subbeg = rex->subbeg;
3808 re->sublen = rex->sublen;
3811 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
3812 "Matching embedded");
3814 startpoint = rei->program + 1;
3815 ST.close_paren = 0; /* only used for GOSUB */
3816 /* borrowed from regtry */
3817 if (PL_reg_start_tmpl <= re->nparens) {
3818 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3819 if(PL_reg_start_tmp)
3820 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3822 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3825 eval_recurse_doit: /* Share code with GOSUB below this line */
3826 /* run the pattern returned from (??{...}) */
3827 ST.cp = regcppush(0); /* Save *all* the positions. */
3828 REGCP_SET(ST.lastcp);
3830 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
3832 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3833 PL_reglastparen = &re->lastparen;
3834 PL_reglastcloseparen = &re->lastcloseparen;
3836 re->lastcloseparen = 0;
3838 PL_reginput = locinput;
3841 /* XXXX This is too dramatic a measure... */
3844 ST.toggle_reg_flags = PL_reg_flags;
3846 PL_reg_flags |= RF_utf8;
3848 PL_reg_flags &= ~RF_utf8;
3849 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3851 ST.prev_rex = rex_sv;
3852 ST.prev_curlyx = cur_curlyx;
3853 SETREX(rex_sv,re_sv);
3858 ST.prev_eval = cur_eval;
3860 /* now continue from first node in postoned RE */
3861 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3864 /* logical is 1, /(?(?{...})X|Y)/ */
3865 sw = (bool)SvTRUE(ret);
3870 case EVAL_AB: /* cleanup after a successful (??{A})B */
3871 /* note: this is called twice; first after popping B, then A */
3872 PL_reg_flags ^= ST.toggle_reg_flags;
3873 ReREFCNT_dec(rex_sv);
3874 SETREX(rex_sv,ST.prev_rex);
3875 rex = (struct regexp *)SvANY(rex_sv);
3876 rexi = RXi_GET(rex);
3878 cur_eval = ST.prev_eval;
3879 cur_curlyx = ST.prev_curlyx;
3881 PL_reglastparen = &rex->lastparen;
3882 PL_reglastcloseparen = &rex->lastcloseparen;
3884 /* XXXX This is too dramatic a measure... */
3886 if ( nochange_depth )
3891 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3892 /* note: this is called twice; first after popping B, then A */
3893 PL_reg_flags ^= ST.toggle_reg_flags;
3894 ReREFCNT_dec(rex_sv);
3895 SETREX(rex_sv,ST.prev_rex);
3896 rex = (struct regexp *)SvANY(rex_sv);
3897 rexi = RXi_GET(rex);
3898 PL_reglastparen = &rex->lastparen;
3899 PL_reglastcloseparen = &rex->lastcloseparen;
3901 PL_reginput = locinput;
3902 REGCP_UNWIND(ST.lastcp);
3904 cur_eval = ST.prev_eval;
3905 cur_curlyx = ST.prev_curlyx;
3906 /* XXXX This is too dramatic a measure... */
3908 if ( nochange_depth )
3914 n = ARG(scan); /* which paren pair */
3915 PL_reg_start_tmp[n] = locinput;
3921 n = ARG(scan); /* which paren pair */
3922 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3923 PL_regoffs[n].end = locinput - PL_bostr;
3924 /*if (n > PL_regsize)
3926 if (n > *PL_reglastparen)
3927 *PL_reglastparen = n;
3928 *PL_reglastcloseparen = n;
3929 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3937 cursor && OP(cursor)!=END;
3938 cursor=regnext(cursor))
3940 if ( OP(cursor)==CLOSE ){
3942 if ( n <= lastopen ) {
3944 = PL_reg_start_tmp[n] - PL_bostr;
3945 PL_regoffs[n].end = locinput - PL_bostr;
3946 /*if (n > PL_regsize)
3948 if (n > *PL_reglastparen)
3949 *PL_reglastparen = n;
3950 *PL_reglastcloseparen = n;
3951 if ( n == ARG(scan) || (cur_eval &&
3952 cur_eval->u.eval.close_paren == n))
3961 n = ARG(scan); /* which paren pair */
3962 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
3965 /* reg_check_named_buff_matched returns 0 for no match */
3966 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3970 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3976 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3978 next = NEXTOPER(NEXTOPER(scan));
3980 next = scan + ARG(scan);
3981 if (OP(next) == IFTHEN) /* Fake one. */
3982 next = NEXTOPER(NEXTOPER(next));
3986 logical = scan->flags;
3989 /*******************************************************************
3991 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3992 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3993 STAR/PLUS/CURLY/CURLYN are used instead.)
3995 A*B is compiled as <CURLYX><A><WHILEM><B>
3997 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3998 state, which contains the current count, initialised to -1. It also sets
3999 cur_curlyx to point to this state, with any previous value saved in the
4002 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4003 since the pattern may possibly match zero times (i.e. it's a while {} loop
4004 rather than a do {} while loop).
4006 Each entry to WHILEM represents a successful match of A. The count in the
4007 CURLYX block is incremented, another WHILEM state is pushed, and execution
4008 passes to A or B depending on greediness and the current count.
4010 For example, if matching against the string a1a2a3b (where the aN are
4011 substrings that match /A/), then the match progresses as follows: (the
4012 pushed states are interspersed with the bits of strings matched so far):
4015 <CURLYX cnt=0><WHILEM>
4016 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4017 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4018 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4019 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4021 (Contrast this with something like CURLYM, which maintains only a single
4025 a1 <CURLYM cnt=1> a2
4026 a1 a2 <CURLYM cnt=2> a3
4027 a1 a2 a3 <CURLYM cnt=3> b
4030 Each WHILEM state block marks a point to backtrack to upon partial failure
4031 of A or B, and also contains some minor state data related to that
4032 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4033 overall state, such as the count, and pointers to the A and B ops.
4035 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4036 must always point to the *current* CURLYX block, the rules are:
4038 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4039 and set cur_curlyx to point the new block.
4041 When popping the CURLYX block after a successful or unsuccessful match,
4042 restore the previous cur_curlyx.
4044 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4045 to the outer one saved in the CURLYX block.
4047 When popping the WHILEM block after a successful or unsuccessful B match,
4048 restore the previous cur_curlyx.
4050 Here's an example for the pattern (AI* BI)*BO
4051 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4054 curlyx backtrack stack
4055 ------ ---------------
4057 CO <CO prev=NULL> <WO>
4058 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4059 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4060 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4062 At this point the pattern succeeds, and we work back down the stack to
4063 clean up, restoring as we go:
4065 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4066 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4067 CO <CO prev=NULL> <WO>
4070 *******************************************************************/
4072 #define ST st->u.curlyx
4074 case CURLYX: /* start of /A*B/ (for complex A) */
4076 /* No need to save/restore up to this paren */
4077 I32 parenfloor = scan->flags;
4079 assert(next); /* keep Coverity happy */
4080 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4083 /* XXXX Probably it is better to teach regpush to support
4084 parenfloor > PL_regsize... */
4085 if (parenfloor > (I32)*PL_reglastparen)
4086 parenfloor = *PL_reglastparen; /* Pessimization... */
4088 ST.prev_curlyx= cur_curlyx;
4090 ST.cp = PL_savestack_ix;
4092 /* these fields contain the state of the current curly.
4093 * they are accessed by subsequent WHILEMs */
4094 ST.parenfloor = parenfloor;
4095 ST.min = ARG1(scan);
4096 ST.max = ARG2(scan);
4097 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4101 ST.count = -1; /* this will be updated by WHILEM */
4102 ST.lastloc = NULL; /* this will be updated by WHILEM */
4104 PL_reginput = locinput;
4105 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4109 case CURLYX_end: /* just finished matching all of A*B */
4110 cur_curlyx = ST.prev_curlyx;
4114 case CURLYX_end_fail: /* just failed to match all of A*B */
4116 cur_curlyx = ST.prev_curlyx;
4122 #define ST st->u.whilem
4124 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4126 /* see the discussion above about CURLYX/WHILEM */
4128 assert(cur_curlyx); /* keep Coverity happy */
4129 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4130 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4131 ST.cache_offset = 0;
4134 PL_reginput = locinput;
4136 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4137 "%*s whilem: matched %ld out of %ld..%ld\n",
4138 REPORT_CODE_OFF+depth*2, "", (long)n,
4139 (long)cur_curlyx->u.curlyx.min,
4140 (long)cur_curlyx->u.curlyx.max)
4143 /* First just match a string of min A's. */
4145 if (n < cur_curlyx->u.curlyx.min) {
4146 cur_curlyx->u.curlyx.lastloc = locinput;
4147 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4151 /* If degenerate A matches "", assume A done. */
4153 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4154 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4155 "%*s whilem: empty match detected, trying continuation...\n",
4156 REPORT_CODE_OFF+depth*2, "")
4158 goto do_whilem_B_max;
4161 /* super-linear cache processing */
4165 if (!PL_reg_maxiter) {
4166 /* start the countdown: Postpone detection until we
4167 * know the match is not *that* much linear. */
4168 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4169 /* possible overflow for long strings and many CURLYX's */
4170 if (PL_reg_maxiter < 0)
4171 PL_reg_maxiter = I32_MAX;
4172 PL_reg_leftiter = PL_reg_maxiter;
4175 if (PL_reg_leftiter-- == 0) {
4176 /* initialise cache */
4177 const I32 size = (PL_reg_maxiter + 7)/8;
4178 if (PL_reg_poscache) {
4179 if ((I32)PL_reg_poscache_size < size) {
4180 Renew(PL_reg_poscache, size, char);
4181 PL_reg_poscache_size = size;
4183 Zero(PL_reg_poscache, size, char);
4186 PL_reg_poscache_size = size;
4187 Newxz(PL_reg_poscache, size, char);
4189 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4190 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4191 PL_colors[4], PL_colors[5])
4195 if (PL_reg_leftiter < 0) {
4196 /* have we already failed at this position? */
4198 offset = (scan->flags & 0xf) - 1
4199 + (locinput - PL_bostr) * (scan->flags>>4);
4200 mask = 1 << (offset % 8);
4202 if (PL_reg_poscache[offset] & mask) {
4203 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4204 "%*s whilem: (cache) already tried at this position...\n",
4205 REPORT_CODE_OFF+depth*2, "")
4207 sayNO; /* cache records failure */
4209 ST.cache_offset = offset;
4210 ST.cache_mask = mask;
4214 /* Prefer B over A for minimal matching. */
4216 if (cur_curlyx->u.curlyx.minmod) {
4217 ST.save_curlyx = cur_curlyx;
4218 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4219 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4220 REGCP_SET(ST.lastcp);
4221 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4225 /* Prefer A over B for maximal matching. */
4227 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4228 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4229 cur_curlyx->u.curlyx.lastloc = locinput;
4230 REGCP_SET(ST.lastcp);
4231 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4234 goto do_whilem_B_max;
4238 case WHILEM_B_min: /* just matched B in a minimal match */
4239 case WHILEM_B_max: /* just matched B in a maximal match */
4240 cur_curlyx = ST.save_curlyx;
4244 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4245 cur_curlyx = ST.save_curlyx;
4246 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4247 cur_curlyx->u.curlyx.count--;
4251 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4252 REGCP_UNWIND(ST.lastcp);
4255 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4256 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4257 cur_curlyx->u.curlyx.count--;
4261 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4262 REGCP_UNWIND(ST.lastcp);
4263 regcppop(rex); /* Restore some previous $<digit>s? */
4264 PL_reginput = locinput;
4265 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4266 "%*s whilem: failed, trying continuation...\n",
4267 REPORT_CODE_OFF+depth*2, "")
4270 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4271 && ckWARN(WARN_REGEXP)
4272 && !(PL_reg_flags & RF_warned))
4274 PL_reg_flags |= RF_warned;
4275 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4276 "Complex regular subexpression recursion",
4281 ST.save_curlyx = cur_curlyx;
4282 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4283 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4286 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4287 cur_curlyx = ST.save_curlyx;
4288 REGCP_UNWIND(ST.lastcp);
4291 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4292 /* Maximum greed exceeded */
4293 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4294 && ckWARN(WARN_REGEXP)
4295 && !(PL_reg_flags & RF_warned))
4297 PL_reg_flags |= RF_warned;
4298 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4299 "%s limit (%d) exceeded",
4300 "Complex regular subexpression recursion",
4303 cur_curlyx->u.curlyx.count--;
4307 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4308 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4310 /* Try grabbing another A and see if it helps. */
4311 PL_reginput = locinput;
4312 cur_curlyx->u.curlyx.lastloc = locinput;
4313 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4314 REGCP_SET(ST.lastcp);
4315 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4319 #define ST st->u.branch
4321 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4322 next = scan + ARG(scan);
4325 scan = NEXTOPER(scan);
4328 case BRANCH: /* /(...|A|...)/ */
4329 scan = NEXTOPER(scan); /* scan now points to inner node */
4330 ST.lastparen = *PL_reglastparen;
4331 ST.next_branch = next;
4333 PL_reginput = locinput;
4335 /* Now go into the branch */
4337 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4339 PUSH_STATE_GOTO(BRANCH_next, scan);
4343 PL_reginput = locinput;
4344 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4345 (SV*)rexi->data->data[ ARG( scan ) ];
4346 PUSH_STATE_GOTO(CUTGROUP_next,next);
4348 case CUTGROUP_next_fail:
4351 if (st->u.mark.mark_name)
4352 sv_commit = st->u.mark.mark_name;
4358 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4363 REGCP_UNWIND(ST.cp);
4364 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4365 PL_regoffs[n].end = -1;
4366 *PL_reglastparen = n;
4367 /*dmq: *PL_reglastcloseparen = n; */
4368 scan = ST.next_branch;
4369 /* no more branches? */
4370 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4372 PerlIO_printf( Perl_debug_log,
4373 "%*s %sBRANCH failed...%s\n",
4374 REPORT_CODE_OFF+depth*2, "",
4380 continue; /* execute next BRANCH[J] op */
4388 #define ST st->u.curlym
4390 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4392 /* This is an optimisation of CURLYX that enables us to push
4393 * only a single backtracking state, no matter now many matches
4394 * there are in {m,n}. It relies on the pattern being constant
4395 * length, with no parens to influence future backrefs
4399 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4401 /* if paren positive, emulate an OPEN/CLOSE around A */
4403 U32 paren = ST.me->flags;
4404 if (paren > PL_regsize)
4406 if (paren > *PL_reglastparen)
4407 *PL_reglastparen = paren;
4408 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4416 ST.c1 = CHRTEST_UNINIT;
4419 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4422 curlym_do_A: /* execute the A in /A{m,n}B/ */
4423 PL_reginput = locinput;
4424 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4427 case CURLYM_A: /* we've just matched an A */
4428 locinput = st->locinput;
4429 nextchr = UCHARAT(locinput);
4432 /* after first match, determine A's length: u.curlym.alen */
4433 if (ST.count == 1) {
4434 if (PL_reg_match_utf8) {
4436 while (s < PL_reginput) {
4442 ST.alen = PL_reginput - locinput;
4445 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4448 PerlIO_printf(Perl_debug_log,
4449 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4450 (int)(REPORT_CODE_OFF+(depth*2)), "",
4451 (IV) ST.count, (IV)ST.alen)
4454 locinput = PL_reginput;
4456 if (cur_eval && cur_eval->u.eval.close_paren &&
4457 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4460 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4461 goto curlym_do_A; /* try to match another A */
4462 goto curlym_do_B; /* try to match B */
4464 case CURLYM_A_fail: /* just failed to match an A */
4465 REGCP_UNWIND(ST.cp);
4467 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4468 || (cur_eval && cur_eval->u.eval.close_paren &&
4469 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4472 curlym_do_B: /* execute the B in /A{m,n}B/ */
4473 PL_reginput = locinput;
4474 if (ST.c1 == CHRTEST_UNINIT) {
4475 /* calculate c1 and c2 for possible match of 1st char
4476 * following curly */
4477 ST.c1 = ST.c2 = CHRTEST_VOID;
4478 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4479 regnode *text_node = ST.B;
4480 if (! HAS_TEXT(text_node))
4481 FIND_NEXT_IMPT(text_node);
4484 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4486 But the former is redundant in light of the latter.
4488 if this changes back then the macro for
4489 IS_TEXT and friends need to change.
4491 if (PL_regkind[OP(text_node)] == EXACT)
4494 ST.c1 = (U8)*STRING(text_node);
4496 (IS_TEXTF(text_node))
4498 : (IS_TEXTFL(text_node))
4499 ? PL_fold_locale[ST.c1]
4506 PerlIO_printf(Perl_debug_log,
4507 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4508 (int)(REPORT_CODE_OFF+(depth*2)),
4511 if (ST.c1 != CHRTEST_VOID
4512 && UCHARAT(PL_reginput) != ST.c1
4513 && UCHARAT(PL_reginput) != ST.c2)
4515 /* simulate B failing */
4517 PerlIO_printf(Perl_debug_log,
4518 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4519 (int)(REPORT_CODE_OFF+(depth*2)),"",
4522 state_num = CURLYM_B_fail;
4523 goto reenter_switch;
4527 /* mark current A as captured */
4528 I32 paren = ST.me->flags;
4530 PL_regoffs[paren].start
4531 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4532 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4533 /*dmq: *PL_reglastcloseparen = paren; */
4536 PL_regoffs[paren].end = -1;
4537 if (cur_eval && cur_eval->u.eval.close_paren &&
4538 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4547 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4550 case CURLYM_B_fail: /* just failed to match a B */
4551 REGCP_UNWIND(ST.cp);
4553 if (ST.count == ARG2(ST.me) /* max */)
4555 goto curlym_do_A; /* try to match a further A */
4557 /* backtrack one A */
4558 if (ST.count == ARG1(ST.me) /* min */)
4561 locinput = HOPc(locinput, -ST.alen);
4562 goto curlym_do_B; /* try to match B */
4565 #define ST st->u.curly
4567 #define CURLY_SETPAREN(paren, success) \
4570 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4571 PL_regoffs[paren].end = locinput - PL_bostr; \
4572 *PL_reglastcloseparen = paren; \
4575 PL_regoffs[paren].end = -1; \
4578 case STAR: /* /A*B/ where A is width 1 */
4582 scan = NEXTOPER(scan);
4584 case PLUS: /* /A+B/ where A is width 1 */
4588 scan = NEXTOPER(scan);
4590 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4591 ST.paren = scan->flags; /* Which paren to set */
4592 if (ST.paren > PL_regsize)
4593 PL_regsize = ST.paren;
4594 if (ST.paren > *PL_reglastparen)
4595 *PL_reglastparen = ST.paren;
4596 ST.min = ARG1(scan); /* min to match */
4597 ST.max = ARG2(scan); /* max to match */
4598 if (cur_eval && cur_eval->u.eval.close_paren &&
4599 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4603 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4605 case CURLY: /* /A{m,n}B/ where A is width 1 */
4607 ST.min = ARG1(scan); /* min to match */
4608 ST.max = ARG2(scan); /* max to match */
4609 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4612 * Lookahead to avoid useless match attempts
4613 * when we know what character comes next.
4615 * Used to only do .*x and .*?x, but now it allows
4616 * for )'s, ('s and (?{ ... })'s to be in the way
4617 * of the quantifier and the EXACT-like node. -- japhy
4620 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4622 if (HAS_TEXT(next) || JUMPABLE(next)) {
4624 regnode *text_node = next;
4626 if (! HAS_TEXT(text_node))
4627 FIND_NEXT_IMPT(text_node);
4629 if (! HAS_TEXT(text_node))
4630 ST.c1 = ST.c2 = CHRTEST_VOID;
4632 if ( PL_regkind[OP(text_node)] != EXACT ) {
4633 ST.c1 = ST.c2 = CHRTEST_VOID;
4634 goto assume_ok_easy;
4637 s = (U8*)STRING(text_node);
4639 /* Currently we only get here when
4641 PL_rekind[OP(text_node)] == EXACT
4643 if this changes back then the macro for IS_TEXT and
4644 friends need to change. */
4647 if (IS_TEXTF(text_node))
4648 ST.c2 = PL_fold[ST.c1];
4649 else if (IS_TEXTFL(text_node))
4650 ST.c2 = PL_fold_locale[ST.c1];
4653 if (IS_TEXTF(text_node)) {
4654 STRLEN ulen1, ulen2;
4655 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4656 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4658 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4659 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4661 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4663 0 : UTF8_ALLOW_ANY);
4664 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4666 0 : UTF8_ALLOW_ANY);
4668 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4670 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4675 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4682 ST.c1 = ST.c2 = CHRTEST_VOID;
4687 PL_reginput = locinput;
4690 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4693 locinput = PL_reginput;
4695 if (ST.c1 == CHRTEST_VOID)
4696 goto curly_try_B_min;
4698 ST.oldloc = locinput;
4700 /* set ST.maxpos to the furthest point along the
4701 * string that could possibly match */
4702 if (ST.max == REG_INFTY) {
4703 ST.maxpos = PL_regeol - 1;
4705 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4709 int m = ST.max - ST.min;
4710 for (ST.maxpos = locinput;
4711 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4712 ST.maxpos += UTF8SKIP(ST.maxpos);
4715 ST.maxpos = locinput + ST.max - ST.min;
4716 if (ST.maxpos >= PL_regeol)
4717 ST.maxpos = PL_regeol - 1;
4719 goto curly_try_B_min_known;
4723 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4724 locinput = PL_reginput;
4725 if (ST.count < ST.min)
4727 if ((ST.count > ST.min)
4728 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4730 /* A{m,n} must come at the end of the string, there's
4731 * no point in backing off ... */
4733 /* ...except that $ and \Z can match before *and* after
4734 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4735 We may back off by one in this case. */
4736 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4740 goto curly_try_B_max;
4745 case CURLY_B_min_known_fail:
4746 /* failed to find B in a non-greedy match where c1,c2 valid */
4747 if (ST.paren && ST.count)
4748 PL_regoffs[ST.paren].end = -1;
4750 PL_reginput = locinput; /* Could be reset... */
4751 REGCP_UNWIND(ST.cp);
4752 /* Couldn't or didn't -- move forward. */
4753 ST.oldloc = locinput;
4755 locinput += UTF8SKIP(locinput);
4759 curly_try_B_min_known:
4760 /* find the next place where 'B' could work, then call B */
4764 n = (ST.oldloc == locinput) ? 0 : 1;
4765 if (ST.c1 == ST.c2) {
4767 /* set n to utf8_distance(oldloc, locinput) */
4768 while (locinput <= ST.maxpos &&
4769 utf8n_to_uvchr((U8*)locinput,
4770 UTF8_MAXBYTES, &len,
4771 uniflags) != (UV)ST.c1) {
4777 /* set n to utf8_distance(oldloc, locinput) */
4778 while (locinput <= ST.maxpos) {
4780 const UV c = utf8n_to_uvchr((U8*)locinput,
4781 UTF8_MAXBYTES, &len,
4783 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4791 if (ST.c1 == ST.c2) {
4792 while (locinput <= ST.maxpos &&
4793 UCHARAT(locinput) != ST.c1)
4797 while (locinput <= ST.maxpos
4798 && UCHARAT(locinput) != ST.c1
4799 && UCHARAT(locinput) != ST.c2)
4802 n = locinput - ST.oldloc;
4804 if (locinput > ST.maxpos)
4806 /* PL_reginput == oldloc now */
4809 if (regrepeat(rex, ST.A, n, depth) < n)
4812 PL_reginput = locinput;
4813 CURLY_SETPAREN(ST.paren, ST.count);
4814 if (cur_eval && cur_eval->u.eval.close_paren &&
4815 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4818 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4823 case CURLY_B_min_fail:
4824 /* failed to find B in a non-greedy match where c1,c2 invalid */
4825 if (ST.paren && ST.count)
4826 PL_regoffs[ST.paren].end = -1;
4828 REGCP_UNWIND(ST.cp);
4829 /* failed -- move forward one */
4830 PL_reginput = locinput;
4831 if (regrepeat(rex, ST.A, 1, depth)) {
4833 locinput = PL_reginput;
4834 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4835 ST.count > 0)) /* count overflow ? */
4838 CURLY_SETPAREN(ST.paren, ST.count);
4839 if (cur_eval && cur_eval->u.eval.close_paren &&
4840 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4843 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4851 /* a successful greedy match: now try to match B */
4852 if (cur_eval && cur_eval->u.eval.close_paren &&
4853 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4858 if (ST.c1 != CHRTEST_VOID)
4859 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4860 UTF8_MAXBYTES, 0, uniflags)
4861 : (UV) UCHARAT(PL_reginput);
4862 /* If it could work, try it. */
4863 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4864 CURLY_SETPAREN(ST.paren, ST.count);
4865 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4870 case CURLY_B_max_fail:
4871 /* failed to find B in a greedy match */
4872 if (ST.paren && ST.count)
4873 PL_regoffs[ST.paren].end = -1;
4875 REGCP_UNWIND(ST.cp);
4877 if (--ST.count < ST.min)
4879 PL_reginput = locinput = HOPc(locinput, -1);
4880 goto curly_try_B_max;
4887 /* we've just finished A in /(??{A})B/; now continue with B */
4889 st->u.eval.toggle_reg_flags
4890 = cur_eval->u.eval.toggle_reg_flags;
4891 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4893 st->u.eval.prev_rex = rex_sv; /* inner */
4894 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4895 rex = (struct regexp *)SvANY(rex_sv);
4896 rexi = RXi_GET(rex);
4897 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4898 ReREFCNT_inc(rex_sv);
4899 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4900 REGCP_SET(st->u.eval.lastcp);
4901 PL_reginput = locinput;
4903 /* Restore parens of the outer rex without popping the
4905 tmpix = PL_savestack_ix;
4906 PL_savestack_ix = cur_eval->u.eval.lastcp;
4908 PL_savestack_ix = tmpix;
4910 st->u.eval.prev_eval = cur_eval;
4911 cur_eval = cur_eval->u.eval.prev_eval;
4913 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4914 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4915 if ( nochange_depth )
4918 PUSH_YES_STATE_GOTO(EVAL_AB,
4919 st->u.eval.prev_eval->u.eval.B); /* match B */
4922 if (locinput < reginfo->till) {
4923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4924 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4926 (long)(locinput - PL_reg_starttry),
4927 (long)(reginfo->till - PL_reg_starttry),
4930 sayNO_SILENT; /* Cannot match: too short. */
4932 PL_reginput = locinput; /* put where regtry can find it */
4933 sayYES; /* Success! */
4935 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4937 PerlIO_printf(Perl_debug_log,
4938 "%*s %ssubpattern success...%s\n",
4939 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4940 PL_reginput = locinput; /* put where regtry can find it */
4941 sayYES; /* Success! */
4944 #define ST st->u.ifmatch
4946 case SUSPEND: /* (?>A) */
4948 PL_reginput = locinput;
4951 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4953 goto ifmatch_trivial_fail_test;
4955 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4957 ifmatch_trivial_fail_test:
4959 char * const s = HOPBACKc(locinput, scan->flags);
4964 sw = 1 - (bool)ST.wanted;
4968 next = scan + ARG(scan);
4976 PL_reginput = locinput;
4980 ST.logical = logical;
4981 /* execute body of (?...A) */
4982 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4985 case IFMATCH_A_fail: /* body of (?...A) failed */
4986 ST.wanted = !ST.wanted;
4989 case IFMATCH_A: /* body of (?...A) succeeded */
4991 sw = (bool)ST.wanted;
4993 else if (!ST.wanted)
4996 if (OP(ST.me) == SUSPEND)
4997 locinput = PL_reginput;
4999 locinput = PL_reginput = st->locinput;
5000 nextchr = UCHARAT(locinput);
5002 scan = ST.me + ARG(ST.me);
5005 continue; /* execute B */
5010 next = scan + ARG(scan);
5015 reginfo->cutpoint = PL_regeol;
5018 PL_reginput = locinput;
5020 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
5021 PUSH_STATE_GOTO(COMMIT_next,next);
5023 case COMMIT_next_fail:
5030 #define ST st->u.mark
5032 ST.prev_mark = mark_state;
5033 ST.mark_name = sv_commit = sv_yes_mark
5034 = (SV*)rexi->data->data[ ARG( scan ) ];
5036 ST.mark_loc = PL_reginput = locinput;
5037 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5039 case MARKPOINT_next:
5040 mark_state = ST.prev_mark;
5043 case MARKPOINT_next_fail:
5044 if (popmark && sv_eq(ST.mark_name,popmark))
5046 if (ST.mark_loc > startpoint)
5047 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5048 popmark = NULL; /* we found our mark */
5049 sv_commit = ST.mark_name;
5052 PerlIO_printf(Perl_debug_log,
5053 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5054 REPORT_CODE_OFF+depth*2, "",
5055 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5058 mark_state = ST.prev_mark;
5059 sv_yes_mark = mark_state ?
5060 mark_state->u.mark.mark_name : NULL;
5064 PL_reginput = locinput;
5066 /* (*SKIP) : if we fail we cut here*/
5067 ST.mark_name = NULL;
5068 ST.mark_loc = locinput;
5069 PUSH_STATE_GOTO(SKIP_next,next);
5071 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5072 otherwise do nothing. Meaning we need to scan
5074 regmatch_state *cur = mark_state;
5075 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
5078 if ( sv_eq( cur->u.mark.mark_name,
5081 ST.mark_name = find;
5082 PUSH_STATE_GOTO( SKIP_next, next );
5084 cur = cur->u.mark.prev_mark;
5087 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5089 case SKIP_next_fail:
5091 /* (*CUT:NAME) - Set up to search for the name as we
5092 collapse the stack*/
5093 popmark = ST.mark_name;
5095 /* (*CUT) - No name, we cut here.*/
5096 if (ST.mark_loc > startpoint)
5097 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5098 /* but we set sv_commit to latest mark_name if there
5099 is one so they can test to see how things lead to this
5102 sv_commit=mark_state->u.mark.mark_name;
5110 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5112 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5115 U8 folded[UTF8_MAXBYTES_CASE+1];
5117 const char * const l = locinput;
5118 char *e = PL_regeol;
5119 to_uni_fold(n, folded, &foldlen);
5121 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
5122 l, &e, 0, do_utf8)) {
5127 nextchr = UCHARAT(locinput);
5130 if ((n=is_LNBREAK(locinput,do_utf8))) {
5132 nextchr = UCHARAT(locinput);
5137 #define CASE_CLASS(nAmE) \
5139 if ((n=is_##nAmE(locinput,do_utf8))) { \
5141 nextchr = UCHARAT(locinput); \
5146 if ((n=is_##nAmE(locinput,do_utf8))) { \
5149 locinput += UTF8SKIP(locinput); \
5150 nextchr = UCHARAT(locinput); \
5155 CASE_CLASS(HORIZWS);
5159 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5160 PTR2UV(scan), OP(scan));
5161 Perl_croak(aTHX_ "regexp memory corruption");
5165 /* switch break jumps here */
5166 scan = next; /* prepare to execute the next op and ... */
5167 continue; /* ... jump back to the top, reusing st */
5171 /* push a state that backtracks on success */
5172 st->u.yes.prev_yes_state = yes_state;
5176 /* push a new regex state, then continue at scan */
5178 regmatch_state *newst;
5181 regmatch_state *cur = st;
5182 regmatch_state *curyes = yes_state;
5184 regmatch_slab *slab = PL_regmatch_slab;
5185 for (;curd > -1;cur--,curd--) {
5186 if (cur < SLAB_FIRST(slab)) {
5188 cur = SLAB_LAST(slab);
5190 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5191 REPORT_CODE_OFF + 2 + depth * 2,"",
5192 curd, PL_reg_name[cur->resume_state],
5193 (curyes == cur) ? "yes" : ""
5196 curyes = cur->u.yes.prev_yes_state;
5199 DEBUG_STATE_pp("push")
5202 st->locinput = locinput;
5204 if (newst > SLAB_LAST(PL_regmatch_slab))
5205 newst = S_push_slab(aTHX);
5206 PL_regmatch_state = newst;
5208 locinput = PL_reginput;
5209 nextchr = UCHARAT(locinput);
5217 * We get here only if there's trouble -- normally "case END" is
5218 * the terminating point.
5220 Perl_croak(aTHX_ "corrupted regexp pointers");
5226 /* we have successfully completed a subexpression, but we must now
5227 * pop to the state marked by yes_state and continue from there */
5228 assert(st != yes_state);
5230 while (st != yes_state) {
5232 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5233 PL_regmatch_slab = PL_regmatch_slab->prev;
5234 st = SLAB_LAST(PL_regmatch_slab);
5238 DEBUG_STATE_pp("pop (no final)");
5240 DEBUG_STATE_pp("pop (yes)");
5246 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5247 || yes_state > SLAB_LAST(PL_regmatch_slab))
5249 /* not in this slab, pop slab */
5250 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5251 PL_regmatch_slab = PL_regmatch_slab->prev;
5252 st = SLAB_LAST(PL_regmatch_slab);
5254 depth -= (st - yes_state);
5257 yes_state = st->u.yes.prev_yes_state;
5258 PL_regmatch_state = st;
5261 locinput= st->locinput;
5262 nextchr = UCHARAT(locinput);
5264 state_num = st->resume_state + no_final;
5265 goto reenter_switch;
5268 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5269 PL_colors[4], PL_colors[5]));
5271 if (PL_reg_eval_set) {
5272 /* each successfully executed (?{...}) block does the equivalent of
5273 * local $^R = do {...}
5274 * When popping the save stack, all these locals would be undone;
5275 * bypass this by setting the outermost saved $^R to the latest
5277 if (oreplsv != GvSV(PL_replgv))
5278 sv_setsv(oreplsv, GvSV(PL_replgv));
5285 PerlIO_printf(Perl_debug_log,
5286 "%*s %sfailed...%s\n",
5287 REPORT_CODE_OFF+depth*2, "",
5288 PL_colors[4], PL_colors[5])
5300 /* there's a previous state to backtrack to */
5302 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5303 PL_regmatch_slab = PL_regmatch_slab->prev;
5304 st = SLAB_LAST(PL_regmatch_slab);
5306 PL_regmatch_state = st;
5307 locinput= st->locinput;
5308 nextchr = UCHARAT(locinput);
5310 DEBUG_STATE_pp("pop");
5312 if (yes_state == st)
5313 yes_state = st->u.yes.prev_yes_state;
5315 state_num = st->resume_state + 1; /* failure = success + 1 */
5316 goto reenter_switch;
5321 if (rex->intflags & PREGf_VERBARG_SEEN) {
5322 SV *sv_err = get_sv("REGERROR", 1);
5323 SV *sv_mrk = get_sv("REGMARK", 1);
5325 sv_commit = &PL_sv_no;
5327 sv_yes_mark = &PL_sv_yes;
5330 sv_commit = &PL_sv_yes;
5331 sv_yes_mark = &PL_sv_no;
5333 sv_setsv(sv_err, sv_commit);
5334 sv_setsv(sv_mrk, sv_yes_mark);
5337 /* clean up; in particular, free all slabs above current one */
5338 LEAVE_SCOPE(oldsave);
5344 - regrepeat - repeatedly match something simple, report how many
5347 * [This routine now assumes that it will only match on things of length 1.
5348 * That was true before, but now we assume scan - reginput is the count,
5349 * rather than incrementing count on every character. [Er, except utf8.]]
5352 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5355 register char *scan;
5357 register char *loceol = PL_regeol;
5358 register I32 hardcount = 0;
5359 register bool do_utf8 = PL_reg_match_utf8;
5361 PERL_UNUSED_ARG(depth);
5364 PERL_ARGS_ASSERT_REGREPEAT;
5367 if (max == REG_INFTY)
5369 else if (max < loceol - scan)
5370 loceol = scan + max;
5375 while (scan < loceol && hardcount < max && *scan != '\n') {
5376 scan += UTF8SKIP(scan);
5380 while (scan < loceol && *scan != '\n')
5387 while (scan < loceol && hardcount < max) {
5388 scan += UTF8SKIP(scan);
5398 case EXACT: /* length of string is 1 */
5400 while (scan < loceol && UCHARAT(scan) == c)
5403 case EXACTF: /* length of string is 1 */
5405 while (scan < loceol &&
5406 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5409 case EXACTFL: /* length of string is 1 */
5410 PL_reg_flags |= RF_tainted;
5412 while (scan < loceol &&
5413 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5419 while (hardcount < max && scan < loceol &&
5420 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5421 scan += UTF8SKIP(scan);
5425 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5432 LOAD_UTF8_CHARCLASS_ALNUM();
5433 while (hardcount < max && scan < loceol &&
5434 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5435 scan += UTF8SKIP(scan);
5439 while (scan < loceol && isALNUM(*scan))
5444 PL_reg_flags |= RF_tainted;
5447 while (hardcount < max && scan < loceol &&
5448 isALNUM_LC_utf8((U8*)scan)) {
5449 scan += UTF8SKIP(scan);
5453 while (scan < loceol && isALNUM_LC(*scan))
5460 LOAD_UTF8_CHARCLASS_ALNUM();
5461 while (hardcount < max && scan < loceol &&
5462 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5463 scan += UTF8SKIP(scan);
5467 while (scan < loceol && !isALNUM(*scan))
5472 PL_reg_flags |= RF_tainted;
5475 while (hardcount < max && scan < loceol &&
5476 !isALNUM_LC_utf8((U8*)scan)) {
5477 scan += UTF8SKIP(scan);
5481 while (scan < loceol && !isALNUM_LC(*scan))
5488 LOAD_UTF8_CHARCLASS_SPACE();
5489 while (hardcount < max && scan < loceol &&
5491 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5492 scan += UTF8SKIP(scan);
5496 while (scan < loceol && isSPACE(*scan))
5501 PL_reg_flags |= RF_tainted;
5504 while (hardcount < max && scan < loceol &&
5505 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5506 scan += UTF8SKIP(scan);
5510 while (scan < loceol && isSPACE_LC(*scan))
5517 LOAD_UTF8_CHARCLASS_SPACE();
5518 while (hardcount < max && scan < loceol &&
5520 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5521 scan += UTF8SKIP(scan);
5525 while (scan < loceol && !isSPACE(*scan))
5530 PL_reg_flags |= RF_tainted;
5533 while (hardcount < max && scan < loceol &&
5534 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5535 scan += UTF8SKIP(scan);
5539 while (scan < loceol && !isSPACE_LC(*scan))
5546 LOAD_UTF8_CHARCLASS_DIGIT();
5547 while (hardcount < max && scan < loceol &&
5548 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5549 scan += UTF8SKIP(scan);
5553 while (scan < loceol && isDIGIT(*scan))
5560 LOAD_UTF8_CHARCLASS_DIGIT();
5561 while (hardcount < max && scan < loceol &&
5562 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5563 scan += UTF8SKIP(scan);
5567 while (scan < loceol && !isDIGIT(*scan))
5573 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5579 LNBREAK can match two latin chars, which is ok,
5580 because we have a null terminated string, but we
5581 have to use hardcount in this situation
5583 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5592 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5597 while (scan < loceol && is_HORIZWS_latin1(scan))
5604 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5605 scan += UTF8SKIP(scan);
5609 while (scan < loceol && !is_HORIZWS_latin1(scan))
5617 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5622 while (scan < loceol && is_VERTWS_latin1(scan))
5630 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5631 scan += UTF8SKIP(scan);
5635 while (scan < loceol && !is_VERTWS_latin1(scan))
5641 default: /* Called on something of 0 width. */
5642 break; /* So match right here or not at all. */
5648 c = scan - PL_reginput;
5652 GET_RE_DEBUG_FLAGS_DECL;
5654 SV * const prop = sv_newmortal();
5655 regprop(prog, prop, p);
5656 PerlIO_printf(Perl_debug_log,
5657 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5658 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5666 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5668 - regclass_swash - prepare the utf8 swash
5672 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5678 RXi_GET_DECL(prog,progi);
5679 const struct reg_data * const data = prog ? progi->data : NULL;
5681 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5683 if (data && data->count) {
5684 const U32 n = ARG(node);
5686 if (data->what[n] == 's') {
5687 SV * const rv = (SV*)data->data[n];
5688 AV * const av = (AV*)SvRV((SV*)rv);
5689 SV **const ary = AvARRAY(av);
5692 /* See the end of regcomp.c:S_regclass() for
5693 * documentation of these array elements. */
5696 a = SvROK(ary[1]) ? &ary[1] : NULL;
5697 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5701 else if (si && doinit) {
5702 sw = swash_init("utf8", "", si, 1, 0);
5703 (void)av_store(av, 1, sw);
5720 - reginclass - determine if a character falls into a character class
5722 The n is the ANYOF regnode, the p is the target string, lenp
5723 is pointer to the maximum length of how far to go in the p
5724 (if the lenp is zero, UTF8SKIP(p) is used),
5725 do_utf8 tells whether the target string is in UTF-8.
5730 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5733 const char flags = ANYOF_FLAGS(n);
5739 PERL_ARGS_ASSERT_REGINCLASS;
5741 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5742 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5743 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5744 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5745 if (len == (STRLEN)-1)
5746 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5749 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5750 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5753 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5754 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5757 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5761 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5764 if (swash_fetch(sw, p, do_utf8))
5766 else if (flags & ANYOF_FOLD) {
5767 if (!match && lenp && av) {
5769 for (i = 0; i <= av_len(av); i++) {
5770 SV* const sv = *av_fetch(av, i, FALSE);
5772 const char * const s = SvPV_const(sv, len);
5774 if (len <= plen && memEQ(s, (char*)p, len)) {
5782 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5785 to_utf8_fold(p, tmpbuf, &tmplen);
5786 if (swash_fetch(sw, tmpbuf, do_utf8))
5792 if (match && lenp && *lenp == 0)
5793 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5795 if (!match && c < 256) {
5796 if (ANYOF_BITMAP_TEST(n, c))
5798 else if (flags & ANYOF_FOLD) {
5801 if (flags & ANYOF_LOCALE) {
5802 PL_reg_flags |= RF_tainted;
5803 f = PL_fold_locale[c];
5807 if (f != c && ANYOF_BITMAP_TEST(n, f))
5811 if (!match && (flags & ANYOF_CLASS)) {
5812 PL_reg_flags |= RF_tainted;
5814 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5815 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5816 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5817 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5818 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5819 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5820 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5821 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5822 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5823 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5824 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5825 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5826 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5827 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5828 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5829 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5830 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5831 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5832 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5833 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5834 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5835 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5836 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5837 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5838 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5839 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5840 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5841 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5842 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5843 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5844 ) /* How's that for a conditional? */
5851 return (flags & ANYOF_INVERT) ? !match : match;
5855 S_reghop3(U8 *s, I32 off, const U8* lim)
5859 PERL_ARGS_ASSERT_REGHOP3;
5862 while (off-- && s < lim) {
5863 /* XXX could check well-formedness here */
5868 while (off++ && s > lim) {
5870 if (UTF8_IS_CONTINUED(*s)) {
5871 while (s > lim && UTF8_IS_CONTINUATION(*s))
5874 /* XXX could check well-formedness here */
5881 /* there are a bunch of places where we use two reghop3's that should
5882 be replaced with this routine. but since thats not done yet
5883 we ifdef it out - dmq
5886 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5890 PERL_ARGS_ASSERT_REGHOP4;
5893 while (off-- && s < rlim) {
5894 /* XXX could check well-formedness here */
5899 while (off++ && s > llim) {
5901 if (UTF8_IS_CONTINUED(*s)) {
5902 while (s > llim && UTF8_IS_CONTINUATION(*s))
5905 /* XXX could check well-formedness here */
5913 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5917 PERL_ARGS_ASSERT_REGHOPMAYBE3;
5920 while (off-- && s < lim) {
5921 /* XXX could check well-formedness here */
5928 while (off++ && s > lim) {
5930 if (UTF8_IS_CONTINUED(*s)) {
5931 while (s > lim && UTF8_IS_CONTINUATION(*s))
5934 /* XXX could check well-formedness here */
5943 restore_pos(pTHX_ void *arg)
5946 regexp * const rex = (regexp *)arg;
5947 if (PL_reg_eval_set) {
5948 if (PL_reg_oldsaved) {
5949 rex->subbeg = PL_reg_oldsaved;
5950 rex->sublen = PL_reg_oldsavedlen;
5951 #ifdef PERL_OLD_COPY_ON_WRITE
5952 rex->saved_copy = PL_nrs;
5954 RXp_MATCH_COPIED_on(rex);
5956 PL_reg_magic->mg_len = PL_reg_oldpos;
5957 PL_reg_eval_set = 0;
5958 PL_curpm = PL_reg_oldcurpm;
5963 S_to_utf8_substr(pTHX_ register regexp *prog)
5967 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
5970 if (prog->substrs->data[i].substr
5971 && !prog->substrs->data[i].utf8_substr) {
5972 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5973 prog->substrs->data[i].utf8_substr = sv;
5974 sv_utf8_upgrade(sv);
5975 if (SvVALID(prog->substrs->data[i].substr)) {
5976 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5977 if (flags & FBMcf_TAIL) {
5978 /* Trim the trailing \n that fbm_compile added last
5980 SvCUR_set(sv, SvCUR(sv) - 1);
5981 /* Whilst this makes the SV technically "invalid" (as its
5982 buffer is no longer followed by "\0") when fbm_compile()
5983 adds the "\n" back, a "\0" is restored. */
5985 fbm_compile(sv, flags);
5987 if (prog->substrs->data[i].substr == prog->check_substr)
5988 prog->check_utf8 = sv;
5994 S_to_byte_substr(pTHX_ register regexp *prog)
5999 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6002 if (prog->substrs->data[i].utf8_substr
6003 && !prog->substrs->data[i].substr) {
6004 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6005 if (sv_utf8_downgrade(sv, TRUE)) {
6006 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6008 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6009 if (flags & FBMcf_TAIL) {
6010 /* Trim the trailing \n that fbm_compile added last
6012 SvCUR_set(sv, SvCUR(sv) - 1);
6014 fbm_compile(sv, flags);
6020 prog->substrs->data[i].substr = sv;
6021 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6022 prog->check_substr = sv;
6029 * c-indentation-style: bsd
6031 * indent-tabs-mode: t
6034 * ex: set ts=8 sts=4 sw=4 noet: