5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 7
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regoffs[p].end);
193 SSPUSHINT(PL_regoffs[p].start);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regoffs[p].start,
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
200 (IV)PL_regoffs[p].end
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regoffs);
205 SSPUSHINT(PL_regsize);
206 SSPUSHINT(*PL_reglastparen);
207 SSPUSHINT(*PL_reglastcloseparen);
208 SSPUSHPTR(PL_reginput);
209 #define REGCP_FRAME_ELEMS 2
210 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
211 * are needed for the regexp context stack bookkeeping. */
212 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
213 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
218 /* These are needed since we do not localize EVAL nodes: */
219 #define REGCP_SET(cp) \
221 PerlIO_printf(Perl_debug_log, \
222 " Setting an EVAL scope, savestack=%"IVdf"\n", \
223 (IV)PL_savestack_ix)); \
226 #define REGCP_UNWIND(cp) \
228 if (cp != PL_savestack_ix) \
229 PerlIO_printf(Perl_debug_log, \
230 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
231 (IV)(cp), (IV)PL_savestack_ix)); \
235 S_regcppop(pTHX_ const regexp *rex)
241 GET_RE_DEBUG_FLAGS_DECL;
243 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
245 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
246 i = SSPOPINT; /* Parentheses elements to pop. */
247 input = (char *) SSPOPPTR;
248 *PL_reglastcloseparen = SSPOPINT;
249 *PL_reglastparen = SSPOPINT;
250 PL_regsize = SSPOPINT;
251 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
254 /* Now restore the parentheses context. */
255 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
256 i > 0; i -= REGCP_PAREN_ELEMS) {
258 U32 paren = (U32)SSPOPINT;
259 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
260 PL_regoffs[paren].start = SSPOPINT;
262 if (paren <= *PL_reglastparen)
263 PL_regoffs[paren].end = tmps;
265 PerlIO_printf(Perl_debug_log,
266 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
267 (UV)paren, (IV)PL_regoffs[paren].start,
268 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
269 (IV)PL_regoffs[paren].end,
270 (paren > *PL_reglastparen ? "(no)" : ""));
274 if (*PL_reglastparen + 1 <= rex->nparens) {
275 PerlIO_printf(Perl_debug_log,
276 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
277 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
281 /* It would seem that the similar code in regtry()
282 * already takes care of this, and in fact it is in
283 * a better location to since this code can #if 0-ed out
284 * but the code in regtry() is needed or otherwise tests
285 * requiring null fields (pat.t#187 and split.t#{13,14}
286 * (as of patchlevel 7877) will fail. Then again,
287 * this code seems to be necessary or otherwise
288 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
289 * --jhi updated by dapm */
290 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
292 PL_regoffs[i].start = -1;
293 PL_regoffs[i].end = -1;
299 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
302 * pregexec and friends
305 #ifndef PERL_IN_XSUB_RE
307 - pregexec - match a regexp against a string
310 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
311 char *strbeg, I32 minend, SV *screamer, U32 nosave)
312 /* strend: pointer to null at end of string */
313 /* strbeg: real beginning of string */
314 /* minend: end of match must be >=minend after stringarg. */
315 /* nosave: For optimizations. */
318 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
319 nosave ? 0 : REXEC_COPY_STR);
324 * Need to implement the following flags for reg_anch:
326 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
328 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
329 * INTUIT_AUTORITATIVE_ML
330 * INTUIT_ONCE_NOML - Intuit can match in one location only.
333 * Another flag for this function: SECOND_TIME (so that float substrs
334 * with giant delta may be not rechecked).
337 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
339 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
340 Otherwise, only SvCUR(sv) is used to get strbeg. */
342 /* XXXX We assume that strpos is strbeg unless sv. */
344 /* XXXX Some places assume that there is a fixed substring.
345 An update may be needed if optimizer marks as "INTUITable"
346 RExen without fixed substrings. Similarly, it is assumed that
347 lengths of all the strings are no more than minlen, thus they
348 cannot come from lookahead.
349 (Or minlen should take into account lookahead.)
350 NOTE: Some of this comment is not correct. minlen does now take account
351 of lookahead/behind. Further research is required. -- demerphq
355 /* A failure to find a constant substring means that there is no need to make
356 an expensive call to REx engine, thus we celebrate a failure. Similarly,
357 finding a substring too deep into the string means that less calls to
358 regtry() should be needed.
360 REx compiler's optimizer found 4 possible hints:
361 a) Anchored substring;
363 c) Whether we are anchored (beginning-of-line or \G);
364 d) First node (of those at offset 0) which may distingush positions;
365 We use a)b)d) and multiline-part of c), and try to find a position in the
366 string which does not contradict any of them.
369 /* Most of decisions we do here should have been done at compile time.
370 The nodes of the REx which we used for the search should have been
371 deleted from the finite automaton. */
374 Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
375 char *strend, const U32 flags, re_scream_pos_data *data)
378 register I32 start_shift = 0;
379 /* Should be nonnegative! */
380 register I32 end_shift = 0;
385 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
387 register char *other_last = NULL; /* other substr checked before this */
388 char *check_at = NULL; /* check substr found at this pos */
389 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
390 RXi_GET_DECL(prog,progi);
392 const char * const i_strpos = strpos;
395 GET_RE_DEBUG_FLAGS_DECL;
397 RX_MATCH_UTF8_set(prog,do_utf8);
399 if (prog->extflags & RXf_UTF8) {
400 PL_reg_flags |= RF_utf8;
403 debug_start_match(prog, do_utf8, strpos, strend,
404 sv ? "Guessing start of match in sv for"
405 : "Guessing start of match in string for");
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
411 "String too short... [re_intuit_start]\n"));
415 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
418 if (!prog->check_utf8 && prog->check_substr)
419 to_utf8_substr(prog);
420 check = prog->check_utf8;
422 if (!prog->check_substr && prog->check_utf8)
423 to_byte_substr(prog);
424 check = prog->check_substr;
426 if (check == &PL_sv_undef) {
427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
428 "Non-utf8 string cannot match utf8 check string\n"));
431 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
432 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
433 || ( (prog->extflags & RXf_ANCH_BOL)
434 && !multiline ) ); /* Check after \n? */
437 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
438 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
439 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
441 && (strpos != strbeg)) {
442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
445 if (prog->check_offset_min == prog->check_offset_max &&
446 !(prog->extflags & RXf_CANY_SEEN)) {
447 /* Substring at constant offset from beg-of-str... */
450 s = HOP3c(strpos, prog->check_offset_min, strend);
453 slen = SvCUR(check); /* >= 1 */
455 if ( strend - s > slen || strend - s < slen - 1
456 || (strend - s == slen && strend[-1] != '\n')) {
457 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
460 /* Now should match s[0..slen-2] */
462 if (slen && (*SvPVX_const(check) != *s
464 && memNE(SvPVX_const(check), s, slen)))) {
466 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
470 else if (*SvPVX_const(check) != *s
471 || ((slen = SvCUR(check)) > 1
472 && memNE(SvPVX_const(check), s, slen)))
475 goto success_at_start;
478 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
480 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
481 end_shift = prog->check_end_shift;
484 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
485 - (SvTAIL(check) != 0);
486 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
488 if (end_shift < eshift)
492 else { /* Can match at random position */
495 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
496 end_shift = prog->check_end_shift;
498 /* end shift should be non negative here */
501 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
503 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
504 (IV)end_shift, prog->precomp);
508 /* Find a possible match in the region s..strend by looking for
509 the "check" substring in the region corrected by start/end_shift. */
512 I32 srch_start_shift = start_shift;
513 I32 srch_end_shift = end_shift;
514 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
515 srch_end_shift -= ((strbeg - s) - srch_start_shift);
516 srch_start_shift = strbeg - s;
518 DEBUG_OPTIMISE_MORE_r({
519 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
520 (IV)prog->check_offset_min,
521 (IV)srch_start_shift,
523 (IV)prog->check_end_shift);
526 if (flags & REXEC_SCREAM) {
527 I32 p = -1; /* Internal iterator of scream. */
528 I32 * const pp = data ? data->scream_pos : &p;
530 if (PL_screamfirst[BmRARE(check)] >= 0
531 || ( BmRARE(check) == '\n'
532 && (BmPREVIOUS(check) == SvCUR(check) - 1)
534 s = screaminstr(sv, check,
535 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
538 /* we may be pointing at the wrong string */
539 if (s && RX_MATCH_COPIED(prog))
540 s = strbeg + (s - SvPVX_const(sv));
542 *data->scream_olds = s;
547 if (prog->extflags & RXf_CANY_SEEN) {
548 start_point= (U8*)(s + srch_start_shift);
549 end_point= (U8*)(strend - srch_end_shift);
551 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
552 end_point= HOP3(strend, -srch_end_shift, strbeg);
554 DEBUG_OPTIMISE_MORE_r({
555 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
556 (int)(end_point - start_point),
557 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
561 s = fbm_instr( start_point, end_point,
562 check, multiline ? FBMrf_MULTILINE : 0);
565 /* Update the count-of-usability, remove useless subpatterns,
569 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
570 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
571 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
572 (s ? "Found" : "Did not find"),
573 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
574 ? "anchored" : "floating"),
577 (s ? " at offset " : "...\n") );
582 /* Finish the diagnostic message */
583 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
585 /* XXX dmq: first branch is for positive lookbehind...
586 Our check string is offset from the beginning of the pattern.
587 So we need to do any stclass tests offset forward from that
596 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
597 Start with the other substr.
598 XXXX no SCREAM optimization yet - and a very coarse implementation
599 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
600 *always* match. Probably should be marked during compile...
601 Probably it is right to do no SCREAM here...
604 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
605 : (prog->float_substr && prog->anchored_substr))
607 /* Take into account the "other" substring. */
608 /* XXXX May be hopelessly wrong for UTF... */
611 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
614 char * const last = HOP3c(s, -start_shift, strbeg);
616 char * const saved_s = s;
619 t = s - prog->check_offset_max;
620 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
622 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
627 t = HOP3c(t, prog->anchored_offset, strend);
628 if (t < other_last) /* These positions already checked */
630 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
633 /* XXXX It is not documented what units *_offsets are in.
634 We assume bytes, but this is clearly wrong.
635 Meaning this code needs to be carefully reviewed for errors.
639 /* On end-of-str: see comment below. */
640 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
641 if (must == &PL_sv_undef) {
643 DEBUG_r(must = prog->anchored_utf8); /* for debug */
648 HOP3(HOP3(last1, prog->anchored_offset, strend)
649 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
651 multiline ? FBMrf_MULTILINE : 0
654 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
655 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
656 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
657 (s ? "Found" : "Contradicts"),
658 quoted, RE_SV_TAIL(must));
663 if (last1 >= last2) {
664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
665 ", giving up...\n"));
668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
669 ", trying floating at offset %ld...\n",
670 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
671 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
672 s = HOP3c(last, 1, strend);
676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
677 (long)(s - i_strpos)));
678 t = HOP3c(s, -prog->anchored_offset, strbeg);
679 other_last = HOP3c(s, 1, strend);
687 else { /* Take into account the floating substring. */
689 char * const saved_s = s;
692 t = HOP3c(s, -start_shift, strbeg);
694 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
695 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
696 last = HOP3c(t, prog->float_max_offset, strend);
697 s = HOP3c(t, prog->float_min_offset, strend);
700 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
701 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
702 /* fbm_instr() takes into account exact value of end-of-str
703 if the check is SvTAIL(ed). Since false positives are OK,
704 and end-of-str is not later than strend we are OK. */
705 if (must == &PL_sv_undef) {
707 DEBUG_r(must = prog->float_utf8); /* for debug message */
710 s = fbm_instr((unsigned char*)s,
711 (unsigned char*)last + SvCUR(must)
713 must, multiline ? FBMrf_MULTILINE : 0);
715 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
716 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
717 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
718 (s ? "Found" : "Contradicts"),
719 quoted, RE_SV_TAIL(must));
723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
724 ", giving up...\n"));
727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
728 ", trying anchored starting at offset %ld...\n",
729 (long)(saved_s + 1 - i_strpos)));
731 s = HOP3c(t, 1, strend);
735 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
736 (long)(s - i_strpos)));
737 other_last = s; /* Fix this later. --Hugo */
747 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
749 DEBUG_OPTIMISE_MORE_r(
750 PerlIO_printf(Perl_debug_log,
751 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
752 (IV)prog->check_offset_min,
753 (IV)prog->check_offset_max,
761 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
763 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
766 /* Fixed substring is found far enough so that the match
767 cannot start at strpos. */
769 if (ml_anch && t[-1] != '\n') {
770 /* Eventually fbm_*() should handle this, but often
771 anchored_offset is not 0, so this check will not be wasted. */
772 /* XXXX In the code below we prefer to look for "^" even in
773 presence of anchored substrings. And we search even
774 beyond the found float position. These pessimizations
775 are historical artefacts only. */
777 while (t < strend - prog->minlen) {
779 if (t < check_at - prog->check_offset_min) {
780 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
781 /* Since we moved from the found position,
782 we definitely contradict the found anchored
783 substr. Due to the above check we do not
784 contradict "check" substr.
785 Thus we can arrive here only if check substr
786 is float. Redo checking for "other"=="fixed".
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
790 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
791 goto do_other_anchored;
793 /* We don't contradict the found floating substring. */
794 /* XXXX Why not check for STCLASS? */
796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
797 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
800 /* Position contradicts check-string */
801 /* XXXX probably better to look for check-string
802 than for "\n", so one should lower the limit for t? */
803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
804 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
805 other_last = strpos = s = t + 1;
810 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
811 PL_colors[0], PL_colors[1]));
815 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
816 PL_colors[0], PL_colors[1]));
820 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
823 /* The found string does not prohibit matching at strpos,
824 - no optimization of calling REx engine can be performed,
825 unless it was an MBOL and we are not after MBOL,
826 or a future STCLASS check will fail this. */
828 /* Even in this situation we may use MBOL flag if strpos is offset
829 wrt the start of the string. */
830 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
831 && (strpos != strbeg) && strpos[-1] != '\n'
832 /* May be due to an implicit anchor of m{.*foo} */
833 && !(prog->intflags & PREGf_IMPLICIT))
838 DEBUG_EXECUTE_r( if (ml_anch)
839 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
840 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
843 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
845 prog->check_utf8 /* Could be deleted already */
846 && --BmUSEFUL(prog->check_utf8) < 0
847 && (prog->check_utf8 == prog->float_utf8)
849 prog->check_substr /* Could be deleted already */
850 && --BmUSEFUL(prog->check_substr) < 0
851 && (prog->check_substr == prog->float_substr)
854 /* If flags & SOMETHING - do not do it many times on the same match */
855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
856 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
857 if (do_utf8 ? prog->check_substr : prog->check_utf8)
858 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
859 prog->check_substr = prog->check_utf8 = NULL; /* disable */
860 prog->float_substr = prog->float_utf8 = NULL; /* clear */
861 check = NULL; /* abort */
863 /* XXXX This is a remnant of the old implementation. It
864 looks wasteful, since now INTUIT can use many
866 prog->extflags &= ~RXf_USE_INTUIT;
873 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
874 /* trie stclasses are too expensive to use here, we are better off to
875 leave it to regmatch itself */
876 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
877 /* minlen == 0 is possible if regstclass is \b or \B,
878 and the fixed substr is ''$.
879 Since minlen is already taken into account, s+1 is before strend;
880 accidentally, minlen >= 1 guaranties no false positives at s + 1
881 even for \b or \B. But (minlen? 1 : 0) below assumes that
882 regstclass does not come from lookahead... */
883 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
884 This leaves EXACTF only, which is dealt with in find_byclass(). */
885 const U8* const str = (U8*)STRING(progi->regstclass);
886 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
887 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
890 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
891 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
892 else if (prog->float_substr || prog->float_utf8)
893 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
898 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
901 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
904 const char *what = NULL;
906 if (endpos == strend) {
907 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
908 "Could not match STCLASS...\n") );
911 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
912 "This position contradicts STCLASS...\n") );
913 if ((prog->extflags & RXf_ANCH) && !ml_anch)
915 /* Contradict one of substrings */
916 if (prog->anchored_substr || prog->anchored_utf8) {
917 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
918 DEBUG_EXECUTE_r( what = "anchored" );
920 s = HOP3c(t, 1, strend);
921 if (s + start_shift + end_shift > strend) {
922 /* XXXX Should be taken into account earlier? */
923 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
924 "Could not match STCLASS...\n") );
929 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
930 "Looking for %s substr starting at offset %ld...\n",
931 what, (long)(s + start_shift - i_strpos)) );
934 /* Have both, check_string is floating */
935 if (t + start_shift >= check_at) /* Contradicts floating=check */
936 goto retry_floating_check;
937 /* Recheck anchored substring, but not floating... */
941 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
942 "Looking for anchored substr starting at offset %ld...\n",
943 (long)(other_last - i_strpos)) );
944 goto do_other_anchored;
946 /* Another way we could have checked stclass at the
947 current position only: */
952 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
953 "Looking for /%s^%s/m starting at offset %ld...\n",
954 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
957 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
959 /* Check is floating subtring. */
960 retry_floating_check:
961 t = check_at - start_shift;
962 DEBUG_EXECUTE_r( what = "floating" );
963 goto hop_and_restart;
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
967 "By STCLASS: moving %ld --> %ld\n",
968 (long)(t - i_strpos), (long)(s - i_strpos))
972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
973 "Does not contradict STCLASS...\n");
978 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
979 PL_colors[4], (check ? "Guessed" : "Giving up"),
980 PL_colors[5], (long)(s - i_strpos)) );
983 fail_finish: /* Substring not found */
984 if (prog->check_substr || prog->check_utf8) /* could be removed already */
985 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
988 PL_colors[4], PL_colors[5]));
994 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
995 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
996 switch (trie_type) { \
997 case trie_utf8_fold: \
999 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1004 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1005 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1006 foldlen -= UNISKIP( uvc ); \
1007 uscan = foldbuf + UNISKIP( uvc ); \
1011 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1019 charid = trie->charmap[ uvc ]; \
1023 if (widecharmap) { \
1024 SV** const svpp = hv_fetch(widecharmap, \
1025 (char*)&uvc, sizeof(UV), 0); \
1027 charid = (U16)SvIV(*svpp); \
1032 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1035 ibcmp_utf8(s, NULL, 0, do_utf8, \
1036 m, NULL, ln, (bool)UTF)) \
1037 && (!reginfo || regtry(reginfo, &s)) ) \
1040 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1041 uvchr_to_utf8(tmpbuf, c); \
1042 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1044 && (f == c1 || f == c2) \
1045 && (ln == foldlen || \
1046 !ibcmp_utf8((char *) foldbuf, \
1047 NULL, foldlen, do_utf8, \
1049 NULL, ln, (bool)UTF)) \
1050 && (!reginfo || regtry(reginfo, &s)) ) \
1055 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1059 && (ln == 1 || !(OP(c) == EXACTF \
1061 : ibcmp_locale(s, m, ln))) \
1062 && (!reginfo || regtry(reginfo, &s)) ) \
1068 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1070 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1076 #define REXEC_FBC_SCAN(CoDe) \
1078 while (s < strend) { \
1084 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1085 REXEC_FBC_UTF8_SCAN( \
1087 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1096 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1099 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1108 #define REXEC_FBC_TRYIT \
1109 if ((!reginfo || regtry(reginfo, &s))) \
1112 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1114 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1117 REXEC_FBC_CLASS_SCAN(CoNd); \
1121 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1124 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1127 REXEC_FBC_CLASS_SCAN(CoNd); \
1131 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1132 PL_reg_flags |= RF_tainted; \
1134 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1137 REXEC_FBC_CLASS_SCAN(CoNd); \
1141 #define DUMP_EXEC_POS(li,s,doutf8) \
1142 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1144 /* We know what class REx starts with. Try to find this position... */
1145 /* if reginfo is NULL, its a dryrun */
1146 /* annoyingly all the vars in this routine have different names from their counterparts
1147 in regmatch. /grrr */
1150 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1151 const char *strend, regmatch_info *reginfo)
1154 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1158 register STRLEN uskip;
1162 register I32 tmp = 1; /* Scratch variable? */
1163 register const bool do_utf8 = PL_reg_match_utf8;
1164 RXi_GET_DECL(prog,progi);
1166 /* We know what class it must start with. */
1170 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1171 !UTF8_IS_INVARIANT((U8)s[0]) ?
1172 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1173 REGINCLASS(prog, c, (U8*)s));
1176 while (s < strend) {
1179 if (REGINCLASS(prog, c, (U8*)s) ||
1180 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1181 /* The assignment of 2 is intentional:
1182 * for the folded sharp s, the skip is 2. */
1183 (skip = SHARP_S_SKIP))) {
1184 if (tmp && (!reginfo || regtry(reginfo, &s)))
1197 if (tmp && (!reginfo || regtry(reginfo, &s)))
1205 ln = STR_LEN(c); /* length to match in octets/bytes */
1206 lnc = (I32) ln; /* length to match in characters */
1208 STRLEN ulen1, ulen2;
1210 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1211 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1212 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1214 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1215 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1217 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1219 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1222 while (sm < ((U8 *) m + ln)) {
1237 c2 = PL_fold_locale[c1];
1239 e = HOP3c(strend, -((I32)lnc), s);
1241 if (!reginfo && e < s)
1242 e = s; /* Due to minlen logic of intuit() */
1244 /* The idea in the EXACTF* cases is to first find the
1245 * first character of the EXACTF* node and then, if
1246 * necessary, case-insensitively compare the full
1247 * text of the node. The c1 and c2 are the first
1248 * characters (though in Unicode it gets a bit
1249 * more complicated because there are more cases
1250 * than just upper and lower: one needs to use
1251 * the so-called folding case for case-insensitive
1252 * matching (called "loose matching" in Unicode).
1253 * ibcmp_utf8() will do just that. */
1257 U8 tmpbuf [UTF8_MAXBYTES+1];
1258 STRLEN len, foldlen;
1259 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1261 /* Upper and lower of 1st char are equal -
1262 * probably not a "letter". */
1264 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1266 REXEC_FBC_EXACTISH_CHECK(c == c1);
1271 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1274 /* Handle some of the three Greek sigmas cases.
1275 * Note that not all the possible combinations
1276 * are handled here: some of them are handled
1277 * by the standard folding rules, and some of
1278 * them (the character class or ANYOF cases)
1279 * are handled during compiletime in
1280 * regexec.c:S_regclass(). */
1281 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1282 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1283 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1285 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1291 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1293 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1297 PL_reg_flags |= RF_tainted;
1304 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1305 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1307 tmp = ((OP(c) == BOUND ?
1308 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1309 LOAD_UTF8_CHARCLASS_ALNUM();
1310 REXEC_FBC_UTF8_SCAN(
1311 if (tmp == !(OP(c) == BOUND ?
1312 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1313 isALNUM_LC_utf8((U8*)s)))
1321 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1322 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1325 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1331 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1335 PL_reg_flags |= RF_tainted;
1342 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1343 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1345 tmp = ((OP(c) == NBOUND ?
1346 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1347 LOAD_UTF8_CHARCLASS_ALNUM();
1348 REXEC_FBC_UTF8_SCAN(
1349 if (tmp == !(OP(c) == NBOUND ?
1350 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1351 isALNUM_LC_utf8((U8*)s)))
1353 else REXEC_FBC_TRYIT;
1357 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1358 tmp = ((OP(c) == NBOUND ?
1359 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1362 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1364 else REXEC_FBC_TRYIT;
1367 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1371 REXEC_FBC_CSCAN_PRELOAD(
1372 LOAD_UTF8_CHARCLASS_ALNUM(),
1373 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1377 REXEC_FBC_CSCAN_TAINT(
1378 isALNUM_LC_utf8((U8*)s),
1382 REXEC_FBC_CSCAN_PRELOAD(
1383 LOAD_UTF8_CHARCLASS_ALNUM(),
1384 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1388 REXEC_FBC_CSCAN_TAINT(
1389 !isALNUM_LC_utf8((U8*)s),
1393 REXEC_FBC_CSCAN_PRELOAD(
1394 LOAD_UTF8_CHARCLASS_SPACE(),
1395 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1399 REXEC_FBC_CSCAN_TAINT(
1400 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1404 REXEC_FBC_CSCAN_PRELOAD(
1405 LOAD_UTF8_CHARCLASS_SPACE(),
1406 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1410 REXEC_FBC_CSCAN_TAINT(
1411 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1415 REXEC_FBC_CSCAN_PRELOAD(
1416 LOAD_UTF8_CHARCLASS_DIGIT(),
1417 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1421 REXEC_FBC_CSCAN_TAINT(
1422 isDIGIT_LC_utf8((U8*)s),
1426 REXEC_FBC_CSCAN_PRELOAD(
1427 LOAD_UTF8_CHARCLASS_DIGIT(),
1428 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1432 REXEC_FBC_CSCAN_TAINT(
1433 !isDIGIT_LC_utf8((U8*)s),
1439 is_LNBREAK_latin1(s)
1449 !is_VERTWS_latin1(s)
1454 is_HORIZWS_latin1(s)
1458 !is_HORIZWS_utf8(s),
1459 !is_HORIZWS_latin1(s)
1464 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1465 trie_type = do_utf8 ?
1466 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1468 /* what trie are we using right now */
1470 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1472 = (reg_trie_data*)progi->data->data[ aho->trie ];
1473 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1475 const char *last_start = strend - trie->minlen;
1477 const char *real_start = s;
1479 STRLEN maxlen = trie->maxlen;
1481 U8 **points; /* map of where we were in the input string
1482 when reading a given char. For ASCII this
1483 is unnecessary overhead as the relationship
1484 is always 1:1, but for Unicode, especially
1485 case folded Unicode this is not true. */
1486 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1490 GET_RE_DEBUG_FLAGS_DECL;
1492 /* We can't just allocate points here. We need to wrap it in
1493 * an SV so it gets freed properly if there is a croak while
1494 * running the match */
1497 sv_points=newSV(maxlen * sizeof(U8 *));
1498 SvCUR_set(sv_points,
1499 maxlen * sizeof(U8 *));
1500 SvPOK_on(sv_points);
1501 sv_2mortal(sv_points);
1502 points=(U8**)SvPV_nolen(sv_points );
1503 if ( trie_type != trie_utf8_fold
1504 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1507 bitmap=(U8*)trie->bitmap;
1509 bitmap=(U8*)ANYOF_BITMAP(c);
1511 /* this is the Aho-Corasick algorithm modified a touch
1512 to include special handling for long "unknown char"
1513 sequences. The basic idea being that we use AC as long
1514 as we are dealing with a possible matching char, when
1515 we encounter an unknown char (and we have not encountered
1516 an accepting state) we scan forward until we find a legal
1518 AC matching is basically that of trie matching, except
1519 that when we encounter a failing transition, we fall back
1520 to the current states "fail state", and try the current char
1521 again, a process we repeat until we reach the root state,
1522 state 1, or a legal transition. If we fail on the root state
1523 then we can either terminate if we have reached an accepting
1524 state previously, or restart the entire process from the beginning
1528 while (s <= last_start) {
1529 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1537 U8 *uscan = (U8*)NULL;
1538 U8 *leftmost = NULL;
1540 U32 accepted_word= 0;
1544 while ( state && uc <= (U8*)strend ) {
1546 U32 word = aho->states[ state ].wordnum;
1550 DEBUG_TRIE_EXECUTE_r(
1551 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1552 dump_exec_pos( (char *)uc, c, strend, real_start,
1553 (char *)uc, do_utf8 );
1554 PerlIO_printf( Perl_debug_log,
1555 " Scanning for legal start char...\n");
1558 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1563 if (uc >(U8*)last_start) break;
1567 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1568 if (!leftmost || lpos < leftmost) {
1569 DEBUG_r(accepted_word=word);
1575 points[pointpos++ % maxlen]= uc;
1576 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1577 uscan, len, uvc, charid, foldlen,
1579 DEBUG_TRIE_EXECUTE_r({
1580 dump_exec_pos( (char *)uc, c, strend, real_start,
1582 PerlIO_printf(Perl_debug_log,
1583 " Charid:%3u CP:%4"UVxf" ",
1589 word = aho->states[ state ].wordnum;
1591 base = aho->states[ state ].trans.base;
1593 DEBUG_TRIE_EXECUTE_r({
1595 dump_exec_pos( (char *)uc, c, strend, real_start,
1597 PerlIO_printf( Perl_debug_log,
1598 "%sState: %4"UVxf", word=%"UVxf,
1599 failed ? " Fail transition to " : "",
1600 (UV)state, (UV)word);
1605 (base + charid > trie->uniquecharcount )
1606 && (base + charid - 1 - trie->uniquecharcount
1608 && trie->trans[base + charid - 1 -
1609 trie->uniquecharcount].check == state
1610 && (tmp=trie->trans[base + charid - 1 -
1611 trie->uniquecharcount ].next))
1613 DEBUG_TRIE_EXECUTE_r(
1614 PerlIO_printf( Perl_debug_log," - legal\n"));
1619 DEBUG_TRIE_EXECUTE_r(
1620 PerlIO_printf( Perl_debug_log," - fail\n"));
1622 state = aho->fail[state];
1626 /* we must be accepting here */
1627 DEBUG_TRIE_EXECUTE_r(
1628 PerlIO_printf( Perl_debug_log," - accepting\n"));
1637 if (!state) state = 1;
1640 if ( aho->states[ state ].wordnum ) {
1641 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1642 if (!leftmost || lpos < leftmost) {
1643 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1648 s = (char*)leftmost;
1649 DEBUG_TRIE_EXECUTE_r({
1651 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1652 (UV)accepted_word, (IV)(s - real_start)
1655 if (!reginfo || regtry(reginfo, &s)) {
1661 DEBUG_TRIE_EXECUTE_r({
1662 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1665 DEBUG_TRIE_EXECUTE_r(
1666 PerlIO_printf( Perl_debug_log,"No match.\n"));
1675 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1684 S_swap_match_buff (pTHX_ regexp *prog) {
1685 regexp_paren_pair *t;
1688 /* We have to be careful. If the previous successful match
1689 was from this regex we don't want a subsequent paritally
1690 successful match to clobber the old results.
1691 So when we detect this possibility we add a swap buffer
1692 to the re, and switch the buffer each match. If we fail
1693 we switch it back, otherwise we leave it swapped.
1695 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
1698 prog->swap = prog->offs;
1704 - regexec_flags - match a regexp against a string
1707 Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
1708 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1709 /* strend: pointer to null at end of string */
1710 /* strbeg: real beginning of string */
1711 /* minend: end of match must be >=minend after stringarg. */
1712 /* data: May be used for some additional optimizations.
1713 Currently its only used, with a U32 cast, for transmitting
1714 the ganch offset when doing a /g match. This will change */
1715 /* nosave: For optimizations. */
1718 /*register*/ char *s;
1719 register regnode *c;
1720 /*register*/ char *startpos = stringarg;
1721 I32 minlen; /* must match at least this many chars */
1722 I32 dontbother = 0; /* how many characters not to try at end */
1723 I32 end_shift = 0; /* Same for the end. */ /* CC */
1724 I32 scream_pos = -1; /* Internal iterator of scream. */
1725 char *scream_olds = NULL;
1726 const bool do_utf8 = (bool)DO_UTF8(sv);
1728 RXi_GET_DECL(prog,progi);
1729 regmatch_info reginfo; /* create some info to pass to regtry etc */
1730 bool swap_on_fail = 0;
1732 GET_RE_DEBUG_FLAGS_DECL;
1734 PERL_UNUSED_ARG(data);
1736 /* Be paranoid... */
1737 if (prog == NULL || startpos == NULL) {
1738 Perl_croak(aTHX_ "NULL regexp parameter");
1742 multiline = prog->extflags & RXf_PMf_MULTILINE;
1743 reginfo.prog = prog;
1745 RX_MATCH_UTF8_set(prog, do_utf8);
1747 debug_start_match(prog, do_utf8, startpos, strend,
1751 minlen = prog->minlen;
1753 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1754 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1755 "String too short [regexec_flags]...\n"));
1760 /* Check validity of program. */
1761 if (UCHARAT(progi->program) != REG_MAGIC) {
1762 Perl_croak(aTHX_ "corrupted regexp program");
1766 PL_reg_eval_set = 0;
1769 if (prog->extflags & RXf_UTF8)
1770 PL_reg_flags |= RF_utf8;
1772 /* Mark beginning of line for ^ and lookbehind. */
1773 reginfo.bol = startpos; /* XXX not used ??? */
1777 /* Mark end of line for $ (and such) */
1780 /* see how far we have to get to not match where we matched before */
1781 reginfo.till = startpos+minend;
1783 /* If there is a "must appear" string, look for it. */
1786 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1789 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1790 reginfo.ganch = startpos + prog->gofs;
1791 else if (sv && SvTYPE(sv) >= SVt_PVMG
1793 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1794 && mg->mg_len >= 0) {
1795 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1796 if (prog->extflags & RXf_ANCH_GPOS) {
1797 if (s > reginfo.ganch)
1799 s = reginfo.ganch - prog->gofs;
1803 reginfo.ganch = strbeg + PTR2UV(data);
1804 } else /* pos() not defined */
1805 reginfo.ganch = strbeg;
1807 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1809 swap_match_buff(prog); /* do we need a save destructor here for
1812 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1813 re_scream_pos_data d;
1815 d.scream_olds = &scream_olds;
1816 d.scream_pos = &scream_pos;
1817 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1819 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1820 goto phooey; /* not present */
1826 /* Simplest case: anchored match need be tried only once. */
1827 /* [unless only anchor is BOL and multiline is set] */
1828 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1829 if (s == startpos && regtry(®info, &startpos))
1831 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1832 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1837 dontbother = minlen - 1;
1838 end = HOP3c(strend, -dontbother, strbeg) - 1;
1839 /* for multiline we only have to try after newlines */
1840 if (prog->check_substr || prog->check_utf8) {
1844 if (regtry(®info, &s))
1849 if (prog->extflags & RXf_USE_INTUIT) {
1850 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1861 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1862 if (regtry(®info, &s))
1869 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1871 /* the warning about reginfo.ganch being used without intialization
1872 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1873 and we only enter this block when the same bit is set. */
1874 char *tmp_s = reginfo.ganch - prog->gofs;
1875 if (regtry(®info, &tmp_s))
1880 /* Messy cases: unanchored match. */
1881 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1882 /* we have /x+whatever/ */
1883 /* it must be a one character string (XXXX Except UTF?) */
1888 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1889 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1890 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1895 DEBUG_EXECUTE_r( did_match = 1 );
1896 if (regtry(®info, &s)) goto got_it;
1898 while (s < strend && *s == ch)
1906 DEBUG_EXECUTE_r( did_match = 1 );
1907 if (regtry(®info, &s)) goto got_it;
1909 while (s < strend && *s == ch)
1914 DEBUG_EXECUTE_r(if (!did_match)
1915 PerlIO_printf(Perl_debug_log,
1916 "Did not find anchored character...\n")
1919 else if (prog->anchored_substr != NULL
1920 || prog->anchored_utf8 != NULL
1921 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1922 && prog->float_max_offset < strend - s)) {
1927 char *last1; /* Last position checked before */
1931 if (prog->anchored_substr || prog->anchored_utf8) {
1932 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1933 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1934 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1935 back_max = back_min = prog->anchored_offset;
1937 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1938 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1939 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1940 back_max = prog->float_max_offset;
1941 back_min = prog->float_min_offset;
1945 if (must == &PL_sv_undef)
1946 /* could not downgrade utf8 check substring, so must fail */
1952 last = HOP3c(strend, /* Cannot start after this */
1953 -(I32)(CHR_SVLEN(must)
1954 - (SvTAIL(must) != 0) + back_min), strbeg);
1957 last1 = HOPc(s, -1);
1959 last1 = s - 1; /* bogus */
1961 /* XXXX check_substr already used to find "s", can optimize if
1962 check_substr==must. */
1964 dontbother = end_shift;
1965 strend = HOPc(strend, -dontbother);
1966 while ( (s <= last) &&
1967 ((flags & REXEC_SCREAM)
1968 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1969 end_shift, &scream_pos, 0))
1970 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1971 (unsigned char*)strend, must,
1972 multiline ? FBMrf_MULTILINE : 0))) ) {
1973 /* we may be pointing at the wrong string */
1974 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1975 s = strbeg + (s - SvPVX_const(sv));
1976 DEBUG_EXECUTE_r( did_match = 1 );
1977 if (HOPc(s, -back_max) > last1) {
1978 last1 = HOPc(s, -back_min);
1979 s = HOPc(s, -back_max);
1982 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1984 last1 = HOPc(s, -back_min);
1988 while (s <= last1) {
1989 if (regtry(®info, &s))
1995 while (s <= last1) {
1996 if (regtry(®info, &s))
2002 DEBUG_EXECUTE_r(if (!did_match) {
2003 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2004 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2005 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2006 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2007 ? "anchored" : "floating"),
2008 quoted, RE_SV_TAIL(must));
2012 else if ( (c = progi->regstclass) ) {
2014 const OPCODE op = OP(progi->regstclass);
2015 /* don't bother with what can't match */
2016 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2017 strend = HOPc(strend, -(minlen - 1));
2020 SV * const prop = sv_newmortal();
2021 regprop(prog, prop, c);
2023 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2025 PerlIO_printf(Perl_debug_log,
2026 "Matching stclass %.*s against %s (%d chars)\n",
2027 (int)SvCUR(prop), SvPVX_const(prop),
2028 quoted, (int)(strend - s));
2031 if (find_byclass(prog, c, s, strend, ®info))
2033 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2037 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2042 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2043 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2044 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2046 if (flags & REXEC_SCREAM) {
2047 last = screaminstr(sv, float_real, s - strbeg,
2048 end_shift, &scream_pos, 1); /* last one */
2050 last = scream_olds; /* Only one occurrence. */
2051 /* we may be pointing at the wrong string */
2052 else if (RX_MATCH_COPIED(prog))
2053 s = strbeg + (s - SvPVX_const(sv));
2057 const char * const little = SvPV_const(float_real, len);
2059 if (SvTAIL(float_real)) {
2060 if (memEQ(strend - len + 1, little, len - 1))
2061 last = strend - len + 1;
2062 else if (!multiline)
2063 last = memEQ(strend - len, little, len)
2064 ? strend - len : NULL;
2070 last = rninstr(s, strend, little, little + len);
2072 last = strend; /* matching "$" */
2077 PerlIO_printf(Perl_debug_log,
2078 "%sCan't trim the tail, match fails (should not happen)%s\n",
2079 PL_colors[4], PL_colors[5]));
2080 goto phooey; /* Should not happen! */
2082 dontbother = strend - last + prog->float_min_offset;
2084 if (minlen && (dontbother < minlen))
2085 dontbother = minlen - 1;
2086 strend -= dontbother; /* this one's always in bytes! */
2087 /* We don't know much -- general case. */
2090 if (regtry(®info, &s))
2099 if (regtry(®info, &s))
2101 } while (s++ < strend);
2109 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2111 if (PL_reg_eval_set)
2112 restore_pos(aTHX_ prog);
2113 if (prog->paren_names)
2114 (void)hv_iterinit(prog->paren_names);
2116 /* make sure $`, $&, $', and $digit will work later */
2117 if ( !(flags & REXEC_NOT_FIRST) ) {
2118 RX_MATCH_COPY_FREE(prog);
2119 if (flags & REXEC_COPY_STR) {
2120 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2121 #ifdef PERL_OLD_COPY_ON_WRITE
2123 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2125 PerlIO_printf(Perl_debug_log,
2126 "Copy on write: regexp capture, type %d\n",
2129 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2130 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2131 assert (SvPOKp(prog->saved_copy));
2135 RX_MATCH_COPIED_on(prog);
2136 s = savepvn(strbeg, i);
2142 prog->subbeg = strbeg;
2143 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2151 PL_colors[4], PL_colors[5]));
2152 if (PL_reg_eval_set)
2153 restore_pos(aTHX_ prog);
2155 /* we failed :-( roll it back */
2156 swap_match_buff(prog);
2163 - regtry - try match at specific point
2165 STATIC I32 /* 0 failure, 1 success */
2166 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2170 regexp *prog = reginfo->prog;
2171 RXi_GET_DECL(prog,progi);
2172 GET_RE_DEBUG_FLAGS_DECL;
2173 reginfo->cutpoint=NULL;
2175 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2178 PL_reg_eval_set = RS_init;
2179 DEBUG_EXECUTE_r(DEBUG_s(
2180 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2181 (IV)(PL_stack_sp - PL_stack_base));
2184 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2185 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2187 /* Apparently this is not needed, judging by wantarray. */
2188 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2189 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2192 /* Make $_ available to executed code. */
2193 if (reginfo->sv != DEFSV) {
2195 DEFSV = reginfo->sv;
2198 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2199 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2200 /* prepare for quick setting of pos */
2201 #ifdef PERL_OLD_COPY_ON_WRITE
2202 if (SvIsCOW(reginfo->sv))
2203 sv_force_normal_flags(reginfo->sv, 0);
2205 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2206 &PL_vtbl_mglob, NULL, 0);
2210 PL_reg_oldpos = mg->mg_len;
2211 SAVEDESTRUCTOR_X(restore_pos, prog);
2213 if (!PL_reg_curpm) {
2214 Newxz(PL_reg_curpm, 1, PMOP);
2217 SV* const repointer = newSViv(0);
2218 /* so we know which PL_regex_padav element is PL_reg_curpm */
2219 SvFLAGS(repointer) |= SVf_BREAK;
2220 av_push(PL_regex_padav,repointer);
2221 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2222 PL_regex_pad = AvARRAY(PL_regex_padav);
2226 PM_SETRE(PL_reg_curpm, prog);
2227 PL_reg_oldcurpm = PL_curpm;
2228 PL_curpm = PL_reg_curpm;
2229 if (RX_MATCH_COPIED(prog)) {
2230 /* Here is a serious problem: we cannot rewrite subbeg,
2231 since it may be needed if this match fails. Thus
2232 $` inside (?{}) could fail... */
2233 PL_reg_oldsaved = prog->subbeg;
2234 PL_reg_oldsavedlen = prog->sublen;
2235 #ifdef PERL_OLD_COPY_ON_WRITE
2236 PL_nrs = prog->saved_copy;
2238 RX_MATCH_COPIED_off(prog);
2241 PL_reg_oldsaved = NULL;
2242 prog->subbeg = PL_bostr;
2243 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2245 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2246 prog->offs[0].start = *startpos - PL_bostr;
2247 PL_reginput = *startpos;
2248 PL_reglastparen = &prog->lastparen;
2249 PL_reglastcloseparen = &prog->lastcloseparen;
2250 prog->lastparen = 0;
2251 prog->lastcloseparen = 0;
2253 PL_regoffs = prog->offs;
2254 if (PL_reg_start_tmpl <= prog->nparens) {
2255 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2256 if(PL_reg_start_tmp)
2257 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2259 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2262 /* XXXX What this code is doing here?!!! There should be no need
2263 to do this again and again, PL_reglastparen should take care of
2266 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2267 * Actually, the code in regcppop() (which Ilya may be meaning by
2268 * PL_reglastparen), is not needed at all by the test suite
2269 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2270 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2271 * Meanwhile, this code *is* needed for the
2272 * above-mentioned test suite tests to succeed. The common theme
2273 * on those tests seems to be returning null fields from matches.
2274 * --jhi updated by dapm */
2276 if (prog->nparens) {
2277 regexp_paren_pair *pp = PL_regoffs;
2279 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2287 if (regmatch(reginfo, progi->program + 1)) {
2288 PL_regoffs[0].end = PL_reginput - PL_bostr;
2291 if (reginfo->cutpoint)
2292 *startpos= reginfo->cutpoint;
2293 REGCP_UNWIND(lastcp);
2298 #define sayYES goto yes
2299 #define sayNO goto no
2300 #define sayNO_SILENT goto no_silent
2302 /* we dont use STMT_START/END here because it leads to
2303 "unreachable code" warnings, which are bogus, but distracting. */
2304 #define CACHEsayNO \
2305 if (ST.cache_mask) \
2306 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2309 /* this is used to determine how far from the left messages like
2310 'failed...' are printed. It should be set such that messages
2311 are inline with the regop output that created them.
2313 #define REPORT_CODE_OFF 32
2316 /* Make sure there is a test for this +1 options in re_tests */
2317 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2319 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2320 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2322 #define SLAB_FIRST(s) (&(s)->states[0])
2323 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2325 /* grab a new slab and return the first slot in it */
2327 STATIC regmatch_state *
2330 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2333 regmatch_slab *s = PL_regmatch_slab->next;
2335 Newx(s, 1, regmatch_slab);
2336 s->prev = PL_regmatch_slab;
2338 PL_regmatch_slab->next = s;
2340 PL_regmatch_slab = s;
2341 return SLAB_FIRST(s);
2345 /* push a new state then goto it */
2347 #define PUSH_STATE_GOTO(state, node) \
2349 st->resume_state = state; \
2352 /* push a new state with success backtracking, then goto it */
2354 #define PUSH_YES_STATE_GOTO(state, node) \
2356 st->resume_state = state; \
2357 goto push_yes_state;
2363 regmatch() - main matching routine
2365 This is basically one big switch statement in a loop. We execute an op,
2366 set 'next' to point the next op, and continue. If we come to a point which
2367 we may need to backtrack to on failure such as (A|B|C), we push a
2368 backtrack state onto the backtrack stack. On failure, we pop the top
2369 state, and re-enter the loop at the state indicated. If there are no more
2370 states to pop, we return failure.
2372 Sometimes we also need to backtrack on success; for example /A+/, where
2373 after successfully matching one A, we need to go back and try to
2374 match another one; similarly for lookahead assertions: if the assertion
2375 completes successfully, we backtrack to the state just before the assertion
2376 and then carry on. In these cases, the pushed state is marked as
2377 'backtrack on success too'. This marking is in fact done by a chain of
2378 pointers, each pointing to the previous 'yes' state. On success, we pop to
2379 the nearest yes state, discarding any intermediate failure-only states.
2380 Sometimes a yes state is pushed just to force some cleanup code to be
2381 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2382 it to free the inner regex.
2384 Note that failure backtracking rewinds the cursor position, while
2385 success backtracking leaves it alone.
2387 A pattern is complete when the END op is executed, while a subpattern
2388 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2389 ops trigger the "pop to last yes state if any, otherwise return true"
2392 A common convention in this function is to use A and B to refer to the two
2393 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2394 the subpattern to be matched possibly multiple times, while B is the entire
2395 rest of the pattern. Variable and state names reflect this convention.
2397 The states in the main switch are the union of ops and failure/success of
2398 substates associated with with that op. For example, IFMATCH is the op
2399 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2400 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2401 successfully matched A and IFMATCH_A_fail is a state saying that we have
2402 just failed to match A. Resume states always come in pairs. The backtrack
2403 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2404 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2405 on success or failure.
2407 The struct that holds a backtracking state is actually a big union, with
2408 one variant for each major type of op. The variable st points to the
2409 top-most backtrack struct. To make the code clearer, within each
2410 block of code we #define ST to alias the relevant union.
2412 Here's a concrete example of a (vastly oversimplified) IFMATCH
2418 #define ST st->u.ifmatch
2420 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2421 ST.foo = ...; // some state we wish to save
2423 // push a yes backtrack state with a resume value of
2424 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2426 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2429 case IFMATCH_A: // we have successfully executed A; now continue with B
2431 bar = ST.foo; // do something with the preserved value
2434 case IFMATCH_A_fail: // A failed, so the assertion failed
2435 ...; // do some housekeeping, then ...
2436 sayNO; // propagate the failure
2443 For any old-timers reading this who are familiar with the old recursive
2444 approach, the code above is equivalent to:
2446 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2455 ...; // do some housekeeping, then ...
2456 sayNO; // propagate the failure
2459 The topmost backtrack state, pointed to by st, is usually free. If you
2460 want to claim it, populate any ST.foo fields in it with values you wish to
2461 save, then do one of
2463 PUSH_STATE_GOTO(resume_state, node);
2464 PUSH_YES_STATE_GOTO(resume_state, node);
2466 which sets that backtrack state's resume value to 'resume_state', pushes a
2467 new free entry to the top of the backtrack stack, then goes to 'node'.
2468 On backtracking, the free slot is popped, and the saved state becomes the
2469 new free state. An ST.foo field in this new top state can be temporarily
2470 accessed to retrieve values, but once the main loop is re-entered, it
2471 becomes available for reuse.
2473 Note that the depth of the backtrack stack constantly increases during the
2474 left-to-right execution of the pattern, rather than going up and down with
2475 the pattern nesting. For example the stack is at its maximum at Z at the
2476 end of the pattern, rather than at X in the following:
2478 /(((X)+)+)+....(Y)+....Z/
2480 The only exceptions to this are lookahead/behind assertions and the cut,
2481 (?>A), which pop all the backtrack states associated with A before
2484 Bascktrack state structs are allocated in slabs of about 4K in size.
2485 PL_regmatch_state and st always point to the currently active state,
2486 and PL_regmatch_slab points to the slab currently containing
2487 PL_regmatch_state. The first time regmatch() is called, the first slab is
2488 allocated, and is never freed until interpreter destruction. When the slab
2489 is full, a new one is allocated and chained to the end. At exit from
2490 regmatch(), slabs allocated since entry are freed.
2495 #define DEBUG_STATE_pp(pp) \
2497 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2498 PerlIO_printf(Perl_debug_log, \
2499 " %*s"pp" %s%s%s%s%s\n", \
2501 PL_reg_name[st->resume_state], \
2502 ((st==yes_state||st==mark_state) ? "[" : ""), \
2503 ((st==yes_state) ? "Y" : ""), \
2504 ((st==mark_state) ? "M" : ""), \
2505 ((st==yes_state||st==mark_state) ? "]" : "") \
2510 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2515 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2516 const char *start, const char *end, const char *blurb)
2518 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2522 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2523 prog->precomp, prog->prelen, 60);
2525 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2526 start, end - start, 60);
2528 PerlIO_printf(Perl_debug_log,
2529 "%s%s REx%s %s against %s\n",
2530 PL_colors[4], blurb, PL_colors[5], s0, s1);
2532 if (do_utf8||utf8_pat)
2533 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2534 utf8_pat ? "pattern" : "",
2535 utf8_pat && do_utf8 ? " and " : "",
2536 do_utf8 ? "string" : ""
2542 S_dump_exec_pos(pTHX_ const char *locinput,
2543 const regnode *scan,
2544 const char *loc_regeol,
2545 const char *loc_bostr,
2546 const char *loc_reg_starttry,
2549 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2550 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2551 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2552 /* The part of the string before starttry has one color
2553 (pref0_len chars), between starttry and current
2554 position another one (pref_len - pref0_len chars),
2555 after the current position the third one.
2556 We assume that pref0_len <= pref_len, otherwise we
2557 decrease pref0_len. */
2558 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2559 ? (5 + taill) - l : locinput - loc_bostr;
2562 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2564 pref0_len = pref_len - (locinput - loc_reg_starttry);
2565 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2566 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2567 ? (5 + taill) - pref_len : loc_regeol - locinput);
2568 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2572 if (pref0_len > pref_len)
2573 pref0_len = pref_len;
2575 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2577 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2578 (locinput - pref_len),pref0_len, 60, 4, 5);
2580 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2581 (locinput - pref_len + pref0_len),
2582 pref_len - pref0_len, 60, 2, 3);
2584 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2585 locinput, loc_regeol - locinput, 10, 0, 1);
2587 const STRLEN tlen=len0+len1+len2;
2588 PerlIO_printf(Perl_debug_log,
2589 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2590 (IV)(locinput - loc_bostr),
2593 (docolor ? "" : "> <"),
2595 (int)(tlen > 19 ? 0 : 19 - tlen),
2602 /* reg_check_named_buff_matched()
2603 * Checks to see if a named buffer has matched. The data array of
2604 * buffer numbers corresponding to the buffer is expected to reside
2605 * in the regexp->data->data array in the slot stored in the ARG() of
2606 * node involved. Note that this routine doesn't actually care about the
2607 * name, that information is not preserved from compilation to execution.
2608 * Returns the index of the leftmost defined buffer with the given name
2609 * or 0 if non of the buffers matched.
2612 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2614 RXi_GET_DECL(rex,rexi);
2615 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2616 I32 *nums=(I32*)SvPVX(sv_dat);
2617 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2618 if ((I32)*PL_reglastparen >= nums[n] &&
2619 PL_regoffs[nums[n]].end != -1)
2628 /* free all slabs above current one - called during LEAVE_SCOPE */
2631 S_clear_backtrack_stack(pTHX_ void *p)
2633 regmatch_slab *s = PL_regmatch_slab->next;
2638 PL_regmatch_slab->next = NULL;
2640 regmatch_slab * const osl = s;
2647 #define SETREX(Re1,Re2) \
2648 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2651 STATIC I32 /* 0 failure, 1 success */
2652 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2654 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2658 register const bool do_utf8 = PL_reg_match_utf8;
2659 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2661 regexp *rex = reginfo->prog;
2662 RXi_GET_DECL(rex,rexi);
2666 /* the current state. This is a cached copy of PL_regmatch_state */
2667 register regmatch_state *st;
2669 /* cache heavy used fields of st in registers */
2670 register regnode *scan;
2671 register regnode *next;
2672 register U32 n = 0; /* general value; init to avoid compiler warning */
2673 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2674 register char *locinput = PL_reginput;
2675 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2677 bool result = 0; /* return value of S_regmatch */
2678 int depth = 0; /* depth of backtrack stack */
2679 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2680 const U32 max_nochange_depth =
2681 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2682 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2684 regmatch_state *yes_state = NULL; /* state to pop to on success of
2686 /* mark_state piggy backs on the yes_state logic so that when we unwind
2687 the stack on success we can update the mark_state as we go */
2688 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2690 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2691 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2693 bool no_final = 0; /* prevent failure from backtracking? */
2694 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2695 char *startpoint = PL_reginput;
2696 SV *popmark = NULL; /* are we looking for a mark? */
2697 SV *sv_commit = NULL; /* last mark name seen in failure */
2698 SV *sv_yes_mark = NULL; /* last mark name we have seen
2699 during a successfull match */
2700 U32 lastopen = 0; /* last open we saw */
2701 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2703 SV* const oreplsv = GvSV(PL_replgv);
2706 /* these three flags are set by various ops to signal information to
2707 * the very next op. They have a useful lifetime of exactly one loop
2708 * iteration, and are not preserved or restored by state pushes/pops
2710 bool sw = 0; /* the condition value in (?(cond)a|b) */
2711 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2712 int logical = 0; /* the following EVAL is:
2716 or the following IFMATCH/UNLESSM is:
2717 false: plain (?=foo)
2718 true: used as a condition: (?(?=foo))
2722 GET_RE_DEBUG_FLAGS_DECL;
2725 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2726 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2728 /* on first ever call to regmatch, allocate first slab */
2729 if (!PL_regmatch_slab) {
2730 Newx(PL_regmatch_slab, 1, regmatch_slab);
2731 PL_regmatch_slab->prev = NULL;
2732 PL_regmatch_slab->next = NULL;
2733 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2736 oldsave = PL_savestack_ix;
2737 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2738 SAVEVPTR(PL_regmatch_slab);
2739 SAVEVPTR(PL_regmatch_state);
2741 /* grab next free state slot */
2742 st = ++PL_regmatch_state;
2743 if (st > SLAB_LAST(PL_regmatch_slab))
2744 st = PL_regmatch_state = S_push_slab(aTHX);
2746 /* Note that nextchr is a byte even in UTF */
2747 nextchr = UCHARAT(locinput);
2749 while (scan != NULL) {
2752 SV * const prop = sv_newmortal();
2753 regnode *rnext=regnext(scan);
2754 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2755 regprop(rex, prop, scan);
2757 PerlIO_printf(Perl_debug_log,
2758 "%3"IVdf":%*s%s(%"IVdf")\n",
2759 (IV)(scan - rexi->program), depth*2, "",
2761 (PL_regkind[OP(scan)] == END || !rnext) ?
2762 0 : (IV)(rnext - rexi->program));
2765 next = scan + NEXT_OFF(scan);
2768 state_num = OP(scan);
2771 switch (state_num) {
2773 if (locinput == PL_bostr)
2775 /* reginfo->till = reginfo->bol; */
2780 if (locinput == PL_bostr ||
2781 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2787 if (locinput == PL_bostr)
2791 if (locinput == reginfo->ganch)
2796 /* update the startpoint */
2797 st->u.keeper.val = PL_regoffs[0].start;
2798 PL_reginput = locinput;
2799 PL_regoffs[0].start = locinput - PL_bostr;
2800 PUSH_STATE_GOTO(KEEPS_next, next);
2802 case KEEPS_next_fail:
2803 /* rollback the start point change */
2804 PL_regoffs[0].start = st->u.keeper.val;
2810 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2815 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2817 if (PL_regeol - locinput > 1)
2821 if (PL_regeol != locinput)
2825 if (!nextchr && locinput >= PL_regeol)
2828 locinput += PL_utf8skip[nextchr];
2829 if (locinput > PL_regeol)
2831 nextchr = UCHARAT(locinput);
2834 nextchr = UCHARAT(++locinput);
2837 if (!nextchr && locinput >= PL_regeol)
2839 nextchr = UCHARAT(++locinput);
2842 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2845 locinput += PL_utf8skip[nextchr];
2846 if (locinput > PL_regeol)
2848 nextchr = UCHARAT(locinput);
2851 nextchr = UCHARAT(++locinput);
2855 #define ST st->u.trie
2857 /* In this case the charclass data is available inline so
2858 we can fail fast without a lot of extra overhead.
2860 if (scan->flags == EXACT || !do_utf8) {
2861 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2863 PerlIO_printf(Perl_debug_log,
2864 "%*s %sfailed to match trie start class...%s\n",
2865 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2874 /* what type of TRIE am I? (utf8 makes this contextual) */
2875 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2876 trie_type = do_utf8 ?
2877 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2880 /* what trie are we using right now */
2881 reg_trie_data * const trie
2882 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2883 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2884 U32 state = trie->startstate;
2886 if (trie->bitmap && trie_type != trie_utf8_fold &&
2887 !TRIE_BITMAP_TEST(trie,*locinput)
2889 if (trie->states[ state ].wordnum) {
2891 PerlIO_printf(Perl_debug_log,
2892 "%*s %smatched empty string...%s\n",
2893 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2898 PerlIO_printf(Perl_debug_log,
2899 "%*s %sfailed to match trie start class...%s\n",
2900 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2907 U8 *uc = ( U8* )locinput;
2911 U8 *uscan = (U8*)NULL;
2913 SV *sv_accept_buff = NULL;
2914 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2916 ST.accepted = 0; /* how many accepting states we have seen */
2918 ST.jump = trie->jump;
2921 traverse the TRIE keeping track of all accepting states
2922 we transition through until we get to a failing node.
2925 while ( state && uc <= (U8*)PL_regeol ) {
2926 U32 base = trie->states[ state ].trans.base;
2929 /* We use charid to hold the wordnum as we don't use it
2930 for charid until after we have done the wordnum logic.
2931 We define an alias just so that the wordnum logic reads
2934 #define got_wordnum charid
2935 got_wordnum = trie->states[ state ].wordnum;
2937 if ( got_wordnum ) {
2938 if ( ! ST.accepted ) {
2940 /* SAVETMPS; */ /* XXX is this necessary? dmq */
2941 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2942 sv_accept_buff=newSV(bufflen *
2943 sizeof(reg_trie_accepted) - 1);
2944 SvCUR_set(sv_accept_buff, 0);
2945 SvPOK_on(sv_accept_buff);
2946 sv_2mortal(sv_accept_buff);
2949 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2952 if (ST.accepted >= bufflen) {
2954 ST.accept_buff =(reg_trie_accepted*)
2955 SvGROW(sv_accept_buff,
2956 bufflen * sizeof(reg_trie_accepted));
2958 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2959 + sizeof(reg_trie_accepted));
2962 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2963 ST.accept_buff[ST.accepted].endpos = uc;
2965 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2969 DEBUG_TRIE_EXECUTE_r({
2970 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2971 PerlIO_printf( Perl_debug_log,
2972 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2973 2+depth * 2, "", PL_colors[4],
2974 (UV)state, (UV)ST.accepted );
2978 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2979 uscan, len, uvc, charid, foldlen,
2983 (base + charid > trie->uniquecharcount )
2984 && (base + charid - 1 - trie->uniquecharcount
2986 && trie->trans[base + charid - 1 -
2987 trie->uniquecharcount].check == state)
2989 state = trie->trans[base + charid - 1 -
2990 trie->uniquecharcount ].next;
3001 DEBUG_TRIE_EXECUTE_r(
3002 PerlIO_printf( Perl_debug_log,
3003 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3004 charid, uvc, (UV)state, PL_colors[5] );
3011 PerlIO_printf( Perl_debug_log,
3012 "%*s %sgot %"IVdf" possible matches%s\n",
3013 REPORT_CODE_OFF + depth * 2, "",
3014 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3017 goto trie_first_try; /* jump into the fail handler */
3019 case TRIE_next_fail: /* we failed - try next alterative */
3021 REGCP_UNWIND(ST.cp);
3022 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3023 PL_regoffs[n].end = -1;
3024 *PL_reglastparen = n;
3033 ST.lastparen = *PL_reglastparen;
3036 if ( ST.accepted == 1 ) {
3037 /* only one choice left - just continue */
3039 AV *const trie_words
3040 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3041 SV ** const tmp = av_fetch( trie_words,
3042 ST.accept_buff[ 0 ].wordnum-1, 0 );
3043 SV *sv= tmp ? sv_newmortal() : NULL;
3045 PerlIO_printf( Perl_debug_log,
3046 "%*s %sonly one match left: #%d <%s>%s\n",
3047 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3048 ST.accept_buff[ 0 ].wordnum,
3049 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3050 PL_colors[0], PL_colors[1],
3051 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3053 : "not compiled under -Dr",
3056 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3057 /* in this case we free tmps/leave before we call regmatch
3058 as we wont be using accept_buff again. */
3060 locinput = PL_reginput;
3061 nextchr = UCHARAT(locinput);
3062 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3065 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3066 if (!has_cutgroup) {
3071 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3074 continue; /* execute rest of RE */
3077 if ( !ST.accepted-- ) {
3079 PerlIO_printf( Perl_debug_log,
3080 "%*s %sTRIE failed...%s\n",
3081 REPORT_CODE_OFF+depth*2, "",
3092 There are at least two accepting states left. Presumably
3093 the number of accepting states is going to be low,
3094 typically two. So we simply scan through to find the one
3095 with lowest wordnum. Once we find it, we swap the last
3096 state into its place and decrement the size. We then try to
3097 match the rest of the pattern at the point where the word
3098 ends. If we succeed, control just continues along the
3099 regex; if we fail we return here to try the next accepting
3106 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3107 DEBUG_TRIE_EXECUTE_r(
3108 PerlIO_printf( Perl_debug_log,
3109 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3110 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3111 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3112 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3115 if (ST.accept_buff[cur].wordnum <
3116 ST.accept_buff[best].wordnum)
3121 AV *const trie_words
3122 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3123 SV ** const tmp = av_fetch( trie_words,
3124 ST.accept_buff[ best ].wordnum - 1, 0 );
3125 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3127 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3128 SV *sv= tmp ? sv_newmortal() : NULL;
3130 PerlIO_printf( Perl_debug_log,
3131 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3132 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3133 ST.accept_buff[best].wordnum,
3134 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3135 PL_colors[0], PL_colors[1],
3136 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3137 ) : "not compiled under -Dr",
3138 REG_NODE_NUM(nextop),
3142 if ( best<ST.accepted ) {
3143 reg_trie_accepted tmp = ST.accept_buff[ best ];
3144 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3145 ST.accept_buff[ ST.accepted ] = tmp;
3148 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3149 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3152 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3154 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3165 char *s = STRING(scan);
3167 if (do_utf8 != UTF) {
3168 /* The target and the pattern have differing utf8ness. */
3170 const char * const e = s + ln;
3173 /* The target is utf8, the pattern is not utf8. */
3178 if (NATIVE_TO_UNI(*(U8*)s) !=
3179 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3187 /* The target is not utf8, the pattern is utf8. */
3192 if (NATIVE_TO_UNI(*((U8*)l)) !=
3193 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3201 nextchr = UCHARAT(locinput);
3204 /* The target and the pattern have the same utf8ness. */
3205 /* Inline the first character, for speed. */
3206 if (UCHARAT(s) != nextchr)
3208 if (PL_regeol - locinput < ln)
3210 if (ln > 1 && memNE(s, locinput, ln))
3213 nextchr = UCHARAT(locinput);
3217 PL_reg_flags |= RF_tainted;
3220 char * const s = STRING(scan);
3223 if (do_utf8 || UTF) {
3224 /* Either target or the pattern are utf8. */
3225 const char * const l = locinput;
3226 char *e = PL_regeol;
3228 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3229 l, &e, 0, do_utf8)) {
3230 /* One more case for the sharp s:
3231 * pack("U0U*", 0xDF) =~ /ss/i,
3232 * the 0xC3 0x9F are the UTF-8
3233 * byte sequence for the U+00DF. */
3236 toLOWER(s[0]) == 's' &&
3238 toLOWER(s[1]) == 's' &&
3245 nextchr = UCHARAT(locinput);
3249 /* Neither the target and the pattern are utf8. */
3251 /* Inline the first character, for speed. */
3252 if (UCHARAT(s) != nextchr &&
3253 UCHARAT(s) != ((OP(scan) == EXACTF)
3254 ? PL_fold : PL_fold_locale)[nextchr])
3256 if (PL_regeol - locinput < ln)
3258 if (ln > 1 && (OP(scan) == EXACTF
3259 ? ibcmp(s, locinput, ln)
3260 : ibcmp_locale(s, locinput, ln)))
3263 nextchr = UCHARAT(locinput);
3268 STRLEN inclasslen = PL_regeol - locinput;
3270 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3272 if (locinput >= PL_regeol)
3274 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3275 nextchr = UCHARAT(locinput);
3280 nextchr = UCHARAT(locinput);
3281 if (!REGINCLASS(rex, scan, (U8*)locinput))
3283 if (!nextchr && locinput >= PL_regeol)
3285 nextchr = UCHARAT(++locinput);
3289 /* If we might have the case of the German sharp s
3290 * in a casefolding Unicode character class. */
3292 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3293 locinput += SHARP_S_SKIP;
3294 nextchr = UCHARAT(locinput);
3300 PL_reg_flags |= RF_tainted;
3306 LOAD_UTF8_CHARCLASS_ALNUM();
3307 if (!(OP(scan) == ALNUM
3308 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3309 : isALNUM_LC_utf8((U8*)locinput)))
3313 locinput += PL_utf8skip[nextchr];
3314 nextchr = UCHARAT(locinput);
3317 if (!(OP(scan) == ALNUM
3318 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3320 nextchr = UCHARAT(++locinput);
3323 PL_reg_flags |= RF_tainted;
3326 if (!nextchr && locinput >= PL_regeol)
3329 LOAD_UTF8_CHARCLASS_ALNUM();
3330 if (OP(scan) == NALNUM
3331 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3332 : isALNUM_LC_utf8((U8*)locinput))
3336 locinput += PL_utf8skip[nextchr];
3337 nextchr = UCHARAT(locinput);
3340 if (OP(scan) == NALNUM
3341 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3343 nextchr = UCHARAT(++locinput);
3347 PL_reg_flags |= RF_tainted;
3351 /* was last char in word? */
3353 if (locinput == PL_bostr)
3356 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3358 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3360 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3361 ln = isALNUM_uni(ln);
3362 LOAD_UTF8_CHARCLASS_ALNUM();
3363 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3366 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3367 n = isALNUM_LC_utf8((U8*)locinput);
3371 ln = (locinput != PL_bostr) ?
3372 UCHARAT(locinput - 1) : '\n';
3373 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3375 n = isALNUM(nextchr);
3378 ln = isALNUM_LC(ln);
3379 n = isALNUM_LC(nextchr);
3382 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3383 OP(scan) == BOUNDL))
3387 PL_reg_flags |= RF_tainted;
3393 if (UTF8_IS_CONTINUED(nextchr)) {
3394 LOAD_UTF8_CHARCLASS_SPACE();
3395 if (!(OP(scan) == SPACE
3396 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3397 : isSPACE_LC_utf8((U8*)locinput)))
3401 locinput += PL_utf8skip[nextchr];
3402 nextchr = UCHARAT(locinput);
3405 if (!(OP(scan) == SPACE
3406 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3408 nextchr = UCHARAT(++locinput);
3411 if (!(OP(scan) == SPACE
3412 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3414 nextchr = UCHARAT(++locinput);
3418 PL_reg_flags |= RF_tainted;
3421 if (!nextchr && locinput >= PL_regeol)
3424 LOAD_UTF8_CHARCLASS_SPACE();
3425 if (OP(scan) == NSPACE
3426 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3427 : isSPACE_LC_utf8((U8*)locinput))
3431 locinput += PL_utf8skip[nextchr];
3432 nextchr = UCHARAT(locinput);
3435 if (OP(scan) == NSPACE
3436 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3438 nextchr = UCHARAT(++locinput);
3441 PL_reg_flags |= RF_tainted;
3447 LOAD_UTF8_CHARCLASS_DIGIT();
3448 if (!(OP(scan) == DIGIT
3449 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3450 : isDIGIT_LC_utf8((U8*)locinput)))
3454 locinput += PL_utf8skip[nextchr];
3455 nextchr = UCHARAT(locinput);
3458 if (!(OP(scan) == DIGIT
3459 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3461 nextchr = UCHARAT(++locinput);
3464 PL_reg_flags |= RF_tainted;
3467 if (!nextchr && locinput >= PL_regeol)
3470 LOAD_UTF8_CHARCLASS_DIGIT();
3471 if (OP(scan) == NDIGIT
3472 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3473 : isDIGIT_LC_utf8((U8*)locinput))
3477 locinput += PL_utf8skip[nextchr];
3478 nextchr = UCHARAT(locinput);
3481 if (OP(scan) == NDIGIT
3482 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3484 nextchr = UCHARAT(++locinput);
3487 if (locinput >= PL_regeol)
3490 LOAD_UTF8_CHARCLASS_MARK();
3491 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3493 locinput += PL_utf8skip[nextchr];
3494 while (locinput < PL_regeol &&
3495 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3496 locinput += UTF8SKIP(locinput);
3497 if (locinput > PL_regeol)
3502 nextchr = UCHARAT(locinput);
3509 PL_reg_flags |= RF_tainted;
3514 n = reg_check_named_buff_matched(rex,scan);
3517 type = REF + ( type - NREF );
3524 PL_reg_flags |= RF_tainted;
3528 n = ARG(scan); /* which paren pair */
3531 ln = PL_regoffs[n].start;
3532 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3533 if (*PL_reglastparen < n || ln == -1)
3534 sayNO; /* Do not match unless seen CLOSEn. */
3535 if (ln == PL_regoffs[n].end)
3539 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3541 const char *e = PL_bostr + PL_regoffs[n].end;
3543 * Note that we can't do the "other character" lookup trick as
3544 * in the 8-bit case (no pun intended) because in Unicode we
3545 * have to map both upper and title case to lower case.
3549 STRLEN ulen1, ulen2;
3550 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3551 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3555 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3556 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3557 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3564 nextchr = UCHARAT(locinput);
3568 /* Inline the first character, for speed. */
3569 if (UCHARAT(s) != nextchr &&
3571 (UCHARAT(s) != (type == REFF
3572 ? PL_fold : PL_fold_locale)[nextchr])))
3574 ln = PL_regoffs[n].end - ln;
3575 if (locinput + ln > PL_regeol)
3577 if (ln > 1 && (type == REF
3578 ? memNE(s, locinput, ln)
3580 ? ibcmp(s, locinput, ln)
3581 : ibcmp_locale(s, locinput, ln))))
3584 nextchr = UCHARAT(locinput);
3594 #define ST st->u.eval
3598 regexp_internal *rei;
3599 regnode *startpoint;
3602 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3603 if (cur_eval && cur_eval->locinput==locinput) {
3604 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3605 Perl_croak(aTHX_ "Infinite recursion in regex");
3606 if ( ++nochange_depth > max_nochange_depth )
3608 "Pattern subroutine nesting without pos change"
3609 " exceeded limit in regex");
3615 (void)ReREFCNT_inc(rex);
3616 if (OP(scan)==GOSUB) {
3617 startpoint = scan + ARG2L(scan);
3618 ST.close_paren = ARG(scan);
3620 startpoint = rei->program+1;
3623 goto eval_recurse_doit;
3625 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3626 if (cur_eval && cur_eval->locinput==locinput) {
3627 if ( ++nochange_depth > max_nochange_depth )
3628 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3633 /* execute the code in the {...} */
3635 SV ** const before = SP;
3636 OP_4tree * const oop = PL_op;
3637 COP * const ocurcop = PL_curcop;
3641 PL_op = (OP_4tree*)rexi->data->data[n];
3642 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3643 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3644 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3645 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3648 SV *sv_mrk = get_sv("REGMARK", 1);
3649 sv_setsv(sv_mrk, sv_yes_mark);
3652 CALLRUNOPS(aTHX); /* Scalar context. */
3655 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3662 PAD_RESTORE_LOCAL(old_comppad);
3663 PL_curcop = ocurcop;
3666 sv_setsv(save_scalar(PL_replgv), ret);
3670 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3673 /* extract RE object from returned value; compiling if
3678 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3679 mg = mg_find(sv, PERL_MAGIC_qr);
3680 else if (SvSMAGICAL(ret)) {
3681 if (SvGMAGICAL(ret))
3682 sv_unmagic(ret, PERL_MAGIC_qr);
3684 mg = mg_find(ret, PERL_MAGIC_qr);
3688 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
3692 const I32 osize = PL_regsize;
3694 if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
3695 re = CALLREGCOMP(ret, pm_flags);
3697 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3699 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3704 RX_MATCH_COPIED_off(re);
3705 re->subbeg = rex->subbeg;
3706 re->sublen = rex->sublen;
3709 debug_start_match(re, do_utf8, locinput, PL_regeol,
3710 "Matching embedded");
3712 startpoint = rei->program + 1;
3713 ST.close_paren = 0; /* only used for GOSUB */
3714 /* borrowed from regtry */
3715 if (PL_reg_start_tmpl <= re->nparens) {
3716 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3717 if(PL_reg_start_tmp)
3718 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3720 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3723 eval_recurse_doit: /* Share code with GOSUB below this line */
3724 /* run the pattern returned from (??{...}) */
3725 ST.cp = regcppush(0); /* Save *all* the positions. */
3726 REGCP_SET(ST.lastcp);
3728 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
3730 *PL_reglastparen = 0;
3731 *PL_reglastcloseparen = 0;
3732 PL_reginput = locinput;
3735 /* XXXX This is too dramatic a measure... */
3738 ST.toggle_reg_flags = PL_reg_flags;
3739 if (re->extflags & RXf_UTF8)
3740 PL_reg_flags |= RF_utf8;
3742 PL_reg_flags &= ~RF_utf8;
3743 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3746 ST.prev_curlyx = cur_curlyx;
3751 ST.prev_eval = cur_eval;
3753 /* now continue from first node in postoned RE */
3754 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3757 /* logical is 1, /(?(?{...})X|Y)/ */
3758 sw = (bool)SvTRUE(ret);
3763 case EVAL_AB: /* cleanup after a successful (??{A})B */
3764 /* note: this is called twice; first after popping B, then A */
3765 PL_reg_flags ^= ST.toggle_reg_flags;
3767 SETREX(rex,ST.prev_rex);
3768 rexi = RXi_GET(rex);
3770 cur_eval = ST.prev_eval;
3771 cur_curlyx = ST.prev_curlyx;
3772 /* XXXX This is too dramatic a measure... */
3774 if ( nochange_depth )
3779 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3780 /* note: this is called twice; first after popping B, then A */
3781 PL_reg_flags ^= ST.toggle_reg_flags;
3783 SETREX(rex,ST.prev_rex);
3784 rexi = RXi_GET(rex);
3785 PL_reginput = locinput;
3786 REGCP_UNWIND(ST.lastcp);
3788 cur_eval = ST.prev_eval;
3789 cur_curlyx = ST.prev_curlyx;
3790 /* XXXX This is too dramatic a measure... */
3792 if ( nochange_depth )
3798 n = ARG(scan); /* which paren pair */
3799 PL_reg_start_tmp[n] = locinput;
3805 n = ARG(scan); /* which paren pair */
3806 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3807 PL_regoffs[n].end = locinput - PL_bostr;
3808 /*if (n > PL_regsize)
3810 if (n > *PL_reglastparen)
3811 *PL_reglastparen = n;
3812 *PL_reglastcloseparen = n;
3813 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3821 cursor && OP(cursor)!=END;
3822 cursor=regnext(cursor))
3824 if ( OP(cursor)==CLOSE ){
3826 if ( n <= lastopen ) {
3828 = PL_reg_start_tmp[n] - PL_bostr;
3829 PL_regoffs[n].end = locinput - PL_bostr;
3830 /*if (n > PL_regsize)
3832 if (n > *PL_reglastparen)
3833 *PL_reglastparen = n;
3834 *PL_reglastcloseparen = n;
3835 if ( n == ARG(scan) || (cur_eval &&
3836 cur_eval->u.eval.close_paren == n))
3845 n = ARG(scan); /* which paren pair */
3846 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
3849 /* reg_check_named_buff_matched returns 0 for no match */
3850 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3854 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3860 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3862 next = NEXTOPER(NEXTOPER(scan));
3864 next = scan + ARG(scan);
3865 if (OP(next) == IFTHEN) /* Fake one. */
3866 next = NEXTOPER(NEXTOPER(next));
3870 logical = scan->flags;
3873 /*******************************************************************
3875 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3876 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3877 STAR/PLUS/CURLY/CURLYN are used instead.)
3879 A*B is compiled as <CURLYX><A><WHILEM><B>
3881 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3882 state, which contains the current count, initialised to -1. It also sets
3883 cur_curlyx to point to this state, with any previous value saved in the
3886 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3887 since the pattern may possibly match zero times (i.e. it's a while {} loop
3888 rather than a do {} while loop).
3890 Each entry to WHILEM represents a successful match of A. The count in the
3891 CURLYX block is incremented, another WHILEM state is pushed, and execution
3892 passes to A or B depending on greediness and the current count.
3894 For example, if matching against the string a1a2a3b (where the aN are
3895 substrings that match /A/), then the match progresses as follows: (the
3896 pushed states are interspersed with the bits of strings matched so far):
3899 <CURLYX cnt=0><WHILEM>
3900 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3901 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3902 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3903 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3905 (Contrast this with something like CURLYM, which maintains only a single
3909 a1 <CURLYM cnt=1> a2
3910 a1 a2 <CURLYM cnt=2> a3
3911 a1 a2 a3 <CURLYM cnt=3> b
3914 Each WHILEM state block marks a point to backtrack to upon partial failure
3915 of A or B, and also contains some minor state data related to that
3916 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3917 overall state, such as the count, and pointers to the A and B ops.
3919 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3920 must always point to the *current* CURLYX block, the rules are:
3922 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3923 and set cur_curlyx to point the new block.
3925 When popping the CURLYX block after a successful or unsuccessful match,
3926 restore the previous cur_curlyx.
3928 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3929 to the outer one saved in the CURLYX block.
3931 When popping the WHILEM block after a successful or unsuccessful B match,
3932 restore the previous cur_curlyx.
3934 Here's an example for the pattern (AI* BI)*BO
3935 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3938 curlyx backtrack stack
3939 ------ ---------------
3941 CO <CO prev=NULL> <WO>
3942 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3943 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3944 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3946 At this point the pattern succeeds, and we work back down the stack to
3947 clean up, restoring as we go:
3949 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3950 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3951 CO <CO prev=NULL> <WO>
3954 *******************************************************************/
3956 #define ST st->u.curlyx
3958 case CURLYX: /* start of /A*B/ (for complex A) */
3960 /* No need to save/restore up to this paren */
3961 I32 parenfloor = scan->flags;
3963 assert(next); /* keep Coverity happy */
3964 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3967 /* XXXX Probably it is better to teach regpush to support
3968 parenfloor > PL_regsize... */
3969 if (parenfloor > (I32)*PL_reglastparen)
3970 parenfloor = *PL_reglastparen; /* Pessimization... */
3972 ST.prev_curlyx= cur_curlyx;
3974 ST.cp = PL_savestack_ix;
3976 /* these fields contain the state of the current curly.
3977 * they are accessed by subsequent WHILEMs */
3978 ST.parenfloor = parenfloor;
3979 ST.min = ARG1(scan);
3980 ST.max = ARG2(scan);
3981 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3985 ST.count = -1; /* this will be updated by WHILEM */
3986 ST.lastloc = NULL; /* this will be updated by WHILEM */
3988 PL_reginput = locinput;
3989 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3993 case CURLYX_end: /* just finished matching all of A*B */
3994 cur_curlyx = ST.prev_curlyx;
3998 case CURLYX_end_fail: /* just failed to match all of A*B */
4000 cur_curlyx = ST.prev_curlyx;
4006 #define ST st->u.whilem
4008 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4010 /* see the discussion above about CURLYX/WHILEM */
4012 assert(cur_curlyx); /* keep Coverity happy */
4013 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4014 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4015 ST.cache_offset = 0;
4018 PL_reginput = locinput;
4020 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4021 "%*s whilem: matched %ld out of %ld..%ld\n",
4022 REPORT_CODE_OFF+depth*2, "", (long)n,
4023 (long)cur_curlyx->u.curlyx.min,
4024 (long)cur_curlyx->u.curlyx.max)
4027 /* First just match a string of min A's. */
4029 if (n < cur_curlyx->u.curlyx.min) {
4030 cur_curlyx->u.curlyx.lastloc = locinput;
4031 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4035 /* If degenerate A matches "", assume A done. */
4037 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4038 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4039 "%*s whilem: empty match detected, trying continuation...\n",
4040 REPORT_CODE_OFF+depth*2, "")
4042 goto do_whilem_B_max;
4045 /* super-linear cache processing */
4049 if (!PL_reg_maxiter) {
4050 /* start the countdown: Postpone detection until we
4051 * know the match is not *that* much linear. */
4052 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4053 /* possible overflow for long strings and many CURLYX's */
4054 if (PL_reg_maxiter < 0)
4055 PL_reg_maxiter = I32_MAX;
4056 PL_reg_leftiter = PL_reg_maxiter;
4059 if (PL_reg_leftiter-- == 0) {
4060 /* initialise cache */
4061 const I32 size = (PL_reg_maxiter + 7)/8;
4062 if (PL_reg_poscache) {
4063 if ((I32)PL_reg_poscache_size < size) {
4064 Renew(PL_reg_poscache, size, char);
4065 PL_reg_poscache_size = size;
4067 Zero(PL_reg_poscache, size, char);
4070 PL_reg_poscache_size = size;
4071 Newxz(PL_reg_poscache, size, char);
4073 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4074 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4075 PL_colors[4], PL_colors[5])
4079 if (PL_reg_leftiter < 0) {
4080 /* have we already failed at this position? */
4082 offset = (scan->flags & 0xf) - 1
4083 + (locinput - PL_bostr) * (scan->flags>>4);
4084 mask = 1 << (offset % 8);
4086 if (PL_reg_poscache[offset] & mask) {
4087 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4088 "%*s whilem: (cache) already tried at this position...\n",
4089 REPORT_CODE_OFF+depth*2, "")
4091 sayNO; /* cache records failure */
4093 ST.cache_offset = offset;
4094 ST.cache_mask = mask;
4098 /* Prefer B over A for minimal matching. */
4100 if (cur_curlyx->u.curlyx.minmod) {
4101 ST.save_curlyx = cur_curlyx;
4102 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4103 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4104 REGCP_SET(ST.lastcp);
4105 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4109 /* Prefer A over B for maximal matching. */
4111 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4112 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4113 cur_curlyx->u.curlyx.lastloc = locinput;
4114 REGCP_SET(ST.lastcp);
4115 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4118 goto do_whilem_B_max;
4122 case WHILEM_B_min: /* just matched B in a minimal match */
4123 case WHILEM_B_max: /* just matched B in a maximal match */
4124 cur_curlyx = ST.save_curlyx;
4128 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4129 cur_curlyx = ST.save_curlyx;
4130 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4131 cur_curlyx->u.curlyx.count--;
4135 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4136 REGCP_UNWIND(ST.lastcp);
4139 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4140 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4141 cur_curlyx->u.curlyx.count--;
4145 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4146 REGCP_UNWIND(ST.lastcp);
4147 regcppop(rex); /* Restore some previous $<digit>s? */
4148 PL_reginput = locinput;
4149 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4150 "%*s whilem: failed, trying continuation...\n",
4151 REPORT_CODE_OFF+depth*2, "")
4154 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4155 && ckWARN(WARN_REGEXP)
4156 && !(PL_reg_flags & RF_warned))
4158 PL_reg_flags |= RF_warned;
4159 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4160 "Complex regular subexpression recursion",
4165 ST.save_curlyx = cur_curlyx;
4166 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4167 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4170 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4171 cur_curlyx = ST.save_curlyx;
4172 REGCP_UNWIND(ST.lastcp);
4175 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4176 /* Maximum greed exceeded */
4177 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4178 && ckWARN(WARN_REGEXP)
4179 && !(PL_reg_flags & RF_warned))
4181 PL_reg_flags |= RF_warned;
4182 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4183 "%s limit (%d) exceeded",
4184 "Complex regular subexpression recursion",
4187 cur_curlyx->u.curlyx.count--;
4191 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4192 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4194 /* Try grabbing another A and see if it helps. */
4195 PL_reginput = locinput;
4196 cur_curlyx->u.curlyx.lastloc = locinput;
4197 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4198 REGCP_SET(ST.lastcp);
4199 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4203 #define ST st->u.branch
4205 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4206 next = scan + ARG(scan);
4209 scan = NEXTOPER(scan);
4212 case BRANCH: /* /(...|A|...)/ */
4213 scan = NEXTOPER(scan); /* scan now points to inner node */
4214 ST.lastparen = *PL_reglastparen;
4215 ST.next_branch = next;
4217 PL_reginput = locinput;
4219 /* Now go into the branch */
4221 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4223 PUSH_STATE_GOTO(BRANCH_next, scan);
4227 PL_reginput = locinput;
4228 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4229 (SV*)rexi->data->data[ ARG( scan ) ];
4230 PUSH_STATE_GOTO(CUTGROUP_next,next);
4232 case CUTGROUP_next_fail:
4235 if (st->u.mark.mark_name)
4236 sv_commit = st->u.mark.mark_name;
4242 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4247 REGCP_UNWIND(ST.cp);
4248 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4249 PL_regoffs[n].end = -1;
4250 *PL_reglastparen = n;
4251 /*dmq: *PL_reglastcloseparen = n; */
4252 scan = ST.next_branch;
4253 /* no more branches? */
4254 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4256 PerlIO_printf( Perl_debug_log,
4257 "%*s %sBRANCH failed...%s\n",
4258 REPORT_CODE_OFF+depth*2, "",
4264 continue; /* execute next BRANCH[J] op */
4272 #define ST st->u.curlym
4274 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4276 /* This is an optimisation of CURLYX that enables us to push
4277 * only a single backtracking state, no matter now many matches
4278 * there are in {m,n}. It relies on the pattern being constant
4279 * length, with no parens to influence future backrefs
4283 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4285 /* if paren positive, emulate an OPEN/CLOSE around A */
4287 U32 paren = ST.me->flags;
4288 if (paren > PL_regsize)
4290 if (paren > *PL_reglastparen)
4291 *PL_reglastparen = paren;
4292 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4300 ST.c1 = CHRTEST_UNINIT;
4303 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4306 curlym_do_A: /* execute the A in /A{m,n}B/ */
4307 PL_reginput = locinput;
4308 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4311 case CURLYM_A: /* we've just matched an A */
4312 locinput = st->locinput;
4313 nextchr = UCHARAT(locinput);
4316 /* after first match, determine A's length: u.curlym.alen */
4317 if (ST.count == 1) {
4318 if (PL_reg_match_utf8) {
4320 while (s < PL_reginput) {
4326 ST.alen = PL_reginput - locinput;
4329 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4332 PerlIO_printf(Perl_debug_log,
4333 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4334 (int)(REPORT_CODE_OFF+(depth*2)), "",
4335 (IV) ST.count, (IV)ST.alen)
4338 locinput = PL_reginput;
4340 if (cur_eval && cur_eval->u.eval.close_paren &&
4341 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4344 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4345 goto curlym_do_A; /* try to match another A */
4346 goto curlym_do_B; /* try to match B */
4348 case CURLYM_A_fail: /* just failed to match an A */
4349 REGCP_UNWIND(ST.cp);
4351 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4352 || (cur_eval && cur_eval->u.eval.close_paren &&
4353 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4356 curlym_do_B: /* execute the B in /A{m,n}B/ */
4357 PL_reginput = locinput;
4358 if (ST.c1 == CHRTEST_UNINIT) {
4359 /* calculate c1 and c2 for possible match of 1st char
4360 * following curly */
4361 ST.c1 = ST.c2 = CHRTEST_VOID;
4362 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4363 regnode *text_node = ST.B;
4364 if (! HAS_TEXT(text_node))
4365 FIND_NEXT_IMPT(text_node);
4368 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4370 But the former is redundant in light of the latter.
4372 if this changes back then the macro for
4373 IS_TEXT and friends need to change.
4375 if (PL_regkind[OP(text_node)] == EXACT)
4378 ST.c1 = (U8)*STRING(text_node);
4380 (IS_TEXTF(text_node))
4382 : (IS_TEXTFL(text_node))
4383 ? PL_fold_locale[ST.c1]
4390 PerlIO_printf(Perl_debug_log,
4391 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4392 (int)(REPORT_CODE_OFF+(depth*2)),
4395 if (ST.c1 != CHRTEST_VOID
4396 && UCHARAT(PL_reginput) != ST.c1
4397 && UCHARAT(PL_reginput) != ST.c2)
4399 /* simulate B failing */
4401 PerlIO_printf(Perl_debug_log,
4402 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4403 (int)(REPORT_CODE_OFF+(depth*2)),"",
4406 state_num = CURLYM_B_fail;
4407 goto reenter_switch;
4411 /* mark current A as captured */
4412 I32 paren = ST.me->flags;
4414 PL_regoffs[paren].start
4415 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4416 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4417 /*dmq: *PL_reglastcloseparen = paren; */
4420 PL_regoffs[paren].end = -1;
4421 if (cur_eval && cur_eval->u.eval.close_paren &&
4422 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4431 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4434 case CURLYM_B_fail: /* just failed to match a B */
4435 REGCP_UNWIND(ST.cp);
4437 if (ST.count == ARG2(ST.me) /* max */)
4439 goto curlym_do_A; /* try to match a further A */
4441 /* backtrack one A */
4442 if (ST.count == ARG1(ST.me) /* min */)
4445 locinput = HOPc(locinput, -ST.alen);
4446 goto curlym_do_B; /* try to match B */
4449 #define ST st->u.curly
4451 #define CURLY_SETPAREN(paren, success) \
4454 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4455 PL_regoffs[paren].end = locinput - PL_bostr; \
4456 *PL_reglastcloseparen = paren; \
4459 PL_regoffs[paren].end = -1; \
4462 case STAR: /* /A*B/ where A is width 1 */
4466 scan = NEXTOPER(scan);
4468 case PLUS: /* /A+B/ where A is width 1 */
4472 scan = NEXTOPER(scan);
4474 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4475 ST.paren = scan->flags; /* Which paren to set */
4476 if (ST.paren > PL_regsize)
4477 PL_regsize = ST.paren;
4478 if (ST.paren > *PL_reglastparen)
4479 *PL_reglastparen = ST.paren;
4480 ST.min = ARG1(scan); /* min to match */
4481 ST.max = ARG2(scan); /* max to match */
4482 if (cur_eval && cur_eval->u.eval.close_paren &&
4483 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4487 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4489 case CURLY: /* /A{m,n}B/ where A is width 1 */
4491 ST.min = ARG1(scan); /* min to match */
4492 ST.max = ARG2(scan); /* max to match */
4493 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4496 * Lookahead to avoid useless match attempts
4497 * when we know what character comes next.
4499 * Used to only do .*x and .*?x, but now it allows
4500 * for )'s, ('s and (?{ ... })'s to be in the way
4501 * of the quantifier and the EXACT-like node. -- japhy
4504 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4506 if (HAS_TEXT(next) || JUMPABLE(next)) {
4508 regnode *text_node = next;
4510 if (! HAS_TEXT(text_node))
4511 FIND_NEXT_IMPT(text_node);
4513 if (! HAS_TEXT(text_node))
4514 ST.c1 = ST.c2 = CHRTEST_VOID;
4516 if ( PL_regkind[OP(text_node)] != EXACT ) {
4517 ST.c1 = ST.c2 = CHRTEST_VOID;
4518 goto assume_ok_easy;
4521 s = (U8*)STRING(text_node);
4523 /* Currently we only get here when
4525 PL_rekind[OP(text_node)] == EXACT
4527 if this changes back then the macro for IS_TEXT and
4528 friends need to change. */
4531 if (IS_TEXTF(text_node))
4532 ST.c2 = PL_fold[ST.c1];
4533 else if (IS_TEXTFL(text_node))
4534 ST.c2 = PL_fold_locale[ST.c1];
4537 if (IS_TEXTF(text_node)) {
4538 STRLEN ulen1, ulen2;
4539 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4540 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4542 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4543 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4545 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4547 0 : UTF8_ALLOW_ANY);
4548 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4550 0 : UTF8_ALLOW_ANY);
4552 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4554 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4559 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4566 ST.c1 = ST.c2 = CHRTEST_VOID;
4571 PL_reginput = locinput;
4574 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4577 locinput = PL_reginput;
4579 if (ST.c1 == CHRTEST_VOID)
4580 goto curly_try_B_min;
4582 ST.oldloc = locinput;
4584 /* set ST.maxpos to the furthest point along the
4585 * string that could possibly match */
4586 if (ST.max == REG_INFTY) {
4587 ST.maxpos = PL_regeol - 1;
4589 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4593 int m = ST.max - ST.min;
4594 for (ST.maxpos = locinput;
4595 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4596 ST.maxpos += UTF8SKIP(ST.maxpos);
4599 ST.maxpos = locinput + ST.max - ST.min;
4600 if (ST.maxpos >= PL_regeol)
4601 ST.maxpos = PL_regeol - 1;
4603 goto curly_try_B_min_known;
4607 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4608 locinput = PL_reginput;
4609 if (ST.count < ST.min)
4611 if ((ST.count > ST.min)
4612 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4614 /* A{m,n} must come at the end of the string, there's
4615 * no point in backing off ... */
4617 /* ...except that $ and \Z can match before *and* after
4618 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4619 We may back off by one in this case. */
4620 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4624 goto curly_try_B_max;
4629 case CURLY_B_min_known_fail:
4630 /* failed to find B in a non-greedy match where c1,c2 valid */
4631 if (ST.paren && ST.count)
4632 PL_regoffs[ST.paren].end = -1;
4634 PL_reginput = locinput; /* Could be reset... */
4635 REGCP_UNWIND(ST.cp);
4636 /* Couldn't or didn't -- move forward. */
4637 ST.oldloc = locinput;
4639 locinput += UTF8SKIP(locinput);
4643 curly_try_B_min_known:
4644 /* find the next place where 'B' could work, then call B */
4648 n = (ST.oldloc == locinput) ? 0 : 1;
4649 if (ST.c1 == ST.c2) {
4651 /* set n to utf8_distance(oldloc, locinput) */
4652 while (locinput <= ST.maxpos &&
4653 utf8n_to_uvchr((U8*)locinput,
4654 UTF8_MAXBYTES, &len,
4655 uniflags) != (UV)ST.c1) {
4661 /* set n to utf8_distance(oldloc, locinput) */
4662 while (locinput <= ST.maxpos) {
4664 const UV c = utf8n_to_uvchr((U8*)locinput,
4665 UTF8_MAXBYTES, &len,
4667 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4675 if (ST.c1 == ST.c2) {
4676 while (locinput <= ST.maxpos &&
4677 UCHARAT(locinput) != ST.c1)
4681 while (locinput <= ST.maxpos
4682 && UCHARAT(locinput) != ST.c1
4683 && UCHARAT(locinput) != ST.c2)
4686 n = locinput - ST.oldloc;
4688 if (locinput > ST.maxpos)
4690 /* PL_reginput == oldloc now */
4693 if (regrepeat(rex, ST.A, n, depth) < n)
4696 PL_reginput = locinput;
4697 CURLY_SETPAREN(ST.paren, ST.count);
4698 if (cur_eval && cur_eval->u.eval.close_paren &&
4699 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4702 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4707 case CURLY_B_min_fail:
4708 /* failed to find B in a non-greedy match where c1,c2 invalid */
4709 if (ST.paren && ST.count)
4710 PL_regoffs[ST.paren].end = -1;
4712 REGCP_UNWIND(ST.cp);
4713 /* failed -- move forward one */
4714 PL_reginput = locinput;
4715 if (regrepeat(rex, ST.A, 1, depth)) {
4717 locinput = PL_reginput;
4718 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4719 ST.count > 0)) /* count overflow ? */
4722 CURLY_SETPAREN(ST.paren, ST.count);
4723 if (cur_eval && cur_eval->u.eval.close_paren &&
4724 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4727 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4735 /* a successful greedy match: now try to match B */
4736 if (cur_eval && cur_eval->u.eval.close_paren &&
4737 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4742 if (ST.c1 != CHRTEST_VOID)
4743 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4744 UTF8_MAXBYTES, 0, uniflags)
4745 : (UV) UCHARAT(PL_reginput);
4746 /* If it could work, try it. */
4747 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4748 CURLY_SETPAREN(ST.paren, ST.count);
4749 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4754 case CURLY_B_max_fail:
4755 /* failed to find B in a greedy match */
4756 if (ST.paren && ST.count)
4757 PL_regoffs[ST.paren].end = -1;
4759 REGCP_UNWIND(ST.cp);
4761 if (--ST.count < ST.min)
4763 PL_reginput = locinput = HOPc(locinput, -1);
4764 goto curly_try_B_max;
4771 /* we've just finished A in /(??{A})B/; now continue with B */
4773 st->u.eval.toggle_reg_flags
4774 = cur_eval->u.eval.toggle_reg_flags;
4775 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4777 st->u.eval.prev_rex = rex; /* inner */
4778 SETREX(rex,cur_eval->u.eval.prev_rex);
4779 rexi = RXi_GET(rex);
4780 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4782 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4783 REGCP_SET(st->u.eval.lastcp);
4784 PL_reginput = locinput;
4786 /* Restore parens of the outer rex without popping the
4788 tmpix = PL_savestack_ix;
4789 PL_savestack_ix = cur_eval->u.eval.lastcp;
4791 PL_savestack_ix = tmpix;
4793 st->u.eval.prev_eval = cur_eval;
4794 cur_eval = cur_eval->u.eval.prev_eval;
4796 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4797 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4798 if ( nochange_depth )
4801 PUSH_YES_STATE_GOTO(EVAL_AB,
4802 st->u.eval.prev_eval->u.eval.B); /* match B */
4805 if (locinput < reginfo->till) {
4806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4807 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4809 (long)(locinput - PL_reg_starttry),
4810 (long)(reginfo->till - PL_reg_starttry),
4813 sayNO_SILENT; /* Cannot match: too short. */
4815 PL_reginput = locinput; /* put where regtry can find it */
4816 sayYES; /* Success! */
4818 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4820 PerlIO_printf(Perl_debug_log,
4821 "%*s %ssubpattern success...%s\n",
4822 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4823 PL_reginput = locinput; /* put where regtry can find it */
4824 sayYES; /* Success! */
4827 #define ST st->u.ifmatch
4829 case SUSPEND: /* (?>A) */
4831 PL_reginput = locinput;
4834 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4836 goto ifmatch_trivial_fail_test;
4838 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4840 ifmatch_trivial_fail_test:
4842 char * const s = HOPBACKc(locinput, scan->flags);
4847 sw = 1 - (bool)ST.wanted;
4851 next = scan + ARG(scan);
4859 PL_reginput = locinput;
4863 ST.logical = logical;
4864 /* execute body of (?...A) */
4865 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4868 case IFMATCH_A_fail: /* body of (?...A) failed */
4869 ST.wanted = !ST.wanted;
4872 case IFMATCH_A: /* body of (?...A) succeeded */
4874 sw = (bool)ST.wanted;
4876 else if (!ST.wanted)
4879 if (OP(ST.me) == SUSPEND)
4880 locinput = PL_reginput;
4882 locinput = PL_reginput = st->locinput;
4883 nextchr = UCHARAT(locinput);
4885 scan = ST.me + ARG(ST.me);
4888 continue; /* execute B */
4893 next = scan + ARG(scan);
4898 reginfo->cutpoint = PL_regeol;
4901 PL_reginput = locinput;
4903 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
4904 PUSH_STATE_GOTO(COMMIT_next,next);
4906 case COMMIT_next_fail:
4913 #define ST st->u.mark
4915 ST.prev_mark = mark_state;
4916 ST.mark_name = sv_commit = sv_yes_mark
4917 = (SV*)rexi->data->data[ ARG( scan ) ];
4919 ST.mark_loc = PL_reginput = locinput;
4920 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4922 case MARKPOINT_next:
4923 mark_state = ST.prev_mark;
4926 case MARKPOINT_next_fail:
4927 if (popmark && sv_eq(ST.mark_name,popmark))
4929 if (ST.mark_loc > startpoint)
4930 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4931 popmark = NULL; /* we found our mark */
4932 sv_commit = ST.mark_name;
4935 PerlIO_printf(Perl_debug_log,
4936 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4937 REPORT_CODE_OFF+depth*2, "",
4938 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
4941 mark_state = ST.prev_mark;
4942 sv_yes_mark = mark_state ?
4943 mark_state->u.mark.mark_name : NULL;
4947 PL_reginput = locinput;
4949 /* (*SKIP) : if we fail we cut here*/
4950 ST.mark_name = NULL;
4951 ST.mark_loc = locinput;
4952 PUSH_STATE_GOTO(SKIP_next,next);
4954 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
4955 otherwise do nothing. Meaning we need to scan
4957 regmatch_state *cur = mark_state;
4958 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
4961 if ( sv_eq( cur->u.mark.mark_name,
4964 ST.mark_name = find;
4965 PUSH_STATE_GOTO( SKIP_next, next );
4967 cur = cur->u.mark.prev_mark;
4970 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
4972 case SKIP_next_fail:
4974 /* (*CUT:NAME) - Set up to search for the name as we
4975 collapse the stack*/
4976 popmark = ST.mark_name;
4978 /* (*CUT) - No name, we cut here.*/
4979 if (ST.mark_loc > startpoint)
4980 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4981 /* but we set sv_commit to latest mark_name if there
4982 is one so they can test to see how things lead to this
4985 sv_commit=mark_state->u.mark.mark_name;
4993 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
4995 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
4998 U8 folded[UTF8_MAXBYTES_CASE+1];
5000 const char * const l = locinput;
5001 char *e = PL_regeol;
5002 to_uni_fold(n, folded, &foldlen);
5004 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
5005 l, &e, 0, do_utf8)) {
5010 nextchr = UCHARAT(locinput);
5013 if ((n=is_LNBREAK(locinput,do_utf8))) {
5015 nextchr = UCHARAT(locinput);
5020 #define CASE_CLASS(nAmE) \
5022 if ((n=is_##nAmE(locinput,do_utf8))) { \
5024 nextchr = UCHARAT(locinput); \
5029 if ((n=is_##nAmE(locinput,do_utf8))) { \
5032 locinput += UTF8SKIP(locinput); \
5033 nextchr = UCHARAT(locinput); \
5038 CASE_CLASS(HORIZWS);
5042 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5043 PTR2UV(scan), OP(scan));
5044 Perl_croak(aTHX_ "regexp memory corruption");
5048 /* switch break jumps here */
5049 scan = next; /* prepare to execute the next op and ... */
5050 continue; /* ... jump back to the top, reusing st */
5054 /* push a state that backtracks on success */
5055 st->u.yes.prev_yes_state = yes_state;
5059 /* push a new regex state, then continue at scan */
5061 regmatch_state *newst;
5064 regmatch_state *cur = st;
5065 regmatch_state *curyes = yes_state;
5067 regmatch_slab *slab = PL_regmatch_slab;
5068 for (;curd > -1;cur--,curd--) {
5069 if (cur < SLAB_FIRST(slab)) {
5071 cur = SLAB_LAST(slab);
5073 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5074 REPORT_CODE_OFF + 2 + depth * 2,"",
5075 curd, PL_reg_name[cur->resume_state],
5076 (curyes == cur) ? "yes" : ""
5079 curyes = cur->u.yes.prev_yes_state;
5082 DEBUG_STATE_pp("push")
5085 st->locinput = locinput;
5087 if (newst > SLAB_LAST(PL_regmatch_slab))
5088 newst = S_push_slab(aTHX);
5089 PL_regmatch_state = newst;
5091 locinput = PL_reginput;
5092 nextchr = UCHARAT(locinput);
5100 * We get here only if there's trouble -- normally "case END" is
5101 * the terminating point.
5103 Perl_croak(aTHX_ "corrupted regexp pointers");
5109 /* we have successfully completed a subexpression, but we must now
5110 * pop to the state marked by yes_state and continue from there */
5111 assert(st != yes_state);
5113 while (st != yes_state) {
5115 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5116 PL_regmatch_slab = PL_regmatch_slab->prev;
5117 st = SLAB_LAST(PL_regmatch_slab);
5121 DEBUG_STATE_pp("pop (no final)");
5123 DEBUG_STATE_pp("pop (yes)");
5129 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5130 || yes_state > SLAB_LAST(PL_regmatch_slab))
5132 /* not in this slab, pop slab */
5133 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5134 PL_regmatch_slab = PL_regmatch_slab->prev;
5135 st = SLAB_LAST(PL_regmatch_slab);
5137 depth -= (st - yes_state);
5140 yes_state = st->u.yes.prev_yes_state;
5141 PL_regmatch_state = st;
5144 locinput= st->locinput;
5145 nextchr = UCHARAT(locinput);
5147 state_num = st->resume_state + no_final;
5148 goto reenter_switch;
5151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5152 PL_colors[4], PL_colors[5]));
5154 if (PL_reg_eval_set) {
5155 /* each successfully executed (?{...}) block does the equivalent of
5156 * local $^R = do {...}
5157 * When popping the save stack, all these locals would be undone;
5158 * bypass this by setting the outermost saved $^R to the latest
5160 if (oreplsv != GvSV(PL_replgv))
5161 sv_setsv(oreplsv, GvSV(PL_replgv));
5168 PerlIO_printf(Perl_debug_log,
5169 "%*s %sfailed...%s\n",
5170 REPORT_CODE_OFF+depth*2, "",
5171 PL_colors[4], PL_colors[5])
5183 /* there's a previous state to backtrack to */
5185 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5186 PL_regmatch_slab = PL_regmatch_slab->prev;
5187 st = SLAB_LAST(PL_regmatch_slab);
5189 PL_regmatch_state = st;
5190 locinput= st->locinput;
5191 nextchr = UCHARAT(locinput);
5193 DEBUG_STATE_pp("pop");
5195 if (yes_state == st)
5196 yes_state = st->u.yes.prev_yes_state;
5198 state_num = st->resume_state + 1; /* failure = success + 1 */
5199 goto reenter_switch;
5204 if (rex->intflags & PREGf_VERBARG_SEEN) {
5205 SV *sv_err = get_sv("REGERROR", 1);
5206 SV *sv_mrk = get_sv("REGMARK", 1);
5208 sv_commit = &PL_sv_no;
5210 sv_yes_mark = &PL_sv_yes;
5213 sv_commit = &PL_sv_yes;
5214 sv_yes_mark = &PL_sv_no;
5216 sv_setsv(sv_err, sv_commit);
5217 sv_setsv(sv_mrk, sv_yes_mark);
5220 /* clean up; in particular, free all slabs above current one */
5221 LEAVE_SCOPE(oldsave);
5227 - regrepeat - repeatedly match something simple, report how many
5230 * [This routine now assumes that it will only match on things of length 1.
5231 * That was true before, but now we assume scan - reginput is the count,
5232 * rather than incrementing count on every character. [Er, except utf8.]]
5235 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5238 register char *scan;
5240 register char *loceol = PL_regeol;
5241 register I32 hardcount = 0;
5242 register bool do_utf8 = PL_reg_match_utf8;
5244 PERL_UNUSED_ARG(depth);
5248 if (max == REG_INFTY)
5250 else if (max < loceol - scan)
5251 loceol = scan + max;
5256 while (scan < loceol && hardcount < max && *scan != '\n') {
5257 scan += UTF8SKIP(scan);
5261 while (scan < loceol && *scan != '\n')
5268 while (scan < loceol && hardcount < max) {
5269 scan += UTF8SKIP(scan);
5279 case EXACT: /* length of string is 1 */
5281 while (scan < loceol && UCHARAT(scan) == c)
5284 case EXACTF: /* length of string is 1 */
5286 while (scan < loceol &&
5287 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5290 case EXACTFL: /* length of string is 1 */
5291 PL_reg_flags |= RF_tainted;
5293 while (scan < loceol &&
5294 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5300 while (hardcount < max && scan < loceol &&
5301 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5302 scan += UTF8SKIP(scan);
5306 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5313 LOAD_UTF8_CHARCLASS_ALNUM();
5314 while (hardcount < max && scan < loceol &&
5315 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5316 scan += UTF8SKIP(scan);
5320 while (scan < loceol && isALNUM(*scan))
5325 PL_reg_flags |= RF_tainted;
5328 while (hardcount < max && scan < loceol &&
5329 isALNUM_LC_utf8((U8*)scan)) {
5330 scan += UTF8SKIP(scan);
5334 while (scan < loceol && isALNUM_LC(*scan))
5341 LOAD_UTF8_CHARCLASS_ALNUM();
5342 while (hardcount < max && scan < loceol &&
5343 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5344 scan += UTF8SKIP(scan);
5348 while (scan < loceol && !isALNUM(*scan))
5353 PL_reg_flags |= RF_tainted;
5356 while (hardcount < max && scan < loceol &&
5357 !isALNUM_LC_utf8((U8*)scan)) {
5358 scan += UTF8SKIP(scan);
5362 while (scan < loceol && !isALNUM_LC(*scan))
5369 LOAD_UTF8_CHARCLASS_SPACE();
5370 while (hardcount < max && scan < loceol &&
5372 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5373 scan += UTF8SKIP(scan);
5377 while (scan < loceol && isSPACE(*scan))
5382 PL_reg_flags |= RF_tainted;
5385 while (hardcount < max && scan < loceol &&
5386 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5387 scan += UTF8SKIP(scan);
5391 while (scan < loceol && isSPACE_LC(*scan))
5398 LOAD_UTF8_CHARCLASS_SPACE();
5399 while (hardcount < max && scan < loceol &&
5401 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5402 scan += UTF8SKIP(scan);
5406 while (scan < loceol && !isSPACE(*scan))
5411 PL_reg_flags |= RF_tainted;
5414 while (hardcount < max && scan < loceol &&
5415 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5416 scan += UTF8SKIP(scan);
5420 while (scan < loceol && !isSPACE_LC(*scan))
5427 LOAD_UTF8_CHARCLASS_DIGIT();
5428 while (hardcount < max && scan < loceol &&
5429 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5430 scan += UTF8SKIP(scan);
5434 while (scan < loceol && isDIGIT(*scan))
5441 LOAD_UTF8_CHARCLASS_DIGIT();
5442 while (hardcount < max && scan < loceol &&
5443 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5444 scan += UTF8SKIP(scan);
5448 while (scan < loceol && !isDIGIT(*scan))
5454 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5460 LNBREAK can match two latin chars, which is ok,
5461 because we have a null terminated string, but we
5462 have to use hardcount in this situation
5464 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5473 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5478 while (scan < loceol && is_HORIZWS_latin1(scan))
5485 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5486 scan += UTF8SKIP(scan);
5490 while (scan < loceol && !is_HORIZWS_latin1(scan))
5498 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5503 while (scan < loceol && is_VERTWS_latin1(scan))
5511 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5512 scan += UTF8SKIP(scan);
5516 while (scan < loceol && !is_VERTWS_latin1(scan))
5522 default: /* Called on something of 0 width. */
5523 break; /* So match right here or not at all. */
5529 c = scan - PL_reginput;
5533 GET_RE_DEBUG_FLAGS_DECL;
5535 SV * const prop = sv_newmortal();
5536 regprop(prog, prop, p);
5537 PerlIO_printf(Perl_debug_log,
5538 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5539 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5547 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5549 - regclass_swash - prepare the utf8 swash
5553 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5559 RXi_GET_DECL(prog,progi);
5560 const struct reg_data * const data = prog ? progi->data : NULL;
5562 if (data && data->count) {
5563 const U32 n = ARG(node);
5565 if (data->what[n] == 's') {
5566 SV * const rv = (SV*)data->data[n];
5567 AV * const av = (AV*)SvRV((SV*)rv);
5568 SV **const ary = AvARRAY(av);
5571 /* See the end of regcomp.c:S_regclass() for
5572 * documentation of these array elements. */
5575 a = SvROK(ary[1]) ? &ary[1] : NULL;
5576 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5580 else if (si && doinit) {
5581 sw = swash_init("utf8", "", si, 1, 0);
5582 (void)av_store(av, 1, sw);
5599 - reginclass - determine if a character falls into a character class
5601 The n is the ANYOF regnode, the p is the target string, lenp
5602 is pointer to the maximum length of how far to go in the p
5603 (if the lenp is zero, UTF8SKIP(p) is used),
5604 do_utf8 tells whether the target string is in UTF-8.
5609 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5612 const char flags = ANYOF_FLAGS(n);
5618 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5619 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5620 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5621 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5622 if (len == (STRLEN)-1)
5623 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5626 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5627 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5630 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5631 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5634 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5638 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5641 if (swash_fetch(sw, p, do_utf8))
5643 else if (flags & ANYOF_FOLD) {
5644 if (!match && lenp && av) {
5646 for (i = 0; i <= av_len(av); i++) {
5647 SV* const sv = *av_fetch(av, i, FALSE);
5649 const char * const s = SvPV_const(sv, len);
5651 if (len <= plen && memEQ(s, (char*)p, len)) {
5659 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5662 to_utf8_fold(p, tmpbuf, &tmplen);
5663 if (swash_fetch(sw, tmpbuf, do_utf8))
5669 if (match && lenp && *lenp == 0)
5670 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5672 if (!match && c < 256) {
5673 if (ANYOF_BITMAP_TEST(n, c))
5675 else if (flags & ANYOF_FOLD) {
5678 if (flags & ANYOF_LOCALE) {
5679 PL_reg_flags |= RF_tainted;
5680 f = PL_fold_locale[c];
5684 if (f != c && ANYOF_BITMAP_TEST(n, f))
5688 if (!match && (flags & ANYOF_CLASS)) {
5689 PL_reg_flags |= RF_tainted;
5691 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5692 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5693 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5694 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5695 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5696 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5697 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5698 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5699 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5700 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5701 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5702 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5703 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5704 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5705 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5706 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5707 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5708 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5709 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5710 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5711 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5712 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5713 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5714 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5715 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5716 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5717 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5718 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5719 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5720 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5721 ) /* How's that for a conditional? */
5728 return (flags & ANYOF_INVERT) ? !match : match;
5732 S_reghop3(U8 *s, I32 off, const U8* lim)
5736 while (off-- && s < lim) {
5737 /* XXX could check well-formedness here */
5742 while (off++ && s > lim) {
5744 if (UTF8_IS_CONTINUED(*s)) {
5745 while (s > lim && UTF8_IS_CONTINUATION(*s))
5748 /* XXX could check well-formedness here */
5755 /* there are a bunch of places where we use two reghop3's that should
5756 be replaced with this routine. but since thats not done yet
5757 we ifdef it out - dmq
5760 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5764 while (off-- && s < rlim) {
5765 /* XXX could check well-formedness here */
5770 while (off++ && s > llim) {
5772 if (UTF8_IS_CONTINUED(*s)) {
5773 while (s > llim && UTF8_IS_CONTINUATION(*s))
5776 /* XXX could check well-formedness here */
5784 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5788 while (off-- && s < lim) {
5789 /* XXX could check well-formedness here */
5796 while (off++ && s > lim) {
5798 if (UTF8_IS_CONTINUED(*s)) {
5799 while (s > lim && UTF8_IS_CONTINUATION(*s))
5802 /* XXX could check well-formedness here */
5811 restore_pos(pTHX_ void *arg)
5814 regexp * const rex = (regexp *)arg;
5815 if (PL_reg_eval_set) {
5816 if (PL_reg_oldsaved) {
5817 rex->subbeg = PL_reg_oldsaved;
5818 rex->sublen = PL_reg_oldsavedlen;
5819 #ifdef PERL_OLD_COPY_ON_WRITE
5820 rex->saved_copy = PL_nrs;
5822 RX_MATCH_COPIED_on(rex);
5824 PL_reg_magic->mg_len = PL_reg_oldpos;
5825 PL_reg_eval_set = 0;
5826 PL_curpm = PL_reg_oldcurpm;
5831 S_to_utf8_substr(pTHX_ register regexp *prog)
5835 if (prog->substrs->data[i].substr
5836 && !prog->substrs->data[i].utf8_substr) {
5837 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5838 prog->substrs->data[i].utf8_substr = sv;
5839 sv_utf8_upgrade(sv);
5840 if (SvVALID(prog->substrs->data[i].substr)) {
5841 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5842 if (flags & FBMcf_TAIL) {
5843 /* Trim the trailing \n that fbm_compile added last
5845 SvCUR_set(sv, SvCUR(sv) - 1);
5846 /* Whilst this makes the SV technically "invalid" (as its
5847 buffer is no longer followed by "\0") when fbm_compile()
5848 adds the "\n" back, a "\0" is restored. */
5850 fbm_compile(sv, flags);
5852 if (prog->substrs->data[i].substr == prog->check_substr)
5853 prog->check_utf8 = sv;
5859 S_to_byte_substr(pTHX_ register regexp *prog)
5864 if (prog->substrs->data[i].utf8_substr
5865 && !prog->substrs->data[i].substr) {
5866 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5867 if (sv_utf8_downgrade(sv, TRUE)) {
5868 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5870 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5871 if (flags & FBMcf_TAIL) {
5872 /* Trim the trailing \n that fbm_compile added last
5874 SvCUR_set(sv, SvCUR(sv) - 1);
5876 fbm_compile(sv, flags);
5882 prog->substrs->data[i].substr = sv;
5883 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5884 prog->check_substr = sv;
5891 * c-indentation-style: bsd
5893 * indent-tabs-mode: t
5896 * ex: set ts=8 sts=4 sw=4 noet: