5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126 #define JUMPABLE(rn) ( \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
135 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
137 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
140 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
147 /* ... so we use this as its faster. */
148 #define IS_TEXT(rn) ( OP(rn)==EXACT )
149 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
158 #define FIND_NEXT_IMPT(rn) STMT_START { \
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
162 rn = NEXTOPER(NEXTOPER(rn)); \
163 else if (type == PLUS) \
165 else if (type == IFMATCH) \
166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
167 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
178 const int retval = PL_savestack_ix;
179 #define REGCP_PAREN_ELEMS 4
180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 GET_RE_DEBUG_FLAGS_DECL;
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
187 #define REGCP_OTHER_ELEMS 7
188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
190 for (p = PL_regsize; p > parenfloor; p--) {
191 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
192 SSPUSHINT(PL_regoffs[p].end);
193 SSPUSHINT(PL_regoffs[p].start);
194 SSPUSHPTR(PL_reg_start_tmp[p]);
196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regoffs[p].start,
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
200 (IV)PL_regoffs[p].end
203 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
204 SSPUSHPTR(PL_regoffs);
205 SSPUSHINT(PL_regsize);
206 SSPUSHINT(*PL_reglastparen);
207 SSPUSHINT(*PL_reglastcloseparen);
208 SSPUSHPTR(PL_reginput);
209 #define REGCP_FRAME_ELEMS 2
210 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
211 * are needed for the regexp context stack bookkeeping. */
212 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
213 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
218 /* These are needed since we do not localize EVAL nodes: */
219 #define REGCP_SET(cp) \
221 PerlIO_printf(Perl_debug_log, \
222 " Setting an EVAL scope, savestack=%"IVdf"\n", \
223 (IV)PL_savestack_ix)); \
226 #define REGCP_UNWIND(cp) \
228 if (cp != PL_savestack_ix) \
229 PerlIO_printf(Perl_debug_log, \
230 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
231 (IV)(cp), (IV)PL_savestack_ix)); \
235 S_regcppop(pTHX_ const regexp *rex)
241 GET_RE_DEBUG_FLAGS_DECL;
243 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
245 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
246 i = SSPOPINT; /* Parentheses elements to pop. */
247 input = (char *) SSPOPPTR;
248 *PL_reglastcloseparen = SSPOPINT;
249 *PL_reglastparen = SSPOPINT;
250 PL_regsize = SSPOPINT;
251 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
254 /* Now restore the parentheses context. */
255 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
256 i > 0; i -= REGCP_PAREN_ELEMS) {
258 U32 paren = (U32)SSPOPINT;
259 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
260 PL_regoffs[paren].start = SSPOPINT;
262 if (paren <= *PL_reglastparen)
263 PL_regoffs[paren].end = tmps;
265 PerlIO_printf(Perl_debug_log,
266 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
267 (UV)paren, (IV)PL_regoffs[paren].start,
268 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
269 (IV)PL_regoffs[paren].end,
270 (paren > *PL_reglastparen ? "(no)" : ""));
274 if (*PL_reglastparen + 1 <= rex->nparens) {
275 PerlIO_printf(Perl_debug_log,
276 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
277 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
281 /* It would seem that the similar code in regtry()
282 * already takes care of this, and in fact it is in
283 * a better location to since this code can #if 0-ed out
284 * but the code in regtry() is needed or otherwise tests
285 * requiring null fields (pat.t#187 and split.t#{13,14}
286 * (as of patchlevel 7877) will fail. Then again,
287 * this code seems to be necessary or otherwise
288 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
289 * --jhi updated by dapm */
290 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
292 PL_regoffs[i].start = -1;
293 PL_regoffs[i].end = -1;
299 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
302 * pregexec and friends
305 #ifndef PERL_IN_XSUB_RE
307 - pregexec - match a regexp against a string
310 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
311 char *strbeg, I32 minend, SV *screamer, U32 nosave)
312 /* strend: pointer to null at end of string */
313 /* strbeg: real beginning of string */
314 /* minend: end of match must be >=minend after stringarg. */
315 /* nosave: For optimizations. */
318 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
319 nosave ? 0 : REXEC_COPY_STR);
324 * Need to implement the following flags for reg_anch:
326 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
328 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
329 * INTUIT_AUTORITATIVE_ML
330 * INTUIT_ONCE_NOML - Intuit can match in one location only.
333 * Another flag for this function: SECOND_TIME (so that float substrs
334 * with giant delta may be not rechecked).
337 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
339 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
340 Otherwise, only SvCUR(sv) is used to get strbeg. */
342 /* XXXX We assume that strpos is strbeg unless sv. */
344 /* XXXX Some places assume that there is a fixed substring.
345 An update may be needed if optimizer marks as "INTUITable"
346 RExen without fixed substrings. Similarly, it is assumed that
347 lengths of all the strings are no more than minlen, thus they
348 cannot come from lookahead.
349 (Or minlen should take into account lookahead.)
350 NOTE: Some of this comment is not correct. minlen does now take account
351 of lookahead/behind. Further research is required. -- demerphq
355 /* A failure to find a constant substring means that there is no need to make
356 an expensive call to REx engine, thus we celebrate a failure. Similarly,
357 finding a substring too deep into the string means that less calls to
358 regtry() should be needed.
360 REx compiler's optimizer found 4 possible hints:
361 a) Anchored substring;
363 c) Whether we are anchored (beginning-of-line or \G);
364 d) First node (of those at offset 0) which may distingush positions;
365 We use a)b)d) and multiline-part of c), and try to find a position in the
366 string which does not contradict any of them.
369 /* Most of decisions we do here should have been done at compile time.
370 The nodes of the REx which we used for the search should have been
371 deleted from the finite automaton. */
374 Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
375 char *strend, const U32 flags, re_scream_pos_data *data)
378 register I32 start_shift = 0;
379 /* Should be nonnegative! */
380 register I32 end_shift = 0;
385 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
387 register char *other_last = NULL; /* other substr checked before this */
388 char *check_at = NULL; /* check substr found at this pos */
389 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
390 RXi_GET_DECL(prog,progi);
392 const char * const i_strpos = strpos;
395 GET_RE_DEBUG_FLAGS_DECL;
397 RX_MATCH_UTF8_set(prog,do_utf8);
399 if (prog->extflags & RXf_UTF8) {
400 PL_reg_flags |= RF_utf8;
403 debug_start_match(prog, do_utf8, strpos, strend,
404 sv ? "Guessing start of match in sv for"
405 : "Guessing start of match in string for");
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
411 "String too short... [re_intuit_start]\n"));
415 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
418 if (!prog->check_utf8 && prog->check_substr)
419 to_utf8_substr(prog);
420 check = prog->check_utf8;
422 if (!prog->check_substr && prog->check_utf8)
423 to_byte_substr(prog);
424 check = prog->check_substr;
426 if (check == &PL_sv_undef) {
427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
428 "Non-utf8 string cannot match utf8 check string\n"));
431 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
432 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
433 || ( (prog->extflags & RXf_ANCH_BOL)
434 && !multiline ) ); /* Check after \n? */
437 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
438 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
439 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
441 && (strpos != strbeg)) {
442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
445 if (prog->check_offset_min == prog->check_offset_max &&
446 !(prog->extflags & RXf_CANY_SEEN)) {
447 /* Substring at constant offset from beg-of-str... */
450 s = HOP3c(strpos, prog->check_offset_min, strend);
453 slen = SvCUR(check); /* >= 1 */
455 if ( strend - s > slen || strend - s < slen - 1
456 || (strend - s == slen && strend[-1] != '\n')) {
457 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
460 /* Now should match s[0..slen-2] */
462 if (slen && (*SvPVX_const(check) != *s
464 && memNE(SvPVX_const(check), s, slen)))) {
466 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
470 else if (*SvPVX_const(check) != *s
471 || ((slen = SvCUR(check)) > 1
472 && memNE(SvPVX_const(check), s, slen)))
475 goto success_at_start;
478 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
480 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
481 end_shift = prog->check_end_shift;
484 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
485 - (SvTAIL(check) != 0);
486 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
488 if (end_shift < eshift)
492 else { /* Can match at random position */
495 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
496 end_shift = prog->check_end_shift;
498 /* end shift should be non negative here */
501 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
503 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
504 (IV)end_shift, prog->precomp);
508 /* Find a possible match in the region s..strend by looking for
509 the "check" substring in the region corrected by start/end_shift. */
512 I32 srch_start_shift = start_shift;
513 I32 srch_end_shift = end_shift;
514 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
515 srch_end_shift -= ((strbeg - s) - srch_start_shift);
516 srch_start_shift = strbeg - s;
518 DEBUG_OPTIMISE_MORE_r({
519 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
520 (IV)prog->check_offset_min,
521 (IV)srch_start_shift,
523 (IV)prog->check_end_shift);
526 if (flags & REXEC_SCREAM) {
527 I32 p = -1; /* Internal iterator of scream. */
528 I32 * const pp = data ? data->scream_pos : &p;
530 if (PL_screamfirst[BmRARE(check)] >= 0
531 || ( BmRARE(check) == '\n'
532 && (BmPREVIOUS(check) == SvCUR(check) - 1)
534 s = screaminstr(sv, check,
535 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
538 /* we may be pointing at the wrong string */
539 if (s && RX_MATCH_COPIED(prog))
540 s = strbeg + (s - SvPVX_const(sv));
542 *data->scream_olds = s;
547 if (prog->extflags & RXf_CANY_SEEN) {
548 start_point= (U8*)(s + srch_start_shift);
549 end_point= (U8*)(strend - srch_end_shift);
551 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
552 end_point= HOP3(strend, -srch_end_shift, strbeg);
554 DEBUG_OPTIMISE_MORE_r({
555 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
556 (int)(end_point - start_point),
557 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
561 s = fbm_instr( start_point, end_point,
562 check, multiline ? FBMrf_MULTILINE : 0);
565 /* Update the count-of-usability, remove useless subpatterns,
569 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
570 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
571 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
572 (s ? "Found" : "Did not find"),
573 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
574 ? "anchored" : "floating"),
577 (s ? " at offset " : "...\n") );
582 /* Finish the diagnostic message */
583 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
585 /* XXX dmq: first branch is for positive lookbehind...
586 Our check string is offset from the beginning of the pattern.
587 So we need to do any stclass tests offset forward from that
596 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
597 Start with the other substr.
598 XXXX no SCREAM optimization yet - and a very coarse implementation
599 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
600 *always* match. Probably should be marked during compile...
601 Probably it is right to do no SCREAM here...
604 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
605 : (prog->float_substr && prog->anchored_substr))
607 /* Take into account the "other" substring. */
608 /* XXXX May be hopelessly wrong for UTF... */
611 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
614 char * const last = HOP3c(s, -start_shift, strbeg);
616 char * const saved_s = s;
619 t = s - prog->check_offset_max;
620 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
622 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
627 t = HOP3c(t, prog->anchored_offset, strend);
628 if (t < other_last) /* These positions already checked */
630 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
633 /* XXXX It is not documented what units *_offsets are in.
634 We assume bytes, but this is clearly wrong.
635 Meaning this code needs to be carefully reviewed for errors.
639 /* On end-of-str: see comment below. */
640 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
641 if (must == &PL_sv_undef) {
643 DEBUG_r(must = prog->anchored_utf8); /* for debug */
648 HOP3(HOP3(last1, prog->anchored_offset, strend)
649 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
651 multiline ? FBMrf_MULTILINE : 0
654 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
655 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
656 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
657 (s ? "Found" : "Contradicts"),
658 quoted, RE_SV_TAIL(must));
663 if (last1 >= last2) {
664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
665 ", giving up...\n"));
668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
669 ", trying floating at offset %ld...\n",
670 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
671 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
672 s = HOP3c(last, 1, strend);
676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
677 (long)(s - i_strpos)));
678 t = HOP3c(s, -prog->anchored_offset, strbeg);
679 other_last = HOP3c(s, 1, strend);
687 else { /* Take into account the floating substring. */
689 char * const saved_s = s;
692 t = HOP3c(s, -start_shift, strbeg);
694 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
695 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
696 last = HOP3c(t, prog->float_max_offset, strend);
697 s = HOP3c(t, prog->float_min_offset, strend);
700 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
701 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
702 /* fbm_instr() takes into account exact value of end-of-str
703 if the check is SvTAIL(ed). Since false positives are OK,
704 and end-of-str is not later than strend we are OK. */
705 if (must == &PL_sv_undef) {
707 DEBUG_r(must = prog->float_utf8); /* for debug message */
710 s = fbm_instr((unsigned char*)s,
711 (unsigned char*)last + SvCUR(must)
713 must, multiline ? FBMrf_MULTILINE : 0);
715 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
716 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
717 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
718 (s ? "Found" : "Contradicts"),
719 quoted, RE_SV_TAIL(must));
723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
724 ", giving up...\n"));
727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
728 ", trying anchored starting at offset %ld...\n",
729 (long)(saved_s + 1 - i_strpos)));
731 s = HOP3c(t, 1, strend);
735 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
736 (long)(s - i_strpos)));
737 other_last = s; /* Fix this later. --Hugo */
747 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
749 DEBUG_OPTIMISE_MORE_r(
750 PerlIO_printf(Perl_debug_log,
751 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
752 (IV)prog->check_offset_min,
753 (IV)prog->check_offset_max,
761 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
763 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
766 /* Fixed substring is found far enough so that the match
767 cannot start at strpos. */
769 if (ml_anch && t[-1] != '\n') {
770 /* Eventually fbm_*() should handle this, but often
771 anchored_offset is not 0, so this check will not be wasted. */
772 /* XXXX In the code below we prefer to look for "^" even in
773 presence of anchored substrings. And we search even
774 beyond the found float position. These pessimizations
775 are historical artefacts only. */
777 while (t < strend - prog->minlen) {
779 if (t < check_at - prog->check_offset_min) {
780 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
781 /* Since we moved from the found position,
782 we definitely contradict the found anchored
783 substr. Due to the above check we do not
784 contradict "check" substr.
785 Thus we can arrive here only if check substr
786 is float. Redo checking for "other"=="fixed".
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
790 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
791 goto do_other_anchored;
793 /* We don't contradict the found floating substring. */
794 /* XXXX Why not check for STCLASS? */
796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
797 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
800 /* Position contradicts check-string */
801 /* XXXX probably better to look for check-string
802 than for "\n", so one should lower the limit for t? */
803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
804 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
805 other_last = strpos = s = t + 1;
810 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
811 PL_colors[0], PL_colors[1]));
815 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
816 PL_colors[0], PL_colors[1]));
820 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
823 /* The found string does not prohibit matching at strpos,
824 - no optimization of calling REx engine can be performed,
825 unless it was an MBOL and we are not after MBOL,
826 or a future STCLASS check will fail this. */
828 /* Even in this situation we may use MBOL flag if strpos is offset
829 wrt the start of the string. */
830 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
831 && (strpos != strbeg) && strpos[-1] != '\n'
832 /* May be due to an implicit anchor of m{.*foo} */
833 && !(prog->intflags & PREGf_IMPLICIT))
838 DEBUG_EXECUTE_r( if (ml_anch)
839 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
840 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
843 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
845 prog->check_utf8 /* Could be deleted already */
846 && --BmUSEFUL(prog->check_utf8) < 0
847 && (prog->check_utf8 == prog->float_utf8)
849 prog->check_substr /* Could be deleted already */
850 && --BmUSEFUL(prog->check_substr) < 0
851 && (prog->check_substr == prog->float_substr)
854 /* If flags & SOMETHING - do not do it many times on the same match */
855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
856 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
857 if (do_utf8 ? prog->check_substr : prog->check_utf8)
858 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
859 prog->check_substr = prog->check_utf8 = NULL; /* disable */
860 prog->float_substr = prog->float_utf8 = NULL; /* clear */
861 check = NULL; /* abort */
863 /* XXXX This is a remnant of the old implementation. It
864 looks wasteful, since now INTUIT can use many
866 prog->extflags &= ~RXf_USE_INTUIT;
873 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
874 /* trie stclasses are too expensive to use here, we are better off to
875 leave it to regmatch itself */
876 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
877 /* minlen == 0 is possible if regstclass is \b or \B,
878 and the fixed substr is ''$.
879 Since minlen is already taken into account, s+1 is before strend;
880 accidentally, minlen >= 1 guaranties no false positives at s + 1
881 even for \b or \B. But (minlen? 1 : 0) below assumes that
882 regstclass does not come from lookahead... */
883 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
884 This leaves EXACTF only, which is dealt with in find_byclass(). */
885 const U8* const str = (U8*)STRING(progi->regstclass);
886 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
887 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
890 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
891 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
892 else if (prog->float_substr || prog->float_utf8)
893 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
898 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
901 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
904 const char *what = NULL;
906 if (endpos == strend) {
907 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
908 "Could not match STCLASS...\n") );
911 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
912 "This position contradicts STCLASS...\n") );
913 if ((prog->extflags & RXf_ANCH) && !ml_anch)
915 /* Contradict one of substrings */
916 if (prog->anchored_substr || prog->anchored_utf8) {
917 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
918 DEBUG_EXECUTE_r( what = "anchored" );
920 s = HOP3c(t, 1, strend);
921 if (s + start_shift + end_shift > strend) {
922 /* XXXX Should be taken into account earlier? */
923 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
924 "Could not match STCLASS...\n") );
929 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
930 "Looking for %s substr starting at offset %ld...\n",
931 what, (long)(s + start_shift - i_strpos)) );
934 /* Have both, check_string is floating */
935 if (t + start_shift >= check_at) /* Contradicts floating=check */
936 goto retry_floating_check;
937 /* Recheck anchored substring, but not floating... */
941 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
942 "Looking for anchored substr starting at offset %ld...\n",
943 (long)(other_last - i_strpos)) );
944 goto do_other_anchored;
946 /* Another way we could have checked stclass at the
947 current position only: */
952 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
953 "Looking for /%s^%s/m starting at offset %ld...\n",
954 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
957 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
959 /* Check is floating subtring. */
960 retry_floating_check:
961 t = check_at - start_shift;
962 DEBUG_EXECUTE_r( what = "floating" );
963 goto hop_and_restart;
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
967 "By STCLASS: moving %ld --> %ld\n",
968 (long)(t - i_strpos), (long)(s - i_strpos))
972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
973 "Does not contradict STCLASS...\n");
978 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
979 PL_colors[4], (check ? "Guessed" : "Giving up"),
980 PL_colors[5], (long)(s - i_strpos)) );
983 fail_finish: /* Substring not found */
984 if (prog->check_substr || prog->check_utf8) /* could be removed already */
985 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
988 PL_colors[4], PL_colors[5]));
994 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
995 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
996 switch (trie_type) { \
997 case trie_utf8_fold: \
999 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1004 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1005 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1006 foldlen -= UNISKIP( uvc ); \
1007 uscan = foldbuf + UNISKIP( uvc ); \
1011 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1019 charid = trie->charmap[ uvc ]; \
1023 if (widecharmap) { \
1024 SV** const svpp = hv_fetch(widecharmap, \
1025 (char*)&uvc, sizeof(UV), 0); \
1027 charid = (U16)SvIV(*svpp); \
1032 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1035 ibcmp_utf8(s, NULL, 0, do_utf8, \
1036 m, NULL, ln, (bool)UTF)) \
1037 && (!reginfo || regtry(reginfo, &s)) ) \
1040 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1041 uvchr_to_utf8(tmpbuf, c); \
1042 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1044 && (f == c1 || f == c2) \
1045 && (ln == foldlen || \
1046 !ibcmp_utf8((char *) foldbuf, \
1047 NULL, foldlen, do_utf8, \
1049 NULL, ln, (bool)UTF)) \
1050 && (!reginfo || regtry(reginfo, &s)) ) \
1055 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1059 && (ln == 1 || !(OP(c) == EXACTF \
1061 : ibcmp_locale(s, m, ln))) \
1062 && (!reginfo || regtry(reginfo, &s)) ) \
1068 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1070 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1076 #define REXEC_FBC_SCAN(CoDe) \
1078 while (s < strend) { \
1084 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1085 REXEC_FBC_UTF8_SCAN( \
1087 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1096 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1099 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1108 #define REXEC_FBC_TRYIT \
1109 if ((!reginfo || regtry(reginfo, &s))) \
1112 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1114 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1117 REXEC_FBC_CLASS_SCAN(CoNd); \
1121 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1124 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1127 REXEC_FBC_CLASS_SCAN(CoNd); \
1131 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1132 PL_reg_flags |= RF_tainted; \
1134 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1137 REXEC_FBC_CLASS_SCAN(CoNd); \
1141 #define DUMP_EXEC_POS(li,s,doutf8) \
1142 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1144 /* We know what class REx starts with. Try to find this position... */
1145 /* if reginfo is NULL, its a dryrun */
1146 /* annoyingly all the vars in this routine have different names from their counterparts
1147 in regmatch. /grrr */
1150 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1151 const char *strend, regmatch_info *reginfo)
1154 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1158 register STRLEN uskip;
1162 register I32 tmp = 1; /* Scratch variable? */
1163 register const bool do_utf8 = PL_reg_match_utf8;
1164 RXi_GET_DECL(prog,progi);
1166 /* We know what class it must start with. */
1170 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1171 !UTF8_IS_INVARIANT((U8)s[0]) ?
1172 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1173 REGINCLASS(prog, c, (U8*)s));
1176 while (s < strend) {
1179 if (REGINCLASS(prog, c, (U8*)s) ||
1180 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1181 /* The assignment of 2 is intentional:
1182 * for the folded sharp s, the skip is 2. */
1183 (skip = SHARP_S_SKIP))) {
1184 if (tmp && (!reginfo || regtry(reginfo, &s)))
1197 if (tmp && (!reginfo || regtry(reginfo, &s)))
1205 ln = STR_LEN(c); /* length to match in octets/bytes */
1206 lnc = (I32) ln; /* length to match in characters */
1208 STRLEN ulen1, ulen2;
1210 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1211 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1212 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1214 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1215 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1217 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1219 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1222 while (sm < ((U8 *) m + ln)) {
1237 c2 = PL_fold_locale[c1];
1239 e = HOP3c(strend, -((I32)lnc), s);
1241 if (!reginfo && e < s)
1242 e = s; /* Due to minlen logic of intuit() */
1244 /* The idea in the EXACTF* cases is to first find the
1245 * first character of the EXACTF* node and then, if
1246 * necessary, case-insensitively compare the full
1247 * text of the node. The c1 and c2 are the first
1248 * characters (though in Unicode it gets a bit
1249 * more complicated because there are more cases
1250 * than just upper and lower: one needs to use
1251 * the so-called folding case for case-insensitive
1252 * matching (called "loose matching" in Unicode).
1253 * ibcmp_utf8() will do just that. */
1257 U8 tmpbuf [UTF8_MAXBYTES+1];
1258 STRLEN len, foldlen;
1259 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1261 /* Upper and lower of 1st char are equal -
1262 * probably not a "letter". */
1264 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1266 REXEC_FBC_EXACTISH_CHECK(c == c1);
1271 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1274 /* Handle some of the three Greek sigmas cases.
1275 * Note that not all the possible combinations
1276 * are handled here: some of them are handled
1277 * by the standard folding rules, and some of
1278 * them (the character class or ANYOF cases)
1279 * are handled during compiletime in
1280 * regexec.c:S_regclass(). */
1281 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1282 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1283 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1285 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1291 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1293 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1297 PL_reg_flags |= RF_tainted;
1304 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1305 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1307 tmp = ((OP(c) == BOUND ?
1308 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1309 LOAD_UTF8_CHARCLASS_ALNUM();
1310 REXEC_FBC_UTF8_SCAN(
1311 if (tmp == !(OP(c) == BOUND ?
1312 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1313 isALNUM_LC_utf8((U8*)s)))
1321 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1322 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1325 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1331 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1335 PL_reg_flags |= RF_tainted;
1342 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1343 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1345 tmp = ((OP(c) == NBOUND ?
1346 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1347 LOAD_UTF8_CHARCLASS_ALNUM();
1348 REXEC_FBC_UTF8_SCAN(
1349 if (tmp == !(OP(c) == NBOUND ?
1350 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1351 isALNUM_LC_utf8((U8*)s)))
1353 else REXEC_FBC_TRYIT;
1357 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1358 tmp = ((OP(c) == NBOUND ?
1359 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1362 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1364 else REXEC_FBC_TRYIT;
1367 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1371 REXEC_FBC_CSCAN_PRELOAD(
1372 LOAD_UTF8_CHARCLASS_ALNUM(),
1373 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1377 REXEC_FBC_CSCAN_TAINT(
1378 isALNUM_LC_utf8((U8*)s),
1382 REXEC_FBC_CSCAN_PRELOAD(
1383 LOAD_UTF8_CHARCLASS_ALNUM(),
1384 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1388 REXEC_FBC_CSCAN_TAINT(
1389 !isALNUM_LC_utf8((U8*)s),
1393 REXEC_FBC_CSCAN_PRELOAD(
1394 LOAD_UTF8_CHARCLASS_SPACE(),
1395 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1399 REXEC_FBC_CSCAN_TAINT(
1400 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1404 REXEC_FBC_CSCAN_PRELOAD(
1405 LOAD_UTF8_CHARCLASS_SPACE(),
1406 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1410 REXEC_FBC_CSCAN_TAINT(
1411 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1415 REXEC_FBC_CSCAN_PRELOAD(
1416 LOAD_UTF8_CHARCLASS_DIGIT(),
1417 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1421 REXEC_FBC_CSCAN_TAINT(
1422 isDIGIT_LC_utf8((U8*)s),
1426 REXEC_FBC_CSCAN_PRELOAD(
1427 LOAD_UTF8_CHARCLASS_DIGIT(),
1428 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1432 REXEC_FBC_CSCAN_TAINT(
1433 !isDIGIT_LC_utf8((U8*)s),
1439 is_LNBREAK_latin1(s)
1449 !is_VERTWS_latin1(s)
1454 is_HORIZWS_latin1(s)
1458 !is_HORIZWS_utf8(s),
1459 !is_HORIZWS_latin1(s)
1464 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1465 trie_type = do_utf8 ?
1466 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1468 /* what trie are we using right now */
1470 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1472 = (reg_trie_data*)progi->data->data[ aho->trie ];
1473 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
1475 const char *last_start = strend - trie->minlen;
1477 const char *real_start = s;
1479 STRLEN maxlen = trie->maxlen;
1481 U8 **points; /* map of where we were in the input string
1482 when reading a given char. For ASCII this
1483 is unnecessary overhead as the relationship
1484 is always 1:1, but for Unicode, especially
1485 case folded Unicode this is not true. */
1486 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1490 GET_RE_DEBUG_FLAGS_DECL;
1492 /* We can't just allocate points here. We need to wrap it in
1493 * an SV so it gets freed properly if there is a croak while
1494 * running the match */
1497 sv_points=newSV(maxlen * sizeof(U8 *));
1498 SvCUR_set(sv_points,
1499 maxlen * sizeof(U8 *));
1500 SvPOK_on(sv_points);
1501 sv_2mortal(sv_points);
1502 points=(U8**)SvPV_nolen(sv_points );
1503 if ( trie_type != trie_utf8_fold
1504 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1507 bitmap=(U8*)trie->bitmap;
1509 bitmap=(U8*)ANYOF_BITMAP(c);
1511 /* this is the Aho-Corasick algorithm modified a touch
1512 to include special handling for long "unknown char"
1513 sequences. The basic idea being that we use AC as long
1514 as we are dealing with a possible matching char, when
1515 we encounter an unknown char (and we have not encountered
1516 an accepting state) we scan forward until we find a legal
1518 AC matching is basically that of trie matching, except
1519 that when we encounter a failing transition, we fall back
1520 to the current states "fail state", and try the current char
1521 again, a process we repeat until we reach the root state,
1522 state 1, or a legal transition. If we fail on the root state
1523 then we can either terminate if we have reached an accepting
1524 state previously, or restart the entire process from the beginning
1528 while (s <= last_start) {
1529 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1537 U8 *uscan = (U8*)NULL;
1538 U8 *leftmost = NULL;
1540 U32 accepted_word= 0;
1544 while ( state && uc <= (U8*)strend ) {
1546 U32 word = aho->states[ state ].wordnum;
1550 DEBUG_TRIE_EXECUTE_r(
1551 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1552 dump_exec_pos( (char *)uc, c, strend, real_start,
1553 (char *)uc, do_utf8 );
1554 PerlIO_printf( Perl_debug_log,
1555 " Scanning for legal start char...\n");
1558 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1563 if (uc >(U8*)last_start) break;
1567 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1568 if (!leftmost || lpos < leftmost) {
1569 DEBUG_r(accepted_word=word);
1575 points[pointpos++ % maxlen]= uc;
1576 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1577 uscan, len, uvc, charid, foldlen,
1579 DEBUG_TRIE_EXECUTE_r({
1580 dump_exec_pos( (char *)uc, c, strend, real_start,
1582 PerlIO_printf(Perl_debug_log,
1583 " Charid:%3u CP:%4"UVxf" ",
1589 word = aho->states[ state ].wordnum;
1591 base = aho->states[ state ].trans.base;
1593 DEBUG_TRIE_EXECUTE_r({
1595 dump_exec_pos( (char *)uc, c, strend, real_start,
1597 PerlIO_printf( Perl_debug_log,
1598 "%sState: %4"UVxf", word=%"UVxf,
1599 failed ? " Fail transition to " : "",
1600 (UV)state, (UV)word);
1605 (base + charid > trie->uniquecharcount )
1606 && (base + charid - 1 - trie->uniquecharcount
1608 && trie->trans[base + charid - 1 -
1609 trie->uniquecharcount].check == state
1610 && (tmp=trie->trans[base + charid - 1 -
1611 trie->uniquecharcount ].next))
1613 DEBUG_TRIE_EXECUTE_r(
1614 PerlIO_printf( Perl_debug_log," - legal\n"));
1619 DEBUG_TRIE_EXECUTE_r(
1620 PerlIO_printf( Perl_debug_log," - fail\n"));
1622 state = aho->fail[state];
1626 /* we must be accepting here */
1627 DEBUG_TRIE_EXECUTE_r(
1628 PerlIO_printf( Perl_debug_log," - accepting\n"));
1637 if (!state) state = 1;
1640 if ( aho->states[ state ].wordnum ) {
1641 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1642 if (!leftmost || lpos < leftmost) {
1643 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1648 s = (char*)leftmost;
1649 DEBUG_TRIE_EXECUTE_r({
1651 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1652 (UV)accepted_word, (IV)(s - real_start)
1655 if (!reginfo || regtry(reginfo, &s)) {
1661 DEBUG_TRIE_EXECUTE_r({
1662 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1665 DEBUG_TRIE_EXECUTE_r(
1666 PerlIO_printf( Perl_debug_log,"No match.\n"));
1675 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1684 S_swap_match_buff (pTHX_ regexp *prog) {
1685 regexp_paren_pair *t;
1688 /* We have to be careful. If the previous successful match
1689 was from this regex we don't want a subsequent paritally
1690 successful match to clobber the old results.
1691 So when we detect this possibility we add a swap buffer
1692 to the re, and switch the buffer each match. If we fail
1693 we switch it back, otherwise we leave it swapped.
1695 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair);
1698 prog->swap = prog->offs;
1704 - regexec_flags - match a regexp against a string
1707 Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
1708 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1709 /* strend: pointer to null at end of string */
1710 /* strbeg: real beginning of string */
1711 /* minend: end of match must be >=minend after stringarg. */
1712 /* data: May be used for some additional optimizations.
1713 Currently its only used, with a U32 cast, for transmitting
1714 the ganch offset when doing a /g match. This will change */
1715 /* nosave: For optimizations. */
1718 /*register*/ char *s;
1719 register regnode *c;
1720 /*register*/ char *startpos = stringarg;
1721 I32 minlen; /* must match at least this many chars */
1722 I32 dontbother = 0; /* how many characters not to try at end */
1723 I32 end_shift = 0; /* Same for the end. */ /* CC */
1724 I32 scream_pos = -1; /* Internal iterator of scream. */
1725 char *scream_olds = NULL;
1726 const bool do_utf8 = (bool)DO_UTF8(sv);
1728 RXi_GET_DECL(prog,progi);
1729 regmatch_info reginfo; /* create some info to pass to regtry etc */
1730 bool swap_on_fail = 0;
1732 GET_RE_DEBUG_FLAGS_DECL;
1734 PERL_UNUSED_ARG(data);
1736 /* Be paranoid... */
1737 if (prog == NULL || startpos == NULL) {
1738 Perl_croak(aTHX_ "NULL regexp parameter");
1742 multiline = prog->extflags & RXf_PMf_MULTILINE;
1743 reginfo.prog = prog;
1745 RX_MATCH_UTF8_set(prog, do_utf8);
1747 debug_start_match(prog, do_utf8, startpos, strend,
1751 minlen = prog->minlen;
1753 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1754 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1755 "String too short [regexec_flags]...\n"));
1760 /* Check validity of program. */
1761 if (UCHARAT(progi->program) != REG_MAGIC) {
1762 Perl_croak(aTHX_ "corrupted regexp program");
1766 PL_reg_eval_set = 0;
1769 if (prog->extflags & RXf_UTF8)
1770 PL_reg_flags |= RF_utf8;
1772 /* Mark beginning of line for ^ and lookbehind. */
1773 reginfo.bol = startpos; /* XXX not used ??? */
1777 /* Mark end of line for $ (and such) */
1780 /* see how far we have to get to not match where we matched before */
1781 reginfo.till = startpos+minend;
1783 /* If there is a "must appear" string, look for it. */
1786 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1789 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1790 reginfo.ganch = startpos + prog->gofs;
1791 else if (sv && SvTYPE(sv) >= SVt_PVMG
1793 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1794 && mg->mg_len >= 0) {
1795 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1796 if (prog->extflags & RXf_ANCH_GPOS) {
1797 if (s > reginfo.ganch)
1799 s = reginfo.ganch - prog->gofs;
1803 reginfo.ganch = strbeg + PTR2UV(data);
1804 } else /* pos() not defined */
1805 reginfo.ganch = strbeg;
1807 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
1809 swap_match_buff(prog); /* do we need a save destructor here for
1812 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1813 re_scream_pos_data d;
1815 d.scream_olds = &scream_olds;
1816 d.scream_pos = &scream_pos;
1817 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1819 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1820 goto phooey; /* not present */
1826 /* Simplest case: anchored match need be tried only once. */
1827 /* [unless only anchor is BOL and multiline is set] */
1828 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1829 if (s == startpos && regtry(®info, &startpos))
1831 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1832 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1837 dontbother = minlen - 1;
1838 end = HOP3c(strend, -dontbother, strbeg) - 1;
1839 /* for multiline we only have to try after newlines */
1840 if (prog->check_substr || prog->check_utf8) {
1844 if (regtry(®info, &s))
1849 if (prog->extflags & RXf_USE_INTUIT) {
1850 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1861 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1862 if (regtry(®info, &s))
1869 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
1871 /* the warning about reginfo.ganch being used without intialization
1872 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
1873 and we only enter this block when the same bit is set. */
1874 char *tmp_s = reginfo.ganch - prog->gofs;
1875 if (regtry(®info, &tmp_s))
1880 /* Messy cases: unanchored match. */
1881 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
1882 /* we have /x+whatever/ */
1883 /* it must be a one character string (XXXX Except UTF?) */
1888 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1889 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1890 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1895 DEBUG_EXECUTE_r( did_match = 1 );
1896 if (regtry(®info, &s)) goto got_it;
1898 while (s < strend && *s == ch)
1906 DEBUG_EXECUTE_r( did_match = 1 );
1907 if (regtry(®info, &s)) goto got_it;
1909 while (s < strend && *s == ch)
1914 DEBUG_EXECUTE_r(if (!did_match)
1915 PerlIO_printf(Perl_debug_log,
1916 "Did not find anchored character...\n")
1919 else if (prog->anchored_substr != NULL
1920 || prog->anchored_utf8 != NULL
1921 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1922 && prog->float_max_offset < strend - s)) {
1927 char *last1; /* Last position checked before */
1931 if (prog->anchored_substr || prog->anchored_utf8) {
1932 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1933 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1934 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1935 back_max = back_min = prog->anchored_offset;
1937 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1938 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1939 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1940 back_max = prog->float_max_offset;
1941 back_min = prog->float_min_offset;
1945 if (must == &PL_sv_undef)
1946 /* could not downgrade utf8 check substring, so must fail */
1952 last = HOP3c(strend, /* Cannot start after this */
1953 -(I32)(CHR_SVLEN(must)
1954 - (SvTAIL(must) != 0) + back_min), strbeg);
1957 last1 = HOPc(s, -1);
1959 last1 = s - 1; /* bogus */
1961 /* XXXX check_substr already used to find "s", can optimize if
1962 check_substr==must. */
1964 dontbother = end_shift;
1965 strend = HOPc(strend, -dontbother);
1966 while ( (s <= last) &&
1967 ((flags & REXEC_SCREAM)
1968 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1969 end_shift, &scream_pos, 0))
1970 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1971 (unsigned char*)strend, must,
1972 multiline ? FBMrf_MULTILINE : 0))) ) {
1973 /* we may be pointing at the wrong string */
1974 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1975 s = strbeg + (s - SvPVX_const(sv));
1976 DEBUG_EXECUTE_r( did_match = 1 );
1977 if (HOPc(s, -back_max) > last1) {
1978 last1 = HOPc(s, -back_min);
1979 s = HOPc(s, -back_max);
1982 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1984 last1 = HOPc(s, -back_min);
1988 while (s <= last1) {
1989 if (regtry(®info, &s))
1995 while (s <= last1) {
1996 if (regtry(®info, &s))
2002 DEBUG_EXECUTE_r(if (!did_match) {
2003 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2004 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2005 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2006 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2007 ? "anchored" : "floating"),
2008 quoted, RE_SV_TAIL(must));
2012 else if ( (c = progi->regstclass) ) {
2014 const OPCODE op = OP(progi->regstclass);
2015 /* don't bother with what can't match */
2016 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2017 strend = HOPc(strend, -(minlen - 1));
2020 SV * const prop = sv_newmortal();
2021 regprop(prog, prop, c);
2023 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2025 PerlIO_printf(Perl_debug_log,
2026 "Matching stclass %.*s against %s (%d chars)\n",
2027 (int)SvCUR(prop), SvPVX_const(prop),
2028 quoted, (int)(strend - s));
2031 if (find_byclass(prog, c, s, strend, ®info))
2033 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2037 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2042 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2043 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2044 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2046 if (flags & REXEC_SCREAM) {
2047 last = screaminstr(sv, float_real, s - strbeg,
2048 end_shift, &scream_pos, 1); /* last one */
2050 last = scream_olds; /* Only one occurrence. */
2051 /* we may be pointing at the wrong string */
2052 else if (RX_MATCH_COPIED(prog))
2053 s = strbeg + (s - SvPVX_const(sv));
2057 const char * const little = SvPV_const(float_real, len);
2059 if (SvTAIL(float_real)) {
2060 if (memEQ(strend - len + 1, little, len - 1))
2061 last = strend - len + 1;
2062 else if (!multiline)
2063 last = memEQ(strend - len, little, len)
2064 ? strend - len : NULL;
2070 last = rninstr(s, strend, little, little + len);
2072 last = strend; /* matching "$" */
2077 PerlIO_printf(Perl_debug_log,
2078 "%sCan't trim the tail, match fails (should not happen)%s\n",
2079 PL_colors[4], PL_colors[5]));
2080 goto phooey; /* Should not happen! */
2082 dontbother = strend - last + prog->float_min_offset;
2084 if (minlen && (dontbother < minlen))
2085 dontbother = minlen - 1;
2086 strend -= dontbother; /* this one's always in bytes! */
2087 /* We don't know much -- general case. */
2090 if (regtry(®info, &s))
2099 if (regtry(®info, &s))
2101 } while (s++ < strend);
2109 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2111 if (PL_reg_eval_set)
2112 restore_pos(aTHX_ prog);
2113 if (prog->paren_names)
2114 (void)hv_iterinit(prog->paren_names);
2116 /* make sure $`, $&, $', and $digit will work later */
2117 if ( !(flags & REXEC_NOT_FIRST) ) {
2118 RX_MATCH_COPY_FREE(prog);
2119 if (flags & REXEC_COPY_STR) {
2120 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2121 #ifdef PERL_OLD_COPY_ON_WRITE
2123 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2125 PerlIO_printf(Perl_debug_log,
2126 "Copy on write: regexp capture, type %d\n",
2129 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2130 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2131 assert (SvPOKp(prog->saved_copy));
2135 RX_MATCH_COPIED_on(prog);
2136 s = savepvn(strbeg, i);
2142 prog->subbeg = strbeg;
2143 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2151 PL_colors[4], PL_colors[5]));
2152 if (PL_reg_eval_set)
2153 restore_pos(aTHX_ prog);
2155 /* we failed :-( roll it back */
2156 swap_match_buff(prog);
2163 - regtry - try match at specific point
2165 STATIC I32 /* 0 failure, 1 success */
2166 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2170 regexp *prog = reginfo->prog;
2171 RXi_GET_DECL(prog,progi);
2172 GET_RE_DEBUG_FLAGS_DECL;
2173 reginfo->cutpoint=NULL;
2175 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2178 PL_reg_eval_set = RS_init;
2179 DEBUG_EXECUTE_r(DEBUG_s(
2180 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2181 (IV)(PL_stack_sp - PL_stack_base));
2184 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2185 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2187 /* Apparently this is not needed, judging by wantarray. */
2188 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2189 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2192 /* Make $_ available to executed code. */
2193 if (reginfo->sv != DEFSV) {
2195 DEFSV = reginfo->sv;
2198 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2199 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2200 /* prepare for quick setting of pos */
2201 #ifdef PERL_OLD_COPY_ON_WRITE
2202 if (SvIsCOW(reginfo->sv))
2203 sv_force_normal_flags(reginfo->sv, 0);
2205 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2206 &PL_vtbl_mglob, NULL, 0);
2210 PL_reg_oldpos = mg->mg_len;
2211 SAVEDESTRUCTOR_X(restore_pos, prog);
2213 if (!PL_reg_curpm) {
2214 Newxz(PL_reg_curpm, 1, PMOP);
2217 SV* const repointer = newSViv(0);
2218 /* so we know which PL_regex_padav element is PL_reg_curpm */
2219 SvFLAGS(repointer) |= SVf_BREAK;
2220 av_push(PL_regex_padav,repointer);
2221 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2222 PL_regex_pad = AvARRAY(PL_regex_padav);
2226 PM_SETRE(PL_reg_curpm, prog);
2227 PL_reg_oldcurpm = PL_curpm;
2228 PL_curpm = PL_reg_curpm;
2229 if (RX_MATCH_COPIED(prog)) {
2230 /* Here is a serious problem: we cannot rewrite subbeg,
2231 since it may be needed if this match fails. Thus
2232 $` inside (?{}) could fail... */
2233 PL_reg_oldsaved = prog->subbeg;
2234 PL_reg_oldsavedlen = prog->sublen;
2235 #ifdef PERL_OLD_COPY_ON_WRITE
2236 PL_nrs = prog->saved_copy;
2238 RX_MATCH_COPIED_off(prog);
2241 PL_reg_oldsaved = NULL;
2242 prog->subbeg = PL_bostr;
2243 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2245 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2246 prog->offs[0].start = *startpos - PL_bostr;
2247 PL_reginput = *startpos;
2248 PL_reglastparen = &prog->lastparen;
2249 PL_reglastcloseparen = &prog->lastcloseparen;
2250 prog->lastparen = 0;
2251 prog->lastcloseparen = 0;
2253 PL_regoffs = prog->offs;
2254 if (PL_reg_start_tmpl <= prog->nparens) {
2255 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2256 if(PL_reg_start_tmp)
2257 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2259 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2262 /* XXXX What this code is doing here?!!! There should be no need
2263 to do this again and again, PL_reglastparen should take care of
2266 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2267 * Actually, the code in regcppop() (which Ilya may be meaning by
2268 * PL_reglastparen), is not needed at all by the test suite
2269 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2270 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2271 * Meanwhile, this code *is* needed for the
2272 * above-mentioned test suite tests to succeed. The common theme
2273 * on those tests seems to be returning null fields from matches.
2274 * --jhi updated by dapm */
2276 if (prog->nparens) {
2277 regexp_paren_pair *pp = PL_regoffs;
2279 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2287 if (regmatch(reginfo, progi->program + 1)) {
2288 PL_regoffs[0].end = PL_reginput - PL_bostr;
2291 if (reginfo->cutpoint)
2292 *startpos= reginfo->cutpoint;
2293 REGCP_UNWIND(lastcp);
2298 #define sayYES goto yes
2299 #define sayNO goto no
2300 #define sayNO_SILENT goto no_silent
2302 /* we dont use STMT_START/END here because it leads to
2303 "unreachable code" warnings, which are bogus, but distracting. */
2304 #define CACHEsayNO \
2305 if (ST.cache_mask) \
2306 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2309 /* this is used to determine how far from the left messages like
2310 'failed...' are printed. It should be set such that messages
2311 are inline with the regop output that created them.
2313 #define REPORT_CODE_OFF 32
2316 /* Make sure there is a test for this +1 options in re_tests */
2317 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2319 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2320 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2322 #define SLAB_FIRST(s) (&(s)->states[0])
2323 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2325 /* grab a new slab and return the first slot in it */
2327 STATIC regmatch_state *
2330 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2333 regmatch_slab *s = PL_regmatch_slab->next;
2335 Newx(s, 1, regmatch_slab);
2336 s->prev = PL_regmatch_slab;
2338 PL_regmatch_slab->next = s;
2340 PL_regmatch_slab = s;
2341 return SLAB_FIRST(s);
2345 /* push a new state then goto it */
2347 #define PUSH_STATE_GOTO(state, node) \
2349 st->resume_state = state; \
2352 /* push a new state with success backtracking, then goto it */
2354 #define PUSH_YES_STATE_GOTO(state, node) \
2356 st->resume_state = state; \
2357 goto push_yes_state;
2363 regmatch() - main matching routine
2365 This is basically one big switch statement in a loop. We execute an op,
2366 set 'next' to point the next op, and continue. If we come to a point which
2367 we may need to backtrack to on failure such as (A|B|C), we push a
2368 backtrack state onto the backtrack stack. On failure, we pop the top
2369 state, and re-enter the loop at the state indicated. If there are no more
2370 states to pop, we return failure.
2372 Sometimes we also need to backtrack on success; for example /A+/, where
2373 after successfully matching one A, we need to go back and try to
2374 match another one; similarly for lookahead assertions: if the assertion
2375 completes successfully, we backtrack to the state just before the assertion
2376 and then carry on. In these cases, the pushed state is marked as
2377 'backtrack on success too'. This marking is in fact done by a chain of
2378 pointers, each pointing to the previous 'yes' state. On success, we pop to
2379 the nearest yes state, discarding any intermediate failure-only states.
2380 Sometimes a yes state is pushed just to force some cleanup code to be
2381 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2382 it to free the inner regex.
2384 Note that failure backtracking rewinds the cursor position, while
2385 success backtracking leaves it alone.
2387 A pattern is complete when the END op is executed, while a subpattern
2388 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2389 ops trigger the "pop to last yes state if any, otherwise return true"
2392 A common convention in this function is to use A and B to refer to the two
2393 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2394 the subpattern to be matched possibly multiple times, while B is the entire
2395 rest of the pattern. Variable and state names reflect this convention.
2397 The states in the main switch are the union of ops and failure/success of
2398 substates associated with with that op. For example, IFMATCH is the op
2399 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2400 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2401 successfully matched A and IFMATCH_A_fail is a state saying that we have
2402 just failed to match A. Resume states always come in pairs. The backtrack
2403 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2404 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2405 on success or failure.
2407 The struct that holds a backtracking state is actually a big union, with
2408 one variant for each major type of op. The variable st points to the
2409 top-most backtrack struct. To make the code clearer, within each
2410 block of code we #define ST to alias the relevant union.
2412 Here's a concrete example of a (vastly oversimplified) IFMATCH
2418 #define ST st->u.ifmatch
2420 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2421 ST.foo = ...; // some state we wish to save
2423 // push a yes backtrack state with a resume value of
2424 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2426 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2429 case IFMATCH_A: // we have successfully executed A; now continue with B
2431 bar = ST.foo; // do something with the preserved value
2434 case IFMATCH_A_fail: // A failed, so the assertion failed
2435 ...; // do some housekeeping, then ...
2436 sayNO; // propagate the failure
2443 For any old-timers reading this who are familiar with the old recursive
2444 approach, the code above is equivalent to:
2446 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2455 ...; // do some housekeeping, then ...
2456 sayNO; // propagate the failure
2459 The topmost backtrack state, pointed to by st, is usually free. If you
2460 want to claim it, populate any ST.foo fields in it with values you wish to
2461 save, then do one of
2463 PUSH_STATE_GOTO(resume_state, node);
2464 PUSH_YES_STATE_GOTO(resume_state, node);
2466 which sets that backtrack state's resume value to 'resume_state', pushes a
2467 new free entry to the top of the backtrack stack, then goes to 'node'.
2468 On backtracking, the free slot is popped, and the saved state becomes the
2469 new free state. An ST.foo field in this new top state can be temporarily
2470 accessed to retrieve values, but once the main loop is re-entered, it
2471 becomes available for reuse.
2473 Note that the depth of the backtrack stack constantly increases during the
2474 left-to-right execution of the pattern, rather than going up and down with
2475 the pattern nesting. For example the stack is at its maximum at Z at the
2476 end of the pattern, rather than at X in the following:
2478 /(((X)+)+)+....(Y)+....Z/
2480 The only exceptions to this are lookahead/behind assertions and the cut,
2481 (?>A), which pop all the backtrack states associated with A before
2484 Bascktrack state structs are allocated in slabs of about 4K in size.
2485 PL_regmatch_state and st always point to the currently active state,
2486 and PL_regmatch_slab points to the slab currently containing
2487 PL_regmatch_state. The first time regmatch() is called, the first slab is
2488 allocated, and is never freed until interpreter destruction. When the slab
2489 is full, a new one is allocated and chained to the end. At exit from
2490 regmatch(), slabs allocated since entry are freed.
2495 #define DEBUG_STATE_pp(pp) \
2497 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2498 PerlIO_printf(Perl_debug_log, \
2499 " %*s"pp" %s%s%s%s%s\n", \
2501 PL_reg_name[st->resume_state], \
2502 ((st==yes_state||st==mark_state) ? "[" : ""), \
2503 ((st==yes_state) ? "Y" : ""), \
2504 ((st==mark_state) ? "M" : ""), \
2505 ((st==yes_state||st==mark_state) ? "]" : "") \
2510 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2515 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2516 const char *start, const char *end, const char *blurb)
2518 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
2522 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2523 prog->precomp, prog->prelen, 60);
2525 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2526 start, end - start, 60);
2528 PerlIO_printf(Perl_debug_log,
2529 "%s%s REx%s %s against %s\n",
2530 PL_colors[4], blurb, PL_colors[5], s0, s1);
2532 if (do_utf8||utf8_pat)
2533 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2534 utf8_pat ? "pattern" : "",
2535 utf8_pat && do_utf8 ? " and " : "",
2536 do_utf8 ? "string" : ""
2542 S_dump_exec_pos(pTHX_ const char *locinput,
2543 const regnode *scan,
2544 const char *loc_regeol,
2545 const char *loc_bostr,
2546 const char *loc_reg_starttry,
2549 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2550 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2551 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2552 /* The part of the string before starttry has one color
2553 (pref0_len chars), between starttry and current
2554 position another one (pref_len - pref0_len chars),
2555 after the current position the third one.
2556 We assume that pref0_len <= pref_len, otherwise we
2557 decrease pref0_len. */
2558 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2559 ? (5 + taill) - l : locinput - loc_bostr;
2562 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2564 pref0_len = pref_len - (locinput - loc_reg_starttry);
2565 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2566 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2567 ? (5 + taill) - pref_len : loc_regeol - locinput);
2568 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2572 if (pref0_len > pref_len)
2573 pref0_len = pref_len;
2575 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2577 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2578 (locinput - pref_len),pref0_len, 60, 4, 5);
2580 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2581 (locinput - pref_len + pref0_len),
2582 pref_len - pref0_len, 60, 2, 3);
2584 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2585 locinput, loc_regeol - locinput, 10, 0, 1);
2587 const STRLEN tlen=len0+len1+len2;
2588 PerlIO_printf(Perl_debug_log,
2589 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2590 (IV)(locinput - loc_bostr),
2593 (docolor ? "" : "> <"),
2595 (int)(tlen > 19 ? 0 : 19 - tlen),
2602 /* reg_check_named_buff_matched()
2603 * Checks to see if a named buffer has matched. The data array of
2604 * buffer numbers corresponding to the buffer is expected to reside
2605 * in the regexp->data->data array in the slot stored in the ARG() of
2606 * node involved. Note that this routine doesn't actually care about the
2607 * name, that information is not preserved from compilation to execution.
2608 * Returns the index of the leftmost defined buffer with the given name
2609 * or 0 if non of the buffers matched.
2612 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2614 RXi_GET_DECL(rex,rexi);
2615 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
2616 I32 *nums=(I32*)SvPVX(sv_dat);
2617 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2618 if ((I32)*PL_reglastparen >= nums[n] &&
2619 PL_regoffs[nums[n]].end != -1)
2628 /* free all slabs above current one - called during LEAVE_SCOPE */
2631 S_clear_backtrack_stack(pTHX_ void *p)
2633 regmatch_slab *s = PL_regmatch_slab->next;
2638 PL_regmatch_slab->next = NULL;
2640 regmatch_slab * const osl = s;
2647 #define SETREX(Re1,Re2) \
2648 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2651 STATIC I32 /* 0 failure, 1 success */
2652 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2654 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2658 register const bool do_utf8 = PL_reg_match_utf8;
2659 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2661 regexp *rex = reginfo->prog;
2662 RXi_GET_DECL(rex,rexi);
2666 /* the current state. This is a cached copy of PL_regmatch_state */
2667 register regmatch_state *st;
2669 /* cache heavy used fields of st in registers */
2670 register regnode *scan;
2671 register regnode *next;
2672 register U32 n = 0; /* general value; init to avoid compiler warning */
2673 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2674 register char *locinput = PL_reginput;
2675 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2677 bool result = 0; /* return value of S_regmatch */
2678 int depth = 0; /* depth of backtrack stack */
2679 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2680 const U32 max_nochange_depth =
2681 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2682 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2684 regmatch_state *yes_state = NULL; /* state to pop to on success of
2686 /* mark_state piggy backs on the yes_state logic so that when we unwind
2687 the stack on success we can update the mark_state as we go */
2688 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2690 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2691 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2693 bool no_final = 0; /* prevent failure from backtracking? */
2694 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2695 char *startpoint = PL_reginput;
2696 SV *popmark = NULL; /* are we looking for a mark? */
2697 SV *sv_commit = NULL; /* last mark name seen in failure */
2698 SV *sv_yes_mark = NULL; /* last mark name we have seen
2699 during a successfull match */
2700 U32 lastopen = 0; /* last open we saw */
2701 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2703 SV* const oreplsv = GvSV(PL_replgv);
2706 /* these three flags are set by various ops to signal information to
2707 * the very next op. They have a useful lifetime of exactly one loop
2708 * iteration, and are not preserved or restored by state pushes/pops
2710 bool sw = 0; /* the condition value in (?(cond)a|b) */
2711 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2712 int logical = 0; /* the following EVAL is:
2716 or the following IFMATCH/UNLESSM is:
2717 false: plain (?=foo)
2718 true: used as a condition: (?(?=foo))
2722 GET_RE_DEBUG_FLAGS_DECL;
2725 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2726 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2728 /* on first ever call to regmatch, allocate first slab */
2729 if (!PL_regmatch_slab) {
2730 Newx(PL_regmatch_slab, 1, regmatch_slab);
2731 PL_regmatch_slab->prev = NULL;
2732 PL_regmatch_slab->next = NULL;
2733 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2736 oldsave = PL_savestack_ix;
2737 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2738 SAVEVPTR(PL_regmatch_slab);
2739 SAVEVPTR(PL_regmatch_state);
2741 /* grab next free state slot */
2742 st = ++PL_regmatch_state;
2743 if (st > SLAB_LAST(PL_regmatch_slab))
2744 st = PL_regmatch_state = S_push_slab(aTHX);
2746 /* Note that nextchr is a byte even in UTF */
2747 nextchr = UCHARAT(locinput);
2749 while (scan != NULL) {
2752 SV * const prop = sv_newmortal();
2753 regnode *rnext=regnext(scan);
2754 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2755 regprop(rex, prop, scan);
2757 PerlIO_printf(Perl_debug_log,
2758 "%3"IVdf":%*s%s(%"IVdf")\n",
2759 (IV)(scan - rexi->program), depth*2, "",
2761 (PL_regkind[OP(scan)] == END || !rnext) ?
2762 0 : (IV)(rnext - rexi->program));
2765 next = scan + NEXT_OFF(scan);
2768 state_num = OP(scan);
2771 switch (state_num) {
2773 if (locinput == PL_bostr)
2775 /* reginfo->till = reginfo->bol; */
2780 if (locinput == PL_bostr ||
2781 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2787 if (locinput == PL_bostr)
2791 if (locinput == reginfo->ganch)
2796 /* update the startpoint */
2797 st->u.keeper.val = PL_regoffs[0].start;
2798 PL_reginput = locinput;
2799 PL_regoffs[0].start = locinput - PL_bostr;
2800 PUSH_STATE_GOTO(KEEPS_next, next);
2802 case KEEPS_next_fail:
2803 /* rollback the start point change */
2804 PL_regoffs[0].start = st->u.keeper.val;
2810 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2815 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2817 if (PL_regeol - locinput > 1)
2821 if (PL_regeol != locinput)
2825 if (!nextchr && locinput >= PL_regeol)
2828 locinput += PL_utf8skip[nextchr];
2829 if (locinput > PL_regeol)
2831 nextchr = UCHARAT(locinput);
2834 nextchr = UCHARAT(++locinput);
2837 if (!nextchr && locinput >= PL_regeol)
2839 nextchr = UCHARAT(++locinput);
2842 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2845 locinput += PL_utf8skip[nextchr];
2846 if (locinput > PL_regeol)
2848 nextchr = UCHARAT(locinput);
2851 nextchr = UCHARAT(++locinput);
2855 #define ST st->u.trie
2857 /* In this case the charclass data is available inline so
2858 we can fail fast without a lot of extra overhead.
2860 if (scan->flags == EXACT || !do_utf8) {
2861 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2863 PerlIO_printf(Perl_debug_log,
2864 "%*s %sfailed to match trie start class...%s\n",
2865 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2874 /* what type of TRIE am I? (utf8 makes this contextual) */
2875 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2876 trie_type = do_utf8 ?
2877 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2880 /* what trie are we using right now */
2881 reg_trie_data * const trie
2882 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
2883 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
2884 U32 state = trie->startstate;
2886 if (trie->bitmap && trie_type != trie_utf8_fold &&
2887 !TRIE_BITMAP_TEST(trie,*locinput)
2889 if (trie->states[ state ].wordnum) {
2891 PerlIO_printf(Perl_debug_log,
2892 "%*s %smatched empty string...%s\n",
2893 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2898 PerlIO_printf(Perl_debug_log,
2899 "%*s %sfailed to match trie start class...%s\n",
2900 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2907 U8 *uc = ( U8* )locinput;
2911 U8 *uscan = (U8*)NULL;
2913 SV *sv_accept_buff = NULL;
2914 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2916 ST.accepted = 0; /* how many accepting states we have seen */
2918 ST.jump = trie->jump;
2921 traverse the TRIE keeping track of all accepting states
2922 we transition through until we get to a failing node.
2925 while ( state && uc <= (U8*)PL_regeol ) {
2926 U32 base = trie->states[ state ].trans.base;
2929 /* We use charid to hold the wordnum as we don't use it
2930 for charid until after we have done the wordnum logic.
2931 We define an alias just so that the wordnum logic reads
2934 #define got_wordnum charid
2935 got_wordnum = trie->states[ state ].wordnum;
2937 if ( got_wordnum ) {
2938 if ( ! ST.accepted ) {
2941 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2942 sv_accept_buff=newSV(bufflen *
2943 sizeof(reg_trie_accepted) - 1);
2944 SvCUR_set(sv_accept_buff, 0);
2945 SvPOK_on(sv_accept_buff);
2946 sv_2mortal(sv_accept_buff);
2949 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2952 if (ST.accepted >= bufflen) {
2954 ST.accept_buff =(reg_trie_accepted*)
2955 SvGROW(sv_accept_buff,
2956 bufflen * sizeof(reg_trie_accepted));
2958 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2959 + sizeof(reg_trie_accepted));
2962 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2963 ST.accept_buff[ST.accepted].endpos = uc;
2965 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2969 DEBUG_TRIE_EXECUTE_r({
2970 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2971 PerlIO_printf( Perl_debug_log,
2972 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2973 2+depth * 2, "", PL_colors[4],
2974 (UV)state, (UV)ST.accepted );
2978 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2979 uscan, len, uvc, charid, foldlen,
2983 (base + charid > trie->uniquecharcount )
2984 && (base + charid - 1 - trie->uniquecharcount
2986 && trie->trans[base + charid - 1 -
2987 trie->uniquecharcount].check == state)
2989 state = trie->trans[base + charid - 1 -
2990 trie->uniquecharcount ].next;
3001 DEBUG_TRIE_EXECUTE_r(
3002 PerlIO_printf( Perl_debug_log,
3003 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3004 charid, uvc, (UV)state, PL_colors[5] );
3011 PerlIO_printf( Perl_debug_log,
3012 "%*s %sgot %"IVdf" possible matches%s\n",
3013 REPORT_CODE_OFF + depth * 2, "",
3014 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3017 goto trie_first_try; /* jump into the fail handler */
3019 case TRIE_next_fail: /* we failed - try next alterative */
3021 REGCP_UNWIND(ST.cp);
3022 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3023 PL_regoffs[n].end = -1;
3024 *PL_reglastparen = n;
3033 ST.lastparen = *PL_reglastparen;
3036 if ( ST.accepted == 1 ) {
3037 /* only one choice left - just continue */
3039 AV *const trie_words
3040 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3041 SV ** const tmp = av_fetch( trie_words,
3042 ST.accept_buff[ 0 ].wordnum-1, 0 );
3043 SV *sv= tmp ? sv_newmortal() : NULL;
3045 PerlIO_printf( Perl_debug_log,
3046 "%*s %sonly one match left: #%d <%s>%s\n",
3047 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3048 ST.accept_buff[ 0 ].wordnum,
3049 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3050 PL_colors[0], PL_colors[1],
3051 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3053 : "not compiled under -Dr",
3056 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3057 /* in this case we free tmps/leave before we call regmatch
3058 as we wont be using accept_buff again. */
3060 locinput = PL_reginput;
3061 nextchr = UCHARAT(locinput);
3062 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3065 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3066 if (!has_cutgroup) {
3071 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3074 continue; /* execute rest of RE */
3077 if ( !ST.accepted-- ) {
3079 PerlIO_printf( Perl_debug_log,
3080 "%*s %sTRIE failed...%s\n",
3081 REPORT_CODE_OFF+depth*2, "",
3092 There are at least two accepting states left. Presumably
3093 the number of accepting states is going to be low,
3094 typically two. So we simply scan through to find the one
3095 with lowest wordnum. Once we find it, we swap the last
3096 state into its place and decrement the size. We then try to
3097 match the rest of the pattern at the point where the word
3098 ends. If we succeed, control just continues along the
3099 regex; if we fail we return here to try the next accepting
3106 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3107 DEBUG_TRIE_EXECUTE_r(
3108 PerlIO_printf( Perl_debug_log,
3109 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3110 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3111 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3112 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3115 if (ST.accept_buff[cur].wordnum <
3116 ST.accept_buff[best].wordnum)
3121 AV *const trie_words
3122 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3123 SV ** const tmp = av_fetch( trie_words,
3124 ST.accept_buff[ best ].wordnum - 1, 0 );
3125 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3127 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3128 SV *sv= tmp ? sv_newmortal() : NULL;
3130 PerlIO_printf( Perl_debug_log,
3131 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3132 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3133 ST.accept_buff[best].wordnum,
3134 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3135 PL_colors[0], PL_colors[1],
3136 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3137 ) : "not compiled under -Dr",
3138 REG_NODE_NUM(nextop),
3142 if ( best<ST.accepted ) {
3143 reg_trie_accepted tmp = ST.accept_buff[ best ];
3144 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3145 ST.accept_buff[ ST.accepted ] = tmp;
3148 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3149 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3153 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3157 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3160 PUSH_STATE_GOTO(TRIE_next, scan);
3173 char *s = STRING(scan);
3175 if (do_utf8 != UTF) {
3176 /* The target and the pattern have differing utf8ness. */
3178 const char * const e = s + ln;
3181 /* The target is utf8, the pattern is not utf8. */
3186 if (NATIVE_TO_UNI(*(U8*)s) !=
3187 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3195 /* The target is not utf8, the pattern is utf8. */
3200 if (NATIVE_TO_UNI(*((U8*)l)) !=
3201 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3209 nextchr = UCHARAT(locinput);
3212 /* The target and the pattern have the same utf8ness. */
3213 /* Inline the first character, for speed. */
3214 if (UCHARAT(s) != nextchr)
3216 if (PL_regeol - locinput < ln)
3218 if (ln > 1 && memNE(s, locinput, ln))
3221 nextchr = UCHARAT(locinput);
3225 PL_reg_flags |= RF_tainted;
3228 char * const s = STRING(scan);
3231 if (do_utf8 || UTF) {
3232 /* Either target or the pattern are utf8. */
3233 const char * const l = locinput;
3234 char *e = PL_regeol;
3236 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3237 l, &e, 0, do_utf8)) {
3238 /* One more case for the sharp s:
3239 * pack("U0U*", 0xDF) =~ /ss/i,
3240 * the 0xC3 0x9F are the UTF-8
3241 * byte sequence for the U+00DF. */
3244 toLOWER(s[0]) == 's' &&
3246 toLOWER(s[1]) == 's' &&
3253 nextchr = UCHARAT(locinput);
3257 /* Neither the target and the pattern are utf8. */
3259 /* Inline the first character, for speed. */
3260 if (UCHARAT(s) != nextchr &&
3261 UCHARAT(s) != ((OP(scan) == EXACTF)
3262 ? PL_fold : PL_fold_locale)[nextchr])
3264 if (PL_regeol - locinput < ln)
3266 if (ln > 1 && (OP(scan) == EXACTF
3267 ? ibcmp(s, locinput, ln)
3268 : ibcmp_locale(s, locinput, ln)))
3271 nextchr = UCHARAT(locinput);
3276 STRLEN inclasslen = PL_regeol - locinput;
3278 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3280 if (locinput >= PL_regeol)
3282 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3283 nextchr = UCHARAT(locinput);
3288 nextchr = UCHARAT(locinput);
3289 if (!REGINCLASS(rex, scan, (U8*)locinput))
3291 if (!nextchr && locinput >= PL_regeol)
3293 nextchr = UCHARAT(++locinput);
3297 /* If we might have the case of the German sharp s
3298 * in a casefolding Unicode character class. */
3300 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3301 locinput += SHARP_S_SKIP;
3302 nextchr = UCHARAT(locinput);
3308 PL_reg_flags |= RF_tainted;
3314 LOAD_UTF8_CHARCLASS_ALNUM();
3315 if (!(OP(scan) == ALNUM
3316 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3317 : isALNUM_LC_utf8((U8*)locinput)))
3321 locinput += PL_utf8skip[nextchr];
3322 nextchr = UCHARAT(locinput);
3325 if (!(OP(scan) == ALNUM
3326 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3328 nextchr = UCHARAT(++locinput);
3331 PL_reg_flags |= RF_tainted;
3334 if (!nextchr && locinput >= PL_regeol)
3337 LOAD_UTF8_CHARCLASS_ALNUM();
3338 if (OP(scan) == NALNUM
3339 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3340 : isALNUM_LC_utf8((U8*)locinput))
3344 locinput += PL_utf8skip[nextchr];
3345 nextchr = UCHARAT(locinput);
3348 if (OP(scan) == NALNUM
3349 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3351 nextchr = UCHARAT(++locinput);
3355 PL_reg_flags |= RF_tainted;
3359 /* was last char in word? */
3361 if (locinput == PL_bostr)
3364 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3366 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3368 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3369 ln = isALNUM_uni(ln);
3370 LOAD_UTF8_CHARCLASS_ALNUM();
3371 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3374 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3375 n = isALNUM_LC_utf8((U8*)locinput);
3379 ln = (locinput != PL_bostr) ?
3380 UCHARAT(locinput - 1) : '\n';
3381 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3383 n = isALNUM(nextchr);
3386 ln = isALNUM_LC(ln);
3387 n = isALNUM_LC(nextchr);
3390 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3391 OP(scan) == BOUNDL))
3395 PL_reg_flags |= RF_tainted;
3401 if (UTF8_IS_CONTINUED(nextchr)) {
3402 LOAD_UTF8_CHARCLASS_SPACE();
3403 if (!(OP(scan) == SPACE
3404 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3405 : isSPACE_LC_utf8((U8*)locinput)))
3409 locinput += PL_utf8skip[nextchr];
3410 nextchr = UCHARAT(locinput);
3413 if (!(OP(scan) == SPACE
3414 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3416 nextchr = UCHARAT(++locinput);
3419 if (!(OP(scan) == SPACE
3420 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3422 nextchr = UCHARAT(++locinput);
3426 PL_reg_flags |= RF_tainted;
3429 if (!nextchr && locinput >= PL_regeol)
3432 LOAD_UTF8_CHARCLASS_SPACE();
3433 if (OP(scan) == NSPACE
3434 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3435 : isSPACE_LC_utf8((U8*)locinput))
3439 locinput += PL_utf8skip[nextchr];
3440 nextchr = UCHARAT(locinput);
3443 if (OP(scan) == NSPACE
3444 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3446 nextchr = UCHARAT(++locinput);
3449 PL_reg_flags |= RF_tainted;
3455 LOAD_UTF8_CHARCLASS_DIGIT();
3456 if (!(OP(scan) == DIGIT
3457 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3458 : isDIGIT_LC_utf8((U8*)locinput)))
3462 locinput += PL_utf8skip[nextchr];
3463 nextchr = UCHARAT(locinput);
3466 if (!(OP(scan) == DIGIT
3467 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3469 nextchr = UCHARAT(++locinput);
3472 PL_reg_flags |= RF_tainted;
3475 if (!nextchr && locinput >= PL_regeol)
3478 LOAD_UTF8_CHARCLASS_DIGIT();
3479 if (OP(scan) == NDIGIT
3480 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3481 : isDIGIT_LC_utf8((U8*)locinput))
3485 locinput += PL_utf8skip[nextchr];
3486 nextchr = UCHARAT(locinput);
3489 if (OP(scan) == NDIGIT
3490 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3492 nextchr = UCHARAT(++locinput);
3495 if (locinput >= PL_regeol)
3498 LOAD_UTF8_CHARCLASS_MARK();
3499 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3501 locinput += PL_utf8skip[nextchr];
3502 while (locinput < PL_regeol &&
3503 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3504 locinput += UTF8SKIP(locinput);
3505 if (locinput > PL_regeol)
3510 nextchr = UCHARAT(locinput);
3517 PL_reg_flags |= RF_tainted;
3522 n = reg_check_named_buff_matched(rex,scan);
3525 type = REF + ( type - NREF );
3532 PL_reg_flags |= RF_tainted;
3536 n = ARG(scan); /* which paren pair */
3539 ln = PL_regoffs[n].start;
3540 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3541 if (*PL_reglastparen < n || ln == -1)
3542 sayNO; /* Do not match unless seen CLOSEn. */
3543 if (ln == PL_regoffs[n].end)
3547 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3549 const char *e = PL_bostr + PL_regoffs[n].end;
3551 * Note that we can't do the "other character" lookup trick as
3552 * in the 8-bit case (no pun intended) because in Unicode we
3553 * have to map both upper and title case to lower case.
3557 STRLEN ulen1, ulen2;
3558 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3559 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3563 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3564 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3565 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3572 nextchr = UCHARAT(locinput);
3576 /* Inline the first character, for speed. */
3577 if (UCHARAT(s) != nextchr &&
3579 (UCHARAT(s) != (type == REFF
3580 ? PL_fold : PL_fold_locale)[nextchr])))
3582 ln = PL_regoffs[n].end - ln;
3583 if (locinput + ln > PL_regeol)
3585 if (ln > 1 && (type == REF
3586 ? memNE(s, locinput, ln)
3588 ? ibcmp(s, locinput, ln)
3589 : ibcmp_locale(s, locinput, ln))))
3592 nextchr = UCHARAT(locinput);
3602 #define ST st->u.eval
3606 regexp_internal *rei;
3607 regnode *startpoint;
3610 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3611 if (cur_eval && cur_eval->locinput==locinput) {
3612 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3613 Perl_croak(aTHX_ "Infinite recursion in regex");
3614 if ( ++nochange_depth > max_nochange_depth )
3616 "Pattern subroutine nesting without pos change"
3617 " exceeded limit in regex");
3623 (void)ReREFCNT_inc(rex);
3624 if (OP(scan)==GOSUB) {
3625 startpoint = scan + ARG2L(scan);
3626 ST.close_paren = ARG(scan);
3628 startpoint = rei->program+1;
3631 goto eval_recurse_doit;
3633 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3634 if (cur_eval && cur_eval->locinput==locinput) {
3635 if ( ++nochange_depth > max_nochange_depth )
3636 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3641 /* execute the code in the {...} */
3643 SV ** const before = SP;
3644 OP_4tree * const oop = PL_op;
3645 COP * const ocurcop = PL_curcop;
3649 PL_op = (OP_4tree*)rexi->data->data[n];
3650 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3651 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3652 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3653 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3656 SV *sv_mrk = get_sv("REGMARK", 1);
3657 sv_setsv(sv_mrk, sv_yes_mark);
3660 CALLRUNOPS(aTHX); /* Scalar context. */
3663 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3670 PAD_RESTORE_LOCAL(old_comppad);
3671 PL_curcop = ocurcop;
3674 sv_setsv(save_scalar(PL_replgv), ret);
3678 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3681 /* extract RE object from returned value; compiling if
3686 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3687 mg = mg_find(sv, PERL_MAGIC_qr);
3688 else if (SvSMAGICAL(ret)) {
3689 if (SvGMAGICAL(ret))
3690 sv_unmagic(ret, PERL_MAGIC_qr);
3692 mg = mg_find(ret, PERL_MAGIC_qr);
3696 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
3700 const I32 osize = PL_regsize;
3702 if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
3703 re = CALLREGCOMP(ret, pm_flags);
3705 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3707 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3712 RX_MATCH_COPIED_off(re);
3713 re->subbeg = rex->subbeg;
3714 re->sublen = rex->sublen;
3717 debug_start_match(re, do_utf8, locinput, PL_regeol,
3718 "Matching embedded");
3720 startpoint = rei->program + 1;
3721 ST.close_paren = 0; /* only used for GOSUB */
3722 /* borrowed from regtry */
3723 if (PL_reg_start_tmpl <= re->nparens) {
3724 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3725 if(PL_reg_start_tmp)
3726 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3728 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3731 eval_recurse_doit: /* Share code with GOSUB below this line */
3732 /* run the pattern returned from (??{...}) */
3733 ST.cp = regcppush(0); /* Save *all* the positions. */
3734 REGCP_SET(ST.lastcp);
3736 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
3738 *PL_reglastparen = 0;
3739 *PL_reglastcloseparen = 0;
3740 PL_reginput = locinput;
3743 /* XXXX This is too dramatic a measure... */
3746 ST.toggle_reg_flags = PL_reg_flags;
3747 if (re->extflags & RXf_UTF8)
3748 PL_reg_flags |= RF_utf8;
3750 PL_reg_flags &= ~RF_utf8;
3751 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3754 ST.prev_curlyx = cur_curlyx;
3759 ST.prev_eval = cur_eval;
3761 /* now continue from first node in postoned RE */
3762 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3765 /* logical is 1, /(?(?{...})X|Y)/ */
3766 sw = (bool)SvTRUE(ret);
3771 case EVAL_AB: /* cleanup after a successful (??{A})B */
3772 /* note: this is called twice; first after popping B, then A */
3773 PL_reg_flags ^= ST.toggle_reg_flags;
3775 SETREX(rex,ST.prev_rex);
3776 rexi = RXi_GET(rex);
3778 cur_eval = ST.prev_eval;
3779 cur_curlyx = ST.prev_curlyx;
3780 /* XXXX This is too dramatic a measure... */
3782 if ( nochange_depth )
3787 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3788 /* note: this is called twice; first after popping B, then A */
3789 PL_reg_flags ^= ST.toggle_reg_flags;
3791 SETREX(rex,ST.prev_rex);
3792 rexi = RXi_GET(rex);
3793 PL_reginput = locinput;
3794 REGCP_UNWIND(ST.lastcp);
3796 cur_eval = ST.prev_eval;
3797 cur_curlyx = ST.prev_curlyx;
3798 /* XXXX This is too dramatic a measure... */
3800 if ( nochange_depth )
3806 n = ARG(scan); /* which paren pair */
3807 PL_reg_start_tmp[n] = locinput;
3813 n = ARG(scan); /* which paren pair */
3814 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3815 PL_regoffs[n].end = locinput - PL_bostr;
3816 /*if (n > PL_regsize)
3818 if (n > *PL_reglastparen)
3819 *PL_reglastparen = n;
3820 *PL_reglastcloseparen = n;
3821 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3829 cursor && OP(cursor)!=END;
3830 cursor=regnext(cursor))
3832 if ( OP(cursor)==CLOSE ){
3834 if ( n <= lastopen ) {
3836 = PL_reg_start_tmp[n] - PL_bostr;
3837 PL_regoffs[n].end = locinput - PL_bostr;
3838 /*if (n > PL_regsize)
3840 if (n > *PL_reglastparen)
3841 *PL_reglastparen = n;
3842 *PL_reglastcloseparen = n;
3843 if ( n == ARG(scan) || (cur_eval &&
3844 cur_eval->u.eval.close_paren == n))
3853 n = ARG(scan); /* which paren pair */
3854 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
3857 /* reg_check_named_buff_matched returns 0 for no match */
3858 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3862 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3868 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3870 next = NEXTOPER(NEXTOPER(scan));
3872 next = scan + ARG(scan);
3873 if (OP(next) == IFTHEN) /* Fake one. */
3874 next = NEXTOPER(NEXTOPER(next));
3878 logical = scan->flags;
3881 /*******************************************************************
3883 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3884 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3885 STAR/PLUS/CURLY/CURLYN are used instead.)
3887 A*B is compiled as <CURLYX><A><WHILEM><B>
3889 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3890 state, which contains the current count, initialised to -1. It also sets
3891 cur_curlyx to point to this state, with any previous value saved in the
3894 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3895 since the pattern may possibly match zero times (i.e. it's a while {} loop
3896 rather than a do {} while loop).
3898 Each entry to WHILEM represents a successful match of A. The count in the
3899 CURLYX block is incremented, another WHILEM state is pushed, and execution
3900 passes to A or B depending on greediness and the current count.
3902 For example, if matching against the string a1a2a3b (where the aN are
3903 substrings that match /A/), then the match progresses as follows: (the
3904 pushed states are interspersed with the bits of strings matched so far):
3907 <CURLYX cnt=0><WHILEM>
3908 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3909 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3910 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3911 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3913 (Contrast this with something like CURLYM, which maintains only a single
3917 a1 <CURLYM cnt=1> a2
3918 a1 a2 <CURLYM cnt=2> a3
3919 a1 a2 a3 <CURLYM cnt=3> b
3922 Each WHILEM state block marks a point to backtrack to upon partial failure
3923 of A or B, and also contains some minor state data related to that
3924 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3925 overall state, such as the count, and pointers to the A and B ops.
3927 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3928 must always point to the *current* CURLYX block, the rules are:
3930 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3931 and set cur_curlyx to point the new block.
3933 When popping the CURLYX block after a successful or unsuccessful match,
3934 restore the previous cur_curlyx.
3936 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3937 to the outer one saved in the CURLYX block.
3939 When popping the WHILEM block after a successful or unsuccessful B match,
3940 restore the previous cur_curlyx.
3942 Here's an example for the pattern (AI* BI)*BO
3943 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3946 curlyx backtrack stack
3947 ------ ---------------
3949 CO <CO prev=NULL> <WO>
3950 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3951 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3952 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3954 At this point the pattern succeeds, and we work back down the stack to
3955 clean up, restoring as we go:
3957 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3958 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3959 CO <CO prev=NULL> <WO>
3962 *******************************************************************/
3964 #define ST st->u.curlyx
3966 case CURLYX: /* start of /A*B/ (for complex A) */
3968 /* No need to save/restore up to this paren */
3969 I32 parenfloor = scan->flags;
3971 assert(next); /* keep Coverity happy */
3972 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3975 /* XXXX Probably it is better to teach regpush to support
3976 parenfloor > PL_regsize... */
3977 if (parenfloor > (I32)*PL_reglastparen)
3978 parenfloor = *PL_reglastparen; /* Pessimization... */
3980 ST.prev_curlyx= cur_curlyx;
3982 ST.cp = PL_savestack_ix;
3984 /* these fields contain the state of the current curly.
3985 * they are accessed by subsequent WHILEMs */
3986 ST.parenfloor = parenfloor;
3987 ST.min = ARG1(scan);
3988 ST.max = ARG2(scan);
3989 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3993 ST.count = -1; /* this will be updated by WHILEM */
3994 ST.lastloc = NULL; /* this will be updated by WHILEM */
3996 PL_reginput = locinput;
3997 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4001 case CURLYX_end: /* just finished matching all of A*B */
4002 cur_curlyx = ST.prev_curlyx;
4006 case CURLYX_end_fail: /* just failed to match all of A*B */
4008 cur_curlyx = ST.prev_curlyx;
4014 #define ST st->u.whilem
4016 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4018 /* see the discussion above about CURLYX/WHILEM */
4020 assert(cur_curlyx); /* keep Coverity happy */
4021 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4022 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4023 ST.cache_offset = 0;
4026 PL_reginput = locinput;
4028 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4029 "%*s whilem: matched %ld out of %ld..%ld\n",
4030 REPORT_CODE_OFF+depth*2, "", (long)n,
4031 (long)cur_curlyx->u.curlyx.min,
4032 (long)cur_curlyx->u.curlyx.max)
4035 /* First just match a string of min A's. */
4037 if (n < cur_curlyx->u.curlyx.min) {
4038 cur_curlyx->u.curlyx.lastloc = locinput;
4039 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4043 /* If degenerate A matches "", assume A done. */
4045 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4046 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4047 "%*s whilem: empty match detected, trying continuation...\n",
4048 REPORT_CODE_OFF+depth*2, "")
4050 goto do_whilem_B_max;
4053 /* super-linear cache processing */
4057 if (!PL_reg_maxiter) {
4058 /* start the countdown: Postpone detection until we
4059 * know the match is not *that* much linear. */
4060 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4061 /* possible overflow for long strings and many CURLYX's */
4062 if (PL_reg_maxiter < 0)
4063 PL_reg_maxiter = I32_MAX;
4064 PL_reg_leftiter = PL_reg_maxiter;
4067 if (PL_reg_leftiter-- == 0) {
4068 /* initialise cache */
4069 const I32 size = (PL_reg_maxiter + 7)/8;
4070 if (PL_reg_poscache) {
4071 if ((I32)PL_reg_poscache_size < size) {
4072 Renew(PL_reg_poscache, size, char);
4073 PL_reg_poscache_size = size;
4075 Zero(PL_reg_poscache, size, char);
4078 PL_reg_poscache_size = size;
4079 Newxz(PL_reg_poscache, size, char);
4081 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4082 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4083 PL_colors[4], PL_colors[5])
4087 if (PL_reg_leftiter < 0) {
4088 /* have we already failed at this position? */
4090 offset = (scan->flags & 0xf) - 1
4091 + (locinput - PL_bostr) * (scan->flags>>4);
4092 mask = 1 << (offset % 8);
4094 if (PL_reg_poscache[offset] & mask) {
4095 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4096 "%*s whilem: (cache) already tried at this position...\n",
4097 REPORT_CODE_OFF+depth*2, "")
4099 sayNO; /* cache records failure */
4101 ST.cache_offset = offset;
4102 ST.cache_mask = mask;
4106 /* Prefer B over A for minimal matching. */
4108 if (cur_curlyx->u.curlyx.minmod) {
4109 ST.save_curlyx = cur_curlyx;
4110 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4111 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4112 REGCP_SET(ST.lastcp);
4113 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4117 /* Prefer A over B for maximal matching. */
4119 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4120 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4121 cur_curlyx->u.curlyx.lastloc = locinput;
4122 REGCP_SET(ST.lastcp);
4123 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4126 goto do_whilem_B_max;
4130 case WHILEM_B_min: /* just matched B in a minimal match */
4131 case WHILEM_B_max: /* just matched B in a maximal match */
4132 cur_curlyx = ST.save_curlyx;
4136 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4137 cur_curlyx = ST.save_curlyx;
4138 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4139 cur_curlyx->u.curlyx.count--;
4143 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4144 REGCP_UNWIND(ST.lastcp);
4147 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4148 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4149 cur_curlyx->u.curlyx.count--;
4153 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4154 REGCP_UNWIND(ST.lastcp);
4155 regcppop(rex); /* Restore some previous $<digit>s? */
4156 PL_reginput = locinput;
4157 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4158 "%*s whilem: failed, trying continuation...\n",
4159 REPORT_CODE_OFF+depth*2, "")
4162 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4163 && ckWARN(WARN_REGEXP)
4164 && !(PL_reg_flags & RF_warned))
4166 PL_reg_flags |= RF_warned;
4167 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4168 "Complex regular subexpression recursion",
4173 ST.save_curlyx = cur_curlyx;
4174 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4175 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4178 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4179 cur_curlyx = ST.save_curlyx;
4180 REGCP_UNWIND(ST.lastcp);
4183 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4184 /* Maximum greed exceeded */
4185 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4186 && ckWARN(WARN_REGEXP)
4187 && !(PL_reg_flags & RF_warned))
4189 PL_reg_flags |= RF_warned;
4190 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4191 "%s limit (%d) exceeded",
4192 "Complex regular subexpression recursion",
4195 cur_curlyx->u.curlyx.count--;
4199 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4200 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4202 /* Try grabbing another A and see if it helps. */
4203 PL_reginput = locinput;
4204 cur_curlyx->u.curlyx.lastloc = locinput;
4205 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4206 REGCP_SET(ST.lastcp);
4207 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4211 #define ST st->u.branch
4213 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4214 next = scan + ARG(scan);
4217 scan = NEXTOPER(scan);
4220 case BRANCH: /* /(...|A|...)/ */
4221 scan = NEXTOPER(scan); /* scan now points to inner node */
4222 ST.lastparen = *PL_reglastparen;
4223 ST.next_branch = next;
4225 PL_reginput = locinput;
4227 /* Now go into the branch */
4229 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4231 PUSH_STATE_GOTO(BRANCH_next, scan);
4235 PL_reginput = locinput;
4236 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4237 (SV*)rexi->data->data[ ARG( scan ) ];
4238 PUSH_STATE_GOTO(CUTGROUP_next,next);
4240 case CUTGROUP_next_fail:
4243 if (st->u.mark.mark_name)
4244 sv_commit = st->u.mark.mark_name;
4250 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4255 REGCP_UNWIND(ST.cp);
4256 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4257 PL_regoffs[n].end = -1;
4258 *PL_reglastparen = n;
4259 /*dmq: *PL_reglastcloseparen = n; */
4260 scan = ST.next_branch;
4261 /* no more branches? */
4262 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4264 PerlIO_printf( Perl_debug_log,
4265 "%*s %sBRANCH failed...%s\n",
4266 REPORT_CODE_OFF+depth*2, "",
4272 continue; /* execute next BRANCH[J] op */
4280 #define ST st->u.curlym
4282 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4284 /* This is an optimisation of CURLYX that enables us to push
4285 * only a single backtracking state, no matter now many matches
4286 * there are in {m,n}. It relies on the pattern being constant
4287 * length, with no parens to influence future backrefs
4291 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4293 /* if paren positive, emulate an OPEN/CLOSE around A */
4295 U32 paren = ST.me->flags;
4296 if (paren > PL_regsize)
4298 if (paren > *PL_reglastparen)
4299 *PL_reglastparen = paren;
4300 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4308 ST.c1 = CHRTEST_UNINIT;
4311 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4314 curlym_do_A: /* execute the A in /A{m,n}B/ */
4315 PL_reginput = locinput;
4316 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4319 case CURLYM_A: /* we've just matched an A */
4320 locinput = st->locinput;
4321 nextchr = UCHARAT(locinput);
4324 /* after first match, determine A's length: u.curlym.alen */
4325 if (ST.count == 1) {
4326 if (PL_reg_match_utf8) {
4328 while (s < PL_reginput) {
4334 ST.alen = PL_reginput - locinput;
4337 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4340 PerlIO_printf(Perl_debug_log,
4341 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4342 (int)(REPORT_CODE_OFF+(depth*2)), "",
4343 (IV) ST.count, (IV)ST.alen)
4346 locinput = PL_reginput;
4348 if (cur_eval && cur_eval->u.eval.close_paren &&
4349 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4352 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
4353 goto curlym_do_A; /* try to match another A */
4354 goto curlym_do_B; /* try to match B */
4356 case CURLYM_A_fail: /* just failed to match an A */
4357 REGCP_UNWIND(ST.cp);
4359 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4360 || (cur_eval && cur_eval->u.eval.close_paren &&
4361 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4364 curlym_do_B: /* execute the B in /A{m,n}B/ */
4365 PL_reginput = locinput;
4366 if (ST.c1 == CHRTEST_UNINIT) {
4367 /* calculate c1 and c2 for possible match of 1st char
4368 * following curly */
4369 ST.c1 = ST.c2 = CHRTEST_VOID;
4370 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4371 regnode *text_node = ST.B;
4372 if (! HAS_TEXT(text_node))
4373 FIND_NEXT_IMPT(text_node);
4376 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4378 But the former is redundant in light of the latter.
4380 if this changes back then the macro for
4381 IS_TEXT and friends need to change.
4383 if (PL_regkind[OP(text_node)] == EXACT)
4386 ST.c1 = (U8)*STRING(text_node);
4388 (IS_TEXTF(text_node))
4390 : (IS_TEXTFL(text_node))
4391 ? PL_fold_locale[ST.c1]
4398 PerlIO_printf(Perl_debug_log,
4399 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4400 (int)(REPORT_CODE_OFF+(depth*2)),
4403 if (ST.c1 != CHRTEST_VOID
4404 && UCHARAT(PL_reginput) != ST.c1
4405 && UCHARAT(PL_reginput) != ST.c2)
4407 /* simulate B failing */
4409 PerlIO_printf(Perl_debug_log,
4410 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4411 (int)(REPORT_CODE_OFF+(depth*2)),"",
4414 state_num = CURLYM_B_fail;
4415 goto reenter_switch;
4419 /* mark current A as captured */
4420 I32 paren = ST.me->flags;
4422 PL_regoffs[paren].start
4423 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4424 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4425 /*dmq: *PL_reglastcloseparen = paren; */
4428 PL_regoffs[paren].end = -1;
4429 if (cur_eval && cur_eval->u.eval.close_paren &&
4430 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4439 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4442 case CURLYM_B_fail: /* just failed to match a B */
4443 REGCP_UNWIND(ST.cp);
4445 if (ST.count == ARG2(ST.me) /* max */)
4447 goto curlym_do_A; /* try to match a further A */
4449 /* backtrack one A */
4450 if (ST.count == ARG1(ST.me) /* min */)
4453 locinput = HOPc(locinput, -ST.alen);
4454 goto curlym_do_B; /* try to match B */
4457 #define ST st->u.curly
4459 #define CURLY_SETPAREN(paren, success) \
4462 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4463 PL_regoffs[paren].end = locinput - PL_bostr; \
4464 *PL_reglastcloseparen = paren; \
4467 PL_regoffs[paren].end = -1; \
4470 case STAR: /* /A*B/ where A is width 1 */
4474 scan = NEXTOPER(scan);
4476 case PLUS: /* /A+B/ where A is width 1 */
4480 scan = NEXTOPER(scan);
4482 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4483 ST.paren = scan->flags; /* Which paren to set */
4484 if (ST.paren > PL_regsize)
4485 PL_regsize = ST.paren;
4486 if (ST.paren > *PL_reglastparen)
4487 *PL_reglastparen = ST.paren;
4488 ST.min = ARG1(scan); /* min to match */
4489 ST.max = ARG2(scan); /* max to match */
4490 if (cur_eval && cur_eval->u.eval.close_paren &&
4491 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4495 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4497 case CURLY: /* /A{m,n}B/ where A is width 1 */
4499 ST.min = ARG1(scan); /* min to match */
4500 ST.max = ARG2(scan); /* max to match */
4501 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4504 * Lookahead to avoid useless match attempts
4505 * when we know what character comes next.
4507 * Used to only do .*x and .*?x, but now it allows
4508 * for )'s, ('s and (?{ ... })'s to be in the way
4509 * of the quantifier and the EXACT-like node. -- japhy
4512 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4514 if (HAS_TEXT(next) || JUMPABLE(next)) {
4516 regnode *text_node = next;
4518 if (! HAS_TEXT(text_node))
4519 FIND_NEXT_IMPT(text_node);
4521 if (! HAS_TEXT(text_node))
4522 ST.c1 = ST.c2 = CHRTEST_VOID;
4524 if ( PL_regkind[OP(text_node)] != EXACT ) {
4525 ST.c1 = ST.c2 = CHRTEST_VOID;
4526 goto assume_ok_easy;
4529 s = (U8*)STRING(text_node);
4531 /* Currently we only get here when
4533 PL_rekind[OP(text_node)] == EXACT
4535 if this changes back then the macro for IS_TEXT and
4536 friends need to change. */
4539 if (IS_TEXTF(text_node))
4540 ST.c2 = PL_fold[ST.c1];
4541 else if (IS_TEXTFL(text_node))
4542 ST.c2 = PL_fold_locale[ST.c1];
4545 if (IS_TEXTF(text_node)) {
4546 STRLEN ulen1, ulen2;
4547 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4548 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4550 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4551 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4553 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4555 0 : UTF8_ALLOW_ANY);
4556 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4558 0 : UTF8_ALLOW_ANY);
4560 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4562 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4567 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4574 ST.c1 = ST.c2 = CHRTEST_VOID;
4579 PL_reginput = locinput;
4582 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4585 locinput = PL_reginput;
4587 if (ST.c1 == CHRTEST_VOID)
4588 goto curly_try_B_min;
4590 ST.oldloc = locinput;
4592 /* set ST.maxpos to the furthest point along the
4593 * string that could possibly match */
4594 if (ST.max == REG_INFTY) {
4595 ST.maxpos = PL_regeol - 1;
4597 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4601 int m = ST.max - ST.min;
4602 for (ST.maxpos = locinput;
4603 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4604 ST.maxpos += UTF8SKIP(ST.maxpos);
4607 ST.maxpos = locinput + ST.max - ST.min;
4608 if (ST.maxpos >= PL_regeol)
4609 ST.maxpos = PL_regeol - 1;
4611 goto curly_try_B_min_known;
4615 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4616 locinput = PL_reginput;
4617 if (ST.count < ST.min)
4619 if ((ST.count > ST.min)
4620 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4622 /* A{m,n} must come at the end of the string, there's
4623 * no point in backing off ... */
4625 /* ...except that $ and \Z can match before *and* after
4626 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4627 We may back off by one in this case. */
4628 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4632 goto curly_try_B_max;
4637 case CURLY_B_min_known_fail:
4638 /* failed to find B in a non-greedy match where c1,c2 valid */
4639 if (ST.paren && ST.count)
4640 PL_regoffs[ST.paren].end = -1;
4642 PL_reginput = locinput; /* Could be reset... */
4643 REGCP_UNWIND(ST.cp);
4644 /* Couldn't or didn't -- move forward. */
4645 ST.oldloc = locinput;
4647 locinput += UTF8SKIP(locinput);
4651 curly_try_B_min_known:
4652 /* find the next place where 'B' could work, then call B */
4656 n = (ST.oldloc == locinput) ? 0 : 1;
4657 if (ST.c1 == ST.c2) {
4659 /* set n to utf8_distance(oldloc, locinput) */
4660 while (locinput <= ST.maxpos &&
4661 utf8n_to_uvchr((U8*)locinput,
4662 UTF8_MAXBYTES, &len,
4663 uniflags) != (UV)ST.c1) {
4669 /* set n to utf8_distance(oldloc, locinput) */
4670 while (locinput <= ST.maxpos) {
4672 const UV c = utf8n_to_uvchr((U8*)locinput,
4673 UTF8_MAXBYTES, &len,
4675 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4683 if (ST.c1 == ST.c2) {
4684 while (locinput <= ST.maxpos &&
4685 UCHARAT(locinput) != ST.c1)
4689 while (locinput <= ST.maxpos
4690 && UCHARAT(locinput) != ST.c1
4691 && UCHARAT(locinput) != ST.c2)
4694 n = locinput - ST.oldloc;
4696 if (locinput > ST.maxpos)
4698 /* PL_reginput == oldloc now */
4701 if (regrepeat(rex, ST.A, n, depth) < n)
4704 PL_reginput = locinput;
4705 CURLY_SETPAREN(ST.paren, ST.count);
4706 if (cur_eval && cur_eval->u.eval.close_paren &&
4707 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4710 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4715 case CURLY_B_min_fail:
4716 /* failed to find B in a non-greedy match where c1,c2 invalid */
4717 if (ST.paren && ST.count)
4718 PL_regoffs[ST.paren].end = -1;
4720 REGCP_UNWIND(ST.cp);
4721 /* failed -- move forward one */
4722 PL_reginput = locinput;
4723 if (regrepeat(rex, ST.A, 1, depth)) {
4725 locinput = PL_reginput;
4726 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4727 ST.count > 0)) /* count overflow ? */
4730 CURLY_SETPAREN(ST.paren, ST.count);
4731 if (cur_eval && cur_eval->u.eval.close_paren &&
4732 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4735 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4743 /* a successful greedy match: now try to match B */
4744 if (cur_eval && cur_eval->u.eval.close_paren &&
4745 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4750 if (ST.c1 != CHRTEST_VOID)
4751 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4752 UTF8_MAXBYTES, 0, uniflags)
4753 : (UV) UCHARAT(PL_reginput);
4754 /* If it could work, try it. */
4755 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4756 CURLY_SETPAREN(ST.paren, ST.count);
4757 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4762 case CURLY_B_max_fail:
4763 /* failed to find B in a greedy match */
4764 if (ST.paren && ST.count)
4765 PL_regoffs[ST.paren].end = -1;
4767 REGCP_UNWIND(ST.cp);
4769 if (--ST.count < ST.min)
4771 PL_reginput = locinput = HOPc(locinput, -1);
4772 goto curly_try_B_max;
4779 /* we've just finished A in /(??{A})B/; now continue with B */
4781 st->u.eval.toggle_reg_flags
4782 = cur_eval->u.eval.toggle_reg_flags;
4783 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4785 st->u.eval.prev_rex = rex; /* inner */
4786 SETREX(rex,cur_eval->u.eval.prev_rex);
4787 rexi = RXi_GET(rex);
4788 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4790 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4791 REGCP_SET(st->u.eval.lastcp);
4792 PL_reginput = locinput;
4794 /* Restore parens of the outer rex without popping the
4796 tmpix = PL_savestack_ix;
4797 PL_savestack_ix = cur_eval->u.eval.lastcp;
4799 PL_savestack_ix = tmpix;
4801 st->u.eval.prev_eval = cur_eval;
4802 cur_eval = cur_eval->u.eval.prev_eval;
4804 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4805 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4806 if ( nochange_depth )
4809 PUSH_YES_STATE_GOTO(EVAL_AB,
4810 st->u.eval.prev_eval->u.eval.B); /* match B */
4813 if (locinput < reginfo->till) {
4814 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4815 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4817 (long)(locinput - PL_reg_starttry),
4818 (long)(reginfo->till - PL_reg_starttry),
4821 sayNO_SILENT; /* Cannot match: too short. */
4823 PL_reginput = locinput; /* put where regtry can find it */
4824 sayYES; /* Success! */
4826 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4828 PerlIO_printf(Perl_debug_log,
4829 "%*s %ssubpattern success...%s\n",
4830 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4831 PL_reginput = locinput; /* put where regtry can find it */
4832 sayYES; /* Success! */
4835 #define ST st->u.ifmatch
4837 case SUSPEND: /* (?>A) */
4839 PL_reginput = locinput;
4842 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4844 goto ifmatch_trivial_fail_test;
4846 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4848 ifmatch_trivial_fail_test:
4850 char * const s = HOPBACKc(locinput, scan->flags);
4855 sw = 1 - (bool)ST.wanted;
4859 next = scan + ARG(scan);
4867 PL_reginput = locinput;
4871 ST.logical = logical;
4872 /* execute body of (?...A) */
4873 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4876 case IFMATCH_A_fail: /* body of (?...A) failed */
4877 ST.wanted = !ST.wanted;
4880 case IFMATCH_A: /* body of (?...A) succeeded */
4882 sw = (bool)ST.wanted;
4884 else if (!ST.wanted)
4887 if (OP(ST.me) == SUSPEND)
4888 locinput = PL_reginput;
4890 locinput = PL_reginput = st->locinput;
4891 nextchr = UCHARAT(locinput);
4893 scan = ST.me + ARG(ST.me);
4896 continue; /* execute B */
4901 next = scan + ARG(scan);
4906 reginfo->cutpoint = PL_regeol;
4909 PL_reginput = locinput;
4911 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
4912 PUSH_STATE_GOTO(COMMIT_next,next);
4914 case COMMIT_next_fail:
4921 #define ST st->u.mark
4923 ST.prev_mark = mark_state;
4924 ST.mark_name = sv_commit = sv_yes_mark
4925 = (SV*)rexi->data->data[ ARG( scan ) ];
4927 ST.mark_loc = PL_reginput = locinput;
4928 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4930 case MARKPOINT_next:
4931 mark_state = ST.prev_mark;
4934 case MARKPOINT_next_fail:
4935 if (popmark && sv_eq(ST.mark_name,popmark))
4937 if (ST.mark_loc > startpoint)
4938 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4939 popmark = NULL; /* we found our mark */
4940 sv_commit = ST.mark_name;
4943 PerlIO_printf(Perl_debug_log,
4944 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4945 REPORT_CODE_OFF+depth*2, "",
4946 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
4949 mark_state = ST.prev_mark;
4950 sv_yes_mark = mark_state ?
4951 mark_state->u.mark.mark_name : NULL;
4955 PL_reginput = locinput;
4957 /* (*SKIP) : if we fail we cut here*/
4958 ST.mark_name = NULL;
4959 ST.mark_loc = locinput;
4960 PUSH_STATE_GOTO(SKIP_next,next);
4962 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
4963 otherwise do nothing. Meaning we need to scan
4965 regmatch_state *cur = mark_state;
4966 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
4969 if ( sv_eq( cur->u.mark.mark_name,
4972 ST.mark_name = find;
4973 PUSH_STATE_GOTO( SKIP_next, next );
4975 cur = cur->u.mark.prev_mark;
4978 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
4980 case SKIP_next_fail:
4982 /* (*CUT:NAME) - Set up to search for the name as we
4983 collapse the stack*/
4984 popmark = ST.mark_name;
4986 /* (*CUT) - No name, we cut here.*/
4987 if (ST.mark_loc > startpoint)
4988 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4989 /* but we set sv_commit to latest mark_name if there
4990 is one so they can test to see how things lead to this
4993 sv_commit=mark_state->u.mark.mark_name;
5001 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5003 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5006 U8 folded[UTF8_MAXBYTES_CASE+1];
5008 const char * const l = locinput;
5009 char *e = PL_regeol;
5010 to_uni_fold(n, folded, &foldlen);
5012 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
5013 l, &e, 0, do_utf8)) {
5018 nextchr = UCHARAT(locinput);
5021 if ((n=is_LNBREAK(locinput,do_utf8))) {
5023 nextchr = UCHARAT(locinput);
5028 #define CASE_CLASS(nAmE) \
5030 if ((n=is_##nAmE(locinput,do_utf8))) { \
5032 nextchr = UCHARAT(locinput); \
5037 if ((n=is_##nAmE(locinput,do_utf8))) { \
5040 locinput += UTF8SKIP(locinput); \
5041 nextchr = UCHARAT(locinput); \
5046 CASE_CLASS(HORIZWS);
5050 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5051 PTR2UV(scan), OP(scan));
5052 Perl_croak(aTHX_ "regexp memory corruption");
5056 /* switch break jumps here */
5057 scan = next; /* prepare to execute the next op and ... */
5058 continue; /* ... jump back to the top, reusing st */
5062 /* push a state that backtracks on success */
5063 st->u.yes.prev_yes_state = yes_state;
5067 /* push a new regex state, then continue at scan */
5069 regmatch_state *newst;
5072 regmatch_state *cur = st;
5073 regmatch_state *curyes = yes_state;
5075 regmatch_slab *slab = PL_regmatch_slab;
5076 for (;curd > -1;cur--,curd--) {
5077 if (cur < SLAB_FIRST(slab)) {
5079 cur = SLAB_LAST(slab);
5081 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5082 REPORT_CODE_OFF + 2 + depth * 2,"",
5083 curd, PL_reg_name[cur->resume_state],
5084 (curyes == cur) ? "yes" : ""
5087 curyes = cur->u.yes.prev_yes_state;
5090 DEBUG_STATE_pp("push")
5093 st->locinput = locinput;
5095 if (newst > SLAB_LAST(PL_regmatch_slab))
5096 newst = S_push_slab(aTHX);
5097 PL_regmatch_state = newst;
5099 locinput = PL_reginput;
5100 nextchr = UCHARAT(locinput);
5108 * We get here only if there's trouble -- normally "case END" is
5109 * the terminating point.
5111 Perl_croak(aTHX_ "corrupted regexp pointers");
5117 /* we have successfully completed a subexpression, but we must now
5118 * pop to the state marked by yes_state and continue from there */
5119 assert(st != yes_state);
5121 while (st != yes_state) {
5123 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5124 PL_regmatch_slab = PL_regmatch_slab->prev;
5125 st = SLAB_LAST(PL_regmatch_slab);
5129 DEBUG_STATE_pp("pop (no final)");
5131 DEBUG_STATE_pp("pop (yes)");
5137 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5138 || yes_state > SLAB_LAST(PL_regmatch_slab))
5140 /* not in this slab, pop slab */
5141 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5142 PL_regmatch_slab = PL_regmatch_slab->prev;
5143 st = SLAB_LAST(PL_regmatch_slab);
5145 depth -= (st - yes_state);
5148 yes_state = st->u.yes.prev_yes_state;
5149 PL_regmatch_state = st;
5152 locinput= st->locinput;
5153 nextchr = UCHARAT(locinput);
5155 state_num = st->resume_state + no_final;
5156 goto reenter_switch;
5159 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5160 PL_colors[4], PL_colors[5]));
5162 if (PL_reg_eval_set) {
5163 /* each successfully executed (?{...}) block does the equivalent of
5164 * local $^R = do {...}
5165 * When popping the save stack, all these locals would be undone;
5166 * bypass this by setting the outermost saved $^R to the latest
5168 if (oreplsv != GvSV(PL_replgv))
5169 sv_setsv(oreplsv, GvSV(PL_replgv));
5176 PerlIO_printf(Perl_debug_log,
5177 "%*s %sfailed...%s\n",
5178 REPORT_CODE_OFF+depth*2, "",
5179 PL_colors[4], PL_colors[5])
5191 /* there's a previous state to backtrack to */
5193 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5194 PL_regmatch_slab = PL_regmatch_slab->prev;
5195 st = SLAB_LAST(PL_regmatch_slab);
5197 PL_regmatch_state = st;
5198 locinput= st->locinput;
5199 nextchr = UCHARAT(locinput);
5201 DEBUG_STATE_pp("pop");
5203 if (yes_state == st)
5204 yes_state = st->u.yes.prev_yes_state;
5206 state_num = st->resume_state + 1; /* failure = success + 1 */
5207 goto reenter_switch;
5212 if (rex->intflags & PREGf_VERBARG_SEEN) {
5213 SV *sv_err = get_sv("REGERROR", 1);
5214 SV *sv_mrk = get_sv("REGMARK", 1);
5216 sv_commit = &PL_sv_no;
5218 sv_yes_mark = &PL_sv_yes;
5221 sv_commit = &PL_sv_yes;
5222 sv_yes_mark = &PL_sv_no;
5224 sv_setsv(sv_err, sv_commit);
5225 sv_setsv(sv_mrk, sv_yes_mark);
5228 /* clean up; in particular, free all slabs above current one */
5229 LEAVE_SCOPE(oldsave);
5235 - regrepeat - repeatedly match something simple, report how many
5238 * [This routine now assumes that it will only match on things of length 1.
5239 * That was true before, but now we assume scan - reginput is the count,
5240 * rather than incrementing count on every character. [Er, except utf8.]]
5243 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5246 register char *scan;
5248 register char *loceol = PL_regeol;
5249 register I32 hardcount = 0;
5250 register bool do_utf8 = PL_reg_match_utf8;
5252 PERL_UNUSED_ARG(depth);
5256 if (max == REG_INFTY)
5258 else if (max < loceol - scan)
5259 loceol = scan + max;
5264 while (scan < loceol && hardcount < max && *scan != '\n') {
5265 scan += UTF8SKIP(scan);
5269 while (scan < loceol && *scan != '\n')
5276 while (scan < loceol && hardcount < max) {
5277 scan += UTF8SKIP(scan);
5287 case EXACT: /* length of string is 1 */
5289 while (scan < loceol && UCHARAT(scan) == c)
5292 case EXACTF: /* length of string is 1 */
5294 while (scan < loceol &&
5295 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5298 case EXACTFL: /* length of string is 1 */
5299 PL_reg_flags |= RF_tainted;
5301 while (scan < loceol &&
5302 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5308 while (hardcount < max && scan < loceol &&
5309 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5310 scan += UTF8SKIP(scan);
5314 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5321 LOAD_UTF8_CHARCLASS_ALNUM();
5322 while (hardcount < max && scan < loceol &&
5323 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5324 scan += UTF8SKIP(scan);
5328 while (scan < loceol && isALNUM(*scan))
5333 PL_reg_flags |= RF_tainted;
5336 while (hardcount < max && scan < loceol &&
5337 isALNUM_LC_utf8((U8*)scan)) {
5338 scan += UTF8SKIP(scan);
5342 while (scan < loceol && isALNUM_LC(*scan))
5349 LOAD_UTF8_CHARCLASS_ALNUM();
5350 while (hardcount < max && scan < loceol &&
5351 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5352 scan += UTF8SKIP(scan);
5356 while (scan < loceol && !isALNUM(*scan))
5361 PL_reg_flags |= RF_tainted;
5364 while (hardcount < max && scan < loceol &&
5365 !isALNUM_LC_utf8((U8*)scan)) {
5366 scan += UTF8SKIP(scan);
5370 while (scan < loceol && !isALNUM_LC(*scan))
5377 LOAD_UTF8_CHARCLASS_SPACE();
5378 while (hardcount < max && scan < loceol &&
5380 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5381 scan += UTF8SKIP(scan);
5385 while (scan < loceol && isSPACE(*scan))
5390 PL_reg_flags |= RF_tainted;
5393 while (hardcount < max && scan < loceol &&
5394 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5395 scan += UTF8SKIP(scan);
5399 while (scan < loceol && isSPACE_LC(*scan))
5406 LOAD_UTF8_CHARCLASS_SPACE();
5407 while (hardcount < max && scan < loceol &&
5409 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5410 scan += UTF8SKIP(scan);
5414 while (scan < loceol && !isSPACE(*scan))
5419 PL_reg_flags |= RF_tainted;
5422 while (hardcount < max && scan < loceol &&
5423 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5424 scan += UTF8SKIP(scan);
5428 while (scan < loceol && !isSPACE_LC(*scan))
5435 LOAD_UTF8_CHARCLASS_DIGIT();
5436 while (hardcount < max && scan < loceol &&
5437 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5438 scan += UTF8SKIP(scan);
5442 while (scan < loceol && isDIGIT(*scan))
5449 LOAD_UTF8_CHARCLASS_DIGIT();
5450 while (hardcount < max && scan < loceol &&
5451 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5452 scan += UTF8SKIP(scan);
5456 while (scan < loceol && !isDIGIT(*scan))
5462 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5468 LNBREAK can match two latin chars, which is ok,
5469 because we have a null terminated string, but we
5470 have to use hardcount in this situation
5472 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5481 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5486 while (scan < loceol && is_HORIZWS_latin1(scan))
5493 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5494 scan += UTF8SKIP(scan);
5498 while (scan < loceol && !is_HORIZWS_latin1(scan))
5506 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5511 while (scan < loceol && is_VERTWS_latin1(scan))
5519 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5520 scan += UTF8SKIP(scan);
5524 while (scan < loceol && !is_VERTWS_latin1(scan))
5530 default: /* Called on something of 0 width. */
5531 break; /* So match right here or not at all. */
5537 c = scan - PL_reginput;
5541 GET_RE_DEBUG_FLAGS_DECL;
5543 SV * const prop = sv_newmortal();
5544 regprop(prog, prop, p);
5545 PerlIO_printf(Perl_debug_log,
5546 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5547 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5555 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5557 - regclass_swash - prepare the utf8 swash
5561 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5567 RXi_GET_DECL(prog,progi);
5568 const struct reg_data * const data = prog ? progi->data : NULL;
5570 if (data && data->count) {
5571 const U32 n = ARG(node);
5573 if (data->what[n] == 's') {
5574 SV * const rv = (SV*)data->data[n];
5575 AV * const av = (AV*)SvRV((SV*)rv);
5576 SV **const ary = AvARRAY(av);
5579 /* See the end of regcomp.c:S_regclass() for
5580 * documentation of these array elements. */
5583 a = SvROK(ary[1]) ? &ary[1] : 0;
5584 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5588 else if (si && doinit) {
5589 sw = swash_init("utf8", "", si, 1, 0);
5590 (void)av_store(av, 1, sw);
5607 - reginclass - determine if a character falls into a character class
5609 The n is the ANYOF regnode, the p is the target string, lenp
5610 is pointer to the maximum length of how far to go in the p
5611 (if the lenp is zero, UTF8SKIP(p) is used),
5612 do_utf8 tells whether the target string is in UTF-8.
5617 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5620 const char flags = ANYOF_FLAGS(n);
5626 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5627 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5628 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5629 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5630 if (len == (STRLEN)-1)
5631 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5634 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5635 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5638 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5639 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5642 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5646 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5649 if (swash_fetch(sw, p, do_utf8))
5651 else if (flags & ANYOF_FOLD) {
5652 if (!match && lenp && av) {
5654 for (i = 0; i <= av_len(av); i++) {
5655 SV* const sv = *av_fetch(av, i, FALSE);
5657 const char * const s = SvPV_const(sv, len);
5659 if (len <= plen && memEQ(s, (char*)p, len)) {
5667 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5670 to_utf8_fold(p, tmpbuf, &tmplen);
5671 if (swash_fetch(sw, tmpbuf, do_utf8))
5677 if (match && lenp && *lenp == 0)
5678 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5680 if (!match && c < 256) {
5681 if (ANYOF_BITMAP_TEST(n, c))
5683 else if (flags & ANYOF_FOLD) {
5686 if (flags & ANYOF_LOCALE) {
5687 PL_reg_flags |= RF_tainted;
5688 f = PL_fold_locale[c];
5692 if (f != c && ANYOF_BITMAP_TEST(n, f))
5696 if (!match && (flags & ANYOF_CLASS)) {
5697 PL_reg_flags |= RF_tainted;
5699 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5700 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5701 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5702 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5703 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5704 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5705 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5706 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5707 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5708 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5709 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5710 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5711 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5712 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5713 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5714 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5715 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5716 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5717 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5718 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5719 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5720 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5721 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5722 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5723 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5724 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5725 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5726 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5727 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5728 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5729 ) /* How's that for a conditional? */
5736 return (flags & ANYOF_INVERT) ? !match : match;
5740 S_reghop3(U8 *s, I32 off, const U8* lim)
5744 while (off-- && s < lim) {
5745 /* XXX could check well-formedness here */
5750 while (off++ && s > lim) {
5752 if (UTF8_IS_CONTINUED(*s)) {
5753 while (s > lim && UTF8_IS_CONTINUATION(*s))
5756 /* XXX could check well-formedness here */
5763 /* there are a bunch of places where we use two reghop3's that should
5764 be replaced with this routine. but since thats not done yet
5765 we ifdef it out - dmq
5768 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5772 while (off-- && s < rlim) {
5773 /* XXX could check well-formedness here */
5778 while (off++ && s > llim) {
5780 if (UTF8_IS_CONTINUED(*s)) {
5781 while (s > llim && UTF8_IS_CONTINUATION(*s))
5784 /* XXX could check well-formedness here */
5792 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5796 while (off-- && s < lim) {
5797 /* XXX could check well-formedness here */
5804 while (off++ && s > lim) {
5806 if (UTF8_IS_CONTINUED(*s)) {
5807 while (s > lim && UTF8_IS_CONTINUATION(*s))
5810 /* XXX could check well-formedness here */
5819 restore_pos(pTHX_ void *arg)
5822 regexp * const rex = (regexp *)arg;
5823 if (PL_reg_eval_set) {
5824 if (PL_reg_oldsaved) {
5825 rex->subbeg = PL_reg_oldsaved;
5826 rex->sublen = PL_reg_oldsavedlen;
5827 #ifdef PERL_OLD_COPY_ON_WRITE
5828 rex->saved_copy = PL_nrs;
5830 RX_MATCH_COPIED_on(rex);
5832 PL_reg_magic->mg_len = PL_reg_oldpos;
5833 PL_reg_eval_set = 0;
5834 PL_curpm = PL_reg_oldcurpm;
5839 S_to_utf8_substr(pTHX_ register regexp *prog)
5843 if (prog->substrs->data[i].substr
5844 && !prog->substrs->data[i].utf8_substr) {
5845 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5846 prog->substrs->data[i].utf8_substr = sv;
5847 sv_utf8_upgrade(sv);
5848 if (SvVALID(prog->substrs->data[i].substr)) {
5849 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5850 if (flags & FBMcf_TAIL) {
5851 /* Trim the trailing \n that fbm_compile added last
5853 SvCUR_set(sv, SvCUR(sv) - 1);
5854 /* Whilst this makes the SV technically "invalid" (as its
5855 buffer is no longer followed by "\0") when fbm_compile()
5856 adds the "\n" back, a "\0" is restored. */
5858 fbm_compile(sv, flags);
5860 if (prog->substrs->data[i].substr == prog->check_substr)
5861 prog->check_utf8 = sv;
5867 S_to_byte_substr(pTHX_ register regexp *prog)
5872 if (prog->substrs->data[i].utf8_substr
5873 && !prog->substrs->data[i].substr) {
5874 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5875 if (sv_utf8_downgrade(sv, TRUE)) {
5876 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5878 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5879 if (flags & FBMcf_TAIL) {
5880 /* Trim the trailing \n that fbm_compile added last
5882 SvCUR_set(sv, SvCUR(sv) - 1);
5884 fbm_compile(sv, flags);
5890 prog->substrs->data[i].substr = sv;
5891 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5892 prog->check_substr = sv;
5899 * c-indentation-style: bsd
5901 * indent-tabs-mode: t
5904 * ex: set ts=8 sts=4 sw=4 noet: