5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
80 #define RF_evaled 4 /* Did an EVAL with setting? */
81 #define RF_utf8 8 /* String contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
159 const int retval = PL_savestack_ix;
160 #define REGCP_PAREN_ELEMS 4
161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 GET_RE_DEBUG_FLAGS_DECL;
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168 #define REGCP_OTHER_ELEMS 6
169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
170 for (p = PL_regsize; p > parenfloor; p--) {
171 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
174 SSPUSHPTR(PL_reg_start_tmp[p]);
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
183 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
186 SSPUSHINT(*PL_reglastcloseparen);
187 SSPUSHPTR(PL_reginput);
188 #define REGCP_FRAME_ELEMS 2
189 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
197 /* These are needed since we do not localize EVAL nodes: */
198 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
202 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
203 PerlIO_printf(Perl_debug_log, \
204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
208 S_regcppop(pTHX_ const regexp *rex)
214 GET_RE_DEBUG_FLAGS_DECL;
216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
220 input = (char *) SSPOPPTR;
221 *PL_reglastcloseparen = SSPOPINT;
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
225 /* Now restore the parentheses context. */
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
229 U32 paren = (U32)SSPOPINT;
230 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
231 PL_regstartp[paren] = SSPOPINT;
233 if (paren <= *PL_reglastparen)
234 PL_regendp[paren] = tmps;
236 PerlIO_printf(Perl_debug_log,
237 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
238 (UV)paren, (IV)PL_regstartp[paren],
239 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
240 (IV)PL_regendp[paren],
241 (paren > *PL_reglastparen ? "(no)" : ""));
245 if (*PL_reglastparen + 1 <= rex->nparens) {
246 PerlIO_printf(Perl_debug_log,
247 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
248 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
252 /* It would seem that the similar code in regtry()
253 * already takes care of this, and in fact it is in
254 * a better location to since this code can #if 0-ed out
255 * but the code in regtry() is needed or otherwise tests
256 * requiring null fields (pat.t#187 and split.t#{13,14}
257 * (as of patchlevel 7877) will fail. Then again,
258 * this code seems to be necessary or otherwise
259 * building DynaLoader will fail:
260 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
264 PL_regstartp[i] = -1;
271 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
274 * pregexec and friends
277 #ifndef PERL_IN_XSUB_RE
279 - pregexec - match a regexp against a string
282 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
283 char *strbeg, I32 minend, SV *screamer, U32 nosave)
284 /* strend: pointer to null at end of string */
285 /* strbeg: real beginning of string */
286 /* minend: end of match must be >=minend after stringarg. */
287 /* nosave: For optimizations. */
290 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
291 nosave ? 0 : REXEC_COPY_STR);
296 * Need to implement the following flags for reg_anch:
298 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
301 * INTUIT_AUTORITATIVE_ML
302 * INTUIT_ONCE_NOML - Intuit can match in one location only.
305 * Another flag for this function: SECOND_TIME (so that float substrs
306 * with giant delta may be not rechecked).
309 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
312 Otherwise, only SvCUR(sv) is used to get strbeg. */
314 /* XXXX We assume that strpos is strbeg unless sv. */
316 /* XXXX Some places assume that there is a fixed substring.
317 An update may be needed if optimizer marks as "INTUITable"
318 RExen without fixed substrings. Similarly, it is assumed that
319 lengths of all the strings are no more than minlen, thus they
320 cannot come from lookahead.
321 (Or minlen should take into account lookahead.) */
323 /* A failure to find a constant substring means that there is no need to make
324 an expensive call to REx engine, thus we celebrate a failure. Similarly,
325 finding a substring too deep into the string means that less calls to
326 regtry() should be needed.
328 REx compiler's optimizer found 4 possible hints:
329 a) Anchored substring;
331 c) Whether we are anchored (beginning-of-line or \G);
332 d) First node (of those at offset 0) which may distingush positions;
333 We use a)b)d) and multiline-part of c), and try to find a position in the
334 string which does not contradict any of them.
337 /* Most of decisions we do here should have been done at compile time.
338 The nodes of the REx which we used for the search should have been
339 deleted from the finite automaton. */
342 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
343 char *strend, U32 flags, re_scream_pos_data *data)
346 register I32 start_shift = 0;
347 /* Should be nonnegative! */
348 register I32 end_shift = 0;
353 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
355 register char *other_last = NULL; /* other substr checked before this */
356 char *check_at = NULL; /* check substr found at this pos */
357 const I32 multiline = prog->reganch & PMf_MULTILINE;
359 const char * const i_strpos = strpos;
362 GET_RE_DEBUG_FLAGS_DECL;
364 RX_MATCH_UTF8_set(prog,do_utf8);
366 if (prog->reganch & ROPT_UTF8) {
367 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
368 "UTF-8 regex...\n"));
369 PL_reg_flags |= RF_utf8;
373 RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
374 PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
378 if (PL_reg_match_utf8)
379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
380 "UTF-8 target...\n"));
381 PerlIO_printf(Perl_debug_log,
382 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
383 PL_colors[4], PL_colors[5], PL_colors[0],
386 (strlen(prog->precomp) > 60 ? "..." : ""),
388 (int)(len > 60 ? 60 : len),
390 (len > 60 ? "..." : "")
394 /* CHR_DIST() would be more correct here but it makes things slow. */
395 if (prog->minlen > strend - strpos) {
396 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
397 "String too short... [re_intuit_start]\n"));
400 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
403 if (!prog->check_utf8 && prog->check_substr)
404 to_utf8_substr(prog);
405 check = prog->check_utf8;
407 if (!prog->check_substr && prog->check_utf8)
408 to_byte_substr(prog);
409 check = prog->check_substr;
411 if (check == &PL_sv_undef) {
412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
413 "Non-utf string cannot match utf check string\n"));
416 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
417 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
418 || ( (prog->reganch & ROPT_ANCH_BOL)
419 && !multiline ) ); /* Check after \n? */
422 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
423 | ROPT_IMPLICIT)) /* not a real BOL */
424 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
426 && (strpos != strbeg)) {
427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
430 if (prog->check_offset_min == prog->check_offset_max &&
431 !(prog->reganch & ROPT_CANY_SEEN)) {
432 /* Substring at constant offset from beg-of-str... */
435 s = HOP3c(strpos, prog->check_offset_min, strend);
437 slen = SvCUR(check); /* >= 1 */
439 if ( strend - s > slen || strend - s < slen - 1
440 || (strend - s == slen && strend[-1] != '\n')) {
441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
444 /* Now should match s[0..slen-2] */
446 if (slen && (*SvPVX_const(check) != *s
448 && memNE(SvPVX_const(check), s, slen)))) {
450 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
454 else if (*SvPVX_const(check) != *s
455 || ((slen = SvCUR(check)) > 1
456 && memNE(SvPVX_const(check), s, slen)))
459 goto success_at_start;
462 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
464 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
465 end_shift = prog->minlen - start_shift -
466 CHR_SVLEN(check) + (SvTAIL(check) != 0);
468 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
469 - (SvTAIL(check) != 0);
470 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
472 if (end_shift < eshift)
476 else { /* Can match at random position */
479 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
480 /* Should be nonnegative! */
481 end_shift = prog->minlen - start_shift -
482 CHR_SVLEN(check) + (SvTAIL(check) != 0);
485 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
487 Perl_croak(aTHX_ "panic: end_shift");
491 /* Find a possible match in the region s..strend by looking for
492 the "check" substring in the region corrected by start/end_shift. */
493 if (flags & REXEC_SCREAM) {
494 I32 p = -1; /* Internal iterator of scream. */
495 I32 * const pp = data ? data->scream_pos : &p;
497 if (PL_screamfirst[BmRARE(check)] >= 0
498 || ( BmRARE(check) == '\n'
499 && (BmPREVIOUS(check) == SvCUR(check) - 1)
501 s = screaminstr(sv, check,
502 start_shift + (s - strbeg), end_shift, pp, 0);
505 /* we may be pointing at the wrong string */
506 if (s && RX_MATCH_COPIED(prog))
507 s = strbeg + (s - SvPVX_const(sv));
509 *data->scream_olds = s;
511 else if (prog->reganch & ROPT_CANY_SEEN)
512 s = fbm_instr((U8*)(s + start_shift),
513 (U8*)(strend - end_shift),
514 check, multiline ? FBMrf_MULTILINE : 0);
516 s = fbm_instr(HOP3(s, start_shift, strend),
517 HOP3(strend, -end_shift, strbeg),
518 check, multiline ? FBMrf_MULTILINE : 0);
520 /* Update the count-of-usability, remove useless subpatterns,
523 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
524 (s ? "Found" : "Did not find"),
525 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
527 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
529 PL_colors[1], (SvTAIL(check) ? "$" : ""),
530 (s ? " at offset " : "...\n") ) );
537 /* Finish the diagnostic message */
538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
540 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
541 Start with the other substr.
542 XXXX no SCREAM optimization yet - and a very coarse implementation
543 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
544 *always* match. Probably should be marked during compile...
545 Probably it is right to do no SCREAM here...
548 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
549 /* Take into account the "other" substring. */
550 /* XXXX May be hopelessly wrong for UTF... */
553 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
556 char * const last = HOP3c(s, -start_shift, strbeg);
558 char * const saved_s = s;
561 t = s - prog->check_offset_max;
562 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
564 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
569 t = HOP3c(t, prog->anchored_offset, strend);
570 if (t < other_last) /* These positions already checked */
572 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
575 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
576 /* On end-of-str: see comment below. */
577 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
578 if (must == &PL_sv_undef) {
580 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
585 HOP3(HOP3(last1, prog->anchored_offset, strend)
586 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
588 multiline ? FBMrf_MULTILINE : 0
590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
591 "%s anchored substr \"%s%.*s%s\"%s",
592 (s ? "Found" : "Contradicts"),
595 - (SvTAIL(must)!=0)),
597 PL_colors[1], (SvTAIL(must) ? "$" : "")));
599 if (last1 >= last2) {
600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
601 ", giving up...\n"));
604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
605 ", trying floating at offset %ld...\n",
606 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
607 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
608 s = HOP3c(last, 1, strend);
612 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
613 (long)(s - i_strpos)));
614 t = HOP3c(s, -prog->anchored_offset, strbeg);
615 other_last = HOP3c(s, 1, strend);
623 else { /* Take into account the floating substring. */
625 char * const saved_s = s;
628 t = HOP3c(s, -start_shift, strbeg);
630 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
631 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
632 last = HOP3c(t, prog->float_max_offset, strend);
633 s = HOP3c(t, prog->float_min_offset, strend);
636 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
637 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
638 /* fbm_instr() takes into account exact value of end-of-str
639 if the check is SvTAIL(ed). Since false positives are OK,
640 and end-of-str is not later than strend we are OK. */
641 if (must == &PL_sv_undef) {
643 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
646 s = fbm_instr((unsigned char*)s,
647 (unsigned char*)last + SvCUR(must)
649 must, multiline ? FBMrf_MULTILINE : 0);
650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
651 (s ? "Found" : "Contradicts"),
653 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
659 ", giving up...\n"));
662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
663 ", trying anchored starting at offset %ld...\n",
664 (long)(saved_s + 1 - i_strpos)));
666 s = HOP3c(t, 1, strend);
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
671 (long)(s - i_strpos)));
672 other_last = s; /* Fix this later. --Hugo */
681 t = s - prog->check_offset_max;
682 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
684 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
686 /* Fixed substring is found far enough so that the match
687 cannot start at strpos. */
689 if (ml_anch && t[-1] != '\n') {
690 /* Eventually fbm_*() should handle this, but often
691 anchored_offset is not 0, so this check will not be wasted. */
692 /* XXXX In the code below we prefer to look for "^" even in
693 presence of anchored substrings. And we search even
694 beyond the found float position. These pessimizations
695 are historical artefacts only. */
697 while (t < strend - prog->minlen) {
699 if (t < check_at - prog->check_offset_min) {
700 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
701 /* Since we moved from the found position,
702 we definitely contradict the found anchored
703 substr. Due to the above check we do not
704 contradict "check" substr.
705 Thus we can arrive here only if check substr
706 is float. Redo checking for "other"=="fixed".
709 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
710 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
711 goto do_other_anchored;
713 /* We don't contradict the found floating substring. */
714 /* XXXX Why not check for STCLASS? */
716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
717 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
720 /* Position contradicts check-string */
721 /* XXXX probably better to look for check-string
722 than for "\n", so one should lower the limit for t? */
723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
724 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
725 other_last = strpos = s = t + 1;
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
731 PL_colors[0], PL_colors[1]));
735 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
736 PL_colors[0], PL_colors[1]));
740 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
743 /* The found string does not prohibit matching at strpos,
744 - no optimization of calling REx engine can be performed,
745 unless it was an MBOL and we are not after MBOL,
746 or a future STCLASS check will fail this. */
748 /* Even in this situation we may use MBOL flag if strpos is offset
749 wrt the start of the string. */
750 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
751 && (strpos != strbeg) && strpos[-1] != '\n'
752 /* May be due to an implicit anchor of m{.*foo} */
753 && !(prog->reganch & ROPT_IMPLICIT))
758 DEBUG_EXECUTE_r( if (ml_anch)
759 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
760 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
763 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
765 prog->check_utf8 /* Could be deleted already */
766 && --BmUSEFUL(prog->check_utf8) < 0
767 && (prog->check_utf8 == prog->float_utf8)
769 prog->check_substr /* Could be deleted already */
770 && --BmUSEFUL(prog->check_substr) < 0
771 && (prog->check_substr == prog->float_substr)
774 /* If flags & SOMETHING - do not do it many times on the same match */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
776 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
777 if (do_utf8 ? prog->check_substr : prog->check_utf8)
778 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
779 prog->check_substr = prog->check_utf8 = NULL; /* disable */
780 prog->float_substr = prog->float_utf8 = NULL; /* clear */
781 check = NULL; /* abort */
783 /* XXXX This is a remnant of the old implementation. It
784 looks wasteful, since now INTUIT can use many
786 prog->reganch &= ~RE_USE_INTUIT;
793 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
794 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
795 /* minlen == 0 is possible if regstclass is \b or \B,
796 and the fixed substr is ''$.
797 Since minlen is already taken into account, s+1 is before strend;
798 accidentally, minlen >= 1 guaranties no false positives at s + 1
799 even for \b or \B. But (minlen? 1 : 0) below assumes that
800 regstclass does not come from lookahead... */
801 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
802 This leaves EXACTF only, which is dealt with in find_byclass(). */
803 const U8* const str = (U8*)STRING(prog->regstclass);
804 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
805 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
807 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
808 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
809 : (prog->float_substr || prog->float_utf8
810 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
813 /*if (OP(prog->regstclass) == TRIE)
816 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
819 const char *what = NULL;
821 if (endpos == strend) {
822 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
823 "Could not match STCLASS...\n") );
826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
827 "This position contradicts STCLASS...\n") );
828 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
830 /* Contradict one of substrings */
831 if (prog->anchored_substr || prog->anchored_utf8) {
832 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
833 DEBUG_EXECUTE_r( what = "anchored" );
835 s = HOP3c(t, 1, strend);
836 if (s + start_shift + end_shift > strend) {
837 /* XXXX Should be taken into account earlier? */
838 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
839 "Could not match STCLASS...\n") );
844 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
845 "Looking for %s substr starting at offset %ld...\n",
846 what, (long)(s + start_shift - i_strpos)) );
849 /* Have both, check_string is floating */
850 if (t + start_shift >= check_at) /* Contradicts floating=check */
851 goto retry_floating_check;
852 /* Recheck anchored substring, but not floating... */
856 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
857 "Looking for anchored substr starting at offset %ld...\n",
858 (long)(other_last - i_strpos)) );
859 goto do_other_anchored;
861 /* Another way we could have checked stclass at the
862 current position only: */
867 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
868 "Looking for /%s^%s/m starting at offset %ld...\n",
869 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
872 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
874 /* Check is floating subtring. */
875 retry_floating_check:
876 t = check_at - start_shift;
877 DEBUG_EXECUTE_r( what = "floating" );
878 goto hop_and_restart;
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
882 "By STCLASS: moving %ld --> %ld\n",
883 (long)(t - i_strpos), (long)(s - i_strpos))
887 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
888 "Does not contradict STCLASS...\n");
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
894 PL_colors[4], (check ? "Guessed" : "Giving up"),
895 PL_colors[5], (long)(s - i_strpos)) );
898 fail_finish: /* Substring not found */
899 if (prog->check_substr || prog->check_utf8) /* could be removed already */
900 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
903 PL_colors[4], PL_colors[5]));
907 /* We know what class REx starts with. Try to find this position... */
908 /* if reginfo is NULL, its a dryrun */
909 /* annoyingly all the vars in this routine have different names from their counterparts
910 in regmatch. /grrr */
912 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
913 foldlen, foldbuf, uniflags) STMT_START { \
914 switch (trie_type) { \
915 case trie_utf8_fold: \
917 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
922 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
923 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
924 foldlen -= UNISKIP( uvc ); \
925 uscan = foldbuf + UNISKIP( uvc ); \
929 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
937 charid = trie->charmap[ uvc ]; \
941 if (trie->widecharmap) { \
942 SV** const svpp = hv_fetch(trie->widecharmap, \
943 (char*)&uvc, sizeof(UV), 0); \
945 charid = (U16)SvIV(*svpp); \
950 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
953 ibcmp_utf8(s, NULL, 0, do_utf8, \
954 m, NULL, ln, (bool)UTF)) \
955 && (!reginfo || regtry(reginfo, s)) ) \
958 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
959 uvchr_to_utf8(tmpbuf, c); \
960 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
962 && (f == c1 || f == c2) \
963 && (ln == foldlen || \
964 !ibcmp_utf8((char *) foldbuf, \
965 NULL, foldlen, do_utf8, \
967 NULL, ln, (bool)UTF)) \
968 && (!reginfo || regtry(reginfo, s)) ) \
973 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
977 && (ln == 1 || !(OP(c) == EXACTF \
979 : ibcmp_locale(s, m, ln))) \
980 && (!reginfo || regtry(reginfo, s)) ) \
986 #define REXEC_FBC_UTF8_SCAN(CoDe) \
988 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
994 #define REXEC_FBC_SCAN(CoDe) \
996 while (s < strend) { \
1002 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1003 REXEC_FBC_UTF8_SCAN( \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1014 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1017 if (tmp && (!reginfo || regtry(reginfo, s))) \
1026 #define REXEC_FBC_TRYIT \
1027 if ((!reginfo || regtry(reginfo, s))) \
1030 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1033 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1036 REXEC_FBC_CLASS_SCAN(CoNd); \
1040 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1041 PL_reg_flags |= RF_tainted; \
1043 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1046 REXEC_FBC_CLASS_SCAN(CoNd); \
1051 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1052 const char *strend, const regmatch_info *reginfo)
1055 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1059 register STRLEN uskip;
1063 register I32 tmp = 1; /* Scratch variable? */
1064 register const bool do_utf8 = PL_reg_match_utf8;
1066 /* We know what class it must start with. */
1070 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1071 !UTF8_IS_INVARIANT((U8)s[0]) ?
1072 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1073 REGINCLASS(prog, c, (U8*)s));
1076 while (s < strend) {
1079 if (REGINCLASS(prog, c, (U8*)s) ||
1080 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1081 /* The assignment of 2 is intentional:
1082 * for the folded sharp s, the skip is 2. */
1083 (skip = SHARP_S_SKIP))) {
1084 if (tmp && (!reginfo || regtry(reginfo, s)))
1097 if (tmp && (!reginfo || regtry(reginfo, s)))
1105 ln = STR_LEN(c); /* length to match in octets/bytes */
1106 lnc = (I32) ln; /* length to match in characters */
1108 STRLEN ulen1, ulen2;
1110 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1111 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1112 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1114 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1115 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1117 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1119 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1122 while (sm < ((U8 *) m + ln)) {
1137 c2 = PL_fold_locale[c1];
1139 e = HOP3c(strend, -((I32)lnc), s);
1141 if (!reginfo && e < s)
1142 e = s; /* Due to minlen logic of intuit() */
1144 /* The idea in the EXACTF* cases is to first find the
1145 * first character of the EXACTF* node and then, if
1146 * necessary, case-insensitively compare the full
1147 * text of the node. The c1 and c2 are the first
1148 * characters (though in Unicode it gets a bit
1149 * more complicated because there are more cases
1150 * than just upper and lower: one needs to use
1151 * the so-called folding case for case-insensitive
1152 * matching (called "loose matching" in Unicode).
1153 * ibcmp_utf8() will do just that. */
1157 U8 tmpbuf [UTF8_MAXBYTES+1];
1158 STRLEN len, foldlen;
1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1161 /* Upper and lower of 1st char are equal -
1162 * probably not a "letter". */
1164 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1166 REXEC_FBC_EXACTISH_CHECK(c == c1);
1171 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1174 /* Handle some of the three Greek sigmas cases.
1175 * Note that not all the possible combinations
1176 * are handled here: some of them are handled
1177 * by the standard folding rules, and some of
1178 * them (the character class or ANYOF cases)
1179 * are handled during compiletime in
1180 * regexec.c:S_regclass(). */
1181 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1182 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1183 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1185 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1191 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1193 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1197 PL_reg_flags |= RF_tainted;
1204 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1205 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1207 tmp = ((OP(c) == BOUND ?
1208 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1209 LOAD_UTF8_CHARCLASS_ALNUM();
1210 REXEC_FBC_UTF8_SCAN(
1211 if (tmp == !(OP(c) == BOUND ?
1212 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1213 isALNUM_LC_utf8((U8*)s)))
1221 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1222 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1225 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1231 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1235 PL_reg_flags |= RF_tainted;
1242 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1243 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1245 tmp = ((OP(c) == NBOUND ?
1246 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1247 LOAD_UTF8_CHARCLASS_ALNUM();
1248 REXEC_FBC_UTF8_SCAN(
1249 if (tmp == !(OP(c) == NBOUND ?
1250 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1251 isALNUM_LC_utf8((U8*)s)))
1253 else REXEC_FBC_TRYIT;
1257 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1258 tmp = ((OP(c) == NBOUND ?
1259 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1262 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1264 else REXEC_FBC_TRYIT;
1267 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1271 REXEC_FBC_CSCAN_PRELOAD(
1272 LOAD_UTF8_CHARCLASS_ALNUM(),
1273 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1277 REXEC_FBC_CSCAN_TAINT(
1278 isALNUM_LC_utf8((U8*)s),
1282 REXEC_FBC_CSCAN_PRELOAD(
1283 LOAD_UTF8_CHARCLASS_ALNUM(),
1284 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1288 REXEC_FBC_CSCAN_TAINT(
1289 !isALNUM_LC_utf8((U8*)s),
1293 REXEC_FBC_CSCAN_PRELOAD(
1294 LOAD_UTF8_CHARCLASS_SPACE(),
1295 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1299 REXEC_FBC_CSCAN_TAINT(
1300 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1304 REXEC_FBC_CSCAN_PRELOAD(
1305 LOAD_UTF8_CHARCLASS_SPACE(),
1306 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1310 REXEC_FBC_CSCAN_TAINT(
1311 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1315 REXEC_FBC_CSCAN_PRELOAD(
1316 LOAD_UTF8_CHARCLASS_DIGIT(),
1317 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1321 REXEC_FBC_CSCAN_TAINT(
1322 isDIGIT_LC_utf8((U8*)s),
1326 REXEC_FBC_CSCAN_PRELOAD(
1327 LOAD_UTF8_CHARCLASS_DIGIT(),
1328 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1332 REXEC_FBC_CSCAN_TAINT(
1333 !isDIGIT_LC_utf8((U8*)s),
1337 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1339 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1340 trie_type = do_utf8 ?
1341 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1343 /* what trie are we using right now */
1345 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1346 reg_trie_data *trie=aho->trie;
1348 const char *last_start = strend - trie->minlen;
1349 const char *real_start = s;
1350 STRLEN maxlen = trie->maxlen;
1352 U8 **points; /* map of where we were in the input string
1353 when reading a given string. For ASCII this
1354 is unnecessary overhead as the relationship
1355 is always 1:1, but for unicode, especially
1356 case folded unicode this is not true. */
1357 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1359 GET_RE_DEBUG_FLAGS_DECL;
1361 /* We can't just allocate points here. We need to wrap it in
1362 * an SV so it gets freed properly if there is a croak while
1363 * running the match */
1366 sv_points=newSV(maxlen * sizeof(U8 *));
1367 SvCUR_set(sv_points,
1368 maxlen * sizeof(U8 *));
1369 SvPOK_on(sv_points);
1370 sv_2mortal(sv_points);
1371 points=(U8**)SvPV_nolen(sv_points );
1373 if (trie->bitmap && trie_type != trie_utf8_fold) {
1374 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1379 while (s <= last_start) {
1380 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1388 U8 *uscan = (U8*)NULL;
1389 U8 *leftmost = NULL;
1393 while ( state && uc <= (U8*)strend ) {
1395 if (aho->states[ state ].wordnum) {
1396 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1397 if (!leftmost || lpos < leftmost)
1401 points[pointpos++ % maxlen]= uc;
1402 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1403 uvc, charid, foldlen, foldbuf, uniflags);
1404 DEBUG_TRIE_EXECUTE_r(
1405 PerlIO_printf(Perl_debug_log,
1406 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1407 (int)((const char*)uc - real_start), charid, uvc)
1412 U32 word = aho->states[ state ].wordnum;
1413 base = aho->states[ state ].trans.base;
1415 DEBUG_TRIE_EXECUTE_r(
1416 PerlIO_printf( Perl_debug_log,
1417 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1418 failed ? "Fail transition to " : "",
1419 state, base, uvc, word)
1424 (base + charid > trie->uniquecharcount )
1425 && (base + charid - 1 - trie->uniquecharcount
1427 && trie->trans[base + charid - 1 -
1428 trie->uniquecharcount].check == state
1429 && (tmp=trie->trans[base + charid - 1 -
1430 trie->uniquecharcount ].next))
1440 state = aho->fail[state];
1444 /* we must be accepting here */
1452 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1453 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1459 if ( aho->states[ state ].wordnum ) {
1460 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1461 if (!leftmost || lpos < leftmost)
1464 DEBUG_TRIE_EXECUTE_r(
1465 PerlIO_printf( Perl_debug_log,
1466 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1471 s = (char*)leftmost;
1472 if (!reginfo || regtry(reginfo, s)) {
1487 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1496 - regexec_flags - match a regexp against a string
1499 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1500 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1501 /* strend: pointer to null at end of string */
1502 /* strbeg: real beginning of string */
1503 /* minend: end of match must be >=minend after stringarg. */
1504 /* data: May be used for some additional optimizations. */
1505 /* nosave: For optimizations. */
1509 register regnode *c;
1510 register char *startpos = stringarg;
1511 I32 minlen; /* must match at least this many chars */
1512 I32 dontbother = 0; /* how many characters not to try at end */
1513 I32 end_shift = 0; /* Same for the end. */ /* CC */
1514 I32 scream_pos = -1; /* Internal iterator of scream. */
1515 char *scream_olds = NULL;
1516 SV* const oreplsv = GvSV(PL_replgv);
1517 const bool do_utf8 = DO_UTF8(sv);
1520 regmatch_info reginfo; /* create some info to pass to regtry etc */
1522 GET_RE_DEBUG_FLAGS_DECL;
1524 PERL_UNUSED_ARG(data);
1526 /* Be paranoid... */
1527 if (prog == NULL || startpos == NULL) {
1528 Perl_croak(aTHX_ "NULL regexp parameter");
1532 multiline = prog->reganch & PMf_MULTILINE;
1533 reginfo.prog = prog;
1535 RX_MATCH_UTF8_set(prog, do_utf8);
1537 minlen = prog->minlen;
1538 if (strend - startpos < minlen) {
1539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1540 "String too short [regexec_flags]...\n"));
1544 /* Check validity of program. */
1545 if (UCHARAT(prog->program) != REG_MAGIC) {
1546 Perl_croak(aTHX_ "corrupted regexp program");
1550 PL_reg_eval_set = 0;
1553 if (prog->reganch & ROPT_UTF8)
1554 PL_reg_flags |= RF_utf8;
1556 /* Mark beginning of line for ^ and lookbehind. */
1557 reginfo.bol = startpos; /* XXX not used ??? */
1561 /* Mark end of line for $ (and such) */
1564 /* see how far we have to get to not match where we matched before */
1565 reginfo.till = startpos+minend;
1567 /* If there is a "must appear" string, look for it. */
1570 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1573 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1574 reginfo.ganch = startpos;
1575 else if (sv && SvTYPE(sv) >= SVt_PVMG
1577 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1578 && mg->mg_len >= 0) {
1579 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1580 if (prog->reganch & ROPT_ANCH_GPOS) {
1581 if (s > reginfo.ganch)
1586 else /* pos() not defined */
1587 reginfo.ganch = strbeg;
1590 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1591 re_scream_pos_data d;
1593 d.scream_olds = &scream_olds;
1594 d.scream_pos = &scream_pos;
1595 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1597 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1598 goto phooey; /* not present */
1603 RE_PV_DISPLAY_DECL(s0, len0, UTF,
1604 PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
1605 RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
1606 PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
1610 PerlIO_printf(Perl_debug_log,
1611 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1612 PL_colors[4], PL_colors[5], PL_colors[0],
1615 len0 > 60 ? "..." : "",
1617 (int)(len1 > 60 ? 60 : len1),
1619 (len1 > 60 ? "..." : "")
1623 /* Simplest case: anchored match need be tried only once. */
1624 /* [unless only anchor is BOL and multiline is set] */
1625 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1626 if (s == startpos && regtry(®info, startpos))
1628 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1629 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1634 dontbother = minlen - 1;
1635 end = HOP3c(strend, -dontbother, strbeg) - 1;
1636 /* for multiline we only have to try after newlines */
1637 if (prog->check_substr || prog->check_utf8) {
1641 if (regtry(®info, s))
1646 if (prog->reganch & RE_USE_INTUIT) {
1647 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1658 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1659 if (regtry(®info, s))
1666 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1667 if (regtry(®info, reginfo.ganch))
1672 /* Messy cases: unanchored match. */
1673 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1674 /* we have /x+whatever/ */
1675 /* it must be a one character string (XXXX Except UTF?) */
1680 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1681 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1682 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1687 DEBUG_EXECUTE_r( did_match = 1 );
1688 if (regtry(®info, s)) goto got_it;
1690 while (s < strend && *s == ch)
1698 DEBUG_EXECUTE_r( did_match = 1 );
1699 if (regtry(®info, s)) goto got_it;
1701 while (s < strend && *s == ch)
1706 DEBUG_EXECUTE_r(if (!did_match)
1707 PerlIO_printf(Perl_debug_log,
1708 "Did not find anchored character...\n")
1711 else if (prog->anchored_substr != NULL
1712 || prog->anchored_utf8 != NULL
1713 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1714 && prog->float_max_offset < strend - s)) {
1719 char *last1; /* Last position checked before */
1723 if (prog->anchored_substr || prog->anchored_utf8) {
1724 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1725 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1726 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1727 back_max = back_min = prog->anchored_offset;
1729 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1730 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1731 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1732 back_max = prog->float_max_offset;
1733 back_min = prog->float_min_offset;
1735 if (must == &PL_sv_undef)
1736 /* could not downgrade utf8 check substring, so must fail */
1739 last = HOP3c(strend, /* Cannot start after this */
1740 -(I32)(CHR_SVLEN(must)
1741 - (SvTAIL(must) != 0) + back_min), strbeg);
1744 last1 = HOPc(s, -1);
1746 last1 = s - 1; /* bogus */
1748 /* XXXX check_substr already used to find "s", can optimize if
1749 check_substr==must. */
1751 dontbother = end_shift;
1752 strend = HOPc(strend, -dontbother);
1753 while ( (s <= last) &&
1754 ((flags & REXEC_SCREAM)
1755 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1756 end_shift, &scream_pos, 0))
1757 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1758 (unsigned char*)strend, must,
1759 multiline ? FBMrf_MULTILINE : 0))) ) {
1760 /* we may be pointing at the wrong string */
1761 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1762 s = strbeg + (s - SvPVX_const(sv));
1763 DEBUG_EXECUTE_r( did_match = 1 );
1764 if (HOPc(s, -back_max) > last1) {
1765 last1 = HOPc(s, -back_min);
1766 s = HOPc(s, -back_max);
1769 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1771 last1 = HOPc(s, -back_min);
1775 while (s <= last1) {
1776 if (regtry(®info, s))
1782 while (s <= last1) {
1783 if (regtry(®info, s))
1789 DEBUG_EXECUTE_r(if (!did_match)
1790 PerlIO_printf(Perl_debug_log,
1791 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1792 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1793 ? "anchored" : "floating"),
1795 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1797 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1801 else if ((c = prog->regstclass)) {
1803 const OPCODE op = OP(prog->regstclass);
1804 /* don't bother with what can't match */
1805 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
1806 strend = HOPc(strend, -(minlen - 1));
1809 SV * const prop = sv_newmortal();
1810 regprop(prog, prop, c);
1812 RE_PV_DISPLAY_DECL(s0,len0,UTF,
1813 PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
1814 RE_PV_DISPLAY_DECL(s1,len1,UTF,
1815 PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
1816 PerlIO_printf(Perl_debug_log,
1817 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
1819 len1, len1, s1, (int)(strend - s));
1822 if (find_byclass(prog, c, s, strend, ®info))
1824 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1828 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1833 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1834 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1835 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1837 if (flags & REXEC_SCREAM) {
1838 last = screaminstr(sv, float_real, s - strbeg,
1839 end_shift, &scream_pos, 1); /* last one */
1841 last = scream_olds; /* Only one occurrence. */
1842 /* we may be pointing at the wrong string */
1843 else if (RX_MATCH_COPIED(prog))
1844 s = strbeg + (s - SvPVX_const(sv));
1848 const char * const little = SvPV_const(float_real, len);
1850 if (SvTAIL(float_real)) {
1851 if (memEQ(strend - len + 1, little, len - 1))
1852 last = strend - len + 1;
1853 else if (!multiline)
1854 last = memEQ(strend - len, little, len)
1855 ? strend - len : NULL;
1861 last = rninstr(s, strend, little, little + len);
1863 last = strend; /* matching "$" */
1867 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1868 "%sCan't trim the tail, match fails (should not happen)%s\n",
1869 PL_colors[4], PL_colors[5]));
1870 goto phooey; /* Should not happen! */
1872 dontbother = strend - last + prog->float_min_offset;
1874 if (minlen && (dontbother < minlen))
1875 dontbother = minlen - 1;
1876 strend -= dontbother; /* this one's always in bytes! */
1877 /* We don't know much -- general case. */
1880 if (regtry(®info, s))
1889 if (regtry(®info, s))
1891 } while (s++ < strend);
1899 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1901 if (PL_reg_eval_set) {
1902 /* Preserve the current value of $^R */
1903 if (oreplsv != GvSV(PL_replgv))
1904 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1905 restored, the value remains
1907 restore_pos(aTHX_ prog);
1910 /* make sure $`, $&, $', and $digit will work later */
1911 if ( !(flags & REXEC_NOT_FIRST) ) {
1912 RX_MATCH_COPY_FREE(prog);
1913 if (flags & REXEC_COPY_STR) {
1914 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
1915 #ifdef PERL_OLD_COPY_ON_WRITE
1917 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1919 PerlIO_printf(Perl_debug_log,
1920 "Copy on write: regexp capture, type %d\n",
1923 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
1924 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
1925 assert (SvPOKp(prog->saved_copy));
1929 RX_MATCH_COPIED_on(prog);
1930 s = savepvn(strbeg, i);
1936 prog->subbeg = strbeg;
1937 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1945 PL_colors[4], PL_colors[5]));
1946 if (PL_reg_eval_set)
1947 restore_pos(aTHX_ prog);
1952 - regtry - try match at specific point
1954 STATIC I32 /* 0 failure, 1 success */
1955 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
1961 regexp *prog = reginfo->prog;
1962 GET_RE_DEBUG_FLAGS_DECL;
1965 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1967 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1970 PL_reg_eval_set = RS_init;
1971 DEBUG_EXECUTE_r(DEBUG_s(
1972 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1973 (IV)(PL_stack_sp - PL_stack_base));
1975 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1976 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1977 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1979 /* Apparently this is not needed, judging by wantarray. */
1980 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1981 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1984 /* Make $_ available to executed code. */
1985 if (reginfo->sv != DEFSV) {
1987 DEFSV = reginfo->sv;
1990 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1991 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
1992 /* prepare for quick setting of pos */
1993 #ifdef PERL_OLD_COPY_ON_WRITE
1995 sv_force_normal_flags(sv, 0);
1997 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
1998 &PL_vtbl_mglob, NULL, 0);
2002 PL_reg_oldpos = mg->mg_len;
2003 SAVEDESTRUCTOR_X(restore_pos, prog);
2005 if (!PL_reg_curpm) {
2006 Newxz(PL_reg_curpm, 1, PMOP);
2009 SV* const repointer = newSViv(0);
2010 /* so we know which PL_regex_padav element is PL_reg_curpm */
2011 SvFLAGS(repointer) |= SVf_BREAK;
2012 av_push(PL_regex_padav,repointer);
2013 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2014 PL_regex_pad = AvARRAY(PL_regex_padav);
2018 PM_SETRE(PL_reg_curpm, prog);
2019 PL_reg_oldcurpm = PL_curpm;
2020 PL_curpm = PL_reg_curpm;
2021 if (RX_MATCH_COPIED(prog)) {
2022 /* Here is a serious problem: we cannot rewrite subbeg,
2023 since it may be needed if this match fails. Thus
2024 $` inside (?{}) could fail... */
2025 PL_reg_oldsaved = prog->subbeg;
2026 PL_reg_oldsavedlen = prog->sublen;
2027 #ifdef PERL_OLD_COPY_ON_WRITE
2028 PL_nrs = prog->saved_copy;
2030 RX_MATCH_COPIED_off(prog);
2033 PL_reg_oldsaved = NULL;
2034 prog->subbeg = PL_bostr;
2035 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2037 prog->startp[0] = startpos - PL_bostr;
2038 PL_reginput = startpos;
2039 PL_regstartp = prog->startp;
2040 PL_regendp = prog->endp;
2041 PL_reglastparen = &prog->lastparen;
2042 PL_reglastcloseparen = &prog->lastcloseparen;
2043 prog->lastparen = 0;
2044 prog->lastcloseparen = 0;
2046 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2047 if (PL_reg_start_tmpl <= prog->nparens) {
2048 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2049 if(PL_reg_start_tmp)
2050 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2052 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2055 /* XXXX What this code is doing here?!!! There should be no need
2056 to do this again and again, PL_reglastparen should take care of
2059 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2060 * Actually, the code in regcppop() (which Ilya may be meaning by
2061 * PL_reglastparen), is not needed at all by the test suite
2062 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2063 * enough, for building DynaLoader, or otherwise this
2064 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2065 * will happen. Meanwhile, this code *is* needed for the
2066 * above-mentioned test suite tests to succeed. The common theme
2067 * on those tests seems to be returning null fields from matches.
2072 if (prog->nparens) {
2074 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2081 if (regmatch(reginfo, prog->program + 1)) {
2082 prog->endp[0] = PL_reginput - PL_bostr;
2085 REGCP_UNWIND(lastcp);
2090 #define sayYES goto yes
2091 #define sayNO goto no
2092 #define sayNO_ANYOF goto no_anyof
2093 #define sayYES_FINAL goto yes_final
2094 #define sayNO_FINAL goto no_final
2095 #define sayNO_SILENT goto do_no
2096 #define saySAME(x) if (x) goto yes; else goto no
2098 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2099 #define POSCACHE_SEEN 1 /* we know what we're caching */
2100 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2102 #define CACHEsayYES STMT_START { \
2103 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2104 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2105 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2106 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2108 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2109 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2112 /* cache records failure, but this is success */ \
2114 PerlIO_printf(Perl_debug_log, \
2115 "%*s (remove success from failure cache)\n", \
2116 REPORT_CODE_OFF+PL_regindent*2, "") \
2118 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2124 #define CACHEsayNO STMT_START { \
2125 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2126 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2127 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2128 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2130 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2131 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2134 /* cache records success, but this is failure */ \
2136 PerlIO_printf(Perl_debug_log, \
2137 "%*s (remove failure from success cache)\n", \
2138 REPORT_CODE_OFF+PL_regindent*2, "") \
2140 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2146 /* this is used to determine how far from the left messages like
2147 'failed...' are printed. Currently 29 makes these messages line
2148 up with the opcode they refer to. Earlier perls used 25 which
2149 left these messages outdented making reviewing a debug output
2152 #define REPORT_CODE_OFF 29
2155 /* Make sure there is a test for this +1 options in re_tests */
2156 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2158 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2159 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2161 #define SLAB_FIRST(s) (&(s)->states[0])
2162 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2164 /* grab a new slab and return the first slot in it */
2166 STATIC regmatch_state *
2169 #if PERL_VERSION < 9
2172 regmatch_slab *s = PL_regmatch_slab->next;
2174 Newx(s, 1, regmatch_slab);
2175 s->prev = PL_regmatch_slab;
2177 PL_regmatch_slab->next = s;
2179 PL_regmatch_slab = s;
2180 return SLAB_FIRST(s);
2183 /* simulate a recursive call to regmatch */
2185 #define REGMATCH(ns, where) \
2188 st->resume_state = resume_##where; \
2189 goto start_recurse; \
2190 resume_point_##where:
2192 /* push a new state then goto it */
2194 #define PUSH_STATE_GOTO(state, node) \
2196 st->resume_state = state; \
2199 /* push a new state with success backtracking, then goto it */
2201 #define PUSH_YES_STATE_GOTO(state, node) \
2203 st->resume_state = state; \
2204 goto push_yes_state;
2209 - regmatch - main matching routine
2211 * Conceptually the strategy is simple: check to see whether the current
2212 * node matches, call self recursively to see whether the rest matches,
2213 * and then act accordingly. In practice we make some effort to avoid
2214 * recursion, in particular by going through "ordinary" nodes (that don't
2215 * need to know whether the rest of the match failed) by a loop instead of
2218 /* [lwall] I've hoisted the register declarations to the outer block in order to
2219 * maybe save a little bit of pushing and popping on the stack. It also takes
2220 * advantage of machines that use a register save mask on subroutine entry.
2222 * This function used to be heavily recursive, but since this had the
2223 * effect of blowing the CPU stack on complex regexes, it has been
2224 * restructured to be iterative, and to save state onto the heap rather
2225 * than the stack. Essentially whereever regmatch() used to be called, it
2226 * pushes the current state, notes where to return, then jumps back into
2229 * Originally the structure of this function used to look something like
2234 while (scan != NULL) {
2235 a++; // do stuff with a and b
2241 if (regmatch(...)) // recurse
2251 * Now it looks something like this:
2259 regmatch_state *st = new();
2261 st->a++; // do stuff with a and b
2263 while (scan != NULL) {
2271 st->resume_state = resume_FOO;
2272 goto start_recurse; // recurse
2281 st = new(); push a new state
2282 st->a = 1; st->b = 2;
2289 switch (resume_state) {
2291 goto resume_point_FOO;
2298 * WARNING: this means that any line in this function that contains a
2299 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2300 * regmatch() using gotos instead. Thus the values of any local variables
2301 * not saved in the regmatch_state structure will have been lost when
2302 * execution resumes on the next line .
2304 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2305 * PL_regmatch_state always points to the currently active state, and
2306 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2307 * The first time regmatch is called, the first slab is allocated, and is
2308 * never freed until interpreter desctruction. When the slab is full,
2309 * a new one is allocated chained to the end. At exit from regmatch, slabs
2310 * allocated since entry are freed.
2313 /* *** every FOO_fail should = FOO+1 */
2314 #define TRIE_next (REGNODE_MAX+1)
2315 #define TRIE_next_fail (REGNODE_MAX+2)
2316 #define EVAL_A (REGNODE_MAX+3)
2317 #define EVAL_A_fail (REGNODE_MAX+4)
2318 #define resume_CURLYX (REGNODE_MAX+5)
2319 #define resume_WHILEM1 (REGNODE_MAX+6)
2320 #define resume_WHILEM2 (REGNODE_MAX+7)
2321 #define resume_WHILEM3 (REGNODE_MAX+8)
2322 #define resume_WHILEM4 (REGNODE_MAX+9)
2323 #define resume_WHILEM5 (REGNODE_MAX+10)
2324 #define resume_WHILEM6 (REGNODE_MAX+11)
2325 #define BRANCH_next (REGNODE_MAX+12)
2326 #define BRANCH_next_fail (REGNODE_MAX+13)
2327 #define CURLYM_A (REGNODE_MAX+14)
2328 #define CURLYM_A_fail (REGNODE_MAX+15)
2329 #define CURLYM_B (REGNODE_MAX+16)
2330 #define CURLYM_B_fail (REGNODE_MAX+17)
2331 #define IFMATCH_A (REGNODE_MAX+18)
2332 #define IFMATCH_A_fail (REGNODE_MAX+19)
2333 #define CURLY_B_min_known (REGNODE_MAX+20)
2334 #define CURLY_B_min_known_fail (REGNODE_MAX+21)
2335 #define CURLY_B_min (REGNODE_MAX+22)
2336 #define CURLY_B_min_fail (REGNODE_MAX+23)
2337 #define CURLY_B_max (REGNODE_MAX+24)
2338 #define CURLY_B_max_fail (REGNODE_MAX+25)
2341 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2346 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2348 const int docolor = *PL_colors[0];
2349 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2350 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2351 /* The part of the string before starttry has one color
2352 (pref0_len chars), between starttry and current
2353 position another one (pref_len - pref0_len chars),
2354 after the current position the third one.
2355 We assume that pref0_len <= pref_len, otherwise we
2356 decrease pref0_len. */
2357 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2358 ? (5 + taill) - l : locinput - PL_bostr;
2361 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2363 pref0_len = pref_len - (locinput - PL_reg_starttry);
2364 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2365 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2366 ? (5 + taill) - pref_len : PL_regeol - locinput);
2367 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2371 if (pref0_len > pref_len)
2372 pref0_len = pref_len;
2374 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2376 RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2377 (locinput - pref_len),pref0_len, 60);
2379 RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2380 (locinput - pref_len + pref0_len),
2381 pref_len - pref0_len, 60);
2383 RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2384 locinput, PL_regeol - locinput, 60);
2386 PerlIO_printf(Perl_debug_log,
2387 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2388 (IV)(locinput - PL_bostr),
2395 (docolor ? "" : "> <"),
2399 15 - l - pref_len + 1,
2406 STATIC I32 /* 0 failure, 1 success */
2407 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2409 #if PERL_VERSION < 9
2413 register const bool do_utf8 = PL_reg_match_utf8;
2414 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2416 regexp *rex = reginfo->prog;
2418 regmatch_slab *orig_slab;
2419 regmatch_state *orig_state;
2421 /* the current state. This is a cached copy of PL_regmatch_state */
2422 register regmatch_state *st;
2424 /* cache heavy used fields of st in registers */
2425 register regnode *scan;
2426 register regnode *next;
2427 register I32 n = 0; /* initialize to shut up compiler warning */
2428 register char *locinput = PL_reginput;
2430 /* these variables are NOT saved during a recusive RFEGMATCH: */
2431 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2432 bool result; /* return value of S_regmatch */
2433 int depth = 0; /* depth of recursion */
2434 regmatch_state *yes_state = NULL; /* state to pop to on success of
2439 GET_RE_DEBUG_FLAGS_DECL;
2443 /* on first ever call to regmatch, allocate first slab */
2444 if (!PL_regmatch_slab) {
2445 Newx(PL_regmatch_slab, 1, regmatch_slab);
2446 PL_regmatch_slab->prev = NULL;
2447 PL_regmatch_slab->next = NULL;
2448 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2451 /* remember current high-water mark for exit */
2452 /* XXX this should be done with SAVE* instead */
2453 orig_slab = PL_regmatch_slab;
2454 orig_state = PL_regmatch_state;
2456 /* grab next free state slot */
2457 st = ++PL_regmatch_state;
2458 if (st > SLAB_LAST(PL_regmatch_slab))
2459 st = PL_regmatch_state = S_push_slab(aTHX);
2465 /* Note that nextchr is a byte even in UTF */
2466 nextchr = UCHARAT(locinput);
2468 while (scan != NULL) {
2471 SV * const prop = sv_newmortal();
2472 dump_exec_pos( locinput, scan, do_utf8 );
2473 regprop(rex, prop, scan);
2475 PerlIO_printf(Perl_debug_log,
2476 "%3"IVdf":%*s%s(%"IVdf")\n",
2477 (IV)(scan - rex->program), PL_regindent*2, "",
2479 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2482 next = scan + NEXT_OFF(scan);
2485 state_num = OP(scan);
2488 switch (state_num) {
2490 if (locinput == PL_bostr)
2492 /* reginfo->till = reginfo->bol; */
2497 if (locinput == PL_bostr ||
2498 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2504 if (locinput == PL_bostr)
2508 if (locinput == reginfo->ganch)
2514 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2519 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2521 if (PL_regeol - locinput > 1)
2525 if (PL_regeol != locinput)
2529 if (!nextchr && locinput >= PL_regeol)
2532 locinput += PL_utf8skip[nextchr];
2533 if (locinput > PL_regeol)
2535 nextchr = UCHARAT(locinput);
2538 nextchr = UCHARAT(++locinput);
2541 if (!nextchr && locinput >= PL_regeol)
2543 nextchr = UCHARAT(++locinput);
2546 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2549 locinput += PL_utf8skip[nextchr];
2550 if (locinput > PL_regeol)
2552 nextchr = UCHARAT(locinput);
2555 nextchr = UCHARAT(++locinput);
2559 #define ST st->u.trie
2563 /* what type of TRIE am I? (utf8 makes this contextual) */
2564 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2565 trie_type = do_utf8 ?
2566 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2569 /* what trie are we using right now */
2570 reg_trie_data * const trie
2571 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2572 U32 state = trie->startstate;
2574 U8 *uc = ( U8* )locinput;
2580 U8 *uscan = (U8*)NULL;
2582 SV *sv_accept_buff = NULL;
2583 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2585 ST.accepted = 0; /* how many accepting states we have seen */
2591 if (trie->bitmap && trie_type != trie_utf8_fold &&
2592 !TRIE_BITMAP_TEST(trie,*locinput)
2594 if (trie->states[ state ].wordnum) {
2596 PerlIO_printf(Perl_debug_log,
2597 "%*s %smatched empty string...%s\n",
2598 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2603 PerlIO_printf(Perl_debug_log,
2604 "%*s %sfailed to match start class...%s\n",
2605 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2612 traverse the TRIE keeping track of all accepting states
2613 we transition through until we get to a failing node.
2616 while ( state && uc <= (U8*)PL_regeol ) {
2618 if (trie->states[ state ].wordnum) {
2619 if (!ST.accepted ) {
2622 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2623 sv_accept_buff=newSV(bufflen *
2624 sizeof(reg_trie_accepted) - 1);
2625 SvCUR_set(sv_accept_buff,
2626 sizeof(reg_trie_accepted));
2627 SvPOK_on(sv_accept_buff);
2628 sv_2mortal(sv_accept_buff);
2631 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2634 if (ST.accepted >= bufflen) {
2636 ST.accept_buff =(reg_trie_accepted*)
2637 SvGROW(sv_accept_buff,
2638 bufflen * sizeof(reg_trie_accepted));
2640 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2641 + sizeof(reg_trie_accepted));
2643 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2644 ST.accept_buff[ST.accepted].endpos = uc;
2648 base = trie->states[ state ].trans.base;
2650 DEBUG_TRIE_EXECUTE_r({
2651 dump_exec_pos( (char *)uc, scan, do_utf8 );
2652 PerlIO_printf( Perl_debug_log,
2653 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2654 2+PL_regindent * 2, "", PL_colors[4],
2655 (UV)state, (UV)base, (UV)ST.accepted );
2659 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2660 uvc, charid, foldlen, foldbuf, uniflags);
2663 (base + charid > trie->uniquecharcount )
2664 && (base + charid - 1 - trie->uniquecharcount
2666 && trie->trans[base + charid - 1 -
2667 trie->uniquecharcount].check == state)
2669 state = trie->trans[base + charid - 1 -
2670 trie->uniquecharcount ].next;
2681 DEBUG_TRIE_EXECUTE_r(
2682 PerlIO_printf( Perl_debug_log,
2683 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2684 charid, uvc, (UV)state, PL_colors[5] );
2691 PerlIO_printf( Perl_debug_log,
2692 "%*s %sgot %"IVdf" possible matches%s\n",
2693 REPORT_CODE_OFF + PL_regindent * 2, "",
2694 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2700 case TRIE_next_fail: /* we failed - try next alterative */
2702 if ( ST.accepted == 1 ) {
2703 /* only one choice left - just continue */
2705 reg_trie_data * const trie
2706 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2707 SV ** const tmp = RX_DEBUG(reginfo->prog)
2708 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2710 PerlIO_printf( Perl_debug_log,
2711 "%*s %sonly one match left: #%d <%s>%s\n",
2712 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2713 ST.accept_buff[ 0 ].wordnum,
2714 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2717 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2718 /* in this case we free tmps/leave before we call regmatch
2719 as we wont be using accept_buff again. */
2722 locinput = PL_reginput;
2723 nextchr = UCHARAT(locinput);
2725 continue; /* execute rest of RE */
2728 if (!ST.accepted-- ) {
2735 There are at least two accepting states left. Presumably
2736 the number of accepting states is going to be low,
2737 typically two. So we simply scan through to find the one
2738 with lowest wordnum. Once we find it, we swap the last
2739 state into its place and decrement the size. We then try to
2740 match the rest of the pattern at the point where the word
2741 ends. If we succeed, control just continues along the
2742 regex; if we fail we return here to try the next accepting
2749 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2750 DEBUG_TRIE_EXECUTE_r(
2751 PerlIO_printf( Perl_debug_log,
2752 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2753 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2754 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2755 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2758 if (ST.accept_buff[cur].wordnum <
2759 ST.accept_buff[best].wordnum)
2764 reg_trie_data * const trie
2765 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2766 SV ** const tmp = RX_DEBUG(reginfo->prog)
2767 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2769 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2770 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2771 ST.accept_buff[best].wordnum,
2772 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2776 if ( best<ST.accepted ) {
2777 reg_trie_accepted tmp = ST.accept_buff[ best ];
2778 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2779 ST.accept_buff[ ST.accepted ] = tmp;
2782 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2784 PUSH_STATE_GOTO(TRIE_next, ST.B);
2790 char *s = STRING(scan);
2791 st->ln = STR_LEN(scan);
2792 if (do_utf8 != UTF) {
2793 /* The target and the pattern have differing utf8ness. */
2795 const char * const e = s + st->ln;
2798 /* The target is utf8, the pattern is not utf8. */
2803 if (NATIVE_TO_UNI(*(U8*)s) !=
2804 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2812 /* The target is not utf8, the pattern is utf8. */
2817 if (NATIVE_TO_UNI(*((U8*)l)) !=
2818 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2826 nextchr = UCHARAT(locinput);
2829 /* The target and the pattern have the same utf8ness. */
2830 /* Inline the first character, for speed. */
2831 if (UCHARAT(s) != nextchr)
2833 if (PL_regeol - locinput < st->ln)
2835 if (st->ln > 1 && memNE(s, locinput, st->ln))
2838 nextchr = UCHARAT(locinput);
2842 PL_reg_flags |= RF_tainted;
2845 char * const s = STRING(scan);
2846 st->ln = STR_LEN(scan);
2848 if (do_utf8 || UTF) {
2849 /* Either target or the pattern are utf8. */
2850 const char * const l = locinput;
2851 char *e = PL_regeol;
2853 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2854 l, &e, 0, do_utf8)) {
2855 /* One more case for the sharp s:
2856 * pack("U0U*", 0xDF) =~ /ss/i,
2857 * the 0xC3 0x9F are the UTF-8
2858 * byte sequence for the U+00DF. */
2860 toLOWER(s[0]) == 's' &&
2862 toLOWER(s[1]) == 's' &&
2869 nextchr = UCHARAT(locinput);
2873 /* Neither the target and the pattern are utf8. */
2875 /* Inline the first character, for speed. */
2876 if (UCHARAT(s) != nextchr &&
2877 UCHARAT(s) != ((OP(scan) == EXACTF)
2878 ? PL_fold : PL_fold_locale)[nextchr])
2880 if (PL_regeol - locinput < st->ln)
2882 if (st->ln > 1 && (OP(scan) == EXACTF
2883 ? ibcmp(s, locinput, st->ln)
2884 : ibcmp_locale(s, locinput, st->ln)))
2887 nextchr = UCHARAT(locinput);
2892 STRLEN inclasslen = PL_regeol - locinput;
2894 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
2896 if (locinput >= PL_regeol)
2898 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2899 nextchr = UCHARAT(locinput);
2904 nextchr = UCHARAT(locinput);
2905 if (!REGINCLASS(rex, scan, (U8*)locinput))
2907 if (!nextchr && locinput >= PL_regeol)
2909 nextchr = UCHARAT(++locinput);
2913 /* If we might have the case of the German sharp s
2914 * in a casefolding Unicode character class. */
2916 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2917 locinput += SHARP_S_SKIP;
2918 nextchr = UCHARAT(locinput);
2924 PL_reg_flags |= RF_tainted;
2930 LOAD_UTF8_CHARCLASS_ALNUM();
2931 if (!(OP(scan) == ALNUM
2932 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2933 : isALNUM_LC_utf8((U8*)locinput)))
2937 locinput += PL_utf8skip[nextchr];
2938 nextchr = UCHARAT(locinput);
2941 if (!(OP(scan) == ALNUM
2942 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2944 nextchr = UCHARAT(++locinput);
2947 PL_reg_flags |= RF_tainted;
2950 if (!nextchr && locinput >= PL_regeol)
2953 LOAD_UTF8_CHARCLASS_ALNUM();
2954 if (OP(scan) == NALNUM
2955 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2956 : isALNUM_LC_utf8((U8*)locinput))
2960 locinput += PL_utf8skip[nextchr];
2961 nextchr = UCHARAT(locinput);
2964 if (OP(scan) == NALNUM
2965 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2967 nextchr = UCHARAT(++locinput);
2971 PL_reg_flags |= RF_tainted;
2975 /* was last char in word? */
2977 if (locinput == PL_bostr)
2980 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2982 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
2984 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2985 st->ln = isALNUM_uni(st->ln);
2986 LOAD_UTF8_CHARCLASS_ALNUM();
2987 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2990 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
2991 n = isALNUM_LC_utf8((U8*)locinput);
2995 st->ln = (locinput != PL_bostr) ?
2996 UCHARAT(locinput - 1) : '\n';
2997 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2998 st->ln = isALNUM(st->ln);
2999 n = isALNUM(nextchr);
3002 st->ln = isALNUM_LC(st->ln);
3003 n = isALNUM_LC(nextchr);
3006 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3007 OP(scan) == BOUNDL))
3011 PL_reg_flags |= RF_tainted;
3017 if (UTF8_IS_CONTINUED(nextchr)) {
3018 LOAD_UTF8_CHARCLASS_SPACE();
3019 if (!(OP(scan) == SPACE
3020 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3021 : isSPACE_LC_utf8((U8*)locinput)))
3025 locinput += PL_utf8skip[nextchr];
3026 nextchr = UCHARAT(locinput);
3029 if (!(OP(scan) == SPACE
3030 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3032 nextchr = UCHARAT(++locinput);
3035 if (!(OP(scan) == SPACE
3036 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3038 nextchr = UCHARAT(++locinput);
3042 PL_reg_flags |= RF_tainted;
3045 if (!nextchr && locinput >= PL_regeol)
3048 LOAD_UTF8_CHARCLASS_SPACE();
3049 if (OP(scan) == NSPACE
3050 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3051 : isSPACE_LC_utf8((U8*)locinput))
3055 locinput += PL_utf8skip[nextchr];
3056 nextchr = UCHARAT(locinput);
3059 if (OP(scan) == NSPACE
3060 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3062 nextchr = UCHARAT(++locinput);
3065 PL_reg_flags |= RF_tainted;
3071 LOAD_UTF8_CHARCLASS_DIGIT();
3072 if (!(OP(scan) == DIGIT
3073 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3074 : isDIGIT_LC_utf8((U8*)locinput)))
3078 locinput += PL_utf8skip[nextchr];
3079 nextchr = UCHARAT(locinput);
3082 if (!(OP(scan) == DIGIT
3083 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3085 nextchr = UCHARAT(++locinput);
3088 PL_reg_flags |= RF_tainted;
3091 if (!nextchr && locinput >= PL_regeol)
3094 LOAD_UTF8_CHARCLASS_DIGIT();
3095 if (OP(scan) == NDIGIT
3096 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3097 : isDIGIT_LC_utf8((U8*)locinput))
3101 locinput += PL_utf8skip[nextchr];
3102 nextchr = UCHARAT(locinput);
3105 if (OP(scan) == NDIGIT
3106 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3108 nextchr = UCHARAT(++locinput);
3111 if (locinput >= PL_regeol)
3114 LOAD_UTF8_CHARCLASS_MARK();
3115 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3117 locinput += PL_utf8skip[nextchr];
3118 while (locinput < PL_regeol &&
3119 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3120 locinput += UTF8SKIP(locinput);
3121 if (locinput > PL_regeol)
3126 nextchr = UCHARAT(locinput);
3129 PL_reg_flags |= RF_tainted;
3134 n = ARG(scan); /* which paren pair */
3135 st->ln = PL_regstartp[n];
3136 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3137 if ((I32)*PL_reglastparen < n || st->ln == -1)
3138 sayNO; /* Do not match unless seen CLOSEn. */
3139 if (st->ln == PL_regendp[n])
3142 s = PL_bostr + st->ln;
3143 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3145 const char *e = PL_bostr + PL_regendp[n];
3147 * Note that we can't do the "other character" lookup trick as
3148 * in the 8-bit case (no pun intended) because in Unicode we
3149 * have to map both upper and title case to lower case.
3151 if (OP(scan) == REFF) {
3153 STRLEN ulen1, ulen2;
3154 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3155 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3159 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3160 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3161 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3168 nextchr = UCHARAT(locinput);
3172 /* Inline the first character, for speed. */
3173 if (UCHARAT(s) != nextchr &&
3175 (UCHARAT(s) != ((OP(scan) == REFF
3176 ? PL_fold : PL_fold_locale)[nextchr]))))
3178 st->ln = PL_regendp[n] - st->ln;
3179 if (locinput + st->ln > PL_regeol)
3181 if (st->ln > 1 && (OP(scan) == REF
3182 ? memNE(s, locinput, st->ln)
3184 ? ibcmp(s, locinput, st->ln)
3185 : ibcmp_locale(s, locinput, st->ln))))
3188 nextchr = UCHARAT(locinput);
3199 #define ST st->u.eval
3201 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3205 /* execute the code in the {...} */
3207 SV ** const before = SP;
3208 OP_4tree * const oop = PL_op;
3209 COP * const ocurcop = PL_curcop;
3213 PL_op = (OP_4tree*)rex->data->data[n];
3214 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3215 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3216 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3218 CALLRUNOPS(aTHX); /* Scalar context. */
3221 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3228 PAD_RESTORE_LOCAL(old_comppad);
3229 PL_curcop = ocurcop;
3232 sv_setsv(save_scalar(PL_replgv), ret);
3236 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3239 /* extract RE object from returned value; compiling if
3244 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3245 mg = mg_find(sv, PERL_MAGIC_qr);
3246 else if (SvSMAGICAL(ret)) {
3247 if (SvGMAGICAL(ret))
3248 sv_unmagic(ret, PERL_MAGIC_qr);
3250 mg = mg_find(ret, PERL_MAGIC_qr);
3254 re = (regexp *)mg->mg_obj;
3255 (void)ReREFCNT_inc(re);
3259 const char * const t = SvPV_const(ret, len);
3261 const I32 osize = PL_regsize;
3264 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3265 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3267 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3269 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3275 /* run the pattern returned from (??{...}) */
3278 PerlIO_printf(Perl_debug_log,
3279 "Entering embedded \"%s%.60s%s%s\"\n",
3283 (strlen(re->precomp) > 60 ? "..." : ""))
3286 ST.cp = regcppush(0); /* Save *all* the positions. */
3287 REGCP_SET(ST.lastcp);
3288 *PL_reglastparen = 0;
3289 *PL_reglastcloseparen = 0;
3290 PL_reginput = locinput;
3292 /* XXXX This is too dramatic a measure... */
3296 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3297 ((re->reganch & ROPT_UTF8) != 0);
3298 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3303 /* now continue from first node in postoned RE */
3304 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3307 /* /(?(?{...})X|Y)/ */
3308 st->sw = SvTRUE(ret);
3313 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3315 PL_reg_flags ^= RF_utf8;
3318 /* XXXX This is too dramatic a measure... */
3320 /* Restore parens of the caller without popping the
3323 const I32 tmp = PL_savestack_ix;
3324 PL_savestack_ix = ST.lastcp;
3326 PL_savestack_ix = tmp;
3328 PL_reginput = locinput;
3329 /* continue at the node following the (??{...}) */
3333 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3334 /* Restore state to the outer re then re-throw the failure */
3336 PL_reg_flags ^= RF_utf8;
3340 /* XXXX This is too dramatic a measure... */
3343 PL_reginput = locinput;
3344 REGCP_UNWIND(ST.lastcp);
3351 n = ARG(scan); /* which paren pair */
3352 PL_reg_start_tmp[n] = locinput;
3357 n = ARG(scan); /* which paren pair */
3358 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3359 PL_regendp[n] = locinput - PL_bostr;
3360 if (n > (I32)*PL_reglastparen)
3361 *PL_reglastparen = n;
3362 *PL_reglastcloseparen = n;
3365 n = ARG(scan); /* which paren pair */
3366 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3369 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3371 next = NEXTOPER(NEXTOPER(scan));
3373 next = scan + ARG(scan);
3374 if (OP(next) == IFTHEN) /* Fake one. */
3375 next = NEXTOPER(NEXTOPER(next));
3379 st->logical = scan->flags;
3381 /*******************************************************************
3382 cc points to the regmatch_state associated with the most recent CURLYX.
3383 This struct contains info about the innermost (...)* loop (an
3384 "infoblock"), and a pointer to the next outer cc.
3386 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3388 1) After matching Y, regnode for CURLYX is processed;
3390 2) This regnode populates cc, and calls regmatch() recursively
3391 with the starting point at WHILEM node;
3393 3) Each hit of WHILEM node tries to match A and Z (in the order
3394 depending on the current iteration, min/max of {min,max} and
3395 greediness). The information about where are nodes for "A"
3396 and "Z" is read from cc, as is info on how many times "A"
3397 was already matched, and greediness.
3399 4) After A matches, the same WHILEM node is hit again.
3401 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3402 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3403 resets cc, since this Y(A)*Z can be a part of some other loop:
3404 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3405 of the external loop.
3407 Currently present infoblocks form a tree with a stem formed by st->cc
3408 and whatever it mentions via ->next, and additional attached trees
3409 corresponding to temporarily unset infoblocks as in "5" above.
3411 In the following picture, infoblocks for outer loop of
3412 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3413 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3414 infoblocks are drawn below the "reset" infoblock.
3416 In fact in the picture below we do not show failed matches for Z and T
3417 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3418 more obvious *why* one needs to *temporary* unset infoblocks.]
3420 Matched REx position InfoBlocks Comment
3424 Y A)*?Z)*?T x <- O <- I
3425 YA )*?Z)*?T x <- O <- I
3426 YA A)*?Z)*?T x <- O <- I
3427 YAA )*?Z)*?T x <- O <- I
3428 YAA Z)*?T x <- O # Temporary unset I
3431 YAAZ Y(A)*?Z)*?T x <- O
3434 YAAZY (A)*?Z)*?T x <- O
3437 YAAZY A)*?Z)*?T x <- O <- I
3440 YAAZYA )*?Z)*?T x <- O <- I
3443 YAAZYA Z)*?T x <- O # Temporary unset I
3449 YAAZYAZ T x # Temporary unset O
3456 *******************************************************************/
3459 /* No need to save/restore up to this paren */
3460 I32 parenfloor = scan->flags;
3464 CURLYX and WHILEM are always paired: they're the moral
3465 equivalent of pp_enteriter anbd pp_iter.
3467 The only time next could be null is if the node tree is
3468 corrupt. This was mentioned on p5p a few days ago.
3470 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3471 So we'll assert that this is true:
3474 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3476 /* XXXX Probably it is better to teach regpush to support
3477 parenfloor > PL_regsize... */
3478 if (parenfloor > (I32)*PL_reglastparen)
3479 parenfloor = *PL_reglastparen; /* Pessimization... */
3481 st->u.curlyx.cp = PL_savestack_ix;
3482 st->u.curlyx.outercc = st->cc;
3484 /* these fields contain the state of the current curly.
3485 * they are accessed by subsequent WHILEMs;
3486 * cur and lastloc are also updated by WHILEM */
3487 st->u.curlyx.parenfloor = parenfloor;
3488 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3489 st->u.curlyx.min = ARG1(scan);
3490 st->u.curlyx.max = ARG2(scan);
3491 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3492 st->u.curlyx.lastloc = 0;
3493 /* st->next and st->minmod are also read by WHILEM */
3495 PL_reginput = locinput;
3496 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3497 /*** all unsaved local vars undefined at this point */
3498 regcpblow(st->u.curlyx.cp);
3499 st->cc = st->u.curlyx.outercc;
3505 * This is really hard to understand, because after we match
3506 * what we're trying to match, we must make sure the rest of
3507 * the REx is going to match for sure, and to do that we have
3508 * to go back UP the parse tree by recursing ever deeper. And
3509 * if it fails, we have to reset our parent's current state
3510 * that we can try again after backing off.
3515 st->cc gets initialised by CURLYX ready for use by WHILEM.
3516 So again, unless somethings been corrupted, st->cc cannot
3517 be null at that point in WHILEM.
3519 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3520 So we'll assert that this is true:
3523 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3524 st->u.whilem.cache_offset = 0;
3525 st->u.whilem.cache_bit = 0;
3527 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3528 PL_reginput = locinput;
3531 PerlIO_printf(Perl_debug_log,
3532 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3533 REPORT_CODE_OFF+PL_regindent*2, "",
3534 (long)n, (long)st->cc->u.curlyx.min,
3535 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3538 /* If degenerate scan matches "", assume scan done. */
3540 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3541 st->u.whilem.savecc = st->cc;
3542 st->cc = st->cc->u.curlyx.outercc;
3544 st->ln = st->cc->u.curlyx.cur;
3546 PerlIO_printf(Perl_debug_log,
3547 "%*s empty match detected, try continuation...\n",
3548 REPORT_CODE_OFF+PL_regindent*2, "")
3550 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3551 /*** all unsaved local vars undefined at this point */
3552 st->cc = st->u.whilem.savecc;
3555 if (st->cc->u.curlyx.outercc)
3556 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3560 /* First just match a string of min scans. */
3562 if (n < st->cc->u.curlyx.min) {
3563 st->cc->u.curlyx.cur = n;
3564 st->cc->u.curlyx.lastloc = locinput;
3565 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3566 /*** all unsaved local vars undefined at this point */
3569 st->cc->u.curlyx.cur = n - 1;
3570 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3575 /* Check whether we already were at this position.
3576 Postpone detection until we know the match is not
3577 *that* much linear. */
3578 if (!PL_reg_maxiter) {
3579 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3580 /* possible overflow for long strings and many CURLYX's */
3581 if (PL_reg_maxiter < 0)
3582 PL_reg_maxiter = I32_MAX;
3583 PL_reg_leftiter = PL_reg_maxiter;
3585 if (PL_reg_leftiter-- == 0) {
3586 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3587 if (PL_reg_poscache) {
3588 if ((I32)PL_reg_poscache_size < size) {
3589 Renew(PL_reg_poscache, size, char);
3590 PL_reg_poscache_size = size;
3592 Zero(PL_reg_poscache, size, char);
3595 PL_reg_poscache_size = size;
3596 Newxz(PL_reg_poscache, size, char);
3599 PerlIO_printf(Perl_debug_log,
3600 "%sDetected a super-linear match, switching on caching%s...\n",
3601 PL_colors[4], PL_colors[5])
3604 if (PL_reg_leftiter < 0) {
3605 st->u.whilem.cache_offset = locinput - PL_bostr;
3607 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3608 + st->u.whilem.cache_offset * (scan->flags>>4);
3609 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3610 st->u.whilem.cache_offset /= 8;
3611 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3613 PerlIO_printf(Perl_debug_log,
3614 "%*s already tried at this position...\n",
3615 REPORT_CODE_OFF+PL_regindent*2, "")
3617 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3618 /* cache records success */
3621 /* cache records failure */
3627 /* Prefer next over scan for minimal matching. */
3629 if (st->cc->minmod) {
3630 st->u.whilem.savecc = st->cc;
3631 st->cc = st->cc->u.curlyx.outercc;
3633 st->ln = st->cc->u.curlyx.cur;
3634 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3635 REGCP_SET(st->u.whilem.lastcp);
3636 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3637 /*** all unsaved local vars undefined at this point */
3638 st->cc = st->u.whilem.savecc;
3640 regcpblow(st->u.whilem.cp);
3641 CACHEsayYES; /* All done. */
3643 REGCP_UNWIND(st->u.whilem.lastcp);
3645 if (st->cc->u.curlyx.outercc)
3646 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3648 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3649 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3650 && !(PL_reg_flags & RF_warned)) {
3651 PL_reg_flags |= RF_warned;
3652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3653 "Complex regular subexpression recursion",
3660 PerlIO_printf(Perl_debug_log,
3661 "%*s trying longer...\n",
3662 REPORT_CODE_OFF+PL_regindent*2, "")
3664 /* Try scanning more and see if it helps. */
3665 PL_reginput = locinput;
3666 st->cc->u.curlyx.cur = n;
3667 st->cc->u.curlyx.lastloc = locinput;
3668 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3669 REGCP_SET(st->u.whilem.lastcp);
3670 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3671 /*** all unsaved local vars undefined at this point */
3673 regcpblow(st->u.whilem.cp);
3676 REGCP_UNWIND(st->u.whilem.lastcp);
3678 st->cc->u.curlyx.cur = n - 1;
3679 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3683 /* Prefer scan over next for maximal matching. */
3685 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3686 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3687 st->cc->u.curlyx.cur = n;
3688 st->cc->u.curlyx.lastloc = locinput;
3689 REGCP_SET(st->u.whilem.lastcp);
3690 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3691 /*** all unsaved local vars undefined at this point */
3693 regcpblow(st->u.whilem.cp);
3696 REGCP_UNWIND(st->u.whilem.lastcp);
3697 regcppop(rex); /* Restore some previous $<digit>s? */
3698 PL_reginput = locinput;
3700 PerlIO_printf(Perl_debug_log,
3701 "%*s failed, try continuation...\n",
3702 REPORT_CODE_OFF+PL_regindent*2, "")
3705 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3706 && !(PL_reg_flags & RF_warned)) {
3707 PL_reg_flags |= RF_warned;
3708 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3709 "Complex regular subexpression recursion",
3713 /* Failed deeper matches of scan, so see if this one works. */
3714 st->u.whilem.savecc = st->cc;
3715 st->cc = st->cc->u.curlyx.outercc;
3717 st->ln = st->cc->u.curlyx.cur;
3718 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3719 /*** all unsaved local vars undefined at this point */
3720 st->cc = st->u.whilem.savecc;
3723 if (st->cc->u.curlyx.outercc)
3724 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3725 st->cc->u.curlyx.cur = n - 1;
3726 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3732 #define ST st->u.branch
3734 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3735 next = scan + ARG(scan);
3738 scan = NEXTOPER(scan);
3741 case BRANCH: /* /(...|A|...)/ */
3742 scan = NEXTOPER(scan); /* scan now points to inner node */
3743 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3744 /* last branch; skip state push and jump direct to node */
3746 ST.lastparen = *PL_reglastparen;
3747 ST.next_branch = next;
3749 PL_reginput = locinput;
3751 /* Now go into the branch */
3752 PUSH_STATE_GOTO(BRANCH_next, scan);
3755 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3756 REGCP_UNWIND(ST.cp);
3757 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3759 *PL_reglastparen = n;
3760 scan = ST.next_branch;
3761 /* no more branches? */
3762 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3764 continue; /* execute next BRANCH[J] op */
3772 #define ST st->u.curlym
3774 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3776 /* This is an optimisation of CURLYX that enables us to push
3777 * only a single backtracking state, no matter now many matches
3778 * there are in {m,n}. It relies on the pattern being constant
3779 * length, with no parens to influence future backrefs
3783 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3785 /* if paren positive, emulate an OPEN/CLOSE around A */
3787 I32 paren = ST.me->flags;
3788 if (paren > PL_regsize)
3790 if (paren > (I32)*PL_reglastparen)
3791 *PL_reglastparen = paren;
3792 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3798 ST.minmod = st->minmod;
3800 ST.c1 = CHRTEST_UNINIT;
3803 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3806 curlym_do_A: /* execute the A in /A{m,n}B/ */
3807 PL_reginput = locinput;
3808 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3811 case CURLYM_A: /* we've just matched an A */
3812 locinput = st->locinput;
3813 nextchr = UCHARAT(locinput);
3816 /* after first match, determine A's length: u.curlym.alen */
3817 if (ST.count == 1) {
3818 if (PL_reg_match_utf8) {
3820 while (s < PL_reginput) {
3826 ST.alen = PL_reginput - locinput;
3829 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3832 PerlIO_printf(Perl_debug_log,
3833 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3834 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3835 (IV) ST.count, (IV)ST.alen)
3838 locinput = PL_reginput;
3839 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3840 goto curlym_do_A; /* try to match another A */
3841 goto curlym_do_B; /* try to match B */
3843 case CURLYM_A_fail: /* just failed to match an A */
3844 REGCP_UNWIND(ST.cp);
3845 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3848 curlym_do_B: /* execute the B in /A{m,n}B/ */
3849 PL_reginput = locinput;
3850 if (ST.c1 == CHRTEST_UNINIT) {
3851 /* calculate c1 and c2 for possible match of 1st char
3852 * following curly */
3853 ST.c1 = ST.c2 = CHRTEST_VOID;
3854 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3855 regnode *text_node = ST.B;
3856 if (! HAS_TEXT(text_node))
3857 FIND_NEXT_IMPT(text_node);
3858 if (HAS_TEXT(text_node)
3859 && PL_regkind[OP(text_node)] != REF)
3861 ST.c1 = (U8)*STRING(text_node);
3863 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3865 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3866 ? PL_fold_locale[ST.c1]
3873 PerlIO_printf(Perl_debug_log,
3874 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
3875 (int)(REPORT_CODE_OFF+PL_regindent*2),
3878 if (ST.c1 != CHRTEST_VOID
3879 && UCHARAT(PL_reginput) != ST.c1
3880 && UCHARAT(PL_reginput) != ST.c2)
3882 /* simulate B failing */
3883 state_num = CURLYM_B_fail;
3884 goto reenter_switch;
3888 /* mark current A as captured */
3889 I32 paren = ST.me->flags;
3892 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3893 PL_regendp[paren] = PL_reginput - PL_bostr;
3896 PL_regendp[paren] = -1;
3898 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
3901 case CURLYM_B_fail: /* just failed to match a B */
3902 REGCP_UNWIND(ST.cp);
3904 if (ST.count == ARG2(ST.me) /* max */)
3906 goto curlym_do_A; /* try to match a further A */
3908 /* backtrack one A */
3909 if (ST.count == ARG1(ST.me) /* min */)
3912 locinput = HOPc(locinput, -ST.alen);
3913 goto curlym_do_B; /* try to match B */
3916 #define ST st->u.curly
3918 #define CURLY_SETPAREN(paren, success) \
3921 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3922 PL_regendp[paren] = locinput - PL_bostr; \
3925 PL_regendp[paren] = -1; \
3928 case STAR: /* /A*B/ where A is width 1 */
3932 scan = NEXTOPER(scan);
3934 case PLUS: /* /A+B/ where A is width 1 */
3938 scan = NEXTOPER(scan);
3940 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3941 ST.paren = scan->flags; /* Which paren to set */
3942 if (ST.paren > PL_regsize)
3943 PL_regsize = ST.paren;
3944 if (ST.paren > (I32)*PL_reglastparen)
3945 *PL_reglastparen = ST.paren;
3946 ST.min = ARG1(scan); /* min to match */
3947 ST.max = ARG2(scan); /* max to match */
3948 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3950 case CURLY: /* /A{m,n}B/ where A is width 1 */
3952 ST.min = ARG1(scan); /* min to match */
3953 ST.max = ARG2(scan); /* max to match */
3954 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3957 * Lookahead to avoid useless match attempts
3958 * when we know what character comes next.
3960 * Used to only do .*x and .*?x, but now it allows
3961 * for )'s, ('s and (?{ ... })'s to be in the way
3962 * of the quantifier and the EXACT-like node. -- japhy
3965 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3967 if (HAS_TEXT(next) || JUMPABLE(next)) {
3969 regnode *text_node = next;
3971 if (! HAS_TEXT(text_node))
3972 FIND_NEXT_IMPT(text_node);
3974 if (! HAS_TEXT(text_node))
3975 ST.c1 = ST.c2 = CHRTEST_VOID;
3977 if (PL_regkind[OP(text_node)] == REF) {
3978 ST.c1 = ST.c2 = CHRTEST_VOID;
3979 goto assume_ok_easy;
3982 s = (U8*)STRING(text_node);
3986 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3987 ST.c2 = PL_fold[ST.c1];
3988 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3989 ST.c2 = PL_fold_locale[ST.c1];
3992 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3993 STRLEN ulen1, ulen2;
3994 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3995 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3997 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3998 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4000 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4002 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4006 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4013 ST.c1 = ST.c2 = CHRTEST_VOID;
4018 PL_reginput = locinput;
4021 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4024 locinput = PL_reginput;
4026 if (ST.c1 == CHRTEST_VOID)
4027 goto curly_try_B_min;
4029 ST.oldloc = locinput;
4031 /* set ST.maxpos to the furthest point along the
4032 * string that could possibly match */
4033 if (ST.max == REG_INFTY) {
4034 ST.maxpos = PL_regeol - 1;
4036 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4040 int m = ST.max - ST.min;
4041 for (ST.maxpos = locinput;
4042 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4043 ST.maxpos += UTF8SKIP(ST.maxpos);
4046 ST.maxpos = locinput + ST.max - ST.min;
4047 if (ST.maxpos >= PL_regeol)
4048 ST.maxpos = PL_regeol - 1;
4050 goto curly_try_B_min_known;
4054 ST.count = regrepeat(rex, ST.A, ST.max);
4055 locinput = PL_reginput;
4056 if (ST.count < ST.min)
4058 if ((ST.count > ST.min)
4059 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4061 /* A{m,n} must come at the end of the string, there's
4062 * no point in backing off ... */
4064 /* ...except that $ and \Z can match before *and* after
4065 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4066 We may back off by one in this case. */
4067 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4071 goto curly_try_B_max;
4076 case CURLY_B_min_known_fail:
4077 /* failed to find B in a non-greedy match where c1,c2 valid */
4078 if (ST.paren && ST.count)
4079 PL_regendp[ST.paren] = -1;
4081 PL_reginput = locinput; /* Could be reset... */
4082 REGCP_UNWIND(ST.cp);
4083 /* Couldn't or didn't -- move forward. */
4084 ST.oldloc = locinput;
4086 locinput += UTF8SKIP(locinput);
4090 curly_try_B_min_known:
4091 /* find the next place where 'B' could work, then call B */
4095 n = (ST.oldloc == locinput) ? 0 : 1;
4096 if (ST.c1 == ST.c2) {
4098 /* set n to utf8_distance(oldloc, locinput) */
4099 while (locinput <= ST.maxpos &&
4100 utf8n_to_uvchr((U8*)locinput,
4101 UTF8_MAXBYTES, &len,
4102 uniflags) != (UV)ST.c1) {
4108 /* set n to utf8_distance(oldloc, locinput) */
4109 while (locinput <= ST.maxpos) {
4111 const UV c = utf8n_to_uvchr((U8*)locinput,
4112 UTF8_MAXBYTES, &len,
4114 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4122 if (ST.c1 == ST.c2) {
4123 while (locinput <= ST.maxpos &&
4124 UCHARAT(locinput) != ST.c1)
4128 while (locinput <= ST.maxpos
4129 && UCHARAT(locinput) != ST.c1
4130 && UCHARAT(locinput) != ST.c2)
4133 n = locinput - ST.oldloc;
4135 if (locinput > ST.maxpos)
4137 /* PL_reginput == oldloc now */
4140 if (regrepeat(rex, ST.A, n) < n)
4143 PL_reginput = locinput;
4144 CURLY_SETPAREN(ST.paren, ST.count);
4145 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4150 case CURLY_B_min_fail:
4151 /* failed to find B in a non-greedy match where c1,c2 invalid */
4152 if (ST.paren && ST.count)
4153 PL_regendp[ST.paren] = -1;
4155 REGCP_UNWIND(ST.cp);
4156 /* failed -- move forward one */
4157 PL_reginput = locinput;
4158 if (regrepeat(rex, ST.A, 1)) {
4160 locinput = PL_reginput;
4161 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4162 ST.count > 0)) /* count overflow ? */
4165 CURLY_SETPAREN(ST.paren, ST.count);
4166 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4174 /* a successful greedy match: now try to match B */
4177 if (ST.c1 != CHRTEST_VOID)
4178 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4179 UTF8_MAXBYTES, 0, uniflags)
4180 : (UV) UCHARAT(PL_reginput);
4181 /* If it could work, try it. */
4182 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4183 CURLY_SETPAREN(ST.paren, ST.count);
4184 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4189 case CURLY_B_max_fail:
4190 /* failed to find B in a greedy match */
4191 if (ST.paren && ST.count)
4192 PL_regendp[ST.paren] = -1;
4194 REGCP_UNWIND(ST.cp);
4196 if (--ST.count < ST.min)
4198 PL_reginput = locinput = HOPc(locinput, -1);
4199 goto curly_try_B_max;
4205 if (locinput < reginfo->till) {
4206 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4207 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4209 (long)(locinput - PL_reg_starttry),
4210 (long)(reginfo->till - PL_reg_starttry),
4212 sayNO_FINAL; /* Cannot match: too short. */
4214 PL_reginput = locinput; /* put where regtry can find it */
4215 sayYES_FINAL; /* Success! */
4217 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4219 PerlIO_printf(Perl_debug_log,
4220 "%*s %ssubpattern success...%s\n",
4221 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
4222 PL_reginput = locinput; /* put where regtry can find it */
4223 sayYES_FINAL; /* Success! */
4226 #define ST st->u.ifmatch
4228 case SUSPEND: /* (?>A) */
4230 PL_reginput = locinput;
4233 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4235 goto ifmatch_trivial_fail_test;
4237 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4239 ifmatch_trivial_fail_test:
4241 char * const s = HOPBACKc(locinput, scan->flags);
4246 st->sw = 1 - (bool)ST.wanted;
4250 next = scan + ARG(scan);
4258 PL_reginput = locinput;
4262 /* execute body of (?...A) */
4263 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4266 case IFMATCH_A_fail: /* body of (?...A) failed */
4267 ST.wanted = !ST.wanted;
4270 case IFMATCH_A: /* body of (?...A) succeeded */
4273 st->sw = (bool)ST.wanted;
4275 else if (!ST.wanted)
4278 if (OP(ST.me) == SUSPEND)
4279 locinput = PL_reginput;
4281 locinput = PL_reginput = st->locinput;
4282 nextchr = UCHARAT(locinput);
4284 scan = ST.me + ARG(ST.me);
4287 continue; /* execute B */
4292 next = scan + ARG(scan);
4297 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4298 PTR2UV(scan), OP(scan));
4299 Perl_croak(aTHX_ "regexp memory corruption");
4307 /* push a state that backtracks on success */
4308 st->u.yes.prev_yes_state = yes_state;
4312 /* push a new regex state, then continue at scan */
4314 regmatch_state *newst;
4317 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4318 "PUSH STATE(%d)\n", depth));
4319 st->locinput = locinput;
4321 if (newst > SLAB_LAST(PL_regmatch_slab))
4322 newst = S_push_slab(aTHX);
4323 PL_regmatch_state = newst;
4325 /* XXX probably don't need to initialise these */
4330 locinput = PL_reginput;
4331 nextchr = UCHARAT(locinput);
4337 /* simulate recursively calling regmatch(), but without actually
4338 * recursing - ie save the current state on the heap rather than on
4339 * the stack, then re-enter the loop. This avoids complex regexes
4340 * blowing the processor stack */
4344 /* push new state */
4345 regmatch_state *oldst = st;
4348 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
4350 /* grab the next free state slot */
4352 if (st > SLAB_LAST(PL_regmatch_slab))
4353 st = S_push_slab(aTHX);
4354 PL_regmatch_state = st;
4358 oldst->locinput = locinput;
4361 locinput = PL_reginput;
4362 nextchr = UCHARAT(locinput);
4375 * We get here only if there's trouble -- normally "case END" is
4376 * the terminating point.
4378 Perl_croak(aTHX_ "corrupted regexp pointers");
4385 /* we have successfully completed a subexpression, but we must now
4386 * pop to the state marked by yes_state and continue from there */
4388 assert(st != yes_state);
4389 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4390 || yes_state > SLAB_LAST(PL_regmatch_slab))
4392 /* not in this slab, pop slab */
4393 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4394 PL_regmatch_slab = PL_regmatch_slab->prev;
4395 st = SLAB_LAST(PL_regmatch_slab);
4397 depth -= (st - yes_state);
4398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4399 depth+1, depth+(st - yes_state)));
4401 yes_state = st->u.yes.prev_yes_state;
4402 PL_regmatch_state = st;
4404 switch (st->resume_state) {
4408 state_num = st->resume_state;
4409 goto reenter_switch;
4416 Perl_croak(aTHX_ "unexpected yes resume state");
4420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4421 PL_colors[4], PL_colors[5]));
4428 /* XXX this is duplicate(ish) code to that in the do_no section.
4429 * will disappear when REGFMATCH goes */
4431 /* restore previous state and re-enter */
4432 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4435 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4436 PL_regmatch_slab = PL_regmatch_slab->prev;
4437 st = SLAB_LAST(PL_regmatch_slab);
4439 PL_regmatch_state = st;
4443 locinput= st->locinput;
4444 nextchr = UCHARAT(locinput);
4446 switch (st->resume_state) {
4448 goto resume_point_CURLYX;
4449 case resume_WHILEM1:
4450 goto resume_point_WHILEM1;
4451 case resume_WHILEM2:
4452 goto resume_point_WHILEM2;
4453 case resume_WHILEM3:
4454 goto resume_point_WHILEM3;
4455 case resume_WHILEM4:
4456 goto resume_point_WHILEM4;
4457 case resume_WHILEM5:
4458 goto resume_point_WHILEM5;
4459 case resume_WHILEM6:
4460 goto resume_point_WHILEM6;
4470 case CURLY_B_min_known:
4474 Perl_croak(aTHX_ "regexp resume memory corruption");
4481 PerlIO_printf(Perl_debug_log,
4482 "%*s %sfailed...%s\n",
4483 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4494 /* there's a previous state to backtrack to */
4495 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4498 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4499 PL_regmatch_slab = PL_regmatch_slab->prev;
4500 st = SLAB_LAST(PL_regmatch_slab);
4502 PL_regmatch_state = st;
4506 locinput= st->locinput;
4507 nextchr = UCHARAT(locinput);
4509 switch (st->resume_state) {
4511 goto resume_point_CURLYX;
4512 case resume_WHILEM1:
4513 goto resume_point_WHILEM1;
4514 case resume_WHILEM2:
4515 goto resume_point_WHILEM2;
4516 case resume_WHILEM3:
4517 goto resume_point_WHILEM3;
4518 case resume_WHILEM4:
4519 goto resume_point_WHILEM4;
4520 case resume_WHILEM5:
4521 goto resume_point_WHILEM5;
4522 case resume_WHILEM6:
4523 goto resume_point_WHILEM6;
4533 case CURLY_B_min_known:
4534 if (yes_state == st)
4535 yes_state = st->u.yes.prev_yes_state;
4536 state_num = st->resume_state + 1; /* failure = success + 1 */
4537 goto reenter_switch;
4540 Perl_croak(aTHX_ "regexp resume memory corruption");
4546 /* restore original high-water mark */
4547 PL_regmatch_slab = orig_slab;
4548 PL_regmatch_state = orig_state;
4550 /* free all slabs above current one */
4551 if (orig_slab->next) {
4552 regmatch_slab *sl = orig_slab->next;
4553 orig_slab->next = NULL;
4555 regmatch_slab * const osl = sl;
4566 - regrepeat - repeatedly match something simple, report how many
4569 * [This routine now assumes that it will only match on things of length 1.
4570 * That was true before, but now we assume scan - reginput is the count,
4571 * rather than incrementing count on every character. [Er, except utf8.]]
4574 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4577 register char *scan;
4579 register char *loceol = PL_regeol;
4580 register I32 hardcount = 0;
4581 register bool do_utf8 = PL_reg_match_utf8;
4584 if (max == REG_INFTY)
4586 else if (max < loceol - scan)
4587 loceol = scan + max;
4592 while (scan < loceol && hardcount < max && *scan != '\n') {
4593 scan += UTF8SKIP(scan);
4597 while (scan < loceol && *scan != '\n')
4604 while (scan < loceol && hardcount < max) {
4605 scan += UTF8SKIP(scan);
4615 case EXACT: /* length of string is 1 */
4617 while (scan < loceol && UCHARAT(scan) == c)
4620 case EXACTF: /* length of string is 1 */
4622 while (scan < loceol &&
4623 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4626 case EXACTFL: /* length of string is 1 */
4627 PL_reg_flags |= RF_tainted;
4629 while (scan < loceol &&
4630 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4636 while (hardcount < max && scan < loceol &&
4637 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4638 scan += UTF8SKIP(scan);
4642 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4649 LOAD_UTF8_CHARCLASS_ALNUM();
4650 while (hardcount < max && scan < loceol &&
4651 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4652 scan += UTF8SKIP(scan);
4656 while (scan < loceol && isALNUM(*scan))
4661 PL_reg_flags |= RF_tainted;
4664 while (hardcount < max && scan < loceol &&
4665 isALNUM_LC_utf8((U8*)scan)) {
4666 scan += UTF8SKIP(scan);
4670 while (scan < loceol && isALNUM_LC(*scan))
4677 LOAD_UTF8_CHARCLASS_ALNUM();
4678 while (hardcount < max && scan < loceol &&
4679 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4680 scan += UTF8SKIP(scan);
4684 while (scan < loceol && !isALNUM(*scan))
4689 PL_reg_flags |= RF_tainted;
4692 while (hardcount < max && scan < loceol &&
4693 !isALNUM_LC_utf8((U8*)scan)) {
4694 scan += UTF8SKIP(scan);
4698 while (scan < loceol && !isALNUM_LC(*scan))
4705 LOAD_UTF8_CHARCLASS_SPACE();
4706 while (hardcount < max && scan < loceol &&
4708 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4709 scan += UTF8SKIP(scan);
4713 while (scan < loceol && isSPACE(*scan))
4718 PL_reg_flags |= RF_tainted;
4721 while (hardcount < max && scan < loceol &&
4722 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4723 scan += UTF8SKIP(scan);
4727 while (scan < loceol && isSPACE_LC(*scan))
4734 LOAD_UTF8_CHARCLASS_SPACE();
4735 while (hardcount < max && scan < loceol &&
4737 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4738 scan += UTF8SKIP(scan);
4742 while (scan < loceol && !isSPACE(*scan))
4747 PL_reg_flags |= RF_tainted;
4750 while (hardcount < max && scan < loceol &&
4751 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4752 scan += UTF8SKIP(scan);
4756 while (scan < loceol && !isSPACE_LC(*scan))
4763 LOAD_UTF8_CHARCLASS_DIGIT();
4764 while (hardcount < max && scan < loceol &&
4765 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4766 scan += UTF8SKIP(scan);
4770 while (scan < loceol && isDIGIT(*scan))
4777 LOAD_UTF8_CHARCLASS_DIGIT();
4778 while (hardcount < max && scan < loceol &&
4779 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4780 scan += UTF8SKIP(scan);
4784 while (scan < loceol && !isDIGIT(*scan))
4788 default: /* Called on something of 0 width. */
4789 break; /* So match right here or not at all. */
4795 c = scan - PL_reginput;
4799 GET_RE_DEBUG_FLAGS_DECL;
4801 SV * const prop = sv_newmortal();
4802 regprop(prog, prop, p);
4803 PerlIO_printf(Perl_debug_log,
4804 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4805 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4813 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4815 - regclass_swash - prepare the utf8 swash
4819 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4825 const struct reg_data * const data = prog ? prog->data : NULL;
4827 if (data && data->count) {
4828 const U32 n = ARG(node);
4830 if (data->what[n] == 's') {
4831 SV * const rv = (SV*)data->data[n];
4832 AV * const av = (AV*)SvRV((SV*)rv);
4833 SV **const ary = AvARRAY(av);
4836 /* See the end of regcomp.c:S_regclass() for
4837 * documentation of these array elements. */
4840 a = SvROK(ary[1]) ? &ary[1] : 0;
4841 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4845 else if (si && doinit) {
4846 sw = swash_init("utf8", "", si, 1, 0);
4847 (void)av_store(av, 1, sw);
4864 - reginclass - determine if a character falls into a character class
4866 The n is the ANYOF regnode, the p is the target string, lenp
4867 is pointer to the maximum length of how far to go in the p
4868 (if the lenp is zero, UTF8SKIP(p) is used),
4869 do_utf8 tells whether the target string is in UTF-8.
4874 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4877 const char flags = ANYOF_FLAGS(n);
4883 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4884 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4885 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4886 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
4887 if (len == (STRLEN)-1)
4888 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4891 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4892 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4895 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4896 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4899 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4903 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
4906 if (swash_fetch(sw, p, do_utf8))
4908 else if (flags & ANYOF_FOLD) {
4909 if (!match && lenp && av) {
4911 for (i = 0; i <= av_len(av); i++) {
4912 SV* const sv = *av_fetch(av, i, FALSE);
4914 const char * const s = SvPV_const(sv, len);
4916 if (len <= plen && memEQ(s, (char*)p, len)) {
4924 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4927 to_utf8_fold(p, tmpbuf, &tmplen);
4928 if (swash_fetch(sw, tmpbuf, do_utf8))
4934 if (match && lenp && *lenp == 0)
4935 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4937 if (!match && c < 256) {
4938 if (ANYOF_BITMAP_TEST(n, c))
4940 else if (flags & ANYOF_FOLD) {
4943 if (flags & ANYOF_LOCALE) {
4944 PL_reg_flags |= RF_tainted;
4945 f = PL_fold_locale[c];
4949 if (f != c && ANYOF_BITMAP_TEST(n, f))
4953 if (!match && (flags & ANYOF_CLASS)) {
4954 PL_reg_flags |= RF_tainted;
4956 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4957 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4958 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4959 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4960 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4961 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4962 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4963 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4964 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4965 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4966 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4967 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4968 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4969 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4970 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4971 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4972 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4973 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4974 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4975 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4976 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4977 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4978 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4979 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4980 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4981 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4982 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4983 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4984 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4985 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4986 ) /* How's that for a conditional? */
4993 return (flags & ANYOF_INVERT) ? !match : match;
4997 S_reghop3(U8 *s, I32 off, const U8* lim)
5001 while (off-- && s < lim) {
5002 /* XXX could check well-formedness here */
5010 if (UTF8_IS_CONTINUED(*s)) {
5011 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5014 /* XXX could check well-formedness here */
5022 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5026 while (off-- && s < lim) {
5027 /* XXX could check well-formedness here */
5037 if (UTF8_IS_CONTINUED(*s)) {
5038 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5041 /* XXX could check well-formedness here */
5053 restore_pos(pTHX_ void *arg)
5056 regexp * const rex = (regexp *)arg;
5057 if (PL_reg_eval_set) {
5058 if (PL_reg_oldsaved) {
5059 rex->subbeg = PL_reg_oldsaved;
5060 rex->sublen = PL_reg_oldsavedlen;
5061 #ifdef PERL_OLD_COPY_ON_WRITE
5062 rex->saved_copy = PL_nrs;
5064 RX_MATCH_COPIED_on(rex);
5066 PL_reg_magic->mg_len = PL_reg_oldpos;
5067 PL_reg_eval_set = 0;
5068 PL_curpm = PL_reg_oldcurpm;
5073 S_to_utf8_substr(pTHX_ register regexp *prog)
5075 if (prog->float_substr && !prog->float_utf8) {
5076 SV* const sv = newSVsv(prog->float_substr);
5077 prog->float_utf8 = sv;
5078 sv_utf8_upgrade(sv);
5079 if (SvTAIL(prog->float_substr))
5081 if (prog->float_substr == prog->check_substr)
5082 prog->check_utf8 = sv;
5084 if (prog->anchored_substr && !prog->anchored_utf8) {
5085 SV* const sv = newSVsv(prog->anchored_substr);
5086 prog->anchored_utf8 = sv;
5087 sv_utf8_upgrade(sv);
5088 if (SvTAIL(prog->anchored_substr))
5090 if (prog->anchored_substr == prog->check_substr)
5091 prog->check_utf8 = sv;
5096 S_to_byte_substr(pTHX_ register regexp *prog)
5099 if (prog->float_utf8 && !prog->float_substr) {
5100 SV* sv = newSVsv(prog->float_utf8);
5101 prog->float_substr = sv;
5102 if (sv_utf8_downgrade(sv, TRUE)) {
5103 if (SvTAIL(prog->float_utf8))
5107 prog->float_substr = sv = &PL_sv_undef;
5109 if (prog->float_utf8 == prog->check_utf8)
5110 prog->check_substr = sv;
5112 if (prog->anchored_utf8 && !prog->anchored_substr) {
5113 SV* sv = newSVsv(prog->anchored_utf8);
5114 prog->anchored_substr = sv;
5115 if (sv_utf8_downgrade(sv, TRUE)) {
5116 if (SvTAIL(prog->anchored_utf8))
5120 prog->anchored_substr = sv = &PL_sv_undef;
5122 if (prog->anchored_utf8 == prog->check_utf8)
5123 prog->check_substr = sv;
5129 * c-indentation-style: bsd
5131 * indent-tabs-mode: t
5134 * ex: set ts=8 sts=4 sw=4 noet: