5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
80 #define RF_evaled 4 /* Did an EVAL with setting? */
81 #define RF_utf8 8 /* String contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
153 static void restore_pos(pTHX_ void *arg);
156 S_regcppush(pTHX_ I32 parenfloor)
159 const int retval = PL_savestack_ix;
160 #define REGCP_PAREN_ELEMS 4
161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
163 GET_RE_DEBUG_FLAGS_DECL;
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168 #define REGCP_OTHER_ELEMS 6
169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
170 for (p = PL_regsize; p > parenfloor; p--) {
171 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
174 SSPUSHPTR(PL_reg_start_tmp[p]);
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
183 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
186 SSPUSHINT(*PL_reglastcloseparen);
187 SSPUSHPTR(PL_reginput);
188 #define REGCP_FRAME_ELEMS 2
189 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
197 /* These are needed since we do not localize EVAL nodes: */
198 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
202 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
203 PerlIO_printf(Perl_debug_log, \
204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
208 S_regcppop(pTHX_ const regexp *rex)
214 GET_RE_DEBUG_FLAGS_DECL;
216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
220 input = (char *) SSPOPPTR;
221 *PL_reglastcloseparen = SSPOPINT;
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
225 /* Now restore the parentheses context. */
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
229 U32 paren = (U32)SSPOPINT;
230 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
231 PL_regstartp[paren] = SSPOPINT;
233 if (paren <= *PL_reglastparen)
234 PL_regendp[paren] = tmps;
236 PerlIO_printf(Perl_debug_log,
237 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
238 (UV)paren, (IV)PL_regstartp[paren],
239 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
240 (IV)PL_regendp[paren],
241 (paren > *PL_reglastparen ? "(no)" : ""));
245 if (*PL_reglastparen + 1 <= rex->nparens) {
246 PerlIO_printf(Perl_debug_log,
247 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
248 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
252 /* It would seem that the similar code in regtry()
253 * already takes care of this, and in fact it is in
254 * a better location to since this code can #if 0-ed out
255 * but the code in regtry() is needed or otherwise tests
256 * requiring null fields (pat.t#187 and split.t#{13,14}
257 * (as of patchlevel 7877) will fail. Then again,
258 * this code seems to be necessary or otherwise
259 * building DynaLoader will fail:
260 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
264 PL_regstartp[i] = -1;
271 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
274 * pregexec and friends
277 #ifndef PERL_IN_XSUB_RE
279 - pregexec - match a regexp against a string
282 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
283 char *strbeg, I32 minend, SV *screamer, U32 nosave)
284 /* strend: pointer to null at end of string */
285 /* strbeg: real beginning of string */
286 /* minend: end of match must be >=minend after stringarg. */
287 /* nosave: For optimizations. */
290 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
291 nosave ? 0 : REXEC_COPY_STR);
296 * Need to implement the following flags for reg_anch:
298 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
301 * INTUIT_AUTORITATIVE_ML
302 * INTUIT_ONCE_NOML - Intuit can match in one location only.
305 * Another flag for this function: SECOND_TIME (so that float substrs
306 * with giant delta may be not rechecked).
309 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
312 Otherwise, only SvCUR(sv) is used to get strbeg. */
314 /* XXXX We assume that strpos is strbeg unless sv. */
316 /* XXXX Some places assume that there is a fixed substring.
317 An update may be needed if optimizer marks as "INTUITable"
318 RExen without fixed substrings. Similarly, it is assumed that
319 lengths of all the strings are no more than minlen, thus they
320 cannot come from lookahead.
321 (Or minlen should take into account lookahead.) */
323 /* A failure to find a constant substring means that there is no need to make
324 an expensive call to REx engine, thus we celebrate a failure. Similarly,
325 finding a substring too deep into the string means that less calls to
326 regtry() should be needed.
328 REx compiler's optimizer found 4 possible hints:
329 a) Anchored substring;
331 c) Whether we are anchored (beginning-of-line or \G);
332 d) First node (of those at offset 0) which may distingush positions;
333 We use a)b)d) and multiline-part of c), and try to find a position in the
334 string which does not contradict any of them.
337 /* Most of decisions we do here should have been done at compile time.
338 The nodes of the REx which we used for the search should have been
339 deleted from the finite automaton. */
342 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
343 char *strend, U32 flags, re_scream_pos_data *data)
346 register I32 start_shift = 0;
347 /* Should be nonnegative! */
348 register I32 end_shift = 0;
353 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
355 register char *other_last = NULL; /* other substr checked before this */
356 char *check_at = NULL; /* check substr found at this pos */
357 const I32 multiline = prog->reganch & PMf_MULTILINE;
359 const char * const i_strpos = strpos;
362 GET_RE_DEBUG_FLAGS_DECL;
364 RX_MATCH_UTF8_set(prog,do_utf8);
366 if (prog->reganch & ROPT_UTF8) {
367 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
368 "UTF-8 regex...\n"));
369 PL_reg_flags |= RF_utf8;
373 RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
374 PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
378 if (PL_reg_match_utf8)
379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
380 "UTF-8 target...\n"));
381 PerlIO_printf(Perl_debug_log,
382 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
383 PL_colors[4], PL_colors[5], PL_colors[0],
386 (strlen(prog->precomp) > 60 ? "..." : ""),
388 (int)(len > 60 ? 60 : len),
390 (len > 60 ? "..." : "")
394 /* CHR_DIST() would be more correct here but it makes things slow. */
395 if (prog->minlen > strend - strpos) {
396 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
397 "String too short... [re_intuit_start]\n"));
400 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
403 if (!prog->check_utf8 && prog->check_substr)
404 to_utf8_substr(prog);
405 check = prog->check_utf8;
407 if (!prog->check_substr && prog->check_utf8)
408 to_byte_substr(prog);
409 check = prog->check_substr;
411 if (check == &PL_sv_undef) {
412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
413 "Non-utf string cannot match utf check string\n"));
416 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
417 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
418 || ( (prog->reganch & ROPT_ANCH_BOL)
419 && !multiline ) ); /* Check after \n? */
422 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
423 | ROPT_IMPLICIT)) /* not a real BOL */
424 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
426 && (strpos != strbeg)) {
427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
430 if (prog->check_offset_min == prog->check_offset_max &&
431 !(prog->reganch & ROPT_CANY_SEEN)) {
432 /* Substring at constant offset from beg-of-str... */
435 s = HOP3c(strpos, prog->check_offset_min, strend);
437 slen = SvCUR(check); /* >= 1 */
439 if ( strend - s > slen || strend - s < slen - 1
440 || (strend - s == slen && strend[-1] != '\n')) {
441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
444 /* Now should match s[0..slen-2] */
446 if (slen && (*SvPVX_const(check) != *s
448 && memNE(SvPVX_const(check), s, slen)))) {
450 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
454 else if (*SvPVX_const(check) != *s
455 || ((slen = SvCUR(check)) > 1
456 && memNE(SvPVX_const(check), s, slen)))
459 goto success_at_start;
462 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
464 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
465 end_shift = prog->minlen - start_shift -
466 CHR_SVLEN(check) + (SvTAIL(check) != 0);
468 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
469 - (SvTAIL(check) != 0);
470 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
472 if (end_shift < eshift)
476 else { /* Can match at random position */
479 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
480 /* Should be nonnegative! */
481 end_shift = prog->minlen - start_shift -
482 CHR_SVLEN(check) + (SvTAIL(check) != 0);
485 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
487 Perl_croak(aTHX_ "panic: end_shift");
491 /* Find a possible match in the region s..strend by looking for
492 the "check" substring in the region corrected by start/end_shift. */
493 if (flags & REXEC_SCREAM) {
494 I32 p = -1; /* Internal iterator of scream. */
495 I32 * const pp = data ? data->scream_pos : &p;
497 if (PL_screamfirst[BmRARE(check)] >= 0
498 || ( BmRARE(check) == '\n'
499 && (BmPREVIOUS(check) == SvCUR(check) - 1)
501 s = screaminstr(sv, check,
502 start_shift + (s - strbeg), end_shift, pp, 0);
505 /* we may be pointing at the wrong string */
506 if (s && RX_MATCH_COPIED(prog))
507 s = strbeg + (s - SvPVX_const(sv));
509 *data->scream_olds = s;
511 else if (prog->reganch & ROPT_CANY_SEEN)
512 s = fbm_instr((U8*)(s + start_shift),
513 (U8*)(strend - end_shift),
514 check, multiline ? FBMrf_MULTILINE : 0);
516 s = fbm_instr(HOP3(s, start_shift, strend),
517 HOP3(strend, -end_shift, strbeg),
518 check, multiline ? FBMrf_MULTILINE : 0);
520 /* Update the count-of-usability, remove useless subpatterns,
523 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
524 (s ? "Found" : "Did not find"),
525 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
527 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
529 PL_colors[1], (SvTAIL(check) ? "$" : ""),
530 (s ? " at offset " : "...\n") ) );
537 /* Finish the diagnostic message */
538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
540 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
541 Start with the other substr.
542 XXXX no SCREAM optimization yet - and a very coarse implementation
543 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
544 *always* match. Probably should be marked during compile...
545 Probably it is right to do no SCREAM here...
548 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
549 /* Take into account the "other" substring. */
550 /* XXXX May be hopelessly wrong for UTF... */
553 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
556 char * const last = HOP3c(s, -start_shift, strbeg);
558 char * const saved_s = s;
561 t = s - prog->check_offset_max;
562 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
564 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
569 t = HOP3c(t, prog->anchored_offset, strend);
570 if (t < other_last) /* These positions already checked */
572 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
575 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
576 /* On end-of-str: see comment below. */
577 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
578 if (must == &PL_sv_undef) {
580 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
585 HOP3(HOP3(last1, prog->anchored_offset, strend)
586 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
588 multiline ? FBMrf_MULTILINE : 0
590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
591 "%s anchored substr \"%s%.*s%s\"%s",
592 (s ? "Found" : "Contradicts"),
595 - (SvTAIL(must)!=0)),
597 PL_colors[1], (SvTAIL(must) ? "$" : "")));
599 if (last1 >= last2) {
600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
601 ", giving up...\n"));
604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
605 ", trying floating at offset %ld...\n",
606 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
607 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
608 s = HOP3c(last, 1, strend);
612 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
613 (long)(s - i_strpos)));
614 t = HOP3c(s, -prog->anchored_offset, strbeg);
615 other_last = HOP3c(s, 1, strend);
623 else { /* Take into account the floating substring. */
625 char * const saved_s = s;
628 t = HOP3c(s, -start_shift, strbeg);
630 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
631 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
632 last = HOP3c(t, prog->float_max_offset, strend);
633 s = HOP3c(t, prog->float_min_offset, strend);
636 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
637 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
638 /* fbm_instr() takes into account exact value of end-of-str
639 if the check is SvTAIL(ed). Since false positives are OK,
640 and end-of-str is not later than strend we are OK. */
641 if (must == &PL_sv_undef) {
643 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
646 s = fbm_instr((unsigned char*)s,
647 (unsigned char*)last + SvCUR(must)
649 must, multiline ? FBMrf_MULTILINE : 0);
650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
651 (s ? "Found" : "Contradicts"),
653 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
659 ", giving up...\n"));
662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
663 ", trying anchored starting at offset %ld...\n",
664 (long)(saved_s + 1 - i_strpos)));
666 s = HOP3c(t, 1, strend);
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
671 (long)(s - i_strpos)));
672 other_last = s; /* Fix this later. --Hugo */
681 t = s - prog->check_offset_max;
682 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
684 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
686 /* Fixed substring is found far enough so that the match
687 cannot start at strpos. */
689 if (ml_anch && t[-1] != '\n') {
690 /* Eventually fbm_*() should handle this, but often
691 anchored_offset is not 0, so this check will not be wasted. */
692 /* XXXX In the code below we prefer to look for "^" even in
693 presence of anchored substrings. And we search even
694 beyond the found float position. These pessimizations
695 are historical artefacts only. */
697 while (t < strend - prog->minlen) {
699 if (t < check_at - prog->check_offset_min) {
700 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
701 /* Since we moved from the found position,
702 we definitely contradict the found anchored
703 substr. Due to the above check we do not
704 contradict "check" substr.
705 Thus we can arrive here only if check substr
706 is float. Redo checking for "other"=="fixed".
709 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
710 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
711 goto do_other_anchored;
713 /* We don't contradict the found floating substring. */
714 /* XXXX Why not check for STCLASS? */
716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
717 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
720 /* Position contradicts check-string */
721 /* XXXX probably better to look for check-string
722 than for "\n", so one should lower the limit for t? */
723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
724 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
725 other_last = strpos = s = t + 1;
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
731 PL_colors[0], PL_colors[1]));
735 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
736 PL_colors[0], PL_colors[1]));
740 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
743 /* The found string does not prohibit matching at strpos,
744 - no optimization of calling REx engine can be performed,
745 unless it was an MBOL and we are not after MBOL,
746 or a future STCLASS check will fail this. */
748 /* Even in this situation we may use MBOL flag if strpos is offset
749 wrt the start of the string. */
750 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
751 && (strpos != strbeg) && strpos[-1] != '\n'
752 /* May be due to an implicit anchor of m{.*foo} */
753 && !(prog->reganch & ROPT_IMPLICIT))
758 DEBUG_EXECUTE_r( if (ml_anch)
759 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
760 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
763 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
765 prog->check_utf8 /* Could be deleted already */
766 && --BmUSEFUL(prog->check_utf8) < 0
767 && (prog->check_utf8 == prog->float_utf8)
769 prog->check_substr /* Could be deleted already */
770 && --BmUSEFUL(prog->check_substr) < 0
771 && (prog->check_substr == prog->float_substr)
774 /* If flags & SOMETHING - do not do it many times on the same match */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
776 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
777 if (do_utf8 ? prog->check_substr : prog->check_utf8)
778 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
779 prog->check_substr = prog->check_utf8 = NULL; /* disable */
780 prog->float_substr = prog->float_utf8 = NULL; /* clear */
781 check = NULL; /* abort */
783 /* XXXX This is a remnant of the old implementation. It
784 looks wasteful, since now INTUIT can use many
786 prog->reganch &= ~RE_USE_INTUIT;
793 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
794 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
795 /* minlen == 0 is possible if regstclass is \b or \B,
796 and the fixed substr is ''$.
797 Since minlen is already taken into account, s+1 is before strend;
798 accidentally, minlen >= 1 guaranties no false positives at s + 1
799 even for \b or \B. But (minlen? 1 : 0) below assumes that
800 regstclass does not come from lookahead... */
801 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
802 This leaves EXACTF only, which is dealt with in find_byclass(). */
803 const U8* const str = (U8*)STRING(prog->regstclass);
804 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
805 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
807 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
808 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
809 : (prog->float_substr || prog->float_utf8
810 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
813 /*if (OP(prog->regstclass) == TRIE)
816 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
819 const char *what = NULL;
821 if (endpos == strend) {
822 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
823 "Could not match STCLASS...\n") );
826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
827 "This position contradicts STCLASS...\n") );
828 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
830 /* Contradict one of substrings */
831 if (prog->anchored_substr || prog->anchored_utf8) {
832 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
833 DEBUG_EXECUTE_r( what = "anchored" );
835 s = HOP3c(t, 1, strend);
836 if (s + start_shift + end_shift > strend) {
837 /* XXXX Should be taken into account earlier? */
838 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
839 "Could not match STCLASS...\n") );
844 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
845 "Looking for %s substr starting at offset %ld...\n",
846 what, (long)(s + start_shift - i_strpos)) );
849 /* Have both, check_string is floating */
850 if (t + start_shift >= check_at) /* Contradicts floating=check */
851 goto retry_floating_check;
852 /* Recheck anchored substring, but not floating... */
856 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
857 "Looking for anchored substr starting at offset %ld...\n",
858 (long)(other_last - i_strpos)) );
859 goto do_other_anchored;
861 /* Another way we could have checked stclass at the
862 current position only: */
867 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
868 "Looking for /%s^%s/m starting at offset %ld...\n",
869 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
872 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
874 /* Check is floating subtring. */
875 retry_floating_check:
876 t = check_at - start_shift;
877 DEBUG_EXECUTE_r( what = "floating" );
878 goto hop_and_restart;
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
882 "By STCLASS: moving %ld --> %ld\n",
883 (long)(t - i_strpos), (long)(s - i_strpos))
887 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
888 "Does not contradict STCLASS...\n");
893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
894 PL_colors[4], (check ? "Guessed" : "Giving up"),
895 PL_colors[5], (long)(s - i_strpos)) );
898 fail_finish: /* Substring not found */
899 if (prog->check_substr || prog->check_utf8) /* could be removed already */
900 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
903 PL_colors[4], PL_colors[5]));
907 /* We know what class REx starts with. Try to find this position... */
908 /* if reginfo is NULL, its a dryrun */
909 /* annoyingly all the vars in this routine have different names from their counterparts
910 in regmatch. /grrr */
912 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
913 foldlen, foldbuf, uniflags) STMT_START { \
914 switch (trie_type) { \
915 case trie_utf8_fold: \
917 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
922 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
923 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
924 foldlen -= UNISKIP( uvc ); \
925 uscan = foldbuf + UNISKIP( uvc ); \
929 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
937 charid = trie->charmap[ uvc ]; \
941 if (trie->widecharmap) { \
942 SV** const svpp = hv_fetch(trie->widecharmap, \
943 (char*)&uvc, sizeof(UV), 0); \
945 charid = (U16)SvIV(*svpp); \
950 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
953 ibcmp_utf8(s, NULL, 0, do_utf8, \
954 m, NULL, ln, (bool)UTF)) \
955 && (!reginfo || regtry(reginfo, s)) ) \
958 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
959 uvchr_to_utf8(tmpbuf, c); \
960 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
962 && (f == c1 || f == c2) \
963 && (ln == foldlen || \
964 !ibcmp_utf8((char *) foldbuf, \
965 NULL, foldlen, do_utf8, \
967 NULL, ln, (bool)UTF)) \
968 && (!reginfo || regtry(reginfo, s)) ) \
973 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
977 && (ln == 1 || !(OP(c) == EXACTF \
979 : ibcmp_locale(s, m, ln))) \
980 && (!reginfo || regtry(reginfo, s)) ) \
986 #define REXEC_FBC_UTF8_SCAN(CoDe) \
988 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
994 #define REXEC_FBC_SCAN(CoDe) \
996 while (s < strend) { \
1002 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1003 REXEC_FBC_UTF8_SCAN( \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1014 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1017 if (tmp && (!reginfo || regtry(reginfo, s))) \
1026 #define REXEC_FBC_TRYIT \
1027 if ((!reginfo || regtry(reginfo, s))) \
1030 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1033 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1036 REXEC_FBC_CLASS_SCAN(CoNd); \
1040 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1041 PL_reg_flags |= RF_tainted; \
1043 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1046 REXEC_FBC_CLASS_SCAN(CoNd); \
1051 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1052 const char *strend, const regmatch_info *reginfo)
1055 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1059 register STRLEN uskip;
1063 register I32 tmp = 1; /* Scratch variable? */
1064 register const bool do_utf8 = PL_reg_match_utf8;
1066 /* We know what class it must start with. */
1070 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1071 !UTF8_IS_INVARIANT((U8)s[0]) ?
1072 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1073 REGINCLASS(prog, c, (U8*)s));
1076 while (s < strend) {
1079 if (REGINCLASS(prog, c, (U8*)s) ||
1080 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1081 /* The assignment of 2 is intentional:
1082 * for the folded sharp s, the skip is 2. */
1083 (skip = SHARP_S_SKIP))) {
1084 if (tmp && (!reginfo || regtry(reginfo, s)))
1097 if (tmp && (!reginfo || regtry(reginfo, s)))
1105 ln = STR_LEN(c); /* length to match in octets/bytes */
1106 lnc = (I32) ln; /* length to match in characters */
1108 STRLEN ulen1, ulen2;
1110 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1111 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1112 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1114 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1115 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1117 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1119 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1122 while (sm < ((U8 *) m + ln)) {
1137 c2 = PL_fold_locale[c1];
1139 e = HOP3c(strend, -((I32)lnc), s);
1141 if (!reginfo && e < s)
1142 e = s; /* Due to minlen logic of intuit() */
1144 /* The idea in the EXACTF* cases is to first find the
1145 * first character of the EXACTF* node and then, if
1146 * necessary, case-insensitively compare the full
1147 * text of the node. The c1 and c2 are the first
1148 * characters (though in Unicode it gets a bit
1149 * more complicated because there are more cases
1150 * than just upper and lower: one needs to use
1151 * the so-called folding case for case-insensitive
1152 * matching (called "loose matching" in Unicode).
1153 * ibcmp_utf8() will do just that. */
1157 U8 tmpbuf [UTF8_MAXBYTES+1];
1158 STRLEN len, foldlen;
1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1161 /* Upper and lower of 1st char are equal -
1162 * probably not a "letter". */
1164 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1166 REXEC_FBC_EXACTISH_CHECK(c == c1);
1171 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1174 /* Handle some of the three Greek sigmas cases.
1175 * Note that not all the possible combinations
1176 * are handled here: some of them are handled
1177 * by the standard folding rules, and some of
1178 * them (the character class or ANYOF cases)
1179 * are handled during compiletime in
1180 * regexec.c:S_regclass(). */
1181 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1182 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1183 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1185 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1191 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1193 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1197 PL_reg_flags |= RF_tainted;
1204 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1205 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1207 tmp = ((OP(c) == BOUND ?
1208 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1209 LOAD_UTF8_CHARCLASS_ALNUM();
1210 REXEC_FBC_UTF8_SCAN(
1211 if (tmp == !(OP(c) == BOUND ?
1212 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1213 isALNUM_LC_utf8((U8*)s)))
1221 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1222 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1225 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1231 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1235 PL_reg_flags |= RF_tainted;
1242 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1243 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1245 tmp = ((OP(c) == NBOUND ?
1246 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1247 LOAD_UTF8_CHARCLASS_ALNUM();
1248 REXEC_FBC_UTF8_SCAN(
1249 if (tmp == !(OP(c) == NBOUND ?
1250 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1251 isALNUM_LC_utf8((U8*)s)))
1253 else REXEC_FBC_TRYIT;
1257 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1258 tmp = ((OP(c) == NBOUND ?
1259 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1262 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1264 else REXEC_FBC_TRYIT;
1267 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1271 REXEC_FBC_CSCAN_PRELOAD(
1272 LOAD_UTF8_CHARCLASS_ALNUM(),
1273 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1277 REXEC_FBC_CSCAN_TAINT(
1278 isALNUM_LC_utf8((U8*)s),
1282 REXEC_FBC_CSCAN_PRELOAD(
1283 LOAD_UTF8_CHARCLASS_ALNUM(),
1284 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1288 REXEC_FBC_CSCAN_TAINT(
1289 !isALNUM_LC_utf8((U8*)s),
1293 REXEC_FBC_CSCAN_PRELOAD(
1294 LOAD_UTF8_CHARCLASS_SPACE(),
1295 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1299 REXEC_FBC_CSCAN_TAINT(
1300 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1304 REXEC_FBC_CSCAN_PRELOAD(
1305 LOAD_UTF8_CHARCLASS_SPACE(),
1306 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1310 REXEC_FBC_CSCAN_TAINT(
1311 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1315 REXEC_FBC_CSCAN_PRELOAD(
1316 LOAD_UTF8_CHARCLASS_DIGIT(),
1317 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1321 REXEC_FBC_CSCAN_TAINT(
1322 isDIGIT_LC_utf8((U8*)s),
1326 REXEC_FBC_CSCAN_PRELOAD(
1327 LOAD_UTF8_CHARCLASS_DIGIT(),
1328 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1332 REXEC_FBC_CSCAN_TAINT(
1333 !isDIGIT_LC_utf8((U8*)s),
1337 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1339 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1340 trie_type = do_utf8 ?
1341 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1343 /* what trie are we using right now */
1345 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1346 reg_trie_data *trie=aho->trie;
1348 const char *last_start = strend - trie->minlen;
1350 const char *real_start = s;
1352 STRLEN maxlen = trie->maxlen;
1354 U8 **points; /* map of where we were in the input string
1355 when reading a given string. For ASCII this
1356 is unnecessary overhead as the relationship
1357 is always 1:1, but for unicode, especially
1358 case folded unicode this is not true. */
1359 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1361 GET_RE_DEBUG_FLAGS_DECL;
1363 /* We can't just allocate points here. We need to wrap it in
1364 * an SV so it gets freed properly if there is a croak while
1365 * running the match */
1368 sv_points=newSV(maxlen * sizeof(U8 *));
1369 SvCUR_set(sv_points,
1370 maxlen * sizeof(U8 *));
1371 SvPOK_on(sv_points);
1372 sv_2mortal(sv_points);
1373 points=(U8**)SvPV_nolen(sv_points );
1375 if (trie->bitmap && trie_type != trie_utf8_fold) {
1376 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1381 while (s <= last_start) {
1382 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1390 U8 *uscan = (U8*)NULL;
1391 U8 *leftmost = NULL;
1395 while ( state && uc <= (U8*)strend ) {
1397 if (aho->states[ state ].wordnum) {
1398 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1399 if (!leftmost || lpos < leftmost)
1403 points[pointpos++ % maxlen]= uc;
1404 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1405 uvc, charid, foldlen, foldbuf, uniflags);
1406 DEBUG_TRIE_EXECUTE_r(
1407 PerlIO_printf(Perl_debug_log,
1408 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1409 (int)((const char*)uc - real_start), charid, uvc)
1415 U32 word = aho->states[ state ].wordnum;
1417 base = aho->states[ state ].trans.base;
1419 DEBUG_TRIE_EXECUTE_r(
1420 PerlIO_printf( Perl_debug_log,
1421 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1422 failed ? "Fail transition to " : "",
1423 state, base, uvc, word)
1428 (base + charid > trie->uniquecharcount )
1429 && (base + charid - 1 - trie->uniquecharcount
1431 && trie->trans[base + charid - 1 -
1432 trie->uniquecharcount].check == state
1433 && (tmp=trie->trans[base + charid - 1 -
1434 trie->uniquecharcount ].next))
1444 state = aho->fail[state];
1448 /* we must be accepting here */
1456 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1457 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1463 if ( aho->states[ state ].wordnum ) {
1464 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1465 if (!leftmost || lpos < leftmost)
1468 DEBUG_TRIE_EXECUTE_r(
1469 PerlIO_printf( Perl_debug_log,
1470 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1475 s = (char*)leftmost;
1476 if (!reginfo || regtry(reginfo, s)) {
1491 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1500 - regexec_flags - match a regexp against a string
1503 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1504 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1505 /* strend: pointer to null at end of string */
1506 /* strbeg: real beginning of string */
1507 /* minend: end of match must be >=minend after stringarg. */
1508 /* data: May be used for some additional optimizations. */
1509 /* nosave: For optimizations. */
1513 register regnode *c;
1514 register char *startpos = stringarg;
1515 I32 minlen; /* must match at least this many chars */
1516 I32 dontbother = 0; /* how many characters not to try at end */
1517 I32 end_shift = 0; /* Same for the end. */ /* CC */
1518 I32 scream_pos = -1; /* Internal iterator of scream. */
1519 char *scream_olds = NULL;
1520 SV* const oreplsv = GvSV(PL_replgv);
1521 const bool do_utf8 = DO_UTF8(sv);
1524 regmatch_info reginfo; /* create some info to pass to regtry etc */
1526 GET_RE_DEBUG_FLAGS_DECL;
1528 PERL_UNUSED_ARG(data);
1530 /* Be paranoid... */
1531 if (prog == NULL || startpos == NULL) {
1532 Perl_croak(aTHX_ "NULL regexp parameter");
1536 multiline = prog->reganch & PMf_MULTILINE;
1537 reginfo.prog = prog;
1539 RX_MATCH_UTF8_set(prog, do_utf8);
1541 minlen = prog->minlen;
1542 if (strend - startpos < minlen) {
1543 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1544 "String too short [regexec_flags]...\n"));
1548 /* Check validity of program. */
1549 if (UCHARAT(prog->program) != REG_MAGIC) {
1550 Perl_croak(aTHX_ "corrupted regexp program");
1554 PL_reg_eval_set = 0;
1557 if (prog->reganch & ROPT_UTF8)
1558 PL_reg_flags |= RF_utf8;
1560 /* Mark beginning of line for ^ and lookbehind. */
1561 reginfo.bol = startpos; /* XXX not used ??? */
1565 /* Mark end of line for $ (and such) */
1568 /* see how far we have to get to not match where we matched before */
1569 reginfo.till = startpos+minend;
1571 /* If there is a "must appear" string, look for it. */
1574 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1577 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1578 reginfo.ganch = startpos;
1579 else if (sv && SvTYPE(sv) >= SVt_PVMG
1581 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1582 && mg->mg_len >= 0) {
1583 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1584 if (prog->reganch & ROPT_ANCH_GPOS) {
1585 if (s > reginfo.ganch)
1590 else /* pos() not defined */
1591 reginfo.ganch = strbeg;
1594 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1595 re_scream_pos_data d;
1597 d.scream_olds = &scream_olds;
1598 d.scream_pos = &scream_pos;
1599 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1602 goto phooey; /* not present */
1607 RE_PV_DISPLAY_DECL(s0, len0, UTF,
1608 PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
1609 RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
1610 PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
1614 PerlIO_printf(Perl_debug_log,
1615 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1616 PL_colors[4], PL_colors[5], PL_colors[0],
1619 len0 > 60 ? "..." : "",
1621 (int)(len1 > 60 ? 60 : len1),
1623 (len1 > 60 ? "..." : "")
1627 /* Simplest case: anchored match need be tried only once. */
1628 /* [unless only anchor is BOL and multiline is set] */
1629 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1630 if (s == startpos && regtry(®info, startpos))
1632 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1633 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1638 dontbother = minlen - 1;
1639 end = HOP3c(strend, -dontbother, strbeg) - 1;
1640 /* for multiline we only have to try after newlines */
1641 if (prog->check_substr || prog->check_utf8) {
1645 if (regtry(®info, s))
1650 if (prog->reganch & RE_USE_INTUIT) {
1651 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1662 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1663 if (regtry(®info, s))
1670 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1671 if (regtry(®info, reginfo.ganch))
1676 /* Messy cases: unanchored match. */
1677 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1678 /* we have /x+whatever/ */
1679 /* it must be a one character string (XXXX Except UTF?) */
1684 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1685 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1686 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1691 DEBUG_EXECUTE_r( did_match = 1 );
1692 if (regtry(®info, s)) goto got_it;
1694 while (s < strend && *s == ch)
1702 DEBUG_EXECUTE_r( did_match = 1 );
1703 if (regtry(®info, s)) goto got_it;
1705 while (s < strend && *s == ch)
1710 DEBUG_EXECUTE_r(if (!did_match)
1711 PerlIO_printf(Perl_debug_log,
1712 "Did not find anchored character...\n")
1715 else if (prog->anchored_substr != NULL
1716 || prog->anchored_utf8 != NULL
1717 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1718 && prog->float_max_offset < strend - s)) {
1723 char *last1; /* Last position checked before */
1727 if (prog->anchored_substr || prog->anchored_utf8) {
1728 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1729 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1730 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1731 back_max = back_min = prog->anchored_offset;
1733 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1734 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1735 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1736 back_max = prog->float_max_offset;
1737 back_min = prog->float_min_offset;
1739 if (must == &PL_sv_undef)
1740 /* could not downgrade utf8 check substring, so must fail */
1743 last = HOP3c(strend, /* Cannot start after this */
1744 -(I32)(CHR_SVLEN(must)
1745 - (SvTAIL(must) != 0) + back_min), strbeg);
1748 last1 = HOPc(s, -1);
1750 last1 = s - 1; /* bogus */
1752 /* XXXX check_substr already used to find "s", can optimize if
1753 check_substr==must. */
1755 dontbother = end_shift;
1756 strend = HOPc(strend, -dontbother);
1757 while ( (s <= last) &&
1758 ((flags & REXEC_SCREAM)
1759 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1760 end_shift, &scream_pos, 0))
1761 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1762 (unsigned char*)strend, must,
1763 multiline ? FBMrf_MULTILINE : 0))) ) {
1764 /* we may be pointing at the wrong string */
1765 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1766 s = strbeg + (s - SvPVX_const(sv));
1767 DEBUG_EXECUTE_r( did_match = 1 );
1768 if (HOPc(s, -back_max) > last1) {
1769 last1 = HOPc(s, -back_min);
1770 s = HOPc(s, -back_max);
1773 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1775 last1 = HOPc(s, -back_min);
1779 while (s <= last1) {
1780 if (regtry(®info, s))
1786 while (s <= last1) {
1787 if (regtry(®info, s))
1793 DEBUG_EXECUTE_r(if (!did_match)
1794 PerlIO_printf(Perl_debug_log,
1795 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1796 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1797 ? "anchored" : "floating"),
1799 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1801 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1805 else if ((c = prog->regstclass)) {
1807 const OPCODE op = OP(prog->regstclass);
1808 /* don't bother with what can't match */
1809 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
1810 strend = HOPc(strend, -(minlen - 1));
1813 SV * const prop = sv_newmortal();
1814 regprop(prog, prop, c);
1816 RE_PV_DISPLAY_DECL(s0,len0,UTF,
1817 PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
1818 RE_PV_DISPLAY_DECL(s1,len1,UTF,
1819 PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
1820 PerlIO_printf(Perl_debug_log,
1821 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
1823 len1, len1, s1, (int)(strend - s));
1826 if (find_byclass(prog, c, s, strend, ®info))
1828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1832 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1837 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1838 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1839 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1841 if (flags & REXEC_SCREAM) {
1842 last = screaminstr(sv, float_real, s - strbeg,
1843 end_shift, &scream_pos, 1); /* last one */
1845 last = scream_olds; /* Only one occurrence. */
1846 /* we may be pointing at the wrong string */
1847 else if (RX_MATCH_COPIED(prog))
1848 s = strbeg + (s - SvPVX_const(sv));
1852 const char * const little = SvPV_const(float_real, len);
1854 if (SvTAIL(float_real)) {
1855 if (memEQ(strend - len + 1, little, len - 1))
1856 last = strend - len + 1;
1857 else if (!multiline)
1858 last = memEQ(strend - len, little, len)
1859 ? strend - len : NULL;
1865 last = rninstr(s, strend, little, little + len);
1867 last = strend; /* matching "$" */
1871 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1872 "%sCan't trim the tail, match fails (should not happen)%s\n",
1873 PL_colors[4], PL_colors[5]));
1874 goto phooey; /* Should not happen! */
1876 dontbother = strend - last + prog->float_min_offset;
1878 if (minlen && (dontbother < minlen))
1879 dontbother = minlen - 1;
1880 strend -= dontbother; /* this one's always in bytes! */
1881 /* We don't know much -- general case. */
1884 if (regtry(®info, s))
1893 if (regtry(®info, s))
1895 } while (s++ < strend);
1903 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1905 if (PL_reg_eval_set) {
1906 /* Preserve the current value of $^R */
1907 if (oreplsv != GvSV(PL_replgv))
1908 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1909 restored, the value remains
1911 restore_pos(aTHX_ prog);
1914 /* make sure $`, $&, $', and $digit will work later */
1915 if ( !(flags & REXEC_NOT_FIRST) ) {
1916 RX_MATCH_COPY_FREE(prog);
1917 if (flags & REXEC_COPY_STR) {
1918 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
1919 #ifdef PERL_OLD_COPY_ON_WRITE
1921 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1923 PerlIO_printf(Perl_debug_log,
1924 "Copy on write: regexp capture, type %d\n",
1927 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
1928 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
1929 assert (SvPOKp(prog->saved_copy));
1933 RX_MATCH_COPIED_on(prog);
1934 s = savepvn(strbeg, i);
1940 prog->subbeg = strbeg;
1941 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1948 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1949 PL_colors[4], PL_colors[5]));
1950 if (PL_reg_eval_set)
1951 restore_pos(aTHX_ prog);
1956 - regtry - try match at specific point
1958 STATIC I32 /* 0 failure, 1 success */
1959 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
1965 regexp *prog = reginfo->prog;
1966 GET_RE_DEBUG_FLAGS_DECL;
1969 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1971 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1974 PL_reg_eval_set = RS_init;
1975 DEBUG_EXECUTE_r(DEBUG_s(
1976 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1977 (IV)(PL_stack_sp - PL_stack_base));
1979 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
1980 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1981 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1983 /* Apparently this is not needed, judging by wantarray. */
1984 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1985 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1988 /* Make $_ available to executed code. */
1989 if (reginfo->sv != DEFSV) {
1991 DEFSV = reginfo->sv;
1994 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1995 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
1996 /* prepare for quick setting of pos */
1997 #ifdef PERL_OLD_COPY_ON_WRITE
1999 sv_force_normal_flags(sv, 0);
2001 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2002 &PL_vtbl_mglob, NULL, 0);
2006 PL_reg_oldpos = mg->mg_len;
2007 SAVEDESTRUCTOR_X(restore_pos, prog);
2009 if (!PL_reg_curpm) {
2010 Newxz(PL_reg_curpm, 1, PMOP);
2013 SV* const repointer = newSViv(0);
2014 /* so we know which PL_regex_padav element is PL_reg_curpm */
2015 SvFLAGS(repointer) |= SVf_BREAK;
2016 av_push(PL_regex_padav,repointer);
2017 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2018 PL_regex_pad = AvARRAY(PL_regex_padav);
2022 PM_SETRE(PL_reg_curpm, prog);
2023 PL_reg_oldcurpm = PL_curpm;
2024 PL_curpm = PL_reg_curpm;
2025 if (RX_MATCH_COPIED(prog)) {
2026 /* Here is a serious problem: we cannot rewrite subbeg,
2027 since it may be needed if this match fails. Thus
2028 $` inside (?{}) could fail... */
2029 PL_reg_oldsaved = prog->subbeg;
2030 PL_reg_oldsavedlen = prog->sublen;
2031 #ifdef PERL_OLD_COPY_ON_WRITE
2032 PL_nrs = prog->saved_copy;
2034 RX_MATCH_COPIED_off(prog);
2037 PL_reg_oldsaved = NULL;
2038 prog->subbeg = PL_bostr;
2039 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2041 prog->startp[0] = startpos - PL_bostr;
2042 PL_reginput = startpos;
2043 PL_regstartp = prog->startp;
2044 PL_regendp = prog->endp;
2045 PL_reglastparen = &prog->lastparen;
2046 PL_reglastcloseparen = &prog->lastcloseparen;
2047 prog->lastparen = 0;
2048 prog->lastcloseparen = 0;
2050 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2051 if (PL_reg_start_tmpl <= prog->nparens) {
2052 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2053 if(PL_reg_start_tmp)
2054 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2056 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2059 /* XXXX What this code is doing here?!!! There should be no need
2060 to do this again and again, PL_reglastparen should take care of
2063 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2064 * Actually, the code in regcppop() (which Ilya may be meaning by
2065 * PL_reglastparen), is not needed at all by the test suite
2066 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2067 * enough, for building DynaLoader, or otherwise this
2068 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2069 * will happen. Meanwhile, this code *is* needed for the
2070 * above-mentioned test suite tests to succeed. The common theme
2071 * on those tests seems to be returning null fields from matches.
2076 if (prog->nparens) {
2078 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2085 if (regmatch(reginfo, prog->program + 1)) {
2086 prog->endp[0] = PL_reginput - PL_bostr;
2089 REGCP_UNWIND(lastcp);
2094 #define sayYES goto yes
2095 #define sayNO goto no
2096 #define sayNO_ANYOF goto no_anyof
2097 #define sayYES_FINAL goto yes_final
2098 #define sayNO_FINAL goto no_final
2099 #define sayNO_SILENT goto do_no
2100 #define saySAME(x) if (x) goto yes; else goto no
2102 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2103 #define POSCACHE_SEEN 1 /* we know what we're caching */
2104 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2106 #define CACHEsayYES STMT_START { \
2107 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2108 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2109 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2110 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2112 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2113 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2116 /* cache records failure, but this is success */ \
2118 PerlIO_printf(Perl_debug_log, \
2119 "%*s (remove success from failure cache)\n", \
2120 REPORT_CODE_OFF+PL_regindent*2, "") \
2122 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2128 #define CACHEsayNO STMT_START { \
2129 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2130 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2131 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2132 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2134 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2135 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2138 /* cache records success, but this is failure */ \
2140 PerlIO_printf(Perl_debug_log, \
2141 "%*s (remove failure from success cache)\n", \
2142 REPORT_CODE_OFF+PL_regindent*2, "") \
2144 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2150 /* this is used to determine how far from the left messages like
2151 'failed...' are printed. Currently 29 makes these messages line
2152 up with the opcode they refer to. Earlier perls used 25 which
2153 left these messages outdented making reviewing a debug output
2156 #define REPORT_CODE_OFF 29
2159 /* Make sure there is a test for this +1 options in re_tests */
2160 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2162 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2163 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2165 #define SLAB_FIRST(s) (&(s)->states[0])
2166 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2168 /* grab a new slab and return the first slot in it */
2170 STATIC regmatch_state *
2173 #if PERL_VERSION < 9
2176 regmatch_slab *s = PL_regmatch_slab->next;
2178 Newx(s, 1, regmatch_slab);
2179 s->prev = PL_regmatch_slab;
2181 PL_regmatch_slab->next = s;
2183 PL_regmatch_slab = s;
2184 return SLAB_FIRST(s);
2187 /* simulate a recursive call to regmatch */
2189 #define REGMATCH(ns, where) \
2192 st->resume_state = resume_##where; \
2193 goto start_recurse; \
2194 resume_point_##where:
2196 /* push a new state then goto it */
2198 #define PUSH_STATE_GOTO(state, node) \
2200 st->resume_state = state; \
2203 /* push a new state with success backtracking, then goto it */
2205 #define PUSH_YES_STATE_GOTO(state, node) \
2207 st->resume_state = state; \
2208 goto push_yes_state;
2213 - regmatch - main matching routine
2215 * Conceptually the strategy is simple: check to see whether the current
2216 * node matches, call self recursively to see whether the rest matches,
2217 * and then act accordingly. In practice we make some effort to avoid
2218 * recursion, in particular by going through "ordinary" nodes (that don't
2219 * need to know whether the rest of the match failed) by a loop instead of
2222 /* [lwall] I've hoisted the register declarations to the outer block in order to
2223 * maybe save a little bit of pushing and popping on the stack. It also takes
2224 * advantage of machines that use a register save mask on subroutine entry.
2226 * This function used to be heavily recursive, but since this had the
2227 * effect of blowing the CPU stack on complex regexes, it has been
2228 * restructured to be iterative, and to save state onto the heap rather
2229 * than the stack. Essentially whereever regmatch() used to be called, it
2230 * pushes the current state, notes where to return, then jumps back into
2233 * Originally the structure of this function used to look something like
2238 while (scan != NULL) {
2239 a++; // do stuff with a and b
2245 if (regmatch(...)) // recurse
2255 * Now it looks something like this:
2263 regmatch_state *st = new();
2265 st->a++; // do stuff with a and b
2267 while (scan != NULL) {
2275 st->resume_state = resume_FOO;
2276 goto start_recurse; // recurse
2285 st = new(); push a new state
2286 st->a = 1; st->b = 2;
2293 switch (resume_state) {
2295 goto resume_point_FOO;
2302 * WARNING: this means that any line in this function that contains a
2303 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2304 * regmatch() using gotos instead. Thus the values of any local variables
2305 * not saved in the regmatch_state structure will have been lost when
2306 * execution resumes on the next line .
2308 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2309 * PL_regmatch_state always points to the currently active state, and
2310 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2311 * The first time regmatch is called, the first slab is allocated, and is
2312 * never freed until interpreter desctruction. When the slab is full,
2313 * a new one is allocated chained to the end. At exit from regmatch, slabs
2314 * allocated since entry are freed.
2317 /* *** every FOO_fail should = FOO+1 */
2318 #define TRIE_next (REGNODE_MAX+1)
2319 #define TRIE_next_fail (REGNODE_MAX+2)
2320 #define EVAL_A (REGNODE_MAX+3)
2321 #define EVAL_A_fail (REGNODE_MAX+4)
2322 #define resume_CURLYX (REGNODE_MAX+5)
2323 #define resume_WHILEM1 (REGNODE_MAX+6)
2324 #define resume_WHILEM2 (REGNODE_MAX+7)
2325 #define resume_WHILEM3 (REGNODE_MAX+8)
2326 #define resume_WHILEM4 (REGNODE_MAX+9)
2327 #define resume_WHILEM5 (REGNODE_MAX+10)
2328 #define resume_WHILEM6 (REGNODE_MAX+11)
2329 #define BRANCH_next (REGNODE_MAX+12)
2330 #define BRANCH_next_fail (REGNODE_MAX+13)
2331 #define CURLYM_A (REGNODE_MAX+14)
2332 #define CURLYM_A_fail (REGNODE_MAX+15)
2333 #define CURLYM_B (REGNODE_MAX+16)
2334 #define CURLYM_B_fail (REGNODE_MAX+17)
2335 #define IFMATCH_A (REGNODE_MAX+18)
2336 #define IFMATCH_A_fail (REGNODE_MAX+19)
2337 #define CURLY_B_min_known (REGNODE_MAX+20)
2338 #define CURLY_B_min_known_fail (REGNODE_MAX+21)
2339 #define CURLY_B_min (REGNODE_MAX+22)
2340 #define CURLY_B_min_fail (REGNODE_MAX+23)
2341 #define CURLY_B_max (REGNODE_MAX+24)
2342 #define CURLY_B_max_fail (REGNODE_MAX+25)
2345 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2350 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2352 const int docolor = *PL_colors[0];
2353 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2354 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2355 /* The part of the string before starttry has one color
2356 (pref0_len chars), between starttry and current
2357 position another one (pref_len - pref0_len chars),
2358 after the current position the third one.
2359 We assume that pref0_len <= pref_len, otherwise we
2360 decrease pref0_len. */
2361 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2362 ? (5 + taill) - l : locinput - PL_bostr;
2365 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2367 pref0_len = pref_len - (locinput - PL_reg_starttry);
2368 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2369 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2370 ? (5 + taill) - pref_len : PL_regeol - locinput);
2371 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2375 if (pref0_len > pref_len)
2376 pref0_len = pref_len;
2378 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2380 RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2381 (locinput - pref_len),pref0_len, 60);
2383 RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2384 (locinput - pref_len + pref0_len),
2385 pref_len - pref0_len, 60);
2387 RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2388 locinput, PL_regeol - locinput, 60);
2390 PerlIO_printf(Perl_debug_log,
2391 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2392 (IV)(locinput - PL_bostr),
2399 (docolor ? "" : "> <"),
2403 15 - l - pref_len + 1,
2410 STATIC I32 /* 0 failure, 1 success */
2411 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2413 #if PERL_VERSION < 9
2417 register const bool do_utf8 = PL_reg_match_utf8;
2418 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2420 regexp *rex = reginfo->prog;
2422 regmatch_slab *orig_slab;
2423 regmatch_state *orig_state;
2425 /* the current state. This is a cached copy of PL_regmatch_state */
2426 register regmatch_state *st;
2428 /* cache heavy used fields of st in registers */
2429 register regnode *scan;
2430 register regnode *next;
2431 register I32 n = 0; /* initialize to shut up compiler warning */
2432 register char *locinput = PL_reginput;
2434 /* these variables are NOT saved during a recusive RFEGMATCH: */
2435 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2436 bool result; /* return value of S_regmatch */
2437 int depth = 0; /* depth of recursion */
2438 regmatch_state *yes_state = NULL; /* state to pop to on success of
2443 GET_RE_DEBUG_FLAGS_DECL;
2447 /* on first ever call to regmatch, allocate first slab */
2448 if (!PL_regmatch_slab) {
2449 Newx(PL_regmatch_slab, 1, regmatch_slab);
2450 PL_regmatch_slab->prev = NULL;
2451 PL_regmatch_slab->next = NULL;
2452 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2455 /* remember current high-water mark for exit */
2456 /* XXX this should be done with SAVE* instead */
2457 orig_slab = PL_regmatch_slab;
2458 orig_state = PL_regmatch_state;
2460 /* grab next free state slot */
2461 st = ++PL_regmatch_state;
2462 if (st > SLAB_LAST(PL_regmatch_slab))
2463 st = PL_regmatch_state = S_push_slab(aTHX);
2469 /* Note that nextchr is a byte even in UTF */
2470 nextchr = UCHARAT(locinput);
2472 while (scan != NULL) {
2475 SV * const prop = sv_newmortal();
2476 dump_exec_pos( locinput, scan, do_utf8 );
2477 regprop(rex, prop, scan);
2479 PerlIO_printf(Perl_debug_log,
2480 "%3"IVdf":%*s%s(%"IVdf")\n",
2481 (IV)(scan - rex->program), PL_regindent*2, "",
2483 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2486 next = scan + NEXT_OFF(scan);
2489 state_num = OP(scan);
2492 switch (state_num) {
2494 if (locinput == PL_bostr)
2496 /* reginfo->till = reginfo->bol; */
2501 if (locinput == PL_bostr ||
2502 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2508 if (locinput == PL_bostr)
2512 if (locinput == reginfo->ganch)
2518 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2523 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2525 if (PL_regeol - locinput > 1)
2529 if (PL_regeol != locinput)
2533 if (!nextchr && locinput >= PL_regeol)
2536 locinput += PL_utf8skip[nextchr];
2537 if (locinput > PL_regeol)
2539 nextchr = UCHARAT(locinput);
2542 nextchr = UCHARAT(++locinput);
2545 if (!nextchr && locinput >= PL_regeol)
2547 nextchr = UCHARAT(++locinput);
2550 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2553 locinput += PL_utf8skip[nextchr];
2554 if (locinput > PL_regeol)
2556 nextchr = UCHARAT(locinput);
2559 nextchr = UCHARAT(++locinput);
2563 #define ST st->u.trie
2567 /* what type of TRIE am I? (utf8 makes this contextual) */
2568 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2569 trie_type = do_utf8 ?
2570 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2573 /* what trie are we using right now */
2574 reg_trie_data * const trie
2575 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2576 U32 state = trie->startstate;
2578 U8 *uc = ( U8* )locinput;
2584 U8 *uscan = (U8*)NULL;
2586 SV *sv_accept_buff = NULL;
2587 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2589 ST.accepted = 0; /* how many accepting states we have seen */
2595 if (trie->bitmap && trie_type != trie_utf8_fold &&
2596 !TRIE_BITMAP_TEST(trie,*locinput)
2598 if (trie->states[ state ].wordnum) {
2600 PerlIO_printf(Perl_debug_log,
2601 "%*s %smatched empty string...%s\n",
2602 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2607 PerlIO_printf(Perl_debug_log,
2608 "%*s %sfailed to match start class...%s\n",
2609 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2616 traverse the TRIE keeping track of all accepting states
2617 we transition through until we get to a failing node.
2620 while ( state && uc <= (U8*)PL_regeol ) {
2622 if (trie->states[ state ].wordnum) {
2623 if (!ST.accepted ) {
2626 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2627 sv_accept_buff=newSV(bufflen *
2628 sizeof(reg_trie_accepted) - 1);
2629 SvCUR_set(sv_accept_buff,
2630 sizeof(reg_trie_accepted));
2631 SvPOK_on(sv_accept_buff);
2632 sv_2mortal(sv_accept_buff);
2635 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2638 if (ST.accepted >= bufflen) {
2640 ST.accept_buff =(reg_trie_accepted*)
2641 SvGROW(sv_accept_buff,
2642 bufflen * sizeof(reg_trie_accepted));
2644 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2645 + sizeof(reg_trie_accepted));
2647 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2648 ST.accept_buff[ST.accepted].endpos = uc;
2652 base = trie->states[ state ].trans.base;
2654 DEBUG_TRIE_EXECUTE_r({
2655 dump_exec_pos( (char *)uc, scan, do_utf8 );
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2658 2+PL_regindent * 2, "", PL_colors[4],
2659 (UV)state, (UV)base, (UV)ST.accepted );
2663 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2664 uvc, charid, foldlen, foldbuf, uniflags);
2667 (base + charid > trie->uniquecharcount )
2668 && (base + charid - 1 - trie->uniquecharcount
2670 && trie->trans[base + charid - 1 -
2671 trie->uniquecharcount].check == state)
2673 state = trie->trans[base + charid - 1 -
2674 trie->uniquecharcount ].next;
2685 DEBUG_TRIE_EXECUTE_r(
2686 PerlIO_printf( Perl_debug_log,
2687 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2688 charid, uvc, (UV)state, PL_colors[5] );
2695 PerlIO_printf( Perl_debug_log,
2696 "%*s %sgot %"IVdf" possible matches%s\n",
2697 REPORT_CODE_OFF + PL_regindent * 2, "",
2698 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2704 case TRIE_next_fail: /* we failed - try next alterative */
2706 if ( ST.accepted == 1 ) {
2707 /* only one choice left - just continue */
2709 reg_trie_data * const trie
2710 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2711 SV ** const tmp = RX_DEBUG(reginfo->prog)
2712 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2714 PerlIO_printf( Perl_debug_log,
2715 "%*s %sonly one match left: #%d <%s>%s\n",
2716 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2717 ST.accept_buff[ 0 ].wordnum,
2718 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2721 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2722 /* in this case we free tmps/leave before we call regmatch
2723 as we wont be using accept_buff again. */
2726 locinput = PL_reginput;
2727 nextchr = UCHARAT(locinput);
2729 continue; /* execute rest of RE */
2732 if (!ST.accepted-- ) {
2739 There are at least two accepting states left. Presumably
2740 the number of accepting states is going to be low,
2741 typically two. So we simply scan through to find the one
2742 with lowest wordnum. Once we find it, we swap the last
2743 state into its place and decrement the size. We then try to
2744 match the rest of the pattern at the point where the word
2745 ends. If we succeed, control just continues along the
2746 regex; if we fail we return here to try the next accepting
2753 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2754 DEBUG_TRIE_EXECUTE_r(
2755 PerlIO_printf( Perl_debug_log,
2756 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2757 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2758 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2759 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2762 if (ST.accept_buff[cur].wordnum <
2763 ST.accept_buff[best].wordnum)
2768 reg_trie_data * const trie
2769 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2770 SV ** const tmp = RX_DEBUG(reginfo->prog)
2771 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2773 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2774 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2775 ST.accept_buff[best].wordnum,
2776 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2780 if ( best<ST.accepted ) {
2781 reg_trie_accepted tmp = ST.accept_buff[ best ];
2782 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2783 ST.accept_buff[ ST.accepted ] = tmp;
2786 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2788 PUSH_STATE_GOTO(TRIE_next, ST.B);
2794 char *s = STRING(scan);
2795 st->ln = STR_LEN(scan);
2796 if (do_utf8 != UTF) {
2797 /* The target and the pattern have differing utf8ness. */
2799 const char * const e = s + st->ln;
2802 /* The target is utf8, the pattern is not utf8. */
2807 if (NATIVE_TO_UNI(*(U8*)s) !=
2808 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2816 /* The target is not utf8, the pattern is utf8. */
2821 if (NATIVE_TO_UNI(*((U8*)l)) !=
2822 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2830 nextchr = UCHARAT(locinput);
2833 /* The target and the pattern have the same utf8ness. */
2834 /* Inline the first character, for speed. */
2835 if (UCHARAT(s) != nextchr)
2837 if (PL_regeol - locinput < st->ln)
2839 if (st->ln > 1 && memNE(s, locinput, st->ln))
2842 nextchr = UCHARAT(locinput);
2846 PL_reg_flags |= RF_tainted;
2849 char * const s = STRING(scan);
2850 st->ln = STR_LEN(scan);
2852 if (do_utf8 || UTF) {
2853 /* Either target or the pattern are utf8. */
2854 const char * const l = locinput;
2855 char *e = PL_regeol;
2857 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2858 l, &e, 0, do_utf8)) {
2859 /* One more case for the sharp s:
2860 * pack("U0U*", 0xDF) =~ /ss/i,
2861 * the 0xC3 0x9F are the UTF-8
2862 * byte sequence for the U+00DF. */
2864 toLOWER(s[0]) == 's' &&
2866 toLOWER(s[1]) == 's' &&
2873 nextchr = UCHARAT(locinput);
2877 /* Neither the target and the pattern are utf8. */
2879 /* Inline the first character, for speed. */
2880 if (UCHARAT(s) != nextchr &&
2881 UCHARAT(s) != ((OP(scan) == EXACTF)
2882 ? PL_fold : PL_fold_locale)[nextchr])
2884 if (PL_regeol - locinput < st->ln)
2886 if (st->ln > 1 && (OP(scan) == EXACTF
2887 ? ibcmp(s, locinput, st->ln)
2888 : ibcmp_locale(s, locinput, st->ln)))
2891 nextchr = UCHARAT(locinput);
2896 STRLEN inclasslen = PL_regeol - locinput;
2898 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
2900 if (locinput >= PL_regeol)
2902 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2903 nextchr = UCHARAT(locinput);
2908 nextchr = UCHARAT(locinput);
2909 if (!REGINCLASS(rex, scan, (U8*)locinput))
2911 if (!nextchr && locinput >= PL_regeol)
2913 nextchr = UCHARAT(++locinput);
2917 /* If we might have the case of the German sharp s
2918 * in a casefolding Unicode character class. */
2920 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2921 locinput += SHARP_S_SKIP;
2922 nextchr = UCHARAT(locinput);
2928 PL_reg_flags |= RF_tainted;
2934 LOAD_UTF8_CHARCLASS_ALNUM();
2935 if (!(OP(scan) == ALNUM
2936 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2937 : isALNUM_LC_utf8((U8*)locinput)))
2941 locinput += PL_utf8skip[nextchr];
2942 nextchr = UCHARAT(locinput);
2945 if (!(OP(scan) == ALNUM
2946 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2948 nextchr = UCHARAT(++locinput);
2951 PL_reg_flags |= RF_tainted;
2954 if (!nextchr && locinput >= PL_regeol)
2957 LOAD_UTF8_CHARCLASS_ALNUM();
2958 if (OP(scan) == NALNUM
2959 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2960 : isALNUM_LC_utf8((U8*)locinput))
2964 locinput += PL_utf8skip[nextchr];
2965 nextchr = UCHARAT(locinput);
2968 if (OP(scan) == NALNUM
2969 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2971 nextchr = UCHARAT(++locinput);
2975 PL_reg_flags |= RF_tainted;
2979 /* was last char in word? */
2981 if (locinput == PL_bostr)
2984 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2986 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
2988 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2989 st->ln = isALNUM_uni(st->ln);
2990 LOAD_UTF8_CHARCLASS_ALNUM();
2991 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2994 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
2995 n = isALNUM_LC_utf8((U8*)locinput);
2999 st->ln = (locinput != PL_bostr) ?
3000 UCHARAT(locinput - 1) : '\n';
3001 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3002 st->ln = isALNUM(st->ln);
3003 n = isALNUM(nextchr);
3006 st->ln = isALNUM_LC(st->ln);
3007 n = isALNUM_LC(nextchr);
3010 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3011 OP(scan) == BOUNDL))
3015 PL_reg_flags |= RF_tainted;
3021 if (UTF8_IS_CONTINUED(nextchr)) {
3022 LOAD_UTF8_CHARCLASS_SPACE();
3023 if (!(OP(scan) == SPACE
3024 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3025 : isSPACE_LC_utf8((U8*)locinput)))
3029 locinput += PL_utf8skip[nextchr];
3030 nextchr = UCHARAT(locinput);
3033 if (!(OP(scan) == SPACE
3034 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3036 nextchr = UCHARAT(++locinput);
3039 if (!(OP(scan) == SPACE
3040 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3042 nextchr = UCHARAT(++locinput);
3046 PL_reg_flags |= RF_tainted;
3049 if (!nextchr && locinput >= PL_regeol)
3052 LOAD_UTF8_CHARCLASS_SPACE();
3053 if (OP(scan) == NSPACE
3054 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3055 : isSPACE_LC_utf8((U8*)locinput))
3059 locinput += PL_utf8skip[nextchr];
3060 nextchr = UCHARAT(locinput);
3063 if (OP(scan) == NSPACE
3064 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3066 nextchr = UCHARAT(++locinput);
3069 PL_reg_flags |= RF_tainted;
3075 LOAD_UTF8_CHARCLASS_DIGIT();
3076 if (!(OP(scan) == DIGIT
3077 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3078 : isDIGIT_LC_utf8((U8*)locinput)))
3082 locinput += PL_utf8skip[nextchr];
3083 nextchr = UCHARAT(locinput);
3086 if (!(OP(scan) == DIGIT
3087 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3089 nextchr = UCHARAT(++locinput);
3092 PL_reg_flags |= RF_tainted;
3095 if (!nextchr && locinput >= PL_regeol)
3098 LOAD_UTF8_CHARCLASS_DIGIT();
3099 if (OP(scan) == NDIGIT
3100 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3101 : isDIGIT_LC_utf8((U8*)locinput))
3105 locinput += PL_utf8skip[nextchr];
3106 nextchr = UCHARAT(locinput);
3109 if (OP(scan) == NDIGIT
3110 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3112 nextchr = UCHARAT(++locinput);
3115 if (locinput >= PL_regeol)
3118 LOAD_UTF8_CHARCLASS_MARK();
3119 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3121 locinput += PL_utf8skip[nextchr];
3122 while (locinput < PL_regeol &&
3123 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3124 locinput += UTF8SKIP(locinput);
3125 if (locinput > PL_regeol)
3130 nextchr = UCHARAT(locinput);
3133 PL_reg_flags |= RF_tainted;
3138 n = ARG(scan); /* which paren pair */
3139 st->ln = PL_regstartp[n];
3140 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3141 if ((I32)*PL_reglastparen < n || st->ln == -1)
3142 sayNO; /* Do not match unless seen CLOSEn. */
3143 if (st->ln == PL_regendp[n])
3146 s = PL_bostr + st->ln;
3147 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3149 const char *e = PL_bostr + PL_regendp[n];
3151 * Note that we can't do the "other character" lookup trick as
3152 * in the 8-bit case (no pun intended) because in Unicode we
3153 * have to map both upper and title case to lower case.
3155 if (OP(scan) == REFF) {
3157 STRLEN ulen1, ulen2;
3158 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3159 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3163 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3164 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3165 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3172 nextchr = UCHARAT(locinput);
3176 /* Inline the first character, for speed. */
3177 if (UCHARAT(s) != nextchr &&
3179 (UCHARAT(s) != ((OP(scan) == REFF
3180 ? PL_fold : PL_fold_locale)[nextchr]))))
3182 st->ln = PL_regendp[n] - st->ln;
3183 if (locinput + st->ln > PL_regeol)
3185 if (st->ln > 1 && (OP(scan) == REF
3186 ? memNE(s, locinput, st->ln)
3188 ? ibcmp(s, locinput, st->ln)
3189 : ibcmp_locale(s, locinput, st->ln))))
3192 nextchr = UCHARAT(locinput);
3203 #define ST st->u.eval
3205 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3209 /* execute the code in the {...} */
3211 SV ** const before = SP;
3212 OP_4tree * const oop = PL_op;
3213 COP * const ocurcop = PL_curcop;
3217 PL_op = (OP_4tree*)rex->data->data[n];
3218 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3219 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3220 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3222 CALLRUNOPS(aTHX); /* Scalar context. */
3225 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3232 PAD_RESTORE_LOCAL(old_comppad);
3233 PL_curcop = ocurcop;
3236 sv_setsv(save_scalar(PL_replgv), ret);
3240 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3243 /* extract RE object from returned value; compiling if
3248 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3249 mg = mg_find(sv, PERL_MAGIC_qr);
3250 else if (SvSMAGICAL(ret)) {
3251 if (SvGMAGICAL(ret))
3252 sv_unmagic(ret, PERL_MAGIC_qr);
3254 mg = mg_find(ret, PERL_MAGIC_qr);
3258 re = (regexp *)mg->mg_obj;
3259 (void)ReREFCNT_inc(re);
3263 const char * const t = SvPV_const(ret, len);
3265 const I32 osize = PL_regsize;
3268 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3269 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3271 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3273 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3279 /* run the pattern returned from (??{...}) */
3282 PerlIO_printf(Perl_debug_log,
3283 "Entering embedded \"%s%.60s%s%s\"\n",
3287 (strlen(re->precomp) > 60 ? "..." : ""))
3290 ST.cp = regcppush(0); /* Save *all* the positions. */
3291 REGCP_SET(ST.lastcp);
3292 *PL_reglastparen = 0;
3293 *PL_reglastcloseparen = 0;
3294 PL_reginput = locinput;
3296 /* XXXX This is too dramatic a measure... */
3300 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3301 ((re->reganch & ROPT_UTF8) != 0);
3302 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3307 /* now continue from first node in postoned RE */
3308 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3311 /* /(?(?{...})X|Y)/ */
3312 st->sw = SvTRUE(ret);
3317 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3319 PL_reg_flags ^= RF_utf8;
3322 /* XXXX This is too dramatic a measure... */
3324 /* Restore parens of the caller without popping the
3327 const I32 tmp = PL_savestack_ix;
3328 PL_savestack_ix = ST.lastcp;
3330 PL_savestack_ix = tmp;
3332 PL_reginput = locinput;
3333 /* continue at the node following the (??{...}) */
3337 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3338 /* Restore state to the outer re then re-throw the failure */
3340 PL_reg_flags ^= RF_utf8;
3344 /* XXXX This is too dramatic a measure... */
3347 PL_reginput = locinput;
3348 REGCP_UNWIND(ST.lastcp);
3355 n = ARG(scan); /* which paren pair */
3356 PL_reg_start_tmp[n] = locinput;
3361 n = ARG(scan); /* which paren pair */
3362 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3363 PL_regendp[n] = locinput - PL_bostr;
3364 if (n > (I32)*PL_reglastparen)
3365 *PL_reglastparen = n;
3366 *PL_reglastcloseparen = n;
3369 n = ARG(scan); /* which paren pair */
3370 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3373 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3375 next = NEXTOPER(NEXTOPER(scan));
3377 next = scan + ARG(scan);
3378 if (OP(next) == IFTHEN) /* Fake one. */
3379 next = NEXTOPER(NEXTOPER(next));
3383 st->logical = scan->flags;
3385 /*******************************************************************
3386 cc points to the regmatch_state associated with the most recent CURLYX.
3387 This struct contains info about the innermost (...)* loop (an
3388 "infoblock"), and a pointer to the next outer cc.
3390 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3392 1) After matching Y, regnode for CURLYX is processed;
3394 2) This regnode populates cc, and calls regmatch() recursively
3395 with the starting point at WHILEM node;
3397 3) Each hit of WHILEM node tries to match A and Z (in the order
3398 depending on the current iteration, min/max of {min,max} and
3399 greediness). The information about where are nodes for "A"
3400 and "Z" is read from cc, as is info on how many times "A"
3401 was already matched, and greediness.
3403 4) After A matches, the same WHILEM node is hit again.
3405 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3406 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3407 resets cc, since this Y(A)*Z can be a part of some other loop:
3408 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3409 of the external loop.
3411 Currently present infoblocks form a tree with a stem formed by st->cc
3412 and whatever it mentions via ->next, and additional attached trees
3413 corresponding to temporarily unset infoblocks as in "5" above.
3415 In the following picture, infoblocks for outer loop of
3416 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3417 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3418 infoblocks are drawn below the "reset" infoblock.
3420 In fact in the picture below we do not show failed matches for Z and T
3421 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3422 more obvious *why* one needs to *temporary* unset infoblocks.]
3424 Matched REx position InfoBlocks Comment
3428 Y A)*?Z)*?T x <- O <- I
3429 YA )*?Z)*?T x <- O <- I
3430 YA A)*?Z)*?T x <- O <- I
3431 YAA )*?Z)*?T x <- O <- I
3432 YAA Z)*?T x <- O # Temporary unset I
3435 YAAZ Y(A)*?Z)*?T x <- O
3438 YAAZY (A)*?Z)*?T x <- O
3441 YAAZY A)*?Z)*?T x <- O <- I
3444 YAAZYA )*?Z)*?T x <- O <- I
3447 YAAZYA Z)*?T x <- O # Temporary unset I
3453 YAAZYAZ T x # Temporary unset O
3460 *******************************************************************/
3463 /* No need to save/restore up to this paren */
3464 I32 parenfloor = scan->flags;
3468 CURLYX and WHILEM are always paired: they're the moral
3469 equivalent of pp_enteriter anbd pp_iter.
3471 The only time next could be null is if the node tree is
3472 corrupt. This was mentioned on p5p a few days ago.
3474 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3475 So we'll assert that this is true:
3478 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3480 /* XXXX Probably it is better to teach regpush to support
3481 parenfloor > PL_regsize... */
3482 if (parenfloor > (I32)*PL_reglastparen)
3483 parenfloor = *PL_reglastparen; /* Pessimization... */
3485 st->u.curlyx.cp = PL_savestack_ix;
3486 st->u.curlyx.outercc = st->cc;
3488 /* these fields contain the state of the current curly.
3489 * they are accessed by subsequent WHILEMs;
3490 * cur and lastloc are also updated by WHILEM */
3491 st->u.curlyx.parenfloor = parenfloor;
3492 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3493 st->u.curlyx.min = ARG1(scan);
3494 st->u.curlyx.max = ARG2(scan);
3495 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3496 st->u.curlyx.lastloc = 0;
3497 /* st->next and st->minmod are also read by WHILEM */
3499 PL_reginput = locinput;
3500 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3501 /*** all unsaved local vars undefined at this point */
3502 regcpblow(st->u.curlyx.cp);
3503 st->cc = st->u.curlyx.outercc;
3509 * This is really hard to understand, because after we match
3510 * what we're trying to match, we must make sure the rest of
3511 * the REx is going to match for sure, and to do that we have
3512 * to go back UP the parse tree by recursing ever deeper. And
3513 * if it fails, we have to reset our parent's current state
3514 * that we can try again after backing off.
3519 st->cc gets initialised by CURLYX ready for use by WHILEM.
3520 So again, unless somethings been corrupted, st->cc cannot
3521 be null at that point in WHILEM.
3523 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3524 So we'll assert that this is true:
3527 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3528 st->u.whilem.cache_offset = 0;
3529 st->u.whilem.cache_bit = 0;
3531 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3532 PL_reginput = locinput;
3535 PerlIO_printf(Perl_debug_log,
3536 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3537 REPORT_CODE_OFF+PL_regindent*2, "",
3538 (long)n, (long)st->cc->u.curlyx.min,
3539 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3542 /* If degenerate scan matches "", assume scan done. */
3544 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3545 st->u.whilem.savecc = st->cc;
3546 st->cc = st->cc->u.curlyx.outercc;
3548 st->ln = st->cc->u.curlyx.cur;
3550 PerlIO_printf(Perl_debug_log,
3551 "%*s empty match detected, try continuation...\n",
3552 REPORT_CODE_OFF+PL_regindent*2, "")
3554 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3555 /*** all unsaved local vars undefined at this point */
3556 st->cc = st->u.whilem.savecc;
3559 if (st->cc->u.curlyx.outercc)
3560 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3564 /* First just match a string of min scans. */
3566 if (n < st->cc->u.curlyx.min) {
3567 st->cc->u.curlyx.cur = n;
3568 st->cc->u.curlyx.lastloc = locinput;
3569 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3570 /*** all unsaved local vars undefined at this point */
3573 st->cc->u.curlyx.cur = n - 1;
3574 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3579 /* Check whether we already were at this position.
3580 Postpone detection until we know the match is not
3581 *that* much linear. */
3582 if (!PL_reg_maxiter) {
3583 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3584 /* possible overflow for long strings and many CURLYX's */
3585 if (PL_reg_maxiter < 0)
3586 PL_reg_maxiter = I32_MAX;
3587 PL_reg_leftiter = PL_reg_maxiter;
3589 if (PL_reg_leftiter-- == 0) {
3590 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3591 if (PL_reg_poscache) {
3592 if ((I32)PL_reg_poscache_size < size) {
3593 Renew(PL_reg_poscache, size, char);
3594 PL_reg_poscache_size = size;
3596 Zero(PL_reg_poscache, size, char);
3599 PL_reg_poscache_size = size;
3600 Newxz(PL_reg_poscache, size, char);
3603 PerlIO_printf(Perl_debug_log,
3604 "%sDetected a super-linear match, switching on caching%s...\n",
3605 PL_colors[4], PL_colors[5])
3608 if (PL_reg_leftiter < 0) {
3609 st->u.whilem.cache_offset = locinput - PL_bostr;
3611 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3612 + st->u.whilem.cache_offset * (scan->flags>>4);
3613 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3614 st->u.whilem.cache_offset /= 8;
3615 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3617 PerlIO_printf(Perl_debug_log,
3618 "%*s already tried at this position...\n",
3619 REPORT_CODE_OFF+PL_regindent*2, "")
3621 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3622 /* cache records success */
3625 /* cache records failure */
3631 /* Prefer next over scan for minimal matching. */
3633 if (st->cc->minmod) {
3634 st->u.whilem.savecc = st->cc;
3635 st->cc = st->cc->u.curlyx.outercc;
3637 st->ln = st->cc->u.curlyx.cur;
3638 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3639 REGCP_SET(st->u.whilem.lastcp);
3640 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3641 /*** all unsaved local vars undefined at this point */
3642 st->cc = st->u.whilem.savecc;
3644 regcpblow(st->u.whilem.cp);
3645 CACHEsayYES; /* All done. */
3647 REGCP_UNWIND(st->u.whilem.lastcp);
3649 if (st->cc->u.curlyx.outercc)
3650 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3652 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3653 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3654 && !(PL_reg_flags & RF_warned)) {
3655 PL_reg_flags |= RF_warned;
3656 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3657 "Complex regular subexpression recursion",
3664 PerlIO_printf(Perl_debug_log,
3665 "%*s trying longer...\n",
3666 REPORT_CODE_OFF+PL_regindent*2, "")
3668 /* Try scanning more and see if it helps. */
3669 PL_reginput = locinput;
3670 st->cc->u.curlyx.cur = n;
3671 st->cc->u.curlyx.lastloc = locinput;
3672 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3673 REGCP_SET(st->u.whilem.lastcp);
3674 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3675 /*** all unsaved local vars undefined at this point */
3677 regcpblow(st->u.whilem.cp);
3680 REGCP_UNWIND(st->u.whilem.lastcp);
3682 st->cc->u.curlyx.cur = n - 1;
3683 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3687 /* Prefer scan over next for maximal matching. */
3689 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3690 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3691 st->cc->u.curlyx.cur = n;
3692 st->cc->u.curlyx.lastloc = locinput;
3693 REGCP_SET(st->u.whilem.lastcp);
3694 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3695 /*** all unsaved local vars undefined at this point */
3697 regcpblow(st->u.whilem.cp);
3700 REGCP_UNWIND(st->u.whilem.lastcp);
3701 regcppop(rex); /* Restore some previous $<digit>s? */
3702 PL_reginput = locinput;
3704 PerlIO_printf(Perl_debug_log,
3705 "%*s failed, try continuation...\n",
3706 REPORT_CODE_OFF+PL_regindent*2, "")
3709 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3710 && !(PL_reg_flags & RF_warned)) {
3711 PL_reg_flags |= RF_warned;
3712 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3713 "Complex regular subexpression recursion",
3717 /* Failed deeper matches of scan, so see if this one works. */
3718 st->u.whilem.savecc = st->cc;
3719 st->cc = st->cc->u.curlyx.outercc;
3721 st->ln = st->cc->u.curlyx.cur;
3722 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
3723 /*** all unsaved local vars undefined at this point */
3724 st->cc = st->u.whilem.savecc;
3727 if (st->cc->u.curlyx.outercc)
3728 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3729 st->cc->u.curlyx.cur = n - 1;
3730 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3736 #define ST st->u.branch
3738 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3739 next = scan + ARG(scan);
3742 scan = NEXTOPER(scan);
3745 case BRANCH: /* /(...|A|...)/ */
3746 scan = NEXTOPER(scan); /* scan now points to inner node */
3747 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3748 /* last branch; skip state push and jump direct to node */
3750 ST.lastparen = *PL_reglastparen;
3751 ST.next_branch = next;
3753 PL_reginput = locinput;
3755 /* Now go into the branch */
3756 PUSH_STATE_GOTO(BRANCH_next, scan);
3759 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3760 REGCP_UNWIND(ST.cp);
3761 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3763 *PL_reglastparen = n;
3764 scan = ST.next_branch;
3765 /* no more branches? */
3766 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3768 continue; /* execute next BRANCH[J] op */
3776 #define ST st->u.curlym
3778 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3780 /* This is an optimisation of CURLYX that enables us to push
3781 * only a single backtracking state, no matter now many matches
3782 * there are in {m,n}. It relies on the pattern being constant
3783 * length, with no parens to influence future backrefs
3787 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3789 /* if paren positive, emulate an OPEN/CLOSE around A */
3791 I32 paren = ST.me->flags;
3792 if (paren > PL_regsize)
3794 if (paren > (I32)*PL_reglastparen)
3795 *PL_reglastparen = paren;
3796 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3802 ST.minmod = st->minmod;
3804 ST.c1 = CHRTEST_UNINIT;
3807 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3810 curlym_do_A: /* execute the A in /A{m,n}B/ */
3811 PL_reginput = locinput;
3812 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3815 case CURLYM_A: /* we've just matched an A */
3816 locinput = st->locinput;
3817 nextchr = UCHARAT(locinput);
3820 /* after first match, determine A's length: u.curlym.alen */
3821 if (ST.count == 1) {
3822 if (PL_reg_match_utf8) {
3824 while (s < PL_reginput) {
3830 ST.alen = PL_reginput - locinput;
3833 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3836 PerlIO_printf(Perl_debug_log,
3837 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3838 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3839 (IV) ST.count, (IV)ST.alen)
3842 locinput = PL_reginput;
3843 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3844 goto curlym_do_A; /* try to match another A */
3845 goto curlym_do_B; /* try to match B */
3847 case CURLYM_A_fail: /* just failed to match an A */
3848 REGCP_UNWIND(ST.cp);
3849 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3852 curlym_do_B: /* execute the B in /A{m,n}B/ */
3853 PL_reginput = locinput;
3854 if (ST.c1 == CHRTEST_UNINIT) {
3855 /* calculate c1 and c2 for possible match of 1st char
3856 * following curly */
3857 ST.c1 = ST.c2 = CHRTEST_VOID;
3858 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3859 regnode *text_node = ST.B;
3860 if (! HAS_TEXT(text_node))
3861 FIND_NEXT_IMPT(text_node);
3862 if (HAS_TEXT(text_node)
3863 && PL_regkind[OP(text_node)] != REF)
3865 ST.c1 = (U8)*STRING(text_node);
3867 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3869 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3870 ? PL_fold_locale[ST.c1]
3877 PerlIO_printf(Perl_debug_log,
3878 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
3879 (int)(REPORT_CODE_OFF+PL_regindent*2),
3882 if (ST.c1 != CHRTEST_VOID
3883 && UCHARAT(PL_reginput) != ST.c1
3884 && UCHARAT(PL_reginput) != ST.c2)
3886 /* simulate B failing */
3887 state_num = CURLYM_B_fail;
3888 goto reenter_switch;
3892 /* mark current A as captured */
3893 I32 paren = ST.me->flags;
3896 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3897 PL_regendp[paren] = PL_reginput - PL_bostr;
3900 PL_regendp[paren] = -1;
3902 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
3905 case CURLYM_B_fail: /* just failed to match a B */
3906 REGCP_UNWIND(ST.cp);
3908 if (ST.count == ARG2(ST.me) /* max */)
3910 goto curlym_do_A; /* try to match a further A */
3912 /* backtrack one A */
3913 if (ST.count == ARG1(ST.me) /* min */)
3916 locinput = HOPc(locinput, -ST.alen);
3917 goto curlym_do_B; /* try to match B */
3920 #define ST st->u.curly
3922 #define CURLY_SETPAREN(paren, success) \
3925 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3926 PL_regendp[paren] = locinput - PL_bostr; \
3929 PL_regendp[paren] = -1; \
3932 case STAR: /* /A*B/ where A is width 1 */
3936 scan = NEXTOPER(scan);
3938 case PLUS: /* /A+B/ where A is width 1 */
3942 scan = NEXTOPER(scan);
3944 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3945 ST.paren = scan->flags; /* Which paren to set */
3946 if (ST.paren > PL_regsize)
3947 PL_regsize = ST.paren;
3948 if (ST.paren > (I32)*PL_reglastparen)
3949 *PL_reglastparen = ST.paren;
3950 ST.min = ARG1(scan); /* min to match */
3951 ST.max = ARG2(scan); /* max to match */
3952 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3954 case CURLY: /* /A{m,n}B/ where A is width 1 */
3956 ST.min = ARG1(scan); /* min to match */
3957 ST.max = ARG2(scan); /* max to match */
3958 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3961 * Lookahead to avoid useless match attempts
3962 * when we know what character comes next.
3964 * Used to only do .*x and .*?x, but now it allows
3965 * for )'s, ('s and (?{ ... })'s to be in the way
3966 * of the quantifier and the EXACT-like node. -- japhy
3969 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3971 if (HAS_TEXT(next) || JUMPABLE(next)) {
3973 regnode *text_node = next;
3975 if (! HAS_TEXT(text_node))
3976 FIND_NEXT_IMPT(text_node);
3978 if (! HAS_TEXT(text_node))
3979 ST.c1 = ST.c2 = CHRTEST_VOID;
3981 if (PL_regkind[OP(text_node)] == REF) {
3982 ST.c1 = ST.c2 = CHRTEST_VOID;
3983 goto assume_ok_easy;
3986 s = (U8*)STRING(text_node);
3990 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3991 ST.c2 = PL_fold[ST.c1];
3992 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3993 ST.c2 = PL_fold_locale[ST.c1];
3996 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3997 STRLEN ulen1, ulen2;
3998 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3999 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4001 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4002 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4004 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4006 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4010 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4017 ST.c1 = ST.c2 = CHRTEST_VOID;
4022 PL_reginput = locinput;
4025 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4028 locinput = PL_reginput;
4030 if (ST.c1 == CHRTEST_VOID)
4031 goto curly_try_B_min;
4033 ST.oldloc = locinput;
4035 /* set ST.maxpos to the furthest point along the
4036 * string that could possibly match */
4037 if (ST.max == REG_INFTY) {
4038 ST.maxpos = PL_regeol - 1;
4040 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4044 int m = ST.max - ST.min;
4045 for (ST.maxpos = locinput;
4046 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4047 ST.maxpos += UTF8SKIP(ST.maxpos);
4050 ST.maxpos = locinput + ST.max - ST.min;
4051 if (ST.maxpos >= PL_regeol)
4052 ST.maxpos = PL_regeol - 1;
4054 goto curly_try_B_min_known;
4058 ST.count = regrepeat(rex, ST.A, ST.max);
4059 locinput = PL_reginput;
4060 if (ST.count < ST.min)
4062 if ((ST.count > ST.min)
4063 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4065 /* A{m,n} must come at the end of the string, there's
4066 * no point in backing off ... */
4068 /* ...except that $ and \Z can match before *and* after
4069 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4070 We may back off by one in this case. */
4071 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4075 goto curly_try_B_max;
4080 case CURLY_B_min_known_fail:
4081 /* failed to find B in a non-greedy match where c1,c2 valid */
4082 if (ST.paren && ST.count)
4083 PL_regendp[ST.paren] = -1;
4085 PL_reginput = locinput; /* Could be reset... */
4086 REGCP_UNWIND(ST.cp);
4087 /* Couldn't or didn't -- move forward. */
4088 ST.oldloc = locinput;
4090 locinput += UTF8SKIP(locinput);
4094 curly_try_B_min_known:
4095 /* find the next place where 'B' could work, then call B */
4099 n = (ST.oldloc == locinput) ? 0 : 1;
4100 if (ST.c1 == ST.c2) {
4102 /* set n to utf8_distance(oldloc, locinput) */
4103 while (locinput <= ST.maxpos &&
4104 utf8n_to_uvchr((U8*)locinput,
4105 UTF8_MAXBYTES, &len,
4106 uniflags) != (UV)ST.c1) {
4112 /* set n to utf8_distance(oldloc, locinput) */
4113 while (locinput <= ST.maxpos) {
4115 const UV c = utf8n_to_uvchr((U8*)locinput,
4116 UTF8_MAXBYTES, &len,
4118 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4126 if (ST.c1 == ST.c2) {
4127 while (locinput <= ST.maxpos &&
4128 UCHARAT(locinput) != ST.c1)
4132 while (locinput <= ST.maxpos
4133 && UCHARAT(locinput) != ST.c1
4134 && UCHARAT(locinput) != ST.c2)
4137 n = locinput - ST.oldloc;
4139 if (locinput > ST.maxpos)
4141 /* PL_reginput == oldloc now */
4144 if (regrepeat(rex, ST.A, n) < n)
4147 PL_reginput = locinput;
4148 CURLY_SETPAREN(ST.paren, ST.count);
4149 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4154 case CURLY_B_min_fail:
4155 /* failed to find B in a non-greedy match where c1,c2 invalid */
4156 if (ST.paren && ST.count)
4157 PL_regendp[ST.paren] = -1;
4159 REGCP_UNWIND(ST.cp);
4160 /* failed -- move forward one */
4161 PL_reginput = locinput;
4162 if (regrepeat(rex, ST.A, 1)) {
4164 locinput = PL_reginput;
4165 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4166 ST.count > 0)) /* count overflow ? */
4169 CURLY_SETPAREN(ST.paren, ST.count);
4170 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4178 /* a successful greedy match: now try to match B */
4181 if (ST.c1 != CHRTEST_VOID)
4182 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4183 UTF8_MAXBYTES, 0, uniflags)
4184 : (UV) UCHARAT(PL_reginput);
4185 /* If it could work, try it. */
4186 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4187 CURLY_SETPAREN(ST.paren, ST.count);
4188 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4193 case CURLY_B_max_fail:
4194 /* failed to find B in a greedy match */
4195 if (ST.paren && ST.count)
4196 PL_regendp[ST.paren] = -1;
4198 REGCP_UNWIND(ST.cp);
4200 if (--ST.count < ST.min)
4202 PL_reginput = locinput = HOPc(locinput, -1);
4203 goto curly_try_B_max;
4209 if (locinput < reginfo->till) {
4210 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4211 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4213 (long)(locinput - PL_reg_starttry),
4214 (long)(reginfo->till - PL_reg_starttry),
4216 sayNO_FINAL; /* Cannot match: too short. */
4218 PL_reginput = locinput; /* put where regtry can find it */
4219 sayYES_FINAL; /* Success! */
4221 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4223 PerlIO_printf(Perl_debug_log,
4224 "%*s %ssubpattern success...%s\n",
4225 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
4226 PL_reginput = locinput; /* put where regtry can find it */
4227 sayYES_FINAL; /* Success! */
4230 #define ST st->u.ifmatch
4232 case SUSPEND: /* (?>A) */
4234 PL_reginput = locinput;
4237 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4239 goto ifmatch_trivial_fail_test;
4241 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4243 ifmatch_trivial_fail_test:
4245 char * const s = HOPBACKc(locinput, scan->flags);
4250 st->sw = 1 - (bool)ST.wanted;
4254 next = scan + ARG(scan);
4262 PL_reginput = locinput;
4266 /* execute body of (?...A) */
4267 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4270 case IFMATCH_A_fail: /* body of (?...A) failed */
4271 ST.wanted = !ST.wanted;
4274 case IFMATCH_A: /* body of (?...A) succeeded */
4277 st->sw = (bool)ST.wanted;
4279 else if (!ST.wanted)
4282 if (OP(ST.me) == SUSPEND)
4283 locinput = PL_reginput;
4285 locinput = PL_reginput = st->locinput;
4286 nextchr = UCHARAT(locinput);
4288 scan = ST.me + ARG(ST.me);
4291 continue; /* execute B */
4296 next = scan + ARG(scan);
4301 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4302 PTR2UV(scan), OP(scan));
4303 Perl_croak(aTHX_ "regexp memory corruption");
4311 /* push a state that backtracks on success */
4312 st->u.yes.prev_yes_state = yes_state;
4316 /* push a new regex state, then continue at scan */
4318 regmatch_state *newst;
4321 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4322 "PUSH STATE(%d)\n", depth));
4323 st->locinput = locinput;
4325 if (newst > SLAB_LAST(PL_regmatch_slab))
4326 newst = S_push_slab(aTHX);
4327 PL_regmatch_state = newst;
4329 /* XXX probably don't need to initialise these */
4334 locinput = PL_reginput;
4335 nextchr = UCHARAT(locinput);
4341 /* simulate recursively calling regmatch(), but without actually
4342 * recursing - ie save the current state on the heap rather than on
4343 * the stack, then re-enter the loop. This avoids complex regexes
4344 * blowing the processor stack */
4348 /* push new state */
4349 regmatch_state *oldst = st;
4352 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
4354 /* grab the next free state slot */
4356 if (st > SLAB_LAST(PL_regmatch_slab))
4357 st = S_push_slab(aTHX);
4358 PL_regmatch_state = st;
4362 oldst->locinput = locinput;
4365 locinput = PL_reginput;
4366 nextchr = UCHARAT(locinput);
4379 * We get here only if there's trouble -- normally "case END" is
4380 * the terminating point.
4382 Perl_croak(aTHX_ "corrupted regexp pointers");
4389 /* we have successfully completed a subexpression, but we must now
4390 * pop to the state marked by yes_state and continue from there */
4392 assert(st != yes_state);
4393 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4394 || yes_state > SLAB_LAST(PL_regmatch_slab))
4396 /* not in this slab, pop slab */
4397 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4398 PL_regmatch_slab = PL_regmatch_slab->prev;
4399 st = SLAB_LAST(PL_regmatch_slab);
4401 depth -= (st - yes_state);
4402 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4403 depth+1, depth+(st - yes_state)));
4405 yes_state = st->u.yes.prev_yes_state;
4406 PL_regmatch_state = st;
4408 switch (st->resume_state) {
4412 state_num = st->resume_state;
4413 goto reenter_switch;
4420 Perl_croak(aTHX_ "unexpected yes resume state");
4424 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4425 PL_colors[4], PL_colors[5]));
4432 /* XXX this is duplicate(ish) code to that in the do_no section.
4433 * will disappear when REGFMATCH goes */
4435 /* restore previous state and re-enter */
4436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4439 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4440 PL_regmatch_slab = PL_regmatch_slab->prev;
4441 st = SLAB_LAST(PL_regmatch_slab);
4443 PL_regmatch_state = st;
4447 locinput= st->locinput;
4448 nextchr = UCHARAT(locinput);
4450 switch (st->resume_state) {
4452 goto resume_point_CURLYX;
4453 case resume_WHILEM1:
4454 goto resume_point_WHILEM1;
4455 case resume_WHILEM2:
4456 goto resume_point_WHILEM2;
4457 case resume_WHILEM3:
4458 goto resume_point_WHILEM3;
4459 case resume_WHILEM4:
4460 goto resume_point_WHILEM4;
4461 case resume_WHILEM5:
4462 goto resume_point_WHILEM5;
4463 case resume_WHILEM6:
4464 goto resume_point_WHILEM6;
4474 case CURLY_B_min_known:
4478 Perl_croak(aTHX_ "regexp resume memory corruption");
4485 PerlIO_printf(Perl_debug_log,
4486 "%*s %sfailed...%s\n",
4487 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4498 /* there's a previous state to backtrack to */
4499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4502 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4503 PL_regmatch_slab = PL_regmatch_slab->prev;
4504 st = SLAB_LAST(PL_regmatch_slab);
4506 PL_regmatch_state = st;
4510 locinput= st->locinput;
4511 nextchr = UCHARAT(locinput);
4513 switch (st->resume_state) {
4515 goto resume_point_CURLYX;
4516 case resume_WHILEM1:
4517 goto resume_point_WHILEM1;
4518 case resume_WHILEM2:
4519 goto resume_point_WHILEM2;
4520 case resume_WHILEM3:
4521 goto resume_point_WHILEM3;
4522 case resume_WHILEM4:
4523 goto resume_point_WHILEM4;
4524 case resume_WHILEM5:
4525 goto resume_point_WHILEM5;
4526 case resume_WHILEM6:
4527 goto resume_point_WHILEM6;
4537 case CURLY_B_min_known:
4538 if (yes_state == st)
4539 yes_state = st->u.yes.prev_yes_state;
4540 state_num = st->resume_state + 1; /* failure = success + 1 */
4541 goto reenter_switch;
4544 Perl_croak(aTHX_ "regexp resume memory corruption");
4550 /* restore original high-water mark */
4551 PL_regmatch_slab = orig_slab;
4552 PL_regmatch_state = orig_state;
4554 /* free all slabs above current one */
4555 if (orig_slab->next) {
4556 regmatch_slab *sl = orig_slab->next;
4557 orig_slab->next = NULL;
4559 regmatch_slab * const osl = sl;
4570 - regrepeat - repeatedly match something simple, report how many
4573 * [This routine now assumes that it will only match on things of length 1.
4574 * That was true before, but now we assume scan - reginput is the count,
4575 * rather than incrementing count on every character. [Er, except utf8.]]
4578 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4581 register char *scan;
4583 register char *loceol = PL_regeol;
4584 register I32 hardcount = 0;
4585 register bool do_utf8 = PL_reg_match_utf8;
4588 if (max == REG_INFTY)
4590 else if (max < loceol - scan)
4591 loceol = scan + max;
4596 while (scan < loceol && hardcount < max && *scan != '\n') {
4597 scan += UTF8SKIP(scan);
4601 while (scan < loceol && *scan != '\n')
4608 while (scan < loceol && hardcount < max) {
4609 scan += UTF8SKIP(scan);
4619 case EXACT: /* length of string is 1 */
4621 while (scan < loceol && UCHARAT(scan) == c)
4624 case EXACTF: /* length of string is 1 */
4626 while (scan < loceol &&
4627 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4630 case EXACTFL: /* length of string is 1 */
4631 PL_reg_flags |= RF_tainted;
4633 while (scan < loceol &&
4634 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4640 while (hardcount < max && scan < loceol &&
4641 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4642 scan += UTF8SKIP(scan);
4646 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4653 LOAD_UTF8_CHARCLASS_ALNUM();
4654 while (hardcount < max && scan < loceol &&
4655 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4656 scan += UTF8SKIP(scan);
4660 while (scan < loceol && isALNUM(*scan))
4665 PL_reg_flags |= RF_tainted;
4668 while (hardcount < max && scan < loceol &&
4669 isALNUM_LC_utf8((U8*)scan)) {
4670 scan += UTF8SKIP(scan);
4674 while (scan < loceol && isALNUM_LC(*scan))
4681 LOAD_UTF8_CHARCLASS_ALNUM();
4682 while (hardcount < max && scan < loceol &&
4683 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4684 scan += UTF8SKIP(scan);
4688 while (scan < loceol && !isALNUM(*scan))
4693 PL_reg_flags |= RF_tainted;
4696 while (hardcount < max && scan < loceol &&
4697 !isALNUM_LC_utf8((U8*)scan)) {
4698 scan += UTF8SKIP(scan);
4702 while (scan < loceol && !isALNUM_LC(*scan))
4709 LOAD_UTF8_CHARCLASS_SPACE();
4710 while (hardcount < max && scan < loceol &&
4712 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4713 scan += UTF8SKIP(scan);
4717 while (scan < loceol && isSPACE(*scan))
4722 PL_reg_flags |= RF_tainted;
4725 while (hardcount < max && scan < loceol &&
4726 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4727 scan += UTF8SKIP(scan);
4731 while (scan < loceol && isSPACE_LC(*scan))
4738 LOAD_UTF8_CHARCLASS_SPACE();
4739 while (hardcount < max && scan < loceol &&
4741 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4742 scan += UTF8SKIP(scan);
4746 while (scan < loceol && !isSPACE(*scan))
4751 PL_reg_flags |= RF_tainted;
4754 while (hardcount < max && scan < loceol &&
4755 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4756 scan += UTF8SKIP(scan);
4760 while (scan < loceol && !isSPACE_LC(*scan))
4767 LOAD_UTF8_CHARCLASS_DIGIT();
4768 while (hardcount < max && scan < loceol &&
4769 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4770 scan += UTF8SKIP(scan);
4774 while (scan < loceol && isDIGIT(*scan))
4781 LOAD_UTF8_CHARCLASS_DIGIT();
4782 while (hardcount < max && scan < loceol &&
4783 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4784 scan += UTF8SKIP(scan);
4788 while (scan < loceol && !isDIGIT(*scan))
4792 default: /* Called on something of 0 width. */
4793 break; /* So match right here or not at all. */
4799 c = scan - PL_reginput;
4803 GET_RE_DEBUG_FLAGS_DECL;
4805 SV * const prop = sv_newmortal();
4806 regprop(prog, prop, p);
4807 PerlIO_printf(Perl_debug_log,
4808 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4809 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4817 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4819 - regclass_swash - prepare the utf8 swash
4823 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4829 const struct reg_data * const data = prog ? prog->data : NULL;
4831 if (data && data->count) {
4832 const U32 n = ARG(node);
4834 if (data->what[n] == 's') {
4835 SV * const rv = (SV*)data->data[n];
4836 AV * const av = (AV*)SvRV((SV*)rv);
4837 SV **const ary = AvARRAY(av);
4840 /* See the end of regcomp.c:S_regclass() for
4841 * documentation of these array elements. */
4844 a = SvROK(ary[1]) ? &ary[1] : 0;
4845 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4849 else if (si && doinit) {
4850 sw = swash_init("utf8", "", si, 1, 0);
4851 (void)av_store(av, 1, sw);
4868 - reginclass - determine if a character falls into a character class
4870 The n is the ANYOF regnode, the p is the target string, lenp
4871 is pointer to the maximum length of how far to go in the p
4872 (if the lenp is zero, UTF8SKIP(p) is used),
4873 do_utf8 tells whether the target string is in UTF-8.
4878 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4881 const char flags = ANYOF_FLAGS(n);
4887 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4888 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4889 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4890 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
4891 if (len == (STRLEN)-1)
4892 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4895 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4896 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4899 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4900 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4903 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4907 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
4910 if (swash_fetch(sw, p, do_utf8))
4912 else if (flags & ANYOF_FOLD) {
4913 if (!match && lenp && av) {
4915 for (i = 0; i <= av_len(av); i++) {
4916 SV* const sv = *av_fetch(av, i, FALSE);
4918 const char * const s = SvPV_const(sv, len);
4920 if (len <= plen && memEQ(s, (char*)p, len)) {
4928 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4931 to_utf8_fold(p, tmpbuf, &tmplen);
4932 if (swash_fetch(sw, tmpbuf, do_utf8))
4938 if (match && lenp && *lenp == 0)
4939 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4941 if (!match && c < 256) {
4942 if (ANYOF_BITMAP_TEST(n, c))
4944 else if (flags & ANYOF_FOLD) {
4947 if (flags & ANYOF_LOCALE) {
4948 PL_reg_flags |= RF_tainted;
4949 f = PL_fold_locale[c];
4953 if (f != c && ANYOF_BITMAP_TEST(n, f))
4957 if (!match && (flags & ANYOF_CLASS)) {
4958 PL_reg_flags |= RF_tainted;
4960 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4961 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4962 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4963 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4964 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4965 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4966 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4967 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4968 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4969 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4970 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4971 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4972 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4973 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4974 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4975 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4976 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4977 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4978 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4979 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4980 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4981 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4982 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4983 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4984 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4985 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4986 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4987 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4988 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4989 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4990 ) /* How's that for a conditional? */
4997 return (flags & ANYOF_INVERT) ? !match : match;
5001 S_reghop3(U8 *s, I32 off, const U8* lim)
5005 while (off-- && s < lim) {
5006 /* XXX could check well-formedness here */
5014 if (UTF8_IS_CONTINUED(*s)) {
5015 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5018 /* XXX could check well-formedness here */
5026 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5030 while (off-- && s < lim) {
5031 /* XXX could check well-formedness here */
5041 if (UTF8_IS_CONTINUED(*s)) {
5042 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5045 /* XXX could check well-formedness here */
5057 restore_pos(pTHX_ void *arg)
5060 regexp * const rex = (regexp *)arg;
5061 if (PL_reg_eval_set) {
5062 if (PL_reg_oldsaved) {
5063 rex->subbeg = PL_reg_oldsaved;
5064 rex->sublen = PL_reg_oldsavedlen;
5065 #ifdef PERL_OLD_COPY_ON_WRITE
5066 rex->saved_copy = PL_nrs;
5068 RX_MATCH_COPIED_on(rex);
5070 PL_reg_magic->mg_len = PL_reg_oldpos;
5071 PL_reg_eval_set = 0;
5072 PL_curpm = PL_reg_oldcurpm;
5077 S_to_utf8_substr(pTHX_ register regexp *prog)
5079 if (prog->float_substr && !prog->float_utf8) {
5080 SV* const sv = newSVsv(prog->float_substr);
5081 prog->float_utf8 = sv;
5082 sv_utf8_upgrade(sv);
5083 if (SvTAIL(prog->float_substr))
5085 if (prog->float_substr == prog->check_substr)
5086 prog->check_utf8 = sv;
5088 if (prog->anchored_substr && !prog->anchored_utf8) {
5089 SV* const sv = newSVsv(prog->anchored_substr);
5090 prog->anchored_utf8 = sv;
5091 sv_utf8_upgrade(sv);
5092 if (SvTAIL(prog->anchored_substr))
5094 if (prog->anchored_substr == prog->check_substr)
5095 prog->check_utf8 = sv;
5100 S_to_byte_substr(pTHX_ register regexp *prog)
5103 if (prog->float_utf8 && !prog->float_substr) {
5104 SV* sv = newSVsv(prog->float_utf8);
5105 prog->float_substr = sv;
5106 if (sv_utf8_downgrade(sv, TRUE)) {
5107 if (SvTAIL(prog->float_utf8))
5111 prog->float_substr = sv = &PL_sv_undef;
5113 if (prog->float_utf8 == prog->check_utf8)
5114 prog->check_substr = sv;
5116 if (prog->anchored_utf8 && !prog->anchored_substr) {
5117 SV* sv = newSVsv(prog->anchored_utf8);
5118 prog->anchored_substr = sv;
5119 if (sv_utf8_downgrade(sv, TRUE)) {
5120 if (SvTAIL(prog->anchored_utf8))
5124 prog->anchored_substr = sv = &PL_sv_undef;
5126 if (prog->anchored_utf8 == prog->check_utf8)
5127 prog->check_substr = sv;
5133 * c-indentation-style: bsd
5135 * indent-tabs-mode: t
5138 * ex: set ts=8 sts=4 sw=4 noet: