5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
78 **** Alterations to Henry's code are...
80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
91 #define PERL_IN_REGEXEC_C
96 #define RF_tainted 1 /* tainted information used? */
97 #define RF_warned 2 /* warned about big count? */
98 #define RF_evaled 4 /* Did an EVAL with setting? */
99 #define RF_utf8 8 /* String contains multibyte chars? */
101 #define UTF ((PL_reg_flags & RF_utf8) != 0)
103 #define RS_init 1 /* eval environment created */
104 #define RS_set 2 /* replsv value is set */
107 #define STATIC static
110 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
116 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
117 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
119 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
121 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
123 #define HOPc(pos,off) ((char*)HOP(pos,off))
124 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
126 #define HOPBACK(pos, off) ( \
127 (PL_reg_match_utf8) \
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
133 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
135 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
137 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
142 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
149 /* for use after a quantifier and before an EXACT-like node -- japhy */
150 #define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
157 #define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
165 #define FIND_NEXT_IMPT(rn) STMT_START { \
166 while (JUMPABLE(rn)) \
167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
168 rn = NEXTOPER(NEXTOPER(rn)); \
169 else if (OP(rn) == PLUS) \
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
173 else rn += NEXT_OFF(rn); \
176 static void restore_pos(pTHX_ void *arg);
179 S_regcppush(pTHX_ I32 parenfloor)
182 const int retval = PL_savestack_ix;
183 #define REGCP_PAREN_ELEMS 4
184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
190 #define REGCP_OTHER_ELEMS 6
191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
192 for (p = PL_regsize; p > parenfloor; p--) {
193 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
196 SSPUSHPTR(PL_reg_start_tmp[p]);
199 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
202 SSPUSHINT(*PL_reglastcloseparen);
203 SSPUSHPTR(PL_reginput);
204 #define REGCP_FRAME_ELEMS 2
205 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
213 /* These are needed since we do not localize EVAL nodes: */
214 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
218 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
219 PerlIO_printf(Perl_debug_log, \
220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
231 GET_RE_DEBUG_FLAGS_DECL;
233 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
235 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
236 i = SSPOPINT; /* Parentheses elements to pop. */
237 input = (char *) SSPOPPTR;
238 *PL_reglastcloseparen = SSPOPINT;
239 *PL_reglastparen = SSPOPINT;
240 PL_regsize = SSPOPINT;
242 /* Now restore the parentheses context. */
243 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
244 i > 0; i -= REGCP_PAREN_ELEMS) {
246 paren = (U32)SSPOPINT;
247 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
248 PL_regstartp[paren] = SSPOPINT;
250 if (paren <= *PL_reglastparen)
251 PL_regendp[paren] = tmps;
253 PerlIO_printf(Perl_debug_log,
254 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
255 (UV)paren, (IV)PL_regstartp[paren],
256 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
257 (IV)PL_regendp[paren],
258 (paren > *PL_reglastparen ? "(no)" : ""));
262 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
263 PerlIO_printf(Perl_debug_log,
264 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
265 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
269 /* It would seem that the similar code in regtry()
270 * already takes care of this, and in fact it is in
271 * a better location to since this code can #if 0-ed out
272 * but the code in regtry() is needed or otherwise tests
273 * requiring null fields (pat.t#187 and split.t#{13,14}
274 * (as of patchlevel 7877) will fail. Then again,
275 * this code seems to be necessary or otherwise
276 * building DynaLoader will fail:
277 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
279 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
280 if ((I32)paren > PL_regsize)
281 PL_regstartp[paren] = -1;
282 PL_regendp[paren] = -1;
289 S_regcp_set_to(pTHX_ I32 ss)
292 const I32 tmp = PL_savestack_ix;
294 PL_savestack_ix = ss;
296 PL_savestack_ix = tmp;
300 typedef struct re_cc_state
304 struct re_cc_state *prev;
309 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
311 #define TRYPAREN(paren, n, input) { \
314 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
315 PL_regendp[paren] = input - PL_bostr; \
318 PL_regendp[paren] = -1; \
320 if (regmatch(next)) \
323 PL_regendp[paren] = -1; \
328 * pregexec and friends
332 - pregexec - match a regexp against a string
335 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
336 char *strbeg, I32 minend, SV *screamer, U32 nosave)
337 /* strend: pointer to null at end of string */
338 /* strbeg: real beginning of string */
339 /* minend: end of match must be >=minend after stringarg. */
340 /* nosave: For optimizations. */
343 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
344 nosave ? 0 : REXEC_COPY_STR);
348 S_cache_re(pTHX_ regexp *prog)
351 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
353 PL_regprogram = prog->program;
355 PL_regnpar = prog->nparens;
356 PL_regdata = prog->data;
361 * Need to implement the following flags for reg_anch:
363 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
365 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
366 * INTUIT_AUTORITATIVE_ML
367 * INTUIT_ONCE_NOML - Intuit can match in one location only.
370 * Another flag for this function: SECOND_TIME (so that float substrs
371 * with giant delta may be not rechecked).
374 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
376 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
377 Otherwise, only SvCUR(sv) is used to get strbeg. */
379 /* XXXX We assume that strpos is strbeg unless sv. */
381 /* XXXX Some places assume that there is a fixed substring.
382 An update may be needed if optimizer marks as "INTUITable"
383 RExen without fixed substrings. Similarly, it is assumed that
384 lengths of all the strings are no more than minlen, thus they
385 cannot come from lookahead.
386 (Or minlen should take into account lookahead.) */
388 /* A failure to find a constant substring means that there is no need to make
389 an expensive call to REx engine, thus we celebrate a failure. Similarly,
390 finding a substring too deep into the string means that less calls to
391 regtry() should be needed.
393 REx compiler's optimizer found 4 possible hints:
394 a) Anchored substring;
396 c) Whether we are anchored (beginning-of-line or \G);
397 d) First node (of those at offset 0) which may distingush positions;
398 We use a)b)d) and multiline-part of c), and try to find a position in the
399 string which does not contradict any of them.
402 /* Most of decisions we do here should have been done at compile time.
403 The nodes of the REx which we used for the search should have been
404 deleted from the finite automaton. */
407 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
408 char *strend, U32 flags, re_scream_pos_data *data)
411 register I32 start_shift = 0;
412 /* Should be nonnegative! */
413 register I32 end_shift = 0;
418 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
420 register char *other_last = Nullch; /* other substr checked before this */
421 char *check_at = Nullch; /* check substr found at this pos */
422 const I32 multiline = prog->reganch & PMf_MULTILINE;
424 const char * const i_strpos = strpos;
425 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
428 GET_RE_DEBUG_FLAGS_DECL;
430 RX_MATCH_UTF8_set(prog,do_utf8);
432 if (prog->reganch & ROPT_UTF8) {
433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
434 "UTF-8 regex...\n"));
435 PL_reg_flags |= RF_utf8;
439 const char *s = PL_reg_match_utf8 ?
440 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
442 const int len = PL_reg_match_utf8 ?
443 strlen(s) : strend - strpos;
446 if (PL_reg_match_utf8)
447 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
448 "UTF-8 target...\n"));
449 PerlIO_printf(Perl_debug_log,
450 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
451 PL_colors[4], PL_colors[5], PL_colors[0],
454 (strlen(prog->precomp) > 60 ? "..." : ""),
456 (int)(len > 60 ? 60 : len),
458 (len > 60 ? "..." : "")
462 /* CHR_DIST() would be more correct here but it makes things slow. */
463 if (prog->minlen > strend - strpos) {
464 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
465 "String too short... [re_intuit_start]\n"));
468 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
471 if (!prog->check_utf8 && prog->check_substr)
472 to_utf8_substr(prog);
473 check = prog->check_utf8;
475 if (!prog->check_substr && prog->check_utf8)
476 to_byte_substr(prog);
477 check = prog->check_substr;
479 if (check == &PL_sv_undef) {
480 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
481 "Non-utf string cannot match utf check string\n"));
484 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
485 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
486 || ( (prog->reganch & ROPT_ANCH_BOL)
487 && !multiline ) ); /* Check after \n? */
490 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
491 | ROPT_IMPLICIT)) /* not a real BOL */
492 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
494 && (strpos != strbeg)) {
495 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
498 if (prog->check_offset_min == prog->check_offset_max &&
499 !(prog->reganch & ROPT_CANY_SEEN)) {
500 /* Substring at constant offset from beg-of-str... */
503 s = HOP3c(strpos, prog->check_offset_min, strend);
505 slen = SvCUR(check); /* >= 1 */
507 if ( strend - s > slen || strend - s < slen - 1
508 || (strend - s == slen && strend[-1] != '\n')) {
509 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
512 /* Now should match s[0..slen-2] */
514 if (slen && (*SvPVX_const(check) != *s
516 && memNE(SvPVX_const(check), s, slen)))) {
518 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
522 else if (*SvPVX_const(check) != *s
523 || ((slen = SvCUR(check)) > 1
524 && memNE(SvPVX_const(check), s, slen)))
527 goto success_at_start;
530 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
532 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
533 end_shift = prog->minlen - start_shift -
534 CHR_SVLEN(check) + (SvTAIL(check) != 0);
536 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
537 - (SvTAIL(check) != 0);
538 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
540 if (end_shift < eshift)
544 else { /* Can match at random position */
547 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
548 /* Should be nonnegative! */
549 end_shift = prog->minlen - start_shift -
550 CHR_SVLEN(check) + (SvTAIL(check) != 0);
553 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
555 Perl_croak(aTHX_ "panic: end_shift");
559 /* Find a possible match in the region s..strend by looking for
560 the "check" substring in the region corrected by start/end_shift. */
561 if (flags & REXEC_SCREAM) {
562 I32 p = -1; /* Internal iterator of scream. */
563 I32 * const pp = data ? data->scream_pos : &p;
565 if (PL_screamfirst[BmRARE(check)] >= 0
566 || ( BmRARE(check) == '\n'
567 && (BmPREVIOUS(check) == SvCUR(check) - 1)
569 s = screaminstr(sv, check,
570 start_shift + (s - strbeg), end_shift, pp, 0);
573 /* we may be pointing at the wrong string */
574 if (s && RX_MATCH_COPIED(prog))
575 s = strbeg + (s - SvPVX_const(sv));
577 *data->scream_olds = s;
579 else if (prog->reganch & ROPT_CANY_SEEN)
580 s = fbm_instr((U8*)(s + start_shift),
581 (U8*)(strend - end_shift),
582 check, multiline ? FBMrf_MULTILINE : 0);
584 s = fbm_instr(HOP3(s, start_shift, strend),
585 HOP3(strend, -end_shift, strbeg),
586 check, multiline ? FBMrf_MULTILINE : 0);
588 /* Update the count-of-usability, remove useless subpatterns,
591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
592 (s ? "Found" : "Did not find"),
593 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
595 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
597 PL_colors[1], (SvTAIL(check) ? "$" : ""),
598 (s ? " at offset " : "...\n") ) );
605 /* Finish the diagnostic message */
606 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
608 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
609 Start with the other substr.
610 XXXX no SCREAM optimization yet - and a very coarse implementation
611 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
612 *always* match. Probably should be marked during compile...
613 Probably it is right to do no SCREAM here...
616 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
617 /* Take into account the "other" substring. */
618 /* XXXX May be hopelessly wrong for UTF... */
621 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
624 char * const last = HOP3c(s, -start_shift, strbeg);
629 t = s - prog->check_offset_max;
630 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
632 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
637 t = HOP3c(t, prog->anchored_offset, strend);
638 if (t < other_last) /* These positions already checked */
640 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
643 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
644 /* On end-of-str: see comment below. */
645 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
646 if (must == &PL_sv_undef) {
648 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
653 HOP3(HOP3(last1, prog->anchored_offset, strend)
654 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
656 multiline ? FBMrf_MULTILINE : 0
658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
659 "%s anchored substr \"%s%.*s%s\"%s",
660 (s ? "Found" : "Contradicts"),
663 - (SvTAIL(must)!=0)),
665 PL_colors[1], (SvTAIL(must) ? "$" : "")));
667 if (last1 >= last2) {
668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
669 ", giving up...\n"));
672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
673 ", trying floating at offset %ld...\n",
674 (long)(HOP3c(s1, 1, strend) - i_strpos)));
675 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
676 s = HOP3c(last, 1, strend);
680 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
681 (long)(s - i_strpos)));
682 t = HOP3c(s, -prog->anchored_offset, strbeg);
683 other_last = HOP3c(s, 1, strend);
691 else { /* Take into account the floating substring. */
696 t = HOP3c(s, -start_shift, strbeg);
698 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
699 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
700 last = HOP3c(t, prog->float_max_offset, strend);
701 s = HOP3c(t, prog->float_min_offset, strend);
704 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
705 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
706 /* fbm_instr() takes into account exact value of end-of-str
707 if the check is SvTAIL(ed). Since false positives are OK,
708 and end-of-str is not later than strend we are OK. */
709 if (must == &PL_sv_undef) {
711 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
714 s = fbm_instr((unsigned char*)s,
715 (unsigned char*)last + SvCUR(must)
717 must, multiline ? FBMrf_MULTILINE : 0);
718 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
719 (s ? "Found" : "Contradicts"),
721 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
723 PL_colors[1], (SvTAIL(must) ? "$" : "")));
726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
727 ", giving up...\n"));
730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
731 ", trying anchored starting at offset %ld...\n",
732 (long)(s1 + 1 - i_strpos)));
734 s = HOP3c(t, 1, strend);
738 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
739 (long)(s - i_strpos)));
740 other_last = s; /* Fix this later. --Hugo */
749 t = s - prog->check_offset_max;
750 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
752 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
754 /* Fixed substring is found far enough so that the match
755 cannot start at strpos. */
757 if (ml_anch && t[-1] != '\n') {
758 /* Eventually fbm_*() should handle this, but often
759 anchored_offset is not 0, so this check will not be wasted. */
760 /* XXXX In the code below we prefer to look for "^" even in
761 presence of anchored substrings. And we search even
762 beyond the found float position. These pessimizations
763 are historical artefacts only. */
765 while (t < strend - prog->minlen) {
767 if (t < check_at - prog->check_offset_min) {
768 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
769 /* Since we moved from the found position,
770 we definitely contradict the found anchored
771 substr. Due to the above check we do not
772 contradict "check" substr.
773 Thus we can arrive here only if check substr
774 is float. Redo checking for "other"=="fixed".
777 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
778 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
779 goto do_other_anchored;
781 /* We don't contradict the found floating substring. */
782 /* XXXX Why not check for STCLASS? */
784 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
785 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
788 /* Position contradicts check-string */
789 /* XXXX probably better to look for check-string
790 than for "\n", so one should lower the limit for t? */
791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
792 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
793 other_last = strpos = s = t + 1;
798 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
799 PL_colors[0], PL_colors[1]));
803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
804 PL_colors[0], PL_colors[1]));
808 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
811 /* The found string does not prohibit matching at strpos,
812 - no optimization of calling REx engine can be performed,
813 unless it was an MBOL and we are not after MBOL,
814 or a future STCLASS check will fail this. */
816 /* Even in this situation we may use MBOL flag if strpos is offset
817 wrt the start of the string. */
818 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
819 && (strpos != strbeg) && strpos[-1] != '\n'
820 /* May be due to an implicit anchor of m{.*foo} */
821 && !(prog->reganch & ROPT_IMPLICIT))
826 DEBUG_EXECUTE_r( if (ml_anch)
827 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
828 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
831 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
833 prog->check_utf8 /* Could be deleted already */
834 && --BmUSEFUL(prog->check_utf8) < 0
835 && (prog->check_utf8 == prog->float_utf8)
837 prog->check_substr /* Could be deleted already */
838 && --BmUSEFUL(prog->check_substr) < 0
839 && (prog->check_substr == prog->float_substr)
842 /* If flags & SOMETHING - do not do it many times on the same match */
843 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
844 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
845 if (do_utf8 ? prog->check_substr : prog->check_utf8)
846 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
847 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
848 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
849 check = Nullsv; /* abort */
851 /* XXXX This is a remnant of the old implementation. It
852 looks wasteful, since now INTUIT can use many
854 prog->reganch &= ~RE_USE_INTUIT;
861 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
862 if (prog->regstclass) {
863 /* minlen == 0 is possible if regstclass is \b or \B,
864 and the fixed substr is ''$.
865 Since minlen is already taken into account, s+1 is before strend;
866 accidentally, minlen >= 1 guaranties no false positives at s + 1
867 even for \b or \B. But (minlen? 1 : 0) below assumes that
868 regstclass does not come from lookahead... */
869 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
870 This leaves EXACTF only, which is dealt with in find_byclass(). */
871 const U8* const str = (U8*)STRING(prog->regstclass);
872 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
873 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
875 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
876 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
877 : (prog->float_substr || prog->float_utf8
878 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
884 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
887 const char *what = NULL;
889 if (endpos == strend) {
890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
891 "Could not match STCLASS...\n") );
894 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
895 "This position contradicts STCLASS...\n") );
896 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
898 /* Contradict one of substrings */
899 if (prog->anchored_substr || prog->anchored_utf8) {
900 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
901 DEBUG_EXECUTE_r( what = "anchored" );
903 s = HOP3c(t, 1, strend);
904 if (s + start_shift + end_shift > strend) {
905 /* XXXX Should be taken into account earlier? */
906 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
907 "Could not match STCLASS...\n") );
912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
913 "Looking for %s substr starting at offset %ld...\n",
914 what, (long)(s + start_shift - i_strpos)) );
917 /* Have both, check_string is floating */
918 if (t + start_shift >= check_at) /* Contradicts floating=check */
919 goto retry_floating_check;
920 /* Recheck anchored substring, but not floating... */
924 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
925 "Looking for anchored substr starting at offset %ld...\n",
926 (long)(other_last - i_strpos)) );
927 goto do_other_anchored;
929 /* Another way we could have checked stclass at the
930 current position only: */
935 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
936 "Looking for /%s^%s/m starting at offset %ld...\n",
937 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
940 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
942 /* Check is floating subtring. */
943 retry_floating_check:
944 t = check_at - start_shift;
945 DEBUG_EXECUTE_r( what = "floating" );
946 goto hop_and_restart;
949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
950 "By STCLASS: moving %ld --> %ld\n",
951 (long)(t - i_strpos), (long)(s - i_strpos))
955 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
956 "Does not contradict STCLASS...\n");
961 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
962 PL_colors[4], (check ? "Guessed" : "Giving up"),
963 PL_colors[5], (long)(s - i_strpos)) );
966 fail_finish: /* Substring not found */
967 if (prog->check_substr || prog->check_utf8) /* could be removed already */
968 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
970 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
971 PL_colors[4], PL_colors[5]));
975 /* We know what class REx starts with. Try to find this position... */
977 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
980 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
984 register STRLEN uskip;
988 register I32 tmp = 1; /* Scratch variable? */
989 register const bool do_utf8 = PL_reg_match_utf8;
991 /* We know what class it must start with. */
995 while (s + (uskip = UTF8SKIP(s)) <= strend) {
996 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
997 !UTF8_IS_INVARIANT((U8)s[0]) ?
998 reginclass(c, (U8*)s, 0, do_utf8) :
999 REGINCLASS(c, (U8*)s)) {
1000 if (tmp && (norun || regtry(prog, s)))
1011 while (s < strend) {
1014 if (REGINCLASS(c, (U8*)s) ||
1015 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1016 /* The assignment of 2 is intentional:
1017 * for the folded sharp s, the skip is 2. */
1018 (skip = SHARP_S_SKIP))) {
1019 if (tmp && (norun || regtry(prog, s)))
1031 while (s < strend) {
1032 if (tmp && (norun || regtry(prog, s)))
1041 ln = STR_LEN(c); /* length to match in octets/bytes */
1042 lnc = (I32) ln; /* length to match in characters */
1044 STRLEN ulen1, ulen2;
1046 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1047 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1048 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1050 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1051 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1053 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1055 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1058 while (sm < ((U8 *) m + ln)) {
1073 c2 = PL_fold_locale[c1];
1075 e = HOP3c(strend, -((I32)lnc), s);
1078 e = s; /* Due to minlen logic of intuit() */
1080 /* The idea in the EXACTF* cases is to first find the
1081 * first character of the EXACTF* node and then, if
1082 * necessary, case-insensitively compare the full
1083 * text of the node. The c1 and c2 are the first
1084 * characters (though in Unicode it gets a bit
1085 * more complicated because there are more cases
1086 * than just upper and lower: one needs to use
1087 * the so-called folding case for case-insensitive
1088 * matching (called "loose matching" in Unicode).
1089 * ibcmp_utf8() will do just that. */
1093 U8 tmpbuf [UTF8_MAXBYTES+1];
1094 STRLEN len, foldlen;
1095 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1097 /* Upper and lower of 1st char are equal -
1098 * probably not a "letter". */
1100 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1104 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1105 m, (char **)0, ln, (bool)UTF))
1106 && (norun || regtry(prog, s)) )
1109 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1110 uvchr_to_utf8(tmpbuf, c);
1111 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1113 && (f == c1 || f == c2)
1114 && (ln == foldlen ||
1115 !ibcmp_utf8((char *) foldbuf,
1116 (char **)0, foldlen, do_utf8,
1118 (char **)0, ln, (bool)UTF))
1119 && (norun || regtry(prog, s)) )
1127 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1130 /* Handle some of the three Greek sigmas cases.
1131 * Note that not all the possible combinations
1132 * are handled here: some of them are handled
1133 * by the standard folding rules, and some of
1134 * them (the character class or ANYOF cases)
1135 * are handled during compiletime in
1136 * regexec.c:S_regclass(). */
1137 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1138 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1139 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1141 if ( (c == c1 || c == c2)
1143 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1144 m, (char **)0, ln, (bool)UTF))
1145 && (norun || regtry(prog, s)) )
1148 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1149 uvchr_to_utf8(tmpbuf, c);
1150 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1152 && (f == c1 || f == c2)
1153 && (ln == foldlen ||
1154 !ibcmp_utf8((char *) foldbuf,
1155 (char **)0, foldlen, do_utf8,
1157 (char **)0, ln, (bool)UTF))
1158 && (norun || regtry(prog, s)) )
1169 && (ln == 1 || !(OP(c) == EXACTF
1171 : ibcmp_locale(s, m, ln)))
1172 && (norun || regtry(prog, s)) )
1178 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1179 && (ln == 1 || !(OP(c) == EXACTF
1181 : ibcmp_locale(s, m, ln)))
1182 && (norun || regtry(prog, s)) )
1189 PL_reg_flags |= RF_tainted;
1196 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1198 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1200 tmp = ((OP(c) == BOUND ?
1201 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1202 LOAD_UTF8_CHARCLASS_ALNUM();
1203 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1204 if (tmp == !(OP(c) == BOUND ?
1205 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1206 isALNUM_LC_utf8((U8*)s)))
1209 if ((norun || regtry(prog, s)))
1216 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1217 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1218 while (s < strend) {
1220 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1222 if ((norun || regtry(prog, s)))
1228 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1232 PL_reg_flags |= RF_tainted;
1239 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1241 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1243 tmp = ((OP(c) == NBOUND ?
1244 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1245 LOAD_UTF8_CHARCLASS_ALNUM();
1246 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1247 if (tmp == !(OP(c) == NBOUND ?
1248 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1249 isALNUM_LC_utf8((U8*)s)))
1251 else if ((norun || regtry(prog, s)))
1257 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1258 tmp = ((OP(c) == NBOUND ?
1259 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1260 while (s < strend) {
1262 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1264 else if ((norun || regtry(prog, s)))
1269 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1274 LOAD_UTF8_CHARCLASS_ALNUM();
1275 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1276 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1277 if (tmp && (norun || regtry(prog, s)))
1288 while (s < strend) {
1290 if (tmp && (norun || regtry(prog, s)))
1302 PL_reg_flags |= RF_tainted;
1304 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1305 if (isALNUM_LC_utf8((U8*)s)) {
1306 if (tmp && (norun || regtry(prog, s)))
1317 while (s < strend) {
1318 if (isALNUM_LC(*s)) {
1319 if (tmp && (norun || regtry(prog, s)))
1332 LOAD_UTF8_CHARCLASS_ALNUM();
1333 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1334 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1335 if (tmp && (norun || regtry(prog, s)))
1346 while (s < strend) {
1348 if (tmp && (norun || regtry(prog, s)))
1360 PL_reg_flags |= RF_tainted;
1362 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1363 if (!isALNUM_LC_utf8((U8*)s)) {
1364 if (tmp && (norun || regtry(prog, s)))
1375 while (s < strend) {
1376 if (!isALNUM_LC(*s)) {
1377 if (tmp && (norun || regtry(prog, s)))
1390 LOAD_UTF8_CHARCLASS_SPACE();
1391 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1392 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1393 if (tmp && (norun || regtry(prog, s)))
1404 while (s < strend) {
1406 if (tmp && (norun || regtry(prog, s)))
1418 PL_reg_flags |= RF_tainted;
1420 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1421 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1422 if (tmp && (norun || regtry(prog, s)))
1433 while (s < strend) {
1434 if (isSPACE_LC(*s)) {
1435 if (tmp && (norun || regtry(prog, s)))
1448 LOAD_UTF8_CHARCLASS_SPACE();
1449 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1450 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1451 if (tmp && (norun || regtry(prog, s)))
1462 while (s < strend) {
1464 if (tmp && (norun || regtry(prog, s)))
1476 PL_reg_flags |= RF_tainted;
1478 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1479 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1480 if (tmp && (norun || regtry(prog, s)))
1491 while (s < strend) {
1492 if (!isSPACE_LC(*s)) {
1493 if (tmp && (norun || regtry(prog, s)))
1506 LOAD_UTF8_CHARCLASS_DIGIT();
1507 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1508 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1509 if (tmp && (norun || regtry(prog, s)))
1520 while (s < strend) {
1522 if (tmp && (norun || regtry(prog, s)))
1534 PL_reg_flags |= RF_tainted;
1536 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1537 if (isDIGIT_LC_utf8((U8*)s)) {
1538 if (tmp && (norun || regtry(prog, s)))
1549 while (s < strend) {
1550 if (isDIGIT_LC(*s)) {
1551 if (tmp && (norun || regtry(prog, s)))
1564 LOAD_UTF8_CHARCLASS_DIGIT();
1565 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1566 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1567 if (tmp && (norun || regtry(prog, s)))
1578 while (s < strend) {
1580 if (tmp && (norun || regtry(prog, s)))
1592 PL_reg_flags |= RF_tainted;
1594 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1595 if (!isDIGIT_LC_utf8((U8*)s)) {
1596 if (tmp && (norun || regtry(prog, s)))
1607 while (s < strend) {
1608 if (!isDIGIT_LC(*s)) {
1609 if (tmp && (norun || regtry(prog, s)))
1621 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1630 - regexec_flags - match a regexp against a string
1633 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1634 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1635 /* strend: pointer to null at end of string */
1636 /* strbeg: real beginning of string */
1637 /* minend: end of match must be >=minend after stringarg. */
1638 /* data: May be used for some additional optimizations. */
1639 /* nosave: For optimizations. */
1643 register regnode *c;
1644 register char *startpos = stringarg;
1645 I32 minlen; /* must match at least this many chars */
1646 I32 dontbother = 0; /* how many characters not to try at end */
1647 I32 end_shift = 0; /* Same for the end. */ /* CC */
1648 I32 scream_pos = -1; /* Internal iterator of scream. */
1650 SV* oreplsv = GvSV(PL_replgv);
1651 const bool do_utf8 = DO_UTF8(sv);
1652 const I32 multiline = prog->reganch & PMf_MULTILINE;
1654 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1655 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1658 GET_RE_DEBUG_FLAGS_DECL;
1660 PERL_UNUSED_ARG(data);
1661 RX_MATCH_UTF8_set(prog,do_utf8);
1667 PL_regnarrate = DEBUG_r_TEST;
1670 /* Be paranoid... */
1671 if (prog == NULL || startpos == NULL) {
1672 Perl_croak(aTHX_ "NULL regexp parameter");
1676 minlen = prog->minlen;
1677 if (strend - startpos < minlen) {
1678 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1679 "String too short [regexec_flags]...\n"));
1683 /* Check validity of program. */
1684 if (UCHARAT(prog->program) != REG_MAGIC) {
1685 Perl_croak(aTHX_ "corrupted regexp program");
1689 PL_reg_eval_set = 0;
1692 if (prog->reganch & ROPT_UTF8)
1693 PL_reg_flags |= RF_utf8;
1695 /* Mark beginning of line for ^ and lookbehind. */
1696 PL_regbol = startpos;
1700 /* Mark end of line for $ (and such) */
1703 /* see how far we have to get to not match where we matched before */
1704 PL_regtill = startpos+minend;
1706 /* We start without call_cc context. */
1709 /* If there is a "must appear" string, look for it. */
1712 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1715 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1716 PL_reg_ganch = startpos;
1717 else if (sv && SvTYPE(sv) >= SVt_PVMG
1719 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1720 && mg->mg_len >= 0) {
1721 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1722 if (prog->reganch & ROPT_ANCH_GPOS) {
1723 if (s > PL_reg_ganch)
1728 else /* pos() not defined */
1729 PL_reg_ganch = strbeg;
1732 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1733 re_scream_pos_data d;
1735 d.scream_olds = &scream_olds;
1736 d.scream_pos = &scream_pos;
1737 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1739 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1740 goto phooey; /* not present */
1745 const char * const s0 = UTF
1746 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1749 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1750 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1751 UNI_DISPLAY_REGEX) : startpos;
1752 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1755 PerlIO_printf(Perl_debug_log,
1756 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1757 PL_colors[4], PL_colors[5], PL_colors[0],
1760 len0 > 60 ? "..." : "",
1762 (int)(len1 > 60 ? 60 : len1),
1764 (len1 > 60 ? "..." : "")
1768 /* Simplest case: anchored match need be tried only once. */
1769 /* [unless only anchor is BOL and multiline is set] */
1770 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1771 if (s == startpos && regtry(prog, startpos))
1773 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1774 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1779 dontbother = minlen - 1;
1780 end = HOP3c(strend, -dontbother, strbeg) - 1;
1781 /* for multiline we only have to try after newlines */
1782 if (prog->check_substr || prog->check_utf8) {
1786 if (regtry(prog, s))
1791 if (prog->reganch & RE_USE_INTUIT) {
1792 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1803 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1804 if (regtry(prog, s))
1811 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1812 if (regtry(prog, PL_reg_ganch))
1817 /* Messy cases: unanchored match. */
1818 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1819 /* we have /x+whatever/ */
1820 /* it must be a one character string (XXXX Except UTF?) */
1825 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1826 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1827 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1830 while (s < strend) {
1832 DEBUG_EXECUTE_r( did_match = 1 );
1833 if (regtry(prog, s)) goto got_it;
1835 while (s < strend && *s == ch)
1842 while (s < strend) {
1844 DEBUG_EXECUTE_r( did_match = 1 );
1845 if (regtry(prog, s)) goto got_it;
1847 while (s < strend && *s == ch)
1853 DEBUG_EXECUTE_r(if (!did_match)
1854 PerlIO_printf(Perl_debug_log,
1855 "Did not find anchored character...\n")
1858 else if (prog->anchored_substr != Nullsv
1859 || prog->anchored_utf8 != Nullsv
1860 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1861 && prog->float_max_offset < strend - s)) {
1866 char *last1; /* Last position checked before */
1870 if (prog->anchored_substr || prog->anchored_utf8) {
1871 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1872 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1873 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1874 back_max = back_min = prog->anchored_offset;
1876 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1877 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1878 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1879 back_max = prog->float_max_offset;
1880 back_min = prog->float_min_offset;
1882 if (must == &PL_sv_undef)
1883 /* could not downgrade utf8 check substring, so must fail */
1886 last = HOP3c(strend, /* Cannot start after this */
1887 -(I32)(CHR_SVLEN(must)
1888 - (SvTAIL(must) != 0) + back_min), strbeg);
1891 last1 = HOPc(s, -1);
1893 last1 = s - 1; /* bogus */
1895 /* XXXX check_substr already used to find "s", can optimize if
1896 check_substr==must. */
1898 dontbother = end_shift;
1899 strend = HOPc(strend, -dontbother);
1900 while ( (s <= last) &&
1901 ((flags & REXEC_SCREAM)
1902 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1903 end_shift, &scream_pos, 0))
1904 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1905 (unsigned char*)strend, must,
1906 multiline ? FBMrf_MULTILINE : 0))) ) {
1907 /* we may be pointing at the wrong string */
1908 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1909 s = strbeg + (s - SvPVX_const(sv));
1910 DEBUG_EXECUTE_r( did_match = 1 );
1911 if (HOPc(s, -back_max) > last1) {
1912 last1 = HOPc(s, -back_min);
1913 s = HOPc(s, -back_max);
1916 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1918 last1 = HOPc(s, -back_min);
1922 while (s <= last1) {
1923 if (regtry(prog, s))
1929 while (s <= last1) {
1930 if (regtry(prog, s))
1936 DEBUG_EXECUTE_r(if (!did_match)
1937 PerlIO_printf(Perl_debug_log,
1938 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1939 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1940 ? "anchored" : "floating"),
1942 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1944 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1948 else if ((c = prog->regstclass)) {
1950 I32 op = (U8)OP(prog->regstclass);
1951 /* don't bother with what can't match */
1952 if (PL_regkind[op] != EXACT && op != CANY)
1953 strend = HOPc(strend, -(minlen - 1));
1956 SV *prop = sv_newmortal();
1964 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1965 UNI_DISPLAY_REGEX) :
1967 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1969 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1970 len1 = UTF ? SvCUR(dsv1) : strend - s;
1971 PerlIO_printf(Perl_debug_log,
1972 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1976 if (find_byclass(prog, c, s, strend, 0))
1978 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1982 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1987 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1988 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1989 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1991 if (flags & REXEC_SCREAM) {
1992 last = screaminstr(sv, float_real, s - strbeg,
1993 end_shift, &scream_pos, 1); /* last one */
1995 last = scream_olds; /* Only one occurrence. */
1996 /* we may be pointing at the wrong string */
1997 else if (RX_MATCH_COPIED(prog))
1998 s = strbeg + (s - SvPVX_const(sv));
2002 const char * const little = SvPV_const(float_real, len);
2004 if (SvTAIL(float_real)) {
2005 if (memEQ(strend - len + 1, little, len - 1))
2006 last = strend - len + 1;
2007 else if (!multiline)
2008 last = memEQ(strend - len, little, len)
2009 ? strend - len : Nullch;
2015 last = rninstr(s, strend, little, little + len);
2017 last = strend; /* matching "$" */
2021 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2022 "%sCan't trim the tail, match fails (should not happen)%s\n",
2023 PL_colors[4], PL_colors[5]));
2024 goto phooey; /* Should not happen! */
2026 dontbother = strend - last + prog->float_min_offset;
2028 if (minlen && (dontbother < minlen))
2029 dontbother = minlen - 1;
2030 strend -= dontbother; /* this one's always in bytes! */
2031 /* We don't know much -- general case. */
2034 if (regtry(prog, s))
2043 if (regtry(prog, s))
2045 } while (s++ < strend);
2053 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2055 if (PL_reg_eval_set) {
2056 /* Preserve the current value of $^R */
2057 if (oreplsv != GvSV(PL_replgv))
2058 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2059 restored, the value remains
2061 restore_pos(aTHX_ 0);
2064 /* make sure $`, $&, $', and $digit will work later */
2065 if ( !(flags & REXEC_NOT_FIRST) ) {
2066 RX_MATCH_COPY_FREE(prog);
2067 if (flags & REXEC_COPY_STR) {
2068 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2069 #ifdef PERL_OLD_COPY_ON_WRITE
2071 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2073 PerlIO_printf(Perl_debug_log,
2074 "Copy on write: regexp capture, type %d\n",
2077 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2078 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2079 assert (SvPOKp(prog->saved_copy));
2083 RX_MATCH_COPIED_on(prog);
2084 s = savepvn(strbeg, i);
2090 prog->subbeg = strbeg;
2091 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2098 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2099 PL_colors[4], PL_colors[5]));
2100 if (PL_reg_eval_set)
2101 restore_pos(aTHX_ 0);
2106 - regtry - try match at specific point
2108 STATIC I32 /* 0 failure, 1 success */
2109 S_regtry(pTHX_ regexp *prog, char *startpos)
2116 GET_RE_DEBUG_FLAGS_DECL;
2119 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2121 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2124 PL_reg_eval_set = RS_init;
2125 DEBUG_EXECUTE_r(DEBUG_s(
2126 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2127 (IV)(PL_stack_sp - PL_stack_base));
2129 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2130 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2131 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2133 /* Apparently this is not needed, judging by wantarray. */
2134 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2135 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2138 /* Make $_ available to executed code. */
2139 if (PL_reg_sv != DEFSV) {
2144 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2145 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2146 /* prepare for quick setting of pos */
2147 sv_magic(PL_reg_sv, (SV*)0,
2148 PERL_MAGIC_regex_global, Nullch, 0);
2149 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2153 PL_reg_oldpos = mg->mg_len;
2154 SAVEDESTRUCTOR_X(restore_pos, 0);
2156 if (!PL_reg_curpm) {
2157 Newxz(PL_reg_curpm, 1, PMOP);
2160 SV* repointer = newSViv(0);
2161 /* so we know which PL_regex_padav element is PL_reg_curpm */
2162 SvFLAGS(repointer) |= SVf_BREAK;
2163 av_push(PL_regex_padav,repointer);
2164 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2165 PL_regex_pad = AvARRAY(PL_regex_padav);
2169 PM_SETRE(PL_reg_curpm, prog);
2170 PL_reg_oldcurpm = PL_curpm;
2171 PL_curpm = PL_reg_curpm;
2172 if (RX_MATCH_COPIED(prog)) {
2173 /* Here is a serious problem: we cannot rewrite subbeg,
2174 since it may be needed if this match fails. Thus
2175 $` inside (?{}) could fail... */
2176 PL_reg_oldsaved = prog->subbeg;
2177 PL_reg_oldsavedlen = prog->sublen;
2178 #ifdef PERL_OLD_COPY_ON_WRITE
2179 PL_nrs = prog->saved_copy;
2181 RX_MATCH_COPIED_off(prog);
2184 PL_reg_oldsaved = Nullch;
2185 prog->subbeg = PL_bostr;
2186 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2188 prog->startp[0] = startpos - PL_bostr;
2189 PL_reginput = startpos;
2190 PL_regstartp = prog->startp;
2191 PL_regendp = prog->endp;
2192 PL_reglastparen = &prog->lastparen;
2193 PL_reglastcloseparen = &prog->lastcloseparen;
2194 prog->lastparen = 0;
2195 prog->lastcloseparen = 0;
2197 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2198 if (PL_reg_start_tmpl <= prog->nparens) {
2199 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2200 if(PL_reg_start_tmp)
2201 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2203 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2206 /* XXXX What this code is doing here?!!! There should be no need
2207 to do this again and again, PL_reglastparen should take care of
2210 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2211 * Actually, the code in regcppop() (which Ilya may be meaning by
2212 * PL_reglastparen), is not needed at all by the test suite
2213 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2214 * enough, for building DynaLoader, or otherwise this
2215 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2216 * will happen. Meanwhile, this code *is* needed for the
2217 * above-mentioned test suite tests to succeed. The common theme
2218 * on those tests seems to be returning null fields from matches.
2223 if (prog->nparens) {
2224 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2231 if (regmatch(prog->program + 1)) {
2232 prog->endp[0] = PL_reginput - PL_bostr;
2235 REGCP_UNWIND(lastcp);
2239 #define RE_UNWIND_BRANCH 1
2240 #define RE_UNWIND_BRANCHJ 2
2244 typedef struct { /* XX: makes sense to enlarge it... */
2248 } re_unwind_generic_t;
2261 } re_unwind_branch_t;
2263 typedef union re_unwind_t {
2265 re_unwind_generic_t generic;
2266 re_unwind_branch_t branch;
2269 #define sayYES goto yes
2270 #define sayNO goto no
2271 #define sayNO_ANYOF goto no_anyof
2272 #define sayYES_FINAL goto yes_final
2273 #define sayYES_LOUD goto yes_loud
2274 #define sayNO_FINAL goto no_final
2275 #define sayNO_SILENT goto do_no
2276 #define saySAME(x) if (x) goto yes; else goto no
2278 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2279 #define POSCACHE_SEEN 1 /* we know what we're caching */
2280 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2281 #define CACHEsayYES STMT_START { \
2282 if (cache_offset | cache_bit) { \
2283 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2284 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2285 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2286 /* cache records failure, but this is success */ \
2288 PerlIO_printf(Perl_debug_log, \
2289 "%*s (remove success from failure cache)\n", \
2290 REPORT_CODE_OFF+PL_regindent*2, "") \
2292 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2297 #define CACHEsayNO STMT_START { \
2298 if (cache_offset | cache_bit) { \
2299 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2300 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2301 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2302 /* cache records success, but this is failure */ \
2304 PerlIO_printf(Perl_debug_log, \
2305 "%*s (remove failure from success cache)\n", \
2306 REPORT_CODE_OFF+PL_regindent*2, "") \
2308 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2314 /* this is used to determine how far from the left messages like
2315 'failed...' are printed. Currently 29 makes these messages line
2316 up with the opcode they refer to. Earlier perls used 25 which
2317 left these messages outdented making reviewing a debug output
2320 #define REPORT_CODE_OFF 29
2323 /* Make sure there is a test for this +1 options in re_tests */
2324 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2326 #define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2327 if ( trie->states[ state ].wordnum ) { \
2328 if ( !accepted ) { \
2331 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2332 sv_accept_buff=NEWSV( 1234, \
2333 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2334 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2335 SvPOK_on( sv_accept_buff ); \
2336 sv_2mortal( sv_accept_buff ); \
2337 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2339 if ( accepted >= bufflen ) { \
2341 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2342 bufflen * sizeof(reg_trie_accepted) ); \
2344 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2345 + sizeof( reg_trie_accepted ) ); \
2347 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2348 accept_buff[ accepted ].endpos = uc; \
2352 #define TRIE_HANDLE_CHAR STMT_START { \
2353 if ( uvc < 256 ) { \
2354 charid = trie->charmap[ uvc ]; \
2357 if( trie->widecharmap ) { \
2358 SV** svpp = (SV**)NULL; \
2359 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2360 sizeof( UV ), 0 ); \
2362 charid = (U16)SvIV( *svpp ); \
2367 ( base + charid > trie->uniquecharcount ) && \
2368 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
2369 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2371 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2379 - regmatch - main matching routine
2381 * Conceptually the strategy is simple: check to see whether the current
2382 * node matches, call self recursively to see whether the rest matches,
2383 * and then act accordingly. In practice we make some effort to avoid
2384 * recursion, in particular by going through "ordinary" nodes (that don't
2385 * need to know whether the rest of the match failed) by a loop instead of
2388 /* [lwall] I've hoisted the register declarations to the outer block in order to
2389 * maybe save a little bit of pushing and popping on the stack. It also takes
2390 * advantage of machines that use a register save mask on subroutine entry.
2392 STATIC I32 /* 0 failure, 1 success */
2393 S_regmatch(pTHX_ regnode *prog)
2396 register regnode *scan; /* Current node. */
2397 regnode *next; /* Next node. */
2398 regnode *inner; /* Next node in internal branch. */
2399 register I32 nextchr; /* renamed nextchr - nextchar colides with
2400 function of same name */
2401 register I32 n; /* no or next */
2402 register I32 ln = 0; /* len or last */
2403 register char *s = Nullch; /* operand or save */
2404 register char *locinput = PL_reginput;
2405 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2406 int minmod = 0, sw = 0, logical = 0;
2409 /* used by the trie code */
2410 SV *sv_accept_buff = NULL; /* accepting states we have traversed */
2411 reg_trie_accepted *accept_buff = NULL; /* "" */
2412 reg_trie_data *trie; /* what trie are we using right now */
2413 U32 accepted = 0; /* how many accepting states we have seen*/
2416 I32 firstcp = PL_savestack_ix;
2418 register const bool do_utf8 = PL_reg_match_utf8;
2420 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2421 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2422 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2424 SV *re_debug_flags = NULL;
2426 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
2435 /* Note that nextchr is a byte even in UTF */
2436 nextchr = UCHARAT(locinput);
2438 while (scan != NULL) {
2441 SV *prop = sv_newmortal();
2442 const int docolor = *PL_colors[0];
2443 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2444 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2445 /* The part of the string before starttry has one color
2446 (pref0_len chars), between starttry and current
2447 position another one (pref_len - pref0_len chars),
2448 after the current position the third one.
2449 We assume that pref0_len <= pref_len, otherwise we
2450 decrease pref0_len. */
2451 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2452 ? (5 + taill) - l : locinput - PL_bostr;
2455 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2457 pref0_len = pref_len - (locinput - PL_reg_starttry);
2458 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2459 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2460 ? (5 + taill) - pref_len : PL_regeol - locinput);
2461 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2465 if (pref0_len > pref_len)
2466 pref0_len = pref_len;
2467 regprop(prop, scan);
2469 const char * const s0 =
2470 do_utf8 && OP(scan) != CANY ?
2471 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2472 pref0_len, 60, UNI_DISPLAY_REGEX) :
2473 locinput - pref_len;
2474 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2475 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2476 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2477 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2478 locinput - pref_len + pref0_len;
2479 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2480 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2481 pv_uni_display(dsv2, (U8*)locinput,
2482 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2484 const int len2 = do_utf8 ? strlen(s2) : l;
2485 PerlIO_printf(Perl_debug_log,
2486 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2487 (IV)(locinput - PL_bostr),
2494 (docolor ? "" : "> <"),
2498 15 - l - pref_len + 1,
2500 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2505 next = scan + NEXT_OFF(scan);
2511 if (locinput == PL_bostr)
2513 /* regtill = regbol; */
2518 if (locinput == PL_bostr ||
2519 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2525 if (locinput == PL_bostr)
2529 if (locinput == PL_reg_ganch)
2535 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2540 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2542 if (PL_regeol - locinput > 1)
2546 if (PL_regeol != locinput)
2550 if (!nextchr && locinput >= PL_regeol)
2553 locinput += PL_utf8skip[nextchr];
2554 if (locinput > PL_regeol)
2556 nextchr = UCHARAT(locinput);
2559 nextchr = UCHARAT(++locinput);
2562 if (!nextchr && locinput >= PL_regeol)
2564 nextchr = UCHARAT(++locinput);
2567 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2570 locinput += PL_utf8skip[nextchr];
2571 if (locinput > PL_regeol)
2573 nextchr = UCHARAT(locinput);
2576 nextchr = UCHARAT(++locinput);
2582 traverse the TRIE keeping track of all accepting states
2583 we transition through until we get to a failing node.
2585 we use two slightly different pieces of code to handle
2586 the traversal depending on whether its case sensitive or
2587 not. we reuse the accept code however. (this should probably
2588 be turned into a macro.)
2594 U8 *uc = ( U8* )locinput;
2601 U8 *uscan = (U8*)NULL;
2605 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2607 while ( state && uc <= (U8*)PL_regeol ) {
2609 TRIE_CHECK_STATE_IS_ACCEPTING;
2611 base = trie->states[ state ].trans.base;
2613 DEBUG_TRIE_EXECUTE_r(
2614 PerlIO_printf( Perl_debug_log,
2615 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2616 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2617 (UV)state, (UV)base, (UV)accepted );
2624 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2629 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2630 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2631 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2632 foldlen -= UNISKIP( uvc );
2633 uscan = foldbuf + UNISKIP( uvc );
2645 DEBUG_TRIE_EXECUTE_r(
2646 PerlIO_printf( Perl_debug_log,
2647 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2648 charid, uvc, (UV)state, PL_colors[5] );
2657 /* unreached codepoint: we jump into the middle of the next case
2658 from previous if blocks */
2661 U8 *uc = (U8*)locinput;
2670 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2672 while ( state && uc <= (U8*)PL_regeol ) {
2674 TRIE_CHECK_STATE_IS_ACCEPTING;
2676 base = trie->states[ state ].trans.base;
2678 DEBUG_TRIE_EXECUTE_r(
2679 PerlIO_printf( Perl_debug_log,
2680 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2681 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2682 (UV)state, (UV)base, (UV)accepted );
2688 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2699 DEBUG_TRIE_EXECUTE_r(
2700 PerlIO_printf( Perl_debug_log,
2701 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2702 charid, uvc, (UV)state, PL_colors[5] );
2712 There was at least one accepting state that we
2713 transitioned through. Presumably the number of accepting
2714 states is going to be low, typically one or two. So we
2715 simply scan through to find the one with lowest wordnum.
2716 Once we find it, we swap the last state into its place
2717 and decrement the size. We then try to match the rest of
2718 the pattern at the point where the word ends, if we
2719 succeed then we end the loop, otherwise the loop
2720 eventually terminates once all of the accepting states
2727 if ( accepted == 1 ) {
2729 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2730 PerlIO_printf( Perl_debug_log,
2731 "%*s %sonly one match : #%d <%s>%s\n",
2732 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2733 accept_buff[ 0 ].wordnum,
2734 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2737 PL_reginput = (char *)accept_buff[ 0 ].endpos;
2738 /* in this case we free tmps/leave before we call regmatch
2739 as we wont be using accept_buff again. */
2742 gotit = regmatch( scan + NEXT_OFF( scan ) );
2745 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2746 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
2749 while ( !gotit && accepted-- ) {
2752 for( cur = 1 ; cur <= accepted ; cur++ ) {
2753 DEBUG_TRIE_EXECUTE_r(
2754 PerlIO_printf( Perl_debug_log,
2755 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2756 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2757 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2758 accept_buff[ cur ].wordnum, PL_colors[5] );
2761 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2765 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2766 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2767 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2768 accept_buff[best].wordnum,
2769 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
2772 if ( best<accepted ) {
2773 reg_trie_accepted tmp = accept_buff[ best ];
2774 accept_buff[ best ] = accept_buff[ accepted ];
2775 accept_buff[ accepted ] = tmp;
2778 PL_reginput = (char *)accept_buff[ best ].endpos;
2781 as far as I can tell we only need the SAVETMPS/FREETMPS
2782 for re's with EVAL in them but I'm leaving them in for
2783 all until I can be sure.
2786 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2799 /* unreached codepoint */
2803 if (do_utf8 != UTF) {
2804 /* The target and the pattern have differing utf8ness. */
2806 const char *e = s + ln;
2809 /* The target is utf8, the pattern is not utf8. */
2814 if (NATIVE_TO_UNI(*(U8*)s) !=
2815 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2823 /* The target is not utf8, the pattern is utf8. */
2828 if (NATIVE_TO_UNI(*((U8*)l)) !=
2829 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2837 nextchr = UCHARAT(locinput);
2840 /* The target and the pattern have the same utf8ness. */
2841 /* Inline the first character, for speed. */
2842 if (UCHARAT(s) != nextchr)
2844 if (PL_regeol - locinput < ln)
2846 if (ln > 1 && memNE(s, locinput, ln))
2849 nextchr = UCHARAT(locinput);
2852 PL_reg_flags |= RF_tainted;
2858 if (do_utf8 || UTF) {
2859 /* Either target or the pattern are utf8. */
2861 char *e = PL_regeol;
2863 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2864 l, &e, 0, do_utf8)) {
2865 /* One more case for the sharp s:
2866 * pack("U0U*", 0xDF) =~ /ss/i,
2867 * the 0xC3 0x9F are the UTF-8
2868 * byte sequence for the U+00DF. */
2870 toLOWER(s[0]) == 's' &&
2872 toLOWER(s[1]) == 's' &&
2879 nextchr = UCHARAT(locinput);
2883 /* Neither the target and the pattern are utf8. */
2885 /* Inline the first character, for speed. */
2886 if (UCHARAT(s) != nextchr &&
2887 UCHARAT(s) != ((OP(scan) == EXACTF)
2888 ? PL_fold : PL_fold_locale)[nextchr])
2890 if (PL_regeol - locinput < ln)
2892 if (ln > 1 && (OP(scan) == EXACTF
2893 ? ibcmp(s, locinput, ln)
2894 : ibcmp_locale(s, locinput, ln)))
2897 nextchr = UCHARAT(locinput);
2901 STRLEN inclasslen = PL_regeol - locinput;
2903 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2905 if (locinput >= PL_regeol)
2907 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2908 nextchr = UCHARAT(locinput);
2913 nextchr = UCHARAT(locinput);
2914 if (!REGINCLASS(scan, (U8*)locinput))
2916 if (!nextchr && locinput >= PL_regeol)
2918 nextchr = UCHARAT(++locinput);
2922 /* If we might have the case of the German sharp s
2923 * in a casefolding Unicode character class. */
2925 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2926 locinput += SHARP_S_SKIP;
2927 nextchr = UCHARAT(locinput);
2933 PL_reg_flags |= RF_tainted;
2939 LOAD_UTF8_CHARCLASS_ALNUM();
2940 if (!(OP(scan) == ALNUM
2941 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2942 : isALNUM_LC_utf8((U8*)locinput)))
2946 locinput += PL_utf8skip[nextchr];
2947 nextchr = UCHARAT(locinput);
2950 if (!(OP(scan) == ALNUM
2951 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2953 nextchr = UCHARAT(++locinput);
2956 PL_reg_flags |= RF_tainted;
2959 if (!nextchr && locinput >= PL_regeol)
2962 LOAD_UTF8_CHARCLASS_ALNUM();
2963 if (OP(scan) == NALNUM
2964 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2965 : isALNUM_LC_utf8((U8*)locinput))
2969 locinput += PL_utf8skip[nextchr];
2970 nextchr = UCHARAT(locinput);
2973 if (OP(scan) == NALNUM
2974 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2976 nextchr = UCHARAT(++locinput);
2980 PL_reg_flags |= RF_tainted;
2984 /* was last char in word? */
2986 if (locinput == PL_bostr)
2989 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2991 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2993 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2994 ln = isALNUM_uni(ln);
2995 LOAD_UTF8_CHARCLASS_ALNUM();
2996 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2999 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3000 n = isALNUM_LC_utf8((U8*)locinput);
3004 ln = (locinput != PL_bostr) ?
3005 UCHARAT(locinput - 1) : '\n';
3006 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3008 n = isALNUM(nextchr);
3011 ln = isALNUM_LC(ln);
3012 n = isALNUM_LC(nextchr);
3015 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3016 OP(scan) == BOUNDL))
3020 PL_reg_flags |= RF_tainted;
3026 if (UTF8_IS_CONTINUED(nextchr)) {
3027 LOAD_UTF8_CHARCLASS_SPACE();
3028 if (!(OP(scan) == SPACE
3029 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3030 : isSPACE_LC_utf8((U8*)locinput)))
3034 locinput += PL_utf8skip[nextchr];
3035 nextchr = UCHARAT(locinput);
3038 if (!(OP(scan) == SPACE
3039 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3041 nextchr = UCHARAT(++locinput);
3044 if (!(OP(scan) == SPACE
3045 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3047 nextchr = UCHARAT(++locinput);
3051 PL_reg_flags |= RF_tainted;
3054 if (!nextchr && locinput >= PL_regeol)
3057 LOAD_UTF8_CHARCLASS_SPACE();
3058 if (OP(scan) == NSPACE
3059 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3060 : isSPACE_LC_utf8((U8*)locinput))
3064 locinput += PL_utf8skip[nextchr];
3065 nextchr = UCHARAT(locinput);
3068 if (OP(scan) == NSPACE
3069 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3071 nextchr = UCHARAT(++locinput);
3074 PL_reg_flags |= RF_tainted;
3080 LOAD_UTF8_CHARCLASS_DIGIT();
3081 if (!(OP(scan) == DIGIT
3082 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3083 : isDIGIT_LC_utf8((U8*)locinput)))
3087 locinput += PL_utf8skip[nextchr];
3088 nextchr = UCHARAT(locinput);
3091 if (!(OP(scan) == DIGIT
3092 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3094 nextchr = UCHARAT(++locinput);
3097 PL_reg_flags |= RF_tainted;
3100 if (!nextchr && locinput >= PL_regeol)
3103 LOAD_UTF8_CHARCLASS_DIGIT();
3104 if (OP(scan) == NDIGIT
3105 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3106 : isDIGIT_LC_utf8((U8*)locinput))
3110 locinput += PL_utf8skip[nextchr];
3111 nextchr = UCHARAT(locinput);
3114 if (OP(scan) == NDIGIT
3115 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3117 nextchr = UCHARAT(++locinput);
3120 if (locinput >= PL_regeol)
3123 LOAD_UTF8_CHARCLASS_MARK();
3124 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3126 locinput += PL_utf8skip[nextchr];
3127 while (locinput < PL_regeol &&
3128 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3129 locinput += UTF8SKIP(locinput);
3130 if (locinput > PL_regeol)
3135 nextchr = UCHARAT(locinput);
3138 PL_reg_flags |= RF_tainted;
3142 n = ARG(scan); /* which paren pair */
3143 ln = PL_regstartp[n];
3144 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3145 if ((I32)*PL_reglastparen < n || ln == -1)
3146 sayNO; /* Do not match unless seen CLOSEn. */
3147 if (ln == PL_regendp[n])
3151 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3153 const char *e = PL_bostr + PL_regendp[n];
3155 * Note that we can't do the "other character" lookup trick as
3156 * in the 8-bit case (no pun intended) because in Unicode we
3157 * have to map both upper and title case to lower case.
3159 if (OP(scan) == REFF) {
3161 STRLEN ulen1, ulen2;
3162 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3163 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3167 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3168 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3169 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3176 nextchr = UCHARAT(locinput);
3180 /* Inline the first character, for speed. */
3181 if (UCHARAT(s) != nextchr &&
3183 (UCHARAT(s) != ((OP(scan) == REFF
3184 ? PL_fold : PL_fold_locale)[nextchr]))))
3186 ln = PL_regendp[n] - ln;
3187 if (locinput + ln > PL_regeol)
3189 if (ln > 1 && (OP(scan) == REF
3190 ? memNE(s, locinput, ln)
3192 ? ibcmp(s, locinput, ln)
3193 : ibcmp_locale(s, locinput, ln))))
3196 nextchr = UCHARAT(locinput);
3207 OP_4tree *oop = PL_op;
3208 COP *ocurcop = PL_curcop;
3211 struct regexp *oreg = PL_reg_re;
3214 PL_op = (OP_4tree*)PL_regdata->data[n];
3215 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3216 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
3217 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3221 CALLRUNOPS(aTHX); /* Scalar context. */
3224 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3232 PAD_RESTORE_LOCAL(old_comppad);
3233 PL_curcop = ocurcop;
3235 if (logical == 2) { /* Postponed subexpression. */
3237 MAGIC *mg = Null(MAGIC*);
3239 CHECKPOINT cp, lastcp;
3243 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3244 mg = mg_find(sv, PERL_MAGIC_qr);
3245 else if (SvSMAGICAL(ret)) {
3246 if (SvGMAGICAL(ret))
3247 sv_unmagic(ret, PERL_MAGIC_qr);
3249 mg = mg_find(ret, PERL_MAGIC_qr);
3253 re = (regexp *)mg->mg_obj;
3254 (void)ReREFCNT_inc(re);
3258 const char *t = SvPV_const(ret, len);
3260 char * const oprecomp = PL_regprecomp;
3261 const I32 osize = PL_regsize;
3262 const I32 onpar = PL_regnpar;
3265 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3266 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3268 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3270 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3272 PL_regprecomp = oprecomp;
3277 PerlIO_printf(Perl_debug_log,
3278 "Entering embedded \"%s%.60s%s%s\"\n",
3282 (strlen(re->precomp) > 60 ? "..." : ""))
3285 state.prev = PL_reg_call_cc;
3286 state.cc = PL_regcc;
3287 state.re = PL_reg_re;
3291 cp = regcppush(0); /* Save *all* the positions. */
3294 state.ss = PL_savestack_ix;
3295 *PL_reglastparen = 0;
3296 *PL_reglastcloseparen = 0;
3297 PL_reg_call_cc = &state;
3298 PL_reginput = locinput;
3299 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3300 ((re->reganch & ROPT_UTF8) != 0);
3301 if (toggleutf) PL_reg_flags ^= RF_utf8;
3303 /* XXXX This is too dramatic a measure... */
3306 if (regmatch(re->program + 1)) {
3307 /* Even though we succeeded, we need to restore
3308 global variables, since we may be wrapped inside
3309 SUSPEND, thus the match may be not finished yet. */
3311 /* XXXX Do this only if SUSPENDed? */
3312 PL_reg_call_cc = state.prev;
3313 PL_regcc = state.cc;
3314 PL_reg_re = state.re;
3315 cache_re(PL_reg_re);
3316 if (toggleutf) PL_reg_flags ^= RF_utf8;
3318 /* XXXX This is too dramatic a measure... */
3321 /* These are needed even if not SUSPEND. */
3327 REGCP_UNWIND(lastcp);
3329 PL_reg_call_cc = state.prev;
3330 PL_regcc = state.cc;
3331 PL_reg_re = state.re;
3332 cache_re(PL_reg_re);
3333 if (toggleutf) PL_reg_flags ^= RF_utf8;
3335 /* XXXX This is too dramatic a measure... */
3345 sv_setsv(save_scalar(PL_replgv), ret);
3351 n = ARG(scan); /* which paren pair */
3352 PL_reg_start_tmp[n] = locinput;
3357 n = ARG(scan); /* which paren pair */
3358 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3359 PL_regendp[n] = locinput - PL_bostr;
3360 if (n > (I32)*PL_reglastparen)
3361 *PL_reglastparen = n;
3362 *PL_reglastcloseparen = n;
3365 n = ARG(scan); /* which paren pair */
3366 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3369 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3371 next = NEXTOPER(NEXTOPER(scan));
3373 next = scan + ARG(scan);
3374 if (OP(next) == IFTHEN) /* Fake one. */
3375 next = NEXTOPER(NEXTOPER(next));
3379 logical = scan->flags;
3381 /*******************************************************************
3382 PL_regcc contains infoblock about the innermost (...)* loop, and
3383 a pointer to the next outer infoblock.
3385 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3387 1) After matching X, regnode for CURLYX is processed;
3389 2) This regnode creates infoblock on the stack, and calls
3390 regmatch() recursively with the starting point at WHILEM node;
3392 3) Each hit of WHILEM node tries to match A and Z (in the order
3393 depending on the current iteration, min/max of {min,max} and
3394 greediness). The information about where are nodes for "A"
3395 and "Z" is read from the infoblock, as is info on how many times "A"
3396 was already matched, and greediness.
3398 4) After A matches, the same WHILEM node is hit again.
3400 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3401 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3402 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3403 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3404 of the external loop.
3406 Currently present infoblocks form a tree with a stem formed by PL_curcc
3407 and whatever it mentions via ->next, and additional attached trees
3408 corresponding to temporarily unset infoblocks as in "5" above.
3410 In the following picture infoblocks for outer loop of
3411 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3412 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3413 infoblocks are drawn below the "reset" infoblock.
3415 In fact in the picture below we do not show failed matches for Z and T
3416 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3417 more obvious *why* one needs to *temporary* unset infoblocks.]
3419 Matched REx position InfoBlocks Comment
3423 Y A)*?Z)*?T x <- O <- I
3424 YA )*?Z)*?T x <- O <- I
3425 YA A)*?Z)*?T x <- O <- I
3426 YAA )*?Z)*?T x <- O <- I
3427 YAA Z)*?T x <- O # Temporary unset I
3430 YAAZ Y(A)*?Z)*?T x <- O
3433 YAAZY (A)*?Z)*?T x <- O
3436 YAAZY A)*?Z)*?T x <- O <- I
3439 YAAZYA )*?Z)*?T x <- O <- I
3442 YAAZYA Z)*?T x <- O # Temporary unset I
3448 YAAZYAZ T x # Temporary unset O
3455 *******************************************************************/
3458 CHECKPOINT cp = PL_savestack_ix;
3459 /* No need to save/restore up to this paren */
3460 I32 parenfloor = scan->flags;
3462 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3464 cc.oldcc = PL_regcc;
3466 /* XXXX Probably it is better to teach regpush to support
3467 parenfloor > PL_regsize... */
3468 if (parenfloor > (I32)*PL_reglastparen)
3469 parenfloor = *PL_reglastparen; /* Pessimization... */
3470 cc.parenfloor = parenfloor;
3472 cc.min = ARG1(scan);
3473 cc.max = ARG2(scan);
3474 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3478 PL_reginput = locinput;
3479 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3481 PL_regcc = cc.oldcc;
3487 * This is really hard to understand, because after we match
3488 * what we're trying to match, we must make sure the rest of
3489 * the REx is going to match for sure, and to do that we have
3490 * to go back UP the parse tree by recursing ever deeper. And
3491 * if it fails, we have to reset our parent's current state
3492 * that we can try again after backing off.
3495 CHECKPOINT cp, lastcp;
3496 CURCUR* cc = PL_regcc;
3497 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3498 I32 cache_offset = 0, cache_bit = 0;
3500 n = cc->cur + 1; /* how many we know we matched */
3501 PL_reginput = locinput;
3504 PerlIO_printf(Perl_debug_log,
3505 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3506 REPORT_CODE_OFF+PL_regindent*2, "",
3507 (long)n, (long)cc->min,
3508 (long)cc->max, PTR2UV(cc))
3511 /* If degenerate scan matches "", assume scan done. */
3513 if (locinput == cc->lastloc && n >= cc->min) {
3514 PL_regcc = cc->oldcc;
3518 PerlIO_printf(Perl_debug_log,
3519 "%*s empty match detected, try continuation...\n",
3520 REPORT_CODE_OFF+PL_regindent*2, "")
3522 if (regmatch(cc->next))
3530 /* First just match a string of min scans. */
3534 cc->lastloc = locinput;
3535 if (regmatch(cc->scan))
3538 cc->lastloc = lastloc;
3543 /* Check whether we already were at this position.
3544 Postpone detection until we know the match is not
3545 *that* much linear. */
3546 if (!PL_reg_maxiter) {
3547 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3548 PL_reg_leftiter = PL_reg_maxiter;
3550 if (PL_reg_leftiter-- == 0) {
3551 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3552 if (PL_reg_poscache) {
3553 if ((I32)PL_reg_poscache_size < size) {
3554 Renew(PL_reg_poscache, size, char);
3555 PL_reg_poscache_size = size;
3557 Zero(PL_reg_poscache, size, char);
3560 PL_reg_poscache_size = size;
3561 Newxz(PL_reg_poscache, size, char);
3564 PerlIO_printf(Perl_debug_log,
3565 "%sDetected a super-linear match, switching on caching%s...\n",
3566 PL_colors[4], PL_colors[5])
3569 if (PL_reg_leftiter < 0) {
3570 cache_offset = locinput - PL_bostr;
3572 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3573 + cache_offset * (scan->flags>>4);
3574 cache_bit = cache_offset % 8;
3576 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3578 PerlIO_printf(Perl_debug_log,
3579 "%*s already tried at this position...\n",
3580 REPORT_CODE_OFF+PL_regindent*2, "")
3582 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3583 /* cache records success */
3586 /* cache records failure */
3589 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3593 /* Prefer next over scan for minimal matching. */
3596 PL_regcc = cc->oldcc;
3599 cp = regcppush(cc->parenfloor);
3601 if (regmatch(cc->next)) {
3603 CACHEsayYES; /* All done. */
3605 REGCP_UNWIND(lastcp);
3611 if (n >= cc->max) { /* Maximum greed exceeded? */
3612 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3613 && !(PL_reg_flags & RF_warned)) {
3614 PL_reg_flags |= RF_warned;
3615 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3616 "Complex regular subexpression recursion",
3623 PerlIO_printf(Perl_debug_log,
3624 "%*s trying longer...\n",
3625 REPORT_CODE_OFF+PL_regindent*2, "")
3627 /* Try scanning more and see if it helps. */
3628 PL_reginput = locinput;
3630 cc->lastloc = locinput;
3631 cp = regcppush(cc->parenfloor);
3633 if (regmatch(cc->scan)) {
3637 REGCP_UNWIND(lastcp);
3640 cc->lastloc = lastloc;
3644 /* Prefer scan over next for maximal matching. */
3646 if (n < cc->max) { /* More greed allowed? */
3647 cp = regcppush(cc->parenfloor);
3649 cc->lastloc = locinput;
3651 if (regmatch(cc->scan)) {
3655 REGCP_UNWIND(lastcp);
3656 regcppop(); /* Restore some previous $<digit>s? */
3657 PL_reginput = locinput;
3659 PerlIO_printf(Perl_debug_log,
3660 "%*s failed, try continuation...\n",
3661 REPORT_CODE_OFF+PL_regindent*2, "")
3664 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3665 && !(PL_reg_flags & RF_warned)) {
3666 PL_reg_flags |= RF_warned;
3667 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3668 "Complex regular subexpression recursion",
3672 /* Failed deeper matches of scan, so see if this one works. */
3673 PL_regcc = cc->oldcc;
3676 if (regmatch(cc->next))
3682 cc->lastloc = lastloc;
3687 next = scan + ARG(scan);
3690 inner = NEXTOPER(NEXTOPER(scan));
3693 inner = NEXTOPER(scan);
3697 if (OP(next) != c1) /* No choice. */
3698 next = inner; /* Avoid recursion. */
3700 const I32 lastparen = *PL_reglastparen;
3702 re_unwind_branch_t *uw;
3704 /* Put unwinding data on stack */
3705 unwind1 = SSNEWt(1,re_unwind_branch_t);
3706 uw = SSPTRt(unwind1,re_unwind_branch_t);
3709 uw->type = ((c1 == BRANCH)
3711 : RE_UNWIND_BRANCHJ);
3712 uw->lastparen = lastparen;
3714 uw->locinput = locinput;
3715 uw->nextchr = nextchr;
3717 uw->regindent = ++PL_regindent;
3720 REGCP_SET(uw->lastcp);
3722 /* Now go into the first branch */
3735 /* We suppose that the next guy does not need
3736 backtracking: in particular, it is of constant non-zero length,
3737 and has no parenths to influence future backrefs. */
3738 ln = ARG1(scan); /* min to match */
3739 n = ARG2(scan); /* max to match */
3740 paren = scan->flags;
3742 if (paren > PL_regsize)
3744 if (paren > (I32)*PL_reglastparen)
3745 *PL_reglastparen = paren;
3747 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3749 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3750 PL_reginput = locinput;
3753 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3755 locinput = PL_reginput;
3756 if (HAS_TEXT(next) || JUMPABLE(next)) {
3757 regnode *text_node = next;
3759 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3761 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3763 if (PL_regkind[(U8)OP(text_node)] == REF) {
3767 else { c1 = (U8)*STRING(text_node); }
3768 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3770 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3771 c2 = PL_fold_locale[c1];
3780 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3781 /* If it could work, try it. */
3783 UCHARAT(PL_reginput) == c1 ||
3784 UCHARAT(PL_reginput) == c2)
3788 PL_regstartp[paren] =
3789 HOPc(PL_reginput, -l) - PL_bostr;
3790 PL_regendp[paren] = PL_reginput - PL_bostr;
3793 PL_regendp[paren] = -1;
3797 REGCP_UNWIND(lastcp);
3799 /* Couldn't or didn't -- move forward. */
3800 PL_reginput = locinput;
3801 if (regrepeat_hard(scan, 1, &l)) {
3803 locinput = PL_reginput;
3810 n = regrepeat_hard(scan, n, &l);
3811 locinput = PL_reginput;
3813 PerlIO_printf(Perl_debug_log,
3814 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3815 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3819 if (HAS_TEXT(next) || JUMPABLE(next)) {
3820 regnode *text_node = next;
3822 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3824 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3826 if (PL_regkind[(U8)OP(text_node)] == REF) {
3830 else { c1 = (U8)*STRING(text_node); }
3832 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3834 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3835 c2 = PL_fold_locale[c1];
3846 /* If it could work, try it. */
3848 UCHARAT(PL_reginput) == c1 ||
3849 UCHARAT(PL_reginput) == c2)
3852 PerlIO_printf(Perl_debug_log,
3853 "%*s trying tail with n=%"IVdf"...\n",
3854 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3858 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3859 PL_regendp[paren] = PL_reginput - PL_bostr;
3862 PL_regendp[paren] = -1;
3866 REGCP_UNWIND(lastcp);
3868 /* Couldn't or didn't -- back up. */
3870 locinput = HOPc(locinput, -l);
3871 PL_reginput = locinput;
3878 paren = scan->flags; /* Which paren to set */
3879 if (paren > PL_regsize)
3881 if (paren > (I32)*PL_reglastparen)
3882 *PL_reglastparen = paren;
3883 ln = ARG1(scan); /* min to match */
3884 n = ARG2(scan); /* max to match */
3885 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3889 ln = ARG1(scan); /* min to match */
3890 n = ARG2(scan); /* max to match */
3891 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3896 scan = NEXTOPER(scan);
3902 scan = NEXTOPER(scan);
3906 * Lookahead to avoid useless match attempts
3907 * when we know what character comes next.
3911 * Used to only do .*x and .*?x, but now it allows
3912 * for )'s, ('s and (?{ ... })'s to be in the way
3913 * of the quantifier and the EXACT-like node. -- japhy
3916 if (HAS_TEXT(next) || JUMPABLE(next)) {
3918 regnode *text_node = next;
3920 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3922 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3924 if (PL_regkind[(U8)OP(text_node)] == REF) {
3926 goto assume_ok_easy;
3928 else { s = (U8*)STRING(text_node); }
3932 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3934 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3935 c2 = PL_fold_locale[c1];
3938 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3939 STRLEN ulen1, ulen2;
3940 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3941 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3943 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3944 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3946 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3948 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3952 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3961 PL_reginput = locinput;
3965 if (ln && regrepeat(scan, ln) < ln)
3967 locinput = PL_reginput;
3970 char *e; /* Should not check after this */
3971 char *old = locinput;
3974 if (n == REG_INFTY) {
3977 while (UTF8_IS_CONTINUATION(*(U8*)e))
3983 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3987 e = locinput + n - ln;
3992 /* Find place 'next' could work */
3995 while (locinput <= e &&
3996 UCHARAT(locinput) != c1)
3999 while (locinput <= e
4000 && UCHARAT(locinput) != c1
4001 && UCHARAT(locinput) != c2)
4004 count = locinput - old;
4009 /* count initialised to
4010 * utf8_distance(old, locinput) */
4011 while (locinput <= e &&
4012 utf8n_to_uvchr((U8*)locinput,
4013 UTF8_MAXBYTES, &len,
4014 uniflags) != (UV)c1) {
4020 /* count initialised to
4021 * utf8_distance(old, locinput) */
4022 while (locinput <= e) {
4023 UV c = utf8n_to_uvchr((U8*)locinput,
4024 UTF8_MAXBYTES, &len,
4026 if (c == (UV)c1 || c == (UV)c2)
4035 /* PL_reginput == old now */
4036 if (locinput != old) {
4037 ln = 1; /* Did some */
4038 if (regrepeat(scan, count) < count)
4041 /* PL_reginput == locinput now */
4042 TRYPAREN(paren, ln, locinput);
4043 PL_reginput = locinput; /* Could be reset... */
4044 REGCP_UNWIND(lastcp);
4045 /* Couldn't or didn't -- move forward. */
4048 locinput += UTF8SKIP(locinput);
4055 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
4059 c = utf8n_to_uvchr((U8*)PL_reginput,
4063 c = UCHARAT(PL_reginput);
4064 /* If it could work, try it. */
4065 if (c == (UV)c1 || c == (UV)c2)
4067 TRYPAREN(paren, ln, PL_reginput);
4068 REGCP_UNWIND(lastcp);
4071 /* If it could work, try it. */
4072 else if (c1 == -1000)
4074 TRYPAREN(paren, ln, PL_reginput);
4075 REGCP_UNWIND(lastcp);
4077 /* Couldn't or didn't -- move forward. */
4078 PL_reginput = locinput;
4079 if (regrepeat(scan, 1)) {
4081 locinput = PL_reginput;
4089 n = regrepeat(scan, n);
4090 locinput = PL_reginput;
4091 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
4092 (OP(next) != MEOL ||
4093 OP(next) == SEOL || OP(next) == EOS))
4095 ln = n; /* why back off? */
4096 /* ...because $ and \Z can match before *and* after
4097 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4098 We should back off by one in this case. */
4099 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4108 c = utf8n_to_uvchr((U8*)PL_reginput,
4112 c = UCHARAT(PL_reginput);
4114 /* If it could work, try it. */
4115 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
4117 TRYPAREN(paren, n, PL_reginput);
4118 REGCP_UNWIND(lastcp);
4120 /* Couldn't or didn't -- back up. */
4122 PL_reginput = locinput = HOPc(locinput, -1);
4130 c = utf8n_to_uvchr((U8*)PL_reginput,
4134 c = UCHARAT(PL_reginput);
4136 /* If it could work, try it. */
4137 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
4139 TRYPAREN(paren, n, PL_reginput);
4140 REGCP_UNWIND(lastcp);
4142 /* Couldn't or didn't -- back up. */
4144 PL_reginput = locinput = HOPc(locinput, -1);
4151 if (PL_reg_call_cc) {
4152 re_cc_state *cur_call_cc = PL_reg_call_cc;
4153 CURCUR *cctmp = PL_regcc;
4154 regexp *re = PL_reg_re;
4155 CHECKPOINT cp, lastcp;
4157 cp = regcppush(0); /* Save *all* the positions. */
4159 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
4161 PL_reginput = locinput; /* Make position available to
4163 cache_re(PL_reg_call_cc->re);
4164 PL_regcc = PL_reg_call_cc->cc;
4165 PL_reg_call_cc = PL_reg_call_cc->prev;
4166 if (regmatch(cur_call_cc->node)) {
4167 PL_reg_call_cc = cur_call_cc;
4171 REGCP_UNWIND(lastcp);
4173 PL_reg_call_cc = cur_call_cc;
4179 PerlIO_printf(Perl_debug_log,
4180 "%*s continuation failed...\n",
4181 REPORT_CODE_OFF+PL_regindent*2, "")
4185 if (locinput < PL_regtill) {
4186 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4187 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4189 (long)(locinput - PL_reg_starttry),
4190 (long)(PL_regtill - PL_reg_starttry),
4192 sayNO_FINAL; /* Cannot match: too short. */
4194 PL_reginput = locinput; /* put where regtry can find it */
4195 sayYES_FINAL; /* Success! */
4197 PL_reginput = locinput; /* put where regtry can find it */
4198 sayYES_LOUD; /* Success! */
4201 PL_reginput = locinput;
4206 s = HOPBACKc(locinput, scan->flags);
4212 PL_reginput = locinput;
4217 s = HOPBACKc(locinput, scan->flags);
4223 PL_reginput = locinput;
4226 inner = NEXTOPER(NEXTOPER(scan));
4227 if (regmatch(inner) != n) {
4242 if (OP(scan) == SUSPEND) {
4243 locinput = PL_reginput;
4244 nextchr = UCHARAT(locinput);
4249 next = scan + ARG(scan);
4254 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4255 PTR2UV(scan), OP(scan));
4256 Perl_croak(aTHX_ "regexp memory corruption");
4263 * We get here only if there's trouble -- normally "case END" is
4264 * the terminating point.
4266 Perl_croak(aTHX_ "corrupted regexp pointers");
4272 PerlIO_printf(Perl_debug_log,
4273 "%*s %scould match...%s\n",
4274 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4278 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4279 PL_colors[4], PL_colors[5]));
4285 #if 0 /* Breaks $^R */
4293 PerlIO_printf(Perl_debug_log,
4294 "%*s %sfailed...%s\n",
4295 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4301 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
4304 case RE_UNWIND_BRANCH:
4305 case RE_UNWIND_BRANCHJ:
4307 re_unwind_branch_t *uwb = &(uw->branch);
4308 const I32 lastparen = uwb->lastparen;
4310 REGCP_UNWIND(uwb->lastcp);
4311 for (n = *PL_reglastparen; n > lastparen; n--)
4313 *PL_reglastparen = n;
4314 scan = next = uwb->next;
4316 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
4317 ? BRANCH : BRANCHJ) ) { /* Failure */
4324 /* Have more choice yet. Reuse the same uwb. */
4325 if ((n = (uwb->type == RE_UNWIND_BRANCH
4326 ? NEXT_OFF(next) : ARG(next))))
4329 next = NULL; /* XXXX Needn't unwinding in this case... */
4331 next = NEXTOPER(scan);
4332 if (uwb->type == RE_UNWIND_BRANCHJ)
4333 next = NEXTOPER(next);
4334 locinput = uwb->locinput;
4335 nextchr = uwb->nextchr;
4337 PL_regindent = uwb->regindent;
4344 Perl_croak(aTHX_ "regexp unwind memory corruption");
4355 - regrepeat - repeatedly match something simple, report how many
4358 * [This routine now assumes that it will only match on things of length 1.
4359 * That was true before, but now we assume scan - reginput is the count,
4360 * rather than incrementing count on every character. [Er, except utf8.]]
4363 S_regrepeat(pTHX_ const regnode *p, I32 max)
4366 register char *scan;
4368 register char *loceol = PL_regeol;
4369 register I32 hardcount = 0;
4370 register bool do_utf8 = PL_reg_match_utf8;
4373 if (max == REG_INFTY)
4375 else if (max < loceol - scan)
4376 loceol = scan + max;
4381 while (scan < loceol && hardcount < max && *scan != '\n') {
4382 scan += UTF8SKIP(scan);
4386 while (scan < loceol && *scan != '\n')
4393 while (scan < loceol && hardcount < max) {
4394 scan += UTF8SKIP(scan);
4404 case EXACT: /* length of string is 1 */
4406 while (scan < loceol && UCHARAT(scan) == c)
4409 case EXACTF: /* length of string is 1 */
4411 while (scan < loceol &&
4412 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4415 case EXACTFL: /* length of string is 1 */
4416 PL_reg_flags |= RF_tainted;
4418 while (scan < loceol &&
4419 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4425 while (hardcount < max && scan < loceol &&
4426 reginclass(p, (U8*)scan, 0, do_utf8)) {
4427 scan += UTF8SKIP(scan);
4431 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4438 LOAD_UTF8_CHARCLASS_ALNUM();
4439 while (hardcount < max && scan < loceol &&
4440 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4441 scan += UTF8SKIP(scan);
4445 while (scan < loceol && isALNUM(*scan))
4450 PL_reg_flags |= RF_tainted;
4453 while (hardcount < max && scan < loceol &&
4454 isALNUM_LC_utf8((U8*)scan)) {
4455 scan += UTF8SKIP(scan);
4459 while (scan < loceol && isALNUM_LC(*scan))
4466 LOAD_UTF8_CHARCLASS_ALNUM();
4467 while (hardcount < max && scan < loceol &&
4468 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4469 scan += UTF8SKIP(scan);
4473 while (scan < loceol && !isALNUM(*scan))
4478 PL_reg_flags |= RF_tainted;
4481 while (hardcount < max && scan < loceol &&
4482 !isALNUM_LC_utf8((U8*)scan)) {
4483 scan += UTF8SKIP(scan);
4487 while (scan < loceol && !isALNUM_LC(*scan))
4494 LOAD_UTF8_CHARCLASS_SPACE();
4495 while (hardcount < max && scan < loceol &&
4497 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4498 scan += UTF8SKIP(scan);
4502 while (scan < loceol && isSPACE(*scan))
4507 PL_reg_flags |= RF_tainted;
4510 while (hardcount < max && scan < loceol &&
4511 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4512 scan += UTF8SKIP(scan);
4516 while (scan < loceol && isSPACE_LC(*scan))
4523 LOAD_UTF8_CHARCLASS_SPACE();
4524 while (hardcount < max && scan < loceol &&
4526 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4527 scan += UTF8SKIP(scan);
4531 while (scan < loceol && !isSPACE(*scan))
4536 PL_reg_flags |= RF_tainted;
4539 while (hardcount < max && scan < loceol &&
4540 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4541 scan += UTF8SKIP(scan);
4545 while (scan < loceol && !isSPACE_LC(*scan))
4552 LOAD_UTF8_CHARCLASS_DIGIT();
4553 while (hardcount < max && scan < loceol &&
4554 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4555 scan += UTF8SKIP(scan);
4559 while (scan < loceol && isDIGIT(*scan))
4566 LOAD_UTF8_CHARCLASS_DIGIT();
4567 while (hardcount < max && scan < loceol &&
4568 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4569 scan += UTF8SKIP(scan);
4573 while (scan < loceol && !isDIGIT(*scan))
4577 default: /* Called on something of 0 width. */
4578 break; /* So match right here or not at all. */
4584 c = scan - PL_reginput;
4588 SV *re_debug_flags = NULL;
4589 SV *prop = sv_newmortal();
4593 PerlIO_printf(Perl_debug_log,
4594 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4595 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4603 - regrepeat_hard - repeatedly match something, report total lenth and length
4605 * The repeater is supposed to have constant non-zero length.
4609 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4612 register char *scan = Nullch;
4613 register char *start;
4614 register char *loceol = PL_regeol;
4616 I32 count = 0, res = 1;
4621 start = PL_reginput;
4622 if (PL_reg_match_utf8) {
4623 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4626 while (start < PL_reginput) {
4628 start += UTF8SKIP(start);
4639 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4641 *lp = l = PL_reginput - start;
4642 if (max != REG_INFTY && l*max < loceol - scan)
4643 loceol = scan + l*max;
4656 - regclass_swash - prepare the utf8 swash
4660 Perl_regclass_swash(pTHX_ register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4667 if (PL_regdata && PL_regdata->count) {
4668 const U32 n = ARG(node);
4670 if (PL_regdata->what[n] == 's') {
4671 SV * const rv = (SV*)PL_regdata->data[n];
4672 AV * const av = (AV*)SvRV((SV*)rv);
4673 SV **const ary = AvARRAY(av);
4676 /* See the end of regcomp.c:S_regclass() for
4677 * documentation of these array elements. */
4680 a = SvROK(ary[1]) ? &ary[1] : 0;
4681 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4685 else if (si && doinit) {
4686 sw = swash_init("utf8", "", si, 1, 0);
4687 (void)av_store(av, 1, sw);
4703 - reginclass - determine if a character falls into a character class
4705 The n is the ANYOF regnode, the p is the target string, lenp
4706 is pointer to the maximum length of how far to go in the p
4707 (if the lenp is zero, UTF8SKIP(p) is used),
4708 do_utf8 tells whether the target string is in UTF-8.
4713 S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4716 const char flags = ANYOF_FLAGS(n);
4722 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4723 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4724 ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
4725 UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
4726 if (len == (STRLEN)-1)
4727 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4730 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4731 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4734 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4735 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4738 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4742 SV * const sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4745 if (swash_fetch(sw, p, do_utf8))
4747 else if (flags & ANYOF_FOLD) {
4748 if (!match && lenp && av) {
4750 for (i = 0; i <= av_len(av); i++) {
4751 SV* const sv = *av_fetch(av, i, FALSE);
4753 const char * const s = SvPV_const(sv, len);
4755 if (len <= plen && memEQ(s, (char*)p, len)) {
4763 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4766 to_utf8_fold(p, tmpbuf, &tmplen);
4767 if (swash_fetch(sw, tmpbuf, do_utf8))
4773 if (match && lenp && *lenp == 0)
4774 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4776 if (!match && c < 256) {
4777 if (ANYOF_BITMAP_TEST(n, c))
4779 else if (flags & ANYOF_FOLD) {
4782 if (flags & ANYOF_LOCALE) {
4783 PL_reg_flags |= RF_tainted;
4784 f = PL_fold_locale[c];
4788 if (f != c && ANYOF_BITMAP_TEST(n, f))
4792 if (!match && (flags & ANYOF_CLASS)) {
4793 PL_reg_flags |= RF_tainted;
4795 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4796 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4797 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4798 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4799 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4800 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4801 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4802 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4803 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4804 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4805 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4806 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4807 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4808 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4809 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4810 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4811 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4812 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4813 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4814 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4815 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4816 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4817 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4818 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4819 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4820 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4821 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4822 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4823 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4824 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4825 ) /* How's that for a conditional? */
4832 return (flags & ANYOF_INVERT) ? !match : match;
4836 S_reghop(pTHX_ U8 *s, I32 off)
4839 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4843 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4847 while (off-- && s < lim) {
4848 /* XXX could check well-formedness here */
4856 if (UTF8_IS_CONTINUED(*s)) {
4857 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4860 /* XXX could check well-formedness here */
4868 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4871 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4875 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4879 while (off-- && s < lim) {
4880 /* XXX could check well-formedness here */
4890 if (UTF8_IS_CONTINUED(*s)) {
4891 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4894 /* XXX could check well-formedness here */
4906 restore_pos(pTHX_ void *arg)
4909 PERL_UNUSED_ARG(arg);
4910 if (PL_reg_eval_set) {
4911 if (PL_reg_oldsaved) {
4912 PL_reg_re->subbeg = PL_reg_oldsaved;
4913 PL_reg_re->sublen = PL_reg_oldsavedlen;
4914 #ifdef PERL_OLD_COPY_ON_WRITE
4915 PL_reg_re->saved_copy = PL_nrs;
4917 RX_MATCH_COPIED_on(PL_reg_re);
4919 PL_reg_magic->mg_len = PL_reg_oldpos;
4920 PL_reg_eval_set = 0;
4921 PL_curpm = PL_reg_oldcurpm;
4926 S_to_utf8_substr(pTHX_ register regexp *prog)
4928 if (prog->float_substr && !prog->float_utf8) {
4930 prog->float_utf8 = sv = newSVsv(prog->float_substr);
4931 sv_utf8_upgrade(sv);
4932 if (SvTAIL(prog->float_substr))
4934 if (prog->float_substr == prog->check_substr)
4935 prog->check_utf8 = sv;
4937 if (prog->anchored_substr && !prog->anchored_utf8) {
4939 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
4940 sv_utf8_upgrade(sv);
4941 if (SvTAIL(prog->anchored_substr))
4943 if (prog->anchored_substr == prog->check_substr)
4944 prog->check_utf8 = sv;
4949 S_to_byte_substr(pTHX_ register regexp *prog)
4952 if (prog->float_utf8 && !prog->float_substr) {
4954 prog->float_substr = sv = newSVsv(prog->float_utf8);
4955 if (sv_utf8_downgrade(sv, TRUE)) {
4956 if (SvTAIL(prog->float_utf8))
4960 prog->float_substr = sv = &PL_sv_undef;
4962 if (prog->float_utf8 == prog->check_utf8)
4963 prog->check_substr = sv;
4965 if (prog->anchored_utf8 && !prog->anchored_substr) {
4967 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
4968 if (sv_utf8_downgrade(sv, TRUE)) {
4969 if (SvTAIL(prog->anchored_utf8))
4973 prog->anchored_substr = sv = &PL_sv_undef;
4975 if (prog->anchored_utf8 == prog->check_utf8)
4976 prog->check_substr = sv;
4982 * c-indentation-style: bsd
4984 * indent-tabs-mode: t
4987 * ex: set ts=8 sts=4 sw=4 noet: