5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
56 **** Alterations to Henry's code are...
58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
69 #define PERL_IN_REGEXEC_C
72 #ifdef PERL_IN_XSUB_RE
78 #define RF_tainted 1 /* tainted information used? */
79 #define RF_warned 2 /* warned about big count? */
81 #define RF_utf8 8 /* Pattern contains multibyte chars? */
83 #define UTF ((PL_reg_flags & RF_utf8) != 0)
85 #define RS_init 1 /* eval environment created */
86 #define RS_set 2 /* replsv value is set */
92 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
98 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
99 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
101 #define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
105 #define HOPBACKc(pos, off) \
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
112 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
113 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
115 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
122 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124 /* for use after a quantifier and before an EXACT-like node -- japhy */
125 #define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
132 #define HAS_TEXT(rn) ( \
133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140 #define FIND_NEXT_IMPT(rn) STMT_START { \
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
144 rn = NEXTOPER(NEXTOPER(rn)); \
145 else if (type == PLUS) \
147 else if (type == IFMATCH) \
148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
149 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 8
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHPTR(PL_regstartp);
186 SSPUSHPTR(PL_regendp);
187 SSPUSHINT(PL_regsize);
188 SSPUSHINT(*PL_reglastparen);
189 SSPUSHINT(*PL_reglastcloseparen);
190 SSPUSHPTR(PL_reginput);
191 #define REGCP_FRAME_ELEMS 2
192 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
193 * are needed for the regexp context stack bookkeeping. */
194 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
195 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
200 /* These are needed since we do not localize EVAL nodes: */
201 #define REGCP_SET(cp) \
203 PerlIO_printf(Perl_debug_log, \
204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
205 (IV)PL_savestack_ix)); \
208 #define REGCP_UNWIND(cp) \
210 if (cp != PL_savestack_ix) \
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
213 (IV)(cp), (IV)PL_savestack_ix)); \
217 S_regcppop(pTHX_ const regexp *rex)
223 GET_RE_DEBUG_FLAGS_DECL;
225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
229 input = (char *) SSPOPPTR;
230 *PL_reglastcloseparen = SSPOPINT;
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
233 PL_regendp=(I32 *) SSPOPPTR;
234 PL_regstartp=(I32 *) SSPOPPTR;
237 /* Now restore the parentheses context. */
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
241 U32 paren = (U32)SSPOPINT;
242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
243 PL_regstartp[paren] = SSPOPINT;
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
248 PerlIO_printf(Perl_debug_log,
249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
250 (UV)paren, (IV)PL_regstartp[paren],
251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
252 (IV)PL_regendp[paren],
253 (paren > *PL_reglastparen ? "(no)" : ""));
257 if (*PL_reglastparen + 1 <= rex->nparens) {
258 PerlIO_printf(Perl_debug_log,
259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
260 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
276 PL_regstartp[i] = -1;
283 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
286 * pregexec and friends
289 #ifndef PERL_IN_XSUB_RE
291 - pregexec - match a regexp against a string
294 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
295 char *strbeg, I32 minend, SV *screamer, U32 nosave)
296 /* strend: pointer to null at end of string */
297 /* strbeg: real beginning of string */
298 /* minend: end of match must be >=minend after stringarg. */
299 /* nosave: For optimizations. */
302 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
303 nosave ? 0 : REXEC_COPY_STR);
308 * Need to implement the following flags for reg_anch:
310 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
313 * INTUIT_AUTORITATIVE_ML
314 * INTUIT_ONCE_NOML - Intuit can match in one location only.
317 * Another flag for this function: SECOND_TIME (so that float substrs
318 * with giant delta may be not rechecked).
321 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
324 Otherwise, only SvCUR(sv) is used to get strbeg. */
326 /* XXXX We assume that strpos is strbeg unless sv. */
328 /* XXXX Some places assume that there is a fixed substring.
329 An update may be needed if optimizer marks as "INTUITable"
330 RExen without fixed substrings. Similarly, it is assumed that
331 lengths of all the strings are no more than minlen, thus they
332 cannot come from lookahead.
333 (Or minlen should take into account lookahead.) */
335 /* A failure to find a constant substring means that there is no need to make
336 an expensive call to REx engine, thus we celebrate a failure. Similarly,
337 finding a substring too deep into the string means that less calls to
338 regtry() should be needed.
340 REx compiler's optimizer found 4 possible hints:
341 a) Anchored substring;
343 c) Whether we are anchored (beginning-of-line or \G);
344 d) First node (of those at offset 0) which may distingush positions;
345 We use a)b)d) and multiline-part of c), and try to find a position in the
346 string which does not contradict any of them.
349 /* Most of decisions we do here should have been done at compile time.
350 The nodes of the REx which we used for the search should have been
351 deleted from the finite automaton. */
354 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
355 char *strend, U32 flags, re_scream_pos_data *data)
358 register I32 start_shift = 0;
359 /* Should be nonnegative! */
360 register I32 end_shift = 0;
365 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
367 register char *other_last = NULL; /* other substr checked before this */
368 char *check_at = NULL; /* check substr found at this pos */
369 const I32 multiline = prog->reganch & PMf_MULTILINE;
371 const char * const i_strpos = strpos;
374 GET_RE_DEBUG_FLAGS_DECL;
376 RX_MATCH_UTF8_set(prog,do_utf8);
378 if (prog->reganch & ROPT_UTF8) {
379 PL_reg_flags |= RF_utf8;
382 debug_start_match(prog, do_utf8, strpos, strend,
383 sv ? "Guessing start of match in sv for"
384 : "Guessing start of match in string for");
387 /* CHR_DIST() would be more correct here but it makes things slow. */
388 if (prog->minlen > strend - strpos) {
389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
390 "String too short... [re_intuit_start]\n"));
394 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
397 if (!prog->check_utf8 && prog->check_substr)
398 to_utf8_substr(prog);
399 check = prog->check_utf8;
401 if (!prog->check_substr && prog->check_utf8)
402 to_byte_substr(prog);
403 check = prog->check_substr;
405 if (check == &PL_sv_undef) {
406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
407 "Non-utf8 string cannot match utf8 check string\n"));
410 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
411 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
412 || ( (prog->reganch & ROPT_ANCH_BOL)
413 && !multiline ) ); /* Check after \n? */
416 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
417 | ROPT_IMPLICIT)) /* not a real BOL */
418 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
420 && (strpos != strbeg)) {
421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
424 if (prog->check_offset_min == prog->check_offset_max &&
425 !(prog->reganch & ROPT_CANY_SEEN)) {
426 /* Substring at constant offset from beg-of-str... */
429 s = HOP3c(strpos, prog->check_offset_min, strend);
432 slen = SvCUR(check); /* >= 1 */
434 if ( strend - s > slen || strend - s < slen - 1
435 || (strend - s == slen && strend[-1] != '\n')) {
436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
439 /* Now should match s[0..slen-2] */
441 if (slen && (*SvPVX_const(check) != *s
443 && memNE(SvPVX_const(check), s, slen)))) {
445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
449 else if (*SvPVX_const(check) != *s
450 || ((slen = SvCUR(check)) > 1
451 && memNE(SvPVX_const(check), s, slen)))
454 goto success_at_start;
457 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
459 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
460 end_shift = prog->check_end_shift;
463 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
464 - (SvTAIL(check) != 0);
465 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
467 if (end_shift < eshift)
471 else { /* Can match at random position */
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->check_end_shift;
477 /* end shift should be non negative here */
480 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
482 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
483 (IV)end_shift, prog->precomp);
487 /* Find a possible match in the region s..strend by looking for
488 the "check" substring in the region corrected by start/end_shift. */
491 I32 srch_start_shift = start_shift;
492 I32 srch_end_shift = end_shift;
493 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
494 srch_end_shift -= ((strbeg - s) - srch_start_shift);
495 srch_start_shift = strbeg - s;
497 DEBUG_OPTIMISE_MORE_r({
498 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
499 (IV)prog->check_offset_min,
500 (IV)srch_start_shift,
502 (IV)prog->check_end_shift);
505 if (flags & REXEC_SCREAM) {
506 I32 p = -1; /* Internal iterator of scream. */
507 I32 * const pp = data ? data->scream_pos : &p;
509 if (PL_screamfirst[BmRARE(check)] >= 0
510 || ( BmRARE(check) == '\n'
511 && (BmPREVIOUS(check) == SvCUR(check) - 1)
513 s = screaminstr(sv, check,
514 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
517 /* we may be pointing at the wrong string */
518 if (s && RX_MATCH_COPIED(prog))
519 s = strbeg + (s - SvPVX_const(sv));
521 *data->scream_olds = s;
526 if (prog->reganch & ROPT_CANY_SEEN) {
527 start_point= (U8*)(s + srch_start_shift);
528 end_point= (U8*)(strend - srch_end_shift);
530 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
531 end_point= HOP3(strend, -srch_end_shift, strbeg);
533 DEBUG_OPTIMISE_MORE_r({
534 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
535 (int)(end_point - start_point),
536 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
540 s = fbm_instr( start_point, end_point,
541 check, multiline ? FBMrf_MULTILINE : 0);
544 /* Update the count-of-usability, remove useless subpatterns,
548 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
549 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
550 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
551 (s ? "Found" : "Did not find"),
552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
553 ? "anchored" : "floating"),
556 (s ? " at offset " : "...\n") );
561 /* Finish the diagnostic message */
562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
564 /* XXX dmq: first branch is for positive lookbehind...
565 Our check string is offset from the beginning of the pattern.
566 So we need to do any stclass tests offset forward from that
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
578 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
584 : (prog->float_substr && prog->anchored_substr))
586 /* Take into account the "other" substring. */
587 /* XXXX May be hopelessly wrong for UTF... */
590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
593 char * const last = HOP3c(s, -start_shift, strbeg);
595 char * const saved_s = s;
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
606 t = HOP3c(t, prog->anchored_offset, strend);
607 if (t < other_last) /* These positions already checked */
609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
612 /* XXXX It is not documented what units *_offsets are in.
613 We assume bytes, but this is clearly wrong.
614 Meaning this code needs to be carefully reviewed for errors.
618 /* On end-of-str: see comment below. */
619 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
620 if (must == &PL_sv_undef) {
622 DEBUG_r(must = prog->anchored_utf8); /* for debug */
627 HOP3(HOP3(last1, prog->anchored_offset, strend)
628 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
630 multiline ? FBMrf_MULTILINE : 0
633 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
634 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
635 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
636 (s ? "Found" : "Contradicts"),
637 quoted, RE_SV_TAIL(must));
642 if (last1 >= last2) {
643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
644 ", giving up...\n"));
647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
648 ", trying floating at offset %ld...\n",
649 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
650 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
651 s = HOP3c(last, 1, strend);
655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
656 (long)(s - i_strpos)));
657 t = HOP3c(s, -prog->anchored_offset, strbeg);
658 other_last = HOP3c(s, 1, strend);
666 else { /* Take into account the floating substring. */
668 char * const saved_s = s;
671 t = HOP3c(s, -start_shift, strbeg);
673 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
674 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
675 last = HOP3c(t, prog->float_max_offset, strend);
676 s = HOP3c(t, prog->float_min_offset, strend);
679 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
680 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
681 /* fbm_instr() takes into account exact value of end-of-str
682 if the check is SvTAIL(ed). Since false positives are OK,
683 and end-of-str is not later than strend we are OK. */
684 if (must == &PL_sv_undef) {
686 DEBUG_r(must = prog->float_utf8); /* for debug message */
689 s = fbm_instr((unsigned char*)s,
690 (unsigned char*)last + SvCUR(must)
692 must, multiline ? FBMrf_MULTILINE : 0);
694 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
695 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
696 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
697 (s ? "Found" : "Contradicts"),
698 quoted, RE_SV_TAIL(must));
702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
703 ", giving up...\n"));
706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
707 ", trying anchored starting at offset %ld...\n",
708 (long)(saved_s + 1 - i_strpos)));
710 s = HOP3c(t, 1, strend);
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
715 (long)(s - i_strpos)));
716 other_last = s; /* Fix this later. --Hugo */
726 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
728 DEBUG_OPTIMISE_MORE_r(
729 PerlIO_printf(Perl_debug_log,
730 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
731 (IV)prog->check_offset_min,
732 (IV)prog->check_offset_max,
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
748 if (ml_anch && t[-1] != '\n') {
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
756 while (t < strend - prog->minlen) {
758 if (t < check_at - prog->check_offset_min) {
759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
769 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
770 goto do_other_anchored;
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
776 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
783 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
784 other_last = strpos = s = t + 1;
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
790 PL_colors[0], PL_colors[1]));
794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
795 PL_colors[0], PL_colors[1]));
799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
802 /* The found string does not prohibit matching at strpos,
803 - no optimization of calling REx engine can be performed,
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
810 && (strpos != strbeg) && strpos[-1] != '\n'
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
817 DEBUG_EXECUTE_r( if (ml_anch)
818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
819 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
833 /* If flags & SOMETHING - do not do it many times on the same match */
834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
838 prog->check_substr = prog->check_utf8 = NULL; /* disable */
839 prog->float_substr = prog->float_utf8 = NULL; /* clear */
840 check = NULL; /* abort */
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
845 prog->reganch &= ~RE_USE_INTUIT;
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
853 /* trie stclasses are too expensive to use here, we are better off to
854 leave it to regmatch itself */
855 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
864 const U8* const str = (U8*)STRING(prog->regstclass);
865 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
869 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
871 else if (prog->float_substr || prog->float_utf8)
872 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
877 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
880 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
883 const char *what = NULL;
885 if (endpos == strend) {
886 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
887 "Could not match STCLASS...\n") );
890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
891 "This position contradicts STCLASS...\n") );
892 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
894 /* Contradict one of substrings */
895 if (prog->anchored_substr || prog->anchored_utf8) {
896 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
897 DEBUG_EXECUTE_r( what = "anchored" );
899 s = HOP3c(t, 1, strend);
900 if (s + start_shift + end_shift > strend) {
901 /* XXXX Should be taken into account earlier? */
902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
903 "Could not match STCLASS...\n") );
908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
909 "Looking for %s substr starting at offset %ld...\n",
910 what, (long)(s + start_shift - i_strpos)) );
913 /* Have both, check_string is floating */
914 if (t + start_shift >= check_at) /* Contradicts floating=check */
915 goto retry_floating_check;
916 /* Recheck anchored substring, but not floating... */
920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
921 "Looking for anchored substr starting at offset %ld...\n",
922 (long)(other_last - i_strpos)) );
923 goto do_other_anchored;
925 /* Another way we could have checked stclass at the
926 current position only: */
931 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
932 "Looking for /%s^%s/m starting at offset %ld...\n",
933 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
936 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
938 /* Check is floating subtring. */
939 retry_floating_check:
940 t = check_at - start_shift;
941 DEBUG_EXECUTE_r( what = "floating" );
942 goto hop_and_restart;
945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
946 "By STCLASS: moving %ld --> %ld\n",
947 (long)(t - i_strpos), (long)(s - i_strpos))
951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
952 "Does not contradict STCLASS...\n");
957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
958 PL_colors[4], (check ? "Guessed" : "Giving up"),
959 PL_colors[5], (long)(s - i_strpos)) );
962 fail_finish: /* Substring not found */
963 if (prog->check_substr || prog->check_utf8) /* could be removed already */
964 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
967 PL_colors[4], PL_colors[5]));
973 #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
974 foldlen, foldbuf, uniflags) STMT_START { \
975 switch (trie_type) { \
976 case trie_utf8_fold: \
978 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
983 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
985 foldlen -= UNISKIP( uvc ); \
986 uscan = foldbuf + UNISKIP( uvc ); \
990 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
998 charid = trie->charmap[ uvc ]; \
1002 if (trie->widecharmap) { \
1003 SV** const svpp = hv_fetch(trie->widecharmap, \
1004 (char*)&uvc, sizeof(UV), 0); \
1006 charid = (U16)SvIV(*svpp); \
1011 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1014 ibcmp_utf8(s, NULL, 0, do_utf8, \
1015 m, NULL, ln, (bool)UTF)) \
1016 && (!reginfo || regtry(reginfo, s)) ) \
1019 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1020 uvchr_to_utf8(tmpbuf, c); \
1021 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1023 && (f == c1 || f == c2) \
1024 && (ln == foldlen || \
1025 !ibcmp_utf8((char *) foldbuf, \
1026 NULL, foldlen, do_utf8, \
1028 NULL, ln, (bool)UTF)) \
1029 && (!reginfo || regtry(reginfo, s)) ) \
1034 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1038 && (ln == 1 || !(OP(c) == EXACTF \
1040 : ibcmp_locale(s, m, ln))) \
1041 && (!reginfo || regtry(reginfo, s)) ) \
1047 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1049 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1055 #define REXEC_FBC_SCAN(CoDe) \
1057 while (s < strend) { \
1063 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1064 REXEC_FBC_UTF8_SCAN( \
1066 if (tmp && (!reginfo || regtry(reginfo, s))) \
1075 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1078 if (tmp && (!reginfo || regtry(reginfo, s))) \
1087 #define REXEC_FBC_TRYIT \
1088 if ((!reginfo || regtry(reginfo, s))) \
1091 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1094 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1097 REXEC_FBC_CLASS_SCAN(CoNd); \
1101 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1102 PL_reg_flags |= RF_tainted; \
1104 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1107 REXEC_FBC_CLASS_SCAN(CoNd); \
1111 #define DUMP_EXEC_POS(li,s,doutf8) \
1112 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1114 /* We know what class REx starts with. Try to find this position... */
1115 /* if reginfo is NULL, its a dryrun */
1116 /* annoyingly all the vars in this routine have different names from their counterparts
1117 in regmatch. /grrr */
1120 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1121 const char *strend, const regmatch_info *reginfo)
1124 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1128 register STRLEN uskip;
1132 register I32 tmp = 1; /* Scratch variable? */
1133 register const bool do_utf8 = PL_reg_match_utf8;
1135 /* We know what class it must start with. */
1139 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1140 !UTF8_IS_INVARIANT((U8)s[0]) ?
1141 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1142 REGINCLASS(prog, c, (U8*)s));
1145 while (s < strend) {
1148 if (REGINCLASS(prog, c, (U8*)s) ||
1149 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1150 /* The assignment of 2 is intentional:
1151 * for the folded sharp s, the skip is 2. */
1152 (skip = SHARP_S_SKIP))) {
1153 if (tmp && (!reginfo || regtry(reginfo, s)))
1166 if (tmp && (!reginfo || regtry(reginfo, s)))
1174 ln = STR_LEN(c); /* length to match in octets/bytes */
1175 lnc = (I32) ln; /* length to match in characters */
1177 STRLEN ulen1, ulen2;
1179 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1180 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1181 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1183 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1184 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1186 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1188 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1191 while (sm < ((U8 *) m + ln)) {
1206 c2 = PL_fold_locale[c1];
1208 e = HOP3c(strend, -((I32)lnc), s);
1210 if (!reginfo && e < s)
1211 e = s; /* Due to minlen logic of intuit() */
1213 /* The idea in the EXACTF* cases is to first find the
1214 * first character of the EXACTF* node and then, if
1215 * necessary, case-insensitively compare the full
1216 * text of the node. The c1 and c2 are the first
1217 * characters (though in Unicode it gets a bit
1218 * more complicated because there are more cases
1219 * than just upper and lower: one needs to use
1220 * the so-called folding case for case-insensitive
1221 * matching (called "loose matching" in Unicode).
1222 * ibcmp_utf8() will do just that. */
1226 U8 tmpbuf [UTF8_MAXBYTES+1];
1227 STRLEN len, foldlen;
1228 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1230 /* Upper and lower of 1st char are equal -
1231 * probably not a "letter". */
1233 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1235 REXEC_FBC_EXACTISH_CHECK(c == c1);
1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1243 /* Handle some of the three Greek sigmas cases.
1244 * Note that not all the possible combinations
1245 * are handled here: some of them are handled
1246 * by the standard folding rules, and some of
1247 * them (the character class or ANYOF cases)
1248 * are handled during compiletime in
1249 * regexec.c:S_regclass(). */
1250 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1251 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1252 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1254 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1260 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1262 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1266 PL_reg_flags |= RF_tainted;
1273 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1274 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1276 tmp = ((OP(c) == BOUND ?
1277 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1278 LOAD_UTF8_CHARCLASS_ALNUM();
1279 REXEC_FBC_UTF8_SCAN(
1280 if (tmp == !(OP(c) == BOUND ?
1281 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1282 isALNUM_LC_utf8((U8*)s)))
1290 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1291 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1294 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1300 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1304 PL_reg_flags |= RF_tainted;
1311 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1312 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1314 tmp = ((OP(c) == NBOUND ?
1315 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1316 LOAD_UTF8_CHARCLASS_ALNUM();
1317 REXEC_FBC_UTF8_SCAN(
1318 if (tmp == !(OP(c) == NBOUND ?
1319 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1320 isALNUM_LC_utf8((U8*)s)))
1322 else REXEC_FBC_TRYIT;
1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1327 tmp = ((OP(c) == NBOUND ?
1328 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1331 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1333 else REXEC_FBC_TRYIT;
1336 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1340 REXEC_FBC_CSCAN_PRELOAD(
1341 LOAD_UTF8_CHARCLASS_ALNUM(),
1342 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1346 REXEC_FBC_CSCAN_TAINT(
1347 isALNUM_LC_utf8((U8*)s),
1351 REXEC_FBC_CSCAN_PRELOAD(
1352 LOAD_UTF8_CHARCLASS_ALNUM(),
1353 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1357 REXEC_FBC_CSCAN_TAINT(
1358 !isALNUM_LC_utf8((U8*)s),
1362 REXEC_FBC_CSCAN_PRELOAD(
1363 LOAD_UTF8_CHARCLASS_SPACE(),
1364 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1368 REXEC_FBC_CSCAN_TAINT(
1369 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1373 REXEC_FBC_CSCAN_PRELOAD(
1374 LOAD_UTF8_CHARCLASS_SPACE(),
1375 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1379 REXEC_FBC_CSCAN_TAINT(
1380 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1384 REXEC_FBC_CSCAN_PRELOAD(
1385 LOAD_UTF8_CHARCLASS_DIGIT(),
1386 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1390 REXEC_FBC_CSCAN_TAINT(
1391 isDIGIT_LC_utf8((U8*)s),
1395 REXEC_FBC_CSCAN_PRELOAD(
1396 LOAD_UTF8_CHARCLASS_DIGIT(),
1397 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1401 REXEC_FBC_CSCAN_TAINT(
1402 !isDIGIT_LC_utf8((U8*)s),
1408 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1409 trie_type = do_utf8 ?
1410 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1412 /* what trie are we using right now */
1414 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1415 reg_trie_data *trie=aho->trie;
1417 const char *last_start = strend - trie->minlen;
1419 const char *real_start = s;
1421 STRLEN maxlen = trie->maxlen;
1423 U8 **points; /* map of where we were in the input string
1424 when reading a given char. For ASCII this
1425 is unnecessary overhead as the relationship
1426 is always 1:1, but for unicode, especially
1427 case folded unicode this is not true. */
1428 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1432 GET_RE_DEBUG_FLAGS_DECL;
1434 /* We can't just allocate points here. We need to wrap it in
1435 * an SV so it gets freed properly if there is a croak while
1436 * running the match */
1439 sv_points=newSV(maxlen * sizeof(U8 *));
1440 SvCUR_set(sv_points,
1441 maxlen * sizeof(U8 *));
1442 SvPOK_on(sv_points);
1443 sv_2mortal(sv_points);
1444 points=(U8**)SvPV_nolen(sv_points );
1445 if ( trie_type != trie_utf8_fold
1446 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1449 bitmap=(U8*)trie->bitmap;
1451 bitmap=(U8*)ANYOF_BITMAP(c);
1453 /* this is the Aho-Corasick algorithm modified a touch
1454 to include special handling for long "unknown char"
1455 sequences. The basic idea being that we use AC as long
1456 as we are dealing with a possible matching char, when
1457 we encounter an unknown char (and we have not encountered
1458 an accepting state) we scan forward until we find a legal
1460 AC matching is basically that of trie matching, except
1461 that when we encounter a failing transition, we fall back
1462 to the current states "fail state", and try the current char
1463 again, a process we repeat until we reach the root state,
1464 state 1, or a legal transition. If we fail on the root state
1465 then we can either terminate if we have reached an accepting
1466 state previously, or restart the entire process from the beginning
1470 while (s <= last_start) {
1471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1479 U8 *uscan = (U8*)NULL;
1480 U8 *leftmost = NULL;
1482 U32 accepted_word= 0;
1486 while ( state && uc <= (U8*)strend ) {
1488 U32 word = aho->states[ state ].wordnum;
1492 DEBUG_TRIE_EXECUTE_r(
1493 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1494 dump_exec_pos( (char *)uc, c, strend, real_start,
1495 (char *)uc, do_utf8 );
1496 PerlIO_printf( Perl_debug_log,
1497 " Scanning for legal start char...\n");
1500 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1505 if (uc >(U8*)last_start) break;
1509 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1510 if (!leftmost || lpos < leftmost) {
1511 DEBUG_r(accepted_word=word);
1517 points[pointpos++ % maxlen]= uc;
1518 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1519 uvc, charid, foldlen, foldbuf, uniflags);
1520 DEBUG_TRIE_EXECUTE_r({
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1523 PerlIO_printf(Perl_debug_log,
1524 " Charid:%3u CP:%4"UVxf" ",
1530 word = aho->states[ state ].wordnum;
1532 base = aho->states[ state ].trans.base;
1534 DEBUG_TRIE_EXECUTE_r({
1536 dump_exec_pos( (char *)uc, c, strend, real_start,
1538 PerlIO_printf( Perl_debug_log,
1539 "%sState: %4"UVxf", word=%"UVxf,
1540 failed ? " Fail transition to " : "",
1541 (UV)state, (UV)word);
1546 (base + charid > trie->uniquecharcount )
1547 && (base + charid - 1 - trie->uniquecharcount
1549 && trie->trans[base + charid - 1 -
1550 trie->uniquecharcount].check == state
1551 && (tmp=trie->trans[base + charid - 1 -
1552 trie->uniquecharcount ].next))
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - legal\n"));
1560 DEBUG_TRIE_EXECUTE_r(
1561 PerlIO_printf( Perl_debug_log," - fail\n"));
1563 state = aho->fail[state];
1567 /* we must be accepting here */
1568 DEBUG_TRIE_EXECUTE_r(
1569 PerlIO_printf( Perl_debug_log," - accepting\n"));
1578 if (!state) state = 1;
1581 if ( aho->states[ state ].wordnum ) {
1582 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1583 if (!leftmost || lpos < leftmost) {
1584 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1589 s = (char*)leftmost;
1590 DEBUG_TRIE_EXECUTE_r({
1592 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1593 (UV)accepted_word, s - real_start
1596 if (!reginfo || regtry(reginfo, s)) {
1602 DEBUG_TRIE_EXECUTE_r({
1603 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1606 DEBUG_TRIE_EXECUTE_r(
1607 PerlIO_printf( Perl_debug_log,"No match.\n"));
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1625 - regexec_flags - match a regexp against a string
1628 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630 /* strend: pointer to null at end of string */
1631 /* strbeg: real beginning of string */
1632 /* minend: end of match must be >=minend after stringarg. */
1633 /* data: May be used for some additional optimizations. */
1634 /* nosave: For optimizations. */
1638 register regnode *c;
1639 register char *startpos = stringarg;
1640 I32 minlen; /* must match at least this many chars */
1641 I32 dontbother = 0; /* how many characters not to try at end */
1642 I32 end_shift = 0; /* Same for the end. */ /* CC */
1643 I32 scream_pos = -1; /* Internal iterator of scream. */
1644 char *scream_olds = NULL;
1645 SV* const oreplsv = GvSV(PL_replgv);
1646 const bool do_utf8 = (bool)DO_UTF8(sv);
1649 regmatch_info reginfo; /* create some info to pass to regtry etc */
1651 GET_RE_DEBUG_FLAGS_DECL;
1653 PERL_UNUSED_ARG(data);
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1661 multiline = prog->reganch & PMf_MULTILINE;
1662 reginfo.prog = prog;
1664 RX_MATCH_UTF8_set(prog, do_utf8);
1666 debug_start_match(prog, do_utf8, startpos, strend,
1670 minlen = prog->minlen;
1672 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1674 "String too short [regexec_flags]...\n"));
1679 /* Check validity of program. */
1680 if (UCHARAT(prog->program) != REG_MAGIC) {
1681 Perl_croak(aTHX_ "corrupted regexp program");
1685 PL_reg_eval_set = 0;
1688 if (prog->reganch & ROPT_UTF8)
1689 PL_reg_flags |= RF_utf8;
1691 /* Mark beginning of line for ^ and lookbehind. */
1692 reginfo.bol = startpos; /* XXX not used ??? */
1696 /* Mark end of line for $ (and such) */
1699 /* see how far we have to get to not match where we matched before */
1700 reginfo.till = startpos+minend;
1702 /* If there is a "must appear" string, look for it. */
1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1709 reginfo.ganch = startpos;
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
1714 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1715 if (prog->reganch & ROPT_ANCH_GPOS) {
1716 if (s > reginfo.ganch)
1721 else /* pos() not defined */
1722 reginfo.ganch = strbeg;
1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1726 re_scream_pos_data d;
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1733 goto phooey; /* not present */
1739 /* Simplest case: anchored match need be tried only once. */
1740 /* [unless only anchor is BOL and multiline is set] */
1741 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1742 if (s == startpos && regtry(®info, startpos))
1744 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1745 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1750 dontbother = minlen - 1;
1751 end = HOP3c(strend, -dontbother, strbeg) - 1;
1752 /* for multiline we only have to try after newlines */
1753 if (prog->check_substr || prog->check_utf8) {
1757 if (regtry(®info, s))
1762 if (prog->reganch & RE_USE_INTUIT) {
1763 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1774 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1775 if (regtry(®info, s))
1782 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1784 /* the warning about reginfo.ganch being used without intialization
1785 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1786 and we only enter this block when the same bit is set. */
1787 if (regtry(®info, reginfo.ganch))
1792 /* Messy cases: unanchored match. */
1793 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1794 /* we have /x+whatever/ */
1795 /* it must be a one character string (XXXX Except UTF?) */
1800 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1801 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1802 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1807 DEBUG_EXECUTE_r( did_match = 1 );
1808 if (regtry(®info, s)) goto got_it;
1810 while (s < strend && *s == ch)
1818 DEBUG_EXECUTE_r( did_match = 1 );
1819 if (regtry(®info, s)) goto got_it;
1821 while (s < strend && *s == ch)
1826 DEBUG_EXECUTE_r(if (!did_match)
1827 PerlIO_printf(Perl_debug_log,
1828 "Did not find anchored character...\n")
1831 else if (prog->anchored_substr != NULL
1832 || prog->anchored_utf8 != NULL
1833 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1834 && prog->float_max_offset < strend - s)) {
1839 char *last1; /* Last position checked before */
1843 if (prog->anchored_substr || prog->anchored_utf8) {
1844 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1847 back_max = back_min = prog->anchored_offset;
1849 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1850 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1851 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1852 back_max = prog->float_max_offset;
1853 back_min = prog->float_min_offset;
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1864 last = HOP3c(strend, /* Cannot start after this */
1865 -(I32)(CHR_SVLEN(must)
1866 - (SvTAIL(must) != 0) + back_min), strbeg);
1869 last1 = HOPc(s, -1);
1871 last1 = s - 1; /* bogus */
1873 /* XXXX check_substr already used to find "s", can optimize if
1874 check_substr==must. */
1876 dontbother = end_shift;
1877 strend = HOPc(strend, -dontbother);
1878 while ( (s <= last) &&
1879 ((flags & REXEC_SCREAM)
1880 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
1881 end_shift, &scream_pos, 0))
1882 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
1883 (unsigned char*)strend, must,
1884 multiline ? FBMrf_MULTILINE : 0))) ) {
1885 /* we may be pointing at the wrong string */
1886 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1887 s = strbeg + (s - SvPVX_const(sv));
1888 DEBUG_EXECUTE_r( did_match = 1 );
1889 if (HOPc(s, -back_max) > last1) {
1890 last1 = HOPc(s, -back_min);
1891 s = HOPc(s, -back_max);
1894 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1896 last1 = HOPc(s, -back_min);
1900 while (s <= last1) {
1901 if (regtry(®info, s))
1907 while (s <= last1) {
1908 if (regtry(®info, s))
1914 DEBUG_EXECUTE_r(if (!did_match) {
1915 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1916 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1917 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
1918 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1919 ? "anchored" : "floating"),
1920 quoted, RE_SV_TAIL(must));
1924 else if ( (c = prog->regstclass) ) {
1926 const OPCODE op = OP(prog->regstclass);
1927 /* don't bother with what can't match */
1928 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
1929 strend = HOPc(strend, -(minlen - 1));
1932 SV * const prop = sv_newmortal();
1933 regprop(prog, prop, c);
1935 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1937 PerlIO_printf(Perl_debug_log,
1938 "Matching stclass %.*s against %s (%d chars)\n",
1939 (int)SvCUR(prop), SvPVX_const(prop),
1940 quoted, (int)(strend - s));
1943 if (find_byclass(prog, c, s, strend, ®info))
1945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
1949 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1954 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1955 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1956 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1958 if (flags & REXEC_SCREAM) {
1959 last = screaminstr(sv, float_real, s - strbeg,
1960 end_shift, &scream_pos, 1); /* last one */
1962 last = scream_olds; /* Only one occurrence. */
1963 /* we may be pointing at the wrong string */
1964 else if (RX_MATCH_COPIED(prog))
1965 s = strbeg + (s - SvPVX_const(sv));
1969 const char * const little = SvPV_const(float_real, len);
1971 if (SvTAIL(float_real)) {
1972 if (memEQ(strend - len + 1, little, len - 1))
1973 last = strend - len + 1;
1974 else if (!multiline)
1975 last = memEQ(strend - len, little, len)
1976 ? strend - len : NULL;
1982 last = rninstr(s, strend, little, little + len);
1984 last = strend; /* matching "$" */
1989 PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
1992 goto phooey; /* Should not happen! */
1994 dontbother = strend - last + prog->float_min_offset;
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
2002 if (regtry(®info, s))
2011 if (regtry(®info, s))
2013 } while (s++ < strend);
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2029 restore_pos(aTHX_ prog);
2032 /* make sure $`, $&, $', and $digit will work later */
2033 if ( !(flags & REXEC_NOT_FIRST) ) {
2034 RX_MATCH_COPY_FREE(prog);
2035 if (flags & REXEC_COPY_STR) {
2036 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2037 #ifdef PERL_OLD_COPY_ON_WRITE
2039 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2041 PerlIO_printf(Perl_debug_log,
2042 "Copy on write: regexp capture, type %d\n",
2045 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2046 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2047 assert (SvPOKp(prog->saved_copy));
2051 RX_MATCH_COPIED_on(prog);
2052 s = savepvn(strbeg, i);
2058 prog->subbeg = strbeg;
2059 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2067 PL_colors[4], PL_colors[5]));
2068 if (PL_reg_eval_set)
2069 restore_pos(aTHX_ prog);
2075 - regtry - try match at specific point
2077 STATIC I32 /* 0 failure, 1 success */
2078 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2084 regexp *prog = reginfo->prog;
2085 GET_RE_DEBUG_FLAGS_DECL;
2087 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2090 PL_reg_eval_set = RS_init;
2091 DEBUG_EXECUTE_r(DEBUG_s(
2092 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2093 (IV)(PL_stack_sp - PL_stack_base));
2095 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2096 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2097 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2099 /* Apparently this is not needed, judging by wantarray. */
2100 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2101 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2104 /* Make $_ available to executed code. */
2105 if (reginfo->sv != DEFSV) {
2107 DEFSV = reginfo->sv;
2110 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2111 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2112 /* prepare for quick setting of pos */
2113 #ifdef PERL_OLD_COPY_ON_WRITE
2115 sv_force_normal_flags(sv, 0);
2117 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2118 &PL_vtbl_mglob, NULL, 0);
2122 PL_reg_oldpos = mg->mg_len;
2123 SAVEDESTRUCTOR_X(restore_pos, prog);
2125 if (!PL_reg_curpm) {
2126 Newxz(PL_reg_curpm, 1, PMOP);
2129 SV* const repointer = newSViv(0);
2130 /* so we know which PL_regex_padav element is PL_reg_curpm */
2131 SvFLAGS(repointer) |= SVf_BREAK;
2132 av_push(PL_regex_padav,repointer);
2133 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2134 PL_regex_pad = AvARRAY(PL_regex_padav);
2138 PM_SETRE(PL_reg_curpm, prog);
2139 PL_reg_oldcurpm = PL_curpm;
2140 PL_curpm = PL_reg_curpm;
2141 if (RX_MATCH_COPIED(prog)) {
2142 /* Here is a serious problem: we cannot rewrite subbeg,
2143 since it may be needed if this match fails. Thus
2144 $` inside (?{}) could fail... */
2145 PL_reg_oldsaved = prog->subbeg;
2146 PL_reg_oldsavedlen = prog->sublen;
2147 #ifdef PERL_OLD_COPY_ON_WRITE
2148 PL_nrs = prog->saved_copy;
2150 RX_MATCH_COPIED_off(prog);
2153 PL_reg_oldsaved = NULL;
2154 prog->subbeg = PL_bostr;
2155 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2157 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2158 prog->startp[0] = startpos - PL_bostr;
2159 PL_reginput = startpos;
2160 PL_reglastparen = &prog->lastparen;
2161 PL_reglastcloseparen = &prog->lastcloseparen;
2162 prog->lastparen = 0;
2163 prog->lastcloseparen = 0;
2165 PL_regstartp = prog->startp;
2166 PL_regendp = prog->endp;
2167 if (PL_reg_start_tmpl <= prog->nparens) {
2168 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2169 if(PL_reg_start_tmp)
2170 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2172 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2175 /* XXXX What this code is doing here?!!! There should be no need
2176 to do this again and again, PL_reglastparen should take care of
2179 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2180 * Actually, the code in regcppop() (which Ilya may be meaning by
2181 * PL_reglastparen), is not needed at all by the test suite
2182 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2183 * enough, for building DynaLoader, or otherwise this
2184 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2185 * will happen. Meanwhile, this code *is* needed for the
2186 * above-mentioned test suite tests to succeed. The common theme
2187 * on those tests seems to be returning null fields from matches.
2192 if (prog->nparens) {
2194 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2201 if (regmatch(reginfo, prog->program + 1)) {
2202 prog->endp[0] = PL_reginput - PL_bostr;
2205 REGCP_UNWIND(lastcp);
2210 #define sayYES goto yes
2211 #define sayNO goto no
2212 #define sayNO_SILENT goto no_silent
2214 /* we dont use STMT_START/END here because it leads to
2215 "unreachable code" warnings, which are bogus, but distracting. */
2216 #define CACHEsayNO \
2217 if (ST.cache_mask) \
2218 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2221 /* this is used to determine how far from the left messages like
2222 'failed...' are printed. It should be set such that messages
2223 are inline with the regop output that created them.
2225 #define REPORT_CODE_OFF 32
2228 /* Make sure there is a test for this +1 options in re_tests */
2229 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2231 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2232 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2234 #define SLAB_FIRST(s) (&(s)->states[0])
2235 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2237 /* grab a new slab and return the first slot in it */
2239 STATIC regmatch_state *
2242 #if PERL_VERSION < 9
2245 regmatch_slab *s = PL_regmatch_slab->next;
2247 Newx(s, 1, regmatch_slab);
2248 s->prev = PL_regmatch_slab;
2250 PL_regmatch_slab->next = s;
2252 PL_regmatch_slab = s;
2253 return SLAB_FIRST(s);
2257 /* push a new state then goto it */
2259 #define PUSH_STATE_GOTO(state, node) \
2261 st->resume_state = state; \
2264 /* push a new state with success backtracking, then goto it */
2266 #define PUSH_YES_STATE_GOTO(state, node) \
2268 st->resume_state = state; \
2269 goto push_yes_state;
2274 - regmatch - main matching routine
2276 * Conceptually the strategy is simple: check to see whether the current
2277 * node matches, call self recursively to see whether the rest matches,
2278 * and then act accordingly. In practice we make some effort to avoid
2279 * recursion, in particular by going through "ordinary" nodes (that don't
2280 * need to know whether the rest of the match failed) by a loop instead of
2283 /* [lwall] I've hoisted the register declarations to the outer block in order to
2284 * maybe save a little bit of pushing and popping on the stack. It also takes
2285 * advantage of machines that use a register save mask on subroutine entry.
2287 * This function used to be heavily recursive, but since this had the
2288 * effect of blowing the CPU stack on complex regexes, it has been
2289 * restructured to be iterative, and to save state onto the heap rather
2290 * than the stack. Essentially whereever regmatch() used to be called, it
2291 * pushes the current state, notes where to return, then jumps back into
2294 * Originally the structure of this function used to look something like
2299 while (scan != NULL) {
2300 a++; // do stuff with a and b
2306 if (regmatch(...)) // recurse
2316 * Now it looks something like this:
2324 regmatch_state *st = new();
2326 st->a++; // do stuff with a and b
2328 while (scan != NULL) {
2336 st->resume_state = resume_FOO;
2337 goto start_recurse; // recurse
2346 st = new(); push a new state
2347 st->a = 1; st->b = 2;
2354 switch (resume_state) {
2356 goto resume_point_FOO;
2363 * WARNING: this means that any line in this function that contains a
2364 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2365 * regmatch() using gotos instead. Thus the values of any local variables
2366 * not saved in the regmatch_state structure will have been lost when
2367 * execution resumes on the next line .
2369 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2370 * PL_regmatch_state always points to the currently active state, and
2371 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2372 * The first time regmatch is called, the first slab is allocated, and is
2373 * never freed until interpreter desctruction. When the slab is full,
2374 * a new one is allocated chained to the end. At exit from regmatch, slabs
2375 * allocated since entry are freed.
2379 #define DEBUG_STATE_pp(pp) \
2381 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2382 PerlIO_printf(Perl_debug_log, \
2385 reg_name[st->resume_state] ); \
2389 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2394 S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2395 const char *start, const char *end, const char *blurb)
2397 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2401 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2402 prog->precomp, prog->prelen, 60);
2404 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2405 start, end - start, 60);
2407 PerlIO_printf(Perl_debug_log,
2408 "%s%s REx%s %s against %s\n",
2409 PL_colors[4], blurb, PL_colors[5], s0, s1);
2411 if (do_utf8||utf8_pat)
2412 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2413 utf8_pat ? "pattern" : "",
2414 utf8_pat && do_utf8 ? " and " : "",
2415 do_utf8 ? "string" : ""
2421 S_dump_exec_pos(pTHX_ const char *locinput,
2422 const regnode *scan,
2423 const char *loc_regeol,
2424 const char *loc_bostr,
2425 const char *loc_reg_starttry,
2428 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2429 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2430 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2431 /* The part of the string before starttry has one color
2432 (pref0_len chars), between starttry and current
2433 position another one (pref_len - pref0_len chars),
2434 after the current position the third one.
2435 We assume that pref0_len <= pref_len, otherwise we
2436 decrease pref0_len. */
2437 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2438 ? (5 + taill) - l : locinput - loc_bostr;
2441 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2443 pref0_len = pref_len - (locinput - loc_reg_starttry);
2444 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2445 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2446 ? (5 + taill) - pref_len : loc_regeol - locinput);
2447 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2451 if (pref0_len > pref_len)
2452 pref0_len = pref_len;
2454 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2456 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2457 (locinput - pref_len),pref0_len, 60, 4, 5);
2459 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2460 (locinput - pref_len + pref0_len),
2461 pref_len - pref0_len, 60, 2, 3);
2463 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2464 locinput, loc_regeol - locinput, 10, 0, 1);
2466 const STRLEN tlen=len0+len1+len2;
2467 PerlIO_printf(Perl_debug_log,
2468 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2469 (IV)(locinput - loc_bostr),
2472 (docolor ? "" : "> <"),
2474 (int)(tlen > 19 ? 0 : 19 - tlen),
2481 STATIC I32 /* 0 failure, 1 success */
2482 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2484 #if PERL_VERSION < 9
2488 register const bool do_utf8 = PL_reg_match_utf8;
2489 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2491 regexp *rex = reginfo->prog;
2493 regmatch_slab *orig_slab;
2494 regmatch_state *orig_state;
2496 /* the current state. This is a cached copy of PL_regmatch_state */
2497 register regmatch_state *st;
2499 /* cache heavy used fields of st in registers */
2500 register regnode *scan;
2501 register regnode *next;
2502 register I32 n = 0; /* initialize to shut up compiler warning */
2503 register char *locinput = PL_reginput;
2505 /* these variables are NOT saved during a recusive RFEGMATCH: */
2506 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2507 bool result = 0; /* return value of S_regmatch */
2508 int depth = 0; /* depth of recursion */
2509 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
2510 regmatch_state *yes_state = NULL; /* state to pop to on success of
2512 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2513 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2517 GET_RE_DEBUG_FLAGS_DECL;
2520 /* on first ever call to regmatch, allocate first slab */
2521 if (!PL_regmatch_slab) {
2522 Newx(PL_regmatch_slab, 1, regmatch_slab);
2523 PL_regmatch_slab->prev = NULL;
2524 PL_regmatch_slab->next = NULL;
2525 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2528 /* remember current high-water mark for exit */
2529 /* XXX this should be done with SAVE* instead */
2530 orig_slab = PL_regmatch_slab;
2531 orig_state = PL_regmatch_state;
2533 /* grab next free state slot */
2534 st = ++PL_regmatch_state;
2535 if (st > SLAB_LAST(PL_regmatch_slab))
2536 st = PL_regmatch_state = S_push_slab(aTHX);
2543 /* Note that nextchr is a byte even in UTF */
2544 nextchr = UCHARAT(locinput);
2546 while (scan != NULL) {
2549 SV * const prop = sv_newmortal();
2550 regnode *rnext=regnext(scan);
2551 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2552 regprop(rex, prop, scan);
2554 PerlIO_printf(Perl_debug_log,
2555 "%3"IVdf":%*s%s(%"IVdf")\n",
2556 (IV)(scan - rex->program), depth*2, "",
2558 (PL_regkind[OP(scan)] == END || !rnext) ?
2559 0 : (IV)(rnext - rex->program));
2562 next = scan + NEXT_OFF(scan);
2565 state_num = OP(scan);
2568 switch (state_num) {
2570 if (locinput == PL_bostr)
2572 /* reginfo->till = reginfo->bol; */
2577 if (locinput == PL_bostr ||
2578 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2584 if (locinput == PL_bostr)
2588 if (locinput == reginfo->ganch)
2594 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2599 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2601 if (PL_regeol - locinput > 1)
2605 if (PL_regeol != locinput)
2609 if (!nextchr && locinput >= PL_regeol)
2612 locinput += PL_utf8skip[nextchr];
2613 if (locinput > PL_regeol)
2615 nextchr = UCHARAT(locinput);
2618 nextchr = UCHARAT(++locinput);
2621 if (!nextchr && locinput >= PL_regeol)
2623 nextchr = UCHARAT(++locinput);
2626 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2629 locinput += PL_utf8skip[nextchr];
2630 if (locinput > PL_regeol)
2632 nextchr = UCHARAT(locinput);
2635 nextchr = UCHARAT(++locinput);
2639 #define ST st->u.trie
2641 /* In this case the charclass data is available inline so
2642 we can fail fast without a lot of extra overhead.
2644 if (scan->flags == EXACT || !do_utf8) {
2645 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2647 PerlIO_printf(Perl_debug_log,
2648 "%*s %sfailed to match trie start class...%s\n",
2649 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2658 /* what type of TRIE am I? (utf8 makes this contextual) */
2659 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2660 trie_type = do_utf8 ?
2661 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2664 /* what trie are we using right now */
2665 reg_trie_data * const trie
2666 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2667 U32 state = trie->startstate;
2669 if (trie->bitmap && trie_type != trie_utf8_fold &&
2670 !TRIE_BITMAP_TEST(trie,*locinput)
2672 if (trie->states[ state ].wordnum) {
2674 PerlIO_printf(Perl_debug_log,
2675 "%*s %smatched empty string...%s\n",
2676 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2681 PerlIO_printf(Perl_debug_log,
2682 "%*s %sfailed to match trie start class...%s\n",
2683 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
2690 U8 *uc = ( U8* )locinput;
2694 U8 *uscan = (U8*)NULL;
2696 SV *sv_accept_buff = NULL;
2697 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2699 ST.accepted = 0; /* how many accepting states we have seen */
2701 ST.jump = trie->jump;
2710 traverse the TRIE keeping track of all accepting states
2711 we transition through until we get to a failing node.
2714 while ( state && uc <= (U8*)PL_regeol ) {
2715 U32 base = trie->states[ state ].trans.base;
2718 /* We use charid to hold the wordnum as we don't use it
2719 for charid until after we have done the wordnum logic.
2720 We define an alias just so that the wordnum logic reads
2723 #define got_wordnum charid
2724 got_wordnum = trie->states[ state ].wordnum;
2726 if ( got_wordnum ) {
2727 if ( ! ST.accepted ) {
2730 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2731 sv_accept_buff=newSV(bufflen *
2732 sizeof(reg_trie_accepted) - 1);
2733 SvCUR_set(sv_accept_buff, 0);
2734 SvPOK_on(sv_accept_buff);
2735 sv_2mortal(sv_accept_buff);
2738 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2741 if (ST.accepted >= bufflen) {
2743 ST.accept_buff =(reg_trie_accepted*)
2744 SvGROW(sv_accept_buff,
2745 bufflen * sizeof(reg_trie_accepted));
2747 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2748 + sizeof(reg_trie_accepted));
2751 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2752 ST.accept_buff[ST.accepted].endpos = uc;
2754 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2758 DEBUG_TRIE_EXECUTE_r({
2759 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
2760 PerlIO_printf( Perl_debug_log,
2761 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2762 2+depth * 2, "", PL_colors[4],
2763 (UV)state, (UV)ST.accepted );
2767 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2768 uvc, charid, foldlen, foldbuf, uniflags);
2771 (base + charid > trie->uniquecharcount )
2772 && (base + charid - 1 - trie->uniquecharcount
2774 && trie->trans[base + charid - 1 -
2775 trie->uniquecharcount].check == state)
2777 state = trie->trans[base + charid - 1 -
2778 trie->uniquecharcount ].next;
2789 DEBUG_TRIE_EXECUTE_r(
2790 PerlIO_printf( Perl_debug_log,
2791 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
2792 charid, uvc, (UV)state, PL_colors[5] );
2799 PerlIO_printf( Perl_debug_log,
2800 "%*s %sgot %"IVdf" possible matches%s\n",
2801 REPORT_CODE_OFF + depth * 2, "",
2802 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2808 case TRIE_next_fail: /* we failed - try next alterative */
2810 if ( ST.accepted == 1 ) {
2811 /* only one choice left - just continue */
2813 reg_trie_data * const trie
2814 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2815 SV ** const tmp = RX_DEBUG(reginfo->prog)
2816 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2818 PerlIO_printf( Perl_debug_log,
2819 "%*s %sonly one match left: #%d <%s>%s\n",
2820 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2821 ST.accept_buff[ 0 ].wordnum,
2822 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2825 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2826 /* in this case we free tmps/leave before we call regmatch
2827 as we wont be using accept_buff again. */
2830 locinput = PL_reginput;
2831 nextchr = UCHARAT(locinput);
2836 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2838 continue; /* execute rest of RE */
2841 if (!ST.accepted-- ) {
2848 There are at least two accepting states left. Presumably
2849 the number of accepting states is going to be low,
2850 typically two. So we simply scan through to find the one
2851 with lowest wordnum. Once we find it, we swap the last
2852 state into its place and decrement the size. We then try to
2853 match the rest of the pattern at the point where the word
2854 ends. If we succeed, control just continues along the
2855 regex; if we fail we return here to try the next accepting
2862 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2863 DEBUG_TRIE_EXECUTE_r(
2864 PerlIO_printf( Perl_debug_log,
2865 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2866 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
2867 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2868 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2871 if (ST.accept_buff[cur].wordnum <
2872 ST.accept_buff[best].wordnum)
2877 reg_trie_data * const trie
2878 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2879 SV ** const tmp = RX_DEBUG(reginfo->prog)
2880 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2882 regnode *nextop=!ST.jump ?
2884 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2885 PerlIO_printf( Perl_debug_log,
2886 "%*s %strying alternation #%d <%s> at node #%d %s\n",
2887 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
2888 ST.accept_buff[best].wordnum,
2889 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2890 REG_NODE_NUM(nextop),
2894 if ( best<ST.accepted ) {
2895 reg_trie_accepted tmp = ST.accept_buff[ best ];
2896 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2897 ST.accept_buff[ ST.accepted ] = tmp;
2900 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2902 PUSH_STATE_GOTO(TRIE_next, ST.B);
2905 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2915 char *s = STRING(scan);
2916 st->ln = STR_LEN(scan);
2917 if (do_utf8 != UTF) {
2918 /* The target and the pattern have differing utf8ness. */
2920 const char * const e = s + st->ln;
2923 /* The target is utf8, the pattern is not utf8. */
2928 if (NATIVE_TO_UNI(*(U8*)s) !=
2929 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2937 /* The target is not utf8, the pattern is utf8. */
2942 if (NATIVE_TO_UNI(*((U8*)l)) !=
2943 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2951 nextchr = UCHARAT(locinput);
2954 /* The target and the pattern have the same utf8ness. */
2955 /* Inline the first character, for speed. */
2956 if (UCHARAT(s) != nextchr)
2958 if (PL_regeol - locinput < st->ln)
2960 if (st->ln > 1 && memNE(s, locinput, st->ln))
2963 nextchr = UCHARAT(locinput);
2967 PL_reg_flags |= RF_tainted;
2970 char * const s = STRING(scan);
2971 st->ln = STR_LEN(scan);
2973 if (do_utf8 || UTF) {
2974 /* Either target or the pattern are utf8. */
2975 const char * const l = locinput;
2976 char *e = PL_regeol;
2978 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
2979 l, &e, 0, do_utf8)) {
2980 /* One more case for the sharp s:
2981 * pack("U0U*", 0xDF) =~ /ss/i,
2982 * the 0xC3 0x9F are the UTF-8
2983 * byte sequence for the U+00DF. */
2985 toLOWER(s[0]) == 's' &&
2987 toLOWER(s[1]) == 's' &&
2994 nextchr = UCHARAT(locinput);
2998 /* Neither the target and the pattern are utf8. */
3000 /* Inline the first character, for speed. */
3001 if (UCHARAT(s) != nextchr &&
3002 UCHARAT(s) != ((OP(scan) == EXACTF)
3003 ? PL_fold : PL_fold_locale)[nextchr])
3005 if (PL_regeol - locinput < st->ln)
3007 if (st->ln > 1 && (OP(scan) == EXACTF
3008 ? ibcmp(s, locinput, st->ln)
3009 : ibcmp_locale(s, locinput, st->ln)))
3012 nextchr = UCHARAT(locinput);
3017 STRLEN inclasslen = PL_regeol - locinput;
3019 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3021 if (locinput >= PL_regeol)
3023 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3024 nextchr = UCHARAT(locinput);
3029 nextchr = UCHARAT(locinput);
3030 if (!REGINCLASS(rex, scan, (U8*)locinput))
3032 if (!nextchr && locinput >= PL_regeol)
3034 nextchr = UCHARAT(++locinput);
3038 /* If we might have the case of the German sharp s
3039 * in a casefolding Unicode character class. */
3041 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3042 locinput += SHARP_S_SKIP;
3043 nextchr = UCHARAT(locinput);
3049 PL_reg_flags |= RF_tainted;
3055 LOAD_UTF8_CHARCLASS_ALNUM();
3056 if (!(OP(scan) == ALNUM
3057 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3058 : isALNUM_LC_utf8((U8*)locinput)))
3062 locinput += PL_utf8skip[nextchr];
3063 nextchr = UCHARAT(locinput);
3066 if (!(OP(scan) == ALNUM
3067 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3069 nextchr = UCHARAT(++locinput);
3072 PL_reg_flags |= RF_tainted;
3075 if (!nextchr && locinput >= PL_regeol)
3078 LOAD_UTF8_CHARCLASS_ALNUM();
3079 if (OP(scan) == NALNUM
3080 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3081 : isALNUM_LC_utf8((U8*)locinput))
3085 locinput += PL_utf8skip[nextchr];
3086 nextchr = UCHARAT(locinput);
3089 if (OP(scan) == NALNUM
3090 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3092 nextchr = UCHARAT(++locinput);
3096 PL_reg_flags |= RF_tainted;
3100 /* was last char in word? */
3102 if (locinput == PL_bostr)
3105 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3107 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3109 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3110 st->ln = isALNUM_uni(st->ln);
3111 LOAD_UTF8_CHARCLASS_ALNUM();
3112 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3115 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3116 n = isALNUM_LC_utf8((U8*)locinput);
3120 st->ln = (locinput != PL_bostr) ?
3121 UCHARAT(locinput - 1) : '\n';
3122 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3123 st->ln = isALNUM(st->ln);
3124 n = isALNUM(nextchr);
3127 st->ln = isALNUM_LC(st->ln);
3128 n = isALNUM_LC(nextchr);
3131 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3132 OP(scan) == BOUNDL))
3136 PL_reg_flags |= RF_tainted;
3142 if (UTF8_IS_CONTINUED(nextchr)) {
3143 LOAD_UTF8_CHARCLASS_SPACE();
3144 if (!(OP(scan) == SPACE
3145 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3146 : isSPACE_LC_utf8((U8*)locinput)))
3150 locinput += PL_utf8skip[nextchr];
3151 nextchr = UCHARAT(locinput);
3154 if (!(OP(scan) == SPACE
3155 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3157 nextchr = UCHARAT(++locinput);
3160 if (!(OP(scan) == SPACE
3161 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3163 nextchr = UCHARAT(++locinput);
3167 PL_reg_flags |= RF_tainted;
3170 if (!nextchr && locinput >= PL_regeol)
3173 LOAD_UTF8_CHARCLASS_SPACE();
3174 if (OP(scan) == NSPACE
3175 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3176 : isSPACE_LC_utf8((U8*)locinput))
3180 locinput += PL_utf8skip[nextchr];
3181 nextchr = UCHARAT(locinput);
3184 if (OP(scan) == NSPACE
3185 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3187 nextchr = UCHARAT(++locinput);
3190 PL_reg_flags |= RF_tainted;
3196 LOAD_UTF8_CHARCLASS_DIGIT();
3197 if (!(OP(scan) == DIGIT
3198 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3199 : isDIGIT_LC_utf8((U8*)locinput)))
3203 locinput += PL_utf8skip[nextchr];
3204 nextchr = UCHARAT(locinput);
3207 if (!(OP(scan) == DIGIT
3208 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3210 nextchr = UCHARAT(++locinput);
3213 PL_reg_flags |= RF_tainted;
3216 if (!nextchr && locinput >= PL_regeol)
3219 LOAD_UTF8_CHARCLASS_DIGIT();
3220 if (OP(scan) == NDIGIT
3221 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3222 : isDIGIT_LC_utf8((U8*)locinput))
3226 locinput += PL_utf8skip[nextchr];
3227 nextchr = UCHARAT(locinput);
3230 if (OP(scan) == NDIGIT
3231 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3233 nextchr = UCHARAT(++locinput);
3236 if (locinput >= PL_regeol)
3239 LOAD_UTF8_CHARCLASS_MARK();
3240 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3242 locinput += PL_utf8skip[nextchr];
3243 while (locinput < PL_regeol &&
3244 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3245 locinput += UTF8SKIP(locinput);
3246 if (locinput > PL_regeol)
3251 nextchr = UCHARAT(locinput);
3254 PL_reg_flags |= RF_tainted;
3259 n = ARG(scan); /* which paren pair */
3260 st->ln = PL_regstartp[n];
3261 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3262 if ((I32)*PL_reglastparen < n || st->ln == -1)
3263 sayNO; /* Do not match unless seen CLOSEn. */
3264 if (st->ln == PL_regendp[n])
3267 s = PL_bostr + st->ln;
3268 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3270 const char *e = PL_bostr + PL_regendp[n];
3272 * Note that we can't do the "other character" lookup trick as
3273 * in the 8-bit case (no pun intended) because in Unicode we
3274 * have to map both upper and title case to lower case.
3276 if (OP(scan) == REFF) {
3278 STRLEN ulen1, ulen2;
3279 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3280 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3284 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3285 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3286 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3293 nextchr = UCHARAT(locinput);
3297 /* Inline the first character, for speed. */
3298 if (UCHARAT(s) != nextchr &&
3300 (UCHARAT(s) != ((OP(scan) == REFF
3301 ? PL_fold : PL_fold_locale)[nextchr]))))
3303 st->ln = PL_regendp[n] - st->ln;
3304 if (locinput + st->ln > PL_regeol)
3306 if (st->ln > 1 && (OP(scan) == REF
3307 ? memNE(s, locinput, st->ln)
3309 ? ibcmp(s, locinput, st->ln)
3310 : ibcmp_locale(s, locinput, st->ln))))
3313 nextchr = UCHARAT(locinput);
3324 #define ST st->u.eval
3328 regnode *startpoint;
3331 case RECURSE: /* /(...(?1))/ */
3332 if (cur_eval && cur_eval->locinput==locinput) {
3333 if (cur_eval->u.eval.close_paren == ARG(scan))
3334 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3335 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3336 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3341 (void)ReREFCNT_inc(rex);
3342 if (OP(scan)==RECURSE) {
3343 startpoint = scan + ARG2L(scan);
3344 ST.close_paren = ARG(scan);
3346 startpoint = re->program+1;
3349 goto eval_recurse_doit;
3351 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3352 if (cur_eval && cur_eval->locinput==locinput) {
3353 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3354 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3359 /* execute the code in the {...} */
3361 SV ** const before = SP;
3362 OP_4tree * const oop = PL_op;
3363 COP * const ocurcop = PL_curcop;
3367 PL_op = (OP_4tree*)rex->data->data[n];
3368 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3369 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3370 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3372 CALLRUNOPS(aTHX); /* Scalar context. */
3375 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3382 PAD_RESTORE_LOCAL(old_comppad);
3383 PL_curcop = ocurcop;
3386 sv_setsv(save_scalar(PL_replgv), ret);
3390 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3393 /* extract RE object from returned value; compiling if
3398 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3399 mg = mg_find(sv, PERL_MAGIC_qr);
3400 else if (SvSMAGICAL(ret)) {
3401 if (SvGMAGICAL(ret))
3402 sv_unmagic(ret, PERL_MAGIC_qr);
3404 mg = mg_find(ret, PERL_MAGIC_qr);
3408 re = (regexp *)mg->mg_obj;
3409 (void)ReREFCNT_inc(re);
3413 const char * const t = SvPV_const(ret, len);
3415 const I32 osize = PL_regsize;
3418 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3419 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
3421 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3423 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3429 debug_start_match(re, do_utf8, locinput, PL_regeol,
3430 "Matching embedded");
3432 startpoint = re->program + 1;
3433 ST.close_paren = 0; /* only used for RECURSE */
3434 /* borrowed from regtry */
3435 if (PL_reg_start_tmpl <= re->nparens) {
3436 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3437 if(PL_reg_start_tmp)
3438 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3440 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3443 eval_recurse_doit: /* Share code with RECURSE below this line */
3444 /* run the pattern returned from (??{...}) */
3445 ST.cp = regcppush(0); /* Save *all* the positions. */
3446 REGCP_SET(ST.lastcp);
3448 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3449 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3451 *PL_reglastparen = 0;
3452 *PL_reglastcloseparen = 0;
3453 PL_reginput = locinput;
3455 /* XXXX This is too dramatic a measure... */
3459 ST.toggle_reg_flags = PL_reg_flags;
3460 if (re->reganch & ROPT_UTF8)
3461 PL_reg_flags |= RF_utf8;
3463 PL_reg_flags &= ~RF_utf8;
3464 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3467 ST.prev_curlyx = cur_curlyx;
3471 ST.prev_eval = cur_eval;
3473 /* now continue from first node in postoned RE */
3474 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3477 /* /(?(?{...})X|Y)/ */
3478 st->sw = (bool)SvTRUE(ret);
3483 case EVAL_AB: /* cleanup after a successful (??{A})B */
3484 /* note: this is called twice; first after popping B, then A */
3485 PL_reg_flags ^= ST.toggle_reg_flags;
3489 cur_eval = ST.prev_eval;
3490 cur_curlyx = ST.prev_curlyx;
3491 /* XXXX This is too dramatic a measure... */
3496 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3497 /* note: this is called twice; first after popping B, then A */
3498 PL_reg_flags ^= ST.toggle_reg_flags;
3501 PL_reginput = locinput;
3502 REGCP_UNWIND(ST.lastcp);
3504 cur_eval = ST.prev_eval;
3505 cur_curlyx = ST.prev_curlyx;
3506 /* XXXX This is too dramatic a measure... */
3512 n = ARG(scan); /* which paren pair */
3513 PL_reg_start_tmp[n] = locinput;
3518 n = ARG(scan); /* which paren pair */
3519 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3520 PL_regendp[n] = locinput - PL_bostr;
3521 if (n > (I32)*PL_reglastparen)
3522 *PL_reglastparen = n;
3523 *PL_reglastcloseparen = n;
3524 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3529 n = ARG(scan); /* which paren pair */
3530 st->sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3533 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3535 next = NEXTOPER(NEXTOPER(scan));
3537 next = scan + ARG(scan);
3538 if (OP(next) == IFTHEN) /* Fake one. */
3539 next = NEXTOPER(NEXTOPER(next));
3543 st->logical = scan->flags;
3546 /*******************************************************************
3548 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3549 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3550 STAR/PLUS/CURLY/CURLYN are used instead.)
3552 A*B is compiled as <CURLYX><A><WHILEM><B>
3554 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3555 state, which contains the current count, initialised to -1. It also sets
3556 cur_curlyx to point to this state, with any previous value saved in the
3559 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3560 since the pattern may possibly match zero times (i.e. it's a while {} loop
3561 rather than a do {} while loop).
3563 Each entry to WHILEM represents a successful match of A. The count in the
3564 CURLYX block is incremented, another WHILEM state is pushed, and execution
3565 passes to A or B depending on greediness and the current count.
3567 For example, if matching against the string a1a2a3b (where the aN are
3568 substrings that match /A/), then the match progresses as follows: (the
3569 pushed states are interspersed with the bits of strings matched so far):
3572 <CURLYX cnt=0><WHILEM>
3573 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3574 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3575 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3576 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
3578 (Contrast this with something like CURLYM, which maintains only a single
3582 a1 <CURLYM cnt=1> a2
3583 a1 a2 <CURLYM cnt=2> a3
3584 a1 a2 a3 <CURLYM cnt=3> b
3587 Each WHILEM state block marks a point to backtrack to upon partial failure
3588 of A or B, and also contains some minor state data related to that
3589 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3590 overall state, such as the count, and pointers to the A and B ops.
3592 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3593 must always point to the *current* CURLYX block, the rules are:
3595 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3596 and set cur_curlyx to point the new block.
3598 When popping the CURLYX block after a successful or unsuccessful match,
3599 restore the previous cur_curlyx.
3601 When WHILEM is about to execute B, save the current cur_curlyx, and set it
3602 to the outer one saved in the CURLYX block.
3604 When popping the WHILEM block after a successful or unsuccessful B match,
3605 restore the previous cur_curlyx.
3607 Here's an example for the pattern (AI* BI)*BO
3608 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
3611 curlyx backtrack stack
3612 ------ ---------------
3614 CO <CO prev=NULL> <WO>
3615 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3616 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3617 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
3619 At this point the pattern succeeds, and we work back down the stack to
3620 clean up, restoring as we go:
3622 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3623 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3624 CO <CO prev=NULL> <WO>
3627 *******************************************************************/
3629 #define ST st->u.curlyx
3631 case CURLYX: /* start of /A*B/ (for complex A) */
3633 /* No need to save/restore up to this paren */
3634 I32 parenfloor = scan->flags;
3636 assert(next); /* keep Coverity happy */
3637 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3640 /* XXXX Probably it is better to teach regpush to support
3641 parenfloor > PL_regsize... */
3642 if (parenfloor > (I32)*PL_reglastparen)
3643 parenfloor = *PL_reglastparen; /* Pessimization... */
3645 ST.prev_curlyx= cur_curlyx;
3647 ST.cp = PL_savestack_ix;
3649 /* these fields contain the state of the current curly.
3650 * they are accessed by subsequent WHILEMs */
3651 ST.parenfloor = parenfloor;
3652 ST.min = ARG1(scan);
3653 ST.max = ARG2(scan);
3654 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3656 ST.minmod = st->minmod;
3657 ST.count = -1; /* this will be updated by WHILEM */
3658 ST.lastloc = NULL; /* this will be updated by WHILEM */
3660 PL_reginput = locinput;
3661 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
3665 case CURLYX_end: /* just finished matching all of A*B */
3667 cur_curlyx = ST.prev_curlyx;
3671 case CURLYX_end_fail: /* just failed to match all of A*B */
3673 cur_curlyx = ST.prev_curlyx;
3679 #define ST st->u.whilem
3681 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3683 /* see the discussion above about CURLYX/WHILEM */
3686 assert(cur_curlyx); /* keep Coverity happy */
3687 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3688 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3689 ST.cache_offset = 0;
3692 PL_reginput = locinput;
3694 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3695 "%*s whilem: matched %ld out of %ld..%ld\n",
3696 REPORT_CODE_OFF+depth*2, "", (long)n,
3697 (long)cur_curlyx->u.curlyx.min,
3698 (long)cur_curlyx->u.curlyx.max)
3701 /* First just match a string of min A's. */
3703 if (n < cur_curlyx->u.curlyx.min) {
3704 cur_curlyx->u.curlyx.lastloc = locinput;
3705 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3709 /* If degenerate A matches "", assume A done. */
3711 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3712 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3713 "%*s whilem: empty match detected, trying continuation...\n",
3714 REPORT_CODE_OFF+depth*2, "")
3716 goto do_whilem_B_max;
3719 /* super-linear cache processing */
3723 if (!PL_reg_maxiter) {
3724 /* start the countdown: Postpone detection until we
3725 * know the match is not *that* much linear. */
3726 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3727 /* possible overflow for long strings and many CURLYX's */
3728 if (PL_reg_maxiter < 0)
3729 PL_reg_maxiter = I32_MAX;
3730 PL_reg_leftiter = PL_reg_maxiter;
3733 if (PL_reg_leftiter-- == 0) {
3734 /* initialise cache */
3735 const I32 size = (PL_reg_maxiter + 7)/8;
3736 if (PL_reg_poscache) {
3737 if ((I32)PL_reg_poscache_size < size) {
3738 Renew(PL_reg_poscache, size, char);
3739 PL_reg_poscache_size = size;
3741 Zero(PL_reg_poscache, size, char);
3744 PL_reg_poscache_size = size;
3745 Newxz(PL_reg_poscache, size, char);
3747 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3748 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3749 PL_colors[4], PL_colors[5])
3753 if (PL_reg_leftiter < 0) {
3754 /* have we already failed at this position? */
3756 offset = (scan->flags & 0xf) - 1
3757 + (locinput - PL_bostr) * (scan->flags>>4);
3758 mask = 1 << (offset % 8);
3760 if (PL_reg_poscache[offset] & mask) {
3761 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3762 "%*s whilem: (cache) already tried at this position...\n",
3763 REPORT_CODE_OFF+depth*2, "")
3765 sayNO; /* cache records failure */
3767 ST.cache_offset = offset;
3768 ST.cache_mask = mask;
3772 /* Prefer B over A for minimal matching. */
3774 if (cur_curlyx->u.curlyx.minmod) {
3775 ST.save_curlyx = cur_curlyx;
3776 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3777 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3778 REGCP_SET(ST.lastcp);
3779 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3783 /* Prefer A over B for maximal matching. */
3785 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3786 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3787 cur_curlyx->u.curlyx.lastloc = locinput;
3788 REGCP_SET(ST.lastcp);
3789 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3792 goto do_whilem_B_max;
3796 case WHILEM_B_min: /* just matched B in a minimal match */
3797 case WHILEM_B_max: /* just matched B in a maximal match */
3798 cur_curlyx = ST.save_curlyx;
3802 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3803 cur_curlyx = ST.save_curlyx;
3804 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3805 cur_curlyx->u.curlyx.count--;
3809 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3810 REGCP_UNWIND(ST.lastcp);
3813 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3814 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3815 cur_curlyx->u.curlyx.count--;
3819 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3820 REGCP_UNWIND(ST.lastcp);
3821 regcppop(rex); /* Restore some previous $<digit>s? */
3822 PL_reginput = locinput;
3823 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3824 "%*s whilem: failed, trying continuation...\n",
3825 REPORT_CODE_OFF+depth*2, "")
3828 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3829 && ckWARN(WARN_REGEXP)
3830 && !(PL_reg_flags & RF_warned))
3832 PL_reg_flags |= RF_warned;
3833 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3834 "Complex regular subexpression recursion",
3839 ST.save_curlyx = cur_curlyx;
3840 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3841 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3844 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3845 cur_curlyx = ST.save_curlyx;
3846 REGCP_UNWIND(ST.lastcp);
3849 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3850 /* Maximum greed exceeded */
3851 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3852 && ckWARN(WARN_REGEXP)
3853 && !(PL_reg_flags & RF_warned))
3855 PL_reg_flags |= RF_warned;
3856 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3857 "%s limit (%d) exceeded",
3858 "Complex regular subexpression recursion",
3861 cur_curlyx->u.curlyx.count--;
3865 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3866 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3868 /* Try grabbing another A and see if it helps. */
3869 PL_reginput = locinput;
3870 cur_curlyx->u.curlyx.lastloc = locinput;
3871 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3872 REGCP_SET(ST.lastcp);
3873 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
3877 #define ST st->u.branch
3879 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
3880 next = scan + ARG(scan);
3883 scan = NEXTOPER(scan);
3886 case BRANCH: /* /(...|A|...)/ */
3887 scan = NEXTOPER(scan); /* scan now points to inner node */
3888 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3889 /* last branch; skip state push and jump direct to node */
3891 ST.lastparen = *PL_reglastparen;
3892 ST.next_branch = next;
3894 PL_reginput = locinput;
3896 /* Now go into the branch */
3897 PUSH_STATE_GOTO(BRANCH_next, scan);
3900 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3901 REGCP_UNWIND(ST.cp);
3902 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3904 *PL_reglastparen = n;
3905 scan = ST.next_branch;
3906 /* no more branches? */
3907 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3909 continue; /* execute next BRANCH[J] op */
3917 #define ST st->u.curlym
3919 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3921 /* This is an optimisation of CURLYX that enables us to push
3922 * only a single backtracking state, no matter now many matches
3923 * there are in {m,n}. It relies on the pattern being constant
3924 * length, with no parens to influence future backrefs
3928 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3930 /* if paren positive, emulate an OPEN/CLOSE around A */
3932 I32 paren = ST.me->flags;
3933 if (paren > PL_regsize)
3935 if (paren > (I32)*PL_reglastparen)
3936 *PL_reglastparen = paren;
3937 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3943 ST.minmod = st->minmod;
3945 ST.c1 = CHRTEST_UNINIT;
3948 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3951 curlym_do_A: /* execute the A in /A{m,n}B/ */
3952 PL_reginput = locinput;
3953 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3956 case CURLYM_A: /* we've just matched an A */
3957 locinput = st->locinput;
3958 nextchr = UCHARAT(locinput);
3961 /* after first match, determine A's length: u.curlym.alen */
3962 if (ST.count == 1) {
3963 if (PL_reg_match_utf8) {
3965 while (s < PL_reginput) {
3971 ST.alen = PL_reginput - locinput;
3974 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3977 PerlIO_printf(Perl_debug_log,
3978 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
3979 (int)(REPORT_CODE_OFF+(depth*2)), "",
3980 (IV) ST.count, (IV)ST.alen)
3983 locinput = PL_reginput;
3984 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3985 goto curlym_do_A; /* try to match another A */
3986 goto curlym_do_B; /* try to match B */
3988 case CURLYM_A_fail: /* just failed to match an A */
3989 REGCP_UNWIND(ST.cp);
3990 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3993 curlym_do_B: /* execute the B in /A{m,n}B/ */
3994 PL_reginput = locinput;
3995 if (ST.c1 == CHRTEST_UNINIT) {
3996 /* calculate c1 and c2 for possible match of 1st char
3997 * following curly */
3998 ST.c1 = ST.c2 = CHRTEST_VOID;
3999 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4000 regnode *text_node = ST.B;
4001 if (! HAS_TEXT(text_node))
4002 FIND_NEXT_IMPT(text_node);
4003 if (HAS_TEXT(text_node)
4004 && PL_regkind[OP(text_node)] != REF)
4006 ST.c1 = (U8)*STRING(text_node);
4008 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4010 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4011 ? PL_fold_locale[ST.c1]
4018 PerlIO_printf(Perl_debug_log,
4019 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4020 (int)(REPORT_CODE_OFF+(depth*2)),
4023 if (ST.c1 != CHRTEST_VOID
4024 && UCHARAT(PL_reginput) != ST.c1
4025 && UCHARAT(PL_reginput) != ST.c2)
4027 /* simulate B failing */
4028 state_num = CURLYM_B_fail;
4029 goto reenter_switch;
4033 /* mark current A as captured */
4034 I32 paren = ST.me->flags;
4037 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4038 PL_regendp[paren] = PL_reginput - PL_bostr;
4041 PL_regendp[paren] = -1;
4043 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4046 case CURLYM_B_fail: /* just failed to match a B */
4047 REGCP_UNWIND(ST.cp);
4049 if (ST.count == ARG2(ST.me) /* max */)
4051 goto curlym_do_A; /* try to match a further A */
4053 /* backtrack one A */
4054 if (ST.count == ARG1(ST.me) /* min */)
4057 locinput = HOPc(locinput, -ST.alen);
4058 goto curlym_do_B; /* try to match B */
4061 #define ST st->u.curly
4063 #define CURLY_SETPAREN(paren, success) \
4066 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4067 PL_regendp[paren] = locinput - PL_bostr; \
4070 PL_regendp[paren] = -1; \
4073 case STAR: /* /A*B/ where A is width 1 */
4077 scan = NEXTOPER(scan);
4079 case PLUS: /* /A+B/ where A is width 1 */
4083 scan = NEXTOPER(scan);
4085 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4086 ST.paren = scan->flags; /* Which paren to set */
4087 if (ST.paren > PL_regsize)
4088 PL_regsize = ST.paren;
4089 if (ST.paren > (I32)*PL_reglastparen)
4090 *PL_reglastparen = ST.paren;
4091 ST.min = ARG1(scan); /* min to match */
4092 ST.max = ARG2(scan); /* max to match */
4093 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4095 case CURLY: /* /A{m,n}B/ where A is width 1 */
4097 ST.min = ARG1(scan); /* min to match */
4098 ST.max = ARG2(scan); /* max to match */
4099 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4102 * Lookahead to avoid useless match attempts
4103 * when we know what character comes next.
4105 * Used to only do .*x and .*?x, but now it allows
4106 * for )'s, ('s and (?{ ... })'s to be in the way
4107 * of the quantifier and the EXACT-like node. -- japhy
4110 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4112 if (HAS_TEXT(next) || JUMPABLE(next)) {
4114 regnode *text_node = next;
4116 if (! HAS_TEXT(text_node))
4117 FIND_NEXT_IMPT(text_node);
4119 if (! HAS_TEXT(text_node))
4120 ST.c1 = ST.c2 = CHRTEST_VOID;
4122 if (PL_regkind[OP(text_node)] == REF) {
4123 ST.c1 = ST.c2 = CHRTEST_VOID;
4124 goto assume_ok_easy;
4127 s = (U8*)STRING(text_node);
4131 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4132 ST.c2 = PL_fold[ST.c1];
4133 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4134 ST.c2 = PL_fold_locale[ST.c1];
4137 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4138 STRLEN ulen1, ulen2;
4139 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4140 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4142 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4143 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4145 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4147 0 : UTF8_ALLOW_ANY);
4148 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4150 0 : UTF8_ALLOW_ANY);
4152 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4154 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4159 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4166 ST.c1 = ST.c2 = CHRTEST_VOID;
4171 PL_reginput = locinput;
4174 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4177 locinput = PL_reginput;
4179 if (ST.c1 == CHRTEST_VOID)
4180 goto curly_try_B_min;
4182 ST.oldloc = locinput;
4184 /* set ST.maxpos to the furthest point along the
4185 * string that could possibly match */
4186 if (ST.max == REG_INFTY) {
4187 ST.maxpos = PL_regeol - 1;
4189 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4193 int m = ST.max - ST.min;
4194 for (ST.maxpos = locinput;
4195 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4196 ST.maxpos += UTF8SKIP(ST.maxpos);
4199 ST.maxpos = locinput + ST.max - ST.min;
4200 if (ST.maxpos >= PL_regeol)
4201 ST.maxpos = PL_regeol - 1;
4203 goto curly_try_B_min_known;
4207 ST.count = regrepeat(rex, ST.A, ST.max);
4208 locinput = PL_reginput;
4209 if (ST.count < ST.min)
4211 if ((ST.count > ST.min)
4212 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4214 /* A{m,n} must come at the end of the string, there's
4215 * no point in backing off ... */
4217 /* ...except that $ and \Z can match before *and* after
4218 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4219 We may back off by one in this case. */
4220 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4224 goto curly_try_B_max;
4229 case CURLY_B_min_known_fail:
4230 /* failed to find B in a non-greedy match where c1,c2 valid */
4231 if (ST.paren && ST.count)
4232 PL_regendp[ST.paren] = -1;
4234 PL_reginput = locinput; /* Could be reset... */
4235 REGCP_UNWIND(ST.cp);
4236 /* Couldn't or didn't -- move forward. */
4237 ST.oldloc = locinput;
4239 locinput += UTF8SKIP(locinput);
4243 curly_try_B_min_known:
4244 /* find the next place where 'B' could work, then call B */
4248 n = (ST.oldloc == locinput) ? 0 : 1;
4249 if (ST.c1 == ST.c2) {
4251 /* set n to utf8_distance(oldloc, locinput) */
4252 while (locinput <= ST.maxpos &&
4253 utf8n_to_uvchr((U8*)locinput,
4254 UTF8_MAXBYTES, &len,
4255 uniflags) != (UV)ST.c1) {
4261 /* set n to utf8_distance(oldloc, locinput) */
4262 while (locinput <= ST.maxpos) {
4264 const UV c = utf8n_to_uvchr((U8*)locinput,
4265 UTF8_MAXBYTES, &len,
4267 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4275 if (ST.c1 == ST.c2) {
4276 while (locinput <= ST.maxpos &&
4277 UCHARAT(locinput) != ST.c1)
4281 while (locinput <= ST.maxpos
4282 && UCHARAT(locinput) != ST.c1
4283 && UCHARAT(locinput) != ST.c2)
4286 n = locinput - ST.oldloc;
4288 if (locinput > ST.maxpos)
4290 /* PL_reginput == oldloc now */
4293 if (regrepeat(rex, ST.A, n) < n)
4296 PL_reginput = locinput;
4297 CURLY_SETPAREN(ST.paren, ST.count);
4298 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4303 case CURLY_B_min_fail:
4304 /* failed to find B in a non-greedy match where c1,c2 invalid */
4305 if (ST.paren && ST.count)
4306 PL_regendp[ST.paren] = -1;
4308 REGCP_UNWIND(ST.cp);
4309 /* failed -- move forward one */
4310 PL_reginput = locinput;
4311 if (regrepeat(rex, ST.A, 1)) {
4313 locinput = PL_reginput;
4314 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4315 ST.count > 0)) /* count overflow ? */
4318 CURLY_SETPAREN(ST.paren, ST.count);
4319 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4327 /* a successful greedy match: now try to match B */
4330 if (ST.c1 != CHRTEST_VOID)
4331 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4332 UTF8_MAXBYTES, 0, uniflags)
4333 : (UV) UCHARAT(PL_reginput);
4334 /* If it could work, try it. */
4335 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4336 CURLY_SETPAREN(ST.paren, ST.count);
4337 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4342 case CURLY_B_max_fail:
4343 /* failed to find B in a greedy match */
4344 if (ST.paren && ST.count)
4345 PL_regendp[ST.paren] = -1;
4347 REGCP_UNWIND(ST.cp);
4349 if (--ST.count < ST.min)
4351 PL_reginput = locinput = HOPc(locinput, -1);
4352 goto curly_try_B_max;
4360 /* we've just finished A in /(??{A})B/; now continue with B */
4364 st->u.eval.toggle_reg_flags
4365 = cur_eval->u.eval.toggle_reg_flags;
4366 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4368 st->u.eval.prev_rex = rex; /* inner */
4369 rex = cur_eval->u.eval.prev_rex; /* outer */
4370 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4372 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4373 REGCP_SET(st->u.eval.lastcp);
4374 PL_reginput = locinput;
4376 /* Restore parens of the outer rex without popping the
4378 tmpix = PL_savestack_ix;
4379 PL_savestack_ix = cur_eval->u.eval.lastcp;
4381 PL_savestack_ix = tmpix;
4383 st->u.eval.prev_eval = cur_eval;
4384 cur_eval = cur_eval->u.eval.prev_eval;
4386 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4387 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
4388 PUSH_YES_STATE_GOTO(EVAL_AB,
4389 st->u.eval.prev_eval->u.eval.B); /* match B */
4392 if (locinput < reginfo->till) {
4393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4394 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4396 (long)(locinput - PL_reg_starttry),
4397 (long)(reginfo->till - PL_reg_starttry),
4399 sayNO_SILENT; /* Cannot match: too short. */
4401 PL_reginput = locinput; /* put where regtry can find it */
4402 sayYES; /* Success! */
4404 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4406 PerlIO_printf(Perl_debug_log,
4407 "%*s %ssubpattern success...%s\n",
4408 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4409 PL_reginput = locinput; /* put where regtry can find it */
4410 sayYES; /* Success! */
4413 #define ST st->u.ifmatch
4415 case SUSPEND: /* (?>A) */
4417 PL_reginput = locinput;
4420 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4422 goto ifmatch_trivial_fail_test;
4424 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4426 ifmatch_trivial_fail_test:
4428 char * const s = HOPBACKc(locinput, scan->flags);
4433 st->sw = 1 - (bool)ST.wanted;
4437 next = scan + ARG(scan);
4445 PL_reginput = locinput;
4449 /* execute body of (?...A) */
4450 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4453 case IFMATCH_A_fail: /* body of (?...A) failed */
4454 ST.wanted = !ST.wanted;
4457 case IFMATCH_A: /* body of (?...A) succeeded */
4460 st->sw = (bool)ST.wanted;
4462 else if (!ST.wanted)
4465 if (OP(ST.me) == SUSPEND)
4466 locinput = PL_reginput;
4468 locinput = PL_reginput = st->locinput;
4469 nextchr = UCHARAT(locinput);
4471 scan = ST.me + ARG(ST.me);
4474 continue; /* execute B */
4479 next = scan + ARG(scan);
4484 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4485 PTR2UV(scan), OP(scan));
4486 Perl_croak(aTHX_ "regexp memory corruption");
4494 /* push a state that backtracks on success */
4495 st->u.yes.prev_yes_state = yes_state;
4499 /* push a new regex state, then continue at scan */
4501 regmatch_state *newst;
4503 DEBUG_STATE_pp("push");
4505 st->locinput = locinput;
4507 if (newst > SLAB_LAST(PL_regmatch_slab))
4508 newst = S_push_slab(aTHX);
4509 PL_regmatch_state = newst;
4510 /* XXX probably don't need to initialise these */
4516 locinput = PL_reginput;
4517 nextchr = UCHARAT(locinput);
4525 * We get here only if there's trouble -- normally "case END" is
4526 * the terminating point.
4528 Perl_croak(aTHX_ "corrupted regexp pointers");
4534 /* we have successfully completed a subexpression, but we must now
4535 * pop to the state marked by yes_state and continue from there */
4536 assert(st != yes_state);
4538 while (st != yes_state) {
4540 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4541 PL_regmatch_slab = PL_regmatch_slab->prev;
4542 st = SLAB_LAST(PL_regmatch_slab);
4544 DEBUG_STATE_pp("pop (yes)");
4548 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4549 || yes_state > SLAB_LAST(PL_regmatch_slab))
4551 /* not in this slab, pop slab */
4552 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4553 PL_regmatch_slab = PL_regmatch_slab->prev;
4554 st = SLAB_LAST(PL_regmatch_slab);
4556 depth -= (st - yes_state);
4559 yes_state = st->u.yes.prev_yes_state;
4560 PL_regmatch_state = st;
4562 switch (st->resume_state) {
4570 state_num = st->resume_state;
4571 goto reenter_switch;
4581 Perl_croak(aTHX_ "unexpected yes resume state");
4585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4586 PL_colors[4], PL_colors[5]));
4593 PerlIO_printf(Perl_debug_log,
4594 "%*s %sfailed...%s\n",
4595 REPORT_CODE_OFF+depth*2, "",
4596 PL_colors[4], PL_colors[5])
4603 /* there's a previous state to backtrack to */
4605 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4606 PL_regmatch_slab = PL_regmatch_slab->prev;
4607 st = SLAB_LAST(PL_regmatch_slab);
4609 PL_regmatch_state = st;
4613 locinput= st->locinput;
4614 nextchr = UCHARAT(locinput);
4616 DEBUG_STATE_pp("pop");
4618 if (yes_state == st)
4619 yes_state = st->u.yes.prev_yes_state;
4621 switch (st->resume_state) {
4630 case CURLY_B_min_known:
4637 state_num = st->resume_state + 1; /* failure = success + 1 */
4638 goto reenter_switch;
4641 Perl_croak(aTHX_ "regexp resume memory corruption");
4647 /* restore original high-water mark */
4648 PL_regmatch_slab = orig_slab;
4649 PL_regmatch_state = orig_state;
4651 /* free all slabs above current one */
4652 if (orig_slab->next) {
4653 regmatch_slab *sl = orig_slab->next;
4654 orig_slab->next = NULL;
4656 regmatch_slab * const osl = sl;
4666 - regrepeat - repeatedly match something simple, report how many
4669 * [This routine now assumes that it will only match on things of length 1.
4670 * That was true before, but now we assume scan - reginput is the count,
4671 * rather than incrementing count on every character. [Er, except utf8.]]
4674 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4677 register char *scan;
4679 register char *loceol = PL_regeol;
4680 register I32 hardcount = 0;
4681 register bool do_utf8 = PL_reg_match_utf8;
4684 if (max == REG_INFTY)
4686 else if (max < loceol - scan)
4687 loceol = scan + max;
4692 while (scan < loceol && hardcount < max && *scan != '\n') {
4693 scan += UTF8SKIP(scan);
4697 while (scan < loceol && *scan != '\n')
4704 while (scan < loceol && hardcount < max) {
4705 scan += UTF8SKIP(scan);
4715 case EXACT: /* length of string is 1 */
4717 while (scan < loceol && UCHARAT(scan) == c)
4720 case EXACTF: /* length of string is 1 */
4722 while (scan < loceol &&
4723 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4726 case EXACTFL: /* length of string is 1 */
4727 PL_reg_flags |= RF_tainted;
4729 while (scan < loceol &&
4730 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4736 while (hardcount < max && scan < loceol &&
4737 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4738 scan += UTF8SKIP(scan);
4742 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4749 LOAD_UTF8_CHARCLASS_ALNUM();
4750 while (hardcount < max && scan < loceol &&
4751 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4752 scan += UTF8SKIP(scan);
4756 while (scan < loceol && isALNUM(*scan))
4761 PL_reg_flags |= RF_tainted;
4764 while (hardcount < max && scan < loceol &&
4765 isALNUM_LC_utf8((U8*)scan)) {
4766 scan += UTF8SKIP(scan);
4770 while (scan < loceol && isALNUM_LC(*scan))
4777 LOAD_UTF8_CHARCLASS_ALNUM();
4778 while (hardcount < max && scan < loceol &&
4779 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4780 scan += UTF8SKIP(scan);
4784 while (scan < loceol && !isALNUM(*scan))
4789 PL_reg_flags |= RF_tainted;
4792 while (hardcount < max && scan < loceol &&
4793 !isALNUM_LC_utf8((U8*)scan)) {
4794 scan += UTF8SKIP(scan);
4798 while (scan < loceol && !isALNUM_LC(*scan))
4805 LOAD_UTF8_CHARCLASS_SPACE();
4806 while (hardcount < max && scan < loceol &&
4808 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4809 scan += UTF8SKIP(scan);
4813 while (scan < loceol && isSPACE(*scan))
4818 PL_reg_flags |= RF_tainted;
4821 while (hardcount < max && scan < loceol &&
4822 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4823 scan += UTF8SKIP(scan);
4827 while (scan < loceol && isSPACE_LC(*scan))
4834 LOAD_UTF8_CHARCLASS_SPACE();
4835 while (hardcount < max && scan < loceol &&
4837 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4838 scan += UTF8SKIP(scan);
4842 while (scan < loceol && !isSPACE(*scan))
4847 PL_reg_flags |= RF_tainted;
4850 while (hardcount < max && scan < loceol &&
4851 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4852 scan += UTF8SKIP(scan);
4856 while (scan < loceol && !isSPACE_LC(*scan))
4863 LOAD_UTF8_CHARCLASS_DIGIT();
4864 while (hardcount < max && scan < loceol &&
4865 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4866 scan += UTF8SKIP(scan);
4870 while (scan < loceol && isDIGIT(*scan))
4877 LOAD_UTF8_CHARCLASS_DIGIT();
4878 while (hardcount < max && scan < loceol &&
4879 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4880 scan += UTF8SKIP(scan);
4884 while (scan < loceol && !isDIGIT(*scan))
4888 default: /* Called on something of 0 width. */
4889 break; /* So match right here or not at all. */
4895 c = scan - PL_reginput;
4899 GET_RE_DEBUG_FLAGS_DECL;
4901 SV * const prop = sv_newmortal();
4902 regprop(prog, prop, p);
4903 PerlIO_printf(Perl_debug_log,
4904 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4905 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4913 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
4915 - regclass_swash - prepare the utf8 swash
4919 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4925 const struct reg_data * const data = prog ? prog->data : NULL;
4927 if (data && data->count) {
4928 const U32 n = ARG(node);
4930 if (data->what[n] == 's') {
4931 SV * const rv = (SV*)data->data[n];
4932 AV * const av = (AV*)SvRV((SV*)rv);
4933 SV **const ary = AvARRAY(av);
4936 /* See the end of regcomp.c:S_regclass() for
4937 * documentation of these array elements. */
4940 a = SvROK(ary[1]) ? &ary[1] : 0;
4941 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4945 else if (si && doinit) {
4946 sw = swash_init("utf8", "", si, 1, 0);
4947 (void)av_store(av, 1, sw);
4964 - reginclass - determine if a character falls into a character class
4966 The n is the ANYOF regnode, the p is the target string, lenp
4967 is pointer to the maximum length of how far to go in the p
4968 (if the lenp is zero, UTF8SKIP(p) is used),
4969 do_utf8 tells whether the target string is in UTF-8.
4974 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4977 const char flags = ANYOF_FLAGS(n);
4983 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4984 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4985 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4986 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
4987 if (len == (STRLEN)-1)
4988 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4991 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4992 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4995 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4996 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4999 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5003 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5006 if (swash_fetch(sw, p, do_utf8))
5008 else if (flags & ANYOF_FOLD) {
5009 if (!match && lenp && av) {
5011 for (i = 0; i <= av_len(av); i++) {
5012 SV* const sv = *av_fetch(av, i, FALSE);
5014 const char * const s = SvPV_const(sv, len);
5016 if (len <= plen && memEQ(s, (char*)p, len)) {
5024 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5027 to_utf8_fold(p, tmpbuf, &tmplen);
5028 if (swash_fetch(sw, tmpbuf, do_utf8))
5034 if (match && lenp && *lenp == 0)
5035 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5037 if (!match && c < 256) {
5038 if (ANYOF_BITMAP_TEST(n, c))
5040 else if (flags & ANYOF_FOLD) {
5043 if (flags & ANYOF_LOCALE) {
5044 PL_reg_flags |= RF_tainted;
5045 f = PL_fold_locale[c];
5049 if (f != c && ANYOF_BITMAP_TEST(n, f))
5053 if (!match && (flags & ANYOF_CLASS)) {
5054 PL_reg_flags |= RF_tainted;
5056 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5057 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5058 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5059 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5060 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5061 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5062 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5063 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5064 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5065 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5066 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5067 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5068 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5069 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5070 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5071 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5072 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5073 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5074 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5075 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5076 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5077 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5078 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5079 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5080 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5081 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5082 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5083 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5084 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5085 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5086 ) /* How's that for a conditional? */
5093 return (flags & ANYOF_INVERT) ? !match : match;
5097 S_reghop3(U8 *s, I32 off, const U8* lim)
5101 while (off-- && s < lim) {
5102 /* XXX could check well-formedness here */
5107 while (off++ && s > lim) {
5109 if (UTF8_IS_CONTINUED(*s)) {
5110 while (s > lim && UTF8_IS_CONTINUATION(*s))
5113 /* XXX could check well-formedness here */
5120 /* there are a bunch of places where we use two reghop3's that should
5121 be replaced with this routine. but since thats not done yet
5122 we ifdef it out - dmq
5125 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5129 while (off-- && s < rlim) {
5130 /* XXX could check well-formedness here */
5135 while (off++ && s > llim) {
5137 if (UTF8_IS_CONTINUED(*s)) {
5138 while (s > llim && UTF8_IS_CONTINUATION(*s))
5141 /* XXX could check well-formedness here */
5149 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5153 while (off-- && s < lim) {
5154 /* XXX could check well-formedness here */
5161 while (off++ && s > lim) {
5163 if (UTF8_IS_CONTINUED(*s)) {
5164 while (s > lim && UTF8_IS_CONTINUATION(*s))
5167 /* XXX could check well-formedness here */
5176 restore_pos(pTHX_ void *arg)
5179 regexp * const rex = (regexp *)arg;
5180 if (PL_reg_eval_set) {
5181 if (PL_reg_oldsaved) {
5182 rex->subbeg = PL_reg_oldsaved;
5183 rex->sublen = PL_reg_oldsavedlen;
5184 #ifdef PERL_OLD_COPY_ON_WRITE
5185 rex->saved_copy = PL_nrs;
5187 RX_MATCH_COPIED_on(rex);
5189 PL_reg_magic->mg_len = PL_reg_oldpos;
5190 PL_reg_eval_set = 0;
5191 PL_curpm = PL_reg_oldcurpm;
5196 S_to_utf8_substr(pTHX_ register regexp *prog)
5198 if (prog->float_substr && !prog->float_utf8) {
5199 SV* const sv = newSVsv(prog->float_substr);
5200 prog->float_utf8 = sv;
5201 sv_utf8_upgrade(sv);
5202 if (SvTAIL(prog->float_substr))
5204 if (prog->float_substr == prog->check_substr)
5205 prog->check_utf8 = sv;
5207 if (prog->anchored_substr && !prog->anchored_utf8) {
5208 SV* const sv = newSVsv(prog->anchored_substr);
5209 prog->anchored_utf8 = sv;
5210 sv_utf8_upgrade(sv);
5211 if (SvTAIL(prog->anchored_substr))
5213 if (prog->anchored_substr == prog->check_substr)
5214 prog->check_utf8 = sv;
5219 S_to_byte_substr(pTHX_ register regexp *prog)
5222 if (prog->float_utf8 && !prog->float_substr) {
5223 SV* sv = newSVsv(prog->float_utf8);
5224 prog->float_substr = sv;
5225 if (sv_utf8_downgrade(sv, TRUE)) {
5226 if (SvTAIL(prog->float_utf8))
5230 prog->float_substr = sv = &PL_sv_undef;
5232 if (prog->float_utf8 == prog->check_utf8)
5233 prog->check_substr = sv;
5235 if (prog->anchored_utf8 && !prog->anchored_substr) {
5236 SV* sv = newSVsv(prog->anchored_utf8);
5237 prog->anchored_substr = sv;
5238 if (sv_utf8_downgrade(sv, TRUE)) {
5239 if (SvTAIL(prog->anchored_utf8))
5243 prog->anchored_substr = sv = &PL_sv_undef;
5245 if (prog->anchored_utf8 == prog->check_utf8)
5246 prog->check_substr = sv;
5252 * c-indentation-style: bsd
5254 * indent-tabs-mode: t
5257 * ex: set ts=8 sts=4 sw=4 noet: