5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
38 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
70 #define PERL_IN_REGEXEC_C
73 #ifdef PERL_IN_XSUB_RE
79 #define RF_tainted 1 /* tainted information used? */
80 #define RF_warned 2 /* warned about big count? */
81 #define RF_evaled 4 /* Did an EVAL with setting? */
82 #define RF_utf8 8 /* String contains multibyte chars? */
84 #define UTF ((PL_reg_flags & RF_utf8) != 0)
86 #define RS_init 1 /* eval environment created */
87 #define RS_set 2 /* replsv value is set */
93 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
99 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
100 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
102 #define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
106 #define HOPBACKc(pos, off) \
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
113 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
114 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
116 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
123 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
125 /* for use after a quantifier and before an EXACT-like node -- japhy */
126 #define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
133 #define HAS_TEXT(rn) ( \
134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
141 #define FIND_NEXT_IMPT(rn) STMT_START { \
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
145 rn = NEXTOPER(NEXTOPER(rn)); \
146 else if (type == PLUS) \
148 else if (type == IFMATCH) \
149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
150 else rn += NEXT_OFF(rn); \
154 static void restore_pos(pTHX_ void *arg);
157 S_regcppush(pTHX_ I32 parenfloor)
160 const int retval = PL_savestack_ix;
161 #define REGCP_PAREN_ELEMS 4
162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
164 GET_RE_DEBUG_FLAGS_DECL;
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
169 #define REGCP_OTHER_ELEMS 6
170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
171 for (p = PL_regsize; p > parenfloor; p--) {
172 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
175 SSPUSHPTR(PL_reg_start_tmp[p]);
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
187 SSPUSHINT(*PL_reglastcloseparen);
188 SSPUSHPTR(PL_reginput);
189 #define REGCP_FRAME_ELEMS 2
190 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
198 /* These are needed since we do not localize EVAL nodes: */
199 # define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
203 # define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
204 PerlIO_printf(Perl_debug_log, \
205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
209 S_regcppop(pTHX_ const regexp *rex)
215 GET_RE_DEBUG_FLAGS_DECL;
217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
221 input = (char *) SSPOPPTR;
222 *PL_reglastcloseparen = SSPOPINT;
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
226 /* Now restore the parentheses context. */
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
230 U32 paren = (U32)SSPOPINT;
231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
232 PL_regstartp[paren] = SSPOPINT;
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
237 PerlIO_printf(Perl_debug_log,
238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
239 (UV)paren, (IV)PL_regstartp[paren],
240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
241 (IV)PL_regendp[paren],
242 (paren > *PL_reglastparen ? "(no)" : ""));
246 if (*PL_reglastparen + 1 <= rex->nparens) {
247 PerlIO_printf(Perl_debug_log,
248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
265 PL_regstartp[i] = -1;
272 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
275 * pregexec and friends
278 #ifndef PERL_IN_XSUB_RE
280 - pregexec - match a regexp against a string
283 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
284 char *strbeg, I32 minend, SV *screamer, U32 nosave)
285 /* strend: pointer to null at end of string */
286 /* strbeg: real beginning of string */
287 /* minend: end of match must be >=minend after stringarg. */
288 /* nosave: For optimizations. */
291 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
292 nosave ? 0 : REXEC_COPY_STR);
297 * Need to implement the following flags for reg_anch:
299 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
301 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
302 * INTUIT_AUTORITATIVE_ML
303 * INTUIT_ONCE_NOML - Intuit can match in one location only.
306 * Another flag for this function: SECOND_TIME (so that float substrs
307 * with giant delta may be not rechecked).
310 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
312 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
313 Otherwise, only SvCUR(sv) is used to get strbeg. */
315 /* XXXX We assume that strpos is strbeg unless sv. */
317 /* XXXX Some places assume that there is a fixed substring.
318 An update may be needed if optimizer marks as "INTUITable"
319 RExen without fixed substrings. Similarly, it is assumed that
320 lengths of all the strings are no more than minlen, thus they
321 cannot come from lookahead.
322 (Or minlen should take into account lookahead.) */
324 /* A failure to find a constant substring means that there is no need to make
325 an expensive call to REx engine, thus we celebrate a failure. Similarly,
326 finding a substring too deep into the string means that less calls to
327 regtry() should be needed.
329 REx compiler's optimizer found 4 possible hints:
330 a) Anchored substring;
332 c) Whether we are anchored (beginning-of-line or \G);
333 d) First node (of those at offset 0) which may distingush positions;
334 We use a)b)d) and multiline-part of c), and try to find a position in the
335 string which does not contradict any of them.
338 /* Most of decisions we do here should have been done at compile time.
339 The nodes of the REx which we used for the search should have been
340 deleted from the finite automaton. */
343 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
344 char *strend, U32 flags, re_scream_pos_data *data)
347 register I32 start_shift = 0;
348 /* Should be nonnegative! */
349 register I32 end_shift = 0;
354 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
356 register char *other_last = NULL; /* other substr checked before this */
357 char *check_at = NULL; /* check substr found at this pos */
358 const I32 multiline = prog->reganch & PMf_MULTILINE;
360 const char * const i_strpos = strpos;
363 GET_RE_DEBUG_FLAGS_DECL;
365 RX_MATCH_UTF8_set(prog,do_utf8);
367 if (prog->reganch & ROPT_UTF8) {
368 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
369 "UTF-8 regex...\n"));
370 PL_reg_flags |= RF_utf8;
374 RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
375 PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
379 if (PL_reg_match_utf8)
380 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
381 "UTF-8 target...\n"));
382 PerlIO_printf(Perl_debug_log,
383 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
384 PL_colors[4], PL_colors[5], PL_colors[0],
387 (strlen(prog->precomp) > 60 ? "..." : ""),
389 (int)(len > 60 ? 60 : len),
391 (len > 60 ? "..." : "")
395 /* CHR_DIST() would be more correct here but it makes things slow. */
396 if (prog->minlen > strend - strpos) {
397 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
398 "String too short... [re_intuit_start]\n"));
401 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
404 if (!prog->check_utf8 && prog->check_substr)
405 to_utf8_substr(prog);
406 check = prog->check_utf8;
408 if (!prog->check_substr && prog->check_utf8)
409 to_byte_substr(prog);
410 check = prog->check_substr;
412 if (check == &PL_sv_undef) {
413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
414 "Non-utf string cannot match utf check string\n"));
417 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
418 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
419 || ( (prog->reganch & ROPT_ANCH_BOL)
420 && !multiline ) ); /* Check after \n? */
423 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
424 | ROPT_IMPLICIT)) /* not a real BOL */
425 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
427 && (strpos != strbeg)) {
428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
431 if (prog->check_offset_min == prog->check_offset_max &&
432 !(prog->reganch & ROPT_CANY_SEEN)) {
433 /* Substring at constant offset from beg-of-str... */
436 s = HOP3c(strpos, prog->check_offset_min, strend);
438 slen = SvCUR(check); /* >= 1 */
440 if ( strend - s > slen || strend - s < slen - 1
441 || (strend - s == slen && strend[-1] != '\n')) {
442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
445 /* Now should match s[0..slen-2] */
447 if (slen && (*SvPVX_const(check) != *s
449 && memNE(SvPVX_const(check), s, slen)))) {
451 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
455 else if (*SvPVX_const(check) != *s
456 || ((slen = SvCUR(check)) > 1
457 && memNE(SvPVX_const(check), s, slen)))
460 goto success_at_start;
463 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
466 end_shift = prog->minlen - start_shift -
467 CHR_SVLEN(check) + (SvTAIL(check) != 0);
469 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
470 - (SvTAIL(check) != 0);
471 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
473 if (end_shift < eshift)
477 else { /* Can match at random position */
480 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
481 /* Should be nonnegative! */
482 end_shift = prog->minlen - start_shift -
483 CHR_SVLEN(check) + (SvTAIL(check) != 0);
486 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
488 Perl_croak(aTHX_ "panic: end_shift");
492 /* Find a possible match in the region s..strend by looking for
493 the "check" substring in the region corrected by start/end_shift. */
494 if (flags & REXEC_SCREAM) {
495 I32 p = -1; /* Internal iterator of scream. */
496 I32 * const pp = data ? data->scream_pos : &p;
498 if (PL_screamfirst[BmRARE(check)] >= 0
499 || ( BmRARE(check) == '\n'
500 && (BmPREVIOUS(check) == SvCUR(check) - 1)
502 s = screaminstr(sv, check,
503 start_shift + (s - strbeg), end_shift, pp, 0);
506 /* we may be pointing at the wrong string */
507 if (s && RX_MATCH_COPIED(prog))
508 s = strbeg + (s - SvPVX_const(sv));
510 *data->scream_olds = s;
512 else if (prog->reganch & ROPT_CANY_SEEN)
513 s = fbm_instr((U8*)(s + start_shift),
514 (U8*)(strend - end_shift),
515 check, multiline ? FBMrf_MULTILINE : 0);
517 s = fbm_instr(HOP3(s, start_shift, strend),
518 HOP3(strend, -end_shift, strbeg),
519 check, multiline ? FBMrf_MULTILINE : 0);
521 /* Update the count-of-usability, remove useless subpatterns,
524 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
525 (s ? "Found" : "Did not find"),
526 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
528 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
530 PL_colors[1], (SvTAIL(check) ? "$" : ""),
531 (s ? " at offset " : "...\n") ) );
538 /* Finish the diagnostic message */
539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
541 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
542 Start with the other substr.
543 XXXX no SCREAM optimization yet - and a very coarse implementation
544 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
545 *always* match. Probably should be marked during compile...
546 Probably it is right to do no SCREAM here...
549 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
550 /* Take into account the "other" substring. */
551 /* XXXX May be hopelessly wrong for UTF... */
554 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
557 char * const last = HOP3c(s, -start_shift, strbeg);
559 char * const saved_s = s;
562 t = s - prog->check_offset_max;
563 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
565 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
570 t = HOP3c(t, prog->anchored_offset, strend);
571 if (t < other_last) /* These positions already checked */
573 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
576 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
577 /* On end-of-str: see comment below. */
578 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
579 if (must == &PL_sv_undef) {
581 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
586 HOP3(HOP3(last1, prog->anchored_offset, strend)
587 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
589 multiline ? FBMrf_MULTILINE : 0
591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
592 "%s anchored substr \"%s%.*s%s\"%s",
593 (s ? "Found" : "Contradicts"),
596 - (SvTAIL(must)!=0)),
598 PL_colors[1], (SvTAIL(must) ? "$" : "")));
600 if (last1 >= last2) {
601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
602 ", giving up...\n"));
605 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
606 ", trying floating at offset %ld...\n",
607 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
608 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
609 s = HOP3c(last, 1, strend);
613 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
614 (long)(s - i_strpos)));
615 t = HOP3c(s, -prog->anchored_offset, strbeg);
616 other_last = HOP3c(s, 1, strend);
624 else { /* Take into account the floating substring. */
626 char * const saved_s = s;
629 t = HOP3c(s, -start_shift, strbeg);
631 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
632 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
633 last = HOP3c(t, prog->float_max_offset, strend);
634 s = HOP3c(t, prog->float_min_offset, strend);
637 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
638 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
639 /* fbm_instr() takes into account exact value of end-of-str
640 if the check is SvTAIL(ed). Since false positives are OK,
641 and end-of-str is not later than strend we are OK. */
642 if (must == &PL_sv_undef) {
644 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
647 s = fbm_instr((unsigned char*)s,
648 (unsigned char*)last + SvCUR(must)
650 must, multiline ? FBMrf_MULTILINE : 0);
651 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
652 (s ? "Found" : "Contradicts"),
654 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
656 PL_colors[1], (SvTAIL(must) ? "$" : "")));
659 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
660 ", giving up...\n"));
663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
664 ", trying anchored starting at offset %ld...\n",
665 (long)(saved_s + 1 - i_strpos)));
667 s = HOP3c(t, 1, strend);
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
672 (long)(s - i_strpos)));
673 other_last = s; /* Fix this later. --Hugo */
682 t = s - prog->check_offset_max;
683 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
685 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
687 /* Fixed substring is found far enough so that the match
688 cannot start at strpos. */
690 if (ml_anch && t[-1] != '\n') {
691 /* Eventually fbm_*() should handle this, but often
692 anchored_offset is not 0, so this check will not be wasted. */
693 /* XXXX In the code below we prefer to look for "^" even in
694 presence of anchored substrings. And we search even
695 beyond the found float position. These pessimizations
696 are historical artefacts only. */
698 while (t < strend - prog->minlen) {
700 if (t < check_at - prog->check_offset_min) {
701 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
702 /* Since we moved from the found position,
703 we definitely contradict the found anchored
704 substr. Due to the above check we do not
705 contradict "check" substr.
706 Thus we can arrive here only if check substr
707 is float. Redo checking for "other"=="fixed".
710 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
711 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
712 goto do_other_anchored;
714 /* We don't contradict the found floating substring. */
715 /* XXXX Why not check for STCLASS? */
717 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
718 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
721 /* Position contradicts check-string */
722 /* XXXX probably better to look for check-string
723 than for "\n", so one should lower the limit for t? */
724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
725 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
726 other_last = strpos = s = t + 1;
731 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
732 PL_colors[0], PL_colors[1]));
736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
737 PL_colors[0], PL_colors[1]));
741 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
744 /* The found string does not prohibit matching at strpos,
745 - no optimization of calling REx engine can be performed,
746 unless it was an MBOL and we are not after MBOL,
747 or a future STCLASS check will fail this. */
749 /* Even in this situation we may use MBOL flag if strpos is offset
750 wrt the start of the string. */
751 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
752 && (strpos != strbeg) && strpos[-1] != '\n'
753 /* May be due to an implicit anchor of m{.*foo} */
754 && !(prog->reganch & ROPT_IMPLICIT))
759 DEBUG_EXECUTE_r( if (ml_anch)
760 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
761 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
764 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
766 prog->check_utf8 /* Could be deleted already */
767 && --BmUSEFUL(prog->check_utf8) < 0
768 && (prog->check_utf8 == prog->float_utf8)
770 prog->check_substr /* Could be deleted already */
771 && --BmUSEFUL(prog->check_substr) < 0
772 && (prog->check_substr == prog->float_substr)
775 /* If flags & SOMETHING - do not do it many times on the same match */
776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
777 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
778 if (do_utf8 ? prog->check_substr : prog->check_utf8)
779 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
780 prog->check_substr = prog->check_utf8 = NULL; /* disable */
781 prog->float_substr = prog->float_utf8 = NULL; /* clear */
782 check = NULL; /* abort */
784 /* XXXX This is a remnant of the old implementation. It
785 looks wasteful, since now INTUIT can use many
787 prog->reganch &= ~RE_USE_INTUIT;
794 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
795 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
796 /* minlen == 0 is possible if regstclass is \b or \B,
797 and the fixed substr is ''$.
798 Since minlen is already taken into account, s+1 is before strend;
799 accidentally, minlen >= 1 guaranties no false positives at s + 1
800 even for \b or \B. But (minlen? 1 : 0) below assumes that
801 regstclass does not come from lookahead... */
802 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
803 This leaves EXACTF only, which is dealt with in find_byclass(). */
804 const U8* const str = (U8*)STRING(prog->regstclass);
805 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
806 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
808 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
809 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
810 : (prog->float_substr || prog->float_utf8
811 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
814 /*if (OP(prog->regstclass) == TRIE)
817 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
820 const char *what = NULL;
822 if (endpos == strend) {
823 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
824 "Could not match STCLASS...\n") );
827 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
828 "This position contradicts STCLASS...\n") );
829 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
831 /* Contradict one of substrings */
832 if (prog->anchored_substr || prog->anchored_utf8) {
833 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
834 DEBUG_EXECUTE_r( what = "anchored" );
836 s = HOP3c(t, 1, strend);
837 if (s + start_shift + end_shift > strend) {
838 /* XXXX Should be taken into account earlier? */
839 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
840 "Could not match STCLASS...\n") );
845 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
846 "Looking for %s substr starting at offset %ld...\n",
847 what, (long)(s + start_shift - i_strpos)) );
850 /* Have both, check_string is floating */
851 if (t + start_shift >= check_at) /* Contradicts floating=check */
852 goto retry_floating_check;
853 /* Recheck anchored substring, but not floating... */
857 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
858 "Looking for anchored substr starting at offset %ld...\n",
859 (long)(other_last - i_strpos)) );
860 goto do_other_anchored;
862 /* Another way we could have checked stclass at the
863 current position only: */
868 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
869 "Looking for /%s^%s/m starting at offset %ld...\n",
870 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
873 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
875 /* Check is floating subtring. */
876 retry_floating_check:
877 t = check_at - start_shift;
878 DEBUG_EXECUTE_r( what = "floating" );
879 goto hop_and_restart;
882 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
883 "By STCLASS: moving %ld --> %ld\n",
884 (long)(t - i_strpos), (long)(s - i_strpos))
888 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
889 "Does not contradict STCLASS...\n");
894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
895 PL_colors[4], (check ? "Guessed" : "Giving up"),
896 PL_colors[5], (long)(s - i_strpos)) );
899 fail_finish: /* Substring not found */
900 if (prog->check_substr || prog->check_utf8) /* could be removed already */
901 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
904 PL_colors[4], PL_colors[5]));
908 /* We know what class REx starts with. Try to find this position... */
909 /* if reginfo is NULL, its a dryrun */
910 /* annoyingly all the vars in this routine have different names from their counterparts
911 in regmatch. /grrr */
914 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
915 const char *strend, const regmatch_info *reginfo)
918 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
922 register STRLEN uskip;
926 register I32 tmp = 1; /* Scratch variable? */
927 register const bool do_utf8 = PL_reg_match_utf8;
929 /* We know what class it must start with. */
933 while (s + (uskip = UTF8SKIP(s)) <= strend) {
934 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
935 !UTF8_IS_INVARIANT((U8)s[0]) ?
936 reginclass(prog, c, (U8*)s, 0, do_utf8) :
937 REGINCLASS(prog, c, (U8*)s)) {
938 if (tmp && (!reginfo || regtry(reginfo, s)))
952 if (REGINCLASS(prog, c, (U8*)s) ||
953 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
954 /* The assignment of 2 is intentional:
955 * for the folded sharp s, the skip is 2. */
956 (skip = SHARP_S_SKIP))) {
957 if (tmp && (!reginfo || regtry(reginfo, s)))
970 if (tmp && (!reginfo || regtry(reginfo, s)))
979 ln = STR_LEN(c); /* length to match in octets/bytes */
980 lnc = (I32) ln; /* length to match in characters */
984 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
985 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
986 const U32 uniflags = UTF8_ALLOW_DEFAULT;
988 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
989 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
991 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
993 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
996 while (sm < ((U8 *) m + ln)) {
1011 c2 = PL_fold_locale[c1];
1013 e = HOP3c(strend, -((I32)lnc), s);
1015 if (!reginfo && e < s)
1016 e = s; /* Due to minlen logic of intuit() */
1018 /* The idea in the EXACTF* cases is to first find the
1019 * first character of the EXACTF* node and then, if
1020 * necessary, case-insensitively compare the full
1021 * text of the node. The c1 and c2 are the first
1022 * characters (though in Unicode it gets a bit
1023 * more complicated because there are more cases
1024 * than just upper and lower: one needs to use
1025 * the so-called folding case for case-insensitive
1026 * matching (called "loose matching" in Unicode).
1027 * ibcmp_utf8() will do just that. */
1031 U8 tmpbuf [UTF8_MAXBYTES+1];
1032 STRLEN len, foldlen;
1033 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1035 /* Upper and lower of 1st char are equal -
1036 * probably not a "letter". */
1038 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1042 ibcmp_utf8(s, NULL, 0, do_utf8,
1043 m, NULL, ln, (bool)UTF))
1044 && (!reginfo || regtry(reginfo, s)) )
1047 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1048 uvchr_to_utf8(tmpbuf, c);
1049 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1051 && (f == c1 || f == c2)
1052 && (ln == foldlen ||
1053 !ibcmp_utf8((char *) foldbuf,
1054 NULL, foldlen, do_utf8,
1056 NULL, ln, (bool)UTF))
1057 && (!reginfo || regtry(reginfo, s)) )
1065 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1068 /* Handle some of the three Greek sigmas cases.
1069 * Note that not all the possible combinations
1070 * are handled here: some of them are handled
1071 * by the standard folding rules, and some of
1072 * them (the character class or ANYOF cases)
1073 * are handled during compiletime in
1074 * regexec.c:S_regclass(). */
1075 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1076 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1077 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1079 if ( (c == c1 || c == c2)
1081 ibcmp_utf8(s, NULL, 0, do_utf8,
1082 m, NULL, ln, (bool)UTF))
1083 && (!reginfo || regtry(reginfo, s)) )
1086 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1087 uvchr_to_utf8(tmpbuf, c);
1088 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1090 && (f == c1 || f == c2)
1091 && (ln == foldlen ||
1092 !ibcmp_utf8((char *) foldbuf,
1093 NULL, foldlen, do_utf8,
1095 NULL, ln, (bool)UTF))
1096 && (!reginfo || regtry(reginfo, s)) )
1107 && (ln == 1 || !(OP(c) == EXACTF
1109 : ibcmp_locale(s, m, ln)))
1110 && (!reginfo || regtry(reginfo, s)) )
1116 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1117 && (ln == 1 || !(OP(c) == EXACTF
1119 : ibcmp_locale(s, m, ln)))
1120 && (!reginfo || regtry(reginfo, s)) )
1127 PL_reg_flags |= RF_tainted;
1134 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1135 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1137 tmp = ((OP(c) == BOUND ?
1138 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1139 LOAD_UTF8_CHARCLASS_ALNUM();
1140 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1141 if (tmp == !(OP(c) == BOUND ?
1142 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1143 isALNUM_LC_utf8((U8*)s)))
1146 if ((!reginfo || regtry(reginfo, s)))
1153 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1154 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1155 while (s < strend) {
1157 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1159 if ((!reginfo || regtry(reginfo, s)))
1165 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
1169 PL_reg_flags |= RF_tainted;
1176 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1177 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1179 tmp = ((OP(c) == NBOUND ?
1180 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1181 LOAD_UTF8_CHARCLASS_ALNUM();
1182 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1183 if (tmp == !(OP(c) == NBOUND ?
1184 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1185 isALNUM_LC_utf8((U8*)s)))
1187 else if ((!reginfo || regtry(reginfo, s)))
1193 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1194 tmp = ((OP(c) == NBOUND ?
1195 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1196 while (s < strend) {
1198 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1200 else if ((!reginfo || regtry(reginfo, s)))
1205 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
1210 LOAD_UTF8_CHARCLASS_ALNUM();
1211 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1212 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1213 if (tmp && (!reginfo || regtry(reginfo, s)))
1224 while (s < strend) {
1226 if (tmp && (!reginfo || regtry(reginfo, s)))
1238 PL_reg_flags |= RF_tainted;
1240 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1241 if (isALNUM_LC_utf8((U8*)s)) {
1242 if (tmp && (!reginfo || regtry(reginfo, s)))
1253 while (s < strend) {
1254 if (isALNUM_LC(*s)) {
1255 if (tmp && (!reginfo || regtry(reginfo, s)))
1268 LOAD_UTF8_CHARCLASS_ALNUM();
1269 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1270 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1271 if (tmp && (!reginfo || regtry(reginfo, s)))
1282 while (s < strend) {
1284 if (tmp && (!reginfo || regtry(reginfo, s)))
1296 PL_reg_flags |= RF_tainted;
1298 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1299 if (!isALNUM_LC_utf8((U8*)s)) {
1300 if (tmp && (!reginfo || regtry(reginfo, s)))
1311 while (s < strend) {
1312 if (!isALNUM_LC(*s)) {
1313 if (tmp && (!reginfo || regtry(reginfo, s)))
1326 LOAD_UTF8_CHARCLASS_SPACE();
1327 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1328 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1329 if (tmp && (!reginfo || regtry(reginfo, s)))
1340 while (s < strend) {
1342 if (tmp && (!reginfo || regtry(reginfo, s)))
1354 PL_reg_flags |= RF_tainted;
1356 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1357 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1358 if (tmp && (!reginfo || regtry(reginfo, s)))
1369 while (s < strend) {
1370 if (isSPACE_LC(*s)) {
1371 if (tmp && (!reginfo || regtry(reginfo, s)))
1384 LOAD_UTF8_CHARCLASS_SPACE();
1385 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1386 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1387 if (tmp && (!reginfo || regtry(reginfo, s)))
1398 while (s < strend) {
1400 if (tmp && (!reginfo || regtry(reginfo, s)))
1412 PL_reg_flags |= RF_tainted;
1414 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1415 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1416 if (tmp && (!reginfo || regtry(reginfo, s)))
1427 while (s < strend) {
1428 if (!isSPACE_LC(*s)) {
1429 if (tmp && (!reginfo || regtry(reginfo, s)))
1442 LOAD_UTF8_CHARCLASS_DIGIT();
1443 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1444 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1445 if (tmp && (!reginfo || regtry(reginfo, s)))
1456 while (s < strend) {
1458 if (tmp && (!reginfo || regtry(reginfo, s)))
1470 PL_reg_flags |= RF_tainted;
1472 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1473 if (isDIGIT_LC_utf8((U8*)s)) {
1474 if (tmp && (!reginfo || regtry(reginfo, s)))
1485 while (s < strend) {
1486 if (isDIGIT_LC(*s)) {
1487 if (tmp && (!reginfo || regtry(reginfo, s)))
1500 LOAD_UTF8_CHARCLASS_DIGIT();
1501 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1502 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1503 if (tmp && (!reginfo || regtry(reginfo, s)))
1514 while (s < strend) {
1516 if (tmp && (!reginfo || regtry(reginfo, s)))
1528 PL_reg_flags |= RF_tainted;
1530 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1531 if (!isDIGIT_LC_utf8((U8*)s)) {
1532 if (tmp && (!reginfo || regtry(reginfo, s)))
1543 while (s < strend) {
1544 if (!isDIGIT_LC(*s)) {
1545 if (tmp && (!reginfo || regtry(reginfo, s)))
1557 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1559 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1560 trie_type = do_utf8 ?
1561 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1563 /* what trie are we using right now */
1565 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1566 reg_trie_data *trie=aho->trie;
1568 const char *last_start = strend - trie->minlen;
1569 const char *real_start = s;
1570 STRLEN maxlen = trie->maxlen;
1572 U8 **points; /* map of where we were in the input string
1573 when reading a given string. For ASCII this
1574 is unnecessary overhead as the relationship
1575 is always 1:1, but for unicode, especially
1576 case folded unicode this is not true. */
1577 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1579 GET_RE_DEBUG_FLAGS_DECL;
1581 /* We can't just allocate points here. We need to wrap it in
1582 * an SV so it gets freed properly if there is a croak while
1583 * running the match */
1586 sv_points=newSV(maxlen * sizeof(U8 *));
1587 SvCUR_set(sv_points,
1588 maxlen * sizeof(U8 *));
1589 SvPOK_on(sv_points);
1590 sv_2mortal(sv_points);
1591 points=(U8**)SvPV_nolen(sv_points );
1593 if (trie->bitmap && trie_type != trie_utf8_fold) {
1594 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
1599 while (s <= last_start) {
1600 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1608 U8 *uscan = (U8*)NULL;
1609 U8 *leftmost = NULL;
1613 while ( state && uc <= (U8*)strend ) {
1615 if (aho->states[ state ].wordnum) {
1616 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1617 if (!leftmost || lpos < leftmost)
1621 points[pointpos++ % maxlen]= uc;
1622 switch (trie_type) {
1623 case trie_utf8_fold:
1625 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1630 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1631 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1632 foldlen -= UNISKIP( uvc );
1633 uscan = foldbuf + UNISKIP( uvc );
1637 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1646 charid = trie->charmap[ uvc ];
1650 if (trie->widecharmap) {
1651 SV** const svpp = hv_fetch(trie->widecharmap,
1652 (char*)&uvc, sizeof(UV), 0);
1654 charid = (U16)SvIV(*svpp);
1657 DEBUG_TRIE_EXECUTE_r(
1658 PerlIO_printf(Perl_debug_log,
1659 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1660 (int)((const char*)uc - real_start), charid, uvc)
1665 U32 word = aho->states[ state ].wordnum;
1666 base = aho->states[ state ].trans.base;
1668 DEBUG_TRIE_EXECUTE_r(
1669 PerlIO_printf( Perl_debug_log,
1670 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1671 failed ? "Fail transition to " : "",
1672 state, base, uvc, word)
1677 (base + charid > trie->uniquecharcount )
1678 && (base + charid - 1 - trie->uniquecharcount
1680 && trie->trans[base + charid - 1 -
1681 trie->uniquecharcount].check == state
1682 && (tmp=trie->trans[base + charid - 1 -
1683 trie->uniquecharcount ].next))
1693 state = aho->fail[state];
1697 /* we must be accepting here */
1705 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1706 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
1712 if ( aho->states[ state ].wordnum ) {
1713 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1714 if (!leftmost || lpos < leftmost)
1717 DEBUG_TRIE_EXECUTE_r(
1718 PerlIO_printf( Perl_debug_log,
1719 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1724 s = (char*)leftmost;
1725 if (!reginfo || regtry(reginfo, s)) {
1740 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1749 - regexec_flags - match a regexp against a string
1752 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1753 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1754 /* strend: pointer to null at end of string */
1755 /* strbeg: real beginning of string */
1756 /* minend: end of match must be >=minend after stringarg. */
1757 /* data: May be used for some additional optimizations. */
1758 /* nosave: For optimizations. */
1762 register regnode *c;
1763 register char *startpos = stringarg;
1764 I32 minlen; /* must match at least this many chars */
1765 I32 dontbother = 0; /* how many characters not to try at end */
1766 I32 end_shift = 0; /* Same for the end. */ /* CC */
1767 I32 scream_pos = -1; /* Internal iterator of scream. */
1768 char *scream_olds = NULL;
1769 SV* const oreplsv = GvSV(PL_replgv);
1770 const bool do_utf8 = DO_UTF8(sv);
1773 regmatch_info reginfo; /* create some info to pass to regtry etc */
1775 GET_RE_DEBUG_FLAGS_DECL;
1777 PERL_UNUSED_ARG(data);
1779 /* Be paranoid... */
1780 if (prog == NULL || startpos == NULL) {
1781 Perl_croak(aTHX_ "NULL regexp parameter");
1785 multiline = prog->reganch & PMf_MULTILINE;
1786 reginfo.prog = prog;
1788 RX_MATCH_UTF8_set(prog, do_utf8);
1790 minlen = prog->minlen;
1791 if (strend - startpos < minlen) {
1792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1793 "String too short [regexec_flags]...\n"));
1797 /* Check validity of program. */
1798 if (UCHARAT(prog->program) != REG_MAGIC) {
1799 Perl_croak(aTHX_ "corrupted regexp program");
1803 PL_reg_eval_set = 0;
1806 if (prog->reganch & ROPT_UTF8)
1807 PL_reg_flags |= RF_utf8;
1809 /* Mark beginning of line for ^ and lookbehind. */
1810 reginfo.bol = startpos; /* XXX not used ??? */
1814 /* Mark end of line for $ (and such) */
1817 /* see how far we have to get to not match where we matched before */
1818 reginfo.till = startpos+minend;
1820 /* If there is a "must appear" string, look for it. */
1823 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
1826 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1827 reginfo.ganch = startpos;
1828 else if (sv && SvTYPE(sv) >= SVt_PVMG
1830 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1831 && mg->mg_len >= 0) {
1832 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1833 if (prog->reganch & ROPT_ANCH_GPOS) {
1834 if (s > reginfo.ganch)
1839 else /* pos() not defined */
1840 reginfo.ganch = strbeg;
1843 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1844 re_scream_pos_data d;
1846 d.scream_olds = &scream_olds;
1847 d.scream_pos = &scream_pos;
1848 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1850 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1851 goto phooey; /* not present */
1856 RE_PV_DISPLAY_DECL(s0, len0, UTF,
1857 PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
1858 RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
1859 PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
1863 PerlIO_printf(Perl_debug_log,
1864 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1865 PL_colors[4], PL_colors[5], PL_colors[0],
1868 len0 > 60 ? "..." : "",
1870 (int)(len1 > 60 ? 60 : len1),
1872 (len1 > 60 ? "..." : "")
1876 /* Simplest case: anchored match need be tried only once. */
1877 /* [unless only anchor is BOL and multiline is set] */
1878 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1879 if (s == startpos && regtry(®info, startpos))
1881 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1882 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1887 dontbother = minlen - 1;
1888 end = HOP3c(strend, -dontbother, strbeg) - 1;
1889 /* for multiline we only have to try after newlines */
1890 if (prog->check_substr || prog->check_utf8) {
1894 if (regtry(®info, s))
1899 if (prog->reganch & RE_USE_INTUIT) {
1900 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1911 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1912 if (regtry(®info, s))
1919 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1920 if (regtry(®info, reginfo.ganch))
1925 /* Messy cases: unanchored match. */
1926 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1927 /* we have /x+whatever/ */
1928 /* it must be a one character string (XXXX Except UTF?) */
1933 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1934 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1935 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1938 while (s < strend) {
1940 DEBUG_EXECUTE_r( did_match = 1 );
1941 if (regtry(®info, s)) goto got_it;
1943 while (s < strend && *s == ch)
1950 while (s < strend) {
1952 DEBUG_EXECUTE_r( did_match = 1 );
1953 if (regtry(®info, s)) goto got_it;
1955 while (s < strend && *s == ch)
1961 DEBUG_EXECUTE_r(if (!did_match)
1962 PerlIO_printf(Perl_debug_log,
1963 "Did not find anchored character...\n")
1966 else if (prog->anchored_substr != NULL
1967 || prog->anchored_utf8 != NULL
1968 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1969 && prog->float_max_offset < strend - s)) {
1974 char *last1; /* Last position checked before */
1978 if (prog->anchored_substr || prog->anchored_utf8) {
1979 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1980 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1981 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1982 back_max = back_min = prog->anchored_offset;
1984 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1985 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1986 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1987 back_max = prog->float_max_offset;
1988 back_min = prog->float_min_offset;
1990 if (must == &PL_sv_undef)
1991 /* could not downgrade utf8 check substring, so must fail */
1994 last = HOP3c(strend, /* Cannot start after this */
1995 -(I32)(CHR_SVLEN(must)
1996 - (SvTAIL(must) != 0) + back_min), strbeg);
1999 last1 = HOPc(s, -1);
2001 last1 = s - 1; /* bogus */
2003 /* XXXX check_substr already used to find "s", can optimize if
2004 check_substr==must. */
2006 dontbother = end_shift;
2007 strend = HOPc(strend, -dontbother);
2008 while ( (s <= last) &&
2009 ((flags & REXEC_SCREAM)
2010 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
2011 end_shift, &scream_pos, 0))
2012 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
2013 (unsigned char*)strend, must,
2014 multiline ? FBMrf_MULTILINE : 0))) ) {
2015 /* we may be pointing at the wrong string */
2016 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
2017 s = strbeg + (s - SvPVX_const(sv));
2018 DEBUG_EXECUTE_r( did_match = 1 );
2019 if (HOPc(s, -back_max) > last1) {
2020 last1 = HOPc(s, -back_min);
2021 s = HOPc(s, -back_max);
2024 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2026 last1 = HOPc(s, -back_min);
2030 while (s <= last1) {
2031 if (regtry(®info, s))
2037 while (s <= last1) {
2038 if (regtry(®info, s))
2044 DEBUG_EXECUTE_r(if (!did_match)
2045 PerlIO_printf(Perl_debug_log,
2046 "Did not find %s substr \"%s%.*s%s\"%s...\n",
2047 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2048 ? "anchored" : "floating"),
2050 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
2052 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2056 else if ((c = prog->regstclass)) {
2058 const OPCODE op = OP(prog->regstclass);
2059 /* don't bother with what can't match */
2060 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
2061 strend = HOPc(strend, -(minlen - 1));
2064 SV * const prop = sv_newmortal();
2065 regprop(prog, prop, c);
2067 RE_PV_DISPLAY_DECL(s0,len0,UTF,
2068 PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
2069 RE_PV_DISPLAY_DECL(s1,len1,UTF,
2070 PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
2071 PerlIO_printf(Perl_debug_log,
2072 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
2074 len1, len1, s1, (int)(strend - s));
2077 if (find_byclass(prog, c, s, strend, ®info))
2079 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2083 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2088 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2089 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2090 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2092 if (flags & REXEC_SCREAM) {
2093 last = screaminstr(sv, float_real, s - strbeg,
2094 end_shift, &scream_pos, 1); /* last one */
2096 last = scream_olds; /* Only one occurrence. */
2097 /* we may be pointing at the wrong string */
2098 else if (RX_MATCH_COPIED(prog))
2099 s = strbeg + (s - SvPVX_const(sv));
2103 const char * const little = SvPV_const(float_real, len);
2105 if (SvTAIL(float_real)) {
2106 if (memEQ(strend - len + 1, little, len - 1))
2107 last = strend - len + 1;
2108 else if (!multiline)
2109 last = memEQ(strend - len, little, len)
2110 ? strend - len : NULL;
2116 last = rninstr(s, strend, little, little + len);
2118 last = strend; /* matching "$" */
2122 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2123 "%sCan't trim the tail, match fails (should not happen)%s\n",
2124 PL_colors[4], PL_colors[5]));
2125 goto phooey; /* Should not happen! */
2127 dontbother = strend - last + prog->float_min_offset;
2129 if (minlen && (dontbother < minlen))
2130 dontbother = minlen - 1;
2131 strend -= dontbother; /* this one's always in bytes! */
2132 /* We don't know much -- general case. */
2135 if (regtry(®info, s))
2144 if (regtry(®info, s))
2146 } while (s++ < strend);
2154 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2156 if (PL_reg_eval_set) {
2157 /* Preserve the current value of $^R */
2158 if (oreplsv != GvSV(PL_replgv))
2159 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2160 restored, the value remains
2162 restore_pos(aTHX_ prog);
2165 /* make sure $`, $&, $', and $digit will work later */
2166 if ( !(flags & REXEC_NOT_FIRST) ) {
2167 RX_MATCH_COPY_FREE(prog);
2168 if (flags & REXEC_COPY_STR) {
2169 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2170 #ifdef PERL_OLD_COPY_ON_WRITE
2172 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2174 PerlIO_printf(Perl_debug_log,
2175 "Copy on write: regexp capture, type %d\n",
2178 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2179 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2180 assert (SvPOKp(prog->saved_copy));
2184 RX_MATCH_COPIED_on(prog);
2185 s = savepvn(strbeg, i);
2191 prog->subbeg = strbeg;
2192 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2199 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2200 PL_colors[4], PL_colors[5]));
2201 if (PL_reg_eval_set)
2202 restore_pos(aTHX_ prog);
2207 - regtry - try match at specific point
2209 STATIC I32 /* 0 failure, 1 success */
2210 S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
2216 regexp *prog = reginfo->prog;
2217 GET_RE_DEBUG_FLAGS_DECL;
2220 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2222 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2225 PL_reg_eval_set = RS_init;
2226 DEBUG_EXECUTE_r(DEBUG_s(
2227 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2228 (IV)(PL_stack_sp - PL_stack_base));
2230 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2231 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2232 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2234 /* Apparently this is not needed, judging by wantarray. */
2235 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2236 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2239 /* Make $_ available to executed code. */
2240 if (reginfo->sv != DEFSV) {
2242 DEFSV = reginfo->sv;
2245 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2246 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2247 /* prepare for quick setting of pos */
2248 #ifdef PERL_OLD_COPY_ON_WRITE
2250 sv_force_normal_flags(sv, 0);
2252 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2253 &PL_vtbl_mglob, NULL, 0);
2257 PL_reg_oldpos = mg->mg_len;
2258 SAVEDESTRUCTOR_X(restore_pos, prog);
2260 if (!PL_reg_curpm) {
2261 Newxz(PL_reg_curpm, 1, PMOP);
2264 SV* const repointer = newSViv(0);
2265 /* so we know which PL_regex_padav element is PL_reg_curpm */
2266 SvFLAGS(repointer) |= SVf_BREAK;
2267 av_push(PL_regex_padav,repointer);
2268 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2269 PL_regex_pad = AvARRAY(PL_regex_padav);
2273 PM_SETRE(PL_reg_curpm, prog);
2274 PL_reg_oldcurpm = PL_curpm;
2275 PL_curpm = PL_reg_curpm;
2276 if (RX_MATCH_COPIED(prog)) {
2277 /* Here is a serious problem: we cannot rewrite subbeg,
2278 since it may be needed if this match fails. Thus
2279 $` inside (?{}) could fail... */
2280 PL_reg_oldsaved = prog->subbeg;
2281 PL_reg_oldsavedlen = prog->sublen;
2282 #ifdef PERL_OLD_COPY_ON_WRITE
2283 PL_nrs = prog->saved_copy;
2285 RX_MATCH_COPIED_off(prog);
2288 PL_reg_oldsaved = NULL;
2289 prog->subbeg = PL_bostr;
2290 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2292 prog->startp[0] = startpos - PL_bostr;
2293 PL_reginput = startpos;
2294 PL_regstartp = prog->startp;
2295 PL_regendp = prog->endp;
2296 PL_reglastparen = &prog->lastparen;
2297 PL_reglastcloseparen = &prog->lastcloseparen;
2298 prog->lastparen = 0;
2299 prog->lastcloseparen = 0;
2301 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
2302 if (PL_reg_start_tmpl <= prog->nparens) {
2303 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2304 if(PL_reg_start_tmp)
2305 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2307 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2310 /* XXXX What this code is doing here?!!! There should be no need
2311 to do this again and again, PL_reglastparen should take care of
2314 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2315 * Actually, the code in regcppop() (which Ilya may be meaning by
2316 * PL_reglastparen), is not needed at all by the test suite
2317 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2318 * enough, for building DynaLoader, or otherwise this
2319 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2320 * will happen. Meanwhile, this code *is* needed for the
2321 * above-mentioned test suite tests to succeed. The common theme
2322 * on those tests seems to be returning null fields from matches.
2327 if (prog->nparens) {
2329 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2336 if (regmatch(reginfo, prog->program + 1)) {
2337 prog->endp[0] = PL_reginput - PL_bostr;
2340 REGCP_UNWIND(lastcp);
2345 #define sayYES goto yes
2346 #define sayNO goto no
2347 #define sayNO_ANYOF goto no_anyof
2348 #define sayYES_FINAL goto yes_final
2349 #define sayNO_FINAL goto no_final
2350 #define sayNO_SILENT goto do_no
2351 #define saySAME(x) if (x) goto yes; else goto no
2353 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2354 #define POSCACHE_SEEN 1 /* we know what we're caching */
2355 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2357 #define CACHEsayYES STMT_START { \
2358 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2359 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2360 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2361 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2363 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2364 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2367 /* cache records failure, but this is success */ \
2369 PerlIO_printf(Perl_debug_log, \
2370 "%*s (remove success from failure cache)\n", \
2371 REPORT_CODE_OFF+PL_regindent*2, "") \
2373 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2379 #define CACHEsayNO STMT_START { \
2380 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
2381 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2382 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2383 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2385 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2386 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2389 /* cache records success, but this is failure */ \
2391 PerlIO_printf(Perl_debug_log, \
2392 "%*s (remove failure from success cache)\n", \
2393 REPORT_CODE_OFF+PL_regindent*2, "") \
2395 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
2401 /* this is used to determine how far from the left messages like
2402 'failed...' are printed. Currently 29 makes these messages line
2403 up with the opcode they refer to. Earlier perls used 25 which
2404 left these messages outdented making reviewing a debug output
2407 #define REPORT_CODE_OFF 29
2410 /* Make sure there is a test for this +1 options in re_tests */
2411 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2413 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2414 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2416 #define SLAB_FIRST(s) (&(s)->states[0])
2417 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2419 /* grab a new slab and return the first slot in it */
2421 STATIC regmatch_state *
2424 #if PERL_VERSION < 9
2427 regmatch_slab *s = PL_regmatch_slab->next;
2429 Newx(s, 1, regmatch_slab);
2430 s->prev = PL_regmatch_slab;
2432 PL_regmatch_slab->next = s;
2434 PL_regmatch_slab = s;
2435 return SLAB_FIRST(s);
2438 /* simulate a recursive call to regmatch */
2440 #define REGMATCH(ns, where) \
2443 st->resume_state = resume_##where; \
2444 goto start_recurse; \
2445 resume_point_##where:
2447 /* push a new state then goto it */
2449 #define PUSH_STATE_GOTO(state, node) \
2451 st->resume_state = state; \
2454 /* push a new state with success backtracking, then goto it */
2456 #define PUSH_YES_STATE_GOTO(state, node) \
2458 st->resume_state = state; \
2459 goto push_yes_state;
2464 - regmatch - main matching routine
2466 * Conceptually the strategy is simple: check to see whether the current
2467 * node matches, call self recursively to see whether the rest matches,
2468 * and then act accordingly. In practice we make some effort to avoid
2469 * recursion, in particular by going through "ordinary" nodes (that don't
2470 * need to know whether the rest of the match failed) by a loop instead of
2473 /* [lwall] I've hoisted the register declarations to the outer block in order to
2474 * maybe save a little bit of pushing and popping on the stack. It also takes
2475 * advantage of machines that use a register save mask on subroutine entry.
2477 * This function used to be heavily recursive, but since this had the
2478 * effect of blowing the CPU stack on complex regexes, it has been
2479 * restructured to be iterative, and to save state onto the heap rather
2480 * than the stack. Essentially whereever regmatch() used to be called, it
2481 * pushes the current state, notes where to return, then jumps back into
2484 * Originally the structure of this function used to look something like
2489 while (scan != NULL) {
2490 a++; // do stuff with a and b
2496 if (regmatch(...)) // recurse
2506 * Now it looks something like this:
2514 regmatch_state *st = new();
2516 st->a++; // do stuff with a and b
2518 while (scan != NULL) {
2526 st->resume_state = resume_FOO;
2527 goto start_recurse; // recurse
2536 st = new(); push a new state
2537 st->a = 1; st->b = 2;
2544 switch (resume_state) {
2546 goto resume_point_FOO;
2553 * WARNING: this means that any line in this function that contains a
2554 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2555 * regmatch() using gotos instead. Thus the values of any local variables
2556 * not saved in the regmatch_state structure will have been lost when
2557 * execution resumes on the next line .
2559 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2560 * PL_regmatch_state always points to the currently active state, and
2561 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2562 * The first time regmatch is called, the first slab is allocated, and is
2563 * never freed until interpreter desctruction. When the slab is full,
2564 * a new one is allocated chained to the end. At exit from regmatch, slabs
2565 * allocated since entry are freed.
2568 /* *** every FOO_fail should = FOO+1 */
2569 #define TRIE_next (REGNODE_MAX+1)
2570 #define TRIE_next_fail (REGNODE_MAX+2)
2571 #define EVAL_A (REGNODE_MAX+3)
2572 #define EVAL_A_fail (REGNODE_MAX+4)
2573 #define resume_CURLYX (REGNODE_MAX+5)
2574 #define resume_WHILEM1 (REGNODE_MAX+6)
2575 #define resume_WHILEM2 (REGNODE_MAX+7)
2576 #define resume_WHILEM3 (REGNODE_MAX+8)
2577 #define resume_WHILEM4 (REGNODE_MAX+9)
2578 #define resume_WHILEM5 (REGNODE_MAX+10)
2579 #define resume_WHILEM6 (REGNODE_MAX+11)
2580 #define BRANCH_next (REGNODE_MAX+12)
2581 #define BRANCH_next_fail (REGNODE_MAX+13)
2582 #define CURLYM_A (REGNODE_MAX+14)
2583 #define CURLYM_A_fail (REGNODE_MAX+15)
2584 #define CURLYM_B (REGNODE_MAX+16)
2585 #define CURLYM_B_fail (REGNODE_MAX+17)
2586 #define IFMATCH_A (REGNODE_MAX+18)
2587 #define IFMATCH_A_fail (REGNODE_MAX+19)
2588 #define CURLY_B_min_known (REGNODE_MAX+20)
2589 #define CURLY_B_min_known_fail (REGNODE_MAX+21)
2590 #define CURLY_B_min (REGNODE_MAX+22)
2591 #define CURLY_B_min_fail (REGNODE_MAX+23)
2592 #define CURLY_B_max (REGNODE_MAX+24)
2593 #define CURLY_B_max_fail (REGNODE_MAX+25)
2596 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2601 S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2603 const int docolor = *PL_colors[0];
2604 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2605 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2606 /* The part of the string before starttry has one color
2607 (pref0_len chars), between starttry and current
2608 position another one (pref_len - pref0_len chars),
2609 after the current position the third one.
2610 We assume that pref0_len <= pref_len, otherwise we
2611 decrease pref0_len. */
2612 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2613 ? (5 + taill) - l : locinput - PL_bostr;
2616 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2618 pref0_len = pref_len - (locinput - PL_reg_starttry);
2619 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2620 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2621 ? (5 + taill) - pref_len : PL_regeol - locinput);
2622 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2626 if (pref0_len > pref_len)
2627 pref0_len = pref_len;
2629 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2631 RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2632 (locinput - pref_len),pref0_len, 60);
2634 RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2635 (locinput - pref_len + pref0_len),
2636 pref_len - pref0_len, 60);
2638 RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2639 locinput, PL_regeol - locinput, 60);
2641 PerlIO_printf(Perl_debug_log,
2642 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2643 (IV)(locinput - PL_bostr),
2650 (docolor ? "" : "> <"),
2654 15 - l - pref_len + 1,
2661 STATIC I32 /* 0 failure, 1 success */
2662 S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
2664 #if PERL_VERSION < 9
2668 register const bool do_utf8 = PL_reg_match_utf8;
2669 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2671 regexp *rex = reginfo->prog;
2673 regmatch_slab *orig_slab;
2674 regmatch_state *orig_state;
2676 /* the current state. This is a cached copy of PL_regmatch_state */
2677 register regmatch_state *st;
2679 /* cache heavy used fields of st in registers */
2680 register regnode *scan;
2681 register regnode *next;
2682 register I32 n = 0; /* initialize to shut up compiler warning */
2683 register char *locinput = PL_reginput;
2685 /* these variables are NOT saved during a recusive RFEGMATCH: */
2686 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2687 bool result; /* return value of S_regmatch */
2688 int depth = 0; /* depth of recursion */
2689 regmatch_state *yes_state = NULL; /* state to pop to on success of
2694 GET_RE_DEBUG_FLAGS_DECL;
2698 /* on first ever call to regmatch, allocate first slab */
2699 if (!PL_regmatch_slab) {
2700 Newx(PL_regmatch_slab, 1, regmatch_slab);
2701 PL_regmatch_slab->prev = NULL;
2702 PL_regmatch_slab->next = NULL;
2703 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2706 /* remember current high-water mark for exit */
2707 /* XXX this should be done with SAVE* instead */
2708 orig_slab = PL_regmatch_slab;
2709 orig_state = PL_regmatch_state;
2711 /* grab next free state slot */
2712 st = ++PL_regmatch_state;
2713 if (st > SLAB_LAST(PL_regmatch_slab))
2714 st = PL_regmatch_state = S_push_slab(aTHX);
2720 /* Note that nextchr is a byte even in UTF */
2721 nextchr = UCHARAT(locinput);
2723 while (scan != NULL) {
2726 SV * const prop = sv_newmortal();
2727 dump_exec_pos( locinput, scan, do_utf8 );
2728 regprop(rex, prop, scan);
2730 PerlIO_printf(Perl_debug_log,
2731 "%3"IVdf":%*s%s(%"IVdf")\n",
2732 (IV)(scan - rex->program), PL_regindent*2, "",
2734 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2737 next = scan + NEXT_OFF(scan);
2740 state_num = OP(scan);
2743 switch (state_num) {
2745 if (locinput == PL_bostr)
2747 /* reginfo->till = reginfo->bol; */
2752 if (locinput == PL_bostr ||
2753 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2759 if (locinput == PL_bostr)
2763 if (locinput == reginfo->ganch)
2769 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2774 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2776 if (PL_regeol - locinput > 1)
2780 if (PL_regeol != locinput)
2784 if (!nextchr && locinput >= PL_regeol)
2787 locinput += PL_utf8skip[nextchr];
2788 if (locinput > PL_regeol)
2790 nextchr = UCHARAT(locinput);
2793 nextchr = UCHARAT(++locinput);
2796 if (!nextchr && locinput >= PL_regeol)
2798 nextchr = UCHARAT(++locinput);
2801 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2804 locinput += PL_utf8skip[nextchr];
2805 if (locinput > PL_regeol)
2807 nextchr = UCHARAT(locinput);
2810 nextchr = UCHARAT(++locinput);
2814 #define ST st->u.trie
2818 /* what type of TRIE am I? (utf8 makes this contextual) */
2819 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2820 trie_type = do_utf8 ?
2821 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2824 /* what trie are we using right now */
2825 reg_trie_data * const trie
2826 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2827 U32 state = trie->startstate;
2829 U8 *uc = ( U8* )locinput;
2835 U8 *uscan = (U8*)NULL;
2837 SV *sv_accept_buff = NULL;
2838 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2840 ST.accepted = 0; /* how many accepting states we have seen */
2846 if (trie->bitmap && trie_type != trie_utf8_fold &&
2847 !TRIE_BITMAP_TEST(trie,*locinput)
2849 if (trie->states[ state ].wordnum) {
2851 PerlIO_printf(Perl_debug_log,
2852 "%*s %smatched empty string...%s\n",
2853 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2858 PerlIO_printf(Perl_debug_log,
2859 "%*s %sfailed to match start class...%s\n",
2860 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2867 traverse the TRIE keeping track of all accepting states
2868 we transition through until we get to a failing node.
2871 while ( state && uc <= (U8*)PL_regeol ) {
2873 if (trie->states[ state ].wordnum) {
2874 if (!ST.accepted ) {
2877 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2878 sv_accept_buff=newSV(bufflen *
2879 sizeof(reg_trie_accepted) - 1);
2880 SvCUR_set(sv_accept_buff,
2881 sizeof(reg_trie_accepted));
2882 SvPOK_on(sv_accept_buff);
2883 sv_2mortal(sv_accept_buff);
2886 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2889 if (ST.accepted >= bufflen) {
2891 ST.accept_buff =(reg_trie_accepted*)
2892 SvGROW(sv_accept_buff,
2893 bufflen * sizeof(reg_trie_accepted));
2895 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2896 + sizeof(reg_trie_accepted));
2898 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2899 ST.accept_buff[ST.accepted].endpos = uc;
2903 base = trie->states[ state ].trans.base;
2905 DEBUG_TRIE_EXECUTE_r({
2906 dump_exec_pos( (char *)uc, scan, do_utf8 );
2907 PerlIO_printf( Perl_debug_log,
2908 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2909 2+PL_regindent * 2, "", PL_colors[4],
2910 (UV)state, (UV)base, (UV)ST.accepted );
2914 switch (trie_type) {
2915 case trie_utf8_fold:
2917 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2922 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2923 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2924 foldlen -= UNISKIP( uvc );
2925 uscan = foldbuf + UNISKIP( uvc );
2929 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2938 charid = trie->charmap[ uvc ];
2942 if (trie->widecharmap) {
2943 SV** const svpp = hv_fetch(trie->widecharmap,
2944 (char*)&uvc, sizeof(UV), 0);
2946 charid = (U16)SvIV(*svpp);
2951 (base + charid > trie->uniquecharcount )
2952 && (base + charid - 1 - trie->uniquecharcount
2954 && trie->trans[base + charid - 1 -
2955 trie->uniquecharcount].check == state)
2957 state = trie->trans[base + charid - 1 -
2958 trie->uniquecharcount ].next;
2969 DEBUG_TRIE_EXECUTE_r(
2970 PerlIO_printf( Perl_debug_log,
2971 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2972 charid, uvc, (UV)state, PL_colors[5] );
2979 PerlIO_printf( Perl_debug_log,
2980 "%*s %sgot %"IVdf" possible matches%s\n",
2981 REPORT_CODE_OFF + PL_regindent * 2, "",
2982 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2988 case TRIE_next_fail: /* we failed - try next alterative */
2990 if ( ST.accepted == 1 ) {
2991 /* only one choice left - just continue */
2993 reg_trie_data * const trie
2994 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2995 SV ** const tmp = RX_DEBUG(reginfo->prog)
2996 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2998 PerlIO_printf( Perl_debug_log,
2999 "%*s %sonly one match left: #%d <%s>%s\n",
3000 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3001 ST.accept_buff[ 0 ].wordnum,
3002 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3005 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3006 /* in this case we free tmps/leave before we call regmatch
3007 as we wont be using accept_buff again. */
3010 locinput = PL_reginput;
3011 nextchr = UCHARAT(locinput);
3013 continue; /* execute rest of RE */
3016 if (!ST.accepted-- ) {
3023 There are at least two accepting states left. Presumably
3024 the number of accepting states is going to be low,
3025 typically two. So we simply scan through to find the one
3026 with lowest wordnum. Once we find it, we swap the last
3027 state into its place and decrement the size. We then try to
3028 match the rest of the pattern at the point where the word
3029 ends. If we succeed, control just continues along the
3030 regex; if we fail we return here to try the next accepting
3037 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3038 DEBUG_TRIE_EXECUTE_r(
3039 PerlIO_printf( Perl_debug_log,
3040 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3041 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3042 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3043 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3046 if (ST.accept_buff[cur].wordnum <
3047 ST.accept_buff[best].wordnum)
3052 reg_trie_data * const trie
3053 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3054 SV ** const tmp = RX_DEBUG(reginfo->prog)
3055 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
3057 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3058 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3059 ST.accept_buff[best].wordnum,
3060 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3064 if ( best<ST.accepted ) {
3065 reg_trie_accepted tmp = ST.accept_buff[ best ];
3066 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3067 ST.accept_buff[ ST.accepted ] = tmp;
3070 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3072 PUSH_STATE_GOTO(TRIE_next, ST.B);
3078 char *s = STRING(scan);
3079 st->ln = STR_LEN(scan);
3080 if (do_utf8 != UTF) {
3081 /* The target and the pattern have differing utf8ness. */
3083 const char * const e = s + st->ln;
3086 /* The target is utf8, the pattern is not utf8. */
3091 if (NATIVE_TO_UNI(*(U8*)s) !=
3092 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3100 /* The target is not utf8, the pattern is utf8. */
3105 if (NATIVE_TO_UNI(*((U8*)l)) !=
3106 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3114 nextchr = UCHARAT(locinput);
3117 /* The target and the pattern have the same utf8ness. */
3118 /* Inline the first character, for speed. */
3119 if (UCHARAT(s) != nextchr)
3121 if (PL_regeol - locinput < st->ln)
3123 if (st->ln > 1 && memNE(s, locinput, st->ln))
3126 nextchr = UCHARAT(locinput);
3130 PL_reg_flags |= RF_tainted;
3133 char * const s = STRING(scan);
3134 st->ln = STR_LEN(scan);
3136 if (do_utf8 || UTF) {
3137 /* Either target or the pattern are utf8. */
3138 const char * const l = locinput;
3139 char *e = PL_regeol;
3141 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
3142 l, &e, 0, do_utf8)) {
3143 /* One more case for the sharp s:
3144 * pack("U0U*", 0xDF) =~ /ss/i,
3145 * the 0xC3 0x9F are the UTF-8
3146 * byte sequence for the U+00DF. */
3148 toLOWER(s[0]) == 's' &&
3150 toLOWER(s[1]) == 's' &&
3157 nextchr = UCHARAT(locinput);
3161 /* Neither the target and the pattern are utf8. */
3163 /* Inline the first character, for speed. */
3164 if (UCHARAT(s) != nextchr &&
3165 UCHARAT(s) != ((OP(scan) == EXACTF)
3166 ? PL_fold : PL_fold_locale)[nextchr])
3168 if (PL_regeol - locinput < st->ln)
3170 if (st->ln > 1 && (OP(scan) == EXACTF
3171 ? ibcmp(s, locinput, st->ln)
3172 : ibcmp_locale(s, locinput, st->ln)))
3175 nextchr = UCHARAT(locinput);
3180 STRLEN inclasslen = PL_regeol - locinput;
3182 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3184 if (locinput >= PL_regeol)
3186 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3187 nextchr = UCHARAT(locinput);
3192 nextchr = UCHARAT(locinput);
3193 if (!REGINCLASS(rex, scan, (U8*)locinput))
3195 if (!nextchr && locinput >= PL_regeol)
3197 nextchr = UCHARAT(++locinput);
3201 /* If we might have the case of the German sharp s
3202 * in a casefolding Unicode character class. */
3204 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3205 locinput += SHARP_S_SKIP;
3206 nextchr = UCHARAT(locinput);
3212 PL_reg_flags |= RF_tainted;
3218 LOAD_UTF8_CHARCLASS_ALNUM();
3219 if (!(OP(scan) == ALNUM
3220 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3221 : isALNUM_LC_utf8((U8*)locinput)))
3225 locinput += PL_utf8skip[nextchr];
3226 nextchr = UCHARAT(locinput);
3229 if (!(OP(scan) == ALNUM
3230 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
3232 nextchr = UCHARAT(++locinput);
3235 PL_reg_flags |= RF_tainted;
3238 if (!nextchr && locinput >= PL_regeol)
3241 LOAD_UTF8_CHARCLASS_ALNUM();
3242 if (OP(scan) == NALNUM
3243 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
3244 : isALNUM_LC_utf8((U8*)locinput))
3248 locinput += PL_utf8skip[nextchr];
3249 nextchr = UCHARAT(locinput);
3252 if (OP(scan) == NALNUM
3253 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
3255 nextchr = UCHARAT(++locinput);
3259 PL_reg_flags |= RF_tainted;
3263 /* was last char in word? */
3265 if (locinput == PL_bostr)
3268 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3270 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3272 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3273 st->ln = isALNUM_uni(st->ln);
3274 LOAD_UTF8_CHARCLASS_ALNUM();
3275 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3278 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
3279 n = isALNUM_LC_utf8((U8*)locinput);
3283 st->ln = (locinput != PL_bostr) ?
3284 UCHARAT(locinput - 1) : '\n';
3285 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3286 st->ln = isALNUM(st->ln);
3287 n = isALNUM(nextchr);
3290 st->ln = isALNUM_LC(st->ln);
3291 n = isALNUM_LC(nextchr);
3294 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
3295 OP(scan) == BOUNDL))
3299 PL_reg_flags |= RF_tainted;
3305 if (UTF8_IS_CONTINUED(nextchr)) {
3306 LOAD_UTF8_CHARCLASS_SPACE();
3307 if (!(OP(scan) == SPACE
3308 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3309 : isSPACE_LC_utf8((U8*)locinput)))
3313 locinput += PL_utf8skip[nextchr];
3314 nextchr = UCHARAT(locinput);
3317 if (!(OP(scan) == SPACE
3318 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3320 nextchr = UCHARAT(++locinput);
3323 if (!(OP(scan) == SPACE
3324 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3326 nextchr = UCHARAT(++locinput);
3330 PL_reg_flags |= RF_tainted;
3333 if (!nextchr && locinput >= PL_regeol)
3336 LOAD_UTF8_CHARCLASS_SPACE();
3337 if (OP(scan) == NSPACE
3338 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
3339 : isSPACE_LC_utf8((U8*)locinput))
3343 locinput += PL_utf8skip[nextchr];
3344 nextchr = UCHARAT(locinput);
3347 if (OP(scan) == NSPACE
3348 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
3350 nextchr = UCHARAT(++locinput);
3353 PL_reg_flags |= RF_tainted;
3359 LOAD_UTF8_CHARCLASS_DIGIT();
3360 if (!(OP(scan) == DIGIT
3361 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3362 : isDIGIT_LC_utf8((U8*)locinput)))
3366 locinput += PL_utf8skip[nextchr];
3367 nextchr = UCHARAT(locinput);
3370 if (!(OP(scan) == DIGIT
3371 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
3373 nextchr = UCHARAT(++locinput);
3376 PL_reg_flags |= RF_tainted;
3379 if (!nextchr && locinput >= PL_regeol)
3382 LOAD_UTF8_CHARCLASS_DIGIT();
3383 if (OP(scan) == NDIGIT
3384 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
3385 : isDIGIT_LC_utf8((U8*)locinput))
3389 locinput += PL_utf8skip[nextchr];
3390 nextchr = UCHARAT(locinput);
3393 if (OP(scan) == NDIGIT
3394 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
3396 nextchr = UCHARAT(++locinput);
3399 if (locinput >= PL_regeol)
3402 LOAD_UTF8_CHARCLASS_MARK();
3403 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3405 locinput += PL_utf8skip[nextchr];
3406 while (locinput < PL_regeol &&
3407 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3408 locinput += UTF8SKIP(locinput);
3409 if (locinput > PL_regeol)
3414 nextchr = UCHARAT(locinput);
3417 PL_reg_flags |= RF_tainted;
3422 n = ARG(scan); /* which paren pair */
3423 st->ln = PL_regstartp[n];
3424 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3425 if ((I32)*PL_reglastparen < n || st->ln == -1)
3426 sayNO; /* Do not match unless seen CLOSEn. */
3427 if (st->ln == PL_regendp[n])
3430 s = PL_bostr + st->ln;
3431 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
3433 const char *e = PL_bostr + PL_regendp[n];
3435 * Note that we can't do the "other character" lookup trick as
3436 * in the 8-bit case (no pun intended) because in Unicode we
3437 * have to map both upper and title case to lower case.
3439 if (OP(scan) == REFF) {
3441 STRLEN ulen1, ulen2;
3442 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3443 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3447 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3448 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3449 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3456 nextchr = UCHARAT(locinput);
3460 /* Inline the first character, for speed. */
3461 if (UCHARAT(s) != nextchr &&
3463 (UCHARAT(s) != ((OP(scan) == REFF
3464 ? PL_fold : PL_fold_locale)[nextchr]))))
3466 st->ln = PL_regendp[n] - st->ln;
3467 if (locinput + st->ln > PL_regeol)
3469 if (st->ln > 1 && (OP(scan) == REF
3470 ? memNE(s, locinput, st->ln)
3472 ? ibcmp(s, locinput, st->ln)
3473 : ibcmp_locale(s, locinput, st->ln))))
3476 nextchr = UCHARAT(locinput);
3487 #define ST st->u.eval
3489 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3493 /* execute the code in the {...} */
3495 SV ** const before = SP;
3496 OP_4tree * const oop = PL_op;
3497 COP * const ocurcop = PL_curcop;
3501 PL_op = (OP_4tree*)rex->data->data[n];
3502 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3503 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
3504 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3506 CALLRUNOPS(aTHX); /* Scalar context. */
3509 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3516 PAD_RESTORE_LOCAL(old_comppad);
3517 PL_curcop = ocurcop;
3520 sv_setsv(save_scalar(PL_replgv), ret);
3524 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3527 /* extract RE object from returned value; compiling if
3532 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3533 mg = mg_find(sv, PERL_MAGIC_qr);
3534 else if (SvSMAGICAL(ret)) {
3535 if (SvGMAGICAL(ret))
3536 sv_unmagic(ret, PERL_MAGIC_qr);
3538 mg = mg_find(ret, PERL_MAGIC_qr);
3542 re = (regexp *)mg->mg_obj;
3543 (void)ReREFCNT_inc(re);
3547 const char * const t = SvPV_const(ret, len);
3549 const I32 osize = PL_regsize;
3552 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
3553 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
3555 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3557 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3563 /* run the pattern returned from (??{...}) */
3566 PerlIO_printf(Perl_debug_log,
3567 "Entering embedded \"%s%.60s%s%s\"\n",
3571 (strlen(re->precomp) > 60 ? "..." : ""))
3574 ST.cp = regcppush(0); /* Save *all* the positions. */
3575 REGCP_SET(ST.lastcp);
3576 *PL_reglastparen = 0;
3577 *PL_reglastcloseparen = 0;
3578 PL_reginput = locinput;
3580 /* XXXX This is too dramatic a measure... */
3584 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3585 ((re->reganch & ROPT_UTF8) != 0);
3586 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3591 /* now continue from first node in postoned RE */
3592 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
3595 /* /(?(?{...})X|Y)/ */
3596 st->sw = SvTRUE(ret);
3601 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3603 PL_reg_flags ^= RF_utf8;
3606 /* XXXX This is too dramatic a measure... */
3608 /* Restore parens of the caller without popping the
3611 const I32 tmp = PL_savestack_ix;
3612 PL_savestack_ix = ST.lastcp;
3614 PL_savestack_ix = tmp;
3616 PL_reginput = locinput;
3617 /* continue at the node following the (??{...}) */
3621 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3622 /* Restore state to the outer re then re-throw the failure */
3624 PL_reg_flags ^= RF_utf8;
3628 /* XXXX This is too dramatic a measure... */
3631 PL_reginput = locinput;
3632 REGCP_UNWIND(ST.lastcp);
3639 n = ARG(scan); /* which paren pair */
3640 PL_reg_start_tmp[n] = locinput;
3645 n = ARG(scan); /* which paren pair */
3646 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3647 PL_regendp[n] = locinput - PL_bostr;
3648 if (n > (I32)*PL_reglastparen)
3649 *PL_reglastparen = n;
3650 *PL_reglastcloseparen = n;
3653 n = ARG(scan); /* which paren pair */
3654 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3657 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3659 next = NEXTOPER(NEXTOPER(scan));
3661 next = scan + ARG(scan);
3662 if (OP(next) == IFTHEN) /* Fake one. */
3663 next = NEXTOPER(NEXTOPER(next));
3667 st->logical = scan->flags;
3669 /*******************************************************************
3670 cc points to the regmatch_state associated with the most recent CURLYX.
3671 This struct contains info about the innermost (...)* loop (an
3672 "infoblock"), and a pointer to the next outer cc.
3674 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3676 1) After matching Y, regnode for CURLYX is processed;
3678 2) This regnode populates cc, and calls regmatch() recursively
3679 with the starting point at WHILEM node;
3681 3) Each hit of WHILEM node tries to match A and Z (in the order
3682 depending on the current iteration, min/max of {min,max} and
3683 greediness). The information about where are nodes for "A"
3684 and "Z" is read from cc, as is info on how many times "A"
3685 was already matched, and greediness.
3687 4) After A matches, the same WHILEM node is hit again.
3689 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
3690 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3691 resets cc, since this Y(A)*Z can be a part of some other loop:
3692 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3693 of the external loop.
3695 Currently present infoblocks form a tree with a stem formed by st->cc
3696 and whatever it mentions via ->next, and additional attached trees
3697 corresponding to temporarily unset infoblocks as in "5" above.
3699 In the following picture, infoblocks for outer loop of
3700 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3701 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3702 infoblocks are drawn below the "reset" infoblock.
3704 In fact in the picture below we do not show failed matches for Z and T
3705 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3706 more obvious *why* one needs to *temporary* unset infoblocks.]
3708 Matched REx position InfoBlocks Comment
3712 Y A)*?Z)*?T x <- O <- I
3713 YA )*?Z)*?T x <- O <- I
3714 YA A)*?Z)*?T x <- O <- I
3715 YAA )*?Z)*?T x <- O <- I
3716 YAA Z)*?T x <- O # Temporary unset I
3719 YAAZ Y(A)*?Z)*?T x <- O
3722 YAAZY (A)*?Z)*?T x <- O
3725 YAAZY A)*?Z)*?T x <- O <- I
3728 YAAZYA )*?Z)*?T x <- O <- I
3731 YAAZYA Z)*?T x <- O # Temporary unset I
3737 YAAZYAZ T x # Temporary unset O
3744 *******************************************************************/
3747 /* No need to save/restore up to this paren */
3748 I32 parenfloor = scan->flags;
3752 CURLYX and WHILEM are always paired: they're the moral
3753 equivalent of pp_enteriter anbd pp_iter.
3755 The only time next could be null is if the node tree is
3756 corrupt. This was mentioned on p5p a few days ago.
3758 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3759 So we'll assert that this is true:
3762 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3764 /* XXXX Probably it is better to teach regpush to support
3765 parenfloor > PL_regsize... */
3766 if (parenfloor > (I32)*PL_reglastparen)
3767 parenfloor = *PL_reglastparen; /* Pessimization... */
3769 st->u.curlyx.cp = PL_savestack_ix;
3770 st->u.curlyx.outercc = st->cc;
3772 /* these fields contain the state of the current curly.
3773 * they are accessed by subsequent WHILEMs;
3774 * cur and lastloc are also updated by WHILEM */
3775 st->u.curlyx.parenfloor = parenfloor;
3776 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3777 st->u.curlyx.min = ARG1(scan);
3778 st->u.curlyx.max = ARG2(scan);
3779 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3780 st->u.curlyx.lastloc = 0;
3781 /* st->next and st->minmod are also read by WHILEM */
3783 PL_reginput = locinput;
3784 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3785 /*** all unsaved local vars undefined at this point */
3786 regcpblow(st->u.curlyx.cp);
3787 st->cc = st->u.curlyx.outercc;
3793 * This is really hard to understand, because after we match
3794 * what we're trying to match, we must make sure the rest of
3795 * the REx is going to match for sure, and to do that we have
3796 * to go back UP the parse tree by recursing ever deeper. And
3797 * if it fails, we have to reset our parent's current state
3798 * that we can try again after backing off.
3803 st->cc gets initialised by CURLYX ready for use by WHILEM.
3804 So again, unless somethings been corrupted, st->cc cannot
3805 be null at that point in WHILEM.
3807 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3808 So we'll assert that this is true:
3811 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3812 st->u.whilem.cache_offset = 0;
3813 st->u.whilem.cache_bit = 0;
3815 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3816 PL_reginput = locinput;
3819 PerlIO_printf(Perl_debug_log,
3820 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3821 REPORT_CODE_OFF+PL_regindent*2, "",
3822 (long)n, (long)st->cc->u.curlyx.min,
3823 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
3826 /* If degenerate scan matches "", assume scan done. */
3828 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3829 st->u.whilem.savecc = st->cc;
3830 st->cc = st->cc->u.curlyx.outercc;
3832 st->ln = st->cc->u.curlyx.cur;
3834 PerlIO_printf(Perl_debug_log,
3835 "%*s empty match detected, try continuation...\n",
3836 REPORT_CODE_OFF+PL_regindent*2, "")
3838 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
3839 /*** all unsaved local vars undefined at this point */
3840 st->cc = st->u.whilem.savecc;
3843 if (st->cc->u.curlyx.outercc)
3844 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3848 /* First just match a string of min scans. */
3850 if (n < st->cc->u.curlyx.min) {
3851 st->cc->u.curlyx.cur = n;
3852 st->cc->u.curlyx.lastloc = locinput;
3853 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
3854 /*** all unsaved local vars undefined at this point */
3857 st->cc->u.curlyx.cur = n - 1;
3858 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3863 /* Check whether we already were at this position.
3864 Postpone detection until we know the match is not
3865 *that* much linear. */
3866 if (!PL_reg_maxiter) {
3867 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3868 /* possible overflow for long strings and many CURLYX's */
3869 if (PL_reg_maxiter < 0)
3870 PL_reg_maxiter = I32_MAX;
3871 PL_reg_leftiter = PL_reg_maxiter;
3873 if (PL_reg_leftiter-- == 0) {
3874 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3875 if (PL_reg_poscache) {
3876 if ((I32)PL_reg_poscache_size < size) {
3877 Renew(PL_reg_poscache, size, char);
3878 PL_reg_poscache_size = size;
3880 Zero(PL_reg_poscache, size, char);
3883 PL_reg_poscache_size = size;
3884 Newxz(PL_reg_poscache, size, char);
3887 PerlIO_printf(Perl_debug_log,
3888 "%sDetected a super-linear match, switching on caching%s...\n",
3889 PL_colors[4], PL_colors[5])
3892 if (PL_reg_leftiter < 0) {
3893 st->u.whilem.cache_offset = locinput - PL_bostr;
3895 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3896 + st->u.whilem.cache_offset * (scan->flags>>4);
3897 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3898 st->u.whilem.cache_offset /= 8;
3899 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
3901 PerlIO_printf(Perl_debug_log,
3902 "%*s already tried at this position...\n",
3903 REPORT_CODE_OFF+PL_regindent*2, "")
3905 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3906 /* cache records success */
3909 /* cache records failure */
3915 /* Prefer next over scan for minimal matching. */
3917 if (st->cc->minmod) {
3918 st->u.whilem.savecc = st->cc;
3919 st->cc = st->cc->u.curlyx.outercc;
3921 st->ln = st->cc->u.curlyx.cur;
3922 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3923 REGCP_SET(st->u.whilem.lastcp);
3924 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
3925 /*** all unsaved local vars undefined at this point */
3926 st->cc = st->u.whilem.savecc;
3928 regcpblow(st->u.whilem.cp);
3929 CACHEsayYES; /* All done. */
3931 REGCP_UNWIND(st->u.whilem.lastcp);
3933 if (st->cc->u.curlyx.outercc)
3934 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3936 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
3937 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3938 && !(PL_reg_flags & RF_warned)) {
3939 PL_reg_flags |= RF_warned;
3940 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3941 "Complex regular subexpression recursion",
3948 PerlIO_printf(Perl_debug_log,
3949 "%*s trying longer...\n",
3950 REPORT_CODE_OFF+PL_regindent*2, "")
3952 /* Try scanning more and see if it helps. */
3953 PL_reginput = locinput;
3954 st->cc->u.curlyx.cur = n;
3955 st->cc->u.curlyx.lastloc = locinput;
3956 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3957 REGCP_SET(st->u.whilem.lastcp);
3958 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
3959 /*** all unsaved local vars undefined at this point */
3961 regcpblow(st->u.whilem.cp);
3964 REGCP_UNWIND(st->u.whilem.lastcp);
3966 st->cc->u.curlyx.cur = n - 1;
3967 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3971 /* Prefer scan over next for maximal matching. */
3973 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3974 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3975 st->cc->u.curlyx.cur = n;
3976 st->cc->u.curlyx.lastloc = locinput;
3977 REGCP_SET(st->u.whilem.lastcp);
3978 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
3979 /*** all unsaved local vars undefined at this point */
3981 regcpblow(st->u.whilem.cp);
3984 REGCP_UNWIND(st->u.whilem.lastcp);
3985 regcppop(rex); /* Restore some previous $<digit>s? */
3986 PL_reginput = locinput;
3988 PerlIO_printf(Perl_debug_log,
3989 "%*s failed, try continuation...\n",
3990 REPORT_CODE_OFF+PL_regindent*2, "")
3993 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3994 && !(PL_reg_flags & RF_warned)) {
3995 PL_reg_flags |= RF_warned;
3996 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3997 "Complex regular subexpression recursion",
4001 /* Failed deeper matches of scan, so see if this one works. */
4002 st->u.whilem.savecc = st->cc;
4003 st->cc = st->cc->u.curlyx.outercc;
4005 st->ln = st->cc->u.curlyx.cur;
4006 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
4007 /*** all unsaved local vars undefined at this point */
4008 st->cc = st->u.whilem.savecc;
4011 if (st->cc->u.curlyx.outercc)
4012 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4013 st->cc->u.curlyx.cur = n - 1;
4014 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4020 #define ST st->u.branch
4022 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4023 next = scan + ARG(scan);
4026 scan = NEXTOPER(scan);
4029 case BRANCH: /* /(...|A|...)/ */
4030 scan = NEXTOPER(scan); /* scan now points to inner node */
4031 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4032 /* last branch; skip state push and jump direct to node */
4034 ST.lastparen = *PL_reglastparen;
4035 ST.next_branch = next;
4037 PL_reginput = locinput;
4039 /* Now go into the branch */
4040 PUSH_STATE_GOTO(BRANCH_next, scan);
4043 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4044 REGCP_UNWIND(ST.cp);
4045 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4047 *PL_reglastparen = n;
4048 scan = ST.next_branch;
4049 /* no more branches? */
4050 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4052 continue; /* execute next BRANCH[J] op */
4060 #define ST st->u.curlym
4062 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4064 /* This is an optimisation of CURLYX that enables us to push
4065 * only a single backtracking state, no matter now many matches
4066 * there are in {m,n}. It relies on the pattern being constant
4067 * length, with no parens to influence future backrefs
4071 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4073 /* if paren positive, emulate an OPEN/CLOSE around A */
4075 I32 paren = ST.me->flags;
4076 if (paren > PL_regsize)
4078 if (paren > (I32)*PL_reglastparen)
4079 *PL_reglastparen = paren;
4080 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4086 ST.minmod = st->minmod;
4088 ST.c1 = CHRTEST_UNINIT;
4091 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4094 curlym_do_A: /* execute the A in /A{m,n}B/ */
4095 PL_reginput = locinput;
4096 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4099 case CURLYM_A: /* we've just matched an A */
4100 locinput = st->locinput;
4101 nextchr = UCHARAT(locinput);
4104 /* after first match, determine A's length: u.curlym.alen */
4105 if (ST.count == 1) {
4106 if (PL_reg_match_utf8) {
4108 while (s < PL_reginput) {
4114 ST.alen = PL_reginput - locinput;
4117 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4120 PerlIO_printf(Perl_debug_log,
4121 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4122 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
4123 (IV) ST.count, (IV)ST.alen)
4126 locinput = PL_reginput;
4127 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4128 goto curlym_do_A; /* try to match another A */
4129 goto curlym_do_B; /* try to match B */
4131 case CURLYM_A_fail: /* just failed to match an A */
4132 REGCP_UNWIND(ST.cp);
4133 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4136 curlym_do_B: /* execute the B in /A{m,n}B/ */
4137 PL_reginput = locinput;
4138 if (ST.c1 == CHRTEST_UNINIT) {
4139 /* calculate c1 and c2 for possible match of 1st char
4140 * following curly */
4141 ST.c1 = ST.c2 = CHRTEST_VOID;
4142 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4143 regnode *text_node = ST.B;
4144 if (! HAS_TEXT(text_node))
4145 FIND_NEXT_IMPT(text_node);
4146 if (HAS_TEXT(text_node)
4147 && PL_regkind[OP(text_node)] != REF)
4149 ST.c1 = (U8)*STRING(text_node);
4151 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4153 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4154 ? PL_fold_locale[ST.c1]
4161 PerlIO_printf(Perl_debug_log,
4162 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4163 (int)(REPORT_CODE_OFF+PL_regindent*2),
4166 if (ST.c1 != CHRTEST_VOID
4167 && UCHARAT(PL_reginput) != ST.c1
4168 && UCHARAT(PL_reginput) != ST.c2)
4170 /* simulate B failing */
4171 state_num = CURLYM_B_fail;
4172 goto reenter_switch;
4176 /* mark current A as captured */
4177 I32 paren = ST.me->flags;
4180 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4181 PL_regendp[paren] = PL_reginput - PL_bostr;
4184 PL_regendp[paren] = -1;
4186 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4189 case CURLYM_B_fail: /* just failed to match a B */
4190 REGCP_UNWIND(ST.cp);
4192 if (ST.count == ARG2(ST.me) /* max */)
4194 goto curlym_do_A; /* try to match a further A */
4196 /* backtrack one A */
4197 if (ST.count == ARG1(ST.me) /* min */)
4200 locinput = HOPc(locinput, -ST.alen);
4201 goto curlym_do_B; /* try to match B */
4204 #define ST st->u.curly
4206 #define CURLY_SETPAREN(paren, success) \
4209 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4210 PL_regendp[paren] = locinput - PL_bostr; \
4213 PL_regendp[paren] = -1; \
4216 case STAR: /* /A*B/ where A is width 1 */
4220 scan = NEXTOPER(scan);
4222 case PLUS: /* /A+B/ where A is width 1 */
4226 scan = NEXTOPER(scan);
4228 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4229 ST.paren = scan->flags; /* Which paren to set */
4230 if (ST.paren > PL_regsize)
4231 PL_regsize = ST.paren;
4232 if (ST.paren > (I32)*PL_reglastparen)
4233 *PL_reglastparen = ST.paren;
4234 ST.min = ARG1(scan); /* min to match */
4235 ST.max = ARG2(scan); /* max to match */
4236 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4238 case CURLY: /* /A{m,n}B/ where A is width 1 */
4240 ST.min = ARG1(scan); /* min to match */
4241 ST.max = ARG2(scan); /* max to match */
4242 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4245 * Lookahead to avoid useless match attempts
4246 * when we know what character comes next.
4248 * Used to only do .*x and .*?x, but now it allows
4249 * for )'s, ('s and (?{ ... })'s to be in the way
4250 * of the quantifier and the EXACT-like node. -- japhy
4253 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4255 if (HAS_TEXT(next) || JUMPABLE(next)) {
4257 regnode *text_node = next;
4259 if (! HAS_TEXT(text_node))
4260 FIND_NEXT_IMPT(text_node);
4262 if (! HAS_TEXT(text_node))
4263 ST.c1 = ST.c2 = CHRTEST_VOID;
4265 if (PL_regkind[OP(text_node)] == REF) {
4266 ST.c1 = ST.c2 = CHRTEST_VOID;
4267 goto assume_ok_easy;
4270 s = (U8*)STRING(text_node);
4274 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
4275 ST.c2 = PL_fold[ST.c1];
4276 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4277 ST.c2 = PL_fold_locale[ST.c1];
4280 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
4281 STRLEN ulen1, ulen2;
4282 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4283 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4285 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4286 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4288 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4290 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4294 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4301 ST.c1 = ST.c2 = CHRTEST_VOID;
4306 PL_reginput = locinput;
4309 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4312 locinput = PL_reginput;
4314 if (ST.c1 == CHRTEST_VOID)
4315 goto curly_try_B_min;
4317 ST.oldloc = locinput;
4319 /* set ST.maxpos to the furthest point along the
4320 * string that could possibly match */
4321 if (ST.max == REG_INFTY) {
4322 ST.maxpos = PL_regeol - 1;
4324 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4328 int m = ST.max - ST.min;
4329 for (ST.maxpos = locinput;
4330 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4331 ST.maxpos += UTF8SKIP(ST.maxpos);
4334 ST.maxpos = locinput + ST.max - ST.min;
4335 if (ST.maxpos >= PL_regeol)
4336 ST.maxpos = PL_regeol - 1;
4338 goto curly_try_B_min_known;
4342 ST.count = regrepeat(rex, ST.A, ST.max);
4343 locinput = PL_reginput;
4344 if (ST.count < ST.min)
4346 if ((ST.count > ST.min)
4347 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4349 /* A{m,n} must come at the end of the string, there's
4350 * no point in backing off ... */
4352 /* ...except that $ and \Z can match before *and* after
4353 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4354 We may back off by one in this case. */
4355 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4359 goto curly_try_B_max;
4364 case CURLY_B_min_known_fail:
4365 /* failed to find B in a non-greedy match where c1,c2 valid */
4366 if (ST.paren && ST.count)
4367 PL_regendp[ST.paren] = -1;
4369 PL_reginput = locinput; /* Could be reset... */
4370 REGCP_UNWIND(ST.cp);
4371 /* Couldn't or didn't -- move forward. */
4372 ST.oldloc = locinput;
4374 locinput += UTF8SKIP(locinput);
4378 curly_try_B_min_known:
4379 /* find the next place where 'B' could work, then call B */
4383 n = (ST.oldloc == locinput) ? 0 : 1;
4384 if (ST.c1 == ST.c2) {
4386 /* set n to utf8_distance(oldloc, locinput) */
4387 while (locinput <= ST.maxpos &&
4388 utf8n_to_uvchr((U8*)locinput,
4389 UTF8_MAXBYTES, &len,
4390 uniflags) != (UV)ST.c1) {
4396 /* set n to utf8_distance(oldloc, locinput) */
4397 while (locinput <= ST.maxpos) {
4399 const UV c = utf8n_to_uvchr((U8*)locinput,
4400 UTF8_MAXBYTES, &len,
4402 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4410 if (ST.c1 == ST.c2) {
4411 while (locinput <= ST.maxpos &&
4412 UCHARAT(locinput) != ST.c1)
4416 while (locinput <= ST.maxpos
4417 && UCHARAT(locinput) != ST.c1
4418 && UCHARAT(locinput) != ST.c2)
4421 n = locinput - ST.oldloc;
4423 if (locinput > ST.maxpos)
4425 /* PL_reginput == oldloc now */
4428 if (regrepeat(rex, ST.A, n) < n)
4431 PL_reginput = locinput;
4432 CURLY_SETPAREN(ST.paren, ST.count);
4433 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4438 case CURLY_B_min_fail:
4439 /* failed to find B in a non-greedy match where c1,c2 invalid */
4440 if (ST.paren && ST.count)
4441 PL_regendp[ST.paren] = -1;
4443 REGCP_UNWIND(ST.cp);
4444 /* failed -- move forward one */
4445 PL_reginput = locinput;
4446 if (regrepeat(rex, ST.A, 1)) {
4448 locinput = PL_reginput;
4449 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4450 ST.count > 0)) /* count overflow ? */
4453 CURLY_SETPAREN(ST.paren, ST.count);
4454 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4462 /* a successful greedy match: now try to match B */
4465 if (ST.c1 != CHRTEST_VOID)
4466 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4467 UTF8_MAXBYTES, 0, uniflags)
4468 : (UV) UCHARAT(PL_reginput);
4469 /* If it could work, try it. */
4470 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4471 CURLY_SETPAREN(ST.paren, ST.count);
4472 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4477 case CURLY_B_max_fail:
4478 /* failed to find B in a greedy match */
4479 if (ST.paren && ST.count)
4480 PL_regendp[ST.paren] = -1;
4482 REGCP_UNWIND(ST.cp);
4484 if (--ST.count < ST.min)
4486 PL_reginput = locinput = HOPc(locinput, -1);
4487 goto curly_try_B_max;
4493 if (locinput < reginfo->till) {
4494 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4495 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4497 (long)(locinput - PL_reg_starttry),
4498 (long)(reginfo->till - PL_reg_starttry),
4500 sayNO_FINAL; /* Cannot match: too short. */
4502 PL_reginput = locinput; /* put where regtry can find it */
4503 sayYES_FINAL; /* Success! */
4505 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4507 PerlIO_printf(Perl_debug_log,
4508 "%*s %ssubpattern success...%s\n",
4509 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
4510 PL_reginput = locinput; /* put where regtry can find it */
4511 sayYES_FINAL; /* Success! */
4514 #define ST st->u.ifmatch
4516 case SUSPEND: /* (?>A) */
4518 PL_reginput = locinput;
4521 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4523 goto ifmatch_trivial_fail_test;
4525 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4527 ifmatch_trivial_fail_test:
4529 char * const s = HOPBACKc(locinput, scan->flags);
4534 st->sw = 1 - (bool)ST.wanted;
4538 next = scan + ARG(scan);
4546 PL_reginput = locinput;
4550 /* execute body of (?...A) */
4551 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4554 case IFMATCH_A_fail: /* body of (?...A) failed */
4555 ST.wanted = !ST.wanted;
4558 case IFMATCH_A: /* body of (?...A) succeeded */
4561 st->sw = (bool)ST.wanted;
4563 else if (!ST.wanted)
4566 if (OP(ST.me) == SUSPEND)
4567 locinput = PL_reginput;
4569 locinput = PL_reginput = st->locinput;
4570 nextchr = UCHARAT(locinput);
4572 scan = ST.me + ARG(ST.me);
4575 continue; /* execute B */
4580 next = scan + ARG(scan);
4585 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
4586 PTR2UV(scan), OP(scan));
4587 Perl_croak(aTHX_ "regexp memory corruption");
4595 /* push a state that backtracks on success */
4596 st->u.yes.prev_yes_state = yes_state;
4600 /* push a new regex state, then continue at scan */
4602 regmatch_state *newst;
4605 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4606 "PUSH STATE(%d)\n", depth));
4607 st->locinput = locinput;
4609 if (newst > SLAB_LAST(PL_regmatch_slab))
4610 newst = S_push_slab(aTHX);
4611 PL_regmatch_state = newst;
4613 /* XXX probably don't need to initialise these */
4618 locinput = PL_reginput;
4619 nextchr = UCHARAT(locinput);
4625 /* simulate recursively calling regmatch(), but without actually
4626 * recursing - ie save the current state on the heap rather than on
4627 * the stack, then re-enter the loop. This avoids complex regexes
4628 * blowing the processor stack */
4632 /* push new state */
4633 regmatch_state *oldst = st;
4636 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
4638 /* grab the next free state slot */
4640 if (st > SLAB_LAST(PL_regmatch_slab))
4641 st = S_push_slab(aTHX);
4642 PL_regmatch_state = st;
4646 oldst->locinput = locinput;
4649 locinput = PL_reginput;
4650 nextchr = UCHARAT(locinput);
4663 * We get here only if there's trouble -- normally "case END" is
4664 * the terminating point.
4666 Perl_croak(aTHX_ "corrupted regexp pointers");
4673 /* we have successfully completed a subexpression, but we must now
4674 * pop to the state marked by yes_state and continue from there */
4676 assert(st != yes_state);
4677 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4678 || yes_state > SLAB_LAST(PL_regmatch_slab))
4680 /* not in this slab, pop slab */
4681 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4682 PL_regmatch_slab = PL_regmatch_slab->prev;
4683 st = SLAB_LAST(PL_regmatch_slab);
4685 depth -= (st - yes_state);
4686 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4687 depth+1, depth+(st - yes_state)));
4689 yes_state = st->u.yes.prev_yes_state;
4690 PL_regmatch_state = st;
4692 switch (st->resume_state) {
4696 state_num = st->resume_state;
4697 goto reenter_switch;
4704 Perl_croak(aTHX_ "unexpected yes resume state");
4708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
4709 PL_colors[4], PL_colors[5]));
4716 /* XXX this is duplicate(ish) code to that in the do_no section.
4717 * will disappear when REGFMATCH goes */
4719 /* restore previous state and re-enter */
4720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4723 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4724 PL_regmatch_slab = PL_regmatch_slab->prev;
4725 st = SLAB_LAST(PL_regmatch_slab);
4727 PL_regmatch_state = st;
4731 locinput= st->locinput;
4732 nextchr = UCHARAT(locinput);
4734 switch (st->resume_state) {
4736 goto resume_point_CURLYX;
4737 case resume_WHILEM1:
4738 goto resume_point_WHILEM1;
4739 case resume_WHILEM2:
4740 goto resume_point_WHILEM2;
4741 case resume_WHILEM3:
4742 goto resume_point_WHILEM3;
4743 case resume_WHILEM4:
4744 goto resume_point_WHILEM4;
4745 case resume_WHILEM5:
4746 goto resume_point_WHILEM5;
4747 case resume_WHILEM6:
4748 goto resume_point_WHILEM6;
4758 case CURLY_B_min_known:
4762 Perl_croak(aTHX_ "regexp resume memory corruption");
4769 PerlIO_printf(Perl_debug_log,
4770 "%*s %sfailed...%s\n",
4771 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
4782 /* there's a previous state to backtrack to */
4783 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4786 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4787 PL_regmatch_slab = PL_regmatch_slab->prev;
4788 st = SLAB_LAST(PL_regmatch_slab);
4790 PL_regmatch_state = st;
4794 locinput= st->locinput;
4795 nextchr = UCHARAT(locinput);
4797 switch (st->resume_state) {
4799 goto resume_point_CURLYX;
4800 case resume_WHILEM1:
4801 goto resume_point_WHILEM1;
4802 case resume_WHILEM2:
4803 goto resume_point_WHILEM2;
4804 case resume_WHILEM3:
4805 goto resume_point_WHILEM3;
4806 case resume_WHILEM4:
4807 goto resume_point_WHILEM4;
4808 case resume_WHILEM5:
4809 goto resume_point_WHILEM5;
4810 case resume_WHILEM6:
4811 goto resume_point_WHILEM6;
4821 case CURLY_B_min_known:
4822 if (yes_state == st)
4823 yes_state = st->u.yes.prev_yes_state;
4824 state_num = st->resume_state + 1; /* failure = success + 1 */
4825 goto reenter_switch;
4828 Perl_croak(aTHX_ "regexp resume memory corruption");
4834 /* restore original high-water mark */
4835 PL_regmatch_slab = orig_slab;
4836 PL_regmatch_state = orig_state;
4838 /* free all slabs above current one */
4839 if (orig_slab->next) {
4840 regmatch_slab *sl = orig_slab->next;
4841 orig_slab->next = NULL;
4843 regmatch_slab * const osl = sl;
4854 - regrepeat - repeatedly match something simple, report how many
4857 * [This routine now assumes that it will only match on things of length 1.
4858 * That was true before, but now we assume scan - reginput is the count,
4859 * rather than incrementing count on every character. [Er, except utf8.]]
4862 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
4865 register char *scan;
4867 register char *loceol = PL_regeol;
4868 register I32 hardcount = 0;
4869 register bool do_utf8 = PL_reg_match_utf8;
4872 if (max == REG_INFTY)
4874 else if (max < loceol - scan)
4875 loceol = scan + max;
4880 while (scan < loceol && hardcount < max && *scan != '\n') {
4881 scan += UTF8SKIP(scan);
4885 while (scan < loceol && *scan != '\n')
4892 while (scan < loceol && hardcount < max) {
4893 scan += UTF8SKIP(scan);
4903 case EXACT: /* length of string is 1 */
4905 while (scan < loceol && UCHARAT(scan) == c)
4908 case EXACTF: /* length of string is 1 */
4910 while (scan < loceol &&
4911 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4914 case EXACTFL: /* length of string is 1 */
4915 PL_reg_flags |= RF_tainted;
4917 while (scan < loceol &&
4918 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4924 while (hardcount < max && scan < loceol &&
4925 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
4926 scan += UTF8SKIP(scan);
4930 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
4937 LOAD_UTF8_CHARCLASS_ALNUM();
4938 while (hardcount < max && scan < loceol &&
4939 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4940 scan += UTF8SKIP(scan);
4944 while (scan < loceol && isALNUM(*scan))
4949 PL_reg_flags |= RF_tainted;
4952 while (hardcount < max && scan < loceol &&
4953 isALNUM_LC_utf8((U8*)scan)) {
4954 scan += UTF8SKIP(scan);
4958 while (scan < loceol && isALNUM_LC(*scan))
4965 LOAD_UTF8_CHARCLASS_ALNUM();
4966 while (hardcount < max && scan < loceol &&
4967 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4968 scan += UTF8SKIP(scan);
4972 while (scan < loceol && !isALNUM(*scan))
4977 PL_reg_flags |= RF_tainted;
4980 while (hardcount < max && scan < loceol &&
4981 !isALNUM_LC_utf8((U8*)scan)) {
4982 scan += UTF8SKIP(scan);
4986 while (scan < loceol && !isALNUM_LC(*scan))
4993 LOAD_UTF8_CHARCLASS_SPACE();
4994 while (hardcount < max && scan < loceol &&
4996 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4997 scan += UTF8SKIP(scan);
5001 while (scan < loceol && isSPACE(*scan))
5006 PL_reg_flags |= RF_tainted;
5009 while (hardcount < max && scan < loceol &&
5010 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5011 scan += UTF8SKIP(scan);
5015 while (scan < loceol && isSPACE_LC(*scan))
5022 LOAD_UTF8_CHARCLASS_SPACE();
5023 while (hardcount < max && scan < loceol &&
5025 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5026 scan += UTF8SKIP(scan);
5030 while (scan < loceol && !isSPACE(*scan))
5035 PL_reg_flags |= RF_tainted;
5038 while (hardcount < max && scan < loceol &&
5039 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5040 scan += UTF8SKIP(scan);
5044 while (scan < loceol && !isSPACE_LC(*scan))
5051 LOAD_UTF8_CHARCLASS_DIGIT();
5052 while (hardcount < max && scan < loceol &&
5053 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5054 scan += UTF8SKIP(scan);
5058 while (scan < loceol && isDIGIT(*scan))
5065 LOAD_UTF8_CHARCLASS_DIGIT();
5066 while (hardcount < max && scan < loceol &&
5067 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5068 scan += UTF8SKIP(scan);
5072 while (scan < loceol && !isDIGIT(*scan))
5076 default: /* Called on something of 0 width. */
5077 break; /* So match right here or not at all. */
5083 c = scan - PL_reginput;
5087 GET_RE_DEBUG_FLAGS_DECL;
5089 SV * const prop = sv_newmortal();
5090 regprop(prog, prop, p);
5091 PerlIO_printf(Perl_debug_log,
5092 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5093 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
5101 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5103 - regclass_swash - prepare the utf8 swash
5107 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5113 const struct reg_data * const data = prog ? prog->data : NULL;
5115 if (data && data->count) {
5116 const U32 n = ARG(node);
5118 if (data->what[n] == 's') {
5119 SV * const rv = (SV*)data->data[n];
5120 AV * const av = (AV*)SvRV((SV*)rv);
5121 SV **const ary = AvARRAY(av);
5124 /* See the end of regcomp.c:S_regclass() for
5125 * documentation of these array elements. */
5128 a = SvROK(ary[1]) ? &ary[1] : 0;
5129 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5133 else if (si && doinit) {
5134 sw = swash_init("utf8", "", si, 1, 0);
5135 (void)av_store(av, 1, sw);
5152 - reginclass - determine if a character falls into a character class
5154 The n is the ANYOF regnode, the p is the target string, lenp
5155 is pointer to the maximum length of how far to go in the p
5156 (if the lenp is zero, UTF8SKIP(p) is used),
5157 do_utf8 tells whether the target string is in UTF-8.
5162 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5165 const char flags = ANYOF_FLAGS(n);
5171 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5172 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5173 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5174 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5175 if (len == (STRLEN)-1)
5176 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5179 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5180 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5183 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5184 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5187 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5191 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5194 if (swash_fetch(sw, p, do_utf8))
5196 else if (flags & ANYOF_FOLD) {
5197 if (!match && lenp && av) {
5199 for (i = 0; i <= av_len(av); i++) {
5200 SV* const sv = *av_fetch(av, i, FALSE);
5202 const char * const s = SvPV_const(sv, len);
5204 if (len <= plen && memEQ(s, (char*)p, len)) {
5212 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5215 to_utf8_fold(p, tmpbuf, &tmplen);
5216 if (swash_fetch(sw, tmpbuf, do_utf8))
5222 if (match && lenp && *lenp == 0)
5223 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5225 if (!match && c < 256) {
5226 if (ANYOF_BITMAP_TEST(n, c))
5228 else if (flags & ANYOF_FOLD) {
5231 if (flags & ANYOF_LOCALE) {
5232 PL_reg_flags |= RF_tainted;
5233 f = PL_fold_locale[c];
5237 if (f != c && ANYOF_BITMAP_TEST(n, f))
5241 if (!match && (flags & ANYOF_CLASS)) {
5242 PL_reg_flags |= RF_tainted;
5244 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5245 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5246 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5247 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5248 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5249 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5250 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5251 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5252 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5253 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5254 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5255 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5256 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5257 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5258 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5259 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5260 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5261 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5262 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5263 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5264 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5265 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5266 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5267 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5268 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5269 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5270 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5271 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5272 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5273 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5274 ) /* How's that for a conditional? */
5281 return (flags & ANYOF_INVERT) ? !match : match;
5285 S_reghop3(U8 *s, I32 off, const U8* lim)
5289 while (off-- && s < lim) {
5290 /* XXX could check well-formedness here */
5298 if (UTF8_IS_CONTINUED(*s)) {
5299 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5302 /* XXX could check well-formedness here */
5310 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5314 while (off-- && s < lim) {
5315 /* XXX could check well-formedness here */
5325 if (UTF8_IS_CONTINUED(*s)) {
5326 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
5329 /* XXX could check well-formedness here */
5341 restore_pos(pTHX_ void *arg)
5344 regexp * const rex = (regexp *)arg;
5345 if (PL_reg_eval_set) {
5346 if (PL_reg_oldsaved) {
5347 rex->subbeg = PL_reg_oldsaved;
5348 rex->sublen = PL_reg_oldsavedlen;
5349 #ifdef PERL_OLD_COPY_ON_WRITE
5350 rex->saved_copy = PL_nrs;
5352 RX_MATCH_COPIED_on(rex);
5354 PL_reg_magic->mg_len = PL_reg_oldpos;
5355 PL_reg_eval_set = 0;
5356 PL_curpm = PL_reg_oldcurpm;
5361 S_to_utf8_substr(pTHX_ register regexp *prog)
5363 if (prog->float_substr && !prog->float_utf8) {
5364 SV* const sv = newSVsv(prog->float_substr);
5365 prog->float_utf8 = sv;
5366 sv_utf8_upgrade(sv);
5367 if (SvTAIL(prog->float_substr))
5369 if (prog->float_substr == prog->check_substr)
5370 prog->check_utf8 = sv;
5372 if (prog->anchored_substr && !prog->anchored_utf8) {
5373 SV* const sv = newSVsv(prog->anchored_substr);
5374 prog->anchored_utf8 = sv;
5375 sv_utf8_upgrade(sv);
5376 if (SvTAIL(prog->anchored_substr))
5378 if (prog->anchored_substr == prog->check_substr)
5379 prog->check_utf8 = sv;
5384 S_to_byte_substr(pTHX_ register regexp *prog)
5387 if (prog->float_utf8 && !prog->float_substr) {
5388 SV* sv = newSVsv(prog->float_utf8);
5389 prog->float_substr = sv;
5390 if (sv_utf8_downgrade(sv, TRUE)) {
5391 if (SvTAIL(prog->float_utf8))
5395 prog->float_substr = sv = &PL_sv_undef;
5397 if (prog->float_utf8 == prog->check_utf8)
5398 prog->check_substr = sv;
5400 if (prog->anchored_utf8 && !prog->anchored_substr) {
5401 SV* sv = newSVsv(prog->anchored_utf8);
5402 prog->anchored_substr = sv;
5403 if (sv_utf8_downgrade(sv, TRUE)) {
5404 if (SvTAIL(prog->anchored_utf8))
5408 prog->anchored_substr = sv = &PL_sv_undef;
5410 if (prog->anchored_utf8 == prog->check_utf8)
5411 prog->check_substr = sv;
5417 * c-indentation-style: bsd
5419 * indent-tabs-mode: t
5422 * ex: set ts=8 sts=4 sw=4 noet: